diff --git a/.dockerignore b/.dockerignore
index fe1ff54b7..22a52b8b8 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -4,5 +4,4 @@
 */.mypy_cache
 */.pytest_cache
 */build
-*/*/_mars
 **/node_modules
\ No newline at end of file
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index 04edad397..25b9f87eb 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -43,9 +43,11 @@ jobs:
       - uses: isort/isort-action@master
         with:
           sortPaths: "python/xorbits"
-          configuration: "--check-only --profile black --diff --skip-glob xorbits/_mars/"
+          configuration: "--check-only --profile black --diff --skip python/xorbits/_mars/"
       - name: mypy
         run: pip install mypy && cd python && mypy xorbits
+      - name: codespell
+        run: pip install codespell && cd python && codespell xorbits
       - name: Set up Node.js
         uses: actions/setup-node@v1
         with:
diff --git a/.gitignore b/.gitignore
index aa3aa9542..88c75be57 100644
--- a/.gitignore
+++ b/.gitignore
@@ -119,6 +119,16 @@ venv.bak/
 # mkdocs documentation
 /site
 
+# cython compiled files
+python/xorbits/_mars/*.c*
+python/xorbits/_mars/*.h*
+python/xorbits/_mars/core/**/*.c*
+python/xorbits/_mars/learn/cluster/*.c*
+python/xorbits/_mars/learn/utils/*.c*
+python/xorbits/_mars/lib/*.c*
+python/xorbits/_mars/oscar/**/*.c*
+python/xorbits/_mars/serialization/*.c*
+
 # mypy
 .mypy_cache/
 .dmypy.json
@@ -132,9 +142,6 @@ dmypy.json
 .vscode
 *.iml
 
-# soft link
-python/xorbits/_mars
-
 # web staff
 node_modules/
 static/
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 97bb4bb60..000000000
--- a/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "third_party/_mars"]
-	path = third_party/_mars
-	url = https://github.com/xprobe-inc/mars.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 30f2f5c25..a3988130a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,15 +18,23 @@ repos:
     rev: 5.12.0
     hooks:
       - id: isort
-        args: [--profile=black]
+        args: [--sp, python/setup.cfg]
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v0.991
+    rev: v1.0.0
     hooks:
       - id: mypy
         additional_dependencies: [tokenize-rt==3.2.0]
-        args: [--config-file, python/setup.cfg]
+        exclude: _mars
+        args: [--ignore-missing-imports, --follow-imports, skip]
   - repo: https://github.com/pre-commit/mirrors-prettier
     rev: v3.0.0-alpha.4 # Use the sha or tag you want to point at
     hooks:
       - id: prettier
         types_or: [html, javascript]
+        args: [--ignore-path, python/xorbits/_mars]
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.2.2
+    hooks:
+      - id: codespell
+        exclude: _mars/lib
+        args: [ --config, python/setup.cfg]
diff --git a/python/setup.cfg b/python/setup.cfg
index b8f471686..22084a4dd 100644
--- a/python/setup.cfg
+++ b/python/setup.cfg
@@ -168,13 +168,16 @@ exclude =
     ci/
     dist/
     docs/
-    xorbits/_mars/*
+    xorbits/_mars/lib/nvutils.py
+    xorbits/_mars/lib/uhashring/*
+    xorbits/_mars/lib/version.py
+
 per-file-ignores =
      */core/adapter.py: F401
 
 [codespell]
 ignore-words-list = hist,rcall,fpr,ser,nd,inout,ot,Ba,ba,asend,hart,coo,splitted,datas,fro
-skip = .idea,.git,./build,./docs/build,./xorbits/_mars/lib,node_modules,static,generated,*.po,*.ts,*.json,*.c,*.cpp,*.cfg
+skip = .idea,.git,./build,./docs/build,xorbits/_mars/lib,node_modules,static,generated,*.po,*.ts,*.json,*.c,*.cpp,*.cfg
 
 [isort]
 profile = black
diff --git a/python/setup.py b/python/setup.py
index 5fdba7d35..ab2f504bb 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -61,15 +61,6 @@
 repo_root = os.path.dirname(os.path.abspath(__file__))
 os.chdir(repo_root)
 
-# create symlink for mars
-absolute_path = os.path.join(repo_root, os.path.join("xorbits", "_mars"))
-source_path = os.path.join("..", "..", "third_party", "_mars", "mars")
-try:
-    os.symlink(source_path, absolute_path, target_is_directory=True)
-except FileExistsError:
-    # symlink exists already, skip
-    pass
-
 
 cythonize_kw = dict(language_level=sys.version_info[0])
 cy_extension_kw = dict()
diff --git a/python/xorbits/_mars/__init__.py b/python/xorbits/_mars/__init__.py
new file mode 100644
index 000000000..3e3beccbb
--- /dev/null
+++ b/python/xorbits/_mars/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import _version
+from .config import options
+from .core.context import get_context
+from .deploy.oscar import new_cluster_in_ray, new_ray_session
+from .session import execute, fetch, fetch_log, new_session, stop_server
+
+__version__ = _version.get_versions()["version"]
diff --git a/python/xorbits/_mars/_resource.pyx b/python/xorbits/_mars/_resource.pyx
new file mode 100644
index 000000000..63803adcf
--- /dev/null
+++ b/python/xorbits/_mars/_resource.pyx
@@ -0,0 +1,73 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cdef class Resource:
+    cdef readonly:
+        float num_cpus
+        float num_gpus
+        float mem_bytes
+
+    def __init__(self, float num_cpus=0, float num_gpus=0, float mem_bytes=0):
+        self.num_cpus = num_cpus
+        self.num_gpus = num_gpus
+        self.mem_bytes = mem_bytes
+
+    def __eq__(self, Resource other):
+        cdef bint ret = (
+            self.mem_bytes == other.mem_bytes
+            and self.num_gpus == other.num_gpus
+            and self.num_cpus == other.num_cpus
+        )
+        return ret
+
+    cdef bint _le(self, Resource other) nogil:
+        # memory first, then gpu, cpu last
+        cdef bint ret = (
+            self.mem_bytes <= other.mem_bytes
+            and self.num_gpus <= other.num_gpus
+            and self.num_cpus <= other.num_cpus
+        )
+        return ret
+
+    def __gt__(self, Resource other):
+        return not self._le(other)
+
+    def __le__(self, Resource other):
+        return self._le(other)
+
+    def __add__(self, Resource other):
+        return Resource(
+            num_cpus=self.num_cpus + other.num_cpus,
+            num_gpus=self.num_gpus + other.num_gpus,
+            mem_bytes=self.mem_bytes + other.mem_bytes,
+        )
+
+    def __sub__(self, Resource other):
+        return Resource(
+            num_cpus=self.num_cpus - other.num_cpus,
+            num_gpus=self.num_gpus - other.num_gpus,
+            mem_bytes=self.mem_bytes - other.mem_bytes,
+        )
+
+    def __neg__(self):
+        return Resource(
+            num_cpus=-self.num_cpus,
+            num_gpus=-self.num_gpus,
+            mem_bytes=-self.mem_bytes,
+        )
+
+    def __repr__(self):
+        return f"Resource(num_cpus={self.num_cpus}, num_gpus={self.num_gpus}, mem_bytes={self.mem_bytes})"
+
+ZeroResource = Resource(num_cpus=0, num_gpus=0, mem_bytes=0)
diff --git a/python/xorbits/_mars/_utils.pxd b/python/xorbits/_mars/_utils.pxd
new file mode 100644
index 000000000..d875ff78c
--- /dev/null
+++ b/python/xorbits/_mars/_utils.pxd
@@ -0,0 +1,33 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+cdef class TypeDispatcher:
+    cdef dict _handlers
+    cdef dict _lazy_handlers
+    cdef dict _inherit_handlers
+    cdef object __weakref__
+
+    cpdef void register(self, object type_, object handler)
+    cpdef void unregister(self, object type_)
+    cdef _reload_lazy_handlers(self)
+    cpdef get_handler(self, object type_)
+
+
+cpdef str to_str(s, encoding=*)
+cpdef bytes to_binary(s, encoding=*)
+cpdef unicode to_text(s, encoding=*)
+cpdef register_tokenizer(cls, handler)
+cpdef void reset_id_random_seed() except *
+cpdef bytes new_random_id(int byte_len)
diff --git a/python/xorbits/_mars/_utils.pyx b/python/xorbits/_mars/_utils.pyx
new file mode 100644
index 000000000..a7740f8df
--- /dev/null
+++ b/python/xorbits/_mars/_utils.pyx
@@ -0,0 +1,508 @@
+# distutils: language = c++
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import importlib
+import itertools
+import os
+import pickle
+import pkgutil
+import time
+import types
+import uuid
+import warnings
+from datetime import date, datetime, timedelta, tzinfo
+from enum import Enum
+from functools import lru_cache, partial
+from random import getrandbits
+from weakref import WeakSet
+
+import cloudpickle
+import numpy as np
+import pandas as pd
+
+cimport cython
+from cpython cimport PyBytes_FromStringAndSize
+from libc.stdint cimport uint8_t, uint32_t, uint_fast64_t
+from libc.stdlib cimport free, malloc
+
+from .lib.cython.libcpp cimport mt19937_64
+
+try:
+    from pandas.tseries.offsets import Tick as PDTick
+except ImportError:
+    PDTick = None
+
+from .lib.mmh3 import hash as mmh_hash
+from .lib.mmh3 import hash_bytes as mmh_hash_bytes
+from .lib.mmh3 import hash_from_buffer as mmh3_hash_from_buffer
+
+
+cdef bint _has_cupy = bool(pkgutil.find_loader('cupy'))
+cdef bint _has_cudf = bool(pkgutil.find_loader('cudf'))
+cdef bint _has_sqlalchemy = bool(pkgutil.find_loader('sqlalchemy'))
+cdef bint _has_interval_array_inclusive = hasattr(
+    pd.arrays.IntervalArray, "inclusive"
+)
+
+
+cdef extern from "MurmurHash3.h":
+    void MurmurHash3_x64_128(const void * key, Py_ssize_t len, uint32_t seed, void * out)
+
+
+cdef bytes _get_mars_key(const uint8_t[:] bufferview):
+    cdef const uint8_t *data = &bufferview[0]
+    cdef uint8_t out[16]
+    MurmurHash3_x64_128(data, len(bufferview), 0, out)
+    out[0] |= 0xC0
+    return PyBytes_FromStringAndSize(<char*>out, 16)
+
+
+cpdef str to_str(s, encoding='utf-8'):
+    if type(s) is str:
+        return <str>s
+    elif isinstance(s, bytes):
+        return (<bytes>s).decode(encoding)
+    elif isinstance(s, str):
+        return str(s)
+    elif s is None:
+        return s
+    else:
+        raise TypeError(f"Could not convert from {s} to str.")
+
+
+cpdef bytes to_binary(s, encoding='utf-8'):
+    if type(s) is bytes:
+        return <bytes>s
+    elif isinstance(s, unicode):
+        return (<unicode>s).encode(encoding)
+    elif isinstance(s, bytes):
+        return bytes(s)
+    elif s is None:
+        return None
+    else:
+        raise TypeError(f"Could not convert from {s} to bytes.")
+
+
+cpdef unicode to_text(s, encoding='utf-8'):
+    if type(s) is unicode:
+        return <unicode>s
+    elif isinstance(s, bytes):
+        return (<bytes>s).decode('utf-8')
+    elif isinstance(s, unicode):
+        return unicode(s)
+    elif s is None:
+        return None
+    else:
+        raise TypeError(f"Could not convert from {s} to unicode.")
+
+
+_type_dispatchers = WeakSet()
+
+
+NamedType = collections.namedtuple("NamedType", ["name", "type_"])
+
+
+cdef class TypeDispatcher:
+    def __init__(self):
+        self._handlers = dict()
+        self._lazy_handlers = dict()
+        # store inherited handlers to facilitate unregistering
+        self._inherit_handlers = dict()
+
+        _type_dispatchers.add(self)
+
+    cpdef void register(self, object type_, object handler):
+        if isinstance(type_, str):
+            self._lazy_handlers[type_] = handler
+        elif type(type_) is not NamedType and isinstance(type_, tuple):
+            for t in type_:
+                self.register(t, handler)
+        else:
+            self._handlers[type_] = handler
+
+    cpdef void unregister(self, object type_):
+        if type(type_) is not NamedType and isinstance(type_, tuple):
+            for t in type_:
+                self.unregister(t)
+        else:
+            self._lazy_handlers.pop(type_, None)
+            self._handlers.pop(type_, None)
+            self._inherit_handlers.clear()
+
+    cdef _reload_lazy_handlers(self):
+        for k, v in self._lazy_handlers.items():
+            mod_name, obj_name = k.rsplit('.', 1)
+            with warnings.catch_warnings():
+                # the lazy imported cudf will warn no device found,
+                # when we set visible device to -1 for CPU processes,
+                # ignore the warning to not distract users
+                warnings.simplefilter("ignore")
+                mod = importlib.import_module(mod_name, __name__)
+            self.register(getattr(mod, obj_name), v)
+        self._lazy_handlers = dict()
+
+    cpdef get_handler(self, object type_):
+        try:
+            return self._handlers[type_]
+        except KeyError:
+            pass
+
+        try:
+            return self._inherit_handlers[type_]
+        except KeyError:
+            self._reload_lazy_handlers()
+            if type(type_) is NamedType:
+                named_type = partial(NamedType, type_.name)
+                mro = itertools.chain(
+                    *zip(map(named_type, type_.type_.__mro__),
+                         type_.type_.__mro__)
+                )
+            else:
+                mro = type_.__mro__
+            for clz in mro:
+                # only lookup self._handlers for mro clz
+                handler = self._handlers.get(clz)
+                if handler is not None:
+                    self._inherit_handlers[type_] = handler
+                    return handler
+            raise KeyError(f'Cannot dispatch type {type_}')
+
+    def __call__(self, object obj, *args, **kwargs):
+        return self.get_handler(type(obj))(obj, *args, **kwargs)
+
+    @staticmethod
+    def reload_all_lazy_handlers():
+        for dispatcher in _type_dispatchers:
+            (<TypeDispatcher>dispatcher)._reload_lazy_handlers()
+
+
+cdef inline build_canonical_bytes(tuple args, kwargs):
+    if kwargs:
+        args = args + (kwargs,)
+    return pickle.dumps(tokenize_handler(args))
+
+
+def tokenize(*args, **kwargs):
+    return _get_mars_key(build_canonical_bytes(args, kwargs)).hex()
+
+
+def tokenize_int(*args, **kwargs):
+    return mmh_hash(build_canonical_bytes(args, kwargs))
+
+
+cdef class Tokenizer(TypeDispatcher):
+    def __call__(self, object obj, *args, **kwargs):
+        try:
+            return self.get_handler(type(obj))(obj, *args, **kwargs)
+        except KeyError:
+            if hasattr(obj, '__mars_tokenize__') and not isinstance(obj, type):
+                if len(args) == 0 and len(kwargs) == 0:
+                    return obj.__mars_tokenize__()
+                else:
+                    obj = obj.__mars_tokenize__()
+                    return self.get_handler(type(obj))(obj, *args, **kwargs)
+            if callable(obj):
+                if PDTick is not None and not isinstance(obj, PDTick):
+                    return tokenize_function(obj)
+
+            try:
+                return cloudpickle.dumps(obj)
+            except:
+                raise TypeError(f'Cannot generate token for {obj}, type: {type(obj)}') from None
+
+
+cdef inline list iterative_tokenize(object ob):
+    cdef list dq = [ob]
+    cdef int dq_pos = 0
+    cdef list h_list = []
+    while dq_pos < len(dq):
+        x = dq[dq_pos]
+        dq_pos += 1
+        if type(x) in _primitive_types:
+            h_list.append(x)
+        elif isinstance(x, (list, tuple)):
+            dq.extend(x)
+        elif isinstance(x, set):
+            dq.extend(sorted(x))
+        elif isinstance(x, dict):
+            dq.extend(sorted(x.items()))
+        else:
+            h_list.append(tokenize_handler(x))
+
+        if dq_pos >= 64 and len(dq) < dq_pos * 2:  # pragma: no cover
+            dq = dq[dq_pos:]
+            dq_pos = 0
+    return h_list
+
+
+cdef inline tuple tokenize_numpy(ob):
+    cdef int offset
+
+    if not ob.shape:
+        return str(ob), ob.dtype
+    if hasattr(ob, 'mode') and getattr(ob, 'filename', None):
+        if hasattr(ob.base, 'ctypes'):
+            offset = (ob.ctypes.get_as_parameter().value -
+                      ob.base.ctypes.get_as_parameter().value)
+        else:
+            offset = 0  # root memmap's have mmap object as base
+        return (ob.filename, os.path.getmtime(ob.filename), ob.dtype,
+                ob.shape, ob.strides, offset)
+    if ob.dtype.hasobject:
+        try:
+            data = mmh_hash_bytes('-'.join(ob.flat).encode('utf-8', errors='surrogatepass'))
+        except UnicodeDecodeError:
+            data = mmh_hash_bytes(b'-'.join([to_binary(x) for x in ob.flat]))
+        except TypeError:
+            try:
+                data = mmh_hash_bytes(pickle.dumps(ob, pickle.HIGHEST_PROTOCOL))
+            except:
+                # nothing can do, generate uuid
+                data = uuid.uuid4().hex
+    else:
+        try:
+            data = mmh_hash_bytes(ob.ravel().view('i1').data)
+        except (BufferError, AttributeError, ValueError):
+            data = mmh_hash_bytes(ob.copy().ravel().view('i1').data)
+    return data, ob.dtype, ob.shape, ob.strides
+
+
+cdef inline _extract_range_index_attr(object range_index, str attr):
+    try:
+        return getattr(range_index, attr)
+    except AttributeError:  # pragma: no cover
+        return getattr(range_index, '_' + attr)
+
+
+cdef list tokenize_pandas_index(ob):
+    cdef long long start
+    cdef long long stop
+    cdef long long end
+    if isinstance(ob, pd.RangeIndex):
+        start = _extract_range_index_attr(ob, 'start')
+        stop = _extract_range_index_attr(ob, 'stop')
+        step = _extract_range_index_attr(ob, 'step')
+        # for range index, there is no need to get the values
+        return iterative_tokenize([ob.name, getattr(ob, 'names', None), slice(start, stop, step)])
+    else:
+        return iterative_tokenize([ob.name, getattr(ob, 'names', None), ob.values])
+
+
+cdef list tokenize_pandas_series(ob):
+    return iterative_tokenize([ob.name, ob.dtype, ob.values, ob.index])
+
+
+cdef list tokenize_pandas_dataframe(ob):
+    l = [block.values for block in ob._data.blocks]
+    l.extend([ob.columns, ob.index])
+    return iterative_tokenize(l)
+
+
+cdef list tokenize_pandas_categorical(ob):
+    l = ob.to_list()
+    l.append(ob.shape)
+    return iterative_tokenize(l)
+
+
+cdef list tokenize_pd_extension_dtype(ob):
+    return iterative_tokenize([ob.name])
+
+
+cdef list tokenize_categories_dtype(ob):
+    return iterative_tokenize([ob.categories, ob.ordered])
+
+
+cdef list tokenize_interval_dtype(ob):
+    return iterative_tokenize([type(ob).__name__, ob.subtype])
+
+
+cdef list tokenize_pandas_time_arrays(ob):
+    return iterative_tokenize([ob.asi8, ob.dtype])
+
+
+cdef list tokenize_pandas_tick(ob):
+    return iterative_tokenize([ob.freqstr])
+
+
+cdef list tokenize_pandas_interval_arrays(ob):  # pragma: no cover
+    if _has_interval_array_inclusive:
+        return iterative_tokenize([ob.left, ob.right, ob.inclusive])
+    else:
+        return iterative_tokenize([ob.left, ob.right, ob.closed])
+
+
+cdef list tokenize_sqlalchemy_data_type(ob):
+    return iterative_tokenize([repr(ob)])
+
+
+cdef list tokenize_sqlalchemy_selectable(ob):
+    return iterative_tokenize([str(ob)])
+
+
+cdef list tokenize_enum(ob):
+    cls = type(ob)
+    return iterative_tokenize([id(cls), cls.__name__, ob.name])
+
+
+@lru_cache(500)
+def tokenize_function(ob):
+    if isinstance(ob, partial):
+        args = iterative_tokenize(ob.args)
+        keywords = iterative_tokenize(ob.keywords.items()) if ob.keywords else None
+        return tokenize_function(ob.func), args, keywords
+    else:
+        try:
+            if isinstance(ob, types.FunctionType):
+                return iterative_tokenize([pickle.dumps(ob, protocol=0), id(ob)])
+            else:
+                return pickle.dumps(ob, protocol=0)
+        except:
+            pass
+        try:
+            return cloudpickle.dumps(ob, protocol=0)
+        except:
+            return str(ob)
+
+
+@lru_cache(500)
+def tokenize_pickled_with_cache(ob):
+    return pickle.dumps(ob)
+
+
+def tokenize_cupy(ob):
+    from .serialization import serialize
+    header, _buffers = serialize(ob)
+    return iterative_tokenize([header, ob.data.ptr])
+
+
+def tokenize_cudf(ob):
+    from .serialization import serialize
+    header, buffers = serialize(ob)
+    return iterative_tokenize([header] + [(buf.ptr, buf.size) for buf in buffers])
+
+
+cdef Tokenizer tokenize_handler = Tokenizer()
+
+cdef set _primitive_types = {
+    int, float, str, unicode, bytes, complex, type(None), type, slice, date, datetime, timedelta
+}
+for t in _primitive_types:
+    tokenize_handler.register(t, lambda ob: ob)
+
+for t in (np.dtype, np.generic):
+    tokenize_handler.register(t, lambda ob: ob)
+
+for t in (list, tuple, dict, set):
+    tokenize_handler.register(t, iterative_tokenize)
+
+tokenize_handler.register(np.ndarray, tokenize_numpy)
+tokenize_handler.register(np.random.RandomState, lambda ob: iterative_tokenize(ob.get_state()))
+tokenize_handler.register(memoryview, lambda ob: mmh3_hash_from_buffer(ob))
+tokenize_handler.register(Enum, tokenize_enum)
+tokenize_handler.register(pd.Index, tokenize_pandas_index)
+tokenize_handler.register(pd.Series, tokenize_pandas_series)
+tokenize_handler.register(pd.DataFrame, tokenize_pandas_dataframe)
+tokenize_handler.register(pd.Categorical, tokenize_pandas_categorical)
+tokenize_handler.register(pd.CategoricalDtype, tokenize_categories_dtype)
+tokenize_handler.register(pd.IntervalDtype, tokenize_interval_dtype)
+tokenize_handler.register(tzinfo, tokenize_pickled_with_cache)
+tokenize_handler.register(pd.arrays.DatetimeArray, tokenize_pandas_time_arrays)
+tokenize_handler.register(pd.arrays.TimedeltaArray, tokenize_pandas_time_arrays)
+tokenize_handler.register(pd.arrays.PeriodArray, tokenize_pandas_time_arrays)
+tokenize_handler.register(pd.arrays.IntervalArray, tokenize_pandas_interval_arrays)
+tokenize_handler.register(pd.api.extensions.ExtensionDtype, tokenize_pd_extension_dtype)
+if _has_cupy:
+    tokenize_handler.register('cupy.ndarray', tokenize_cupy)
+if _has_cudf:
+    tokenize_handler.register('cudf.DataFrame', tokenize_cudf)
+    tokenize_handler.register('cudf.Series', tokenize_cudf)
+    tokenize_handler.register('cudf.Index', tokenize_cudf)
+
+if PDTick is not None:
+    tokenize_handler.register(PDTick, tokenize_pandas_tick)
+if _has_sqlalchemy:
+    tokenize_handler.register(
+        "sqlalchemy.sql.sqltypes.TypeEngine", tokenize_sqlalchemy_data_type
+    )
+    tokenize_handler.register(
+        "sqlalchemy.sql.Selectable", tokenize_sqlalchemy_selectable
+    )
+
+cpdef register_tokenizer(cls, handler):
+    tokenize_handler.register(cls, handler)
+
+
+@cython.nonecheck(False)
+@cython.cdivision(True)
+cpdef long long ceildiv(long long x, long long y) nogil:
+    return x // y + (x % y != 0)
+
+
+cdef class Timer:
+    cdef object _start
+    cdef readonly object duration
+
+    def __enter__(self):
+        self._start = time.time()
+        return self
+
+    def __exit__(self, *_):
+        self.duration = time.time() - self._start
+
+
+cdef mt19937_64 _rnd_gen
+cdef bint _rnd_is_seed_set = False
+
+
+cpdef void reset_id_random_seed() except *:
+    cdef bytes seed_bytes
+    global _rnd_is_seed_set
+
+    seed_bytes = getrandbits(64).to_bytes(8, "little")
+    _rnd_gen.seed((<uint_fast64_t *><char *>seed_bytes)[0])
+    _rnd_is_seed_set = True
+
+
+cpdef bytes new_random_id(int byte_len):
+    cdef uint_fast64_t *res_ptr
+    cdef uint_fast64_t res_data[4]
+    cdef int i, qw_num = byte_len >> 3
+    cdef bytes res
+
+    if not _rnd_is_seed_set:
+        reset_id_random_seed()
+
+    if (qw_num << 3) < byte_len:
+        qw_num += 1
+
+    if qw_num <= 4:
+        # use stack memory to accelerate
+        res_ptr = res_data
+    else:
+        res_ptr = <uint_fast64_t *>malloc(qw_num << 3)
+
+    try:
+        for i in range(qw_num):
+            res_ptr[i] = _rnd_gen()
+        return <bytes>((<char *>&(res_ptr[0]))[:byte_len])
+    finally:
+        # free memory if allocated by malloc
+        if res_ptr != res_data:
+            free(res_ptr)
+
+
+__all__ = ['to_str', 'to_binary', 'to_text', 'TypeDispatcher', 'tokenize', 'tokenize_int',
+           'register_tokenizer', 'ceildiv', 'Timer', 'reset_id_random_seed', 'new_random_id']
diff --git a/python/xorbits/_mars/_version.py b/python/xorbits/_mars/_version.py
new file mode 100644
index 000000000..742480dba
--- /dev/null
+++ b/python/xorbits/_mars/_version.py
@@ -0,0 +1,692 @@
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain. Generated by
+# versioneer-0.23 (https://github.com/python-versioneer/python-versioneer)
+
+"""Git implementation of _version.py."""
+
+import errno
+import os
+import re
+import subprocess
+import sys
+from typing import Callable, Dict
+import functools
+
+
+def get_keywords():
+    """Get the keywords needed to look up the version information."""
+    # these strings will be replaced by git during git-archive.
+    # setup.py/versioneer.py will grep for the variable names, so they must
+    # each be defined on a line of their own. _version.py will just call
+    # get_keywords().
+    git_refnames = "$Format:%d$"
+    git_full = "$Format:%H$"
+    git_date = "$Format:%ci$"
+    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+    return keywords
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_config():
+    """Create, populate and return the VersioneerConfig() object."""
+    # these strings are filled in when 'setup.py versioneer' creates
+    # _version.py
+    cfg = VersioneerConfig()
+    cfg.VCS = "git"
+    cfg.style = "pep440"
+    cfg.tag_prefix = "v"
+    cfg.parentdir_prefix = "pymars-"
+    cfg.versionfile_source = "mars/_version.py"
+    cfg.verbose = False
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+LONG_VERSION_PY: Dict[str, str] = {}
+HANDLERS: Dict[str, Dict[str, Callable]] = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Create decorator to mark a method as the handler of a VCS."""
+
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    process = None
+
+    popen_kwargs = {}
+    if sys.platform == "win32":
+        # This hides the console window if pythonw.exe is used
+        startupinfo = subprocess.STARTUPINFO()
+        startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+        popen_kwargs["startupinfo"] = startupinfo
+
+    for command in commands:
+        try:
+            dispcmd = str([command] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            process = subprocess.Popen(
+                [command] + args,
+                cwd=cwd,
+                env=env,
+                stdout=subprocess.PIPE,
+                stderr=(subprocess.PIPE if hide_stderr else None),
+                **popen_kwargs,
+            )
+            break
+        except OSError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %s" % dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %s" % (commands,))
+        return None, None
+    stdout = process.communicate()[0].strip().decode()
+    if process.returncode != 0:
+        if verbose:
+            print("unable to run %s (error)" % dispcmd)
+            print("stdout was %s" % stdout)
+        return None, process.returncode
+    return stdout, process.returncode
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for _ in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {
+                "version": dirname[len(parentdir_prefix) :],
+                "full-revisionid": None,
+                "dirty": False,
+                "error": None,
+                "date": None,
+            }
+        rootdirs.append(root)
+        root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print(
+            "Tried directories %s but none started with prefix %s"
+            % (str(rootdirs), parentdir_prefix)
+        )
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        with open(versionfile_abs, "r") as fobj:
+            for line in fobj:
+                if line.strip().startswith("git_refnames ="):
+                    mo = re.search(r'=\s*"(.*)"', line)
+                    if mo:
+                        keywords["refnames"] = mo.group(1)
+                if line.strip().startswith("git_full ="):
+                    mo = re.search(r'=\s*"(.*)"', line)
+                    if mo:
+                        keywords["full"] = mo.group(1)
+                if line.strip().startswith("git_date ="):
+                    mo = re.search(r'=\s*"(.*)"', line)
+                    if mo:
+                        keywords["date"] = mo.group(1)
+    except OSError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if "refnames" not in keywords:
+        raise NotThisMethod("Short version file found")
+    date = keywords.get("date")
+    if date is not None:
+        # Use only the last line.  Previous lines may contain GPG signature
+        # information.
+        date = date.splitlines()[-1]
+
+        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = {r.strip() for r in refnames.strip("()").split(",")}
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)}
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = {r for r in refs if re.search(r"\d", r)}
+        if verbose:
+            print("discarding '%s', no digits" % ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %s" % ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix) :]
+            # Filter out refs that exactly match prefix or that don't start
+            # with a number once the prefix is stripped (mostly a concern
+            # when prefix is '')
+            if not re.match(r"\d", r):
+                continue
+            if verbose:
+                print("picking %s" % r)
+            return {
+                "version": r,
+                "full-revisionid": keywords["full"].strip(),
+                "dirty": False,
+                "error": None,
+                "date": date,
+            }
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {
+        "version": "0+unknown",
+        "full-revisionid": keywords["full"].strip(),
+        "dirty": False,
+        "error": "no suitable tags",
+        "date": None,
+    }
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    # GIT_DIR can interfere with correct operation of Versioneer.
+    # It may be intended to be passed to the Versioneer-versioned project,
+    # but that should not change where we get our version from.
+    env = os.environ.copy()
+    env.pop("GIT_DIR", None)
+    runner = functools.partial(runner, env=env)
+
+    _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %s not under git control" % root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = runner(
+        GITS,
+        [
+            "describe",
+            "--tags",
+            "--dirty",
+            "--always",
+            "--long",
+            "--match",
+            f"{tag_prefix}[[:digit:]]*",
+        ],
+        cwd=root,
+    )
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root)
+    # --abbrev-ref was added in git-1.6.3
+    if rc != 0 or branch_name is None:
+        raise NotThisMethod("'git rev-parse --abbrev-ref' returned error")
+    branch_name = branch_name.strip()
+
+    if branch_name == "HEAD":
+        # If we aren't exactly on a branch, pick a branch which represents
+        # the current commit. If all else fails, we are on a branchless
+        # commit.
+        branches, rc = runner(GITS, ["branch", "--contains"], cwd=root)
+        # --contains was added in git-1.5.4
+        if rc != 0 or branches is None:
+            raise NotThisMethod("'git branch --contains' returned error")
+        branches = branches.split("\n")
+
+        # Remove the first line if we're running detached
+        if "(" in branches[0]:
+            branches.pop(0)
+
+        # Strip off the leading "* " from the list of branches.
+        branches = [branch[2:] for branch in branches]
+        if "master" in branches:
+            branch_name = "master"
+        elif not branches:
+            branch_name = None
+        else:
+            # Pick the first branch that is returned. Good or bad.
+            branch_name = branches[0]
+
+    pieces["branch"] = branch_name
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[: git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
+        if not mo:
+            # unparsable. Maybe git-describe is misbehaving?
+            pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%s' doesn't start with prefix '%s'"
+                print(fmt % (full_tag, tag_prefix))
+            pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
+                full_tag,
+                tag_prefix,
+            )
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix) :]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root)
+        pieces["distance"] = len(out.split())  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip()
+    # Use only the last line.  Previous lines may contain GPG signature
+    # information.
+    date = date.splitlines()[-1]
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_branch(pieces):
+    """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] .
+
+    The ".dev0" means not master branch. Note that .dev0 sorts backwards
+    (a feature branch will appear "older" than the master branch).
+
+    Exceptions:
+    1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            if pieces["branch"] != "master":
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0"
+        if pieces["branch"] != "master":
+            rendered += ".dev0"
+        rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def pep440_split_post(ver):
+    """Split pep440 version string at the post-release segment.
+
+    Returns the release segments before the post-release and the
+    post-release version number (or -1 if no post-release segment is present).
+    """
+    vc = str.split(ver, ".post")
+    return vc[0], int(vc[1] or 0) if len(vc) == 2 else None
+
+
+def render_pep440_pre(pieces):
+    """TAG[.postN.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post0.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        if pieces["distance"]:
+            # update the post release segment
+            tag_version, post_version = pep440_split_post(pieces["closest-tag"])
+            rendered = tag_version
+            if post_version is not None:
+                rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"])
+            else:
+                rendered += ".post0.dev%d" % (pieces["distance"])
+        else:
+            # no commits, use the tag as the version
+            rendered = pieces["closest-tag"]
+    else:
+        # exception #1
+        rendered = "0.post0.dev%d" % pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+    return rendered
+
+
+def render_pep440_post_branch(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] .
+
+    The ".dev0" means not master branch.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["branch"] != "master":
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["branch"] != "master":
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {
+            "version": "unknown",
+            "full-revisionid": pieces.get("long"),
+            "dirty": None,
+            "error": pieces["error"],
+            "date": None,
+        }
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-branch":
+        rendered = render_pep440_branch(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-post-branch":
+        rendered = render_pep440_post_branch(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%s'" % style)
+
+    return {
+        "version": rendered,
+        "full-revisionid": pieces["long"],
+        "dirty": pieces["dirty"],
+        "error": None,
+        "date": pieces.get("date"),
+    }
+
+
+def get_versions():
+    """Get version information or return default if unable to do so."""
+    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+    # __file__, we can work backwards from there to the root. Some
+    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # case we can only use expanded keywords.
+
+    cfg = get_config()
+    verbose = cfg.verbose
+
+    try:
+        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose)
+    except NotThisMethod:
+        pass
+
+    try:
+        root = os.path.realpath(__file__)
+        # versionfile_source is the relative path from the top of the source
+        # tree (where the .git directory might live) to this file. Invert
+        # this to find the root from __file__.
+        for _ in cfg.versionfile_source.split("/"):
+            root = os.path.dirname(root)
+    except NameError:
+        return {
+            "version": "0+unknown",
+            "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to find root of source tree",
+            "date": None,
+        }
+
+    try:
+        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+        return render(pieces, cfg.style)
+    except NotThisMethod:
+        pass
+
+    try:
+        if cfg.parentdir_prefix:
+            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+    except NotThisMethod:
+        pass
+
+    return {
+        "version": "0+unknown",
+        "full-revisionid": None,
+        "dirty": None,
+        "error": "unable to compute version",
+        "date": None,
+    }
diff --git a/python/xorbits/_mars/config.py b/python/xorbits/_mars/config.py
new file mode 100644
index 000000000..f20513450
--- /dev/null
+++ b/python/xorbits/_mars/config.py
@@ -0,0 +1,443 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import functools
+import operator
+import os
+import threading
+import warnings
+from copy import deepcopy
+from typing import Dict, Union
+
+_DEFAULT_REDIRECT_WARN = (
+    "Option {source} has been replaced by {target} and "
+    "might be removed in a future release."
+)
+
+
+class OptionError(Exception):
+    pass
+
+
+class Redirection:
+    def __init__(self, item, warn=None):
+        self._items = item.split(".")
+        self._warn = warn
+        self._warned = True
+        self._parent = None
+
+    def bind(self, attr_dict):
+        self._parent = attr_dict
+        self.getvalue()
+        self._warned = False
+
+    def getvalue(self):
+        if self._warn and not self._warned:
+            self._warned = True
+            warnings.warn(self._warn)
+        conf = self._parent.root
+        for it in self._items:
+            conf = getattr(conf, it)
+        return conf
+
+    def setvalue(self, value):
+        if self._warn and not self._warned:
+            self._warned = True
+            warnings.warn(self._warn)
+        conf = self._parent.root
+        for it in self._items[:-1]:
+            conf = getattr(conf, it)
+        setattr(conf, self._items[-1], value)
+
+
+class AttributeDict(dict):
+    def __init__(self, *args, **kwargs):
+        self._inited = False
+        self._parent = kwargs.pop("_parent", None)
+        self._root = None
+        super().__init__(*args, **kwargs)
+        self._inited = True
+
+    @property
+    def root(self):
+        if self._root is not None:
+            return self._root
+        if self._parent is None:
+            self._root = self
+        else:
+            self._root = self._parent.root
+        return self._root
+
+    def __getattr__(self, item):
+        if item in self:
+            val = self[item]
+            if isinstance(val, AttributeDict):
+                return val
+            elif isinstance(val[0], Redirection):
+                return val[0].getvalue()
+            else:
+                return val[0]
+        return object.__getattribute__(self, item)
+
+    def __dir__(self):
+        return list(self.keys())
+
+    def register(self, key, value, validator=None):
+        if isinstance(validator, tuple):
+            validator = any_validator(*validator)
+        self[key] = value, validator
+        if isinstance(value, Redirection):
+            value.bind(self)
+
+    def unregister(self, key):
+        del self[key]
+
+    def _setattr(self, key, value, silent=False):
+        splits = key.split(".")
+        target = self
+        for k in splits[:-1]:
+            if not silent and (
+                not isinstance(target, AttributeDict) or k not in target
+            ):
+                raise OptionError("You can only set the value of existing options")
+            target = target[k]
+        key = splits[-1]
+
+        if not isinstance(value, AttributeDict):
+            validate = None
+            if key in target:
+                val = target[key]
+                validate = target[key][1]
+                if validate is not None:
+                    if not validate(value):
+                        raise ValueError(f"Cannot set value `{value}`")
+                if isinstance(val[0], Redirection):
+                    val[0].setvalue(value)
+                else:
+                    target[key] = value, validate
+            else:
+                target[key] = value, validate
+        else:
+            target[key] = value
+
+    def __setattr__(self, key, value):
+        if key == "_inited":
+            super().__setattr__(key, value)
+            return
+        try:
+            object.__getattribute__(self, key)
+            super().__setattr__(key, value)
+            return
+        except AttributeError:
+            pass
+
+        if not self._inited:
+            super().__setattr__(key, value)
+        else:
+            self._setattr(key, value)
+
+    def to_dict(self):
+        d = dict()
+        for k, v in self.items():
+            if isinstance(v, AttributeDict):
+                d.update(
+                    {k + "." + sub_k: sub_v for sub_k, sub_v in v.to_dict().items()}
+                )
+            elif isinstance(v[0], Redirection):
+                continue
+            else:
+                d[k] = v[0]
+        return d
+
+
+class Config:
+    def __init__(self, config=None):
+        self._config = config or AttributeDict()
+        self._serialize_options = []
+
+    def __dir__(self):
+        return list(self._config.keys())
+
+    def __getattr__(self, item):
+        config = object.__getattribute__(self, "_config")
+        return getattr(config, item)
+
+    def __setattr__(self, key, value):
+        if key.startswith("_"):
+            object.__setattr__(self, key, value)
+            return
+        setattr(self._config, key, value)
+
+    def register_option(self, option, value, validator=None, serialize=False):
+        splits = option.split(".")
+        conf = self._config
+        if isinstance(validator, tuple):
+            validator = any_validator(*validator)
+
+        for name in splits[:-1]:
+            config = conf.get(name)
+            if config is None:
+                val = AttributeDict(_parent=conf)
+                conf[name] = val
+                conf = val
+            elif not isinstance(config, dict):
+                raise AttributeError(
+                    f"Fail to set option: {option}, conflict has encountered"
+                )
+            else:
+                conf = config
+
+        key = splits[-1]
+        if conf.get(key) is not None:
+            raise AttributeError(f"Fail to set option: {option}, option has been set")
+
+        conf.register(key, value, validator)
+        if serialize:
+            self._serialize_options.append(option)
+
+    def redirect_option(self, option, target, warn=_DEFAULT_REDIRECT_WARN):
+        redir = Redirection(target, warn=warn.format(source=option, target=target))
+        self.register_option(option, redir)
+
+    def unregister_option(self, option):
+        splits = option.split(".")
+        conf = self._config
+        for name in splits[:-1]:
+            config = conf.get(name)
+            if not isinstance(config, dict):
+                raise AttributeError(
+                    f"Fail to unregister option: {option}, conflict has encountered"
+                )
+            else:
+                conf = config
+
+        key = splits[-1]
+        if key not in conf:
+            raise AttributeError(
+                f"Option {option} not configured, thus failed to unregister."
+            )
+        conf.unregister(key)
+
+    def copy(self):
+        new_options = Config(deepcopy(self._config))
+        return new_options
+
+    def update(self, new_config: Union["Config", Dict]):
+        if not isinstance(new_config, dict):
+            new_config = new_config._config
+        for option, value in new_config.items():
+            try:
+                self.register_option(option, value)
+            except AttributeError:
+                setattr(self, option, value)
+
+    def get_serializable(self):
+        d = dict()
+        for k in self._serialize_options:
+            parts = k.split(".")
+            v = self
+            for p in parts:
+                v = getattr(v, p)
+            d[k] = v
+        return d
+
+    def fill_serialized(self, d):
+        for k, v in d.items():
+            parts = k.split(".")
+            cf = self
+            for p in parts[:-1]:
+                cf = getattr(cf, p)
+            setattr(cf, parts[-1], v)
+
+    def to_dict(self):
+        return self._config.to_dict()
+
+
+@contextlib.contextmanager
+def option_context(config=None):
+    global_options = get_global_option()
+
+    try:
+        config = config or dict()
+        local_options = Config(deepcopy(global_options._config))
+        local_options.update(config)
+        _options_local.default_options = local_options
+        yield local_options
+    finally:
+        _options_local.default_options = global_options
+
+
+def is_interactive():
+    import __main__ as main
+
+    return not hasattr(main, "__file__")
+
+
+# validators
+def any_validator(*validators):
+    def validate(x):
+        return any(validator(x) for validator in validators)
+
+    return validate
+
+
+def all_validator(*validators):
+    def validate(x):
+        return all(validator(x) for validator in validators)
+
+    return validate
+
+
+def _instance_check(typ, v):
+    return isinstance(v, typ)
+
+
+is_null = functools.partial(operator.is_, None)
+is_bool = functools.partial(_instance_check, bool)
+is_integer = functools.partial(_instance_check, int)
+is_float = functools.partial(_instance_check, float)
+is_numeric = functools.partial(_instance_check, (float, int))
+is_string = functools.partial(_instance_check, str)
+is_dict = functools.partial(_instance_check, dict)
+is_list = functools.partial(_instance_check, list)
+
+
+def is_in(vals):
+    def validate(x):
+        return x in vals
+
+    return validate
+
+
+default_options = Config()
+default_options.register_option("tcp_timeout", 30, validator=is_integer)
+default_options.register_option("verbose", False, validator=is_bool)
+default_options.register_option("kv_store", ":inproc:", validator=is_string)
+default_options.register_option("check_interval", 20, validator=is_integer)
+default_options.register_option(
+    "show_progress", "auto", validator=any_validator(is_bool, is_string)
+)
+default_options.register_option("serialize_method", "pickle")
+
+# dataframe-related options
+default_options.register_option(
+    "dataframe.mode.use_inf_as_na", False, validator=is_bool
+)
+default_options.register_option(
+    "dataframe.use_arrow_dtype", None, validator=any_validator(is_null, is_bool)
+)
+default_options.register_option(
+    "dataframe.arrow_array.pandas_only", None, validator=any_validator(is_null, is_bool)
+)
+
+# learn options
+assume_finite = os.environ.get("SKLEARN_ASSUME_FINITE")
+if assume_finite is not None:
+    assume_finite = bool(assume_finite)
+working_memory = os.environ.get("SKLEARN_WORKING_MEMORY")
+if working_memory is not None:
+    working_memory = int(working_memory)
+default_options.register_option(
+    "learn.assume_finite", assume_finite, validator=any_validator(is_null, is_bool)
+)
+default_options.register_option(
+    "learn.working_memory", working_memory, validator=any_validator(is_null, is_integer)
+)
+
+# the number of combined chunks in tree reduction or tree add
+default_options.register_option("combine_size", 4, validator=is_integer, serialize=True)
+
+# the default chunk store size
+default_options.register_option(
+    "chunk_store_limit", 128 * 1024**2, validator=is_numeric
+)
+default_options.register_option(
+    "chunk_size", None, validator=any_validator(is_null, is_integer), serialize=True
+)
+
+# rechunk
+default_options.register_option(
+    "rechunk.threshold", 4, validator=is_integer, serialize=True
+)
+default_options.register_option(
+    "rechunk.chunk_size_limit", int(1e8), validator=is_integer, serialize=True
+)
+
+default_options.register_option(
+    "bincount.chunk_size_limit", int(1e8), validator=is_integer, serialize=True
+)
+
+# deploy
+default_options.register_option("deploy.open_browser", True, validator=is_bool)
+
+# optimization
+default_options.register_option("optimize_tileable_graph", True, validator=is_bool)
+
+# eager mode
+default_options.register_option("eager_mode", False, validator=is_bool)
+
+# optimization
+default_options.register_option(
+    "optimize.head_optimize_threshold", 1000, validator=is_integer
+)
+
+# debug
+default_options.register_option("warn_duplicated_execution", False, validator=is_bool)
+
+# client serialize type
+default_options.register_option("client.serial_type", "arrow", validator=is_string)
+
+# custom log dir
+default_options.register_option(
+    "custom_log_dir", None, validator=any_validator(is_null, is_string)
+)
+
+# vineyard
+default_options.register_option(
+    "vineyard.socket", os.environ.get("VINEYARD_IPC_SOCKET", None)
+)
+default_options.register_option(
+    "vineyard.enabled", os.environ.get("WITH_VINEYARD", None) is not None
+)
+
+_options_local = threading.local()
+_options_local.default_options = default_options
+
+
+def get_global_option():
+    ret = getattr(_options_local, "default_options", None)
+    if ret is None:
+        ret = _options_local.default_options = Config(deepcopy(default_options._config))
+
+    return ret
+
+
+class OptionsProxy:
+    def __dir__(self):
+        return dir(get_global_option())
+
+    def __getattribute__(self, attr):
+        return getattr(get_global_option(), attr)
+
+    def __setattr__(self, key, value):
+        setattr(get_global_option(), key, value)
+
+
+options = OptionsProxy()
+
+options.redirect_option("tensor.chunk_store_limit", "chunk_store_limit")
+options.redirect_option("tensor.chunk_size", "chunk_size")
+options.redirect_option("tensor.rechunk.threshold", "rechunk.threshold")
+options.redirect_option("tensor.rechunk.chunk_size_limit", "rechunk.chunk_size_limit")
diff --git a/python/xorbits/_mars/config.yml b/python/xorbits/_mars/config.yml
new file mode 100644
index 000000000..9f808f479
--- /dev/null
+++ b/python/xorbits/_mars/config.yml
@@ -0,0 +1 @@
+"@inherits": "@mars/deploy/oscar/config.yml"
diff --git a/python/xorbits/_mars/conftest.py b/python/xorbits/_mars/conftest.py
new file mode 100644
index 000000000..767010588
--- /dev/null
+++ b/python/xorbits/_mars/conftest.py
@@ -0,0 +1,290 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import concurrent.futures
+import os
+import subprocess
+import time
+
+import psutil
+import pytest
+from mars.config import option_context
+from mars.core.mode import is_build_mode, is_kernel_mode
+from mars.lib.aio.lru import clear_all_alru_caches
+from mars.oscar.backends.ray.communication import RayServer
+from mars.oscar.backends.router import Router
+from mars.utils import lazy_import
+
+ray = lazy_import("ray")
+MARS_CI_BACKEND = os.environ.get("MARS_CI_BACKEND", "mars")
+
+
+@pytest.fixture(autouse=True)
+def auto_cleanup(request):
+    request.addfinalizer(clear_all_alru_caches)
+
+
+@pytest.fixture(scope="module", autouse=True)
+def check_router_cleaned(request):
+    def route_checker():
+        if Router.get_instance() is not None:
+            assert len(Router.get_instance()._mapping) == 0
+            assert len(Router.get_instance()._local_mapping) == 0
+
+    request.addfinalizer(route_checker)
+
+
+@pytest.fixture(scope="module")
+def ray_start_regular_shared(request):  # pragma: no cover
+    yield from _ray_start_regular(request)
+
+
+@pytest.fixture(scope="module")
+def ray_start_regular_shared2(request):  # pragma: no cover
+    os.environ["RAY_kill_idle_workers_interval_ms"] = "0"
+    param = getattr(request, "param", {})
+    num_cpus = param.get("num_cpus", 64)
+    total_memory_mb = num_cpus * 2 * 1024**2
+    try:
+        try:
+            job_config = ray.job_config.JobConfig(total_memory_mb=total_memory_mb)
+        except TypeError:
+            job_config = None
+        yield ray.init(num_cpus=num_cpus, job_config=job_config)
+    finally:
+        ray.shutdown()
+        Router.set_instance(None)
+        os.environ.pop("RAY_kill_idle_workers_interval_ms", None)
+
+
+@pytest.fixture
+def ray_start_regular(request):  # pragma: no cover
+    yield from _ray_start_regular(request)
+
+
+def _ray_start_regular(request):  # pragma: no cover
+    param = getattr(request, "param", {})
+    if not param.get("enable", True):
+        yield
+    elif ray and ray.is_initialized():
+        yield
+    else:
+        num_cpus = param.get("num_cpus", 64)
+        total_memory_mb = num_cpus * 2 * 1024**2
+        try:
+            try:
+                job_config = ray.job_config.JobConfig(total_memory_mb=total_memory_mb)
+            except TypeError:
+                job_config = None
+            yield ray.init(num_cpus=num_cpus, job_config=job_config)
+        finally:
+            ray.shutdown()
+            Router.set_instance(None)
+            RayServer.clear()
+            if "COV_CORE_SOURCE" in os.environ:
+                # Remove this when https://github.com/ray-project/ray/issues/16802 got fixed
+                subprocess.check_call(["ray", "stop", "--force"])
+
+
+@pytest.fixture(scope="module")
+def ray_large_cluster_shared(request):  # pragma: no cover
+    yield from _ray_large_cluster(request)
+
+
+@pytest.fixture
+def ray_large_cluster(request):  # pragma: no cover
+    yield from _ray_large_cluster(request)
+
+
+def _ray_large_cluster(request):  # pragma: no cover
+    param = getattr(request, "param", {})
+    num_nodes = param.get("num_nodes", 3)
+    num_cpus = param.get("num_cpus", 16)
+    from ray.cluster_utils import Cluster
+
+    cluster = Cluster()
+    remote_nodes = []
+    for i in range(num_nodes):
+        remote_nodes.append(
+            cluster.add_node(num_cpus=num_cpus, memory=num_cpus * 2 * 1024**3)
+        )
+        if len(remote_nodes) == 1:
+            try:
+                job_config = ray.job_config.JobConfig(
+                    total_memory_mb=num_nodes * 32 * 1024**3
+                )
+            except TypeError:
+                job_config = None
+            ray.init(address=cluster.address, job_config=job_config)
+    try:
+        yield cluster
+    finally:
+        Router.set_instance(None)
+        RayServer.clear()
+        ray.shutdown()
+        cluster.shutdown()
+        if "COV_CORE_SOURCE" in os.environ:
+            # Remove this when https://github.com/ray-project/ray/issues/16802 got fixed
+            subprocess.check_call(["ray", "stop", "--force"])
+
+
+@pytest.fixture
+def stop_ray(request):  # pragma: no cover
+    yield
+    if ray.is_initialized():
+        ray.shutdown()
+    Router.set_instance(None)
+
+
+@pytest.fixture
+async def ray_create_mars_cluster(request, check_router_cleaned):
+    from mars.deploy.oscar.ray import _load_config, new_cluster
+
+    ray_config = _load_config()
+    param = getattr(request, "param", {})
+    supervisor_mem = param.get("supervisor_mem", 1 * 1024**3)
+    worker_num = param.get("worker_num", 2)
+    worker_cpu = param.get("worker_cpu", 2)
+    worker_mem = param.get("worker_mem", 256 * 1024**2)
+    ray_config.update(param.get("config", {}))
+    client = await new_cluster(
+        supervisor_mem=supervisor_mem,
+        worker_num=worker_num,
+        worker_cpu=worker_cpu,
+        worker_mem=worker_mem,
+        config=ray_config,
+    )
+    try:
+        async with client:
+            yield client
+    finally:
+        Router.set_instance(None)
+
+
+@pytest.fixture
+def stop_mars():
+    try:
+        yield
+    finally:
+        import mars
+
+        mars.stop_server()
+
+
+@pytest.fixture(scope="module")
+def _new_test_session(check_router_cleaned):
+    from .deploy.oscar.tests.session import new_test_session
+
+    sess = new_test_session(
+        address="test://127.0.0.1",
+        backend=MARS_CI_BACKEND,
+        init_local=True,
+        default=True,
+        timeout=300,
+    )
+    with option_context({"show_progress": False}):
+        try:
+            yield sess
+        finally:
+            sess.stop_server(isolation=False)
+            Router.set_instance(None)
+
+
+@pytest.fixture(scope="module")
+def _new_integrated_test_session(check_router_cleaned):
+    from .deploy.oscar.tests.session import new_test_session
+
+    sess = None
+    for i in range(3):
+        try:
+            sess = new_test_session(
+                address="127.0.0.1",
+                backend=MARS_CI_BACKEND,
+                init_local=True,
+                n_worker=2,
+                default=True,
+                timeout=300,
+            )
+        except ChildProcessError:
+            time.sleep(1)
+            if i == 2:
+                raise
+            else:
+                continue
+        else:
+            break
+    with option_context({"show_progress": False}):
+        try:
+            yield sess
+        finally:
+            try:
+                sess.stop_server(isolation=False)
+            except concurrent.futures.TimeoutError:
+                Router.set_instance(None)
+                subprocesses = psutil.Process().children(recursive=True)
+                for proc in subprocesses:
+                    proc.terminate()
+                for proc in subprocesses:
+                    try:
+                        proc.wait(1)
+                    except (psutil.TimeoutExpired, psutil.NoSuchProcess):
+                        pass
+                    try:
+                        proc.kill()
+                    except psutil.NoSuchProcess:
+                        pass
+
+
+@pytest.fixture(scope="module")
+def _new_gpu_test_session(check_router_cleaned):  # pragma: no cover
+    from .deploy.oscar.tests.session import new_test_session
+    from .resource import cuda_count
+
+    cuda_devices = list(range(min(cuda_count(), 2)))
+
+    sess = new_test_session(
+        address="127.0.0.1",
+        backend=MARS_CI_BACKEND,
+        init_local=True,
+        n_worker=1,
+        n_cpu=1,
+        cuda_devices=cuda_devices,
+        default=True,
+        timeout=300,
+    )
+    with option_context({"show_progress": False}):
+        try:
+            yield sess
+        finally:
+            sess.stop_server(isolation=False)
+            Router.set_instance(None)
+
+
+@pytest.fixture
+def setup(_new_test_session):
+    _new_test_session.as_default()
+    yield _new_test_session
+    assert not (is_build_mode() or is_kernel_mode())
+
+
+@pytest.fixture
+def setup_cluster(_new_integrated_test_session):
+    _new_integrated_test_session.as_default()
+    yield _new_integrated_test_session
+
+
+@pytest.fixture
+def setup_gpu(_new_gpu_test_session):  # pragma: no cover
+    _new_gpu_test_session.as_default()
+    yield _new_test_session
diff --git a/python/xorbits/_mars/constants.py b/python/xorbits/_mars/constants.py
new file mode 100644
index 000000000..b4b56cab4
--- /dev/null
+++ b/python/xorbits/_mars/constants.py
@@ -0,0 +1,20 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Env key for Mars log absolute path
+MARS_LOG_PATH_KEY = "MARS_LOG_PATH"
+# Mars log file name prefix
+MARS_LOG_PREFIX = "mars_"
+# The prefix of the temporary directory where the Mars log is located
+MARS_TMP_DIR_PREFIX = "mars_tmp"
diff --git a/python/xorbits/_mars/contrib/__init__.py b/python/xorbits/_mars/contrib/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/contrib/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/contrib/dask/__init__.py b/python/xorbits/_mars/contrib/dask/__init__.py
new file mode 100644
index 000000000..7a4ea89b8
--- /dev/null
+++ b/python/xorbits/_mars/contrib/dask/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# noinspection PyUnresolvedReferences
+
+from ...utils import ModulePlaceholder
+
+try:
+    import dask
+except ImportError:
+    convert_dask_collection = mars_scheduler = ModulePlaceholder("dask")
+else:
+    from .converter import convert_dask_collection
+    from .scheduler import mars_scheduler
diff --git a/python/xorbits/_mars/contrib/dask/converter.py b/python/xorbits/_mars/contrib/dask/converter.py
new file mode 100644
index 000000000..e41f5e33f
--- /dev/null
+++ b/python/xorbits/_mars/contrib/dask/converter.py
@@ -0,0 +1,58 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dask import is_dask_collection, optimize
+from dask.bag import Bag
+
+from ...remote import spawn
+from .scheduler import mars_dask_get
+from .utils import reduce
+
+
+def convert_dask_collection(dc):
+    """
+    Convert dask collection object into mars.core.Object via remote API
+
+    Parameters
+    ----------
+    dc: dask collection
+        Dask collection object to be converted.
+
+    Returns
+    -------
+    Object
+        Mars Object.
+    """
+    if not is_dask_collection(dc):
+        raise TypeError(f"'{type(dc).__name__}' object is not a valid dask collection")
+
+    dc.__dask_graph__().validate()
+    dsk = optimize(dc)[0].__dask_graph__()
+
+    first_key = next(iter(dsk.keys()))
+    if isinstance(first_key, str):
+        key = [first_key]
+    elif isinstance(first_key, tuple):
+        key = sorted(
+            [i for i in dsk.keys() if i[0] == first_key[0]], key=lambda x: x[1]
+        )
+    else:
+        raise ValueError(
+            f"Dask collection object seems be broken, with unexpected key type:'{type(first_key).__name__}'"
+        )
+    res = reduce(mars_dask_get(dsk, [key]))
+    if isinstance(dc, Bag):
+        return spawn(lambda x: list(x[0][0]), args=(res,))
+    else:
+        return res
diff --git a/python/xorbits/_mars/contrib/dask/scheduler.py b/python/xorbits/_mars/contrib/dask/scheduler.py
new file mode 100644
index 000000000..bdfe17f47
--- /dev/null
+++ b/python/xorbits/_mars/contrib/dask/scheduler.py
@@ -0,0 +1,102 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Tuple, Union
+
+from dask.core import ishashable, istask
+
+from ...deploy.oscar.session import execute
+from ...remote import spawn
+from .utils import reduce
+
+
+def mars_scheduler(dsk: dict, keys: Union[List[List[str]], List[str]]):
+    """
+    A Dask-Mars scheduler
+
+    This scheduler is intended to be compatible with existing
+    dask user interface, no callbacks are implemented.
+
+    Parameters
+    ----------
+    dsk: Dict
+        Dask graph, represented as a task DAG dictionary.
+    keys: Union[List[List[str]], List[str]]
+        1d or 2d list of Dask graph keys whose values we wish to compute and return.
+
+    Returns
+    -------
+    Object
+        Computed values corresponding to the provided keys with same dimension.
+    """
+
+    if isinstance(keys, List) and not isinstance(keys[0], List):  # 1d keys
+        task = execute(mars_dask_get(dsk, keys))
+        if not isinstance(task, List):
+            task = [task]
+        return map(lambda x: x.fetch(), task)
+    else:  # 2d keys
+        res = execute(reduce(mars_dask_get(dsk, keys))).fetch()
+        if not isinstance(res, List):
+            return [[res]]
+        else:
+            return res
+
+
+def mars_dask_get(dsk: dict, keys: Union[List[List[str]], List[str]]):
+    """
+    A Dask-Mars convert function. This function will send the dask graph layers
+        to Mars Remote API, generating mars objects correspond to the provided keys.
+
+    Parameters
+    ----------
+    dsk: Dict
+        Dask graph, represented as a task DAG dictionary.
+    keys: Union[List[List[str]], List[str]]
+        1d or 2d list of Dask graph keys whose values we wish to compute and return.
+
+    Returns
+    -------
+    Object
+        Spawned mars objects corresponding to the provided keys with same dimension.
+    """
+
+    def _get_arg(a):
+        # if arg contains layer index or callable objs, handle it
+        if ishashable(a) and a in dsk.keys():
+            while ishashable(a) and a in dsk.keys():
+                a = dsk[a]
+            return _spawn_task(a)
+        elif not isinstance(a, str) and hasattr(a, "__getitem__"):
+            if istask(
+                a
+            ):  # TODO:Handle `SubgraphCallable`, which may contains dsk in it
+                return spawn(a[0], args=tuple(_get_arg(i) for i in a[1:]))
+            elif isinstance(a, dict):
+                return {k: _get_arg(v) for k, v in a.items()}
+            elif isinstance(a, List) or isinstance(a, Tuple):
+                return type(a)(_get_arg(i) for i in a)
+        return a
+
+    def _spawn_task(task: tuple):
+        if not istask(task):
+            return _get_arg(task)
+        return spawn(task[0], args=tuple(_get_arg(a) for a in task[1:]))
+
+    return [
+        [_spawn_task(dsk[k]) for k in keys_d]
+        if isinstance(keys_d, List)
+        else _spawn_task(dsk[keys_d])
+        for keys_d in keys
+    ]
diff --git a/python/xorbits/_mars/contrib/dask/tests/__init__.py b/python/xorbits/_mars/contrib/dask/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/contrib/dask/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/contrib/dask/tests/test_dask.py b/python/xorbits/_mars/contrib/dask/tests/test_dask.py
new file mode 100644
index 000000000..967807410
--- /dev/null
+++ b/python/xorbits/_mars/contrib/dask/tests/test_dask.py
@@ -0,0 +1,183 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from ....utils import lazy_import
+from .. import convert_dask_collection, mars_scheduler
+
+dask_installed = lazy_import("dask") is not None
+mimesis_installed = lazy_import("mimesis") is not None
+
+
+@pytest.mark.skipif(not dask_installed, reason="dask not installed")
+def test_delayed(setup_cluster):
+    import numpy as np
+    from dask import delayed
+
+    def calc_chunk(n: int, i: int):
+        rs = np.random.RandomState(i)
+        a = rs.uniform(-1, 1, size=(n, 2))
+        d = np.linalg.norm(a, axis=1)
+        return (d < 1).sum()
+
+    def calc_pi(fs, N):
+        return sum(fs) * 4 / N
+
+    N = 200_000_000
+    n = 10_000_000
+
+    fs = [delayed(calc_chunk)(n, i) for i in range(N // n)]
+    pi = delayed(calc_pi)(fs, N)
+
+    dask_res = pi.compute()
+    assert dask_res == pi.compute(scheduler=mars_scheduler)
+    assert dask_res == convert_dask_collection(pi).execute().fetch()
+
+
+@pytest.mark.skipif(not dask_installed, reason="dask not installed")
+def test_partitioned_dataframe(setup_cluster):
+    import numpy as np
+    import pandas as pd
+    from dask import dataframe as dd
+    from pandas._testing import assert_frame_equal
+
+    data = np.random.randn(10000, 100)
+    df = dd.from_pandas(
+        pd.DataFrame(data, columns=[f"col{i}" for i in range(100)]), npartitions=4
+    )
+    df["col0"] = df["col0"] + df["col1"] / 2
+    col2_mean = df["col2"].mean()
+    df = df[df["col2"] > col2_mean]
+
+    dask_res = df.compute()
+    assert_frame_equal(
+        dask_res, df.compute(scheduler=mars_scheduler), check_index_type=False
+    )
+    assert_frame_equal(
+        dask_res, convert_dask_collection(df).execute().fetch(), check_index_type=False
+    )
+
+
+@pytest.mark.skipif(not dask_installed, reason="dask not installed")
+def test_unpartitioned_dataframe(setup_cluster):
+    import pandas as pd
+    from dask import dataframe as dd
+    from pandas._testing import assert_frame_equal
+    from sklearn.datasets import load_iris
+
+    boston = load_iris()
+    pd.DataFrame(boston.data, columns=boston["feature_names"]).to_csv(
+        "./boston_housing_data.csv"
+    )
+
+    df = dd.read_csv(r"./boston_housing_data.csv")
+    df["sepal length (cm)"] = df["sepal length (cm)"] / 2
+
+    dask_res = df.compute()
+    assert_frame_equal(dask_res, df.compute(scheduler=mars_scheduler))
+    assert_frame_equal(dask_res, convert_dask_collection(df).execute().fetch())
+
+
+@pytest.mark.skipif(not dask_installed, reason="dask not installed")
+def test_array(setup_cluster):
+    import dask.array as da
+    from numpy.core.numeric import array_equal
+
+    x = da.random.random((10000, 10000), chunks=(1000, 1000))
+    y = x + x.T
+    z = y[::2, 5000:].mean(axis=1)
+
+    dask_res = z.compute()
+    assert array_equal(dask_res, z.compute(scheduler=mars_scheduler))
+    assert array_equal(dask_res, convert_dask_collection(z).execute().fetch())
+
+
+@pytest.mark.skipif(not dask_installed, reason="dask not installed")
+@pytest.mark.skipif(not mimesis_installed, reason="mimesis not installed")
+def test_bag(setup_cluster):
+    import dask
+
+    b = dask.datasets.make_people()  # Make records of people
+    result = (
+        b.filter(lambda record: record["age"] > 30)
+        .map(lambda record: record["occupation"])
+        .frequencies(sort=True)
+        .topk(10, key=1)
+    )
+
+    dask_res = result.compute()
+    assert dask_res == result.compute(scheduler=mars_scheduler)
+    assert dask_res == convert_dask_collection(result).execute().fetch()
+
+
+@pytest.mark.skipif(not dask_installed, reason="dask not installed")
+def test_dask_errors():
+    with pytest.raises(TypeError):
+        convert_dask_collection({"foo": 0, "bar": 1})
+
+
+@pytest.mark.skipif(not dask_installed, reason="dask not installed")
+def test_multiple_objects(setup_cluster):
+    import dask
+
+    def inc(x: int):
+        return x + 1
+
+    test_list = [dask.delayed(inc)(i) for i in range(10)]
+    test_tuple = tuple(dask.delayed(inc)(i) for i in range(10))
+    test_dict = {str(i): dask.delayed(inc)(i) for i in range(10)}
+
+    for test_obj in (test_list, test_tuple, test_dict):
+        assert dask.compute(test_obj) == dask.compute(
+            test_obj, scheduler=mars_scheduler
+        )
+
+
+@pytest.mark.skipif(not dask_installed, reason="dask not installed")
+def test_persist(setup_cluster):
+    import dask
+
+    def inc(x):
+        return x + 1
+
+    a = dask.delayed(inc)(1)
+    task_mars_persist = dask.delayed(inc)(a.persist(scheduler=mars_scheduler))
+    task_dask_persist = dask.delayed(inc)(a.persist())
+
+    assert task_dask_persist.compute() == task_mars_persist.compute(
+        scheduler=mars_scheduler
+    )
+
+
+@pytest.mark.skipif(not dask_installed, reason="dask not installed")
+def test_partitioned_dataframe_persist(setup_cluster):
+    import numpy as np
+    import pandas as pd
+    from dask import dataframe as dd
+    from pandas._testing import assert_frame_equal
+
+    data = np.random.randn(10000, 100)
+    df = dd.from_pandas(
+        pd.DataFrame(data, columns=[f"col{i}" for i in range(100)]), npartitions=4
+    )
+    df["col0"] = df["col0"] + df["col1"] / 2
+    col2_mean = df["col2"].mean()
+
+    df_mars_persist = df[df["col2"] > col2_mean.persist(scheduler=mars_scheduler)]
+    df_dask_persist = df[df["col2"] > col2_mean.persist()]
+
+    assert_frame_equal(
+        df_dask_persist.compute(), df_mars_persist.compute(scheduler=mars_scheduler)
+    )
diff --git a/python/xorbits/_mars/contrib/dask/utils.py b/python/xorbits/_mars/contrib/dask/utils.py
new file mode 100644
index 000000000..53f73cb28
--- /dev/null
+++ b/python/xorbits/_mars/contrib/dask/utils.py
@@ -0,0 +1,66 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+from dask import is_dask_collection
+from dask.array.core import _concatenate2 as array_concat
+from dask.dataframe import concat as df_concat
+from dask.utils import is_arraylike, is_dataframe_like, is_index_like, is_series_like
+
+from ...remote import spawn
+
+
+def concat(objs: List):
+    """
+    Concat the results of partitioned dask task executions. This function guess the
+        types of resulting list, then calls the corresponding native dask concat functions.
+
+    Parameters
+    ----------
+    objs: List
+        List of the partitioned dask task execution results, which will be concat.
+
+    Returns
+    -------
+    obj:
+        The concat result
+
+    """
+    if is_arraylike(objs[0]):
+        res = array_concat(objs, axes=[0])  # TODO: Add concat with args support
+    elif any(
+        (is_dataframe_like(objs[0]), is_series_like(objs[0]), is_index_like(objs[0]))
+    ):
+        res = df_concat(objs)
+    else:
+        res = objs
+    return res.compute() if is_dask_collection(res) else res
+
+
+def reduce(objs: List[List]):
+    """
+    Spawn a concat task for 2d-list objects
+
+    Parameters
+    ----------
+    objs: List
+        2d-list of the partitioned dask task execution results, which will be concat.
+
+    Returns
+    -------
+    obj:
+        The spawning concat task.
+    """
+    return spawn(concat, args=([spawn(concat, args=(objs_d,)) for objs_d in objs],))
diff --git a/python/xorbits/_mars/core/__init__.py b/python/xorbits/_mars/core/__init__.py
new file mode 100644
index 000000000..b26fc8950
--- /dev/null
+++ b/python/xorbits/_mars/core/__init__.py
@@ -0,0 +1,67 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# noinspection PyUnresolvedReferences
+from ..typing import ChunkType, EntityType, OperandType, TileableType
+from .base import ExecutionError
+from .entity import (
+    CHUNK_TYPE,
+    ENTITY_TYPE,
+    FUSE_CHUNK_TYPE,
+    OBJECT_CHUNK_TYPE,
+    OBJECT_TYPE,
+    TILEABLE_TYPE,
+    Chunk,
+    ChunkData,
+    Entity,
+    EntityData,
+    ExecutableTuple,
+    FuseChunk,
+    FuseChunkData,
+    HasShapeTileable,
+    HasShapeTileableData,
+    NotSupportTile,
+    Object,
+    ObjectChunk,
+    ObjectChunkData,
+    ObjectData,
+    OutputType,
+    Tileable,
+    TileableData,
+    _ExecuteAndFetchMixin,
+    get_chunk_types,
+    get_fetch_class,
+    get_output_types,
+    get_tileable_types,
+    recursive_tile,
+    register,
+    register_fetch_class,
+    register_output_types,
+    tile,
+    unregister,
+)
+
+# noinspection PyUnresolvedReferences
+from .graph import (
+    DAG,
+    ChunkGraph,
+    ChunkGraphBuilder,
+    DirectedGraph,
+    GraphContainsCycleError,
+    TileableGraph,
+    TileableGraphBuilder,
+    TileContext,
+    TileStatus,
+)
+from .mode import enter_mode, is_build_mode, is_eager_mode, is_kernel_mode
diff --git a/python/xorbits/_mars/core/base.py b/python/xorbits/_mars/core/base.py
new file mode 100644
index 000000000..c3022c83f
--- /dev/null
+++ b/python/xorbits/_mars/core/base.py
@@ -0,0 +1,149 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Tuple, Type
+
+from ..serialization.serializables import Serializable, StringField
+from ..serialization.serializables.core import SerializableSerializer
+from ..utils import tokenize
+
+
+class Base(Serializable):
+    _no_copy_attrs_ = {"_id"}
+    _init_update_key_ = True
+
+    _key = StringField("key", default=None)
+    _id = StringField("id")
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if self._init_update_key_ and (not hasattr(self, "_key") or not self._key):
+            self._update_key()
+        if not hasattr(self, "_id") or not self._id:
+            self._id = str(id(self))
+
+    @property
+    def _keys_(self):
+        cls = type(self)
+        member = "__keys_" + cls.__name__
+        try:
+            return getattr(cls, member)
+        except AttributeError:
+            slots = sorted(self._FIELDS)
+            setattr(cls, member, slots)
+            return slots
+
+    @property
+    def _copy_tags_(self):
+        cls = type(self)
+        member = f"__copy_tags_{cls.__name__}"
+        try:
+            return getattr(cls, member)
+        except AttributeError:
+            slots = sorted(
+                f.name for k, f in self._FIELDS.items() if k not in self._no_copy_attrs_
+            )
+            setattr(cls, member, slots)
+            return slots
+
+    @property
+    def _values_(self):
+        values = []
+        fields = self._FIELDS
+        for k in self._copy_tags_:
+            try:
+                values.append(fields[k].get(self))
+            except AttributeError:
+                values.append(None)
+        return values
+
+    def __mars_tokenize__(self):
+        try:
+            return self._key
+        except AttributeError:  # pragma: no cover
+            self._update_key()
+            return self._key
+
+    def _obj_set(self, k, v):
+        object.__setattr__(self, k, v)
+
+    def _update_key(self):
+        self._obj_set("_key", tokenize(type(self).__name__, *self._values_))
+        return self
+
+    def reset_key(self):
+        self._obj_set("_key", None)
+        return self
+
+    def __copy__(self):
+        return self.copy()
+
+    def copy(self):
+        return self.copy_to(type(self)(_key=self.key))
+
+    def copy_to(self, target: "Base"):
+        target_fields = target._FIELDS
+        no_copy_attrs = self._no_copy_attrs_
+        for k, field in self._FIELDS.items():
+            if k in no_copy_attrs:
+                continue
+            try:
+                # Slightly faster than getattr.
+                value = field.__get__(self, k)
+                target_fields[k].set(target, value)
+            except AttributeError:
+                continue
+
+        return target
+
+    def copy_from(self, obj):
+        obj.copy_to(self)
+
+    @property
+    def key(self):
+        return self._key
+
+    @property
+    def id(self):
+        return self._id
+
+    def to_kv(self, exclude_fields: Tuple[str], accept_value_types: Tuple[Type]):
+        fields = self._FIELDS
+        kv = {}
+        no_value = object()
+        for name, field in fields.items():
+            if name not in exclude_fields:
+                value = getattr(self, name, no_value)
+                if value is not no_value and isinstance(value, accept_value_types):
+                    kv[field.tag] = value
+        return kv
+
+
+class BaseSerializer(SerializableSerializer):
+    def serial(self, obj: Base, context: Dict):
+        return super().serial(obj, context)
+
+
+BaseSerializer.register(Base)
+
+
+class MarsError(Exception):
+    pass
+
+
+class ExecutionError(MarsError):
+    def __init__(self, nested_error: BaseException):
+        super().__init__(nested_error)
+        self.nested_error = nested_error
diff --git a/python/xorbits/_mars/core/context.py b/python/xorbits/_mars/core/context.py
new file mode 100644
index 000000000..6379d29ed
--- /dev/null
+++ b/python/xorbits/_mars/core/context.py
@@ -0,0 +1,304 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from abc import ABC, abstractmethod
+from typing import Dict, List
+
+from ..storage.base import StorageLevel
+from ..typing import BandType, SessionType
+from ..utils import classproperty
+
+
+class Context(ABC):
+    """
+    Context that providing API that can be
+    used inside `tile` and `execute`.
+    """
+
+    all_contexts = []
+
+    def __init__(
+        self,
+        session_id: str = None,
+        supervisor_address: str = None,
+        worker_address: str = None,
+        local_address: str = None,
+        band: BandType = None,
+    ):
+        if session_id is None:
+            # try to get session id from environment
+            session_id = os.environ.get("MARS_SESSION_ID")
+            if session_id is None:
+                raise ValueError("session_id should be provided to create a context")
+        if supervisor_address is None:
+            # try to get supervisor address from environment
+            supervisor_address = os.environ.get("MARS_SUPERVISOR_ADDRESS")
+            if supervisor_address is None:
+                raise ValueError(
+                    "supervisor_address should be provided to create a context"
+                )
+
+        self.session_id = session_id
+        self.supervisor_address = supervisor_address
+        self.worker_address = worker_address
+        self.local_address = local_address
+        self.band = band
+
+    @abstractmethod
+    def get_current_session(self) -> SessionType:
+        """
+        Get current session
+
+        Returns
+        -------
+        session
+        """
+
+    @abstractmethod
+    def get_local_host_ip(self) -> str:
+        """
+        Get local worker's host ip
+
+        Returns
+        -------
+        host_ip : str
+        """
+
+    @abstractmethod
+    def get_supervisor_addresses(self) -> List[str]:
+        """
+        Get supervisor addresses.
+
+        Returns
+        -------
+        supervisor_addresses : list
+        """
+
+    @abstractmethod
+    def get_worker_addresses(self) -> List[str]:
+        """
+        Get worker addresses.
+
+        Returns
+        -------
+        worker_addresses : list
+        """
+
+    @abstractmethod
+    def get_worker_bands(self) -> List[BandType]:
+        """
+        Get worker bands.
+
+        Returns
+        -------
+        worker_bands : list
+        """
+
+    @abstractmethod
+    def get_total_n_cpu(self) -> int:
+        """
+        Get number of cpus.
+
+        Returns
+        -------
+        number_of_cpu: int
+        """
+
+    @abstractmethod
+    def get_slots(self) -> int:
+        """
+        Get num of slots of current band
+
+        Returns
+        -------
+        number_of_bands: int
+        """
+
+    @abstractmethod
+    def get_chunks_result(self, data_keys: List[str], fetch_only: bool = False) -> List:
+        """
+        Get result of chunks.
+
+        Parameters
+        ----------
+        data_keys : list
+            Data keys.
+        fetch_only : bool
+            If fetch_only, only fetch data but not return.
+
+        Returns
+        -------
+        results : list
+            Result of chunks if not fetch_only, else return None
+        """
+
+    @abstractmethod
+    def get_chunks_meta(
+        self, data_keys: List[str], fields: List[str] = None, error="raise"
+    ) -> List[Dict]:
+        """
+        Get meta of chunks.
+
+        Parameters
+        ----------
+        data_keys : list
+            Data keys.
+        fields : list
+            Fields to filter.
+        error : str
+            raise, ignore
+
+        Returns
+        -------
+        meta_list : list
+            Meta list.
+        """
+
+    @abstractmethod
+    def get_storage_info(self, address: str, level: StorageLevel):
+        """
+        Get the customized storage backend info of requested storage backend level at given worker.
+
+        Parameters
+        ----------
+        address: str
+            The worker address.
+        level: StorageLevel
+            The storage level to fetch the backend info.
+
+        Returns
+        -------
+        info: dict
+            Customized storage backend info dict of all workers. The key is
+            worker address, the value is the backend info dict.
+        """
+
+    @abstractmethod
+    def create_remote_object(self, name: str, object_cls, *args, **kwargs):
+        """
+        Create remote object.
+
+        Parameters
+        ----------
+        name : str
+            Object name.
+        object_cls
+            Object class.
+        args
+        kwargs
+
+        Returns
+        -------
+        ref
+        """
+
+    @abstractmethod
+    def get_remote_object(self, name: str):
+        """
+        Get remote object
+
+        Parameters
+        ----------
+        name : str
+            Object name.
+
+        Returns
+        -------
+        ref
+        """
+
+    @abstractmethod
+    def destroy_remote_object(self, name: str):
+        """
+        Destroy remote object.
+
+        Parameters
+        ----------
+        name : str
+            Object name.
+        """
+
+    @abstractmethod
+    def register_custom_log_path(
+        self,
+        session_id: str,
+        tileable_op_key: str,
+        chunk_op_key: str,
+        worker_address: str,
+        log_path: str,
+    ):
+        """
+        Register custom log path.
+
+        Parameters
+        ----------
+        session_id : str
+            Session ID.
+        tileable_op_key : str
+            Key of tileable's op.
+        chunk_op_key : str
+            Kye of chunk's op.
+        worker_address : str
+            Worker address.
+        log_path : str
+            Log path.
+        """
+
+    def new_custom_log_dir(self) -> str:
+        """
+        New custom log dir.
+
+        Returns
+        -------
+        custom_log_dir : str
+            Custom log dir.
+        """
+
+    def set_running_operand_key(self, session_id: str, op_key: str):
+        """
+        Set key of running operand.
+
+        Parameters
+        ----------
+        session_id : str
+        op_key : str
+        """
+
+    def set_progress(self, progress: float):
+        """
+        Set progress of running operand.
+
+        Parameters
+        ----------
+        progress : float
+        """
+
+    def __enter__(self):
+        Context.all_contexts.append(self)
+
+    def __exit__(self, *_):
+        Context.all_contexts.pop()
+
+    @classproperty
+    def current(cls):
+        return cls.all_contexts[-1] if cls.all_contexts else None
+
+
+def set_context(context: Context):
+    Context.all_contexts.append(context)
+
+
+def get_context() -> Context:
+    return Context.current
diff --git a/python/xorbits/_mars/core/custom_log.py b/python/xorbits/_mars/core/custom_log.py
new file mode 100644
index 000000000..7b918ef21
--- /dev/null
+++ b/python/xorbits/_mars/core/custom_log.py
@@ -0,0 +1,188 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import io
+import os
+import sys
+import textwrap
+import weakref
+from typing import Callable, List, Type
+
+from ..typing import OperandType, SessionType, TileableType
+from .context import Context
+
+
+class _LogWrapper:
+    def __init__(self, ctx: Context, op: OperandType, log_path: str):
+        self.ctx = ctx
+        self.op = op
+        self.log_path = log_path
+
+        self.file = open(log_path, "w")
+        self.stdout = sys.stdout
+
+        self.raw_stdout = self.stdout
+        while isinstance(self.raw_stdout, _LogWrapper):
+            self.raw_stdout = self.raw_stdout.stdout
+
+        # flag about registering log path
+        self.is_log_path_registered = False
+
+    def __enter__(self):
+        self.file.__enter__()
+        # set stdout
+        sys.stdout = self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.__exit__(exc_type, exc_val, exc_tb)
+        # set back stdout
+        sys.stdout = self.stdout
+
+    def _register_log_path(self):
+        if self.is_log_path_registered:
+            return
+
+        # register log path
+        session_id = self.ctx.session_id
+        tileable_op_key = self.op.tileable_op_key
+        chunk_op_key = self.op.key
+        worker_addr = self.ctx.local_address
+        log_path = self.log_path
+
+        self.ctx.register_custom_log_path(
+            session_id, tileable_op_key, chunk_op_key, worker_addr, log_path
+        )
+
+        self.is_log_path_registered = True
+
+    def write(self, data):
+        self._register_log_path()
+
+        # write into file
+        self.file.write(data)
+        # force flush to make sure `fetch_log` can get stdout in time
+        self.file.flush()
+        # write into previous stdout
+        self.raw_stdout.write(data)
+
+    def flush(self):
+        self.raw_stdout.flush()
+
+
+def redirect_custom_log(func: Callable[[Type, Context, OperandType], None]):
+    """
+    Redirect stdout to a file by wrapping ``Operand.execute(ctx, op)``
+    """
+
+    @functools.wraps(func)
+    def wrap(cls, ctx: Context, op: OperandType):
+        custom_log_dir = ctx.new_custom_log_dir()
+
+        if custom_log_dir is None:
+            return func(cls, ctx, op)
+
+        log_path = os.path.join(custom_log_dir, op.key)
+
+        with _LogWrapper(ctx, op, log_path):
+            return func(cls, ctx, op)
+
+    return wrap
+
+
+_tileable_to_log_fetcher = weakref.WeakKeyDictionary()
+
+
+class LogFetcher:
+    def __init__(self, tileable_op_key: str, session: SessionType):
+        self._tileable_op_key = tileable_op_key
+        self._session = session
+        self._chunk_op_key_to_result = dict()
+        self._chunk_op_key_to_offsets = dict()
+
+    def __len__(self):
+        return len(self._chunk_op_key_to_result)
+
+    @property
+    def chunk_op_keys(self) -> List[str]:
+        return list(self._chunk_op_key_to_result.keys())
+
+    @property
+    def results(self) -> list:
+        return list(self._chunk_op_key_to_result.values())
+
+    @property
+    def offsets(self) -> List[List[int]]:
+        return list(self._chunk_op_key_to_offsets.values())
+
+    def fetch(self, offsets: List[int] = None, sizes: List[int] = None):
+        if offsets is None:
+            offsets = self._chunk_op_key_to_offsets
+
+        if sizes is None:
+            sizes = 1 * 1024**2  # 1M each time
+
+        result: dict = self._session.fetch_tileable_op_logs(
+            self._tileable_op_key, offsets=offsets, sizes=sizes
+        )
+
+        if result is None:
+            return
+
+        for chunk_key, chunk_result in result.items():
+            self._chunk_op_key_to_result[chunk_key] = chunk_result["log"]
+            self._chunk_op_key_to_offsets[chunk_key] = chunk_result["offset"]
+
+    def _display(self, representation: bool = True):
+        if len(self) == 1:
+            content = next(iter(self._chunk_op_key_to_result.values()))
+            return repr(content) if representation else str(content)
+
+        sio = io.StringIO()
+        for chunk_op_key, content in self._chunk_op_key_to_result.items():
+            sio.write(
+                textwrap.dedent(
+                    f"""
+                Chunk op key: {chunk_op_key}
+                Out:
+                {content}"""
+                )
+            )
+        result = sio.getvalue()
+        return repr(result) if representation else str(result)
+
+    def __repr__(self):
+        return self._display(True)
+
+    def __str__(self):
+        return self._display(False)
+
+
+def fetch(
+    tileables: List[TileableType],
+    session: SessionType,
+    offsets: List[int] = None,
+    sizes: List[int] = None,
+):
+    log_fetchers = []
+    for tileable in tileables:
+        tileable = tileable.data if hasattr(tileable, "data") else tileable
+
+        if tileable not in _tileable_to_log_fetcher:
+            _tileable_to_log_fetcher[tileable] = LogFetcher(tileable.op.key, session)
+
+        log_fetcher = _tileable_to_log_fetcher[tileable]
+        log_fetcher.fetch(offsets=offsets, sizes=sizes)
+        log_fetchers.append(log_fetcher)
+    return log_fetchers
diff --git a/python/xorbits/_mars/core/entity/__init__.py b/python/xorbits/_mars/core/entity/__init__.py
new file mode 100644
index 000000000..e0a4ee754
--- /dev/null
+++ b/python/xorbits/_mars/core/entity/__init__.py
@@ -0,0 +1,46 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .chunks import CHUNK_TYPE, Chunk, ChunkData
+from .core import ENTITY_TYPE, Entity, EntityData
+from .executable import ExecutableTuple, _ExecuteAndFetchMixin
+from .fuse import FUSE_CHUNK_TYPE, FuseChunk, FuseChunkData
+from .objects import (
+    OBJECT_CHUNK_TYPE,
+    OBJECT_TYPE,
+    Object,
+    ObjectChunk,
+    ObjectChunkData,
+    ObjectData,
+)
+from .output_types import (
+    OutputType,
+    get_chunk_types,
+    get_fetch_class,
+    get_output_types,
+    get_tileable_types,
+    register_fetch_class,
+    register_output_types,
+)
+from .tileables import (
+    TILEABLE_TYPE,
+    HasShapeTileable,
+    HasShapeTileableData,
+    NotSupportTile,
+    Tileable,
+    TileableData,
+    register,
+    unregister,
+)
+from .utils import recursive_tile, tile
diff --git a/python/xorbits/_mars/core/entity/chunks.py b/python/xorbits/_mars/core/entity/chunks.py
new file mode 100644
index 000000000..e96b87aa3
--- /dev/null
+++ b/python/xorbits/_mars/core/entity/chunks.py
@@ -0,0 +1,68 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...serialization.serializables import BoolField, FieldTypes, TupleField
+from ...utils import tokenize
+from .core import Entity, EntityData
+
+
+class ChunkData(EntityData):
+    __slots__ = ()
+
+    is_broadcaster = BoolField("is_broadcaster", default=False)
+    # If the operand is a shuffle mapper, this flag indicates whether the current chunk is mapper chunk when
+    # the operand produce multiple chunks such as TensorUnique.
+    is_mapper = BoolField("is_mapper", default=None)
+    # optional fields
+    _index = TupleField("index", FieldTypes.uint32)
+
+    def __repr__(self):
+        if self.op.stage is None:
+            return (
+                f"{type(self).__name__} <op={type(self.op).__name__}, "
+                f"key={self.key}>"
+            )
+        else:
+            return (
+                f"{type(self).__name__} <op={type(self.op).__name__}, "
+                f"stage={self.op.stage.name}, key={self.key}>"
+            )
+
+    @property
+    def index(self):
+        return getattr(self, "_index", None)
+
+    @property
+    def device(self):
+        return self.op.device
+
+    def _update_key(self):
+        object.__setattr__(
+            self,
+            "_key",
+            tokenize(
+                type(self).__name__,
+                *(getattr(self, k, None) for k in self._keys_ if k != "_index"),
+            ),
+        )
+
+
+class Chunk(Entity):
+    _allow_data_type_ = (ChunkData,)
+
+    def __repr__(self):
+        return f"{type(self).__name__}({self._data.__repr__()})"
+
+
+CHUNK_TYPE = (Chunk, ChunkData)
diff --git a/python/xorbits/_mars/core/entity/core.py b/python/xorbits/_mars/core/entity/core.py
new file mode 100644
index 000000000..b6bbeff56
--- /dev/null
+++ b/python/xorbits/_mars/core/entity/core.py
@@ -0,0 +1,152 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...serialization.serializables import (
+    DictField,
+    FieldTypes,
+    ReferenceField,
+    Serializable,
+)
+from ...utils import AttributeDict
+from ..base import Base
+
+
+class EntityData(Base):
+    __slots__ = ("_siblings",)
+    type_name = None
+
+    # required fields
+    _op = ReferenceField("op", "mars.core.operand.base.Operand")
+    # optional fields
+    _extra_params = DictField("extra_params", key_type=FieldTypes.string)
+
+    def __init__(self, *args, **kwargs):
+        extras = AttributeDict(
+            (k, kwargs.pop(k)) for k in set(kwargs) - set(self._FIELDS)
+        )
+        kwargs["_extra_params"] = kwargs.pop("_extra_params", extras)
+        super().__init__(*args, **kwargs)
+
+    @property
+    def op(self):
+        return self._op
+
+    @property
+    def inputs(self):
+        return self.op.inputs
+
+    @inputs.setter
+    def inputs(self, new_inputs):
+        self.op.inputs = new_inputs
+
+    def is_sparse(self):
+        return self.op.is_sparse()
+
+    issparse = is_sparse
+
+    @property
+    def extra_params(self):
+        return self._extra_params
+
+    def build_graph(self, **kw):
+        from ..graph.builder.utils import build_graph
+
+        return build_graph([self], **kw)
+
+    def visualize(self, graph_attrs=None, node_attrs=None, **kw):
+        from graphviz import Source
+
+        g = self.build_graph(**kw)
+        dot = g.to_dot(
+            graph_attrs=graph_attrs,
+            node_attrs=node_attrs,
+            result_chunk_keys={c.key for c in self.chunks},
+        )
+
+        return Source(dot)
+
+    def _need_execution(self):  # pylint: disable=no-self-use
+        # some tileable may generate unknown meta,
+        # they need to be executed first
+        return False
+
+
+class Entity(Serializable):
+    _allow_data_type_ = ()
+    type_name = None
+
+    _data = ReferenceField("data", EntityData)
+
+    def __init__(self, data=None, **kw):
+        super().__init__(_data=data, **kw)
+
+    def __dir__(self):
+        obj_dir = object.__dir__(self)
+        if self._data is not None:
+            obj_dir = sorted(set(dir(self._data) + obj_dir))
+        return obj_dir
+
+    def __str__(self):
+        return self._data.__str__()
+
+    def __repr__(self):
+        return self._data.__repr__()
+
+    def _check_data(self, data):
+        if data is not None and not isinstance(data, self._allow_data_type_):
+            raise TypeError(f"Expect {self._allow_data_type_}, got {type(data)}")
+
+    @property
+    def data(self):
+        return self._data
+
+    @data.setter
+    def data(self, new_data):
+        self._check_data(new_data)
+        self._data = new_data
+
+    def __copy__(self):
+        return self.copy()
+
+    def copy(self):
+        return self.copy_to(type(self)(None))
+
+    def copy_to(self, target):
+        target.data = self._data
+        return target
+
+    def copy_from(self, obj):
+        self.data = obj.data
+
+    def tiles(self):
+        from .tileables import handler
+
+        new_entity = self.copy()
+        new_entity.data = handler.tiles(self.data)
+        return new_entity
+
+    def __getattr__(self, attr):
+        return getattr(self._data, attr)
+
+    def __setattr__(self, key, value):
+        try:
+            object.__setattr__(self, key, value)
+        except AttributeError:
+            return setattr(self._data, key, value)
+
+    def _need_execution(self):
+        return self._data._need_execution()
+
+
+ENTITY_TYPE = (Entity, EntityData)
diff --git a/python/xorbits/_mars/core/entity/executable.py b/python/xorbits/_mars/core/entity/executable.py
new file mode 100644
index 000000000..2b420efa5
--- /dev/null
+++ b/python/xorbits/_mars/core/entity/executable.py
@@ -0,0 +1,337 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import atexit
+import concurrent.futures
+import queue
+import threading
+from typing import List
+from weakref import WeakKeyDictionary, ref
+
+from ...lib.aio import get_isolation
+from ...typing import SessionType, TileableType
+from ..mode import enter_mode
+
+
+class DecrefRunner:
+    def __init__(self):
+        self._decref_thread = None
+        self._queue = queue.Queue()
+
+    def start(self):
+        self._decref_thread = threading.Thread(
+            target=self._thread_body, name="DecrefThread"
+        )
+        self._decref_thread.daemon = True
+        self._decref_thread.start()
+
+    def _thread_body(self):
+        from ...deploy.oscar.session import SyncSession
+        from ...oscar.errors import ActorNotExist
+
+        while True:
+            key, session_ref, fut = self._queue.get()
+            if key is None:
+                break
+
+            session = session_ref()
+            if session is None:
+                fut.set_result(None)
+                continue
+            try:
+                s = SyncSession.from_isolated_session(session)
+                s.decref(key)
+                fut.set_result(None)
+            except (RuntimeError, ConnectionError, KeyError, ActorNotExist):
+                fut.set_result(None)
+            except (
+                Exception
+            ) as ex:  # pragma: no cover  # noqa: E722  # nosec  # pylint: disable=bare-except
+                fut.set_exception(ex)
+            finally:
+                del session
+
+    def stop(self):
+        if self._decref_thread:  # pragma: no branch
+            self._queue.put_nowait((None, None, None))
+            self._decref_thread.join(1)
+
+    def put(self, key: str, session_ref: ref):
+        if self._decref_thread is None:
+            self.start()
+
+        fut = concurrent.futures.Future()
+        self._queue.put_nowait((key, session_ref, fut))
+        return fut
+
+
+_decref_runner = DecrefRunner()
+atexit.register(_decref_runner.stop)
+
+
+class _TileableSession:
+    def __init__(self, tileable: TileableType, session: SessionType):
+        self._sess_id = id(session)
+        key = tileable.key
+
+        def cb(_, sess=ref(session)):
+            try:
+                cur_thread_ident = threading.current_thread().ident
+                decref_in_isolation = get_isolation().thread_ident == cur_thread_ident
+            except KeyError:
+                # isolation destroyed, no need to decref
+                return
+
+            fut = _decref_runner.put(key, sess)
+            if not decref_in_isolation:
+                # if decref in isolation, means that this tileable
+                # is not required for main thread, thus we do not need
+                # to wait for decref, otherwise, wait a bit
+                try:
+                    fut.result(0.5)
+                except concurrent.futures.TimeoutError:
+                    # ignore timeout
+                    pass
+
+        self.tileable = ref(tileable, cb)
+
+    def __eq__(self, other: "_TileableSession"):
+        return self._sess_id == other._sess_id
+
+
+class _TileableDataCleaner:
+    def __init__(self):
+        self._tileable_to_sessions = WeakKeyDictionary()
+
+    @enter_mode(build=True)
+    def register(self, tileable: TileableType, session: SessionType):
+        if tileable in self._tileable_to_sessions:
+            self._tileable_to_sessions[tileable].append(
+                _TileableSession(tileable, session)
+            )
+        else:
+            self._tileable_to_sessions[tileable] = [_TileableSession(tileable, session)]
+
+
+# we don't use __del__ to avoid potential Circular reference
+_cleaner = _TileableDataCleaner()
+
+
+def _get_session(executable: "_ExecutableMixin", session: SessionType = None):
+    from ...deploy.oscar.session import get_default_session
+
+    # if session is not specified, use default session
+    if session is None:
+        session = get_default_session()
+
+    return session
+
+
+class _ExecutableMixin:
+    __slots__ = ()
+    _executed_sessions: List[SessionType]
+
+    def execute(self, session: SessionType = None, **kw):
+        from ...deploy.oscar.session import execute
+
+        session = _get_session(self, session)
+        return execute(self, session=session, **kw)
+
+    def _check_session(self, session: SessionType, action: str):
+        if session is None:
+            if isinstance(self, tuple):
+                key = self[0].key
+            else:
+                key = self.key
+            raise ValueError(
+                f"Tileable object {key} must be executed first before {action}"
+            )
+
+    def _fetch(self, session: SessionType = None, **kw):
+        from ...deploy.oscar.session import fetch
+
+        session = _get_session(self, session)
+        self._check_session(session, "fetch")
+        return fetch(self, session=session, **kw)
+
+    def fetch(self, session: SessionType = None, **kw):
+        return self._fetch(session=session, **kw)
+
+    def fetch_log(
+        self,
+        session: SessionType = None,
+        offsets: List[int] = None,
+        sizes: List[int] = None,
+    ):
+        from ...deploy.oscar.session import fetch_log
+
+        session = _get_session(self, session)
+        self._check_session(session, "fetch_log")
+        return fetch_log(self, session=session, offsets=offsets, sizes=sizes)[0]
+
+    def _fetch_infos(self, fields=None, session=None, **kw):
+        from ...deploy.oscar.session import fetch_infos
+
+        session = _get_session(self, session)
+        self._check_session(session, "fetch_infos")
+        return fetch_infos(self, fields=fields, session=session, **kw)
+
+    def _attach_session(self, session: SessionType):
+        if session not in self._executed_sessions:
+            _cleaner.register(self, session)
+            self._executed_sessions.append(session)
+
+    def _detach_session(self, session: SessionType):
+        if session in self._executed_sessions:
+            sessions = _cleaner._tileable_to_sessions.get(self, [])
+            if sessions:
+                sessions.remove(_TileableSession(self, session))
+            if len(sessions) == 0:
+                del _cleaner._tileable_to_sessions[self]
+            self._executed_sessions.remove(session)
+
+
+class _ExecuteAndFetchMixin:
+    __slots__ = ()
+
+    def _execute_and_fetch(self, session: SessionType = None, **kw):
+        from ...deploy.oscar.session import ExecutionInfo, SyncSession, fetch
+
+        session = _get_session(self, session)
+        fetch_kwargs = kw.pop("fetch_kwargs", dict())
+        if session in self._executed_sessions:
+            # if has been executed, fetch directly.
+            return self.fetch(session=session, **fetch_kwargs)
+        ret = self.execute(session=session, **kw)
+        if isinstance(ret, ExecutionInfo):
+            # wait=False
+            aio_task = ret.aio_task
+
+            async def _wait():
+                await aio_task
+
+            def run():
+                asyncio.run_coroutine_threadsafe(_wait(), loop=ret.loop).result()
+                return fetch(self, session=session, **fetch_kwargs)
+
+            return SyncSession._execution_pool.submit(run)
+        else:
+            # wait=True
+            return self.fetch(session=session, **fetch_kwargs)
+
+
+class _ToObjectMixin(_ExecuteAndFetchMixin):
+    __slots__ = ()
+
+    def to_object(self, session: SessionType = None, **kw):
+        return self._execute_and_fetch(session=session, **kw)
+
+
+class ExecutableTuple(tuple, _ExecutableMixin, _ToObjectMixin):
+    def __init__(self, *args):
+        tuple.__init__(*args)
+
+        self._fields_to_idx = None
+        self._fields = None
+        self._raw_type = None
+
+        if len(args) == 1 and isinstance(args[0], tuple):
+            self._fields = getattr(args[0], "_fields", None)
+            if self._fields is not None:
+                self._raw_type = type(args[0])
+                self._fields_to_idx = {f: idx for idx, f in enumerate(self._fields)}
+
+        self._executed_sessions = []
+
+    def __getattr__(self, item):
+        if self._fields_to_idx is None or item not in self._fields_to_idx:
+            raise AttributeError(item)
+        return self[self._fields_to_idx[item]]
+
+    def __dir__(self):
+        result = list(super().__dir__()) + list(self._fields or [])
+        return sorted(result)
+
+    def __repr__(self):
+        if not self._fields:
+            return super().__repr__()
+        items = []
+        for k, v in zip(self._fields, self):
+            items.append(f"{k}={v!r}")
+        return "%s(%s)" % (self._raw_type.__name__, ", ".join(items))
+
+    def execute(self, session: SessionType = None, **kw):
+        from ...deploy.oscar.session import execute
+
+        if len(self) == 0:
+            return self
+
+        session = _get_session(self, session)
+        ret = execute(*self, session=session, **kw)
+
+        if session not in self._executed_sessions:
+            self._executed_sessions.append(session)
+
+        if kw.get("wait", True):
+            return self
+        else:
+            return ret
+
+    def _fetch(self, session: SessionType = None, **kw):
+        from ...deploy.oscar.session import fetch
+
+        session = _get_session(self, session)
+        self._check_session(session, "fetch")
+        return fetch(*self, session=session, **kw)
+
+    def _fetch_infos(self, fields=None, session=None, **kw):
+        from ...deploy.oscar.session import fetch_infos
+
+        session = _get_session(self, session)
+        self._check_session(session, "fetch_infos")
+        return fetch_infos(*self, fields=fields, session=session, **kw)
+
+    def fetch(self, session: SessionType = None, **kw):
+        if len(self) == 0:
+            return tuple()
+
+        session = _get_session(self, session)
+        ret = super().fetch(session=session, **kw)
+        if self._raw_type is not None:
+            ret = self._raw_type(*ret)
+        if len(self) == 1:
+            return (ret,)
+        return ret
+
+    def fetch_log(
+        self,
+        session: SessionType = None,
+        offsets: List[int] = None,
+        sizes: List[int] = None,
+    ):
+        from ...deploy.oscar.session import fetch_log
+
+        if len(self) == 0:
+            return []
+        session = self._get_session(session=session)
+        return fetch_log(*self, session=session, offsets=offsets, sizes=sizes)
+
+    def _get_session(self, session: SessionType = None):
+        if session is None:
+            for item in self:
+                session = _get_session(item, session)
+                if session is not None:
+                    return session
+        return session
diff --git a/python/xorbits/_mars/core/entity/fuse.py b/python/xorbits/_mars/core/entity/fuse.py
new file mode 100644
index 000000000..0fafe2d9c
--- /dev/null
+++ b/python/xorbits/_mars/core/entity/fuse.py
@@ -0,0 +1,73 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ...serialization.serializables import ReferenceField
+from .chunks import CHUNK_TYPE, Chunk, ChunkData
+
+
+class FuseChunkData(ChunkData):
+    __slots__ = ("_inited",)
+
+    _chunk = ReferenceField(
+        "chunk", CHUNK_TYPE, on_serialize=lambda x: x.data if hasattr(x, "data") else x
+    )
+
+    def __init__(self, *args, **kwargs):
+        self._inited = False
+        super().__init__(*args, **kwargs)
+        self._extra_params = {}
+        self._inited = True
+
+    @property
+    def chunk(self):
+        return self._chunk
+
+    @property
+    def composed(self):
+        # for compatibility, just return the topological ordering,
+        # once we apply optimization on the subgraph,
+        # `composed` is not needed any more and should be removed then.
+        assert getattr(self._op, "fuse_graph", None) is not None
+        fuse_graph = self._op.fuse_graph
+        return list(fuse_graph.topological_iter())
+
+    def __getattr__(self, attr):
+        if not self._inited:
+            return object.__getattribute__(self, attr)
+        if attr in self._extra_params:
+            return self._extra_params[attr]
+        try:
+            return getattr(self._chunk, attr)
+        except AttributeError:
+            return object.__getattribute__(self, attr)
+
+    def __setattr__(self, attr, value):
+        if attr == "params":
+            self._chunk.params = value
+        else:
+            super().__setattr__(attr, value)
+
+    @property
+    def nbytes(self):
+        return np.prod(self.shape) * self.dtype.itemsize
+
+
+class FuseChunk(Chunk):
+    __slots__ = ()
+    _allow_data_type_ = (FuseChunkData,)
+
+
+FUSE_CHUNK_TYPE = (FuseChunkData, FuseChunk)
diff --git a/python/xorbits/_mars/core/entity/objects.py b/python/xorbits/_mars/core/entity/objects.py
new file mode 100644
index 000000000..cefe7d629
--- /dev/null
+++ b/python/xorbits/_mars/core/entity/objects.py
@@ -0,0 +1,98 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict
+
+from ...serialization.serializables import FieldTypes, ListField
+from .chunks import Chunk, ChunkData
+from .executable import _ToObjectMixin
+from .tileables import Tileable, TileableData
+
+
+class ObjectChunkData(ChunkData):
+    # chunk whose data could be any serializable
+    __slots__ = ()
+    type_name = "Object"
+
+    def __init__(self, op=None, index=None, **kw):
+        super().__init__(_op=op, _index=index, **kw)
+
+    @property
+    def params(self) -> Dict[str, Any]:
+        # params return the properties which useful to rebuild a new chunk
+        return {
+            "index": self.index,
+        }
+
+    @params.setter
+    def params(self, new_params: Dict[str, Any]):
+        params = new_params.copy()
+        params.pop("index", None)  # index not needed to update
+        if params:  # pragma: no cover
+            raise TypeError(f"Unknown params: {list(params)}")
+
+    @classmethod
+    def get_params_from_data(cls, data: Any) -> Dict[str, Any]:
+        return dict()
+
+
+class ObjectChunk(Chunk):
+    __slots__ = ()
+    _allow_data_type_ = (ObjectChunkData,)
+    type_name = "Object"
+
+
+class ObjectData(TileableData, _ToObjectMixin):
+    __slots__ = ()
+    type_name = "Object"
+
+    # optional fields
+    _chunks = ListField(
+        "chunks",
+        FieldTypes.reference(ObjectChunkData),
+        on_serialize=lambda x: [it.data for it in x] if x is not None else x,
+        on_deserialize=lambda x: [ObjectChunk(it) for it in x] if x is not None else x,
+    )
+
+    def __init__(self, op=None, nsplits=None, chunks=None, **kw):
+        super().__init__(_op=op, _nsplits=nsplits, _chunks=chunks, **kw)
+
+    def __repr__(self):
+        return f"Object <op={type(self.op).__name__}, key={self.key}>"
+
+    @property
+    def params(self):
+        # params return the properties which useful to rebuild a new tileable object
+        return dict()
+
+    @params.setter
+    def params(self, new_params: Dict[str, Any]):
+        params = new_params.copy()
+        if params:  # pragma: no cover
+            raise TypeError(f"Unknown params: {list(params)}")
+
+    def refresh_params(self):
+        # refresh params when chunks updated
+        # nothing needs to do for Object
+        pass
+
+
+class Object(Tileable, _ToObjectMixin):
+    __slots__ = ()
+    _allow_data_type_ = (ObjectData,)
+    type_name = "Object"
+
+
+OBJECT_TYPE = (Object, ObjectData)
+OBJECT_CHUNK_TYPE = (ObjectChunk, ObjectChunkData)
diff --git a/python/xorbits/_mars/core/entity/output_types.py b/python/xorbits/_mars/core/entity/output_types.py
new file mode 100644
index 000000000..b63f59508
--- /dev/null
+++ b/python/xorbits/_mars/core/entity/output_types.py
@@ -0,0 +1,97 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+from enum import Enum
+
+from .fuse import FUSE_CHUNK_TYPE
+from .objects import OBJECT_CHUNK_TYPE, OBJECT_TYPE
+
+
+class OutputType(Enum):
+    object = 1
+    tensor = 2
+    dataframe = 3
+    series = 4
+    index = 5
+    scalar = 6
+    categorical = 7
+    dataframe_groupby = 8
+    series_groupby = 9
+    df_or_series = 10
+
+    @classmethod
+    def serialize_list(cls, output_types):
+        return [ot.value for ot in output_types] if output_types is not None else None
+
+    @classmethod
+    def deserialize_list(cls, output_types):
+        return [cls(ot) for ot in output_types] if output_types is not None else None
+
+
+_OUTPUT_TYPE_TO_CHUNK_TYPES = {OutputType.object: OBJECT_CHUNK_TYPE}
+_OUTPUT_TYPE_TO_TILEABLE_TYPES = {OutputType.object: OBJECT_TYPE}
+_OUTPUT_TYPE_TO_FETCH_CLS = {}
+
+
+def register_output_types(output_type, tileable_types, chunk_types):
+    _OUTPUT_TYPE_TO_TILEABLE_TYPES[output_type] = tileable_types
+    _OUTPUT_TYPE_TO_CHUNK_TYPES[output_type] = chunk_types
+
+
+def register_fetch_class(output_type, fetch_cls, fetch_shuffle_cls):
+    _OUTPUT_TYPE_TO_FETCH_CLS[output_type] = (fetch_cls, fetch_shuffle_cls)
+
+
+def get_tileable_types(output_type):
+    return _OUTPUT_TYPE_TO_TILEABLE_TYPES[output_type]
+
+
+def get_chunk_types(output_type):
+    return _OUTPUT_TYPE_TO_CHUNK_TYPES[output_type]
+
+
+def get_fetch_class(output_type):
+    return _OUTPUT_TYPE_TO_FETCH_CLS[output_type]
+
+
+@functools.lru_cache(100)
+def _get_output_type_by_cls(cls):
+    for tp in OutputType.__members__.values():
+        try:
+            tileable_types = _OUTPUT_TYPE_TO_TILEABLE_TYPES[tp]
+            chunk_types = _OUTPUT_TYPE_TO_CHUNK_TYPES[tp]
+            if issubclass(cls, (tileable_types, chunk_types)):
+                return tp
+        except KeyError:  # pragma: no cover
+            continue
+    raise TypeError("Output can only be tensor, dataframe or series")
+
+
+def get_output_types(*objs, unknown_as=None):
+    output_types = []
+    for obj in objs:
+        if obj is None:
+            continue
+        elif isinstance(obj, FUSE_CHUNK_TYPE):
+            obj = obj.chunk
+
+        try:
+            output_types.append(_get_output_type_by_cls(type(obj)))
+        except TypeError:
+            if unknown_as is not None:
+                output_types.append(unknown_as)
+            else:  # pragma: no cover
+                raise
+    return output_types
diff --git a/python/xorbits/_mars/core/entity/tests/__init__.py b/python/xorbits/_mars/core/entity/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/core/entity/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/core/entity/tests/test_utils.py b/python/xorbits/_mars/core/entity/tests/test_utils.py
new file mode 100644
index 000000000..ac54b3cd6
--- /dev/null
+++ b/python/xorbits/_mars/core/entity/tests/test_utils.py
@@ -0,0 +1,81 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+from .... import tensor as mt
+from ....tensor.operands import TensorOperand, TensorOperandMixin
+from ....utils import has_unknown_shape
+from ... import recursive_tile
+
+
+class _TestOperand(TensorOperand, TensorOperandMixin):
+    @classmethod
+    def tile(cls, op: "_TestOperand"):
+        data1, data2 = op.inputs
+
+        data1 = mt.sort(data1)
+        data2 = mt.sort(data2)
+        data_all = mt.concatenate([data1, data2])
+        s1 = mt.searchsorted(data1, data_all)
+        s2 = mt.searchsorted(data2, data_all)
+        result = yield from recursive_tile(mt.concatenate([s1, s2]))
+        # data1 will be yield by s1
+        assert not has_unknown_shape(data1)
+        assert not has_unknown_shape(data2)
+        assert not has_unknown_shape(data_all)
+        return result
+
+
+def test_recursive_tile(setup):
+    d1 = mt.random.rand(10, chunk_size=5)
+    d2 = mt.random.rand(10, chunk_size=5)
+    op = _TestOperand()
+    t = op.new_tensor([d1, d2], dtype=d1.dtype, shape=(20,), order=d1.order)
+    t.execute(extra_config={"check_duplicated_operand_keys": True})
+
+
+class _TestOperandWithDuplicatedSubmission(TensorOperand, TensorOperandMixin):
+    @classmethod
+    def tile(cls, op: "_TestOperand"):
+        data1 = op.inputs[0]
+
+        data2 = yield from recursive_tile(data1 + 1)
+        yield data2.chunks
+        data3 = yield from recursive_tile(data1 + 2)
+        yield data3.chunks
+
+        return (yield from recursive_tile(data2 + data3))
+
+
+def test_recursive_tile_with_duplicated_submission(setup):
+    raw = np.random.RandomState(0).rand(10)
+    d1 = mt.tensor(raw, chunk_size=5)
+    op = _TestOperandWithDuplicatedSubmission()
+    t = op.new_tensor(
+        [
+            d1,
+        ],
+        dtype=d1.dtype,
+        shape=(10,),
+        order=d1.order,
+    )
+
+    with pytest.raises(RuntimeError, match="submitted repeatedly"):
+        t.execute(extra_config={"check_duplicated_submission": True})
+
+    result = t.execute(extra_config={"check_duplicated_submission": False})
+    expected = 2 * raw + 3
+    np.testing.assert_array_equal(result, expected)
diff --git a/python/xorbits/_mars/core/entity/tileables.py b/python/xorbits/_mars/core/entity/tileables.py
new file mode 100644
index 000000000..7f7718a2a
--- /dev/null
+++ b/python/xorbits/_mars/core/entity/tileables.py
@@ -0,0 +1,470 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import builtins
+import inspect
+import itertools
+from operator import attrgetter
+from typing import Callable, Generator, List
+from weakref import WeakKeyDictionary, WeakSet
+
+import numpy as np
+
+from ...serialization.serializables import BoolField, FieldTypes, TupleField
+from ...typing import ChunkType, OperandType, TileableType
+from ...utils import on_deserialize_shape, on_serialize_nsplits, on_serialize_shape
+from ..base import Base
+from ..mode import enter_mode
+from .chunks import Chunk
+from .core import Entity, EntityData
+from .executable import _ExecutableMixin
+
+
+class NotSupportTile(Exception):
+    pass
+
+
+class OperandTilesHandler:
+    _handlers = dict()
+
+    @classmethod
+    def _get_op_cls(cls, op: OperandType):
+        if isinstance(op, type):
+            return op
+        return type(op)
+
+    @classmethod
+    def register(
+        cls, op: OperandType, tile_handler: Callable[[OperandType], TileableType]
+    ):
+        cls._handlers[cls._get_op_cls(op)] = tile_handler
+
+    @classmethod
+    def unregister(cls, op: OperandType):
+        del cls._handlers[cls._get_op_cls(op)]
+
+    @classmethod
+    def get_handler(
+        cls, op: OperandType
+    ) -> Callable[[OperandType], List[TileableType]]:
+        op_cls = cls._get_op_cls(op)
+        return cls._handlers.get(op_cls, op_cls.tile)
+
+    @classmethod
+    def tile(
+        cls, tileables: List[TileableType]
+    ) -> Generator[List[ChunkType], List[ChunkType], List[TileableType]]:
+        op = tileables[0].op
+        # pre tile
+        op.pre_tile(op)
+        tiled_result = None
+        try:
+            tile_handler = cls.get_handler(op)
+            if inspect.isgeneratorfunction(tile_handler):
+                # op.tile can be a generator function,
+                # each time an operand yield some chunks,
+                # they will be put into ChunkGraph and executed first.
+                # After execution, resume from the yield place.
+                tiled_result = yield from tile_handler(op)
+            else:
+                # without iterative tiling
+                tiled_result = tile_handler(op)
+        finally:
+            op.post_tile(op, tiled_result)
+
+        if not isinstance(tiled_result, list):
+            tiled_result = [tiled_result]
+        tiled_results = [t.data if hasattr(t, "data") else t for t in tiled_result]
+        assert len(tileables) == len(tiled_results)
+        if any(inspect.isgenerator(r) for r in tiled_results):  # pragma: no cover
+            raise TypeError(f"tiled result cannot be generator when tiling {op}")
+        cls._assign_to(tiled_results, tileables)
+        return tileables
+
+    @classmethod
+    def _assign_to(
+        cls,
+        tile_after_tensor_datas: List["TileableData"],
+        tile_before_tensor_datas: List["TileableData"],
+    ):
+        assert len(tile_after_tensor_datas) == len(tile_before_tensor_datas)
+
+        for tile_after_tensor_data, tile_before_tensor_data in zip(
+            tile_after_tensor_datas, tile_before_tensor_datas
+        ):
+            if tile_before_tensor_data is None:
+                # garbage collected
+                continue
+            tile_after_tensor_data.copy_to(tile_before_tensor_data)
+            tile_before_tensor_data.op.outputs = tile_before_tensor_datas
+
+    @enter_mode(kernel=True)
+    def dispatch(self, op: OperandType):
+        op_cls = self._get_op_cls(op)
+        tiled = None
+        cause = None
+
+        if op_cls in self._handlers:
+            tiled = self._handlers[op_cls](op)
+        else:
+            try:
+                tiled = op_cls.tile(op)
+            except NotImplementedError as ex:
+                cause = ex
+                for super_cls in op_cls.__mro__:
+                    if super_cls in self._handlers:
+                        h = self._handlers[op_cls] = self._handlers[super_cls]
+                        tiled = h(op)
+                        break
+
+        if tiled is not None:
+            return tiled if isinstance(tiled, list) else [tiled]
+        else:
+            raise NotImplementedError(f"{type(op)} does not support tile") from cause
+
+
+handler = OperandTilesHandler()
+register = OperandTilesHandler.register
+unregister = OperandTilesHandler.unregister
+
+
+class _ChunksIndexer:
+    __slots__ = ("_tileable",)
+
+    def __init__(self, tileable):
+        self._tileable = tileable
+
+    def __getitem__(self, item):
+        """
+        The indices for `cix` can be [x, y] or [x, :].
+        For the former the result will be a single chunk,
+        and for the later the result will be a list of chunks (flattened).
+
+        The length of indices must be the same with `chunk_shape` of tileable.
+        """
+        if isinstance(item, int):
+            item = (item,)
+        if isinstance(item, tuple):
+            if len(item) == 0 and self._tileable.is_scalar():
+                return self._tileable.chunks[0]
+            if len(item) != self._tileable.ndim:
+                raise ValueError(
+                    f"Cannot get chunk by {item}, "
+                    f"expect length {self._tileable.ndim}"
+                )
+            slices, singleton = [], True
+            for it, dim in zip(item, self._tileable.chunk_shape):
+                if isinstance(it, slice):
+                    slices.append(range(dim)[it])
+                    singleton = False
+                elif np.issubdtype(type(it), np.integer):
+                    slices.append([it if it >= 0 else dim + it])
+                else:
+                    raise TypeError(
+                        f"Cannot get chunk by {it}, "
+                        f"invalid value has type {type(it)}"
+                    )
+
+            indexes = tuple(zip(*itertools.product(*slices)))
+
+            flat_index = np.ravel_multi_index(indexes, self._tileable.chunk_shape)
+            if singleton:
+                return self._tileable._chunks[flat_index[0]]
+            else:
+                return [self._tileable._chunks[idx] for idx in flat_index]
+
+        raise ValueError(f"Cannot get {type(self._tileable).__name__} chunk by {item}")
+
+
+class EntityDataModificationHandler:
+    def __init__(self):
+        self._data_to_entities = WeakKeyDictionary()
+
+    def _add_observer(self, data, entity):
+        # only tileable data should be considered
+        assert isinstance(data, TileableData)
+        assert isinstance(entity, Tileable)
+
+        if data not in self._data_to_entities:
+            self._data_to_entities[data] = WeakSet()
+
+        self._data_to_entities[data].add(entity)
+
+    @enter_mode(build=True)
+    def add_observer(self, data, entity):
+        self._add_observer(data, entity)
+
+    def _update_observe_data(self, observer, data, new_data):
+        self._data_to_entities.get(data, set()).discard(observer)
+        self._add_observer(new_data, observer)
+
+    @staticmethod
+    def _set_data(entity, data):
+        entity._data.detach(entity)
+        entity._data = data
+        data.attach(entity)
+
+    @staticmethod
+    def _get_data(obj):
+        return obj.data if isinstance(obj, Entity) else obj
+
+    @enter_mode(build=True)
+    def data_changed(self, old_data, new_data):
+        notified = set()
+        processed_data = set()
+        old_to_new = {old_data: new_data}
+        q = [old_data]
+        while len(q) > 0:
+            data = q.pop()
+
+            # handle entities
+            for entity in data.entities:
+                self._set_data(entity, old_to_new[data])
+                notified.add(entity)
+
+            observers = {
+                ob
+                for ob in self._data_to_entities.pop(data, set())
+                if ob not in notified
+            }
+            for ob in observers:
+                new_data = self._get_data(ob.op.on_input_modify(old_to_new[data]))
+                old_data = ob.data
+                self._update_observe_data(ob, ob.data, new_data)
+                old_to_new[old_data] = new_data
+                if old_data not in processed_data:
+                    q.append(old_data)
+                    processed_data.add(old_data)
+                notified.add(ob)
+
+            if data.op.create_view:
+                old_input_data = data.inputs[0]
+                new_input_data = self._get_data(
+                    data.op.on_output_modify(old_to_new[data])
+                )
+                old_to_new[old_input_data] = new_input_data
+                if old_input_data not in processed_data:
+                    q.append(old_input_data)
+                    processed_data.add(old_input_data)
+
+
+entity_view_handler = EntityDataModificationHandler()
+
+
+class TileableData(EntityData, _ExecutableMixin):
+    __slots__ = "_cix", "_entities", "_executed_sessions"
+    _no_copy_attrs_ = Base._no_copy_attrs_ | {"_cix"}
+
+    # optional fields
+    # `nsplits` means the sizes of chunks for each dimension
+    _nsplits = TupleField(
+        "nsplits",
+        FieldTypes.tuple(FieldTypes.tuple(FieldTypes.uint64)),
+        on_serialize=on_serialize_nsplits,
+    )
+    # cache tileable data, if true, this data will be materialized
+    cache = BoolField("cache", default=False)
+
+    def __init__(self: TileableType, *args, **kwargs):
+        if kwargs.get("_nsplits", None) is not None:
+            kwargs["_nsplits"] = tuple(tuple(s) for s in kwargs["_nsplits"])
+
+        super().__init__(*args, **kwargs)
+
+        try:
+            chunks = self._chunks
+            if chunks:
+                self._chunks = sorted(chunks, key=attrgetter("index"))
+        except AttributeError:  # pragma: no cover
+            pass
+        self._entities = WeakSet()
+        self._executed_sessions = []
+
+    def __on_deserialize__(self):
+        super(TileableData, self).__on_deserialize__()
+        self._entities = WeakSet()
+        self._executed_sessions = []
+
+    @property
+    def chunk_shape(self):
+        if hasattr(self, "_nsplits") and self._nsplits is not None:
+            return tuple(map(len, self._nsplits))
+
+    @property
+    def chunks(self) -> List[Chunk]:
+        return getattr(self, "_chunks", None)
+
+    @property
+    def nsplits(self):
+        return getattr(self, "_nsplits", None)
+
+    @nsplits.setter
+    def nsplits(self, new_nsplits):
+        self._nsplits = new_nsplits
+
+    @property
+    def params(self) -> dict:
+        # params return the properties which useful to rebuild a new tileable object
+        return dict()
+
+    @property
+    def cix(self):
+        if self.ndim == 0:
+            return _ChunksIndexer(self)
+
+        try:
+            if getattr(self, "_cix", None) is None:
+                self._cix = _ChunksIndexer(self)
+            return self._cix
+        except (TypeError, ValueError):
+            return _ChunksIndexer(self)
+
+    @property
+    def entities(self):
+        return self._entities
+
+    def is_coarse(self):
+        if not hasattr(self, "_chunks"):
+            return True
+        if not self._chunks:
+            return True
+        return False
+
+    def attach(self, entity):
+        self._entities.add(entity)
+
+    def detach(self, entity):
+        self._entities.discard(entity)
+
+
+class Tileable(Entity):
+    def __init__(self, data: TileableType = None, **kw):
+        super().__init__(data=data, **kw)
+        data = self._data
+        if data is not None:
+            data.attach(self)
+            if data.op.create_view:
+                entity_view_handler.add_observer(data.inputs[0], self)
+
+    def __copy__(self):
+        return self._view()
+
+    def _view(self):
+        return super().copy()
+
+    def copy(self: TileableType) -> TileableType:
+        new_op = self.op.copy()
+        if new_op.create_view:
+            # if the operand is a view, make it a copy
+            new_op.create_view = False
+        params = []
+        for o in self.op.outputs:
+            param = o.params
+            param["_key"] = o.key
+            param.update(o.extra_params)
+            params.append(param)
+        new_outs = new_op.new_tileables(
+            self.op.inputs, kws=params, output_limit=len(params)
+        )
+        pos = -1
+        for i, out in enumerate(self.op.outputs):
+            # create a ref to copied one
+            new_out = new_outs[i]
+            if not hasattr(new_out.data, "_siblings"):
+                new_out.data._siblings = []
+            new_out.data._siblings.append(out)
+
+            if self._data is out:
+                pos = i
+                break
+        assert pos >= 0
+        return new_outs[pos]
+
+    @Entity.data.setter
+    def data(self, new_data):
+        self._check_data(new_data)
+        if self._data is None:
+            self._data = new_data
+            self._data.attach(self)
+        else:
+            entity_view_handler.data_changed(self._data, new_data)
+
+    def execute(self, session=None, **kw):
+        result = self.data.execute(session=session, **kw)
+        if isinstance(result, TILEABLE_TYPE):
+            return self
+        else:
+            return result
+
+
+TILEABLE_TYPE = (Tileable, TileableData)
+
+
+class HasShapeTileableData(TileableData):
+    # required fields
+    _shape = TupleField(
+        "shape",
+        FieldTypes.int64,
+        on_serialize=on_serialize_shape,
+        on_deserialize=on_deserialize_shape,
+    )
+
+    @property
+    def ndim(self):
+        return len(self.shape)
+
+    def __len__(self):
+        try:
+            return int(self.shape[0])
+        except (IndexError, ValueError):  # pragma: no cover
+            return 0
+
+    @property
+    def shape(self):
+        if hasattr(self, "_shape") and self._shape is not None:
+            return self._shape
+        if hasattr(self, "_nsplits") and self._nsplits is not None:
+            self._shape = tuple(builtins.sum(nsplit) for nsplit in self._nsplits)
+            return self._shape
+
+    def _update_shape(self, new_shape):
+        self._shape = new_shape
+
+    @property
+    def size(self):
+        return np.prod(self.shape).item()
+
+    @property
+    def params(self):
+        # params return the properties which useful to rebuild a new tileable object
+        return {"shape": self.shape}
+
+    def _equals(self, o):
+        return self is o
+
+
+class HasShapeTileable(Tileable):
+    __slots__ = ()
+
+    @property
+    def shape(self):
+        return self._data.shape
+
+    @property
+    def ndim(self):
+        return self._data.ndim
+
+    @property
+    def size(self):
+        return self._data.size
diff --git a/python/xorbits/_mars/core/entity/utils.py b/python/xorbits/_mars/core/entity/utils.py
new file mode 100644
index 000000000..02b603f90
--- /dev/null
+++ b/python/xorbits/_mars/core/entity/utils.py
@@ -0,0 +1,102 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Generator, List, Union
+
+from ...typing import ChunkType, TileableType
+from ...utils import calc_nsplits, has_unknown_shape
+
+
+def refresh_tileable_shape(tileable):
+    if tileable.shape is None or has_unknown_shape(tileable):
+        # update shape
+        nsplits = calc_nsplits({c.index: c.shape for c in tileable.chunks})
+        shape = tuple(sum(ns) for ns in nsplits)
+        tileable._nsplits = nsplits
+        tileable._shape = shape
+
+
+def tile(tileable, *tileables: TileableType):
+    from ..graph import (
+        ChunkGraphBuilder,
+        TileableGraph,
+        TileableGraphBuilder,
+        TileContext,
+    )
+
+    raw_tileables = target_tileables = [tileable] + list(tileables)
+    target_tileables = [t.data if hasattr(t, "data") else t for t in target_tileables]
+
+    tileable_graph = TileableGraph(target_tileables)
+    tileable_graph_builder = TileableGraphBuilder(tileable_graph)
+    next(tileable_graph_builder.build())
+
+    # tile
+    tile_context = TileContext()
+    chunk_graph_builder = ChunkGraphBuilder(
+        tileable_graph, fuse_enabled=False, tile_context=tile_context
+    )
+    next(chunk_graph_builder.build())
+
+    if len(tileables) == 0:
+        return type(tileable)(tile_context[target_tileables[0]])
+    else:
+        return [
+            type(raw_t)(tile_context[t])
+            for raw_t, t in zip(raw_tileables, target_tileables)
+        ]
+
+
+def recursive_tile(
+    tileable: TileableType, *tileables: TileableType
+) -> Generator[
+    List[ChunkType], List[ChunkType], Union[TileableType, List[TileableType]]
+]:
+    from .tileables import handler
+
+    return_list = len(tileables) > 0
+    if not return_list and isinstance(tileable, (list, tuple)):
+        return_list = True
+        raw = tileable
+        tileable = raw[0]
+        tileables = raw[1:]
+
+    to_tile = [tileable] + list(tileables)
+    q = [t for t in to_tile if t.is_coarse()]
+    while q:
+        t = q[-1]
+        if t.is_coarse():
+            # t may be put into q repeatedly,
+            # so we check if it's tiled or not
+            cs = [c for c in t.inputs if c.is_coarse()]
+            if cs:
+                q.extend(cs)
+                continue
+            for obj in handler.tile(t.op.outputs):
+                to_update_inputs = []
+                chunks = []
+                for inp in t.op.inputs:
+                    chunks.extend(inp.chunks)
+                    if has_unknown_shape(inp):
+                        to_update_inputs.append(inp)
+                if obj is None:
+                    yield chunks + to_update_inputs
+                else:
+                    yield obj + to_update_inputs
+        q.pop()
+
+    if not return_list:
+        return tileable
+    else:
+        return [tileable] + list(tileables)
diff --git a/python/xorbits/_mars/core/entrypoints.py b/python/xorbits/_mars/core/entrypoints.py
new file mode 100644
index 000000000..53ea7ee6b
--- /dev/null
+++ b/python/xorbits/_mars/core/entrypoints.py
@@ -0,0 +1,42 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import logging
+import warnings
+
+logger = logging.getLogger(__name__)
+
+
+# from https://github.com/numba/numba/blob/master/numba/core/entrypoints.py
+# Must put this here to avoid extensions re-triggering initialization
+@functools.lru_cache(maxsize=None)
+def init_extension_entrypoints():
+    """Execute all `mars_extensions` entry points with the name `init`
+    If extensions have already been initialized, this function does nothing.
+    """
+    from pkg_resources import iter_entry_points
+
+    for entry_point in iter_entry_points("mars_extensions", "init"):
+        logger.info("Loading extension: %s", entry_point)
+        try:
+            func = entry_point.load()
+            func()
+        except Exception as e:
+            msg = "Mars extension module '{}' failed to load due to '{}({})'."
+            warnings.warn(
+                msg.format(entry_point.module_name, type(e).__name__, str(e)),
+                stacklevel=2,
+            )
+            logger.info("Extension loading failed for: %s", entry_point)
diff --git a/python/xorbits/_mars/core/graph/__init__.py b/python/xorbits/_mars/core/graph/__init__.py
new file mode 100644
index 000000000..e298eabdd
--- /dev/null
+++ b/python/xorbits/_mars/core/graph/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .builder import ChunkGraphBuilder, TileableGraphBuilder, TileContext, TileStatus
+from .core import DAG, DirectedGraph, GraphContainsCycleError
+from .entity import ChunkGraph, EntityGraph, TileableGraph
diff --git a/python/xorbits/_mars/core/graph/builder/__init__.py b/python/xorbits/_mars/core/graph/builder/__init__.py
new file mode 100644
index 000000000..2e5f05c3e
--- /dev/null
+++ b/python/xorbits/_mars/core/graph/builder/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .chunk import ChunkGraphBuilder, TileContext, TileStatus
+from .tileable import TileableGraphBuilder
diff --git a/python/xorbits/_mars/core/graph/builder/base.py b/python/xorbits/_mars/core/graph/builder/base.py
new file mode 100644
index 000000000..dbf56e3bc
--- /dev/null
+++ b/python/xorbits/_mars/core/graph/builder/base.py
@@ -0,0 +1,86 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from abc import ABC, abstractmethod
+from typing import Generator, List, Set, Union
+
+from ....typing import EntityType
+from ..entity import ChunkGraph, EntityGraph, TileableGraph
+
+
+def _default_inputs_selector(inputs: List[EntityType]) -> List[EntityType]:
+    return inputs
+
+
+class AbstractGraphBuilder(ABC):
+    _graph: EntityGraph
+
+    def __init__(self, graph: EntityGraph):
+        self._graph = graph
+
+    def _process_node(self, entity: EntityType):
+        return entity
+
+    def _select_inputs(self, inputs: List[EntityType]):
+        return inputs
+
+    def _if_add_node(
+        self, node: EntityType, visited: Set
+    ):  # pylint: disable=no-self-use
+        return node not in visited
+
+    def _add_nodes(
+        self,
+        graph: Union[ChunkGraph, TileableGraph],
+        nodes: List[EntityType],
+        visited: Set,
+    ):
+        # update visited
+        visited.update(nodes)
+
+        while len(nodes) > 0:
+            node = nodes.pop()
+            node = self._process_node(node)
+
+            # mark node as visited
+            visited.add(node)
+
+            # add node to graph if possible
+            if not graph.contains(node):
+                graph.add_node(node)
+
+            children = self._select_inputs(node.inputs or [])
+            if children:
+                node.inputs = children
+            for c in children:
+                c = self._process_node(c)
+                if not graph.contains(c):
+                    graph.add_node(c)
+                if not graph.has_successor(c, node):
+                    graph.add_edge(c, node)
+                for out in c.op.outputs:
+                    if self._if_add_node(out, visited):
+                        nodes.append(out)
+
+    @abstractmethod
+    def build(self) -> Generator[Union[EntityGraph, ChunkGraph], None, None]:
+        """
+        Build a entity graph.
+
+        Returns
+        -------
+        graph : EntityGraph
+            Entity graph.
+        """
diff --git a/python/xorbits/_mars/core/graph/builder/chunk.py b/python/xorbits/_mars/core/graph/builder/chunk.py
new file mode 100644
index 000000000..06d0a7a87
--- /dev/null
+++ b/python/xorbits/_mars/core/graph/builder/chunk.py
@@ -0,0 +1,430 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+import functools
+from typing import Callable, Dict, Generator, Iterable, List, Optional, Set, Type, Union
+
+from ....core import CHUNK_TYPE, FUSE_CHUNK_TYPE, TILEABLE_TYPE
+from ....typing import ChunkType, EntityType, TileableType
+from ....utils import build_fetch, copy_tileables
+from ...entity.tileables import handler
+from ...mode import enter_mode
+from ..entity import ChunkGraph, TileableGraph
+from .base import AbstractGraphBuilder
+
+tile_gen_type = Generator[List[ChunkType], List[ChunkType], List[TileableType]]
+DEFAULT_UPDATED_PROGRESS = 0.4
+
+
+@dataclasses.dataclass
+class _TileableHandler:
+    tileable: TileableType
+    handler: tile_gen_type
+    last_need_processes: List[EntityType] = None
+
+
+@dataclasses.dataclass
+class _TileableTileInfo:
+    curr_iter: int
+    # incremental progress for this iteration
+    tile_progress: float
+    # newly generated chunks by a tileable in this iteration
+    generated_chunks: List[ChunkType] = dataclasses.field(default_factory=list)
+
+
+class TileContext(Dict[TileableType, TileableType]):
+    _tileables = Set[TileableType]
+    _tileable_to_progress: Dict[TileableType, float]
+    _tileable_to_tile_infos: Dict[TileableType, List[_TileableTileInfo]]
+
+    def __init__(self, *args, **kw):
+        super().__init__(*args, **kw)
+        self._tileables = None
+        self._tileable_to_progress = dict()
+        self._tileable_to_tile_infos = dict()
+
+    def set_tileables(self, tileables: Set[TileableType]):
+        self._tileables = tileables
+
+    def __setitem__(self, key, value):
+        self._tileable_to_progress.pop(key, None)
+        return super().__setitem__(key, value)
+
+    def set_progress(self, tileable: TileableType, progress: float):
+        assert 0.0 <= progress <= 1.0
+        last_progress = self._tileable_to_progress.get(tileable, 0.0)
+        self._tileable_to_progress[tileable] = max(progress, last_progress)
+
+    def get_progress(self, tileable: TileableType) -> float:
+        if tileable in self:
+            return 1.0
+        else:
+            return self._tileable_to_progress.get(tileable, 0.0)
+
+    def get_all_progress(self) -> float:
+        return sum(self.get_progress(t) for t in self._tileables) / len(self._tileables)
+
+    def record_tileable_tile_info(
+        self, tileable: TileableType, curr_iter: int, generated_chunks: List[ChunkType]
+    ):
+        if tileable not in self._tileable_to_tile_infos:
+            self._tileable_to_tile_infos[tileable] = []
+        prev_progress = sum(
+            info.tile_progress for info in self._tileable_to_tile_infos[tileable]
+        )
+        curr_progress = self.get_progress(tileable)
+        infos = self._tileable_to_tile_infos[tileable]
+        infos.append(
+            _TileableTileInfo(
+                curr_iter=curr_iter,
+                tile_progress=curr_progress - prev_progress,
+                generated_chunks=generated_chunks,
+            )
+        )
+
+    def get_tileable_tile_infos(self) -> Dict[TileableType, List[_TileableTileInfo]]:
+        return {t: self._tileable_to_tile_infos.get(t, list()) for t in self._tileables}
+
+
+@dataclasses.dataclass
+class TileStatus:
+    entities: List[EntityType] = None
+    progress: float = None
+
+
+class Tiler:
+    _cur_iter: int
+    _cur_chunk_graph: Optional[ChunkGraph]
+    _tileable_handlers: Iterable[_TileableHandler]
+
+    def __init__(
+        self,
+        tileable_graph: TileableGraph,
+        tile_context: TileContext,
+        processed_chunks: Set[str],
+        chunk_to_fetch: Dict[ChunkType, ChunkType],
+        add_nodes: Callable,
+    ):
+        self._tileable_graph = tileable_graph
+        self._tile_context = tile_context
+        self._processed_chunks = processed_chunks
+        self._chunk_to_fetch = chunk_to_fetch
+        self._add_nodes = self._wrap_add_nodes(add_nodes)
+        self._curr_iter = 0
+        self._cur_chunk_graph = None
+        self._tileable_handlers = (
+            _TileableHandler(tileable, self._tile_handler(tileable))
+            for tileable in tileable_graph.topological_iter()
+        )
+
+    def _wrap_add_nodes(self, add_nodes: Callable):
+        @functools.wraps(add_nodes)
+        def inner(
+            chunk_graph: ChunkGraph,
+            chunks: List[ChunkType],
+            visited: Set[ChunkType],
+            tileable: TileableType,
+        ):
+            prev_chunks = set(chunk_graph)
+            add_nodes(chunk_graph, chunks, visited)
+            new_chunks = set(chunk_graph)
+            self._tile_context.record_tileable_tile_info(
+                tileable, self._curr_iter, list(new_chunks - prev_chunks)
+            )
+
+        return inner
+
+    @staticmethod
+    def _get_data(entity: EntityType):
+        return entity.data if hasattr(entity, "data") else entity
+
+    def _tile_handler(self, tileable: TileableType) -> tile_gen_type:
+        from ....core.operand import Fetch
+
+        tileable = self._get_data(tileable)
+
+        if isinstance(tileable.op, Fetch) and not tileable.is_coarse():
+            return [tileable]
+
+        assert tileable.is_coarse()
+
+        # copy tileable
+        tiled_tileables = copy_tileables(
+            tileable.op.outputs,
+            inputs=[self._tile_context[inp] for inp in tileable.inputs],
+            copy_key=True,
+            copy_id=False,
+        )
+        tiled_tileables = [self._get_data(t) for t in tiled_tileables]
+        # start to tile
+        tiled_tileables = yield from handler.tile(tiled_tileables)
+        return tiled_tileables
+
+    def _gen_tileable_handlers(self, next_tileable_handlers: List[_TileableHandler]):
+        for tile_handler in self._tileable_handlers:
+            tileable, handler = tile_handler.tileable, tile_handler.handler
+            if tileable in self._tile_context:
+                continue
+            if any(
+                inp not in self._tile_context
+                for inp in self._tileable_graph.predecessors(tileable)
+            ):
+                # predecessors not finished yet
+                next_tileable_handlers.append(_TileableHandler(tileable, handler))
+                continue
+
+            yield _TileableHandler(tileable, handler)
+
+    def _tile(
+        self,
+        chunk_graph: ChunkGraph,
+        tileable: TileableType,
+        tile_handler: tile_gen_type,
+        next_tileable_handlers: List[_TileableHandler],
+        to_update_tileables: List[TileableType],
+        visited: Set[EntityType],
+    ):
+        try:
+            need_process = next(tile_handler)
+
+            if isinstance(need_process, TileStatus):
+                # process tile that returns progress
+                self._tile_context.set_progress(tileable, need_process.progress)
+                need_process = need_process.entities
+            else:
+                # if progress not specified, we just update 0.4 * rest progress
+                progress = self._tile_context.get_progress(tileable)
+                new_progress = progress + (1.0 - progress) * DEFAULT_UPDATED_PROGRESS
+                self._tile_context.set_progress(tileable, new_progress)
+
+            chunks = []
+            if need_process is not None:
+                for t in need_process:
+                    if isinstance(t, CHUNK_TYPE):
+                        chunks.append(self._get_data(t))
+                    elif isinstance(t, TILEABLE_TYPE):
+                        to_update_tileables.append(self._get_data(t))
+            # not finished yet
+            self._add_nodes(chunk_graph, chunks.copy(), visited, tileable)
+            next_tileable_handlers.append(
+                _TileableHandler(tileable, tile_handler, need_process)
+            )
+            # add intermediate chunks into result chunks
+            # to prevent them being pruned
+            chunk_graph.result_chunks.extend(c for c in chunks if c in chunk_graph)
+        except StopIteration as e:
+            # tile done
+            tiled_tileables = e.value
+            for out, tiled_tileable in zip(tileable.op.outputs, tiled_tileables):
+                out = self._get_data(out)
+                tiled_tileable = self._get_data(tiled_tileable)
+
+                chunks = tiled_tileable.chunks
+                if chunks is None:  # pragma: no cover
+                    raise ValueError(f"tileable({out}) is still coarse after tile")
+                chunks = [self._get_data(c) for c in chunks]
+                self._tile_context[out] = tiled_tileable
+                self._add_nodes(chunk_graph, chunks, visited, tileable)
+
+    def _gen_result_chunks(
+        self,
+        chunk_graph: ChunkGraph,
+        next_tileable_handlers: List[_TileableHandler],
+    ):
+        result_chunks = chunk_graph.result_chunks
+        tileable_graph = self._tileable_graph
+        result_chunk_set = set(result_chunks)
+
+        def _add_result_chunk(c):
+            if c not in result_chunk_set:
+                result_chunks.append(c)
+                result_chunk_set.add(c)
+
+        if next_tileable_handlers:
+            for tileable_handler in next_tileable_handlers:
+                tileable = tileable_handler.tileable
+                # tileable that tile not completed, scan their inputs
+                for inp_tileable in tileable_graph.iter_predecessors(tileable):
+                    if (
+                        tileable_handler.last_need_processes is None
+                        or tileable_graph.count_successors(inp_tileable) > 1
+                    ):
+                        # if nothing yielded inside its tile,
+                        # or the input has more than 1 successors,
+                        # make sure their chunks in result,
+                        # so that they will not be executed repeatedly
+                        if inp_tileable in self._tile_context:
+                            for chunk in self._tile_context[inp_tileable].chunks:
+                                chunk = self._get_data(chunk)
+                                if chunk in chunk_graph:
+                                    _add_result_chunk(chunk)
+        for tileable in tileable_graph.result_tileables:
+            if tileable in self._tile_context:
+                for chunk in self._tile_context[tileable].chunks:
+                    chunk = self._get_data(chunk)
+                    if chunk in chunk_graph:
+                        _add_result_chunk(chunk)
+                    if (
+                        chunk in self._chunk_to_fetch
+                        and self._chunk_to_fetch[chunk] in chunk_graph
+                    ):
+                        _add_result_chunk(self._chunk_to_fetch[chunk])
+
+    def _iter(self):
+        chunk_graph = self._cur_chunk_graph
+
+        to_update_tileables = []
+        visited = set()
+
+        if chunk_graph is not None:
+            # last tiled chunks, add them to processed
+            # so that fetch chunk can be generated.
+            # Use chunk key as the key to make sure the copied chunk can be build to a fetch.
+            processed_chunks = (
+                c.chunk.key if isinstance(c, FUSE_CHUNK_TYPE) else c.key
+                for c in chunk_graph.result_chunks
+            )
+            self._processed_chunks.update(processed_chunks)
+
+        result_chunks = []
+        chunk_graph = self._cur_chunk_graph = ChunkGraph(result_chunks)
+
+        next_tileable_handlers = []
+        # tile
+        for tile_handler in self._gen_tileable_handlers(next_tileable_handlers):
+            self._tile(
+                chunk_graph,
+                tile_handler.tileable,
+                tile_handler.handler,
+                next_tileable_handlers,
+                to_update_tileables,
+                visited,
+            )
+        self._tileable_handlers = next_tileable_handlers
+        # gen result chunks
+        self._gen_result_chunks(chunk_graph, next_tileable_handlers)
+        # prune unused chunks
+        prune_chunk_graph(chunk_graph)
+
+        self._curr_iter += 1
+
+        return to_update_tileables
+
+    def __iter__(self):
+        while self._tileable_handlers:
+            to_update_tileables = self._iter()
+            yield self._cur_chunk_graph
+            for t in to_update_tileables:
+                t.refresh_params()
+
+
+def prune_chunk_graph(chunk_graph: ChunkGraph):
+    from ....core.operand import Fetch, ShuffleProxy, VirtualOperand
+
+    result_set = set(chunk_graph.result_chunks)
+    stack = list(chunk_graph.result_chunks)
+    used = set()
+    while stack:
+        n = stack.pop()
+        if n in used:
+            continue
+        used.add(n)
+        stack.extend(chunk_graph.predecessors(n))
+        if isinstance(n.op, ShuffleProxy):
+            stack.extend(
+                succ for succ in chunk_graph.iter_successors(n) if succ not in used
+            )
+
+    unused = {n for n in chunk_graph if n not in used}
+    for n in unused:
+        # for pruned chunks, we assume we will use them later,
+        # so we add the inputs of them into result chunks,
+        # to prevent from duplicated submission
+        for inp in chunk_graph.iter_predecessors(n):
+            if (
+                inp in used
+                and inp not in result_set
+                and not isinstance(inp.op, (Fetch, VirtualOperand))
+            ):
+                chunk_graph.result_chunks.append(inp)
+                result_set.add(inp)
+        # prune chunk
+        chunk_graph.remove_node(n)
+
+
+class ChunkGraphBuilder(AbstractGraphBuilder):
+    _graph: TileableGraph
+
+    def __init__(
+        self,
+        graph: TileableGraph,
+        fuse_enabled: bool = True,
+        tile_context: TileContext = None,
+        tiler_cls: Union[Type[Tiler], Callable] = None,
+    ):
+        super().__init__(graph)
+        self.fuse_enabled = fuse_enabled
+        self.tile_context = TileContext() if tile_context is None else tile_context
+        self.tile_context.set_tileables(set(graph))
+
+        self._processed_chunks: Set[str] = set()
+        self._chunk_to_fetch: Dict[ChunkType, ChunkType] = dict()
+
+        tiler_cls = Tiler if tiler_cls is None else tiler_cls
+        self.tiler = tiler_cls(
+            self._graph,
+            self.tile_context,
+            self._processed_chunks,
+            self._chunk_to_fetch,
+            self._add_nodes,
+        )
+
+    def _process_node(self, entity: EntityType):
+        if entity.key in self._processed_chunks:
+            if entity not in self._chunk_to_fetch:
+                # gen fetch
+                fetch_chunk = build_fetch(entity).data
+                self._chunk_to_fetch[entity] = fetch_chunk
+            return self._chunk_to_fetch[entity]
+        return entity
+
+    def _select_inputs(self, inputs: List[ChunkType]):
+        new_inputs = []
+        for inp in inputs:
+            if inp.key in self._processed_chunks:
+                # gen fetch
+                if inp not in self._chunk_to_fetch:
+                    fetch_chunk = build_fetch(inp).data
+                    self._chunk_to_fetch[inp] = fetch_chunk
+                new_inputs.append(self._chunk_to_fetch[inp])
+            else:
+                new_inputs.append(inp)
+        return new_inputs
+
+    def _if_add_node(self, node: EntityType, visited: Set):
+        return node not in visited and node.key not in self._processed_chunks
+
+    def _build(self) -> Iterable[Union[TileableGraph, ChunkGraph]]:
+        tile_iterator = iter(self.tiler)
+        while True:
+            try:
+                with enter_mode(build=True, kernel=True):
+                    graph = next(tile_iterator)
+                yield graph
+            except StopIteration:
+                break
+
+    def build(self) -> Generator[Union[TileableGraph, ChunkGraph], None, None]:
+        yield from self._build()
diff --git a/python/xorbits/_mars/core/graph/builder/tileable.py b/python/xorbits/_mars/core/graph/builder/tileable.py
new file mode 100644
index 000000000..65d3c9ecb
--- /dev/null
+++ b/python/xorbits/_mars/core/graph/builder/tileable.py
@@ -0,0 +1,34 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Generator, Union
+
+from ...mode import enter_mode
+from ..entity import ChunkGraph, TileableGraph
+from .base import AbstractGraphBuilder
+
+
+class TileableGraphBuilder(AbstractGraphBuilder):
+    _graph: TileableGraph
+
+    def __init__(self, graph: TileableGraph):
+        super().__init__(graph=graph)
+
+    @enter_mode(build=True, kernel=True)
+    def _build(self) -> Union[TileableGraph, ChunkGraph]:
+        self._add_nodes(self._graph, list(self._graph.result_tileables), set())
+        return self._graph
+
+    def build(self) -> Generator[Union[TileableGraph, ChunkGraph], None, None]:
+        yield self._build()
diff --git a/python/xorbits/_mars/core/graph/builder/utils.py b/python/xorbits/_mars/core/graph/builder/utils.py
new file mode 100644
index 000000000..e32d9e4f5
--- /dev/null
+++ b/python/xorbits/_mars/core/graph/builder/utils.py
@@ -0,0 +1,41 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from typing import List, Union
+
+from ....typing import TileableType
+from ...mode import enter_mode
+from ..entity import ChunkGraph, TileableGraph
+from .chunk import ChunkGraphBuilder
+from .tileable import TileableGraphBuilder
+
+
+@enter_mode(kernel=True)
+def build_graph(
+    tileables: List[TileableType],
+    tile: bool = False,
+    fuse_enabled: bool = True,
+    **chunk_graph_build_kwargs
+) -> Union[TileableGraph, ChunkGraph]:
+    tileables = list(itertools.chain(*(tileable.op.outputs for tileable in tileables)))
+    tileable_graph = TileableGraph(tileables)
+    tileable_graph_builder = TileableGraphBuilder(tileable_graph)
+    tileable_graph = next(tileable_graph_builder.build())
+    if not tile:
+        return tileable_graph
+    chunk_graph_builder = ChunkGraphBuilder(
+        tileable_graph, fuse_enabled=fuse_enabled, **chunk_graph_build_kwargs
+    )
+    return next(chunk_graph_builder.build())
diff --git a/python/xorbits/_mars/core/graph/core.pyx b/python/xorbits/_mars/core/graph/core.pyx
new file mode 100644
index 000000000..9134ae4c2
--- /dev/null
+++ b/python/xorbits/_mars/core/graph/core.pyx
@@ -0,0 +1,464 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+from collections import deque
+from io import StringIO
+
+logger = logging.getLogger(__name__)
+
+
+cdef class DirectedGraph:
+    cdef readonly:
+        dict _nodes
+        dict _predecessors
+        dict _successors
+
+    def __init__(self):
+        self._nodes = dict()
+        self._predecessors = dict()
+        self._successors = dict()
+
+    def __iter__(self):
+        return iter(self._nodes)
+
+    def __contains__(self, n):
+        return n in self._nodes
+
+    def __len__(self):
+        return len(self._nodes)
+
+    def __getitem__(self, n):
+        return self._successors[n]
+
+    def contains(self, node):
+        return node in self._nodes
+
+    def add_node(self, node, node_attr=None, **node_attrs):
+        if node_attr is None:
+            node_attr = node_attrs
+        else:
+            try:
+                node_attr.update(node_attrs)
+            except AttributeError:
+                raise TypeError('The node_attr argument must be a dictionary')
+        self._add_node(node, node_attr)
+
+    cdef inline _add_node(self, node, dict node_attr=None):
+        if node_attr is None:
+            node_attr = dict()
+        if node not in self._nodes:
+            self._nodes[node] = node_attr
+            self._successors[node] = dict()
+            self._predecessors[node] = dict()
+        else:
+            self._nodes[node].update(node_attr)
+
+    def remove_node(self, node):
+        if node not in self._nodes:
+            raise KeyError(f'Node {node} does not exist '
+                           f'in the directed graph')
+
+        del self._nodes[node]
+
+        for succ in self._successors[node]:
+            del self._predecessors[succ][node]
+        del self._successors[node]
+
+        for pred in self._predecessors[node]:
+            del self._successors[pred][node]
+        del self._predecessors[node]
+
+    def add_edge(self, u, v, edge_attr=None, **edge_attrs):
+        if edge_attr is None:
+            edge_attr = edge_attrs
+        else:
+            try:
+                edge_attr.update(edge_attrs)
+            except AttributeError:
+                raise TypeError('The edge_attr argument must be a dictionary')
+        self._add_edge(u, v, edge_attr)
+
+    cdef inline _add_edge(self, u, v, edge_attr=None):
+        cdef:
+            dict u_succ, v_pred
+
+        if u not in self._nodes:
+            raise KeyError(f'Node {u} does not exist in the directed graph')
+        if v not in self._nodes:
+            raise KeyError(f'Node {v} does not exist in the directed graph')
+
+        if edge_attr is None:
+            edge_attr = dict()
+
+        u_succ = self._successors[u]
+        if v in u_succ:
+            u_succ[v].update(edge_attr)
+        else:
+            u_succ[v] = edge_attr
+
+        v_pred = self._predecessors[v]
+        if u not in v_pred:
+            # `update` is not necessary, as they point to the same object
+            v_pred[u] = edge_attr
+
+    def remove_edge(self, u, v):
+        try:
+            del self._successors[u][v]
+            del self._predecessors[v][u]
+        except KeyError:
+            raise KeyError(f'Edge {u}->{v} does not exist in the directed graph')
+
+    def has_successor(self, u, v):
+        return (u in self._successors) and (v in self._successors[u])
+
+    def has_predecessor(self, u, v):
+        return (u in self._predecessors) and (v in self._predecessors[u])
+
+    def iter_nodes(self, data=False):
+        if data:
+            return iter(self._nodes.items())
+        return iter(self._nodes)
+
+    def iter_successors(self, n):
+        try:
+            return iter(self._successors[n])
+        except KeyError:
+            raise KeyError(f'Node {n} does not exist in the directed graph')
+
+    cpdef list successors(self, n):
+        try:
+            return list(self._successors[n])
+        except KeyError:
+            raise KeyError(f'Node {n} does not exist in the directed graph')
+
+    def iter_predecessors(self, n):
+        try:
+            return iter(self._predecessors[n])
+        except KeyError:
+            raise KeyError(f'Node {n} does not exist in the directed graph')
+
+    cpdef list predecessors(self, n):
+        try:
+            return list(self._predecessors[n])
+        except KeyError:
+            raise KeyError(f'Node {n} does not exist in the directed graph')
+
+    cpdef int count_successors(self, n):
+        return len(self._successors[n])
+
+    cpdef int count_predecessors(self, n):
+        return len(self._predecessors[n])
+
+    def iter_indep(self, bint reverse=False):
+        cdef dict preds
+        preds = self._predecessors if not reverse else self._successors
+        for n, p in preds.items():
+            if len(p) == 0:
+                yield n
+
+    cpdef int count_indep(self, reverse=False):
+        cdef:
+            dict preds
+            int result = 0
+        preds = self._predecessors if not reverse else self._successors
+        for n, p in preds.items():
+            if len(p) == 0:
+                result += 1
+        return result
+
+    def dfs(self, start=None, visit_predicate=None, successors=None, reverse=False):
+        cdef:
+            set visited = set()
+            list stack
+            bint visit_all = False
+
+        if reverse:
+            pred_fun, succ_fun = self.successors, self.predecessors
+        else:
+            pred_fun, succ_fun = self.predecessors, self.successors
+
+        if start:
+            if not isinstance(start, (list, tuple)):
+                start = [start]
+            stack = list(start)
+        else:
+            stack = list(self.iter_indep(reverse=reverse))
+
+        def _default_visit_predicate(n, visited):
+            cdef list preds
+            preds = pred_fun(n)
+            return not preds or all(pred in visited for pred in preds)
+
+        successors = successors or succ_fun
+        visit_all = (visit_predicate == 'all')
+        visit_predicate = visit_predicate or _default_visit_predicate
+
+        while stack:
+            node = stack.pop()
+            if node in visited:
+                continue
+            preds = self.predecessors(node)
+            if visit_all or visit_predicate(node, visited):
+                yield node
+                visited.add(node)
+                stack.extend(n for n in successors(node) if n not in visited)
+            else:
+                stack.append(node)
+                stack.extend(n for n in preds if n not in visited)
+
+    def bfs(self, start=None, visit_predicate=None, successors=None, reverse=False):
+        cdef:
+            object queue
+            object node
+            set visited = set()
+            bint visit_all = False
+
+        if reverse:
+            pred_fun, succ_fun = self.successors, self.predecessors
+        else:
+            pred_fun, succ_fun = self.predecessors, self.successors
+
+        if start is not None:
+            if not isinstance(start, (list, tuple)):
+                start = [start]
+            queue = deque(start)
+        else:
+            queue = deque(self.iter_indep(reverse=reverse))
+
+        def _default_visit_predicate(n, visited):
+            preds = pred_fun(n)
+            return not preds or all(pred in visited for pred in preds)
+
+        successors = successors or succ_fun
+        visit_all = (visit_predicate == 'all')
+        visit_predicate = visit_predicate or _default_visit_predicate
+
+        while queue:
+            node = queue.popleft()
+            if node in visited:
+                continue
+            preds = pred_fun(node)
+            if visit_all or visit_predicate(node, visited):
+                yield node
+                visited.add(node)
+                queue.extend(n for n in successors(node) if n not in visited)
+            else:
+                queue.append(node)
+                queue.extend(n for n in preds if n not in visited)
+
+    def copy(self):
+        cdef DirectedGraph graph = type(self)()
+        for n in self:
+            if n not in graph._nodes:
+                graph._add_node(n)
+            for succ in self.iter_successors(n):
+                if succ not in graph._nodes:
+                    graph._add_node(succ)
+                graph._add_edge(n, succ)
+        return graph
+
+    def copyto(self, DirectedGraph other_graph):
+        if other_graph is self:
+            return
+
+        other_graph._nodes = self._nodes.copy()
+        other_graph._predecessors = self._predecessors.copy()
+        other_graph._successors = self._successors.copy()
+
+    def build_undirected(self):
+        cdef DirectedGraph graph = DirectedGraph()
+        for n in self:
+            if n not in graph._nodes:
+                graph._add_node(n)
+            for succ in self._successors[n]:
+                if succ not in graph._nodes:
+                    graph._add_node(succ)
+                graph._add_edge(n, succ)
+                graph._add_edge(succ, n)
+        return graph
+
+    def build_reversed(self):
+        cdef DirectedGraph graph = type(self)()
+        for n in self:
+            if n not in graph._nodes:
+                graph._add_node(n)
+            for succ in self._successors[n]:
+                if succ not in graph._nodes:
+                    graph._add_node(succ)
+                graph._add_edge(succ, n)
+        return graph
+
+    @classmethod
+    def _repr_in_dot(cls, val):
+        if isinstance(val, bool):
+            return 'true' if val else 'false'
+        if isinstance(val, str):
+            return f'"{val}"'
+        return val
+
+    def _extract_operands(self, node):
+        return [node.op]
+
+    def to_dot(
+        self,
+        graph_attrs=None,
+        node_attrs=None,
+        trunc_key=5, result_chunk_keys=None, show_columns=False):
+
+        sio = StringIO()
+        sio.write('digraph {\n')
+        sio.write('splines=curved\n')
+        sio.write('rankdir=BT\n')
+
+        if graph_attrs:
+            sio.write('graph [{0}];\n'.format(
+                ' '.join(f'{k}={self._repr_in_dot(v)}' for k, v in graph_attrs.items())))
+        if node_attrs:
+            sio.write('node [{0}];\n'.format(
+                ' '.join(f'{k}={self._repr_in_dot(v)}' for k, v in node_attrs.items())))
+
+        chunk_style = '[shape=box]'
+        operand_style = '[shape=circle]'
+
+        visited = set()
+
+        def get_col_names(obj):
+            if hasattr(obj, "dtypes"):
+                return f"\"{','.join(list(obj.dtypes.index))}\""
+            elif hasattr(obj, "name"):
+                return f"\"{obj.name}\""
+            else:
+                return "\"N/A\""
+
+        for node in self.iter_nodes():
+            for op in self._extract_operands(node):
+                op_name = type(op).__name__
+                if op.stage is not None:
+                    op_name = f'{op_name}:{op.stage.name}'
+                if op.key in visited:
+                    continue
+                for input_chunk in (op.inputs or []):
+                    if input_chunk.key not in visited:
+                        sio.write(f'"Chunk:{self._gen_chunk_key(input_chunk, trunc_key)}" {chunk_style}\n')
+                        visited.add(input_chunk.key)
+                    if op.key not in visited:
+                        sio.write(f'"{op_name}:{op.key[:trunc_key]}" {operand_style}\n')
+                        visited.add(op.key)
+                    sio.write(f'"Chunk:{self._gen_chunk_key(input_chunk, trunc_key)}" -> '
+                              f'"{op_name}:{op.key[:trunc_key]}"\n')
+
+                for output_chunk in (op.outputs or []):
+                    if output_chunk.key not in visited:
+                        tmp_chunk_style = chunk_style
+                        if result_chunk_keys and output_chunk.key in result_chunk_keys:
+                            tmp_chunk_style = '[shape=box,style=filled,fillcolor=cadetblue1]'
+                        sio.write(f'"Chunk:{self._gen_chunk_key(output_chunk, trunc_key)}" {tmp_chunk_style}\n')
+                        visited.add(output_chunk.key)
+                    if op.key not in visited:
+                        sio.write(f'"{op_name}:{op.key[:trunc_key]}" {operand_style}\n')
+                        visited.add(op.key)
+                    sio.write(f'"{op_name}:{op.key[:trunc_key]}" -> '
+                              f'"Chunk:{self._gen_chunk_key(output_chunk, trunc_key)}"')
+                    if show_columns:
+                        sio.write(f' [ label={get_col_names(output_chunk)} ]')
+                    sio.write("\n")
+
+        sio.write('}')
+        return sio.getvalue()
+
+    @classmethod
+    def _gen_chunk_key(cls, chunk, trunc_key):
+        if "_" in chunk.key:
+            key, index = chunk.key.split("_", 1)
+            return "_".join([key[:trunc_key], index])
+        else:  # pragma: no cover
+            return chunk.key[:trunc_key]
+
+    def _repr_svg_(self):  # pragma: no cover
+        from graphviz import Source
+        return Source(self.to_dot())._repr_svg_()
+
+    def compose(self, list keys=None):
+        from ...optimizes.chunk_graph.fuse import Fusion
+
+        return Fusion(self).compose(keys=keys)
+
+    def decompose(self, nodes=None):
+        from ...optimizes.chunk_graph.fuse import Fusion
+
+        Fusion(self).decompose(nodes=nodes)
+
+    def view(self, filename='default', graph_attrs=None, node_attrs=None, result_chunk_keys=None, show_columns=False):  # pragma: no cover
+        from graphviz import Source
+
+        g = Source(self.to_dot(graph_attrs, node_attrs, result_chunk_keys=result_chunk_keys, show_columns=show_columns))
+        g.view(filename=filename, cleanup=True)
+
+    def to_dag(self):
+        dag = DAG()
+        dag._nodes = self._nodes.copy()
+        dag._predecessors = self._predecessors.copy()
+        dag._successors = self._successors.copy()
+        return dag
+
+
+class GraphContainsCycleError(Exception):
+    pass
+
+
+cdef class DAG(DirectedGraph):
+    def to_dag(self):
+        return self
+
+    def topological_iter(self, succ_checker=None, reverse=False):
+        cdef:
+            dict preds, succs
+            set visited = set()
+            list stack
+
+        if len(self) == 0:
+            return
+
+        if reverse:
+            preds, succs = self._successors, self._predecessors
+        else:
+            preds, succs = self._predecessors, self._successors
+
+        # copy predecessors and successors
+        succs = dict((k, set(v)) for k, v in succs.items())
+        preds = dict((k, set(v)) for k, v in preds.items())
+
+        def _default_succ_checker(_, predecessors):
+            return len(predecessors) == 0
+
+        succ_checker = succ_checker or _default_succ_checker
+
+        stack = list((p for p, l in preds.items() if len(l) == 0))
+        if not stack:
+            raise GraphContainsCycleError
+        while stack:
+            node = stack.pop()
+            yield node
+            visited.add(node)
+            for succ in succs.get(node, {}):
+                if succ in visited:
+                    raise GraphContainsCycleError
+                succ_preds = preds[succ]
+                succ_preds.remove(node)
+                if succ_checker(succ, succ_preds):
+                    stack.append(succ)
+        if len(visited) != len(self):
+            raise GraphContainsCycleError
diff --git a/python/xorbits/_mars/core/graph/entity.py b/python/xorbits/_mars/core/graph/entity.py
new file mode 100644
index 000000000..8672cdbec
--- /dev/null
+++ b/python/xorbits/_mars/core/graph/entity.py
@@ -0,0 +1,170 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABCMeta, abstractmethod
+from typing import Dict, Iterable, List, Tuple, Union
+
+from ...core import Chunk, Tileable
+from ...serialization.core import buffered
+from ...serialization.serializables import BoolField, DictField, ListField, Serializable
+from ...serialization.serializables.core import SerializableSerializer
+from ...utils import tokenize
+from .core import DAG
+
+
+class EntityGraph(DAG, metaclass=ABCMeta):
+    @property
+    @abstractmethod
+    def results(self):
+        """
+        Return result tileables or chunks.
+
+        Returns
+        -------
+        results
+        """
+
+    @results.setter
+    @abstractmethod
+    def results(self, new_results):
+        """
+        Set result tileables or chunks.
+
+        Parameters
+        ----------
+        new_results
+
+        Returns
+        -------
+
+        """
+
+    def copy(self) -> "EntityGraph":
+        graph = super().copy()
+        graph.results = self.results.copy()
+        return graph
+
+
+class TileableGraph(EntityGraph, Iterable[Tileable]):
+    _result_tileables: List[Tileable]
+    # logic key is a unique and deterministic key for `TileableGraph`. For
+    # multiple runs the logic key will remain same if the computational logic
+    # doesn't change. And it can be used to some optimization when running a
+    # same `execute`, like HBO.
+    _logic_key: str
+
+    def __init__(self, result_tileables: List[Tileable] = None):
+        super().__init__()
+        self._result_tileables = result_tileables
+
+    @property
+    def result_tileables(self):
+        return self._result_tileables
+
+    @property
+    def results(self):
+        return self._result_tileables
+
+    @results.setter
+    def results(self, new_results):
+        self._result_tileables = new_results
+
+    @property
+    def logic_key(self):
+        if not hasattr(self, "_logic_key") or self._logic_key is None:
+            token_keys = []
+            for node in self.bfs():
+                logic_key = node.op.get_logic_key()
+                if hasattr(node.op, "logic_key") and node.op.logic_key is None:
+                    node.op.logic_key = logic_key
+                token_keys.append(
+                    tokenize(logic_key, **node.extra_params)
+                    if node.extra_params
+                    else logic_key
+                )
+            self._logic_key = tokenize(*token_keys)
+        return self._logic_key
+
+
+class ChunkGraph(EntityGraph, Iterable[Chunk]):
+    _result_chunks: List[Chunk]
+
+    def __init__(self, result_chunks: List[Chunk] = None):
+        super().__init__()
+        self._result_chunks = result_chunks
+
+    @property
+    def result_chunks(self):
+        return self._result_chunks
+
+    @property
+    def results(self):
+        return self._result_chunks
+
+    @results.setter
+    def results(self, new_results):
+        self._result_chunks = new_results
+
+
+class SerializableGraph(Serializable):
+    _is_chunk = BoolField("is_chunk")
+    # TODO(qinxuye): remove this logic when we handle fetch elegantly,
+    # now, the node in the graph and inputs for operand may be inconsistent,
+    # for example, an operand's inputs may be chunks,
+    # but in the graph, the predecessors are all fetch chunks,
+    # we serialize the fetch chunks first to make sure when operand's inputs
+    # are serialized, they will just be marked as serialized and skip serialization.
+    _fetch_nodes = ListField("fetch_nodes")
+    _nodes = DictField("nodes")
+    _predecessors = DictField("predecessors")
+    _successors = DictField("successors")
+    _results = ListField("results")
+
+    @classmethod
+    def from_graph(cls, graph: Union[TileableGraph, ChunkGraph]) -> "SerializableGraph":
+        from ..operand import Fetch
+
+        is_chunk = isinstance(graph, ChunkGraph)
+        return SerializableGraph(
+            _is_chunk=is_chunk,
+            _fetch_nodes=[chunk for chunk in graph if isinstance(chunk.op, Fetch)],
+            _nodes=graph._nodes,
+            _predecessors=graph._predecessors,
+            _successors=graph._successors,
+            _results=graph.results,
+        )
+
+    def to_graph(self) -> Union[TileableGraph, ChunkGraph]:
+        graph_cls = ChunkGraph if self._is_chunk else TileableGraph
+        graph = graph_cls(self._results)
+        graph._nodes.update(self._nodes)
+        graph._predecessors.update(self._predecessors)
+        graph._successors.update(self._successors)
+        return graph
+
+
+class GraphSerializer(SerializableSerializer):
+    @buffered
+    def serial(self, obj: Union[TileableGraph, ChunkGraph], context: Dict):
+        serializable_graph = SerializableGraph.from_graph(obj)
+        return (), [serializable_graph], False
+
+    def deserial(
+        self, serialized: Tuple, context: Dict, subs: List
+    ) -> Union[TileableGraph, ChunkGraph]:
+        serializable_graph: SerializableGraph = subs[0]
+        return serializable_graph.to_graph()
+
+
+GraphSerializer.register(EntityGraph)
diff --git a/python/xorbits/_mars/core/graph/tests/__init__.py b/python/xorbits/_mars/core/graph/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/core/graph/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/core/graph/tests/test_graph.py b/python/xorbits/_mars/core/graph/tests/test_graph.py
new file mode 100644
index 000000000..ed9be63bf
--- /dev/null
+++ b/python/xorbits/_mars/core/graph/tests/test_graph.py
@@ -0,0 +1,211 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+from .... import dataframe as md
+from .... import tensor as mt
+from ....tests.core import flaky
+from ....utils import to_str
+from .. import DAG, GraphContainsCycleError
+
+
+def test_dag():
+    r"""
+    1 --- 4
+    2 --- 6
+      \  /
+       5
+     /
+    3
+    """
+
+    dag = DAG()
+    [dag.add_node(i) for i in range(1, 7)]
+    dag.add_edge(1, 4)
+    dag.add_edge(2, 6)
+    dag.add_edge(2, 5)
+    dag.add_edge(5, 6)
+    dag.add_edge(3, 5)
+
+    with pytest.raises(KeyError):
+        dag.add_edge(1, 10)
+    with pytest.raises(KeyError):
+        dag.add_edge(10, 1)
+
+    assert set(dag[2]) == {5, 6}
+    assert list(dag.topological_iter()) == [3, 2, 5, 6, 1, 4]
+
+    assert list(dag.dfs()) == [3, 2, 5, 6, 1, 4]
+    assert list(dag.bfs()) == [1, 2, 3, 4, 5, 6]
+
+    dag.add_edge(6, 1)
+    dag.add_edge(1, 2)
+
+    with pytest.raises(KeyError):
+        for _ in dag.iter_predecessors(-1):
+            pass
+
+    with pytest.raises(KeyError):
+        for _ in dag.iter_successors(-1):
+            pass
+
+    with pytest.raises(GraphContainsCycleError):
+        _ = list(dag.topological_iter())
+
+    dag.remove_edge(2, 5)
+    assert dag.has_successor(2, 5) is False
+    with pytest.raises(KeyError):
+        dag.remove_edge(2, 5)
+
+    rev_dag = dag.build_reversed()
+    for n in dag:
+        assert n in rev_dag
+        assert (
+            all(rev_dag.has_successor(n, pred) for pred in dag.predecessors(n)) is True
+        )
+
+    undigraph = dag.build_undirected()
+    for n in dag:
+        assert n in undigraph
+        assert (
+            all(undigraph.has_predecessor(pred, n) for pred in dag.predecessors(n))
+            is True
+        )
+        assert (
+            all(undigraph.has_successor(n, pred) for pred in dag.predecessors(n))
+            is True
+        )
+
+    dag_copy = dag.copy()
+    for n in dag:
+        assert n in dag_copy
+        assert (
+            all(dag_copy.has_successor(pred, n) for pred in dag_copy.predecessors(n))
+            is True
+        )
+
+
+@flaky(max_runs=3)
+def test_to_dot():
+    arr = mt.random.randint(10, size=(10, 8), chunk_size=4)
+    arr_add = mt.random.randint(10, size=(10, 8), chunk_size=4)
+    arr2 = arr + arr_add
+    graph = arr2.build_graph(fuse_enabled=False, tile=True)
+
+    dot = to_str(graph.to_dot(trunc_key=5))
+    assert all(to_str(n.key)[:5] in dot for n in graph) is True
+
+
+def test_tileable_graph_logic_key():
+    # Tensor
+    t1 = mt.random.randint(10, size=(10, 8), chunk_size=4)
+    t2 = mt.random.randint(10, size=(10, 8), chunk_size=5)
+    graph1 = (t1 + t2).build_graph(tile=False)
+    tt1 = mt.random.randint(10, size=(10, 8), chunk_size=4)
+    tt2 = mt.random.randint(10, size=(10, 8), chunk_size=5)
+    graph2 = (tt1 + tt2).build_graph(tile=False)
+    assert graph1.logic_key == graph2.logic_key
+    t3 = mt.random.randint(10, size=(10, 8), chunk_size=6)
+    tt3 = mt.random.randint(10, size=(10, 8), chunk_size=6)
+    graph3 = (t1 + t3).build_graph(tile=False)
+    graph4 = (t1 + tt3).build_graph(tile=False)
+    assert graph1.logic_key != graph3.logic_key
+    assert graph3.logic_key == graph4.logic_key
+    t4 = mt.random.randint(10, size=(10, 8))
+    graph5 = (t1 + t4).build_graph(tile=False)
+    assert graph1.logic_key != graph5.logic_key
+
+    # Series
+    s1 = md.Series([1, 3, 5, mt.nan, 6, 8])
+    s2 = md.Series(np.random.randn(1000), chunk_size=100)
+    graph1 = (s1 + s2).build_graph(tile=False)
+    ss1 = md.Series([1, 3, 5, mt.nan, 6, 8])
+    ss2 = md.Series(np.random.randn(1000), chunk_size=100)
+    graph2 = (ss1 + ss2).build_graph(tile=False)
+    assert graph1.logic_key == graph2.logic_key
+    s3 = md.Series(np.random.randn(1000), chunk_size=200)
+    ss3 = md.Series(np.random.randn(1000), chunk_size=200)
+    graph3 = (s1 + s3).build_graph(tile=False)
+    graph4 = (s1 + ss3).build_graph(tile=False)
+    assert graph1.logic_key != graph3.logic_key
+    assert graph3.logic_key == graph4.logic_key
+    s4 = md.Series(np.random.randn(1000))
+    graph5 = (s1 + s4).build_graph(tile=False)
+    assert graph1.logic_key != graph5.logic_key
+
+    # DataFrame
+    df1 = md.DataFrame(
+        np.random.randint(0, 100, size=(100_000, 4)), columns=list("ABCD"), chunk_size=5
+    )
+    df2 = md.DataFrame(
+        np.random.randint(0, 100, size=(100_000, 4)), columns=list("ABCD"), chunk_size=4
+    )
+    graph1 = (df1 + df2).build_graph(tile=False)
+    ddf1 = md.DataFrame(
+        np.random.randint(0, 100, size=(100_000, 4)), columns=list("ABCD"), chunk_size=5
+    )
+    ddf2 = md.DataFrame(
+        np.random.randint(0, 100, size=(100_000, 4)), columns=list("ABCD"), chunk_size=4
+    )
+    graph2 = (ddf1 + ddf2).build_graph(tile=False)
+    assert graph1.logic_key == graph2.logic_key
+    df3 = md.DataFrame(
+        np.random.randint(0, 100, size=(100_000, 4)), columns=list("ABCD"), chunk_size=3
+    )
+    ddf3 = md.DataFrame(
+        np.random.randint(0, 100, size=(100_000, 4)), columns=list("ABCD"), chunk_size=3
+    )
+    graph3 = (df1 + df3).build_graph(tile=False)
+    graph4 = (df1 + ddf3).build_graph(tile=False)
+    assert graph1.logic_key != graph3.logic_key
+    assert graph3.logic_key == graph4.logic_key
+    df5 = md.DataFrame(
+        np.random.randint(0, 100, size=(100_000, 4)), columns=list("ABCD")
+    )
+    graph5 = (df1 + df5).build_graph(tile=False)
+    assert graph1.logic_key != graph5.logic_key
+    graph6 = df1.describe().build_graph(tile=False)
+    graph7 = df2.describe().build_graph(tile=False)
+    assert graph6.logic_key != graph7.logic_key
+    graph8 = df1.apply(lambda x: x.max() - x.min()).build_graph(tile=False)
+    graph9 = df2.apply(lambda x: x.max() - x.min()).build_graph(tile=False)
+    assert graph8.logic_key != graph9.logic_key
+    assert (
+        graph8.result_tileables[0].op.logic_key
+        == graph9.result_tileables[0].op.logic_key
+    )
+    pieces1 = [df1[:3], df1[3:7], df1[7:]]
+    graph10 = md.concat(pieces1).build_graph(tile=False)
+    pieces2 = [df2[:3], df2[3:7], df2[7:]]
+    graph11 = md.concat(pieces2).build_graph(tile=False)
+    assert graph10.logic_key != graph11.logic_key
+    graph12 = md.merge(df1, df2, on="A", how="left").build_graph(tile=False)
+    graph13 = md.merge(df1, df3, on="A", how="left").build_graph(tile=False)
+    assert graph12.logic_key != graph13.logic_key
+    graph14 = df2.groupby("A").sum().build_graph(tile=False)
+    graph15 = df3.groupby("A").sum().build_graph(tile=False)
+    assert graph14.logic_key != graph15.logic_key
+    graph16 = (
+        df2.groupby("A").apply(lambda x: x.max() - x.min()).build_graph(tile=False)
+    )
+    graph17 = (
+        df3.groupby("A").apply(lambda x: x.max() - x.min()).build_graph(tile=False)
+    )
+    assert graph16.logic_key != graph17.logic_key
+    assert (
+        graph16.result_tileables[0].op.logic_key
+        == graph17.result_tileables[0].op.logic_key
+    )
diff --git a/python/xorbits/_mars/core/mode.py b/python/xorbits/_mars/core/mode.py
new file mode 100644
index 000000000..09ab86faf
--- /dev/null
+++ b/python/xorbits/_mars/core/mode.py
@@ -0,0 +1,96 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import inspect
+import threading
+
+from ..config import options
+
+_internal_mode = threading.local()
+
+
+def is_eager_mode():
+    in_kernel = is_kernel_mode()
+    if not in_kernel:
+        return options.eager_mode
+    else:
+        # in kernel, eager mode always False
+        return False
+
+
+def is_kernel_mode():
+    try:
+        return bool(_internal_mode.kernel)
+    except AttributeError:
+        _internal_mode.kernel = None
+        return False
+
+
+def is_build_mode():
+    return bool(getattr(_internal_mode, "build", False))
+
+
+class _EnterModeFuncWrapper:
+    def __init__(self, mode_name_to_value):
+        self.mode_name_to_value = mode_name_to_value
+
+        # as the wrapper may enter for many times
+        # record old values for each time
+        self.mode_name_to_value_list = list()
+
+    def __enter__(self):
+        mode_name_to_old_value = dict()
+        for mode_name, value in self.mode_name_to_value.items():
+            # record mode's old values
+            mode_name_to_old_value[mode_name] = getattr(_internal_mode, mode_name, None)
+            if value is None:
+                continue
+            # set value
+            setattr(_internal_mode, mode_name, value)
+        self.mode_name_to_value_list.append(mode_name_to_old_value)
+
+    def __exit__(self, *_):
+        mode_name_to_old_value = self.mode_name_to_value_list.pop()
+        for mode_name in self.mode_name_to_value.keys():
+            # set back old values
+            setattr(_internal_mode, mode_name, mode_name_to_old_value[mode_name])
+
+    def __call__(self, func):
+        mode_name_to_value = self.mode_name_to_value.copy()
+        if not inspect.iscoroutinefunction(func):
+            # sync
+            @functools.wraps(func)
+            def _inner(*args, **kwargs):
+                with enter_mode(**mode_name_to_value):
+                    return func(*args, **kwargs)
+
+        else:
+            # async
+            @functools.wraps(func)
+            async def _inner(*args, **kwargs):
+                with enter_mode(**mode_name_to_value):
+                    return await func(*args, **kwargs)
+
+        return _inner
+
+
+def enter_mode(kernel=None, build=None):
+    mode_name_to_value = {
+        "kernel": kernel,
+        "build": build,
+    }
+    mode_name_to_value = {k: v for k, v in mode_name_to_value.items() if v is not None}
+
+    return _EnterModeFuncWrapper(mode_name_to_value)
diff --git a/python/xorbits/_mars/core/operand/__init__.py b/python/xorbits/_mars/core/operand/__init__.py
new file mode 100644
index 000000000..7f86d5ace
--- /dev/null
+++ b/python/xorbits/_mars/core/operand/__init__.py
@@ -0,0 +1,34 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import (
+    HasInput,
+    LogicKeyGenerator,
+    Operand,
+    OperandStage,
+    OperatorLogicKeyGeneratorMixin,
+    VirtualOperand,
+)
+from .core import TileableOperandMixin, estimate_size, execute
+from .fetch import Fetch, FetchMixin, FetchShuffle, ShuffleFetchType
+from .fuse import Fuse, FuseChunkMixin
+from .objects import (
+    MergeDictOperand,
+    ObjectFetch,
+    ObjectFuseChunk,
+    ObjectFuseChunkMixin,
+    ObjectOperand,
+    ObjectOperandMixin,
+)
+from .shuffle import MapReduceOperand, ShuffleProxy
diff --git a/python/xorbits/_mars/core/operand/base.py b/python/xorbits/_mars/core/operand/base.py
new file mode 100644
index 000000000..e009d25cf
--- /dev/null
+++ b/python/xorbits/_mars/core/operand/base.py
@@ -0,0 +1,387 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import weakref
+from copy import deepcopy
+from enum import Enum
+from functools import partial
+from typing import Any, Dict, List, Tuple, Type, Union
+
+from ...serialization.core import Placeholder
+from ...serialization.serializables import (
+    BoolField,
+    DictField,
+    FieldTypes,
+    Float32Field,
+    Int32Field,
+    ListField,
+    ReferenceField,
+    Serializable,
+    SerializableMeta,
+    StringField,
+    TupleField,
+)
+from ...serialization.serializables.core import SerializableSerializer
+from ...typing import OperandType
+from ...utils import AttributeDict, classproperty, tokenize
+from ..base import Base
+from ..entity.chunks import Chunk
+from ..entity.core import ENTITY_TYPE, Entity, EntityData
+from ..entity.output_types import OutputType
+from ..entity.tileables import Tileable
+from ..mode import enter_mode
+
+
+class OperandMetaclass(SerializableMeta):
+    def __new__(mcs, name: str, bases: Tuple[Type], properties: Dict):
+        if "__call__" in properties:
+            # if __call__ is specified for an operand,
+            # make sure that entering user space
+            properties["__call__"] = enter_mode(kernel=False)(properties["__call__"])
+
+        return super().__new__(mcs, name, bases, properties)
+
+
+class OperandStage(Enum):
+    map = 0
+    reduce = 1
+    combine = 2
+    agg = 3
+
+
+class SchedulingHint(Serializable):
+    # worker to execute, only work for chunk op,
+    # if specified, the op should be executed on the specified worker
+    # only work for those operand that has no input
+    expect_worker = StringField("expect_worker", default=None)
+    # band to execute, only work for chunk op,
+    # if specified, the op should be executed on the specified band
+    # only work for those operand that has no input
+    expect_band = TupleField(
+        "expect_band",
+        FieldTypes.tuple(FieldTypes.string, FieldTypes.string),
+        default=None,
+    )
+    # will this operand be assigned a worker or not
+    reassign_worker = BoolField("reassign_worker", default=False)
+    # mark a op as fuseable
+    fuseable = BoolField("fuseable", default=True)
+    # True means control dependency, False means data dependency
+    _pure_depends = ListField("pure_depends", FieldTypes.bool, default=None)
+    # useful when setting chunk index as priority,
+    # useful for those op like read_csv, the first chunk
+    # need to be executed not later than the later ones,
+    # because the range index of later chunk should be accumulated from
+    # indexes of previous ones
+    # `gpu` indicates that if the operand should be executed on the GPU.
+    gpu = BoolField("gpu", default=None)
+    priority = Int32Field("priority", default=None)
+
+    @classproperty
+    @functools.lru_cache(1)
+    def all_hint_names(cls):
+        return list(cls._FIELDS)
+
+    def can_be_fused(self) -> bool:
+        if not self.fuseable:
+            return False
+        if self.reassign_worker:
+            return False
+        if self._pure_depends and any(depend for depend in self._pure_depends):
+            # control dependency exists
+            return False
+        return True
+
+
+def _install_scheduling_hint_properties(cls: Type["Operand"]):
+    def get_hint(name):
+        def _get_val(operand: "Operand"):
+            if operand.scheduling_hint:
+                return getattr(operand.scheduling_hint, name)
+
+        def _set_val(operand: "Operand", val: Any):
+            if not operand.scheduling_hint:
+                operand.scheduling_hint = SchedulingHint(**{name: val})
+            else:
+                setattr(operand.scheduling_hint, name, val)
+
+        return property(_get_val, _set_val)
+
+    for hint_name in SchedulingHint.all_hint_names:
+        setattr(cls, hint_name, get_hint(hint_name))
+    return cls
+
+
+class OperatorLogicKeyGeneratorMixin:
+    """
+    This generator will generate an unique and deterministic key for operator compute logic. It should be same
+    for different run if the compute logic doesn't change. This id will be used in subtask speculative
+    execution and hbo scheduling and so on.
+    """
+
+    def get_logic_key(self):
+        """The subclass may need to override this method to ensure unique and deterministic."""
+        fields = self._get_logic_key_token_values()
+        try:
+            return tokenize(*fields)
+        except Exception as e:  # pragma: no cover
+            raise ValueError(
+                f"Cannot generate logic key for operator {self} with fields {fields}"
+            ) from e
+
+    def _get_logic_key_token_values(self):
+        token_values = [type(self).__module__, type(self).__name__]
+        if self.stage is not None:
+            token_values.append(self.stage.name)
+        return token_values
+
+
+class LogicKeyGenerator:
+    def __init__(self):
+        self.operator_id_to_logic_key = {}
+
+    def get_logic_key(self, op: "Operand"):
+        assert isinstance(op, Operand)
+        logic_key = self.operator_id_to_logic_key.get(op.id)
+        if logic_key is None:
+            logic_key = self.operator_id_to_logic_key[op.id] = op.get_logic_key()
+        return logic_key
+
+
+@_install_scheduling_hint_properties
+class Operand(Base, OperatorLogicKeyGeneratorMixin, metaclass=OperandMetaclass):
+    """
+    Operand base class. All operands should have a type, which can be Add, Subtract etc.
+    `sparse` indicates that if the operand is applied on a sparse tensor/chunk.
+    `device`, 0 means the CPU, otherwise means the GPU device.
+    Operand can have inputs and outputs
+    which should be the :class:`mars.tensor.core.TensorData`, :class:`mars.tensor.core.ChunkData` etc.
+    """
+
+    attr_tag = "attr"
+    _init_update_key_ = False
+    _output_type_ = None
+    _no_copy_attrs_ = Base._no_copy_attrs_ | {"scheduling_hint"}
+    _cache_primitive_serial = True
+
+    sparse = BoolField("sparse", default=False)
+    device = Int32Field("device", default=None)
+    # will this operand create a view of input data or not
+    create_view = BoolField("create_view", default=False)
+    stage = ReferenceField("stage", OperandStage, default=None)
+    memory_scale = Float32Field("memory_scale", default=None)
+    tileable_op_key = StringField("tileable_op_key", default=None)
+    extra_params = DictField("extra_params", key_type=FieldTypes.string)
+    # scheduling hint
+    scheduling_hint = ReferenceField("scheduling_hint", SchedulingHint, default=None)
+
+    _inputs = ListField(
+        "inputs", FieldTypes.reference(EntityData), default_factory=list
+    )
+    # outputs are weak-refs which are not pickle-able
+    _outputs = ListField(
+        "outputs", default=None, on_serialize=lambda outputs: [o() for o in outputs]
+    )
+    _output_types = ListField(
+        "output_type", FieldTypes.reference(OutputType), default=None
+    )
+
+    def __init__(self: OperandType, *args, **kwargs):
+        self._parse_kwargs(kwargs)
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def _parse_kwargs(cls, kwargs: Dict[str, Any]):
+        extra_params = kwargs.pop("extra_params", {})
+        kwargs["extra_params"] = extras = AttributeDict(extra_params)
+        kwargs["scheduling_hint"] = scheduling_hint = kwargs.get(
+            "scheduling_hint", SchedulingHint()
+        )
+        for k in set(kwargs):
+            if k in cls._FIELDS:
+                continue
+            elif k in SchedulingHint.all_hint_names:
+                setattr(scheduling_hint, k, kwargs.pop(k))
+            else:
+                extras[k] = kwargs.pop(k)
+
+    def __repr__(self):
+        if self.stage is None:
+            return f"{type(self).__name__} <key={self.key}>"
+        else:
+            return f"{type(self).__name__} <key={self.key}, stage={self.stage.name}>"
+
+    @classmethod
+    def _get_entity_data(cls, entity):
+        if isinstance(entity, Entity):
+            return entity.data
+        return entity
+
+    @classmethod
+    def _get_inputs_data(cls, inputs):
+        return [cls._get_entity_data(inp) for inp in inputs]
+
+    def _set_inputs(self, inputs):
+        if inputs is not None:
+            inputs = self._get_inputs_data(inputs)
+        if hasattr(self, "check_inputs"):
+            self.check_inputs(inputs)
+        setattr(self, "_inputs", inputs)
+
+    @property
+    def inputs(self) -> List[Union[ENTITY_TYPE]]:
+        inputs = self._inputs
+        if inputs is None:
+            inputs = self._inputs = []
+        return inputs
+
+    @inputs.setter
+    def inputs(self, vals):
+        self._set_inputs(vals)
+
+    @property
+    def output_limit(self) -> int:
+        return 1
+
+    @property
+    def pure_depends(self):
+        val = self._pure_depends  # pylint: disable=access-member-before-definition
+        if not val:
+            val = self._pure_depends = [False] * len(self.inputs or ())
+        return val
+
+    @property
+    def output_types(self):
+        return self._output_types
+
+    @output_types.setter
+    def output_types(self, value):
+        self._output_types = value
+
+    def _attach_outputs(self, *outputs):
+        self._outputs = [
+            weakref.ref(self._get_entity_data(o)) if o is not None else o
+            for o in outputs
+        ]
+
+        if len(self._outputs) > self.output_limit:
+            raise ValueError("Outputs' size exceeds limitation")
+
+    @property
+    def outputs(self) -> List[Union[Chunk, Tileable]]:
+        outputs = self._outputs
+        if outputs:
+            return [ref() for ref in outputs]
+
+    @outputs.setter
+    def outputs(self, outputs):
+        self._attach_outputs(*outputs)
+
+    def is_sparse(self) -> bool:
+        return self.sparse
+
+    issparse = is_sparse
+
+    def is_gpu(self) -> bool:
+        return self.gpu
+
+    @property
+    def retryable(self) -> bool:
+        return True
+
+    def get_dependent_data_keys(self):
+        return [dep.key for dep in self.inputs or ()]
+
+    def _get_output_type(self, output_idx):
+        if self.output_types:
+            try:
+                return self.output_types[output_idx]
+            except IndexError:
+                return self.output_types[0]
+        else:
+            return self._output_type_
+
+    def copy(self: OperandType) -> OperandType:
+        new_op = super().copy()
+        new_op.outputs = []
+        # copy scheduling_hint
+        new_op.scheduling_hint = self.scheduling_hint.copy()
+        extra_params = self.extra_params
+        if extra_params:
+            new_op.extra_params = deepcopy(extra_params)
+        return new_op
+
+    def on_output_modify(self, new_output):
+        # when `create_view` is True, if the output is modified,
+        # the modification should be set back to the input.
+        # This function is for this sort of usage.
+        # Remember, if `create_view` is False, this function should take no effect.
+        raise NotImplementedError
+
+    def on_input_modify(self, new_input):
+        # when `create_view` is True, if the input is modified,
+        # this function could be used to respond the modification.
+        # Remember, if `create_view` is False, this function should take no effect.
+        raise NotImplementedError
+
+
+class OperandSerializer(SerializableSerializer):
+    def serial(self, obj: Serializable, context: Dict):
+        res = super().serial(obj, context)
+        return res
+
+    def deserial(self, serialized: Tuple, context: Dict, subs: List) -> Operand:
+        # convert outputs back to weak-refs
+        operand: Operand = super().deserial(serialized, context, subs)
+        for i, out in enumerate(operand._outputs):
+
+            def cb(o, index):
+                outputs = operand._outputs
+                outputs[index] = weakref.ref(o)
+
+                if len(outputs) > 1 and all(
+                    not isinstance(o, Placeholder) for o in outputs
+                ):
+                    # all replaced
+                    # add siblings for multiple outputs
+                    outputs = operand.outputs
+                    for j in range(len(outputs)):
+                        outputs[j]._siblings = outputs[:j] + outputs[j + 1 :]
+
+            if isinstance(out, Placeholder):
+                out.callbacks.append(partial(cb, index=i))
+            else:
+                cb(out, i)
+        return operand
+
+
+OperandSerializer.register(Operand)
+
+
+class VirtualOperand(Operand):
+    def get_dependent_data_keys(self):
+        return []
+
+
+class HasInput(Operand):
+    __slots__ = ()
+
+    @property
+    def input(self):
+        return self._input
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
diff --git a/python/xorbits/_mars/core/operand/core.py b/python/xorbits/_mars/core/operand/core.py
new file mode 100644
index 000000000..3a05a55e8
--- /dev/null
+++ b/python/xorbits/_mars/core/operand/core.py
@@ -0,0 +1,529 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from typing import Any, Callable, Dict, List, Tuple, Type, Union
+
+import numpy as np
+
+try:
+    from numpy.core._exceptions import UFuncTypeError
+except ImportError:  # pragma: no cover
+    UFuncTypeError = None
+
+from ...metrics import Metrics
+from ...typing import ChunkType, OperandType, TileableType
+from ...utils import calc_data_size
+from ..context import Context
+from ..entity import (
+    ExecutableTuple,
+    OutputType,
+    get_chunk_types,
+    get_fetch_class,
+    get_output_types,
+    get_tileable_types,
+)
+from ..mode import is_eager_mode
+
+_op_type_to_executor: Dict[Type[OperandType], Callable] = dict()
+_op_type_to_size_estimator: Dict[Type[OperandType], Callable] = dict()
+
+
+op_executed_number = Metrics.counter(
+    "mars.operand.executed_number", "The number of executed operands.", ("op",)
+)
+
+
+class TileableOperandMixin:
+    __slots__ = ()
+
+    def check_inputs(self, inputs: List[TileableType]):
+        if not inputs:
+            return
+
+        for inp in inputs:
+            if inp is not None and inp._need_execution():
+                raise ValueError(
+                    f"{inp} has unknown dtypes, "
+                    f"it must be executed first before {str(type(self))}"
+                )
+
+    @classmethod
+    def _check_if_gpu(cls, inputs: List[TileableType]):
+        if not inputs:
+            return None
+        true_num = 0
+        for inp in inputs:
+            op = getattr(inp, "op", None)
+            if op is None or op.gpu is None:
+                return None
+            true_num += int(op.gpu)
+        if true_num == len(inputs):
+            return True
+        elif true_num == 0:
+            return False
+        return None
+
+    def _tokenize_output(self, output_idx: int, **kw):
+        return f"{self._key}_{output_idx}"
+
+    def _create_chunk(self, output_idx: int, index: Tuple[int], **kw) -> ChunkType:
+        output_type = kw.pop("output_type", None) or self._get_output_type(output_idx)
+        if not output_type:
+            raise ValueError("output_type should be specified")
+
+        if isinstance(output_type, (list, tuple)):
+            output_type = output_type[output_idx]
+        chunk_type, chunk_data_type = get_chunk_types(output_type)
+        kw["_i"] = output_idx
+        kw["op"] = self
+        kw["index"] = index
+        if output_type == OutputType.scalar:
+            # tensor
+            kw["order"] = "C_ORDER"
+
+        # key of output chunks may only contain keys for its output ids
+        if "_key" not in kw:
+            kw["_key"] = self._tokenize_output(output_idx, **kw)
+
+        data = chunk_data_type(**kw)
+        return chunk_type(data)
+
+    def _new_chunks(
+        self, inputs: List[ChunkType], kws: List[Dict] = None, **kw
+    ) -> List[ChunkType]:
+        output_limit = kw.pop("output_limit", None)
+        if output_limit is None:
+            output_limit = self.output_limit
+        if isinstance(output_limit, float) and kws:
+            output_limit = len(kws)
+
+        self.check_inputs(inputs)
+        self._set_inputs(inputs)
+        if self.gpu is None:
+            self.gpu = self._check_if_gpu(self._inputs)
+        if self._key is None:
+            self._update_key()
+
+        chunks = []
+        for j in range(output_limit):
+            create_chunk_kw = kw.copy()
+            if kws:
+                create_chunk_kw.update(kws[j])
+            index = create_chunk_kw.pop("index", None)
+            chunk = self._create_chunk(j, index, **create_chunk_kw)
+            chunks.append(chunk)
+
+        self.outputs = chunks
+        if len(chunks) > 1:
+            # for each output chunk, hold the reference to the other outputs
+            # so that either no one or everyone are gc collected
+            for j, t in enumerate(chunks):
+                t.data._siblings = [c.data for c in chunks[:j] + chunks[j + 1 :]]
+        return chunks
+
+    def new_chunks(
+        self, inputs: List[ChunkType], kws: List[Dict] = None, **kwargs
+    ) -> List[ChunkType]:
+        """
+        Create chunks.
+
+        A chunk is a node in a fine grained graph, all the chunk objects are created by
+        calling this function, it happens mostly in tiles.
+        The generated chunks will be set as this operand's outputs and each chunk will
+        hold this operand as it's op.
+
+        Parameters
+        ----------
+        inputs : list
+            Input chunks.
+        kws : dict
+            Kwargs for each output.
+        kwargs : dict
+            common kwargs for all outputs
+
+        Returns
+        -------
+        chunks : list
+            Output chunks.
+
+        .. note::
+            It's a final method, do not override.
+            Override the method `_new_chunks` if needed.
+        """
+        return self._new_chunks(inputs, kws=kws, **kwargs)
+
+    def new_chunk(
+        self, inputs: List[ChunkType], kws: List[Dict] = None, **kw
+    ) -> ChunkType:
+        if getattr(self, "output_limit") != 1:
+            raise TypeError("cannot new chunk with more than 1 outputs")
+
+        return self.new_chunks(inputs, kws=kws, **kw)[0]
+
+    @staticmethod
+    def _fill_nan_shape(kw: dict):
+        nsplits = kw.get("nsplits")
+        shape = kw.get("shape")
+        if nsplits is not None and shape is not None:
+            nsplits = tuple(nsplits)
+            shape = list(shape)
+            for idx, (s, sp) in enumerate(zip(shape, nsplits)):
+                if not np.isnan(s):
+                    continue
+                s = sum(sp)
+                if not np.isnan(s):
+                    shape[idx] = s
+            kw["shape"] = tuple(shape)
+            kw["nsplits"] = nsplits
+        return kw
+
+    def _create_tileable(self, output_idx: int, **kw) -> TileableType:
+        output_type = kw.pop("output_type", self._get_output_type(output_idx))
+        if output_type is None:
+            raise ValueError("output_type should be specified")
+
+        if isinstance(output_type, (list, tuple)):
+            output_type = output_type[output_idx]
+
+        tileable_type, tileable_data_type = get_tileable_types(output_type)
+        kw["_i"] = output_idx
+        kw["op"] = self
+        if output_type == OutputType.scalar:
+            # tensor
+            kw["order"] = "C_ORDER"
+
+        kw = self._fill_nan_shape(kw)
+
+        # key of output chunks may only contain keys for its output ids
+        if "_key" not in kw:
+            kw["_key"] = self._tokenize_output(output_idx, **kw)
+
+        data = tileable_data_type(**kw)
+        return tileable_type(data)
+
+    def _new_tileables(
+        self, inputs: List[TileableType], kws: List[dict] = None, **kw
+    ) -> List[TileableType]:
+        output_limit = kw.pop("output_limit", None)
+        if output_limit is None:
+            output_limit = getattr(self, "output_limit")
+
+        self._set_inputs(inputs)
+        if self.gpu is None:
+            self.gpu = self._check_if_gpu(self._inputs)
+        if getattr(self, "_key", None) is None:
+            self._update_key()  # update key when inputs are set
+
+        tileables = []
+        for j in range(output_limit):
+            create_tensor_kw = kw.copy()
+            if kws:
+                create_tensor_kw.update(kws[j])
+            tileable = self._create_tileable(j, **create_tensor_kw)
+            tileables.append(tileable)
+
+        self.outputs = tileables
+        if len(tileables) > 1:
+            # for each output tileable, hold the reference to the other outputs
+            # so that either no one or everyone are gc collected
+            for j, t in enumerate(tileables):
+                t.data._siblings = [
+                    tileable.data for tileable in tileables[:j] + tileables[j + 1 :]
+                ]
+        return tileables
+
+    def new_tileables(
+        self, inputs: List[TileableType], kws: List[dict] = None, **kw
+    ) -> List[TileableType]:
+        """
+        Create tileable objects(Tensors or DataFrames).
+
+        This is a base function for create tileable objects like tensors or dataframes,
+        it will be called inside the `new_tensors` and `new_dataframes`.
+        If eager mode is on, it will trigger the execution after tileable objects are created.
+
+        Parameters
+        ----------
+        inputs : list
+            Input tileables
+        kws : List[dict]
+            Kwargs for each output.
+        kw : dict
+            Common kwargs for all outputs.
+
+        Returns
+        -------
+        tileables : list
+            Output tileables.
+
+        .. note::
+            It's a final method, do not override.
+            Override the method `_new_tileables` if needed.
+        """
+        tileables = self._new_tileables(inputs, kws=kws, **kw)
+        if is_eager_mode():
+            ExecutableTuple(tileables).execute()
+        return tileables
+
+    def new_tileable(
+        self, inputs: List[TileableType], kws: List[Dict] = None, **kw
+    ) -> TileableType:
+        if getattr(self, "output_limit") != 1:
+            raise TypeError("cannot new chunk with more than 1 outputs")
+
+        return self.new_tileables(inputs, kws=kws, **kw)[0]
+
+    @classmethod
+    def pre_tile(cls, op: OperandType):
+        """
+        Operation before tile.
+
+        Parameters
+        ----------
+        op : OperandType
+          Operand to tile
+        """
+
+    @classmethod
+    def tile(cls, op: OperandType):
+        raise NotImplementedError
+
+    @classmethod
+    def post_tile(cls, op: OperandType, results: List[TileableType]):
+        """
+        Operation after tile.
+
+        Parameters
+        ----------
+        op : OperandType
+          Operand to tile.
+        results: list
+          List of tiled results.
+        """
+
+    @classmethod
+    def pre_execute(cls, ctx: Union[dict, Context], op: OperandType):
+        """
+        Operation before execute.
+
+        Parameters
+        ----------
+        ctx : dict
+            Data store.
+        op : OperandType
+            Operand to execute.
+        """
+
+    @classmethod
+    def execute(cls, ctx: Union[dict, Context], op: OperandType):
+        raise NotImplementedError
+
+    @classmethod
+    def post_execute(cls, ctx: Union[dict, Context], op: OperandType):
+        """
+        Operand before execute.
+
+        Parameters
+        ----------
+        ctx : dict
+            Data store
+        op : OperandType
+            Operand to execute.
+        """
+
+    @classmethod
+    def estimate_size(cls, ctx: dict, op: OperandType):
+        from .fetch import FetchShuffle
+
+        # when sizes of all outputs are deterministic, return directly
+        outputs = op.outputs
+        if all(
+            not c.is_sparse() and hasattr(c, "nbytes") and not np.isnan(c.nbytes)
+            for c in outputs
+        ):
+            for out in outputs:
+                ctx[out.key] = (out.nbytes, out.nbytes)
+            return
+
+        pure_dep_keys = set(
+            inp.key
+            for inp, is_dep in zip(op.inputs or (), op.pure_depends or ())
+            if is_dep
+        )
+        exec_sizes = [0]
+        for inp in op.inputs or ():
+            if inp.key in pure_dep_keys:
+                continue
+            try:
+                if isinstance(inp.op, FetchShuffle):
+                    keys_and_shapes = inp.extra_params.get("_shapes", dict()).items()
+                else:
+                    keys_and_shapes = [(inp.key, getattr(inp, "shape", None))]
+
+                # execution size of a specific data chunk may be
+                # larger than stored type due to objects
+                for key, shape in keys_and_shapes:
+                    exec_sizes.append(ctx[key][0])
+            except KeyError:
+                if not op.sparse:
+                    inp_size = calc_data_size(inp)
+                    if not np.isnan(inp_size):
+                        exec_sizes.append(inp_size)
+        if any(c.is_sparse() for c in op.inputs):
+            exec_size = sum(exec_sizes)
+        else:
+            exec_size = max(exec_sizes)
+
+        total_out_size = 0
+        chunk_sizes = dict()
+        for out in outputs:
+            try:
+                if not out.is_sparse():
+                    chunk_size = calc_data_size(out)
+                else:
+                    chunk_size = exec_size
+                if np.isnan(chunk_size):
+                    raise TypeError
+                chunk_sizes[out.key] = chunk_size
+                total_out_size += chunk_size
+            except (AttributeError, TypeError, ValueError):
+                pass
+
+        exec_size = max(exec_size, total_out_size)
+        memory_scale = op.memory_scale or 1.0
+        for out in outputs:
+            if out.key in ctx:
+                continue
+            if out.key in chunk_sizes:
+                result_size = chunk_sizes[out.key]
+            else:
+                result_size = max(
+                    exec_size // len(outputs),
+                    total_out_size // max(len(chunk_sizes), 1),
+                )
+            try:
+                if getattr(out, "dtype", None) is not None and out.is_sparse():
+                    max_sparse_size = (
+                        out.nbytes
+                        + np.dtype(np.int64).itemsize * np.prod(out.shape) * out.ndim
+                    )
+                else:
+                    max_sparse_size = np.nan
+            except TypeError:  # pragma: no cover
+                max_sparse_size = np.nan
+            if not np.isnan(max_sparse_size):
+                result_size = min(result_size, max_sparse_size)
+            ctx[out.key] = (result_size, int(exec_size * memory_scale // len(outputs)))
+
+    @classmethod
+    def concat_tileable_chunks(cls, tileable: TileableType):
+        raise NotImplementedError
+
+    @classmethod
+    def create_tileable_from_chunks(
+        cls, chunks: List[ChunkType], inputs: List[TileableType] = None, **kw
+    ) -> TileableType:
+        raise NotImplementedError
+
+    def get_fetch_op_cls(self, obj: ChunkType):
+        from .shuffle import ShuffleProxy
+
+        output_types = get_output_types(obj, unknown_as=OutputType.object)
+        fetch_cls, fetch_shuffle_cls = get_fetch_class(output_types[0])
+        if isinstance(self, ShuffleProxy):
+            cls = fetch_shuffle_cls
+        else:
+            cls = fetch_cls
+
+        def _inner(**kw):
+            return cls(output_types=output_types, **kw)
+
+        return _inner
+
+    def get_fuse_op_cls(self, obj: ChunkType):
+        raise NotImplementedError
+
+    @classmethod
+    def register_executor(cls, executor: Callable):
+        _op_type_to_executor[cls] = executor
+
+    @classmethod
+    def unregister_executor(cls):
+        del _op_type_to_executor[cls]
+
+    @classmethod
+    def register_size_estimator(cls, size_estimator: Callable):
+        _op_type_to_size_estimator[cls] = size_estimator
+
+    @classmethod
+    def unregister_size_estimator(cls):
+        del _op_type_to_size_estimator[cls]
+
+
+def execute(results: Dict[str, Any], op: OperandType):
+    try:
+        executor = _op_type_to_executor[type(op)]
+    except KeyError:
+        executor = type(op).execute
+
+    # pre execute
+    op.pre_execute(results, op)
+    succeeded = False
+    try:
+        if UFuncTypeError is None:  # pragma: no cover
+            return executor(results, op)
+        else:
+            # Cast `UFuncTypeError` to `TypeError` since subclasses of the former is unpickleable.
+            # The `UFuncTypeError` was introduced by numpy#12593 since v1.17.0.
+            try:
+                result = executor(results, op)
+                succeeded = True
+                if op.stage is not None:
+                    op_name = f"{op.__class__.__name__}:{op.stage.name}"
+                else:
+                    op_name = op.__class__.__name__
+                op_executed_number.record(1, {"op": op_name})
+                return result
+            except UFuncTypeError as e:  # pragma: no cover
+                raise TypeError(str(e)).with_traceback(sys.exc_info()[2]) from None
+    except NotImplementedError:
+        for op_cls in type(op).__mro__:
+            if op_cls in _op_type_to_executor:
+                executor = _op_type_to_executor[op_cls]
+                _op_type_to_executor[type(op)] = executor
+                result = executor(results, op)
+                succeeded = True
+                return result
+        raise KeyError(f"No handler found for op: {op}")
+    finally:
+        if succeeded:
+            op.post_execute(results, op)
+
+
+def estimate_size(results: Dict[str, Any], op: OperandType):
+    try:
+        size_estimator = _op_type_to_size_estimator[type(op)]
+    except KeyError:
+        size_estimator = type(op).estimate_size
+
+    try:
+        return size_estimator(results, op)
+    except NotImplementedError:
+        for op_cls in type(op).__mro__:
+            if op_cls in _op_type_to_size_estimator:
+                size_estimator = _op_type_to_size_estimator[op_cls]
+                _op_type_to_size_estimator[type(op)] = size_estimator
+                return size_estimator(results, op)
+        raise KeyError(f"No handler found for op: {op} to estimate size")
diff --git a/python/xorbits/_mars/core/operand/fetch.py b/python/xorbits/_mars/core/operand/fetch.py
new file mode 100644
index 000000000..1e06f0922
--- /dev/null
+++ b/python/xorbits/_mars/core/operand/fetch.py
@@ -0,0 +1,62 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import enum
+
+from ... import opcodes
+from ...serialization.serializables import (
+    FieldTypes,
+    Int32Field,
+    ListField,
+    ReferenceField,
+    StringField,
+)
+from .base import Operand
+from .core import TileableOperandMixin
+
+
+class Fetch(Operand):
+    _op_type_ = opcodes.FETCH
+
+    source_key = StringField("source_key", default=None)
+
+
+class FetchMixin(TileableOperandMixin):
+    def check_inputs(self, inputs):
+        # no inputs
+        if inputs and len(inputs) > 0:
+            raise ValueError(f"{type(self).__name__} has no inputs")
+
+    @classmethod
+    def tile(cls, op):
+        raise NotImplementedError("Fetch tile cannot be handled by operand itself")
+
+    @classmethod
+    def execute(cls, ctx, op):
+        """
+        Fetch operand needs nothing to do.
+        """
+
+
+class FetchShuffle(Operand):
+    _op_type_ = opcodes.FETCH_SHUFFLE
+
+    source_keys = ListField("source_keys", FieldTypes.string)
+    n_mappers = Int32Field("n_mappers")
+    n_reducers = Int32Field("n_reducers")
+    shuffle_fetch_type = ReferenceField("shuffle_fetch_type")
+
+
+class ShuffleFetchType(enum.Enum):
+    FETCH_BY_KEY = 0
+    FETCH_BY_INDEX = 1
diff --git a/python/xorbits/_mars/core/operand/fuse.py b/python/xorbits/_mars/core/operand/fuse.py
new file mode 100644
index 000000000..d1d78e65a
--- /dev/null
+++ b/python/xorbits/_mars/core/operand/fuse.py
@@ -0,0 +1,38 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes
+from ...serialization.serializables import ReferenceField
+from ..entity import FuseChunk, FuseChunkData, NotSupportTile
+from ..graph import ChunkGraph
+from .base import Operand
+
+
+class Fuse(Operand):
+    __slots__ = ("_fuse_graph",)
+    _op_type_ = opcodes.FUSE
+
+    fuse_graph = ReferenceField("fuse_graph", ChunkGraph)
+
+
+class FuseChunkMixin:
+    __slots__ = ()
+
+    def _create_chunk(self, output_idx, index, **kw):
+        data = FuseChunkData(_index=index, _op=self, **kw)
+        return FuseChunk(data)
+
+    @classmethod
+    def tile(cls, op):
+        raise NotSupportTile("FuseChunk is a chunk operand which does not support tile")
diff --git a/python/xorbits/_mars/core/operand/objects.py b/python/xorbits/_mars/core/operand/objects.py
new file mode 100644
index 000000000..091464c48
--- /dev/null
+++ b/python/xorbits/_mars/core/operand/objects.py
@@ -0,0 +1,86 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...serialization.serializables import BoolField
+from ..entity import OutputType, register_fetch_class
+from .base import Operand
+from .core import TileableOperandMixin
+from .fetch import Fetch, FetchMixin
+from .fuse import Fuse, FuseChunkMixin
+
+
+class ObjectOperand(Operand):
+    pass
+
+
+class ObjectOperandMixin(TileableOperandMixin):
+    _output_type_ = OutputType.object
+
+    def get_fuse_op_cls(self, obj):
+        return ObjectFuseChunk
+
+
+class ObjectFuseChunkMixin(FuseChunkMixin, ObjectOperandMixin):
+    __slots__ = ()
+
+
+class ObjectFuseChunk(ObjectFuseChunkMixin, Fuse):
+    pass
+
+
+class ObjectFetch(FetchMixin, ObjectOperandMixin, Fetch):
+    _output_type_ = OutputType.object
+
+    def __init__(self, **kw):
+        kw.pop("output_types", None)
+        kw.pop("_output_types", None)
+        super().__init__(**kw)
+
+    def _new_chunks(self, inputs, kws=None, **kw):
+        if "_key" in kw and self.source_key is None:
+            self.source_key = kw["_key"]
+        return super()._new_chunks(inputs, kws=kws, **kw)
+
+    def _new_tileables(self, inputs, kws=None, **kw):
+        if "_key" in kw and self.source_key is None:
+            self.source_key = kw["_key"]
+        return super()._new_tileables(inputs, kws=kws, **kw)
+
+
+register_fetch_class(OutputType.object, ObjectFetch, None)
+
+
+class MergeDictOperand(ObjectOperand, ObjectOperandMixin):
+    _merge = BoolField("merge")
+
+    def __init__(self, merge=None, **kw):
+        super().__init__(_merge=merge, **kw)
+
+    @property
+    def merge(self):
+        return self._merge
+
+    @classmethod
+    def concat_tileable_chunks(cls, tileable):
+        assert not tileable.is_coarse()
+
+        op = cls(merge=True)
+        chunk = cls(merge=True).new_chunk(tileable.chunks)
+        return op.new_tileable([tileable], chunks=[chunk], nsplits=((1,),))
+
+    @classmethod
+    def execute(cls, ctx, op):
+        assert op.merge
+        inputs = [ctx[inp.key] for inp in op.inputs]
+        ctx[op.outputs[0].key] = next(inp for inp in inputs if inp)
diff --git a/python/xorbits/_mars/core/operand/shuffle.py b/python/xorbits/_mars/core/operand/shuffle.py
new file mode 100644
index 000000000..8c4404657
--- /dev/null
+++ b/python/xorbits/_mars/core/operand/shuffle.py
@@ -0,0 +1,130 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes
+from ...serialization.serializables import (
+    FieldTypes,
+    Int32Field,
+    StringField,
+    TupleField,
+)
+from . import FetchShuffle, ShuffleFetchType
+from .base import Operand, OperandStage, VirtualOperand
+
+
+class ShuffleProxy(VirtualOperand):
+    _op_type_ = opcodes.SHUFFLE_PROXY
+    n_mappers = Int32Field("n_mappers", default=0)
+    # `n_reducers` will be updated in `MapReduceOperand._new_chunks`
+    n_reducers = Int32Field("n_reducers", default=0)
+
+    def _new_chunks(self, inputs, kws=None, **kw):
+        self.n_mappers = len(inputs)
+        return super()._new_chunks(inputs, kws, **kw)
+
+
+class MapReduceOperand(Operand):
+    """
+    An operand for shuffle execution which partitions data by the value in each record’s partition key, and
+    send the partitioned data from all mappers to all reducers.
+    """
+
+    # for reducer
+    reducer_index = TupleField("reducer_index", FieldTypes.uint64)
+    # Total reducer nums, which also be shuffle blocks for single mapper.
+    n_reducers = Int32Field("n_reducers")
+    # The reducer ordinal in all reducers. It's different from reducer_index,
+    # which might be a tuple.
+    # `reducer_ordinal` will be set in `_new_chunks`.
+    reducer_ordinal = Int32Field("reducer_ordinal")
+    reducer_phase = StringField("reducer_phase", default=None)
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if self.stage == OperandStage.reduce:
+            # for reducer, we assign worker at first
+            self.scheduling_hint.reassign_worker = True
+
+    def _new_chunks(self, inputs, kws=None, **kw):
+        if getattr(self, "reducer_index", None) is None:
+            index = None
+            if kws:
+                index = kws[0].get("index")
+            self.reducer_index = index or kw.get("index")
+        if self.stage == OperandStage.reduce:
+            # Operands such as `TensorIndexSetValue` will have multiple inputs, some won't be ProxyChunk
+            proxy_operands = [c.op for c in inputs if isinstance(c.op, ShuffleProxy)]
+            if proxy_operands:
+                # For create reduce checks with `FetchShuffle`, `proxy_operands` will be empty.
+                proxy = proxy_operands[0]
+                self.reducer_ordinal = proxy.n_reducers
+                proxy.n_reducers += 1
+        return super()._new_chunks(inputs, kws, **kw)
+
+    def get_dependent_data_keys(self):
+        from .fetch import FetchShuffle
+
+        if self.stage == OperandStage.reduce:
+            inputs = self.inputs or ()
+            deps = []
+            for inp in inputs:
+                if isinstance(inp.op, ShuffleProxy):
+                    deps.extend(
+                        [(chunk.key, self.reducer_index) for chunk in inp.inputs or ()]
+                    )
+                elif isinstance(inp.op, FetchShuffle):
+                    # fetch shuffle by index doesn't store data keys, so it won't run into this function.
+                    assert inp.op.shuffle_fetch_type == ShuffleFetchType.FETCH_BY_KEY
+                    deps.extend([(k, self.reducer_index) for k in inp.op.source_keys])
+                else:
+                    deps.append(inp.key)
+            return deps
+        return super().get_dependent_data_keys()
+
+    def iter_mapper_keys(self, input_id=0):
+        # key is mapper chunk key, index is mapper chunk index.
+        input_chunk = self.inputs[input_id]
+        if isinstance(input_chunk.op, ShuffleProxy):
+            keys = [inp.key for inp in input_chunk.inputs]
+        else:
+            assert isinstance(input_chunk.op, FetchShuffle), input_chunk.op
+            if input_chunk.op.shuffle_fetch_type == ShuffleFetchType.FETCH_BY_INDEX:
+                # For fetch shuffle by index, all shuffle block of same reducers are
+                # identified by their index. chunk key are not needed any more.
+                # so just mock key here.
+                # keep this in sync with ray executor `execute_subtask`.
+                return list(range(input_chunk.op.n_mappers))
+            keys = input_chunk.op.source_keys
+        return keys
+
+    def iter_mapper_data(self, ctx, input_id=0, pop=False, skip_none=False):
+        for key in self.iter_mapper_keys(input_id):
+            try:
+                if pop:
+                    yield ctx.pop((key, self.reducer_index))
+                else:
+                    yield ctx[key, self.reducer_index]
+            except KeyError:
+                if not skip_none:  # pragma: no cover
+                    raise
+                if not pop:
+                    ctx[key, self.reducer_index] = None
+
+    def execute(self, ctx, op):
+        """The mapper stage must ensure all mapper blocks are inserted into ctx and no blocks
+        for some reducers are missing. This is needed by shuffle fetch by index,
+        which shuffle block are identified by the  index instead of data keys.
+        For operands implementation simplicity, we can sort the `ctx` by key which are (chunk key, reducer index) tuple
+        and relax the insert order requirements.
+        """
diff --git a/python/xorbits/_mars/core/operand/tests/__init__.py b/python/xorbits/_mars/core/operand/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/core/operand/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/core/operand/tests/test_core.py b/python/xorbits/_mars/core/operand/tests/test_core.py
new file mode 100644
index 000000000..736b84faf
--- /dev/null
+++ b/python/xorbits/_mars/core/operand/tests/test_core.py
@@ -0,0 +1,151 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ....dataframe import core  # noqa: F401  # pylint: disable=unused-variable
+from ... import OutputType
+from .. import Operand, ShuffleProxy, TileableOperandMixin, estimate_size, execute
+
+
+class MyOperand(Operand, TileableOperandMixin):
+    @classmethod
+    def execute(cls, ctx, op):
+        return 1
+
+    @classmethod
+    def estimate_size(cls, ctx, op):
+        return 1
+
+
+class MyOperand2(MyOperand):
+    @classmethod
+    def execute(cls, ctx, op):
+        raise NotImplementedError
+
+    @classmethod
+    def estimate_size(cls, ctx, op):
+        raise NotImplementedError
+
+
+class _OperandMixin(TileableOperandMixin):
+    @classmethod
+    def tile(cls, op):
+        out = op.outputs[0]
+        params = out.params.copy()
+        params["index"] = (0,) * out.ndim
+        chunk = op.copy().reset_key().new_chunk(None, kws=[params])
+        new_params = out.params.copy()
+        new_params["chunks"] = [chunk]
+        new_params["nsplits"] = ()
+        return op.copy().new_tileables(op.inputs, kws=[new_params])
+
+
+class MyOperand3(Operand, _OperandMixin):
+    @classmethod
+    def execute(cls, ctx, op):
+        raise ValueError("intend to fail")
+
+    @classmethod
+    def post_execute(cls, ctx, op):  # pragma: no cover
+        ctx[op.outputs[0].key] += 1
+
+
+class MyOperand4(Operand, _OperandMixin):
+    @classmethod
+    def post_execute(cls, ctx, op):
+        ctx[op.outputs[0].key] += 1
+
+
+class MyOperand5(MyOperand4):
+    pass
+
+
+def test_execute():
+    op = MyOperand(extra_params={"my_extra_params": 1})
+    assert op.extra_params["my_extra_params"] == 1
+    MyOperand.register_executor(lambda *_: 2)
+    assert execute(dict(), MyOperand(_key="1")) == 2
+    assert execute(dict(), MyOperand2(_key="1")) == 2
+
+    MyOperand.unregister_executor()
+    assert execute(dict(), MyOperand(_key="1")) == 1
+    MyOperand2.unregister_executor()
+    with pytest.raises(KeyError):
+        execute(dict(), MyOperand2(_key="1"))
+
+
+def test_estimate_size():
+    MyOperand.register_size_estimator(lambda *_: 2)
+    assert estimate_size(dict(), MyOperand(_key="1")) == 2
+    assert estimate_size(dict(), MyOperand2(_key="1")) == 2
+
+    MyOperand.unregister_size_estimator()
+    assert estimate_size(dict(), MyOperand(_key="1")) == 1
+    MyOperand2.unregister_size_estimator()
+    with pytest.raises(KeyError):
+        estimate_size(dict(), MyOperand2(_key="1"))
+
+
+def test_unknown_dtypes():
+    op = MyOperand(_output_types=[OutputType.dataframe])
+    df = op.new_tileable(None, dtypes=None)
+    op2 = MyOperand(_output_types=[OutputType.scalar])
+    with pytest.raises(ValueError) as exc_info:
+        op2.new_tileable([df])
+    assert "executed first" in exc_info.value.args[0]
+
+
+def test_post_execute(setup):
+    op = MyOperand3(_output_types=[OutputType.tensor])
+    t = op.new_tileable(None, dtype=np.dtype(float), shape=())
+    with pytest.raises(ValueError, match="intend to fail"):
+        t.execute()
+
+    op = MyOperand5(_output_types=[OutputType.tensor])
+    t2 = op.new_tileable(None, dtype=np.dtype(float), shape=())
+
+    def execute_error(*_):
+        raise ValueError("intend to fail again")
+
+    with pytest.raises(ValueError, match="intend to fail again"):
+        operand_executors = {MyOperand4: execute_error}
+        t2.execute(extra_config={"operand_executors": operand_executors}).fetch()
+
+    def execute_normally(ctx, op):
+        ctx[op.outputs[0].key] = 1
+
+    operand_executors = {MyOperand5: execute_normally}
+    assert (
+        t2.execute(extra_config={"operand_executors": operand_executors}).fetch() == 2
+    )
+
+
+def test_shuffle(setup):
+    from ....dataframe import DataFrame
+
+    chunk_size, n_rows = 10, 100
+    df = DataFrame(
+        pd.DataFrame(np.random.rand(n_rows, 3), columns=list("abc")),
+        chunk_size=chunk_size,
+    )
+    chunk_graph = df.groupby(["a"]).apply(lambda x: x).build_graph(tile=True)
+    [proxy_chunk] = [c for c in chunk_graph if isinstance(c.op, ShuffleProxy)]
+    successors = chunk_graph.successors(proxy_chunk)
+    n_reducers = successors[0].op.n_reducers
+    assert n_reducers == len(successors), (n_reducers, len(successors))
+    assert len(set(c.op.n_reducers for c in successors)) == 1
+    assert sorted([c.op.reducer_ordinal for c in successors]) == list(range(n_reducers))
diff --git a/python/xorbits/_mars/core/tests/__init__.py b/python/xorbits/_mars/core/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/core/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/core/tests/test_context.py b/python/xorbits/_mars/core/tests/test_context.py
new file mode 100644
index 000000000..07e065fe1
--- /dev/null
+++ b/python/xorbits/_mars/core/tests/test_context.py
@@ -0,0 +1,31 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import remote as mr
+from ..context import get_context
+
+
+def test_context(setup):
+    def func():
+        ctx = get_context()
+        assert ctx is not None
+
+    # no error should happen
+    mr.spawn(func).execute()
+
+    # context should be reset after execution
+    # for test backend(test://xxx),
+    # the worker pool and client are in the same process
+    # if context is not reset, get_context() will still get one
+    assert get_context() is None
diff --git a/python/xorbits/_mars/core/tests/test_entrypoints.py b/python/xorbits/_mars/core/tests/test_entrypoints.py
new file mode 100644
index 000000000..250783851
--- /dev/null
+++ b/python/xorbits/_mars/core/tests/test_entrypoints.py
@@ -0,0 +1,125 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import types
+import warnings
+
+import pkg_resources
+
+
+class _DummyClass(object):
+    def __init__(self, value):
+        self.value = value
+
+    def __repr__(self):
+        return "_DummyClass(%f, %f)" % self.value
+
+
+def test_init_entrypoint():
+    # FIXME: Python 2 workaround because nonlocal doesn't exist
+    counters = {"init": 0}
+
+    def init_function():
+        counters["init"] += 1
+
+    mod = types.ModuleType("_test_mars_extension")
+    mod.init_func = init_function
+
+    try:
+        # will remove this module at the end of the test
+        sys.modules[mod.__name__] = mod
+
+        # We are registering an entry point using the "mars" package
+        # ("distribution" in pkg_resources-speak) itself, though these are
+        # normally registered by other packages.
+        dist = "pymars"
+        entrypoints = pkg_resources.get_entry_map(dist)
+        my_entrypoint = pkg_resources.EntryPoint(
+            "init",  # name of entry point
+            mod.__name__,  # module with entry point object
+            attrs=["init_func"],  # name of entry point object
+            dist=pkg_resources.get_distribution(dist),
+        )
+        entrypoints.setdefault("mars_extensions", {})["init"] = my_entrypoint
+
+        from .. import entrypoints
+
+        # Allow reinitialization
+        entrypoints.init_extension_entrypoints.cache_clear()
+
+        entrypoints.init_extension_entrypoints()
+
+        # was our init function called?
+        assert counters["init"] == 1
+
+        # ensure we do not initialize twice
+        entrypoints.init_extension_entrypoints()
+        assert counters["init"] == 1
+    finally:
+        # remove fake module
+        if mod.__name__ in sys.modules:
+            del sys.modules[mod.__name__]
+
+
+def test_entrypoint_tolerance():
+    # FIXME: Python 2 workaround because nonlocal doesn't exist
+    counters = {"init": 0}
+
+    def init_function():
+        counters["init"] += 1
+        raise ValueError("broken")
+
+    mod = types.ModuleType("_test_mars_bad_extension")
+    mod.init_func = init_function
+
+    try:
+        # will remove this module at the end of the test
+        sys.modules[mod.__name__] = mod
+
+        # We are registering an entry point using the "mars" package
+        # ("distribution" in pkg_resources-speak) itself, though these are
+        # normally registered by other packages.
+        dist = "pymars"
+        entrypoints = pkg_resources.get_entry_map(dist)
+        my_entrypoint = pkg_resources.EntryPoint(
+            "init",  # name of entry point
+            mod.__name__,  # module with entry point object
+            attrs=["init_func"],  # name of entry point object
+            dist=pkg_resources.get_distribution(dist),
+        )
+        entrypoints.setdefault("mars_extensions", {})["init"] = my_entrypoint
+
+        from .. import entrypoints
+
+        # Allow reinitialization
+        entrypoints.init_extension_entrypoints.cache_clear()
+
+        with warnings.catch_warnings(record=True) as w:
+            entrypoints.init_extension_entrypoints()
+
+        bad_str = "Mars extension module '_test_mars_bad_extension'"
+        for x in w:
+            if bad_str in str(x):
+                break
+        else:
+            raise ValueError("Expected warning message not found")
+
+        # was our init function called?
+        assert counters["init"] == 1
+
+    finally:
+        # remove fake module
+        if mod.__name__ in sys.modules:
+            del sys.modules[mod.__name__]
diff --git a/python/xorbits/_mars/core/tests/test_mode.py b/python/xorbits/_mars/core/tests/test_mode.py
new file mode 100644
index 000000000..569bf85a5
--- /dev/null
+++ b/python/xorbits/_mars/core/tests/test_mode.py
@@ -0,0 +1,75 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from .. import enter_mode, is_build_mode, is_eager_mode, is_kernel_mode
+
+
+def test_enter_mode():
+    from ...config import option_context, options
+
+    @enter_mode(kernel=True)
+    def wrapped():
+        return is_eager_mode()
+
+    assert not options.eager_mode
+    assert not wrapped()
+
+    with option_context({"eager_mode": True}):
+        assert options.eager_mode
+        assert not wrapped()
+
+    @enter_mode(kernel=True)
+    def wrapped2():
+        wrapped()
+        with option_context({"eager_mode": True}):
+            assert options.eager_mode
+            assert not is_eager_mode()
+            with enter_mode(kernel=False):
+                assert not is_kernel_mode()
+            assert is_kernel_mode()
+
+    wrapped2()
+
+    assert not is_kernel_mode()
+    assert not is_build_mode()
+
+    @enter_mode(kernel=False)
+    def wrapped3():
+        wrapped()
+        with option_context({"eager_mode": True}):
+            assert options.eager_mode
+            assert not is_kernel_mode()
+            with enter_mode(kernel=True, build=True):
+                assert is_kernel_mode()
+                assert is_build_mode()
+            assert not is_kernel_mode()
+            assert not is_build_mode()
+            with pytest.raises(ValueError):
+                with enter_mode(kernel=True, build=True):
+                    raise ValueError("meant to raise error")
+            assert not is_kernel_mode()
+            assert not is_build_mode()
+
+            @enter_mode(kernel=True)
+            def wrapped4():
+                raise ValueError("meant to raise error")
+
+            with pytest.raises(ValueError):
+                wrapped4()
+            assert not is_kernel_mode()
+            assert not is_build_mode()
+
+    wrapped3()
diff --git a/python/xorbits/_mars/dataframe/__init__.py b/python/xorbits/_mars/dataframe/__init__.py
new file mode 100644
index 000000000..826df3298
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/__init__.py
@@ -0,0 +1,119 @@
+# isort: skip_file
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .initializer import DataFrame, Series, Index
+
+# do imports to register operands
+from .base.cut import cut
+from .base.eval import mars_eval as eval  # pylint: disable=redefined-builtin
+from .base.get_dummies import get_dummies
+from .base.melt import melt
+from .base.qcut import qcut
+from .base.to_numeric import to_numeric
+from .base.value_counts import value_counts
+from .contrib.raydataset import to_ray_mldataset, to_ray_dataset
+from .datasource.from_tensor import dataframe_from_tensor, series_from_tensor
+from .datasource.from_index import series_from_index
+from .datasource.from_records import from_records
+from .datasource.from_vineyard import from_vineyard
+from .datasource.read_csv import read_csv
+from .datasource.read_sql import read_sql, read_sql_table, read_sql_query
+from .datasource.read_parquet import read_parquet
+from .datasource.read_raydataset import (
+    read_raydataset,
+    read_ray_dataset,
+    read_ray_mldataset,
+)
+from .datasource.date_range import date_range
+from .fetch import DataFrameFetch, DataFrameFetchShuffle
+from .merge import concat, merge
+from .missing.checkna import isna, isnull, notna, notnull
+from .reduction import CustomReduction, unique
+from .tseries.to_datetime import to_datetime
+
+from . import arithmetic
+from . import base
+from . import indexing
+from . import merge as merge_
+from . import missing
+from . import reduction
+from . import statistics
+from . import sort
+from . import groupby
+from . import ufunc
+from . import datastore
+from . import window
+from . import plotting
+
+del (
+    reduction,
+    statistics,
+    arithmetic,
+    indexing,
+    merge_,
+    base,
+    groupby,
+    missing,
+    ufunc,
+    datastore,
+    sort,
+    window,
+    plotting,
+)
+del DataFrameFetch, DataFrameFetchShuffle
+
+# noinspection PyUnresolvedReferences
+from .arrays import ArrowStringDtype, ArrowStringArray, ArrowListDtype, ArrowListArray
+from .core import (
+    CategoricalIndex,
+    DatetimeIndex,
+    Float64Index,
+    IntervalIndex,
+    MultiIndex,
+    PeriodIndex,
+    RangeIndex,
+    TimedeltaIndex,
+    UInt64Index,
+)
+
+# noinspection PyUnresolvedReferences
+from pandas import (
+    Timedelta,
+    Timestamp,
+    offsets,
+    NaT,
+    Interval,
+    DateOffset,
+    BooleanDtype,
+    CategoricalDtype,
+    DatetimeTZDtype,
+    Int8Dtype,
+    Int16Dtype,
+    Int32Dtype,
+    Int64Dtype,
+    IntervalDtype,
+    SparseDtype,
+    StringDtype,
+    UInt8Dtype,
+    UInt16Dtype,
+    UInt32Dtype,
+    UInt64Dtype,
+    PeriodDtype,
+)
+
+try:
+    from pandas import NA, NamedAgg
+except ImportError:  # pragma: no cover
+    pass
diff --git a/python/xorbits/_mars/dataframe/align.py b/python/xorbits/_mars/dataframe/align.py
new file mode 100644
index 000000000..53f82ffcb
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/align.py
@@ -0,0 +1,975 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import operator
+
+import numpy as np
+import pandas as pd
+
+from .. import opcodes as OperandDef
+from ..core import OutputType
+from ..core.operand import MapReduceOperand, OperandStage
+from ..serialization.serializables import (
+    AnyField,
+    BoolField,
+    FieldTypes,
+    Int32Field,
+    KeyField,
+    ListField,
+)
+from .core import SERIES_CHUNK_TYPE
+from .operands import DataFrameOperandMixin, DataFrameShuffleProxy
+from .utils import (
+    build_split_idx_to_origin_idx,
+    filter_dtypes,
+    filter_index_value,
+    hash_dtypes,
+    hash_index,
+    is_index_value_identical,
+    parse_index,
+    split_monotonic_index_min_max,
+    validate_axis,
+)
+
+
+class DataFrameIndexAlign(MapReduceOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.DATAFRAME_INDEX_ALIGN
+
+    index_min = AnyField("index_min")
+    index_min_close = BoolField("index_min_close")
+    index_max = AnyField("index_max")
+    index_max_close = BoolField("index_max_close")
+    index_shuffle_size = Int32Field("index_shuffle_size", default=None)
+    column_min = AnyField("column_min")
+    column_min_close = BoolField("column_min_close")
+    column_max = AnyField("column_max")
+    column_max_close = BoolField("column_max_close")
+    column_shuffle_size = Int32Field("column_shuffle_size", default=None)
+    column_shuffle_segments = ListField("column_shuffle_segments", FieldTypes.series)
+
+    input = KeyField("input")
+
+    def __init__(
+        self, index_min_max=None, column_min_max=None, output_types=None, **kw
+    ):
+        if index_min_max is not None:
+            kw.update(
+                dict(
+                    index_min=index_min_max[0],
+                    index_min_close=index_min_max[1],
+                    index_max=index_min_max[2],
+                    index_max_close=index_min_max[3],
+                )
+            )
+        if column_min_max is not None:
+            kw.update(
+                dict(
+                    column_min=column_min_max[0],
+                    column_min_close=column_min_max[1],
+                    column_max=column_min_max[2],
+                    column_max_close=column_min_max[3],
+                )
+            )
+        super().__init__(_output_types=output_types, **kw)
+
+    @property
+    def index_min_max(self):
+        if getattr(self, "index_min", None) is None:
+            return None
+        return (
+            self.index_min,
+            self.index_min_close,
+            self.index_max,
+            self.index_max_close,
+        )
+
+    @property
+    def column_min_max(self):
+        if getattr(self, "column_min", None) is None:
+            return None
+        return (
+            self.column_min,
+            self.column_min_close,
+            self.column_max,
+            self.column_max_close,
+        )
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self.input = self._inputs[0]
+
+    def build_map_chunk_kw(self, inputs, **kw):
+        if kw.get("index_value", None) is None and inputs[0].index_value is not None:
+            input_index_value = inputs[0].index_value
+            index_min_max = self.index_min_max
+            if index_min_max is not None:
+                kw["index_value"] = filter_index_value(input_index_value, index_min_max)
+            else:
+                kw["index_value"] = parse_index(
+                    inputs[0].index_value.to_pandas(),
+                    input_index_value,
+                    type(self).__name__,
+                )
+        if self.output_types[0] == OutputType.dataframe:
+            if (
+                kw.get("columns_value", None) is None
+                and getattr(inputs[0], "columns_value", None) is not None
+            ):
+                input_columns_value = inputs[0].columns_value
+                input_dtypes = inputs[0].dtypes
+                column_min_max = self.column_min_max
+                if column_min_max is not None:
+                    kw["columns_value"] = filter_index_value(
+                        input_columns_value, column_min_max, store_data=True
+                    )
+                else:
+                    kw["columns_value"] = parse_index(
+                        inputs[0].columns_value.to_pandas(),
+                        input_columns_value,
+                        type(self).__name__,
+                    )
+                kw["dtypes"] = input_dtypes[kw["columns_value"].to_pandas()]
+                column_shuffle_size = self.column_shuffle_size
+                if column_shuffle_size is not None:
+                    self.column_shuffle_segments = hash_dtypes(
+                        input_dtypes, column_shuffle_size
+                    )
+        else:
+            if (
+                kw.get("dtype", None) is None
+                and getattr(inputs[0], "dtype", None) is not None
+            ):
+                kw["dtype"] = inputs[0].dtype
+            if (
+                kw.get("name", None) is None
+                and getattr(inputs[0], "name", None) is not None
+            ):
+                kw["name"] = inputs[0].name
+        return kw
+
+    def build_reduce_chunk_kw(self, inputs, index, **kw):
+        kw["index"] = index
+        if (
+            kw.get("index_value", None) is None
+            and inputs[0].inputs[0].index_value is not None
+        ):
+            index_align_map_chunks = inputs[0].inputs
+            if index_align_map_chunks[0].op.index_min_max is not None:
+                # shuffle on columns, all the DataFrameIndexAlignMap has the same index
+                kw["index_value"] = filter_index_value(
+                    index_align_map_chunks[0].index_value,
+                    index_align_map_chunks[0].op.index_min_max,
+                )
+            else:
+                # shuffle on index
+                kw["index_value"] = parse_index(
+                    index_align_map_chunks[0].index_value.to_pandas(),
+                    [c.key for c in index_align_map_chunks],
+                    type(self).__name__,
+                )
+        if self.output_types[0] == OutputType.dataframe:
+            if (
+                kw.get("columns_value", None) is None
+                and getattr(inputs[0].inputs[0], "columns_value", None) is not None
+            ):
+                index_align_map_chunks = inputs[0].inputs
+                if index_align_map_chunks[0].op.column_min_max is not None:
+                    # shuffle on index
+                    kw["columns_value"] = filter_index_value(
+                        index_align_map_chunks[0].columns_value,
+                        index_align_map_chunks[0].op.column_min_max,
+                        store_data=True,
+                    )
+                    kw["dtypes"] = index_align_map_chunks[0].dtypes[
+                        kw["columns_value"].to_pandas()
+                    ]
+                else:
+                    # shuffle on columns
+                    all_dtypes = [
+                        c.op.column_shuffle_segments[index[1]]
+                        for c in index_align_map_chunks
+                        if c.index[0] == index_align_map_chunks[0].index[0]
+                    ]
+                    kw["dtypes"] = pd.concat(all_dtypes)
+                    kw["columns_value"] = parse_index(
+                        kw["dtypes"].index, store_data=True
+                    )
+        else:
+            if (
+                kw.get("dtype", None) is None
+                and getattr(inputs[0].inputs[0], "dtype", None) is not None
+            ):
+                kw["dtype"] = inputs[0].inputs[0].dtype
+            if (
+                kw.get("name", None) is None
+                and getattr(inputs[0].inputs[0], "name", None) is not None
+            ):
+                kw["name"] = inputs[0].inputs[0].name
+        return kw
+
+    @classmethod
+    def execute_map(cls, ctx, op):
+        # TODO(QIN): add GPU support here
+        df = ctx[op.inputs[0].key]
+
+        filters = [[], []]
+
+        chunk = op.outputs[0]
+        if op.index_shuffle_size == -1:
+            # no shuffle and no min-max filter on index
+            filters[0].append(slice(None, None, None))
+        elif op.index_shuffle_size is None:
+            # no shuffle on index
+            comp_op = operator.ge if op.index_min_close else operator.gt
+            index_cond = comp_op(df.index, op.index_min)
+            comp_op = operator.le if op.index_max_close else operator.lt
+            index_cond = index_cond & comp_op(df.index, op.index_max)
+            filters[0].append(index_cond)
+        else:
+            # shuffle on index
+            shuffle_size = op.index_shuffle_size
+            filters[0].extend(hash_index(df.index, shuffle_size))
+
+        if chunk.ndim == 1:
+            if len(filters[0]) == 1:
+                # no shuffle
+                ctx[chunk.key] = df.loc[filters[0][0]]
+            else:
+                for index_idx, index_filter in enumerate(filters[0]):
+                    ctx[chunk.key, (index_idx,)] = (
+                        ctx.get_current_chunk().index,
+                        df.loc[index_filter],
+                    )
+            return
+
+        if op.column_shuffle_size == -1:
+            # no shuffle and no min-max filter on columns
+            filters[1].append(slice(None, None, None))
+        if op.column_shuffle_size is None:
+            # no shuffle on columns
+            comp_op = operator.ge if op.column_min_close else operator.gt
+            columns_cond = comp_op(df.columns, op.column_min)
+            comp_op = operator.le if op.column_max_close else operator.lt
+            columns_cond = columns_cond & comp_op(df.columns, op.column_max)
+            filters[1].append(columns_cond)
+        else:
+            # shuffle on columns
+            shuffle_size = op.column_shuffle_size
+            filters[1].extend(hash_index(df.columns, shuffle_size))
+
+        if all(len(it) == 1 for it in filters):
+            # no shuffle
+            ctx[chunk.key] = df.loc[filters[0][0], filters[1][0]]
+        elif len(filters[0]) == 1:
+            # shuffle on columns
+            for column_idx, column_filter in enumerate(filters[1]):
+                shuffle_index = (chunk.index[0], column_idx)
+                ctx[chunk.key, shuffle_index] = (
+                    ctx.get_current_chunk().index,
+                    df.loc[filters[0][0], column_filter],
+                )
+        elif len(filters[1]) == 1:
+            # shuffle on index
+            for index_idx, index_filter in enumerate(filters[0]):
+                shuffle_index = (index_idx, chunk.index[1])
+                ctx[chunk.key, shuffle_index] = (
+                    ctx.get_current_chunk().index,
+                    df.loc[index_filter, filters[1][0]],
+                )
+        else:
+            # full shuffle
+            shuffle_index_size = op.index_shuffle_size
+            shuffle_column_size = op.column_shuffle_size
+            out_idxes = itertools.product(
+                range(shuffle_index_size), range(shuffle_column_size)
+            )
+            out_index_columns = itertools.product(*filters)
+            for out_idx, out_index_column in zip(out_idxes, out_index_columns):
+                index_filter, column_filter = out_index_column
+                ctx[chunk.key, out_idx] = (
+                    ctx.get_current_chunk().index,
+                    df.loc[index_filter, column_filter],
+                )
+
+    @classmethod
+    def execute_reduce(cls, ctx, op: "DataFrameIndexAlign"):
+        chunk = op.outputs[0]
+        input_idx_to_df = dict(op.iter_mapper_data(ctx))
+        row_idxes = sorted({idx[0] for idx in input_idx_to_df})
+        if chunk.ndim == 2:
+            col_idxes = sorted({idx[1] for idx in input_idx_to_df})
+
+        ress = []
+        for row_idx in row_idxes:
+            if chunk.ndim == 2:
+                row_dfs = []
+                for col_idx in col_idxes:
+                    row_dfs.append(input_idx_to_df[row_idx, col_idx])
+                row_df = pd.concat(row_dfs, axis=1)
+            else:
+                row_df = input_idx_to_df[(row_idx,)]
+
+            ress.append(row_df)
+
+        ctx[chunk.key] = pd.concat(ress, axis=0)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            cls.execute_map(ctx, op)
+        else:
+            cls.execute_reduce(ctx, op)
+
+
+class _AxisMinMaxSplitInfo(object):
+    def __init__(
+        self, left_split, left_increase, right_split, right_increase, dummy=False
+    ):
+        self._left_split = left_split
+        self._right_split = right_split
+        self._dummy = dummy
+
+        self._left_split_idx_to_origin_idx = build_split_idx_to_origin_idx(
+            self._left_split, left_increase
+        )
+        self._right_split_idx_to_origin_idx = build_split_idx_to_origin_idx(
+            self._right_split, right_increase
+        )
+
+    def isdummy(self):
+        return self._dummy
+
+    def get_origin_left_idx(self, idx):
+        return self._left_split_idx_to_origin_idx[idx][0]
+
+    def get_origin_left_split(self, idx):
+        left_idx, left_inner_idx = self._left_split_idx_to_origin_idx[idx]
+        return self._left_split[left_idx][left_inner_idx]
+
+    def get_origin_right_idx(self, idx):
+        return self._right_split_idx_to_origin_idx[idx][0]
+
+    def get_origin_right_split(self, idx):
+        right_idx, right_inner_idx = self._right_split_idx_to_origin_idx[idx]
+        return self._right_split[right_idx][right_inner_idx]
+
+
+class _MinMaxSplitInfo(object):
+    def __init__(self, row_min_max_split_info=None, col_min_max_split_info=None):
+        self.row_min_max_split_info = row_min_max_split_info
+        self.col_min_max_split_info = col_min_max_split_info
+
+    def all_axes_can_split(self):
+        return (
+            self.row_min_max_split_info is not None
+            and self.col_min_max_split_info is not None
+        )
+
+    def one_axis_can_split(self):
+        return (self.row_min_max_split_info is None) ^ (
+            self.col_min_max_split_info is None
+        )
+
+    def no_axis_can_split(self):
+        return (
+            self.row_min_max_split_info is None and self.col_min_max_split_info is None
+        )
+
+    def __getitem__(self, i):
+        return [self.row_min_max_split_info, self.col_min_max_split_info][i]
+
+    def __setitem__(self, axis, axis_min_max_split_info):
+        assert axis in {0, 1}
+        if axis == 0:
+            self.row_min_max_split_info = axis_min_max_split_info
+        else:
+            self.col_min_max_split_info = axis_min_max_split_info
+
+    def get_row_left_idx(self, out_idx):
+        return self.row_min_max_split_info.get_origin_left_idx(out_idx)
+
+    def get_row_left_split(self, out_idx):
+        return self.row_min_max_split_info.get_origin_left_split(out_idx)
+
+    def get_col_left_idx(self, out_idx):
+        return self.col_min_max_split_info.get_origin_left_idx(out_idx)
+
+    def get_col_left_split(self, out_idx):
+        return self.col_min_max_split_info.get_origin_left_split(out_idx)
+
+    def get_row_right_idx(self, out_idx):
+        return self.row_min_max_split_info.get_origin_right_idx(out_idx)
+
+    def get_row_right_split(self, out_idx):
+        return self.row_min_max_split_info.get_origin_right_split(out_idx)
+
+    def get_col_right_idx(self, out_idx):
+        return self.col_min_max_split_info.get_origin_right_idx(out_idx)
+
+    def get_col_right_split(self, out_idx):
+        return self.col_min_max_split_info.get_origin_right_split(out_idx)
+
+    def get_axis_idx(self, axis, left_or_right, out_idx):
+        if axis == 0:
+            if left_or_right == 0:
+                return self.get_row_left_idx(out_idx)
+            else:
+                assert left_or_right == 1
+                return self.get_row_right_idx(out_idx)
+        else:
+            assert axis == 1
+            if left_or_right == 0:
+                return self.get_col_left_idx(out_idx)
+            else:
+                assert left_or_right == 1
+                return self.get_col_right_idx(out_idx)
+
+    def get_axis_split(self, axis, left_or_right, out_idx):
+        if axis == 0:
+            if left_or_right == 0:
+                return self.get_row_left_split(out_idx)
+            else:
+                assert left_or_right == 1
+                return self.get_row_right_split(out_idx)
+        else:
+            assert axis == 1
+            if left_or_right == 0:
+                return self.get_col_left_split(out_idx)
+            else:
+                assert left_or_right == 1
+                return self.get_col_right_split(out_idx)
+
+
+def _get_chunk_index_min_max(index_chunks):
+    chunk_index_min_max = []
+    for chunk in index_chunks:
+        min_val = chunk.min_val
+        min_val_close = chunk.min_val_close
+        max_val = chunk.max_val
+        max_val_close = chunk.max_val_close
+        if min_val is None or max_val is None:
+            chunk_index_min_max.append((None, True, None, True))
+        else:
+            chunk_index_min_max.append((min_val, min_val_close, max_val, max_val_close))
+    return chunk_index_min_max
+
+
+def _get_monotonic_chunk_index_min_max(index, index_chunks):
+    chunk_index_min_max = _get_chunk_index_min_max(index_chunks)
+    if index.is_monotonic_decreasing:
+        return list(reversed(chunk_index_min_max)), False
+
+    for j in range(len(chunk_index_min_max) - 1):
+        # overlap only if the prev max is close and curr min is close
+        # and they are identical
+        prev_max, prev_max_close = chunk_index_min_max[j][2:]
+        curr_min, curr_min_close = chunk_index_min_max[j + 1][:2]
+        if prev_max_close and curr_min_close and prev_max == curr_min:
+            return
+    return chunk_index_min_max, True
+
+
+def _need_align_map(
+    input_chunk,
+    index_min_max,
+    column_min_max,
+    dummy_index_splits=False,
+    dummy_column_splits=False,
+):
+    if isinstance(input_chunk, SERIES_CHUNK_TYPE):
+        if input_chunk.index_value is None:
+            return True
+        if input_chunk.index_value.min_max != index_min_max:
+            return True
+    else:
+        if not dummy_index_splits:
+            if (
+                input_chunk.index_value is None
+                or input_chunk.index_value.min_max != index_min_max
+            ):
+                return True
+        if not dummy_column_splits:
+            if (
+                input_chunk.columns_value is None
+                or input_chunk.columns_value.min_max != column_min_max
+            ):
+                return True
+    return False
+
+
+def _is_index_identical(left, right):
+    if len(left) != len(right):
+        return False
+    for left_item, right_item in zip(left, right):
+        if left_item.key != right_item.key:
+            return False
+    return True
+
+
+def _axis_need_shuffle(left_axis, right_axis, left_axis_chunks, right_axis_chunks):
+    if _is_index_identical(left_axis_chunks, right_axis_chunks):
+        return False
+    if (
+        not left_axis.is_monotonic_increasing_or_decreasing
+        and len(left_axis_chunks) > 1
+    ):
+        return True
+    if (
+        not right_axis.is_monotonic_increasing_or_decreasing
+        and len(right_axis_chunks) > 1
+    ):
+        return True
+    return False
+
+
+def _calc_axis_splits(left_axis, right_axis, left_axis_chunks, right_axis_chunks):
+    if _axis_need_shuffle(left_axis, right_axis, left_axis_chunks, right_axis_chunks):
+        # do shuffle
+        out_chunk_size = max(len(left_axis_chunks), len(right_axis_chunks))
+        return None, [np.nan for _ in range(out_chunk_size)]
+    else:
+        # no need to do shuffle on this axis
+        if _is_index_identical(left_axis_chunks, right_axis_chunks):
+            left_chunk_index_min_max = _get_chunk_index_min_max(left_axis_chunks)
+            right_splits = left_splits = [[c] for c in left_chunk_index_min_max]
+            right_increase = left_increase = None
+        elif len(left_axis_chunks) == 1 and len(right_axis_chunks) == 1:
+            left_splits = [_get_chunk_index_min_max(left_axis_chunks)]
+            left_increase = left_axis_chunks[0].is_monotonic_decreasing
+            right_splits = [_get_chunk_index_min_max(right_axis_chunks)]
+            right_increase = right_axis_chunks[0].is_monotonic_decreasing
+        else:
+            (
+                left_chunk_index_min_max,
+                left_increase,
+            ) = _get_monotonic_chunk_index_min_max(left_axis, left_axis_chunks)
+            (
+                right_chunk_index_min_max,
+                right_increase,
+            ) = _get_monotonic_chunk_index_min_max(right_axis, right_axis_chunks)
+            left_splits, right_splits = split_monotonic_index_min_max(
+                left_chunk_index_min_max,
+                left_increase,
+                right_chunk_index_min_max,
+                right_increase,
+            )
+        splits = _AxisMinMaxSplitInfo(
+            left_splits, left_increase, right_splits, right_increase
+        )
+        return splits, None
+
+
+def _build_dummy_axis_split(chunk_shape):
+    axis_index_min_max, axis_increase = (
+        [(i, True, i + 1, True) for i in range(chunk_shape)],
+        True,
+    )
+    if len(axis_index_min_max) == 1:
+        left_splits, right_splits = [axis_index_min_max], [axis_index_min_max]
+    else:
+        left_splits, right_splits = split_monotonic_index_min_max(
+            axis_index_min_max, axis_increase, axis_index_min_max, axis_increase
+        )
+    return _AxisMinMaxSplitInfo(
+        left_splits, axis_increase, right_splits, axis_increase, dummy=True
+    )
+
+
+def _gen_series_chunks(splits, out_shape, left_or_right, series):
+    out_chunks = []
+    if splits[0] is not None:
+        # need no shuffle
+        for out_idx in range(out_shape[0]):
+            idx = splits.get_axis_idx(0, left_or_right, out_idx)
+            index_min_max = splits.get_axis_split(0, left_or_right, out_idx)
+            chunk = series.cix[(idx,)]
+            if _need_align_map(chunk, index_min_max, None):
+                align_op = DataFrameIndexAlign(
+                    stage=OperandStage.map,
+                    index_min_max=index_min_max,
+                    column_min_max=None,
+                    dtype=chunk.dtype,
+                    sparse=series.issparse(),
+                    output_types=[OutputType.series],
+                )
+                params = align_op.build_map_chunk_kw(
+                    [chunk], shape=(np.nan,), index=(out_idx,)
+                )
+                out_chunk = align_op.new_chunk([chunk], **params)
+            else:
+                out_chunk = chunk
+            out_chunks.append(out_chunk)
+    else:
+        # gen map chunks
+        map_chunks = []
+        for chunk in series.chunks:
+            map_op = DataFrameIndexAlign(
+                stage=OperandStage.map,
+                sparse=chunk.issparse(),
+                index_shuffle_size=out_shape[0],
+                output_types=[OutputType.series],
+            )
+            params = map_op.build_map_chunk_kw(
+                [chunk], shape=(np.nan,), index=chunk.index
+            )
+            map_chunks.append(map_op.new_chunk([chunk], **params))
+
+        proxy_chunk = DataFrameShuffleProxy(output_types=[OutputType.series]).new_chunk(
+            map_chunks, shape=()
+        )
+
+        # gen reduce chunks
+        for out_idx in range(out_shape[0]):
+            reduce_op = DataFrameIndexAlign(
+                stage=OperandStage.reduce,
+                n_reducers=out_shape[0],
+                i=out_idx,
+                sparse=proxy_chunk.issparse(),
+                output_types=[OutputType.series],
+            )
+            params = reduce_op.build_reduce_chunk_kw(
+                [proxy_chunk], index=(out_idx,), shape=(np.nan,)
+            )
+            out_chunks.append(reduce_op.new_chunk([proxy_chunk], **params))
+
+    return out_chunks
+
+
+def _gen_dataframe_chunks(splits, out_shape, left_or_right, df):
+    out_chunks = []
+    if splits.all_axes_can_split():
+        # no shuffle for all axes
+        kw = {
+            "index_shuffle_size": -1 if splits[0].isdummy() else None,
+            "column_shuffle_size": -1 if splits[1].isdummy() else None,
+        }
+        for out_idx in itertools.product(*(range(s) for s in out_shape)):
+            row_idx = splits.get_axis_idx(0, left_or_right, out_idx[0])
+            col_idx = splits.get_axis_idx(1, left_or_right, out_idx[1])
+            index_min_max = splits.get_axis_split(0, left_or_right, out_idx[0])
+            column_min_max = splits.get_axis_split(1, left_or_right, out_idx[1])
+            chunk = df.cix[row_idx, col_idx]
+            if _need_align_map(
+                chunk,
+                index_min_max,
+                column_min_max,
+                splits[0].isdummy(),
+                splits[1].isdummy(),
+            ):
+                if splits[1].isdummy():
+                    dtypes = chunk.dtypes
+                else:
+                    dtypes = filter_dtypes(chunk.dtypes, column_min_max)
+                chunk_kw = {
+                    "index_value": chunk.index_value if splits[0].isdummy() else None,
+                    "columns_value": chunk.columns_value
+                    if splits[1].isdummy()
+                    else None,
+                    "dtypes": chunk.dtypes if splits[1].isdummy() else None,
+                }
+                align_op = DataFrameIndexAlign(
+                    stage=OperandStage.map,
+                    index_min_max=index_min_max,
+                    column_min_max=column_min_max,
+                    dtypes=dtypes,
+                    sparse=chunk.issparse(),
+                    output_types=[OutputType.dataframe],
+                    **kw
+                )
+                params = align_op.build_map_chunk_kw(
+                    [chunk], shape=(np.nan, np.nan), index=out_idx, **chunk_kw
+                )
+                out_chunk = align_op.new_chunk([chunk], **params)
+            else:
+                out_chunk = chunk
+            out_chunks.append(out_chunk)
+    elif splits.one_axis_can_split():
+        # one axis needs shuffle
+        shuffle_axis = 0 if splits[0] is None else 1
+        align_axis = 1 - shuffle_axis
+
+        for align_axis_idx in range(out_shape[align_axis]):
+            if align_axis == 0:
+                kw = {
+                    "index_min_max": splits.get_axis_split(
+                        align_axis, left_or_right, align_axis_idx
+                    ),
+                    "index_shuffle_size": -1 if splits[0].isdummy() else None,
+                    "column_shuffle_size": out_shape[shuffle_axis],
+                }
+                input_idx = splits.get_axis_idx(
+                    align_axis, left_or_right, align_axis_idx
+                )
+            else:
+                kw = {
+                    "column_min_max": splits.get_axis_split(
+                        align_axis, left_or_right, align_axis_idx
+                    ),
+                    "index_shuffle_size": out_shape[shuffle_axis],
+                    "column_shuffle_size": -1 if splits[1].isdummy() else None,
+                }
+                input_idx = splits.get_axis_idx(
+                    align_axis, left_or_right, align_axis_idx
+                )
+            input_chunks = [c for c in df.chunks if c.index[align_axis] == input_idx]
+            map_chunks = []
+            for j, input_chunk in enumerate(input_chunks):
+                chunk_kw = dict()
+                if align_axis == 0:
+                    chunk_kw["index_value"] = (
+                        input_chunk.index_value if splits[0].isdummy() else None
+                    )
+                else:
+                    chunk_kw["columns_value"] = (
+                        input_chunk.columns_value if splits[1].isdummy() else None
+                    )
+                chunk_kw["dtypes"] = input_chunk.dtypes
+                map_op = DataFrameIndexAlign(
+                    stage=OperandStage.map,
+                    sparse=input_chunk.issparse(),
+                    output_types=[OutputType.dataframe],
+                    **kw
+                )
+                idx = [None, None]
+                idx[align_axis] = align_axis_idx
+                idx[shuffle_axis] = j
+                params = map_op.build_map_chunk_kw(
+                    [input_chunk], shape=(np.nan, np.nan), index=tuple(idx), **chunk_kw
+                )
+                map_chunks.append(map_op.new_chunk([input_chunk], **params))
+            proxy_chunk = DataFrameShuffleProxy(
+                sparse=df.issparse(), output_types=[OutputType.dataframe]
+            ).new_chunk(map_chunks, shape=())
+            for j in range(out_shape[shuffle_axis]):
+                chunk_kw = dict()
+                if align_axis == 0:
+                    chunk_kw["index_value"] = (
+                        proxy_chunk.inputs[0].inputs[0].index_value
+                        if splits[0].isdummy()
+                        else None
+                    )
+                else:
+                    chunk_kw["columns_value"] = (
+                        proxy_chunk.inputs[0].inputs[0].columns_value
+                        if splits[1].isdummy()
+                        else None
+                    )
+                chunk_kw["dtypes"] = proxy_chunk.inputs[0].inputs[0].dtypes
+                reduce_idx = (
+                    (align_axis_idx, j) if align_axis == 0 else (j, align_axis_idx)
+                )
+                reduce_op = DataFrameIndexAlign(
+                    stage=OperandStage.reduce,
+                    n_reducers=out_shape[shuffle_axis],
+                    i=j,
+                    sparse=proxy_chunk.issparse(),
+                    output_types=[OutputType.dataframe],
+                )
+                params = reduce_op.build_reduce_chunk_kw(
+                    [proxy_chunk], shape=(np.nan, np.nan), index=reduce_idx, **chunk_kw
+                )
+                out_chunks.append(reduce_op.new_chunk([proxy_chunk], **params))
+        out_chunks.sort(key=lambda c: c.index)
+    else:
+        # all axes need shuffle
+        assert splits.no_axis_can_split()
+
+        # gen map chunks
+        map_chunks = []
+        for chunk in df.chunks:
+            map_op = DataFrameIndexAlign(
+                stage=OperandStage.map,
+                sparse=chunk.issparse(),
+                index_shuffle_size=out_shape[0],
+                column_shuffle_size=out_shape[1],
+                output_types=[OutputType.dataframe],
+            )
+            params = map_op.build_map_chunk_kw(
+                [chunk], shape=(np.nan, np.nan), index=chunk.index
+            )
+            map_chunks.append(map_op.new_chunk([chunk], **params))
+
+        proxy_chunk = DataFrameShuffleProxy(
+            output_types=[OutputType.dataframe]
+        ).new_chunk(map_chunks, shape=())
+
+        # gen reduce chunks
+        out_indices = list(itertools.product(*(range(s) for s in out_shape)))
+        for out_idx in out_indices:
+            reduce_op = DataFrameIndexAlign(
+                stage=OperandStage.reduce,
+                n_reducers=len(out_indices),
+                i=out_idx,
+                sparse=proxy_chunk.issparse(),
+                output_types=[OutputType.dataframe],
+            )
+            params = reduce_op.build_reduce_chunk_kw(
+                [proxy_chunk], index=out_idx, shape=(np.nan, np.nan)
+            )
+            out_chunks.append(reduce_op.new_chunk([proxy_chunk], **params))
+
+    return out_chunks
+
+
+def align_dataframe_dataframe(left, right, axis=None):
+    left_index_chunks = [c.index_value for c in left.cix[:, 0]]
+    right_index_chunks = [c.index_value for c in right.cix[:, 0]]
+    left_columns_chunks = [c.columns_value for c in left.cix[0, :]]
+    right_columns_chunks = [c.columns_value for c in right.cix[0, :]]
+
+    axis = validate_axis(axis) if axis is not None else None
+    if axis is None or axis == 0:
+        index_splits, index_chunk_shape = _calc_axis_splits(
+            left.index_value, right.index_value, left_index_chunks, right_index_chunks
+        )
+    else:
+        index_splits, index_chunk_shape = None, None
+
+    if axis is None or axis == 1:
+        columns_splits, column_chunk_shape = _calc_axis_splits(
+            left.columns_value,
+            right.columns_value,
+            left_columns_chunks,
+            right_columns_chunks,
+        )
+    else:
+        columns_splits, column_chunk_shape = None, None
+
+    splits = _MinMaxSplitInfo(index_splits, columns_splits)
+    out_left_chunk_shape = (
+        len(index_chunk_shape or list(itertools.chain(*index_splits._left_split)))
+        if index_splits is not None
+        else left.chunk_shape[0],
+        len(column_chunk_shape or list(itertools.chain(*columns_splits._left_split)))
+        if columns_splits is not None
+        else left.chunk_shape[1],
+    )
+    if axis is None:
+        out_right_chunk_shape = out_left_chunk_shape
+    else:
+        out_right_chunk_shape = (
+            len(index_chunk_shape or list(itertools.chain(*index_splits._right_split)))
+            if index_splits is not None
+            else right.chunk_shape[0],
+            len(
+                column_chunk_shape
+                or list(itertools.chain(*columns_splits._right_split))
+            )
+            if columns_splits is not None
+            else right.chunk_shape[1],
+        )
+    left_chunks = _gen_dataframe_chunks(splits, out_left_chunk_shape, 0, left)
+    right_chunks = _gen_dataframe_chunks(splits, out_right_chunk_shape, 1, right)
+
+    index_nsplits = columns_nsplits = None
+    if axis is None or axis == 0:
+        if _is_index_identical(left_index_chunks, right_index_chunks):
+            index_nsplits = left.nsplits[0]
+        else:
+            index_nsplits = [np.nan for _ in range(out_left_chunk_shape[0])]
+    if axis is None or axis == 1:
+        if _is_index_identical(left_columns_chunks, right_columns_chunks):
+            columns_nsplits = left.nsplits[1]
+        else:
+            columns_nsplits = [np.nan for _ in range(out_left_chunk_shape[1])]
+
+    nsplits = [index_nsplits, columns_nsplits]
+
+    out_chunk_shapes = (out_left_chunk_shape, out_right_chunk_shape)
+    return nsplits, out_chunk_shapes, left_chunks, right_chunks
+
+
+def align_dataframe_series(left, right, axis="columns"):
+    axis = validate_axis(axis)
+    if axis == 1:
+        left_columns_chunks = [c.columns_value for c in left.cix[0, :]]
+        right_index_chunks = [c.index_value for c in right.chunks]
+        index_splits, chunk_shape = _calc_axis_splits(
+            left.columns_value,
+            right.index_value,
+            left_columns_chunks,
+            right_index_chunks,
+        )
+        dummy_splits, dummy_nsplits = (
+            _build_dummy_axis_split(left.chunk_shape[0]),
+            left.nsplits[0],
+        )
+        out_chunk_shape = (
+            len(dummy_nsplits),
+            len(chunk_shape or list(itertools.chain(*index_splits._left_split))),
+        )
+        left_chunks = _gen_dataframe_chunks(
+            _MinMaxSplitInfo(dummy_splits, index_splits), out_chunk_shape, 0, left
+        )
+        right_chunks = _gen_series_chunks(
+            _MinMaxSplitInfo(index_splits, None), (out_chunk_shape[1],), 1, right
+        )
+        if _is_index_identical(left_columns_chunks, right_index_chunks):
+            index_nsplits = left.nsplits[1]
+        else:
+            index_nsplits = [np.nan for _ in range(out_chunk_shape[1])]
+        nsplits = [dummy_nsplits, index_nsplits]
+    else:
+        left_index_chunks = [c.index_value for c in left.cix[:, 0]]
+        right_index_chunks = [c.index_value for c in right.chunks]
+        index_splits, index_chunk_shape = _calc_axis_splits(
+            left.index_value, right.index_value, left_index_chunks, right_index_chunks
+        )
+
+        dummy_splits, dummy_nsplits = (
+            _build_dummy_axis_split(left.chunk_shape[1]),
+            left.nsplits[1],
+        )
+        out_chunk_shape = (
+            len(index_chunk_shape or list(itertools.chain(*index_splits._left_split))),
+            len(dummy_nsplits),
+        )
+        left_chunks = _gen_dataframe_chunks(
+            _MinMaxSplitInfo(index_splits, dummy_splits), out_chunk_shape, 0, left
+        )
+        right_chunks = _gen_series_chunks(
+            _MinMaxSplitInfo(index_splits, None), (out_chunk_shape[0],), 1, right
+        )
+        if _is_index_identical(left_index_chunks, right_index_chunks):
+            index_nsplits = left.nsplits[0]
+        else:
+            index_nsplits = [np.nan for _ in range(out_chunk_shape[0])]
+        nsplits = [index_nsplits, dummy_nsplits]
+
+    return nsplits, out_chunk_shape, left_chunks, right_chunks
+
+
+def align_series_series(left, right):
+    if is_index_value_identical(left, right):
+        # index identical, skip align
+        return left.nsplits, left.chunk_shape, left.chunks, right.chunks
+
+    left_index_chunks = [c.index_value for c in left.chunks]
+    right_index_chunks = [c.index_value for c in right.chunks]
+
+    index_splits, index_chunk_shape = _calc_axis_splits(
+        left.index_value, right.index_value, left_index_chunks, right_index_chunks
+    )
+
+    out_chunk_shape = (
+        len(index_chunk_shape or list(itertools.chain(*index_splits._left_split))),
+    )
+    splits = _MinMaxSplitInfo(index_splits, None)
+
+    left_chunks = _gen_series_chunks(splits, out_chunk_shape, 0, left)
+    right_chunks = _gen_series_chunks(splits, out_chunk_shape, 1, right)
+    index_nsplits = [np.nan for _ in range(out_chunk_shape[0])]
+    nsplits = [index_nsplits]
+    return nsplits, out_chunk_shape, left_chunks, right_chunks
diff --git a/python/xorbits/_mars/dataframe/arithmetic/__init__.py b/python/xorbits/_mars/dataframe/arithmetic/__init__.py
new file mode 100644
index 000000000..5e80fa457
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/__init__.py
@@ -0,0 +1,352 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+
+import pandas as pd
+
+try:
+    from pandas.core.arraylike import OpsMixin as PdOpsMixin
+except ImportError:  # pragma: no cover
+    PdOpsMixin = None
+
+from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE, is_build_mode
+from ..ufunc.tensor import register_tensor_ufunc
+from ..utils import wrap_notimplemented_exception
+from .abs import DataFrameAbs, abs_
+from .add import DataFrameAdd, add, radd
+from .arccos import DataFrameArccos
+from .arccosh import DataFrameArccosh
+from .arcsin import DataFrameArcsin
+from .arcsinh import DataFrameArcsinh
+from .arctan import DataFrameArctan
+from .arctanh import DataFrameArctanh
+from .around import DataFrameAround, around
+from .bitwise_and import DataFrameAnd, bitand, rbitand
+from .bitwise_or import DataFrameOr, bitor, rbitor
+from .bitwise_xor import DataFrameXor, bitxor, rbitxor
+from .ceil import DataFrameCeil
+from .cos import DataFrameCos
+from .cosh import DataFrameCosh
+from .degrees import DataFrameDegrees
+from .dot import dot, rdot
+from .equal import DataFrameEqual, eq
+from .exp import DataFrameExp
+from .exp2 import DataFrameExp2
+from .expm1 import DataFrameExpm1
+from .floor import DataFrameFloor
+from .floordiv import DataFrameFloorDiv, floordiv, rfloordiv
+from .greater import DataFrameGreater, gt
+from .greater_equal import DataFrameGreaterEqual, ge
+from .invert import DataFrameNot, invert
+from .is_ufuncs import DataFrameIsFinite, DataFrameIsInf, DataFrameIsNan
+from .less import DataFrameLess, lt
+from .less_equal import DataFrameLessEqual, le
+from .log import DataFrameLog
+from .log2 import DataFrameLog2
+from .log10 import DataFrameLog10
+from .mod import DataFrameMod, mod, rmod
+from .multiply import DataFrameMul, mul, rmul
+from .negative import DataFrameNegative, negative
+from .not_equal import DataFrameNotEqual, ne
+from .power import DataFramePower, power, rpower
+from .radians import DataFrameRadians
+from .sin import DataFrameSin
+from .sinh import DataFrameSinh
+from .sqrt import DataFrameSqrt
+from .subtract import DataFrameSubtract, rsubtract, subtract
+from .tan import DataFrameTan
+from .tanh import DataFrameTanh
+from .truediv import DataFrameTrueDiv, rtruediv, truediv
+
+
+def _wrap_eq():
+    @functools.wraps(eq)
+    def call(df, other, **kw):
+        if is_build_mode():
+            return df._equals(other)
+        return _wrap_comparison(eq)(df, other, **kw)
+
+    return call
+
+
+def _wrap_comparison(func):
+    @functools.wraps(func)
+    def call(df, other, **kw):
+        if isinstance(df, DATAFRAME_TYPE) and isinstance(other, DATAFRAME_TYPE):
+            # index and columns should be identical
+            for index_type in ["index_value", "columns_value"]:
+                left, right = getattr(df, index_type), getattr(other, index_type)
+                if left.has_value() and right.has_value():
+                    # if df and other's index or columns has value
+                    index_eq = left.to_pandas().equals(right.to_pandas())
+                else:
+                    index_eq = left.key == right.key
+                if not index_eq:
+                    raise ValueError(
+                        "Can only compare identically-labeled DataFrame object"
+                    )
+        return wrap_notimplemented_exception(func)(df, other, **kw)
+
+    return call
+
+
+_reverse_magic_names = {
+    "eq": "eq",
+    "ne": "ne",
+    "lt": "ge",
+    "le": "gt",
+    "gt": "le",
+    "ge": "lt",
+}
+
+
+def _wrap_pandas_magics(cls, magic_name: str):
+    magic_func_name = f"__{magic_name}__"
+    magic_rfunc_name = _reverse_magic_names.get(magic_name, f"__r{magic_name}__")
+    try:
+        raw_method = getattr(cls, magic_func_name)
+    except AttributeError:
+        return
+
+    @functools.wraps(raw_method)
+    def wrapped(self, other):
+        if not isinstance(other, (DATAFRAME_TYPE, SERIES_TYPE, INDEX_TYPE)):
+            return raw_method(self, other)
+
+        try:
+            val = getattr(other, magic_rfunc_name)(self)
+        except AttributeError:  # pragma: no cover
+            return raw_method(self, other)
+
+        if val is NotImplemented:  # pragma: no cover
+            return raw_method(self, other)
+        return val
+
+    setattr(cls, magic_func_name, wrapped)
+
+
+def _install():
+    def _register_method(cls, name, func, wrapper=None):
+        if wrapper is None:
+
+            @functools.wraps(func)
+            def wrapper(df, *args, **kwargs):
+                return func(df, *args, **kwargs)
+
+        try:
+            if issubclass(cls, DATAFRAME_TYPE):
+                wrapper.__doc__ = func.__frame_doc__
+            elif issubclass(cls, SERIES_TYPE):
+                wrapper.__doc__ = func.__series_doc__
+            else:
+                wrapper = func
+        except AttributeError:
+            wrapper = func
+
+        wrapper.__name__ = func.__name__
+        setattr(cls, name, wrapper)
+
+    def _register_bin_method(cls, name, func):
+        def call_df_fill(df, other, axis="columns", level=None, fill_value=None):
+            return func(df, other, axis=axis, level=level, fill_value=fill_value)
+
+        def call_df_no_fill(df, other, axis="columns", level=None):
+            return func(df, other, axis=axis, level=level)
+
+        def call_series_fill(df, other, level=None, fill_value=None, axis=0):
+            return func(df, other, axis=axis, level=level, fill_value=fill_value)
+
+        def call_series_no_fill(df, other, level=None, axis=0):
+            return func(df, other, axis=axis, level=level)
+
+        if issubclass(cls, DATAFRAME_TYPE):
+            call = (
+                call_df_fill
+                if "fill_value" in func.__code__.co_varnames
+                else call_df_no_fill
+            )
+        elif issubclass(cls, SERIES_TYPE):
+            call = (
+                call_series_fill
+                if "fill_value" in func.__code__.co_varnames
+                else call_series_no_fill
+            )
+        else:
+            call = None
+        return _register_method(cls, name, func, wrapper=call)
+
+    # register mars tensor ufuncs
+    ufunc_ops = [
+        # unary
+        DataFrameAbs,
+        DataFrameLog,
+        DataFrameLog2,
+        DataFrameLog10,
+        DataFrameSin,
+        DataFrameCos,
+        DataFrameTan,
+        DataFrameSinh,
+        DataFrameCosh,
+        DataFrameTanh,
+        DataFrameArcsin,
+        DataFrameArccos,
+        DataFrameArctan,
+        DataFrameArcsinh,
+        DataFrameArccosh,
+        DataFrameArctanh,
+        DataFrameRadians,
+        DataFrameDegrees,
+        DataFrameCeil,
+        DataFrameFloor,
+        DataFrameAround,
+        DataFrameExp,
+        DataFrameExp2,
+        DataFrameExpm1,
+        DataFrameSqrt,
+        DataFrameNot,
+        DataFrameIsNan,
+        DataFrameIsInf,
+        DataFrameIsFinite,
+        DataFrameNegative,
+        # binary
+        DataFrameAdd,
+        DataFrameEqual,
+        DataFrameFloorDiv,
+        DataFrameGreater,
+        DataFrameGreaterEqual,
+        DataFrameLess,
+        DataFrameLessEqual,
+        DataFrameAnd,
+        DataFrameOr,
+        DataFrameXor,
+        DataFrameMod,
+        DataFrameMul,
+        DataFrameNotEqual,
+        DataFramePower,
+        DataFrameSubtract,
+        DataFrameTrueDiv,
+    ]
+    for ufunc_op in ufunc_ops:
+        register_tensor_ufunc(ufunc_op)
+
+    for entity in DATAFRAME_TYPE + SERIES_TYPE:
+        setattr(entity, "__abs__", abs_)
+        setattr(entity, "abs", abs_)
+        _register_method(entity, "round", around)
+        setattr(entity, "__invert__", invert)
+
+        setattr(entity, "__add__", wrap_notimplemented_exception(add))
+        setattr(entity, "__radd__", wrap_notimplemented_exception(radd))
+        _register_bin_method(entity, "add", add)
+        _register_bin_method(entity, "radd", radd)
+
+        setattr(entity, "__sub__", wrap_notimplemented_exception(subtract))
+        setattr(entity, "__rsub__", wrap_notimplemented_exception(rsubtract))
+        _register_bin_method(entity, "sub", subtract)
+        _register_bin_method(entity, "rsub", rsubtract)
+
+        setattr(entity, "__mul__", wrap_notimplemented_exception(mul))
+        setattr(entity, "__rmul__", wrap_notimplemented_exception(rmul))
+        _register_bin_method(entity, "mul", mul)
+        _register_bin_method(entity, "multiply", mul)
+        _register_bin_method(entity, "rmul", rmul)
+
+        setattr(entity, "__floordiv__", wrap_notimplemented_exception(floordiv))
+        setattr(entity, "__rfloordiv__", wrap_notimplemented_exception(rfloordiv))
+        setattr(entity, "__truediv__", wrap_notimplemented_exception(truediv))
+        setattr(entity, "__rtruediv__", wrap_notimplemented_exception(rtruediv))
+        setattr(entity, "__div__", wrap_notimplemented_exception(truediv))
+        setattr(entity, "__rdiv__", wrap_notimplemented_exception(rtruediv))
+        _register_bin_method(entity, "floordiv", floordiv)
+        _register_bin_method(entity, "rfloordiv", rfloordiv)
+        _register_bin_method(entity, "truediv", truediv)
+        _register_bin_method(entity, "rtruediv", rtruediv)
+        _register_bin_method(entity, "div", truediv)
+        _register_bin_method(entity, "rdiv", rtruediv)
+
+        setattr(entity, "__mod__", wrap_notimplemented_exception(mod))
+        setattr(entity, "__rmod__", wrap_notimplemented_exception(rmod))
+        _register_bin_method(entity, "mod", mod)
+        _register_bin_method(entity, "rmod", rmod)
+
+        setattr(entity, "__pow__", wrap_notimplemented_exception(power))
+        setattr(entity, "__rpow__", wrap_notimplemented_exception(rpower))
+        _register_bin_method(entity, "pow", power)
+        _register_bin_method(entity, "rpow", rpower)
+
+        setattr(entity, "__eq__", _wrap_eq())
+        setattr(entity, "__ne__", _wrap_comparison(ne))
+        setattr(entity, "__lt__", _wrap_comparison(lt))
+        setattr(entity, "__gt__", _wrap_comparison(gt))
+        setattr(entity, "__ge__", _wrap_comparison(ge))
+        setattr(entity, "__le__", _wrap_comparison(le))
+        _register_bin_method(entity, "eq", eq)
+        _register_bin_method(entity, "ne", ne)
+        _register_bin_method(entity, "lt", lt)
+        _register_bin_method(entity, "gt", gt)
+        _register_bin_method(entity, "ge", ge)
+        _register_bin_method(entity, "le", le)
+
+        setattr(entity, "__matmul__", dot)
+        setattr(entity, "__rmatmul__", rdot)
+        _register_method(entity, "dot", dot)
+
+        setattr(entity, "__and__", wrap_notimplemented_exception(bitand))
+        setattr(entity, "__rand__", wrap_notimplemented_exception(rbitand))
+
+        setattr(entity, "__or__", wrap_notimplemented_exception(bitor))
+        setattr(entity, "__ror__", wrap_notimplemented_exception(rbitor))
+
+        setattr(entity, "__xor__", wrap_notimplemented_exception(bitxor))
+        setattr(entity, "__rxor__", wrap_notimplemented_exception(rbitxor))
+
+        setattr(entity, "__neg__", wrap_notimplemented_exception(negative))
+
+    for entity in INDEX_TYPE:
+        setattr(entity, "__eq__", _wrap_eq())
+
+    if PdOpsMixin is not None and not hasattr(
+        pd, "_mars_df_arith_wrapped"
+    ):  # pragma: no branch
+        # wrap pandas magic functions to intercept reverse operands
+        for magic_name in [
+            "add",
+            "sub",
+            "mul",
+            "div",
+            "truediv",
+            "floordiv",
+            "mod",
+            "pow",
+            "and",
+            "or",
+            "xor",
+            "eq",
+            "ne",
+            "lt",
+            "le",
+            "gt",
+            "ge",
+        ]:
+            _wrap_pandas_magics(PdOpsMixin, magic_name)
+
+        for pd_cls in (pd.DataFrame, pd.Series):
+            _wrap_pandas_magics(pd_cls, "matmul")
+
+        pd._mars_df_arith_wrapped = True
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/dataframe/arithmetic/abs.py b/python/xorbits/_mars/dataframe/arithmetic/abs.py
new file mode 100644
index 000000000..8b706adae
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/abs.py
@@ -0,0 +1,33 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameAbs(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.ABS
+    _func_name = "abs"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorAbsolute
+
+        return TensorAbsolute
+
+
+def abs_(df):
+    op = DataFrameAbs()
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/add.py b/python/xorbits/_mars/dataframe/arithmetic/add.py
new file mode 100644
index 000000000..9c661f402
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/add.py
@@ -0,0 +1,60 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameBinopUfunc
+from .docstring import bin_arithmetic_doc
+
+
+class DataFrameAdd(DataFrameBinopUfunc):
+    _op_type_ = OperandDef.ADD
+
+    _func_name = "add"
+    _rfunc_name = "radd"
+
+    @classproperty
+    def _operator(self):
+        return operator.add
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorAdd
+
+        return TensorAdd
+
+
+_add_example = """
+>>> a.add(b, fill_value=0).execute()
+a    2.0
+b    1.0
+c    1.0
+d    1.0
+e    NaN
+dtype: float64
+"""
+
+
+@bin_arithmetic_doc("Addition", equiv="+", series_example=_add_example)
+def add(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameAdd(axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other)
+    return op(df, other)
+
+
+@bin_arithmetic_doc("Addition", equiv="+", series_example=_add_example)
+def radd(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameAdd(axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df)
+    return op.rcall(df, other)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/arccos.py b/python/xorbits/_mars/dataframe/arithmetic/arccos.py
new file mode 100644
index 000000000..3cd366e39
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/arccos.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameArccos(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.ARCCOS
+    _func_name = "arccos"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorArccos
+
+        return TensorArccos
diff --git a/python/xorbits/_mars/dataframe/arithmetic/arccosh.py b/python/xorbits/_mars/dataframe/arithmetic/arccosh.py
new file mode 100644
index 000000000..f2612b3d4
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/arccosh.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameArccosh(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.ARCCOSH
+    _func_name = "arccosh"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorArccosh
+
+        return TensorArccosh
diff --git a/python/xorbits/_mars/dataframe/arithmetic/arcsin.py b/python/xorbits/_mars/dataframe/arithmetic/arcsin.py
new file mode 100644
index 000000000..8da1008c4
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/arcsin.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameArcsin(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.ARCSIN
+    _func_name = "arcsin"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorArcsin
+
+        return TensorArcsin
diff --git a/python/xorbits/_mars/dataframe/arithmetic/arcsinh.py b/python/xorbits/_mars/dataframe/arithmetic/arcsinh.py
new file mode 100644
index 000000000..4918323b1
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/arcsinh.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameArcsinh(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.ARCSINH
+    _func_name = "arcsinh"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorArcsinh
+
+        return TensorArcsinh
diff --git a/python/xorbits/_mars/dataframe/arithmetic/arctan.py b/python/xorbits/_mars/dataframe/arithmetic/arctan.py
new file mode 100644
index 000000000..f3a45aba6
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/arctan.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameArctan(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.ARCTAN
+    _func_name = "arctan"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorArctan
+
+        return TensorArctan
diff --git a/python/xorbits/_mars/dataframe/arithmetic/arctanh.py b/python/xorbits/_mars/dataframe/arithmetic/arctanh.py
new file mode 100644
index 000000000..e4f9698d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/arctanh.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameArctanh(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.ARCTANH
+    _func_name = "arctanh"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorArctanh
+
+        return TensorArctanh
diff --git a/python/xorbits/_mars/dataframe/arithmetic/around.py b/python/xorbits/_mars/dataframe/arithmetic/around.py
new file mode 100644
index 000000000..d41f1a409
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/around.py
@@ -0,0 +1,167 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import Int32Field
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameAround(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.AROUND
+    _func_name = "around"
+
+    _decimals = Int32Field("decimals")
+
+    def __init__(self, decimals=None, output_types=None, **kw):
+        super().__init__(_decimals=decimals, output_types=output_types, **kw)
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorAround
+
+        return TensorAround
+
+    @property
+    def decimals(self):
+        return self._decimals
+
+    @classmethod
+    def execute(cls, ctx, op):
+        df = ctx[op.inputs[0].key]
+        func_name = getattr(cls, "_func_name")
+        if hasattr(df, func_name):
+            ctx[op.outputs[0].key] = getattr(df, func_name)(decimals=op.decimals)
+        else:
+            ctx[op.outputs[0].key] = getattr(np, func_name)(df, decimals=op.decimals)
+
+
+def around(df, decimals=0, *args, **kwargs):
+    if len(args) > 0:
+        raise TypeError(
+            f"round() takes 0 positional arguments but {len(args)} was given"
+        )
+    op = DataFrameAround(decimals=decimals, **kwargs)
+    return op(df)
+
+
+around.__frame_doc__ = """
+Round a DataFrame to a variable number of decimal places.
+
+Parameters
+----------
+decimals : int, dict, Series
+    Number of decimal places to round each column to. If an int is
+    given, round each column to the same number of places.
+    Otherwise dict and Series round to variable numbers of places.
+    Column names should be in the keys if `decimals` is a
+    dict-like, or in the index if `decimals` is a Series. Any
+    columns not included in `decimals` will be left as is. Elements
+    of `decimals` which are not columns of the input will be
+    ignored.
+*args
+    Additional keywords have no effect but might be accepted for
+    compatibility with numpy.
+**kwargs
+    Additional keywords have no effect but might be accepted for
+    compatibility with numpy.
+
+Returns
+-------
+DataFrame
+    A DataFrame with the affected columns rounded to the specified
+    number of decimal places.
+
+See Also
+--------
+numpy.around : Round a numpy array to the given number of decimals.
+Series.round : Round a Series to the given number of decimals.
+
+Examples
+--------
+>>> import mars.dataframe as md
+>>> df = md.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
+...                   columns=['dogs', 'cats'])
+>>> df.execute()
+    dogs  cats
+0  0.21  0.32
+1  0.01  0.67
+2  0.66  0.03
+3  0.21  0.18
+
+By providing an integer each column is rounded to the same number
+of decimal places
+
+>>> df.round(1).execute()
+    dogs  cats
+0   0.2   0.3
+1   0.0   0.7
+2   0.7   0.0
+3   0.2   0.2
+
+With a dict, the number of places for specific columns can be
+specified with the column names as key and the number of decimal
+places as value
+
+>>> df.round({'dogs': 1, 'cats': 0}).execute()
+    dogs  cats
+0   0.2   0.0
+1   0.0   1.0
+2   0.7   0.0
+3   0.2   0.0
+
+Using a Series, the number of places for specific columns can be
+specified with the column names as index and the number of
+decimal places as value
+
+>>> decimals = md.Series([0, 1], index=['cats', 'dogs'])
+>>> df.round(decimals).execute()
+    dogs  cats
+0   0.2   0.0
+1   0.0   1.0
+2   0.7   0.0
+3   0.2   0.0
+"""
+around.__series_doc__ = """
+Round each value in a Series to the given number of decimals.
+
+Parameters
+----------
+decimals : int, default 0
+    Number of decimal places to round to. If decimals is negative,
+    it specifies the number of positions to the left of the decimal point.
+
+Returns
+-------
+Series
+    Rounded values of the Series.
+
+See Also
+--------
+numpy.around : Round values of an np.array.
+DataFrame.round : Round values of a DataFrame.
+
+Examples
+--------
+>>> import mars.tensor as mt
+>>> import mars.dataframe as md
+>>> s = md.Series([0.1, 1.3, 2.7])
+>>> s.round().execute()
+0    0.0
+1    1.0
+2    3.0
+dtype: float64
+"""
diff --git a/python/xorbits/_mars/dataframe/arithmetic/bitwise_and.py b/python/xorbits/_mars/dataframe/arithmetic/bitwise_and.py
new file mode 100644
index 000000000..71970ab6b
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/bitwise_and.py
@@ -0,0 +1,46 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameBinopUfunc
+
+
+class DataFrameAnd(DataFrameBinopUfunc):
+    _op_type_ = OperandDef.AND
+
+    _bit_func_name = "__and__"
+    _bit_rfunc_name = "__rand__"
+
+    @classproperty
+    def _operator(self):
+        return operator.and_
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorBitand
+
+        return TensorBitand
+
+
+def bitand(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameAnd(axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other)
+    return op(df, other)
+
+
+def rbitand(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameAnd(axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df)
+    return op.rcall(df, other)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/bitwise_or.py b/python/xorbits/_mars/dataframe/arithmetic/bitwise_or.py
new file mode 100644
index 000000000..eb12d4964
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/bitwise_or.py
@@ -0,0 +1,68 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+
+from ... import opcodes as OperandDef
+from ...utils import TreeReductionBuilder, classproperty
+from .core import DataFrameArithmeticTreeMixin, DataFrameBinopUfunc
+
+
+class DataFrameOr(DataFrameBinopUfunc):
+    _op_type_ = OperandDef.OR
+
+    _bit_func_name = "__or__"
+    _bit_rfunc_name = "__ror__"
+
+    @classproperty
+    def _operator(self):
+        return operator.or_
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorBitor
+
+        return TensorBitor
+
+
+class DataFrameTreeOr(DataFrameArithmeticTreeMixin, DataFrameOr):
+    _op_type_ = OperandDef.TREE_OR
+
+
+def bitor(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameOr(axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other)
+    return op(df, other)
+
+
+def rbitor(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameOr(axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df)
+    return op.rcall(df, other)
+
+
+def tree_dataframe_or(
+    *args, index=None, combine_size=None, axis="columns", level=None, fill_value=None
+):
+    class MultiplyBuilder(TreeReductionBuilder):
+        def _build_reduction(self, inputs, final=False):
+            op = DataFrameTreeOr(
+                axis=axis,
+                level=level,
+                fill_value=fill_value,
+                output_types=inputs[0].op.output_types,
+            )
+            params = inputs[0].params.copy()
+            params["index"] = index
+            return op.new_chunk(inputs, **params)
+
+    return MultiplyBuilder(combine_size).build(args)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/bitwise_xor.py b/python/xorbits/_mars/dataframe/arithmetic/bitwise_xor.py
new file mode 100644
index 000000000..14230dfb6
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/bitwise_xor.py
@@ -0,0 +1,46 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameBinopUfunc
+
+
+class DataFrameXor(DataFrameBinopUfunc):
+    _op_type_ = OperandDef.XOR
+
+    _bit_func_name = "__xor__"
+    _bit_rfunc_name = "__rxor__"
+
+    @classproperty
+    def _operator(self):
+        return operator.xor
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorBitxor
+
+        return TensorBitxor
+
+
+def bitxor(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameXor(axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other)
+    return op(df, other)
+
+
+def rbitxor(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameXor(axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df)
+    return op.rcall(df, other)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/ceil.py b/python/xorbits/_mars/dataframe/arithmetic/ceil.py
new file mode 100644
index 000000000..f277b1b8f
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/ceil.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameCeil(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.CEIL
+    _func_name = "ceil"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorCeil
+
+        return TensorCeil
diff --git a/python/xorbits/_mars/dataframe/arithmetic/core.py b/python/xorbits/_mars/dataframe/arithmetic/core.py
new file mode 100644
index 000000000..7fb06ecca
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/core.py
@@ -0,0 +1,832 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import itertools
+from functools import reduce
+
+import numpy as np
+import pandas as pd
+
+from ...core import CHUNK_TYPE, ENTITY_TYPE, recursive_tile
+from ...serialization.serializables import AnyField
+from ...tensor.core import TENSOR_CHUNK_TYPE, TENSOR_TYPE, Chunk, ChunkData
+from ...utils import classproperty, get_dtype
+from ..align import (
+    align_dataframe_dataframe,
+    align_dataframe_series,
+    align_series_series,
+)
+from ..core import (
+    DATAFRAME_CHUNK_TYPE,
+    DATAFRAME_TYPE,
+    SERIES_CHUNK_TYPE,
+    SERIES_TYPE,
+    is_chunk_meta_lazy,
+)
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..ufunc.tensor import TensorUfuncMixin
+from ..utils import (
+    build_empty_df,
+    infer_dtype,
+    infer_dtypes,
+    infer_index_value,
+    parse_index,
+)
+
+
+class DataFrameBinOpMixin(DataFrameOperandMixin):
+    @classmethod
+    def _tile_both_dataframes(cls, op):
+        # if both of the inputs are DataFrames, axis is just ignored
+        left, right = op.lhs, op.rhs
+        df = op.outputs[0]
+
+        nsplits, out_shapes, left_chunks, right_chunks = align_dataframe_dataframe(
+            left, right
+        )
+        out_chunk_indexes = itertools.product(*(range(s) for s in out_shapes[0]))
+
+        out_chunks = []
+        for idx, left_chunk, right_chunk in zip(
+            out_chunk_indexes, left_chunks, right_chunks
+        ):
+            out_chunk = (
+                op.copy()
+                .reset_key()
+                .new_chunk(
+                    [left_chunk, right_chunk],
+                    shape=(nsplits[0][idx[0]], nsplits[1][idx[1]]),
+                    index=idx,
+                )
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            op.inputs,
+            df.shape,
+            nsplits=tuple(tuple(ns) for ns in nsplits),
+            chunks=out_chunks,
+            dtypes=df.dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+        )
+
+    @classmethod
+    def _tile_both_series(cls, op):
+        left, right = op.lhs, op.rhs
+        df = op.outputs[0]
+
+        nsplits, out_shape, left_chunks, right_chunks = align_series_series(left, right)
+
+        out_chunks = []
+        for idx, left_chunk, right_chunk in zip(
+            range(out_shape[0]), left_chunks, right_chunks
+        ):
+            out_chunk = (
+                op.copy()
+                .reset_key()
+                .new_chunk(
+                    [left_chunk, right_chunk], shape=(nsplits[0][idx],), index=(idx,)
+                )
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_seriess(
+            op.inputs,
+            df.shape,
+            nsplits=tuple(tuple(ns) for ns in nsplits),
+            chunks=out_chunks,
+            dtype=df.dtype,
+            index_value=df.index_value,
+            name=df.name,
+        )
+
+    @classmethod
+    def _tile_dataframe_series(cls, op):
+        left, right = op.lhs, op.rhs
+        df = op.outputs[0]
+
+        nsplits, out_shape, left_chunks, right_chunks = align_dataframe_series(
+            left, right, axis=op.axis
+        )
+        out_chunk_indexes = itertools.product(*(range(s) for s in out_shape))
+
+        out_chunks = []
+        for out_idx, df_chunk in zip(out_chunk_indexes, left_chunks):
+            if op.axis == "columns" or op.axis == 1:
+                series_chunk = right_chunks[out_idx[1]]
+                kw = {
+                    "shape": (nsplits[0][out_idx[0]], nsplits[1][out_idx[1]]),
+                    "index_value": df_chunk.index_value,
+                    "dtypes_value": df_chunk.dtypes_value,
+                }
+            else:
+                series_chunk = right_chunks[out_idx[0]]
+                kw = {
+                    "shape": (nsplits[0][out_idx[0]], nsplits[1][out_idx[1]]),
+                    "columns_value": df_chunk.columns_value,
+                    "dtypes_value": df_chunk.dtypes_value,
+                }
+            out_chunk = (
+                op.copy()
+                .reset_key()
+                .new_chunk([df_chunk, series_chunk], index=out_idx, **kw)
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            op.inputs,
+            df.shape,
+            nsplits=tuple(tuple(ns) for ns in nsplits),
+            chunks=out_chunks,
+            dtypes=df.dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+        )
+
+    @classmethod
+    def _tile_series_dataframe(cls, op):
+        left, right = op.lhs, op.rhs
+        df = op.outputs[0]
+
+        nsplits, out_shape, right_chunks, left_chunks = align_dataframe_series(
+            right, left, axis=op.axis
+        )
+        out_chunk_indexes = itertools.product(*(range(s) for s in out_shape))
+
+        out_chunks = []
+        for out_idx, df_chunk in zip(out_chunk_indexes, right_chunks):
+            if op.axis == "columns" or op.axis == 1:
+                series_chunk = left_chunks[out_idx[1]]
+                kw = {
+                    "shape": (df_chunk.shape[0], np.nan),
+                    "index_value": df_chunk.index_value,
+                    "dtypes_value": df_chunk.dtypes_value,
+                }
+            else:
+                series_chunk = left_chunks[out_idx[0]]
+                kw = {
+                    "shape": (df_chunk.shape[0], np.nan),
+                    "index_value": df_chunk.index_value,
+                    "dtypes_value": df_chunk.dtypes_value,
+                }
+            out_chunk = (
+                op.copy()
+                .reset_key()
+                .new_chunk([series_chunk, df_chunk], index=out_idx, **kw)
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            op.inputs,
+            df.shape,
+            nsplits=tuple(tuple(ns) for ns in nsplits),
+            chunks=out_chunks,
+            dtypes=df.dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+        )
+
+    @classmethod
+    def _tile_scalar(cls, op):
+        tileable = op.rhs if pd.api.types.is_scalar(op.lhs) else op.lhs
+        df = op.outputs[0]
+        out_chunks = []
+        lazy_chunk_meta = is_chunk_meta_lazy(tileable.chunks[0])
+        for chunk in tileable.chunks:
+            out_op = op.copy().reset_key()
+            if chunk.ndim == 2:
+                if lazy_chunk_meta:
+                    out_chunk = out_op.new_chunk(
+                        [chunk],
+                        shape=chunk.shape,
+                        index=chunk.index,
+                    )
+                    out_chunk._set_tileable_meta(
+                        tileable_key=df.key,
+                        nsplits=tileable.nsplits,
+                        index_value=df.index_value,
+                        columns_value=df.columns_value,
+                        dtypes=df.dtypes,
+                    )
+                else:
+                    out_chunk = out_op.new_chunk(
+                        [chunk],
+                        shape=chunk.shape,
+                        index=chunk.index,
+                        dtypes=chunk.dtypes,
+                        index_value=chunk.index_value,
+                        columns_value=getattr(chunk, "columns_value"),
+                    )
+            else:
+                if lazy_chunk_meta:
+                    out_chunk = out_op.new_chunk(
+                        [chunk],
+                        shape=chunk.shape,
+                        index=chunk.index,
+                        dtype=chunk.dtype,
+                        name=getattr(chunk, "name"),
+                    )
+                    out_chunk._set_tileable_meta(
+                        tileable_key=df.key,
+                        nsplits=tileable.nsplits,
+                        index_value=df.index_value,
+                    )
+                else:
+                    out_chunk = out_op.new_chunk(
+                        [chunk],
+                        shape=chunk.shape,
+                        index=chunk.index,
+                        dtype=chunk.dtype,
+                        index_value=chunk.index_value,
+                        name=getattr(chunk, "name"),
+                    )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        params = df.params.copy()
+        params["chunks"] = out_chunks
+        params["nsplits"] = tileable.nsplits
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def _tile_with_tensor(cls, op):
+        out = op.outputs[0]
+        axis = op.axis
+        if axis is None:
+            axis = 0
+
+        rhs_is_tensor = isinstance(op.rhs, TENSOR_TYPE)
+        tensor, other = (op.rhs, op.lhs) if rhs_is_tensor else (op.lhs, op.rhs)
+        if tensor.shape == other.shape:
+            tensor = yield from recursive_tile(tensor.rechunk(other.nsplits))
+        else:
+            # shape differs only when dataframe add 1-d tensor, we need rechunk on columns axis.
+            if axis in ["columns", 1] and other.ndim == 1:
+                # force axis == 0 if it's Series other than DataFrame
+                axis = 0
+            rechunk_size = (
+                other.nsplits[1] if axis == "columns" or axis == 1 else other.nsplits[0]
+            )
+            if tensor.ndim > 0:
+                tensor = yield from recursive_tile(tensor.rechunk((rechunk_size,)))
+
+        out_chunks = []
+        for out_index in itertools.product(*(map(range, other.chunk_shape))):
+            tensor_chunk = tensor.cix[out_index[: tensor.ndim]]
+            other_chunk = other.cix[out_index]
+            out_op = op.copy().reset_key()
+            inputs = (
+                [other_chunk, tensor_chunk]
+                if rhs_is_tensor
+                else [tensor_chunk, other_chunk]
+            )
+            if isinstance(other_chunk, DATAFRAME_CHUNK_TYPE):
+                cum_splits = [0] + np.cumsum(other.nsplits[1]).tolist()
+                start = cum_splits[out_index[1]]
+                end = cum_splits[out_index[1] + 1]
+                chunk_dtypes = out.dtypes.iloc[start:end]
+                out_chunk = out_op.new_chunk(
+                    inputs,
+                    shape=other_chunk.shape,
+                    index=other_chunk.index,
+                    dtypes=chunk_dtypes,
+                    index_value=other_chunk.index_value,
+                    columns_value=other_chunk.columns_value,
+                )
+            else:
+                out_chunk = out_op.new_chunk(
+                    inputs,
+                    shape=other_chunk.shape,
+                    index=other_chunk.index,
+                    dtype=out.dtype,
+                    index_value=other_chunk.index_value,
+                    name=other_chunk.name,
+                )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        if isinstance(other, SERIES_TYPE):
+            return new_op.new_seriess(
+                op.inputs,
+                other.shape,
+                nsplits=other.nsplits,
+                dtype=out.dtype,
+                name=other.name,
+                index_value=other.index_value,
+                chunks=out_chunks,
+            )
+        else:
+            return new_op.new_dataframes(
+                op.inputs,
+                other.shape,
+                nsplits=other.nsplits,
+                dtypes=out.dtypes,
+                index_value=other.index_value,
+                columns_value=other.columns_value,
+                chunks=out_chunks,
+            )
+
+    @classmethod
+    def tile(cls, op):
+        if len(op.inputs) < 2:
+            return cls._tile_scalar(op)
+        elif isinstance(op.inputs[0], DATAFRAME_TYPE) and isinstance(
+            op.inputs[1], DATAFRAME_TYPE
+        ):
+            return cls._tile_both_dataframes(op)
+        elif isinstance(op.inputs[0], SERIES_TYPE) and isinstance(
+            op.inputs[1], SERIES_TYPE
+        ):
+            return cls._tile_both_series(op)
+        elif isinstance(op.inputs[0], DATAFRAME_TYPE) and isinstance(
+            op.inputs[1], SERIES_TYPE
+        ):
+            return cls._tile_dataframe_series(op)
+        elif isinstance(op.inputs[0], SERIES_TYPE) and isinstance(
+            op.inputs[1], DATAFRAME_TYPE
+        ):
+            return cls._tile_series_dataframe(op)
+        elif isinstance(op.inputs[0], TENSOR_TYPE) or isinstance(
+            op.inputs[1], TENSOR_TYPE
+        ):
+            return (yield from cls._tile_with_tensor(op))
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if getattr(cls, "_func_name", None) is not None:
+            if len(op.inputs) == 2:
+                df, other = ctx[op.inputs[0].key], ctx[op.inputs[1].key]
+                if isinstance(op.inputs[0], SERIES_CHUNK_TYPE) and isinstance(
+                    op.inputs[1], DATAFRAME_CHUNK_TYPE
+                ):
+                    df, other = other, df
+                    func_name = getattr(cls, "_rfunc_name")
+                else:
+                    func_name = getattr(cls, "_func_name")
+            elif pd.api.types.is_scalar(op.lhs) or isinstance(op.lhs, np.ndarray):
+                df = ctx[op.rhs.key]
+                other = op.lhs
+                func_name = getattr(cls, "_rfunc_name")
+            else:
+                df = ctx[op.lhs.key]
+                other = op.rhs
+                func_name = getattr(cls, "_func_name")
+            if df.ndim == 2:
+                kw = dict(axis=op.axis)
+            else:
+                kw = dict()
+            if op.fill_value is not None:
+                # comparison function like eq does not have `fill_value`
+                kw["fill_value"] = op.fill_value
+            if op.level is not None:
+                # logical function like and may don't have `level` (for Series type)
+                kw["level"] = op.level
+            if hasattr(other, "ndim") and other.ndim == 0:
+                other = other.item()
+            ctx[op.outputs[0].key] = getattr(df, func_name)(other, **kw)
+        else:
+            inputs_iter = iter(op.inputs)
+            if not pd.api.types.is_scalar(op.lhs):
+                lhs = ctx[next(inputs_iter).key]
+            else:
+                lhs = op.lhs
+            if not pd.api.types.is_scalar(op.rhs):
+                rhs = ctx[next(inputs_iter).key]
+            else:
+                rhs = op.rhs
+            ctx[op.outputs[0].key] = cls._operator(
+                lhs, rhs
+            )  # pylint: disable=too-many-function-args
+
+    @classproperty
+    def _operator(self):
+        raise NotImplementedError
+
+    @classmethod
+    def _calc_properties(cls, x1, x2=None, axis="columns"):
+        is_chunk = isinstance(x1, CHUNK_TYPE)
+
+        if isinstance(x1, (DATAFRAME_TYPE, DATAFRAME_CHUNK_TYPE)) and (
+            x2 is None
+            or pd.api.types.is_scalar(x2)
+            or isinstance(x2, (TENSOR_TYPE, TENSOR_CHUNK_TYPE))
+        ):
+            if not is_chunk:
+                if pd.api.types.is_scalar(x2):
+                    dtypes = cls._operator(build_empty_df(x1.dtypes), x2).dtypes
+                elif x1.dtypes is not None and isinstance(x2, TENSOR_TYPE):
+                    dtypes = pd.Series(
+                        [infer_dtype(dt, x2.dtype, cls._operator) for dt in x1.dtypes],
+                        index=x1.dtypes.index,
+                    )
+                else:  # pragma: no cover
+                    dtypes = x1.dtypes
+                return {
+                    "shape": x1.shape,
+                    "dtypes": dtypes,
+                    "columns_value": x1.columns_value,
+                    "index_value": x1.index_value,
+                }
+            else:
+                return {"shape": x1.shape}
+
+        if isinstance(x1, (SERIES_TYPE, SERIES_CHUNK_TYPE)) and (
+            x2 is None
+            or pd.api.types.is_scalar(x2)
+            or isinstance(x2, (TENSOR_TYPE, TENSOR_CHUNK_TYPE))
+        ):
+            x2_dtype = x2.dtype if hasattr(x2, "dtype") else type(x2)
+            x2_dtype = get_dtype(x2_dtype)
+            if hasattr(cls, "return_dtype"):
+                dtype = cls.return_dtype
+            else:
+                dtype = infer_dtype(x1.dtype, x2_dtype, cls._operator)
+            ret = {"shape": x1.shape, "dtype": dtype}
+            if pd.api.types.is_scalar(x2) or (
+                hasattr(x2, "ndim") and (x2.ndim == 0 or x2.ndim == 1)
+            ):
+                ret["name"] = x1.name
+            if not is_chunk:
+                ret["index_value"] = x1.index_value
+            return ret
+
+        if isinstance(x1, (DATAFRAME_TYPE, DATAFRAME_CHUNK_TYPE)) and isinstance(
+            x2, (DATAFRAME_TYPE, DATAFRAME_CHUNK_TYPE)
+        ):
+            index_shape, column_shape, dtypes, columns, index = (
+                np.nan,
+                np.nan,
+                None,
+                None,
+                None,
+            )
+
+            if (
+                x1.columns_value is not None
+                and x2.columns_value is not None
+                and x1.columns_value.key == x2.columns_value.key
+            ):
+                dtypes = pd.Series(
+                    [
+                        infer_dtype(dt1, dt2, cls._operator)
+                        for dt1, dt2 in zip(x1.dtypes, x2.dtypes)
+                    ],
+                    index=x1.dtypes.index,
+                )
+                columns = copy.copy(x1.columns_value)
+                column_shape = len(dtypes)
+            elif x1.dtypes is not None and x2.dtypes is not None:
+                dtypes = infer_dtypes(x1.dtypes, x2.dtypes, cls._operator)
+                columns = parse_index(dtypes.index, store_data=True)
+                column_shape = len(dtypes)
+            if x1.index_value is not None and x2.index_value is not None:
+                if x1.index_value.key == x2.index_value.key:
+                    index = copy.copy(x1.index_value)
+                    index_shape = x1.shape[0]
+                else:
+                    index = infer_index_value(x1.index_value, x2.index_value)
+                    if index.key == x1.index_value.key == x2.index_value.key and (
+                        not np.isnan(x1.shape[0]) or not np.isnan(x2.shape[0])
+                    ):
+                        index_shape = (
+                            x1.shape[0] if not np.isnan(x1.shape[0]) else x2.shape[0]
+                        )
+
+            return {
+                "shape": (index_shape, column_shape),
+                "dtypes": dtypes,
+                "columns_value": columns,
+                "index_value": index,
+            }
+
+        if isinstance(x1, (DATAFRAME_TYPE, DATAFRAME_CHUNK_TYPE)) and isinstance(
+            x2, (SERIES_TYPE, SERIES_CHUNK_TYPE)
+        ):
+            if axis == "columns" or axis == 1:
+                index_shape = x1.shape[0]
+                index = x1.index_value
+                column_shape, dtypes, columns = np.nan, None, None
+                if x1.columns_value is not None and x1.index_value is not None:
+                    if x1.columns_value.key == x2.index_value.key:
+                        dtypes = pd.Series(
+                            [
+                                infer_dtype(dt, x2.dtype, cls._operator)
+                                for dt in x1.dtypes
+                            ],
+                            index=x1.dtypes.index,
+                        )
+                        columns = copy.copy(x1.columns_value)
+                        column_shape = len(dtypes)
+                    else:  # pragma: no cover
+                        dtypes = x1.dtypes  # FIXME
+                        columns = infer_index_value(x1.columns_value, x2.index_value)
+                        column_shape = np.nan
+            else:
+                assert axis == "index" or axis == 0
+                column_shape = x1.shape[1]
+                columns = x1.columns_value
+                dtypes = x1.dtypes
+                index_shape, index = np.nan, None
+                if x1.index_value is not None and x1.index_value is not None:
+                    if x1.index_value.key == x2.index_value.key:
+                        dtypes = pd.Series(
+                            [
+                                infer_dtype(dt, x2.dtype, cls._operator)
+                                for dt in x1.dtypes
+                            ],
+                            index=x1.dtypes.index,
+                        )
+                        index = copy.copy(x1.index_value)
+                        index_shape = x1.shape[0]
+                    else:
+                        if x1.dtypes is not None:
+                            dtypes = pd.Series(
+                                [
+                                    infer_dtype(dt, x2.dtype, cls._operator)
+                                    for dt in x1.dtypes
+                                ],
+                                index=x1.dtypes.index,
+                            )
+                        index = infer_index_value(x1.index_value, x2.index_value)
+                        index_shape = np.nan
+            return {
+                "shape": (index_shape, column_shape),
+                "dtypes": dtypes,
+                "columns_value": columns,
+                "index_value": index,
+            }
+
+        if isinstance(x1, (SERIES_TYPE, SERIES_CHUNK_TYPE)) and isinstance(
+            x2, (SERIES_TYPE, SERIES_CHUNK_TYPE)
+        ):
+            index_shape, dtype, index = np.nan, None, None
+
+            dtype = infer_dtype(x1.dtype, x2.dtype, cls._operator)
+            if x1.index_value is not None and x2.index_value is not None:
+                if x1.index_value.key == x2.index_value.key:
+                    index = copy.copy(x1.index_value)
+                    index_shape = x1.shape[0]
+                else:
+                    index = infer_index_value(x1.index_value, x2.index_value)
+                    if index.key == x1.index_value.key == x2.index_value.key and (
+                        not np.isnan(x1.shape[0]) or not np.isnan(x2.shape[0])
+                    ):
+                        index_shape = (
+                            x1.shape[0] if not np.isnan(x1.shape[0]) else x2.shape[0]
+                        )
+
+            ret = {"shape": (index_shape,), "dtype": dtype, "index_value": index}
+            if x1.name == x2.name:
+                ret["name"] = x1.name
+            return ret
+
+        raise NotImplementedError("Unknown combination of parameters")
+
+    def _new_chunks(self, inputs, kws=None, **kw):
+        property_inputs = [
+            inp
+            for inp in inputs
+            if isinstance(
+                inp, (DATAFRAME_CHUNK_TYPE, SERIES_CHUNK_TYPE, TENSOR_CHUNK_TYPE)
+            )
+        ]
+        # use first two to infer(for tree operand)
+        property_inputs = property_inputs[:2]
+        if len(property_inputs) == 1:
+            properties = self._calc_properties(*property_inputs)
+        elif any(inp.ndim == 2 for inp in property_inputs):
+            df1, df2 = (
+                property_inputs
+                if isinstance(property_inputs[0], DATAFRAME_CHUNK_TYPE)
+                else reversed(property_inputs)
+            )
+            properties = self._calc_properties(df1, df2, axis=self.axis)
+        else:
+            if property_inputs[0].ndim < property_inputs[1].ndim or isinstance(
+                property_inputs[0], (TENSOR_TYPE, TENSOR_CHUNK_TYPE)
+            ):
+                property_inputs = reversed(property_inputs)
+            properties = self._calc_properties(*property_inputs)
+
+        inputs = [inp for inp in inputs if isinstance(inp, (Chunk, ChunkData))]
+
+        shape = properties.pop("shape")
+        if "shape" in kw:
+            shape = kw.pop("shape")
+
+        for prop, value in properties.items():
+            if kw.get(prop, None) is None:
+                kw[prop] = value
+
+        return super()._new_chunks(inputs, shape=shape, kws=kws, **kw)
+
+    def _check_inputs(self, x1, x2):
+        if isinstance(x1, TENSOR_TYPE) or isinstance(x2, TENSOR_TYPE):
+            tensor, other = (x1, x2) if isinstance(x1, TENSOR_TYPE) else (x2, x1)
+            if isinstance(other, DATAFRAME_TYPE):
+                if self.axis == "index" or self.axis == 0:
+                    other_shape = tuple(reversed(other.shape))
+                else:
+                    other_shape = other.shape
+                if tensor.ndim == 2 and tensor.shape != other_shape:
+                    raise ValueError(
+                        f"Unable to coerce to DataFrame, shape must be {other_shape}: "
+                        f"given {tensor.shape}"
+                    )
+                elif tensor.ndim == 1 and tensor.shape[0] != other_shape[1]:
+                    raise ValueError(
+                        f"Unable to coerce to Series, length must be {other_shape[1]}: "
+                        f"given {tensor.shape[0]}"
+                    )
+                elif tensor.ndim > 2:
+                    raise ValueError(
+                        "Unable to coerce to Series/DataFrame, dim must be <= 2"
+                    )
+            if isinstance(other, SERIES_TYPE):
+                if tensor.ndim == 1 and (tensor.shape[0] != other.shape[0]):
+                    raise ValueError(
+                        f"Unable to coerce to Series, length must be {other.shape[0]}: "
+                        f"given {tensor.shape[0]}"
+                    )
+                elif tensor.ndim > 1:
+                    raise ValueError("Unable to coerce to Series, dim must be 1")
+
+    def _call(self, x1, x2):
+        self._check_inputs(x1, x2)
+        if isinstance(x1, DATAFRAME_TYPE) or isinstance(x2, DATAFRAME_TYPE):
+            df1, df2 = (x1, x2) if isinstance(x1, DATAFRAME_TYPE) else (x2, x1)
+            kw = self._calc_properties(df1, df2, axis=self.axis)
+            if not pd.api.types.is_scalar(df2):
+                return self.new_dataframe([x1, x2], **kw)
+            else:
+                return self.new_dataframe([df1], **kw)
+        if isinstance(x1, SERIES_TYPE) or isinstance(x2, SERIES_TYPE):
+            s1, s2 = (x1, x2) if isinstance(x1, SERIES_TYPE) else (x2, x1)
+            kw = self._calc_properties(s1, s2)
+            if not pd.api.types.is_scalar(s2):
+                return self.new_series([x1, x2], **kw)
+            else:
+                return self.new_series([s1], **kw)
+        raise NotImplementedError(
+            "Only support add dataframe, series or scalar for now"
+        )
+
+    def __call__(self, x1, x2):
+        x1 = self._process_input(x1)
+        x2 = self._process_input(x2)
+        if isinstance(x1, SERIES_TYPE) and isinstance(x2, DATAFRAME_TYPE):
+            # reject invoking series's op on dataframe
+            raise NotImplementedError
+        return self._call(x1, x2)
+
+    def rcall(self, x1, x2):
+        x1 = self._process_input(x1)
+        x2 = self._process_input(x2)
+        if isinstance(x1, SERIES_TYPE) and isinstance(x2, DATAFRAME_TYPE):
+            # reject invoking series's op on dataframe
+            raise NotImplementedError
+        return self._call(x2, x1)
+
+
+class DataFrameBinOp(DataFrameOperand, DataFrameBinOpMixin):
+    axis = AnyField("axis", default=None)
+    level = AnyField("level", default=None)
+    fill_value = AnyField("fill_value", default=None)
+    lhs = AnyField("lhs")
+    rhs = AnyField("rhs")
+
+    def __init__(self, output_types=None, **kw):
+        super().__init__(_output_types=output_types, **kw)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if len(self._inputs) == 2:
+            self.lhs = self._inputs[0]
+            self.rhs = self._inputs[1]
+        else:
+            if isinstance(self.lhs, ENTITY_TYPE):
+                self.lhs = self._inputs[0]
+            elif pd.api.types.is_scalar(self.lhs):
+                self.rhs = self._inputs[0]
+
+
+class DataFrameUnaryOpMixin(DataFrameOperandMixin):
+    __slots__ = ()
+
+    @classmethod
+    def tile(cls, op):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+
+        out_chunks = []
+        index_dtypes_cache = dict()
+        for in_chunk in in_df.chunks:
+            out_op = op.copy().reset_key()
+            if out_df.ndim == 2:
+                try:
+                    dtypes = index_dtypes_cache[in_chunk.index[1]]
+                except KeyError:
+                    dtypes = out_df.dtypes[in_chunk.columns_value.to_pandas()]
+                    index_dtypes_cache[in_chunk.index[1]] = dtypes
+
+                out_chunk = out_op.new_chunk(
+                    [in_chunk],
+                    shape=in_chunk.shape,
+                    dtypes=dtypes,
+                    index=in_chunk.index,
+                    index_value=in_chunk.index_value,
+                    columns_value=in_chunk.columns_value,
+                )
+            else:
+                out_chunk = out_op.new_chunk(
+                    [in_chunk],
+                    shape=in_chunk.shape,
+                    index=in_chunk.index,
+                    dtype=in_chunk.dtype,
+                    index_value=in_chunk.index_value,
+                    name=in_chunk.name,
+                )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        kw = out_df.params
+        kw["nsplits"] = in_df.nsplits
+        kw["chunks"] = out_chunks
+        return new_op.new_tileables(op.inputs, kws=[kw])
+
+    @classmethod
+    def execute(cls, ctx, op):
+        df = ctx[op.inputs[0].key]
+        func_name = getattr(cls, "_func_name")
+        if hasattr(df, func_name):
+            ctx[op.outputs[0].key] = getattr(df, func_name)()
+        else:
+            ctx[op.outputs[0].key] = getattr(np, func_name)(df)
+
+
+class DataFrameUnaryOp(DataFrameOperand, DataFrameUnaryOpMixin):
+    def __init__(self, output_types=None, **kw):
+        super().__init__(_output_types=output_types, **kw)
+
+    @classmethod
+    def _get_output_dtype(cls, df):
+        if df.ndim == 2:
+            return df.dtypes
+        else:
+            return df.dtype
+
+    def __call__(self, df):
+        self.output_types = df.op.output_types
+        if df.ndim == 2:
+            return self.new_dataframe(
+                [df],
+                shape=df.shape,
+                dtypes=self._get_output_dtype(df),
+                columns_value=df.columns_value,
+                index_value=df.index_value,
+            )
+        else:
+            series = df
+            return self.new_series(
+                [series],
+                shape=series.shape,
+                name=series.name,
+                index_value=series.index_value,
+                dtype=self._get_output_dtype(series),
+            )
+
+
+class DataFrameArithmeticTreeMixin:
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs = [ctx[c.key] for c in op.inputs]
+        ctx[op.outputs[0].key] = reduce(op._operator, inputs)
+
+    def _set_inputs(self, inputs):
+        inputs = self._get_inputs_data(inputs)
+        setattr(self, "_inputs", inputs)
+
+
+class DataFrameUnaryUfunc(DataFrameUnaryOp, TensorUfuncMixin):
+    pass
+
+
+class DataFrameBinopUfunc(DataFrameBinOp, TensorUfuncMixin):
+    pass
diff --git a/python/xorbits/_mars/dataframe/arithmetic/cos.py b/python/xorbits/_mars/dataframe/arithmetic/cos.py
new file mode 100644
index 000000000..0e66d4532
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/cos.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameCos(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.COS
+    _func_name = "cos"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorCos
+
+        return TensorCos
diff --git a/python/xorbits/_mars/dataframe/arithmetic/cosh.py b/python/xorbits/_mars/dataframe/arithmetic/cosh.py
new file mode 100644
index 000000000..0df784e29
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/cosh.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameCosh(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.COSH
+    _func_name = "cosh"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorCosh
+
+        return TensorCosh
diff --git a/python/xorbits/_mars/dataframe/arithmetic/degrees.py b/python/xorbits/_mars/dataframe/arithmetic/degrees.py
new file mode 100644
index 000000000..0ce1c5bf2
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/degrees.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameDegrees(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.DEGREES
+    _func_name = "degrees"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorDegrees
+
+        return TensorDegrees
diff --git a/python/xorbits/_mars/dataframe/arithmetic/docstring.py b/python/xorbits/_mars/dataframe/arithmetic/docstring.py
new file mode 100644
index 000000000..43434a3ce
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/docstring.py
@@ -0,0 +1,442 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+_flex_doc_FRAME = """
+Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`).
+Equivalent to ``{equiv}``, but with support to substitute a fill_value
+for missing data in one of the inputs. With reverse version, `{reverse}`.
+Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to
+arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`.
+
+Parameters
+----------
+other : scalar, sequence, Series, or DataFrame
+    Any single or multiple element data structure, or list-like object.
+axis : {{0 or 'index', 1 or 'columns'}}
+    Whether to compare by the index (0 or 'index') or columns
+    (1 or 'columns'). For Series input, axis to match Series index on.
+level : int or label
+    Broadcast across a level, matching Index values on the
+    passed MultiIndex level.
+fill_value : float or None, default None
+    Fill existing missing (NaN) values, and any new element needed for
+    successful DataFrame alignment, with this value before computation.
+    If data in both corresponding DataFrame locations is missing
+    the result will be missing.
+
+Returns
+-------
+DataFrame
+    Result of the arithmetic operation.
+
+See Also
+--------
+DataFrame.add : Add DataFrames.
+DataFrame.sub : Subtract DataFrames.
+DataFrame.mul : Multiply DataFrames.
+DataFrame.div : Divide DataFrames (float division).
+DataFrame.truediv : Divide DataFrames (float division).
+DataFrame.floordiv : Divide DataFrames (integer division).
+DataFrame.mod : Calculate modulo (remainder after division).
+DataFrame.pow : Calculate exponential power.
+
+Notes
+-----
+Mismatched indices will be unioned together.
+
+Examples
+--------
+>>> import mars.dataframe as md
+>>> df = md.DataFrame({{'angles': [0, 3, 4],
+...                    'degrees': [360, 180, 360]}},
+...                   index=['circle', 'triangle', 'rectangle'])
+>>> df.execute()
+           angles  degrees
+circle          0      360
+triangle        3      180
+rectangle       4      360
+
+Add a scalar with operator version which return the same
+results.
+
+>>> (df + 1).execute()
+           angles  degrees
+circle          1      361
+triangle        4      181
+rectangle       5      361
+
+>>> df.add(1).execute()
+           angles  degrees
+circle          1      361
+triangle        4      181
+rectangle       5      361
+
+Divide by constant with reverse version.
+
+>>> df.div(10).execute()
+           angles  degrees
+circle        0.0     36.0
+triangle      0.3     18.0
+rectangle     0.4     36.0
+
+>>> df.rdiv(10).execute()
+             angles   degrees
+circle          inf  0.027778
+triangle   3.333333  0.055556
+rectangle  2.500000  0.027778
+
+Subtract a list and Series by axis with operator version.
+
+>>> (df - [1, 2]).execute()
+           angles  degrees
+circle         -1      358
+triangle        2      178
+rectangle       3      358
+
+>>> df.sub([1, 2], axis='columns').execute()
+           angles  degrees
+circle         -1      358
+triangle        2      178
+rectangle       3      358
+
+>>> df.sub(md.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),
+...        axis='index').execute()
+           angles  degrees
+circle         -1      359
+triangle        2      179
+rectangle       3      359
+
+Multiply a DataFrame of different shape with operator version.
+
+>>> other = md.DataFrame({{'angles': [0, 3, 4]}},
+...                      index=['circle', 'triangle', 'rectangle'])
+>>> other.execute()
+           angles
+circle          0
+triangle        3
+rectangle       4
+
+>>> (df * other).execute()
+           angles  degrees
+circle          0      NaN
+triangle        9      NaN
+rectangle      16      NaN
+
+>>> df.mul(other, fill_value=0).execute()
+           angles  degrees
+circle          0      0.0
+triangle        9      0.0
+rectangle      16      0.0
+
+Divide by a MultiIndex by level.
+
+>>> df_multindex = md.DataFrame({{'angles': [0, 3, 4, 4, 5, 6],
+...                              'degrees': [360, 180, 360, 360, 540, 720]}},
+...                             index=[['A', 'A', 'A', 'B', 'B', 'B'],
+...                                    ['circle', 'triangle', 'rectangle',
+...                                     'square', 'pentagon', 'hexagon']])
+>>> df_multindex.execute()
+             angles  degrees
+A circle          0      360
+  triangle        3      180
+  rectangle       4      360
+B square          4      360
+  pentagon        5      540
+  hexagon         6      720
+
+>>> df.div(df_multindex, level=1, fill_value=0).execute()
+             angles  degrees
+A circle        NaN      1.0
+  triangle      1.0      1.0
+  rectangle     1.0      1.0
+B square        0.0      0.0
+  pentagon      0.0      0.0
+  hexagon       0.0      0.0
+"""
+
+_flex_doc_SERIES = """
+Return {desc} of series and other, element-wise (binary operator `{op_name}`).
+
+Equivalent to ``series {equiv} other``, but with support to substitute a fill_value for
+missing data in one of the inputs.
+
+Parameters
+----------
+other : Series or scalar value
+fill_value : None or float value, default None (NaN)
+    Fill existing missing (NaN) values, and any new element needed for
+    successful Series alignment, with this value before computation.
+    If data in both corresponding Series locations is missing
+    the result will be missing.
+level : int or name
+    Broadcast across a level, matching Index values on the
+    passed MultiIndex level.
+
+Returns
+-------
+Series
+    The result of the operation.
+
+See Also
+--------
+Series.{reverse}
+
+Examples
+--------
+>>> import numpy as np
+>>> import mars.dataframe as md
+>>> a = md.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'])
+>>> a.execute()
+a    1.0
+b    1.0
+c    1.0
+d    NaN
+dtype: float64
+
+>>> b = md.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e'])
+>>> b.execute()
+a    1.0
+b    NaN
+d    1.0
+e    NaN
+dtype: float64
+"""
+
+_flex_comp_doc_FRAME = """
+Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`).
+Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison
+operators.
+
+Equivalent to `dataframe {equiv} other` with support to choose axis (rows or columns)
+and level for comparison.
+
+Parameters
+----------
+other : scalar, sequence, Series, or DataFrame
+    Any single or multiple element data structure, or list-like object.
+axis : {{0 or 'index', 1 or 'columns'}}, default 'columns'
+    Whether to compare by the index (0 or 'index') or columns
+    (1 or 'columns').
+level : int or label
+    Broadcast across a level, matching Index values on the passed
+    MultiIndex level.
+
+Returns
+-------
+DataFrame of bool
+    Result of the comparison.
+
+See Also
+--------
+DataFrame.eq : Compare DataFrames for equality elementwise.
+DataFrame.ne : Compare DataFrames for inequality elementwise.
+DataFrame.le : Compare DataFrames for less than inequality
+    or equality elementwise.
+DataFrame.lt : Compare DataFrames for strictly less than
+    inequality elementwise.
+DataFrame.ge : Compare DataFrames for greater than inequality
+    or equality elementwise.
+DataFrame.gt : Compare DataFrames for strictly greater than
+    inequality elementwise.
+
+Notes
+-----
+Mismatched indices will be unioned together.
+`NaN` values are considered different (i.e. `NaN` != `NaN`).
+
+Examples
+--------
+>>> df = pd.DataFrame({{'cost': [250, 150, 100],
+...                    'revenue': [100, 250, 300]}},
+...                   index=['A', 'B', 'C'])
+>>> df.execute()
+   cost  revenue
+A   250      100
+B   150      250
+C   100      300
+
+Comparison with a scalar, using either the operator or method:
+
+>>> (df == 100).execute()
+    cost  revenue
+A  False     True
+B  False    False
+C   True    False
+
+>>> df.eq(100).execute()
+    cost  revenue
+A  False     True
+B  False    False
+C   True    False
+
+When `other` is a :class:`Series`, the columns of a DataFrame are aligned
+with the index of `other` and broadcast:
+
+>>> (df != pd.Series([100, 250], index=["cost", "revenue"])).execute()
+    cost  revenue
+A   True     True
+B   True    False
+C  False     True
+
+Use the method to control the broadcast axis:
+
+>>> df.ne(pd.Series([100, 300], index=["A", "D"]), axis='index').execute()
+   cost  revenue
+A  True    False
+B  True     True
+C  True     True
+D  True     True
+
+When comparing to an arbitrary sequence, the number of columns must
+match the number elements in `other`:
+
+>>> (df == [250, 100]).execute()
+    cost  revenue
+A   True     True
+B  False    False
+C  False    False
+
+Use the method to control the axis:
+
+>>> df.eq([250, 250, 100], axis='index').execute()
+    cost  revenue
+A   True    False
+B  False     True
+C   True    False
+
+Compare to a DataFrame of different shape.
+
+>>> other = pd.DataFrame({{'revenue': [300, 250, 100, 150]}},
+...                      index=['A', 'B', 'C', 'D'])
+>>> other.execute()
+   revenue
+A      300
+B      250
+C      100
+D      150
+
+>>> df.gt(other).execute()
+    cost  revenue
+A  False    False
+B  False    False
+C  False     True
+D  False    False
+
+Compare to a MultiIndex by level.
+
+>>> df_multindex = pd.DataFrame({{'cost': [250, 150, 100, 150, 300, 220],
+...                              'revenue': [100, 250, 300, 200, 175, 225]}},
+...                             index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'],
+...                                    ['A', 'B', 'C', 'A', 'B', 'C']])
+>>> df_multindex.execute()
+      cost  revenue
+Q1 A   250      100
+   B   150      250
+   C   100      300
+Q2 A   150      200
+   B   300      175
+   C   220      225
+
+>>> df.le(df_multindex, level=1).execute()
+       cost  revenue
+Q1 A   True     True
+   B   True     True
+   C   True     True
+Q2 A  False     True
+   B   True    False
+   C   True    False
+"""
+
+
+_flex_comp_doc_SERIES = """
+Return {desc} of series and other, element-wise (binary operator `{op_name}`).
+
+Equivalent to ``series {equiv} other``, but with support to substitute a fill_value for
+missing data in one of the inputs.
+
+Parameters
+----------
+other : Series or scalar value
+fill_value : None or float value, default None (NaN)
+    Fill existing missing (NaN) values, and any new element needed for
+    successful Series alignment, with this value before computation.
+    If data in both corresponding Series locations is missing
+    the result will be missing.
+level : int or name
+    Broadcast across a level, matching Index values on the
+    passed MultiIndex level.
+
+Returns
+-------
+Series
+    The result of the operation.
+
+Examples
+--------
+>>> import numpy as np
+>>> import mars.dataframe as md
+>>> a = md.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'])
+>>> a.execute()
+a    1.0
+b    1.0
+c    1.0
+d    NaN
+dtype: float64
+
+>>> b = md.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e'])
+>>> b.execute()
+a    1.0
+b    NaN
+d    1.0
+e    NaN
+dtype: float64
+"""
+
+
+def bin_arithmetic_doc(
+    desc, op_name=None, equiv=None, reverse=None, series_example=None
+):
+    def wrapper(fun):
+        nonlocal op_name, reverse
+        op_name = op_name or fun.__name__
+        if reverse is None:
+            reverse = op_name[1:] if op_name.startswith("r") else "r" + op_name
+        fun.__frame_doc__ = _flex_doc_FRAME.format(
+            desc=desc, op_name=op_name, equiv=equiv, reverse=reverse
+        )
+        fun.__series_doc__ = _flex_doc_SERIES.format(
+            desc=desc, op_name=op_name, equiv=equiv, reverse=reverse
+        )
+        if series_example is not None:  # pragma: no branch
+            fun.__series_doc__ += "\n" + series_example.strip()
+        return fun
+
+    return wrapper
+
+
+def bin_compare_doc(desc, op_name=None, equiv=None, series_example=None):
+    def wrapper(fun):
+        nonlocal op_name
+        op_name = op_name or fun.__name__
+        fun.__frame_doc__ = _flex_comp_doc_FRAME.format(
+            desc=desc, op_name=op_name, equiv=equiv
+        )
+        fun.__series_doc__ = _flex_comp_doc_SERIES.format(
+            desc=desc, op_name=op_name, equiv=equiv
+        )
+        if series_example is not None:  # pragma: no branch
+            fun.__series_doc__ += "\n" + series_example.strip()
+        return fun
+
+    return wrapper
diff --git a/python/xorbits/_mars/dataframe/arithmetic/dot.py b/python/xorbits/_mars/dataframe/arithmetic/dot.py
new file mode 100644
index 000000000..57473536a
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/dot.py
@@ -0,0 +1,306 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import recursive_tile
+from ...serialization.serializables import AnyField, KeyField
+from ...tensor import tensor as astensor
+from ...tensor.core import TENSOR_TYPE
+from ...tensor.utils import decide_unify_split, validate_axis
+from ..core import DATAFRAME_TYPE, SERIES_TYPE, IndexValue
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index
+
+
+class DataFrameDot(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.DOT
+
+    lhs = KeyField("lhs")
+    rhs = AnyField("rhs")
+
+    def __init__(self, output_types=None, **kw):
+        super().__init__(_output_types=output_types, **kw)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self.lhs = self._inputs[0]
+        self.rhs = self._inputs[1]
+
+    def __call__(self, lhs, rhs):
+        lhs = self._process_input(lhs)
+        rhs = self._process_input(rhs)
+        if not isinstance(rhs, (DATAFRAME_TYPE, SERIES_TYPE)):
+            rhs = astensor(rhs)
+            test_rhs = rhs
+        else:
+            test_rhs = rhs.to_tensor()
+
+        test_ret = lhs.to_tensor().dot(test_rhs)
+        if test_ret.ndim == 0:
+            if isinstance(lhs, SERIES_TYPE) and isinstance(rhs, TENSOR_TYPE):
+                # return tensor
+                return test_ret
+            return self.new_scalar([lhs, rhs], dtype=test_ret.dtype)
+        elif test_ret.ndim == 1:
+            if lhs.ndim == 1:
+                if hasattr(rhs, "columns_value"):
+                    index_value = rhs.columns_value
+                else:
+                    # tensor
+                    length = -1 if np.isnan(rhs.shape[1]) else rhs.shape[1]
+                    pd_index = pd.RangeIndex(length)
+                    index_value = parse_index(pd_index, store_data=True)
+            else:
+                assert rhs.ndim == 1
+                index_value = lhs.index_value
+            return self.new_series(
+                [lhs, rhs],
+                shape=test_ret.shape,
+                dtype=test_ret.dtype,
+                index_value=index_value,
+            )
+        else:
+            if isinstance(rhs, TENSOR_TYPE):
+                dtypes = pd.Series(
+                    np.repeat(test_ret.dtype, test_ret.shape[1]),
+                    index=pd.RangeIndex(test_ret.shape[1]),
+                )
+                columns_value = parse_index(dtypes.index, store_data=True)
+            else:
+                dtypes = pd.Series(
+                    np.repeat(test_ret.dtype, test_ret.shape[1]),
+                    index=rhs.columns_value.to_pandas(),
+                )
+                columns_value = rhs.columns_value
+            return self.new_dataframe(
+                [lhs, rhs],
+                shape=test_ret.shape,
+                index_value=lhs.index_value,
+                columns_value=columns_value,
+                dtypes=dtypes,
+            )
+
+    @classmethod
+    def _align(cls, lhs, rhs):
+        if isinstance(rhs, TENSOR_TYPE):
+            # no need to align when rhs is a tensor
+            return lhs, rhs
+
+        is_lhs_range_index = False
+        if isinstance(lhs, DATAFRAME_TYPE) and isinstance(
+            lhs.columns_value.value, IndexValue.RangeIndex
+        ):
+            is_lhs_range_index = True
+        if isinstance(lhs, SERIES_TYPE) and isinstance(
+            lhs.index_value.value, IndexValue.RangeIndex
+        ):
+            is_lhs_range_index = True
+
+        is_rhs_range_index = False
+        if isinstance(rhs.index_value.value, IndexValue.RangeIndex):
+            is_rhs_range_index = True
+
+        if not is_lhs_range_index or not is_rhs_range_index:
+            # TODO: e.g. use rhs.loc[lhs.columns_value.to_pandas()]
+            # when lhs is a DataFrame and lhs.columns is not a RangeIndex,
+            # so does Series
+            raise NotImplementedError
+
+        return lhs, rhs
+
+    @classmethod
+    def tile(cls, op):
+        from ..datasource.from_tensor import dataframe_from_tensor, series_from_tensor
+
+        lhs, rhs = op.lhs, op.rhs
+        lhs, rhs = cls._align(lhs, rhs)
+        out = op.outputs[0]
+
+        if any(np.isnan(ns) for ns in lhs.nsplits[-1]):
+            yield
+        if any(np.isnan(ns) for ns in rhs.nsplits[0]):
+            yield
+
+        nsplit = decide_unify_split(lhs.nsplits[-1], rhs.nsplits[0])
+        lhs_axis = validate_axis(lhs.ndim, -1)
+        lhs = yield from recursive_tile(lhs.rechunk({lhs_axis: nsplit}))
+        rhs = yield from recursive_tile(rhs.rechunk({0: nsplit}))
+
+        # delegate computation to tensor
+        lhs_tensor = lhs if isinstance(lhs, TENSOR_TYPE) else lhs.to_tensor()
+        rhs_tensor = rhs if isinstance(rhs, TENSOR_TYPE) else rhs.to_tensor()
+        ret = lhs_tensor.dot(rhs_tensor)
+
+        if isinstance(out, TENSOR_TYPE):
+            pass
+        elif ret.ndim == 1:
+            index = None
+            if isinstance(lhs, DATAFRAME_TYPE):
+                index = lhs.index
+            elif isinstance(rhs, DATAFRAME_TYPE):
+                index = rhs.dtypes.index
+            ret = series_from_tensor(ret, index=index)
+        elif ret.ndim == 2:
+            index = lhs.index
+            columns = None
+            if isinstance(rhs, DATAFRAME_TYPE):
+                columns = rhs.dtypes.index
+            ret = dataframe_from_tensor(ret, index=index, columns=columns)
+
+        tiled = yield from recursive_tile(ret)
+        return [tiled]
+
+
+def dot(df_or_series, other):
+    op = DataFrameDot(lhs=df_or_series, rhs=other)
+    return op(df_or_series, other)
+
+
+def rdot(df_or_series, other):
+    op = DataFrameDot(lhs=other, rhs=df_or_series)
+    return op(other, df_or_series)
+
+
+dot.__frame_doc__ = """
+Compute the matrix multiplication between the DataFrame and other.
+
+This method computes the matrix product between the DataFrame and the
+values of an other Series, DataFrame or a numpy array.
+
+It can also be called using ``self @ other`` in Python >= 3.5.
+
+Parameters
+----------
+other : Series, DataFrame or array-like
+    The other object to compute the matrix product with.
+
+Returns
+-------
+Series or DataFrame
+    If other is a Series, return the matrix product between self and
+    other as a Series. If other is a DataFrame or a numpy.array, return
+    the matrix product of self and other in a DataFrame of a np.array.
+
+See Also
+--------
+Series.dot: Similar method for Series.
+
+Notes
+-----
+The dimensions of DataFrame and other must be compatible in order to
+compute the matrix multiplication. In addition, the column names of
+DataFrame and the index of other must contain the same values, as they
+will be aligned prior to the multiplication.
+
+The dot method for Series computes the inner product, instead of the
+matrix product here.
+
+Examples
+--------
+Here we multiply a DataFrame with a Series.
+
+>>> import mars.tensor as mt
+>>> import mars.dataframe as md
+>>> df = md.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
+>>> s = md.Series([1, 1, 2, 1])
+>>> df.dot(s).execute()
+0    -4
+1     5
+dtype: int64
+
+Here we multiply a DataFrame with another DataFrame.
+
+>>> other = md.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
+>>> df.dot(other).execute()
+    0   1
+0   1   4
+1   2   2
+
+Note that the dot method give the same result as @
+
+>>> (df @ other).execute()
+    0   1
+0   1   4
+1   2   2
+
+The dot method works also if other is an np.array.
+
+>>> arr = mt.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
+>>> df.dot(arr).execute()
+    0   1
+0   1   4
+1   2   2
+
+Note how shuffling of the objects does not change the result.
+
+>>> s2 = s.reindex([1, 0, 2, 3])
+>>> df.dot(s2).execute()
+0    -4
+1     5
+dtype: int64
+"""
+dot.__series_doc__ = """
+Compute the dot product between the Series and the columns of other.
+
+This method computes the dot product between the Series and another
+one, or the Series and each columns of a DataFrame, or the Series and
+each columns of an array.
+
+It can also be called using `self @ other` in Python >= 3.5.
+
+Parameters
+----------
+other : Series, DataFrame or array-like
+    The other object to compute the dot product with its columns.
+
+Returns
+-------
+scalar, Series or numpy.ndarray
+    Return the dot product of the Series and other if other is a
+    Series, the Series of the dot product of Series and each rows of
+    other if other is a DataFrame or a numpy.ndarray between the Series
+    and each columns of the numpy array.
+
+See Also
+--------
+DataFrame.dot: Compute the matrix product with the DataFrame.
+Series.mul: Multiplication of series and other, element-wise.
+
+Notes
+-----
+The Series and other has to share the same index if other is a Series
+or a DataFrame.
+
+Examples
+--------
+>>> import mars.tensor as mt
+>>> import mars.dataframe as md
+>>> s = md.Series([0, 1, 2, 3])
+>>> other = md.Series([-1, 2, -3, 4])
+>>> s.dot(other).execute()
+8
+>>> (s @ other).execute()
+8
+>>> df = md.DataFrame([[0, 1], [-2, 3], [4, -5], [6, 7]])
+>>> s.dot(df).execute()
+0    24
+1    14
+dtype: int64
+>>> arr = mt.array([[0, 1], [-2, 3], [4, -5], [6, 7]])
+>>> s.dot(arr).execute()
+array([24, 14])
+"""
diff --git a/python/xorbits/_mars/dataframe/arithmetic/equal.py b/python/xorbits/_mars/dataframe/arithmetic/equal.py
new file mode 100644
index 000000000..2e8237230
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/equal.py
@@ -0,0 +1,56 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameBinopUfunc
+from .docstring import bin_compare_doc
+
+
+class DataFrameEqual(DataFrameBinopUfunc):
+    _op_type_ = OperandDef.EQ
+
+    _func_name = "eq"
+    _rfunc_name = "eq"
+
+    return_dtype = np.dtype(bool)
+
+    @classproperty
+    def _operator(self):
+        return lambda lhs, rhs: lhs.eq(rhs)
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorEqual
+
+        return TensorEqual
+
+
+_eq_example = """
+>>> a.eq(b, fill_value=0).execute()
+a     True
+b    False
+c    False
+d    False
+e    False
+dtype: bool
+"""
+
+
+@bin_compare_doc("Equal to", equiv="==", series_example=_eq_example)
+def eq(df, other, axis="columns", level=None):
+    op = DataFrameEqual(axis=axis, level=level, lhs=df, rhs=other)
+    return op(df, other)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/exp.py b/python/xorbits/_mars/dataframe/arithmetic/exp.py
new file mode 100644
index 000000000..2cbe3d544
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/exp.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameExp(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.EXP
+    _func_name = "exp"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorExp
+
+        return TensorExp
diff --git a/python/xorbits/_mars/dataframe/arithmetic/exp2.py b/python/xorbits/_mars/dataframe/arithmetic/exp2.py
new file mode 100644
index 000000000..e83e0302a
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/exp2.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameExp2(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.EXP2
+    _func_name = "exp2"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorExp2
+
+        return TensorExp2
diff --git a/python/xorbits/_mars/dataframe/arithmetic/expm1.py b/python/xorbits/_mars/dataframe/arithmetic/expm1.py
new file mode 100644
index 000000000..e64b24b6a
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/expm1.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameExpm1(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.EXPM1
+    _func_name = "expm1"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorExpm1
+
+        return TensorExpm1
diff --git a/python/xorbits/_mars/dataframe/arithmetic/floor.py b/python/xorbits/_mars/dataframe/arithmetic/floor.py
new file mode 100644
index 000000000..719c1c4cd
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/floor.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameFloor(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.FLOOR
+    _func_name = "floor"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorFloor
+
+        return TensorFloor
diff --git a/python/xorbits/_mars/dataframe/arithmetic/floordiv.py b/python/xorbits/_mars/dataframe/arithmetic/floordiv.py
new file mode 100644
index 000000000..15eb654ea
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/floordiv.py
@@ -0,0 +1,64 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameBinopUfunc
+from .docstring import bin_arithmetic_doc
+
+
+class DataFrameFloorDiv(DataFrameBinopUfunc):
+    _op_type_ = OperandDef.FLOORDIV
+
+    _func_name = "floordiv"
+    _rfunc_name = "rfloordiv"
+
+    @classproperty
+    def _operator(self):
+        return operator.floordiv
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorFloorDiv
+
+        return TensorFloorDiv
+
+
+_floordiv_example = """
+>>> a.floordiv(b, fill_value=0).execute()
+a    1.0
+b    NaN
+c    NaN
+d    0.0
+e    NaN
+dtype: float64
+"""
+
+
+@bin_arithmetic_doc("Integer division", equiv="//", series_example=_floordiv_example)
+def floordiv(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameFloorDiv(
+        axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other
+    )
+    return op(df, other)
+
+
+@bin_arithmetic_doc("Integer division", equiv="//", series_example=_floordiv_example)
+def rfloordiv(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameFloorDiv(
+        axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df
+    )
+    return op.rcall(df, other)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/greater.py b/python/xorbits/_mars/dataframe/arithmetic/greater.py
new file mode 100644
index 000000000..5f75f1de9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/greater.py
@@ -0,0 +1,57 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameBinopUfunc
+from .docstring import bin_compare_doc
+
+
+class DataFrameGreater(DataFrameBinopUfunc):
+    _op_type_ = OperandDef.GT
+
+    _func_name = "gt"
+    _rfunc_name = "lt"
+
+    return_dtype = np.dtype(bool)
+
+    @classproperty
+    def _operator(self):
+        return lambda lhs, rhs: lhs.gt(rhs)
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorGreaterThan
+
+        return TensorGreaterThan
+
+
+_gt_example = """
+>>> a.gt(b, fill_value=0).execute()
+a     True
+b    False
+c    False
+d    False
+e     True
+f    False
+dtype: bool
+"""
+
+
+@bin_compare_doc("Greater than", equiv=">", series_example=_gt_example)
+def gt(df, other, axis="columns", level=None):
+    op = DataFrameGreater(axis=axis, level=level, lhs=df, rhs=other)
+    return op(df, other)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/greater_equal.py b/python/xorbits/_mars/dataframe/arithmetic/greater_equal.py
new file mode 100644
index 000000000..7c0d30b05
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/greater_equal.py
@@ -0,0 +1,57 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameBinopUfunc
+from .docstring import bin_compare_doc
+
+
+class DataFrameGreaterEqual(DataFrameBinopUfunc):
+    _op_type_ = OperandDef.GE
+
+    _func_name = "ge"
+    _rfunc_name = "le"
+
+    return_dtype = np.dtype(bool)
+
+    @classproperty
+    def _operator(self):
+        return lambda lhs, rhs: lhs.ge(rhs)
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorGreaterEqual
+
+        return TensorGreaterEqual
+
+
+_ge_example = """
+>>> a.ge(b, fill_value=0).execute()
+a     True
+b     True
+c    False
+d    False
+e     True
+f    False
+dtype: bool
+"""
+
+
+@bin_compare_doc("Greater than or equal to", equiv=">=", series_example=_ge_example)
+def ge(df, other, axis="columns", level=None):
+    op = DataFrameGreaterEqual(axis=axis, level=level, lhs=df, rhs=other)
+    return op(df, other)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/invert.py b/python/xorbits/_mars/dataframe/arithmetic/invert.py
new file mode 100644
index 000000000..202f3e525
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/invert.py
@@ -0,0 +1,33 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameNot(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.INVERT
+    _func_name = "__invert__"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorNot
+
+        return TensorNot
+
+
+def invert(df):
+    op = DataFrameNot()
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/is_ufuncs.py b/python/xorbits/_mars/dataframe/arithmetic/is_ufuncs.py
new file mode 100644
index 000000000..757b1f31e
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/is_ufuncs.py
@@ -0,0 +1,62 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameIsUFuncMixin:
+    @classmethod
+    def _get_output_dtype(cls, df):
+        if df.ndim == 2:
+            return pd.Series(np.dtype(bool), index=df.dtypes.index)
+        else:
+            return np.dtype(bool)
+
+
+class DataFrameIsNan(DataFrameIsUFuncMixin, DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.ISNAN
+    _func_name = "isnan"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorIsNan
+
+        return TensorIsNan
+
+
+class DataFrameIsInf(DataFrameIsUFuncMixin, DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.ISINF
+    _func_name = "isinf"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorIsInf
+
+        return TensorIsInf
+
+
+class DataFrameIsFinite(DataFrameIsUFuncMixin, DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.ISFINITE
+    _func_name = "isfinite"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorIsFinite
+
+        return TensorIsFinite
diff --git a/python/xorbits/_mars/dataframe/arithmetic/less.py b/python/xorbits/_mars/dataframe/arithmetic/less.py
new file mode 100644
index 000000000..88e5b2d02
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/less.py
@@ -0,0 +1,57 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameBinopUfunc
+from .docstring import bin_compare_doc
+
+
+class DataFrameLess(DataFrameBinopUfunc):
+    _op_type_ = OperandDef.LT
+
+    _func_name = "lt"
+    _rfunc_name = "gt"
+
+    return_dtype = np.dtype(bool)
+
+    @classproperty
+    def _operator(self):
+        return lambda lhs, rhs: lhs.lt(rhs)
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorLessThan
+
+        return TensorLessThan
+
+
+_lt_example = """
+>>> a.lt(b, fill_value=0).execute()
+a    False
+b    False
+c     True
+d    False
+e    False
+f     True
+dtype: bool
+"""
+
+
+@bin_compare_doc("Less than", equiv="<", series_example=_lt_example)
+def lt(df, other, axis="columns", level=None):
+    op = DataFrameLess(axis=axis, level=level, lhs=df, rhs=other)
+    return op(df, other)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/less_equal.py b/python/xorbits/_mars/dataframe/arithmetic/less_equal.py
new file mode 100644
index 000000000..78db91f6c
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/less_equal.py
@@ -0,0 +1,57 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameBinopUfunc
+from .docstring import bin_compare_doc
+
+
+class DataFrameLessEqual(DataFrameBinopUfunc):
+    _op_type_ = OperandDef.LE
+
+    _func_name = "le"
+    _rfunc_name = "ge"
+
+    return_dtype = np.dtype(bool)
+
+    @classproperty
+    def _operator(self):
+        return lambda lhs, rhs: lhs.le(rhs)
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorLessEqual
+
+        return TensorLessEqual
+
+
+_le_example = """
+>>> a.le(b, fill_value=0).execute()
+a    False
+b     True
+c     True
+d    False
+e    False
+f     True
+dtype: bool
+"""
+
+
+@bin_compare_doc("Less than or equal to", equiv="<=", series_example=_le_example)
+def le(df, other, axis="columns", level=None):
+    op = DataFrameLessEqual(axis=axis, level=level, lhs=df, rhs=other)
+    return op(df, other)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/log.py b/python/xorbits/_mars/dataframe/arithmetic/log.py
new file mode 100644
index 000000000..df3d96a08
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/log.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameLog(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.LOG
+    _func_name = "log"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorLog
+
+        return TensorLog
diff --git a/python/xorbits/_mars/dataframe/arithmetic/log10.py b/python/xorbits/_mars/dataframe/arithmetic/log10.py
new file mode 100644
index 000000000..4f2d49423
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/log10.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameLog10(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.LOG10
+    _func_name = "log10"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorLog10
+
+        return TensorLog10
diff --git a/python/xorbits/_mars/dataframe/arithmetic/log2.py b/python/xorbits/_mars/dataframe/arithmetic/log2.py
new file mode 100644
index 000000000..6fa3a42de
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/log2.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameLog2(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.LOG2
+    _func_name = "log2"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorLog2
+
+        return TensorLog2
diff --git a/python/xorbits/_mars/dataframe/arithmetic/mod.py b/python/xorbits/_mars/dataframe/arithmetic/mod.py
new file mode 100644
index 000000000..6cfc12592
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/mod.py
@@ -0,0 +1,60 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameBinopUfunc
+from .docstring import bin_arithmetic_doc
+
+
+class DataFrameMod(DataFrameBinopUfunc):
+    _op_type_ = OperandDef.MOD
+
+    _func_name = "mod"
+    _rfunc_name = "rmod"
+
+    @classproperty
+    def _operator(self):
+        return operator.mod
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorMod
+
+        return TensorMod
+
+
+_mod_example = """
+>>> a.mod(b, fill_value=0).execute()
+a    0.0
+b    NaN
+c    NaN
+d    0.0
+e    NaN
+dtype: float64
+"""
+
+
+@bin_arithmetic_doc("Modulo", equiv="%", series_example=_mod_example)
+def mod(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameMod(axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other)
+    return op(df, other)
+
+
+@bin_arithmetic_doc("Modulo", equiv="%", series_example=_mod_example)
+def rmod(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameMod(axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df)
+    return op.rcall(df, other)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/multiply.py b/python/xorbits/_mars/dataframe/arithmetic/multiply.py
new file mode 100644
index 000000000..b6ba3492f
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/multiply.py
@@ -0,0 +1,60 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameBinopUfunc
+from .docstring import bin_arithmetic_doc
+
+
+class DataFrameMul(DataFrameBinopUfunc):
+    _op_type_ = OperandDef.MUL
+
+    _func_name = "mul"
+    _rfunc_name = "rmul"
+
+    @classproperty
+    def _operator(self):
+        return operator.mul
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorMultiply
+
+        return TensorMultiply
+
+
+_mul_example = """
+>>> a.multiply(b, fill_value=0).execute()
+a    1.0
+b    0.0
+c    0.0
+d    0.0
+e    NaN
+dtype: float64
+"""
+
+
+@bin_arithmetic_doc("Multiplication", equiv="*", series_example=_mul_example)
+def mul(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameMul(axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other)
+    return op(df, other)
+
+
+@bin_arithmetic_doc("Multiplication", equiv="*", series_example=_mul_example)
+def rmul(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameMul(axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df)
+    return op.rcall(df, other)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/negative.py b/python/xorbits/_mars/dataframe/arithmetic/negative.py
new file mode 100644
index 000000000..c5312250b
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/negative.py
@@ -0,0 +1,33 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameNegative(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.NEGATIVE
+    _func_name = "negative"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorNegative
+
+        return TensorNegative
+
+
+def negative(df):
+    op = DataFrameNegative()
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/not_equal.py b/python/xorbits/_mars/dataframe/arithmetic/not_equal.py
new file mode 100644
index 000000000..10571b55e
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/not_equal.py
@@ -0,0 +1,56 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameBinopUfunc
+from .docstring import bin_compare_doc
+
+
+class DataFrameNotEqual(DataFrameBinopUfunc):
+    _op_type_ = OperandDef.NE
+
+    _func_name = "ne"
+    _rfunc_name = "ne"
+
+    return_dtype = np.dtype(bool)
+
+    @classproperty
+    def _operator(self):
+        return lambda lhs, rhs: lhs.ne(rhs)
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorNotEqual
+
+        return TensorNotEqual
+
+
+_ne_example = """
+>>> a.ne(b, fill_value=0).execute()
+a    False
+b     True
+c     True
+d     True
+e     True
+dtype: bool
+"""
+
+
+@bin_compare_doc("Not equal to", equiv="!=", series_example=_ne_example)
+def ne(df, other, axis="columns", level=None):
+    op = DataFrameNotEqual(axis=axis, level=level, lhs=df, rhs=other)
+    return op(df, other)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/power.py b/python/xorbits/_mars/dataframe/arithmetic/power.py
new file mode 100644
index 000000000..acb9726c2
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/power.py
@@ -0,0 +1,68 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameBinopUfunc
+from .docstring import bin_arithmetic_doc
+
+
+class DataFramePower(DataFrameBinopUfunc):
+    _op_type_ = OperandDef.POW
+
+    _func_name = "pow"
+    _rfunc_name = "rpow"
+
+    @classproperty
+    def _operator(self):
+        return operator.pow
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorPower
+
+        return TensorPower
+
+
+_pow_example = """
+>>> a.pow(b, fill_value=0).execute()
+a    1.0
+b    1.0
+c    1.0
+d    0.0
+e    NaN
+dtype: float64
+"""
+
+
+@bin_arithmetic_doc(
+    "Exponential power", op_name="pow", equiv="**", series_example=_pow_example
+)
+def power(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFramePower(
+        axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other
+    )
+    return op(df, other)
+
+
+@bin_arithmetic_doc(
+    "Exponential power", op_name="rpow", equiv="**", series_example=_pow_example
+)
+def rpower(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFramePower(
+        axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df
+    )
+    return op.rcall(df, other)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/radians.py b/python/xorbits/_mars/dataframe/arithmetic/radians.py
new file mode 100644
index 000000000..ac870a95b
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/radians.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameRadians(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.RADIANS
+    _func_name = "radians"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorRadians
+
+        return TensorRadians
diff --git a/python/xorbits/_mars/dataframe/arithmetic/sin.py b/python/xorbits/_mars/dataframe/arithmetic/sin.py
new file mode 100644
index 000000000..d969f6a7c
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/sin.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameSin(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.SIN
+    _func_name = "sin"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorSin
+
+        return TensorSin
diff --git a/python/xorbits/_mars/dataframe/arithmetic/sinh.py b/python/xorbits/_mars/dataframe/arithmetic/sinh.py
new file mode 100644
index 000000000..3e1f98c2c
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/sinh.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameSinh(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.SINH
+    _func_name = "sinh"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorSinh
+
+        return TensorSinh
diff --git a/python/xorbits/_mars/dataframe/arithmetic/sqrt.py b/python/xorbits/_mars/dataframe/arithmetic/sqrt.py
new file mode 100644
index 000000000..8bc063cb1
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/sqrt.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameSqrt(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.SQRT
+    _func_name = "sqrt"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorSqrt
+
+        return TensorSqrt
diff --git a/python/xorbits/_mars/dataframe/arithmetic/subtract.py b/python/xorbits/_mars/dataframe/arithmetic/subtract.py
new file mode 100644
index 000000000..8765908a7
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/subtract.py
@@ -0,0 +1,64 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameBinopUfunc
+from .docstring import bin_arithmetic_doc
+
+
+class DataFrameSubtract(DataFrameBinopUfunc):
+    _op_type_ = OperandDef.SUB
+
+    _func_name = "sub"
+    _rfunc_name = "rsub"
+
+    @classproperty
+    def _operator(self):
+        return operator.sub
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorSubtract
+
+        return TensorSubtract
+
+
+_sub_example = """
+>>> a.subtract(b, fill_value=0).execute()
+a    0.0
+b    1.0
+c    1.0
+d   -1.0
+e    NaN
+dtype: float64
+"""
+
+
+@bin_arithmetic_doc("Subtraction", equiv="-", series_example=_sub_example)
+def subtract(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameSubtract(
+        axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other
+    )
+    return op(df, other)
+
+
+@bin_arithmetic_doc("Subtraction", equiv="-", series_example=_sub_example)
+def rsubtract(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameSubtract(
+        axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df
+    )
+    return op.rcall(df, other)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/tan.py b/python/xorbits/_mars/dataframe/arithmetic/tan.py
new file mode 100644
index 000000000..f737c0731
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/tan.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameTan(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.TAN
+    _func_name = "tan"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorTan
+
+        return TensorTan
diff --git a/python/xorbits/_mars/dataframe/arithmetic/tanh.py b/python/xorbits/_mars/dataframe/arithmetic/tanh.py
new file mode 100644
index 000000000..990801507
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/tanh.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameUnaryUfunc
+
+
+class DataFrameTanh(DataFrameUnaryUfunc):
+    _op_type_ = OperandDef.TANH
+    _func_name = "tanh"
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorTanh
+
+        return TensorTanh
diff --git a/python/xorbits/_mars/dataframe/arithmetic/tests/__init__.py b/python/xorbits/_mars/dataframe/arithmetic/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/arithmetic/tests/test_arithmetic.py b/python/xorbits/_mars/dataframe/arithmetic/tests/test_arithmetic.py
new file mode 100644
index 000000000..ed6f681c1
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/tests/test_arithmetic.py
@@ -0,0 +1,1551 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import itertools
+import operator
+from dataclasses import dataclass
+from typing import Callable
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .... import tensor as mt
+from ....core import OperandType, OutputType, tile
+from ....core.operand import OperandStage
+from ....utils import dataslots
+from ...align import DataFrameIndexAlign, DataFrameShuffleProxy
+from ...core import IndexValue
+from ...datasource.dataframe import DataFrameDataSource, from_pandas
+from ...datasource.from_tensor import dataframe_from_tensor
+from ...datasource.series import SeriesDataSource
+from ...datasource.series import from_pandas as from_pandas_series
+from ...utils import (
+    build_split_idx_to_origin_idx,
+    filter_index_value,
+    hash_dtypes,
+    split_monotonic_index_min_max,
+)
+from .. import (
+    DataFrameAbs,
+    DataFrameAdd,
+    DataFrameAnd,
+    DataFrameEqual,
+    DataFrameFloorDiv,
+    DataFrameGreater,
+    DataFrameGreaterEqual,
+    DataFrameLess,
+    DataFrameLessEqual,
+    DataFrameMod,
+    DataFrameMul,
+    DataFrameNot,
+    DataFrameNotEqual,
+    DataFrameOr,
+    DataFramePower,
+    DataFrameSubtract,
+    DataFrameTrueDiv,
+    DataFrameXor,
+)
+
+
+def comp_func(name, reverse_name):
+    def inner(lhs, rhs):
+        try:
+            return getattr(lhs, name)(rhs)
+        except AttributeError:
+            return getattr(rhs, reverse_name)(lhs)
+
+    return inner
+
+
+@dataslots
+@dataclass
+class FunctionOptions:
+    func: Callable
+    op: OperandType
+    func_name: str
+    rfunc_name: str
+
+
+binary_functions = dict(
+    add=FunctionOptions(
+        func=operator.add, op=DataFrameAdd, func_name="add", rfunc_name="radd"
+    ),
+    subtract=FunctionOptions(
+        func=operator.sub, op=DataFrameSubtract, func_name="sub", rfunc_name="rsub"
+    ),
+    multiply=FunctionOptions(
+        func=operator.mul, op=DataFrameMul, func_name="mul", rfunc_name="rmul"
+    ),
+    floordiv=FunctionOptions(
+        func=operator.floordiv,
+        op=DataFrameFloorDiv,
+        func_name="floordiv",
+        rfunc_name="rfloordiv",
+    ),
+    truediv=FunctionOptions(
+        func=operator.truediv,
+        op=DataFrameTrueDiv,
+        func_name="truediv",
+        rfunc_name="rtruediv",
+    ),
+    mod=FunctionOptions(
+        func=operator.mod, op=DataFrameMod, func_name="mod", rfunc_name="rmod"
+    ),
+    power=FunctionOptions(
+        func=operator.pow, op=DataFramePower, func_name="pow", rfunc_name="rpow"
+    ),
+    equal=FunctionOptions(
+        func=comp_func("eq", "eq"), op=DataFrameEqual, func_name="eq", rfunc_name="eq"
+    ),
+    not_equal=FunctionOptions(
+        func=comp_func("ne", "ne"),
+        op=DataFrameNotEqual,
+        func_name="ne",
+        rfunc_name="ne",
+    ),
+    greater=FunctionOptions(
+        func=comp_func("gt", "lt"), op=DataFrameGreater, func_name="gt", rfunc_name="lt"
+    ),
+    less=FunctionOptions(
+        func=comp_func("lt", "gt"), op=DataFrameLess, func_name="lt", rfunc_name="gt"
+    ),
+    greater_equal=FunctionOptions(
+        func=comp_func("ge", "le"),
+        op=DataFrameGreaterEqual,
+        func_name="ge",
+        rfunc_name="le",
+    ),
+    less_equal=FunctionOptions(
+        func=comp_func("le", "ge"),
+        op=DataFrameLessEqual,
+        func_name="le",
+        rfunc_name="ge",
+    ),
+    logical_and=FunctionOptions(
+        func=operator.and_, op=DataFrameAnd, func_name="__and__", rfunc_name="and"
+    ),
+    logical_or=FunctionOptions(
+        func=operator.or_, op=DataFrameOr, func_name="__or__", rfunc_name="__ror__"
+    ),
+    logical_xor=FunctionOptions(
+        func=operator.xor, op=DataFrameXor, func_name="__xor__", rfunc_name="__rxor__"
+    ),
+)
+
+
+def to_boolean_if_needed(func_name, value, split_value=0.5):
+    if func_name in ["__and__", "__or__", "__xor__"]:
+        return value > split_value
+    else:
+        return value
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_without_shuffle(func_name, func_opts):
+    # all the axes are monotonic
+    # data1 with index split into [0...4], [5...9],
+    # columns [3...7], [8...12]
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10), index=np.arange(10), columns=np.arange(3, 13)
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=5)
+    # data2 with index split into [6...11], [2, 5],
+    # columns [4...9], [10, 13]
+    data2 = pd.DataFrame(
+        np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=np.arange(4, 14)
+    )
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=6)
+
+    df3 = func_opts.func(df1, df2)
+
+    # test df3's index and columns
+    pd.testing.assert_index_equal(
+        df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
+    )
+    assert isinstance(df3.index_value.value, IndexValue.Int64Index)
+    pd.testing.assert_index_equal(
+        df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
+    )
+    assert df3.index_value.key != df1.index_value.key
+    assert df3.index_value.key != df2.index_value.key
+    assert df3.shape[1] == 11  # columns is recorded, so we can get it
+
+    df1, df2, df3 = tile(df1, df2, df3)
+
+    # test df3's index and columns after tiling
+    pd.testing.assert_index_equal(
+        df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
+    )
+    assert isinstance(df3.index_value.value, IndexValue.Int64Index)
+    pd.testing.assert_index_equal(
+        df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
+    )
+    assert df3.index_value.key != df1.index_value.key
+    assert df3.index_value.key != df2.index_value.key
+    assert df3.shape[1] == 11  # columns is recorded, so we can get it
+
+    data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)]
+    data1_columns_min_max = [[3, True, 7, True], [8, True, 12, True]]
+    data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)]
+    data2_columns_min_max = [(4, True, 9, True), (10, True, 13, True)]
+
+    left_index_splits, right_index_splits = split_monotonic_index_min_max(
+        data1_index_min_max, True, data2_index_min_max, False
+    )
+    left_columns_splits, right_columns_splits = split_monotonic_index_min_max(
+        data1_columns_min_max, True, data2_columns_min_max, True
+    )
+
+    left_index_idx_to_original_idx = build_split_idx_to_origin_idx(left_index_splits)
+    right_index_idx_to_original_idx = build_split_idx_to_origin_idx(
+        right_index_splits, False
+    )
+    left_columns_idx_to_original_idx = build_split_idx_to_origin_idx(
+        left_columns_splits
+    )
+    right_columns_idx_to_original_idx = build_split_idx_to_origin_idx(
+        right_columns_splits
+    )
+
+    assert df3.chunk_shape == (7, 7)
+    for c in df3.chunks:
+        assert isinstance(c.op, func_opts.op)
+        assert len(c.inputs) == 2
+        # test shape
+        idx = c.index
+        # test the left side
+        assert isinstance(c.inputs[0].op, DataFrameIndexAlign)
+        assert c.inputs[0].op.stage == OperandStage.map
+        left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[idx[0]]
+        left_col_idx, left_col_inner_idx = left_columns_idx_to_original_idx[idx[1]]
+        expect_df1_input = df1.cix[left_row_idx, left_col_idx].data
+        assert c.inputs[0].inputs[0] is expect_df1_input
+        left_index_min_max = left_index_splits[left_row_idx][left_row_inner_idx]
+        assert c.inputs[0].op.index_min == left_index_min_max[0]
+        assert c.inputs[0].op.index_min_close == left_index_min_max[1]
+        assert c.inputs[0].op.index_max == left_index_min_max[2]
+        assert c.inputs[0].op.index_max_close == left_index_min_max[3]
+        assert isinstance(c.inputs[0].index_value.to_pandas(), type(data1.index))
+        left_column_min_max = left_columns_splits[left_col_idx][left_col_inner_idx]
+        assert c.inputs[0].op.column_min == left_column_min_max[0]
+        assert c.inputs[0].op.column_min_close == left_column_min_max[1]
+        assert c.inputs[0].op.column_max == left_column_min_max[2]
+        assert c.inputs[0].op.column_max_close == left_column_min_max[3]
+        expect_left_columns = filter_index_value(
+            expect_df1_input.columns_value, left_column_min_max, store_data=True
+        )
+        pd.testing.assert_index_equal(
+            c.inputs[0].columns_value.to_pandas(), expect_left_columns.to_pandas()
+        )
+        pd.testing.assert_index_equal(
+            c.inputs[0].dtypes.index, expect_left_columns.to_pandas()
+        )
+        # test the right side
+        assert isinstance(c.inputs[1].op, DataFrameIndexAlign)
+        assert c.inputs[1].op.stage == OperandStage.map
+        right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[idx[0]]
+        right_col_idx, right_col_inner_idx = right_columns_idx_to_original_idx[idx[1]]
+        expect_df2_input = df2.cix[right_row_idx, right_col_idx].data
+        assert c.inputs[1].inputs[0] is expect_df2_input
+        right_index_min_max = right_index_splits[right_row_idx][right_row_inner_idx]
+        assert c.inputs[1].op.index_min == right_index_min_max[0]
+        assert c.inputs[1].op.index_min_close == right_index_min_max[1]
+        assert c.inputs[1].op.index_max == right_index_min_max[2]
+        assert c.inputs[1].op.index_max_close == right_index_min_max[3]
+        assert isinstance(c.inputs[1].index_value.to_pandas(), type(data2.index))
+        right_column_min_max = right_columns_splits[right_col_idx][right_col_inner_idx]
+        assert c.inputs[1].op.column_min == right_column_min_max[0]
+        assert c.inputs[1].op.column_min_close == right_column_min_max[1]
+        assert c.inputs[1].op.column_max == right_column_min_max[2]
+        assert c.inputs[1].op.column_max_close == right_column_min_max[3]
+        expect_right_columns = filter_index_value(
+            expect_df2_input.columns_value, left_column_min_max, store_data=True
+        )
+        pd.testing.assert_index_equal(
+            c.inputs[1].columns_value.to_pandas(), expect_right_columns.to_pandas()
+        )
+        pd.testing.assert_index_equal(
+            c.inputs[1].dtypes.index, expect_right_columns.to_pandas()
+        )
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_dataframe_and_series_with_align_map(func_name, func_opts):
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10), index=np.arange(10), columns=np.arange(3, 13)
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=5)
+    s1 = df1[3]
+
+    df2 = func_opts.func(df1, s1)
+    df1, df2, s1 = tile(df1, df2, s1)
+
+    assert df2.shape == (df1.shape[0], np.nan)
+    assert df2.index_value.key == df1.index_value.key
+
+    data1_columns_min_max = [[3, True, 7, True], [8, True, 12, True]]
+    data2_index_min_max = [(0, True, 4, True), (5, True, 9, True)]
+
+    left_columns_splits, right_index_splits = split_monotonic_index_min_max(
+        data1_columns_min_max, True, data2_index_min_max, True
+    )
+
+    left_columns_idx_to_original_idx = build_split_idx_to_origin_idx(
+        left_columns_splits
+    )
+    right_index_idx_to_original_idx = build_split_idx_to_origin_idx(right_index_splits)
+
+    assert df2.chunk_shape == (2, 7)
+    for c in df2.chunks:
+        assert isinstance(c.op, func_opts.op)
+        assert len(c.inputs) == 2
+        # test shape
+        idx = c.index
+        # test the left side (dataframe)
+        assert isinstance(c.inputs[0].op, DataFrameIndexAlign)
+        assert c.inputs[0].op.stage == OperandStage.map
+        left_col_idx, left_col_inner_idx = left_columns_idx_to_original_idx[idx[1]]
+        expect_df1_input = df1.cix[idx[0], left_col_idx].data
+        assert c.inputs[0].inputs[0] is expect_df1_input
+        left_column_min_max = left_columns_splits[left_col_idx][left_col_inner_idx]
+        assert c.inputs[0].op.column_min == left_column_min_max[0]
+        assert c.inputs[0].op.column_min_close == left_column_min_max[1]
+        assert c.inputs[0].op.column_max == left_column_min_max[2]
+        assert c.inputs[0].op.column_max_close == left_column_min_max[3]
+        expect_left_columns = filter_index_value(
+            expect_df1_input.columns_value, left_column_min_max, store_data=True
+        )
+        pd.testing.assert_index_equal(
+            c.inputs[0].columns_value.to_pandas(), expect_left_columns.to_pandas()
+        )
+        pd.testing.assert_index_equal(
+            c.inputs[0].dtypes.index, expect_left_columns.to_pandas()
+        )
+
+        # test the right side (series)
+        assert isinstance(c.inputs[1].op, DataFrameIndexAlign)
+        assert c.inputs[1].op.stage == OperandStage.map
+        right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[idx[1]]
+        expect_s1_input = s1.cix[(right_row_idx,)].data
+        assert c.inputs[1].inputs[0] is expect_s1_input
+        right_index_min_max = right_index_splits[right_row_idx][right_row_inner_idx]
+        assert c.inputs[1].op.index_min == right_index_min_max[0]
+        assert c.inputs[1].op.index_min_close == right_index_min_max[1]
+        assert c.inputs[1].op.index_max == right_index_min_max[2]
+        assert c.inputs[1].op.index_max_close == right_index_min_max[3]
+        assert isinstance(c.inputs[1].index_value.to_pandas(), type(data1[3].index))
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_dataframe_and_series_identical(func_name, func_opts):
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10), index=np.arange(10), columns=np.arange(10)
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=5)
+    s1 = from_pandas_series(data1[3], chunk_size=5)
+
+    df2 = func_opts.func(df1, s1)
+    df1, df2, s1 = tile(df1, df2, s1)
+
+    assert df2.shape == (10, 10)
+    assert df2.index_value.key == df1.index_value.key
+    assert df2.columns_value.key == df1.columns_value.key
+    assert df2.columns_value.key == s1.index_value.key
+
+    assert df2.chunk_shape == (2, 2)
+    for c in df2.chunks:
+        assert isinstance(c.op, func_opts.op)
+        assert len(c.inputs) == 2
+        assert c.shape == (5, 5)
+        assert c.index_value.key == df1.cix[c.index].index_value.key
+        assert c.index_value.key == df2.cix[c.index].index_value.key
+        assert c.columns_value.key == df1.cix[c.index].columns_value.key
+        assert c.columns_value.key == df2.cix[c.index].columns_value.key
+        pd.testing.assert_index_equal(
+            c.columns_value.to_pandas(), df1.cix[c.index].columns_value.to_pandas()
+        )
+        pd.testing.assert_index_equal(
+            c.columns_value.to_pandas(), df2.cix[c.index].columns_value.to_pandas()
+        )
+        pd.testing.assert_index_equal(
+            c.dtypes.index, df1.cix[c.index].columns_value.to_pandas()
+        )
+
+        # test the left side
+        assert isinstance(c.inputs[0].op, DataFrameDataSource)
+        assert c.inputs[0] is df1.cix[c.index].data
+        # test the right side
+        assert isinstance(c.inputs[1].op, SeriesDataSource)
+        assert c.inputs[1] is s1.cix[(c.index[1],)].data
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_dataframe_and_series_with_shuffle(func_name, func_opts):
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[4, 9, 3, 2, 1, 5, 8, 6, 7, 10],
+        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=5)
+    s1 = from_pandas_series(data1[10], chunk_size=6)
+
+    df2 = func_opts.func(df1, s1)
+
+    # test df2's index and columns
+    assert df2.shape == (df1.shape[0], np.nan)
+    assert df2.index_value.key == df1.index_value.key
+    pd.testing.assert_index_equal(
+        df2.columns_value.to_pandas(), pd.Index([], dtype=np.int64)
+    )
+    assert df2.columns_value.key != df1.columns_value.key
+
+    df1, df2, s1 = tile(df1, df2, s1)
+
+    assert df2.chunk_shape == (2, 2)
+    for c in df2.chunks:
+        assert isinstance(c.op, func_opts.op)
+        assert len(c.inputs) == 2
+        idx = c.index
+        # test the left side
+        assert isinstance(c.inputs[0].op, DataFrameIndexAlign)
+        assert c.inputs[0].op.stage == OperandStage.reduce
+        expect_dtypes = pd.concat(
+            [
+                hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]]
+                for ic in c.inputs[0].inputs[0].inputs
+            ]
+        )
+        pd.testing.assert_series_equal(c.inputs[0].dtypes, expect_dtypes)
+        pd.testing.assert_index_equal(
+            c.inputs[0].columns_value.to_pandas(), c.inputs[0].dtypes.index
+        )
+        pd.testing.assert_index_equal(
+            c.inputs[0].index_value.to_pandas(), c.index_value.to_pandas()
+        )
+        assert isinstance(c.inputs[0].inputs[0].op, DataFrameShuffleProxy)
+        for j, ci, ic in zip(
+            itertools.count(0), c.inputs[0].inputs[0].inputs, df1.cix[idx[0], :]
+        ):
+            assert isinstance(ci.op, DataFrameIndexAlign)
+            assert ci.op.stage == OperandStage.map
+            assert ci.index == (idx[0], j)
+            assert ci.op.column_shuffle_size
+            shuffle_segments = ci.op.column_shuffle_segments
+            expected_shuffle_segments = hash_dtypes(ic.data.dtypes, 2)
+            assert len(shuffle_segments) == len(expected_shuffle_segments)
+            for ss, ess in zip(shuffle_segments, expected_shuffle_segments):
+                pd.testing.assert_series_equal(ss, ess)
+            assert ci.inputs[0] is ic.data
+
+        # test the right side
+        assert isinstance(c.inputs[1].op, DataFrameIndexAlign)
+        assert c.inputs[1].op.stage == OperandStage.reduce
+        assert c.inputs[1].op.output_types[0] == OutputType.series
+        assert isinstance(c.inputs[1].inputs[0].op, DataFrameShuffleProxy)
+        for j, ci, ic in zip(
+            itertools.count(0), c.inputs[1].inputs[0].inputs, s1.chunks
+        ):
+            assert isinstance(ci.op, DataFrameIndexAlign)
+            assert ci.op.stage == OperandStage.map
+            assert ci.index == (j,)
+            assert ci.op.index_shuffle_size
+            assert ci.inputs[0] is ic.data
+
+    # make sure shuffle proxies' key are different
+    proxy_keys = set()
+    for i in range(df2.chunk_shape[0]):
+        cs = [c for c in df2.chunks if c.index[0] == i]
+        lps = {c.inputs[0].inputs[0].op.key for c in cs}
+        assert len(lps) == 1
+        proxy_keys.add(lps.pop())
+        rps = {c.inputs[1].inputs[0].op.key for c in cs}
+        assert len(rps) == 1
+        proxy_keys.add(rps.pop())
+    assert len(proxy_keys) == df2.chunk_shape[0] + 1
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_series_and_series_with_align_map(func_name, func_opts):
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10), index=np.arange(10), columns=np.arange(3, 13)
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=5)
+
+    s1 = df1.iloc[4]
+    s2 = df1[3]
+
+    s3 = func_opts.func(s1, s2)
+
+    s1, s2, s3 = tile(s1, s2, s3)
+
+    assert s3.shape == (np.nan,)
+
+    s1_index_min_max = [[3, True, 7, True], [8, True, 12, True]]
+    s2_index_min_max = [(0, True, 4, True), (5, True, 9, True)]
+
+    left_index_splits, right_index_splits = split_monotonic_index_min_max(
+        s1_index_min_max, True, s2_index_min_max, True
+    )
+
+    left_index_idx_to_original_idx = build_split_idx_to_origin_idx(left_index_splits)
+    right_index_idx_to_original_idx = build_split_idx_to_origin_idx(right_index_splits)
+
+    assert s3.chunk_shape == (7,)
+    for c in s3.chunks:
+        assert isinstance(c.op, func_opts.op)
+        assert len(c.inputs) == 2
+        # test shape
+        idx = c.index
+        # test the left side (series)
+        assert isinstance(c.inputs[0].op, DataFrameIndexAlign)
+        assert c.inputs[0].op.stage == OperandStage.map
+        left_col_idx, left_col_inner_idx = left_index_idx_to_original_idx[idx[0]]
+        expect_s1_input = s1.cix[(left_col_idx,)].data
+        assert c.inputs[0].inputs[0] is expect_s1_input
+        left_index_min_max = left_index_splits[left_col_idx][left_col_inner_idx]
+        assert c.inputs[0].op.index_min == left_index_min_max[0]
+        assert c.inputs[0].op.index_min_close == left_index_min_max[1]
+        assert c.inputs[0].op.index_max == left_index_min_max[2]
+        assert c.inputs[0].op.index_max_close == left_index_min_max[3]
+        assert isinstance(
+            c.inputs[0].index_value.to_pandas(), type(data1.iloc[4].index)
+        )
+        expect_left_index = filter_index_value(
+            expect_s1_input.index_value, left_index_min_max, store_data=True
+        )
+        pd.testing.assert_index_equal(
+            c.inputs[0].index_value.to_pandas(), expect_left_index.to_pandas()
+        )
+
+        # test the right side (series)
+        assert isinstance(c.inputs[1].op, DataFrameIndexAlign)
+        assert c.inputs[1].op.stage == OperandStage.map
+        right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[idx[0]]
+        expect_s2_input = s2.cix[(right_row_idx,)].data
+        assert c.inputs[1].inputs[0] is expect_s2_input
+        right_index_min_max = right_index_splits[right_row_idx][right_row_inner_idx]
+        assert c.inputs[1].op.index_min == right_index_min_max[0]
+        assert c.inputs[1].op.index_min_close == right_index_min_max[1]
+        assert c.inputs[1].op.index_max == right_index_min_max[2]
+        assert c.inputs[1].op.index_max_close == right_index_min_max[3]
+        assert isinstance(c.inputs[1].index_value.to_pandas(), type(data1[3].index))
+        expect_right_index = filter_index_value(
+            expect_s2_input.index_value, right_index_min_max, store_data=True
+        )
+        pd.testing.assert_index_equal(
+            c.inputs[1].index_value.to_pandas(), expect_right_index.to_pandas()
+        )
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_series_and_series_identical(func_name, func_opts):
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10), index=np.arange(10), columns=np.arange(10)
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    s1 = from_pandas_series(data1[1], chunk_size=5)
+    s2 = from_pandas_series(data1[3], chunk_size=5)
+
+    s3 = func_opts.func(s1, s2)
+
+    s1, s2, s3 = tile(s1, s2, s3)
+
+    assert s3.shape == (10,)
+    assert s3.index_value.key == s1.index_value.key
+    assert s3.index_value.key == s2.index_value.key
+
+    assert s3.chunk_shape == (2,)
+    for c in s3.chunks:
+        assert isinstance(c.op, func_opts.op)
+        assert c.op.output_types[0] == OutputType.series
+        assert len(c.inputs) == 2
+        assert c.shape == (5,)
+        assert c.index_value.key == s1.cix[c.index].index_value.key
+        assert c.index_value.key == s2.cix[c.index].index_value.key
+
+        # test the left side
+        assert isinstance(c.inputs[0].op, SeriesDataSource)
+        assert c.inputs[0] is s1.cix[c.index].data
+        # test the right side
+        assert isinstance(c.inputs[1].op, SeriesDataSource)
+        assert c.inputs[1] is s2.cix[c.index].data
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_series_and_series_with_shuffle(func_name, func_opts):
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[4, 9, 3, 2, 1, 5, 8, 6, 7, 10],
+        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    s1 = from_pandas_series(data1.iloc[4], chunk_size=5)
+    s2 = from_pandas_series(data1[10], chunk_size=6)
+
+    s3 = func_opts.func(s1, s2)
+
+    # test s3's index
+    assert s3.shape == (np.nan,)
+    assert s3.index_value.key != s1.index_value.key
+    assert s3.index_value.key != s2.index_value.key
+    pd.testing.assert_index_equal(
+        s3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
+    )
+
+    s1, s2, s3 = tile(s1, s2, s3)
+
+    assert s3.chunk_shape == (2,)
+    for c in s3.chunks:
+        assert isinstance(c.op, func_opts.op)
+        assert len(c.inputs) == 2
+        # test the left side
+        assert isinstance(c.inputs[0].op, DataFrameIndexAlign)
+        assert c.inputs[0].op.stage == OperandStage.reduce
+        assert c.inputs[0].op.output_types[0] == OutputType.series
+        assert isinstance(c.inputs[0].inputs[0].op, DataFrameShuffleProxy)
+        for j, ci, ic in zip(
+            itertools.count(0), c.inputs[0].inputs[0].inputs, s1.chunks
+        ):
+            assert isinstance(ci.op, DataFrameIndexAlign)
+            assert ci.op.stage == OperandStage.map
+            assert ci.index == (j,)
+            assert ci.op.index_shuffle_size
+            assert ci.inputs[0] is ic.data
+
+        # test the right side
+        assert isinstance(c.inputs[1].op, DataFrameIndexAlign)
+        assert c.inputs[1].op.stage == OperandStage.reduce
+        assert c.inputs[1].op.output_types[0] == OutputType.series
+        assert isinstance(c.inputs[1].inputs[0].op, DataFrameShuffleProxy)
+        for j, ci, ic in zip(
+            itertools.count(0), c.inputs[1].inputs[0].inputs, s2.chunks
+        ):
+            assert isinstance(ci.op, DataFrameIndexAlign)
+            assert ci.op.stage == OperandStage.map
+            assert ci.index == (j,)
+            assert ci.op.index_shuffle_size
+            assert ci.inputs[0] is ic.data
+
+    # make sure shuffle proxies' key are different
+    proxy_keys = set()
+    for c in s3.chunks:
+        proxy_keys.add(c.inputs[0].inputs[0].op.key)
+        proxy_keys.add(c.inputs[1].inputs[0].op.key)
+    assert len(proxy_keys) == 2
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_identical_index_and_columns(func_name, func_opts):
+    data1 = pd.DataFrame(np.random.rand(10, 10), columns=np.arange(3, 13))
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=5)
+    data2 = pd.DataFrame(np.random.rand(10, 10), columns=np.arange(3, 13))
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=5)
+
+    df3 = func_opts.func(df1, df2)
+
+    # test df3's index and columns
+    pd.testing.assert_index_equal(
+        df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
+    )
+    assert isinstance(df3.index_value.value, IndexValue.RangeIndex)
+    pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.RangeIndex(0, 10))
+    assert df3.index_value.key == df1.index_value.key
+    assert df3.index_value.key == df2.index_value.key
+    assert df3.shape == (10, 10)  # columns is recorded, so we can get it
+
+    df1, df2, df3 = tile(df1, df2, df3)
+
+    assert df3.chunk_shape == (2, 2)
+    for c in df3.chunks:
+        assert isinstance(c.op, func_opts.op)
+        assert len(c.inputs) == 2
+        assert c.shape == (5, 5)
+        assert c.index_value.key == df1.cix[c.index].index_value.key
+        assert c.index_value.key == df2.cix[c.index].index_value.key
+        assert c.columns_value.key == df1.cix[c.index].columns_value.key
+        assert c.columns_value.key == df2.cix[c.index].columns_value.key
+        pd.testing.assert_index_equal(
+            c.columns_value.to_pandas(), df1.cix[c.index].columns_value.to_pandas()
+        )
+        pd.testing.assert_index_equal(
+            c.columns_value.to_pandas(), df2.cix[c.index].columns_value.to_pandas()
+        )
+        pd.testing.assert_index_equal(
+            c.dtypes.index, df1.cix[c.index].columns_value.to_pandas()
+        )
+
+        # test the left side
+        assert c.inputs[0] is df1.cix[c.index].data
+        # test the right side
+        assert c.inputs[1] is df2.cix[c.index].data
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_with_one_shuffle(func_name, func_opts):
+    # only 1 axis is monotonic
+    # data1 with index split into [0...4], [5...9],
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.arange(10),
+        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=5)
+    # data2 with index split into [6...11], [2, 5],
+    data2 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.arange(11, 1, -1),
+        columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
+    )
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=6)
+
+    df3 = func_opts.func(df1, df2)
+
+    # test df3's index and columns
+    pd.testing.assert_index_equal(
+        df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
+    )
+    assert isinstance(df3.index_value.value, IndexValue.Int64Index)
+    pd.testing.assert_index_equal(
+        df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
+    )
+    assert df3.index_value.key != df1.index_value.key
+    assert df3.index_value.key != df2.index_value.key
+    assert df3.shape[1] == 12  # columns is recorded, so we can get it
+
+    df1, df2, df3 = tile(df1, df2, df3)
+
+    data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)]
+    data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)]
+
+    left_index_splits, right_index_splits = split_monotonic_index_min_max(
+        data1_index_min_max, True, data2_index_min_max, False
+    )
+
+    left_index_idx_to_original_idx = build_split_idx_to_origin_idx(left_index_splits)
+    right_index_idx_to_original_idx = build_split_idx_to_origin_idx(
+        right_index_splits, False
+    )
+
+    assert df3.chunk_shape == (7, 2)
+    for c in df3.chunks:
+        assert isinstance(c.op, func_opts.op)
+        assert len(c.inputs) == 2
+        idx = c.index
+        # test the left side
+        assert isinstance(c.inputs[0].op, DataFrameIndexAlign)
+        assert c.inputs[0].op.stage == OperandStage.reduce
+        expect_dtypes = pd.concat(
+            [
+                hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]]
+                for ic in c.inputs[0].inputs[0].inputs
+            ]
+        )
+        pd.testing.assert_series_equal(c.inputs[0].dtypes, expect_dtypes)
+        pd.testing.assert_index_equal(
+            c.inputs[0].columns_value.to_pandas(), c.inputs[0].dtypes.index
+        )
+        assert isinstance(c.inputs[0].index_value.to_pandas(), type(data1.index))
+        assert isinstance(c.inputs[0].inputs[0].op, DataFrameShuffleProxy)
+        left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[idx[0]]
+        left_index_min_max = left_index_splits[left_row_idx][left_row_inner_idx]
+        ics = [ic for ic in df1.chunks if ic.index[0] == left_row_idx]
+        for j, ci, ic in zip(itertools.count(0), c.inputs[0].inputs[0].inputs, ics):
+            assert isinstance(ci.op, DataFrameIndexAlign)
+            assert ci.op.stage == OperandStage.map
+            assert ci.index == (idx[0], j)
+            assert ci.op.index_min == left_index_min_max[0]
+            assert ci.op.index_min_close == left_index_min_max[1]
+            assert ci.op.index_max == left_index_min_max[2]
+            assert ci.op.index_max_close == left_index_min_max[3]
+            assert isinstance(ci.index_value.to_pandas(), type(data1.index))
+            assert ci.op.column_shuffle_size
+            shuffle_segments = ci.op.column_shuffle_segments
+            expected_shuffle_segments = hash_dtypes(ic.data.dtypes, 2)
+            assert len(shuffle_segments) == len(expected_shuffle_segments)
+            for ss, ess in zip(shuffle_segments, expected_shuffle_segments):
+                pd.testing.assert_series_equal(ss, ess)
+            assert ci.inputs[0] is ic.data
+        # test the right side
+        assert isinstance(c.inputs[1].op, DataFrameIndexAlign)
+        assert c.inputs[1].op.stage == OperandStage.reduce
+        expect_dtypes = pd.concat(
+            [
+                hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]]
+                for ic in c.inputs[1].inputs[0].inputs
+            ]
+        )
+        pd.testing.assert_series_equal(c.inputs[1].dtypes, expect_dtypes)
+        pd.testing.assert_index_equal(
+            c.inputs[1].columns_value.to_pandas(), c.inputs[1].dtypes.index
+        )
+        assert isinstance(c.inputs[1].index_value.to_pandas(), type(data1.index))
+        assert isinstance(c.inputs[1].inputs[0].op, DataFrameShuffleProxy)
+        right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[idx[0]]
+        right_index_min_max = right_index_splits[right_row_idx][right_row_inner_idx]
+        ics = [ic for ic in df2.chunks if ic.index[0] == right_row_idx]
+        for j, ci, ic in zip(itertools.count(0), c.inputs[1].inputs[0].inputs, ics):
+            assert isinstance(ci.op, DataFrameIndexAlign)
+            assert ci.op.stage == OperandStage.map
+            assert ci.index == (idx[0], j)
+            assert ci.op.index_min == right_index_min_max[0]
+            assert ci.op.index_min_close == right_index_min_max[1]
+            assert ci.op.index_max == right_index_min_max[2]
+            assert ci.op.index_max_close == right_index_min_max[3]
+            assert ci.op.column_shuffle_size
+            shuffle_segments = ci.op.column_shuffle_segments
+            expected_shuffle_segments = hash_dtypes(ic.data.dtypes, 2)
+            assert len(shuffle_segments) == len(expected_shuffle_segments)
+            for ss, ess in zip(shuffle_segments, expected_shuffle_segments):
+                pd.testing.assert_series_equal(ss, ess)
+            assert ci.inputs[0] is ic.data
+
+    # make sure shuffle proxies' key are different
+    proxy_keys = set()
+    for i in range(df3.chunk_shape[0]):
+        cs = [c for c in df3.chunks if c.index[0] == i]
+        lps = {c.inputs[0].inputs[0].op.key for c in cs}
+        assert len(lps) == 1
+        proxy_keys.add(lps.pop())
+        rps = {c.inputs[1].inputs[0].op.key for c in cs}
+        assert len(rps) == 1
+        proxy_keys.add(rps.pop())
+    assert len(proxy_keys) == 2 * df3.chunk_shape[0]
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_with_all_shuffle(func_name, func_opts):
+    # no axis is monotonic
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
+        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=5)
+    data2 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
+        columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
+    )
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=6)
+
+    df3 = func_opts.func(df1, df2)
+
+    # test df3's index and columns
+    pd.testing.assert_index_equal(
+        df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
+    )
+    assert isinstance(df3.index_value.value, IndexValue.Int64Index)
+    pd.testing.assert_index_equal(
+        df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
+    )
+    assert df3.index_value.key != df1.index_value.key
+    assert df3.index_value.key != df2.index_value.key
+    assert df3.shape[1] == 12  # columns is recorded, so we can get it
+
+    df1, df2, df3 = tile(df1, df2, df3)
+
+    assert df3.chunk_shape == (2, 2)
+    proxy_keys = set()
+    for c in df3.chunks:
+        assert isinstance(c.op, func_opts.op)
+        assert len(c.inputs) == 2
+        # test left side
+        assert isinstance(c.inputs[0].op, DataFrameIndexAlign)
+        assert c.inputs[0].op.stage == OperandStage.reduce
+        expect_dtypes = pd.concat(
+            [
+                hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]]
+                for ic in c.inputs[0].inputs[0].inputs
+                if ic.index[0] == 0
+            ]
+        )
+        pd.testing.assert_series_equal(c.inputs[0].dtypes, expect_dtypes)
+        pd.testing.assert_index_equal(
+            c.inputs[0].columns_value.to_pandas(), c.inputs[0].dtypes.index
+        )
+        assert isinstance(c.inputs[0].index_value.to_pandas(), type(data1.index))
+        assert isinstance(c.inputs[0].inputs[0].op, DataFrameShuffleProxy)
+        proxy_keys.add(c.inputs[0].inputs[0].op.key)
+        for ic, ci in zip(c.inputs[0].inputs[0].inputs, df1.chunks):
+            assert isinstance(ic.op, DataFrameIndexAlign)
+            assert ic.op.stage == OperandStage.map
+            assert ic.op.index_shuffle_size == 2
+            assert isinstance(ic.index_value.to_pandas(), type(data1.index))
+            assert ic.op.column_shuffle_size == 2
+            assert ic.columns_value is not None
+            shuffle_segments = ic.op.column_shuffle_segments
+            expected_shuffle_segments = hash_dtypes(ci.data.dtypes, 2)
+            assert len(shuffle_segments) == len(expected_shuffle_segments)
+            for ss, ess in zip(shuffle_segments, expected_shuffle_segments):
+                pd.testing.assert_series_equal(ss, ess)
+            assert ic.inputs[0] is ci.data
+        # test right side
+        assert isinstance(c.inputs[1].op, DataFrameIndexAlign)
+        assert c.inputs[1].op.stage == OperandStage.reduce
+        expect_dtypes = pd.concat(
+            [
+                hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]]
+                for ic in c.inputs[1].inputs[0].inputs
+                if ic.index[0] == 0
+            ]
+        )
+        pd.testing.assert_series_equal(c.inputs[1].dtypes, expect_dtypes)
+        pd.testing.assert_index_equal(
+            c.inputs[1].columns_value.to_pandas(), c.inputs[1].dtypes.index
+        )
+        assert isinstance(c.inputs[0].index_value.to_pandas(), type(data1.index))
+        assert isinstance(c.inputs[1].inputs[0].op, DataFrameShuffleProxy)
+        proxy_keys.add(c.inputs[1].inputs[0].op.key)
+        for ic, ci in zip(c.inputs[1].inputs[0].inputs, df2.chunks):
+            assert isinstance(ic.op, DataFrameIndexAlign)
+            assert ic.op.stage == OperandStage.map
+            assert ic.op.index_shuffle_size == 2
+            assert isinstance(ic.index_value.to_pandas(), type(data1.index))
+            assert ic.op.column_shuffle_size == 2
+            assert ic.columns_value is not None
+            shuffle_segments = ic.op.column_shuffle_segments
+            expected_shuffle_segments = hash_dtypes(ci.data.dtypes, 2)
+            assert len(shuffle_segments) == len(expected_shuffle_segments)
+            for ss, ess in zip(shuffle_segments, expected_shuffle_segments):
+                pd.testing.assert_series_equal(ss, ess)
+            assert ic.inputs[0] is ci.data
+
+    assert len(proxy_keys) == 2
+
+    data4 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.random.randint(-100, 100, size=(10,)),
+        columns=[np.random.bytes(10) for _ in range(10)],
+    )
+    data4 = to_boolean_if_needed(func_opts.func_name, data4)
+    df4 = from_pandas(data4, chunk_size=3)
+
+    data5 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.random.randint(-100, 100, size=(10,)),
+        columns=[np.random.bytes(10) for _ in range(10)],
+    )
+    data5 = to_boolean_if_needed(func_opts.func_name, data5)
+    df5 = from_pandas(data5, chunk_size=3)
+
+    df6 = func_opts.func(df4, df5)
+
+    # test df6's index and columns
+    pd.testing.assert_index_equal(
+        df6.columns_value.to_pandas(), func_opts.func(data4, data5).columns
+    )
+    assert isinstance(df6.index_value.value, IndexValue.Int64Index)
+    pd.testing.assert_index_equal(
+        df6.index_value.to_pandas(), pd.Index([], dtype=np.int64)
+    )
+    assert df6.index_value.key != df4.index_value.key
+    assert df6.index_value.key != df5.index_value.key
+    assert df6.shape[1] == 20  # columns is recorded, so we can get it
+
+    df4, df5, df6 = tile(df4, df5, df6)
+
+    assert df6.chunk_shape == (4, 4)
+    proxy_keys = set()
+    for c in df6.chunks:
+        assert isinstance(c.op, func_opts.op)
+        assert len(c.inputs) == 2
+        # test left side
+        assert isinstance(c.inputs[0].op, DataFrameIndexAlign)
+        assert c.inputs[0].op.stage == OperandStage.reduce
+        expect_dtypes = pd.concat(
+            [
+                hash_dtypes(ic.inputs[0].op.data.dtypes, 4)[c.index[1]]
+                for ic in c.inputs[0].inputs[0].inputs
+                if ic.index[0] == 0
+            ]
+        )
+        pd.testing.assert_series_equal(c.inputs[0].dtypes, expect_dtypes)
+        pd.testing.assert_index_equal(
+            c.inputs[0].columns_value.to_pandas(), c.inputs[0].dtypes.index
+        )
+        assert isinstance(c.inputs[0].index_value.to_pandas(), type(data1.index))
+        assert isinstance(c.inputs[0].inputs[0].op, DataFrameShuffleProxy)
+        proxy_keys.add(c.inputs[0].inputs[0].op.key)
+        for ic, ci in zip(c.inputs[0].inputs[0].inputs, df4.chunks):
+            assert isinstance(ic.op, DataFrameIndexAlign)
+            assert ic.op.stage == OperandStage.map
+            assert ic.op.index_shuffle_size == 4
+            assert isinstance(ic.index_value.to_pandas(), type(data1.index))
+            assert ic.op.column_shuffle_size == 4
+            assert ic.columns_value is not None
+            shuffle_segments = ic.op.column_shuffle_segments
+            expected_shuffle_segments = hash_dtypes(ci.data.dtypes, 4)
+            assert len(shuffle_segments) == len(expected_shuffle_segments)
+            for ss, ess in zip(shuffle_segments, expected_shuffle_segments):
+                pd.testing.assert_series_equal(ss, ess)
+            assert ic.inputs[0] is ci.data
+        # test right side
+        assert isinstance(c.inputs[1].op, DataFrameIndexAlign)
+        assert c.inputs[1].op.stage == OperandStage.reduce
+        expect_dtypes = pd.concat(
+            [
+                hash_dtypes(ic.inputs[0].op.data.dtypes, 4)[c.index[1]]
+                for ic in c.inputs[1].inputs[0].inputs
+                if ic.index[0] == 0
+            ]
+        )
+        pd.testing.assert_series_equal(c.inputs[1].dtypes, expect_dtypes)
+        pd.testing.assert_index_equal(
+            c.inputs[1].columns_value.to_pandas(), c.inputs[1].dtypes.index
+        )
+        assert isinstance(c.inputs[0].index_value.to_pandas(), type(data1.index))
+        assert isinstance(c.inputs[1].inputs[0].op, DataFrameShuffleProxy)
+        proxy_keys.add(c.inputs[1].inputs[0].op.key)
+        for ic, ci in zip(c.inputs[1].inputs[0].inputs, df5.chunks):
+            assert isinstance(ic.op, DataFrameIndexAlign)
+            assert ic.op.stage == OperandStage.map
+            assert ic.op.index_shuffle_size == 4
+            assert isinstance(ic.index_value.to_pandas(), type(data1.index))
+            assert ic.op.column_shuffle_size == 4
+            assert ic.columns_value is not None
+            shuffle_segments = ic.op.column_shuffle_segments
+            expected_shuffle_segments = hash_dtypes(ci.data.dtypes, 4)
+            assert len(shuffle_segments) == len(expected_shuffle_segments)
+            for ss, ess in zip(shuffle_segments, expected_shuffle_segments):
+                pd.testing.assert_series_equal(ss, ess)
+            assert ic.inputs[0] is ci.data
+
+    assert len(proxy_keys) == 2
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_without_shuffle_and_with_one_chunk(func_name, func_opts):
+    # only 1 axis is monotonic
+    # data1 with index split into [0...4], [5...9],
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.arange(10),
+        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=(5, 10))
+    # data2 with index split into [6...11], [2, 5],
+    data2 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.arange(11, 1, -1),
+        columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
+    )
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=(6, 10))
+
+    df3 = func_opts.func(df1, df2)
+
+    # test df3's index and columns
+    pd.testing.assert_index_equal(
+        df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
+    )
+    assert isinstance(df3.index_value.value, IndexValue.Int64Index)
+    pd.testing.assert_index_equal(
+        df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
+    )
+    assert df3.index_value.key != df1.index_value.key
+    assert df3.index_value.key != df2.index_value.key
+    assert df3.shape[1] == 12  # columns is recorded, so we can get it
+
+    df1, df2, df3 = tile(df1, df2, df3)
+
+    data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)]
+    data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)]
+
+    left_index_splits, right_index_splits = split_monotonic_index_min_max(
+        data1_index_min_max, True, data2_index_min_max, False
+    )
+
+    left_index_idx_to_original_idx = build_split_idx_to_origin_idx(left_index_splits)
+    right_index_idx_to_original_idx = build_split_idx_to_origin_idx(
+        right_index_splits, False
+    )
+
+    assert df3.chunk_shape == (7, 1)
+    for c in df3.chunks:
+        assert isinstance(c.op, func_opts.op)
+        assert len(c.inputs) == 2
+        # test shape
+        idx = c.index
+        # test the left side
+        assert isinstance(c.inputs[0].op, DataFrameIndexAlign)
+        assert c.inputs[0].op.stage == OperandStage.map
+        left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[idx[0]]
+        expect_df1_input = df1.cix[left_row_idx, 0].data
+        assert c.inputs[0].inputs[0] is expect_df1_input
+        left_index_min_max = left_index_splits[left_row_idx][left_row_inner_idx]
+        assert c.inputs[0].op.index_min == left_index_min_max[0]
+        assert c.inputs[0].op.index_min_close == left_index_min_max[1]
+        assert c.inputs[0].op.index_max == left_index_min_max[2]
+        assert c.inputs[0].op.index_max_close == left_index_min_max[3]
+        assert isinstance(c.inputs[0].index_value.to_pandas(), type(data1.index))
+        assert c.inputs[0].op.column_min == expect_df1_input.columns_value.min_val
+        assert (
+            c.inputs[0].op.column_min_close
+            == expect_df1_input.columns_value.min_val_close
+        )
+        assert c.inputs[0].op.column_max == expect_df1_input.columns_value.max_val
+        assert (
+            c.inputs[0].op.column_max_close
+            == expect_df1_input.columns_value.max_val_close
+        )
+        expect_left_columns = expect_df1_input.columns_value
+        pd.testing.assert_index_equal(
+            c.inputs[0].columns_value.to_pandas(), expect_left_columns.to_pandas()
+        )
+        pd.testing.assert_index_equal(
+            c.inputs[0].dtypes.index, expect_left_columns.to_pandas()
+        )
+        # test the right side
+        assert isinstance(c.inputs[1].op, DataFrameIndexAlign)
+        assert c.inputs[1].op.stage == OperandStage.map
+        right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[idx[0]]
+        expect_df2_input = df2.cix[right_row_idx, 0].data
+        assert c.inputs[1].inputs[0] is expect_df2_input
+        right_index_min_max = right_index_splits[right_row_idx][right_row_inner_idx]
+        assert c.inputs[1].op.index_min == right_index_min_max[0]
+        assert c.inputs[1].op.index_min_close == right_index_min_max[1]
+        assert c.inputs[1].op.index_max == right_index_min_max[2]
+        assert c.inputs[1].op.index_max_close == right_index_min_max[3]
+        assert isinstance(c.inputs[1].index_value.to_pandas(), type(data2.index))
+        assert c.inputs[1].op.column_min == expect_df2_input.columns_value.min_val
+        assert (
+            c.inputs[1].op.column_min_close
+            == expect_df2_input.columns_value.min_val_close
+        )
+        assert c.inputs[1].op.column_max == expect_df2_input.columns_value.max_val
+        assert (
+            c.inputs[1].op.column_max_close
+            == expect_df2_input.columns_value.max_val_close
+        )
+        expect_right_columns = expect_df2_input.columns_value
+        pd.testing.assert_index_equal(
+            c.inputs[1].columns_value.to_pandas(), expect_right_columns.to_pandas()
+        )
+        pd.testing.assert_index_equal(
+            c.inputs[1].dtypes.index, expect_right_columns.to_pandas()
+        )
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_both_one_chunk(func_name, func_opts):
+    # no axis is monotonic, but 1 chunk for all axes
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
+        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=10)
+    data2 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
+        columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
+    )
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=10)
+
+    df3 = func_opts.func(df1, df2)
+
+    # test df3's index and columns
+    pd.testing.assert_index_equal(
+        df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
+    )
+    assert isinstance(df3.index_value.value, IndexValue.Int64Index)
+    pd.testing.assert_index_equal(
+        df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
+    )
+    assert df3.index_value.key != df1.index_value.key
+    assert df3.index_value.key != df2.index_value.key
+    assert df3.shape[1] == 12  # columns is recorded, so we can get it
+
+    df1, df2, df3 = tile(df1, df2, df3)
+
+    assert df3.chunk_shape == (1, 1)
+    for c in df3.chunks:
+        assert isinstance(c.op, func_opts.op)
+        assert len(c.inputs) == 2
+        # test the left side
+        assert c.inputs[0] is df1.chunks[0].data
+        # test the right side
+        assert c.inputs[1] is df2.chunks[0].data
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_with_shuffle_and_one_chunk(func_name, func_opts):
+    # no axis is monotonic
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
+        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=(5, 10))
+    data2 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
+        columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
+    )
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=(6, 10))
+
+    df3 = func_opts.func(df1, df2)
+
+    # test df3's index and columns
+    pd.testing.assert_index_equal(
+        df3.columns_value.to_pandas(), func_opts.func(data1, data2).columns
+    )
+    assert isinstance(df3.index_value.value, IndexValue.Int64Index)
+    pd.testing.assert_index_equal(
+        df3.index_value.to_pandas(), pd.Index([], dtype=np.int64)
+    )
+    assert df3.index_value.key != df1.index_value.key
+    assert df3.index_value.key != df2.index_value.key
+    assert df3.shape[1] == 12  # columns is recorded, so we can get it
+
+    df1, df2, df3 = tile(df1, df2, df3)
+
+    assert df3.chunk_shape == (2, 1)
+    proxy_keys = set()
+    for c in df3.chunks:
+        assert isinstance(c.op, func_opts.op)
+        assert len(c.inputs) == 2
+        # test left side
+        assert isinstance(c.inputs[0].op, DataFrameIndexAlign)
+        assert c.inputs[0].op.stage == OperandStage.reduce
+        expect_dtypes = pd.concat(
+            [
+                ic.inputs[0].op.data.dtypes
+                for ic in c.inputs[0].inputs[0].inputs
+                if ic.index[0] == 0
+            ]
+        )
+        pd.testing.assert_series_equal(c.inputs[0].dtypes, expect_dtypes)
+        pd.testing.assert_index_equal(
+            c.inputs[0].columns_value.to_pandas(), c.inputs[0].dtypes.index
+        )
+        assert isinstance(c.inputs[0].index_value.to_pandas(), type(data1.index))
+        assert isinstance(c.inputs[0].inputs[0].op, DataFrameShuffleProxy)
+        proxy_keys.add(c.inputs[0].inputs[0].op.key)
+        for ic, ci in zip(c.inputs[0].inputs[0].inputs, df1.chunks):
+            assert isinstance(ic.op, DataFrameIndexAlign)
+            assert ic.op.stage == OperandStage.map
+            assert ic.op.index_shuffle_size == 2
+            assert isinstance(ic.index_value.to_pandas(), type(data1.index))
+            assert ic.op.column_min == ci.columns_value.min_val
+            assert ic.op.column_min_close == ci.columns_value.min_val_close
+            assert ic.op.column_max == ci.columns_value.max_val
+            assert ic.op.column_max_close == ci.columns_value.max_val_close
+            assert ic.op.column_shuffle_size is None
+            assert ic.columns_value is not None
+            assert ic.inputs[0] is ci.data
+        # test right side
+        assert isinstance(c.inputs[1].op, DataFrameIndexAlign)
+        assert c.inputs[1].op.stage == OperandStage.reduce
+        expect_dtypes = pd.concat(
+            [
+                ic.inputs[0].op.data.dtypes
+                for ic in c.inputs[1].inputs[0].inputs
+                if ic.index[0] == 0
+            ]
+        )
+        pd.testing.assert_series_equal(c.inputs[1].dtypes, expect_dtypes)
+        pd.testing.assert_index_equal(
+            c.inputs[1].columns_value.to_pandas(), c.inputs[1].dtypes.index
+        )
+        assert isinstance(c.inputs[0].index_value.to_pandas(), type(data1.index))
+        assert isinstance(c.inputs[1].inputs[0].op, DataFrameShuffleProxy)
+        proxy_keys.add(c.inputs[1].inputs[0].op.key)
+        for ic, ci in zip(c.inputs[1].inputs[0].inputs, df2.chunks):
+            assert isinstance(ic.op, DataFrameIndexAlign)
+            assert ic.op.stage == OperandStage.map
+            assert ic.op.index_shuffle_size == 2
+            assert isinstance(ic.index_value.to_pandas(), type(data1.index))
+            assert ic.op.column_shuffle_size is None
+            assert ic.op.column_min == ci.columns_value.min_val
+            assert ic.op.column_min_close == ci.columns_value.min_val_close
+            assert ic.op.column_max == ci.columns_value.max_val
+            assert ic.op.column_max_close == ci.columns_value.max_val_close
+            assert ic.op.column_shuffle_size is None
+            assert ic.columns_value is not None
+            assert ic.inputs[0] is ci.data
+
+    assert len(proxy_keys) == 2
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_on_same_dataframe(func_name, func_opts):
+    data = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.random.randint(-100, 100, size=(10,)),
+        columns=[np.random.bytes(10) for _ in range(10)],
+    )
+    data = to_boolean_if_needed(func_opts.func_name, data)
+    df = from_pandas(data, chunk_size=3)
+    df2 = func_opts.func(df, df)
+
+    # test df2's index and columns
+    pd.testing.assert_index_equal(
+        df2.columns_value.to_pandas(), func_opts.func(data, data).columns
+    )
+    assert isinstance(df2.index_value.value, IndexValue.Int64Index)
+    pd.testing.assert_index_equal(
+        df2.index_value.to_pandas(), pd.Index([], dtype=np.int64)
+    )
+    assert df2.index_value.key == df.index_value.key
+    assert df2.columns_value.key == df.columns_value.key
+    assert df2.shape[1] == 10
+
+    df, df2 = tile(df, df2)
+
+    assert df2.chunk_shape == df.chunk_shape
+    for c in df2.chunks:
+        assert isinstance(c.op, func_opts.op)
+        assert len(c.inputs) == 2
+        # test the left side
+        assert c.inputs[0] is df.cix[c.index].data
+        # test the right side
+        assert c.inputs[1] is df.cix[c.index].data
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_dataframe_and_scalar(func_name, func_opts):
+    if func_opts.func_name in ["__and__", "__or__", "__xor__"]:
+        # bitwise logical operators doesn\'t support floating point scalars
+        return
+
+    data = pd.DataFrame(
+        np.random.rand(10, 10), index=np.arange(10), columns=np.arange(3, 13)
+    )
+    df = from_pandas(data, chunk_size=5)
+    # test operator with scalar
+    result = func_opts.func(df, 1)
+    result2 = getattr(df, func_opts.func_name)(1)
+
+    # test reverse operator with scalar
+    result3 = getattr(df, func_opts.rfunc_name)(1)
+    result4 = func_opts.func(df, 1)
+    result5 = func_opts.func(1, df)
+
+    expected = func_opts.func(data, 2)
+    pd.testing.assert_series_equal(result.dtypes, expected.dtypes)
+
+    pd.testing.assert_index_equal(result.columns_value.to_pandas(), data.columns)
+    assert isinstance(result.index_value.value, IndexValue.Int64Index)
+
+    pd.testing.assert_index_equal(result2.columns_value.to_pandas(), data.columns)
+    assert isinstance(result2.index_value.value, IndexValue.Int64Index)
+
+    pd.testing.assert_index_equal(result3.columns_value.to_pandas(), data.columns)
+    assert isinstance(result3.index_value.value, IndexValue.Int64Index)
+
+    pd.testing.assert_index_equal(result4.columns_value.to_pandas(), data.columns)
+    assert isinstance(result4.index_value.value, IndexValue.Int64Index)
+
+    pd.testing.assert_index_equal(result5.columns_value.to_pandas(), data.columns)
+    assert isinstance(result5.index_value.value, IndexValue.Int64Index)
+
+    if "builtin_function_or_method" not in str(type(func_opts.func)):
+        # skip NotImplemented test for comparison function
+        return
+
+    # test NotImplemented, use other's rfunc instead
+    class TestRFunc:
+        pass
+
+    setattr(TestRFunc, f"__{func_opts.rfunc_name}__", lambda *_: 1)
+    other = TestRFunc()
+    ret = func_opts.func(df, other)
+    assert ret == 1
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_series_and_scalar(func_name, func_opts):
+    if func_opts.func_name in ["__and__", "__or__", "__xor__"]:
+        # bitwise logical operators doesn\'t support floating point scalars
+        return
+
+    data = pd.Series(range(10), index=[1, 3, 4, 2, 9, 10, 33, 23, 999, 123])
+    s1 = from_pandas_series(data, chunk_size=3)
+    r = getattr(s1, func_opts.func_name)(456)
+    s1, r = tile(s1, r)
+
+    assert r.index_value.key == s1.index_value.key
+    assert r.chunk_shape == s1.chunk_shape
+    assert r.dtype == getattr(data, func_opts.func_name)(456).dtype
+
+    for cr in r.chunks:
+        cs = s1.cix[cr.index]
+        assert cr.index_value.key == cs.index_value.key
+        assert isinstance(cr.op, func_opts.op)
+        assert len(cr.inputs) == 1
+        assert isinstance(cr.inputs[0].op, SeriesDataSource)
+        assert cr.op.rhs == 456
+
+    if "builtin_function_or_method" not in str(type(func_opts.func)):
+        # skip rfunc test for comparison function
+        return
+
+    s1 = from_pandas_series(data, chunk_size=3)
+    r = getattr(s1, func_opts.rfunc_name)(789)
+    s1, r = tile(s1, r)
+
+    assert r.index_value.key == s1.index_value.key
+    assert r.chunk_shape == s1.chunk_shape
+
+    for cr in r.chunks:
+        cs = s1.cix[cr.index]
+        assert cr.index_value.key == cs.index_value.key
+        assert isinstance(cr.op, func_opts.op)
+        assert len(cr.inputs) == 1
+        assert isinstance(cr.inputs[0].op, SeriesDataSource)
+        assert cr.op.lhs == 789
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_check_inputs(func_name, func_opts):
+    data = pd.DataFrame(np.random.rand(10, 3))
+    data = to_boolean_if_needed(func_opts.func_name, data)
+    df = from_pandas(data)
+
+    with pytest.raises(ValueError):
+        _ = df + np.random.rand(5, 3)
+
+    with pytest.raises(ValueError):
+        _ = df + np.random.rand(10)
+
+    with pytest.raises(ValueError):
+        _ = df + np.random.rand(10, 3, 2)
+
+    data = pd.Series(np.random.rand(10))
+    series = from_pandas_series(data)
+
+    with pytest.raises(ValueError):
+        _ = series + np.random.rand(5, 3)
+
+    with pytest.raises(ValueError):
+        _ = series + np.random.rand(5)
+
+
+def test_abs():
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
+        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+    )
+    df1 = from_pandas(data1, chunk_size=(5, 10))
+
+    df2 = df1.abs()
+
+    # test df2's index and columns
+    pd.testing.assert_index_equal(
+        df2.columns_value.to_pandas(), df1.columns_value.to_pandas()
+    )
+    assert isinstance(df2.index_value.value, IndexValue.Int64Index)
+    assert df2.shape == (10, 10)
+
+    df1, df2 = tile(df1, df2)
+
+    assert df2.chunk_shape == (2, 1)
+    for c2, c1 in zip(df2.chunks, df1.chunks):
+        assert isinstance(c2.op, DataFrameAbs)
+        assert len(c2.inputs) == 1
+        # compare with input chunks
+        assert c2.index == c1.index
+        pd.testing.assert_index_equal(
+            c2.columns_value.to_pandas(), c1.columns_value.to_pandas()
+        )
+        pd.testing.assert_index_equal(
+            c2.index_value.to_pandas(), c1.index_value.to_pandas()
+        )
+
+
+def test_not():
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10) > 0.5,
+        index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
+        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+    )
+    df1 = from_pandas(data1, chunk_size=(5, 10))
+
+    df2 = ~df1
+
+    # test df2's index and columns
+    pd.testing.assert_index_equal(
+        df2.columns_value.to_pandas(), df1.columns_value.to_pandas()
+    )
+    assert isinstance(df2.index_value.value, IndexValue.Int64Index)
+    assert df2.shape == (10, 10)
+
+    df1, df2 = tile(df1, df2)
+
+    assert df2.chunk_shape == (2, 1)
+    for c2, c1 in zip(df2.chunks, df1.chunks):
+        assert isinstance(c2.op, DataFrameNot)
+        assert len(c2.inputs) == 1
+        # compare with input chunks
+        assert c2.index == c1.index
+        pd.testing.assert_index_equal(
+            c2.columns_value.to_pandas(), c1.columns_value.to_pandas()
+        )
+        pd.testing.assert_index_equal(
+            c2.index_value.to_pandas(), c1.index_value.to_pandas()
+        )
+
+
+def test_arithmetic_lazy_chunk_meta():
+    df = dataframe_from_tensor(mt.random.rand(10, 3, chunk_size=3))
+    df2 = df + 1
+    df2 = tile(df2)
+
+    chunk = df2.chunks[0].data
+    assert chunk._FIELDS["_dtypes"].get(chunk) is None
+    pd.testing.assert_series_equal(chunk.dtypes, df.dtypes)
+    assert chunk._FIELDS["_dtypes"].get(chunk) is not None
+    assert chunk._FIELDS["_index_value"].get(chunk) is None
+    pd.testing.assert_index_equal(chunk.index_value.to_pandas(), pd.RangeIndex(3))
+    assert chunk._FIELDS["_index_value"].get(chunk) is not None
+    assert chunk._FIELDS["_columns_value"].get(chunk) is None
+    pd.testing.assert_index_equal(chunk.columns_value.to_pandas(), pd.RangeIndex(3))
+    assert chunk._FIELDS["_columns_value"].get(chunk) is not None
+
+
+def test_datetime_arithmetic():
+    data1 = (
+        pd.Series([pd.Timedelta(days=d) for d in range(10)]) + datetime.datetime.now()
+    )
+    s1 = from_pandas_series(data1)
+
+    assert (s1 + pd.Timedelta(days=10)).dtype == (data1 + pd.Timedelta(days=10)).dtype
+    assert (s1 + datetime.timedelta(days=10)).dtype == (
+        data1 + datetime.timedelta(days=10)
+    ).dtype
+    assert (s1 - pd.Timestamp.now()).dtype == (data1 - pd.Timestamp.now()).dtype
+    assert (s1 - datetime.datetime.now()).dtype == (
+        data1 - datetime.datetime.now()
+    ).dtype
diff --git a/python/xorbits/_mars/dataframe/arithmetic/tests/test_arithmetic_execution.py b/python/xorbits/_mars/dataframe/arithmetic/tests/test_arithmetic_execution.py
new file mode 100644
index 000000000..d4b45c2be
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/tests/test_arithmetic_execution.py
@@ -0,0 +1,917 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+from dataclasses import dataclass
+from functools import partial
+from typing import Callable, Union
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .... import tensor as mt
+from ....tensor.datasource import array as from_array
+from ....utils import dataslots
+from ... import to_datetime
+from ...datasource.dataframe import from_pandas
+from ...datasource.series import from_pandas as from_pandas_series
+from ..tests.test_arithmetic import comp_func
+
+
+@dataslots
+@dataclass
+class FunctionOptions:
+    func: Callable
+    func_name: str
+    rfunc_name: str
+
+
+binary_functions = dict(
+    add=FunctionOptions(func=operator.add, func_name="add", rfunc_name="radd"),
+    equal=FunctionOptions(func=comp_func("eq", "eq"), func_name="eq", rfunc_name="eq"),
+    logical_and=FunctionOptions(
+        func=operator.and_, func_name="__and__", rfunc_name="__rand__"
+    ),
+)
+
+
+def sort_dataframe(
+    df: Union[pd.DataFrame, pd.Series], index: bool = True, columns: bool = True
+):
+    if index:
+        df.sort_index(inplace=True)
+    if columns and isinstance(df, pd.DataFrame):
+        df.sort_index(axis=1, inplace=True)
+    return df
+
+
+def to_boolean_if_needed(func_name, value, split_value=0.5):
+    if func_name in ["__and__", "__or__", "__xor__"]:
+        return value > split_value
+    else:
+        return value
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_without_shuffle_execution(setup, func_name, func_opts):
+    if func_opts.func_name in ["__and__", "__or__", "__xor__"]:
+        # FIXME bitwise logical operators behave differently with pandas when index is not aligned.
+        return
+
+    # all the axes are monotonic
+    # data1 with index split into [0...4], [5...9],
+    # columns [3...7], [8...12]
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10), index=np.arange(10), columns=np.arange(3, 13)
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=5)
+    # data2 with index split into [6...11], [2, 5],
+    # columns [4...9], [10, 13]
+    data2 = pd.DataFrame(
+        np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=np.arange(4, 14)
+    )
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=6)
+
+    df3 = func_opts.func(df1, df2)
+
+    expected = func_opts.func(data1, data2)
+    result = df3.execute().fetch()
+
+    pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result))
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_with_one_shuffle_execution(setup, func_name, func_opts):
+    if func_opts.func_name in ["__and__", "__or__", "__xor__"]:
+        # FIXME bitwise logical operators behave differently with pandas when index is not aligned.
+        return
+
+    # only 1 axis is monotonic
+    # data1 with index split into [0...4], [5...9],
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.arange(10),
+        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=5)
+    # data2 with index split into [6...11], [2, 5],
+    data2 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.arange(11, 1, -1),
+        columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
+    )
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=6)
+
+    df3 = func_opts.func(df1, df2)
+
+    expected = func_opts.func(data1, data2)
+    result = df3.execute().fetch()
+
+    pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result))
+
+    # only 1 axis is monotonic
+    # data1 with columns split into [0...4], [5...9],
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+        columns=np.arange(10),
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=5)
+    # data2 with columns split into [6...11], [2, 5],
+    data2 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
+        columns=np.arange(11, 1, -1),
+    )
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=6)
+
+    df3 = func_opts.func(df1, df2)
+
+    expected = func_opts.func(data1, data2)
+    result = df3.execute().fetch()
+
+    pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result))
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_with_all_shuffle_execution(setup, func_name, func_opts):
+    if func_opts.func_name in ["__and__", "__or__", "__xor__"]:
+        # FIXME bitwise logical operators behave differently with pandas when index is not aligned.
+        return
+
+    # no axis is monotonic
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
+        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=5)
+    data2 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
+        columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
+    )
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=6)
+
+    df3 = func_opts.func(df1, df2)
+
+    expected = func_opts.func(data1, data2)
+    result = df3.execute().fetch()
+
+    pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result))
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_both_with_one_chunk(setup, func_name, func_opts):
+    if func_opts.func_name in ["__and__", "__or__", "__xor__"]:
+        # FIXME bitwise logical operators behave differently with pandas when index is not aligned.
+        return
+
+    # only 1 axis is monotonic
+    # data1 with index split into [0...4], [5...9],
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.arange(10),
+        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=10)
+    # data2 with index split into [6...11], [2, 5],
+    data2 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.arange(11, 1, -1),
+        columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
+    )
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=10)
+
+    df3 = func_opts.func(df1, df2)
+
+    expected = func_opts.func(data1, data2)
+    result = df3.execute().fetch()
+
+    pd.testing.assert_frame_equal(expected, result)
+
+    # only 1 axis is monotonic
+    # data1 with columns split into [0...4], [5...9],
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+        columns=np.arange(10),
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=10)
+    # data2 with columns split into [6...11], [2, 5],
+    data2 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
+        columns=np.arange(11, 1, -1),
+    )
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=10)
+
+    df3 = func_opts.func(df1, df2)
+
+    expected = func_opts.func(data1, data2)
+    result = df3.execute().fetch()
+
+    pd.testing.assert_frame_equal(expected, result)
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_without_shuffle_and_with_one_chunk(setup, func_name, func_opts):
+    if func_opts.func_name in ["__and__", "__or__", "__xor__"]:
+        # FIXME bitwise logical operators behave differently with pandas when index is not aligned.
+        return
+
+    # only 1 axis is monotonic
+    # data1 with index split into [0...4], [5...9],
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.arange(10),
+        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=(5, 10))
+    # data2 with index split into [6...11], [2, 5],
+    data2 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.arange(11, 1, -1),
+        columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
+    )
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=(6, 10))
+
+    df3 = func_opts.func(df1, df2)
+
+    expected = func_opts.func(data1, data2)
+    result = df3.execute().fetch()
+
+    pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result))
+
+    # only 1 axis is monotonic
+    # data1 with columns split into [0...4], [5...9],
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+        columns=np.arange(10),
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=(10, 5))
+    # data2 with columns split into [6...11], [2, 5],
+    data2 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
+        columns=np.arange(11, 1, -1),
+    )
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=(10, 6))
+
+    df3 = func_opts.func(df1, df2)
+
+    expected = func_opts.func(data1, data2)
+    result = df3.execute().fetch()
+
+    pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result))
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_with_shuffle_and_with_one_chunk(setup, func_name, func_opts):
+    if func_opts.func_name in ["__and__", "__or__", "__xor__"]:
+        # pandas fails to compute some expected values due to `na`.
+        return
+
+    # only 1 axis is monotonic
+    # data1 with index split into [0...4], [5...9],
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.arange(10),
+        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+    )
+    df1 = from_pandas(data1, chunk_size=(10, 5))
+    # data2 with index split into [6...11], [2, 5],
+    data2 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.arange(11, 1, -1),
+        columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
+    )
+    df2 = from_pandas(data2, chunk_size=(10, 6))
+
+    df3 = func_opts.func(df1, df2)
+
+    expected = func_opts.func(data1, data2)
+    result = df3.execute().fetch()
+
+    pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result))
+
+    # only 1 axis is monotonic
+    # data1 with columns split into [0...4], [5...9],
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+        columns=np.arange(10),
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=(5, 10))
+    # data2 with columns split into [6...11], [2, 5],
+    data2 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
+        columns=np.arange(11, 1, -1),
+    )
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=(6, 10))
+
+    df3 = func_opts.func(df1, df2)
+
+    expected = func_opts.func(data1, data2)
+    result = df3.execute().fetch()
+
+    pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result))
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_same_index(setup, func_name, func_opts):
+    data = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.random.randint(0, 2, size=(10,)),
+        columns=["c" + str(i) for i in range(10)],
+    )
+    data = to_boolean_if_needed(func_opts.func_name, data)
+    df = from_pandas(data, chunk_size=3)
+    df2 = func_opts.func(df, df)
+
+    expected = func_opts.func(data, data)
+    result = df2.execute().fetch()
+    pd.testing.assert_frame_equal(expected, result)
+
+    series = from_pandas_series(data.iloc[0], chunk_size=3)
+    df3 = func_opts.func(df, series)
+
+    expected = func_opts.func(data, data.iloc[0])
+    result = df3.execute().fetch()
+    pd.testing.assert_frame_equal(expected, result)
+
+    series = from_pandas_series(data.iloc[:, 0], chunk_size=3)
+    df4 = getattr(df, func_opts.func_name)(series, axis=0)
+
+    if func_opts.func_name not in ["__and__", "__or__", "__xor__"]:
+        expected = getattr(data, func_opts.func_name)(data.iloc[:, 0], axis=0)
+        result = df4.execute().fetch()
+        pd.testing.assert_frame_equal(expected, result)
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_chained(setup, func_name, func_opts):
+    data1 = pd.DataFrame(np.random.rand(10, 10))
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=5)
+    data2 = pd.DataFrame(np.random.rand(10, 10))
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=6)
+
+    df3 = func_opts.func(df1, df2)
+
+    data4 = pd.DataFrame(np.random.rand(10, 10))
+    data4 = to_boolean_if_needed(func_opts.func_name, data4)
+    df4 = from_pandas(data4, chunk_size=6)
+
+    df5 = func_opts.func(df3, df4)
+
+    result = df5.execute().fetch()
+    expected = func_opts.func(func_opts.func(data1, data2), data4)
+
+    pd.testing.assert_frame_equal(expected, result)
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_rfunc(setup, func_name, func_opts):
+    data1 = pd.DataFrame(np.random.rand(10, 10))
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=5)
+    data2 = pd.DataFrame(np.random.rand(10, 10))
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=6)
+    df3 = getattr(df1, func_opts.rfunc_name)(df2)
+    result = df3.execute().fetch()
+    expected = func_opts.func(data2, data1)
+    pd.testing.assert_frame_equal(expected, result)
+
+    data3 = pd.DataFrame(np.random.rand(10, 10))
+    data3 = to_boolean_if_needed(func_opts.func_name, data3)
+    df4 = from_pandas(data3, chunk_size=5)
+    df5 = getattr(df4, func_opts.rfunc_name)(1)
+    # todo check dtypes when pandas reverts its behavior on broadcasting
+    check_dtypes = func_opts.func_name not in ("__and__", "__or__", "__xor__")
+    result = df5.execute().fetch(extra_config=dict(check_dtypes=check_dtypes))
+    expected2 = func_opts.func(1, data3)
+    pd.testing.assert_frame_equal(expected2, result)
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_with_multi_forms(setup, func_name, func_opts):
+    # test multiple forms
+    # such as self+other, func_opts.add(other), add(self,other)
+    data1 = pd.DataFrame(np.random.rand(10, 10))
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=5)
+    data2 = pd.DataFrame(np.random.rand(10, 10))
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=6)
+
+    expected = func_opts.func(data1, data2)
+    result = func_opts.func(df1, df2).execute().fetch()
+    pd.testing.assert_frame_equal(expected, result)
+    result = func_opts.func(df1, df2).execute().fetch()
+    pd.testing.assert_frame_equal(expected, result)
+    result = getattr(df1, func_opts.func_name)(df2).execute().fetch()
+    pd.testing.assert_frame_equal(expected, result)
+    result = getattr(df1, func_opts.rfunc_name)(df2).execute().fetch()
+    pd.testing.assert_frame_equal(func_opts.func(data2, data1), result)
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_dataframe_and_scalar(setup, func_name, func_opts):
+    if func_opts.func_name in ["__and__", "__or__", "__xor__"]:
+        # FIXME bitwise logical operators doesn\'t support floating point scalars
+        return
+
+    # test dataframe and scalar
+    pdf = pd.DataFrame(np.random.rand(10, 10))
+    pdf = to_boolean_if_needed(func_opts.func_name, pdf)
+    df = from_pandas(pdf, chunk_size=2)
+    expected = func_opts.func(pdf, 1)
+    result = func_opts.func(df, 1).execute().fetch()
+    pd.testing.assert_frame_equal(expected, result)
+    result2 = func_opts.func(df, 1).execute().fetch()
+    pd.testing.assert_frame_equal(expected, result2)
+    result3 = getattr(df, func_opts.func_name)(1).execute().fetch()
+    pd.testing.assert_frame_equal(expected, result3)
+
+    # test scalar and dataframe
+    result4 = func_opts.func(df, 1).execute().fetch()
+    pd.testing.assert_frame_equal(expected, result4)
+
+    expected2 = func_opts.func(1, pdf)
+    result5 = func_opts.func(1, df).execute().fetch()
+    pd.testing.assert_frame_equal(expected2, result5)
+
+    result6 = getattr(df, func_opts.rfunc_name)(1).execute().fetch()
+    pd.testing.assert_frame_equal(expected2, result6)
+
+    # test pandas series and dataframe
+    pdf2 = pd.DataFrame(np.random.rand(10, 10))
+    expected = func_opts.func(pdf2, pdf)
+    result = func_opts.func(pdf2, df).execute().fetch()
+    pd.testing.assert_frame_equal(expected, result)
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_with_shuffle_on_string_index(setup, func_name, func_opts):
+    if func_opts.func_name in ["__and__", "__or__", "__xor__"]:
+        # FIXME bitwise logical operators behave differently with pandas when index is not aligned.
+        return
+
+    # no axis is monotonic, and the index values are strings.
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[str(x) for x in [0, 10, 2, 3, 4, 5, 6, 7, 8, 9]],
+        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=5)
+    data2 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=[str(x) for x in [11, 1, 2, 5, 7, 6, 8, 9, 10, 3]],
+        columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
+    )
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+    df2 = from_pandas(data2, chunk_size=6)
+
+    df3 = func_opts.func(df1, df2)
+
+    expected = func_opts.func(data1, data2)
+    result = df3.execute().fetch()
+
+    pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result))
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_dataframe_and_series(setup, func_name, func_opts):
+    if func_opts.func_name in ["__and__", "__or__", "__xor__"]:
+        # pandas fails to compute some expected values due to `na`.
+        return
+
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.arange(10),
+        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    data2 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.arange(11, 1, -1),
+        columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
+    )
+    data2 = to_boolean_if_needed(func_opts.func_name, data2)
+
+    s1 = from_pandas_series(data2[1], chunk_size=(6,))
+
+    # operate on single-column dataframe and series
+    df1 = from_pandas(data1[[1]], chunk_size=(5, 5))
+    r1 = getattr(df1, func_opts.func_name)(s1, axis="index")
+
+    expected = getattr(data1[[1]], func_opts.func_name)(data2[1], axis="index")
+    result = r1.execute().fetch()
+    pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result))
+
+    # operate on dataframe and series without shuffle
+    df2 = from_pandas(data1, chunk_size=(5, 5))
+    r2 = getattr(df2, func_opts.func_name)(s1, axis="index")
+
+    expected = getattr(data1, func_opts.func_name)(data2[1], axis="index")
+    result = r2.execute().fetch()
+    pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result))
+
+    # operate on dataframe and series with shuffle
+    df3 = from_pandas(data1, chunk_size=(5, 5))
+    r3 = getattr(df3, func_opts.func_name)(s1, axis="columns")
+
+    expected = getattr(data1, func_opts.func_name)(data2[1], axis="columns")
+    result = r3.execute().fetch()
+    pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result))
+
+    # test both one chunk, axis=0
+    pdf = pd.DataFrame({"ca": [1, 3, 2], "cb": [360, 180, 2]}, index=[1, 2, 3])
+    pdf = to_boolean_if_needed(func_opts.func_name, pdf)
+    df = from_pandas(pdf)
+    series = pd.Series([0, 1, 2], index=[1, 2, 3])
+    mars_series = from_pandas_series(series)
+    result = getattr(df, func_opts.func_name)(mars_series, axis=0).execute().fetch()
+    expected = getattr(pdf, func_opts.func_name)(series, axis=0)
+    pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result))
+
+    # test different number of chunks, axis=0
+    pdf = pd.DataFrame({"ca": [1, 3, 2], "cb": [360, 180, 2]}, index=[1, 2, 3])
+    pdf = to_boolean_if_needed(func_opts.func_name, pdf)
+    df = from_pandas(pdf, chunk_size=1)
+    series = pd.Series([0, 1, 2], index=[1, 2, 3])
+    mars_series = from_pandas_series(series)
+    result = getattr(df, func_opts.func_name)(mars_series, axis=0).execute().fetch()
+    expected = getattr(pdf, func_opts.func_name)(series, axis=0)
+    pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result))
+
+    # test with row shuffle, axis=0
+    pdf = pd.DataFrame({"ca": [1, 3, 2], "cb": [360, 180, 2]}, index=[2, 1, 3])
+    pdf = to_boolean_if_needed(func_opts.func_name, pdf)
+    df = from_pandas(pdf, chunk_size=1)
+    series = pd.Series([0, 1, 2], index=[3, 1, 2])
+    mars_series = from_pandas_series(series)
+    result = getattr(df, func_opts.func_name)(mars_series, axis=0).execute().fetch()
+    expected = getattr(pdf, func_opts.func_name)(series, axis=0).reindex([3, 1, 2])
+    # modify the order of rows
+    result = result.reindex(index=[3, 1, 2])
+    pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result))
+
+    # test both one chunk, axis=1
+    pdf = pd.DataFrame(
+        {1: [1, 3, 2], 2: [360, 180, 2], 3: [1, 2, 3]}, index=["ra", "rb", "rc"]
+    )
+    pdf = to_boolean_if_needed(func_opts.func_name, pdf)
+    df = from_pandas(pdf)
+    series = pd.Series([0, 1, 2], index=[1, 2, 3])
+    mars_series = from_pandas_series(series)
+    result = getattr(df, func_opts.func_name)(mars_series, axis=1).execute().fetch()
+    expected = getattr(pdf, func_opts.func_name)(series, axis=1)
+    pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result))
+
+    # test different number of chunks, axis=1
+    pdf = pd.DataFrame(
+        {1: [1, 3, 2], 2: [360, 180, 2], 3: [1, 2, 3]}, index=["ra", "rb", "rc"]
+    )
+    pdf = to_boolean_if_needed(func_opts.func_name, pdf)
+    df = from_pandas(pdf, chunk_size=1)
+    series = pd.Series([0, 1, 2], index=[1, 2, 3])
+    mars_series = from_pandas_series(series)
+    result = getattr(df, func_opts.func_name)(mars_series, axis=1).execute().fetch()
+    expected = getattr(pdf, func_opts.func_name)(series, axis=1)
+    pd.testing.assert_frame_equal(sort_dataframe(expected), sort_dataframe(result))
+
+    # test with row shuffle, axis=1
+    pdf = pd.DataFrame(
+        {1: [1, 3, 2], 3: [1, 2, 3], 2: [360, 180, 2]}, index=["ra", "rb", "rc"]
+    )
+    pdf = to_boolean_if_needed(func_opts.func_name, pdf)
+    df = from_pandas(pdf, chunk_size=1)
+    series = pd.Series([0, 1, 2], index=[3, 1, 2])
+    mars_series = from_pandas_series(series)
+    result = getattr(df, func_opts.func_name)(mars_series, axis=1).execute().fetch()
+    expected = getattr(pdf, func_opts.func_name)(series, axis=1)
+    # modify the order of columns
+    result = result[[1, 2, 3]]
+    pd.testing.assert_frame_equal(expected, result)
+
+
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_series(setup, func_name, func_opts):
+    # only one chunk
+    s1 = pd.Series(np.arange(10) + 1)
+    s1 = to_boolean_if_needed(func_opts.func_name, s1)
+    s2 = pd.Series(np.arange(10) + 1)
+    s2 = to_boolean_if_needed(func_opts.func_name, s2)
+    r = func_opts.func(
+        from_pandas_series(s1, chunk_size=10), from_pandas_series(s2, chunk_size=10)
+    )
+    result = r.execute().fetch()
+    expected = func_opts.func(s1, s2)
+    pd.testing.assert_series_equal(expected, result)
+
+    # same index
+    s1 = pd.Series(np.arange(10) + 1)
+    s1 = to_boolean_if_needed(func_opts.func_name, s1)
+    s2 = pd.Series(np.arange(10) + 1)
+    s2 = to_boolean_if_needed(func_opts.func_name, s2)
+    r = func_opts.func(
+        from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6)
+    )
+    result = r.execute().fetch()
+    expected = func_opts.func(s1, s2)
+    pd.testing.assert_series_equal(expected, result)
+
+    # no shuffle
+    s1 = pd.Series(np.arange(10) + 1, index=range(10))
+    s1 = to_boolean_if_needed(func_opts.func_name, s1)
+    s2 = pd.Series(np.arange(10) + 1, index=range(10, 0, -1))
+    s2 = to_boolean_if_needed(func_opts.func_name, s2)
+    r = func_opts.func(
+        from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6)
+    )
+    result = r.execute().fetch()
+    expected = func_opts.func(s1, s2)
+    pd.testing.assert_series_equal(expected, result)
+
+    # shuffle
+    data = (np.arange(10) + 1).astype(np.int64, copy=False)
+    s1 = pd.Series(data, index=np.random.permutation(range(10)))
+    s1 = to_boolean_if_needed(func_opts.func_name, s1)
+    s2 = pd.Series(data, index=np.random.permutation(range(10, 0, -1)))
+    s2 = to_boolean_if_needed(func_opts.func_name, s2)
+    r = func_opts.func(
+        from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6)
+    )
+    result = r.execute().fetch()
+    expected = func_opts.func(s1, s2)
+    pd.testing.assert_series_equal(sort_dataframe(expected), sort_dataframe(result))
+
+    if func_opts.func_name in ["__and__", "__or__", "__xor__"]:
+        # bitwise logical operators doesn\'t support floating point scalars
+        return
+
+    # operate with scalar
+    s1 = pd.Series(np.arange(10) + 1, index=np.random.permutation(range(10)))
+    s1 = to_boolean_if_needed(func_opts.func_name, s1)
+    r = func_opts.func(from_pandas_series(s1, chunk_size=4), 4)
+    result = r.execute().fetch()
+    expected = func_opts.func(s1, 4)
+    pd.testing.assert_series_equal(expected, result)
+
+    # reverse with scalar
+    s1 = pd.Series(np.arange(10) + 1, index=np.random.permutation(range(10)))
+    s1 = to_boolean_if_needed(func_opts.func_name, s1)
+    r = func_opts.func(4, from_pandas_series(s1, chunk_size=4))
+    result = r.execute().fetch()
+    expected = func_opts.func(4, s1)
+    pd.testing.assert_series_equal(expected, result)
+
+
+@pytest.mark.skipif(
+    pd.__version__ < "1.2.0", reason="skip due to the incompatibilities."
+)
+@pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
+def test_with_plain_value(setup, func_name, func_opts):
+    if func_opts.func_name in ["__and__", "__or__", "__xor__"]:
+        # skip tests for bitwise logical operators on plain value.
+        return
+
+    data1 = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.arange(10),
+        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=6)
+    s1 = df1[2]
+
+    r = getattr(df1, func_opts.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0)
+    result = r.execute().fetch()
+    expected = getattr(data1, func_opts.func_name)(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0
+    )
+    pd.testing.assert_frame_equal(expected, result)
+
+    r = getattr(df1, func_opts.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0)
+    result = r.execute().fetch()
+    expected = getattr(data1, func_opts.func_name)(
+        (1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0
+    )
+    pd.testing.assert_frame_equal(expected, result)
+
+    r = getattr(s1, func_opts.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    result = r.execute().fetch()
+    expected = getattr(data1[2], func_opts.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    pd.testing.assert_series_equal(expected, result)
+
+    r = getattr(s1, func_opts.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
+    result = r.execute().fetch()
+    expected = getattr(data1[2], func_opts.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
+    pd.testing.assert_series_equal(expected, result)
+
+    # specify index, not the default range index
+    data1 = pd.DataFrame(
+        np.random.rand(10, 7), index=np.arange(5, 15), columns=[4, 1, 3, 2, 5, 6, 7]
+    )
+    data1 = to_boolean_if_needed(func_opts.func_name, data1)
+    df1 = from_pandas(data1, chunk_size=6)
+    s1 = df1[2]
+
+    r = getattr(df1, func_opts.func_name)(
+        np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0
+    )
+    result = r.execute().fetch()
+    expected = getattr(data1, func_opts.func_name)(
+        np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0
+    )
+    pd.testing.assert_frame_equal(expected, result)
+
+    r = getattr(df1, func_opts.func_name)(
+        from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])), axis=0
+    )
+    result = r.execute().fetch()
+    expected = getattr(data1, func_opts.func_name)(
+        np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0
+    )
+    pd.testing.assert_frame_equal(expected, result)
+
+    r = getattr(s1, func_opts.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
+    result = r.execute().fetch()
+    expected = getattr(data1[2], func_opts.func_name)(
+        np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    )
+    pd.testing.assert_series_equal(expected, result)
+
+    r = getattr(s1, func_opts.func_name)(
+        from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
+    )
+    result = r.execute().fetch()
+    expected = getattr(data1[2], func_opts.func_name)(
+        np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    )
+    pd.testing.assert_series_equal(expected, result)
+
+
+def test_abs(setup):
+    data1 = pd.DataFrame(np.random.uniform(low=-1, high=1, size=(10, 10)))
+    df1 = from_pandas(data1, chunk_size=5)
+
+    result = df1.abs().execute().fetch()
+    expected = data1.abs()
+    pd.testing.assert_frame_equal(expected, result)
+
+    result = abs(df1).execute().fetch()
+    pd.testing.assert_frame_equal(expected, result)
+
+
+def test_not(setup):
+    data1 = pd.DataFrame(np.random.uniform(low=-1, high=1, size=(10, 10)) > 0)
+    df1 = from_pandas(data1, chunk_size=5)
+
+    result = ~df1.execute().fetch()
+    expected = ~data1
+    pd.testing.assert_frame_equal(expected, result)
+
+
+def test_negative(setup):
+    data1 = pd.DataFrame(np.random.randint(low=0, high=100, size=(10, 10)))
+    df1 = from_pandas(data1, chunk_size=5)
+
+    result = -df1.execute().fetch()
+    expected = -data1
+    pd.testing.assert_frame_equal(expected, result)
+
+
+def test_ufunc(setup):
+    df_raw = pd.DataFrame(
+        np.random.uniform(size=(10, 10)), index=pd.RangeIndex(9, -1, -1)
+    )
+    df = from_pandas(df_raw, chunk_size=5)
+
+    series_raw = pd.Series(np.random.uniform(size=10), index=pd.RangeIndex(9, -1, -1))
+    series = from_pandas_series(series_raw, chunk_size=5)
+
+    ufuncs = [
+        [np.abs, mt.abs],
+        [np.log, mt.log],
+        [np.log2, mt.log2],
+        [np.log10, mt.log10],
+        [np.sin, mt.sin],
+        [np.cos, mt.cos],
+        [np.tan, mt.tan],
+        [np.sinh, mt.sinh],
+        [np.cosh, mt.cosh],
+        [np.tanh, mt.tanh],
+        [np.arcsin, mt.arcsin],
+        [np.arccos, mt.arccos],
+        [np.arctan, mt.arctan],
+        [np.arcsinh, mt.arcsinh],
+        [np.arccosh, mt.arccosh],
+        [np.arctanh, mt.arctanh],
+        [np.radians, mt.radians],
+        [np.degrees, mt.degrees],
+        [np.ceil, mt.ceil],
+        [np.floor, mt.floor],
+        [partial(np.around, decimals=2), partial(mt.around, decimals=2)],
+        [np.exp, mt.exp],
+        [np.exp2, mt.exp2],
+        [np.expm1, mt.expm1],
+        [np.sqrt, mt.sqrt],
+        [np.isnan, mt.isnan],
+        [np.isfinite, mt.isfinite],
+        [np.isinf, mt.isinf],
+        [np.negative, mt.negative],
+    ]
+
+    for raw, data in [(df_raw, df), (series_raw, series)]:
+        for npf, mtf in ufuncs:
+            r = mtf(data)
+
+            result = r.execute().fetch()
+            expected = npf(raw)
+
+            if isinstance(raw, pd.DataFrame):
+                pd.testing.assert_frame_equal(result, expected)
+            else:
+                pd.testing.assert_series_equal(result, expected)
+
+            # test numpy ufunc
+            r = npf(data)
+
+            result = r.execute().fetch()
+
+            if isinstance(raw, pd.DataFrame):
+                pd.testing.assert_frame_equal(result, expected)
+            else:
+                pd.testing.assert_series_equal(result, expected)
+
+
+def test_date_time_bin(setup):
+    rs = np.random.RandomState(0)
+    df_raw = pd.DataFrame(
+        {
+            "a": rs.randint(1000, size=10),
+            "b": rs.rand(10),
+            "c": [pd.Timestamp(rs.randint(1604000000, 1604481373)) for _ in range(10)],
+        },
+        index=pd.RangeIndex(9, -1, -1),
+    )
+    df = from_pandas(df_raw, chunk_size=5)
+    r = (df["c"] > to_datetime("2000-01-01")) & (df["c"] < to_datetime("2021-01-01"))
+
+    result = r.execute().fetch()
+    expected = (df_raw["c"] > pd.to_datetime("2000-01-01")) & (
+        df_raw["c"] < pd.to_datetime("2021-01-01")
+    )
+    pd.testing.assert_series_equal(result, expected)
+
+
+def test_series_and_tensor(setup):
+    rs = np.random.RandomState(0)
+    s_raw = pd.Series(rs.rand(10)) < 0.5
+    a_raw = rs.rand(10) < 0.5
+
+    series = from_pandas_series(s_raw, chunk_size=5)
+    t = mt.tensor(a_raw, chunk_size=5)
+
+    r = t | series
+    result = r.execute().fetch()
+    expected = a_raw | s_raw
+    pd.testing.assert_series_equal(result, expected)
diff --git a/python/xorbits/_mars/dataframe/arithmetic/tests/test_comparison.py b/python/xorbits/_mars/dataframe/arithmetic/tests/test_comparison.py
new file mode 100644
index 000000000..c92491bcd
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/tests/test_comparison.py
@@ -0,0 +1,83 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+from datetime import datetime
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ....core import enter_mode
+from ...initializer import DataFrame, Series
+
+
+def test_comp(setup):
+    raw_df1 = pd.DataFrame(np.random.rand(4, 3))
+    raw_df2 = pd.DataFrame(np.random.rand(4, 3))
+    df1 = DataFrame(raw_df1)
+    df2 = DataFrame(raw_df2)
+
+    with enter_mode(build=True):
+        assert not df1.data == df2.data
+        assert df1.data == df1.data
+
+    for op in [
+        operator.eq,
+        operator.ne,
+        operator.lt,
+        operator.gt,
+        operator.le,
+        operator.ge,
+    ]:
+        eq_df = op(df1, df2)
+        pd.testing.assert_index_equal(
+            eq_df.index_value.to_pandas(), df1.index_value.to_pandas()
+        )
+        eq_df = op(raw_df1, df2)
+        pd.testing.assert_index_equal(
+            eq_df.index_value.to_pandas(), df1.index_value.to_pandas()
+        )
+
+        # index not identical
+        df3 = DataFrame(pd.DataFrame(np.random.rand(4, 3), index=[1, 2, 3, 4]))
+        with pytest.raises(ValueError):
+            op(df1, df3)
+
+        # columns not identical
+        df4 = DataFrame(pd.DataFrame(np.random.rand(4, 3), columns=["a", "b", "c"]))
+        with pytest.raises(ValueError):
+            op(df1, df4)
+
+    # test datetime
+    df = DataFrame(pd.DataFrame(pd.date_range("20130101", periods=6)))
+    for op in [
+        operator.eq,
+        operator.ne,
+        operator.lt,
+        operator.gt,
+        operator.le,
+        operator.ge,
+    ]:
+        r_df = op(df, datetime(2013, 1, 2))
+        pd.testing.assert_index_equal(
+            r_df.index_value.to_pandas(), df.index_value.to_pandas()
+        )
+
+    # test period type
+    raw = pd.period_range("2000-01-01", periods=10, freq="D")
+    raw_series = pd.Series(raw)
+    series = Series(raw, chunk_size=5)
+    r = series >= series[1]
+    pd.testing.assert_series_equal(r.to_pandas(), raw_series >= raw_series[1])
diff --git a/python/xorbits/_mars/dataframe/arithmetic/tests/test_dot.py b/python/xorbits/_mars/dataframe/arithmetic/tests/test_dot.py
new file mode 100644
index 000000000..7993efcf4
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/tests/test_dot.py
@@ -0,0 +1,93 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ... import DataFrame, Series
+
+
+def test_dot_execution(setup):
+    df1_raw = pd.DataFrame(np.random.rand(4, 7))
+    df2_raw = pd.DataFrame(np.random.rand(7, 5), columns=list("efghi"))
+    s1_raw = pd.Series(np.random.rand(7))
+    s2_raw = pd.Series(np.random.rand(7))
+
+    df1 = DataFrame(df1_raw, chunk_size=(3, 2))
+    df2 = DataFrame(df2_raw, chunk_size=(3, 4))
+
+    # df.dot(df)
+    r = df1.dot(df2)
+    result = r.execute().fetch()
+    expected = df1_raw.dot(df2_raw)
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test @
+    r = df1 @ df2
+    result = r.execute().fetch()
+    expected = df1_raw @ df2_raw
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test reversed @
+    r = df1_raw @ df2
+    result = r.execute().fetch()
+    expected = df1_raw @ df2_raw
+    pd.testing.assert_frame_equal(result, expected)
+
+    series1 = Series(s1_raw, chunk_size=5)
+
+    # df.dot(series)
+    r = df1.dot(series1)
+    result = r.execute().fetch()
+    expected = df1_raw.dot(s1_raw)
+    pd.testing.assert_series_equal(result, expected)
+
+    # df.dot(2d_array)
+    r = df1.dot(df2_raw.to_numpy())
+    result = r.execute().fetch()
+    expected = df1_raw.dot(df2_raw.to_numpy())
+    pd.testing.assert_frame_equal(result, expected)
+
+    # df.dot(1d_array)
+    r = df1.dot(s1_raw.to_numpy())
+    result = r.execute().fetch()
+    expected = df1_raw.dot(s1_raw.to_numpy())
+    pd.testing.assert_series_equal(result, expected)
+
+    series2 = Series(s2_raw, chunk_size=4)
+
+    # series.dot(series)
+    r = series1.dot(series2)
+    result = r.execute().fetch()
+    expected = s1_raw.dot(s2_raw)
+    assert pytest.approx(result) == expected
+
+    # series.dot(df)
+    r = series1.dot(df2)
+    result = r.execute().fetch()
+    expected = s1_raw.dot(df2_raw)
+    pd.testing.assert_series_equal(result, expected)
+
+    # series.dot(2d_array)
+    r = series1.dot(df2_raw.to_numpy())
+    result = r.execute().fetch()
+    expected = s1_raw.dot(df2_raw.to_numpy())
+    np.testing.assert_almost_equal(result, expected)
+
+    # series.dot(1d_array)
+    r = series1.dot(s2_raw.to_numpy())
+    result = r.execute().fetch()
+    expected = s1_raw.dot(s2_raw.to_numpy())
+    assert pytest.approx(result) == expected
diff --git a/python/xorbits/_mars/dataframe/arithmetic/truediv.py b/python/xorbits/_mars/dataframe/arithmetic/truediv.py
new file mode 100644
index 000000000..7bf2ecc09
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arithmetic/truediv.py
@@ -0,0 +1,64 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+
+from ... import opcodes as OperandDef
+from ...utils import classproperty
+from .core import DataFrameBinopUfunc
+from .docstring import bin_arithmetic_doc
+
+
+class DataFrameTrueDiv(DataFrameBinopUfunc):
+    _op_type_ = OperandDef.DIV
+
+    _func_name = "truediv"
+    _rfunc_name = "rtruediv"
+
+    @classproperty
+    def _operator(self):
+        return operator.truediv
+
+    @classproperty
+    def tensor_op_type(self):
+        from ...tensor.arithmetic import TensorTrueDiv
+
+        return TensorTrueDiv
+
+
+_truediv_example = """
+>>> a.truediv(b, fill_value=0).execute()
+a    1.0
+b    inf
+c    inf
+d    0.0
+e    NaN
+dtype: float64
+"""
+
+
+@bin_arithmetic_doc("Floating division", equiv="/", series_example=_truediv_example)
+def truediv(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameTrueDiv(
+        axis=axis, level=level, fill_value=fill_value, lhs=df, rhs=other
+    )
+    return op(df, other)
+
+
+@bin_arithmetic_doc("Floating division", equiv="/", series_example=_truediv_example)
+def rtruediv(df, other, axis="columns", level=None, fill_value=None):
+    op = DataFrameTrueDiv(
+        axis=axis, level=level, fill_value=fill_value, lhs=other, rhs=df
+    )
+    return op.rcall(df, other)
diff --git a/python/xorbits/_mars/dataframe/arrays.py b/python/xorbits/_mars/dataframe/arrays.py
new file mode 100644
index 000000000..e0d13cc7e
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/arrays.py
@@ -0,0 +1,864 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import operator
+import re
+from copy import copy as copy_obj
+from numbers import Integral
+from typing import Sequence, Type
+
+import numpy as np
+import pandas as pd
+from pandas._libs import lib
+from pandas.api.extensions import (
+    ExtensionArray,
+    ExtensionDtype,
+    register_extension_dtype,
+)
+from pandas.api.indexers import check_array_indexer
+from pandas.api.types import (
+    is_array_like,
+    is_list_like,
+    is_scalar,
+    is_string_dtype,
+    pandas_dtype,
+)
+from pandas.arrays import StringArray as StringArrayBase
+from pandas.compat import set_function_name
+from pandas.core import ops
+from pandas.core.algorithms import take
+
+try:
+    from pandas._libs.arrays import NDArrayBacked
+except ImportError:
+    NDArrayBacked = None
+try:
+    import pyarrow as pa
+
+    pa_null = pa.NULL
+except ImportError:  # pragma: no cover
+    pa = None
+    pa_null = None
+try:
+    import pyarrow.compute as pc
+except ImportError:  # pragma: no cover
+    pc = None
+
+from ..config import options
+from ..core import is_kernel_mode
+from ..utils import pd_release_version, tokenize
+
+_use_bool_any_all = pd_release_version[:2] >= (1, 3)
+_use_extension_index = pd_release_version[:2] >= (1, 4)
+_object_engine_for_string_array = pd_release_version[:2] >= (1, 5)
+
+if _object_engine_for_string_array:
+    StringArrayBase = type(StringArrayBase)(
+        "StringArrayBase", StringArrayBase.__bases__, dict(StringArrayBase.__dict__)
+    )
+
+
+class ArrowDtype(ExtensionDtype):
+    @property
+    def arrow_type(self):  # pragma: no cover
+        raise NotImplementedError
+
+    def __from_arrow__(self, array):
+        return self.construct_array_type()(array)
+
+
+@register_extension_dtype
+class ArrowStringDtype(ArrowDtype):
+    """
+    Extension dtype for arrow string data.
+
+    .. warning::
+
+       ArrowStringDtype is considered experimental. The implementation and
+       parts of the API may change without warning.
+
+       In particular, ArrowStringDtype.na_value may change to no longer be
+       ``numpy.nan``.
+
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> md.ArrowStringDtype()
+    ArrowStringDtype
+    """
+
+    type = str
+    kind = "U"
+    name = "Arrow[string]"
+    na_value = pa_null
+
+    @classmethod
+    def construct_from_string(cls, string):
+        if string == cls.name:
+            return cls()
+        else:
+            raise TypeError(f"Cannot construct a '{cls}' from '{string}'")
+
+    @classmethod
+    def construct_array_type(cls) -> "Type[ArrowStringArray]":
+        return ArrowStringArray
+
+    @property
+    def arrow_type(self):
+        return pa.string()
+
+
+@register_extension_dtype
+class ArrowStringDtypeAlias(ArrowStringDtype):
+    name = "arrow_string"  # register an alias name for compatibility
+
+
+class ArrowListDtypeType(type):
+    """
+    the type of ArrowListDtype, this metaclass determines subclass ability
+    """
+
+    pass
+
+
+class ArrowListDtype(ArrowDtype):
+    _metadata = ("_value_type",)
+
+    def __init__(self, dtype):
+        if isinstance(dtype, type(self)):
+            dtype = dtype.value_type
+        if pa and isinstance(dtype, pa.DataType):
+            dtype = dtype.to_pandas_dtype()
+
+        dtype = pandas_dtype(dtype)
+        if is_string_dtype(dtype) and not isinstance(dtype, ArrowStringDtype):
+            # convert string dtype to arrow string dtype
+            dtype = ArrowStringDtype()
+
+        self._value_type = dtype
+
+    @property
+    def value_type(self):
+        return self._value_type
+
+    @property
+    def kind(self):
+        return "O"
+
+    @property
+    def type(self):
+        return ArrowListDtypeType
+
+    @property
+    def name(self):
+        return f"Arrow[List[{self.value_type.name}]]"
+
+    @property
+    def arrow_type(self):
+        if isinstance(self._value_type, ArrowDtype):
+            arrow_subdtype = self._value_type.arrow_type
+        else:
+            arrow_subdtype = pa.from_numpy_dtype(self._value_type)
+        return pa.list_(arrow_subdtype)
+
+    def __repr__(self) -> str:
+        return self.name
+
+    @classmethod
+    def construct_array_type(cls) -> "Type[ArrowListArray]":
+        return ArrowListArray
+
+    @classmethod
+    def construct_from_string(cls, string):
+        msg = f"Cannot construct a 'ArrowListDtype' from '{string}'"
+        xpr = re.compile(r"Arrow\[List\[(?P<value_type>[^,]*)\]\]$")
+        m = xpr.match(string)
+        if m:
+            value_type = m.groupdict()["value_type"]
+            return ArrowListDtype(value_type)
+        else:
+            raise TypeError(msg)
+
+    @classmethod
+    def is_dtype(cls, dtype) -> bool:
+        dtype = getattr(dtype, "dtype", dtype)
+        if isinstance(dtype, str):
+            try:
+                cls.construct_from_string(dtype)
+            except TypeError:
+                return False
+            else:
+                return True
+        else:
+            return isinstance(dtype, cls)
+
+    def __hash__(self):
+        return super().__hash__()
+
+    def __eq__(self, other):
+        if not isinstance(other, ArrowListDtype):
+            return False
+
+        value_type = self._value_type
+        other_value_type = other._value_type
+
+        try:
+            return value_type == other_value_type
+        except TypeError:
+            # cannot compare numpy dtype and extension dtype
+            return other_value_type == value_type
+
+
+class ArrowArray(ExtensionArray):
+    _arrow_type = None
+
+    def __init__(self, values, dtype: ArrowDtype = None, copy=False):
+        pandas_only = self._pandas_only()
+
+        if pa is not None and not pandas_only:
+            self._init_by_arrow(values, dtype=dtype, copy=copy)
+        elif not is_kernel_mode():
+            # not in kernel mode, allow to use numpy handle data
+            # just for infer dtypes purpose
+            self._init_by_numpy(values, dtype=dtype, copy=copy)
+        else:
+            raise ImportError("Cannot create ArrowArray when `pyarrow` not installed")
+
+        # for test purpose
+        self._force_use_pandas = pandas_only
+
+    def _init_by_arrow(self, values, dtype: ArrowDtype = None, copy=False):
+        if isinstance(values, (pd.Index, pd.Series)):
+            # for pandas Index and Series,
+            # convert to PandasArray
+            values = values.array
+
+        if isinstance(values, type(self)):
+            arrow_array = values._arrow_array
+        elif isinstance(values, ExtensionArray):
+            # if come from pandas object like index,
+            # convert to pandas StringArray first,
+            # validation will be done in construct
+            arrow_array = pa.chunked_array([pa.array(values, from_pandas=True)])
+        elif isinstance(values, pa.ChunkedArray):
+            arrow_array = values
+        elif isinstance(values, pa.Array):
+            arrow_array = pa.chunked_array([values])
+        elif len(values) == 0:  # pragma: no cover
+            arrow_array = pa.chunked_array([pa.array([], type=dtype.arrow_type)])
+        else:
+            arrow_array = pa.chunked_array([pa.array(values, type=dtype.arrow_type)])
+
+        if copy:
+            arrow_array = copy_obj(arrow_array)
+
+        self._use_arrow = True
+        self._arrow_array = arrow_array
+
+        if NDArrayBacked is not None and isinstance(self, NDArrayBacked):
+            NDArrayBacked.__init__(self, np.array([]), dtype)
+        else:
+            self._dtype = dtype
+
+    def _init_by_numpy(self, values, dtype: ArrowDtype = None, copy=False):
+        self._use_arrow = False
+
+        ndarray = np.array(values, copy=copy)
+        if NDArrayBacked is not None and isinstance(self, NDArrayBacked):
+            NDArrayBacked.__init__(self, ndarray, dtype)
+        else:
+            self._dtype = dtype
+            self._ndarray = np.array(values, copy=copy)
+
+    @classmethod
+    def _pandas_only(cls):
+        return options.dataframe.arrow_array.pandas_only
+
+    def __repr__(self):
+        return f"{type(self).__name__}({repr(self._array)})"
+
+    @property
+    def _array(self):
+        return self._arrow_array if self._use_arrow else self._ndarray
+
+    @property
+    def dtype(self) -> "Type[ArrowDtype]":
+        return self._dtype
+
+    @property
+    def nbytes(self) -> int:
+        if self._use_arrow:
+            return sum(
+                x.size
+                for chunk in self._arrow_array.chunks
+                for x in chunk.buffers()
+                if x is not None
+            )
+        else:
+            return self._ndarray.nbytes
+
+    @property
+    def shape(self):
+        if self._use_arrow:
+            return (self._arrow_array.length(),)
+        else:
+            return self._ndarray.shape
+
+    def memory_usage(self, deep=True) -> int:
+        if self._use_arrow:
+            return self.nbytes
+        else:
+            return pd.Series(self._ndarray).memory_usage(index=False, deep=deep)
+
+    @classmethod
+    def _to_arrow_array(cls, scalars):
+        return pa.array(scalars)
+
+    @classmethod
+    def _from_sequence(cls, scalars, dtype=None, copy=False):
+        if pa is None or cls._pandas_only():
+            # pyarrow not installed, just return numpy
+            ret = np.empty(len(scalars), dtype=object)
+            ret[:] = scalars
+            return cls(ret)
+
+        if pa_null is not None and isinstance(scalars, type(pa_null)):
+            scalars = []
+        elif not hasattr(scalars, "dtype"):
+            ret = np.empty(len(scalars), dtype=object)
+            for i, s in enumerate(scalars):
+                ret[i] = s
+            scalars = ret
+        elif isinstance(scalars, cls):
+            if copy:
+                scalars = scalars.copy()
+            return scalars
+        arrow_array = pa.chunked_array([cls._to_arrow_array(scalars)])
+        return cls(arrow_array, dtype=dtype, copy=copy)
+
+    @classmethod
+    def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
+        return cls._from_sequence(strings, dtype=dtype, copy=copy)
+
+    @staticmethod
+    def _can_process_slice_via_arrow(slc):
+        if not isinstance(slc, slice):
+            return False
+        if slc.step is not None and slc.step != 1:
+            return False
+        if slc.start is not None and not isinstance(
+            slc.start, Integral
+        ):  # pragma: no cover
+            return False
+        if slc.stop is not None and not isinstance(
+            slc.stop, Integral
+        ):  # pragma: no cover
+            return False
+        return True
+
+    def _values_for_factorize(self):
+        arr = self.to_numpy()
+        mask = self.isna()
+        arr[mask] = -1
+        return arr, -1
+
+    def _values_for_argsort(self):
+        return self.to_numpy()
+
+    @classmethod
+    def _from_factorized(cls, values, original):
+        return cls(values)
+
+    @staticmethod
+    def _process_pos(pos, length, is_start):
+        if pos is None:
+            return 0 if is_start else length
+        return pos + length if pos < 0 else pos
+
+    @classmethod
+    def _post_scalar_getitem(cls, lst):
+        return lst.to_pandas()[0]
+
+    def __getitem__(self, item):
+        cls = type(self)
+
+        if pa is None or self._force_use_pandas:
+            # pyarrow not installed
+            result = self._ndarray[item]
+            if pd.api.types.is_scalar(item):
+                return result
+            else:
+                return type(self)(result)
+
+        has_take = hasattr(self._arrow_array, "take")
+        if not self._force_use_pandas and has_take:
+            if pd.api.types.is_scalar(item):
+                item = item + len(self) if item < 0 else item
+                return self._post_scalar_getitem(self._arrow_array.take([item]))
+            elif self._can_process_slice_via_arrow(item):
+                length = len(self)
+                start, stop = item.start, item.stop
+                start = self._process_pos(start, length, True)
+                stop = self._process_pos(stop, length, False)
+                return cls(
+                    self._arrow_array.slice(offset=start, length=stop - start),
+                    dtype=self._dtype,
+                )
+            elif hasattr(item, "dtype") and np.issubdtype(item.dtype, np.bool_):
+                return cls(
+                    self._arrow_array.filter(pa.array(item, from_pandas=True)),
+                    dtype=self._dtype,
+                )
+            elif hasattr(item, "dtype"):
+                length = len(self)
+                item = np.where(item < 0, item + length, item)
+                return cls(self._arrow_array.take(item), dtype=self._dtype)
+
+        array = np.asarray(self._arrow_array.to_pandas())
+        return cls(array[item], dtype=self._dtype)
+
+    @classmethod
+    def _concat_same_type(cls, to_concat: Sequence["ArrowArray"]) -> "ArrowArray":
+        if pa is None or cls._pandas_only():
+            # pyarrow not installed
+            return cls(np.concatenate([x._array for x in to_concat]))
+
+        chunks = list(
+            itertools.chain.from_iterable(x._arrow_array.chunks for x in to_concat)
+        )
+        if len(chunks) == 0:
+            chunks = [pa.array([], type=to_concat[0].dtype.arrow_type)]
+        return cls(pa.chunked_array(chunks))
+
+    def __len__(self):
+        return len(self._array)
+
+    def __array__(self, dtype=None):
+        return self.to_numpy(dtype=dtype)
+
+    def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default):
+        if self._use_arrow:
+            array = np.asarray(self._arrow_array.to_pandas())
+        else:
+            array = self._ndarray
+        if copy or na_value is not lib.no_default:
+            array = array.copy()
+        if na_value is not lib.no_default:
+            array[self.isna()] = na_value
+        return array
+
+    @classmethod
+    def _array_fillna(cls, array, value):
+        return array.fillna(value)
+
+    def fillna(self, value=None, method=None, limit=None):
+        cls = type(self)
+
+        if pa is None or self._force_use_pandas:
+            # pyarrow not installed
+            return cls(
+                pd.Series(self.to_numpy()).fillna(
+                    value=value, method=method, limit=limit
+                )
+            )
+
+        chunks = []
+        for chunk_array in self._arrow_array.chunks:
+            array = chunk_array.to_pandas()
+            if method is None:
+                result_array = self._array_fillna(array, value)
+            else:
+                result_array = array.fillna(value=value, method=method, limit=limit)
+            chunks.append(pa.array(result_array, from_pandas=True))
+        return cls(pa.chunked_array(chunks), dtype=self._dtype)
+
+    def astype(self, dtype, copy=True):
+        dtype = pandas_dtype(dtype)
+        if isinstance(dtype, ArrowStringDtype):
+            if copy:
+                return self.copy()
+            return self
+
+        if pa is None or self._force_use_pandas:
+            # pyarrow not installed
+            if isinstance(dtype, ArrowDtype):
+                dtype = dtype.type
+            return type(self)(pd.Series(self.to_numpy()).astype(dtype, copy=copy))
+
+        # try to slice 1 record to get the result dtype
+        test_array = self._arrow_array.slice(0, 1).to_pandas()
+        test_result_array = test_array.astype(dtype).array
+        if _use_extension_index:
+            test_result_type = type(test_array.astype(dtype).values)
+            if test_result_type is np.ndarray:
+                test_result_type = np.array
+        else:
+            test_result_type = type(test_result_array)
+
+        result_array = test_result_type(
+            np.full(
+                self.shape,
+                test_result_array.dtype.na_value,
+                dtype=np.asarray(test_result_array).dtype,
+            )
+        )
+
+        start = 0
+        # use chunks to do astype
+        for chunk_array in self._arrow_array.chunks:
+            result_array[start : start + len(chunk_array)] = (
+                chunk_array.to_pandas().astype(dtype).array
+            )
+            start += len(chunk_array)
+        return result_array
+
+    def isna(self):
+        if (
+            not self._force_use_pandas
+            and self._use_arrow
+            and hasattr(self._arrow_array, "is_null")
+        ):
+            return self._arrow_array.is_null().to_pandas().to_numpy()
+        elif self._use_arrow:
+            return pd.isna(self._arrow_array.to_pandas()).to_numpy()
+        else:
+            return pd.isna(self._ndarray)
+
+    def take(self, indices, allow_fill=False, fill_value=None):
+        if (
+            allow_fill is False or (allow_fill and fill_value is self.dtype.na_value)
+        ) and len(self) > 0:
+            return type(self)(self[indices], dtype=self._dtype)
+
+        if self._use_arrow:
+            array = self._arrow_array.to_pandas().to_numpy()
+        else:
+            array = self._ndarray
+
+        replace = False
+        if allow_fill and (fill_value is None or fill_value == self._dtype.na_value):
+            fill_value = self.dtype.na_value
+            replace = True
+
+        result = take(array, indices, fill_value=fill_value, allow_fill=allow_fill)
+        del array
+        if replace and pa is not None:
+            # pyarrow cannot recognize pa.NULL
+            result[result == self.dtype.na_value] = None
+        return type(self)(result, dtype=self._dtype)
+
+    def copy(self):
+        if self._use_arrow:
+            return type(self)(copy_obj(self._arrow_array))
+        else:
+            return type(self)(self._ndarray.copy())
+
+    def unique(self):
+        if self._force_use_pandas or not self._use_arrow or not hasattr(pc, "unique"):
+            return type(self)(np.unique(self.to_numpy()), dtype=self._dtype)
+        return type(self)(pc.unique(self._arrow_array), dtype=self._dtype)
+
+    def value_counts(self, dropna=False):
+        if self._use_arrow:
+            series = self._arrow_array.to_pandas()
+        else:
+            series = pd.Series(self._ndarray)
+        return type(self)(series.value_counts(dropna=dropna), dtype=self._dtype)
+
+    if _use_bool_any_all:
+
+        def any(self, axis=0, out=None):
+            return self.to_numpy().astype(bool).any(axis=axis, out=out)
+
+        def all(self, axis=0, out=None):
+            return self.to_numpy().astype(bool).all(axis=axis, out=out)
+
+    else:
+
+        def any(self, axis=0, out=None):
+            return self.to_numpy().any(axis=axis, out=out)
+
+        def all(self, axis=0, out=None):
+            return self.to_numpy().all(axis=axis, out=out)
+
+    def __mars_tokenize__(self):
+        if self._use_arrow:
+            return tokenize(
+                [
+                    memoryview(x)
+                    for chunk in self._arrow_array.chunks
+                    for x in chunk.buffers()
+                    if x is not None
+                ]
+            )
+        else:
+            return self._ndarray
+
+
+class ArrowStringArray(ArrowArray, StringArrayBase):
+    def __init__(self, values, dtype=None, copy=False):
+        if dtype is not None:
+            assert isinstance(dtype, ArrowStringDtype)
+        ArrowArray.__init__(self, values, ArrowStringDtype(), copy=copy)
+
+    @classmethod
+    def from_scalars(cls, values):
+        if pa is None or cls._pandas_only():
+            return cls._from_sequence(values)
+        else:
+            arrow_array = pa.chunked_array([cls._to_arrow_array(values)])
+            return cls(arrow_array)
+
+    @classmethod
+    def _to_arrow_array(cls, scalars):
+        return pa.array(scalars).cast(pa.string())
+
+    def __setitem__(self, key, value):
+        if isinstance(value, (pd.Index, pd.Series)):
+            value = value.to_numpy()
+        if isinstance(value, type(self)):
+            value = value.to_numpy()
+
+        key = check_array_indexer(self, key)
+        scalar_key = is_scalar(key)
+        scalar_value = is_scalar(value)
+        if scalar_key and not scalar_value:
+            raise ValueError("setting an array element with a sequence.")
+
+        # validate new items
+        if scalar_value:
+            if pd.isna(value):
+                value = None
+            elif not isinstance(value, str):
+                raise ValueError(
+                    f"Cannot set non-string value '{value}' into a ArrowStringArray."
+                )
+        else:
+            if not is_array_like(value):
+                value = np.asarray(value, dtype=object)
+            if len(value) and not lib.is_string_array(value, skipna=True):
+                raise ValueError("Must provide strings.")
+
+        if self._use_arrow:
+            string_array = np.asarray(self._arrow_array.to_pandas())
+            string_array[key] = value
+            self._arrow_array = pa.chunked_array([pa.array(string_array)])
+        else:
+            self._ndarray[key] = value
+
+    # Override parent because we have different return types.
+    @classmethod
+    def _create_arithmetic_method(cls, op):
+        # Note: this handles both arithmetic and comparison methods.
+        def method(self, other):
+            is_arithmetic = True if op.__name__ in ops.ARITHMETIC_BINOPS else False
+            pandas_only = cls._pandas_only()
+
+            is_other_array = False
+            if not is_scalar(other):
+                is_other_array = True
+                other = np.asarray(other)
+
+            self_is_na = self.isna()
+            other_is_na = pd.isna(other)
+            mask = self_is_na | other_is_na
+
+            if pa is None or pandas_only:
+                if is_arithmetic:
+                    ret = np.empty(self.shape, dtype=object)
+                else:
+                    ret = np.zeros(self.shape, dtype=bool)
+                valid = ~mask
+                arr = (
+                    self._arrow_array.to_pandas().to_numpy()
+                    if self._use_arrow
+                    else self._ndarray
+                )
+                o = other[valid] if is_other_array else other
+                ret[valid] = op(arr[valid], o)
+                if is_arithmetic:
+                    return ArrowStringArray(ret)
+                else:
+                    return pd.arrays.BooleanArray(ret, mask)
+
+            chunks = []
+            mask_chunks = []
+            start = 0
+            for chunk_array in self._arrow_array.chunks:
+                chunk_array = np.asarray(chunk_array.to_pandas())
+                end = start + len(chunk_array)
+                chunk_mask = mask[start:end]
+                chunk_valid = ~chunk_mask
+
+                if is_arithmetic:
+                    result = np.empty(chunk_array.shape, dtype=object)
+                else:
+                    result = np.zeros(chunk_array.shape, dtype=bool)
+
+                chunk_other = other
+                if is_other_array:
+                    chunk_other = other[start:end]
+                    chunk_other = chunk_other[chunk_valid]
+
+                # calculate only for both not None
+                result[chunk_valid] = op(chunk_array[chunk_valid], chunk_other)
+
+                if is_arithmetic:
+                    chunks.append(pa.array(result, type=pa.string(), from_pandas=True))
+                else:
+                    chunks.append(result)
+                    mask_chunks.append(chunk_mask)
+
+            if is_arithmetic:
+                return ArrowStringArray(pa.chunked_array(chunks))
+            else:
+                return pd.arrays.BooleanArray(
+                    np.concatenate(chunks), np.concatenate(mask_chunks)
+                )
+
+        return set_function_name(method, f"__{op.__name__}__", cls)
+
+    def shift(self, periods: int = 1, fill_value: object = None) -> "ArrowStringArray":
+        return ExtensionArray.shift(self, periods=periods, fill_value=fill_value)
+
+    @classmethod
+    def _add_arithmetic_ops(cls):
+        cls.__add__ = cls._create_arithmetic_method(operator.add)
+        cls.__radd__ = cls._create_arithmetic_method(ops.radd)
+
+        cls.__mul__ = cls._create_arithmetic_method(operator.mul)
+        cls.__rmul__ = cls._create_arithmetic_method(ops.rmul)
+
+    @classmethod
+    def _add_comparison_ops(cls):
+        cls.__eq__ = cls._create_comparison_method(operator.eq)
+        cls.__ne__ = cls._create_comparison_method(operator.ne)
+        cls.__lt__ = cls._create_comparison_method(operator.lt)
+        cls.__gt__ = cls._create_comparison_method(operator.gt)
+        cls.__le__ = cls._create_comparison_method(operator.le)
+        cls.__ge__ = cls._create_comparison_method(operator.ge)
+
+    _create_comparison_method = _create_arithmetic_method
+
+
+ArrowStringArray._add_arithmetic_ops()
+ArrowStringArray._add_comparison_ops()
+
+
+class ArrowListArray(ArrowArray):
+    def __init__(self, values, dtype: ArrowListDtype = None, copy=False):
+        if dtype is None:
+            if isinstance(values, type(self)):
+                dtype = values.dtype
+            elif pa is not None:
+                if isinstance(values, pa.Array):
+                    dtype = ArrowListDtype(values.type.value_type)
+                elif isinstance(values, pa.ChunkedArray):
+                    dtype = ArrowListDtype(values.type.value_type)
+                else:
+                    values = pa.array(values)
+                    if values.type == pa.null():
+                        dtype = ArrowListDtype(pa.string())
+                    else:
+                        dtype = ArrowListDtype(values.type.value_type)
+            else:
+                value_type = np.asarray(values[0]).dtype
+                dtype = ArrowListDtype(value_type)
+
+        super().__init__(values, dtype=dtype, copy=copy)
+
+    def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default):
+        if self._use_arrow:
+            s = self._arrow_array.to_pandas()
+        else:
+            s = pd.Series(self._ndarray)
+        s = s.map(lambda x: x.tolist() if hasattr(x, "tolist") else x)
+        if copy or na_value is not lib.no_default:
+            s = s.copy()
+        if na_value is not lib.no_default:
+            s[self.isna()] = na_value
+        return np.asarray(s)
+
+    @classmethod
+    def _post_scalar_getitem(cls, lst):
+        return lst[0].as_py()
+
+    def __setitem__(self, key, value):
+        if isinstance(value, (pd.Index, pd.Series)):
+            value = value.to_numpy()
+
+        key = check_array_indexer(self, key)
+        scalar_key = is_scalar(key)
+
+        # validate new items
+        if scalar_key:
+            if pd.isna(value):
+                value = None
+            elif not is_list_like(value):
+                raise ValueError("Must provide list.")
+
+        if self._use_arrow:
+            array = np.asarray(self._arrow_array.to_pandas())
+            array[key] = value
+            self._arrow_array = pa.chunked_array(
+                [pa.array(array, type=self.dtype.arrow_type)]
+            )
+        else:
+            self._ndarray[key] = value
+
+    @classmethod
+    def _array_fillna(cls, series, value):
+        # cannot fillna directly, because value is a list-like object
+        return series.apply(lambda x: x if is_list_like(x) or not pd.isna(x) else value)
+
+    def astype(self, dtype, copy=True):
+        msg = f"cannot astype from {self.dtype} to {dtype}"
+        dtype = pandas_dtype(dtype)
+        if isinstance(dtype, ArrowListDtype):
+            if self.dtype == dtype:
+                if copy:
+                    return self.copy()
+                return self
+            else:
+                if self._use_arrow:
+                    try:
+                        arrow_array = self._arrow_array.cast(dtype.arrow_type)
+                        return ArrowListArray(arrow_array)
+                    except (NotImplementedError, pa.ArrowInvalid):
+                        raise TypeError(msg)
+                else:
+
+                    def f(x):
+                        return pd.Series(x).astype(dtype.value_type.type).tolist()
+
+                    try:
+                        arr = pd.Series(self._ndarray)
+                        ret = arr.map(f).to_numpy()
+                        return ArrowStringArray(ret)
+                    except ValueError:
+                        raise TypeError(msg)
+
+        try:
+            return super().astype(dtype, copy=copy)
+        except ValueError:
+            raise TypeError(msg)
diff --git a/python/xorbits/_mars/dataframe/base/__init__.py b/python/xorbits/_mars/dataframe/base/__init__.py
new file mode 100644
index 000000000..fb7119733
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/__init__.py
@@ -0,0 +1,151 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .apply import df_apply, series_apply
+from .astype import astype, index_astype
+from .cartesian_chunk import cartesian_chunk
+from .check_monotonic import (
+    check_monotonic,
+    is_monotonic,
+    is_monotonic_decreasing,
+    is_monotonic_increasing,
+)
+from .cut import cut
+from .describe import describe
+from .diff import df_diff, series_diff
+from .drop import df_drop, df_pop, index_drop, series_drop
+from .drop_duplicates import (
+    df_drop_duplicates,
+    index_drop_duplicates,
+    series_drop_duplicates,
+)
+from .duplicated import df_duplicated, index_duplicated, series_duplicated
+from .eval import df_eval, df_query
+from .explode import df_explode, series_explode
+from .isin import df_isin, series_isin
+from .map import index_map, series_map
+from .map_chunk import map_chunk
+from .melt import melt
+from .memory_usage import df_memory_usage, index_memory_usage, series_memory_usage
+from .pct_change import pct_change
+from .qcut import qcut
+from .rebalance import rebalance
+from .rechunk import rechunk
+from .select_dtypes import select_dtypes
+from .shift import shift, tshift
+from .stack import stack
+from .to_cpu import to_cpu
+from .to_gpu import to_gpu
+from .transform import df_transform, series_transform
+from .transpose import transpose
+from .value_counts import value_counts
+
+
+def _install():
+    from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
+    from .accessor import CachedAccessor, DatetimeAccessor, StringAccessor
+    from .datetimes import _datetime_method_to_handlers
+    from .standardize_range_index import ChunkStandardizeRangeIndex
+    from .string_ import _string_method_to_handlers
+
+    for t in DATAFRAME_TYPE:
+        setattr(t, "to_gpu", to_gpu)
+        setattr(t, "to_cpu", to_cpu)
+        setattr(t, "rechunk", rechunk)
+        setattr(t, "describe", describe)
+        setattr(t, "apply", df_apply)
+        setattr(t, "transform", df_transform)
+        setattr(t, "isin", df_isin)
+        setattr(t, "shift", shift)
+        setattr(t, "tshift", tshift)
+        setattr(t, "diff", df_diff)
+        setattr(t, "astype", astype)
+        setattr(t, "drop", df_drop)
+        setattr(t, "pop", df_pop)
+        setattr(
+            t, "__delitem__", lambda df, items: df_drop(df, items, axis=1, inplace=True)
+        )
+        setattr(t, "drop_duplicates", df_drop_duplicates)
+        setattr(t, "duplicated", df_duplicated)
+        setattr(t, "melt", melt)
+        setattr(t, "memory_usage", df_memory_usage)
+        setattr(t, "select_dtypes", select_dtypes)
+        setattr(t, "map_chunk", map_chunk)
+        setattr(t, "cartesian_chunk", cartesian_chunk)
+        setattr(t, "rebalance", rebalance)
+        setattr(t, "stack", stack)
+        setattr(t, "explode", df_explode)
+        setattr(t, "eval", df_eval)
+        setattr(t, "query", df_query)
+        setattr(t, "pct_change", pct_change)
+        setattr(t, "transpose", transpose)
+
+    for t in SERIES_TYPE:
+        setattr(t, "to_gpu", to_gpu)
+        setattr(t, "to_cpu", to_cpu)
+        setattr(t, "rechunk", rechunk)
+        setattr(t, "map", series_map)
+        setattr(t, "describe", describe)
+        setattr(t, "apply", series_apply)
+        setattr(t, "transform", series_transform)
+        setattr(t, "isin", series_isin)
+        setattr(t, "shift", shift)
+        setattr(t, "tshift", tshift)
+        setattr(t, "diff", series_diff)
+        setattr(t, "value_counts", value_counts)
+        setattr(t, "astype", astype)
+        setattr(t, "drop", series_drop)
+        setattr(t, "drop_duplicates", series_drop_duplicates)
+        setattr(t, "duplicated", series_duplicated)
+        setattr(t, "memory_usage", series_memory_usage)
+        setattr(t, "map_chunk", map_chunk)
+        setattr(t, "cartesian_chunk", cartesian_chunk)
+        setattr(t, "rebalance", rebalance)
+        setattr(t, "explode", series_explode)
+        setattr(t, "check_monotonic", check_monotonic)
+        setattr(t, "is_monotonic", property(fget=is_monotonic))
+        setattr(t, "is_monotonic_increasing", property(fget=is_monotonic_increasing))
+        setattr(t, "is_monotonic_decreasing", property(fget=is_monotonic_decreasing))
+        setattr(t, "pct_change", pct_change)
+
+    for t in INDEX_TYPE:
+        setattr(t, "map", index_map)
+        setattr(t, "rechunk", rechunk)
+        setattr(t, "rebalance", rebalance)
+        setattr(t, "drop", index_drop)
+        setattr(t, "drop_duplicates", index_drop_duplicates)
+        setattr(t, "duplicated", index_duplicated)
+        setattr(t, "memory_usage", index_memory_usage)
+        setattr(t, "astype", index_astype)
+        setattr(t, "value_counts", value_counts)
+        setattr(t, "check_monotonic", check_monotonic)
+        setattr(t, "is_monotonic", property(fget=is_monotonic))
+        setattr(t, "is_monotonic_increasing", property(fget=is_monotonic_increasing))
+        setattr(t, "is_monotonic_decreasing", property(fget=is_monotonic_decreasing))
+
+    for method in _string_method_to_handlers:
+        if not hasattr(StringAccessor, method):
+            StringAccessor._register(method)
+
+    for method in _datetime_method_to_handlers:
+        if not hasattr(DatetimeAccessor, method):
+            DatetimeAccessor._register(method)
+
+    for series in SERIES_TYPE:
+        series.str = CachedAccessor("str", StringAccessor)
+        series.dt = CachedAccessor("dt", DatetimeAccessor)
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/dataframe/base/_duplicate.py b/python/xorbits/_mars/dataframe/base/_duplicate.py
new file mode 100644
index 000000000..32869ddf3
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/_duplicate.py
@@ -0,0 +1,412 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+
+import numpy as np
+import pandas as pd
+from pandas.api.types import is_list_like
+
+from ...config import options
+from ...core import OutputType, recursive_tile
+from ...core.operand import MapReduceOperand, OperandStage
+from ...serialization.serializables import AnyField, Int32Field, KeyField, StringField
+from ...utils import ceildiv, has_unknown_shape, lazy_import
+from ..initializer import DataFrame as asdataframe
+from ..operands import DataFrameOperandMixin, DataFrameShuffleProxy
+
+cudf = lazy_import("cudf")
+
+
+class DuplicateOperand(MapReduceOperand, DataFrameOperandMixin):
+    _input = KeyField("input")
+    _subset = AnyField("subset")
+    _keep = AnyField("keep")
+    _method = StringField("method")
+
+    # subset chunk, used for method 'subset_tree'
+    _subset_chunk = KeyField("subset_chunk")
+    # shuffle phase, used in shuffle method
+    _shuffle_size = Int32Field("shuffle_size")
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def subset(self):
+        return self._subset
+
+    @property
+    def keep(self):
+        return self._keep
+
+    @property
+    def method(self):
+        return self._method
+
+    @property
+    def subset_chunk(self):
+        return self._subset_chunk
+
+    @property
+    def shuffle_size(self):
+        return self._shuffle_size
+
+    @classmethod
+    def _get_shape(cls, input_shape, op):  # pragma: no cover
+        raise NotImplementedError
+
+    @classmethod
+    def _gen_tileable_params(
+        cls, op: "DuplicateOperand", input_params
+    ):  # pragma: no cover
+        raise NotImplementedError
+
+    @classmethod
+    def _gen_chunk_params(cls, op: "DuplicateOperand", input_chunk):  # pragma: no cover
+        raise NotImplementedError
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+        if self._subset_chunk is not None:
+            self._subset_chunk = self._inputs[1]
+
+    @classmethod
+    def _tile_one_chunk(cls, op: "DuplicateOperand"):
+        inp = op.input
+        out = op.outputs[0]
+
+        chunk_op = op.copy().reset_key()
+        chunk_op._method = None
+        in_chunk = inp.chunks[0]
+        chunk_params = cls._gen_chunk_params(chunk_op, in_chunk)
+        chunk = chunk_op.new_chunk([in_chunk], kws=[chunk_params])
+
+        params = out.params
+        params["chunks"] = [chunk]
+        params["nsplits"] = tuple((s,) for s in chunk.shape)
+        new_op = op.copy()
+        return new_op.new_tileables([inp], kws=[params])
+
+    @classmethod
+    def _get_map_output_types(cls, input_chunk, method: str):
+        raise NotImplementedError
+
+    @classmethod
+    def _gen_map_chunks(cls, op: "DuplicateOperand", inp, method, **kw):
+        chunks = inp.chunks
+        map_chunks = []
+        for c in chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_op._output_types = cls._get_map_output_types(c, method)
+            chunk_op._method = method
+            chunk_op.stage = OperandStage.map
+            for k, v in kw.items():
+                setattr(chunk_op, k, v)
+            chunk_params = cls._gen_chunk_params(chunk_op, c)
+            map_chunks.append(chunk_op.new_chunk([c], kws=[chunk_params]))
+        return map_chunks
+
+    @classmethod
+    def _gen_tree_chunks(cls, op: "DuplicateOperand", inp, method):
+        from ..merge import DataFrameConcat
+
+        out = op.outputs[0]
+        combine_size = options.combine_size
+        new_chunks = cls._gen_map_chunks(op, inp, method)
+        while len(new_chunks) > 1:
+            out_chunk_size = ceildiv(len(new_chunks), combine_size)
+            out_chunks = []
+            for i in range(out_chunk_size):
+                in_chunks = new_chunks[i * combine_size : (i + 1) * combine_size]
+                s = sum(c.shape[0] for c in in_chunks)
+                if in_chunks[0].ndim == 2:
+                    kw = dict(
+                        dtypes=in_chunks[0].dtypes,
+                        index_value=in_chunks[0].index_value,
+                        columns_value=in_chunks[0].columns_value,
+                        shape=(s, in_chunks[0].shape[1]),
+                        index=(i, 0),
+                    )
+                else:
+                    kw = dict(
+                        dtype=in_chunks[0].dtype,
+                        index_value=in_chunks[0].index_value,
+                        name=in_chunks[0].name,
+                        shape=(s,),
+                        index=(i,),
+                    )
+                concat_chunk = DataFrameConcat(
+                    output_types=in_chunks[0].op.output_types
+                ).new_chunk(in_chunks, **kw)
+                chunk_op = op.copy().reset_key()
+                chunk_op._method = method
+                chunk_op.stage = (
+                    OperandStage.combine if out_chunk_size > 1 else OperandStage.agg
+                )
+                if out_chunk_size > 1 and method == "tree":
+                    # for tree, chunks except last one should be dataframes,
+                    chunk_op._output_types = (
+                        concat_chunk.op.output_types
+                        if out_chunk_size > 1
+                        else out.op.output_types
+                    )
+                elif method == "subset_tree":
+                    # `subset_tree` will tile chunks that are always dataframes
+                    chunk_op._output_types = [OutputType.dataframe]
+                params = cls._gen_chunk_params(chunk_op, concat_chunk)
+                if out.ndim == 1 and out_chunk_size == 1:
+                    params["name"] = out.name
+                out_chunks.append(chunk_op.new_chunk([concat_chunk], kws=[params]))
+            new_chunks = out_chunks
+
+        return new_chunks
+
+    @classmethod
+    def _tile_tree(cls, op: "DuplicateOperand", inp):
+        out = op.outputs[0]
+
+        params = out.params
+        params["chunks"] = chunks = cls._gen_tree_chunks(op, inp, "tree")
+        params["nsplits"] = tuple((s,) for s in chunks[0].shape)
+        new_op = op.copy()
+        return new_op.new_tileables([inp], kws=[params])
+
+    @classmethod
+    def _tile_subset_tree(cls, op: "DuplicateOperand", inp):
+        # subset is available for DataFrame only
+        inp = asdataframe(inp)
+        out = op.outputs[0]
+        subset = op.subset
+        if subset is None:
+            subset = inp.dtypes.index.tolist()
+        # select subset first
+        subset_df = yield from recursive_tile(inp[subset])
+        # tree aggregate subset
+        subset_chunk = cls._gen_tree_chunks(op, subset_df, "subset_tree")[0]
+
+        out_chunks = []
+        for c in inp.chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_op._method = "subset_tree"
+            chunk_op._subset_chunk = subset_chunk
+            chunk_params = cls._gen_chunk_params(chunk_op, c)
+            out_chunks.append(chunk_op.new_chunk([c, subset_chunk], kws=[chunk_params]))
+
+        new_op = op.copy()
+        params = out.params
+        params["chunks"] = out_chunks
+        splits = tuple(c.shape[0] for c in out_chunks)
+        if out.ndim == 2:
+            params["nsplits"] = (splits, inp.nsplits[1])
+        else:
+            params["nsplits"] = (splits,)
+        return new_op.new_tileables([inp], kws=[params])
+
+    @classmethod
+    def _tile_shuffle(cls, op: "DuplicateOperand", inp):
+        out = op.outputs[0]
+
+        map_chunks = cls._gen_map_chunks(
+            op, inp, "shuffle", _shuffle_size=inp.chunk_shape[0]
+        )
+        proxy_chunk = DataFrameShuffleProxy(
+            output_types=map_chunks[0].op.output_types
+        ).new_chunk(map_chunks, shape=())
+        reduce_chunks = []
+        for i in range(len(map_chunks)):
+            reduce_op = op.copy().reset_key()
+            reduce_op._method = "shuffle"
+            reduce_op.stage = OperandStage.reduce
+            reduce_op.reducer_phase = "drop_duplicates"
+            reduce_op.n_reducers = len(map_chunks)
+            reduce_op.reducer_ordinal = i
+            reduce_op._shuffle_size = inp.chunk_shape[0]
+            reduce_op._output_types = op.output_types
+            reduce_chunk_params = map_chunks[0].params
+            reduce_chunk_params["index"] = (i,) + reduce_chunk_params["index"][1:]
+            reduce_chunk_params["is_mapper"] = True
+            reduce_chunks.append(
+                reduce_op.new_chunk([proxy_chunk], kws=[reduce_chunk_params])
+            )
+
+        put_back_proxy_chunk = DataFrameShuffleProxy(
+            output_types=map_chunks[0].op.output_types
+        ).new_chunk(reduce_chunks, shape=())
+        put_back_chunks = []
+        for i in range(len(map_chunks)):
+            put_back_op = op.copy().reset_key()
+            put_back_op._method = "shuffle"
+            put_back_op.stage = OperandStage.reduce
+            put_back_op.reducer_phase = "put_back"
+            put_back_op.reducer_index = (i,)
+            put_back_op.n_reducers = len(map_chunks)
+            put_back_op.reducer_ordinal = i
+            if out.ndim == 2:
+                put_back_chunk_params = map_chunks[i].params
+            else:
+                put_back_chunk_params = out.params.copy()
+                map_chunk_params = map_chunks[i].params
+                put_back_chunk_params["index_value"] = map_chunk_params["index_value"]
+                put_back_chunk_params["index"] = map_chunk_params["index"][:1]
+            if out.ndim == 1:
+                put_back_chunk_params["index"] = (i,)
+            else:
+                put_back_chunk_params["index"] = (i,) + put_back_chunk_params["index"][
+                    1:
+                ]
+            put_back_chunk_params["shape"] = cls._get_shape(
+                map_chunks[i].op.input.shape, op
+            )
+            put_back_chunks.append(
+                put_back_op.new_chunk(
+                    [put_back_proxy_chunk], kws=[put_back_chunk_params]
+                )
+            )
+
+        new_op = op.copy()
+        params = out.params
+        params["chunks"] = put_back_chunks
+        split = tuple(c.shape[0] for c in put_back_chunks)
+        if out.ndim == 2:
+            params["nsplits"] = (split, inp.nsplits[1])
+        else:
+            params["nsplits"] = (split,)
+        return new_op.new_tileables([inp], kws=[params])
+
+    @classmethod
+    def tile(cls, op: "DuplicateOperand"):
+        inp = op.input
+
+        if len(inp.chunks) == 1:
+            # one chunk
+            return cls._tile_one_chunk(op)
+
+        if inp.ndim == 2 and inp.chunk_shape[1] > 1:
+            if has_unknown_shape(inp):
+                yield
+            inp = yield from recursive_tile(inp.rechunk({1: inp.shape[1]}))
+
+        default_tile = cls._tile_tree
+
+        if op.method == "auto":
+            # if method == 'auto', pick appropriate method
+            if np.isnan(inp.shape[0]) or op.subset is None:
+                # if any unknown shape exist,
+                # choose merge method
+                return default_tile(op, inp)
+
+            # check subset data to see if it's small enough
+            subset_dtypes = inp.dtypes[op.subset]
+            memory_usage = 0.0
+            for s_dtype in subset_dtypes:
+                if s_dtype.kind == "O" or not hasattr(s_dtype, "itemsize"):
+                    # object, just use default tile
+                    return default_tile(op, inp)
+                else:
+                    memory_usage += s_dtype.itemsize * inp.shape[0]
+            if memory_usage <= options.chunk_store_limit:
+                # if subset is small enough, use method 'subset_tree'
+                r = yield from cls._tile_subset_tree(op, inp)
+                return r
+            else:
+                return default_tile(op, inp)
+        elif op.method == "subset_tree":
+            r = yield from cls._tile_subset_tree(op, inp)
+            return r
+        elif op.method == "tree":
+            return cls._tile_tree(op, inp)
+        else:
+            assert op.method == "shuffle"
+            ret = cls._tile_shuffle(op, inp)
+            if inspect.isgenerator(ret):
+                return (yield from ret)
+            else:
+                return ret
+
+    @classmethod
+    def _drop_duplicates(cls, inp, op, subset=None, keep=None, ignore_index=None):
+        if ignore_index is None:
+            ignore_index = op.ignore_index
+        if subset is None:
+            subset = op.subset
+        if keep is None:
+            keep = op.keep
+        if inp.ndim == 2:
+            try:
+                return inp.drop_duplicates(
+                    subset=subset, keep=keep, ignore_index=ignore_index
+                )
+            except TypeError:
+                # no ignore_index for pandas < 1.0
+                ret = inp.drop_duplicates(subset=subset, keep=keep)
+                if ignore_index:
+                    ret.reset_index(drop=True, inplace=True)
+                return ret
+        else:
+            return inp.drop_duplicates(keep=keep)
+
+    @classmethod
+    def _get_xdf(cls, x):
+        if cudf is None:
+            return pd
+        elif isinstance(x, (pd.Index, pd.Series, pd.DataFrame)):  # pragma: no cover
+            return pd
+        else:  # pragma: no cover
+            return cudf
+
+    @classmethod
+    def _execute_subset_tree_map(cls, ctx, op):
+        out = op.outputs[0]
+        idx = out.index[0]
+        inp = ctx[op.input.key]
+        xdf = cls._get_xdf(inp)
+
+        # index would be (chunk_index, i)
+        index = xdf.MultiIndex.from_arrays(
+            [np.full(inp.shape[0], idx), np.arange(inp.shape[0])],
+            names=["_chunk_index_", "_i_"],
+        )
+        inp = inp.set_index(index)
+        ctx[out.key] = cls._drop_duplicates(inp, op, ignore_index=False)
+
+    @classmethod
+    def _execute_subset_tree_combine(cls, ctx, op):
+        inp = ctx[op.input.key]
+        ctx[op.outputs[0].key] = cls._drop_duplicates(inp, op, ignore_index=False)
+
+    @classmethod
+    def _execute_subset_tree_agg(cls, ctx, op):
+        inp = ctx[op.input.key]
+        ret = cls._drop_duplicates(inp, op, ignore_index=False)
+        ret = ret.index.to_frame()
+        ret.reset_index(drop=True, inplace=True)
+        ctx[op.outputs[0].key] = ret
+
+
+def validate_subset(df, subset):
+    if subset is None:
+        return subset
+    if not is_list_like(subset):
+        subset = [subset]
+    else:
+        subset = list(subset)
+
+    for s in subset:
+        if s not in df.dtypes:
+            raise KeyError(pd.Index([s]))
+
+    return subset
diff --git a/python/xorbits/_mars/dataframe/base/accessor.py b/python/xorbits/_mars/dataframe/base/accessor.py
new file mode 100644
index 000000000..fb9b88c39
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/accessor.py
@@ -0,0 +1,276 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import wraps
+from typing import Iterable
+
+import pandas as pd
+from pandas.api.types import (
+    is_datetime64_dtype,
+    is_datetime64tz_dtype,
+    is_period_dtype,
+    is_timedelta64_dtype,
+)
+
+from ...utils import adapt_mars_docstring
+from .datetimes import SeriesDatetimeMethod, _datetime_method_to_handlers
+from .string_ import SeriesStringMethod, _string_method_to_handlers
+
+
+class StringAccessor:
+    """
+    Vectorized string functions for Series and Index.
+    NAs stay NA unless handled otherwise by a particular method.
+    Patterned after Python's string methods, with some inspiration from
+    R's stringr package.
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> s = md.Series(["A_Str_Series"])
+    >>> s.execute()
+    0    A_Str_Series
+    dtype: object
+    >>> s.str.split("_").execute()
+    0    [A, Str, Series]
+    dtype: object
+    >>> s.str.replace("_", "").execute()
+    0    AStrSeries
+    dtype: object
+    """
+
+    def __init__(self, series):
+        self._series = series
+
+    @classmethod
+    def _gen_func(cls, method):
+        @wraps(getattr(pd.Series.str, method))
+        def _inner(self, *args, **kwargs):
+            op = SeriesStringMethod(
+                method=method, method_args=args, method_kwargs=kwargs
+            )
+            return op(self._series)
+
+        _inner.__doc__ = adapt_mars_docstring(getattr(pd.Series.str, method).__doc__)
+        return _inner
+
+    def __getitem__(self, item):
+        return self._gen_func("__getitem__")(self, item)
+
+    def __dir__(self) -> Iterable[str]:
+        s = set(super().__dir__())
+        s.update(_string_method_to_handlers.keys())
+        return list(s)
+
+    @classmethod
+    def _register(cls, method):
+        setattr(cls, method, cls._gen_func(method))
+
+    def split(self, pat=None, n=-1, expand=False):
+        r"""
+        Split strings around given separator/delimiter.
+
+        Splits the string in the Series/Index from the beginning,
+        at the specified delimiter string. Equivalent to :meth:`str.split`.
+
+        Parameters
+        ----------
+        pat : str, optional
+            String or regular expression to split on.
+            If not specified, split on whitespace.
+        n : int, default -1 (all)
+            Limit number of splits in output.
+            ``None``, 0 and -1 will be interpreted as return all splits.
+        expand : bool, default False
+            Expand the splitted strings into separate columns.
+
+            * If ``True``, return DataFrame/MultiIndex expanding dimensionality.
+            * If ``False``, return Series/Index, containing lists of strings.
+
+        Returns
+        -------
+        Series, Index, DataFrame or MultiIndex
+            Type matches caller unless ``expand=True`` (see Notes).
+
+        See Also
+        --------
+        Series.str.split : Split strings around given separator/delimiter.
+        Series.str.rsplit : Splits string around given separator/delimiter,
+            starting from the right.
+        Series.str.join : Join lists contained as elements in the Series/Index
+            with passed delimiter.
+        str.split : Standard library version for split.
+        str.rsplit : Standard library version for rsplit.
+
+        Notes
+        -----
+        The handling of the `n` keyword depends on the number of found splits:
+
+        - If found splits > `n`,  make first `n` splits only
+        - If found splits <= `n`, make all splits
+        - If for a certain row the number of found splits < `n`,
+          append `None` for padding up to `n` if ``expand=True``
+
+        If using ``expand=True``, Series and Index callers return DataFrame and
+        MultiIndex objects, respectively.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> import mars.dataframe as md
+        >>> s = md.Series(["this is a regular sentence",
+        >>>                "https://docs.python.org/3/tutorial/index.html",
+        >>>                np.nan])
+        >>> s.execute()
+        0                       this is a regular sentence
+        1    https://docs.python.org/3/tutorial/index.html
+        2                                              NaN
+        dtype: object
+
+        In the default setting, the string is split by whitespace.
+
+        >>> s.str.split().execute()
+        0                   [this, is, a, regular, sentence]
+        1    [https://docs.python.org/3/tutorial/index.html]
+        2                                                NaN
+        dtype: object
+
+        Without the `n` parameter, the outputs of `rsplit` and `split`
+        are identical.
+
+        >>> s.str.rsplit().execute()
+        0                   [this, is, a, regular, sentence]
+        1    [https://docs.python.org/3/tutorial/index.html]
+        2                                                NaN
+        dtype: object
+
+        The `n` parameter can be used to limit the number of splits on the
+        delimiter. The outputs of `split` and `rsplit` are different.
+
+        >>> s.str.split(n=2).execute()
+        0                     [this, is, a regular sentence]
+        1    [https://docs.python.org/3/tutorial/index.html]
+        2                                                NaN
+        dtype: object
+
+        >>> s.str.rsplit(n=2).execute()
+        0                     [this is a, regular, sentence]
+        1    [https://docs.python.org/3/tutorial/index.html]
+        2                                                NaN
+        dtype: object
+
+        The `pat` parameter can be used to split by other characters.
+
+        >>> s.str.split(pat = "/").execute()
+        0                         [this is a regular sentence]
+        1    [https:, , docs.python.org, 3, tutorial, index...
+        2                                                  NaN
+        dtype: object
+
+        When using ``expand=True``, the split elements will expand out into
+        separate columns. If NaN is present, it is propagated throughout
+        the columns during the split.
+
+        >>> s.str.split(expand=True).execute()
+                                                       0     1     2        3
+        0                                           this    is     a  regular
+        1  https://docs.python.org/3/tutorial/index.html  None  None     None
+        2                                            NaN   NaN   NaN      NaN \
+                     4
+        0     sentence
+        1         None
+        2          NaN
+
+        For slightly more complex use cases like splitting the html document name
+        from a url, a combination of parameter settings can be used.
+
+        >>> s.str.rsplit("/", n=1, expand=True).execute()
+                                            0           1
+        0          this is a regular sentence        None
+        1  https://docs.python.org/3/tutorial  index.html
+        2                                 NaN         NaN
+
+        Remember to escape special characters when explicitly using regular
+        expressions.
+
+        >>> s = pd.Series(["1+1=2"])
+        >>> s.str.split(r"\+|=", expand=True).execute()
+             0    1    2
+        0    1    1    2
+        """
+        return self._gen_func("split")(self, pat=pat, n=n, expand=expand)
+
+    def rsplit(self, pat=None, n=-1, expand=False):
+        return self._gen_func("rsplit")(self, pat=pat, n=n, expand=expand)
+
+    def cat(self, others=None, sep=None, na_rep=None, join="left"):
+        return self._gen_func("cat")(
+            self, others=others, sep=sep, na_rep=na_rep, join=join
+        )
+
+    rsplit.__doc__ = adapt_mars_docstring(pd.Series.str.rsplit.__doc__)
+    cat.__doc__ = adapt_mars_docstring(pd.Series.str.cat.__doc__)
+
+
+class DatetimeAccessor:
+    def __init__(self, series):
+        if (
+            not is_datetime64_dtype(series.dtype)
+            and not is_datetime64tz_dtype(series.dtype)
+            and not is_timedelta64_dtype(series.dtype)
+            and not is_period_dtype(series.dtype)
+        ):
+            raise AttributeError("Can only use .dt accessor with datetimelike values")
+        self._series = series
+
+    @classmethod
+    def _gen_func(cls, method, is_property):
+        @wraps(getattr(pd.Series.dt, method))
+        def _inner(self, *args, **kwargs):
+            op = SeriesDatetimeMethod(
+                method=method,
+                is_property=is_property,
+                method_args=args,
+                method_kwargs=kwargs,
+            )
+            return op(self._series)
+
+        _inner.__doc__ = adapt_mars_docstring(getattr(pd.Series.dt, method).__doc__)
+        return _inner
+
+    @classmethod
+    def _register(cls, method):
+        is_property = not callable(getattr(pd.Series.dt, method))
+        func = cls._gen_func(method, is_property)
+        if is_property:
+            func = property(func)
+        setattr(cls, method, func)
+
+    def __dir__(self) -> Iterable[str]:
+        s = set(super().__dir__())
+        s.update(_datetime_method_to_handlers.keys())
+        return list(s)
+
+
+class CachedAccessor:
+    def __init__(self, name: str, accessor) -> None:
+        self._name = name
+        self._accessor = accessor
+
+    def __get__(self, obj, cls):
+        if obj is None:
+            # we're accessing the attribute of the class, i.e., Dataset.geo
+            return self._accessor
+        if self._name not in obj._accessors:
+            obj._accessors[self._name] = self._accessor(obj)
+        return obj._accessors[self._name]
diff --git a/python/xorbits/_mars/dataframe/base/apply.py b/python/xorbits/_mars/dataframe/base/apply.py
new file mode 100644
index 000000000..d8db860f7
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/apply.py
@@ -0,0 +1,942 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...config import options
+from ...core import OutputType, recursive_tile
+from ...core.custom_log import redirect_custom_log
+from ...core.operand import OperatorLogicKeyGeneratorMixin
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    DictField,
+    FunctionField,
+    StringField,
+    TupleField,
+)
+from ...utils import enter_current_session, get_func_token, quiet_stdio, tokenize
+from ..arrays import ArrowArray
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import (
+    build_df,
+    build_empty_df,
+    build_empty_series,
+    build_series,
+    clean_up_func,
+    make_dtype,
+    make_dtypes,
+    parse_index,
+    restore_func,
+    validate_axis,
+    validate_output_types,
+)
+
+
+class ApplyOperandLogicKeyGeneratorMixin(OperatorLogicKeyGeneratorMixin):
+    def _get_logic_key_token_values(self):
+        token_values = super()._get_logic_key_token_values() + [
+            self._axis,
+            self._convert_dtype,
+            self._raw,
+            self._result_type,
+            self._elementwise,
+        ]
+        if self.func:
+            return token_values + [get_func_token(self.func)]
+        else:  # pragma: no cover
+            return token_values
+
+
+class ApplyOperand(
+    DataFrameOperand, DataFrameOperandMixin, ApplyOperandLogicKeyGeneratorMixin
+):
+    _op_type_ = opcodes.APPLY
+
+    _func = FunctionField("func")
+    _axis = AnyField("axis")
+    _convert_dtype = BoolField("convert_dtype")
+    _raw = BoolField("raw")
+    _result_type = StringField("result_type")
+    _elementwise = BoolField("elementwise")
+    _logic_key = StringField("logic_key")
+    _func_key = AnyField("func_key")
+    _need_clean_up_func = BoolField("need_clean_up_func")
+    _args = TupleField("args")
+    _kwds = DictField("kwds")
+
+    def __init__(
+        self,
+        func=None,
+        axis=None,
+        convert_dtype=None,
+        raw=None,
+        result_type=None,
+        args=None,
+        kwds=None,
+        output_type=None,
+        elementwise=None,
+        logic_key=None,
+        func_key=None,
+        need_clean_up_func=False,
+        **kw,
+    ):
+        if output_type:
+            kw["_output_types"] = [output_type]
+        super().__init__(
+            _func=func,
+            _axis=axis,
+            _convert_dtype=convert_dtype,
+            _raw=raw,
+            _result_type=result_type,
+            _args=args,
+            _kwds=kwds,
+            _elementwise=elementwise,
+            _logic_key=logic_key,
+            _func_key=func_key,
+            _need_clean_up_func=need_clean_up_func,
+            **kw,
+        )
+
+    def _update_key(self):
+        values = [v for v in self._values_ if v is not self.func] + [
+            get_func_token(self.func)
+        ]
+        self._obj_set("_key", tokenize(type(self).__name__, *values))
+        return self
+
+    @property
+    def func(self):
+        return self._func
+
+    @func.setter
+    def func(self, func):
+        self._func = func
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def convert_dtype(self):
+        return self._convert_dtype
+
+    @property
+    def raw(self):
+        return self._raw
+
+    @property
+    def result_type(self):
+        return self._result_type
+
+    @property
+    def elementwise(self):
+        return self._elementwise
+
+    @property
+    def logic_key(self):
+        return self._logic_key
+
+    @logic_key.setter
+    def logic_key(self, logic_key):
+        self._logic_key = logic_key
+
+    @property
+    def func_key(self):
+        return self._func_key
+
+    @func_key.setter
+    def func_key(self, func_key):
+        self._func_key = func_key
+
+    @property
+    def need_clean_up_func(self):
+        return self._need_clean_up_func
+
+    @need_clean_up_func.setter
+    def need_clean_up_func(self, need_clean_up_func: bool):
+        self._need_clean_up_func = need_clean_up_func
+
+    @property
+    def args(self):
+        return getattr(self, "_args", None) or ()
+
+    @property
+    def kwds(self):
+        return getattr(self, "_kwds", None) or dict()
+
+    @classmethod
+    @redirect_custom_log
+    @enter_current_session
+    def execute(cls, ctx, op):
+        restore_func(ctx, op)
+        input_data = ctx[op.inputs[0].key]
+        out = op.outputs[0]
+        if len(input_data) == 0:
+            if op.output_types[0] == OutputType.dataframe:
+                ctx[out.key] = build_empty_df(out.dtypes)
+            else:
+                ctx[out.key] = build_empty_series(out.dtype, name=out.name)
+            return
+
+        if isinstance(input_data, pd.DataFrame):
+            result = input_data.apply(
+                op.func,
+                axis=op.axis,
+                raw=op.raw,
+                result_type=op.result_type,
+                args=op.args,
+                **op.kwds,
+            )
+        else:
+            try:
+                result = input_data.apply(
+                    op.func, convert_dtype=op.convert_dtype, args=op.args, **op.kwds
+                )
+            except TypeError:
+                if isinstance(input_data.values, ArrowArray):
+                    input_data = pd.Series(
+                        input_data.to_numpy(),
+                        name=input_data.name,
+                        index=input_data.index,
+                    )
+                    result = input_data.apply(
+                        op.func, convert_dtype=op.convert_dtype, args=op.args, **op.kwds
+                    )
+                else:  # pragma: no cover
+                    raise
+        ctx[out.key] = result
+
+    @classmethod
+    def _tile_df(cls, op):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+        axis = op.axis
+        elementwise = op.elementwise
+
+        if not elementwise and in_df.chunk_shape[axis] > 1:
+            chunk_size = (
+                in_df.shape[axis],
+                max(1, options.chunk_store_limit // in_df.shape[axis]),
+            )
+            if axis == 1:
+                chunk_size = chunk_size[::-1]
+            in_df = yield from recursive_tile(in_df.rechunk(chunk_size))
+
+        chunks = []
+        if op.output_types and op.output_types[0] == OutputType.df_or_series:
+            for c in in_df.chunks:
+                new_op = op.copy().reset_key()
+                new_op.tileable_op_key = op.key
+                chunks.append(new_op.new_chunk([c], collapse_axis=axis, index=c.index))
+            new_nsplits = None
+        elif out_df.ndim == 2:
+            for c in in_df.chunks:
+                if elementwise:
+                    new_shape = c.shape
+                    new_index_value, new_columns_value = c.index_value, c.columns_value
+                else:
+                    new_shape = [np.nan, np.nan]
+                    new_shape[1 - axis] = c.shape[1 - axis]
+                    if axis == 0:
+                        new_index_value = out_df.index_value
+                        new_columns_value = c.columns_value
+                    else:
+                        new_index_value = c.index_value
+                        new_columns_value = out_df.columns_value
+
+                if op.axis == 0:
+                    new_dtypes = out_df.dtypes[c.dtypes.keys()]
+                else:
+                    new_dtypes = out_df.dtypes
+
+                new_op = op.copy().reset_key()
+                new_op.tileable_op_key = op.key
+                chunks.append(
+                    new_op.new_chunk(
+                        [c],
+                        shape=tuple(new_shape),
+                        index=c.index,
+                        dtypes=new_dtypes,
+                        index_value=new_index_value,
+                        columns_value=new_columns_value,
+                    )
+                )
+
+            new_nsplits = list(in_df.nsplits)
+            if not elementwise:
+                new_nsplits[axis] = (np.nan,) * len(new_nsplits[axis])
+        else:
+            for c in in_df.chunks:
+                shape_len = c.shape[1 - axis]
+                new_index_value = c.index_value if axis == 1 else c.columns_value
+                new_index = (c.index[1 - axis],)
+                new_op = op.copy().reset_key()
+                new_op.tileable_op_key = op.key
+                chunks.append(
+                    new_op.new_chunk(
+                        [c],
+                        shape=(shape_len,),
+                        index=new_index,
+                        dtype=out_df.dtype,
+                        index_value=new_index_value,
+                    )
+                )
+            new_nsplits = (in_df.nsplits[1 - axis],)
+
+        new_op = op.copy()
+        kw = out_df.params.copy()
+        if isinstance(new_nsplits, list):
+            new_nsplits = tuple(new_nsplits)
+        kw.update(dict(chunks=chunks, nsplits=new_nsplits))
+        return new_op.new_tileables(op.inputs, **kw)
+
+    @classmethod
+    def _tile_series(cls, op):
+        in_series = op.inputs[0]
+        out_series = op.outputs[0]
+        output_type = op.output_types[0] if op.output_types else None
+
+        chunks = []
+        for c in in_series.chunks:
+            new_op = op.copy().reset_key()
+            new_op.tileable_op_key = op.key
+            if output_type == OutputType.df_or_series:
+                chunks.append(new_op.new_chunk([c], collapse_axis=None, index=c.index))
+                continue
+            kw = c.params.copy()
+            if out_series.ndim == 1:
+                kw["dtype"] = out_series.dtype
+            else:
+                kw["index"] = (c.index[0], 0)
+                kw["shape"] = (c.shape[0], out_series.shape[1])
+                kw["dtypes"] = out_series.dtypes
+                kw["columns_value"] = out_series.columns_value
+            chunks.append(new_op.new_chunk([c], **kw))
+
+        new_op = op.copy()
+        kw = out_series.params.copy()
+        if output_type == OutputType.df_or_series:
+            kw.update(dict(chunks=chunks, nsplits=None))
+        else:
+            kw.update(dict(chunks=chunks, nsplits=in_series.nsplits))
+        if output_type != OutputType.df_or_series and out_series.ndim == 2:
+            kw["nsplits"] = (in_series.nsplits[0], (out_series.shape[1],))
+            kw["columns_value"] = out_series.columns_value
+        return new_op.new_tileables(op.inputs, **kw)
+
+    @classmethod
+    def tile(cls, op):
+        clean_up_func(op)
+        if op.inputs[0].ndim == 2:
+            return (yield from cls._tile_df(op))
+        else:
+            return cls._tile_series(op)
+
+    def _infer_df_func_returns(self, df, dtypes, dtype=None, name=None, index=None):
+        if isinstance(self._func, np.ufunc):
+            output_type = OutputType.dataframe
+            new_dtypes = None
+            index_value = "inherit"
+            new_elementwise = True
+        else:
+            if self.output_types is not None and (
+                dtypes is not None or dtype is not None
+            ):
+                ret_dtypes = dtypes if dtypes is not None else (name, dtype)
+                ret_index_value = parse_index(index) if index is not None else None
+                self._elementwise = False
+                return ret_dtypes, ret_index_value
+
+            output_type = new_dtypes = index_value = None
+            new_elementwise = False
+
+        try:
+            empty_df = build_df(df, size=2)
+            with np.errstate(all="ignore"), quiet_stdio():
+                infer_df = empty_df.apply(
+                    self._func,
+                    axis=self._axis,
+                    raw=self._raw,
+                    result_type=self._result_type,
+                    args=self.args,
+                    **self.kwds,
+                )
+            if index_value is None:
+                if infer_df.index is empty_df.index:
+                    index_value = "inherit"
+                else:
+                    index_value = parse_index(pd.RangeIndex(-1))
+
+            if isinstance(infer_df, pd.DataFrame):
+                output_type = output_type or OutputType.dataframe
+                new_dtypes = new_dtypes or infer_df.dtypes
+            else:
+                output_type = output_type or OutputType.series
+                new_dtypes = (name or infer_df.name, dtype or infer_df.dtype)
+            new_elementwise = False if new_elementwise is None else new_elementwise
+        except:  # noqa: E722  # nosec
+            pass
+
+        self.output_types = (
+            [output_type] if not self.output_types else self.output_types
+        )
+        dtypes = new_dtypes if dtypes is None else dtypes
+        index_value = index_value if index is None else parse_index(index)
+        self._elementwise = (
+            new_elementwise if self._elementwise is None else self._elementwise
+        )
+        return dtypes, index_value
+
+    def _call_df_or_series(self, df):
+        return self.new_df_or_series([df])
+
+    def _call_dataframe(self, df, dtypes=None, dtype=None, name=None, index=None):
+        # for backward compatibility
+        dtype = dtype if dtype is not None else dtypes
+        dtypes, index_value = self._infer_df_func_returns(
+            df, dtypes, dtype=dtype, name=name, index=index
+        )
+        if index_value is None:
+            index_value = parse_index(None, (df.key, df.index_value.key))
+        for arg, desc in zip((self.output_types, dtypes), ("output_types", "dtypes")):
+            if arg is None:
+                raise TypeError(
+                    f"Cannot determine {desc} by calculating with enumerate data, "
+                    "please specify it as arguments"
+                )
+
+        if index_value == "inherit":
+            index_value = df.index_value
+
+        if self._elementwise:
+            shape = df.shape
+        elif self.output_types[0] == OutputType.dataframe:
+            shape = [np.nan, np.nan]
+            shape[1 - self.axis] = df.shape[1 - self.axis]
+            shape = tuple(shape)
+        else:
+            shape = (df.shape[1 - self.axis],)
+
+        if self.output_types[0] == OutputType.dataframe:
+            if self.axis == 0:
+                return self.new_dataframe(
+                    [df],
+                    shape=shape,
+                    dtypes=dtypes,
+                    index_value=index_value,
+                    columns_value=parse_index(dtypes.index, store_data=True),
+                )
+            else:
+                return self.new_dataframe(
+                    [df],
+                    shape=shape,
+                    dtypes=dtypes,
+                    index_value=df.index_value,
+                    columns_value=parse_index(dtypes.index, store_data=True),
+                )
+        else:
+            name, dtype = dtypes
+            return self.new_series(
+                [df], shape=shape, name=name, dtype=dtype, index_value=index_value
+            )
+
+    def _call_series(self, series, dtypes=None, dtype=None, name=None, index=None):
+        # for backward compatibility
+        dtype = dtype if dtype is not None else dtypes
+        if self._convert_dtype:
+            if self.output_types is not None and (
+                dtypes is not None or dtype is not None
+            ):
+                infer_series = test_series = None
+            else:
+                test_series = build_series(series, size=2, name=series.name)
+                try:
+                    with np.errstate(all="ignore"), quiet_stdio():
+                        infer_series = test_series.apply(
+                            self._func, args=self.args, **self.kwds
+                        )
+                except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+                    infer_series = None
+
+            output_type = self._output_types[0]
+
+            if index is not None:
+                index_value = parse_index(index)
+            elif infer_series is not None:
+                if infer_series.index is test_series.index:
+                    index_value = series.index_value
+                else:  # pragma: no cover
+                    index_value = parse_index(infer_series.index)
+            else:
+                index_value = parse_index(None, series)
+
+            if output_type == OutputType.dataframe:
+                if dtypes is None:
+                    if infer_series is not None and infer_series.ndim == 2:
+                        dtypes = infer_series.dtypes
+                    else:
+                        raise TypeError(
+                            "Cannot determine dtypes, "
+                            "please specify `dtypes` as argument"
+                        )
+                columns_value = parse_index(dtypes.index, store_data=True)
+
+                return self.new_dataframe(
+                    [series],
+                    shape=(series.shape[0], len(dtypes)),
+                    index_value=index_value,
+                    columns_value=columns_value,
+                    dtypes=dtypes,
+                )
+            else:
+                if (
+                    dtype is None
+                    and infer_series is not None
+                    and infer_series.ndim == 1
+                ):
+                    dtype = infer_series.dtype
+                else:
+                    dtype = dtype if dtype is not None else np.dtype(object)
+                if infer_series is not None and infer_series.ndim == 1:
+                    name = name or infer_series.name
+                return self.new_series(
+                    [series],
+                    dtype=dtype,
+                    shape=series.shape,
+                    index_value=index_value,
+                    name=name,
+                )
+        else:
+            dtype = dtype if dtype is not None else np.dtype("object")
+            return self.new_series(
+                [series],
+                dtype=dtype,
+                shape=series.shape,
+                index_value=series.index_value,
+                name=name,
+            )
+
+    def __call__(self, df_or_series, dtypes=None, dtype=None, name=None, index=None):
+        axis = getattr(self, "axis", None) or 0
+        dtypes = make_dtypes(dtypes)
+        dtype = make_dtype(dtype)
+        self._axis = validate_axis(axis, df_or_series)
+
+        if self.output_types and self.output_types[0] == OutputType.df_or_series:
+            return self._call_df_or_series(df_or_series)
+
+        if df_or_series.op.output_types[0] == OutputType.dataframe:
+            return self._call_dataframe(
+                df_or_series, dtypes=dtypes, dtype=dtype, name=name, index=index
+            )
+        else:
+            return self._call_series(
+                df_or_series, dtypes=dtypes, dtype=dtype, name=name, index=index
+            )
+
+
+def df_apply(
+    df,
+    func,
+    axis=0,
+    raw=False,
+    result_type=None,
+    args=(),
+    dtypes=None,
+    dtype=None,
+    name=None,
+    output_type=None,
+    index=None,
+    elementwise=None,
+    skip_infer=False,
+    **kwds,
+):
+    """
+    Apply a function along an axis of the DataFrame.
+
+    Objects passed to the function are Series objects whose index is
+    either the DataFrame's index (``axis=0``) or the DataFrame's columns
+    (``axis=1``). By default (``result_type=None``), the final return type
+    is inferred from the return type of the applied function. Otherwise,
+    it depends on the `result_type` argument.
+
+    Parameters
+    ----------
+    func : function
+        Function to apply to each column or row.
+    axis : {0 or 'index', 1 or 'columns'}, default 0
+        Axis along which the function is applied:
+
+        * 0 or 'index': apply function to each column.
+        * 1 or 'columns': apply function to each row.
+
+    raw : bool, default False
+        Determines if row or column is passed as a Series or ndarray object:
+
+        * ``False`` : passes each row or column as a Series to the
+          function.
+        * ``True`` : the passed function will receive ndarray objects
+          instead.
+          If you are just applying a NumPy reduction function this will
+          achieve much better performance.
+
+    result_type : {'expand', 'reduce', 'broadcast', None}, default None
+        These only act when ``axis=1`` (columns):
+
+        * 'expand' : list-like results will be turned into columns.
+        * 'reduce' : returns a Series if possible rather than expanding
+          list-like results. This is the opposite of 'expand'.
+        * 'broadcast' : results will be broadcast to the original shape
+          of the DataFrame, the original index and columns will be
+          retained.
+
+        The default behaviour (None) depends on the return value of the
+        applied function: list-like results will be returned as a Series
+        of those. However if the apply function returns a Series these
+        are expanded to columns.
+
+    output_type : {'dataframe', 'series'}, default None
+        Specify type of returned object. See `Notes` for more details.
+
+    dtypes : Series, default None
+        Specify dtypes of returned DataFrames. See `Notes` for more details.
+
+    dtype : numpy.dtype, default None
+        Specify dtype of returned Series. See `Notes` for more details.
+
+    name : str, default None
+        Specify name of returned Series. See `Notes` for more details.
+
+    index : Index, default None
+        Specify index of returned object. See `Notes` for more details.
+
+    elementwise : bool, default False
+        Specify whether ``func`` is an elementwise function:
+
+        * ``False`` : The function is not elementwise. Mars will try
+          concatenating chunks in rows (when ``axis=0``) or in columns
+          (when ``axis=1``) and then apply ``func`` onto the concatenated
+          chunk. The concatenation step can cause extra latency.
+        * ``True`` : The function is elementwise. Mars will apply
+          ``func`` to original chunks. This will not introduce extra
+          concatenation step and reduce overhead.
+
+    skip_infer: bool, default False
+        Whether infer dtypes when dtypes or output_type is not specified.
+
+    args : tuple
+        Positional arguments to pass to `func` in addition to the
+        array/series.
+
+    **kwds
+        Additional keyword arguments to pass as keywords arguments to
+        `func`.
+
+    Returns
+    -------
+    Series or DataFrame
+        Result of applying ``func`` along the given axis of the
+        DataFrame.
+
+    See Also
+    --------
+    DataFrame.applymap: For elementwise operations.
+    DataFrame.aggregate: Only perform aggregating type operations.
+    DataFrame.transform: Only perform transforming type operations.
+
+    Notes
+    -----
+    When deciding output dtypes and shape of the return value, Mars will
+    try applying ``func`` onto a mock DataFrame,  and the apply call may
+    fail. When this happens, you need to specify the type of apply call
+    (DataFrame or Series) in output_type.
+
+    * For DataFrame output, you need to specify a list or a pandas Series
+      as ``dtypes`` of output DataFrame. ``index`` of output can also be
+      specified.
+    * For Series output, you need to specify ``dtype`` and ``name`` of
+      output Series.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import mars.tensor as mt
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
+    >>> df.execute()
+       A  B
+    0  4  9
+    1  4  9
+    2  4  9
+
+    Using a reducing function on either axis
+
+    >>> df.apply(np.sum, axis=0).execute()
+    A    12
+    B    27
+    dtype: int64
+
+    >>> df.apply(np.sum, axis=1).execute()
+    0    13
+    1    13
+    2    13
+    dtype: int64
+
+    Returning a list-like will result in a Series
+
+    >>> df.apply(lambda x: [1, 2], axis=1).execute()
+    0    [1, 2]
+    1    [1, 2]
+    2    [1, 2]
+    dtype: object
+
+    Passing ``result_type='expand'`` will expand list-like results
+    to columns of a Dataframe
+
+    >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand').execute()
+       0  1
+    0  1  2
+    1  1  2
+    2  1  2
+
+    Returning a Series inside the function is similar to passing
+    ``result_type='expand'``. The resulting column names
+    will be the Series index.
+
+    >>> df.apply(lambda x: md.Series([1, 2], index=['foo', 'bar']), axis=1).execute()
+       foo  bar
+    0    1    2
+    1    1    2
+    2    1    2
+
+    Passing ``result_type='broadcast'`` will ensure the same shape
+    result, whether list-like or scalar is returned by the function,
+    and broadcast it along the axis. The resulting column names will
+    be the originals.
+
+    >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast').execute()
+       A  B
+    0  1  2
+    1  1  2
+    2  1  2
+    """
+    if isinstance(func, (list, dict)):
+        return df.aggregate(func, axis)
+
+    output_types = kwds.pop("output_types", None)
+    object_type = kwds.pop("object_type", None)
+    output_types = validate_output_types(
+        output_type=output_type, output_types=output_types, object_type=object_type
+    )
+    output_type = output_types[0] if output_types else None
+    if skip_infer and output_type is None:
+        output_type = OutputType.df_or_series
+
+    # calling member function
+    if isinstance(func, str):
+        func = getattr(df, func)
+        sig = inspect.getfullargspec(func)
+        if "axis" in sig.args:
+            kwds["axis"] = axis
+        return func(*args, **kwds)
+
+    op = ApplyOperand(
+        func=func,
+        axis=axis,
+        raw=raw,
+        result_type=result_type,
+        args=args,
+        kwds=kwds,
+        output_type=output_type,
+        elementwise=elementwise,
+    )
+    return op(df, dtypes=dtypes, dtype=dtype, name=name, index=index)
+
+
+def series_apply(
+    series,
+    func,
+    convert_dtype=True,
+    output_type=None,
+    args=(),
+    dtypes=None,
+    dtype=None,
+    name=None,
+    index=None,
+    skip_infer=False,
+    **kwds,
+):
+    """
+    Invoke function on values of Series.
+
+    Can be ufunc (a NumPy function that applies to the entire Series)
+    or a Python function that only works on single values.
+
+    Parameters
+    ----------
+    func : function
+        Python function or NumPy ufunc to apply.
+
+    convert_dtype : bool, default True
+        Try to find better dtype for elementwise function results. If
+        False, leave as dtype=object.
+
+    output_type : {'dataframe', 'series'}, default None
+        Specify type of returned object. See `Notes` for more details.
+
+    dtypes : Series, default None
+        Specify dtypes of returned DataFrames. See `Notes` for more details.
+
+    dtype : numpy.dtype, default None
+        Specify dtype of returned Series. See `Notes` for more details.
+
+    name : str, default None
+        Specify name of returned Series. See `Notes` for more details.
+
+    index : Index, default None
+        Specify index of returned object. See `Notes` for more details.
+
+    args : tuple
+        Positional arguments passed to func after the series value.
+
+    skip_infer: bool, default False
+        Whether infer dtypes when dtypes or output_type is not specified.
+
+    **kwds
+        Additional keyword arguments passed to func.
+
+    Returns
+    -------
+    Series or DataFrame
+        If func returns a Series object the result will be a DataFrame.
+
+    See Also
+    --------
+    Series.map: For element-wise operations.
+    Series.agg: Only perform aggregating type operations.
+    Series.transform: Only perform transforming type operations.
+
+    Notes
+    -----
+    When deciding output dtypes and shape of the return value, Mars will
+    try applying ``func`` onto a mock Series, and the apply call may fail.
+    When this happens, you need to specify the type of apply call
+    (DataFrame or Series) in output_type.
+
+    * For DataFrame output, you need to specify a list or a pandas Series
+      as ``dtypes`` of output DataFrame. ``index`` of output can also be
+      specified.
+    * For Series output, you need to specify ``dtype`` and ``name`` of
+      output Series.
+
+    Examples
+    --------
+    Create a series with typical summer temperatures for each city.
+
+    >>> import mars.tensor as mt
+    >>> import mars.dataframe as md
+    >>> s = md.Series([20, 21, 12],
+    ...               index=['London', 'New York', 'Helsinki'])
+    >>> s.execute()
+    London      20
+    New York    21
+    Helsinki    12
+    dtype: int64
+
+    Square the values by defining a function and passing it as an
+    argument to ``apply()``.
+
+    >>> def square(x):
+    ...     return x ** 2
+    >>> s.apply(square).execute()
+    London      400
+    New York    441
+    Helsinki    144
+    dtype: int64
+
+    Square the values by passing an anonymous function as an
+    argument to ``apply()``.
+
+    >>> s.apply(lambda x: x ** 2).execute()
+    London      400
+    New York    441
+    Helsinki    144
+    dtype: int64
+
+    Define a custom function that needs additional positional
+    arguments and pass these additional arguments using the
+    ``args`` keyword.
+
+    >>> def subtract_custom_value(x, custom_value):
+    ...     return x - custom_value
+
+    >>> s.apply(subtract_custom_value, args=(5,)).execute()
+    London      15
+    New York    16
+    Helsinki     7
+    dtype: int64
+
+    Define a custom function that takes keyword arguments
+    and pass these arguments to ``apply``.
+
+    >>> def add_custom_values(x, **kwargs):
+    ...     for month in kwargs:
+    ...         x += kwargs[month]
+    ...     return x
+
+    >>> s.apply(add_custom_values, june=30, july=20, august=25).execute()
+    London      95
+    New York    96
+    Helsinki    87
+    dtype: int64
+    """
+    if isinstance(func, (list, dict)):
+        return series.aggregate(func)
+
+    # calling member function
+    if isinstance(func, str):
+        func_body = getattr(series, func, None)
+        if func_body is not None:
+            return func_body(*args, **kwds)
+        func_str = func
+        func = getattr(np, func_str, None)
+        if func is None:
+            raise AttributeError(
+                f"'{func_str!r}' is not a valid function "
+                f"for '{type(series).__name__}' object"
+            )
+
+    if skip_infer and output_type is None:
+        output_type = OutputType.df_or_series
+
+    output_types = kwds.pop("output_types", None)
+    object_type = kwds.pop("object_type", None)
+    output_types = validate_output_types(
+        output_type=output_type, output_types=output_types, object_type=object_type
+    )
+    output_type = output_types[0] if output_types else OutputType.series
+
+    op = ApplyOperand(
+        func=func,
+        convert_dtype=convert_dtype,
+        args=args,
+        kwds=kwds,
+        output_type=output_type,
+    )
+    return op(series, dtypes=dtypes, dtype=dtype, name=name, index=index)
diff --git a/python/xorbits/_mars/dataframe/base/astype.py b/python/xorbits/_mars/dataframe/base/astype.py
new file mode 100644
index 000000000..cd73be813
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/astype.py
@@ -0,0 +1,416 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+from pandas.api.types import CategoricalDtype
+
+from ... import opcodes as OperandDef
+from ...core import recursive_tile
+from ...serialization.serializables import AnyField, ListField, StringField
+from ...tensor.base import sort
+from ...utils import pd_release_version
+from ..core import DATAFRAME_TYPE, SERIES_TYPE
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_empty_df, build_empty_series, parse_index
+
+_need_astype_contiguous = pd_release_version == (1, 3, 0)
+
+
+class DataFrameAstype(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.ASTYPE
+
+    _dtype_values = AnyField("dtype_values")
+    _errors = StringField("errors")
+    _category_cols = ListField("category_cols")
+
+    def __init__(
+        self,
+        dtype_values=None,
+        errors=None,
+        category_cols=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _dtype_values=dtype_values,
+            _errors=errors,
+            _category_cols=category_cols,
+            _output_types=output_types,
+            **kw
+        )
+
+    @property
+    def dtype_values(self):
+        return self._dtype_values
+
+    @property
+    def errors(self):
+        return self._errors
+
+    @property
+    def category_cols(self):
+        return self._category_cols
+
+    @classmethod
+    def _tile_one_chunk(cls, op):
+        c = op.inputs[0].chunks[0]
+        chunk_op = op.copy().reset_key()
+        chunk_params = op.outputs[0].params.copy()
+        chunk_params["index"] = c.index
+        out_chunks = [chunk_op.new_chunk([c], **chunk_params)]
+
+        new_op = op.copy()
+        return new_op.new_tileables(
+            op.inputs,
+            nsplits=op.inputs[0].nsplits,
+            chunks=out_chunks,
+            **op.outputs[0].params.copy()
+        )
+
+    @classmethod
+    def _tile_series_index(cls, op):
+        in_series = op.inputs[0]
+        out = op.outputs[0]
+
+        unique_chunk = None
+        if op.dtype_values == "category" and isinstance(op.dtype_values, str):
+            unique_chunk = (yield from recursive_tile(sort(in_series.unique()))).chunks[
+                0
+            ]
+
+        chunks = []
+        for c in in_series.chunks:
+            chunk_op = op.copy().reset_key()
+            params = c.params.copy()
+            params["dtype"] = out.dtype
+            if unique_chunk is not None:
+                chunk_op._category_cols = [in_series.name]
+                new_chunk = chunk_op.new_chunk([c, unique_chunk], **params)
+            else:
+                new_chunk = chunk_op.new_chunk([c], **params)
+            chunks.append(new_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tileables(
+            op.inputs, nsplits=in_series.nsplits, chunks=chunks, **out.params.copy()
+        )
+
+    @classmethod
+    def _tile_dataframe(cls, op):
+        in_df = op.inputs[0]
+        out = op.outputs[0]
+        cum_nsplits = np.cumsum((0,) + in_df.nsplits[1])
+        out_chunks = []
+
+        if op.dtype_values == "category":
+            # all columns need unique values
+            for c in in_df.chunks:
+                chunk_op = op.copy().reset_key()
+                params = c.params.copy()
+                dtypes = out.dtypes[
+                    cum_nsplits[c.index[1]] : cum_nsplits[c.index[1] + 1]
+                ]
+                params["dtypes"] = dtypes
+                chunk_op._category_cols = list(c.columns_value.to_pandas())
+                unique_chunks = []
+                for col in c.columns_value.to_pandas():
+                    unique = yield from recursive_tile(sort(in_df[col].unique()))
+                    unique_chunks.append(unique.chunks[0])
+                new_chunk = chunk_op.new_chunk([c] + unique_chunks, **params)
+                out_chunks.append(new_chunk)
+        elif (
+            isinstance(op.dtype_values, dict) and "category" in op.dtype_values.values()
+        ):
+            # some columns' types are category
+            category_cols = [
+                c
+                for c, v in op.dtype_values.items()
+                if isinstance(v, str) and v == "category"
+            ]
+            unique_chunks = dict()
+            for col in category_cols:
+                unique = yield from recursive_tile(sort(in_df[col].unique()))
+                unique_chunks[col] = unique.chunks[0]
+            for c in in_df.chunks:
+                chunk_op = op.copy().reset_key()
+                params = c.params.copy()
+                dtypes = out.dtypes[
+                    cum_nsplits[c.index[1]] : cum_nsplits[c.index[1] + 1]
+                ]
+                params["dtypes"] = dtypes
+                chunk_category_cols = []
+                chunk_unique_chunks = []
+                for col in c.columns_value.to_pandas():
+                    if col in category_cols:
+                        chunk_category_cols.append(col)
+                        chunk_unique_chunks.append(unique_chunks[col])
+                chunk_op._category_cols = chunk_category_cols
+                new_chunk = chunk_op.new_chunk([c] + chunk_unique_chunks, **params)
+                out_chunks.append(new_chunk)
+        else:
+            for c in in_df.chunks:
+                chunk_op = op.copy().reset_key()
+                params = c.params.copy()
+                dtypes = out.dtypes[
+                    cum_nsplits[c.index[1]] : cum_nsplits[c.index[1] + 1]
+                ]
+                params["dtypes"] = dtypes
+                new_chunk = chunk_op.new_chunk([c], **params)
+                out_chunks.append(new_chunk)
+
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            op.inputs, nsplits=in_df.nsplits, chunks=out_chunks, **out.params.copy()
+        )
+
+    @classmethod
+    def tile(cls, op):
+        if len(op.inputs[0].chunks) == 1:
+            return cls._tile_one_chunk(op)
+        elif isinstance(op.inputs[0], DATAFRAME_TYPE):
+            return (yield from cls._tile_dataframe(op))
+        else:
+            return (yield from cls._tile_series_index(op))
+
+    @classmethod
+    def execute(cls, ctx, op):
+        in_data = ctx[op.inputs[0].key]
+        if not isinstance(op.dtype_values, dict):
+            if op.category_cols is not None:
+                uniques = [ctx[c.key] for c in op.inputs[1:]]
+                dtype = dict(
+                    (col, CategoricalDtype(unique_values))
+                    for col, unique_values in zip(op.category_cols, uniques)
+                )
+                ctx[op.outputs[0].key] = in_data.astype(dtype, errors=op.errors)
+
+            elif isinstance(in_data, pd.Index):
+                ctx[op.outputs[0].key] = in_data.astype(op.dtype_values)
+            else:
+                if _need_astype_contiguous and not in_data.values.flags.contiguous:
+                    # astype changes the data order in pandas==1.3.0, see pandas#42396
+                    in_data = in_data.copy()
+                ctx[op.outputs[0].key] = in_data.astype(
+                    op.dtype_values, errors=op.errors
+                )
+        else:
+            selected_dtype = dict(
+                (k, v) for k, v in op.dtype_values.items() if k in in_data.columns
+            )
+            if op.category_cols is not None:
+                uniques = [ctx[c.key] for c in op.inputs[1:]]
+                for col, unique_values in zip(op.category_cols, uniques):
+                    selected_dtype[col] = CategoricalDtype(unique_values)
+            ctx[op.outputs[0].key] = in_data.astype(selected_dtype, errors=op.errors)
+
+    def __call__(self, df):
+        if isinstance(df, DATAFRAME_TYPE):
+            empty_df = build_empty_df(df.dtypes)
+            new_df = empty_df.astype(self.dtype_values, errors=self.errors)
+            dtypes = []
+            for dt, new_dt in zip(df.dtypes, new_df.dtypes):
+                if new_dt != dt and isinstance(new_dt, CategoricalDtype):
+                    dtypes.append(CategoricalDtype())
+                else:
+                    dtypes.append(new_dt)
+            dtypes = pd.Series(dtypes, index=new_df.dtypes.index)
+            return self.new_dataframe(
+                [df],
+                shape=df.shape,
+                dtypes=dtypes,
+                index_value=df.index_value,
+                columns_value=df.columns_value,
+            )
+        else:
+            empty_series = build_empty_series(df.dtype)
+            new_series = empty_series.astype(self.dtype_values, errors=self.errors)
+            if new_series.dtype != df.dtype:
+                dtype = (
+                    CategoricalDtype()
+                    if isinstance(new_series.dtype, CategoricalDtype)
+                    else new_series.dtype
+                )
+            else:  # pragma: no cover
+                dtype = df.dtype
+
+            if isinstance(df, SERIES_TYPE):
+                return self.new_series(
+                    [df],
+                    shape=df.shape,
+                    dtype=dtype,
+                    name=df.name,
+                    index_value=df.index_value,
+                )
+            else:
+                new_index = df.index_value.to_pandas().astype(self.dtype_values)
+                new_index_value = parse_index(
+                    new_index, store_data=df.index_value.has_value()
+                )
+                return self.new_index(
+                    [df],
+                    shape=df.shape,
+                    dtype=dtype,
+                    name=df.name,
+                    index_value=new_index_value,
+                )
+
+
+def astype(df, dtype, copy=True, errors="raise"):
+    """
+    Cast a pandas object to a specified dtype ``dtype``.
+
+    Parameters
+    ----------
+    dtype : data type, or dict of column name -> data type
+        Use a numpy.dtype or Python type to cast entire pandas object to
+        the same type. Alternatively, use {col: dtype, ...}, where col is a
+        column label and dtype is a numpy.dtype or Python type to cast one
+        or more of the DataFrame's columns to column-specific types.
+    copy : bool, default True
+        Return a copy when ``copy=True`` (be very careful setting
+        ``copy=False`` as changes to values then may propagate to other
+        pandas objects).
+    errors : {'raise', 'ignore'}, default 'raise'
+        Control raising of exceptions on invalid data for provided dtype.
+
+        - ``raise`` : allow exceptions to be raised
+        - ``ignore`` : suppress exceptions. On error return original object.
+
+    Returns
+    -------
+    casted : same type as caller
+
+    See Also
+    --------
+    to_datetime : Convert argument to datetime.
+    to_timedelta : Convert argument to timedelta.
+    to_numeric : Convert argument to a numeric type.
+    numpy.ndarray.astype : Cast a numpy array to a specified type.
+
+    Examples
+    --------
+    Create a DataFrame:
+
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame(pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}))
+    >>> df.dtypes
+    col1    int64
+    col2    int64
+    dtype: object
+
+    Cast all columns to int32:
+
+    >>> df.astype('int32').dtypes
+    col1    int32
+    col2    int32
+    dtype: object
+
+    Cast col1 to int32 using a dictionary:
+
+    >>> df.astype({'col1': 'int32'}).dtypes
+    col1    int32
+    col2    int64
+    dtype: object
+
+    Create a series:
+
+    >>> ser = md.Series(pd.Series([1, 2], dtype='int32'))
+    >>> ser.execute()
+    0    1
+    1    2
+    dtype: int32
+    >>> ser.astype('int64').execute()
+    0    1
+    1    2
+    dtype: int64
+
+    Convert to categorical type:
+
+    >>> ser.astype('category').execute()
+    0    1
+    1    2
+    dtype: category
+    Categories (2, int64): [1, 2]
+
+    Convert to ordered categorical type with custom ordering:
+
+    >>> cat_dtype = pd.api.types.CategoricalDtype(
+    ...     categories=[2, 1], ordered=True)
+    >>> ser.astype(cat_dtype).execute()
+    0    1
+    1    2
+    dtype: category
+    Categories (2, int64): [2 < 1]
+
+    Note that using ``copy=False`` and changing data on a new
+    pandas object may propagate changes:
+
+    >>> s1 = md.Series(pd.Series([1, 2]))
+    >>> s2 = s1.astype('int64', copy=False)
+    >>> s1.execute()  # note that s1[0] has changed too
+    0     1
+    1     2
+    dtype: int64
+    """
+    if isinstance(dtype, dict):
+        keys = list(dtype.keys())
+        if isinstance(df, SERIES_TYPE):
+            if len(keys) != 1 or keys[0] != df.name:
+                raise KeyError(
+                    "Only the Series name can be used for the key in Series dtype mappings."
+                )
+            else:
+                dtype = list(dtype.values())[0]
+        else:
+            for k in keys:
+                columns = df.columns_value.to_pandas()
+                if k not in columns:
+                    raise KeyError(
+                        "Only a column name can be used for the key in a dtype mappings argument."
+                    )
+    op = DataFrameAstype(dtype_values=dtype, errors=errors)
+    r = op(df)
+    if not copy:
+        df.data = r.data
+        return df
+    else:
+        return r
+
+
+def index_astype(ix, dtype, copy=True):
+    """
+    Create an Index with values cast to dtypes.
+
+    The class of a new Index is determined by dtype. When conversion is
+    impossible, a ValueError exception is raised.
+
+    Parameters
+    ----------
+    dtype : numpy dtype or pandas type
+        Note that any signed integer `dtype` is treated as ``'int64'``,
+        and any unsigned integer `dtype` is treated as ``'uint64'``,
+        regardless of the size.
+    copy : bool, default True
+        By default, astype always returns a newly allocated object.
+        If copy is set to False and internal requirements on dtype are
+        satisfied, the original data is used to create a new Index
+        or the original Index is returned.
+
+    Returns
+    -------
+    Index
+        Index with values cast to specified dtype.
+    """
+    return astype(ix, dtype, copy=copy)
diff --git a/python/xorbits/_mars/dataframe/base/bloom_filter.py b/python/xorbits/_mars/dataframe/base/bloom_filter.py
new file mode 100644
index 000000000..b3fccc8c4
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/bloom_filter.py
@@ -0,0 +1,283 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, List, Union
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import OutputType
+from ...core.context import Context
+from ...lib.bloom_filter import BloomFilter
+from ...serialization.serializables import (
+    AnyField,
+    Float64Field,
+    Int64Field,
+    StringField,
+)
+from ...typing import TileableType
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index
+
+
+class DataFrameBloomFilter(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.DATAFRAME_BLOOM_FILTER
+
+    left_on = AnyField("left_on")
+    right_on = AnyField("right_on")
+    on = AnyField("on")
+    # for build
+    max_elements = Int64Field("max_elements")
+    error_rate = Float64Field("error_rate")
+    combine_size = Int64Field("combine_size")
+    # chunk
+    execution_stage = StringField("execution_stage", default=None)
+
+    def __init__(self, execution_stage=None, **kwargs):
+        if execution_stage in ["build", "union"]:
+            output_types = [OutputType.object]
+        else:
+            output_types = [OutputType.dataframe]
+        kwargs["_output_types"] = output_types
+        super().__init__(execution_stage=execution_stage, **kwargs)
+
+    def __call__(self, df1: TileableType, df2: TileableType):
+        return self.new_tileable([df1, df2], **df1.params)
+
+    @classmethod
+    def tile(cls, op: "DataFrameBloomFilter"):
+        df1, df2 = op.inputs
+        # use df2's chunks to build bloom filter
+        chunks = []
+        for c in df2.chunks:
+            build_op = DataFrameBloomFilter(
+                on=op.right_on,
+                max_elements=op.max_elements,
+                error_rate=op.error_rate,
+                execution_stage="build",
+            )
+            chunks.append(build_op.new_chunk(inputs=[c]))
+
+        # union all chunk filters
+        combine_size = op.combine_size
+        while len(chunks) > combine_size:
+            new_chunks = []
+            for i in range(0, len(chunks), combine_size):
+                chks = chunks[i : i + combine_size]
+                if len(chks) == 1:
+                    chk = chks[0]
+                else:
+                    union_op = DataFrameBloomFilter(execution_stage="union")
+                    for j, c in enumerate(chks):
+                        c._index = (j, 0)
+                    chk = union_op.new_chunk(chks)
+                new_chunks.append(chk)
+            chunks = new_chunks
+        if len(chunks) > 1:
+            union_op = DataFrameBloomFilter(execution_stage="union")
+            filter_chunk = union_op.new_chunk(chunks)
+        else:
+            filter_chunk = chunks[0]
+
+        filter_chunk.is_broadcaster = True
+        # filter df1
+        out_chunks = []
+        for chunk in df1.chunks:
+            filter_op = DataFrameBloomFilter(on=op.left_on, execution_stage="filter")
+            params = chunk.params.copy()
+            params["shape"] = (np.nan, chunk.shape[1])
+            params["index_value"] = parse_index(pd.RangeIndex(-1))
+            out_chunks.append(filter_op.new_chunk([chunk, filter_chunk], **params))
+
+        new_op = op.copy()
+        df1_params = df1.params.copy()
+        df1_params["chunks"] = out_chunks
+        df1_params["nsplits"] = ((np.nan,) * len(out_chunks), df1.nsplits[1])
+        return new_op.new_dataframes(op.inputs, **df1_params)
+
+    @classmethod
+    def _get_value(cls, value: Any):
+        # value could be an element or a series, as BloomFilter
+        # doesn't accept series, convert to list here
+        if isinstance(value, pd.Series):
+            return value.tolist()
+        else:
+            return value
+
+    @classmethod
+    def _filter_on_index(cls, on: Union[str, List, None], data: pd.DataFrame):
+        if on is None:
+            return True
+        elif isinstance(on, str):
+            return on not in data.columns
+        else:
+            assert isinstance(on, list)
+            return any(c not in data.columns for c in on)
+
+    @classmethod
+    def _build_index_filter(cls, in_data: pd.DataFrame, op: "DataFrameBloomFilter"):
+        if isinstance(in_data.index, pd.MultiIndex):
+            index = in_data.index.get_level_values(op.on)
+        else:
+            index = in_data.index
+        bloom_filter = BloomFilter(
+            max_elements=op.max_elements, error_rate=op.error_rate
+        )
+        index.map(lambda v: bloom_filter.add(cls._get_value(v)))
+        return bloom_filter
+
+    @classmethod
+    def _build_series_filter(cls, in_data: pd.Series, op: "DataFrameBloomFilter"):
+        try:
+            bloom_filter = BloomFilter(
+                max_elements=op.max_elements, error_rate=op.error_rate
+            )
+            in_data[op.on].map(lambda v: bloom_filter.add(cls._get_value(v)))
+        except TypeError:
+            # has unhashable data, convert to str
+            in_data = in_data.astype(str)
+            bloom_filter = BloomFilter(
+                max_elements=op.max_elements, error_rate=op.error_rate
+            )
+            in_data[op.on].map(lambda v: bloom_filter.add(cls._get_value(v)))
+        return bloom_filter
+
+    @classmethod
+    def _build_dataframe_filter(cls, in_data: pd.DataFrame, op: "DataFrameBloomFilter"):
+        try:
+            bloom_filter = BloomFilter(
+                max_elements=op.max_elements, error_rate=op.error_rate
+            )
+            in_data[op.on].apply(lambda v: bloom_filter.add(cls._get_value(v)), axis=1)
+        except TypeError:
+            # has unhashable data, convert to str
+            in_data = in_data.astype(cls._convert_to_hashable_dtypes(in_data.dtypes))
+            bloom_filter = BloomFilter(
+                max_elements=op.max_elements, error_rate=op.error_rate
+            )
+            in_data[op.on].apply(lambda v: bloom_filter.add(cls._get_value(v)), axis=1)
+        return bloom_filter
+
+    @classmethod
+    def _convert_to_hashable_dtypes(cls, dtypes: pd.Series):
+        dtypes = dict(
+            (name, dtype) if np.issubdtype(dtype, int) else (name, str)
+            for name, dtype in dtypes.iteritems()
+        )
+        return dtypes
+
+    @classmethod
+    def execute(cls, ctx: Union[dict, Context], op: "DataFrameBloomFilter"):
+        if op.execution_stage == "build":
+            on = op.on
+            in_data = ctx[op.inputs[0].key]
+            if cls._filter_on_index(on, in_data):
+                bloom_filter = cls._build_index_filter(in_data, op)
+            elif isinstance(on, str):
+                bloom_filter = cls._build_series_filter(in_data, op)
+            else:
+                bloom_filter = cls._build_dataframe_filter(in_data, op)
+            ctx[op.outputs[0].key] = bloom_filter
+        elif op.execution_stage == "union":
+            # union bloom filters
+            filters = [ctx[inp.key] for inp in op.inputs]
+            out = filters[0]
+            for f in filters[1:]:
+                out.union(f)
+            ctx[op.outputs[0].key] = out
+        elif op.execution_stage == "filter":
+            on = op.on
+            in_data = ctx[op.inputs[0].key]
+            bloom_filter = ctx[op.inputs[1].key]
+            if cls._filter_on_index(on, in_data):
+                if isinstance(in_data.index, pd.MultiIndex):
+                    idx = in_data.index.names.index(on)
+                    ctx[op.outputs[0].key] = in_data[
+                        in_data.index.map(lambda x: x[idx] in bloom_filter)
+                    ]
+                else:
+                    ctx[op.outputs[0].key] = in_data[
+                        in_data.index.map(lambda x: x in bloom_filter)
+                    ]
+            else:
+                row_func = lambda row: cls._get_value(row) in bloom_filter
+                if isinstance(on, str):
+                    # series
+                    try:
+                        filtered = in_data[in_data[on].map(row_func)]
+                    except TypeError:
+                        converted_data = in_data.astype(str)
+                        filtered = in_data[converted_data[on].map(row_func)]
+                    ctx[op.outputs[0].key] = filtered
+                else:
+                    # dataframe
+                    try:
+                        filtered = in_data[in_data[on].apply(row_func, axis=1)]
+                    except TypeError:
+                        converted_data = in_data.astype(
+                            cls._convert_to_hashable_dtypes(in_data.dtypes)
+                        )
+                        filtered = in_data[converted_data[on].apply(row_func, axis=1)]
+                    ctx[op.outputs[0].key] = filtered
+
+        else:  # pragma: no cover
+            raise ValueError(f"Unknown execution stage: {op.execution_stage}")
+
+
+def filter_by_bloom_filter(
+    df1: TileableType,
+    df2: TileableType,
+    left_on: Union[str, List],
+    right_on: Union[str, List],
+    max_elements: int = 10000,
+    error_rate: float = 0.1,
+    combine_size: int = None,
+):
+    """
+    Use bloom filter to filter DataFrame.
+
+    Parameters
+    ----------
+    df1: DataFrame.
+        DataFrame to be filtered.
+    df2: DataFrame.
+        Dataframe to build filter.
+    left_on: str or list.
+        Column(s) selected on df1.
+    right_on: str or list.
+        Column(s) selected on df2.
+    max_elements: int
+        How many elements you expect the filter to hold.
+    error_rate: float
+        error_rate defines accuracy.
+    combine_size: int
+        Combine size.
+
+    Returns
+    -------
+    DataFrame
+        Filtered df1.
+    """
+    if combine_size is None:
+        combine_size = options.combine_size
+    op = DataFrameBloomFilter(
+        left_on=left_on,
+        right_on=right_on,
+        max_elements=max_elements,
+        error_rate=error_rate,
+        combine_size=combine_size,
+    )
+    return op(df1, df2)
diff --git a/python/xorbits/_mars/dataframe/base/cartesian_chunk.py b/python/xorbits/_mars/dataframe/base/cartesian_chunk.py
new file mode 100644
index 000000000..fb7171d63
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/cartesian_chunk.py
@@ -0,0 +1,277 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import recursive_tile
+from ...core.custom_log import redirect_custom_log
+from ...serialization.serializables import (
+    DictField,
+    FunctionField,
+    KeyField,
+    TupleField,
+)
+from ...utils import enter_current_session, has_unknown_shape, quiet_stdio
+from ..operands import DataFrameOperand, DataFrameOperandMixin, OutputType
+from ..utils import (
+    build_df,
+    build_empty_df,
+    build_series,
+    parse_index,
+    validate_output_types,
+)
+
+
+class DataFrameCartesianChunk(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.CARTESIAN_CHUNK
+
+    _left = KeyField("left")
+    _right = KeyField("right")
+    _func = FunctionField("func")
+    _args = TupleField("args")
+    _kwargs = DictField("kwargs")
+
+    def __init__(
+        self,
+        left=None,
+        right=None,
+        func=None,
+        args=None,
+        kwargs=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _left=left,
+            _right=right,
+            _func=func,
+            _args=args,
+            _kwargs=kwargs,
+            _output_types=output_types,
+            **kw
+        )
+        if self.memory_scale is None:
+            self.memory_scale = 2.0
+
+    @property
+    def left(self):
+        return self._left
+
+    @property
+    def right(self):
+        return self._right
+
+    @property
+    def func(self):
+        return self._func
+
+    @property
+    def args(self):
+        return self._args
+
+    @property
+    def kwargs(self):
+        return self._kwargs
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._left = self._inputs[0]
+        self._right = self._inputs[1]
+
+    @staticmethod
+    def _build_test_obj(obj):
+        return (
+            build_df(obj, size=2)
+            if obj.ndim == 2
+            else build_series(obj, size=2, name=obj.name)
+        )
+
+    def __call__(self, left, right, index=None, dtypes=None):
+        test_left = self._build_test_obj(left)
+        test_right = self._build_test_obj(right)
+        output_type = self._output_types[0] if self._output_types else None
+
+        if output_type == OutputType.df_or_series:
+            return self.new_df_or_series([left, right])
+
+        # try run to infer meta
+        try:
+            with np.errstate(all="ignore"), quiet_stdio():
+                obj = self._func(test_left, test_right, *self._args, **self._kwargs)
+        except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+            if output_type == OutputType.series:
+                obj = pd.Series([], dtype=np.dtype(object))
+            elif output_type == OutputType.dataframe and dtypes is not None:
+                obj = build_empty_df(dtypes)
+            else:
+                raise TypeError(
+                    "Cannot determine `output_type`, "
+                    "you have to specify it as `dataframe` or `series`, "
+                    "for dataframe, `dtypes` is required as well "
+                    "if output_type='dataframe'"
+                )
+
+        if getattr(obj, "ndim", 0) == 1 or output_type == OutputType.series:
+            shape = self._kwargs.pop("shape", (np.nan,))
+            if index is None:
+                index = obj.index
+            index_value = parse_index(
+                index, left, right, self._func, self._args, self._kwargs
+            )
+            return self.new_series(
+                [left, right],
+                dtype=obj.dtype,
+                shape=shape,
+                index_value=index_value,
+                name=obj.name,
+            )
+        else:
+            dtypes = dtypes if dtypes is not None else obj.dtypes
+            # dataframe
+            shape = (np.nan, len(dtypes))
+            columns_value = parse_index(dtypes.index, store_data=True)
+            if index is None:
+                index = obj.index
+            index_value = parse_index(
+                index, left, right, self._func, self._args, self._kwargs
+            )
+            return self.new_dataframe(
+                [left, right],
+                shape=shape,
+                dtypes=dtypes,
+                index_value=index_value,
+                columns_value=columns_value,
+            )
+
+    @classmethod
+    def tile(cls, op: "DataFrameCartesianChunk"):
+        left = op.left
+        right = op.right
+        out = op.outputs[0]
+        out_type = op.output_types[0]
+
+        if left.ndim == 2 and left.chunk_shape[1] > 1:
+            if has_unknown_shape(left):
+                yield
+            # if left is a DataFrame, make sure 1 chunk on axis columns
+            left = yield from recursive_tile(left.rechunk({1: left.shape[1]}))
+        if right.ndim == 2 and right.chunk_shape[1] > 1:
+            if has_unknown_shape(right):
+                yield
+            # if right is a DataFrame, make sure 1 chunk on axis columns
+            right = yield from recursive_tile(right.rechunk({1: right.shape[1]}))
+
+        out_chunks = []
+        if out_type == OutputType.dataframe:
+            nsplits = [[], [out.shape[1]]]
+        elif out_type == OutputType.series:
+            nsplits = [[]]
+        else:
+            # DataFrameOrSeries
+            nsplits = None
+        i = 0
+        for left_chunk in left.chunks:
+            for right_chunk in right.chunks:
+                chunk_op = op.copy().reset_key()
+                chunk_op.tileable_op_key = op.key
+                if out_type == OutputType.df_or_series:
+                    out_chunks.append(
+                        chunk_op.new_chunk(
+                            [left_chunk, right_chunk], index=(i, 0), collapse_axis=1
+                        )
+                    )
+                elif out_type == OutputType.dataframe:
+                    shape = (np.nan, out.shape[1])
+                    index_value = parse_index(
+                        out.index_value.to_pandas(),
+                        left_chunk,
+                        right_chunk,
+                        op.func,
+                        op.args,
+                        op.kwargs,
+                    )
+                    out_chunk = chunk_op.new_chunk(
+                        [left_chunk, right_chunk],
+                        shape=shape,
+                        index_value=index_value,
+                        columns_value=out.columns_value,
+                        dtypes=out.dtypes,
+                        index=(i, 0),
+                    )
+                    out_chunks.append(out_chunk)
+                    nsplits[0].append(out_chunk.shape[0])
+                else:
+                    shape = (np.nan,)
+                    index_value = parse_index(
+                        out.index_value.to_pandas(),
+                        left_chunk,
+                        right_chunk,
+                        op.func,
+                        op.args,
+                        op.kwargs,
+                    )
+                    out_chunk = chunk_op.new_chunk(
+                        [left_chunk, right_chunk],
+                        shape=shape,
+                        index_value=index_value,
+                        dtype=out.dtype,
+                        name=out.name,
+                        index=(i,),
+                    )
+                    out_chunks.append(out_chunk)
+                    nsplits[0].append(out_chunk.shape[0])
+                i += 1
+
+        params = out.params
+        params["nsplits"] = tuple(tuple(ns) for ns in nsplits) if nsplits else nsplits
+        params["chunks"] = out_chunks
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    @redirect_custom_log
+    @enter_current_session
+    def execute(cls, ctx, op: "DataFrameCartesianChunk"):
+        left, right = ctx[op.left.key], ctx[op.right.key]
+        ctx[op.outputs[0].key] = op.func(left, right, *op.args, **(op.kwargs or dict()))
+
+
+def cartesian_chunk(left, right, func, skip_infer=False, args=(), **kwargs):
+    output_type = kwargs.pop("output_type", None)
+    output_types = kwargs.pop("output_types", None)
+    object_type = kwargs.pop("object_type", None)
+    output_types = validate_output_types(
+        output_type=output_type, output_types=output_types, object_type=object_type
+    )
+    output_type = output_types[0] if output_types else None
+    if output_type:
+        output_types = [output_type]
+    elif skip_infer:
+        output_types = [OutputType.df_or_series]
+    index = kwargs.pop("index", None)
+    dtypes = kwargs.pop("dtypes", None)
+    memory_scale = kwargs.pop("memory_scale", None)
+
+    op = DataFrameCartesianChunk(
+        left=left,
+        right=right,
+        func=func,
+        args=args,
+        kwargs=kwargs,
+        output_types=output_types,
+        memory_scale=memory_scale,
+    )
+    return op(left, right, index=index, dtypes=dtypes)
diff --git a/python/xorbits/_mars/dataframe/base/check_monotonic.py b/python/xorbits/_mars/dataframe/base/check_monotonic.py
new file mode 100644
index 000000000..2d76daf96
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/check_monotonic.py
@@ -0,0 +1,168 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import OutputType
+from ...core.operand import OperandStage
+from ...serialization.serializables import BoolField
+from ...tensor.core import TensorOrder
+from ...tensor.merge import TensorConcatenate
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+
+
+class DataFrameCheckMonotonic(DataFrameOperand, DataFrameOperandMixin):
+    _op_code_ = opcodes.CHECK_MONOTONIC
+
+    # 0 - increasing, 1 - decreasing
+    _decreasing = BoolField("decreasing")
+    _strict = BoolField("strict")
+
+    def __init__(self, decreasing=None, strict=None, output_types=None, **kw):
+        super().__init__(
+            _decreasing=decreasing, _strict=strict, _output_types=output_types, **kw
+        )
+
+    @property
+    def decreasing(self):
+        return self._decreasing
+
+    @property
+    def strict(self):
+        return self._strict
+
+    def __call__(self, df_obj):
+        self._output_types = [OutputType.scalar]
+        return self.new_tileable([df_obj], shape=(), dtype=np.dtype(bool))
+
+    @classmethod
+    def tile(cls, op: "DataFrameCheckMonotonic"):
+        map_chunks = []
+        for c in op.inputs[0].chunks:
+            new_op = DataFrameCheckMonotonic(
+                decreasing=op.decreasing,
+                strict=op.strict,
+                stage=OperandStage.map,
+                output_types=[OutputType.series],
+                order=TensorOrder.C_ORDER,
+            )
+            map_chunks.append(new_op.new_chunk([c], shape=(2,), dtype=np.dtype(bool)))
+
+        concat_op = TensorConcatenate(axis=0, dtype=np.dtype(bool))
+        concat_r_chunk = concat_op.new_chunk(
+            map_chunks,
+            shape=(len(map_chunks),),
+            index=(0, 0),
+            order=TensorOrder.C_ORDER,
+        )
+
+        new_op = DataFrameCheckMonotonic(
+            decreasing=op.decreasing,
+            strict=op.strict,
+            stage=OperandStage.reduce,
+            output_types=[OutputType.scalar],
+            order=TensorOrder.C_ORDER,
+        )
+        r_chunk = new_op.new_chunk(
+            [concat_r_chunk], shape=(), order=TensorOrder.C_ORDER, dtype=np.dtype(bool)
+        )
+
+        new_op = op.copy().reset_key()
+        params = op.outputs[0].params
+        params["chunks"] = [r_chunk]
+        params["nsplits"] = ()
+        return new_op.new_tileables(op.inputs, **params)
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameCheckMonotonic"):
+        in_data = ctx[op.inputs[0].key]
+        if op.stage == OperandStage.map:
+            is_mono = (
+                in_data.is_monotonic_increasing
+                if not op.decreasing
+                else in_data.is_monotonic_decreasing
+            )
+            if op.strict and is_mono:
+                is_mono = in_data.is_unique
+
+            if isinstance(in_data, pd.Index):
+                edge_array = np.array([in_data[0], in_data[-1]])
+            else:
+                edge_array = np.array([in_data.iloc[0], in_data.iloc[-1]])
+
+            ctx[op.outputs[0].key] = (
+                np.array([is_mono]),
+                edge_array,
+            )
+        else:
+            in_series = pd.Series(in_data[1])
+            is_edge_mono = (
+                in_series.is_monotonic_increasing
+                if not op.decreasing
+                else in_series.is_monotonic_decreasing
+            )
+            if op.strict and is_edge_mono:
+                is_edge_mono = in_series.is_unique
+            ctx[op.outputs[0].key] = in_data[0].all() and is_edge_mono
+
+
+def check_monotonic(series_or_index, decreasing=False, strict=False):
+    """
+    Check if values in the object are monotonic increasing
+    or decreasing.
+
+    Parameters
+    ----------
+    decreasing : bool
+        If True, check if values are monotonic decreasing,
+        otherwise check if values are monotonic increasing
+    strict : bool
+        If True, values need to be unique to get a positive
+        result
+
+    Returns
+    -------
+    Scalar
+    """
+    op = DataFrameCheckMonotonic(decreasing=decreasing, strict=strict)
+    return op(series_or_index)
+
+
+def is_monotonic(series_or_index):
+    """
+    Return boolean scalar if values in the object are
+    monotonic_increasing.
+
+    Returns
+    -------
+    Scalar
+    """
+    return check_monotonic(series_or_index, decreasing=False, strict=False)
+
+
+is_monotonic_increasing = is_monotonic
+
+
+def is_monotonic_decreasing(series_or_index):
+    """
+    Return boolean scalar if values in the object are
+    monotonic_decreasing.
+
+    Returns
+    -------
+    Scalar
+    """
+    return check_monotonic(series_or_index, decreasing=True, strict=False)
diff --git a/python/xorbits/_mars/dataframe/base/core.py b/python/xorbits/_mars/dataframe/base/core.py
new file mode 100644
index 000000000..6a5f38c41
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/core.py
@@ -0,0 +1,64 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...serialization.serializables import KeyField
+from ..core import DATAFRAME_TYPE, SERIES_TYPE
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+
+
+class DataFrameDeviceConversionBase(DataFrameOperand, DataFrameOperandMixin):
+    _input = KeyField("input")
+
+    @property
+    def input(self):
+        return self._input
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = inputs[0]
+
+    def __call__(self, obj):
+        if isinstance(obj, DATAFRAME_TYPE):
+            return self.new_dataframe(
+                [obj],
+                shape=obj.shape,
+                dtypes=obj.dtypes,
+                index_value=obj.index_value,
+                columns_value=obj.columns_value,
+            )
+        else:
+            assert isinstance(obj, SERIES_TYPE)
+            return self.new_series(
+                [obj],
+                shape=obj.shape,
+                dtype=obj.dtype,
+                index_value=obj.index_value,
+                name=obj.name,
+            )
+
+    @classmethod
+    def tile(cls, op):
+        # Isolate ops on cpu from subsequent ops on gpu
+        yield
+        out_chunks = []
+        for c in op.input.chunks:
+            chunk_op = op.copy().reset_key()
+            out_chunk = chunk_op.new_chunk([c], **c.params)
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy().reset_key()
+        out = op.outputs[0]
+        return new_op.new_tileables(
+            op.inputs, chunks=out_chunks, nsplits=op.inputs[0].nsplits, **out.params
+        )
diff --git a/python/xorbits/_mars/dataframe/base/cut.py b/python/xorbits/_mars/dataframe/base/cut.py
new file mode 100644
index 000000000..cb8405d35
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/cut.py
@@ -0,0 +1,607 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from numbers import Integral
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import ENTITY_TYPE, ExecutableTuple, OutputType, recursive_tile
+from ...core.context import get_context
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    Int32Field,
+    KeyField,
+    StringField,
+)
+from ...tensor import tensor as astensor
+from ...tensor.core import TENSOR_TYPE, TensorOrder
+from ...utils import has_unknown_shape
+from ..core import INDEX_TYPE, SERIES_TYPE
+from ..datasource.index import from_pandas as asindex
+from ..initializer import Series as asseries
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index
+
+
+class DataFrameCut(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.CUT
+
+    _input = KeyField("input")
+    _bins = AnyField("bins")
+    _right = BoolField("right")
+    _labels = AnyField("labels")
+    _retbins = BoolField("retbins")
+    _precision = Int32Field("precision")
+    _include_lowest = BoolField("include_lowest")
+    _duplicates = StringField("duplicates")
+    _ordered = BoolField("ordered")
+
+    def __init__(
+        self,
+        bins=None,
+        right=None,
+        labels=None,
+        retbins=None,
+        precision=None,
+        include_lowest=None,
+        duplicates=None,
+        ordered=None,
+        **kw
+    ):
+        super().__init__(
+            _bins=bins,
+            _right=right,
+            _labels=labels,
+            _retbins=retbins,
+            _precision=precision,
+            _include_lowest=include_lowest,
+            _duplicates=duplicates,
+            _ordered=ordered,
+            **kw
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def bins(self):
+        return self._bins
+
+    @property
+    def right(self):
+        return self._right
+
+    @property
+    def labels(self):
+        return self._labels
+
+    @property
+    def retbins(self):
+        return self._retbins
+
+    @property
+    def precision(self):
+        return self._precision
+
+    @property
+    def include_lowest(self):
+        return self._include_lowest
+
+    @property
+    def duplicates(self):
+        return self._duplicates
+
+    @property
+    def ordered(self):
+        return self._ordered
+
+    @property
+    def output_limit(self):
+        return 1 if not self._retbins else 2
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        self._input = next(inputs_iter)
+        if isinstance(self._bins, ENTITY_TYPE):
+            self._bins = next(inputs_iter)
+        if isinstance(self._labels, ENTITY_TYPE):
+            self._labels = next(inputs_iter)
+
+    def __call__(self, x):
+        if isinstance(x, pd.Series):
+            x = asseries(x)
+        elif not isinstance(x, ENTITY_TYPE):
+            x = astensor(x)
+        if x.ndim != 1:
+            raise ValueError("Input array must be 1 dimensional")
+        if x.size == 0:
+            raise ValueError("Cannot cut empty array")
+
+        inputs = [x]
+        if self._labels is not None and not isinstance(
+            self._labels, (bool, ENTITY_TYPE)
+        ):
+            self._labels = np.asarray(self._labels)
+
+        # infer dtype
+        x_empty = (
+            pd.Series([1], dtype=x.dtype)
+            if isinstance(x, SERIES_TYPE)
+            else np.asarray([1], dtype=x.dtype)
+        )
+        if isinstance(self._bins, INDEX_TYPE):
+            bins = self._bins.index_value.to_pandas()
+            inputs.append(self._bins)
+            bins_unknown = True
+        elif isinstance(self._bins, ENTITY_TYPE):
+            bins = np.asarray([2], dtype=self._bins.dtype)
+            inputs.append(self._bins)
+            bins_unknown = True
+        else:
+            bins = self._bins
+            bins_unknown = isinstance(self._bins, Integral)
+        if isinstance(self._labels, ENTITY_TYPE):
+            bins_unknown = True
+            labels = None
+            inputs.append(self._labels)
+        else:
+            if self._labels is False or not bins_unknown:
+                labels = self._labels
+            else:
+                labels = None
+        ret = pd.cut(
+            x_empty,
+            bins,
+            right=self._right,
+            labels=labels,
+            retbins=True,
+            include_lowest=self._include_lowest,
+            duplicates=self._duplicates,
+        )
+
+        kws = []
+        output_types = []
+        if bins_unknown and isinstance(ret[0].dtype, pd.CategoricalDtype):
+            # inaccurate dtype, just create an empty one
+            out_dtype = pd.CategoricalDtype()
+        else:
+            out_dtype = ret[0].dtype
+        if isinstance(ret[0], pd.Series):
+            output_types.append(OutputType.series)
+            kws.append(
+                {
+                    "dtype": out_dtype,
+                    "shape": x.shape,
+                    "index_value": x.index_value,
+                    "name": x.name,
+                }
+            )
+        elif isinstance(ret[0], np.ndarray):
+            output_types.append(OutputType.tensor)
+            kws.append(
+                {"dtype": out_dtype, "shape": x.shape, "order": TensorOrder.C_ORDER}
+            )
+        else:
+            assert isinstance(ret[0], pd.Categorical)
+            output_types.append(OutputType.categorical)
+            kws.append(
+                {
+                    "dtype": out_dtype,
+                    "shape": x.shape,
+                    "categories_value": parse_index(
+                        out_dtype.categories, store_data=True
+                    ),
+                }
+            )
+
+        if self._retbins:
+            if isinstance(self._bins, (pd.IntervalIndex, INDEX_TYPE)):
+                output_types.append(OutputType.index)
+                kws.append(
+                    {
+                        "dtype": self._bins.dtype,
+                        "shape": self._bins.shape,
+                        "index_value": self._bins.index_value
+                        if isinstance(self._bins, INDEX_TYPE)
+                        else parse_index(self._bins, store_data=False),
+                        "name": self._bins.name,
+                    }
+                )
+            else:
+                output_types.append(OutputType.tensor)
+                kws.append(
+                    {
+                        "dtype": ret[1].dtype,
+                        "shape": ret[1].shape if ret[1].size > 0 else (np.nan,),
+                        "order": TensorOrder.C_ORDER,
+                    }
+                )
+
+        self.output_types = output_types
+        return ExecutableTuple(self.new_tileables(inputs, kws=kws))
+
+    @classmethod
+    def tile(cls, op):
+        if isinstance(op.bins, ENTITY_TYPE):
+            # check op.bins chunk shapes
+            if has_unknown_shape(op.bins):
+                yield
+            bins = yield from recursive_tile(op.bins.rechunk(op.bins.shape))
+        else:
+            bins = op.bins
+
+        if isinstance(op.labels, ENTITY_TYPE):
+            # check op.labels chunk shapes
+            if has_unknown_shape(op.labels):
+                yield
+            labels = yield from recursive_tile(op.labels.rechunk(op.labels.shape))
+        else:
+            labels = op.labels
+
+        if isinstance(op.bins, Integral):
+            input_min, input_max = yield from recursive_tile(
+                op.input.min(), op.input.max()
+            )
+            input_min_chunk = input_min.chunks[0]
+            input_max_chunk = input_max.chunks[0]
+
+            # let input min and max execute first
+            min_max_chunks = [input_min_chunk, input_max_chunk]
+            yield min_max_chunks + [c for inp in op.inputs for c in inp.chunks]
+
+            ctx = get_context()
+            keys = [input_min_chunk.key, input_max_chunk.key]
+            # get min and max of x
+            min_val, max_val = ctx.get_chunks_result(keys)
+            # calculate bins
+            if np.isinf(min_val) or np.isinf(max_val):
+                raise ValueError(
+                    "cannot specify integer `bins` when input data contains infinity"
+                )
+            elif min_val == max_val:  # adjust end points before binning
+                min_val -= 0.001 * abs(min_val) if min_val != 0 else 0.001
+                max_val += 0.001 * abs(max_val) if max_val != 0 else 0.001
+                bins = np.linspace(min_val, max_val, bins + 1, endpoint=True)
+            else:  # adjust end points before binning
+                bins = np.linspace(min_val, max_val, bins + 1, endpoint=True)
+                adj = (max_val - min_val) * 0.001  # 0.1% of the range
+                if op.right:
+                    bins[0] -= adj
+                else:
+                    bins[-1] += adj
+
+        outs = op.outputs
+
+        out_chunks = []
+        for c in op.input.chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_inputs = [c]
+            chunk_op._bins = bins
+            # do not return bins always for chunk
+            chunk_op._retbins = False
+            if isinstance(bins, ENTITY_TYPE):
+                chunk_inputs.append(bins.chunks[0])
+            chunk_op._labels = labels
+            if isinstance(labels, ENTITY_TYPE):
+                chunk_inputs.append(labels.chunks[0])
+
+            chunk_kws = []
+            if isinstance(outs[0], SERIES_TYPE):
+                chunk_kws.append(
+                    {
+                        "dtype": outs[0].dtype,
+                        "shape": c.shape,
+                        "index_value": c.index_value,
+                        "name": c.name,
+                        "index": c.index,
+                    }
+                )
+            elif isinstance(outs[0], TENSOR_TYPE):
+                chunk_kws.append(
+                    {
+                        "dtype": outs[0].dtype,
+                        "shape": c.shape,
+                        "order": TensorOrder.C_ORDER,
+                        "index": c.index,
+                    }
+                )
+            else:
+                chunk_kws.append(
+                    {
+                        "dtype": outs[0].dtype,
+                        "shape": c.shape,
+                        "categories_value": outs[0].categories_value,
+                        "index": c.index,
+                    }
+                )
+
+            out_chunks.append(chunk_op.new_chunk(chunk_inputs, kws=chunk_kws))
+
+        kws = []
+        out_kw = outs[0].params
+        out_kw["chunks"] = out_chunks
+        out_kw["nsplits"] = op.input.nsplits
+        kws.append(out_kw)
+        if len(outs) == 2:
+            bins_kw = outs[1].params
+            bins_kw["chunks"] = bins_chunks = []
+            if isinstance(bins, ENTITY_TYPE):
+                bins_chunks.append(bins.chunks[0])
+            else:
+                if op.duplicates == "drop":
+                    if isinstance(bins, (np.ndarray, list, tuple)):
+                        bins = np.unique(bins)
+                    else:
+                        bins = bins.unique()
+                    bins = bins.astype(outs[1].dtype, copy=False)
+                convert = (
+                    astensor if not isinstance(bins, pd.IntervalIndex) else asindex
+                )
+                converted = yield from recursive_tile(
+                    convert(bins, chunk_size=len(bins))
+                )
+                bins_chunks.append(converted.chunks[0])
+            bins_kw["nsplits"] = ((len(bins),),)
+            kws.append(bins_kw)
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=kws)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        x = ctx[op.input.key]
+        bins = ctx[op.bins.key] if isinstance(op.bins, ENTITY_TYPE) else op.bins
+        labels = ctx[op.labels.key] if isinstance(op.labels, ENTITY_TYPE) else op.labels
+
+        if pd.__version__ >= "1.1.0":
+            cut = partial(
+                pd.cut,
+                right=op.right,
+                retbins=op.retbins,
+                precision=op.precision,
+                include_lowest=op.include_lowest,
+                duplicates=op.duplicates,
+                ordered=op.ordered,
+            )
+        else:
+            cut = partial(
+                pd.cut,
+                right=op.right,
+                retbins=op.retbins,
+                precision=op.precision,
+                include_lowest=op.include_lowest,
+                duplicates=op.duplicates,
+            )
+        try:
+            ret = cut(x, bins, labels=labels)
+        except ValueError:
+            # fail due to buffer source array is read-only
+            ret = cut(x.copy(), bins, labels=labels)
+        if op.retbins:  # pragma: no cover
+            ctx[op.outputs[0].key] = ret[0]
+            ctx[op.outputs[1].key] = ret[1]
+        else:
+            ctx[op.outputs[0].key] = ret
+
+
+def cut(
+    x,
+    bins,
+    right: bool = True,
+    labels=None,
+    retbins: bool = False,
+    precision: int = 3,
+    include_lowest: bool = False,
+    duplicates: str = "raise",
+    ordered: bool = True,
+):
+    """
+    Bin values into discrete intervals.
+
+    Use `cut` when you need to segment and sort data values into bins. This
+    function is also useful for going from a continuous variable to a
+    categorical variable. For example, `cut` could convert ages to groups of
+    age ranges. Supports binning into an equal number of bins, or a
+    pre-specified array of bins.
+
+    Parameters
+    ----------
+    x : array-like
+        The input array to be binned. Must be 1-dimensional.
+    bins : int, sequence of scalars, or IntervalIndex
+        The criteria to bin by.
+
+        * int : Defines the number of equal-width bins in the range of `x`. The
+          range of `x` is extended by .1% on each side to include the minimum
+          and maximum values of `x`.
+        * sequence of scalars : Defines the bin edges allowing for non-uniform
+          width. No extension of the range of `x` is done.
+        * IntervalIndex : Defines the exact bins to be used. Note that
+          IntervalIndex for `bins` must be non-overlapping.
+
+    right : bool, default True
+        Indicates whether `bins` includes the rightmost edge or not. If
+        ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]``
+        indicate (1,2], (2,3], (3,4]. This argument is ignored when
+        `bins` is an IntervalIndex.
+    labels : array or False, default None
+        Specifies the labels for the returned bins. Must be the same length as
+        the resulting bins. If False, returns only integer indicators of the
+        bins. This affects the type of the output container (see below).
+        This argument is ignored when `bins` is an IntervalIndex. If True,
+        raises an error.
+    retbins : bool, default False
+        Whether to return the bins or not. Useful when bins is provided
+        as a scalar.
+    precision : int, default 3
+        The precision at which to store and display the bins labels.
+    include_lowest : bool, default False
+        Whether the first interval should be left-inclusive or not.
+    duplicates : {default 'raise', 'drop'}, optional
+        If bin edges are not unique, raise ValueError or drop non-uniques.
+    ordered : bool, default True
+        Whether the labels are ordered or not. Applies to returned types
+        Categorical and Series (with Categorical dtype). If True, the resulting
+        categorical will be ordered. If False, the resulting categorical will be
+        unordered (labels must be provided).
+
+    Returns
+    -------
+    out : Categorical, Series, or Tensor
+        An array-like object representing the respective bin for each value
+        of `x`. The type depends on the value of `labels`.
+
+        * True (default) : returns a Series for Series `x` or a
+          Categorical for all other inputs. The values stored within
+          are Interval dtype.
+
+        * sequence of scalars : returns a Series for Series `x` or a
+          Categorical for all other inputs. The values stored within
+          are whatever the type in the sequence is.
+
+        * False : returns a tensor of integers.
+
+    bins : Tensor or IntervalIndex.
+        The computed or specified bins. Only returned when `retbins=True`.
+        For scalar or sequence `bins`, this is a tensor with the computed
+        bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
+        an IntervalIndex `bins`, this is equal to `bins`.
+
+    See Also
+    --------
+    qcut : Discretize variable into equal-sized buckets based on rank
+        or based on sample quantiles.
+    Categorical : Array type for storing data that come from a
+        fixed set of values.
+    Series : One-dimensional array with axis labels (including time series).
+    IntervalIndex : Immutable Index implementing an ordered, sliceable set.
+
+    Notes
+    -----
+    Any NA values will be NA in the result. Out of bounds values will be NA in
+    the resulting Series or Categorical object.
+
+    Examples
+    --------
+    Discretize into three equal-sized bins.
+
+    >>> import mars.tensor as mt
+    >>> import mars.dataframe as md
+
+    >>> md.cut(mt.array([1, 7, 5, 4, 6, 3]), 3).execute()
+    ... # doctest: +ELLIPSIS
+    [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
+    Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ...
+
+    >>> md.cut(mt.array([1, 7, 5, 4, 6, 3]), 3, retbins=True).execute()
+    ... # doctest: +ELLIPSIS
+    ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
+    Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ...
+    array([0.994, 3.   , 5.   , 7.   ]))
+
+    Discovers the same bins, but assign them specific labels. Notice that
+    the returned Categorical's categories are `labels` and is ordered.
+
+    >>> md.cut(mt.array([1, 7, 5, 4, 6, 3]),
+    ...        3, labels=["bad", "medium", "good"]).execute()
+    [bad, good, medium, medium, good, bad]
+    Categories (3, object): [bad < medium < good]
+
+    ordered=False will result in unordered categories when labels are passed. This parameter
+    can be used to allow non-unique labels:
+
+    >>> md.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
+    ...        labels=["B", "A", "B"], ordered=False).execute()
+    ['B', 'B', 'A', 'A', 'B', 'B']
+    Categories (2, object): ['A', 'B']
+
+    ``labels=False`` implies you just want the bins back.
+
+    >>> md.cut([0, 1, 1, 2], bins=4, labels=False).execute()
+    array([0, 1, 1, 3])
+
+    Passing a Series as an input returns a Series with categorical dtype:
+
+    >>> s = md.Series(mt.array([2, 4, 6, 8, 10]),
+    ...               index=['a', 'b', 'c', 'd', 'e'])
+    >>> md.cut(s, 3).execute()
+    ... # doctest: +ELLIPSIS
+    a    (1.992, 4.667]
+    b    (1.992, 4.667]
+    c    (4.667, 7.333]
+    d     (7.333, 10.0]
+    e     (7.333, 10.0]
+    dtype: category
+    Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ...
+
+    Passing a Series as an input returns a Series with mapping value.
+    It is used to map numerically to intervals based on bins.
+
+    >>> s = md.Series(mt.array([2, 4, 6, 8, 10]),
+    ...               index=['a', 'b', 'c', 'd', 'e'])
+    >>> md.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False).execute()
+    ... # doctest: +ELLIPSIS
+    (a    0.0
+     b    1.0
+     c    2.0
+     d    3.0
+     e    NaN
+     dtype: float64, array([0, 2, 4, 6, 8, 10]))
+
+    Use `drop` optional when bins is not unique
+
+    >>> md.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True,
+    ...        right=False, duplicates='drop').execute()
+    ... # doctest: +ELLIPSIS
+    (a    0.0
+     b    1.0
+     c    2.0
+     d    3.0
+     e    NaN
+     dtype: float64, array([0, 2, 4, 6, 10]))
+
+    Passing an IntervalIndex for `bins` results in those categories exactly.
+    Notice that values not covered by the IntervalIndex are set to NaN. 0
+    is to the left of the first bin (which is closed on the right), and 1.5
+    falls between two bins.
+
+    >>> bins = md.Index(pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]))
+    >>> md.cut([0, 0.5, 1.5, 2.5, 4.5], bins).execute()
+    [NaN, (0, 1], NaN, (2, 3], (4, 5]]
+    Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]]
+    """
+
+    if isinstance(bins, Integral) and bins < 1:
+        raise ValueError("`bins` should be a positive integer")
+
+    op = DataFrameCut(
+        bins=bins,
+        right=right,
+        labels=labels,
+        retbins=retbins,
+        precision=precision,
+        include_lowest=include_lowest,
+        duplicates=duplicates,
+        ordered=ordered,
+    )
+    ret = op(x)
+    if not retbins:
+        return ret[0]
+    else:
+        return ret
diff --git a/python/xorbits/_mars/dataframe/base/datetimes.py b/python/xorbits/_mars/dataframe/base/datetimes.py
new file mode 100644
index 000000000..8d7375eae
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/datetimes.py
@@ -0,0 +1,154 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import OutputType
+from ...serialization.serializables import (
+    BoolField,
+    DictField,
+    KeyField,
+    StringField,
+    TupleField,
+)
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_empty_series
+
+
+class SeriesDatetimeMethod(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.DATETIME_METHOD
+
+    _input = KeyField("input")
+    _method = StringField("method")
+    _method_args = TupleField("method_args")
+    _method_kwargs = DictField("method_kwargs")
+    _is_property = BoolField("is_property")
+
+    def __init__(
+        self,
+        method=None,
+        method_args=None,
+        method_kwargs=None,
+        is_property=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _method=method,
+            _method_args=method_args,
+            _method_kwargs=method_kwargs,
+            _is_property=is_property,
+            _output_types=output_types,
+            **kw
+        )
+        if not self.output_types:
+            self.output_types = [OutputType.series]
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def method(self):
+        return self._method
+
+    @property
+    def method_args(self):
+        return self._method_args
+
+    @property
+    def method_kwargs(self):
+        return self._method_kwargs
+
+    @property
+    def is_property(self):
+        return self._is_property
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def __call__(self, inp):
+        return _datetime_method_to_handlers[self._method].call(self, inp)
+
+    @classmethod
+    def tile(cls, op):
+        return _datetime_method_to_handlers[op.method].tile(op)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        return _datetime_method_to_handlers[op.method].execute(ctx, op)
+
+
+class SeriesDatetimeMethodBaseHandler:
+    @classmethod
+    def call(cls, op, inp):
+        empty_series = build_empty_series(inp.dtype)
+        if op.is_property:
+            test_obj = getattr(empty_series.dt, op.method)
+        else:
+            test_obj = getattr(empty_series.dt, op.method)(
+                *op.method_args, **op.method_kwargs
+            )
+        dtype = test_obj.dtype
+        return op.new_series(
+            [inp],
+            shape=inp.shape,
+            dtype=dtype,
+            index_value=inp.index_value,
+            name=inp.name,
+        )
+
+    @classmethod
+    def tile(cls, op):
+        out = op.outputs[0]
+
+        out_chunks = []
+        for series_chunk in op.input.chunks:
+            chunk_op = op.copy().reset_key()
+            out_chunks.append(
+                chunk_op.new_chunk(
+                    [series_chunk],
+                    shape=series_chunk.shape,
+                    dtype=out.dtype,
+                    index=series_chunk.index,
+                    index_value=series_chunk.index_value,
+                    name=series_chunk.name,
+                )
+            )
+
+        params = out.params
+        params["chunks"] = out_chunks
+        params["nsplits"] = op.input.nsplits
+        new_op = op.copy()
+        return new_op.new_tileables([op.input], kws=[params])
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inp = ctx[op.input.key]
+        try:
+            out = getattr(inp.dt, op.method)
+        except ValueError:
+            # fail due to buffer read-only
+            out = getattr(inp.copy().dt, op.method)
+        if not op.is_property:
+            out = out(*op.method_args, **op.method_kwargs)
+        ctx[op.outputs[0].key] = out
+
+
+_datetime_method_to_handlers = {}
+for method in dir(pd.Series.dt):
+    if not method.startswith("_"):
+        _datetime_method_to_handlers[method] = SeriesDatetimeMethodBaseHandler
diff --git a/python/xorbits/_mars/dataframe/base/describe.py b/python/xorbits/_mars/dataframe/base/describe.py
new file mode 100644
index 000000000..769026bfe
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/describe.py
@@ -0,0 +1,266 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ... import tensor as mt
+from ...core import recursive_tile
+from ...core.operand import OperandStage
+from ...serialization.serializables import AnyField, FieldTypes, KeyField, ListField
+from ...utils import has_unknown_shape, lazy_import
+from ..core import SERIES_TYPE
+from ..initializer import DataFrame, Series
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_empty_df, parse_index
+
+cudf = lazy_import("cudf")
+
+
+class DataFrameDescribe(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.DESCRIBE
+
+    _input = KeyField("input")
+    _percentiles = ListField("percentiles", FieldTypes.float64)
+    _include = AnyField("include")
+    _exclude = AnyField("exclude")
+
+    def __init__(
+        self, percentiles=None, include=None, exclude=None, output_types=None, **kw
+    ):
+        super().__init__(
+            _percentiles=percentiles,
+            _include=include,
+            _exclude=exclude,
+            _output_types=output_types,
+            **kw
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def percentiles(self):
+        return self._percentiles
+
+    @property
+    def include(self):
+        return self._include
+
+    @property
+    def exclude(self):
+        return self._exclude
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if self.stage != OperandStage.agg:
+            self._input = self._inputs[0]
+
+    def __call__(self, df_or_series):
+        if isinstance(df_or_series, SERIES_TYPE):
+            if not np.issubdtype(df_or_series.dtype, np.number):
+                raise NotImplementedError("non-numeric type is not supported for now")
+            test_series = pd.Series([], dtype=df_or_series.dtype).describe(
+                percentiles=self._percentiles,
+                include=self._include,
+                exclude=self._exclude,
+            )
+            return self.new_series(
+                [df_or_series],
+                shape=(len(test_series),),
+                dtype=test_series.dtype,
+                index_value=parse_index(test_series.index, store_data=True),
+            )
+        else:
+            test_inp_df = build_empty_df(df_or_series.dtypes)
+            test_df = test_inp_df.describe(
+                percentiles=self._percentiles,
+                include=self._include,
+                exclude=self._exclude,
+            )
+            if len(self.percentiles) == 0:
+                # specify percentiles=False
+                # Note: unlike pandas that False is illegal value for percentiles,
+                # Mars DataFrame allows user to specify percentiles=False
+                # to skip computation about percentiles
+                test_df.drop(["50%"], axis=0, inplace=True)
+            for dtype in test_df.dtypes:
+                if not np.issubdtype(dtype, np.number):
+                    raise NotImplementedError(
+                        "non-numeric type is not supported for now"
+                    )
+            return self.new_dataframe(
+                [df_or_series],
+                shape=test_df.shape,
+                dtypes=test_df.dtypes,
+                index_value=parse_index(test_df.index, store_data=True),
+                columns_value=parse_index(test_df.columns, store_data=True),
+            )
+
+    @classmethod
+    def tile(cls, op):
+        inp = op.input
+
+        if len(inp.chunks) == 1:
+            return cls._tile_one_chunk(op)
+
+        if isinstance(inp, SERIES_TYPE):
+            result = yield from cls._tile_series(op)
+        else:
+            result = yield from cls._tile_dataframe(op)
+        return result
+
+    @classmethod
+    def _tile_one_chunk(cls, op):
+        out = op.outputs[0]
+
+        chunk_op = op.copy().reset_key()
+        chunk_params = out.params.copy()
+        chunk_params["index"] = (0,) * out.ndim
+        out_chunk = chunk_op.new_chunk([op.input.chunks[0]], kws=[chunk_params])
+
+        new_op = op.copy()
+        params = out.params.copy()
+        params["chunks"] = [out_chunk]
+        params["nsplits"] = tuple((s,) for s in out.shape)
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def _tile_series(cls, op):
+        series = Series(op.input)
+        out = op.outputs[0]
+        index = out.index_value.to_pandas()
+        # ['count', 'mean', 'std', 'min', {percentiles}, 'max']
+        names = index.tolist()
+
+        values = [None] * 6
+        for i, agg in enumerate(names[:4]):
+            values[i] = mt.atleast_1d(getattr(series, agg)())
+        values[-1] = mt.atleast_1d(getattr(series, names[-1])())
+        values[4] = series.quantile(op.percentiles).to_tensor()
+
+        t = mt.concatenate(values).rechunk(len(names))
+        ret = Series(t, index=index, name=series.name)
+        ret = yield from recursive_tile(ret)
+        return [ret]
+
+    @classmethod
+    def _tile_dataframe(cls, op):
+        df = DataFrame(op.input)
+        out = op.outputs[0]
+        dtypes = out.dtypes
+        columns = dtypes.index.tolist()
+
+        if df.chunk_shape[1] > 1:
+            df = df.rechunk({1: df.shape[1]})
+
+        # check dtypes if selected all fields
+        # to reduce graph scale
+        if df.dtypes.index.tolist() != columns:
+            df = df[columns]
+
+        # calculate percentiles
+        percentiles = None
+        if len(op.percentiles) > 0:
+            if has_unknown_shape(*op.inputs):
+                yield
+            percentiles = yield from recursive_tile(df.quantile(op.percentiles))
+
+        # perform aggregation together
+        aggregation = yield from recursive_tile(
+            df.agg(["count", "mean", "std", "min", "max"])
+        )
+
+        chunk_op = DataFrameDescribe(
+            output_types=op.output_types,
+            stage=OperandStage.agg,
+            percentiles=op.percentiles,
+        )
+        chunk_params = out.params.copy()
+        chunk_params["index"] = (0, 0)
+        in_chunks = aggregation.chunks
+        if percentiles is not None:
+            in_chunks += percentiles.chunks
+        out_chunk = chunk_op.new_chunk(in_chunks, kws=[chunk_params])
+
+        new_op = op.copy()
+        params = out.params.copy()
+        params["chunks"] = [out_chunk]
+        params["nsplits"] = tuple((s,) for s in out.shape)
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def execute(cls, ctx, op):
+        out = op.outputs[0]
+        if op.stage is None:  # 1 chunk
+            df_or_series = ctx[op.input.key]
+
+            ctx[out.key] = df_or_series.describe(
+                percentiles=op.percentiles, include=op.include, exclude=op.exclude
+            )
+        else:
+            assert op.stage == OperandStage.agg
+
+            inputs = [ctx[inp.key] for inp in op.inputs]
+            xdf = (
+                pd
+                if isinstance(inputs[0], (pd.DataFrame, pd.Series, pd.Index))
+                or cudf is None
+                else cudf
+            )
+
+            if len(inputs) == 1:
+                df = inputs[0]
+            else:
+                assert len(inputs) > 1
+                aggregations = inputs[0]
+                percentiles = xdf.concat(inputs[1:], axis=0)
+                df = xdf.concat(
+                    [aggregations.iloc[:-1], percentiles, aggregations.iloc[-1:]],
+                    axis=0,
+                )
+            # ['count', 'mean', 'std', 'min', {percentiles}, 'max']
+            df.index = out.index_value.to_pandas()
+            ctx[out.key] = df
+
+
+def describe(df_or_series, percentiles=None, include=None, exclude=None):
+    if percentiles is False:
+        percentiles = []
+    elif percentiles is None:
+        percentiles = [0.25, 0.5, 0.75]
+    else:
+        percentiles = list(percentiles)
+        if percentiles is not None:
+            for p in percentiles:
+                if p < 0 or p > 1:
+                    raise ValueError(
+                        "percentiles should all be in the interval [0, 1]. "
+                        "Try [{0:.3f}] instead.".format(p / 100)
+                    )
+        # median should always be included
+        if 0.5 not in percentiles:
+            percentiles.append(0.5)
+        percentiles = np.asarray(percentiles)
+
+        # sort and check for duplicates
+        unique_pcts = np.unique(percentiles)
+        if len(unique_pcts) < len(percentiles):
+            raise ValueError("percentiles cannot contain duplicates")
+        percentiles = unique_pcts.tolist()
+
+    op = DataFrameDescribe(percentiles=percentiles, include=include, exclude=exclude)
+    return op(df_or_series)
diff --git a/python/xorbits/_mars/dataframe/base/diff.py b/python/xorbits/_mars/dataframe/base/diff.py
new file mode 100644
index 000000000..a63994812
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/diff.py
@@ -0,0 +1,304 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+
+from ... import opcodes
+from ...core import recursive_tile
+from ...serialization.serializables import AnyField, Int8Field, Int64Field
+from ..core import DATAFRAME_TYPE, OutputType
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_empty_df, build_empty_series, validate_axis
+from .shift import DataFrameShift
+
+
+class DataFrameDiff(DataFrameOperandMixin, DataFrameOperand):
+    _op_type_ = opcodes.DIFF
+
+    _periods = Int64Field("periods")
+    _axis = Int8Field("axis")
+
+    _bool_columns = AnyField("bool_columns")
+
+    @property
+    def periods(self):
+        return self._periods
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def bool_columns(self):
+        return self._bool_columns
+
+    def __init__(self, periods=None, axis=None, bool_columns=None, **kw):
+        super().__init__(_periods=periods, _axis=axis, _bool_columns=bool_columns, **kw)
+
+    def __call__(self, df_or_series):
+        params = df_or_series.params.copy()
+
+        if isinstance(df_or_series, DATAFRAME_TYPE):
+            self.output_types = [OutputType.dataframe]
+            mock_obj = build_empty_df(df_or_series.dtypes)
+            params["dtypes"] = mock_obj.diff().dtypes
+        else:
+            self.output_types = [OutputType.series]
+            mock_obj = build_empty_series(df_or_series.dtype, name=df_or_series.name)
+            params["dtype"] = mock_obj.diff().dtype
+
+        return self.new_tileable([df_or_series], **params)
+
+    @classmethod
+    def tile(cls, op):
+        in_obj = op.inputs[0]
+        out_obj = op.outputs[0]
+        axis = op.axis or 0
+
+        if in_obj.chunk_shape[axis] > 1:
+            shifted = yield from recursive_tile(
+                DataFrameShift(periods=op.periods, axis=axis)(in_obj)
+            )
+            shift_chunks = shifted.chunks
+        else:
+            shift_chunks = itertools.repeat(None)
+
+        chunks = []
+        bool_columns_dict = dict()
+        for in_chunk, shift_chunk in zip(in_obj.chunks, shift_chunks):
+            params = in_chunk.params.copy()
+            if in_chunk.ndim == 2:
+                params["dtypes"] = out_obj.dtypes[in_chunk.dtypes.index]
+                try:
+                    bool_columns = bool_columns_dict[in_chunk.index[1]]
+                except KeyError:
+                    bool_columns = bool_columns_dict[in_chunk.index[1]] = [
+                        col
+                        for col, dt in in_chunk.dtypes.items()
+                        if dt == np.dtype(bool)
+                    ]
+            else:
+                params["dtype"] = out_obj.dtype
+                bool_columns = in_chunk.dtype == np.dtype(bool)
+
+            new_op = op.copy().reset_key()
+            new_op._bool_columns = bool_columns
+
+            if shift_chunk is None:
+                chunks.append(new_op.new_chunk([in_chunk], **params))
+            else:
+                chunks.append(new_op.new_chunk([in_chunk, shift_chunk], **params))
+
+        new_op = op.copy().reset_key()
+        return new_op.new_tileables(
+            [in_obj], chunks=chunks, nsplits=in_obj.nsplits, **out_obj.params
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        in_data = ctx[op.inputs[0].key]
+        if len(op.inputs) == 1:
+            if in_data.ndim == 2:
+                try:
+                    ctx[op.outputs[0].key] = in_data.diff(
+                        periods=op.periods, axis=op.axis
+                    )
+                except ValueError:
+                    ctx[op.outputs[0].key] = in_data.copy().diff(
+                        periods=op.periods, axis=op.axis
+                    )
+            else:
+                ctx[op.outputs[0].key] = in_data.diff(periods=op.periods)
+        else:
+            in_shift = ctx[op.inputs[1].key]
+            result = in_data - in_shift
+            if op.bool_columns:
+                if in_data.ndim == 2:
+                    result.replace(
+                        {c: {1: True, -1: True, 0: False} for c in op.bool_columns},
+                        inplace=True,
+                    )
+                else:
+                    result.replace({1: True, -1: True, 0: False}, inplace=True)
+            ctx[op.outputs[0].key] = result
+
+
+def df_diff(df, periods=1, axis=0):
+    """
+    First discrete difference of element.
+    Calculates the difference of a DataFrame element compared with another
+    element in the DataFrame (default is the element in the same column
+    of the previous row).
+
+    Parameters
+    ----------
+    periods : int, default 1
+        Periods to shift for calculating difference, accepts negative
+        values.
+    axis : {0 or 'index', 1 or 'columns'}, default 0
+        Take difference over rows (0) or columns (1).
+
+    Returns
+    -------
+    DataFrame
+
+    See Also
+    --------
+    Series.diff : First discrete difference for a Series.
+    DataFrame.pct_change : Percent change over given number of periods.
+    DataFrame.shift : Shift index by desired number of periods with an
+        optional time freq.
+
+    Notes
+    -----
+    For boolean dtypes, this uses :meth:`operator.xor` rather than
+    :meth:`operator.sub`.
+
+    Examples
+    --------
+    Difference with previous row
+
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'a': [1, 2, 3, 4, 5, 6],
+    ...                    'b': [1, 1, 2, 3, 5, 8],
+    ...                    'c': [1, 4, 9, 16, 25, 36]})
+    >>> df.execute()
+       a  b   c
+    0  1  1   1
+    1  2  1   4
+    2  3  2   9
+    3  4  3  16
+    4  5  5  25
+    5  6  8  36
+
+    >>> df.diff().execute()
+         a    b     c
+    0  NaN  NaN   NaN
+    1  1.0  0.0   3.0
+    2  1.0  1.0   5.0
+    3  1.0  1.0   7.0
+    4  1.0  2.0   9.0
+    5  1.0  3.0  11.0
+
+    Difference with previous column
+
+    >>> df.diff(axis=1).execute()
+        a    b     c
+    0 NaN  0.0   0.0
+    1 NaN -1.0   3.0
+    2 NaN -1.0   7.0
+    3 NaN -1.0  13.0
+    4 NaN  0.0  20.0
+    5 NaN  2.0  28.0
+
+    Difference with 3rd previous row
+
+    >>> df.diff(periods=3).execute()
+         a    b     c
+    0  NaN  NaN   NaN
+    1  NaN  NaN   NaN
+    2  NaN  NaN   NaN
+    3  3.0  2.0  15.0
+    4  3.0  4.0  21.0
+    5  3.0  6.0  27.0
+
+    Difference with following row
+
+    >>> df.diff(periods=-1).execute()
+         a    b     c
+    0 -1.0  0.0  -3.0
+    1 -1.0 -1.0  -5.0
+    2 -1.0 -1.0  -7.0
+    3 -1.0 -2.0  -9.0
+    4 -1.0 -3.0 -11.0
+    5  NaN  NaN   NaN
+    """
+    axis = validate_axis(axis, df)
+    op = DataFrameDiff(periods=periods, axis=axis)
+    return op(df)
+
+
+def series_diff(series, periods=1):
+    """
+    First discrete difference of element.
+    Calculates the difference of a Series element compared with another
+    element in the Series (default is element in previous row).
+
+    Parameters
+    ----------
+    periods : int, default 1
+        Periods to shift for calculating difference, accepts negative
+        values.
+
+    Returns
+    -------
+    Series
+        First differences of the Series.
+
+    See Also
+    --------
+    Series.pct_change :
+        Percent change over given number of periods.
+    Series.shift :
+        Shift index by desired number of periods with an optional time freq.
+    DataFrame.diff :
+        First discrete difference of object.
+
+    Notes
+    -----
+    For boolean dtypes, this uses :meth:`operator.xor` rather than
+    :meth:`operator.sub`.
+
+    Examples
+    --------
+
+    Difference with previous row
+
+    >>> import mars.dataframe as md
+    >>> s = md.Series([1, 1, 2, 3, 5, 8])
+    >>> s.diff().execute()
+    0    NaN
+    1    0.0
+    2    1.0
+    3    1.0
+    4    2.0
+    5    3.0
+    dtype: float64
+
+    Difference with 3rd previous row
+
+    >>> s.diff(periods=3).execute()
+    0    NaN
+    1    NaN
+    2    NaN
+    3    2.0
+    4    4.0
+    5    6.0
+    dtype: float64
+
+    Difference with following row
+
+    >>> s.diff(periods=-1).execute()
+    0    0.0
+    1   -1.0
+    2   -1.0
+    3   -2.0
+    4   -3.0
+    5    NaN
+    dtype: float64
+    """
+    op = DataFrameDiff(periods=periods)
+    return op(series)
diff --git a/python/xorbits/_mars/dataframe/base/drop.py b/python/xorbits/_mars/dataframe/base/drop.py
new file mode 100644
index 000000000..d7cbaaedd
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/drop.py
@@ -0,0 +1,545 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+
+from ... import opcodes
+from ...core import CHUNK_TYPE, Chunk, Entity, OutputType, recursive_tile
+from ...serialization.serializables import AnyField, StringField
+from ..core import DATAFRAME_TYPE, INDEX_CHUNK_TYPE, SERIES_TYPE, IndexValue
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index, validate_axis
+
+
+class DataFrameDrop(DataFrameOperandMixin, DataFrameOperand):
+    _op_type_ = opcodes.DATAFRAME_DROP
+
+    _index = AnyField("index")
+    _columns = AnyField("columns")
+    _level = AnyField("level")
+    _errors = StringField("errors")
+
+    def __init__(self, index=None, columns=None, level=None, errors=None, **kw):
+        super().__init__(
+            _index=index, _columns=columns, _level=level, _errors=errors, **kw
+        )
+
+    @property
+    def index(self):
+        return self._index
+
+    @property
+    def columns(self):
+        return self._columns
+
+    @property
+    def level(self):
+        return self._level
+
+    @property
+    def errors(self):
+        return self._errors
+
+    def _filter_dtypes(self, dtypes, ignore_errors=False):
+        if self._columns:
+            return dtypes.drop(
+                index=self._columns,
+                level=self._level,
+                errors="ignore" if ignore_errors else self._errors,
+            )
+        else:
+            return dtypes
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs[1:])
+        if len(self._inputs) > 1:
+            self._index = next(inputs_iter)
+
+    def __call__(self, df_or_series):
+        params = df_or_series.params.copy()
+        shape_list = list(df_or_series.shape)
+
+        if self._index is not None:
+            if isinstance(df_or_series.index_value.value, IndexValue.RangeIndex):
+                params["index_value"] = parse_index(
+                    None, (df_or_series.key, df_or_series.index_value.key)
+                )
+            shape_list[0] = np.nan
+
+        if isinstance(df_or_series, DATAFRAME_TYPE):
+            new_dtypes = self._filter_dtypes(df_or_series.dtypes)
+            params["columns_value"] = parse_index(new_dtypes.index, store_data=True)
+            params["dtypes"] = new_dtypes
+            shape_list[1] = len(new_dtypes)
+            self.output_types = [OutputType.dataframe]
+        elif isinstance(df_or_series, SERIES_TYPE):
+            self.output_types = [OutputType.series]
+        else:
+            self.output_types = [OutputType.index]
+
+        params["shape"] = tuple(shape_list)
+
+        inputs = [df_or_series]
+        if isinstance(self._index, Entity):
+            inputs.append(self._index)
+        return self.new_tileable(inputs, **params)
+
+    @classmethod
+    def tile(cls, op: "DataFrameDrop"):
+        inp = op.inputs[0]
+        out = op.outputs[0]
+        if len(op.inputs) > 1:
+            rechunked = yield from recursive_tile(
+                op.index.rechunk({0: (op.index.shape[0],)})
+            )
+            index_chunk = rechunked.chunks[0]
+        else:
+            index_chunk = op.index
+
+        col_to_args = OrderedDict()
+        chunks = []
+        for c in inp.chunks:
+            params = c.params.copy()
+            if isinstance(inp, DATAFRAME_TYPE):
+                new_dtypes, new_col_id = col_to_args.get(c.index[1], (None, None))
+
+                if new_dtypes is None:
+                    new_col_id = len(col_to_args)
+                    new_dtypes = op._filter_dtypes(c.dtypes, ignore_errors=True)
+                    if len(new_dtypes) == 0:
+                        continue
+                    col_to_args[c.index[1]] = (new_dtypes, new_col_id)
+
+                params.update(
+                    dict(
+                        dtypes=new_dtypes,
+                        index=(c.index[0], new_col_id),
+                        index_value=c.index_value,
+                        columns_value=parse_index(new_dtypes.index, store_data=True),
+                    )
+                )
+                if op.index is not None:
+                    params.update(
+                        dict(
+                            shape=(np.nan, len(new_dtypes)),
+                            index_value=parse_index(None, (c.key, c.index_value.key)),
+                        )
+                    )
+                else:
+                    params["shape"] = (c.shape[0], len(new_dtypes))
+            elif op.index is not None:
+                params.update(
+                    dict(
+                        shape=(np.nan,),
+                        index_value=parse_index(None, (c.key, c.index_value.key)),
+                    )
+                )
+
+            chunk_inputs = [c]
+            if isinstance(index_chunk, Chunk):
+                chunk_inputs.append(index_chunk)
+
+            new_op = op.copy().reset_key()
+            new_op._index = index_chunk
+            chunks.append(new_op.new_chunk(chunk_inputs, **params))
+
+        new_op = op.copy().reset_key()
+        params = out.params.copy()
+        if op.index is not None:
+            nsplits_list = [(np.nan,) * inp.chunk_shape[0]]
+        else:
+            nsplits_list = [inp.nsplits[0]]
+        if isinstance(inp, DATAFRAME_TYPE):
+            nsplits_list.append(tuple(len(dt) for dt, _ in col_to_args.values()))
+        params.update(dict(chunks=chunks, nsplits=tuple(nsplits_list)))
+        return new_op.new_tileables(op.inputs, **params)
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameDrop"):
+        inp = op.inputs[0]
+        if isinstance(op.index, CHUNK_TYPE):
+            index_val = ctx[op.index.key]
+        else:
+            index_val = op.index
+
+        if isinstance(inp, INDEX_CHUNK_TYPE):
+            ctx[op.outputs[0].key] = ctx[inp.key].drop(index_val, errors="ignore")
+        else:
+            ctx[op.outputs[0].key] = ctx[inp.key].drop(
+                index=index_val, columns=op.columns, level=op.level, errors="ignore"
+            )
+
+
+def _drop(
+    df_or_series,
+    labels=None,
+    axis=0,
+    index=None,
+    columns=None,
+    level=None,
+    inplace=False,
+    errors="raise",
+):
+    axis = validate_axis(axis, df_or_series)
+    if labels is not None:
+        if axis == 0:
+            index = labels
+        else:
+            columns = labels
+
+    if index is not None and errors == "raise":
+        warnings.warn("Errors will not raise for non-existing indices")
+    if isinstance(columns, Entity):
+        raise NotImplementedError("Columns cannot be Mars objects")
+
+    op = DataFrameDrop(index=index, columns=columns, level=level, errors=errors)
+    df = op(df_or_series)
+    if inplace:
+        df_or_series.data = df.data
+    else:
+        return df
+
+
+def df_drop(
+    df,
+    labels=None,
+    axis=0,
+    index=None,
+    columns=None,
+    level=None,
+    inplace=False,
+    errors="raise",
+):
+    """
+    Drop specified labels from rows or columns.
+
+    Remove rows or columns by specifying label names and corresponding
+    axis, or by specifying directly index or column names. When using a
+    multi-index, labels on different levels can be removed by specifying
+    the level.
+
+    Parameters
+    ----------
+    labels : single label or list-like
+        Index or column labels to drop.
+    axis : {0 or 'index', 1 or 'columns'}, default 0
+        Whether to drop labels from the index (0 or 'index') or
+        columns (1 or 'columns').
+    index : single label or list-like
+        Alternative to specifying axis (``labels, axis=0``
+        is equivalent to ``index=labels``).
+    columns : single label or list-like
+        Alternative to specifying axis (``labels, axis=1``
+        is equivalent to ``columns=labels``).
+    level : int or level name, optional
+        For MultiIndex, level from which the labels will be removed.
+    inplace : bool, default False
+        If True, do operation inplace and return None.
+    errors : {'ignore', 'raise'}, default 'raise'
+        If 'ignore', suppress error and only existing labels are
+        dropped. Note that errors for missing indices will not raise.
+
+    Returns
+    -------
+    DataFrame
+        DataFrame without the removed index or column labels.
+
+    Raises
+    ------
+    KeyError
+        If any of the labels is not found in the selected axis.
+
+    See Also
+    --------
+    DataFrame.loc : Label-location based indexer for selection by label.
+    DataFrame.dropna : Return DataFrame with labels on given axis omitted
+        where (all or any) data are missing.
+    DataFrame.drop_duplicates : Return DataFrame with duplicate rows
+        removed, optionally only considering certain columns.
+    Series.drop : Return Series with specified index labels removed.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import pandas as pd
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame(np.arange(12).reshape(3, 4),
+    ...                   columns=['A', 'B', 'C', 'D'])
+    >>> df.execute()
+       A  B   C   D
+    0  0  1   2   3
+    1  4  5   6   7
+    2  8  9  10  11
+
+    Drop columns
+
+    >>> df.drop(['B', 'C'], axis=1).execute()
+       A   D
+    0  0   3
+    1  4   7
+    2  8  11
+
+    >>> df.drop(columns=['B', 'C']).execute()
+       A   D
+    0  0   3
+    1  4   7
+    2  8  11
+
+    Drop a row by index
+
+    >>> df.drop([0, 1]).execute()
+       A  B   C   D
+    2  8  9  10  11
+
+    Drop columns and/or rows of MultiIndex DataFrame
+
+    >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
+    ...                              ['speed', 'weight', 'length']],
+    ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
+    >>> df = md.DataFrame(index=midx, columns=['big', 'small'],
+    ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
+    ...                         [250, 150], [1.5, 0.8], [320, 250],
+    ...                         [1, 0.8], [0.3, 0.2]])
+    >>> df.execute()
+                    big     small
+    lama    speed   45.0    30.0
+            weight  200.0   100.0
+            length  1.5     1.0
+    cow     speed   30.0    20.0
+            weight  250.0   150.0
+            length  1.5     0.8
+    falcon  speed   320.0   250.0
+            weight  1.0     0.8
+            length  0.3     0.2
+
+    >>> df.drop(index='cow', columns='small').execute()
+                    big
+    lama    speed   45.0
+            weight  200.0
+            length  1.5
+    falcon  speed   320.0
+            weight  1.0
+            length  0.3
+
+    >>> df.drop(index='length', level=1).execute()
+                    big     small
+    lama    speed   45.0    30.0
+            weight  200.0   100.0
+    cow     speed   30.0    20.0
+            weight  250.0   150.0
+    falcon  speed   320.0   250.0
+            weight  1.0     0.8
+    """
+    return _drop(
+        df,
+        labels=labels,
+        axis=axis,
+        index=index,
+        columns=columns,
+        level=level,
+        inplace=inplace,
+        errors=errors,
+    )
+
+
+def df_pop(df, item):
+    """
+    Return item and drop from frame. Raise KeyError if not found.
+
+    Parameters
+    ----------
+    item : str
+        Label of column to be popped.
+
+    Returns
+    -------
+    Series
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame([('falcon', 'bird', 389.0),
+    ...                    ('parrot', 'bird', 24.0),
+    ...                    ('lion', 'mammal', 80.5),
+    ...                    ('monkey', 'mammal', np.nan)],
+    ...                   columns=('name', 'class', 'max_speed'))
+    >>> df.execute()
+         name   class  max_speed
+    0  falcon    bird      389.0
+    1  parrot    bird       24.0
+    2    lion  mammal       80.5
+    3  monkey  mammal        NaN
+
+    >>> df.pop('class').execute()
+    0      bird
+    1      bird
+    2    mammal
+    3    mammal
+    Name: class, dtype: object
+
+    >>> df.execute()
+         name  max_speed
+    0  falcon      389.0
+    1  parrot       24.0
+    2    lion       80.5
+    3  monkey        NaN
+    """
+    series = df.data[item]
+    df_drop(df, item, axis=1, inplace=True)
+    return series
+
+
+def series_drop(
+    series,
+    labels=None,
+    axis=0,
+    index=None,
+    columns=None,
+    level=None,
+    inplace=False,
+    errors="raise",
+):
+    """
+    Return Series with specified index labels removed.
+
+    Remove elements of a Series based on specifying the index labels.
+    When using a multi-index, labels on different levels can be removed
+    by specifying the level.
+
+    Parameters
+    ----------
+    labels : single label or list-like
+        Index labels to drop.
+    axis : 0, default 0
+        Redundant for application on Series.
+    index : single label or list-like
+        Redundant for application on Series, but 'index' can be used instead
+        of 'labels'.
+
+        .. versionadded:: 0.21.0
+    columns : single label or list-like
+        No change is made to the Series; use 'index' or 'labels' instead.
+
+        .. versionadded:: 0.21.0
+    level : int or level name, optional
+        For MultiIndex, level for which the labels will be removed.
+    inplace : bool, default False
+        If True, do operation inplace and return None.
+    errors : {'ignore', 'raise'}, default 'raise'
+        Note that this argument is kept only for compatibility, and errors
+        will not raise even if ``errors=='raise'``.
+
+    Returns
+    -------
+    Series
+        Series with specified index labels removed.
+
+    Raises
+    ------
+    KeyError
+        If none of the labels are found in the index.
+
+    See Also
+    --------
+    Series.reindex : Return only specified index labels of Series.
+    Series.dropna : Return series without null values.
+    Series.drop_duplicates : Return Series with duplicate values removed.
+    DataFrame.drop : Drop specified labels from rows or columns.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import pandas as pd
+    >>> import mars.dataframe as md
+    >>> s = md.Series(data=np.arange(3), index=['A', 'B', 'C'])
+    >>> s.execute()
+    A  0
+    B  1
+    C  2
+    dtype: int64
+
+    Drop labels B en C
+
+    >>> s.drop(labels=['B', 'C']).execute()
+    A  0
+    dtype: int64
+
+    Drop 2nd level label in MultiIndex Series
+
+    >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
+    ...                              ['speed', 'weight', 'length']],
+    ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
+    >>> s = md.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
+    ...               index=midx)
+    >>> s.execute()
+    lama    speed      45.0
+            weight    200.0
+            length      1.2
+    cow     speed      30.0
+            weight    250.0
+            length      1.5
+    falcon  speed     320.0
+            weight      1.0
+            length      0.3
+    dtype: float64
+
+    >>> s.drop(labels='weight', level=1).execute()
+    lama    speed      45.0
+            length      1.2
+    cow     speed      30.0
+            length      1.5
+    falcon  speed     320.0
+            length      0.3
+    dtype: float64
+    """
+    return _drop(
+        series,
+        labels=labels,
+        axis=axis,
+        index=index,
+        columns=columns,
+        level=level,
+        inplace=inplace,
+        errors=errors,
+    )
+
+
+def index_drop(index, labels, errors="raise"):
+    """
+    Make new Index with passed list of labels deleted.
+
+    Parameters
+    ----------
+    labels : array-like
+    errors : {'ignore', 'raise'}, default 'raise'
+        Note that this argument is kept only for compatibility, and errors
+        will not raise even if ``errors=='raise'``.
+
+    Returns
+    -------
+    dropped : Index
+
+    Raises
+    ------
+    KeyError
+        If not all of the labels are found in the selected axis
+    """
+    return _drop(index, labels=labels, errors=errors)
diff --git a/python/xorbits/_mars/dataframe/base/drop_duplicates.py b/python/xorbits/_mars/dataframe/base/drop_duplicates.py
new file mode 100644
index 000000000..596fd8c8a
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/drop_duplicates.py
@@ -0,0 +1,421 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core.operand import OperandStage
+from ...serialization.serializables import BoolField
+from ...utils import calc_nsplits, lazy_import
+from ..operands import OutputType
+from ..utils import (
+    gen_unknown_index_value,
+    hash_dataframe_on,
+    parse_index,
+    standardize_range_index,
+)
+from ._duplicate import DuplicateOperand, validate_subset
+
+cudf = lazy_import("cudf")
+
+
+class DataFrameDropDuplicates(DuplicateOperand):
+    _op_type_ = opcodes.DROP_DUPLICATES
+
+    _ignore_index = BoolField("ignore_index")
+
+    def __init__(
+        self,
+        subset=None,
+        keep=None,
+        ignore_index=None,
+        output_types=None,
+        method=None,
+        subset_chunk=None,
+        shuffle_size=None,
+        **kw
+    ):
+        super().__init__(
+            _subset=subset,
+            _keep=keep,
+            _ignore_index=ignore_index,
+            _output_types=output_types,
+            _method=method,
+            _subset_chunk=subset_chunk,
+            _shuffle_size=shuffle_size,
+            **kw
+        )
+
+    @property
+    def ignore_index(self):
+        return self._ignore_index
+
+    @classmethod
+    def _get_shape(cls, input_shape, op):
+        shape = (np.nan,) + input_shape[1:]
+        if op.output_types[0] == OutputType.dataframe and len(shape) == 1:
+            shape += (3,)
+        return shape
+
+    @classmethod
+    def _gen_tileable_params(cls, op: "DataFrameDropDuplicates", input_params):
+        params = input_params.copy()
+        if op.ignore_index:
+            params["index_value"] = parse_index(pd.RangeIndex(-1))
+        else:
+            params["index_value"] = gen_unknown_index_value(
+                input_params["index_value"], op.keep, op.subset, type(op).__name__
+            )
+        params["shape"] = cls._get_shape(input_params["shape"], op)
+        return params
+
+    def __call__(self, inp, inplace=False):
+        self._output_types = inp.op.output_types
+        params = self._gen_tileable_params(self, inp.params)
+
+        ret = self.new_tileable([inp], kws=[params])
+        if inplace:
+            inp.data = ret.data
+        return ret
+
+    @classmethod
+    def _gen_chunk_params(cls, op: "DataFrameDropDuplicates", input_chunk):
+        input_params = input_chunk.params
+        inp = op.inputs[0]
+        chunk_params = input_params.copy()
+        chunk_params["index"] = input_chunk.index[:1] + (0,) * (inp.ndim - 1)
+        chunk_params["shape"] = cls._get_shape(input_params["shape"], op)
+        chunk_params["index_value"] = gen_unknown_index_value(
+            input_params["index_value"], input_chunk
+        )
+        if inp.ndim == 2:
+            chunk_params["columns_value"] = inp.columns_value
+            chunk_params["dtypes"] = inp.dtypes
+        else:
+            chunk_params["name"] = inp.name
+            chunk_params["dtype"] = inp.dtype
+        return chunk_params
+
+    @classmethod
+    def _get_map_output_types(cls, input_chunk, method: str):
+        if method == "subset_tree":
+            return [OutputType.dataframe]
+        else:
+            return input_chunk.op.output_types
+
+    @classmethod
+    def _tile_shuffle(cls, op: "DataFrameDropDuplicates", inp):
+        tiled = super()._tile_shuffle(op, inp)[0]
+        put_back_chunks = tiled.chunks
+        if op.ignore_index:
+            yield put_back_chunks
+            put_back_chunks = standardize_range_index(put_back_chunks)
+        new_op = op.copy()
+        params = tiled.params
+        params["nsplits"] = calc_nsplits({c.index: c.shape for c in put_back_chunks})
+        params["chunks"] = put_back_chunks
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def _execute_chunk(cls, ctx, op):
+        inp = ctx[op.input.key]
+        ctx[op.outputs[0].key] = cls._drop_duplicates(inp, op)
+
+    @classmethod
+    def _execute_subset_tree_post(cls, ctx, op):
+        inp = ctx[op.input.key]
+        out = op.outputs[0]
+        idx = op.outputs[0].index[0]
+        subset = ctx[op.subset_chunk.key]
+        selected = subset[subset["_chunk_index_"] == idx]["_i_"]
+        ret = inp.iloc[selected]
+        if op.ignore_index:
+            prev_size = (subset["_chunk_index_"] < out.index[0]).sum()
+            ret.index = pd.RangeIndex(prev_size, prev_size + len(ret))
+        ctx[op.outputs[0].key] = ret
+
+    @classmethod
+    def _execute_shuffle_map(cls, ctx, op):
+        out = op.outputs[0]
+        shuffle_size = op.shuffle_size
+        subset = op.subset
+
+        inp = ctx[op.input.key]
+        dropped = cls._drop_duplicates(inp, op)
+        if dropped.ndim == 1:
+            dropped = dropped.to_frame()
+            subset = dropped.columns.tolist()
+        else:
+            if subset is None:
+                subset = dropped.columns.tolist()
+        dropped["_chunk_index_"] = out.index[0]
+        dropped["_i_"] = np.arange(dropped.shape[0])
+        hashed = hash_dataframe_on(dropped, subset, shuffle_size)
+        for i, data in enumerate(hashed):
+            reducer_idx = (i,) + out.index[1:]
+            ctx[out.key, reducer_idx] = dropped.iloc[data]
+
+    @classmethod
+    def _execute_shuffle_reduce(cls, ctx, op: "DataFrameDropDuplicates"):
+        out = op.outputs[0]
+        inputs = list(op.iter_mapper_data(ctx))
+
+        xdf = cls._get_xdf(inputs[0])
+        inp = xdf.concat(inputs)
+        dropped = cls._drop_duplicates(
+            inp,
+            op,
+            subset=[c for c in inp.columns if c not in ("_chunk_index_", "_i_")],
+            keep=op.keep,
+            ignore_index=op.ignore_index,
+        )
+        for i in range(op.shuffle_size):
+            filtered = dropped[dropped["_chunk_index_"] == i]
+            del filtered["_chunk_index_"]
+            ctx[out.key, (i,)] = filtered
+
+    @classmethod
+    def _execute_shuffle_put_back(cls, ctx, op: "DataFrameDropDuplicates"):
+        out = op.outputs[0]
+        inputs = list(op.iter_mapper_data(ctx))
+
+        xdf = cls._get_xdf(inputs[0])
+        inp = xdf.concat(inputs)
+        inp.sort_values("_i_", inplace=True)
+        del inp["_i_"]
+
+        if out.op.output_types[0] == OutputType.index:
+            assert inp.shape[1] == 1
+            ret = xdf.Index(inp.iloc[:, 0])
+        elif out.op.output_types[0] == OutputType.series:
+            assert inp.shape[1] == 1
+            ret = inp.iloc[:, 0]
+            ret.name = out.name
+        else:
+            ret = inp
+
+        if op.ignore_index:
+            ret.reset_index(drop=True, inplace=True)
+        ctx[out.key] = ret
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.method is None:
+            # one chunk
+            cls._execute_chunk(ctx, op)
+        elif op.method == "tree":
+            # tree
+            cls._execute_chunk(ctx, op)
+        elif op.method == "subset_tree":
+            # subset tree
+            if op.stage == OperandStage.map:
+                cls._execute_subset_tree_map(ctx, op)
+            elif op.stage == OperandStage.combine:
+                cls._execute_subset_tree_combine(ctx, op)
+            elif op.stage == OperandStage.agg:
+                cls._execute_subset_tree_agg(ctx, op)
+            else:
+                # post
+                cls._execute_subset_tree_post(ctx, op)
+        else:
+            assert op.method == "shuffle"
+            if op.stage == OperandStage.map:
+                cls._execute_shuffle_map(ctx, op)
+            elif op.reducer_phase == "drop_duplicates":
+                cls._execute_shuffle_reduce(ctx, op)
+            else:
+                assert op.reducer_phase == "put_back"
+                cls._execute_shuffle_put_back(ctx, op)
+
+
+def df_drop_duplicates(
+    df, subset=None, keep="first", inplace=False, ignore_index=False, method="auto"
+):
+    """
+    Return DataFrame with duplicate rows removed.
+
+    Considering certain columns is optional. Indexes, including time indexes
+    are ignored.
+
+    Parameters
+    ----------
+    subset : column label or sequence of labels, optional
+        Only consider certain columns for identifying duplicates, by
+        default use all of the columns.
+    keep : {'first', 'last', False}, default 'first'
+        Determines which duplicates (if any) to keep.
+        - ``first`` : Drop duplicates except for the first occurrence.
+        - ``last`` : Drop duplicates except for the last occurrence.
+        - False : Drop all duplicates.
+    inplace : bool, default False
+        Whether to drop duplicates in place or to return a copy.
+    ignore_index : bool, default False
+        If True, the resulting axis will be labeled 0, 1, …, n - 1.
+
+    Returns
+    -------
+    DataFrame
+        DataFrame with duplicates removed or None if ``inplace=True``.
+    """
+    if method not in ("auto", "tree", "subset_tree", "shuffle", None):
+        raise ValueError(
+            "method could only be one of "
+            "'auto', 'tree', 'subset_tree', 'shuffle' or None"
+        )
+    subset = validate_subset(df, subset)
+    op = DataFrameDropDuplicates(
+        subset=subset, keep=keep, ignore_index=ignore_index, method=method
+    )
+    return op(df, inplace=inplace)
+
+
+def series_drop_duplicates(series, keep="first", inplace=False, method="auto"):
+    """
+    Return Series with duplicate values removed.
+
+    Parameters
+    ----------
+    keep : {'first', 'last', ``False``}, default 'first'
+        Method to handle dropping duplicates:
+
+        - 'first' : Drop duplicates except for the first occurrence.
+        - 'last' : Drop duplicates except for the last occurrence.
+        - ``False`` : Drop all duplicates.
+
+    inplace : bool, default ``False``
+        If ``True``, performs operation inplace and returns None.
+
+    Returns
+    -------
+    Series
+        Series with duplicates dropped.
+
+    See Also
+    --------
+    Index.drop_duplicates : Equivalent method on Index.
+    DataFrame.drop_duplicates : Equivalent method on DataFrame.
+    Series.duplicated : Related method on Series, indicating duplicate
+        Series values.
+
+    Examples
+    --------
+    Generate a Series with duplicated entries.
+
+    >>> import mars.dataframe as md
+    >>> s = md.Series(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'],
+    ...               name='animal')
+    >>> s.execute()
+    0      lama
+    1       cow
+    2      lama
+    3    beetle
+    4      lama
+    5     hippo
+    Name: animal, dtype: object
+
+    With the 'keep' parameter, the selection behaviour of duplicated values
+    can be changed. The value 'first' keeps the first occurrence for each
+    set of duplicated entries. The default value of keep is 'first'.
+
+    >>> s.drop_duplicates().execute()
+    0      lama
+    1       cow
+    3    beetle
+    5     hippo
+    Name: animal, dtype: object
+
+    The value 'last' for parameter 'keep' keeps the last occurrence for
+    each set of duplicated entries.
+
+    >>> s.drop_duplicates(keep='last').execute()
+    1       cow
+    3    beetle
+    4      lama
+    5     hippo
+    Name: animal, dtype: object
+
+    The value ``False`` for parameter 'keep' discards all sets of
+    duplicated entries. Setting the value of 'inplace' to ``True`` performs
+    the operation inplace and returns ``None``.
+
+    >>> s.drop_duplicates(keep=False, inplace=True)
+    >>> s.execute()
+    1       cow
+    3    beetle
+    5     hippo
+    Name: animal, dtype: object
+    """
+    if method not in ("auto", "tree", "shuffle", None):
+        raise ValueError(
+            "method could only be one of 'auto', 'tree', 'shuffle' or None"
+        )
+    op = DataFrameDropDuplicates(keep=keep, method=method)
+    return op(series, inplace=inplace)
+
+
+def index_drop_duplicates(index, keep="first", method="auto"):
+    """
+    Return Index with duplicate values removed.
+
+    Parameters
+    ----------
+    keep : {'first', 'last', ``False``}, default 'first'
+        - 'first' : Drop duplicates except for the first occurrence.
+        - 'last' : Drop duplicates except for the last occurrence.
+        - ``False`` : Drop all duplicates.
+
+    Returns
+    -------
+    deduplicated : Index
+
+    See Also
+    --------
+    Series.drop_duplicates : Equivalent method on Series.
+    DataFrame.drop_duplicates : Equivalent method on DataFrame.
+    Index.duplicated : Related method on Index, indicating duplicate
+        Index values.
+
+    Examples
+    --------
+    Generate an pandas.Index with duplicate values.
+
+    >>> import mars.dataframe as md
+
+    >>> idx = md.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'])
+
+    The `keep` parameter controls  which duplicate values are removed.
+    The value 'first' keeps the first occurrence for each
+    set of duplicated entries. The default value of keep is 'first'.
+
+    >>> idx.drop_duplicates(keep='first').execute()
+    Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object')
+
+    The value 'last' keeps the last occurrence for each set of duplicated
+    entries.
+
+    >>> idx.drop_duplicates(keep='last').execute()
+    Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object')
+
+    The value ``False`` discards all sets of duplicated entries.
+
+    >>> idx.drop_duplicates(keep=False).execute()
+    Index(['cow', 'beetle', 'hippo'], dtype='object')
+    """
+    if method not in ("auto", "tree", "shuffle", None):
+        raise ValueError(
+            "method could only be one of 'auto', 'tree', 'shuffle' or None"
+        )
+    op = DataFrameDropDuplicates(keep=keep, method=method)
+    return op(index)
diff --git a/python/xorbits/_mars/dataframe/base/duplicated.py b/python/xorbits/_mars/dataframe/base/duplicated.py
new file mode 100644
index 000000000..bb4db862f
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/duplicated.py
@@ -0,0 +1,533 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes
+from ...core import OutputType
+from ...core.operand import OperandStage
+from ..utils import gen_unknown_index_value, hash_dataframe_on
+from ._duplicate import DuplicateOperand, validate_subset
+
+
+class DataFrameDuplicated(DuplicateOperand):
+    _op_type_ = opcodes.DUPLICATED
+
+    def __init__(
+        self,
+        subset=None,
+        keep=None,
+        output_types=None,
+        method=None,
+        subset_chunk=None,
+        shuffle_size=None,
+        **kw
+    ):
+        super().__init__(
+            _subset=subset,
+            _keep=keep,
+            _output_types=output_types,
+            _method=method,
+            _subset_chunk=subset_chunk,
+            _shuffle_size=shuffle_size,
+            **kw
+        )
+
+    @classmethod
+    def _get_shape(cls, input_shape, op):
+        return (input_shape[0],)
+
+    @classmethod
+    def _gen_tileable_params(cls, op: "DataFrameDuplicated", input_params):
+        # duplicated() always returns a Series
+        return {
+            "shape": cls._get_shape(input_params["shape"], op),
+            "index_value": input_params["index_value"],
+            "dtype": np.dtype(bool),
+            "name": input_params.get("name"),
+        }
+
+    def __call__(self, inp, inplace=False):
+        self._output_types = [OutputType.series]
+        params = self._gen_tileable_params(self, inp.params)
+
+        return self.new_tileable([inp], kws=[params])
+
+    @classmethod
+    def _get_map_output_types(cls, input_chunk, method: str):
+        if method in ("tree", "subset_tree"):
+            return [OutputType.dataframe]
+        else:
+            return input_chunk.op.output_types
+
+    @classmethod
+    def _gen_chunk_params_default(cls, op: "DataFrameDuplicated", input_chunk):
+        return {
+            "shape": cls._get_shape(input_chunk.shape, op),
+            "index_value": input_chunk.index_value,
+            "dtype": np.dtype(bool),
+            "name": input_chunk.name if input_chunk.ndim == 1 else None,
+            "index": (input_chunk.index[0],),
+        }
+
+    @classmethod
+    def _get_intermediate_shape(cls, input_shape):
+        if len(input_shape) > 1:
+            s = input_shape[1:]
+        else:
+            s = (2,)
+        return (np.nan,) + s
+
+    @classmethod
+    def _gen_intermediate_chunk_params(cls, op: "DataFrameDuplicated", input_chunk):
+        inp = op.input
+        chunk_params = dict()
+        chunk_params["shape"] = shape = cls._get_intermediate_shape(input_chunk.shape)
+        chunk_params["index"] = input_chunk.index[:1] + (0,) * (len(shape) - 1)
+        chunk_params["index_value"] = gen_unknown_index_value(
+            input_chunk.index_value, input_chunk
+        )
+        if inp.ndim == 2 and len(shape) == 2:
+            chunk_params["columns_value"] = input_chunk.columns_value
+            chunk_params["dtypes"] = input_chunk.dtypes
+        return chunk_params
+
+    @classmethod
+    def _gen_chunk_params(cls, op: "DataFrameDuplicated", input_chunk):
+        is_terminal_chunk = False
+        if op.method is None:
+            # one chunk
+            is_terminal_chunk = True
+        elif op.method == "subset_tree" and op.stage is None:
+            is_terminal_chunk = True
+        elif op.method == "tree" and op.stage == OperandStage.agg:
+            is_terminal_chunk = True
+        elif op.method == "shuffle" and op.reducer_phase == "put_back":
+            is_terminal_chunk = True
+
+        if is_terminal_chunk:
+            return cls._gen_chunk_params_default(op, input_chunk)
+        else:
+            return cls._gen_intermediate_chunk_params(op, input_chunk)
+
+    @classmethod
+    def _duplicated(cls, inp, op, subset=None, keep=None):
+        if keep is None:
+            keep = op.keep
+        if inp.ndim == 2:
+            if subset is None:
+                subset = op.subset
+            return inp.duplicated(subset=subset, keep=keep)
+        else:
+            return inp.duplicated(keep=keep)
+
+    @classmethod
+    def _execute_chunk(cls, ctx, op):
+        inp = ctx[op.input.key]
+        ctx[op.outputs[0].key] = cls._duplicated(inp, op)
+
+    @classmethod
+    def _execute_tree_map(cls, ctx, op):
+        inp = ctx[op.input.key]
+        xdf = cls._get_xdf(inp)
+        if op.subset is not None:
+            result = inp[op.subset].copy()
+        else:
+            result = inp.copy()
+        duplicated = cls._duplicated(inp, op)
+        if not duplicated.name:
+            duplicated.name = "_duplicated_"
+        result.iloc[duplicated.values] = None
+        result = xdf.concat([result, duplicated], axis=1)
+        ctx[op.outputs[0].key] = result
+
+    @classmethod
+    def _execute_tree_combine(cls, ctx, op):
+        inp = ctx[op.input.key]
+        result = inp.copy()
+        duplicated_filter = ~inp.iloc[:, -1]
+        duplicates = inp.loc[duplicated_filter]
+        dup_on_duplicated = cls._duplicated(duplicates, op)
+        result.iloc[duplicated_filter.to_numpy().nonzero()[0], -1] = dup_on_duplicated
+        duplicated = result.iloc[:, -1]
+        result.iloc[duplicated.values, :-1] = None
+        ctx[op.outputs[0].key] = result
+
+    @classmethod
+    def _execute_tree_agg(cls, ctx, op):
+        inp = ctx[op.input.key]
+        result = inp.iloc[:, -1].copy()
+        duplicates = inp[~inp.iloc[:, -1]]
+        dup_on_duplicated = cls._duplicated(duplicates, op)
+        result[~inp.iloc[:, -1]] = dup_on_duplicated
+        expect_name = op.outputs[0].name
+        if result.name != expect_name:
+            result.name = expect_name
+        result = result.astype(bool)
+        ctx[op.outputs[0].key] = result
+
+    @classmethod
+    def _execute_subset_tree_post(cls, ctx, op):
+        inp = ctx[op.input.key]
+        idx = op.outputs[0].index[0]
+        subset = ctx[op.subset_chunk.key]
+        selected = subset[subset["_chunk_index_"] == idx]["_i_"]
+
+        xdf = cls._get_xdf(inp)
+        duplicated = np.ones(len(inp), dtype=bool)
+        duplicated[selected] = False
+
+        ctx[op.outputs[0].key] = xdf.Series(duplicated, index=inp.index)
+
+    @classmethod
+    def _execute_shuffle_map(cls, ctx, op):
+        out = op.outputs[0]
+        shuffle_size = op.shuffle_size
+        subset = op.subset
+
+        inp = ctx[op.input.key]
+        if subset is not None:
+            result = inp[subset].copy()
+        else:
+            result = inp.copy()
+        if result.ndim == 1:
+            name = result.name
+            result = result.to_frame()
+            if name is None:
+                result.columns = ["_duplicated_"]
+            subset = result.columns.tolist()
+        else:
+            if subset is None:
+                subset = result.columns.tolist()
+            if len(subset) == 1:
+                result.columns = subset = ["_duplicated_"]
+        result["_chunk_index_"] = out.index[0]
+        result["_i_"] = np.arange(result.shape[0])
+        hashed = hash_dataframe_on(result, subset, shuffle_size)
+        for i, data in enumerate(hashed):
+            reducer_idx = (i,) + out.index[1:]
+            ctx[out.key, reducer_idx] = result.iloc[data]
+
+    @classmethod
+    def _execute_shuffle_reduce(cls, ctx, op: "DataFrameDuplicated"):
+        out = op.outputs[0]
+        inputs = list(op.iter_mapper_data(ctx))
+
+        xdf = cls._get_xdf(inputs[0])
+        inp = xdf.concat(inputs)
+        subset = [c for c in inp.columns if c not in ("_chunk_index_", "_i_")]
+        duplicated = cls._duplicated(inp, op, subset=subset)
+        result = xdf.concat([duplicated, inp[["_chunk_index_", "_i_"]]], axis=1)
+        for i in range(op.shuffle_size):
+            filtered = result[result["_chunk_index_"] == i]
+            del filtered["_chunk_index_"]
+            if len(subset) > 1 or subset[0] == "_duplicated_":
+                filtered.columns = ["_duplicated_"] + filtered.columns[1:].tolist()
+            else:
+                filtered.columns = [subset[0]] + filtered.columns[1:].tolist()
+            ctx[out.key, (i,)] = filtered
+
+    @classmethod
+    def _execute_shuffle_put_back(cls, ctx, op: "DataFrameDuplicated"):
+        inputs = list(op.iter_mapper_data(ctx))
+
+        xdf = cls._get_xdf(inputs[0])
+        inp = xdf.concat(inputs)
+        inp.sort_values("_i_", inplace=True)
+        del inp["_i_"]
+        duplicated = inp.iloc[:, 0]
+        if duplicated.name == "_duplicated_":
+            duplicated.name = None
+        ctx[op.outputs[0].key] = duplicated
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameDuplicated"):
+        if op.method is None:
+            # one chunk
+            cls._execute_chunk(ctx, op)
+        elif op.method == "tree":
+            # tree
+            if op.stage == OperandStage.map:
+                cls._execute_tree_map(ctx, op)
+            elif op.stage == OperandStage.combine:
+                cls._execute_tree_combine(ctx, op)
+            else:
+                assert op.stage == OperandStage.agg
+                cls._execute_tree_agg(ctx, op)
+        elif op.method == "subset_tree":
+            # subset tree
+            if op.stage == OperandStage.map:
+                cls._execute_subset_tree_map(ctx, op)
+            elif op.stage == OperandStage.combine:
+                cls._execute_subset_tree_combine(ctx, op)
+            elif op.stage == OperandStage.agg:
+                cls._execute_subset_tree_agg(ctx, op)
+            else:
+                # post
+                cls._execute_subset_tree_post(ctx, op)
+        else:
+            assert op.method == "shuffle"
+            if op.stage == OperandStage.map:
+                cls._execute_shuffle_map(ctx, op)
+            elif op.reducer_phase == "drop_duplicates":
+                cls._execute_shuffle_reduce(ctx, op)
+            else:
+                assert op.reducer_phase == "put_back"
+                cls._execute_shuffle_put_back(ctx, op)
+
+
+def df_duplicated(df, subset=None, keep="first", method="auto"):
+    """
+    Return boolean Series denoting duplicate rows.
+
+    Considering certain columns is optional.
+
+    Parameters
+    ----------
+    subset : column label or sequence of labels, optional
+        Only consider certain columns for identifying duplicates, by
+        default use all of the columns.
+    keep : {'first', 'last', False}, default 'first'
+        Determines which duplicates (if any) to mark.
+
+        - ``first`` : Mark duplicates as ``True`` except for the first occurrence.
+        - ``last`` : Mark duplicates as ``True`` except for the last occurrence.
+        - False : Mark all duplicates as ``True``.
+
+    Returns
+    -------
+    Series
+        Boolean series for each duplicated rows.
+
+    See Also
+    --------
+    Index.duplicated : Equivalent method on index.
+    Series.duplicated : Equivalent method on Series.
+    Series.drop_duplicates : Remove duplicate values from Series.
+    DataFrame.drop_duplicates : Remove duplicate values from DataFrame.
+
+    Examples
+    --------
+    Consider dataset containing ramen rating.
+
+    >>> import mars.dataframe as md
+
+    >>> df = md.DataFrame({
+    ...     'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
+    ...     'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
+    ...     'rating': [4, 4, 3.5, 15, 5]
+    ... })
+    >>> df.execute()
+        brand style  rating
+    0  Yum Yum   cup     4.0
+    1  Yum Yum   cup     4.0
+    2  Indomie   cup     3.5
+    3  Indomie  pack    15.0
+    4  Indomie  pack     5.0
+
+    By default, for each set of duplicated values, the first occurrence
+    is set on False and all others on True.
+
+    >>> df.duplicated().execute()
+    0    False
+    1     True
+    2    False
+    3    False
+    4    False
+    dtype: bool
+
+    By using 'last', the last occurrence of each set of duplicated values
+    is set on False and all others on True.
+
+    >>> df.duplicated(keep='last').execute()
+    0     True
+    1    False
+    2    False
+    3    False
+    4    False
+    dtype: bool
+
+    By setting ``keep`` on False, all duplicates are True.
+
+    >>> df.duplicated(keep=False).execute()
+    0     True
+    1     True
+    2    False
+    3    False
+    4    False
+    dtype: bool
+
+    To find duplicates on specific column(s), use ``subset``.
+
+    >>> df.duplicated(subset=['brand']).execute()
+    0    False
+    1     True
+    2    False
+    3     True
+    4     True
+    dtype: bool
+    """
+
+    if method not in ("auto", "tree", "subset_tree", "shuffle", None):
+        raise ValueError(
+            "method could only be one of "
+            "'auto', 'tree', 'subset_tree', 'shuffle' or None"
+        )
+    subset = validate_subset(df, subset)
+    op = DataFrameDuplicated(subset=subset, keep=keep, method=method)
+    return op(df)
+
+
+def series_duplicated(series, keep="first", method="auto"):
+    """
+    Indicate duplicate Series values.
+
+    Duplicated values are indicated as ``True`` values in the resulting
+    Series. Either all duplicates, all except the first or all except the
+    last occurrence of duplicates can be indicated.
+
+    Parameters
+    ----------
+    keep : {'first', 'last', False}, default 'first'
+        Method to handle dropping duplicates:
+
+        - 'first' : Mark duplicates as ``True`` except for the first
+          occurrence.
+        - 'last' : Mark duplicates as ``True`` except for the last
+          occurrence.
+        - ``False`` : Mark all duplicates as ``True``.
+
+    Returns
+    -------
+    Series
+        Series indicating whether each value has occurred in the
+        preceding values.
+
+    See Also
+    --------
+    Index.duplicated : Equivalent method on pandas.Index.
+    DataFrame.duplicated : Equivalent method on pandas.DataFrame.
+    Series.drop_duplicates : Remove duplicate values from Series.
+
+    Examples
+    --------
+    By default, for each set of duplicated values, the first occurrence is
+    set on False and all others on True:
+
+    >>> import mars.dataframe as md
+
+    >>> animals = md.Series(['lama', 'cow', 'lama', 'beetle', 'lama'])
+    >>> animals.duplicated().execute()
+    0    False
+    1    False
+    2     True
+    3    False
+    4     True
+    dtype: bool
+
+    which is equivalent to
+
+    >>> animals.duplicated(keep='first').execute()
+    0    False
+    1    False
+    2     True
+    3    False
+    4     True
+    dtype: bool
+
+    By using 'last', the last occurrence of each set of duplicated values
+    is set on False and all others on True:
+
+    >>> animals.duplicated(keep='last').execute()
+    0     True
+    1    False
+    2     True
+    3    False
+    4    False
+    dtype: bool
+
+    By setting keep on ``False``, all duplicates are True:
+
+    >>> animals.duplicated(keep=False).execute()
+    0     True
+    1    False
+    2     True
+    3    False
+    4     True
+    dtype: bool
+    """
+    if method not in ("auto", "tree", "shuffle", None):
+        raise ValueError(
+            "method could only be one of 'auto', 'tree', 'shuffle' or None"
+        )
+    op = DataFrameDuplicated(keep=keep, method=method)
+    return op(series)
+
+
+def index_duplicated(index, keep="first"):
+    """
+    Indicate duplicate index values.
+
+    Duplicated values are indicated as ``True`` values in the resulting
+    array. Either all duplicates, all except the first, or all except the
+    last occurrence of duplicates can be indicated.
+
+    Parameters
+    ----------
+    keep : {'first', 'last', False}, default 'first'
+        The value or values in a set of duplicates to mark as missing.
+        - 'first' : Mark duplicates as ``True`` except for the first
+          occurrence.
+        - 'last' : Mark duplicates as ``True`` except for the last
+          occurrence.
+        - ``False`` : Mark all duplicates as ``True``.
+
+    Returns
+    -------
+    Tensor
+
+    See Also
+    --------
+    Series.duplicated : Equivalent method on pandas.Series.
+    DataFrame.duplicated : Equivalent method on pandas.DataFrame.
+    Index.drop_duplicates : Remove duplicate values from Index.
+
+    Examples
+    --------
+    By default, for each set of duplicated values, the first occurrence is
+    set to False and all others to True:
+
+    >>> import mars.dataframe as md
+
+    >>> idx = md.Index(['lama', 'cow', 'lama', 'beetle', 'lama'])
+    >>> idx.duplicated().execute()
+    array([False, False,  True, False,  True])
+
+    which is equivalent to
+
+    >>> idx.duplicated(keep='first').execute()
+    array([False, False,  True, False,  True])
+
+    By using 'last', the last occurrence of each set of duplicated values
+    is set on False and all others on True:
+
+    >>> idx.duplicated(keep='last').execute()
+    array([ True, False,  True, False, False])
+
+    By setting keep on ``False``, all duplicates are True:
+
+    >>> idx.duplicated(keep=False).execute()
+    array([ True, False,  True, False,  True])
+    """
+    return index.to_series().duplicated(keep=keep).to_tensor()
diff --git a/python/xorbits/_mars/dataframe/base/eval.py b/python/xorbits/_mars/dataframe/base/eval.py
new file mode 100644
index 000000000..dc7172d1c
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/eval.py
@@ -0,0 +1,836 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ast
+import binascii
+import operator
+import sys
+import textwrap
+import tokenize
+from collections import OrderedDict
+from functools import reduce
+from io import StringIO
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import ENTITY_TYPE, OutputType, get_output_types, recursive_tile
+from ...serialization.serializables import BoolField, DictField, StringField
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index
+
+LOCAL_TAG = "_local_var_"
+BACKTICK_TAG = "_backtick_var_"
+
+
+def _tokenize_str(reader):
+    token_generator = tokenize.generate_tokens(reader)
+
+    def _iter_backtick_string(gen, line, back_start):
+        for _, tokval, start, _, _ in gen:
+            if tokval == "`":
+                return (
+                    BACKTICK_TAG
+                    + binascii.b2a_hex(
+                        line[back_start[1] + 1 : start[1]].encode()
+                    ).decode()
+                )
+        else:
+            raise SyntaxError(f"backtick quote at {back_start} does not match")
+
+    for toknum, tokval, start, _, line in token_generator:
+        if toknum == tokenize.OP:
+            if tokval == "@":
+                tokval = LOCAL_TAG
+            if tokval == "&":
+                toknum = tokenize.NAME
+                tokval = "and"
+            elif tokval == "|":
+                toknum = tokenize.NAME
+                tokval = "or"
+        elif tokval == "`":
+            yield tokenize.NAME, _iter_backtick_string(token_generator, line, start)
+            continue
+        yield toknum, tokval
+
+
+class CollectionVisitor(ast.NodeVisitor):
+    _op_handlers = {
+        ast.Add: operator.add,
+        ast.Sub: operator.sub,
+        ast.Mult: operator.mul,
+        ast.Div: operator.truediv,
+        ast.FloorDiv: operator.floordiv,
+        ast.mod: operator.mod,
+        ast.Pow: operator.pow,
+        ast.Eq: operator.eq,
+        ast.NotEq: operator.ne,
+        ast.Lt: operator.lt,
+        ast.LtE: operator.le,
+        ast.Gt: operator.gt,
+        ast.GtE: operator.ge,
+        ast.In: lambda x, y: y.isin(x),
+        ast.NotIn: lambda x, y: ~y.isin(x),
+        ast.UAdd: operator.pos,
+        ast.USub: operator.neg,
+        ast.Invert: operator.invert,
+        ast.And: operator.and_,
+        ast.Or: operator.or_,
+    }
+
+    def __init__(self, resolvers, target, env):
+        self.env = env
+        self.target = target
+        self.resolvers = resolvers
+
+        self.referenced_vars = set()
+        self.assigned = False
+        self.entity_subscribe = False
+
+    def _preparse(self, expr):
+        reader = StringIO(expr).readline
+        return tokenize.untokenize(list(_tokenize_str(reader)))
+
+    def eval(self, expr, rewrite=True):
+        if rewrite:
+            expr = self._preparse(expr)
+        node = ast.fix_missing_locations(ast.parse(expr))
+        return self.visit(node)
+
+    def get_named_object(self, obj_name):
+        for resolver in self.resolvers:
+            try:
+                return resolver[obj_name]
+            except (IndexError, KeyError):
+                continue
+        if obj_name in self.env:
+            self.referenced_vars.add(obj_name)
+            return self.env[obj_name]
+        raise KeyError(f"name {obj_name} is not defined")
+
+    def visit(self, node):
+        if isinstance(node, ENTITY_TYPE):
+            return node
+        node_name = node.__class__.__name__
+        method = "visit_" + node_name
+        try:
+            visitor = getattr(self, method)
+        except AttributeError:
+            raise SyntaxError(
+                "Query string contains unsupported syntax: {}".format(node_name)
+            )
+        return visitor(node)
+
+    def visit_Module(self, node):
+        if self.target is None and len(node.body) != 1:
+            raise SyntaxError("Only a single expression is allowed")
+        result = None
+        for expr in node.body:
+            result = self.visit(expr)
+        return result
+
+    def visit_Expr(self, node):
+        return self.visit(node.value)
+
+    def visit_BinOp(self, node):
+        left = self.visit(node.left)
+        right = self.visit(node.right)
+        return self._op_handlers[type(node.op)](left, right)
+
+    def visit_Call(self, node):
+        func = self.visit(node.func)
+        args = [self.visit(n) for n in node.args]
+        kwargs = OrderedDict([(kw.arg, self.visit(kw.value)) for kw in node.keywords])
+        return func(*args, **kwargs)
+
+    def visit_Compare(self, node):
+        ops = node.ops
+        comps = node.comparators
+
+        if len(comps) == 1:
+            binop = ast.BinOp(op=ops[0], left=node.left, right=comps[0])
+            return self.visit(binop)
+
+        left = node.left
+        values = []
+        for op, comp in zip(ops, comps):
+            new_node = ast.Compare(comparators=[comp], left=left, ops=[op])
+            left = comp
+            values.append(new_node)
+        return self.visit(ast.BoolOp(op=ast.And(), values=values))
+
+    def visit_BoolOp(self, node):
+        def func(lhs, rhs):
+            binop = ast.BinOp(op=node.op, left=lhs, right=rhs)
+            return self.visit(binop)
+
+        return reduce(func, node.values)
+
+    def visit_UnaryOp(self, node):
+        op = self.visit(node.operand)
+        return self._op_handlers[type(node.op)](op)
+
+    def visit_Name(self, node):
+        if node.id.startswith(LOCAL_TAG):
+            local_name = node.id.replace(LOCAL_TAG, "")
+            self.referenced_vars.add(local_name)
+            return self.env[local_name]
+        if node.id.startswith(BACKTICK_TAG):
+            local_name = binascii.a2b_hex(
+                node.id.replace(BACKTICK_TAG, "").encode()
+            ).decode()
+            return self.get_named_object(local_name)
+        return self.get_named_object(node.id)
+
+    def visit_NameConstant(self, node):  # pragma: no cover
+        return node.value
+
+    def visit_Num(self, node):  # pragma: no cover
+        return node.n
+
+    def visit_Str(self, node):  # pragma: no cover
+        return node.s
+
+    def visit_Constant(self, node):
+        return node.value
+
+    def visit_List(self, node):
+        return [self.visit(e) for e in node.elts]
+
+    def visit_Assign(self, node):
+        if self.target is None:
+            raise ValueError("Target not specified for assignment")
+        if isinstance(node.targets[0], ast.Tuple):
+            raise ValueError("Does not support assigning to multiple objects")
+
+        target = node.targets[0].id
+        value = self.visit(node.value)
+        self.target[target] = value
+        self.assigned = True
+
+    visit_Tuple = visit_List
+
+    def visit_Attribute(self, node):
+        attr = node.attr
+        value = node.value
+
+        ctx = node.ctx
+        if isinstance(ctx, ast.Load):
+            resolved = self.visit(value)
+            return getattr(resolved, attr)
+
+        raise ValueError("Invalid Attribute context {0}".format(ctx.__name__))
+
+    def visit_Subscript(self, node):
+        value = self.visit(node.value)
+        sub = self.visit(node.slice)
+        if isinstance(value, ENTITY_TYPE):
+            self.entity_subscribe = True
+        return value[sub]
+
+    def visit_Index(self, node):
+        return self.visit(node.value)
+
+    def visit_Slice(self, node):
+        lower = node.lower
+        if lower is not None:
+            lower = self.visit(lower)
+        upper = node.upper
+        if upper is not None:
+            upper = self.visit(upper)
+        step = node.step
+        if step is not None:
+            step = self.visit(step)
+
+        return slice(lower, upper, step)
+
+
+class DataFrameEval(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.DATAFRAME_EVAL
+
+    _expr = StringField("expr")
+    _parser = StringField("parser")
+    _engine = StringField("engine")
+    _variables = DictField("variables")
+    _self_target = BoolField("self_target")
+    _is_query = BoolField("is_query")
+
+    def __init__(
+        self,
+        expr=None,
+        parser=None,
+        engine=None,
+        variables=None,
+        self_target=None,
+        is_query=None,
+        **kw,
+    ):
+        super().__init__(
+            _expr=expr,
+            _parser=parser,
+            _engine=engine,
+            _variables=variables,
+            _self_target=self_target,
+            _is_query=is_query,
+            **kw,
+        )
+
+    @property
+    def expr(self):
+        return self._expr
+
+    @property
+    def parser(self):
+        return self._parser
+
+    @property
+    def engine(self):
+        return self._engine
+
+    @property
+    def variables(self):
+        return self._variables
+
+    @property
+    def self_target(self):
+        return self._self_target
+
+    @property
+    def is_query(self):
+        return self._is_query
+
+    def __call__(self, df, output_type, shape, dtypes):
+        self._output_types = [output_type]
+        params = df.params
+        new_index_value = (
+            df.index_value if not np.isnan(shape[0]) else parse_index(pd.RangeIndex(-1))
+        )
+        if output_type == OutputType.dataframe:
+            params.update(
+                dict(
+                    dtypes=dtypes,
+                    shape=shape,
+                    columns_value=parse_index(dtypes.index, store_data=True),
+                    index_value=new_index_value,
+                )
+            )
+        else:
+            name, dtype = dtypes
+            params = dict(
+                name=name,
+                dtype=dtype,
+                shape=shape,
+                index_value=new_index_value,
+            )
+        return self.new_tileable([df], **params)
+
+    def convert_to_query(self, df, output_type, shape, dtypes):
+        new_op = self.copy().reset_key()
+        new_op._is_query = True
+        new_op._self_target = False
+        return new_op(df, output_type, shape, dtypes)
+
+    @classmethod
+    def tile(cls, op: "DataFrameEval"):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+
+        if in_df.ndim == 2:
+            if in_df.chunk_shape[1] > 1:
+                in_df = yield from recursive_tile(in_df.rechunk({1: in_df.shape[1]}))
+
+        chunks = []
+        for c in in_df.chunks:
+            if out_df.ndim == 2:
+                new_shape = (
+                    np.nan if np.isnan(out_df.shape[0]) else c.shape[0],
+                    out_df.shape[1],
+                )
+                params = dict(
+                    dtypes=out_df.dtypes,
+                    shape=new_shape,
+                    columns_value=parse_index(out_df.dtypes.index, store_data=True),
+                    index_value=c.index_value,
+                    index=c.index,
+                )
+            else:
+                new_shape = (np.nan if np.isnan(out_df.shape[0]) else c.shape[0],)
+                params = dict(
+                    name=out_df.name,
+                    dtype=out_df.dtype,
+                    shape=new_shape,
+                    index_value=c.index_value,
+                    index=(c.index[0],),
+                )
+            new_op = op.copy().reset_key()
+            chunks.append(new_op.new_chunk([c], **params))
+
+        new_op = op.copy().reset_key()
+        params = out_df.params
+
+        new_nsplits = [in_df.nsplits[0], (out_df.shape[-1],)]
+        if np.isnan(out_df.shape[0]):
+            new_nsplits[0] = (np.nan,) * len(in_df.nsplits[0])
+        if out_df.ndim == 1:
+            new_nsplits = new_nsplits[:1]
+
+        params.update(
+            dict(
+                chunks=chunks,
+                nsplits=tuple(new_nsplits),
+            )
+        )
+        return new_op.new_tileables([in_df], **params)
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameEval"):
+        in_data = ctx[op.inputs[0].key]
+
+        if op.self_target:
+            in_data = in_data.copy()
+
+        if op.is_query:
+            val = in_data.query(
+                op.expr, parser=op.parser, engine=op.engine, local_dict=op.variables
+            )
+        else:
+            val = in_data.eval(
+                op.expr, parser=op.parser, engine=op.engine, local_dict=op.variables
+            )
+        ctx[op.outputs[0].key] = val
+
+
+def mars_eval(
+    expr,
+    parser="mars",
+    engine=None,
+    local_dict=None,
+    global_dict=None,
+    resolvers=(),
+    level=0,
+    target=None,
+    inplace=False,
+):
+    """
+
+    Evaluate a Python expression as a string using various backends.
+
+    The following arithmetic operations are supported: ``+``, ``-``, ``*``,
+    ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following
+    boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not).
+    Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`,
+    :keyword:`or`, and :keyword:`not` with the same semantics as the
+    corresponding bitwise operators.  :class:`~pandas.Series` and
+    :class:`~pandas.DataFrame` objects are supported and behave as they would
+    with plain ol' Python evaluation.
+
+    Parameters
+    ----------
+    expr : str
+        The expression to evaluate. This string cannot contain any Python
+        `statements
+        <https://docs.python.org/3/reference/simple_stmts.html#simple-statements>`__,
+        only Python `expressions
+        <https://docs.python.org/3/reference/simple_stmts.html#expression-statements>`__.
+    local_dict : dict or None, optional
+        A dictionary of local variables, taken from locals() by default.
+    global_dict : dict or None, optional
+        A dictionary of global variables, taken from globals() by default.
+    resolvers : list of dict-like or None, optional
+        A list of objects implementing the ``__getitem__`` special method that
+        you can use to inject an additional collection of namespaces to use for
+        variable lookup. For example, this is used in the
+        :meth:`~DataFrame.query` method to inject the
+        ``DataFrame.index`` and ``DataFrame.columns``
+        variables that refer to their respective :class:`~pandas.DataFrame`
+        instance attributes.
+    level : int, optional
+        The number of prior stack frames to traverse and add to the current
+        scope. Most users will **not** need to change this parameter.
+    target : object, optional, default None
+        This is the target object for assignment. It is used when there is
+        variable assignment in the expression. If so, then `target` must
+        support item assignment with string keys, and if a copy is being
+        returned, it must also support `.copy()`.
+    inplace : bool, default False
+        If `target` is provided, and the expression mutates `target`, whether
+        to modify `target` inplace. Otherwise, return a copy of `target` with
+        the mutation.
+
+    Returns
+    -------
+    ndarray, numeric scalar, DataFrame, Series
+
+    Raises
+    ------
+    ValueError
+        There are many instances where such an error can be raised:
+
+        - `target=None`, but the expression is multiline.
+        - The expression is multiline, but not all them have item assignment.
+          An example of such an arrangement is this:
+
+          a = b + 1
+          a + 2
+
+          Here, there are expressions on different lines, making it multiline,
+          but the last line has no variable assigned to the output of `a + 2`.
+        - `inplace=True`, but the expression is missing item assignment.
+        - Item assignment is provided, but the `target` does not support
+          string item assignment.
+        - Item assignment is provided and `inplace=False`, but the `target`
+          does not support the `.copy()` method
+
+    See Also
+    --------
+    DataFrame.query : Evaluates a boolean expression to query the columns
+            of a frame.
+    DataFrame.eval : Evaluate a string describing operations on
+            DataFrame columns.
+
+    Notes
+    -----
+    The ``dtype`` of any objects involved in an arithmetic ``%`` operation are
+    recursively cast to ``float64``.
+
+    See the :ref:`enhancing performance <enhancingperf.eval>` documentation for
+    more details.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]})
+    >>> df.execute()
+      animal  age
+    0    dog   10
+    1    pig   20
+
+    We can add a new column using ``pd.eval``:
+
+    >>> md.eval("double_age = df.age * 2", target=df).execute()
+      animal  age  double_age
+    0    dog   10          20
+    1    pig   20          40
+    """
+    if not isinstance(expr, str):
+        raise TypeError("expr must be a string")
+
+    expr = textwrap.dedent(expr)
+
+    try:
+        frame = sys._getframe(level + 1)
+        local_dict = local_dict or dict()
+        local_dict.update(frame.f_locals)
+        global_dict = global_dict or dict()
+        global_dict.update(frame.f_globals)
+    finally:
+        del frame
+
+    env = dict()
+    env.update(global_dict)
+    env.update(local_dict)
+
+    ref_frames = set(resolvers) | set([target] if target is not None else [])
+    self_target = len(resolvers) > 0 and resolvers[0] is target
+
+    if target is not None and not inplace:
+        target = target.copy()
+
+    visitor = CollectionVisitor(resolvers, target, env)
+    result = visitor.eval(expr)
+    result = result if result is not None else target
+    has_var_frame = any(
+        isinstance(env[k], ENTITY_TYPE) for k in visitor.referenced_vars
+    )
+    if len(ref_frames) != 1 or visitor.entity_subscribe or has_var_frame:
+        if parser != "mars":
+            raise NotImplementedError("Does not support parser names other than mars")
+        if engine is not None:
+            raise NotImplementedError("Does not support specifying engine names")
+        return result
+    else:
+        parser = "pandas" if parser == "mars" else parser
+        referenced_env = {k: env[k] for k in visitor.referenced_vars}
+        op = DataFrameEval(
+            expr,
+            parser=parser,
+            engine=engine,
+            variables=referenced_env,
+            self_target=visitor.assigned and self_target,
+            is_query=False,
+        )
+        output_type = get_output_types(result)[0]
+        dtypes = result.dtypes if result.ndim == 2 else (result.name, result.dtype)
+        return op(resolvers[0], output_type, result.shape, dtypes)
+
+
+def df_eval(df, expr, inplace=False, **kwargs):
+    """
+    Evaluate a string describing operations on DataFrame columns.
+
+    Operates on columns only, not specific rows or elements.  This allows
+    `eval` to run arbitrary code, which can make you vulnerable to code
+    injection if you pass user input to this function.
+
+    Parameters
+    ----------
+    expr : str
+        The expression string to evaluate.
+    inplace : bool, default False
+        If the expression contains an assignment, whether to perform the
+        operation inplace and mutate the existing DataFrame. Otherwise,
+        a new DataFrame is returned.
+    **kwargs
+        See the documentation for :func:`eval` for complete details
+        on the keyword arguments accepted by
+        :meth:`~pandas.DataFrame.query`.
+
+    Returns
+    -------
+    ndarray, scalar, or pandas object
+        The result of the evaluation.
+
+    See Also
+    --------
+    DataFrame.query : Evaluates a boolean expression to query the columns
+        of a frame.
+    DataFrame.assign : Can evaluate an expression or function to create new
+        values for a column.
+    eval : Evaluate a Python expression as a string using various
+        backends.
+
+    Notes
+    -----
+    For more details see the API documentation for :func:`~eval`.
+    For detailed examples see :ref:`enhancing performance with eval
+    <enhancingperf.eval>`.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
+    >>> df.execute()
+       A   B
+    0  1  10
+    1  2   8
+    2  3   6
+    3  4   4
+    4  5   2
+    >>> df.eval('A + B').execute()
+    0    11
+    1    10
+    2     9
+    3     8
+    4     7
+    dtype: int64
+
+    Assignment is allowed though by default the original DataFrame is not
+    modified.
+
+    >>> df.eval('C = A + B').execute()
+       A   B   C
+    0  1  10  11
+    1  2   8  10
+    2  3   6   9
+    3  4   4   8
+    4  5   2   7
+    >>> df.execute()
+       A   B
+    0  1  10
+    1  2   8
+    2  3   6
+    3  4   4
+    4  5   2
+
+    Use ``inplace=True`` to modify the original DataFrame.
+
+    >>> df.eval('C = A + B', inplace=True)
+    >>> df.execute()
+       A   B   C
+    0  1  10  11
+    1  2   8  10
+    2  3   6   9
+    3  4   4   8
+    4  5   2   7
+
+    Multiple columns can be assigned to using multi-line expressions:
+
+    >>> df.eval('''
+    ... C = A + B
+    ... D = A - B
+    ... ''').execute()
+       A   B   C  D
+    0  1  10  11 -9
+    1  2   8  10 -6
+    2  3   6   9 -3
+    3  4   4   8  0
+    4  5   2   7  3
+    """
+    level = kwargs.pop("level", None) or 0
+    kwargs["inplace"] = inplace
+    val = mars_eval(expr, resolvers=(df,), target=df, level=level + 1, **kwargs)
+    if not inplace:
+        return val
+
+
+def df_query(df, expr, inplace=False, **kwargs):
+    """
+    Query the columns of a DataFrame with a boolean expression.
+
+    Parameters
+    ----------
+    expr : str
+        The query string to evaluate.
+
+        You can refer to variables
+        in the environment by prefixing them with an '@' character like
+        ``@a + b``.
+
+        You can refer to column names that contain spaces or operators by
+        surrounding them in backticks. This way you can also escape
+        names that start with a digit, or those that  are a Python keyword.
+        Basically when it is not valid Python identifier. See notes down
+        for more details.
+
+        For example, if one of your columns is called ``a a`` and you want
+        to sum it with ``b``, your query should be ```a a` + b``.
+
+    inplace : bool
+        Whether the query should modify the data in place or return
+        a modified copy.
+    **kwargs
+        See the documentation for :func:`eval` for complete details
+        on the keyword arguments accepted by :meth:`DataFrame.query`.
+
+    Returns
+    -------
+    DataFrame
+        DataFrame resulting from the provided query expression.
+
+    See Also
+    --------
+    eval : Evaluate a string describing operations on
+        DataFrame columns.
+    DataFrame.eval : Evaluate a string describing operations on
+        DataFrame columns.
+
+    Notes
+    -----
+    The result of the evaluation of this expression is first passed to
+    :attr:`DataFrame.loc` and if that fails because of a
+    multidimensional key (e.g., a DataFrame) then the result will be passed
+    to :meth:`DataFrame.__getitem__`.
+
+    This method uses the top-level :func:`eval` function to
+    evaluate the passed query.
+
+    The :meth:`~pandas.DataFrame.query` method uses a slightly
+    modified Python syntax by default. For example, the ``&`` and ``|``
+    (bitwise) operators have the precedence of their boolean cousins,
+    :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
+    however the semantics are different.
+
+    You can change the semantics of the expression by passing the keyword
+    argument ``parser='python'``. This enforces the same semantics as
+    evaluation in Python space. Likewise, you can pass ``engine='python'``
+    to evaluate an expression using Python itself as a backend. This is not
+    recommended as it is inefficient compared to using ``numexpr`` as the
+    engine.
+
+    The :attr:`DataFrame.index` and
+    :attr:`DataFrame.columns` attributes of the
+    :class:`~pandas.DataFrame` instance are placed in the query namespace
+    by default, which allows you to treat both the index and columns of the
+    frame as a column in the frame.
+    The identifier ``index`` is used for the frame index; you can also
+    use the name of the index to identify it in a query. Please note that
+    Python keywords may not be used as identifiers.
+
+    For further details and examples see the ``query`` documentation in
+    :ref:`indexing <indexing.query>`.
+
+    *Backtick quoted variables*
+
+    Backtick quoted variables are parsed as literal Python code and
+    are converted internally to a Python valid identifier.
+    This can lead to the following problems.
+
+    During parsing a number of disallowed characters inside the backtick
+    quoted string are replaced by strings that are allowed as a Python identifier.
+    These characters include all operators in Python, the space character, the
+    question mark, the exclamation mark, the dollar sign, and the euro sign.
+    For other characters that fall outside the ASCII range (U+0001..U+007F)
+    and those that are not further specified in PEP 3131,
+    the query parser will raise an error.
+    This excludes whitespace different than the space character,
+    but also the hashtag (as it is used for comments) and the backtick
+    itself (backtick can also not be escaped).
+
+    In a special case, quotes that make a pair around a backtick can
+    confuse the parser.
+    For example, ```it's` > `that's``` will raise an error,
+    as it forms a quoted string (``'s > `that'``) with a backtick inside.
+
+    See also the Python documentation about lexical analysis
+    (https://docs.python.org/3/reference/lexical_analysis.html)
+    in combination with the source code in :mod:`pandas.core.computation.parsing`.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'A': range(1, 6),
+    ...                    'B': range(10, 0, -2),
+    ...                    'C C': range(10, 5, -1)})
+    >>> df.execute()
+       A   B  C C
+    0  1  10   10
+    1  2   8    9
+    2  3   6    8
+    3  4   4    7
+    4  5   2    6
+    >>> df.query('A > B').execute()
+       A  B  C C
+    4  5  2    6
+
+    The previous expression is equivalent to
+
+    >>> df[df.A > df.B].execute()
+       A  B  C C
+    4  5  2    6
+
+    For columns with spaces in their name, you can use backtick quoting.
+
+    >>> df.query('B == `C C`').execute()
+       A   B  C C
+    0  1  10   10
+
+    The previous expression is equivalent to
+
+    >>> df[df.B == df['C C']].execute()
+       A   B  C C
+    0  1  10   10
+    """
+    level = kwargs.pop("level", None) or 0
+    predicate = mars_eval(expr, resolvers=(df,), level=level + 1, **kwargs)
+    result = df[predicate]
+
+    if isinstance(predicate.op, DataFrameEval):
+        output_type = get_output_types(result)[0]
+        dtypes = result.dtypes if result.ndim == 2 else (result.name, result.dtype)
+        result = predicate.op.convert_to_query(df, output_type, result.shape, dtypes)
+
+    if inplace:
+        df.data = result.data
+    else:
+        return result
diff --git a/python/xorbits/_mars/dataframe/base/explode.py b/python/xorbits/_mars/dataframe/base/explode.py
new file mode 100644
index 000000000..e8bacf35f
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/explode.py
@@ -0,0 +1,213 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import OutputType, recursive_tile
+from ...serialization.serializables import AnyField, BoolField
+from ...utils import calc_nsplits
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index, standardize_range_index
+
+
+class DataFrameExplode(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.EXPLODE
+
+    _column = AnyField("column")
+    _ignore_index = BoolField("ignore_field")
+
+    def __init__(self, column=None, ignore_index=None, output_types=None, **kw):
+        super().__init__(
+            _column=column, _ignore_index=ignore_index, _output_types=output_types, **kw
+        )
+
+    @property
+    def column(self):
+        return self._column
+
+    @property
+    def ignore_index(self):
+        return self._ignore_index
+
+    def _rewrite_params(self, in_obj):
+        params = in_obj.params.copy()
+        new_shape = list(in_obj.shape)
+        new_shape[0] = np.nan
+        params["shape"] = tuple(new_shape)
+
+        if self.ignore_index:
+            params["index_value"] = parse_index(
+                pd.RangeIndex(-1), (in_obj.key, in_obj.index_value.key)
+            )
+        else:
+            params["index_value"] = parse_index(
+                None, (in_obj.key, in_obj.index_value.key)
+            )
+        return params
+
+    def __call__(self, df_or_series):
+        return self.new_tileable([df_or_series], **self._rewrite_params(df_or_series))
+
+    @classmethod
+    def tile(cls, op: "DataFrameExplode"):
+        in_obj = op.inputs[0]
+
+        if in_obj.ndim == 2 and in_obj.chunk_shape[1] > 1:
+            # make sure data's second dimension has only 1 chunk
+            in_obj = yield from recursive_tile(in_obj.rechunk({1: in_obj.shape[1]}))
+
+        chunks = []
+        for chunk in in_obj.chunks:
+            new_op = op.copy().reset_key()
+            chunks.append(new_op.new_chunk([chunk], **op._rewrite_params(chunk)))
+
+        if op.ignore_index:
+            yield chunks
+            chunks = standardize_range_index(chunks)
+
+        new_op = op.copy().reset_key()
+        out_params = op.outputs[0].params.copy()
+        out_params["chunks"] = chunks
+        out_params["nsplits"] = calc_nsplits({c.index: c.shape for c in chunks})
+        return new_op.new_tileable([in_obj], kws=[out_params])
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameExplode"):
+        in_data = ctx[op.inputs[0].key]
+        if in_data.ndim == 2:
+            ctx[op.outputs[0].key] = in_data.explode(op.column)
+        else:
+            ctx[op.outputs[0].key] = in_data.explode()
+
+
+def df_explode(df, column, ignore_index=False):
+    """
+    Transform each element of a list-like to a row, replicating index values.
+
+    Parameters
+    ----------
+    column : str or tuple
+        Column to explode.
+    ignore_index : bool, default False
+        If True, the resulting index will be labeled 0, 1, …, n - 1.
+
+    Returns
+    -------
+    DataFrame
+        Exploded lists to rows of the subset columns;
+        index will be duplicated for these rows.
+
+    Raises
+    ------
+    ValueError :
+        if columns of the frame are not unique.
+
+    See Also
+    --------
+    DataFrame.unstack : Pivot a level of the (necessarily hierarchical)
+        index labels.
+    DataFrame.melt : Unpivot a DataFrame from wide format to long format.
+    Series.explode : Explode a DataFrame from list-like columns to long format.
+
+    Notes
+    -----
+    This routine will explode list-likes including lists, tuples,
+    Series, and np.ndarray. The result dtype of the subset rows will
+    be object. Scalars will be returned unchanged. Empty list-likes will
+    result in a np.nan for that row.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
+    >>> df.execute()
+               A  B
+    0  [1, 2, 3]  1
+    1        foo  1
+    2         []  1
+    3     [3, 4]  1
+
+    >>> df.explode('A').execute()
+         A  B
+    0    1  1
+    0    2  1
+    0    3  1
+    1  foo  1
+    2  NaN  1
+    3    3  1
+    3    4  1
+    """
+    op = DataFrameExplode(
+        column=column, ignore_index=ignore_index, output_types=[OutputType.dataframe]
+    )
+    return op(df)
+
+
+def series_explode(series, ignore_index=False):
+    """
+    Transform each element of a list-like to a row.
+
+    Parameters
+    ----------
+    ignore_index : bool, default False
+        If True, the resulting index will be labeled 0, 1, …, n - 1.
+
+    Returns
+    -------
+    Series
+        Exploded lists to rows; index will be duplicated for these rows.
+
+    See Also
+    --------
+    Series.str.split : Split string values on specified separator.
+    Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex
+        to produce DataFrame.
+    DataFrame.melt : Unpivot a DataFrame from wide format to long format.
+    DataFrame.explode : Explode a DataFrame from list-like
+        columns to long format.
+
+    Notes
+    -----
+    This routine will explode list-likes including lists, tuples,
+    Series, and np.ndarray. The result dtype of the subset rows will
+    be object. Scalars will be returned unchanged. Empty list-likes will
+    result in a np.nan for that row.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> import mars.dataframe as md
+    >>> s = md.Series([[1, 2, 3], 'foo', [], [3, 4]])
+    >>> s.execute()
+    0    [1, 2, 3]
+    1          foo
+    2           []
+    3       [3, 4]
+    dtype: object
+
+    >>> s.explode().execute()
+    0      1
+    0      2
+    0      3
+    1    foo
+    2    NaN
+    3      3
+    3      4
+    dtype: object
+    """
+    op = DataFrameExplode(ignore_index=ignore_index, output_types=[OutputType.series])
+    return op(series)
diff --git a/python/xorbits/_mars/dataframe/base/get_dummies.py b/python/xorbits/_mars/dataframe/base/get_dummies.py
new file mode 100644
index 000000000..b81da2cb8
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/get_dummies.py
@@ -0,0 +1,360 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ...core import OutputType, recursive_tile
+from ...serialization.serializables import AnyField, BoolField, ListField, StringField
+from ..core import SERIES_TYPE
+from ..datasource.dataframe import from_pandas as from_pandas_df
+from ..datasource.series import from_pandas as from_pandas_series
+from ..initializer import Series as asseries
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..reduction.unique import unique
+from ..utils import gen_unknown_index_value
+
+_encoding_dtype_kind = ["O", "S", "U"]
+
+
+class DataFrameGetDummies(DataFrameOperand, DataFrameOperandMixin):
+    prefix = AnyField("prefix")
+    prefix_sep = StringField("prefix_sep")
+    dummy_na = BoolField("dummy_na")
+    columns = ListField("columns")
+    sparse = BoolField("sparse")
+    drop_first = BoolField("drop_first")
+    dtype = AnyField("dtype")
+
+    def __init__(
+        self,
+        prefix=None,
+        prefix_sep=None,
+        dummy_na=None,
+        columns=None,
+        sparse=None,
+        drop_first=None,
+        dtype=None,
+        **kws,
+    ):
+        super().__init__(
+            prefix=prefix,
+            prefix_sep=prefix_sep,
+            dummy_na=dummy_na,
+            columns=columns,
+            sparse=sparse,
+            drop_first=drop_first,
+            dtype=dtype,
+            **kws,
+        )
+        self.output_types = [OutputType.dataframe]
+
+    @classmethod
+    def tile(cls, op):
+        inp = op.inputs[0]
+        out = op.outputs[0]
+        if len(inp.chunks) == 1:
+            chunk_op = op.copy().reset_key()
+            chunk_param = out.params
+            chunk_param["index"] = (0, 0)
+            chunk = chunk_op.new_chunk(inp.chunks, kws=[chunk_param])
+            new_op = op.copy().reset_key()
+            param = out.params
+            param["chunks"] = [chunk]
+            param["nsplits"] = ((np.nan,), (np.nan,))
+            return new_op.new_dataframe(op.inputs, kws=[param])
+        elif isinstance(inp, SERIES_TYPE):
+            unique_inp = yield from recursive_tile(unique(inp))
+            chunks = []
+            for c in inp.chunks:
+                chunk_op = op.copy().reset_key()
+                chunk_param = out.params
+                chunk_param["index_value"] = gen_unknown_index_value(c.index_value)
+                chunk_param["index"] = (c.index[0], 0)
+                chunk = chunk_op.new_chunk([c] + unique_inp.chunks, kws=[chunk_param])
+                chunks.append(chunk)
+
+            new_op = op.copy().reset_key()
+            param = out.params
+            param["chunks"] = chunks
+            param["nsplits"] = (tuple([np.nan] * inp.chunk_shape[0]), (np.nan,))
+            return new_op.new_dataframe(op.inputs, kws=[param])
+        else:
+            if op.columns:
+                encoding_columns = op.columns
+            else:
+                encoding_columns = []
+                for idx, dtype in enumerate(inp.dtypes.values):
+                    if dtype.kind in _encoding_dtype_kind:
+                        column = inp.dtypes.index[idx]
+                        encoding_columns.append(column)
+            # reindex, make encoding columns in the end of dataframe, to keep pace with pandas.get_dummies
+            total_columns = list(inp.columns.to_pandas().array)
+            for col in encoding_columns:
+                total_columns.remove(col)
+            total_columns.extend(encoding_columns)
+            inp = yield from recursive_tile(inp[total_columns])
+
+            unique_chunks = dict()
+            for col in encoding_columns:
+                unique_chunks[col] = yield from recursive_tile(unique(inp[col]))
+
+            chunks = []
+            prefix = op.prefix
+            column_to_prefix = dict()
+            for c in inp.chunks:
+                chunk_op = op.copy().reset_key()
+                chunk_op.columns = []
+                if isinstance(chunk_op.prefix, list):
+                    chunk_op.prefix = []
+                chunk_param = c.params
+                chunk_param["shape"] = (np.nan, np.nan)
+                chunk_columns = c.dtypes.index
+                inp_chunk = [c]
+                for chunk_column in chunk_columns:
+                    if chunk_column in encoding_columns:
+                        chunk_op.columns.append(chunk_column)
+                        inp_chunk.extend(unique_chunks[chunk_column].chunks)
+                        if isinstance(prefix, list):
+                            if chunk_column in column_to_prefix.keys():
+                                chunk_op.prefix.append(column_to_prefix[chunk_column])
+                            else:
+                                column_to_prefix[chunk_column] = prefix[0]
+                                chunk_op.prefix.append(prefix[0])
+                                prefix = prefix[1:]
+                chunk = chunk_op.new_chunk(inp_chunk, kws=[chunk_param])
+                chunks.append(chunk)
+
+            new_op = op.copy()
+            kw = out.params.copy()
+            kw["chunks"] = chunks
+            kw["nsplits"] = (
+                tuple([np.nan] * inp.chunk_shape[0]),
+                tuple([np.nan] * inp.chunk_shape[1]),
+            )
+            return new_op.new_dataframe(op.inputs, kws=[kw])
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inp = ctx[op.inputs[0].key]
+        result_length = inp.shape[0]
+        unique_inputs = []
+        for unique_input in op.inputs[1:]:
+            unique_inputs.append(ctx[unique_input.key].tolist())
+
+        if unique_inputs:
+            if isinstance(inp, pd.Series):
+                extra_series = pd.Series(unique_inputs[0])
+                inp = pd.concat([inp, extra_series])
+            else:
+                # make all unique_input's length the same, then get a dataframe
+                max_length = len(max(unique_inputs, key=len))
+                unique_inputs = [
+                    unique_list + [unique_list[0]] * (max_length - len(unique_list))
+                    for unique_list in unique_inputs
+                ]
+                extra_dataframe = pd.DataFrame(dict(zip(op.columns, unique_inputs)))
+
+                # add the columns that need not to encode, to concat extra_dataframe and inp
+                total_columns = list(inp.columns.array)
+                for col in op.columns:
+                    total_columns.remove(col)
+                remain_columns = total_columns
+                not_encode_columns = []
+                if len(remain_columns) > 0:
+                    for col in remain_columns:
+                        not_encode_columns.append([inp[col].iloc[0]] * max_length)
+                not_encode_dataframe = pd.DataFrame(
+                    dict(zip(remain_columns, not_encode_columns))
+                )
+
+                extra_dataframe = pd.concat(
+                    [not_encode_dataframe, extra_dataframe], axis=1
+                )
+                inp = pd.concat([inp, extra_dataframe], axis=0)
+
+        result = pd.get_dummies(
+            inp,
+            op.prefix,
+            op.prefix_sep,
+            op.dummy_na,
+            op.columns,
+            op.sparse,
+            op.drop_first,
+            op.dtype,
+        )
+        ctx[op.outputs[0].key] = result.iloc[:result_length]
+
+    def __call__(self, data):
+        if isinstance(data, (list, tuple)):
+            data = asseries(data)
+        elif isinstance(data, pd.Series):
+            data = from_pandas_series(data)
+        elif isinstance(data, pd.DataFrame):
+            data = from_pandas_df(data)
+
+        if self.prefix is not None:
+            if isinstance(self.prefix, list):
+                if self.columns is not None:
+                    encoding_col_num = len(self.columns)
+                else:
+                    encoding_col_num = 0
+                    for dtype in data.dtypes.values:
+                        if dtype.kind in _encoding_dtype_kind:
+                            encoding_col_num += 1
+                prefix_num = len(self.prefix)
+                if prefix_num != encoding_col_num:
+                    raise ValueError(
+                        f"Length of 'prefix' ({prefix_num}) did not match "
+                        + f"the length of the columns being encoded ({encoding_col_num})"
+                    )
+            elif isinstance(self.prefix, dict):
+                if self.columns is not None:
+                    encoding_col_num = len(self.columns)
+                    prefix_num = len(self.prefix)
+                    if prefix_num != encoding_col_num:
+                        raise ValueError(
+                            f"Length of 'prefix' ({prefix_num}) did not match "
+                            + f"the length of the columns being encoded ({encoding_col_num})"
+                        )
+                    columns = self.prefix.keys()
+                    for columns_columnname, prefix_columnname in zip(
+                        columns, list(self.columns)
+                    ):
+                        if columns_columnname != prefix_columnname:
+                            raise KeyError(f"{columns_columnname}")
+                else:
+                    self.columns = list(self.prefix.keys())
+                # Convert prefix from dict to list, to simplify tile work
+                self.prefix = list(self.prefix.values())
+
+        return self.new_dataframe(
+            [data],
+            shape=(np.nan, np.nan),
+            dtypes=None,
+            index_value=data.index_value,
+            columns_value=None,
+        )
+
+
+def get_dummies(
+    data,
+    prefix=None,
+    prefix_sep="_",
+    dummy_na=False,
+    columns=None,
+    sparse=False,
+    drop_first=False,
+    dtype=None,
+):
+    """
+    Convert categorical variable into dummy/indicator variables.
+
+    Parameters
+    ----------
+    data : array-like, Series, or DataFrame
+        Data of which to get dummy indicators.
+    prefix : str, list of str, or dict of str, default None
+        String to append DataFrame column names.
+        Pass a list with length equal to the number of columns
+        when calling get_dummies on a DataFrame. Alternatively, `prefix`
+        can be a dictionary mapping column names to prefixes.
+    prefix_sep : str, default '_'
+        If appending prefix, separator/delimiter to use. Or pass a
+        list or dictionary as with `prefix`.
+    dummy_na : bool, default False
+        Add a column to indicate NaNs, if False NaNs are ignored.
+    columns : list-like, default None
+        Column names in the DataFrame to be encoded.
+        If `columns` is None then all the columns with
+        `object` or `category` dtype will be converted.
+    sparse : bool, default False
+        Whether the dummy-encoded columns should be backed by
+        a :class:`SparseArray` (True) or a regular NumPy array (False).
+    drop_first : bool, default False
+        Whether to get k-1 dummies out of k categorical levels by removing the
+        first level.
+    dtype : dtype, default np.uint8
+        Data type for new columns. Only a single dtype is allowed.
+
+    Returns
+    -------
+    DataFrame
+        Dummy-coded data.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> s = md.Series(list('abca'))
+
+    >>> md.get_dummies(s).execute()
+       a  b  c
+    0  1  0  0
+    1  0  1  0
+    2  0  0  1
+    3  1  0  0
+
+    >>> s1 = ['a', 'b', np.nan]
+
+    >>> md.get_dummies(s1).execute()
+       a  b
+    0  1  0
+    1  0  1
+    2  0  0
+
+    >>> md.get_dummies(s1, dummy_na=True).execute()
+       a  b  NaN
+    0  1  0    0
+    1  0  1    0
+    2  0  0    1
+
+    >>> df = md.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
+    ...                    'C': [1, 2, 3]})
+
+    >>> md.get_dummies(df, prefix=['col1', 'col2']).execute()
+       C  col1_a  col1_b  col2_a  col2_b  col2_c
+    0  1       1       0       0       1       0
+    1  2       0       1       1       0       0
+    2  3       1       0       0       0       1
+
+    >>> md.get_dummies(pd.Series(list('abcaa'))).execute()
+       a  b  c
+    0  1  0  0
+    1  0  1  0
+    2  0  0  1
+    3  1  0  0
+    4  1  0  0
+
+    >>> md.get_dummies(pd.Series(list('abcaa')), drop_first=True).execute()
+       b  c
+    0  0  0
+    1  1  0
+    2  0  1
+    3  0  0
+    4  0  0
+
+    >>> md.get_dummies(pd.Series(list('abc')), dtype=float).execute()
+         a    b    c
+    0  1.0  0.0  0.0
+    1  0.0  1.0  0.0
+    2  0.0  0.0  1.0
+    """
+    if columns is not None and not isinstance(columns, list):
+        raise TypeError("Input must be a list-like for parameter `columns`")
+
+    op = DataFrameGetDummies(
+        prefix, prefix_sep, dummy_na, columns, sparse, drop_first, dtype
+    )
+
+    return op(data)
diff --git a/python/xorbits/_mars/dataframe/base/isin.py b/python/xorbits/_mars/dataframe/base/isin.py
new file mode 100644
index 000000000..d6737dff6
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/isin.py
@@ -0,0 +1,401 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+import pandas as pd
+from pandas.api.types import is_list_like
+
+from ... import opcodes as OperandDef
+from ...core import ENTITY_TYPE
+from ...serialization.serializables import AnyField, KeyField
+from ...tensor.core import TENSOR_TYPE
+from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE, OutputType
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from .drop_duplicates import DataFrameDropDuplicates
+
+
+class DataFrameIsin(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.ISIN
+
+    input = KeyField("input")
+    values = AnyField("values")
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        self.input = next(inputs_iter)
+        if len(self._inputs) > 1:
+            if isinstance(self.values, dict):
+                new_values = dict()
+                for k, v in self.values.items():
+                    if isinstance(v, ENTITY_TYPE):
+                        new_values[k] = next(inputs_iter)
+                    else:
+                        new_values[k] = v
+                self.values = new_values
+            else:
+                self.values = self._inputs[1]
+
+    def __call__(self, elements):
+        inputs = [elements]
+        if isinstance(self.values, ENTITY_TYPE):
+            inputs.append(self.values)
+        elif isinstance(self.values, dict):
+            for v in self.values.values():
+                if isinstance(v, ENTITY_TYPE):
+                    inputs.append(v)
+
+        if elements.ndim == 1:
+            return self.new_series(
+                inputs,
+                shape=elements.shape,
+                dtype=np.dtype("bool"),
+                index_value=elements.index_value,
+                name=elements.name,
+            )
+        else:
+            dtypes = pd.Series(
+                [np.dtype(bool) for _ in elements.dtypes], index=elements.dtypes.index
+            )
+            return self.new_dataframe(
+                inputs,
+                shape=elements.shape,
+                index_value=elements.index_value,
+                columns_value=elements.columns_value,
+                dtypes=dtypes,
+            )
+
+    @classmethod
+    def _tile_entity_values(cls, op):
+        from ...core.context import get_context
+        from ...tensor.base.unique import TensorUnique
+        from ..arithmetic.bitwise_or import tree_dataframe_or
+        from ..utils import auto_merge_chunks
+
+        in_elements = op.input
+        out_elements = op.outputs[0]
+        # values contains mars objects
+        chunks_list = []
+        in_chunks = in_elements.chunks
+        if any(len(t.chunks) > 4 for t in op.inputs):
+            # yield and merge value chunks to reduce graph nodes
+            yield_chunks = [c for c in in_chunks]
+            unique_values = []
+            for value in op.inputs[1:]:
+                if len(value.chunks) >= len(in_chunks) * 2:
+                    # when value chunks is much more than in_chunks,
+                    # we call drop_duplicates to reduce the amount of data.
+                    if isinstance(value, TENSOR_TYPE):
+                        chunks = [
+                            TensorUnique(
+                                return_index=False,
+                                return_inverse=False,
+                                return_counts=False,
+                            ).new_chunk(
+                                [c], index=c.index, shape=(np.nan,), dtype=c.dtype
+                            )
+                            for c in value.chunks
+                        ]
+                        unique_values.append(
+                            TensorUnique(
+                                return_index=False,
+                                return_inverse=False,
+                                return_counts=False,
+                            ).new_tensor(
+                                [value],
+                                chunks=chunks,
+                                nsplits=((np.nan,) * len(chunks),),
+                                shape=(np.nan,),
+                                dtype=value.dtype,
+                            )
+                        )
+                        yield_chunks += chunks
+                    else:
+                        # is series
+                        chunks = [
+                            DataFrameDropDuplicates(
+                                keep="first",
+                                ignore_index=False,
+                                method="tree",
+                                output_types=[OutputType.series],
+                            ).new_chunk(
+                                [c],
+                                index=c.index,
+                                index_value=c.index_value,
+                                name=c.name,
+                                dtype=c.dtype,
+                                shape=(np.nan,),
+                            )
+                            for c in value.chunks
+                        ]
+                        unique_values.append(
+                            DataFrameDropDuplicates(
+                                keep="first",
+                                ignore_index=False,
+                                method="tree",
+                                output_types=[OutputType.series],
+                            ).new_series(
+                                [value],
+                                chunks=chunks,
+                                nsplits=((np.nan,) * len(chunks),),
+                                index_value=value.index_value,
+                                dtype=value.dtype,
+                                shape=(np.nan,),
+                            )
+                        )
+                        yield_chunks += chunks
+                else:
+                    yield_chunks += value.chunks
+                    unique_values.append(value)
+            yield yield_chunks
+            in_elements = auto_merge_chunks(get_context(), op.input)
+            in_chunks = in_elements.chunks
+            for value in unique_values:
+                if isinstance(value, SERIES_TYPE):
+                    merged = auto_merge_chunks(get_context(), value)
+                    chunks_list.append(merged.chunks)
+                elif isinstance(value, ENTITY_TYPE):
+                    chunks_list.append(value.chunks)
+        else:
+            for value in op.inputs[1:]:
+                if isinstance(value, ENTITY_TYPE):
+                    chunks_list.append(value.chunks)
+
+        out_chunks = []
+        for in_chunk in in_chunks:
+            isin_chunks = []
+            for value_chunks in itertools.product(*chunks_list):
+                input_chunks = [in_chunk] + list(value_chunks)
+                isin_chunks.append(cls._new_chunk(op, in_chunk, input_chunks))
+            out_chunk = tree_dataframe_or(*isin_chunks, index=in_chunk.index)
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        params = out_elements.params
+        params["nsplits"] = in_elements.nsplits
+        params["chunks"] = out_chunks
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def tile(cls, op):
+        in_elements = op.input
+        out_elements = op.outputs[0]
+
+        if len(op.inputs) > 1:
+            return (yield from cls._tile_entity_values(op))
+
+        out_chunks = []
+        for chunk in in_elements.chunks:
+            out_chunk = cls._new_chunk(op, chunk, [chunk])
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        params = out_elements.params
+        params["nsplits"] = in_elements.nsplits
+        params["chunks"] = out_chunks
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def _new_chunk(cls, op, chunk, input_chunks):
+        out_elements = op.outputs[0]
+        chunk_op = op.copy().reset_key()
+        if out_elements.ndim == 1:
+            out_chunk = chunk_op.new_chunk(
+                input_chunks,
+                shape=chunk.shape,
+                dtype=out_elements.dtype,
+                index_value=chunk.index_value,
+                name=out_elements.name,
+                index=chunk.index,
+            )
+        else:
+            chunk_dtypes = pd.Series(
+                [np.dtype(bool) for _ in chunk.dtypes], index=chunk.dtypes.index
+            )
+            out_chunk = chunk_op.new_chunk(
+                input_chunks,
+                shape=chunk.shape,
+                index_value=chunk.index_value,
+                columns_value=chunk.columns_value,
+                dtypes=chunk_dtypes,
+                index=chunk.index,
+            )
+        return out_chunk
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs_iter = iter(op.inputs)
+        elements = ctx[next(inputs_iter).key]
+
+        if isinstance(op.values, dict):
+            values = dict()
+            for k, v in op.values.items():
+                if isinstance(v, ENTITY_TYPE):
+                    values[k] = ctx[next(inputs_iter).key]
+                else:
+                    values[k] = v
+        else:
+            if isinstance(op.values, ENTITY_TYPE):
+                values = ctx[next(inputs_iter).key]
+            else:
+                values = op.values
+
+        try:
+            ctx[op.outputs[0].key] = elements.isin(values)
+        except ValueError:
+            # buffer read-only
+            ctx[op.outputs[0].key] = elements.copy().isin(values.copy())
+
+
+def series_isin(elements, values):
+    """
+    Whether elements in Series are contained in `values`.
+
+    Return a boolean Series showing whether each element in the Series
+    matches an element in the passed sequence of `values` exactly.
+
+    Parameters
+    ----------
+    values : set or list-like
+        The sequence of values to test. Passing in a single string will
+        raise a ``TypeError``. Instead, turn a single string into a
+        list of one element.
+
+    Returns
+    -------
+    Series
+        Series of booleans indicating if each element is in values.
+
+    Raises
+    ------
+    TypeError
+      * If `values` is a string
+
+    See Also
+    --------
+    DataFrame.isin : Equivalent method on DataFrame.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> s = md.Series(['lama', 'cow', 'lama', 'beetle', 'lama',
+    ...                'hippo'], name='animal')
+    >>> s.isin(['cow', 'lama']).execute()
+    0     True
+    1     True
+    2     True
+    3    False
+    4     True
+    5    False
+    Name: animal, dtype: bool
+
+    Passing a single string as ``s.isin('lama')`` will raise an error. Use
+    a list of one element instead:
+
+    >>> s.isin(['lama']).execute()
+    0     True
+    1    False
+    2     True
+    3    False
+    4     True
+    5    False
+    Name: animal, dtype: bool
+    """
+    if is_list_like(values):
+        values = list(values)
+    elif not isinstance(values, (SERIES_TYPE, TENSOR_TYPE, INDEX_TYPE)):
+        raise TypeError(
+            "only list-like objects are allowed to be passed to isin(), "
+            f"you passed a [{type(values)}]"
+        )
+    op = DataFrameIsin(values=values)
+    return op(elements)
+
+
+def df_isin(df, values):
+    """
+    Whether each element in the DataFrame is contained in values.
+
+    Parameters
+    ----------
+    values : iterable, Series, DataFrame or dict
+        The result will only be true at a location if all the
+        labels match. If `values` is a Series, that's the index. If
+        `values` is a dict, the keys must be the column names,
+        which must match. If `values` is a DataFrame,
+        then both the index and column labels must match.
+
+    Returns
+    -------
+    DataFrame
+        DataFrame of booleans showing whether each element in the DataFrame
+        is contained in values.
+
+    See Also
+    --------
+    DataFrame.eq: Equality test for DataFrame.
+    Series.isin: Equivalent method on Series.
+    Series.str.contains: Test if pattern or regex is contained within a
+        string of a Series or Index.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
+    ...                   index=['falcon', 'dog'])
+    >>> df.execute()
+            num_legs  num_wings
+    falcon         2          2
+    dog            4          0
+
+    When ``values`` is a list check whether every value in the DataFrame
+    is present in the list (which animals have 0 or 2 legs or wings)
+
+    >>> df.isin([0, 2]).execute()
+            num_legs  num_wings
+    falcon      True       True
+    dog        False       True
+
+    When ``values`` is a dict, we can pass values to check for each
+    column separately:
+
+    >>> df.isin({'num_wings': [0, 3]}).execute()
+            num_legs  num_wings
+    falcon     False      False
+    dog        False       True
+
+    When ``values`` is a Series or DataFrame the index and column must
+    match. Note that 'falcon' does not match based on the number of legs
+    in df2.
+
+    >>> other = md.DataFrame({'num_legs': [8, 2], 'num_wings': [0, 2]},
+    ...                      index=['spider', 'falcon'])
+    >>> df.isin(other).execute()
+            num_legs  num_wings
+    falcon      True       True
+    dog        False      False
+    """
+    if is_list_like(values) and not isinstance(values, dict):
+        values = list(values)
+    elif not isinstance(
+        values, (SERIES_TYPE, DATAFRAME_TYPE, TENSOR_TYPE, INDEX_TYPE, dict)
+    ):
+        raise TypeError(
+            "only list-like objects or dict are allowed to be passed to isin(), "
+            f"you passed a [{type(values)}]"
+        )
+    op = DataFrameIsin(values=values)
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/base/map.py b/python/xorbits/_mars/dataframe/base/map.py
new file mode 100644
index 000000000..f1b583f53
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/map.py
@@ -0,0 +1,308 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from collections.abc import MutableMapping
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import OutputType, recursive_tile
+from ...core.custom_log import redirect_custom_log
+from ...serialization.serializables import AnyField, KeyField, StringField
+from ...utils import enter_current_session, has_unknown_shape, quiet_stdio
+from ..core import SERIES_TYPE
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_series
+
+
+class DataFrameMap(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.MAP
+
+    _input = KeyField("input")
+    _arg = AnyField("arg")
+    _na_action = StringField("na_action")
+
+    def __init__(
+        self, arg=None, na_action=None, output_types=None, memory_scale=None, **kw
+    ):
+        super().__init__(
+            _arg=arg,
+            _na_action=na_action,
+            _output_types=output_types,
+            _memory_scale=memory_scale,
+            **kw
+        )
+        if not self.output_types:
+            self.output_types = [OutputType.series]
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def arg(self):
+        return self._arg
+
+    @property
+    def na_action(self):
+        return self._na_action
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+        if len(inputs) == 2:
+            self._arg = self._inputs[1]
+
+    def __call__(self, series, dtype, skip_infer=False):
+        if dtype is None and not skip_infer:
+            inferred_dtype = None
+            if callable(self._arg):
+                # arg is a function, try to inspect the signature
+                sig = inspect.signature(self._arg)
+                return_type = sig.return_annotation
+                if return_type is not inspect._empty:
+                    inferred_dtype = np.dtype(return_type)
+                else:
+                    try:
+                        with quiet_stdio():
+                            # try to infer dtype by calling the function
+                            inferred_dtype = (
+                                build_series(series)
+                                .map(self._arg, na_action=self._na_action)
+                                .dtype
+                            )
+                    except:  # noqa: E722  # nosec
+                        pass
+            else:
+                if isinstance(self._arg, MutableMapping):
+                    inferred_dtype = pd.Series(self._arg).dtype
+                else:
+                    inferred_dtype = self._arg.dtype
+            if inferred_dtype is not None and np.issubdtype(inferred_dtype, np.number):
+                if np.issubdtype(inferred_dtype, np.inexact):
+                    # for the inexact e.g. float
+                    # we can make the decision,
+                    # but for int, due to the nan which may occur,
+                    # we cannot infer the dtype
+                    dtype = inferred_dtype
+            else:
+                dtype = inferred_dtype
+
+        if dtype is None:
+            if not skip_infer:
+                raise ValueError(
+                    "cannot infer dtype, it needs to be specified manually for `map`"
+                )
+        else:
+            dtype = np.int64 if dtype is int else dtype
+            dtype = np.dtype(dtype)
+
+        inputs = [series]
+        if isinstance(self._arg, SERIES_TYPE):
+            inputs.append(self._arg)
+
+        if isinstance(series, SERIES_TYPE):
+            return self.new_series(
+                inputs,
+                shape=series.shape,
+                dtype=dtype,
+                index_value=series.index_value,
+                name=series.name,
+            )
+        else:
+            return self.new_index(
+                inputs,
+                shape=series.shape,
+                dtype=dtype,
+                index_value=series.index_value,
+                name=series.name,
+            )
+
+    @classmethod
+    def tile(cls, op):
+        in_series = op.input
+        out_series = op.outputs[0]
+
+        arg = op.arg
+        if len(op.inputs) == 2:
+            # make sure arg has known shape when it's a md.Series
+            if has_unknown_shape(op.arg):
+                yield
+            arg = yield from recursive_tile(op.arg.rechunk(op.arg.shape))
+
+        out_chunks = []
+        for chunk in in_series.chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_op.tileable_op_key = op.key
+            chunk_inputs = [chunk]
+            if len(op.inputs) == 2:
+                chunk_inputs.append(arg.chunks[0])
+            out_chunk = chunk_op.new_chunk(
+                chunk_inputs,
+                shape=chunk.shape,
+                dtype=out_series.dtype,
+                index_value=chunk.index_value,
+                name=out_series.name,
+                index=chunk.index,
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        params = out_series.params
+        params["chunks"] = out_chunks
+        params["nsplits"] = in_series.nsplits
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    @redirect_custom_log
+    @enter_current_session
+    def execute(cls, ctx, op):
+        series = ctx[op.inputs[0].key]
+        out = op.outputs[0]
+        if len(op.inputs) == 2:
+            arg = ctx[op.inputs[1].key]
+        else:
+            arg = op.arg
+
+        ret = series.map(arg, na_action=op.na_action)
+        if ret.dtype != out.dtype:
+            ret = ret.astype(out.dtype)
+        ctx[out.key] = ret
+
+
+def series_map(
+    series, arg, na_action=None, dtype=None, memory_scale=None, skip_infer=False
+):
+    """
+    Map values of Series according to input correspondence.
+
+    Used for substituting each value in a Series with another value,
+    that may be derived from a function, a ``dict`` or
+    a :class:`Series`.
+
+    Parameters
+    ----------
+    arg : function, collections.abc.Mapping subclass or Series
+        Mapping correspondence.
+    na_action : {None, 'ignore'}, default None
+        If 'ignore', propagate NaN values, without passing them to the
+        mapping correspondence.
+    dtype : np.dtype, default None
+        Specify return type of the function. Must be specified when
+        we cannot decide the return type of the function.
+    memory_scale : float
+        Specify the scale of memory uses in the function versus
+        input size.
+    skip_infer: bool, default False
+        Whether infer dtypes when dtypes or output_type is not specified
+
+    Returns
+    -------
+    Series
+        Same index as caller.
+
+    See Also
+    --------
+    Series.apply : For applying more complex functions on a Series.
+    DataFrame.apply : Apply a function row-/column-wise.
+    DataFrame.applymap : Apply a function elementwise on a whole DataFrame.
+
+    Notes
+    -----
+    When ``arg`` is a dictionary, values in Series that are not in the
+    dictionary (as keys) are converted to ``NaN``. However, if the
+    dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e.
+    provides a method for default values), then this default is used
+    rather than ``NaN``.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> import mars.dataframe as md
+    >>> s = md.Series(['cat', 'dog', mt.nan, 'rabbit'])
+    >>> s.execute()
+    0      cat
+    1      dog
+    2      NaN
+    3   rabbit
+    dtype: object
+
+    ``map`` accepts a ``dict`` or a ``Series``. Values that are not found
+    in the ``dict`` are converted to ``NaN``, unless the dict has a default
+    value (e.g. ``defaultdict``):
+
+    >>> s.map({'cat': 'kitten', 'dog': 'puppy'}).execute()
+    0   kitten
+    1    puppy
+    2      NaN
+    3      NaN
+    dtype: object
+
+    It also accepts a function:
+
+    >>> s.map('I am a {}'.format).execute()
+    0       I am a cat
+    1       I am a dog
+    2       I am a nan
+    3    I am a rabbit
+    dtype: object
+
+    To avoid applying the function to missing values (and keep them as
+    ``NaN``) ``na_action='ignore'`` can be used:
+
+    >>> s.map('I am a {}'.format, na_action='ignore').execute()
+    0     I am a cat
+    1     I am a dog
+    2            NaN
+    3  I am a rabbit
+    dtype: object
+    """
+    op = DataFrameMap(arg=arg, na_action=na_action, memory_scale=memory_scale)
+    return op(series, dtype=dtype, skip_infer=skip_infer)
+
+
+def index_map(
+    idx, mapper, na_action=None, dtype=None, memory_scale=None, skip_infer=False
+):
+    """
+    Map values using input correspondence (a dict, Series, or function).
+
+    Parameters
+    ----------
+    mapper : function, dict, or Series
+        Mapping correspondence.
+    na_action : {None, 'ignore'}
+        If 'ignore', propagate NA values, without passing them to the
+        mapping correspondence.
+    dtype : np.dtype, default None
+        Specify return type of the function. Must be specified when
+        we cannot decide the return type of the function.
+    memory_scale : float
+        Specify the scale of memory uses in the function versus
+        input size.
+    skip_infer: bool, default False
+        Whether infer dtypes when dtypes or output_type is not specified
+
+
+    Returns
+    -------
+    applied : Union[Index, MultiIndex], inferred
+        The output of the mapping function applied to the index.
+        If the function returns a tuple with more than one element
+        a MultiIndex will be returned.
+    """
+    op = DataFrameMap(arg=mapper, na_action=na_action, memory_scale=memory_scale)
+    return op(idx, dtype=dtype, skip_infer=skip_infer)
diff --git a/python/xorbits/_mars/dataframe/base/map_chunk.py b/python/xorbits/_mars/dataframe/base/map_chunk.py
new file mode 100644
index 000000000..eff7185da
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/map_chunk.py
@@ -0,0 +1,433 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import CHUNK_TYPE, ENTITY_TYPE, get_output_types, recursive_tile
+from ...core.custom_log import redirect_custom_log
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    DictField,
+    FunctionField,
+    KeyField,
+    StringField,
+    TupleField,
+)
+from ...utils import (
+    enter_current_session,
+    find_objects,
+    has_unknown_shape,
+    quiet_stdio,
+    replace_objects,
+)
+from ..operands import DataFrameOperand, DataFrameOperandMixin, OutputType
+from ..utils import (
+    build_df,
+    build_empty_df,
+    build_empty_series,
+    build_series,
+    clean_up_func,
+    parse_index,
+    restore_func,
+    validate_output_types,
+)
+
+
+class DataFrameMapChunk(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.MAP_CHUNK
+
+    _input = KeyField("input")
+    _func = FunctionField("func")
+    _args = TupleField("args")
+    _kwargs = DictField("kwargs")
+    _with_chunk_index = BoolField("with_chunk_index")
+    _logic_key = StringField("logic_key")
+    _func_key = AnyField("func_key")
+    _need_clean_up_func = BoolField("need_clean_up_func")
+
+    def __init__(
+        self,
+        input=None,
+        func=None,
+        args=None,
+        kwargs=None,
+        output_types=None,
+        with_chunk_index=None,
+        logic_key=None,
+        func_key=None,
+        need_clean_up_func=False,
+        **kw,
+    ):
+        super().__init__(
+            _input=input,
+            _func=func,
+            _args=args,
+            _kwargs=kwargs,
+            _output_types=output_types,
+            _with_chunk_index=with_chunk_index,
+            _logic_key=logic_key,
+            _func_key=func_key,
+            _need_clean_up_func=need_clean_up_func,
+            **kw,
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def func(self):
+        return self._func
+
+    @func.setter
+    def func(self, func):
+        self._func = func
+
+    @property
+    def logic_key(self):
+        return self._logic_key
+
+    @logic_key.setter
+    def logic_key(self, logic_key):
+        self._logic_key = logic_key
+
+    @property
+    def func_key(self):
+        return self._func_key
+
+    @func_key.setter
+    def func_key(self, func_key):
+        self._func_key = func_key
+
+    @property
+    def need_clean_up_func(self):
+        return self._need_clean_up_func
+
+    @need_clean_up_func.setter
+    def need_clean_up_func(self, need_clean_up_func: bool):
+        self._need_clean_up_func = need_clean_up_func
+
+    @property
+    def args(self):
+        return self._args
+
+    @property
+    def kwargs(self):
+        return self._kwargs
+
+    @property
+    def with_chunk_index(self):
+        return self._with_chunk_index
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        old_inputs = find_objects(self._args, ENTITY_TYPE) + find_objects(
+            self._kwargs, ENTITY_TYPE
+        )
+        mapping = {o: n for o, n in zip(old_inputs, self._inputs[1:])}
+        self._args = replace_objects(self._args, mapping)
+        self._kwargs = replace_objects(self._kwargs, mapping)
+        self._input = self._inputs[0]
+
+    def _infer_attrs_by_call(self, df_or_series):
+        test_obj = (
+            build_df(df_or_series, size=2)
+            if df_or_series.ndim == 2
+            else build_series(df_or_series, size=2, name=df_or_series.name)
+        )
+        kwargs = self.kwargs or dict()
+        if self.with_chunk_index:
+            kwargs["chunk_index"] = (0,) * df_or_series.ndim
+        with np.errstate(all="ignore"), quiet_stdio():
+            obj = self._func(test_obj, *self._args, **kwargs)
+
+        if obj.ndim == 2:
+            output_type = OutputType.dataframe
+            dtypes = obj.dtypes
+            if obj.shape == test_obj.shape:
+                shape = (df_or_series.shape[0], len(dtypes))
+            else:  # pragma: no cover
+                shape = (np.nan, len(dtypes))
+        else:
+            output_type = OutputType.series
+            dtypes = pd.Series([obj.dtype], name=obj.name)
+            if obj.shape == test_obj.shape:
+                shape = df_or_series.shape
+            else:
+                shape = (np.nan,)
+
+        index_value = parse_index(
+            obj.index, df_or_series, self._func, self._args, self._kwargs
+        )
+        return {
+            "output_type": output_type,
+            "index_value": index_value,
+            "shape": shape,
+            "dtypes": dtypes,
+        }
+
+    def __call__(self, df_or_series, index=None, dtypes=None):
+        output_type = (
+            self.output_types[0]
+            if self.output_types
+            else get_output_types(df_or_series)[0]
+        )
+        shape = self._kwargs.pop("shape", None)
+
+        if output_type == OutputType.df_or_series:
+            return self.new_df_or_series([df_or_series])
+        elif dtypes is not None:
+            index = index if index is not None else pd.RangeIndex(-1)
+            index_value = parse_index(
+                index, df_or_series, self._func, self._args, self._kwargs
+            )
+            if shape is None:  # pragma: no branch
+                shape = (
+                    (np.nan,)
+                    if output_type == OutputType.series
+                    else (np.nan, len(dtypes))
+                )
+        else:
+            # try run to infer meta
+            try:
+                attrs = self._infer_attrs_by_call(df_or_series)
+                output_type = attrs["output_type"]
+                index_value = attrs["index_value"]
+                shape = attrs["shape"]
+                dtypes = attrs["dtypes"]
+            except:  # noqa: E722  # nosec
+                raise TypeError(
+                    "Cannot determine `output_type`, "
+                    "you have to specify it as `dataframe` or `series`, "
+                    "for dataframe, `dtypes` is required as well "
+                    "if output_type='dataframe'"
+                )
+
+        inputs = (
+            [df_or_series]
+            + find_objects(self.args, ENTITY_TYPE)
+            + find_objects(self.kwargs, ENTITY_TYPE)
+        )
+        if output_type == OutputType.series:
+            return self.new_series(
+                inputs,
+                dtype=dtypes.iloc[0],
+                shape=shape,
+                index_value=index_value,
+                name=dtypes.name,
+            )
+        else:
+            # dataframe
+            columns_value = parse_index(dtypes.index, store_data=True)
+            return self.new_dataframe(
+                inputs,
+                shape=shape,
+                dtypes=dtypes,
+                index_value=index_value,
+                columns_value=columns_value,
+            )
+
+    @classmethod
+    def tile(cls, op: "DataFrameMapChunk"):
+        clean_up_func(op)
+        inp = op.input
+        out = op.outputs[0]
+        out_type = op.output_types[0]
+
+        if inp.ndim == 2 and inp.chunk_shape[1] > 1:
+            if has_unknown_shape(inp):
+                yield
+            # if input is a DataFrame, make sure 1 chunk on axis columns
+            inp = yield from recursive_tile(inp.rechunk({1: inp.shape[1]}))
+        arg_input_chunks = []
+        for other_inp in op.inputs[1:]:
+            other_inp = yield from recursive_tile(other_inp.rechunk(other_inp.shape))
+            arg_input_chunks.append(other_inp.chunks[0])
+
+        out_chunks = []
+        if out_type == OutputType.dataframe:
+            nsplits = [[], [out.shape[1]]]
+            pd_out_index = out.index_value.to_pandas()
+        elif out_type == OutputType.series:
+            nsplits = [[]]
+            pd_out_index = out.index_value.to_pandas()
+        else:
+            # DataFrameOrSeries
+            nsplits = None
+            pd_out_index = None
+        for chunk in inp.chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_op.tileable_op_key = op.key
+            if out_type == OutputType.df_or_series:
+                if inp.ndim == 2:
+                    collapse_axis = 1
+                else:
+                    collapse_axis = None
+                out_chunks.append(
+                    chunk_op.new_chunk(
+                        [chunk], index=chunk.index, collapse_axis=collapse_axis
+                    )
+                )
+            elif out_type == OutputType.dataframe:
+                if np.isnan(out.shape[0]):
+                    shape = (np.nan, out.shape[1])
+                else:
+                    shape = (chunk.shape[0], out.shape[1])
+                index_value = parse_index(pd_out_index, chunk, op.key)
+                out_chunk = chunk_op.new_chunk(
+                    [chunk] + arg_input_chunks,
+                    shape=shape,
+                    dtypes=out.dtypes,
+                    index_value=index_value,
+                    columns_value=out.columns_value,
+                    index=(chunk.index[0], 0),
+                )
+                out_chunks.append(out_chunk)
+                nsplits[0].append(out_chunk.shape[0])
+            else:
+                if np.isnan(out.shape[0]):
+                    shape = (np.nan,)
+                else:
+                    shape = (chunk.shape[0],)
+                index_value = parse_index(pd_out_index, chunk, op.key)
+                out_chunk = chunk_op.new_chunk(
+                    [chunk] + arg_input_chunks,
+                    shape=shape,
+                    index_value=index_value,
+                    name=out.name,
+                    dtype=out.dtype,
+                    index=(chunk.index[0],),
+                )
+                out_chunks.append(out_chunk)
+                nsplits[0].append(out_chunk.shape[0])
+
+        params = out.params
+        params["nsplits"] = tuple(tuple(ns) for ns in nsplits) if nsplits else nsplits
+        params["chunks"] = out_chunks
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    @redirect_custom_log
+    @enter_current_session
+    def execute(cls, ctx, op: "DataFrameMapChunk"):
+        restore_func(ctx, op)
+        inp = ctx[op.input.key]
+        out = op.outputs[0]
+        if len(inp) == 0:
+            if op.output_types[0] == OutputType.dataframe:
+                ctx[out.key] = build_empty_df(out.dtypes)
+            elif op.output_types[0] == OutputType.series:
+                ctx[out.key] = build_empty_series(out.dtype, name=out.name)
+            else:
+                raise ValueError(f"Chunk can not be empty except for dataframe/series.")
+            return
+
+        kwargs = op.kwargs or dict()
+        if op.with_chunk_index:
+            kwargs["chunk_index"] = out.index
+        args = op.args or tuple()
+        chunks = find_objects(args, CHUNK_TYPE) + find_objects(kwargs, CHUNK_TYPE)
+        mapping = {chunk: ctx[chunk.key] for chunk in chunks}
+        args = replace_objects(args, mapping)
+        kwargs = replace_objects(kwargs, mapping)
+        ctx[out.key] = op.func(inp, *args, **kwargs)
+
+
+def map_chunk(df_or_series, func, args=(), kwargs=None, skip_infer=False, **kw):
+    """
+    Apply function to each chunk.
+
+    Parameters
+    ----------
+    func : function
+        Function to apply to each chunk.
+    args : tuple
+        Positional arguments to pass to func in addition to the array/series.
+    kwargs: Dict
+        Additional keyword arguments to pass as keywords arguments to func.
+    skip_infer: bool, default False
+        Whether infer dtypes when dtypes or output_type is not specified.
+
+    Returns
+    -------
+    Series or DataFrame
+        Result of applying ``func`` to each chunk of the DataFrame or Series.
+
+    See Also
+    --------
+    DataFrame.apply : Perform any type of operations.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
+    >>> df.execute()
+       A  B
+    0  4  9
+    1  4  9
+    2  4  9
+
+    Output type including Series or DataFrame will be auto inferred.
+
+    >>> df.map_chunk(lambda c: c['A'] + c['B']).execute()
+    0    13
+    1    13
+    2    13
+    dtype: int64
+
+    You can specify ``output_type`` by yourself if auto infer failed.
+
+    >>> import pandas as pd
+    >>> import numpy as np
+    >>> df['c'] = ['s1', 's2', 's3']
+    >>> df.map_chunk(lambda c: pd.concat([c['A'], c['c'].str.slice(1).astype(int)], axis=1)).execute()
+    Traceback (most recent call last):
+    TypeError: Cannot determine `output_type`, you have to specify it as `dataframe` or `series`...
+    >>> df.map_chunk(lambda c: pd.concat([c['A'], c['c'].str.slice(1).astype(int)], axis=1),
+    >>>              output_type='dataframe', dtypes=pd.Series([np.dtype(object), np.dtype(int)])).execute()
+       A  c
+    0  4  1
+    1  4  2
+    2  4  3
+    """
+    output_type = kw.pop("output_type", None)
+    output_types = kw.pop("output_types", None)
+    object_type = kw.pop("object_type", None)
+    output_types = validate_output_types(
+        output_type=output_type, output_types=output_types, object_type=object_type
+    )
+    output_type = output_types[0] if output_types else None
+    if output_type:
+        output_types = [output_type]
+    elif skip_infer:
+        output_types = [OutputType.df_or_series]
+    index = kw.pop("index", None)
+    dtypes = kw.pop("dtypes", None)
+    with_chunk_index = kw.pop("with_chunk_index", False)
+    if kw:  # pragma: no cover
+        raise TypeError(f"Unknown kwargs: {kw}")
+
+    op = DataFrameMapChunk(
+        input=df_or_series,
+        func=func,
+        args=args,
+        kwargs=kwargs or {},
+        output_types=output_types,
+        with_chunk_index=with_chunk_index,
+    )
+    return op(df_or_series, index=index, dtypes=dtypes)
diff --git a/python/xorbits/_mars/dataframe/base/melt.py b/python/xorbits/_mars/dataframe/base/melt.py
new file mode 100644
index 000000000..21eba0d9d
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/melt.py
@@ -0,0 +1,247 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import recursive_tile
+from ...serialization.serializables import AnyField, StringField
+from ...utils import calc_nsplits
+from ..operands import DataFrameOperand, DataFrameOperandMixin, OutputType
+from ..utils import build_empty_df, parse_index, standardize_range_index
+
+
+class DataFrameMelt(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.MELT
+
+    _id_vars = AnyField("id_vars")
+    _value_vars = AnyField("value_vars")
+    _var_name = StringField("var_name")
+    _value_name = StringField("value_name")
+    _col_level = AnyField("col_level")
+
+    def __init__(
+        self,
+        id_vars=None,
+        value_vars=None,
+        var_name=None,
+        value_name=None,
+        col_level=None,
+        **kw
+    ):
+        super().__init__(
+            _id_vars=id_vars,
+            _value_vars=value_vars,
+            _var_name=var_name,
+            _value_name=value_name,
+            _col_level=col_level,
+            **kw
+        )
+
+    @property
+    def id_vars(self):
+        return self._id_vars
+
+    @property
+    def value_vars(self):
+        return self._value_vars
+
+    @property
+    def var_name(self):
+        return self._var_name
+
+    @property
+    def value_name(self):
+        return self._value_name
+
+    @property
+    def col_level(self):
+        return self._col_level
+
+    def __call__(self, df):
+        empty_result = build_empty_df(df.dtypes).melt(
+            id_vars=self.id_vars,
+            value_vars=self.value_vars,
+            var_name=self.var_name,
+            value_name=self.value_name,
+            col_level=self.col_level,
+        )
+        self._output_types = [OutputType.dataframe]
+        return self.new_tileable(
+            [df],
+            shape=(np.nan, len(empty_result.columns)),
+            dtypes=empty_result.dtypes,
+            index_value=parse_index(pd.RangeIndex(-1), df.key, df.index_value.key),
+            columns_value=parse_index(empty_result.columns, store_data=True),
+        )
+
+    @classmethod
+    def tile(cls, op: "DataFrameMelt"):
+        inp = op.inputs[0]
+        out = op.outputs[0]
+
+        inp = yield from recursive_tile(inp.rechunk({1: (inp.shape[1],)}))
+
+        chunks = []
+        for c in inp.chunks:
+            new_op = op.copy().reset_key()
+            chunks.append(
+                new_op.new_chunk(
+                    [c],
+                    index=c.index,
+                    shape=(np.nan, out.shape[1]),
+                    dtypes=out.dtypes,
+                    index_value=parse_index(
+                        pd.RangeIndex(-1), c.key, c.index_value.key
+                    ),
+                    columns_value=out.columns_value,
+                )
+            )
+
+        yield chunks
+        chunks = standardize_range_index(chunks)
+        new_op = op.copy().reset_key()
+        return new_op.new_tileables(
+            [inp],
+            chunks=chunks,
+            nsplits=calc_nsplits({c.index: c.shape for c in chunks}),
+            **out.params
+        )
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameMelt"):
+        in_data = ctx[op.inputs[0].key]
+        ctx[op.outputs[0].key] = in_data.melt(
+            id_vars=op.id_vars,
+            value_vars=op.value_vars,
+            var_name=op.var_name,
+            value_name=op.value_name,
+            col_level=op.col_level,
+        )
+
+
+def melt(
+    frame,
+    id_vars=None,
+    value_vars=None,
+    var_name=None,
+    value_name="value",
+    col_level=None,
+):
+    """
+    Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
+
+    This function is useful to massage a DataFrame into a format where one
+    or more columns are identifier variables (`id_vars`), while all other
+    columns, considered measured variables (`value_vars`), are "unpivoted" to
+    the row axis, leaving just two non-identifier columns, 'variable' and
+    'value'.
+    .. versionadded:: 0.20.0
+
+    Parameters
+    ----------
+    id_vars : tuple, list, or ndarray, optional
+        Column(s) to use as identifier variables.
+    value_vars : tuple, list, or ndarray, optional
+        Column(s) to unpivot. If not specified, uses all columns that
+        are not set as `id_vars`.
+    var_name : scalar
+        Name to use for the 'variable' column. If None it uses
+        ``frame.columns.name`` or 'variable'.
+    value_name : scalar, default 'value'
+        Name to use for the 'value' column.
+    col_level : int or str, optional
+        If columns are a MultiIndex then use this level to melt.
+
+    Returns
+    -------
+    DataFrame
+        Unpivoted DataFrame.
+
+    See Also
+    --------
+    melt
+    pivot_table
+    DataFrame.pivot
+    Series.explode
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
+    ...                    'B': {0: 1, 1: 3, 2: 5},
+    ...                    'C': {0: 2, 1: 4, 2: 6}})
+    >>> df.execute()
+       A  B  C
+    0  a  1  2
+    1  b  3  4
+    2  c  5  6
+
+    >>> df.melt(id_vars=['A'], value_vars=['B']).execute()
+       A variable  value
+    0  a        B      1
+    1  b        B      3
+    2  c        B      5
+
+    >>> df.melt(id_vars=['A'], value_vars=['B', 'C']).execute()
+       A variable  value
+    0  a        B      1
+    1  b        B      3
+    2  c        B      5
+    3  a        C      2
+    4  b        C      4
+    5  c        C      6
+
+    The names of 'variable' and 'value' columns can be customized:
+
+    >>> df.melt(id_vars=['A'], value_vars=['B'],
+    ...         var_name='myVarname', value_name='myValname').execute()
+       A myVarname  myValname
+    0  a         B          1
+    1  b         B          3
+    2  c         B          5
+
+    If you have multi-index columns:
+
+    >>> df = md.DataFrame({('A', 'D'): {0: 'a', 1: 'b', 2: 'c'},
+    ...                    ('B', 'E'): {0: 1, 1: 3, 2: 5},
+    ...                    ('C', 'F'): {0: 2, 1: 4, 2: 6}})
+    >>> df.execute()
+       A  B  C
+       D  E  F
+    0  a  1  2
+    1  b  3  4
+    2  c  5  6
+
+    >>> df.melt(col_level=0, id_vars=['A'], value_vars=['B']).execute()
+       A variable  value
+    0  a        B      1
+    1  b        B      3
+    2  c        B      5
+
+    >>> df.melt(id_vars=[('A', 'D')], value_vars=[('B', 'E')]).execute()
+      (A, D) variable_0 variable_1  value
+    0      a          B          E      1
+    1      b          B          E      3
+    2      c          B          E      5
+    """
+    op = DataFrameMelt(
+        id_vars=id_vars,
+        value_vars=value_vars,
+        var_name=var_name,
+        value_name=value_name,
+        col_level=col_level,
+    )
+    return op(frame)
diff --git a/python/xorbits/_mars/dataframe/base/memory_usage.py b/python/xorbits/_mars/dataframe/base/memory_usage.py
new file mode 100644
index 000000000..75de25932
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/memory_usage.py
@@ -0,0 +1,501 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+from functools import reduce
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...config import options
+from ...core.operand import OperandStage
+from ...serialization.serializables import BoolField, Int64Field
+from ...utils import ceildiv, lazy_import
+from ..core import IndexValue
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index
+
+cudf = lazy_import("cudf")
+
+
+class DataFrameMemoryUsage(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.MEMORY_USAGE
+
+    # raw arguments of memory_usage method
+    _index = BoolField("index")
+    _deep = BoolField("deep")
+
+    # size of range index, when set, the value will be prepended to the result series
+    # if the input is a dataframe, or added to the result when the input is a series
+    _range_index_size = Int64Field("range_index_size")
+
+    def __init__(self, index=None, deep=None, range_index_size=None, **kw):
+        super().__init__(
+            _index=index, _deep=deep, _range_index_size=range_index_size, **kw
+        )
+
+    @property
+    def index(self) -> bool:
+        return self._index
+
+    @index.setter
+    def index(self, value: bool):
+        self._index = value
+
+    @property
+    def deep(self) -> bool:
+        return self._deep
+
+    @property
+    def range_index_size(self) -> int:
+        return self._range_index_size
+
+    @range_index_size.setter
+    def range_index_size(self, value: int):
+        self._range_index_size = value
+
+    def _adapt_index(self, input_index, index=0):
+        """
+        When ``index=True`` is passed, an extra column will be prepended to the result series
+        Thus we need to update the index of initial chunk for returned dataframe chunks
+        """
+        if not self.index or index != 0:
+            return input_index
+        idx_data = input_index.to_pandas().insert(0, "Index")
+        return parse_index(idx_data, store_data=True)
+
+    def _adapt_nsplits(self, input_nsplit):
+        """
+        When ``index=True`` is passed, the size of returned series is one element larger
+        than the number of columns, which affects ``nsplits``.
+        """
+        if not self.index:
+            return (input_nsplit[-1],)
+        nsplits_list = list(input_nsplit[-1])
+        nsplits_list[0] += 1
+        return (tuple(nsplits_list),)
+
+    def __call__(self, df_or_series):
+        """
+        Return output object of memory_usage() call
+        """
+        if df_or_series.ndim == 1:
+            # the input data is a series, a Scalar will be returned
+            return self.new_scalar([df_or_series], dtype=np.dtype(np.int_))
+        else:
+            # the input data is a DataFrame, a Scalar will be returned
+            # calculate shape of returning series given ``op.index``
+            new_shape = (
+                (df_or_series.shape[-1] + 1,)
+                if self.index
+                else (df_or_series.shape[-1],)
+            )
+            return self.new_series(
+                [df_or_series],
+                index_value=self._adapt_index(df_or_series.columns_value),
+                shape=new_shape,
+                dtype=np.dtype(np.int_),
+            )
+
+    @classmethod
+    def _tile_single(cls, op: "DataFrameMemoryUsage"):
+        """
+        Tile when input data has only one chunk on rows
+        """
+        df_or_series = op.inputs[0]
+        output = op.outputs[0]
+
+        chunks = []
+        for c in df_or_series.chunks:
+            new_op = op.copy().reset_key()
+            if c.ndim == 1:
+                # Tile for series
+                chunks.append(
+                    new_op.new_chunk([c], index=c.index, dtype=output.dtype, shape=())
+                )
+            else:
+                # tile for dataframes
+                # only calculate with index=True on the initial chunk
+                new_op.index = op.index and c.index[-1] == 0
+
+                # calculate shape of returning chunk given ``op.index``
+                new_shape = (
+                    (c.shape[-1] + 1,)
+                    if c.index[-1] == 0 and op.index
+                    else (c.shape[-1],)
+                )
+                chunks.append(
+                    new_op.new_chunk(
+                        [c],
+                        shape=new_shape,
+                        dtype=output.dtype,
+                        index=(c.index[-1],),
+                        index_value=op._adapt_index(c.columns_value, c.index[-1]),
+                    )
+                )
+
+        new_op = op.copy().reset_key()
+        # return objects with chunks and nsplits (if needed)
+        if df_or_series.ndim == 1:
+            return new_op.new_scalar([df_or_series], dtype=output.dtype, chunks=chunks)
+        else:
+            return new_op.new_series(
+                [df_or_series],
+                shape=output.shape,
+                dtype=output.dtype,
+                index_value=output.index_value,
+                chunks=chunks,
+                nsplits=op._adapt_nsplits(df_or_series.nsplits),
+            )
+
+    @classmethod
+    def _tile_dataframe(cls, op: "DataFrameMemoryUsage"):
+        """
+        Tile dataframes using tree reduction
+        """
+        df = op.inputs[0]
+        output = op.outputs[0]
+        is_range_index = isinstance(df.index_value.value, IndexValue.RangeIndex)
+
+        # produce map chunks
+        # allocate matrix of chunks
+        chunks_to_reduce = np.empty(shape=df.chunk_shape, dtype=object)
+        for c in df.chunks:
+            new_op = op.copy().reset_key()
+            new_op.stage = OperandStage.map
+
+            if op.index and is_range_index:
+                # when the index is ``pd.RangeIndex``, the size should be included
+                # after all computations are done
+                new_op.index = False
+            else:
+                # when the chunk is not the first chunk in the row, index size is not needed
+                new_op.index = op.index and c.index[-1] == 0
+
+            new_shape = (
+                (c.shape[-1] + 1,) if c.index[-1] == 0 and op.index else (c.shape[-1],)
+            )
+
+            chunks_to_reduce[c.index] = new_op.new_chunk(
+                [c],
+                index=(c.index[-1],),
+                dtype=output.dtype,
+                shape=new_shape,
+                index_value=op._adapt_index(c.columns_value, c.index[-1]),
+            )
+
+        # reduce chunks using tree reduction
+        combine_size = options.combine_size
+        while chunks_to_reduce.shape[0] > 1:
+            # allocate matrix of chunks
+            new_chunks_to_reduce = np.empty(
+                (
+                    ceildiv(chunks_to_reduce.shape[0], combine_size),
+                    chunks_to_reduce.shape[1],
+                ),
+                dtype=object,
+            )
+            for idx in range(0, chunks_to_reduce.shape[0], combine_size):
+                for idx2 in range(chunks_to_reduce.shape[1]):
+                    new_op = op.copy().reset_key()
+                    new_op.stage = OperandStage.reduce
+                    chunks = list(chunks_to_reduce[idx : idx + combine_size, idx2])
+
+                    new_chunks_to_reduce[idx // combine_size, idx2] = new_op.new_chunk(
+                        chunks,
+                        index=(idx2,),
+                        dtype=output.dtype,
+                        shape=chunks[0].shape,
+                        index_value=chunks[0].index_value,
+                    )
+
+            chunks_to_reduce = new_chunks_to_reduce
+
+        # handle RangeIndex at final outputs
+        if op.index and is_range_index:
+            chunks_to_reduce[
+                0, 0
+            ].op.range_index_size = df.index_value.to_pandas().memory_usage()
+
+        # return series with chunks and nsplits
+        new_op = op.copy().reset_key()
+        return new_op.new_series(
+            [df],
+            dtype=output.dtype,
+            shape=output.shape,
+            index_value=output.index_value,
+            chunks=list(chunks_to_reduce[0, :]),
+            nsplits=op._adapt_nsplits(df.nsplits),
+        )
+
+    @classmethod
+    def _tile_series(cls, op: "DataFrameMemoryUsage"):
+        """
+        Tile series using tree reduction
+        """
+        series = op.inputs[0]
+        output = op.outputs[0]
+        is_range_index = isinstance(series.index_value.value, IndexValue.RangeIndex)
+
+        chunks_to_reduce = []
+        for c in series.chunks:
+            new_op = op.copy().reset_key()
+            new_op.stage = OperandStage.map
+
+            # when the index is ``pd.RangeIndex``, the size should be included
+            # after all computations are done
+            new_op.index = op.index and not is_range_index
+
+            chunks_to_reduce.append(
+                new_op.new_chunk([c], index=c.index, dtype=output.dtype, shape=())
+            )
+
+        # reduce chunks using tree reduction
+        combine_size = options.combine_size
+        while len(chunks_to_reduce) > 1:
+            new_chunks_to_reduce = []
+            for idx in range(0, len(chunks_to_reduce), combine_size):
+                new_op = op.copy().reset_key()
+                new_op.stage = OperandStage.reduce
+
+                new_chunks_to_reduce.append(
+                    new_op.new_chunk(
+                        chunks_to_reduce[idx : idx + combine_size],
+                        shape=(),
+                        index=(0,),
+                        dtype=output.dtype,
+                    )
+                )
+
+            chunks_to_reduce = new_chunks_to_reduce
+
+        # handle RangeIndex at final outputs
+        if op.index and is_range_index:
+            chunks_to_reduce[
+                0
+            ].op.range_index_size = series.index_value.to_pandas().memory_usage()
+
+        # return series with chunks
+        new_op = op.copy().reset_key()
+        return new_op.new_scalar([series], dtype=output.dtype, chunks=chunks_to_reduce)
+
+    @classmethod
+    def tile(cls, op: "DataFrameMemoryUsage"):
+        df_or_series = op.inputs[0]
+        if (
+            df_or_series.chunk_shape[0] == 1
+        ):  # only one chunk in row, no aggregation needed
+            return cls._tile_single(op)
+        elif df_or_series.ndim == 1:  # series
+            return cls._tile_series(op)
+        else:  # dataframe
+            return cls._tile_dataframe(op)
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameMemoryUsage"):
+        in_data = ctx[op.inputs[0].key]
+        # choose correct dataframe library
+        xdf = cudf if op.gpu else pd
+
+        if op.stage == OperandStage.reduce:
+            result = reduce(operator.add, (ctx[c.key] for c in op.inputs))
+            if op.range_index_size is not None:
+                if hasattr(in_data, "ndim"):
+                    # dataframe input: prepend index size column
+                    prepend_series = xdf.Series(
+                        [op.range_index_size], index=["Index"], dtype=result.dtype
+                    )
+                    result = xdf.concat([prepend_series, result])
+                else:
+                    # series input: add index size to the output
+                    result += op.range_index_size
+            ctx[op.outputs[0].key] = result
+        elif isinstance(in_data, xdf.Index):
+            ctx[op.outputs[0].key] = in_data.memory_usage(deep=op.deep)
+        else:
+            ctx[op.outputs[0].key] = in_data.memory_usage(index=op.index, deep=op.deep)
+
+
+def df_memory_usage(df, index=True, deep=False):
+    """
+    Return the memory usage of each column in bytes.
+
+    The memory usage can optionally include the contribution of
+    the index and elements of `object` dtype.
+
+    This value is displayed in `DataFrame.info` by default. This can be
+    suppressed by setting ``pandas.options.display.memory_usage`` to False.
+
+    Parameters
+    ----------
+    index : bool, default True
+        Specifies whether to include the memory usage of the DataFrame's
+        index in returned Series. If ``index=True``, the memory usage of
+        the index is the first item in the output.
+    deep : bool, default False
+        If True, introspect the data deeply by interrogating
+        `object` dtypes for system-level memory consumption, and include
+        it in the returned values.
+
+    Returns
+    -------
+    Series
+        A Series whose index is the original column names and whose values
+        is the memory usage of each column in bytes.
+
+    See Also
+    --------
+    numpy.ndarray.nbytes : Total bytes consumed by the elements of an
+        ndarray.
+    Series.memory_usage : Bytes consumed by a Series.
+    Categorical : Memory-efficient array for string values with
+        many repeated values.
+    DataFrame.info : Concise summary of a DataFrame.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> import mars.dataframe as md
+    >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
+    >>> data = dict([(t, mt.ones(shape=5000).astype(t))
+    ...              for t in dtypes])
+    >>> df = md.DataFrame(data)
+    >>> df.head().execute()
+       int64  float64            complex128  object  bool
+    0      1      1.0    1.000000+0.000000j       1  True
+    1      1      1.0    1.000000+0.000000j       1  True
+    2      1      1.0    1.000000+0.000000j       1  True
+    3      1      1.0    1.000000+0.000000j       1  True
+    4      1      1.0    1.000000+0.000000j       1  True
+
+    >>> df.memory_usage().execute()
+    Index           128
+    int64         40000
+    float64       40000
+    complex128    80000
+    object        40000
+    bool           5000
+    dtype: int64
+
+    >>> df.memory_usage(index=False).execute()
+    int64         40000
+    float64       40000
+    complex128    80000
+    object        40000
+    bool           5000
+    dtype: int64
+
+    The memory footprint of `object` dtype columns is ignored by default:
+
+    >>> df.memory_usage(deep=True).execute()
+    Index            128
+    int64          40000
+    float64        40000
+    complex128     80000
+    object        160000
+    bool            5000
+    dtype: int64
+
+    Use a Categorical for efficient storage of an object-dtype column with
+    many repeated values.
+
+    >>> df['object'].astype('category').memory_usage(deep=True).execute()
+    5216
+    """
+    op = DataFrameMemoryUsage(index=index, deep=deep)
+    return op(df)
+
+
+def series_memory_usage(series, index=True, deep=False):
+    """
+    Return the memory usage of the Series.
+
+    The memory usage can optionally include the contribution of
+    the index and of elements of `object` dtype.
+
+    Parameters
+    ----------
+    index : bool, default True
+        Specifies whether to include the memory usage of the Series index.
+    deep : bool, default False
+        If True, introspect the data deeply by interrogating
+        `object` dtypes for system-level memory consumption, and include
+        it in the returned value.
+
+    Returns
+    -------
+    int
+        Bytes of memory consumed.
+
+    See Also
+    --------
+    numpy.ndarray.nbytes : Total bytes consumed by the elements of the
+        array.
+    DataFrame.memory_usage : Bytes consumed by a DataFrame.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> s = md.Series(range(3))
+    >>> s.memory_usage().execute()
+    152
+
+    Not including the index gives the size of the rest of the data, which
+    is necessarily smaller:
+
+    >>> s.memory_usage(index=False).execute()
+    24
+
+    The memory footprint of `object` values is ignored by default:
+
+    >>> s = md.Series(["a", "b"])
+    >>> s.values.execute()
+    array(['a', 'b'], dtype=object)
+
+    >>> s.memory_usage().execute()
+    144
+
+    >>> s.memory_usage(deep=True).execute()
+    260
+    """
+    op = DataFrameMemoryUsage(index=index, deep=deep)
+    return op(series)
+
+
+def index_memory_usage(index, deep=False):
+    """
+    Memory usage of the values.
+
+    Parameters
+    ----------
+    deep : bool
+        Introspect the data deeply, interrogate
+        `object` dtypes for system-level memory consumption.
+
+    Returns
+    -------
+    bytes used
+
+    See Also
+    --------
+    numpy.ndarray.nbytes
+
+    Notes
+    -----
+    Memory usage does not include memory consumed by elements that
+    are not components of the array if deep=False
+    """
+    op = DataFrameMemoryUsage(index=False, deep=deep)
+    return op(index)
diff --git a/python/xorbits/_mars/dataframe/base/pct_change.py b/python/xorbits/_mars/dataframe/base/pct_change.py
new file mode 100644
index 000000000..ec960adb7
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/pct_change.py
@@ -0,0 +1,150 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import validate_axis
+
+
+def pct_change(
+    df_or_series, periods=1, fill_method="pad", limit=None, freq=None, **kwargs
+):
+    """
+    Percentage change between the current and a prior element.
+
+    Computes the percentage change from the immediately previous row by
+    default. This is useful in comparing the percentage of change in a time
+    series of elements.
+
+    Parameters
+    ----------
+    periods : int, default 1
+        Periods to shift for forming percent change.
+    fill_method : str, default 'pad'
+        How to handle NAs before computing percent changes.
+    limit : int, default None
+        The number of consecutive NAs to fill before stopping.
+    freq : DateOffset, timedelta, or str, optional
+        Increment to use from time series API (e.g. 'M' or BDay()).
+    **kwargs
+        Additional keyword arguments are passed into
+        `DataFrame.shift` or `Series.shift`.
+
+    Returns
+    -------
+    chg : Series or DataFrame
+        The same type as the calling object.
+
+    See Also
+    --------
+    Series.diff : Compute the difference of two elements in a Series.
+    DataFrame.diff : Compute the difference of two elements in a DataFrame.
+    Series.shift : Shift the index by some number of periods.
+    DataFrame.shift : Shift the index by some number of periods.
+
+    Examples
+    --------
+    **Series**
+
+    >>> import mars.dataframe as md
+
+    >>> s = md.Series([90, 91, 85])
+    >>> s.execute()
+    0    90
+    1    91
+    2    85
+    dtype: int64
+
+    >>> s.pct_change().execute()
+    0         NaN
+    1    0.011111
+    2   -0.065934
+    dtype: float64
+
+    >>> s.pct_change(periods=2).execute()
+    0         NaN
+    1         NaN
+    2   -0.055556
+    dtype: float64
+
+    See the percentage change in a Series where filling NAs with last
+    valid observation forward to next valid.
+
+    >>> s = md.Series([90, 91, None, 85])
+    >>> s.execute()
+    0    90.0
+    1    91.0
+    2     NaN
+    3    85.0
+    dtype: float64
+
+    >>> s.pct_change(fill_method='ffill').execute()
+    0         NaN
+    1    0.011111
+    2    0.000000
+    3   -0.065934
+    dtype: float64
+
+    **DataFrame**
+
+    Percentage change in French franc, Deutsche Mark, and Italian lira from
+    1980-01-01 to 1980-03-01.
+
+    >>> df = md.DataFrame({
+    ...     'FR': [4.0405, 4.0963, 4.3149],
+    ...     'GR': [1.7246, 1.7482, 1.8519],
+    ...     'IT': [804.74, 810.01, 860.13]},
+    ...     index=['1980-01-01', '1980-02-01', '1980-03-01'])
+    >>> df.execute()
+                    FR      GR      IT
+    1980-01-01  4.0405  1.7246  804.74
+    1980-02-01  4.0963  1.7482  810.01
+    1980-03-01  4.3149  1.8519  860.13
+
+    >>> df.pct_change().execute()
+                      FR        GR        IT
+    1980-01-01       NaN       NaN       NaN
+    1980-02-01  0.013810  0.013684  0.006549
+    1980-03-01  0.053365  0.059318  0.061876
+
+    Percentage of change in GOOG and APPL stock volume. Shows computing
+    the percentage change between columns.
+
+    >>> df = md.DataFrame({
+    ...     '2016': [1769950, 30586265],
+    ...     '2015': [1500923, 40912316],
+    ...     '2014': [1371819, 41403351]},
+    ...     index=['GOOG', 'APPL'])
+    >>> df.execute()
+              2016      2015      2014
+    GOOG   1769950   1500923   1371819
+    APPL  30586265  40912316  41403351
+
+    >>> df.pct_change(axis='columns').execute()
+          2016      2015      2014
+    GOOG   NaN -0.151997 -0.086016
+    APPL   NaN  0.337604  0.012002
+    """
+
+    axis = validate_axis(kwargs.pop("axis", 0))
+    if fill_method is None:
+        data = df_or_series
+    else:
+        data = df_or_series.fillna(method=fill_method, axis=axis, limit=limit)
+
+    rs = data.div(data.shift(periods=periods, freq=freq, axis=axis, **kwargs)) - 1
+    if freq is not None:
+        # Shift method is implemented differently when freq is not None
+        # We want to restore the original index
+        rs = rs.loc[~rs.index.duplicated()]
+        rs = rs.reindex_like(data)
+    return rs
diff --git a/python/xorbits/_mars/dataframe/base/qcut.py b/python/xorbits/_mars/dataframe/base/qcut.py
new file mode 100644
index 000000000..11435409e
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/qcut.py
@@ -0,0 +1,104 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+from pandas.api.types import is_integer
+
+from ...core import ENTITY_TYPE
+from ...tensor import tensor as astensor
+from ...tensor.statistics.percentile import percentile
+from ..core import DATAFRAME_TYPE, SERIES_TYPE
+from ..initializer import DataFrame, Series
+from .cut import cut
+
+
+def qcut(x, q, labels=None, retbins=False, precision=3, duplicate="raise"):
+    """
+    Quantile-based discretization function.
+
+    Discretize variable into equal-sized buckets based on rank or based
+    on sample quantiles. For example 1000 values for 10 quantiles would
+    produce a Categorical object indicating quantile membership for each data point.
+
+    Parameters
+    ----------
+    x : 1d tensor or Series
+    q : int or list-like of float
+        Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
+        array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.
+    labels : array or False, default None
+        Used as labels for the resulting bins. Must be of the same length as
+        the resulting bins. If False, return only integer indicators of the
+        bins. If True, raises an error.
+    retbins : bool, optional
+        Whether to return the (bins, labels) or not. Can be useful if bins
+        is given as a scalar.
+    precision : int, optional
+        The precision at which to store and display the bins labels.
+    duplicates : {default 'raise', 'drop'}, optional
+        If bin edges are not unique, raise ValueError or drop non-uniques.
+
+    Returns
+    -------
+    out : Categorical or Series or tensor of integers if labels is False
+        The return type (Categorical or Series) depends on the input: a Series
+        of type category if input is a Series else Categorical. Bins are
+        represented as categories when categorical data is returned.
+    bins : tensor of floats
+        Returned only if `retbins` is True.
+
+    Notes
+    -----
+    Out of bounds values will be NA in the resulting Categorical object
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> md.qcut(range(5), 4).execute()
+    ... # doctest: +ELLIPSIS
+    [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
+    Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] ...
+
+    >>> md.qcut(range(5), 3, labels=["good", "medium", "bad"]).execute()
+    ... # doctest: +SKIP
+    [good, good, medium, bad, bad]
+    Categories (3, object): [good < medium < bad]
+
+    >>> md.qcut(range(5), 4, labels=False).execute()
+    array([0, 0, 1, 2, 3])
+    """
+    if is_integer(q):
+        q = np.linspace(0, 1, q + 1)
+
+    if isinstance(x, (DATAFRAME_TYPE, SERIES_TYPE, pd.DataFrame, pd.Series)):
+        x = DataFrame(x) if x.ndim == 2 else Series(x)
+        bins = x.quantile(q)
+    else:
+        x = astensor(x)
+        if isinstance(q, ENTITY_TYPE):
+            q = q * 100
+        else:
+            q = [iq * 100 for iq in q]
+        bins = percentile(x, q)
+
+    return cut(
+        x,
+        bins,
+        labels=labels,
+        retbins=retbins,
+        precision=precision,
+        include_lowest=True,
+        duplicates=duplicate,
+    )
diff --git a/python/xorbits/_mars/dataframe/base/rebalance.py b/python/xorbits/_mars/dataframe/base/rebalance.py
new file mode 100644
index 000000000..2a2316a32
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/rebalance.py
@@ -0,0 +1,112 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes
+from ...serialization.serializables import Float64Field, Int64Field, KeyField
+from ...tensor.base.rebalance import RebalanceMixin
+from ..core import INDEX_TYPE
+from ..initializer import DataFrame as asdataframe
+from ..initializer import Index as asindex
+from ..initializer import Series as asseries
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import validate_axis
+
+
+class DataFrameRebalance(RebalanceMixin, DataFrameOperandMixin, DataFrameOperand):
+    _op_type_ = opcodes.REBALANCE
+
+    _input = KeyField("input")
+    _factor = Float64Field("factor")
+    _axis = Int64Field("axis")
+    _num_partitions = Int64Field("num_partitions")
+
+    def __init__(
+        self,
+        input=None,
+        factor=None,
+        axis=None,  # pylint: disable=redefined-builtin
+        num_partitions=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _input=input,
+            _factor=factor,
+            _axis=axis,
+            _num_partitions=num_partitions,
+            _output_types=output_types,
+            **kw
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def factor(self):
+        return self._factor
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def num_partitions(self):
+        return self._num_partitions
+
+    def _get_input_object(self):
+        in_obj = self.input
+        if isinstance(in_obj, INDEX_TYPE):
+            convert = asindex
+        else:
+            convert = asdataframe if in_obj.ndim == 2 else asseries
+        return convert(in_obj)
+
+
+def rebalance(
+    df_or_series, factor=None, axis=0, num_partitions=None, reassign_worker=True
+):
+    """
+    Make Data more balanced across entire cluster.
+
+    Parameters
+    ----------
+    factor : float
+        Specified so that number of chunks after balance is
+        total CPU count of cluster * factor.
+    axis : int
+        The axis to rebalance.
+    num_partitions : int
+        Specified so the number of chunks are at most
+        num_partitions.
+    reassign_worker : bool
+        If True, workers will be reassigned.
+
+    Returns
+    -------
+    Series or DataFrame
+        Result of DataFrame or Series after rebalanced.
+    """
+    axis = validate_axis(axis, df_or_series)
+    if num_partitions is None:
+        factor = factor if factor is not None else 1.2
+
+    op = DataFrameRebalance(
+        input=df_or_series,
+        factor=factor,
+        axis=axis,
+        num_partitions=num_partitions,
+        reassign_worker=reassign_worker,
+    )
+    return op(df_or_series)
diff --git a/python/xorbits/_mars/dataframe/base/rechunk.py b/python/xorbits/_mars/dataframe/base/rechunk.py
new file mode 100644
index 000000000..a2f7c9642
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/rechunk.py
@@ -0,0 +1,206 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import OutputType
+from ...serialization.serializables import AnyField
+from ...tensor.rechunk.core import chunk_size_type, gen_rechunk_infos, get_nsplits
+from ...typing import TileableType
+from ...utils import has_unknown_shape
+from ..initializer import DataFrame as asdataframe
+from ..initializer import Index as asindex
+from ..initializer import Series as asseries
+from ..operands import DATAFRAME_TYPE, DataFrameOperand, DataFrameOperandMixin
+from ..utils import indexing_index_value, merge_index_value
+
+
+class DataFrameRechunk(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.RECHUNK
+
+    chunk_size = AnyField("chunk_size")
+
+    def __call__(self, x):
+        if isinstance(x, DATAFRAME_TYPE):
+            return self.new_dataframe(
+                [x],
+                shape=x.shape,
+                dtypes=x.dtypes,
+                columns_value=x.columns_value,
+                index_value=x.index_value,
+            )
+        else:
+            self.output_types = x.op.output_types
+            f = (
+                self.new_series
+                if self.output_types[0] == OutputType.series
+                else self.new_index
+            )
+            return f(
+                [x],
+                shape=x.shape,
+                dtype=x.dtype,
+                index_value=x.index_value,
+                name=x.name,
+            )
+
+    @classmethod
+    def tile(cls, op: "DataFrameRechunk"):
+        from ..indexing.iloc import (
+            DataFrameIlocGetItem,
+            IndexIlocGetItem,
+            SeriesIlocGetItem,
+        )
+        from ..merge.concat import DataFrameConcat
+
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        out = op.outputs[0]
+        inp = op.inputs[0]
+        if inp.ndim == 2:
+            inp = asdataframe(inp)
+        elif inp.op.output_types[0] == OutputType.series:
+            inp = asseries(inp)
+        else:
+            inp = asindex(inp)
+        chunk_size = _get_chunk_size(inp, op.chunk_size)
+        if chunk_size == inp.nsplits:
+            return [inp]
+
+        rechunk_infos = gen_rechunk_infos(inp, chunk_size)
+        out_chunks = []
+        for rechunk_info in rechunk_infos:
+            chunk_index = rechunk_info.out_index
+            shape = rechunk_info.shape
+            inp_chunks = rechunk_info.input_chunks
+            inp_chunk_slices = rechunk_info.input_slices
+            inp_slice_chunks = []
+            for inp_chunk, inp_chunk_slice in zip(inp_chunks, inp_chunk_slices):
+                if all(slc == slice(None) for slc in inp_chunk_slice):
+                    inp_slice_chunks.append(inp_chunk)
+                else:
+                    index_value = indexing_index_value(
+                        inp_chunk.index_value, inp_chunk_slice[0], rechunk=True
+                    )
+                    if inp_chunk.ndim == 1:
+                        # Series or Index
+                        slc_chunk_op_type = (
+                            SeriesIlocGetItem
+                            if op.output_types[0] == OutputType.series
+                            else IndexIlocGetItem
+                        )
+                        slc_chunk = slc_chunk_op_type(
+                            indexes=inp_chunk_slice,
+                            output_types=op.output_types,
+                            sparse=inp_chunk.op.sparse,
+                        ).new_chunk(
+                            [inp_chunk],
+                            index_value=index_value,
+                            dtype=inp_chunk.dtype,
+                            name=inp_chunk.name,
+                            index=inp_chunk.index,
+                        )
+                    else:
+                        # DataFrame
+                        columns_value = indexing_index_value(
+                            inp_chunk.columns_value,
+                            inp_chunk_slice[1],
+                            store_data=True,
+                            rechunk=True,
+                        )
+                        dtypes = inp_chunk.dtypes.iloc[inp_chunk_slice[1]]
+                        slc_chunk = DataFrameIlocGetItem(
+                            indexes=inp_chunk_slice,
+                            output_types=[OutputType.dataframe],
+                            sparse=inp_chunk.op.sparse,
+                        ).new_chunk(
+                            [inp_chunk],
+                            index_value=index_value,
+                            columns_value=columns_value,
+                            dtypes=dtypes,
+                            index=inp_chunk.index,
+                        )
+                    inp_slice_chunks.append(slc_chunk)
+
+            chunk_shape = rechunk_info.input_chunk_shape
+            inp_chunks_arr = np.empty(chunk_shape, dtype=object)
+            inp_chunks_arr.ravel()[:] = inp_slice_chunks
+            params = dict(index=chunk_index, shape=shape)
+            if inp_chunks_arr.ndim == 1:
+                params["index_value"] = merge_index_value(
+                    {i: c.index_value for i, c in enumerate(inp_chunks_arr)}
+                )
+                params["dtype"] = inp_slice_chunks[0].dtype
+                params["name"] = inp_slice_chunks[0].name
+            else:
+                params["index_value"] = merge_index_value(
+                    {i: c.index_value for i, c in enumerate(inp_chunks_arr[:, 0])}
+                )
+                params["columns_value"] = merge_index_value(
+                    {i: c.columns_value for i, c in enumerate(inp_chunks_arr[0])},
+                    store_data=True,
+                )
+                params["dtypes"] = pd.concat([c.dtypes for c in inp_chunks_arr[0]])
+            if len(inp_slice_chunks) == 1:
+                c = inp_slice_chunks[0]
+                cc = c.op.copy().new_chunk(c.op.inputs, kws=[params])
+                out_chunks.append(cc)
+            else:
+                out_chunk = DataFrameConcat(
+                    output_types=[out.op.output_types[0]]
+                ).new_chunk(inp_slice_chunks, kws=[params])
+                out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        params = out.params
+        params["nsplits"] = chunk_size
+        params["chunks"] = out_chunks
+        df_or_series = new_op.new_tileable(op.inputs, kws=[params])
+
+        if op.reassign_worker:
+            for c in df_or_series.chunks:
+                c.op.reassign_worker = True
+
+        return [df_or_series]
+
+
+def _get_chunk_size(
+    a: TileableType, chunk_size: chunk_size_type
+) -> Tuple[Tuple[int], ...]:
+    if isinstance(a, DATAFRAME_TYPE):
+        itemsize = max(getattr(dt, "itemsize", 8) for dt in a.dtypes)
+    else:
+        itemsize = a.dtype.itemsize
+    return get_nsplits(a, chunk_size, itemsize)
+
+
+def rechunk(a: TileableType, chunk_size: chunk_size_type, reassign_worker=False):
+    if not any(pd.isna(s) for s in a.shape) and not a.is_coarse():
+        if not has_unknown_shape(a):
+            # do client check only when no unknown shape,
+            # real nsplits will be recalculated inside `tile`
+            chunk_size = _get_chunk_size(a, chunk_size)
+            if chunk_size == a.nsplits:
+                return a
+
+    op = DataFrameRechunk(
+        chunk_size=chunk_size,
+        reassign_worker=reassign_worker,
+    )
+    return op(a)
diff --git a/python/xorbits/_mars/dataframe/base/select_dtypes.py b/python/xorbits/_mars/dataframe/base/select_dtypes.py
new file mode 100644
index 000000000..00b00c2ac
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/select_dtypes.py
@@ -0,0 +1,104 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import build_empty_df
+
+
+def select_dtypes(df, include=None, exclude=None):
+    """
+    Return a subset of the DataFrame's columns based on the column dtypes.
+
+    Parameters
+    ----------
+    include, exclude : scalar or list-like
+        A selection of dtypes or strings to be included/excluded. At least
+        one of these parameters must be supplied.
+
+    Returns
+    -------
+    DataFrame
+        The subset of the frame including the dtypes in ``include`` and
+        excluding the dtypes in ``exclude``.
+
+    Raises
+    ------
+    ValueError
+        * If both of ``include`` and ``exclude`` are empty
+        * If ``include`` and ``exclude`` have overlapping elements
+        * If any kind of string dtype is passed in.
+
+    See Also
+    --------
+    DataFrame.dtypes: Return Series with the data type of each column.
+
+    Notes
+    -----
+    * To select all *numeric* types, use ``np.number`` or ``'number'``
+    * To select strings you must use the ``object`` dtype, but note that
+      this will return *all* object dtype columns
+    * See the `numpy dtype hierarchy
+      <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__
+    * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
+      ``'datetime64'``
+    * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
+      ``'timedelta64'``
+    * To select Pandas categorical dtypes, use ``'category'``
+    * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
+      0.20.0) or ``'datetime64[ns, tz]'``
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'a': [1, 2] * 3,
+    ...                    'b': [True, False] * 3,
+    ...                    'c': [1.0, 2.0] * 3})
+    >>> df.execute()
+            a      b  c
+    0       1   True  1.0
+    1       2  False  2.0
+    2       1   True  1.0
+    3       2  False  2.0
+    4       1   True  1.0
+    5       2  False  2.0
+
+    >>> df.select_dtypes(include='bool').execute()
+       b
+    0  True
+    1  False
+    2  True
+    3  False
+    4  True
+    5  False
+
+    >>> df.select_dtypes(include=['float64']).execute()
+       c
+    0  1.0
+    1  2.0
+    2  1.0
+    3  2.0
+    4  1.0
+    5  2.0
+
+    >>> df.select_dtypes(exclude=['int64']).execute()
+           b    c
+    0   True  1.0
+    1  False  2.0
+    2   True  1.0
+    3  False  2.0
+    4   True  1.0
+    5  False  2.0
+    """
+    test_df = build_empty_df(df.dtypes)
+    test_df = test_df.select_dtypes(include=include, exclude=exclude)
+    return df[test_df.dtypes.index.tolist()]
diff --git a/python/xorbits/_mars/dataframe/base/shift.py b/python/xorbits/_mars/dataframe/base/shift.py
new file mode 100644
index 000000000..75a4d0a04
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/shift.py
@@ -0,0 +1,510 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import OutputType
+from ...serialization.serializables import AnyField, Int8Field, Int64Field, KeyField
+from ...utils import has_unknown_shape, no_default, pd_release_version
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_df, build_series, parse_index, validate_axis
+
+_need_consolidate = pd.__version__ in ("1.1.0", "1.3.0", "1.3.1")
+_enable_no_default = pd_release_version[:2] > (1, 1)
+_with_column_freq_bug = (1, 2, 0) <= pd_release_version < (1, 4, 3)
+
+
+class DataFrameShift(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.SHIFT
+
+    _input = KeyField("input")
+    _periods = Int64Field("periods")
+    _freq = AnyField("freq")
+    _axis = Int8Field("axis")
+    _fill_value = AnyField("fill_value")
+
+    def __init__(self, periods=None, freq=None, axis=None, fill_value=None, **kw):
+        super().__init__(
+            _periods=periods, _freq=freq, _axis=axis, _fill_value=fill_value, **kw
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def periods(self):
+        return self._periods
+
+    @property
+    def freq(self):
+        return self._freq
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def fill_value(self):
+        return self._fill_value
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def _call_dataframe(self, df):
+        test_df = build_df(df)
+        result_df = test_df.shift(
+            periods=self._periods,
+            freq=self._freq,
+            axis=self._axis,
+            fill_value=self._fill_value,
+        )
+
+        if self._freq is None:
+            # shift data
+            index_value = df.index_value
+            columns_value = df.columns_value
+        else:
+            # shift index
+            if self._axis == 0:
+                index_value = self._get_index_value(
+                    df.index_value, self._periods, self._freq
+                )
+                columns_value = df.columns_value
+            else:
+                columns_value = parse_index(result_df.dtypes.index, store_data=True)
+                index_value = df.index_value
+
+        return self.new_dataframe(
+            [df],
+            shape=df.shape,
+            dtypes=result_df.dtypes,
+            index_value=index_value,
+            columns_value=columns_value,
+        )
+
+    def _call_series(self, series):
+        test_series = build_series(series)
+        result_series = test_series.shift(
+            periods=self._periods,
+            freq=self._freq,
+            axis=self._axis,
+            fill_value=self._fill_value,
+        )
+
+        index_value = series.index_value
+        if self._freq is not None:
+            # shift index
+            index_value = self._get_index_value(index_value, self._periods, self._freq)
+
+        return self.new_series(
+            [series],
+            shape=series.shape,
+            index_value=index_value,
+            dtype=result_series.dtype,
+            name=series.name,
+        )
+
+    def __call__(self, df_or_series):
+        if df_or_series.op.output_types[0] == OutputType.dataframe:
+            self.output_types = [OutputType.dataframe]
+            return self._call_dataframe(df_or_series)
+        else:
+            assert df_or_series.op.output_types[0] == OutputType.series
+            self.output_types = [OutputType.series]
+            return self._call_series(df_or_series)
+
+    @staticmethod
+    def _get_index_value(input_index_value, periods, freq):
+        if (
+            not input_index_value.has_value()
+            and input_index_value.min_val is not None
+            and input_index_value.max_val is not None
+            and freq is not None
+            and input_index_value.is_monotonic_increasing_or_decreasing
+        ):
+            pd_index = pd.Index(
+                [input_index_value.min_val, input_index_value.max_val]
+            ).shift(periods=periods, freq=freq)
+            index_value = parse_index(pd_index)
+            index_value.value._min_val_close = input_index_value.min_val_close
+            index_value.value._max_val_close = input_index_value.max_val_close
+            return index_value
+        else:
+            pd_index = input_index_value.to_pandas()
+            return parse_index(pd_index, periods, freq)
+
+    @classmethod
+    def _tile_dataframe(cls, op):
+        from ..indexing.iloc import DataFrameIlocGetItem
+        from ..merge.concat import DataFrameConcat
+
+        inp = op.input
+        out = op.outputs[0]
+        axis = op.axis
+
+        out_chunks = []
+        if op.freq is not None:
+            cum_nsplit = [0] + np.cumsum(inp.nsplits[axis]).tolist()
+            # shift index
+            for c in inp.chunks:
+                chunk_op = op.copy().reset_key()
+                i = c.index[axis]
+                start, end = cum_nsplit[i], cum_nsplit[i + 1]
+                if axis == 0:
+                    index_value = cls._get_index_value(
+                        c.index_value, op.periods, op.freq
+                    )
+                    columns_value = c.columns_value
+                    dtypes = c.dtypes
+                else:
+                    dtypes = out.dtypes.iloc[start:end]
+                    columns_value = parse_index(dtypes.index, store_data=True)
+                    index_value = c.index_value
+                out_chunk = chunk_op.new_chunk(
+                    [c],
+                    index=c.index,
+                    shape=c.shape,
+                    index_value=index_value,
+                    columns_value=columns_value,
+                    dtypes=dtypes,
+                )
+                out_chunks.append(out_chunk)
+        else:
+            if np.isnan(np.sum(inp.nsplits[axis])):  # pragma: no cover
+                yield
+
+            # shift data
+            inc = op.periods > 0
+            cum_nsplit = [0] + np.cumsum(inp.nsplits[axis]).tolist()
+            for j in range(inp.chunk_shape[1 - axis]):
+                for i in range(inp.chunk_shape[axis]):
+                    index = [None, None]
+                    index[axis] = i
+                    index[1 - axis] = j
+                    index = tuple(index)
+
+                    start, end = cum_nsplit[i], cum_nsplit[i + 1]
+
+                    c = inp.cix[index]
+                    to_concats = [c]
+                    left = abs(op.periods)
+                    prev_i = i - 1 if inc else i + 1
+                    while left > 0 and 0 <= prev_i < inp.chunk_shape[axis]:
+                        prev_index = [None, None]
+                        prev_index[axis] = prev_i
+                        prev_index[1 - axis] = j
+                        prev_index = tuple(prev_index)
+
+                        prev_chunk = inp.cix[prev_index]
+                        size = min(prev_chunk.shape[axis], left)
+                        left -= size
+                        prev_i = prev_i - 1 if inc else prev_i + 1
+
+                        if size == prev_chunk.shape[axis]:
+                            to_concat = prev_chunk
+                        else:
+                            slcs = [slice(None)] * 2
+                            slc = slice(-size, None) if inc else slice(size)
+                            slcs[axis] = slc
+                            slc_op = DataFrameIlocGetItem(indexes=slcs)
+                            to_concat = slc_op.new_chunk([prev_chunk])
+
+                        if inc:
+                            to_concats.insert(0, to_concat)
+                        else:
+                            to_concats.append(to_concat)
+
+                    if len(to_concats) == 1:
+                        to_shift_chunk = to_concats[0]
+                    else:
+                        concat_op = DataFrameConcat(
+                            axis=axis, output_types=[OutputType.dataframe]
+                        )
+                        to_shift_chunk = concat_op.new_chunk(to_concats)
+
+                    chunk_op = op.copy().reset_key()
+                    if axis == 1:
+                        dtypes = out.dtypes.iloc[start:end]
+                        columns_value = parse_index(dtypes.index, store_data=True)
+                        index_value = c.index_value
+                    else:
+                        dtypes = c.dtypes
+                        columns_value = c.columns_value
+                        index_value = cls._get_index_value(
+                            c.index_value, op.periods, op.freq
+                        )
+
+                    out_chunk = chunk_op.new_chunk(
+                        [to_shift_chunk],
+                        index=index,
+                        shape=c.shape,
+                        index_value=index_value,
+                        columns_value=columns_value,
+                        dtypes=dtypes,
+                    )
+                    out_chunks.append(out_chunk)
+
+        params = out.params
+        params["chunks"] = out_chunks
+        params["nsplits"] = inp.nsplits
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def _tile_series(cls, op):
+        from ..indexing.iloc import SeriesIlocGetItem
+        from ..merge import DataFrameConcat
+
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        inp = op.input
+        out = op.outputs[0]
+
+        out_chunks = []
+
+        for i, c in enumerate(inp.chunks):
+            chunk_op = op.copy().reset_key()
+
+            if op.freq is not None:
+                # shift index
+                index_value = cls._get_index_value(c.index_value, op.periods, op.freq)
+                out_chunk = chunk_op.new_chunk(
+                    [c],
+                    shape=c.shape,
+                    index_value=index_value,
+                    name=c.name,
+                    dtype=out.dtype,
+                    index=c.index,
+                )
+            else:
+                inc = op.periods > 0
+                prev_i = i - 1 if inc else i + 1
+
+                to_concats = [c]
+                left = abs(op.periods)
+                while left > 0 and 0 <= prev_i < inp.chunk_shape[0]:
+                    prev_chunk = inp.cix[prev_i,]
+                    size = min(left, prev_chunk.shape[0])
+                    left -= size
+                    prev_i = prev_i - 1 if inc else prev_i + 1
+
+                    if size == prev_chunk.shape[0]:
+                        to_concat = prev_chunk
+                    else:
+                        slc = slice(-size, None) if inc else slice(size)
+                        slc_op = SeriesIlocGetItem(indexes=[slc])
+                        to_concat = slc_op.new_chunk([prev_chunk])
+
+                    if inc:
+                        to_concats.insert(0, to_concat)
+                    else:
+                        to_concats.append(to_concat)
+
+                if len(to_concats) == 1:
+                    to_concat = to_concats[0]
+                else:
+                    concat_op = DataFrameConcat(output_types=[OutputType.series])
+                    to_concat = concat_op.new_chunk(to_concats)
+
+                out_chunk = chunk_op.new_chunk(
+                    [to_concat],
+                    index=(i,),
+                    shape=c.shape,
+                    index_value=c.index_value,
+                    dtype=out.dtype,
+                    name=out.name,
+                )
+            out_chunks.append(out_chunk)
+
+        params = out.params
+        params["chunks"] = out_chunks
+        params["nsplits"] = inp.nsplits
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def tile(cls, op):
+        if op.output_types[0] == OutputType.dataframe:
+            return (yield from cls._tile_dataframe(op))
+        else:
+            return (yield from cls._tile_series(op))
+
+    @classmethod
+    def execute(cls, ctx, op):
+        axis = op.axis
+        periods = op.periods
+
+        obj = ctx[op.input.key]
+        out = op.outputs[0]
+
+        if (
+            _need_consolidate
+            and isinstance(obj, (pd.Series, pd.DataFrame))
+            and len(obj._data.blocks) > 1
+        ):
+            # if #internal blocks > 1, shift will create wrong result in pandas 1.1.0
+            # see https://github.com/pandas-dev/pandas/issues/35488
+            # if shifting merged dataframe slices, shift will raise TypeError in pandas 1.3.0
+            # see https://github.com/pandas-dev/pandas/issues/42401
+            # thus we force to do consolidate
+            obj._data._consolidate_inplace()
+
+        result = obj.shift(
+            periods=periods, freq=op.freq, axis=axis, fill_value=op.fill_value
+        )
+        if result.shape != out.shape:
+            slc = [slice(None)] * obj.ndim
+            if periods > 0:
+                slc[axis] = slice(-out.shape[axis], None)
+            else:
+                slc[axis] = slice(out.shape[axis])
+
+            result = result.iloc[tuple(slc)]
+            assert result.shape == out.shape, (result.shape, out.shape)
+
+        ctx[out.key] = result
+
+
+def shift(df_or_series, periods=1, freq=None, axis=0, fill_value=None):
+    """
+    Shift index by desired number of periods with an optional time `freq`.
+
+    When `freq` is not passed, shift the index without realigning the data.
+    If `freq` is passed (in this case, the index must be date or datetime,
+    or it will raise a `NotImplementedError`), the index will be
+    increased using the periods and the `freq`.
+
+    Parameters
+    ----------
+    periods : int
+        Number of periods to shift. Can be positive or negative.
+    freq : DateOffset, tseries.offsets, timedelta, or str, optional
+        Offset to use from the tseries module or time rule (e.g. 'EOM').
+        If `freq` is specified then the index values are shifted but the
+        data is not realigned. That is, use `freq` if you would like to
+        extend the index when shifting and preserve the original data.
+    axis : {0 or 'index', 1 or 'columns', None}, default None
+        Shift direction.
+    fill_value : object, optional
+        The scalar value to use for newly introduced missing values.
+        the default depends on the dtype of `self`.
+        For numeric data, ``np.nan`` is used.
+        For datetime, timedelta, or period data, etc. :attr:`NaT` is used.
+        For extension dtypes, ``self.dtype.na_value`` is used.
+
+    Returns
+    -------
+    DataFrame or Series
+        Copy of input object, shifted.
+
+    See Also
+    --------
+    Index.shift : Shift values of Index.
+    DatetimeIndex.shift : Shift values of DatetimeIndex.
+    PeriodIndex.shift : Shift values of PeriodIndex.
+    tshift : Shift the time index, using the index's frequency if
+        available.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+
+    >>> df = md.DataFrame({'Col1': [10, 20, 15, 30, 45],
+    ...                    'Col2': [13, 23, 18, 33, 48],
+    ...                    'Col3': [17, 27, 22, 37, 52]})
+
+    >>> df.shift(periods=3).execute()
+       Col1  Col2  Col3
+    0   NaN   NaN   NaN
+    1   NaN   NaN   NaN
+    2   NaN   NaN   NaN
+    3  10.0  13.0  17.0
+    4  20.0  23.0  27.0
+
+    >>> df.shift(periods=1, axis='columns').execute()
+       Col1  Col2  Col3
+    0   NaN  10.0  13.0
+    1   NaN  20.0  23.0
+    2   NaN  15.0  18.0
+    3   NaN  30.0  33.0
+    4   NaN  45.0  48.0
+
+    >>> df.shift(periods=3, fill_value=0).execute()
+       Col1  Col2  Col3
+    0     0     0     0
+    1     0     0     0
+    2     0     0     0
+    3    10    13    17
+    4    20    23    27
+    """
+    axis = validate_axis(axis, df_or_series)
+    if periods == 0:
+        return df_or_series.copy()
+    if fill_value is no_default:  # pragma: no cover
+        if not _enable_no_default or (
+            _with_column_freq_bug and axis == 1 and freq is not None
+        ):
+            # pandas shift shows different behavior for axis=1 when freq is specified,
+            # see https://github.com/pandas-dev/pandas/issues/47039 for details.
+            fill_value = None
+    op = DataFrameShift(periods=periods, freq=freq, axis=axis, fill_value=fill_value)
+    return op(df_or_series)
+
+
+def tshift(df_or_series, periods: int = 1, freq=None, axis=0):
+    """
+    Shift the time index, using the index's frequency if available.
+
+    Parameters
+    ----------
+    periods : int
+        Number of periods to move, can be positive or negative.
+    freq : DateOffset, timedelta, or str, default None
+        Increment to use from the tseries module
+        or time rule expressed as a string (e.g. 'EOM').
+    axis : {0 or ‘index’, 1 or ‘columns’, None}, default 0
+        Corresponds to the axis that contains the Index.
+
+    Returns
+    -------
+    shifted : Series/DataFrame
+
+    Notes
+    -----
+    If freq is not specified then tries to use the freq or inferred_freq
+    attributes of the index. If neither of those attributes exist, a
+    ValueError is thrown
+    """
+    axis = validate_axis(axis, df_or_series)
+    index = (
+        df_or_series.index_value.to_pandas()
+        if axis == 0
+        else df_or_series.columns_value.to_pandas()
+    )
+
+    if freq is None:
+        freq = getattr(index, "freq", None)
+
+    if freq is None:  # pragma: no cover
+        freq = getattr(index, "inferred_freq", None)
+
+    if freq is None:
+        raise ValueError("Freq was not given and was not set in the index")
+
+    return shift(df_or_series, periods=periods, freq=freq, axis=axis)
diff --git a/python/xorbits/_mars/dataframe/base/stack.py b/python/xorbits/_mars/dataframe/base/stack.py
new file mode 100644
index 000000000..7edda8a67
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/stack.py
@@ -0,0 +1,312 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Union
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import recursive_tile
+from ...serialization.serializables import AnyField, BoolField, KeyField
+from ...utils import has_unknown_shape
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_df, parse_index
+
+
+class DataFrameStack(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.STACK
+
+    _input_df = KeyField("input_df")
+    _level = AnyField("level")
+    _dropna = BoolField("dropna")
+
+    def __init__(self, input_df=None, level=None, dropna=None, **kw):
+        super().__init__(_input_df=input_df, _level=level, _dropna=dropna, **kw)
+
+    @property
+    def input_df(self):
+        return self._input_df
+
+    @property
+    def level(self):
+        return self._level
+
+    @property
+    def dropna(self):
+        return self._dropna
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input_df = self._inputs[0]
+
+    @classmethod
+    def _calc_size(cls, size: int, level: Union[List, int], dtypes: pd.Series):
+        index = dtypes.index
+
+        if not isinstance(index, pd.MultiIndex):
+            return size * len(index)
+
+        if isinstance(level, int):
+            level = [level]
+        return size * np.prod([index.levshape[lev] for lev in level]).item()
+
+    def __call__(self, input_df):
+        test_df = build_df(input_df)
+        test_df = test_df.stack(level=self._level, dropna=self._dropna)
+        if self._dropna:
+            size = np.nan
+        else:
+            size = self._calc_size(input_df.shape[0], self._level, input_df.dtypes)
+        if test_df.ndim == 1:
+            shape = (size,)
+            return self.new_series(
+                [input_df],
+                shape=shape,
+                dtype=test_df.dtype,
+                index_value=parse_index(test_df.index, input_df),
+                name=test_df.name,
+            )
+        else:
+            shape = (size, test_df.shape[1])
+            return self.new_dataframe(
+                [input_df],
+                shape=shape,
+                dtypes=test_df.dtypes,
+                index_value=parse_index(test_df.index, input_df),
+                columns_value=parse_index(test_df.columns, store_data=True),
+            )
+
+    @classmethod
+    def tile(cls, op: "DataFrameStack"):
+        input_df = op.input_df
+        out = op.outputs[0]
+        out_index = out.index_value.to_pandas()
+
+        if input_df.chunk_shape[1] > 1:
+            # rechunk into 1 chunk on axis 1
+            if has_unknown_shape(input_df):
+                yield
+            input_df = yield from recursive_tile(
+                input_df.rechunk({1: input_df.shape[1]})
+            )
+
+        out_chunks = []
+        for c in input_df.chunks:
+            chunk_op = op.copy().reset_key()
+            if op.dropna:
+                size = np.nan
+            else:
+                size = cls._calc_size(c.shape[0], op.level, c.dtypes)
+            if out.ndim == 1:
+                kw = {
+                    "shape": (size,),
+                    "index": (c.index[0],),
+                    "dtype": out.dtype,
+                    "index_value": parse_index(out_index, c),
+                    "name": out.name,
+                }
+            else:
+                kw = {
+                    "shape": (size, out.shape[1]),
+                    "index": (c.index[0], 0),
+                    "dtypes": out.dtypes,
+                    "index_value": parse_index(out_index, c),
+                    "columns_value": out.columns_value,
+                }
+            out_chunk = chunk_op.new_chunk([c], **kw)
+            out_chunks.append(out_chunk)
+
+        params = out.params
+        if out.ndim == 1:
+            params["nsplits"] = (tuple(out_c.shape[0] for out_c in out_chunks),)
+        else:
+            params["nsplits"] = (
+                tuple(out_c.shape[0] for out_c in out_chunks),
+                (out.shape[1],),
+            )
+        params["chunks"] = out_chunks
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameStack"):
+        inp: pd.DataFrame = ctx[op.input_df.key]
+        ctx[op.outputs[0].key] = inp.stack(level=op.level, dropna=op.dropna)
+
+
+def stack(df, level=-1, dropna=True):
+    """
+    Stack the prescribed level(s) from columns to index.
+
+    Return a reshaped DataFrame or Series having a multi-level
+    index with one or more new inner-most levels compared to the current
+    DataFrame. The new inner-most levels are created by pivoting the
+    columns of the current dataframe:
+
+      - if the columns have a single level, the output is a Series;
+      - if the columns have multiple levels, the new index
+        level(s) is (are) taken from the prescribed level(s) and
+        the output is a DataFrame.
+
+    Parameters
+    ----------
+    level : int, str, list, default -1
+        Level(s) to stack from the column axis onto the index
+        axis, defined as one index or label, or a list of indices
+        or labels.
+    dropna : bool, default True
+        Whether to drop rows in the resulting Frame/Series with
+        missing values. Stacking a column level onto the index
+        axis can create combinations of index and column values
+        that are missing from the original dataframe. See Examples
+        section.
+
+    Returns
+    -------
+    DataFrame or Series
+        Stacked dataframe or series.
+
+    See Also
+    --------
+    DataFrame.unstack : Unstack prescribed level(s) from index axis
+         onto column axis.
+    DataFrame.pivot : Reshape dataframe from long format to wide
+         format.
+    DataFrame.pivot_table : Create a spreadsheet-style pivot table
+         as a DataFrame.
+
+    Notes
+    -----
+    The function is named by analogy with a collection of books
+    being reorganized from being side by side on a horizontal
+    position (the columns of the dataframe) to being stacked
+    vertically on top of each other (in the index of the
+    dataframe).
+
+    Examples
+    --------
+    **Single level columns**
+
+    >>> import mars.dataframe as md
+    >>> df_single_level_cols = md.DataFrame([[0, 1], [2, 3]],
+    ...                                     index=['cat', 'dog'],
+    ...                                     columns=['weight', 'height'])
+
+    Stacking a dataframe with a single level column axis returns a Series:
+
+    >>> df_single_level_cols.execute()
+         weight height
+    cat       0      1
+    dog       2      3
+    >>> df_single_level_cols.stack().execute()
+    cat  weight    0
+         height    1
+    dog  weight    2
+         height    3
+    dtype: int64
+
+    **Multi level columns: simple case**
+
+    >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
+    ...                                        ('weight', 'pounds')])
+    >>> df_multi_level_cols1 = md.DataFrame([[1, 2], [2, 4]],
+    ...                                     index=['cat', 'dog'],
+    ...                                     columns=multicol1)
+
+    Stacking a dataframe with a multi-level column axis:
+
+    >>> df_multi_level_cols1.execute()
+         weight
+             kg    pounds
+    cat       1        2
+    dog       2        4
+    >>> df_multi_level_cols1.stack().execute()
+                weight
+    cat kg           1
+        pounds       2
+    dog kg           2
+        pounds       4
+
+    **Missing values**
+
+    >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
+    ...                                        ('height', 'm')])
+    >>> df_multi_level_cols2 = md.DataFrame([[1.0, 2.0], [3.0, 4.0]],
+    ...                                     index=['cat', 'dog'],
+    ...                                     columns=multicol2)
+
+    It is common to have missing values when stacking a dataframe
+    with multi-level columns, as the stacked dataframe typically
+    has more values than the original dataframe. Missing values
+    are filled with NaNs:
+
+    >>> df_multi_level_cols2.execute()
+        weight height
+            kg      m
+    cat    1.0    2.0
+    dog    3.0    4.0
+    >>> df_multi_level_cols2.stack().execute()
+            height  weight
+    cat kg     NaN     1.0
+        m      2.0     NaN
+    dog kg     NaN     3.0
+        m      4.0     NaN
+
+    **Prescribing the level(s) to be stacked**
+
+    The first parameter controls which level or levels are stacked:
+
+    >>> df_multi_level_cols2.stack(0).execute()
+                 kg    m
+    cat height  NaN  2.0
+        weight  1.0  NaN
+    dog height  NaN  4.0
+        weight  3.0  NaN
+    >>> df_multi_level_cols2.stack([0, 1]).execute()
+    cat  height  m     2.0
+         weight  kg    1.0
+    dog  height  m     4.0
+         weight  kg    3.0
+    dtype: float64
+
+    **Dropping missing values**
+
+    >>> df_multi_level_cols3 = md.DataFrame([[None, 1.0], [2.0, 3.0]],
+    ...                                     index=['cat', 'dog'],
+    ...                                     columns=multicol2)
+
+    Note that rows where all values are missing are dropped by
+    default but this behaviour can be controlled via the dropna
+    keyword parameter:
+
+    >>> df_multi_level_cols3.execute()
+        weight height
+            kg      m
+    cat    NaN    1.0
+    dog    2.0    3.0
+    >>> df_multi_level_cols3.stack(dropna=False).execute()
+            height  weight
+    cat kg     NaN     NaN
+        m      1.0     NaN
+    dog kg     NaN     2.0
+        m      3.0     NaN
+    >>> df_multi_level_cols3.stack(dropna=True).execute()
+            height  weight
+    cat m      1.0     NaN
+    dog kg     NaN     2.0
+        m      3.0     NaN
+    """
+    op = DataFrameStack(input_df=df, level=level, dropna=dropna)
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/base/standardize_range_index.py b/python/xorbits/_mars/dataframe/base/standardize_range_index.py
new file mode 100644
index 000000000..d253e0c76
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/standardize_range_index.py
@@ -0,0 +1,42 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import FieldTypes, Int32Field, ListField
+from ...utils import lazy_import
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+
+cudf = lazy_import("cudf")
+
+
+class ChunkStandardizeRangeIndex(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.STANDARDIZE_RANGE_INDEX
+
+    axis = Int32Field("axis")
+    prev_shapes = ListField("prev_shapes", FieldTypes.tuple)
+
+    @classmethod
+    def execute(cls, ctx, op: "ChunkStandardizeRangeIndex"):
+        xdf = cudf if op.gpu else pd
+        in_data = ctx[op.inputs[0].key].copy()
+        index_start = sum([shape[op.axis] for shape in op.prev_shapes])
+        if op.axis == 0:
+            in_data.index = xdf.RangeIndex(index_start, index_start + len(in_data))
+        else:
+            in_data.columns = xdf.RangeIndex(
+                index_start, index_start + in_data.shape[1]
+            )
+        ctx[op.outputs[0].key] = in_data
diff --git a/python/xorbits/_mars/dataframe/base/string_.py b/python/xorbits/_mars/dataframe/base/string_.py
new file mode 100644
index 000000000..f5dac2c11
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/string_.py
@@ -0,0 +1,418 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import OutputType, recursive_tile
+from ...serialization.serializables import DictField, KeyField, StringField, TupleField
+from ...tensor import tensor as astensor
+from ...tensor.core import TENSOR_TYPE
+from ...utils import has_unknown_shape
+from ..align import align_series_series
+from ..core import SERIES_TYPE
+from ..initializer import Series as asseries
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_empty_series, infer_index_value, parse_index
+
+
+class SeriesStringMethod(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.STRING_METHOD
+
+    _input = KeyField("input")
+    _method = StringField("method")
+    _method_args = TupleField("method_args")
+    _method_kwargs = DictField("method_kwargs")
+
+    def __init__(
+        self, method=None, method_args=None, method_kwargs=None, output_types=None, **kw
+    ):
+        super().__init__(
+            _method=method,
+            _method_args=method_args,
+            _method_kwargs=method_kwargs,
+            _output_types=output_types,
+            **kw
+        )
+        if not self.output_types:
+            self.output_types = [OutputType.series]
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def method(self):
+        return self._method
+
+    @property
+    def method_args(self):
+        return self._method_args
+
+    @property
+    def method_kwargs(self):
+        return self._method_kwargs
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+        if len(self._inputs) == 2:
+            # for method cat
+            self._method_kwargs["others"] = self._inputs[1]
+
+    def __call__(self, inp):
+        return _string_method_to_handlers[self._method].call(self, inp)
+
+    @classmethod
+    def tile(cls, op):
+        tiled = _string_method_to_handlers[op.method].tile(op)
+        if inspect.isgenerator(tiled):
+            return (yield from tiled)
+        else:
+            return tiled
+
+    @classmethod
+    def execute(cls, ctx, op):
+        return _string_method_to_handlers[op.method].execute(ctx, op)
+
+
+class SeriesStringMethodBaseHandler:
+    @classmethod
+    def call(cls, op, inp):
+        empty_series = build_empty_series(inp.dtype)
+        dtype = getattr(empty_series.str, op.method)(
+            *op.method_args, **op.method_kwargs
+        ).dtype
+        return op.new_series(
+            [inp],
+            shape=inp.shape,
+            dtype=dtype,
+            index_value=inp.index_value,
+            name=inp.name,
+        )
+
+    @classmethod
+    def tile(cls, op):
+        out = op.outputs[0]
+        out_chunks = []
+        for series_chunk in op.input.chunks:
+            chunk_op = op.copy().reset_key()
+            out_chunk = chunk_op.new_chunk(
+                [series_chunk],
+                shape=series_chunk.shape,
+                dtype=out.dtype,
+                index=series_chunk.index,
+                index_value=series_chunk.index_value,
+                name=series_chunk.name,
+            )
+            out_chunks.append(out_chunk)
+
+        params = out.params
+        params["chunks"] = out_chunks
+        params["nsplits"] = op.input.nsplits
+        new_op = op.copy()
+        return new_op.new_tileables([op.input], kws=[params])
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inp = ctx[op.input.key]
+        ctx[op.outputs[0].key] = getattr(inp.str, op.method)(
+            *op.method_args, **op.method_kwargs
+        )
+
+
+class SeriesStringSplitHandler(SeriesStringMethodBaseHandler):
+    @classmethod
+    def call(cls, op, inp):
+        method_kwargs = op.method_kwargs
+        if method_kwargs.get("expand", False) is False:
+            return super().call(op, inp)
+        n = method_kwargs.get("n", -1)
+        # does not support if expand and n == -1
+        if n == -1:  # pragma: no cover
+            raise NotImplementedError("`n` needs to be specified when expand=True")
+
+        op.output_types = [OutputType.dataframe]
+        columns = pd.RangeIndex(n + 1)
+        columns_value = parse_index(columns, store_data=True)
+        dtypes = pd.Series([inp.dtype] * len(columns), index=columns)
+        return op.new_dataframe(
+            [inp],
+            shape=(inp.shape[0], len(columns)),
+            dtypes=dtypes,
+            columns_value=columns_value,
+            index_value=inp.index_value,
+        )
+
+    @classmethod
+    def tile(cls, op):
+        out = op.outputs[0]
+
+        if out.op.output_types[0] == OutputType.series:
+            return super().tile(op)
+
+        out_chunks = []
+        columns = out.columns_value.to_pandas()
+        for series_chunk in op.input.chunks:
+            chunk_op = op.copy().reset_key()
+            out_chunk = chunk_op.new_chunk(
+                [series_chunk],
+                shape=(series_chunk.shape[0], len(columns)),
+                index=(series_chunk.index[0], 0),
+                dtypes=out.dtypes,
+                index_value=series_chunk.index_value,
+                columns_value=out.columns_value,
+            )
+            out_chunks.append(out_chunk)
+
+        params = out.params
+        params["chunks"] = out_chunks
+        params["nsplits"] = (op.input.nsplits[0], (len(columns),))
+        new_op = op.copy()
+        return new_op.new_tileables([op.input], kws=[params])
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inp = ctx[op.input.key]
+        out = op.outputs[0]
+        result = getattr(inp.str, op.method)(*op.method_args, **op.method_kwargs)
+        if result.ndim == 2 and result.shape[1] < out.shape[1]:
+            for i in range(result.shape[1], out.shape[1]):
+                result[i] = None
+        ctx[op.outputs[0].key] = result
+
+
+class SeriesStringCatHandler(SeriesStringMethodBaseHandler):
+    CAT_TYPE_ERROR = (
+        "others must be Series, Index, DataFrame, "
+        "Tensor, np.ndarrary or list-like "
+        "(either containing only strings or "
+        "containing only objects of "
+        "type Series/Index/Tensor/np.ndarray[1-dim])"
+    )
+    CAT_LEN_ERROR = (
+        "If `others` contains arrays or lists (or other list-likes without an index), "
+        "these must all be of the same length as the calling Series/Index."
+    )
+
+    @classmethod
+    def call(cls, op, inp):
+        method_kwargs = op.method_kwargs
+        others = method_kwargs.get("others")
+
+        if others is None:
+            from ..reduction import build_str_concat_object
+
+            return build_str_concat_object(
+                inp,
+                sep=op.method_kwargs.get("sep"),
+                na_rep=op.method_kwargs.get("na_rep"),
+            )
+        elif isinstance(others, (tuple, list, np.ndarray, TENSOR_TYPE)):
+            others = astensor(others, dtype=object)
+            if others.ndim != 1:
+                raise TypeError(cls.CAT_TYPE_ERROR)
+            if (
+                not np.isnan(inp.shape[0])
+                and not np.isnan(others.shape[0])
+                and inp.shape[0] != others.shape[0]
+            ):
+                raise ValueError(cls.CAT_LEN_ERROR)
+            inputs = [inp]
+            if isinstance(others, TENSOR_TYPE):
+                inputs.append(others)
+            return op.new_series(
+                inputs,
+                shape=inp.shape,
+                dtype=inp.dtype,
+                index_value=inp.index_value,
+                name=inp.name,
+            )
+        elif isinstance(others, (pd.Series, SERIES_TYPE)):
+            others = asseries(others)
+            if op.method_kwargs.get("join") != "outer":  # pragma: no cover
+                raise NotImplementedError("only outer join supported for now")
+            return op.new_series(
+                [inp, others],
+                shape=inp.shape,
+                dtype=inp.dtype,
+                index_value=infer_index_value(inp.index_value, others.index_value),
+                name=inp.name,
+            )
+        elif isinstance(others, str) and op.method_kwargs.get("sep") is None:
+            raise ValueError("Did you mean to supply a `sep` keyword?")
+        else:
+            raise TypeError(cls.CAT_TYPE_ERROR)
+
+    @classmethod
+    def tile(cls, op):
+        inp = op.input
+        out = op.outputs[0]
+
+        # aggregation concat resulting in scalars is redirected
+        assert out.ndim != 0
+
+        if isinstance(op.inputs[1], TENSOR_TYPE):
+            if has_unknown_shape(*op.inputs):
+                yield
+            # rechunk others as input
+            others = yield from recursive_tile(op.inputs[1].rechunk(op.input.nsplits))
+            out_chunks = []
+            for c in inp.chunks:
+                chunk_op = op.copy().reset_key()
+                chunk_op._method_kwargs = op.method_kwargs.copy()
+                out_chunk = chunk_op.new_chunk(
+                    [c, others.cix[c.index]],
+                    dtype=c.dtype,
+                    index=c.index,
+                    shape=c.shape,
+                    index_value=c.index_value,
+                    name=c.name,
+                )
+                out_chunks.append(out_chunk)
+            new_op = op.copy()
+            params = out.params
+            params["nsplits"] = inp.nsplits
+            params["chunks"] = out_chunks
+            return new_op.new_tileables(op.inputs, kws=[params])
+        elif isinstance(op.inputs[1], SERIES_TYPE):
+            # both series
+            out_chunks = []
+            nsplits, _, left_chunks, right_chunks = align_series_series(*op.inputs)
+            for left_chunk, right_chunk in zip(left_chunks, right_chunks):
+                chunk_op = op.copy().reset_key()
+                chunk_op._method_kwargs = op.method_kwargs.copy()
+                params = left_chunk.params
+                params["name"] = out.name
+                out_chunk = chunk_op.new_chunk([left_chunk, right_chunk], **params)
+                out_chunks.append(out_chunk)
+            new_op = op.copy()
+            params = out.params
+            params["nsplits"] = nsplits
+            params["chunks"] = out_chunks
+            return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs = [ctx[inp.key] for inp in op.inputs]
+        method_kwargs = op.method_kwargs
+
+        # aggregation concat is redirected and `others` is always defined
+        assert len(inputs) > 1
+
+        method_kwargs["others"] = inputs[1]
+        ctx[op.outputs[0].key] = inputs[0].str.cat(**method_kwargs)
+
+
+class SeriesStringExtractHandler(SeriesStringMethodBaseHandler):
+    @classmethod
+    def call(cls, op, inp):
+        empty_series = build_empty_series(
+            inp.dtype, index=inp.index_value.to_pandas()[:0]
+        )
+        test_df = getattr(empty_series.str, op.method)(
+            *op.method_args, **op.method_kwargs
+        )
+        if test_df.ndim == 1:
+            return op.new_series(
+                [inp],
+                shape=inp.shape,
+                dtype=test_df.dtype,
+                index_value=inp.index_value,
+                name=inp.name,
+            )
+        else:
+            op.output_types = [OutputType.dataframe]
+            if op.method == "extractall":
+                index_value = parse_index(test_df.index, inp)
+                shape = (np.nan, test_df.shape[1])
+            else:
+                index_value = inp.index_value
+                shape = (inp.shape[0], test_df.shape[1])
+            return op.new_dataframe(
+                [inp],
+                shape=shape,
+                dtypes=test_df.dtypes,
+                index_value=index_value,
+                columns_value=parse_index(test_df.columns, store_data=True),
+            )
+
+    @classmethod
+    def tile(cls, op):
+        out = op.outputs[0]
+        out_chunks = []
+        for series_chunk in op.input.chunks:
+            chunk_op = op.copy().reset_key()
+            if out.ndim == 1:
+                out_chunk = chunk_op.new_chunk(
+                    [series_chunk],
+                    shape=series_chunk.shape,
+                    index=series_chunk.index,
+                    dtype=out.dtype,
+                    index_value=series_chunk.index_value,
+                    name=out.name,
+                )
+            else:
+                if op.method == "extract":
+                    index_value = series_chunk.index_value
+                    shape = (series_chunk.shape[0], out.shape[1])
+                else:
+                    index_value = parse_index(
+                        out.index_value.to_pandas()[:0], series_chunk
+                    )
+                    shape = (np.nan, out.shape[1])
+                out_chunk = chunk_op.new_chunk(
+                    [series_chunk],
+                    shape=shape,
+                    index=(series_chunk.index[0], 0),
+                    dtypes=out.dtypes,
+                    index_value=index_value,
+                    columns_value=out.columns_value,
+                )
+            out_chunks.append(out_chunk)
+
+        out = op.outputs[0]
+        params = out.params
+        params["chunks"] = out_chunks
+        if out.ndim == 1:
+            params["nsplits"] = op.input.nsplits
+        elif op.method == "extract":
+            params["nsplits"] = (op.input.nsplits[0], (out.shape[1],))
+        else:
+            params["nsplits"] = ((np.nan,) * len(op.input.nsplits[0]), (out.shape[1],))
+        new_op = op.copy()
+        return new_op.new_tileables([op.input], kws=[params])
+
+
+_string_method_to_handlers = {}
+_not_implements = ["get_dummies"]
+# start to register handlers for string methods
+# register special methods first
+_string_method_to_handlers["split"] = SeriesStringSplitHandler
+_string_method_to_handlers["rsplit"] = SeriesStringSplitHandler
+_string_method_to_handlers["cat"] = SeriesStringCatHandler
+_string_method_to_handlers["extract"] = SeriesStringExtractHandler
+_string_method_to_handlers["extractall"] = SeriesStringExtractHandler
+# then come to the normal methods
+for method in dir(pd.Series.str):
+    if method.startswith("_") and method != "__getitem__":
+        continue
+    if method in _not_implements:
+        continue
+    if method in _string_method_to_handlers:
+        continue
+    _string_method_to_handlers[method] = SeriesStringMethodBaseHandler
diff --git a/python/xorbits/_mars/dataframe/base/tests/__init__.py b/python/xorbits/_mars/dataframe/base/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/base/tests/test_apply_execution.py b/python/xorbits/_mars/dataframe/base/tests/test_apply_execution.py
new file mode 100644
index 000000000..7c2aeed8d
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/tests/test_apply_execution.py
@@ -0,0 +1,284 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import pandas as pd
+
+import pytest
+
+from .... import dataframe as md
+from ....dataframe.core import DataFrame, DATAFRAME_OR_SERIES_TYPE
+from ....dataframe.fetch.core import DataFrameFetch
+
+
+def test_dataframe_apply_execution(setup):
+    df = pd.DataFrame({"col": [1, 2, 3, 4]})
+    mdf = md.DataFrame(df)
+
+    apply_func = lambda x: 20 if x[0] else 10
+    with pytest.raises(TypeError):
+        mdf.apply(apply_func)
+
+    res = mdf.apply(apply_func, output_type="df_or_series", axis=1).execute()
+    assert res.data_type == "series"
+    assert "dtype" in res.data_params
+    assert not ("dtypes" in res.data_params)
+    assert res.data_params["shape"] == (4,)
+    pd.testing.assert_series_equal(res.fetch(), df.apply(apply_func, axis=1))
+
+    res = mdf.apply(apply_func, output_type="df_or_series", axis=0).execute()
+    assert res.data_type == "series"
+    assert "dtype" in res.data_params
+    assert not ("dtypes" in res.data_params)
+    assert res.data_params["shape"] == (1,)
+    pd.testing.assert_series_equal(res.fetch(), df.apply(apply_func, axis=0))
+
+    apply_func = lambda x: x + 1
+    res = mdf.apply(apply_func, output_type="df_or_series", axis=1).execute()
+    assert res.data_type == "dataframe"
+    assert "dtypes" in res.data_params
+    assert not ("dtype" in res.data_params)
+    assert res.data_params["shape"] == (4, 1)
+    pd.testing.assert_frame_equal(res.fetch(), df.apply(apply_func, axis=1))
+
+    res = mdf.apply(apply_func, output_type="df_or_series", axis=0).execute()
+    assert res.data_type == "dataframe"
+    assert "dtypes" in res.data_params
+    assert not ("dtype" in res.data_params)
+    assert res.data_params["shape"] == (4, 1)
+    pd.testing.assert_frame_equal(res.fetch(), df.apply(apply_func, axis=0))
+
+    apply_func = lambda x: sum(x)
+    res = mdf.apply(apply_func, output_type="df_or_series", axis=1).execute()
+    assert res.data_type == "series"
+    assert "dtype" in res.data_params
+    assert not ("dtypes" in res.data_params)
+    assert res.data_params["shape"] == (4,)
+    pd.testing.assert_series_equal(res.fetch(), df.apply(apply_func, axis=1))
+
+    res = mdf.apply(apply_func, output_type="df_or_series", axis=0).execute()
+    assert res.data_type == "series"
+    assert "dtype" in res.data_params
+    assert not ("dtypes" in res.data_params)
+    assert res.data_params["shape"] == (1,)
+    pd.testing.assert_series_equal(res.fetch(), df.apply(apply_func, axis=0))
+
+    df = pd.DataFrame({"c1": [1, 2, 3, 4], "c2": [5, 6, 7, 8]})
+    mdf = md.DataFrame(df)
+    apply_func = lambda x: sum(x) / len(x)
+
+    res = mdf.apply(apply_func, output_type="df_or_series", axis=1).execute()
+    assert res.data_type == "series"
+    assert "dtype" in res.data_params
+    assert res.data_params["dtype"] == "float64"
+    assert not ("dtypes" in res.data_params)
+    assert res.data_params["shape"] == (4,)
+    pd.testing.assert_series_equal(res.fetch(), df.apply(apply_func, axis=1))
+
+    res = mdf.apply(apply_func, output_type="df_or_series", axis=0).execute()
+    assert res.data_type == "series"
+    assert "dtype" in res.data_params
+    assert res.data_params["dtype"] == "float64"
+    assert not ("dtypes" in res.data_params)
+    assert res.data_params["shape"] == (2,)
+    pd.testing.assert_series_equal(res.fetch(), df.apply(apply_func, axis=0))
+
+    apply_func = lambda x: pd.Series([1, 2])
+    res = mdf.apply(apply_func, output_type="df_or_series", axis=0).execute()
+    assert res.data_type == "dataframe"
+    assert "dtypes" in res.data_params
+    assert res.data_params["shape"] == (2, 2)
+    pd.testing.assert_frame_equal(res.fetch(), df.apply(apply_func, axis=0))
+
+    res = mdf.apply(apply_func, output_type="df_or_series", axis=1).execute()
+    assert res.data_type == "dataframe"
+    assert "dtypes" in res.data_params
+    assert res.data_params["shape"] == (4, 2)
+    pd.testing.assert_frame_equal(res.fetch(), df.apply(apply_func, axis=1))
+
+    apply_func = lambda x: [1, 2]
+    res = mdf.apply(apply_func, output_type="df_or_series", axis=0).execute()
+    assert res.data_type == "dataframe"
+    assert res.data_params["shape"] == (2, 2)
+    pd.testing.assert_frame_equal(res.fetch(), df.apply(apply_func, axis=0))
+
+    res = mdf.apply(apply_func, output_type="df_or_series", axis=1).execute()
+    assert res.data_type == "series"
+    assert res.data_params["shape"] == (4,)
+    assert res.data_params["dtype"] == "object"
+    pd.testing.assert_series_equal(res.fetch(), df.apply(apply_func, axis=1))
+
+    apply_func = lambda x: pd.Series([1, 2, 3.0], index=["c1", "c2", "c3"])
+    res = mdf.apply(apply_func, output_type="df_or_series", axis=0).execute()
+    assert res.data_type == "dataframe"
+    assert res.data_params["shape"] == (3, 2)
+    pd.testing.assert_frame_equal(res.fetch(), df.apply(apply_func, axis=0))
+
+    res = mdf.apply(apply_func, output_type="df_or_series", axis=1).execute()
+    assert res.data_type == "dataframe"
+    assert res.data_params["shape"] == (4, 3)
+    pd.testing.assert_frame_equal(res.fetch(), df.apply(apply_func, axis=1))
+
+    apply_func = lambda x: [1, 2, 3]
+    res = mdf.apply(
+        apply_func, output_type="df_or_series", axis=1, result_type="expand"
+    ).execute()
+    expected = df.apply(apply_func, axis=1, result_type="expand")
+    pd.testing.assert_frame_equal(res.fetch(), expected)
+
+    res = mdf.apply(
+        apply_func, output_type="df_or_series", axis=1, result_type="reduce"
+    ).execute()
+    expected = df.apply(apply_func, axis=1, result_type="reduce")
+    pd.testing.assert_series_equal(res.fetch(), expected)
+
+    apply_func = lambda x: [1, 2]
+    res = mdf.apply(
+        apply_func, output_type="df_or_series", axis=1, result_type="broadcast"
+    ).execute()
+    expected = df.apply(apply_func, axis=1, result_type="broadcast")
+    pd.testing.assert_frame_equal(res.fetch(), expected)
+
+
+def test_apply_with_skip_infer(setup):
+    df = pd.DataFrame({"col1": [1, 2, 3, 4], "col2": list("abcd")})
+    mdf = md.DataFrame(df, chunk_size=2)
+
+    def apply_func(series):
+        if series[1] not in "abcd":
+            # make it fail when inferring
+            raise TypeError
+        else:
+            return 1
+
+    with pytest.raises(TypeError):
+        mdf.apply(apply_func, axis=1)
+
+    res = mdf.apply(apply_func, axis=1, skip_infer=True).execute()
+    assert isinstance(res, DATAFRAME_OR_SERIES_TYPE)
+    pd.testing.assert_series_equal(res.fetch(), pd.Series([1] * 4))
+
+    s = pd.Series([1, 2, 3, 4])
+    ms = md.Series(s, chunk_size=2)
+
+    apply_func = lambda x: pd.Series([1, 2])
+    res = ms.apply(apply_func, skip_infer=True).execute()
+    assert isinstance(res, DATAFRAME_OR_SERIES_TYPE)
+    pd.testing.assert_frame_equal(res.fetch(), pd.DataFrame([[1, 2]] * 4))
+
+
+def test_series_apply_execution(setup):
+    s = pd.Series([1, 2, 3, 4])
+    ms = md.Series(s)
+
+    apply_func = lambda x: x + 1
+    res = ms.apply(apply_func, output_type="df_or_series").execute()
+    assert res.data_type == "series"
+    assert res.data_params["shape"] == (4,)
+    assert res.data_params["dtype"] == "int64"
+    pd.testing.assert_series_equal(res.fetch(), s.apply(apply_func))
+
+    apply_func = lambda x: [1, 2]
+    res = ms.apply(apply_func, output_type="df_or_series").execute()
+    assert res.data_type == "series"
+    assert res.data_params["shape"] == (4,)
+    assert res.data_params["dtype"] == "object"
+    pd.testing.assert_series_equal(res.fetch(), s.apply(apply_func))
+
+    apply_func = lambda x: pd.Series([1, 2, 3])
+    res = ms.apply(apply_func, output_type="df_or_series").execute()
+    assert res.data_type == "dataframe"
+    assert "dtypes" in res.data_params
+    assert res.data_params["shape"] == (4, 3)
+    pd.testing.assert_frame_equal(res.fetch(), s.apply(apply_func))
+
+    def subtract_custom_value(x, custom_value):
+        return x - custom_value
+
+    apply_func = subtract_custom_value
+    res = ms.apply(
+        apply_func, args=(5,), convert_dtype=False, output_type="df_or_series"
+    ).execute()
+    assert res.data_params["dtype"] == "object"
+    pd.testing.assert_series_equal(
+        res.fetch(), s.apply(apply_func, args=(5,), convert_dtype=False)
+    )
+
+    res = ms.apply(
+        apply_func, args=(5,), convert_dtype=True, output_type="df_or_series"
+    ).execute()
+    assert res.dtype == "int64"
+    assert res.shape == (4,)
+    with pytest.raises(AttributeError):
+        _ = res.dtypes
+    pd.testing.assert_series_equal(
+        res.fetch(), s.apply(apply_func, args=(5,), convert_dtype=True)
+    )
+
+
+def test_apply_execution_with_multi_chunks(setup):
+    df = pd.DataFrame({"c1": [1, 2, 3, 4], "c2": [5, 6, 7, 8]})
+    mdf = md.DataFrame(df, chunk_size=5)
+    apply_func = np.sqrt
+
+    res = mdf.apply(apply_func, output_type="df_or_series", axis=0).execute()
+    assert res.data_type == "dataframe"
+    assert "dtypes" in res.data_params
+    assert res.data_params["dtypes"]["c1"] == np.dtype("float")
+    assert not ("dtype" in res.data_params)
+    assert res.data_params["shape"] == (4, 2)
+    pd.testing.assert_frame_equal(res.fetch(), df.apply(apply_func, axis=0))
+
+    res = mdf.apply(apply_func, output_type="df_or_series", axis=1).execute()
+    assert res.data_type == "dataframe"
+    assert "dtypes" in res.data_params
+    assert res.data_params["dtypes"]["c2"] == np.dtype("float")
+    assert not ("dtype" in res.data_params)
+    assert res.data_params["shape"] == (4, 2)
+    pd.testing.assert_frame_equal(res.fetch(), df.apply(apply_func, axis=1))
+
+    s = pd.Series([1, 2, 3, 4])
+    ms = md.Series(s, chunk_size=4)
+
+    res = ms.apply(apply_func, output_type="df_or_series").execute()
+    assert res.data_type == "series"
+    assert "dtype" in res.data_params
+    assert res.data_params["dtype"] == "float64"
+    pd.testing.assert_series_equal(res.fetch(), s.apply(apply_func))
+
+
+def test_apply_ensure_data(setup):
+    df = pd.DataFrame({"c1": [1, 2, 3, 4], "c2": [5, 6, 7, 8]})
+    mdf = md.DataFrame(df, chunk_size=3)
+    apply_func = np.sqrt
+
+    r = mdf.apply(apply_func, output_type="df_or_series")
+    res = r.ensure_data()
+    assert isinstance(res, DataFrame)
+    assert isinstance(res.op, DataFrameFetch)
+    pd.testing.assert_frame_equal(res.execute().fetch(), df.apply(apply_func))
+    pd.testing.assert_frame_equal((res + 1).execute().fetch(), df.apply(apply_func) + 1)
+    pd.testing.assert_frame_equal((res * 3).execute().fetch(), df.apply(apply_func) * 3)
+
+    r = res.groupby("c1").max()
+    expected = df.apply(apply_func).groupby("c1").max()
+    pd.testing.assert_frame_equal(r.execute().fetch(), expected)
+
+    apply_func = np.mean
+    res = mdf.apply(apply_func, output_type="df_or_series", axis=1).ensure_data()
+    expected = df.apply(apply_func, axis=1)
+    pd.testing.assert_series_equal(res.execute().fetch(), expected)
+
+    res = res.to_frame(name="foo").groupby("foo")[["foo"]].max().execute()
+    expected = expected.to_frame(name="foo").groupby("foo")[["foo"]].max()
+    pd.testing.assert_frame_equal(res.fetch(), expected)
diff --git a/python/xorbits/_mars/dataframe/base/tests/test_base.py b/python/xorbits/_mars/dataframe/base/tests/test_base.py
new file mode 100644
index 000000000..aa6eae4fe
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/tests/test_base.py
@@ -0,0 +1,1106 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Union
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .... import opcodes
+from ....config import option_context, options
+from ....core import OutputType, Tileable, tile
+from ....core.graph import (
+    ChunkGraphBuilder,
+    TileableGraph,
+    TileableGraphBuilder,
+    TileContext,
+)
+from ....core.operand import OperandStage
+from ....tensor.core import TENSOR_TYPE
+from ... import cut
+from ... import eval as mars_eval
+from ... import get_dummies, to_numeric
+from ...core import (
+    CATEGORICAL_CHUNK_TYPE,
+    CATEGORICAL_TYPE,
+    DATAFRAME_TYPE,
+    INDEX_TYPE,
+    SERIES_CHUNK_TYPE,
+    SERIES_TYPE,
+    DataFrameData,
+    SeriesData,
+)
+from ...datasource.dataframe import from_pandas as from_pandas_df
+from ...datasource.index import from_pandas as from_pandas_index
+from ...datasource.series import from_pandas as from_pandas_series
+from .. import astype, to_cpu, to_gpu
+
+
+def _get_df_after_tile(
+    tileables: List[Tileable],
+) -> List[Union[DataFrameData, SeriesData]]:
+    graph = TileableGraph(tileables)
+    next(TileableGraphBuilder(graph).build())
+    context = TileContext()
+    chunk_graph_builder = ChunkGraphBuilder(
+        graph, fuse_enabled=False, tile_context=context
+    )
+    chunk_graph_builder = chunk_graph_builder.build()
+    for _ in chunk_graph_builder:
+        pass
+    return [context[df] for df in tileables]
+
+
+def test_to_gpu():
+    # test dataframe
+    data = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.random.randint(-100, 100, size=(10,)),
+        columns=[np.random.bytes(10) for _ in range(10)],
+    )
+    df = from_pandas_df(data)
+    cdf = to_gpu(df)
+
+    assert df.index_value == cdf.index_value
+    assert df.columns_value == cdf.columns_value
+    assert cdf.op.gpu is True
+    pd.testing.assert_series_equal(df.dtypes, cdf.dtypes)
+
+    df, cdf = _get_df_after_tile([df.data, cdf.data])
+
+    assert df.nsplits == cdf.nsplits
+    assert df.chunks[0].index_value == cdf.chunks[0].index_value
+    assert df.chunks[0].columns_value == cdf.chunks[0].columns_value
+    assert cdf.chunks[0].op.gpu is True
+    pd.testing.assert_series_equal(df.chunks[0].dtypes, cdf.chunks[0].dtypes)
+
+    assert cdf is to_gpu(cdf)
+
+    # test series
+    sdata = data.iloc[:, 0]
+    series = from_pandas_series(sdata)
+    cseries = to_gpu(series)
+
+    assert series.index_value == cseries.index_value
+    assert cseries.op.gpu is True
+
+    series, cseries = _get_df_after_tile([series.data, cseries.data])
+
+    assert series.nsplits == cseries.nsplits
+    assert series.chunks[0].index_value == cseries.chunks[0].index_value
+    assert cseries.chunks[0].op.gpu is True
+
+    assert cseries is to_gpu(cseries)
+
+
+def test_to_cpu():
+    data = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.random.randint(-100, 100, size=(10,)),
+        columns=[np.random.bytes(10) for _ in range(10)],
+    )
+    df = from_pandas_df(data)
+    cdf = to_gpu(df)
+    df2 = to_cpu(cdf)
+
+    assert df.index_value == df2.index_value
+    assert df.columns_value == df2.columns_value
+    assert df2.op.gpu is False
+    pd.testing.assert_series_equal(df.dtypes, df2.dtypes)
+
+    df, df2 = _get_df_after_tile([df.data, df2.data])
+
+    assert df.nsplits == df2.nsplits
+    assert df.chunks[0].index_value == df2.chunks[0].index_value
+    assert df.chunks[0].columns_value == df2.chunks[0].columns_value
+    assert df2.chunks[0].op.gpu is False
+    pd.testing.assert_series_equal(df.chunks[0].dtypes, df2.chunks[0].dtypes)
+
+    assert df2 is to_cpu(df2)
+
+
+def test_rechunk():
+    from ...merge.concat import DataFrameConcat
+
+    raw = pd.DataFrame(np.random.rand(10, 10))
+    df = from_pandas_df(raw, chunk_size=3)
+    df2 = tile(df.rechunk(4))
+
+    assert df2.shape == (10, 10)
+    assert len(df2.chunks) == 9
+
+    assert df2.chunks[0].shape == (4, 4)
+    pd.testing.assert_index_equal(
+        df2.chunks[0].index_value.to_pandas(), pd.RangeIndex(4)
+    )
+    pd.testing.assert_index_equal(
+        df2.chunks[0].columns_value.to_pandas(), pd.RangeIndex(4)
+    )
+    pd.testing.assert_series_equal(df2.chunks[0].dtypes, raw.dtypes[:4])
+
+    assert df2.chunks[2].shape == (4, 2)
+    pd.testing.assert_index_equal(
+        df2.chunks[2].index_value.to_pandas(), pd.RangeIndex(4)
+    )
+    pd.testing.assert_index_equal(
+        df2.chunks[2].columns_value.to_pandas(), pd.RangeIndex(8, 10)
+    )
+    pd.testing.assert_series_equal(df2.chunks[2].dtypes, raw.dtypes[-2:])
+
+    assert df2.chunks[-1].shape == (2, 2)
+    pd.testing.assert_index_equal(
+        df2.chunks[-1].index_value.to_pandas(), pd.RangeIndex(8, 10)
+    )
+    pd.testing.assert_index_equal(
+        df2.chunks[-1].columns_value.to_pandas(), pd.RangeIndex(8, 10)
+    )
+    pd.testing.assert_series_equal(df2.chunks[-1].dtypes, raw.dtypes[-2:])
+
+    for c in df2.chunks:
+        assert c.shape[1] == len(c.dtypes)
+        assert len(c.columns_value.to_pandas()) == len(c.dtypes)
+
+    columns = [np.random.bytes(10) for _ in range(10)]
+    index = np.random.randint(-100, 100, size=(4,))
+    raw = pd.DataFrame(np.random.rand(4, 10), index=index, columns=columns)
+    df = from_pandas_df(raw, chunk_size=3)
+    df2 = tile(df.rechunk(6))
+
+    assert df2.shape == (4, 10)
+    assert len(df2.chunks) == 2
+
+    assert df2.chunks[0].shape == (4, 6)
+    pd.testing.assert_index_equal(
+        df2.chunks[0].index_value.to_pandas(), df.index_value.to_pandas()
+    )
+    pd.testing.assert_index_equal(
+        df2.chunks[0].columns_value.to_pandas(), pd.Index(columns[:6])
+    )
+    pd.testing.assert_series_equal(df2.chunks[0].dtypes, raw.dtypes[:6])
+
+    assert df2.chunks[1].shape == (4, 4)
+    pd.testing.assert_index_equal(
+        df2.chunks[1].index_value.to_pandas(), df.index_value.to_pandas()
+    )
+    pd.testing.assert_index_equal(
+        df2.chunks[1].columns_value.to_pandas(), pd.Index(columns[6:])
+    )
+    pd.testing.assert_series_equal(df2.chunks[1].dtypes, raw.dtypes[-4:])
+
+    for c in df2.chunks:
+        assert c.shape[1] == len(c.dtypes)
+        assert len(c.columns_value.to_pandas()) == len(c.dtypes)
+
+    # test Series rechunk
+    series = from_pandas_series(pd.Series(np.random.rand(10)), chunk_size=3)
+    series2 = tile(series.rechunk(4))
+
+    assert series2.shape == (10,)
+    assert len(series2.chunks) == 3
+    pd.testing.assert_index_equal(series2.index_value.to_pandas(), pd.RangeIndex(10))
+
+    assert series2.chunk_shape == (3,)
+    assert series2.nsplits == ((4, 4, 2),)
+    assert series2.chunks[0].shape == (4,)
+    pd.testing.assert_index_equal(
+        series2.chunks[0].index_value.to_pandas(), pd.RangeIndex(4)
+    )
+    assert series2.chunks[1].shape == (4,)
+    pd.testing.assert_index_equal(
+        series2.chunks[1].index_value.to_pandas(), pd.RangeIndex(4, 8)
+    )
+    assert series2.chunks[2].shape == (2,)
+    pd.testing.assert_index_equal(
+        series2.chunks[2].index_value.to_pandas(), pd.RangeIndex(8, 10)
+    )
+
+    series2 = tile(series.rechunk(1))
+
+    assert series2.shape == (10,)
+    assert len(series2.chunks) == 10
+    pd.testing.assert_index_equal(series2.index_value.to_pandas(), pd.RangeIndex(10))
+    assert not any(isinstance(c.op, DataFrameConcat) for c in series2.chunks)
+
+    assert series2.chunk_shape == (10,)
+    assert series2.nsplits == ((1,) * 10,)
+    assert series2.chunks[0].shape == (1,)
+    pd.testing.assert_index_equal(
+        series2.chunks[0].index_value.to_pandas(), pd.RangeIndex(1)
+    )
+
+    # no need to rechunk
+    series2 = tile(series.rechunk(3))
+    series = tile(series)
+    assert series2.chunk_shape == series.chunk_shape
+    assert series2.nsplits == series.nsplits
+
+
+def test_dataframe_apply():
+    cols = [chr(ord("A") + i) for i in range(10)]
+    df_raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols))
+
+    old_chunk_store_limit = options.chunk_store_limit
+    try:
+        options.chunk_store_limit = 20
+
+        df = from_pandas_df(df_raw, chunk_size=5)
+
+        def df_func_with_err(v):
+            assert len(v) > 2
+            return v.sort_values()
+
+        def df_series_func_with_err(v):
+            assert len(v) > 2
+            return 0
+
+        with pytest.raises(TypeError):
+            df.apply(df_func_with_err)
+
+        r = df.apply(df_func_with_err, output_type="dataframe", dtypes=df_raw.dtypes)
+        assert r.shape == (np.nan, df.shape[-1])
+        assert r.op._op_type_ == opcodes.APPLY
+        assert r.op.output_types[0] == OutputType.dataframe
+        assert r.op.elementwise is False
+
+        r = df.apply(
+            df_series_func_with_err, output_type="series", dtype=object, name="output"
+        )
+        assert r.dtype == np.dtype("O")
+        assert r.shape == (df.shape[-1],)
+        assert r.op._op_type_ == opcodes.APPLY
+        assert r.op.output_types[0] == OutputType.series
+        assert r.op.elementwise is False
+
+        r = df.apply("ffill")
+        assert r.op._op_type_ == opcodes.FILL_NA
+
+        r = tile(df.apply(np.sqrt))
+        assert all(v == np.dtype("float64") for v in r.dtypes) is True
+        assert r.shape == df.shape
+        assert r.op._op_type_ == opcodes.APPLY
+        assert r.op.output_types[0] == OutputType.dataframe
+        assert r.op.elementwise is True
+
+        r = tile(df.apply(lambda x: pd.Series([1, 2])))
+        assert all(v == np.dtype("int64") for v in r.dtypes) is True
+        assert r.shape == (np.nan, df.shape[1])
+        assert r.op.output_types[0] == OutputType.dataframe
+        assert r.chunks[0].shape == (np.nan, 1)
+        assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0]
+        assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE
+        assert r.op.elementwise is False
+
+        r = tile(df.apply(np.sum, axis="index"))
+        assert np.dtype("int64") == r.dtype
+        assert r.shape == (df.shape[1],)
+        assert r.op.output_types[0] == OutputType.series
+        assert r.chunks[0].shape == (20 // df.shape[0],)
+        assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0]
+        assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE
+        assert r.op.elementwise is False
+
+        r = tile(df.apply(np.sum, axis="columns"))
+        assert np.dtype("int64") == r.dtype
+        assert r.shape == (df.shape[0],)
+        assert r.op.output_types[0] == OutputType.series
+        assert r.chunks[0].shape == (20 // df.shape[1],)
+        assert r.chunks[0].inputs[0].shape[1] == df_raw.shape[1]
+        assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE
+        assert r.op.elementwise is False
+
+        r = tile(df.apply(lambda x: pd.Series([1, 2], index=["foo", "bar"]), axis=1))
+        assert all(v == np.dtype("int64") for v in r.dtypes) is True
+        assert r.shape == (df.shape[0], np.nan)
+        assert r.op.output_types[0] == OutputType.dataframe
+        assert r.chunks[0].shape == (20 // df.shape[1], np.nan)
+        assert r.chunks[0].inputs[0].shape[1] == df_raw.shape[1]
+        assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE
+        assert r.op.elementwise is False
+
+        r = tile(df.apply(lambda x: [1, 2], axis=1, result_type="expand"))
+        assert all(v == np.dtype("int64") for v in r.dtypes) is True
+        assert r.shape == (df.shape[0], np.nan)
+        assert r.op.output_types[0] == OutputType.dataframe
+        assert r.chunks[0].shape == (20 // df.shape[1], np.nan)
+        assert r.chunks[0].inputs[0].shape[1] == df_raw.shape[1]
+        assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE
+        assert r.op.elementwise is False
+
+        r = tile(df.apply(lambda x: list(range(10)), axis=1, result_type="reduce"))
+        assert np.dtype("object") == r.dtype
+        assert r.shape == (df.shape[0],)
+        assert r.op.output_types[0] == OutputType.series
+        assert r.chunks[0].shape == (20 // df.shape[1],)
+        assert r.chunks[0].inputs[0].shape[1] == df_raw.shape[1]
+        assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE
+        assert r.op.elementwise is False
+
+        r = tile(df.apply(lambda x: list(range(10)), axis=1, result_type="broadcast"))
+        assert all(v == np.dtype("int64") for v in r.dtypes) is True
+        assert r.shape == (df.shape[0], np.nan)
+        assert r.op.output_types[0] == OutputType.dataframe
+        assert r.chunks[0].shape == (20 // df.shape[1], np.nan)
+        assert r.chunks[0].inputs[0].shape[1] == df_raw.shape[1]
+        assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE
+        assert r.op.elementwise is False
+    finally:
+        options.chunk_store_limit = old_chunk_store_limit
+
+    raw = pd.DataFrame({"a": [np.array([1, 2, 3]), np.array([4, 5, 6])]})
+    df = from_pandas_df(raw)
+    df2 = df.apply(
+        lambda x: x["a"].astype(pd.Series),
+        axis=1,
+        output_type="dataframe",
+        dtypes=pd.Series([np.dtype(float)] * 3),
+    )
+    assert df2.ndim == 2
+
+
+def test_series_apply():
+    idxes = [chr(ord("A") + i) for i in range(20)]
+    s_raw = pd.Series([i**2 for i in range(20)], index=idxes)
+
+    series = from_pandas_series(s_raw, chunk_size=5)
+
+    r = tile(series.apply("add", args=(1,)))
+    assert r.op._op_type_ == opcodes.ADD
+
+    r = tile(series.apply(np.sqrt))
+    assert np.dtype("float64") == r.dtype
+    assert r.shape == series.shape
+    assert r.index_value is series.index_value
+    assert r.op._op_type_ == opcodes.APPLY
+    assert r.op.output_types[0] == OutputType.series
+    assert r.chunks[0].shape == (5,)
+    assert r.chunks[0].inputs[0].shape == (5,)
+
+    r = tile(series.apply("sqrt"))
+    assert np.dtype("float64") == r.dtype
+    assert r.shape == series.shape
+    assert r.op._op_type_ == opcodes.APPLY
+    assert r.op.output_types[0] == OutputType.series
+    assert r.chunks[0].shape == (5,)
+    assert r.chunks[0].inputs[0].shape == (5,)
+
+    r = tile(series.apply(lambda x: [x, x + 1], convert_dtype=False))
+    assert np.dtype("object") == r.dtype
+    assert r.shape == series.shape
+    assert r.op._op_type_ == opcodes.APPLY
+    assert r.op.output_types[0] == OutputType.series
+    assert r.chunks[0].shape == (5,)
+    assert r.chunks[0].inputs[0].shape == (5,)
+
+    s_raw2 = pd.Series([np.array([1, 2, 3]), np.array([4, 5, 6])])
+    series = from_pandas_series(s_raw2)
+
+    r = series.apply(np.sum)
+    assert r.dtype == np.dtype(object)
+
+    r = series.apply(lambda x: pd.Series([1]), output_type="dataframe")
+    expected = s_raw2.apply(lambda x: pd.Series([1]))
+    pd.testing.assert_series_equal(r.dtypes, expected.dtypes)
+
+    dtypes = pd.Series([np.dtype(float)] * 3)
+    r = series.apply(pd.Series, output_type="dataframe", dtypes=dtypes)
+    assert r.ndim == 2
+    pd.testing.assert_series_equal(r.dtypes, dtypes)
+    assert r.shape == (2, 3)
+
+    def apply_with_error(_):
+        raise ValueError
+
+    r = series.apply(apply_with_error, output_type="dataframe", dtypes=dtypes)
+    assert r.ndim == 2
+
+    r = series.apply(
+        pd.Series, output_type="dataframe", dtypes=dtypes, index=pd.RangeIndex(2)
+    )
+    assert r.ndim == 2
+    pd.testing.assert_series_equal(r.dtypes, dtypes)
+    assert r.shape == (2, 3)
+
+    with pytest.raises(AttributeError, match="abc"):
+        series.apply("abc")
+
+    with pytest.raises(TypeError):
+        # dtypes not provided
+        series.apply(lambda x: x.tolist(), output_type="dataframe")
+
+
+def test_transform():
+    cols = [chr(ord("A") + i) for i in range(10)]
+    df_raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols))
+    df = from_pandas_df(df_raw, chunk_size=5)
+
+    idxes = [chr(ord("A") + i) for i in range(20)]
+    s_raw = pd.Series([i**2 for i in range(20)], index=idxes)
+    series = from_pandas_series(s_raw, chunk_size=5)
+
+    def rename_fn(f, new_name):
+        f.__name__ = new_name
+        return f
+
+    old_chunk_store_limit = options.chunk_store_limit
+    try:
+        options.chunk_store_limit = 20
+
+        # DATAFRAME CASES
+
+        # test transform with infer failure
+        def transform_df_with_err(v):
+            assert len(v) > 2
+            return v.sort_values()
+
+        with pytest.raises(TypeError):
+            df.transform(transform_df_with_err)
+
+        r = tile(df.transform(transform_df_with_err, dtypes=df_raw.dtypes))
+        assert r.shape == df.shape
+        assert r.op._op_type_ == opcodes.TRANSFORM
+        assert r.op.output_types[0] == OutputType.dataframe
+        assert r.chunks[0].shape == (df.shape[0], 20 // df.shape[0])
+        assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0]
+        assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE
+
+        # test transform scenarios on data frames
+        r = tile(df.transform(lambda x: list(range(len(x)))))
+        assert all(v == np.dtype("int64") for v in r.dtypes) is True
+        assert r.shape == df.shape
+        assert r.op._op_type_ == opcodes.TRANSFORM
+        assert r.op.output_types[0] == OutputType.dataframe
+        assert r.chunks[0].shape == (df.shape[0], 20 // df.shape[0])
+        assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0]
+        assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE
+
+        r = tile(df.transform(lambda x: list(range(len(x))), axis=1))
+        assert all(v == np.dtype("int64") for v in r.dtypes) is True
+        assert r.shape == df.shape
+        assert r.op._op_type_ == opcodes.TRANSFORM
+        assert r.op.output_types[0] == OutputType.dataframe
+        assert r.chunks[0].shape == (20 // df.shape[1], df.shape[1])
+        assert r.chunks[0].inputs[0].shape[1] == df_raw.shape[1]
+        assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE
+
+        r = tile(df.transform(["cumsum", "cummax", lambda x: x + 1]))
+        assert all(v == np.dtype("int64") for v in r.dtypes) is True
+        assert r.shape == (df.shape[0], df.shape[1] * 3)
+        assert r.op._op_type_ == opcodes.TRANSFORM
+        assert r.op.output_types[0] == OutputType.dataframe
+        assert r.chunks[0].shape == (df.shape[0], 20 // df.shape[0] * 3)
+        assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0]
+        assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE
+
+        r = tile(
+            df.transform(
+                {"A": "cumsum", "D": ["cumsum", "cummax"], "F": lambda x: x + 1}
+            )
+        )
+        assert all(v == np.dtype("int64") for v in r.dtypes) is True
+        assert r.shape == (df.shape[0], 4)
+        assert r.op._op_type_ == opcodes.TRANSFORM
+        assert r.op.output_types[0] == OutputType.dataframe
+        assert r.chunks[0].shape == (df.shape[0], 1)
+        assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0]
+        assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE
+
+        # test agg scenarios on series
+        r = tile(df.transform(lambda x: x.iloc[:-1], _call_agg=True))
+        assert all(v == np.dtype("int64") for v in r.dtypes) is True
+        assert r.shape == (np.nan, df.shape[1])
+        assert r.op._op_type_ == opcodes.TRANSFORM
+        assert r.op.output_types[0] == OutputType.dataframe
+        assert r.chunks[0].shape == (np.nan, 1)
+        assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0]
+        assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE
+
+        r = tile(df.transform(lambda x: x.iloc[:-1], axis=1, _call_agg=True))
+        assert all(v == np.dtype("int64") for v in r.dtypes) is True
+        assert r.shape == (df.shape[0], np.nan)
+        assert r.op._op_type_ == opcodes.TRANSFORM
+        assert r.op.output_types[0] == OutputType.dataframe
+        assert r.chunks[0].shape == (2, np.nan)
+        assert r.chunks[0].inputs[0].shape[1] == df_raw.shape[1]
+        assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE
+
+        fn_list = [
+            rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), "f1"),
+            lambda x: x.iloc[:-1].reset_index(drop=True),
+        ]
+        r = tile(df.transform(fn_list, _call_agg=True))
+        assert all(v == np.dtype("int64") for v in r.dtypes) is True
+        assert r.shape == (np.nan, df.shape[1] * 2)
+        assert r.op._op_type_ == opcodes.TRANSFORM
+        assert r.op.output_types[0] == OutputType.dataframe
+        assert r.chunks[0].shape == (np.nan, 2)
+        assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0]
+        assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE
+
+        r = tile(df.transform(lambda x: x.sum(), _call_agg=True))
+        assert r.dtype == np.dtype("int64")
+        assert r.shape == (df.shape[1],)
+        assert r.op._op_type_ == opcodes.TRANSFORM
+        assert r.op.output_types[0] == OutputType.series
+        assert r.chunks[0].shape == (20 // df.shape[0],)
+        assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0]
+        assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE
+
+        fn_dict = {
+            "A": rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), "f1"),
+            "D": [
+                rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), "f1"),
+                lambda x: x.iloc[:-1].reset_index(drop=True),
+            ],
+            "F": lambda x: x.iloc[:-1].reset_index(drop=True),
+        }
+        r = tile(df.transform(fn_dict, _call_agg=True))
+        assert all(v == np.dtype("int64") for v in r.dtypes) is True
+        assert r.shape == (np.nan, 4)
+        assert r.op._op_type_ == opcodes.TRANSFORM
+        assert r.op.output_types[0] == OutputType.dataframe
+        assert r.chunks[0].shape == (np.nan, 1)
+        assert r.chunks[0].inputs[0].shape[0] == df_raw.shape[0]
+        assert r.chunks[0].inputs[0].op._op_type_ == opcodes.CONCATENATE
+
+        # SERIES CASES
+        # test transform scenarios on series
+        r = tile(series.transform(lambda x: x + 1))
+        assert np.dtype("int64") == r.dtype
+        assert r.shape == series.shape
+        assert r.op._op_type_ == opcodes.TRANSFORM
+        assert r.op.output_types[0] == OutputType.series
+        assert r.chunks[0].shape == (5,)
+        assert r.chunks[0].inputs[0].shape == (5,)
+    finally:
+        options.chunk_store_limit = old_chunk_store_limit
+
+
+def test_string_method():
+    s = pd.Series(["a", "b", "c"], name="s")
+    series = from_pandas_series(s, chunk_size=2)
+
+    with pytest.raises(AttributeError):
+        _ = series.str.non_exist
+
+    r = series.str.contains("c")
+    assert r.dtype == np.bool_
+    assert r.name == s.name
+    pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index)
+    assert r.shape == s.shape
+
+    r = tile(r)
+    for i, c in enumerate(r.chunks):
+        assert c.index == (i,)
+        assert c.dtype == np.bool_
+        assert c.name == s.name
+        pd.testing.assert_index_equal(
+            c.index_value.to_pandas(), s.index[i * 2 : (i + 1) * 2]
+        )
+        assert c.shape == (2,) if i == 0 else (1,)
+
+    r = series.str.split(",", expand=True, n=1)
+    assert r.op.output_types[0] == OutputType.dataframe
+    assert r.shape == (3, 2)
+    pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index)
+    pd.testing.assert_index_equal(r.columns_value.to_pandas(), pd.RangeIndex(2))
+
+    r = tile(r)
+    for i, c in enumerate(r.chunks):
+        assert c.index == (i, 0)
+        pd.testing.assert_index_equal(
+            c.index_value.to_pandas(), s.index[i * 2 : (i + 1) * 2]
+        )
+        pd.testing.assert_index_equal(c.columns_value.to_pandas(), pd.RangeIndex(2))
+        assert c.shape == (2, 2) if i == 0 else (1, 2)
+
+    with pytest.raises(TypeError):
+        _ = series.str.cat([["1", "2"]])
+
+    with pytest.raises(ValueError):
+        _ = series.str.cat(["1", "2"])
+
+    with pytest.raises(ValueError):
+        _ = series.str.cat(",")
+
+    with pytest.raises(TypeError):
+        _ = series.str.cat({"1", "2", "3"})
+
+    r = series.str.cat(sep=",")
+    assert r.op.output_types[0] == OutputType.scalar
+    assert r.dtype == s.dtype
+
+    r = tile(r)
+    assert len(r.chunks) == 1
+    assert r.chunks[0].op.output_types[0] == OutputType.scalar
+    assert r.chunks[0].dtype == s.dtype
+
+    r = series.str.extract(r"[ab](\d)", expand=False)
+    assert r.op.output_types[0] == OutputType.series
+    assert r.dtype == s.dtype
+
+    r = tile(r)
+    for i, c in enumerate(r.chunks):
+        assert c.index == (i,)
+        assert c.dtype == s.dtype
+        assert c.name == s.name
+        pd.testing.assert_index_equal(
+            c.index_value.to_pandas(), s.index[i * 2 : (i + 1) * 2]
+        )
+        assert c.shape == (2,) if i == 0 else (1,)
+
+    r = series.str.extract(r"[ab](\d)", expand=True)
+    assert r.op.output_types[0] == OutputType.dataframe
+    assert r.shape == (3, 1)
+    pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index)
+    pd.testing.assert_index_equal(r.columns_value.to_pandas(), pd.RangeIndex(1))
+
+    r = tile(r)
+    for i, c in enumerate(r.chunks):
+        assert c.index == (i, 0)
+        pd.testing.assert_index_equal(
+            c.index_value.to_pandas(), s.index[i * 2 : (i + 1) * 2]
+        )
+        pd.testing.assert_index_equal(c.columns_value.to_pandas(), pd.RangeIndex(1))
+        assert c.shape == (2, 1) if i == 0 else (1, 1)
+
+    assert "lstrip" in dir(series.str)
+
+
+def test_datetime_method():
+    s = pd.Series(
+        [pd.Timestamp("2020-1-1"), pd.Timestamp("2020-2-1"), pd.Timestamp("2020-3-1")],
+        name="ss",
+    )
+    series = from_pandas_series(s, chunk_size=2)
+
+    r = series.dt.year
+    assert r.dtype == s.dt.year.dtype
+    pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index)
+    assert r.shape == s.shape
+    assert r.op.output_types[0] == OutputType.series
+    assert r.name == s.dt.year.name
+
+    r = tile(r)
+    for i, c in enumerate(r.chunks):
+        assert c.index == (i,)
+        assert c.dtype == s.dt.year.dtype
+        assert c.op.output_types[0] == OutputType.series
+        assert r.name == s.dt.year.name
+        pd.testing.assert_index_equal(
+            c.index_value.to_pandas(), s.index[i * 2 : (i + 1) * 2]
+        )
+        assert c.shape == (2,) if i == 0 else (1,)
+
+    with pytest.raises(AttributeError):
+        _ = from_pandas_series(pd.Series([1])).dt
+    with pytest.raises(AttributeError):
+        _ = series.dt.non_exist
+
+    assert "ceil" in dir(series.dt)
+
+
+def test_series_isin():
+    # one chunk in multiple chunks
+    a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), chunk_size=10)
+    b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=2)
+
+    r = tile(a.isin(b))
+    for i, c in enumerate(r.chunks):
+        assert c.index == (i,)
+        assert c.dtype == np.dtype("bool")
+        assert c.shape == (10,)
+        assert len(c.op.inputs) == 2
+        assert c.op.output_types[0] == OutputType.series
+        assert c.op.inputs[0].index == (i,)
+        assert c.op.inputs[0].shape == (10,)
+        assert c.op.inputs[1].index == (0,)
+        assert c.op.inputs[1].shape == (10,)
+
+    # multiple chunk in one chunks
+    a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), chunk_size=5)
+    b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=4)
+
+    r = tile(a.isin(b))
+    for i, c in enumerate(r.chunks):
+        assert c.index == (i,)
+        assert c.dtype == np.dtype("bool")
+        assert c.shape == (5,)
+        assert len(c.op.inputs) == 2
+        assert c.op.output_types[0] == OutputType.series
+        assert c.op.inputs[0].index == (i,)
+        assert c.op.inputs[0].shape == (5,)
+        assert c.op.inputs[1].index == (0,)
+        assert c.op.inputs[1].shape == (4,)
+
+    # multiple chunk in multiple chunks
+    a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), chunk_size=5)
+    b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=2)
+
+    r = tile(a.isin(b))
+    for i, c in enumerate(r.chunks):
+        assert c.index == (i,)
+        assert c.dtype == np.dtype("bool")
+        assert c.shape == (5,)
+        assert len(c.op.inputs) == 2
+        assert c.op.output_types[0] == OutputType.series
+        assert c.op.inputs[0].index == (i,)
+        assert c.op.inputs[0].shape == (5,)
+        assert c.op.inputs[1].index == (i,)
+        assert c.op.inputs[1].shape == (5,)
+
+    with pytest.raises(TypeError):
+        _ = a.isin("sth")
+
+    with pytest.raises(TypeError):
+        _ = a.to_frame().isin("sth")
+
+
+def test_cut():
+    s = from_pandas_series(pd.Series([1.0, 2.0, 3.0, 4.0]), chunk_size=2)
+
+    with pytest.raises(ValueError):
+        _ = cut(s, -1)
+
+    with pytest.raises(ValueError):
+        _ = cut([[1, 2], [3, 4]], 3)
+
+    with pytest.raises(ValueError):
+        _ = cut([], 3)
+
+    r, b = cut(s, [1.5, 2.5], retbins=True)
+    assert isinstance(r, SERIES_TYPE)
+    assert isinstance(b, TENSOR_TYPE)
+
+    r = tile(r)
+
+    assert len(r.chunks) == 2
+    for c in r.chunks:
+        assert isinstance(c, SERIES_CHUNK_TYPE)
+        assert c.shape == (2,)
+
+    r = cut(s.to_tensor(), [1.5, 2.5])
+    assert isinstance(r, CATEGORICAL_TYPE)
+    assert len(r) == len(s)
+    assert "Categorical" in repr(r)
+
+    r = tile(r)
+
+    assert len(r.chunks) == 2
+    for c in r.chunks:
+        assert isinstance(c, CATEGORICAL_CHUNK_TYPE)
+        assert c.shape == (2,)
+        assert c.ndim == 1
+
+    r = cut([0, 1, 1, 2], bins=4, labels=False)
+    assert isinstance(r, TENSOR_TYPE)
+    e = pd.cut([0, 1, 1, 2], bins=4, labels=False)
+    assert r.dtype == e.dtype
+
+
+def test_transpose():
+    s = pd.DataFrame({"a": [1, 2, 3], "b": ["5", "-6", "7"], "c": [1, 2, 3]})
+    df = from_pandas_df(s, chunk_size=2)
+
+    r = tile(df.transpose())
+    assert len(r.chunks) == 4
+    assert isinstance(r, DATAFRAME_TYPE)
+
+    r = tile(df.T)
+    assert len(r.chunks) == 4
+    assert isinstance(r, DATAFRAME_TYPE)
+
+
+def test_to_numeric():
+    raw = pd.DataFrame({"a": [1.0, 2, 3, -3]})
+    df = from_pandas_df(raw, chunk_size=2)
+
+    with pytest.raises(ValueError):
+        _ = to_numeric(df)
+
+    with pytest.raises(ValueError):
+        _ = to_numeric([["1.0", 1]])
+
+    with pytest.raises(ValueError):
+        _ = to_numeric([])
+
+    s = from_pandas_series(pd.Series(["1.0", "2.0", 1, -2]), chunk_size=2)
+    r = tile(to_numeric(s))
+    assert len(r.chunks) == 2
+    assert isinstance(r, SERIES_TYPE)
+
+    r = tile(to_numeric(["1.0", "2.0", 1, -2]))
+    assert isinstance(r, TENSOR_TYPE)
+
+
+def test_astype():
+    s = from_pandas_series(pd.Series([1, 2, 1, 2], name="a"), chunk_size=2)
+    with pytest.raises(KeyError):
+        astype(s, {"b": "str"})
+
+    df = from_pandas_df(
+        pd.DataFrame({"a": [1, 2, 1, 2], "b": ["a", "b", "a", "b"]}), chunk_size=2
+    )
+
+    with pytest.raises(KeyError):
+        astype(df, {"c": "str", "a": "str"})
+
+
+def test_get_dummies():
+    raw = pd.DataFrame(
+        {
+            "a": [1.1, 2.1, 3.1],
+            "b": ["5", "-6", "-7"],
+            "c": [1, 2, 3],
+            "d": ["2", "3", "4"],
+        }
+    )
+    df = from_pandas_df(raw, chunk_size=2)
+
+    with pytest.raises(TypeError):
+        _ = get_dummies(df, columns="a")
+
+    with pytest.raises(ValueError):
+        _ = get_dummies(df, prefix=["col1"])
+
+    with pytest.raises(ValueError):
+        _ = get_dummies(df, columns=["a"], prefix={"a": "col1", "c": "col2"})
+
+    with pytest.raises(KeyError):
+        _ = get_dummies(df, columns=["a", "b"], prefix={"a": "col1", "c": "col2"})
+
+    r = get_dummies(df)
+    assert isinstance(r, DATAFRAME_TYPE)
+
+
+def test_drop():
+    # test dataframe drop
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        rs.randint(1000, size=(20, 8)), columns=["c" + str(i + 1) for i in range(8)]
+    )
+
+    df = from_pandas_df(raw, chunk_size=8)
+
+    with pytest.raises(KeyError):
+        df.drop(columns=["c9"])
+    with pytest.raises(NotImplementedError):
+        df.drop(columns=from_pandas_series(pd.Series(["c9"])))
+
+    r = df.drop(columns=["c1"])
+    pd.testing.assert_index_equal(r.index_value.to_pandas(), raw.index)
+
+    tiled = tile(r)
+    start = 0
+    for c in tiled.chunks:
+        raw_index = raw.index[start : start + c.shape[0]]
+        start += c.shape[0]
+        pd.testing.assert_index_equal(raw_index, c.index_value.to_pandas())
+
+    df = from_pandas_df(raw, chunk_size=3)
+
+    columns = ["c2", "c4", "c5", "c6"]
+    index = [3, 6, 7]
+    r = df.drop(columns=columns, index=index)
+    assert isinstance(r, DATAFRAME_TYPE)
+
+    # test series drop
+    raw = pd.Series(rs.randint(1000, size=(20,)))
+    series = from_pandas_series(raw, chunk_size=3)
+
+    r = series.drop(index=index)
+    assert isinstance(r, SERIES_TYPE)
+
+    # test index drop
+    ser = pd.Series(range(20))
+    rs.shuffle(ser)
+    raw = pd.Index(ser)
+
+    idx = from_pandas_index(raw)
+
+    r = idx.drop(index)
+    assert isinstance(r, INDEX_TYPE)
+
+
+def test_drop_duplicates():
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        rs.randint(1000, size=(20, 7)), columns=["c" + str(i + 1) for i in range(7)]
+    )
+    raw["c7"] = [f"s{j}" for j in range(20)]
+
+    df = from_pandas_df(raw, chunk_size=10)
+    with pytest.raises(ValueError):
+        df.drop_duplicates(method="unknown")
+    with pytest.raises(KeyError):
+        df.drop_duplicates(subset="c8")
+
+    # test auto method selection
+    assert tile(df.drop_duplicates()).chunks[0].op.method == "tree"
+    # subset size less than chunk_store_limit
+    assert (
+        tile(df.drop_duplicates(subset=["c1", "c3"])).chunks[0].op.method
+        == "subset_tree"
+    )
+    with option_context({"chunk_store_limit": 5}):
+        # subset size greater than chunk_store_limit
+        assert (
+            tile(df.drop_duplicates(subset=["c1", "c3"])).chunks[0].op.method == "tree"
+        )
+    assert tile(df.drop_duplicates(subset=["c1", "c7"])).chunks[0].op.method == "tree"
+    assert tile(df["c7"].drop_duplicates()).chunks[0].op.method == "tree"
+
+    s = df["c7"]
+    with pytest.raises(ValueError):
+        s.drop_duplicates(method="unknown")
+
+
+def test_memory_usage():
+    dtypes = ["int64", "float64", "complex128", "object", "bool"]
+    data = dict([(t, np.ones(shape=500).astype(t)) for t in dtypes])
+    raw = pd.DataFrame(data)
+
+    df = from_pandas_df(raw, chunk_size=(500, 2))
+    r = tile(df.memory_usage())
+
+    assert isinstance(r, SERIES_TYPE)
+    assert r.shape == (6,)
+    assert len(r.chunks) == 3
+    assert r.chunks[0].op.stage is None
+
+    df = from_pandas_df(raw, chunk_size=(100, 3))
+    r = tile(df.memory_usage(index=True))
+
+    assert isinstance(r, SERIES_TYPE)
+    assert r.shape == (6,)
+    assert len(r.chunks) == 2
+    assert r.chunks[0].op.stage == OperandStage.reduce
+
+    r = tile(df.memory_usage(index=False))
+
+    assert isinstance(r, SERIES_TYPE)
+    assert r.shape == (5,)
+    assert len(r.chunks) == 2
+    assert r.chunks[0].op.stage == OperandStage.reduce
+
+    raw = pd.Series(np.ones(shape=500).astype("object"), name="s")
+
+    series = from_pandas_series(raw)
+    r = tile(series.memory_usage())
+
+    assert isinstance(r, TENSOR_TYPE)
+    assert r.shape == ()
+    assert len(r.chunks) == 1
+    assert r.chunks[0].op.stage is None
+
+    series = from_pandas_series(raw, chunk_size=100)
+    r = tile(series.memory_usage())
+
+    assert isinstance(r, TENSOR_TYPE)
+    assert r.shape == ()
+    assert len(r.chunks) == 1
+    assert r.chunks[0].op.stage == OperandStage.reduce
+
+
+def test_shift():
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        rs.randint(1000, size=(10, 8)),
+        columns=["col" + str(i + 1) for i in range(8)],
+        index=pd.date_range("2021-1-1", periods=10),
+    )
+    df = from_pandas_df(raw, chunk_size=5)
+
+    df2 = df.shift(1)
+    df2 = tile(df2)
+
+    for c in df2.chunks:
+        pd.testing.assert_index_equal(c.dtypes.index, c.columns_value.to_pandas())
+
+    df2 = df.shift(1, freq="D")
+    df2 = tile(df2)
+
+    for c in df2.chunks:
+        pd.testing.assert_index_equal(c.dtypes.index, c.columns_value.to_pandas())
+
+
+def test_eval_query():
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame({"a": rs.rand(100), "b": rs.rand(100), "c c": rs.rand(100)})
+    df = from_pandas_df(raw, chunk_size=(10, 2))
+
+    with pytest.raises(NotImplementedError):
+        mars_eval("df.a * 2", engine="numexpr")
+    with pytest.raises(NotImplementedError):
+        mars_eval("df.a * 2", parser="pandas")
+    with pytest.raises(TypeError):
+        df.eval(df)
+    with pytest.raises(SyntaxError):
+        df.query(
+            """
+        a + b
+        a + `c c`
+        """
+        )
+    with pytest.raises(SyntaxError):
+        df.eval(
+            """
+        def a():
+            return v
+        a()
+        """
+        )
+    with pytest.raises(SyntaxError):
+        df.eval("a + `c")
+    with pytest.raises(KeyError):
+        df.eval("a + c")
+    with pytest.raises(ValueError):
+        df.eval("p, q = a + c")
+    with pytest.raises(ValueError):
+        df.query("p = a + c")
+
+
+def test_empty():
+    # for DataFrame
+    assert from_pandas_df(pd.DataFrame()).empty == pd.DataFrame().empty
+    assert from_pandas_df(pd.DataFrame({})).empty == pd.DataFrame({}).empty
+    assert (
+        from_pandas_df(pd.DataFrame({"a": []})).empty == pd.DataFrame({"a": []}).empty
+    )
+    assert (
+        from_pandas_df(pd.DataFrame({"a": [1]})).empty == pd.DataFrame({"a": [1]}).empty
+    )
+    assert (
+        from_pandas_df(pd.DataFrame({"a": [1], "b": [2]})).empty
+        == pd.DataFrame({"a": [1], "b": [2]}).empty
+    )
+    assert (
+        from_pandas_df(pd.DataFrame(np.empty(shape=(4, 0)))).empty
+        == pd.DataFrame(np.empty(shape=(4, 0))).empty
+    )
+
+    # for Series
+    assert from_pandas_series(pd.Series()).empty == pd.Series().empty
+    assert from_pandas_series(pd.Series({})).empty == pd.Series({}).empty
+    assert from_pandas_series(pd.Series({"a": []})).empty == pd.Series({"a": []}).empty
+    assert (
+        from_pandas_series(pd.Series({"a": [1]})).empty == pd.Series({"a": [1]}).empty
+    )
+
+    # Maybe fail due to lazy evaluation
+    with pytest.raises(ValueError):
+        a = from_pandas_df(pd.DataFrame(np.random.rand(10, 2)))
+        assert a[a > 0].empty
+    with pytest.raises(ValueError):
+        a = from_pandas_series(pd.Series(np.random.rand(10)))
+        assert a[a > 0].empty
diff --git a/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py b/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py
new file mode 100644
index 000000000..ae94cfec2
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py
@@ -0,0 +1,2470 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+from collections import OrderedDict
+
+import mars
+import numpy as np
+import pandas as pd
+import pytest
+
+try:
+    import pyarrow as pa
+except ImportError:  # pragma: no cover
+    pa = None
+
+from ....config import option_context, options
+from ....dataframe import DataFrame, Series
+from ....tensor import arange, tensor
+from ....tensor.random import rand
+from ....tests.core import require_cudf
+from ....utils import lazy_import, no_default, pd_release_version
+from ... import cut
+from ... import eval as mars_eval
+from ... import get_dummies, qcut
+from ...core import DATAFRAME_OR_SERIES_TYPE
+from ...datasource.dataframe import from_pandas as from_pandas_df
+from ...datasource.index import from_pandas as from_pandas_index
+from ...datasource.series import from_pandas as from_pandas_series
+from .. import to_cpu, to_gpu
+from ..bloom_filter import filter_by_bloom_filter
+from ..rebalance import DataFrameRebalance
+from ..shift import _enable_no_default, _with_column_freq_bug
+from ..to_numeric import to_numeric
+
+pytestmark = pytest.mark.pd_compat
+
+cudf = lazy_import("cudf")
+
+_explode_with_ignore_index = pd_release_version[:2] >= (1, 1)
+_interval_range_closed_arg = pd_release_version[:2] >= (1, 5)
+
+
+@require_cudf
+def test_to_gpu_execution(setup_gpu):
+    pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1))
+    df = from_pandas_df(pdf, chunk_size=(13, 21))
+    cdf = to_gpu(df)
+
+    res = cdf.execute().fetch()
+    assert isinstance(res, cudf.DataFrame)
+    pd.testing.assert_frame_equal(res.to_pandas(), pdf)
+
+    pseries = pdf.iloc[:, 0]
+    series = from_pandas_series(pseries)
+    cseries = series.to_gpu()
+
+    res = cseries.execute().fetch()
+    assert isinstance(res, cudf.Series)
+    pd.testing.assert_series_equal(res.to_pandas(), pseries)
+
+
+@require_cudf
+def test_to_cpu_execution(setup_gpu):
+    pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1))
+    df = from_pandas_df(pdf, chunk_size=(13, 21))
+    cdf = to_gpu(df)
+    df2 = to_cpu(cdf)
+
+    res = df2.execute().fetch()
+    assert isinstance(res, pd.DataFrame)
+    pd.testing.assert_frame_equal(res, pdf)
+
+    pseries = pdf.iloc[:, 0]
+    series = from_pandas_series(pseries, chunk_size=(13, 21))
+    cseries = to_gpu(series)
+    series2 = to_cpu(cseries)
+
+    res = series2.execute().fetch()
+    assert isinstance(res, pd.Series)
+    pd.testing.assert_series_equal(res, pseries)
+
+
+def test_rechunk_execution(setup):
+    ns = np.random.RandomState(0)
+    df = pd.DataFrame(ns.rand(100, 10), columns=["a" + str(i) for i in range(10)])
+
+    # test rechunk after sort
+    mdf = DataFrame(df, chunk_size=10)
+    result = mdf.sort_values("a0").rechunk(chunk_size=10).execute().fetch()
+    expected = df.sort_values("a0")
+    pd.testing.assert_frame_equal(result, expected)
+
+    data = pd.DataFrame(np.random.rand(8, 10))
+    df = from_pandas_df(pd.DataFrame(data), chunk_size=3)
+    df2 = df.rechunk((3, 4))
+    res = df2.execute().fetch()
+    pd.testing.assert_frame_equal(data, res)
+
+    data = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.random.randint(-100, 100, size=(10,)),
+        columns=[np.random.bytes(10) for _ in range(10)],
+    )
+    df = from_pandas_df(data)
+    df2 = df.rechunk(5)
+    res = df2.execute().fetch()
+    pd.testing.assert_frame_equal(data, res)
+
+    # test Series rechunk execution.
+    data = pd.Series(np.random.rand(10))
+    series = from_pandas_series(data)
+    series2 = series.rechunk(3)
+    res = series2.execute().fetch()
+    pd.testing.assert_series_equal(data, res)
+
+    series2 = series.rechunk(1)
+    res = series2.execute().fetch()
+    pd.testing.assert_series_equal(data, res)
+
+    # test index rechunk execution
+    data = pd.Index(np.random.rand(10))
+    index = from_pandas_index(data)
+    index2 = index.rechunk(3)
+    res = index2.execute().fetch()
+    pd.testing.assert_index_equal(data, res)
+
+    index2 = index.rechunk(1)
+    res = index2.execute().fetch()
+    pd.testing.assert_index_equal(data, res)
+
+    # test rechunk on mixed typed columns
+    data = pd.DataFrame({0: [1, 2], 1: [3, 4], "a": [5, 6]})
+    df = from_pandas_df(data)
+    df = df.rechunk((2, 2)).rechunk({1: 3})
+    res = df.execute().fetch()
+    pd.testing.assert_frame_equal(data, res)
+
+
+def test_series_map_execution(setup):
+    raw = pd.Series(np.arange(10))
+    s = from_pandas_series(raw, chunk_size=7)
+
+    with pytest.raises(ValueError):
+        # cannot infer dtype, the inferred is int,
+        # but actually it is float
+        # just due to nan
+        s.map({5: 10})
+
+    r = s.map({5: 10}, dtype=float)
+    result = r.execute().fetch()
+    expected = raw.map({5: 10})
+    pd.testing.assert_series_equal(result, expected)
+
+    # use skip_infer when infer failed
+    r = s.map({5: 10}, skip_infer=True)
+    assert r.dtype is None
+    result = r.execute().fetch()
+    assert np.issubdtype(r.dtype, np.dtype("float"))
+    expected = raw.map({5: 10})
+    pd.testing.assert_series_equal(result, expected)
+
+    r = s.map({i: 10 + i for i in range(7)}, dtype=float)
+    result = r.execute().fetch()
+    expected = raw.map({i: 10 + i for i in range(7)})
+    pd.testing.assert_series_equal(result, expected)
+
+    r = s.map({5: 10}, dtype=float, na_action="ignore")
+    result = r.execute().fetch()
+    expected = raw.map({5: 10}, na_action="ignore")
+    pd.testing.assert_series_equal(result, expected)
+
+    # dtype can be inferred
+    r = s.map({5: 10.0})
+    result = r.execute().fetch()
+    expected = raw.map({5: 10.0})
+    pd.testing.assert_series_equal(result, expected)
+
+    r = s.map(lambda x: x + 1, dtype=int)
+    result = r.execute().fetch()
+    expected = raw.map(lambda x: x + 1)
+    pd.testing.assert_series_equal(result, expected)
+
+    def f(x: int) -> float:
+        return x + 1.0
+
+    # dtype can be inferred for function
+    r = s.map(f)
+    result = r.execute().fetch()
+    expected = raw.map(lambda x: x + 1.0)
+    pd.testing.assert_series_equal(result, expected)
+
+    def f(x: int):
+        return x + 1.0
+
+    # dtype can be inferred for function
+    r = s.map(f)
+    result = r.execute().fetch()
+    expected = raw.map(lambda x: x + 1.0)
+    pd.testing.assert_series_equal(result, expected)
+
+    # test arg is a md.Series
+    raw2 = pd.Series([10], index=[5])
+    s2 = from_pandas_series(raw2)
+
+    r = s.map(s2, dtype=float)
+    result = r.execute().fetch()
+    expected = raw.map(raw2)
+    pd.testing.assert_series_equal(result, expected)
+
+    # test arg is a md.Series, and dtype can be inferred
+    raw2 = pd.Series([10.0], index=[5])
+    s2 = from_pandas_series(raw2)
+
+    r = s.map(s2)
+    result = r.execute().fetch()
+    expected = raw.map(raw2)
+    pd.testing.assert_series_equal(result, expected)
+
+    # test str
+    raw = pd.Series(["a", "b", "c", "d"])
+    s = from_pandas_series(raw, chunk_size=2)
+
+    r = s.map({"c": "e"})
+    result = r.execute().fetch()
+    expected = raw.map({"c": "e"})
+    pd.testing.assert_series_equal(result, expected)
+
+    # test map index
+    raw = pd.Index(np.random.rand(7))
+    idx = from_pandas_index(pd.Index(raw), chunk_size=2)
+    r = idx.map(f)
+    result = r.execute().fetch()
+    expected = raw.map(lambda x: x + 1.0)
+    pd.testing.assert_index_equal(result, expected)
+
+
+def test_describe_execution(setup):
+    s_raw = pd.Series(np.random.rand(10))
+
+    # test one chunk
+    series = from_pandas_series(s_raw, chunk_size=10)
+
+    r = series.describe()
+    result = r.execute().fetch()
+    expected = s_raw.describe()
+    pd.testing.assert_series_equal(result, expected)
+
+    r = series.describe(percentiles=[])
+    result = r.execute().fetch()
+    expected = s_raw.describe(percentiles=[])
+    pd.testing.assert_series_equal(result, expected)
+
+    # test multi chunks
+    series = from_pandas_series(s_raw, chunk_size=3)
+
+    r = series.describe()
+    result = r.execute().fetch()
+    expected = s_raw.describe()
+    pd.testing.assert_series_equal(result, expected)
+
+    r = series.describe(percentiles=[])
+    result = r.execute().fetch()
+    expected = s_raw.describe(percentiles=[])
+    pd.testing.assert_series_equal(result, expected)
+
+    rs = np.random.RandomState(5)
+    df_raw = pd.DataFrame(rs.rand(10, 4), columns=list("abcd"))
+    df_raw["e"] = rs.randint(100, size=10)
+
+    # test one chunk
+    df = from_pandas_df(df_raw, chunk_size=10)
+
+    r = df.describe()
+    result = r.execute().fetch()
+    expected = df_raw.describe()
+    pd.testing.assert_frame_equal(result, expected)
+
+    r = series.describe(percentiles=[], include=np.float64)
+    result = r.execute().fetch()
+    expected = s_raw.describe(percentiles=[], include=np.float64)
+    pd.testing.assert_series_equal(result, expected)
+
+    # test multi chunks
+    df = from_pandas_df(df_raw, chunk_size=3)
+
+    r = df.describe()
+    result = r.execute().fetch()
+    expected = df_raw.describe()
+    pd.testing.assert_frame_equal(result, expected)
+
+    r = df.describe(percentiles=[], include=np.float64)
+    result = r.execute().fetch()
+    expected = df_raw.describe(percentiles=[], include=np.float64)
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test skip percentiles
+    r = df.describe(percentiles=False, include=np.float64)
+    result = r.execute().fetch()
+    expected = df_raw.describe(percentiles=[], include=np.float64)
+    expected.drop(["50%"], axis=0, inplace=True)
+    pd.testing.assert_frame_equal(result, expected)
+
+    with pytest.raises(ValueError):
+        df.describe(percentiles=[1.1])
+
+    with pytest.raises(ValueError):
+        # duplicated values
+        df.describe(percentiles=[0.3, 0.5, 0.3])
+
+    # test input dataframe which has unknown shape
+    df = from_pandas_df(df_raw, chunk_size=3)
+    df2 = df[df["a"] < 0.5]
+    r = df2.describe()
+
+    result = r.execute().fetch()
+    expected = df_raw[df_raw["a"] < 0.5].describe()
+    pd.testing.assert_frame_equal(result, expected)
+
+
+def test_data_frame_apply_execute(setup):
+    cols = [chr(ord("A") + i) for i in range(10)]
+    df_raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols))
+
+    old_chunk_store_limit = options.chunk_store_limit
+    try:
+        options.chunk_store_limit = 20
+
+        df = from_pandas_df(df_raw, chunk_size=5)
+
+        r = df.apply("ffill")
+        result = r.execute().fetch()
+        expected = df_raw.apply("ffill")
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = df.apply(["sum", "max"])
+        result = r.execute().fetch()
+        expected = df_raw.apply(["sum", "max"])
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = df.apply(["sum", "max"], axis=1)
+        result = r.execute().fetch()
+        expected = df_raw.apply(["sum", "max"], axis=1)
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = df.apply(np.sqrt)
+        result = r.execute().fetch()
+        expected = df_raw.apply(np.sqrt)
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = df.apply(lambda x: pd.Series([1, 2]))
+        result = r.execute().fetch()
+        expected = df_raw.apply(lambda x: pd.Series([1, 2]))
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = df.apply(np.sum, axis="index")
+        result = r.execute().fetch()
+        expected = df_raw.apply(np.sum, axis="index")
+        pd.testing.assert_series_equal(result, expected)
+
+        r = df.apply(np.sum, axis="columns")
+        result = r.execute().fetch()
+        expected = df_raw.apply(np.sum, axis="columns")
+        pd.testing.assert_series_equal(result, expected)
+
+        r = df.apply(lambda x: [1, 2], axis=1)
+        result = r.execute().fetch()
+        expected = df_raw.apply(lambda x: [1, 2], axis=1)
+        pd.testing.assert_series_equal(result, expected)
+
+        r = df.apply(lambda x: pd.Series([1, 2], index=["foo", "bar"]), axis=1)
+        result = r.execute().fetch()
+        expected = df_raw.apply(
+            lambda x: pd.Series([1, 2], index=["foo", "bar"]), axis=1
+        )
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = df.apply(lambda x: [1, 2], axis=1, result_type="expand")
+        result = r.execute().fetch()
+        expected = df_raw.apply(lambda x: [1, 2], axis=1, result_type="expand")
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = df.apply(lambda x: list(range(10)), axis=1, result_type="reduce")
+        result = r.execute().fetch()
+        expected = df_raw.apply(lambda x: list(range(10)), axis=1, result_type="reduce")
+        pd.testing.assert_series_equal(result, expected)
+
+        r = df.apply(lambda x: list(range(10)), axis=1, result_type="broadcast")
+        result = r.execute().fetch()
+        expected = df_raw.apply(
+            lambda x: list(range(10)), axis=1, result_type="broadcast"
+        )
+        pd.testing.assert_frame_equal(result, expected)
+    finally:
+        options.chunk_store_limit = old_chunk_store_limit
+
+
+def test_data_frame_apply_closure_execute(setup):
+    cols = [chr(ord("A") + i) for i in range(10)]
+    df_raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols))
+    df = from_pandas_df(df_raw, chunk_size=5)
+
+    x = pd.Series([i for i in range(10**4)])
+    y = pd.Series([i for i in range(10**4)])
+
+    def closure(z):
+        return pd.concat([x, y], ignore_index=True)
+
+    r = df.apply(closure, axis=1)
+    result = r.execute().fetch()
+    expected = df_raw.apply(closure, axis=1)
+    pd.testing.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("multiplier", [1, 3, 4])
+def test_data_frame_apply_callable_execute(setup, multiplier):
+    cols = [chr(ord("A") + i) for i in range(10)]
+    df_raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols))
+    df = from_pandas_df(df_raw, chunk_size=5)
+
+    class callable_df:
+        __slots__ = "x", "__dict__"
+
+        def __init__(self, multiplier: int = 1):
+            self.x = pd.Series([i for i in range(10**multiplier)])
+            self.y = pd.Series([i for i in range(10**multiplier)])
+
+        def __call__(self, pdf):
+            return pd.concat([self.x, self.y], ignore_index=True)
+
+    cdf_large = callable_df(multiplier=multiplier)
+    r = df.apply(cdf_large, axis=1)
+    result = r.execute().fetch()
+    expected = df_raw.apply(cdf_large, axis=1)
+    pd.testing.assert_frame_equal(result, expected)
+
+
+def test_series_apply_execute(setup):
+    idxes = [chr(ord("A") + i) for i in range(20)]
+    s_raw = pd.Series([i**2 for i in range(20)], index=idxes)
+
+    series = from_pandas_series(s_raw, chunk_size=5)
+
+    r = series.apply("add", args=(1,))
+    result = r.execute().fetch()
+    expected = s_raw.apply("add", args=(1,))
+    pd.testing.assert_series_equal(result, expected)
+
+    r = series.apply(["sum", "max"])
+    result = r.execute().fetch()
+    expected = s_raw.apply(["sum", "max"])
+    pd.testing.assert_series_equal(result, expected)
+
+    r = series.apply(np.sqrt)
+    result = r.execute().fetch()
+    expected = s_raw.apply(np.sqrt)
+    pd.testing.assert_series_equal(result, expected)
+
+    r = series.apply("sqrt")
+    result = r.execute().fetch()
+    expected = s_raw.apply("sqrt")
+    pd.testing.assert_series_equal(result, expected)
+
+    r = series.apply(lambda x: [x, x + 1], convert_dtype=False)
+    result = r.execute().fetch()
+    expected = s_raw.apply(lambda x: [x, x + 1], convert_dtype=False)
+    pd.testing.assert_series_equal(result, expected)
+
+    s_raw2 = pd.Series([np.array([1, 2, 3]), np.array([4, 5, 6])])
+    series = from_pandas_series(s_raw2)
+
+    dtypes = pd.Series([np.dtype(float)] * 3)
+    r = series.apply(pd.Series, output_type="dataframe", dtypes=dtypes)
+    result = r.execute().fetch()
+    expected = s_raw2.apply(pd.Series)
+    pd.testing.assert_frame_equal(result, expected)
+
+
+def test_series_apply_closure_execute(setup):
+    idxes = [chr(ord("A") + i) for i in range(20)]
+    s_raw = pd.Series([i**2 for i in range(20)], index=idxes)
+
+    series = from_pandas_series(s_raw, chunk_size=5)
+
+    x, y = 1, 2
+
+    def closure(z):
+        return [z + x, z + y]
+
+    r = series.apply(closure, convert_dtype=False)
+    result = r.execute().fetch()
+    expected = s_raw.apply(closure, convert_dtype=False)
+    pd.testing.assert_series_equal(result, expected)
+
+    class callable_series:
+        __slots__ = "x", "__dict__"
+
+        def __init__(self):
+            self.x = 1
+            self.y = 2
+
+        def __call__(self, z):
+            return [z + self.x, z + self.y]
+
+    cs = callable_series()
+    r = series.apply(cs, convert_dtype=False)
+    result = r.execute().fetch()
+    expected = s_raw.apply(cs, convert_dtype=False)
+    pd.testing.assert_series_equal(result, expected)
+
+
+@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
+def test_apply_with_arrow_dtype_execution(setup):
+    df1 = pd.DataFrame({"a": [1, 2, 1], "b": ["a", "b", "a"]})
+    df = from_pandas_df(df1)
+    df["b"] = df["b"].astype("Arrow[string]")
+
+    r = df.apply(lambda row: str(row[0]) + row[1], axis=1)
+    result = r.execute().fetch()
+    expected = df1.apply(lambda row: str(row[0]) + row[1], axis=1)
+    pd.testing.assert_series_equal(result, expected)
+
+    s1 = df1["b"]
+    s = from_pandas_series(s1)
+    s = s.astype("arrow_string")
+
+    r = s.apply(lambda x: x + "_suffix")
+    result = r.execute().fetch()
+    expected = s1.apply(lambda x: x + "_suffix")
+    pd.testing.assert_series_equal(result, expected)
+
+
+def test_transform_execute(setup):
+    cols = [chr(ord("A") + i) for i in range(10)]
+    df_raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols))
+
+    idx_vals = [chr(ord("A") + i) for i in range(20)]
+    s_raw = pd.Series([i**2 for i in range(20)], index=idx_vals)
+
+    def rename_fn(f, new_name):
+        f.__name__ = new_name
+        return f
+
+    old_chunk_store_limit = options.chunk_store_limit
+    try:
+        options.chunk_store_limit = 20
+
+        # DATAFRAME CASES
+        df = from_pandas_df(df_raw, chunk_size=5)
+
+        # test transform scenarios on data frames
+        def f(s):
+            if s[2] > 0:
+                return s
+            else:
+                return pd.Series([s[2]] * len(s))
+
+        with pytest.raises(TypeError):
+            df.transform(f)
+        r = df.transform(f, skip_infer=True)
+        result = r.execute().fetch()
+        expected = df_raw.transform(f)
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = df.transform(lambda x: list(range(len(x))))
+        result = r.execute().fetch()
+        expected = df_raw.transform(lambda x: list(range(len(x))))
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = df.transform(lambda x: list(range(len(x))), axis=1)
+        result = r.execute().fetch()
+        expected = df_raw.transform(lambda x: list(range(len(x))), axis=1)
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = df.transform(["cumsum", "cummax", lambda x: x + 1])
+        result = r.execute().fetch()
+        expected = df_raw.transform(["cumsum", "cummax", lambda x: x + 1])
+        pd.testing.assert_frame_equal(result, expected)
+
+        fn_dict = OrderedDict(
+            [
+                ("A", "cumsum"),
+                ("D", ["cumsum", "cummax"]),
+                ("F", lambda x: x + 1),
+            ]
+        )
+        r = df.transform(fn_dict)
+        result = r.execute().fetch()
+        expected = df_raw.transform(fn_dict)
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = df.transform(lambda x: x.iloc[:-1], _call_agg=True)
+        result = r.execute().fetch()
+        expected = df_raw.agg(lambda x: x.iloc[:-1])
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = df.transform(lambda x: x.iloc[:-1], axis=1, _call_agg=True)
+        result = r.execute().fetch()
+        expected = df_raw.agg(lambda x: x.iloc[:-1], axis=1)
+        pd.testing.assert_frame_equal(result, expected)
+
+        fn_list = [
+            rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), "f1"),
+            lambda x: x.iloc[:-1].reset_index(drop=True),
+        ]
+        r = df.transform(fn_list, _call_agg=True)
+        result = r.execute().fetch()
+        expected = df_raw.agg(fn_list)
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = df.transform(lambda x: x.sum(), _call_agg=True)
+        result = r.execute().fetch()
+        expected = df_raw.agg(lambda x: x.sum())
+        pd.testing.assert_series_equal(result, expected)
+
+        fn_dict = OrderedDict(
+            [
+                ("A", rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), "f1")),
+                (
+                    "D",
+                    [
+                        rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), "f1"),
+                        lambda x: x.iloc[:-1].reset_index(drop=True),
+                    ],
+                ),
+                ("F", lambda x: x.iloc[:-1].reset_index(drop=True)),
+            ]
+        )
+        r = df.transform(fn_dict, _call_agg=True)
+        result = r.execute().fetch()
+        expected = df_raw.agg(fn_dict)
+        pd.testing.assert_frame_equal(result, expected)
+
+        # SERIES CASES
+        series = from_pandas_series(s_raw, chunk_size=5)
+
+        # test transform scenarios on series
+        r = series.transform(lambda x: x + 1)
+        result = r.execute().fetch()
+        expected = s_raw.transform(lambda x: x + 1)
+        pd.testing.assert_series_equal(result, expected)
+
+        r = series.transform(["cumsum", lambda x: x + 1])
+        result = r.execute().fetch()
+        expected = s_raw.transform(["cumsum", lambda x: x + 1])
+        pd.testing.assert_frame_equal(result, expected)
+
+        # test transform on string dtype
+        df_raw = pd.DataFrame({"col1": ["str"] * 10, "col2": ["string"] * 10})
+        df = from_pandas_df(df_raw, chunk_size=3)
+
+        r = df["col1"].transform(lambda x: x + "_suffix")
+        result = r.execute().fetch()
+        expected = df_raw["col1"].transform(lambda x: x + "_suffix")
+        pd.testing.assert_series_equal(result, expected)
+
+        r = df.transform(lambda x: x + "_suffix")
+        result = r.execute().fetch()
+        expected = df_raw.transform(lambda x: x + "_suffix")
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = df["col2"].transform(lambda x: x + "_suffix", dtype=np.dtype("str"))
+        result = r.execute().fetch()
+        expected = df_raw["col2"].transform(lambda x: x + "_suffix")
+        pd.testing.assert_series_equal(result, expected)
+    finally:
+        options.chunk_store_limit = old_chunk_store_limit
+
+
+@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
+def test_transform_with_arrow_dtype_execution(setup):
+    raw = pd.DataFrame({"a": [1, 2, 1], "b": ["a", "b", "a"]})
+    df = from_pandas_df(raw)
+    df["b"] = df["b"].astype("Arrow[string]")
+
+    r = df.transform({"b": lambda x: x + "_suffix"})
+    result = r.execute().fetch()
+    result["b"] = result["b"].to_numpy()
+    expected = raw.transform({"b": lambda x: x + "_suffix"})
+    pd.testing.assert_frame_equal(result, expected)
+
+    s1 = raw["b"]
+    s = from_pandas_series(s1)
+    s = s.astype("arrow_string")
+
+    r = s.transform(lambda x: x + "_suffix")
+    result = r.execute().fetch()
+    result = pd.Series(result.to_numpy(), name=result.name, index=result.index)
+    expected = s1.transform(lambda x: x + "_suffix")
+    pd.testing.assert_series_equal(result, expected)
+
+
+def test_string_method_execution(setup):
+    s = pd.Series(["s1,s2", "ef,", "dd", np.nan])
+    s2 = pd.concat([s, s, s])
+
+    series = from_pandas_series(s, chunk_size=2)
+    series2 = from_pandas_series(s2, chunk_size=2)
+
+    # test getitem
+    r = series.str[:3]
+    result = r.execute().fetch()
+    expected = s.str[:3]
+    pd.testing.assert_series_equal(result, expected)
+
+    # test split, expand=False
+    r = series.str.split(",", n=2)
+    result = r.execute().fetch()
+    expected = s.str.split(",", n=2)
+    pd.testing.assert_series_equal(result, expected)
+
+    # test split, expand=True
+    r = series.str.split(",", expand=True, n=1)
+    result = r.execute().fetch()
+    expected = s.str.split(",", expand=True, n=1)
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test rsplit
+    r = series.str.rsplit(",", expand=True, n=1)
+    result = r.execute().fetch()
+    expected = s.str.rsplit(",", expand=True, n=1)
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test cat all data
+    r = series2.str.cat(sep="/", na_rep="e")
+    result = r.execute().fetch()
+    expected = s2.str.cat(sep="/", na_rep="e")
+    assert result == expected
+
+    # test cat list
+    r = series.str.cat(["a", "b", np.nan, "c"])
+    result = r.execute().fetch()
+    expected = s.str.cat(["a", "b", np.nan, "c"])
+    pd.testing.assert_series_equal(result, expected)
+
+    # test cat series
+    r = series.str.cat(series.str.capitalize(), join="outer")
+    result = r.execute().fetch()
+    expected = s.str.cat(s.str.capitalize(), join="outer")
+    pd.testing.assert_series_equal(result, expected)
+
+    # test extractall
+    r = series.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
+    result = r.execute().fetch()
+    expected = s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test extract, expand=False
+    r = series.str.extract(r"[ab](\d)", expand=False)
+    result = r.execute().fetch()
+    expected = s.str.extract(r"[ab](\d)", expand=False)
+    pd.testing.assert_series_equal(result, expected)
+
+    # test extract, expand=True
+    r = series.str.extract(r"[ab](\d)", expand=True)
+    result = r.execute().fetch()
+    expected = s.str.extract(r"[ab](\d)", expand=True)
+    pd.testing.assert_frame_equal(result, expected)
+
+
+def test_datetime_method_execution(setup):
+    # test datetime
+    s = pd.Series([pd.Timestamp("2020-1-1"), pd.Timestamp("2020-2-1"), np.nan])
+    series = from_pandas_series(s, chunk_size=2)
+
+    r = series.dt.year
+    result = r.execute().fetch()
+    expected = s.dt.year
+    pd.testing.assert_series_equal(result, expected)
+
+    r = series.dt.strftime("%m-%d-%Y")
+    result = r.execute().fetch()
+    expected = s.dt.strftime("%m-%d-%Y")
+    pd.testing.assert_series_equal(result, expected)
+
+    # test timedelta
+    s = pd.Series([pd.Timedelta("1 days"), pd.Timedelta("3 days"), np.nan])
+    series = from_pandas_series(s, chunk_size=2)
+
+    r = series.dt.days
+    result = r.execute().fetch()
+    expected = s.dt.days
+    pd.testing.assert_series_equal(result, expected)
+
+
+def test_isin_execution(setup):
+    # one chunk in multiple chunks
+    a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+    b = pd.Series([2, 1, 9, 3])
+    sa = from_pandas_series(a, chunk_size=10)
+    sb = from_pandas_series(b, chunk_size=2)
+
+    result = sa.isin(sb).execute().fetch()
+    expected = a.isin(b)
+    pd.testing.assert_series_equal(result, expected)
+
+    # multiple chunk in one chunks
+    a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+    b = pd.Series([2, 1, 9, 3])
+    sa = from_pandas_series(a, chunk_size=2)
+    sb = from_pandas_series(b, chunk_size=4)
+
+    result = sa.isin(sb).execute().fetch()
+    expected = a.isin(b)
+    pd.testing.assert_series_equal(result, expected)
+
+    # multiple chunk in multiple chunks
+    a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+    b = pd.Series([2, 1, 9, 3] * 2)
+    sa = from_pandas_series(a, chunk_size=2)
+    sb = from_pandas_series(b, chunk_size=2)
+
+    result = sa.isin(sb).execute().fetch()
+    expected = a.isin(b)
+    pd.testing.assert_series_equal(result, expected)
+
+    a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+    b = pd.Series([2, 1, 9, 3] * 3)
+    sa = from_pandas_series(a, chunk_size=5)
+    sb = from_pandas_series(b, chunk_size=2)
+
+    result = sa.isin(sb).execute().fetch()
+    expected = a.isin(b)
+    pd.testing.assert_series_equal(result, expected)
+
+    a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+    b = pd.Series([2, 1, 9, 3])
+    sa = from_pandas_series(a, chunk_size=2)
+
+    result = sa.isin(sb).execute().fetch()
+    expected = a.isin(b)
+    pd.testing.assert_series_equal(result, expected)
+
+    a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+    b = np.array([2, 1, 9, 3] * 5)
+    sa = from_pandas_series(a, chunk_size=5)
+    sb = tensor(b, chunk_size=4)
+
+    result = sa.isin(sb).execute().fetch()
+    expected = a.isin(b)
+    pd.testing.assert_series_equal(result, expected)
+
+    a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+    b = {2, 1, 9, 3}  # set
+    sa = from_pandas_series(a, chunk_size=2)
+
+    result = sa.isin(sb).execute().fetch()
+    expected = a.isin(b)
+    pd.testing.assert_series_equal(result, expected)
+
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(rs.randint(1000, size=(10, 3)))
+    df = from_pandas_df(raw, chunk_size=(5, 2))
+
+    # set
+    b = {2, 1, raw[1][0]}
+    r = df.isin(b)
+    result = r.execute().fetch()
+    expected = raw.isin(b)
+    pd.testing.assert_frame_equal(result, expected)
+
+    # mars object
+    b = tensor([2, 1, raw[1][0]] * 2, chunk_size=2)
+    r = df.isin(b)
+    result = r.execute().fetch()
+    expected = raw.isin([2, 1, raw[1][0]])
+    pd.testing.assert_frame_equal(result, expected)
+
+    # mars object and trigger iterative tiling
+    raw = pd.DataFrame(rs.randint(1000, size=(10, 3)))
+    df = from_pandas_df(raw, chunk_size=(5, 2))
+
+    b = from_pandas_series(pd.Series([raw[1][0]] + list(range(9))), chunk_size=2)
+    r = df.isin(b)
+    result = r.execute().fetch()
+    expected = raw.isin([2, 1, raw[1][0]])
+    pd.testing.assert_frame_equal(result, expected)
+
+    # dict
+    b = {1: tensor([2, 1, raw[1][0]], chunk_size=2), 2: [3, 10]}
+    r = df.isin(b)
+    result = r.execute().fetch()
+    expected = raw.isin({1: [2, 1, raw[1][0]], 2: [3, 10]})
+    pd.testing.assert_frame_equal(result, expected)
+
+
+def test_cut_execution(setup):
+    session = setup
+
+    rs = np.random.RandomState(0)
+    raw = rs.random(15) * 1000
+    s = pd.Series(raw, index=[f"i{i}" for i in range(15)])
+    bins = [10, 100, 500]
+    if _interval_range_closed_arg:
+        ii = pd.interval_range(10, 500, 3, closed="right")
+    else:
+        ii = pd.interval_range(10, 500, 3)
+    labels = ["a", "b"]
+
+    t = tensor(raw, chunk_size=4)
+    series = from_pandas_series(s, chunk_size=4)
+    iii = from_pandas_index(ii, chunk_size=2)
+
+    # cut on Series
+    r = cut(series, bins)
+    result = r.execute().fetch()
+    pd.testing.assert_series_equal(result, pd.cut(s, bins))
+
+    r, b = cut(series, bins, retbins=True)
+    r_result = r.execute().fetch()
+    b_result = b.execute().fetch()
+    r_expected, b_expected = pd.cut(s, bins, retbins=True)
+    pd.testing.assert_series_equal(r_result, r_expected)
+    np.testing.assert_array_equal(b_result, b_expected)
+
+    # cut on tensor
+    r = cut(t, bins)
+    # result and expected is array whose dtype is CategoricalDtype
+    result = r.execute().fetch()
+    expected = pd.cut(raw, bins)
+    assert len(result) == len(expected)
+    for r, e in zip(result, expected):
+        np.testing.assert_equal(r, e)
+
+    # one chunk
+    r = cut(s, tensor(bins, chunk_size=2), right=False, include_lowest=True)
+    result = r.execute().fetch()
+    pd.testing.assert_series_equal(
+        result, pd.cut(s, bins, right=False, include_lowest=True)
+    )
+
+    # test labels
+    r = cut(t, bins, labels=labels)
+    # result and expected is array whose dtype is CategoricalDtype
+    result = r.execute().fetch()
+    expected = pd.cut(raw, bins, labels=labels)
+    assert len(result) == len(expected)
+    for r, e in zip(result, expected):
+        np.testing.assert_equal(r, e)
+
+    r = cut(t, bins, labels=False)
+    # result and expected is array whose dtype is CategoricalDtype
+    result = r.execute().fetch()
+    expected = pd.cut(raw, bins, labels=False)
+    np.testing.assert_array_equal(result, expected)
+
+    # test labels which is tensor
+    labels_t = tensor(["a", "b"], chunk_size=1)
+    r = cut(raw, bins, labels=labels_t, include_lowest=True)
+    # result and expected is array whose dtype is CategoricalDtype
+    result = r.execute().fetch()
+    expected = pd.cut(raw, bins, labels=labels, include_lowest=True)
+    assert len(result) == len(expected)
+    for r, e in zip(result, expected):
+        np.testing.assert_equal(r, e)
+
+    # test labels=False
+    r, b = cut(raw, ii, labels=False, retbins=True)
+    # result and expected is array whose dtype is CategoricalDtype
+    r_result, b_result = session.fetch(*session.execute(r, b))
+    r_expected, b_expected = pd.cut(raw, ii, labels=False, retbins=True)
+    for r, e in zip(r_result, r_expected):
+        np.testing.assert_equal(r, e)
+    pd.testing.assert_index_equal(b_result, b_expected)
+
+    # test bins which is md.IntervalIndex
+    r, b = cut(series, iii, labels=tensor(labels, chunk_size=1), retbins=True)
+    r_result = r.execute().fetch()
+    b_result = b.execute().fetch()
+    r_expected, b_expected = pd.cut(s, ii, labels=labels, retbins=True)
+    pd.testing.assert_series_equal(r_result, r_expected)
+    pd.testing.assert_index_equal(b_result, b_expected)
+
+    # test duplicates
+    bins2 = [0, 2, 4, 6, 10, 10]
+    r, b = cut(s, bins2, labels=False, retbins=True, right=False, duplicates="drop")
+    r_result = r.execute().fetch()
+    b_result = b.execute().fetch()
+    r_expected, b_expected = pd.cut(
+        s, bins2, labels=False, retbins=True, right=False, duplicates="drop"
+    )
+    pd.testing.assert_series_equal(r_result, r_expected)
+    np.testing.assert_array_equal(b_result, b_expected)
+
+    # test ordered
+    if pd.__version__ >= "1.1.0":
+        bins3 = [10, 100, 500]
+        r = cut(s, bins3, labels=labels, ordered=False)
+        r_result = r.execute().fetch()
+        r_expected = pd.cut(s, bins3, labels=labels, ordered=False)
+        pd.testing.assert_series_equal(r_result, r_expected)
+
+    # test integer bins
+    r = cut(series, 3)
+    result = r.execute().fetch()
+    pd.testing.assert_series_equal(result, pd.cut(s, 3))
+
+    r, b = cut(series, 3, right=False, retbins=True)
+    r_result, b_result = session.fetch(*session.execute(r, b))
+    r_expected, b_expected = pd.cut(s, 3, right=False, retbins=True)
+    pd.testing.assert_series_equal(r_result, r_expected)
+    np.testing.assert_array_equal(b_result, b_expected)
+
+    # test min max same
+    s2 = pd.Series([1.1] * 15)
+    r = cut(s2, 3)
+    result = r.execute().fetch()
+    pd.testing.assert_series_equal(result, pd.cut(s2, 3))
+
+    # test inf exist
+    s3 = s2.copy()
+    s3[-1] = np.inf
+    with pytest.raises(ValueError):
+        cut(s3, 3).execute()
+
+
+def test_transpose_execution(setup):
+    raw = pd.DataFrame(
+        {"a": ["1", "2", "3"], "b": ["5", "-6", "7"], "c": ["1", "2", "3"]}
+    )
+
+    # test 1 chunk
+    df = from_pandas_df(raw)
+    result = df.transpose().execute().fetch()
+    pd.testing.assert_frame_equal(result, raw.transpose())
+
+    # test multi chunks
+    df = from_pandas_df(raw, chunk_size=2)
+    result = df.transpose().execute().fetch()
+    pd.testing.assert_frame_equal(result, raw.transpose())
+
+    df = from_pandas_df(raw, chunk_size=2)
+    result = df.T.execute().fetch()
+    pd.testing.assert_frame_equal(result, raw.transpose())
+
+    # dtypes are varied
+    raw = pd.DataFrame({"a": [1.1, 2.2, 3.3], "b": [5, -6, 7], "c": [1, 2, 3]})
+
+    df = from_pandas_df(raw, chunk_size=2)
+    result = df.transpose().execute().fetch()
+    pd.testing.assert_frame_equal(result, raw.transpose())
+
+    raw = pd.DataFrame({"a": [1.1, 2.2, 3.3], "b": ["5", "-6", "7"]})
+
+    df = from_pandas_df(raw, chunk_size=2)
+    result = df.transpose().execute().fetch()
+    pd.testing.assert_frame_equal(result, raw.transpose())
+
+    # Transposing from results of other operands
+    raw = pd.DataFrame(np.arange(0, 100).reshape(10, 10))
+    df = DataFrame(arange(0, 100, chunk_size=5).reshape(10, 10))
+    result = df.transpose().execute().fetch()
+    pd.testing.assert_frame_equal(result, raw.transpose())
+
+    df = DataFrame(rand(100, 100, chunk_size=10))
+    raw = df.to_pandas()
+    result = df.transpose().execute().fetch()
+    pd.testing.assert_frame_equal(result, raw.transpose())
+
+
+def test_get_dummies_execution(setup):
+    raw = pd.DataFrame(
+        {
+            "a": [1.1, 2.1, 3.1],
+            "b": ["5", "-6", "-7"],
+            "c": [1, 2, 3],
+            "d": ["2", "3", "4"],
+        }
+    )
+    # test 1 chunk
+    df = from_pandas_df(raw)
+    r = get_dummies(df)
+    pd.testing.assert_frame_equal(r.execute().fetch(), pd.get_dummies(raw))
+
+    # test multi chunks
+    df = from_pandas_df(raw, chunk_size=2)
+    r = get_dummies(df)
+    pd.testing.assert_frame_equal(r.execute().fetch(), pd.get_dummies(raw))
+
+    # test prefix and prefix_sep
+    df = from_pandas_df(raw, chunk_size=2)
+    r = get_dummies(df, prefix=["col1", "col2"], prefix_sep="_")
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(),
+        pd.get_dummies(raw, prefix=["col1", "col2"], prefix_sep="_"),
+    )
+
+    r = get_dummies(df, prefix={"b": "col1", "d": "col2"}, prefix_sep="_")
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(),
+        pd.get_dummies(raw, prefix={"b": "col1", "d": "col2"}, prefix_sep="_"),
+    )
+
+    # test dummy_na
+    raw = pd.Series(["a", "b", "c", np.nan])
+    df = from_pandas_series(raw)
+    r = get_dummies(df, dummy_na=False)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), pd.get_dummies(raw, dummy_na=False)
+    )
+
+    # test columns
+    raw = pd.DataFrame(
+        {
+            "a": [1.1, 2.1, 3.1],
+            "b": ["5", "-6", "-7"],
+            "c": [1, 2, 3],
+            "d": ["2", "3", "4"],
+        }
+    )
+    df = from_pandas_df(raw, chunk_size=2)
+    r = get_dummies(df, columns=["c"])
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), pd.get_dummies(raw, columns=["c"])
+    )
+
+    r = get_dummies(df, columns=["c", "d"], prefix=["col1", "col2"])
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(),
+        pd.get_dummies(raw, columns=["c", "d"], prefix=["col1", "col2"]),
+    )
+
+    # test drop_first
+    df = from_pandas_df(raw, chunk_size=2)
+    r = get_dummies(df, drop_first=True)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), pd.get_dummies(raw, drop_first=True)
+    )
+
+    # test dtype
+    df = from_pandas_df(raw, chunk_size=2)
+    r = get_dummies(df, dtype=float)
+    pd.testing.assert_frame_equal(r.execute().fetch(), pd.get_dummies(raw, dtype=float))
+
+    # test series
+    raw = pd.Series([3, 4, 1, 2])
+    series = from_pandas_series(raw, chunk_size=2)
+    r = get_dummies(series)
+    pd.testing.assert_frame_equal(r.execute().fetch(), pd.get_dummies(raw))
+
+    # test other variable
+    raw = [3, 4, 1, 2]
+    r = get_dummies(raw)
+    pd.testing.assert_frame_equal(r.execute().fetch(), pd.get_dummies(raw))
+
+    raw = pd.Series([3, 4, 2, 1])
+    r = get_dummies(raw)
+    pd.testing.assert_frame_equal(r.execute().fetch(), pd.get_dummies(raw))
+
+    raw = pd.DataFrame(
+        {
+            "a": [1.1, 2.1, 3.1],
+            "b": ["5", "-6", "-7"],
+            "c": [1, 2, 3],
+            "d": ["2", "3", "4"],
+        }
+    )
+    r = get_dummies(raw)
+    pd.testing.assert_frame_equal(r.execute().fetch(), pd.get_dummies(raw))
+
+
+def test_to_numeric_execution(setup):
+    rs = np.random.RandomState(0)
+    s = pd.Series(rs.randint(5, size=100))
+    s[rs.randint(100)] = np.nan
+
+    # test 1 chunk
+    series = from_pandas_series(s)
+
+    r = to_numeric(series)
+    pd.testing.assert_series_equal(r.execute().fetch(), pd.to_numeric(s))
+
+    # test multi chunks
+    series = from_pandas_series(s, chunk_size=20)
+
+    r = to_numeric(series)
+    pd.testing.assert_series_equal(r.execute().fetch(), pd.to_numeric(s))
+
+    # test object dtype
+    s = pd.Series(["1.0", 2, -3, "2.0"])
+    series = from_pandas_series(s)
+
+    r = to_numeric(series)
+    pd.testing.assert_series_equal(r.execute().fetch(), pd.to_numeric(s))
+
+    # test errors and downcast
+    s = pd.Series(["appple", 2, -3, "2.0"])
+    series = from_pandas_series(s)
+
+    r = to_numeric(series, errors="ignore", downcast="signed")
+    pd.testing.assert_series_equal(
+        r.execute().fetch(), pd.to_numeric(s, errors="ignore", downcast="signed")
+    )
+
+    # test list data
+    l = ["1.0", 2, -3, "2.0"]
+
+    r = to_numeric(l)
+    np.testing.assert_array_equal(r.execute().fetch(), pd.to_numeric(l))
+
+
+def test_q_cut_execution(setup):
+    rs = np.random.RandomState(0)
+    raw = rs.random(15) * 1000
+    s = pd.Series(raw, index=[f"i{i}" for i in range(15)])
+
+    series = from_pandas_series(s)
+    r = qcut(series, 3)
+    result = r.execute().fetch()
+    expected = pd.qcut(s, 3)
+    pd.testing.assert_series_equal(result, expected)
+
+    r = qcut(s, 3)
+    result = r.execute().fetch()
+    expected = pd.qcut(s, 3)
+    pd.testing.assert_series_equal(result, expected)
+
+    series = from_pandas_series(s)
+    r = qcut(series, [0.3, 0.5, 0.7])
+    result = r.execute().fetch()
+    expected = pd.qcut(s, [0.3, 0.5, 0.7])
+    pd.testing.assert_series_equal(result, expected)
+
+    r = qcut(range(5), 3)
+    result = r.execute().fetch()
+    expected = pd.qcut(range(5), 3)
+    assert isinstance(result, type(expected))
+    pd.testing.assert_series_equal(pd.Series(result), pd.Series(expected))
+
+    r = qcut(range(5), [0.2, 0.5])
+    result = r.execute().fetch()
+    expected = pd.qcut(range(5), [0.2, 0.5])
+    assert isinstance(result, type(expected))
+    pd.testing.assert_series_equal(pd.Series(result), pd.Series(expected))
+
+    r = qcut(range(5), tensor([0.2, 0.5]))
+    result = r.execute().fetch()
+    expected = pd.qcut(range(5), [0.2, 0.5])
+    assert isinstance(result, type(expected))
+    pd.testing.assert_series_equal(pd.Series(result), pd.Series(expected))
+
+
+def test_shift_execution(setup):
+    fill_value_default = no_default
+    if not _enable_no_default or _with_column_freq_bug:
+        fill_value_default = None
+
+    # test dataframe
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        rs.randint(1000, size=(10, 8)), columns=["col" + str(i + 1) for i in range(8)]
+    )
+
+    df = from_pandas_df(raw, chunk_size=5)
+
+    for periods in (2, -2, 6, -6):
+        for axis in (0, 1):
+            for fill_value in (fill_value_default, 0, 1.0):
+                r = df.shift(periods=periods, axis=axis, fill_value=fill_value)
+
+                try:
+                    result = r.execute().fetch()
+                    expected = raw.shift(
+                        periods=periods, axis=axis, fill_value=fill_value
+                    )
+                    pd.testing.assert_frame_equal(result, expected, check_dtype=False)
+                except AssertionError as e:  # pragma: no cover
+                    raise AssertionError(
+                        f"Failed when periods: {periods}, axis: {axis}, fill_value: {fill_value}"
+                    ) from e
+
+    raw2 = raw.copy()
+    raw2.index = pd.date_range("2020-1-1", periods=10)
+    raw2.columns = pd.date_range("2020-3-1", periods=8)
+
+    df2 = from_pandas_df(raw2, chunk_size=5)
+
+    # test freq not None
+    for periods in (2, -2):
+        for axis in (0, 1):
+            for fill_value in (fill_value_default, 0, 1.0):
+                r = df2.shift(
+                    periods=periods, freq="D", axis=axis, fill_value=fill_value
+                )
+
+                try:
+                    result = r.execute().fetch()
+                    expected = raw2.shift(
+                        periods=periods, freq="D", axis=axis, fill_value=fill_value
+                    )
+                    pd.testing.assert_frame_equal(result, expected)
+                except AssertionError as e:  # pragma: no cover
+                    raise AssertionError(
+                        f"Failed when periods: {periods}, axis: {axis}, fill_value: {fill_value}"
+                    ) from e
+
+    # test tshift
+    r = df2.tshift(periods=1)
+    result = r.execute().fetch()
+    expected = raw2.tshift(periods=1)
+    pd.testing.assert_frame_equal(result, expected)
+
+    with pytest.raises(ValueError):
+        _ = df.tshift(periods=1)
+
+    # test series
+    s = raw.iloc[:, 0]
+
+    series = from_pandas_series(s, chunk_size=5)
+    for periods in (0, 2, -2, 6, -6):
+        for fill_value in (fill_value_default, 0, 1.0):
+            r = series.shift(periods=periods, fill_value=fill_value)
+
+            try:
+                result = r.execute().fetch()
+                expected = s.shift(periods=periods, fill_value=fill_value)
+                pd.testing.assert_series_equal(result, expected)
+            except AssertionError as e:  # pragma: no cover
+                raise AssertionError(
+                    f"Failed when periods: {periods}, fill_value: {fill_value}"
+                ) from e
+
+    s2 = raw2.iloc[:, 0]
+
+    # test freq not None
+    series2 = from_pandas_series(s2, chunk_size=5)
+    for periods in (2, -2):
+        for fill_value in (fill_value_default, 0, 1.0):
+            r = series2.shift(periods=periods, freq="D", fill_value=fill_value)
+
+            try:
+                result = r.execute().fetch()
+                expected = s2.shift(periods=periods, freq="D", fill_value=fill_value)
+                pd.testing.assert_series_equal(result, expected)
+            except AssertionError as e:  # pragma: no cover
+                raise AssertionError(
+                    f"Failed when periods: {periods}, fill_value: {fill_value}"
+                ) from e
+
+
+def test_diff_execution(setup):
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        rs.randint(1000, size=(10, 8)), columns=["col" + str(i + 1) for i in range(8)]
+    )
+
+    raw1 = raw.copy()
+    raw1["col4"] = raw1["col4"] < 400
+
+    r = from_pandas_df(raw1, chunk_size=(10, 5)).diff(-1)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw1.diff(-1))
+
+    r = from_pandas_df(raw1, chunk_size=5).diff(-1)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw1.diff(-1))
+
+    r = from_pandas_df(raw, chunk_size=(5, 8)).diff(1, axis=1)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.diff(1, axis=1))
+
+    r = from_pandas_df(raw, chunk_size=5).diff(1, axis=1)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw.diff(1, axis=1), check_dtype=False
+    )
+
+    # test series
+    s = raw.iloc[:, 0]
+    s1 = s.copy() < 400
+
+    r = from_pandas_series(s, chunk_size=10).diff(-1)
+    pd.testing.assert_series_equal(r.execute().fetch(), s.diff(-1))
+
+    r = from_pandas_series(s, chunk_size=5).diff(-1)
+    pd.testing.assert_series_equal(r.execute().fetch(), s.diff(-1))
+
+    r = from_pandas_series(s1, chunk_size=5).diff(1)
+    pd.testing.assert_series_equal(r.execute().fetch(), s1.diff(1))
+
+
+def test_value_counts_execution(setup):
+    rs = np.random.RandomState(0)
+    s = pd.Series(rs.randint(5, size=100), name="s")
+    s[rs.randint(100)] = np.nan
+
+    # test 1 chunk
+    series = from_pandas_series(s, chunk_size=100)
+
+    r = series.value_counts()
+    pd.testing.assert_series_equal(r.execute().fetch(), s.value_counts())
+
+    r = series.value_counts(bins=5, normalize=True)
+    pd.testing.assert_series_equal(
+        r.execute().fetch(), s.value_counts(bins=5, normalize=True)
+    )
+
+    # test multi chunks
+    series = from_pandas_series(s, chunk_size=30)
+
+    r = series.value_counts(method="tree")
+    pd.testing.assert_series_equal(r.execute().fetch(), s.value_counts())
+
+    r = series.value_counts(method="tree", normalize=True)
+    pd.testing.assert_series_equal(r.execute().fetch(), s.value_counts(normalize=True))
+
+    # test bins and normalize
+    r = series.value_counts(method="tree", bins=5, normalize=True)
+    pd.testing.assert_series_equal(
+        r.execute().fetch(), s.value_counts(bins=5, normalize=True)
+    )
+
+
+def test_astype(setup):
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        rs.randint(1000, size=(20, 8)), columns=["c" + str(i + 1) for i in range(8)]
+    )
+    # single chunk
+    df = from_pandas_df(raw)
+    r = df.astype("int32")
+
+    result = r.execute().fetch()
+    expected = raw.astype("int32")
+    pd.testing.assert_frame_equal(expected, result)
+
+    # multiply chunks
+    df = from_pandas_df(raw, chunk_size=6)
+    r = df.astype("int32")
+
+    result = r.execute().fetch()
+    expected = raw.astype("int32")
+    pd.testing.assert_frame_equal(expected, result)
+
+    # dict type
+    df = from_pandas_df(raw, chunk_size=5)
+    r = df.astype({"c1": "int32", "c2": "float", "c8": "str"})
+
+    result = r.execute().fetch()
+    expected = raw.astype({"c1": "int32", "c2": "float", "c8": "str"})
+    pd.testing.assert_frame_equal(expected, result)
+
+    # test arrow_string dtype
+    df = from_pandas_df(raw, chunk_size=8)
+    r = df.astype({"c1": "arrow_string"})
+
+    result = r.execute().fetch()
+    expected = raw.astype({"c1": "arrow_string"})
+    pd.testing.assert_frame_equal(expected, result)
+
+    # test series
+    s = pd.Series(rs.randint(5, size=20))
+    series = from_pandas_series(s)
+    r = series.astype("int32")
+
+    result = r.execute().fetch()
+    expected = s.astype("int32")
+    pd.testing.assert_series_equal(result, expected)
+
+    series = from_pandas_series(s, chunk_size=6)
+    r = series.astype("arrow_string")
+
+    result = r.execute().fetch()
+    expected = s.astype("arrow_string")
+    pd.testing.assert_series_equal(result, expected)
+
+    # test index
+    raw = pd.Index(rs.randint(5, size=20))
+    mix = from_pandas_index(raw)
+    r = mix.astype("int32")
+
+    result = r.execute().fetch()
+    expected = raw.astype("int32")
+    pd.testing.assert_index_equal(result, expected)
+
+    # multiply chunks
+    series = from_pandas_series(s, chunk_size=6)
+    r = series.astype("str")
+
+    result = r.execute().fetch()
+    expected = s.astype("str")
+    pd.testing.assert_series_equal(result, expected)
+
+    # test category
+    raw = pd.DataFrame(
+        rs.randint(3, size=(20, 8)), columns=["c" + str(i + 1) for i in range(8)]
+    )
+
+    df = from_pandas_df(raw)
+    r = df.astype("category")
+
+    result = r.execute().fetch()
+    expected = raw.astype("category")
+    pd.testing.assert_frame_equal(expected, result)
+
+    df = from_pandas_df(raw)
+    r = df.astype({"c1": "category", "c8": "int32", "c4": "str"})
+
+    result = r.execute().fetch()
+    expected = raw.astype({"c1": "category", "c8": "int32", "c4": "str"})
+    pd.testing.assert_frame_equal(expected, result)
+
+    df = from_pandas_df(raw, chunk_size=5)
+    r = df.astype("category")
+
+    result = r.execute().fetch()
+    expected = raw.astype("category")
+    pd.testing.assert_frame_equal(expected, result)
+
+    df = from_pandas_df(raw, chunk_size=3)
+    r = df.astype({"c1": "category", "c8": "int32", "c4": "str"})
+
+    result = r.execute().fetch()
+    expected = raw.astype({"c1": "category", "c8": "int32", "c4": "str"})
+    pd.testing.assert_frame_equal(expected, result)
+
+    df = from_pandas_df(raw, chunk_size=6)
+    r = df.astype(
+        {
+            "c1": "category",
+            "c5": "float",
+            "c2": "int32",
+            "c7": pd.CategoricalDtype([1, 3, 4, 2]),
+            "c4": pd.CategoricalDtype([1, 3, 2]),
+        }
+    )
+    result = r.execute().fetch()
+    expected = raw.astype(
+        {
+            "c1": "category",
+            "c5": "float",
+            "c2": "int32",
+            "c7": pd.CategoricalDtype([1, 3, 4, 2]),
+            "c4": pd.CategoricalDtype([1, 3, 2]),
+        }
+    )
+    pd.testing.assert_frame_equal(expected, result)
+
+    df = from_pandas_df(raw, chunk_size=8)
+    r = df.astype({"c2": "category"})
+    result = r.execute().fetch()
+    expected = raw.astype({"c2": "category"})
+    pd.testing.assert_frame_equal(expected, result)
+
+    # test series category
+    raw = pd.Series(np.random.choice(["a", "b", "c"], size=(10,)))
+    series = from_pandas_series(raw, chunk_size=4)
+    result = series.astype("category").execute().fetch()
+    expected = raw.astype("category")
+    pd.testing.assert_series_equal(expected, result)
+
+    series = from_pandas_series(raw, chunk_size=3)
+    result = (
+        series.astype(pd.CategoricalDtype(["a", "c", "b"]), copy=False)
+        .execute()
+        .fetch()
+    )
+    expected = raw.astype(pd.CategoricalDtype(["a", "c", "b"]), copy=False)
+    pd.testing.assert_series_equal(expected, result)
+
+    series = from_pandas_series(raw, chunk_size=6)
+    result = series.astype(pd.CategoricalDtype(["a", "c", "b", "d"])).execute().fetch()
+    expected = raw.astype(pd.CategoricalDtype(["a", "c", "b", "d"]))
+    pd.testing.assert_series_equal(expected, result)
+
+
+def test_drop(setup):
+    # test dataframe drop
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        rs.randint(1000, size=(20, 8)), columns=["c" + str(i + 1) for i in range(8)]
+    )
+
+    df = from_pandas_df(raw, chunk_size=3)
+
+    columns = ["c2", "c4", "c5", "c6"]
+    index = [3, 6, 7]
+    r = df.drop(columns=columns, index=index)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw.drop(columns=columns, index=index)
+    )
+
+    idx_series = from_pandas_series(pd.Series(index))
+    r = df.drop(idx_series)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.drop(pd.Series(index)))
+
+    df.drop(columns, axis=1, inplace=True)
+    pd.testing.assert_frame_equal(df.execute().fetch(), raw.drop(columns, axis=1))
+
+    del df["c3"]
+    pd.testing.assert_frame_equal(
+        df.execute().fetch(), raw.drop(columns + ["c3"], axis=1)
+    )
+
+    ps = df.pop("c8")
+    pd.testing.assert_frame_equal(
+        df.execute().fetch(), raw.drop(columns + ["c3", "c8"], axis=1)
+    )
+    pd.testing.assert_series_equal(ps.execute().fetch(), raw["c8"])
+
+    # test series drop
+    raw = pd.Series(rs.randint(1000, size=(20,)))
+
+    series = from_pandas_series(raw, chunk_size=3)
+
+    r = series.drop(index=index)
+    pd.testing.assert_series_equal(r.execute().fetch(), raw.drop(index=index))
+
+    # test index drop
+    ser = pd.Series(range(20))
+    rs.shuffle(ser)
+    raw = pd.Index(ser)
+
+    idx = from_pandas_index(raw)
+
+    r = idx.drop(index)
+    pd.testing.assert_index_equal(r.execute().fetch(), raw.drop(index))
+
+
+def test_melt(setup):
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        rs.randint(1000, size=(20, 8)), columns=["c" + str(i + 1) for i in range(8)]
+    )
+
+    df = from_pandas_df(raw, chunk_size=3)
+
+    r = df.melt(id_vars=["c1"], value_vars=["c2", "c4"])
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_values(["c1", "variable"]).reset_index(drop=True),
+        raw.melt(id_vars=["c1"], value_vars=["c2", "c4"])
+        .sort_values(["c1", "variable"])
+        .reset_index(drop=True),
+    )
+
+
+def test_drop_duplicates(setup):
+    # test dataframe drop
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        rs.randint(1000, size=(20, 5)),
+        columns=["c" + str(i + 1) for i in range(5)],
+        index=["i" + str(j) for j in range(20)],
+    )
+    duplicate_lines = rs.randint(1000, size=5)
+    for i in [1, 3, 10, 11, 15]:
+        raw.iloc[i] = duplicate_lines
+
+    with option_context({"combine_size": 2}):
+        # test dataframe
+        for chunk_size in [(8, 3), (20, 5)]:
+            df = from_pandas_df(raw, chunk_size=chunk_size)
+            if chunk_size[0] < len(raw):
+                methods = ["tree", "subset_tree", "shuffle"]
+            else:
+                # 1 chunk
+                methods = [None]
+            for method in methods:
+                for subset in [None, "c1", ["c1", "c2"]]:
+                    for keep in ["first", "last", False]:
+                        for ignore_index in [True, False]:
+                            try:
+                                r = df.drop_duplicates(
+                                    method=method,
+                                    subset=subset,
+                                    keep=keep,
+                                    ignore_index=ignore_index,
+                                )
+                                result = r.execute().fetch()
+                                try:
+                                    expected = raw.drop_duplicates(
+                                        subset=subset,
+                                        keep=keep,
+                                        ignore_index=ignore_index,
+                                    )
+                                except TypeError:
+                                    # ignore_index is supported in pandas 1.0
+                                    expected = raw.drop_duplicates(
+                                        subset=subset, keep=keep
+                                    )
+                                    if ignore_index:
+                                        expected.reset_index(drop=True, inplace=True)
+
+                                pd.testing.assert_frame_equal(result, expected)
+                            except Exception as e:  # pragma: no cover
+                                raise AssertionError(
+                                    f"failed when method={method}, subset={subset}, "
+                                    f"keep={keep}, ignore_index={ignore_index}"
+                                ) from e
+
+        # test series and index
+        s = raw["c3"]
+        ind = pd.Index(s)
+
+        for tp, obj in [("series", s), ("index", ind)]:
+            for chunk_size in [8, 20]:
+                to_m = from_pandas_series if tp == "series" else from_pandas_index
+                mobj = to_m(obj, chunk_size=chunk_size)
+                if chunk_size < len(obj):
+                    methods = ["tree", "shuffle"]
+                else:
+                    # 1 chunk
+                    methods = [None]
+                for method in methods:
+                    for keep in ["first", "last", False]:
+                        try:
+                            r = mobj.drop_duplicates(method=method, keep=keep)
+                            result = r.execute().fetch()
+                            expected = obj.drop_duplicates(keep=keep)
+
+                            cmp = (
+                                pd.testing.assert_series_equal
+                                if tp == "series"
+                                else pd.testing.assert_index_equal
+                            )
+                            cmp(result, expected)
+                        except Exception as e:  # pragma: no cover
+                            raise AssertionError(
+                                f"failed when method={method}, keep={keep}"
+                            ) from e
+
+        # test inplace
+        series = from_pandas_series(s, chunk_size=11)
+        series.drop_duplicates(inplace=True)
+        result = series.execute().fetch()
+        expected = s.drop_duplicates()
+        pd.testing.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["tree", "shuffle"])
+def test_series_drop_duplicates(setup, method):
+    raw = pd.Series(np.random.randint(5, size=50))
+    s = Series(raw, chunk_size=10)
+    r = s.drop_duplicates(method=method).execute()
+    result = r.execute().fetch()
+    pd.testing.assert_series_equal(result, raw.drop_duplicates())
+
+
+def test_duplicated(setup):
+    # test dataframe drop
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        rs.randint(1000, size=(20, 5)),
+        columns=["c" + str(i + 1) for i in range(5)],
+        index=["i" + str(j) for j in range(20)],
+    )
+    duplicate_lines = rs.randint(1000, size=5)
+    for i in [1, 3, 10, 11, 15]:
+        raw.iloc[i] = duplicate_lines
+
+    with option_context({"combine_size": 2}):
+        # test dataframe
+        for chunk_size in [(8, 3), (20, 5)]:
+            df = from_pandas_df(raw, chunk_size=chunk_size)
+            if chunk_size[0] < len(raw):
+                methods = ["tree", "subset_tree", "shuffle"]
+            else:
+                # 1 chunk
+                methods = [None]
+            for method in methods:
+                for subset in [None, "c1", ["c1", "c2"]]:
+                    for keep in ["first", "last", False]:
+                        try:
+                            r = df.duplicated(method=method, subset=subset, keep=keep)
+                            result = r.execute().fetch()
+                            expected = raw.duplicated(subset=subset, keep=keep)
+                            pd.testing.assert_series_equal(result, expected)
+                        except Exception as e:  # pragma: no cover
+                            raise AssertionError(
+                                f"failed when method={method}, subset={subset}, "
+                                f"keep={keep}"
+                            ) from e
+
+        # test series
+        s = raw["c3"]
+
+        for tp, obj in [("series", s)]:
+            for chunk_size in [8, 20]:
+                to_m = from_pandas_series if tp == "series" else from_pandas_index
+                mobj = to_m(obj, chunk_size=chunk_size)
+                if chunk_size < len(obj):
+                    methods = ["tree", "shuffle"]
+                else:
+                    # 1 chunk
+                    methods = [None]
+                for method in methods:
+                    for keep in ["first", "last", False]:
+                        try:
+                            r = mobj.duplicated(method=method, keep=keep)
+                            result = r.execute().fetch()
+                            expected = obj.duplicated(keep=keep)
+
+                            cmp = (
+                                pd.testing.assert_series_equal
+                                if tp == "series"
+                                else pd.testing.assert_index_equal
+                            )
+                            cmp(result, expected)
+                        except Exception as e:  # pragma: no cover
+                            raise AssertionError(
+                                f"failed when method={method}, keep={keep}"
+                            ) from e
+
+
+@pytest.mark.parametrize("method", ["tree", "shuffle"])
+def test_series_duplicated(setup, method):
+    raw = pd.Series(np.random.randint(5, size=50))
+    s = Series(raw, chunk_size=10)
+    r = s.duplicated(method=method).execute()
+    result = r.execute().fetch()
+    pd.testing.assert_series_equal(result, raw.duplicated())
+
+
+def test_memory_usage_execution(setup):
+    dtypes = ["int64", "float64", "complex128", "object", "bool"]
+    data = dict([(t, np.ones(shape=500).astype(t)) for t in dtypes])
+    raw = pd.DataFrame(data)
+
+    df = from_pandas_df(raw, chunk_size=(500, 2))
+    r = df.memory_usage(index=False)
+    pd.testing.assert_series_equal(r.execute().fetch(), raw.memory_usage(index=False))
+
+    df = from_pandas_df(raw, chunk_size=(500, 2))
+    r = df.memory_usage(index=True)
+    pd.testing.assert_series_equal(r.execute().fetch(), raw.memory_usage(index=True))
+
+    df = from_pandas_df(raw, chunk_size=(100, 3))
+    r = df.memory_usage(index=False)
+    pd.testing.assert_series_equal(r.execute().fetch(), raw.memory_usage(index=False))
+
+    r = df.memory_usage(index=True)
+    pd.testing.assert_series_equal(r.execute().fetch(), raw.memory_usage(index=True))
+
+    raw = pd.DataFrame(data, index=np.arange(500).astype("object"))
+
+    df = from_pandas_df(raw, chunk_size=(100, 3))
+    r = df.memory_usage(index=True)
+    pd.testing.assert_series_equal(r.execute().fetch(), raw.memory_usage(index=True))
+
+    raw = pd.Series(np.ones(shape=500).astype("object"), name="s")
+
+    series = from_pandas_series(raw)
+    r = series.memory_usage(index=True)
+    assert r.execute().fetch() == raw.memory_usage(index=True)
+
+    series = from_pandas_series(raw, chunk_size=100)
+    r = series.memory_usage(index=False)
+    assert r.execute().fetch() == raw.memory_usage(index=False)
+
+    series = from_pandas_series(raw, chunk_size=100)
+    r = series.memory_usage(index=True)
+    assert r.execute().fetch() == raw.memory_usage(index=True)
+
+    raw = pd.Series(
+        np.ones(shape=500).astype("object"),
+        index=np.arange(500).astype("object"),
+        name="s",
+    )
+
+    series = from_pandas_series(raw, chunk_size=100)
+    r = series.memory_usage(index=True)
+    assert r.execute().fetch() == raw.memory_usage(index=True)
+
+    raw = pd.Index(np.arange(500), name="s")
+
+    index = from_pandas_index(raw)
+    r = index.memory_usage()
+    assert r.execute().fetch() == raw.memory_usage()
+
+    index = from_pandas_index(raw, chunk_size=100)
+    r = index.memory_usage()
+    assert r.execute().fetch() == raw.memory_usage()
+
+
+def test_select_dtypes_execution(setup):
+    raw = pd.DataFrame({"a": np.random.rand(10), "b": np.random.randint(10, size=10)})
+
+    df = from_pandas_df(raw, chunk_size=5)
+    r = df.select_dtypes(include=["float64"])
+
+    result = r.execute().fetch()
+    expected = raw.select_dtypes(include=["float64"])
+    pd.testing.assert_frame_equal(result, expected)
+
+
+def test_map_chunk_execution(setup):
+    raw = pd.DataFrame(np.random.rand(10, 5), columns=[f"col{i}" for i in range(5)])
+
+    df = from_pandas_df(raw, chunk_size=(5, 3))
+
+    def f1(pdf):
+        return pdf + 1
+
+    r = df.map_chunk(f1)
+
+    result = r.execute().fetch()
+    expected = raw + 1
+    pd.testing.assert_frame_equal(result, expected)
+
+    raw_s = raw["col1"]
+    series = from_pandas_series(raw_s, chunk_size=5)
+
+    r = series.map_chunk(f1)
+
+    result = r.execute().fetch()
+    expected = raw_s + 1
+    pd.testing.assert_series_equal(result, expected)
+
+    def f2(pdf):
+        return pdf.sum(axis=1)
+
+    df = from_pandas_df(raw, chunk_size=5)
+    r = df.map_chunk(f2, output_type="series")
+
+    result = r.execute().fetch()
+    expected = raw.sum(axis=1)
+    pd.testing.assert_series_equal(result, expected)
+
+    raw = pd.DataFrame({"a": [f"s{i}" for i in range(10)], "b": np.arange(10)})
+
+    df = from_pandas_df(raw, chunk_size=5)
+
+    def f3(pdf):
+        return pdf["a"].str.slice(1).astype(int) + pdf["b"]
+
+    with pytest.raises(TypeError):
+        r = df.map_chunk(f3)
+        _ = r.execute().fetch()
+
+    r = df.map_chunk(f3, output_type="series", dtypes=pd.Series([np.int64]))
+    result = r.execute(extra_config={"check_dtypes": False}).fetch()
+    expected = f3(raw)
+    pd.testing.assert_series_equal(result, expected)
+
+    def f4(pdf):
+        ret = pd.DataFrame(columns=["a", "b"])
+        ret["a"] = pdf["a"].str.slice(1).astype(int)
+        ret["b"] = pdf["b"]
+        return ret
+
+    with pytest.raises(TypeError):
+        r = df.map_chunk(f4, output_type="dataframe")
+        _ = r.execute().fetch()
+
+    r = df.map_chunk(
+        f4,
+        output_type="dataframe",
+        dtypes=pd.Series([np.dtype(int), raw["b"].dtype], index=["a", "b"]),
+    )
+    result = r.execute().fetch()
+    expected = f4(raw)
+    pd.testing.assert_frame_equal(result, expected)
+
+    raw2 = pd.DataFrame({"a": [np.array([1, 2, 3]), np.array([4, 5, 6])]})
+    df2 = from_pandas_df(raw2)
+    dtypes = pd.Series([np.dtype(float)] * 3)
+    r = df2.map_chunk(
+        lambda x: x["a"].apply(pd.Series), output_type="dataframe", dtypes=dtypes
+    )
+    assert r.shape == (np.nan, 3)
+    pd.testing.assert_series_equal(r.dtypes, dtypes)
+    result = r.execute().fetch()
+    expected = raw2.apply(lambda x: x["a"], axis=1, result_type="expand")
+    pd.testing.assert_frame_equal(result, expected)
+
+    raw = pd.DataFrame(np.random.rand(10, 5), columns=[f"col{i}" for i in range(5)])
+
+    df = from_pandas_df(raw, chunk_size=(5, 3))
+
+    def f5(pdf, chunk_index):
+        return pdf + 1 + chunk_index[0]
+
+    r = df.map_chunk(f5, with_chunk_index=True)
+
+    result = r.execute().fetch()
+    expected = (raw + 1).add(np.arange(10) // 5, axis=0)
+    pd.testing.assert_frame_equal(result, expected)
+
+    raw_s = raw["col1"]
+    series = from_pandas_series(raw_s, chunk_size=5)
+
+    r = series.map_chunk(f5, with_chunk_index=True)
+
+    result = r.execute().fetch()
+    expected = raw_s + 1 + np.arange(10) // 5
+    pd.testing.assert_series_equal(result, expected)
+
+    # test args or kwargs with mars objects
+    df = from_pandas_df(raw, chunk_size=5)
+
+    def f6(df, mars_df):
+        return df + mars_df.sum()
+
+    df_arg = from_pandas_df(raw, chunk_size=6)
+    r = df.map_chunk(f6, args=(df_arg,), output_type="dataframe", dtypes=df.dtypes)
+    expected = raw + raw.sum()
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(result, expected)
+
+    df = from_pandas_df(raw, chunk_size=5)
+    df_arg = from_pandas_df(raw, chunk_size=6)
+    r = df.map_chunk(
+        f6, kwargs=dict(mars_df=df_arg), output_type="dataframe", dtypes=df.dtypes
+    )
+    expected = raw + raw.sum()
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(result, expected)
+
+    def f7(s):
+        return s.to_json()
+
+    with pytest.raises(TypeError):
+        series.map_chunk(f7)
+
+
+def test_map_chunk_with_df_or_series_output(setup):
+    raw = pd.DataFrame(np.random.rand(10, 5), columns=[f"col{i}" for i in range(5)])
+
+    df = from_pandas_df(raw, chunk_size=(5, 3))
+
+    def f1(pdf):
+        return pdf.iloc[2, :2]
+
+    with pytest.raises(TypeError):
+        df.map_chunk(f1)
+
+    for kwargs in [dict(output_type="df_or_series"), dict(skip_infer=True)]:
+        res = df.map_chunk(f1, **kwargs)
+        assert isinstance(res, DATAFRAME_OR_SERIES_TYPE)
+        res = res.execute()
+        assert res.data_type == "series"
+        assert res.dtype == np.dtype("float")
+        assert not ("dtypes" in res.data_params)
+        assert res.shape == (4,)
+        pd.testing.assert_series_equal(
+            res.fetch(), pd.concat([raw.iloc[2, :2], raw.iloc[7, :2]])
+        )
+
+    def f2(pdf):
+        return pdf.iloc[[0, 2], :2]
+
+    with pytest.raises(TypeError):
+        df.map_chunk(f2)
+
+    res = df.map_chunk(f2, output_type="df_or_series")
+    assert isinstance(res, DATAFRAME_OR_SERIES_TYPE)
+    res = res.execute()
+    assert res.data_type == "dataframe"
+    pd.testing.assert_series_equal(res.dtypes, raw.dtypes[:2])
+    assert not ("dtype" in res.data_params)
+    assert res.shape == (4, 2)
+    pd.testing.assert_frame_equal(
+        res.fetch(),
+        raw.iloc[[0, 2, 5, 7], :2],
+    )
+
+
+def test_map_chunk_closure_execute(setup):
+    raw = pd.DataFrame(
+        np.random.randint(10**3, size=(10, 5)), columns=[f"col{i}" for i in range(5)]
+    )
+
+    df = from_pandas_df(raw, chunk_size=5)
+    num = 1
+    dic = {i: -i for i in range(10**3)}
+
+    def f1(pdf):
+        return pdf + num
+
+    r = df.map_chunk(f1)
+
+    result = r.execute().fetch()
+    expected = raw + num
+    pd.testing.assert_frame_equal(result, expected)
+
+    def f2(pdf):
+        ret = pd.DataFrame(columns=["col1", "col2"])
+        ret["col1"] = pdf["col1"].apply(lambda x: dic.get(x, 0))
+        ret["col2"] = pdf["col2"]
+        return ret
+
+    r = df.map_chunk(f2, output_type="dataframe")
+
+    result = r.execute().fetch()
+    expected = f2(raw)
+    pd.testing.assert_frame_equal(result, expected)
+
+    class callable_df:
+        def __init__(self, multiplier: int = 1):
+            self.dic = {i: -i for i in range(10**multiplier)}
+
+        def __call__(self, pdf):
+            ret = pd.DataFrame(columns=["col1", "col2"])
+            ret["col1"] = pdf["col1"].apply(lambda x: self.dic.get(x, 0))
+            ret["col2"] = pdf["col2"]
+            return ret
+
+    cdf = callable_df(multiplier=4)
+    r = df.map_chunk(cdf, output_type="dataframe")
+
+    result = r.execute().fetch()
+    expected = cdf(raw)
+    pd.testing.assert_frame_equal(result, expected)
+
+
+@pytest.mark.ray_dag
+def test_cartesian_chunk_execution(setup):
+    rs = np.random.RandomState(0)
+    raw1 = pd.DataFrame({"a": rs.randint(3, size=10), "b": rs.rand(10)})
+    raw2 = pd.DataFrame(
+        {"c": rs.randint(3, size=10), "d": rs.rand(10), "e": rs.rand(10)}
+    )
+    df1 = from_pandas_df(raw1, chunk_size=(5, 1))
+    df2 = from_pandas_df(raw2, chunk_size=(5, 1))
+
+    def f(c1, c2):
+        c1, c2 = c1.copy(), c2.copy()
+        c1["x"] = 1
+        c2["x"] = 1
+        r = c1.merge(c2, on="x")
+        r = r[(r["b"] > r["d"]) & (r["b"] < r["e"])]
+        return r[["a", "c"]]
+
+    rr = df1.cartesian_chunk(df2, f)
+
+    result = rr.execute().fetch()
+    expected = f(raw1, raw2)
+    pd.testing.assert_frame_equal(
+        result.sort_values(by=["a", "c"]).reset_index(drop=True),
+        expected.sort_values(by=["a", "c"]).reset_index(drop=True),
+    )
+
+    def f2(c1, c2):
+        r = f(c1, c2)
+        return r["a"] + r["c"]
+
+    rr = df1.cartesian_chunk(df2, f2)
+
+    result = rr.execute().fetch()
+    expected = f2(raw1, raw2)
+    pd.testing.assert_series_equal(
+        result.sort_values().reset_index(drop=True),
+        expected.sort_values().reset_index(drop=True),
+    )
+
+    # size_res = setup.executor.execute_dataframe(rr, mock=True)[0][0]
+    # assert size_res > 0
+
+    def f3(c1, c2):
+        cr = pd.DataFrame()
+        cr["a"] = c1.str.slice(1).astype(np.int64)
+        cr["x"] = 1
+        cr2 = pd.DataFrame()
+        cr2["b"] = c2.str.slice(1).astype(np.int64)
+        cr2["x"] = 1
+        return cr.merge(cr2, on="x")[["a", "b"]]
+
+    s_raw = pd.Series([f"s{i}" for i in range(10)])
+    series = from_pandas_series(s_raw, chunk_size=5)
+
+    rr = series.cartesian_chunk(
+        series,
+        f3,
+        output_type="dataframe",
+        dtypes=pd.Series([np.dtype(np.int64)] * 2, index=["a", "b"]),
+    )
+
+    result = rr.execute().fetch()
+    expected = f3(s_raw, s_raw)
+    pd.testing.assert_frame_equal(
+        result.sort_values(by=["a", "b"]).reset_index(drop=True),
+        expected.sort_values(by=["a", "b"]).reset_index(drop=True),
+    )
+
+    with pytest.raises(TypeError):
+        _ = series.cartesian_chunk(series, f3)
+
+    def f4(c1, c2):
+        r = f3(c1, c2)
+        return r["a"] + r["b"]
+
+    rr = series.cartesian_chunk(
+        series, f4, output_type="series", dtypes=np.dtype(np.int64)
+    )
+
+    result = rr.execute().fetch()
+    expected = f4(s_raw, s_raw)
+    pd.testing.assert_series_equal(
+        result.sort_values().reset_index(drop=True),
+        expected.sort_values().reset_index(drop=True),
+    )
+
+
+def test_cartesian_chunk_with_df_or_series(setup):
+    rs = np.random.RandomState(0)
+    raw1 = pd.DataFrame({"a": range(10), "b": rs.rand(10)})
+    raw2 = pd.DataFrame(
+        {"c": rs.randint(3, size=10), "d": rs.rand(10), "e": rs.rand(10)}
+    )
+    df1 = from_pandas_df(raw1, chunk_size=(5, 1))
+    df2 = from_pandas_df(raw2, chunk_size=(5, 1))
+
+    def f1(c1, c2):
+        return c1.iloc[[2, 4], :]
+
+    with pytest.raises(TypeError):
+        df1.cartesian_chunk(df2, f1)
+
+    for kwargs in [dict(output_type="df_or_series"), dict(skip_infer=True)]:
+        res = df1.cartesian_chunk(df2, f1, **kwargs)
+
+        assert isinstance(res, DATAFRAME_OR_SERIES_TYPE)
+        res = res.execute()
+        assert res.data_type == "dataframe"
+        assert not ("dtype" in res.data_params)
+        assert res.shape == (8, 2)
+        pd.testing.assert_series_equal(res.dtypes, raw1.dtypes)
+        pd.testing.assert_frame_equal(
+            res.fetch(), raw1.iloc[[2, 4] * 2 + [7, 9] * 2, :]
+        )
+
+    def f2(c1, c2):
+        return c1.iloc[2, :]
+
+    with pytest.raises(TypeError):
+        df1.cartesian_chunk(df2, f2)
+
+    res = df1.cartesian_chunk(df2, f2, output_type="df_or_series")
+
+    assert isinstance(res, DATAFRAME_OR_SERIES_TYPE)
+    res = res.execute()
+    assert res.data_type == "series"
+    assert not ("dtypes" in res.data_params)
+    assert res.shape == (8,)
+    pd.testing.assert_series_equal(
+        res.fetch(),
+        pd.concat([raw1.iloc[2, :], raw1.iloc[2, :], raw1.iloc[7, :], raw1.iloc[7, :]]),
+    )
+
+
+def test_rebalance_execution(setup):
+    raw = pd.DataFrame(np.random.rand(10, 3), columns=list("abc"))
+    df = from_pandas_df(raw)
+
+    def _expect_count(n):
+        def _tile_rebalance(op):
+            tileable = yield from op.tile(op)
+            assert len(tileable.chunks) == n
+            return tileable
+
+        return _tile_rebalance
+
+    r = df.rebalance(num_partitions=3)
+    extra_config = {"operand_tile_handlers": {DataFrameRebalance: _expect_count(3)}}
+    result = r.execute(extra_config=extra_config).fetch()
+    pd.testing.assert_frame_equal(result, raw)
+
+    r = df.rebalance(factor=0.5)
+    extra_config = {"operand_tile_handlers": {DataFrameRebalance: _expect_count(1)}}
+    result = r.execute(extra_config=extra_config).fetch()
+    pd.testing.assert_frame_equal(result, raw)
+
+    # test worker has two cores
+    r = df.rebalance()
+    extra_config = {"operand_tile_handlers": {DataFrameRebalance: _expect_count(2)}}
+    result = r.execute(extra_config=extra_config).fetch()
+    pd.testing.assert_frame_equal(result, raw)
+
+
+def test_stack_execution(setup):
+    raw = pd.DataFrame(
+        np.random.rand(10, 3), columns=list("abc"), index=[f"s{i}" for i in range(10)]
+    )
+    for loc in [(5, 1), (8, 2), (1, 0)]:
+        raw.iloc[loc] = np.nan
+    df = from_pandas_df(raw, chunk_size=(5, 2))
+
+    for dropna in (True, False):
+        r = df.stack(dropna=dropna)
+        result = r.execute().fetch()
+        expected = raw.stack(dropna=dropna)
+        pd.testing.assert_series_equal(result, expected)
+
+    cols = pd.MultiIndex.from_tuples([("c1", "cc1"), ("c1", "cc2"), ("c2", "cc3")])
+    raw2 = raw.copy()
+    raw2.columns = cols
+    df = from_pandas_df(raw2, chunk_size=(5, 2))
+
+    for level in [-1, 0, [0, 1]]:
+        for dropna in (True, False):
+            r = df.stack(level=level, dropna=dropna)
+            result = r.execute().fetch()
+            expected = raw2.stack(level=level, dropna=dropna)
+            assert_method = (
+                pd.testing.assert_series_equal
+                if expected.ndim == 1
+                else pd.testing.assert_frame_equal
+            )
+            assert_method(result, expected)
+
+
+@pytest.mark.parametrize(
+    "ignore_index", [False, True] if _explode_with_ignore_index else [False]
+)
+def test_explode_execution(setup, ignore_index):
+    explode_kw = {"ignore_index": True} if ignore_index else {}
+
+    raw = pd.DataFrame(
+        {
+            "a": np.random.rand(10),
+            "b": [np.random.rand(random.randint(1, 10)) for _ in range(10)],
+            "c": np.random.rand(10),
+            "d": np.random.rand(10),
+        }
+    )
+    df = from_pandas_df(raw, chunk_size=(4, 2))
+    r = df.explode("b", ignore_index=ignore_index)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.explode("b", **explode_kw))
+
+    series = from_pandas_series(raw.b, chunk_size=4)
+    r = series.explode(ignore_index=ignore_index)
+    pd.testing.assert_series_equal(r.execute().fetch(), raw.b.explode(**explode_kw))
+
+
+def test_eval_query_execution(setup):
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame({"a": rs.rand(100), "b": rs.rand(100), "c c": rs.rand(100)})
+    df = from_pandas_df(raw, chunk_size=(10, 2))
+
+    r = mars_eval('c = df.a * 2 + df["c c"]', target=df)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(),
+        pd.eval('c = raw.a * 2 + raw["c c"]', engine="python", target=raw),
+    )
+
+    r = df.eval("a + b")
+    pd.testing.assert_series_equal(r.execute().fetch(), raw.eval("a + b"))
+
+    _val = 5.0  # noqa: F841
+    _val_array = [1, 2, 3]  # noqa: F841
+    expr = """
+    e = -a + b + 1
+    f = b + `c c` + @_val + @_val_array[-1]
+    """
+    r = df.eval(expr)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.eval(expr))
+
+    copied_df = df.copy()
+    copied_df.eval("c = a + b", inplace=True)
+    pd.testing.assert_frame_equal(copied_df.execute().fetch(), raw.eval("c = a + b"))
+
+    expr = "a > b | a < `c c`"
+    r = df.query(expr)
+    pd.testing.assert_frame_equal(
+        r.execute(extra_config={"check_index_value": False}).fetch(), raw.query(expr)
+    )
+
+    expr = "a > b & ~(a < `c c`)"
+    r = df.query(expr)
+    pd.testing.assert_frame_equal(
+        r.execute(extra_config={"check_index_value": False}).fetch(), raw.query(expr)
+    )
+
+    expr = "a < b < `c c`"
+    r = df.query(expr)
+    pd.testing.assert_frame_equal(
+        r.execute(extra_config={"check_index_value": False}).fetch(), raw.query(expr)
+    )
+
+    expr = "a < 0.5 and a != 0.1 and b != 0.2"
+    r = df.query(expr)
+    pd.testing.assert_frame_equal(
+        r.execute(extra_config={"check_index_value": False}).fetch(), raw.query(expr)
+    )
+
+    expr = "(a < 0.5 or a > 0.7) and (b != 0.1 or `c c` > 0.2)"
+    r = df.query(expr)
+    pd.testing.assert_frame_equal(
+        r.execute(extra_config={"check_index_value": False}).fetch(), raw.query(expr)
+    )
+
+    copied_df = df.copy()
+    copied_df.query("a < b", inplace=True)
+    pd.testing.assert_frame_equal(
+        copied_df.execute(extra_config={"check_index_value": False}).fetch(),
+        raw.query("a < b"),
+    )
+
+
+def test_check_monotonic_execution(setup):
+    idx_value = pd.Index(list(range(1000)))
+
+    idx_increase = from_pandas_index(idx_value, chunk_size=100)
+    assert idx_increase.is_monotonic_increasing.execute().fetch() is True
+    assert idx_increase.is_monotonic_decreasing.execute().fetch() is False
+
+    idx_decrease = from_pandas_index(idx_value[::-1], chunk_size=100)
+    assert idx_decrease.is_monotonic_increasing.execute().fetch() is False
+    assert idx_decrease.is_monotonic_decreasing.execute().fetch() is True
+
+    idx_mixed = from_pandas_index(
+        pd.Index(list(range(500)) + list(range(500))), chunk_size=100
+    )
+    assert idx_mixed.is_monotonic_increasing.execute().fetch() is False
+    assert idx_mixed.is_monotonic_decreasing.execute().fetch() is False
+
+    ser_mixed = from_pandas_series(
+        pd.Series(list(range(500)) + list(range(499, 999))), chunk_size=100
+    )
+    assert ser_mixed.is_monotonic_increasing.execute().fetch() is True
+    assert ser_mixed.is_monotonic_decreasing.execute().fetch() is False
+
+
+def test_pct_change_execution(setup):
+    # test dataframe
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        rs.randint(1000, size=(10, 8)),
+        columns=["col" + str(i + 1) for i in range(8)],
+        index=pd.date_range("2021-1-1", periods=10),
+    )
+
+    df = from_pandas_df(raw, chunk_size=5)
+    r = df.pct_change()
+    result = r.execute().fetch()
+    expected = raw.pct_change()
+    pd.testing.assert_frame_equal(expected, result)
+
+    df = from_pandas_df(raw, chunk_size=5)
+    r = df.pct_change(fill_method=None)
+    result = r.execute().fetch()
+    expected = raw.pct_change(fill_method=None)
+    pd.testing.assert_frame_equal(expected, result)
+
+    df = from_pandas_df(raw, chunk_size=5)
+    r = df.pct_change(freq="D")
+    result = r.execute().fetch()
+    expected = raw.pct_change(freq="D")
+    pd.testing.assert_frame_equal(expected, result)
+
+
+def test_bloom_filter(setup):
+    rs = np.random.RandomState(0)
+    raw1 = pd.DataFrame(
+        {"col1": rs.randint(0, 100, size=(100,)), "col2": rs.random(100)}
+    )
+    raw2 = pd.DataFrame(
+        {"col1": rs.randint(0, 10, size=(100,)), "col2": rs.random(100)}
+    )
+
+    df1 = from_pandas_df(raw1, chunk_size=10)
+    df2 = from_pandas_df(raw2, chunk_size=20)
+
+    filtered = filter_by_bloom_filter(df1, df2, "col1", "col1")
+    r1, r2, filtered_r = mars.fetch(mars.execute(df1, df2, filtered))
+    assert r1.shape[0] > filtered_r.shape[0]
+    assert len(filtered_r[filtered_r["col1"] > 10]) < 10
+
+    pd.testing.assert_frame_equal(r1, raw1)
+    pd.testing.assert_frame_equal(r2, raw2)
+    pd.testing.assert_frame_equal(
+        filtered_r[filtered_r["col1"] <= 10], raw1[raw1["col1"] <= 10]
+    )
diff --git a/python/xorbits/_mars/dataframe/base/to_cpu.py b/python/xorbits/_mars/dataframe/base/to_cpu.py
new file mode 100644
index 000000000..4b2e79406
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/to_cpu.py
@@ -0,0 +1,38 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from .core import DataFrameDeviceConversionBase
+
+
+class DataFrameToCPU(DataFrameDeviceConversionBase):
+    _op_type_ = OperandDef.TO_CPU
+
+    def __init__(self, dtypes=None, output_types=None, **kw):
+        super().__init__(_dtypes=dtypes, _output_types=output_types, **kw)
+        if self.gpu or self.gpu is None:
+            self.gpu = False
+
+    @classmethod
+    def execute(cls, ctx, op):
+        ctx[op.outputs[0].key] = ctx[op.inputs[0].key].to_pandas()
+
+
+def to_cpu(df_or_series):
+    if df_or_series.op.gpu is False:
+        # if op.gpu is None, means unknown
+        return df_or_series
+
+    op = DataFrameToCPU()
+    return op(df_or_series)
diff --git a/python/xorbits/_mars/dataframe/base/to_gpu.py b/python/xorbits/_mars/dataframe/base/to_gpu.py
new file mode 100644
index 000000000..16e111f96
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/to_gpu.py
@@ -0,0 +1,43 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from .core import DataFrameDeviceConversionBase
+
+
+class DataFrameToGPU(DataFrameDeviceConversionBase):
+    _op_type_ = OperandDef.TO_GPU
+
+    def __init__(self, dtypes=None, output_types=None, **kw):
+        super().__init__(_dtypes=dtypes, _output_types=output_types, **kw)
+        if not self.gpu:
+            self.gpu = True
+
+    @classmethod
+    def execute(cls, ctx, op):
+        import cudf
+
+        out_df = op.outputs[0]
+        if out_df.ndim == 2:
+            ctx[out_df.key] = cudf.DataFrame.from_pandas(ctx[op.inputs[0].key])
+        else:
+            ctx[out_df.key] = cudf.Series.from_pandas(ctx[op.inputs[0].key])
+
+
+def to_gpu(df_or_series):
+    if df_or_series.op.gpu:
+        return df_or_series
+
+    op = DataFrameToGPU()
+    return op(df_or_series)
diff --git a/python/xorbits/_mars/dataframe/base/to_numeric.py b/python/xorbits/_mars/dataframe/base/to_numeric.py
new file mode 100644
index 000000000..762913bd1
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/to_numeric.py
@@ -0,0 +1,220 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import pandas as pd
+
+from ...core import ENTITY_TYPE, OutputType
+from ...serialization.serializables import StringField
+from ...tensor import tensor as astensor
+from ...tensor.core import TENSOR_TYPE, TensorOrder
+from ..core import SERIES_TYPE
+from ..initializer import Series as asseries
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+
+
+class DataFrameToNumeric(DataFrameOperand, DataFrameOperandMixin):
+    errors = StringField("errors")
+    downcast = StringField("downcast")
+
+    def __init__(self, errors="raise", downcast=None, **kw):
+        super().__init__(errors=errors, downcast=downcast, **kw)
+
+    def __call__(self, arg):
+        if isinstance(arg, pd.Series):
+            arg = asseries(arg)
+        elif not isinstance(arg, ENTITY_TYPE):
+            arg = astensor(arg)
+        if arg.ndim != 1:
+            raise ValueError("Input array must be 1 dimensional")
+        if arg.size == 0:
+            raise ValueError("Input array can not be empty")
+
+        if isinstance(arg, asseries):
+            series = arg
+            self.output_types = [OutputType.series]
+            return self.new_series(
+                [series],
+                shape=series.shape,
+                name=series.name,
+                index_value=series.index_value,
+                dtype=series.dtype,
+            )
+        else:
+            tensor = arg
+            self.output_types = [OutputType.tensor]
+            dtype = tensor.dtype
+            if dtype.kind == "U":
+                dtype = np.dtype(object)
+            return self.new_tileables([tensor], shape=tensor.shape, dtype=dtype)[0]
+
+    @classmethod
+    def tile(cls, op):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+
+        out_chunks = []
+        for in_chunk in in_df.chunks:
+            out_op = op.copy().reset_key()
+            chunk_kws = []
+            if isinstance(out_df, SERIES_TYPE):
+                chunk_kws.append(
+                    {
+                        "dtype": out_df.dtype,
+                        "shape": in_chunk.shape,
+                        "index": in_chunk.index,
+                        "index_value": in_chunk.index_value,
+                        "name": in_chunk.name,
+                    }
+                )
+            elif isinstance(out_df, TENSOR_TYPE):
+                chunk_kws.append(
+                    {
+                        "dtype": out_df.dtype,
+                        "shape": in_chunk.shape,
+                        "order": TensorOrder.C_ORDER,
+                        "index": in_chunk.index,
+                    }
+                )
+            out_chunks.append(out_op.new_chunk([in_chunk], kws=chunk_kws))
+
+        new_op = op.copy()
+        kw = out_df.params
+        kw["nsplits"] = in_df.nsplits
+        kw["chunks"] = out_chunks
+        return new_op.new_tileables(op.inputs, kws=[kw])
+
+    @classmethod
+    def execute(cls, ctx, op):
+        input_data = ctx[op.inputs[0].key]
+        errors_ = op.errors
+        downcast_ = op.downcast
+        ctx[op.outputs[0].key] = pd.to_numeric(
+            input_data, errors=errors_, downcast=downcast_
+        )
+
+
+def to_numeric(arg, errors="raise", downcast=None):
+    """
+    Convert argument to a numeric type.
+
+    The default return dtype is `float64` or `int64`
+    depending on the data supplied. Use the `downcast` parameter
+    to obtain other dtypes.
+
+    Please note that precision loss may occur if really large numbers
+    are passed in. Due to the internal limitations of `ndarray`, if
+    numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
+    or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
+    passed in, it is very likely they will be converted to float so that
+    they can stored in an `ndarray`. These warnings apply similarly to
+    `Series` since it internally leverages `ndarray`.
+
+    Parameters
+    ----------
+    arg : scalar, list, tuple, 1-d array, or Series
+        Argument to be converted.
+    errors : {'ignore', 'raise', 'coerce'}, default 'raise'
+        - If 'raise', then invalid parsing will raise an exception.
+        - If 'coerce', then invalid parsing will be set as NaN.
+        - If 'ignore', then invalid parsing will return the input.
+    downcast : {'integer', 'signed', 'unsigned', 'float'}, default None
+        If not None, and if the data has been successfully cast to a
+        numerical dtype (or if the data was numeric to begin with),
+        downcast that resulting data to the smallest numerical dtype
+        possible according to the following rules:
+
+        - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
+        - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
+        - 'float': smallest float dtype (min.: np.float32)
+
+        As this behaviour is separate from the core conversion to
+        numeric values, any errors raised during the downcasting
+        will be surfaced regardless of the value of the 'errors' input.
+
+        In addition, downcasting will only occur if the size
+        of the resulting data's dtype is strictly larger than
+        the dtype it is to be cast to, so if none of the dtypes
+        checked satisfy that specification, no downcasting will be
+        performed on the data.
+
+    Returns
+    -------
+    ret
+        Numeric if parsing succeeded.
+        Return type depends on input.  Series if Series, otherwise Tensor.
+
+    See Also
+    --------
+    DataFrame.astype : Cast argument to a specified dtype.
+    to_datetime : Convert argument to datetime.
+    to_timedelta : Convert argument to timedelta.
+    numpy.ndarray.astype : Cast a numpy array to a specified type.
+    DataFrame.convert_dtypes : Convert dtypes.
+
+    Examples
+    --------
+    Take separate series and convert to numeric, coercing when told to
+
+    >>> s = md.Series(['1.0', '2', -3])
+    >>> md.to_numeric(s).execute()
+    0    1.0
+    1    2.0
+    2   -3.0
+    dtype: float64
+    >>> md.to_numeric(s, downcast='float').execute()
+    0    1.0
+    1    2.0
+    2   -3.0
+    dtype: float32
+    >>> md.to_numeric(s, downcast='signed').execute()
+    0    1
+    1    2
+    2   -3
+    dtype: int8
+    >>> s = md.Series(['apple', '1.0', '2', -3])
+    >>> md.to_numeric(s, errors='ignore').execute()
+    0    apple
+    1      1.0
+    2        2
+    3       -3
+    dtype: object
+    >>> md.to_numeric(s, errors='coerce').execute()
+    0    NaN
+    1    1.0
+    2    2.0
+    3   -3.0
+    dtype: float64
+
+    Downcasting of nullable integer and floating dtypes is supported:
+
+    >>> s = md.Series([1, 2, 3], dtype="int64")
+    >>> md.to_numeric(s, downcast="integer").execute()
+    0    1
+    1    2
+    2    3
+    dtype: int8
+    >>> s = md.Series([1.0, 2.1, 3.0], dtype="float64")
+    >>> md.to_numeric(s, downcast="float").execute()
+    0    1.0
+    1    2.1
+    2    3.0
+    dtype: float32
+    """
+    if errors not in ("ignore", "raise", "coerce"):
+        raise ValueError("invalid error value specified")
+    if downcast not in (None, "integer", "signed", "unsigned", "float"):
+        raise ValueError("invalid downcasting method provided")
+
+    op = DataFrameToNumeric(errors=errors, downcast=downcast)
+    return op(arg)
diff --git a/python/xorbits/_mars/dataframe/base/transform.py b/python/xorbits/_mars/dataframe/base/transform.py
new file mode 100644
index 000000000..e72bca0cb
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/transform.py
@@ -0,0 +1,533 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...config import options
+from ...core import OutputType, recursive_tile
+from ...core.custom_log import redirect_custom_log
+from ...serialization.serializables import AnyField, BoolField, DictField, TupleField
+from ...utils import enter_current_session, pd_release_version, quiet_stdio
+from ..core import DATAFRAME_CHUNK_TYPE, DATAFRAME_TYPE
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import (
+    build_df,
+    build_series,
+    filter_dtypes_by_index,
+    make_dtypes,
+    parse_index,
+    validate_axis,
+)
+
+_with_convert_dtype = pd_release_version < (1, 2, 0)
+
+
+class TransformOperand(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.TRANSFORM
+
+    _func = AnyField("func")
+    _axis = AnyField("axis")
+    _convert_dtype = BoolField("convert_dtype")
+    _args = TupleField("args")
+    _kwds = DictField("kwds")
+
+    _call_agg = BoolField("call_agg")
+
+    def __init__(
+        self,
+        func=None,
+        axis=None,
+        convert_dtype=None,
+        args=None,
+        kwds=None,
+        call_agg=None,
+        output_types=None,
+        memory_scale=None,
+        **kw
+    ):
+        super().__init__(
+            _func=func,
+            _axis=axis,
+            _convert_dtype=convert_dtype,
+            _args=args,
+            _kwds=kwds,
+            _call_agg=call_agg,
+            _output_types=output_types,
+            _memory_scale=memory_scale,
+            **kw
+        )
+
+    @property
+    def func(self):
+        return self._func
+
+    @property
+    def convert_dtype(self):
+        return self._convert_dtype
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def args(self):
+        return getattr(self, "_args", None) or ()
+
+    @property
+    def kwds(self):
+        return getattr(self, "_kwds", None) or dict()
+
+    @property
+    def call_agg(self):
+        return self._call_agg
+
+    @classmethod
+    @redirect_custom_log
+    @enter_current_session
+    def execute(cls, ctx, op):
+        in_data = ctx[op.inputs[0].key]
+        out_chunk = op.outputs[0]
+
+        if op.call_agg:
+            result = in_data.agg(op.func, axis=op.axis, *op.args, **op.kwds)
+        else:
+            result = in_data.transform(op.func, axis=op.axis, *op.args, **op.kwds)
+
+        if isinstance(out_chunk, DATAFRAME_CHUNK_TYPE):
+            if out_chunk.dtypes is not None:
+                result.columns = out_chunk.dtypes.index
+        ctx[op.outputs[0].key] = result
+
+    @classmethod
+    def tile(cls, op: "TransformOperand"):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+        axis = op.axis
+
+        if isinstance(in_df, DATAFRAME_TYPE):
+            if in_df.chunk_shape[axis] > 1:
+                chunk_size = (
+                    in_df.shape[axis],
+                    max(1, options.chunk_store_limit // in_df.shape[axis]),
+                )
+                if axis == 1:
+                    chunk_size = chunk_size[::-1]
+                in_df = yield from recursive_tile(in_df.rechunk(chunk_size))
+        elif isinstance(op.func, str) or (
+            isinstance(op.func, list) and any(isinstance(e, str) for e in op.func)
+        ):
+            # builtin cols handles whole columns, thus merge is needed
+            if in_df.chunk_shape[0] > 1:
+                in_df = yield from recursive_tile(in_df.rechunk((in_df.shape[axis],)))
+
+        chunks = []
+        axis_index_map = dict()
+        col_sizes = []
+        for c in in_df.chunks:
+            new_op = op.copy().reset_key()
+            new_op.tileable_op_key = op.key
+            params = c.params.copy()
+
+            if out_df.ndim == 2:
+                if out_df.dtypes is None:
+                    new_dtypes = None
+                    new_shape = [c.shape[0], np.nan]
+                    new_index = c.index
+                    new_columns_value = None
+                    if c.index[0] == 0:
+                        col_sizes.append(np.nan)
+                elif isinstance(c, DATAFRAME_CHUNK_TYPE):
+                    columns = c.columns_value.to_pandas()
+                    new_dtypes = filter_dtypes_by_index(out_df.dtypes, columns)
+
+                    if len(new_dtypes) == 0:
+                        continue
+                    if c.index[0] == 0:
+                        col_sizes.append(len(new_dtypes))
+
+                    new_index = list(c.index)
+                    try:
+                        new_index[1 - op.axis] = axis_index_map[c.index[1 - op.axis]]
+                    except KeyError:
+                        new_index[1 - op.axis] = axis_index_map[
+                            c.index[1 - op.axis]
+                        ] = len(axis_index_map)
+
+                    if isinstance(op.func, dict):
+                        new_op._func = dict(
+                            (k, v) for k, v in op.func.items() if k in new_dtypes
+                        )
+
+                    new_shape = list(c.shape)
+                    new_shape[1] = len(new_dtypes)
+
+                    if op.call_agg:
+                        new_shape[op.axis] = np.nan
+                        params["index_value"] = parse_index(
+                            None, c.key, c.index_value.key
+                        )
+                    new_columns_value = parse_index(new_dtypes.index)
+                else:
+                    new_dtypes = out_df.dtypes
+                    new_index = c.index + (0,)
+                    new_shape = [c.shape[0], len(new_dtypes)]
+                    if op.call_agg:
+                        new_shape[0] = np.nan
+                    if c.index[0] == 0:
+                        col_sizes.append(len(new_dtypes))
+                    new_columns_value = out_df.columns_value
+                params.update(
+                    dict(
+                        dtypes=new_dtypes,
+                        shape=tuple(new_shape),
+                        index=tuple(new_index),
+                        columns_value=new_columns_value,
+                    )
+                )
+            else:
+                params["dtype"] = out_df.dtype
+                if isinstance(in_df, DATAFRAME_TYPE):
+                    params.pop("columns_value", None)
+                    params["index_value"] = out_df.index_value
+                    params["shape"] = (c.shape[1 - op.axis],)
+                    params["index"] = (c.index[1 - op.axis],)
+            chunks.append(new_op.new_chunk([c], **params))
+
+        if out_df.ndim == 2:
+            new_nsplits = [in_df.nsplits[0], tuple(col_sizes)]
+            if op.call_agg:
+                new_nsplits[op.axis] = (np.nan,)
+        elif op.call_agg:
+            if isinstance(in_df, DATAFRAME_TYPE):
+                new_nsplits = (in_df.nsplits[1],)
+            else:
+                new_nsplits = ((np.nan,),)
+        else:
+            new_nsplits = in_df.nsplits
+
+        new_op = op.copy()
+        kw = out_df.params.copy()
+        kw.update(dict(chunks=chunks, nsplits=tuple(new_nsplits)))
+        return new_op.new_tileables(op.inputs, **kw)
+
+    def _infer_df_func_returns(self, df, dtypes):
+        if self.output_types[0] == OutputType.dataframe:
+            test_df = build_df(df, fill_value=1, size=2)
+            try:
+                with np.errstate(all="ignore"), quiet_stdio():
+                    if self.call_agg:
+                        infer_df = test_df.agg(
+                            self._func, axis=self._axis, *self.args, **self.kwds
+                        )
+                    else:
+                        infer_df = test_df.transform(
+                            self._func, axis=self._axis, *self.args, **self.kwds
+                        )
+            except:  # noqa: E722
+                infer_df = None
+        else:
+            test_df = build_series(df, size=2, name=df.name)
+            try:
+                with np.errstate(all="ignore"), quiet_stdio():
+                    if self.call_agg:
+                        infer_df = test_df.agg(self._func, args=self.args, **self.kwds)
+                    else:
+                        if not _with_convert_dtype:
+                            infer_df = test_df.transform(
+                                self._func, *self.args, **self.kwds
+                            )
+                        else:  # pragma: no cover
+                            infer_df = test_df.transform(
+                                self._func,
+                                convert_dtype=self.convert_dtype,
+                                args=self.args,
+                                **self.kwds
+                            )
+            except:  # noqa: E722
+                infer_df = None
+
+        if infer_df is None and dtypes is None:
+            raise TypeError(
+                "Failed to infer dtype, please specify dtypes as arguments."
+            )
+
+        if infer_df is None:
+            is_df = self.output_types[0] == OutputType.dataframe
+        else:
+            is_df = isinstance(infer_df, pd.DataFrame)
+
+        if is_df:
+            new_dtypes = make_dtypes(dtypes) if dtypes is not None else infer_df.dtypes
+            self.output_types = [OutputType.dataframe]
+        else:
+            new_dtypes = (
+                dtypes if dtypes is not None else (infer_df.name, infer_df.dtype)
+            )
+            self.output_types = [OutputType.series]
+
+        return new_dtypes
+
+    def __call__(self, df, dtypes=None, index=None, skip_infer=None):
+        axis = getattr(self, "axis", None) or 0
+        self._axis = validate_axis(axis, df)
+
+        if not skip_infer:
+            dtypes = self._infer_df_func_returns(df, dtypes)
+
+        if self.output_types[0] == OutputType.dataframe:
+            new_shape = list(df.shape)
+            new_index_value = df.index_value
+            if len(new_shape) == 1:
+                new_shape.append(len(dtypes) if dtypes is not None else np.nan)
+            else:
+                new_shape[1] = len(dtypes) if dtypes is not None else np.nan
+
+            if self.call_agg:
+                new_shape[self.axis] = np.nan
+                new_index_value = parse_index(None, (df.key, df.index_value.key))
+            if dtypes is None:
+                columns_value = None
+            else:
+                columns_value = parse_index(dtypes.index, store_data=True)
+            return self.new_dataframe(
+                [df],
+                shape=tuple(new_shape),
+                dtypes=dtypes,
+                index_value=new_index_value,
+                columns_value=columns_value,
+            )
+        else:
+            if dtypes is not None:
+                name, dtype = dtypes
+            else:
+                name, dtype = None, None
+
+            if isinstance(df, DATAFRAME_TYPE):
+                new_shape = (df.shape[1 - axis],)
+                new_index_value = [df.columns_value, df.index_value][axis]
+            else:
+                new_shape = (np.nan,) if self.call_agg else df.shape
+                new_index_value = df.index_value
+
+            return self.new_series(
+                [df],
+                shape=new_shape,
+                name=name,
+                dtype=dtype,
+                index_value=new_index_value,
+            )
+
+
+def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwargs):
+    """
+    Call ``func`` on self producing a DataFrame with transformed values.
+
+    Produced DataFrame will have same axis length as self.
+
+    Parameters
+    ----------
+    func : function, str, list or dict
+        Function to use for transforming the data. If a function, must either
+        work when passed a DataFrame or when passed to DataFrame.apply.
+
+        Accepted combinations are:
+
+        - function
+        - string function name
+        - list of functions and/or function names, e.g. ``[np.exp. 'sqrt']``
+        - dict of axis labels -> functions, function names or list of such.
+    axis : {0 or 'index', 1 or 'columns'}, default 0
+            If 0 or 'index': apply function to each column.
+            If 1 or 'columns': apply function to each row.
+
+    dtypes : Series, default None
+        Specify dtypes of returned DataFrames. See `Notes` for more details.
+
+    skip_infer: bool, default False
+        Whether infer dtypes when dtypes or output_type is not specified.
+
+    *args
+        Positional arguments to pass to `func`.
+    **kwargs
+        Keyword arguments to pass to `func`.
+
+    Returns
+    -------
+    DataFrame
+        A DataFrame that must have the same length as self.
+
+    Raises
+    ------
+    ValueError : If the returned DataFrame has a different length than self.
+
+    See Also
+    --------
+    DataFrame.agg : Only perform aggregating type operations.
+    DataFrame.apply : Invoke function on a DataFrame.
+
+    Notes
+    -----
+    When deciding output dtypes and shape of the return value, Mars will
+    try applying ``func`` onto a mock DataFrame and the apply call may
+    fail. When this happens, you need to specify a list or a pandas
+    Series as ``dtypes`` of output DataFrame.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'A': range(3), 'B': range(1, 4)})
+    >>> df.execute()
+       A  B
+    0  0  1
+    1  1  2
+    2  2  3
+    >>> df.transform(lambda x: x + 1).execute()
+       A  B
+    0  1  2
+    1  2  3
+    2  3  4
+
+    Even though the resulting DataFrame must have the same length as the
+    input DataFrame, it is possible to provide several input functions:
+
+    >>> s = md.Series(range(3))
+    >>> s.execute()
+    0    0
+    1    1
+    2    2
+    dtype: int64
+    >>> s.transform([mt.sqrt, mt.exp]).execute()
+           sqrt        exp
+    0  0.000000   1.000000
+    1  1.000000   2.718282
+    2  1.414214   7.389056
+    """
+    op = TransformOperand(
+        func=func,
+        axis=axis,
+        args=args,
+        kwds=kwargs,
+        output_types=[OutputType.dataframe],
+        call_agg=kwargs.pop("_call_agg", False),
+    )
+    return op(df, dtypes=dtypes, skip_infer=skip_infer)
+
+
+def series_transform(
+    series,
+    func,
+    convert_dtype=True,
+    axis=0,
+    *args,
+    skip_infer=False,
+    dtype=None,
+    **kwargs
+):
+    """
+    Call ``func`` on self producing a Series with transformed values.
+
+    Produced Series will have same axis length as self.
+
+    Parameters
+    ----------
+    func : function, str, list or dict
+    Function to use for transforming the data. If a function, must either
+    work when passed a Series or when passed to Series.apply.
+
+    Accepted combinations are:
+
+    - function
+    - string function name
+    - list of functions and/or function names, e.g. ``[np.exp. 'sqrt']``
+    - dict of axis labels -> functions, function names or list of such.
+    axis : {0 or 'index'}
+        Parameter needed for compatibility with DataFrame.
+
+    dtype : numpy.dtype, default None
+        Specify dtypes of returned DataFrames. See `Notes` for more details.
+
+    skip_infer: bool, default False
+        Whether infer dtypes when dtypes or output_type is not specified.
+
+    *args
+        Positional arguments to pass to `func`.
+    **kwargs
+        Keyword arguments to pass to `func`.
+
+    Returns
+    -------
+    Series
+    A Series that must have the same length as self.
+
+    Raises
+    ------
+    ValueError : If the returned Series has a different length than self.
+
+    See Also
+    --------
+    Series.agg : Only perform aggregating type operations.
+    Series.apply : Invoke function on a Series.
+
+    Notes
+    -----
+    When deciding output dtypes and shape of the return value, Mars will
+    try applying ``func`` onto a mock Series, and the transform call may
+    fail. When this happens, you need to specify ``dtype`` of output
+    Series.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'A': range(3), 'B': range(1, 4)})
+    >>> df.execute()
+    A  B
+    0  0  1
+    1  1  2
+    2  2  3
+    >>> df.transform(lambda x: x + 1).execute()
+    A  B
+    0  1  2
+    1  2  3
+    2  3  4
+
+    Even though the resulting Series must have the same length as the
+    input Series, it is possible to provide several input functions:
+
+    >>> s = md.Series(range(3))
+    >>> s.execute()
+    0    0
+    1    1
+    2    2
+    dtype: int64
+    >>> s.transform([mt.sqrt, mt.exp]).execute()
+       sqrt        exp
+    0  0.000000   1.000000
+    1  1.000000   2.718282
+    2  1.414214   7.389056
+    """
+    op = TransformOperand(
+        func=func,
+        axis=axis,
+        convert_dtype=convert_dtype,
+        args=args,
+        kwds=kwargs,
+        output_types=[OutputType.series],
+        call_agg=kwargs.pop("_call_agg", False),
+    )
+    dtypes = (series.name, dtype) if dtype is not None else None
+    return op(series, dtypes=dtypes, skip_infer=skip_infer)
diff --git a/python/xorbits/_mars/dataframe/base/transpose.py b/python/xorbits/_mars/dataframe/base/transpose.py
new file mode 100644
index 000000000..229dc0dff
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/transpose.py
@@ -0,0 +1,169 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes
+from ...core import OutputType
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index
+
+
+class DataFrameTranspose(DataFrameOperand, DataFrameOperandMixin):
+    _op_code_ = opcodes.TRANSPOSE
+
+    def __init__(self, **kw):
+        super().__init__(**kw)
+        self.output_types = [OutputType.dataframe]
+
+    def __call__(self, args):
+        arg = args[0]
+        new_shape = arg.shape[::-1]
+        columns_value = arg.index_value
+        index_value = parse_index(arg.dtypes.index)
+        return self.new_dataframe(
+            [arg],
+            shape=new_shape,
+            dtypes=None,
+            columns_value=columns_value,
+            index_value=index_value,
+        )
+
+    @classmethod
+    def tile(cls, op):
+        out_chunks = []
+        for c in op.inputs[0].chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_shape = tuple(s if np.isnan(s) else int(s) for s in c.shape[::-1])
+            chunk_idx = c.index[::-1]
+            index_value = parse_index(c.dtypes.index)
+            columns_value = c.index_value
+            out_chunk = chunk_op.new_chunk(
+                [c],
+                shape=chunk_shape,
+                index=chunk_idx,
+                index_value=index_value,
+                columns_value=columns_value,
+                dtypes=None,
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        nsplits = op.inputs[0].nsplits[::-1]
+        params = op.outputs[0].params
+        return new_op.new_dataframe(
+            op.inputs, chunks=out_chunks, nsplits=nsplits, **params
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inp = ctx[op.inputs[0].key]
+        ctx[op.outputs[0].key] = inp.transpose()
+
+
+def transpose(*args):
+    """
+    Transpose index and columns.
+
+    Reflect the DataFrame over its main diagonal by writing rows as columns
+    and vice-versa. The property :attr:`.T` is an accessor to the method
+    :meth:`transpose`.
+
+    Parameters
+    ----------
+    *args : tuple, optional
+            Accepted for compatibility with NumPy.
+
+    Returns
+    -------
+    DataFrame
+        The transposed DataFrame.
+
+    See Also
+    --------
+    numpy.transpose : Permute the dimensions of a given array.
+
+    Notes
+    -----
+    Transposing a DataFrame with mixed dtypes will result in a homogeneous
+    DataFrame with the `object` dtype.
+
+    Examples
+    --------
+    **Square DataFrame with homogeneous dtype**
+
+    >>> import mars.dataframe as md
+    >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
+    >>> df1 = md.DataFrame(data=d1).execute()
+    >>> df1
+        col1  col2
+    0     1     3
+    1     2     4
+
+    >>> df1_transposed = df1.T.execute() # or df1.transpose().execute()
+    >>> df1_transposed
+          0  1
+    col1  1  2
+    col2  3  4
+
+    When the dtype is homogeneous in the original DataFrame, we get a
+    transposed DataFrame with the same dtype:
+
+    >>> df1.dtypes
+    col1    int64
+    col2    int64
+    dtype: object
+
+    >>> df1_transposed.dtypes
+    0    int64
+    1    int64
+    dtype: object
+
+    **Non-square DataFrame with mixed dtypes**
+
+    >>> d2 = {'name': ['Alice', 'Bob'],
+    ...       'score': [9.5, 8],
+    ...       'employed': [False, True],
+    ...       'kids': [0, 0]}
+    >>> df2 = md.DataFrame(data=d2).execute()
+    >>> df2
+        name  score  employed  kids
+    0  Alice    9.5     False     0
+    1    Bob    8.0      True     0
+
+    >>> df2_transposed = df2.T.execute() # or df2.transpose().execute()
+    >>> df2_transposed
+                  0     1
+    name      Alice   Bob
+    score       9.5   8.0
+    employed  False  True
+    kids          0     0
+
+    When the DataFrame has mixed dtypes, we get a transposed DataFrame with
+    the `object` dtype:
+
+    >>> df2.dtypes
+    name         object
+    score       float64
+    employed       bool
+    kids          int64
+    dtype: object
+
+    >>> df2_transposed.dtypes
+    0    object
+    1    object
+    dtype: object
+    """
+    op = DataFrameTranspose()
+    return op(args)
diff --git a/python/xorbits/_mars/dataframe/base/value_counts.py b/python/xorbits/_mars/dataframe/base/value_counts.py
new file mode 100644
index 000000000..260257c36
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/base/value_counts.py
@@ -0,0 +1,294 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import OutputType, recursive_tile
+from ...core.operand import OperandStage
+from ...serialization.serializables import BoolField, Int64Field, KeyField, StringField
+from ...utils import has_unknown_shape, pd_release_version
+from ..core import Series
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_series, parse_index
+
+_keep_original_order = pd_release_version >= (1, 3, 0)
+
+
+class DataFrameValueCounts(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.VALUE_COUNTS
+
+    input = KeyField("input")
+    normalize = BoolField("normalize")
+    sort = BoolField("sort")
+    ascending = BoolField("ascending")
+    bins = Int64Field("bins")
+    dropna = BoolField("dropna")
+    method = StringField("method")
+    convert_index_to_interval = BoolField("convert_index_to_interval", default=None)
+    nrows = Int64Field("nrows", default=None)
+
+    def __init__(self, **kw):
+        super().__init__(**kw)
+        self.output_types = [OutputType.series]
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self.input = self._inputs[0]
+
+    def __call__(self, inp):
+        test_series = build_series(inp).value_counts(normalize=self.normalize)
+        if self.bins is not None:
+            from .cut import cut
+
+            # cut
+            try:
+                inp = cut(inp, self.bins, include_lowest=True)
+            except TypeError:  # pragma: no cover
+                raise TypeError("bins argument only works with numeric data.")
+
+            self.bins = None
+            self.convert_index_to_interval = True
+            return self.new_series(
+                [inp],
+                shape=(np.nan,),
+                index_value=parse_index(pd.CategoricalIndex([]), inp, store_data=False),
+                name=inp.name,
+                dtype=test_series.dtype,
+            )
+        else:
+            return self.new_series(
+                [inp],
+                shape=(np.nan,),
+                index_value=parse_index(test_series.index, store_data=False),
+                name=inp.name,
+                dtype=test_series.dtype,
+            )
+
+    @classmethod
+    def tile(cls, op: "DataFrameValueCounts"):
+        inp = op.input
+        out = op.outputs[0]
+
+        if len(inp.chunks) == 1:
+            chunk_op = op.copy().reset_key()
+            chunk_param = out.params
+            chunk_param["index"] = (0,)
+            chunk = chunk_op.new_chunk(inp.chunks, kws=[chunk_param])
+
+            new_op = op.copy()
+            param = out.params
+            param["chunks"] = [chunk]
+            param["nsplits"] = ((np.nan,),)
+            return new_op.new_seriess(op.inputs, kws=[param])
+
+        inp = Series(inp)
+
+        if op.dropna:
+            inp = inp.dropna()
+
+        inp = inp.groupby(inp, sort=not _keep_original_order).count(method=op.method)
+
+        if op.normalize:
+            if op.convert_index_to_interval:
+                if has_unknown_shape(op.input):
+                    yield
+                inp = inp.truediv(op.input.shape[0], axis=0)
+            else:
+                inp = inp.truediv(inp.sum(), axis=0)
+
+        if op.sort:
+            inp = inp.sort_values(
+                ascending=op.ascending,
+                kind="mergesort" if _keep_original_order else "quicksort",
+            )
+
+            if op.nrows:
+                # set to sort_values
+                inp.op.nrows = op.nrows
+        elif op.nrows:
+            inp = inp.iloc[: op.nrows]
+
+        ret = yield from recursive_tile(inp)
+
+        chunks = []
+        for c in ret.chunks:
+            chunk_op = DataFrameValueCounts(
+                convert_index_to_interval=op.convert_index_to_interval,
+                stage=OperandStage.map,
+            )
+            chunk_params = c.params
+            if op.convert_index_to_interval:
+                # convert index to IntervalDtype
+                chunk_params["index_value"] = parse_index(
+                    pd.IntervalIndex([]), c, store_data=False
+                )
+            chunks.append(chunk_op.new_chunk([c], kws=[chunk_params]))
+
+        new_op = op.copy()
+        params = out.params
+        params["chunks"] = chunks
+        params["nsplits"] = ret.nsplits
+        return new_op.new_seriess(out.inputs, kws=[params])
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameValueCounts"):
+        if op.stage != OperandStage.map:
+            in_data = ctx[op.input.key]
+            if op.convert_index_to_interval:
+                result = in_data.value_counts(
+                    normalize=False,
+                    sort=op.sort,
+                    ascending=op.ascending,
+                    bins=op.bins,
+                    dropna=op.dropna,
+                )
+                if op.normalize:
+                    result /= in_data.shape[0]
+            else:
+                try:
+                    result = in_data.value_counts(
+                        normalize=op.normalize,
+                        sort=op.sort,
+                        ascending=op.ascending,
+                        bins=op.bins,
+                        dropna=op.dropna,
+                    )
+                except ValueError:
+                    in_data = in_data.copy()
+                    result = in_data.value_counts(
+                        normalize=op.normalize,
+                        sort=op.sort,
+                        ascending=op.ascending,
+                        bins=op.bins,
+                        dropna=op.dropna,
+                    )
+        else:
+            result = ctx[op.input.key]
+            # set index name to None to keep consistency with pandas
+            result.index.name = None
+        if op.convert_index_to_interval:
+            # convert CategoricalDtype which generated in `cut`
+            # to IntervalDtype
+            result.index = result.index.astype("interval")
+        if op.nrows:
+            result = result.head(op.nrows)
+        ctx[op.outputs[0].key] = result
+
+
+def value_counts(
+    series,
+    normalize=False,
+    sort=True,
+    ascending=False,
+    bins=None,
+    dropna=True,
+    method="auto",
+):
+    """
+    Return a Series containing counts of unique values.
+
+    The resulting object will be in descending order so that the
+    first element is the most frequently-occurring element.
+    Excludes NA values by default.
+
+    Parameters
+    ----------
+    normalize : bool, default False
+        If True then the object returned will contain the relative
+        frequencies of the unique values.
+    sort : bool, default True
+        Sort by frequencies.
+    ascending : bool, default False
+        Sort in ascending order.
+    bins : int, optional
+        Rather than count values, group them into half-open bins,
+        a convenience for ``pd.cut``, only works with numeric data.
+    dropna : bool, default True
+        Don't include counts of NaN.
+    method : str, default 'auto'
+        'auto', 'shuffle', or 'tree', 'tree' method provide
+        a better performance, while 'shuffle' is recommended
+        if aggregated result is very large, 'auto' will use
+        'shuffle' method in distributed mode and use 'tree'
+        in local mode.
+
+    Returns
+    -------
+    Series
+
+    See Also
+    --------
+    Series.count: Number of non-NA elements in a Series.
+    DataFrame.count: Number of non-NA elements in a DataFrame.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> import mars.tensor as mt
+
+    >>> s = md.Series([3, 1, 2, 3, 4, mt.nan])
+    >>> s.value_counts().execute()
+    3.0    2
+    4.0    1
+    2.0    1
+    1.0    1
+    dtype: int64
+
+    With `normalize` set to `True`, returns the relative frequency by
+    dividing all values by the sum of values.
+
+    >>> s = md.Series([3, 1, 2, 3, 4, mt.nan])
+    >>> s.value_counts(normalize=True).execute()
+    3.0    0.4
+    4.0    0.2
+    2.0    0.2
+    1.0    0.2
+    dtype: float64
+
+    **bins**
+
+    Bins can be useful for going from a continuous variable to a
+    categorical variable; instead of counting unique
+    apparitions of values, divide the index in the specified
+    number of half-open bins.
+
+    >>> s.value_counts(bins=3).execute()
+    (2.0, 3.0]      2
+    (0.996, 2.0]    2
+    (3.0, 4.0]      1
+    dtype: int64
+
+    **dropna**
+
+    With `dropna` set to `False` we can also see NaN index values.
+
+    >>> s.value_counts(dropna=False).execute()
+    3.0    2
+    NaN    1
+    4.0    1
+    2.0    1
+    1.0    1
+    dtype: int64
+    """
+    op = DataFrameValueCounts(
+        normalize=normalize,
+        sort=sort,
+        ascending=ascending,
+        bins=bins,
+        dropna=dropna,
+        method=method,
+    )
+    return op(series)
diff --git a/python/xorbits/_mars/dataframe/contrib/__init__.py b/python/xorbits/_mars/dataframe/contrib/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/contrib/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/contrib/raydataset/__init__.py b/python/xorbits/_mars/dataframe/contrib/raydataset/__init__.py
new file mode 100644
index 000000000..7bab2ce69
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/contrib/raydataset/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .dataset import get_chunk_refs, to_ray_dataset
+from .mldataset import ChunkRefBatch, to_ray_mldataset
diff --git a/python/xorbits/_mars/dataframe/contrib/raydataset/dataset.py b/python/xorbits/_mars/dataframe/contrib/raydataset/dataset.py
new file mode 100644
index 000000000..5e900bf53
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/contrib/raydataset/dataset.py
@@ -0,0 +1,66 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import operator
+from functools import reduce
+from typing import Dict, List
+
+from ....utils import lazy_import
+from .mldataset import _rechunk_if_needed
+
+ray = lazy_import("ray")
+# Ray Datasets is available in early preview at ray.data with Ray 1.6+
+# (and ray.experimental.data in Ray 1.5)
+ray_dataset = lazy_import("ray.data", rename="ray_dataset")
+
+
+def to_ray_dataset(df, num_shards: int = None):
+    """Create a Ray Dataset from Mars DataFrame
+
+    Args:
+        df (mars.dataframe.Dataframe): the Mars DataFrame
+        num_shards (int, optional): the number of shards that will be created
+            for the Ray Dataset. Defaults to None.
+            If num_shards is None, chunks will be grouped by nodes where they lie.
+            Otherwise, chunks will be grouped by their order in DataFrame.
+
+    Returns:
+        a Ray Dataset
+    """
+    df = _rechunk_if_needed(df, num_shards)
+    # chunk_addr_refs is fetched directly rather than in batches
+    # during `fetch` procedure, it'll be checked that df has been executed
+    # items in chunk_addr_refs are ordered by positions in df
+    # while adjacent chunks may belong to different addrs, i.e.
+    #       chunk1 for addr1,
+    #       chunk2 & chunk3 for addr2,
+    #       chunk4 for addr1
+    chunk_refs: List["ray.ObjectRef"] = get_chunk_refs(df)
+    dataset = ray_dataset.from_pandas_refs(chunk_refs)
+    # Hold mars dataframe to avoid mars dataframe and ray object gc.
+    dataset.dataframe = df
+
+    def __getstate__():
+        state = dataset.__dict__.copy()
+        state.pop("dataframe", None)
+        return state
+
+    # `dataframe` is not serializable by ray.
+    dataset.__getstate__ = __getstate__
+    return dataset
+
+
+def get_chunk_refs(df):
+    fetched_infos: Dict[str, List] = df.fetch_infos(["object_refs"])
+    object_refs = reduce(operator.concat, fetched_infos["object_refs"])
+    return object_refs
diff --git a/python/xorbits/_mars/dataframe/contrib/raydataset/mldataset.py b/python/xorbits/_mars/dataframe/contrib/raydataset/mldataset.py
new file mode 100644
index 000000000..6384c3116
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/contrib/raydataset/mldataset.py
@@ -0,0 +1,137 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import Dict, Iterable, List, Tuple
+
+import numpy as np
+import pandas as pd
+
+from ....utils import lazy_import
+
+ray = lazy_import("ray")
+parallel_it = lazy_import("ray.util.iter", rename="parallel_it")
+ml_dataset = lazy_import("ray.util.data", rename="ml_dataset")
+
+
+class ChunkRefBatch:
+    def __init__(self, shard_id: int, obj_refs: "ray.ObjectRef"):
+        """Iterable batch holding a list of ray.ObjectRefs.
+
+        Args:
+            shard_id (int): id of the shard
+            prefix (str): prefix name of the batch
+            obj_refs (List[ray.ObjectRefs]): list of ray.ObjectRefs
+        """
+        self._shard_id = shard_id
+        self._obj_refs = obj_refs
+
+    @property
+    def shard_id(self) -> int:
+        return self._shard_id
+
+    def __iter__(self) -> Iterable[pd.DataFrame]:
+        """Returns the item_generator required from ParallelIteratorWorker."""
+        for obj_ref in self._obj_refs:
+            yield ray.get(obj_ref)
+
+
+def _group_chunk_refs(
+    chunk_addr_refs: List[Tuple[Tuple, "ray.ObjectRef"]], num_shards: int
+):
+    """Group fetched ray.ObjectRefs into a dict for later use.
+
+    Args:
+        chunk_addr_refs (List[Tuple[Tuple, ray.ObjectRef]]): a list of tuples of
+            band & ray.ObjectRef of each chunk.
+        num_shards (int): the number of shards that will be created for the MLDataset.
+
+    Returns:
+        Dict[str, List[ray.ObjectRef]]: a dict that defines which group of ray.ObjectRefs will
+            be in an ChunkRefBatch.
+    """
+    group_to_obj_refs = defaultdict(list)
+    if not num_shards:
+        for addr, obj_ref in chunk_addr_refs:
+            group_to_obj_refs[addr].append(obj_ref)
+    else:
+        splits = np.array_split([ref for _, ref in chunk_addr_refs], num_shards)
+        for idx, split in enumerate(splits):
+            group_to_obj_refs["group-" + str(idx)] = list(split)
+    return group_to_obj_refs
+
+
+def _rechunk_if_needed(df, num_shards: int = None):
+    try:
+        if num_shards:
+            assert isinstance(num_shards, int) and num_shards > 0
+            df = df.rebalance(axis=0, num_partitions=num_shards)
+        df = df.rechunk({1: df.shape[1]})
+        df = df.reset_index(drop=True)
+        return df.execute()
+    except Exception as e:  # pragma: no cover
+        raise Exception(f"rechunk failed df.shape {df.shape}") from e
+
+
+def to_ray_mldataset(df, num_shards: int = None):
+    """Create a MLDataset from Mars DataFrame
+
+    Args:
+        df (mars.dataframe.Dataframe): the Mars DataFrame
+        num_shards (int, optional): the number of shards that will be created
+            for the MLDataset. Defaults to None.
+            If num_shards is None, chunks will be grouped by nodes where they lie.
+            Otherwise, chunks will be grouped by their order in DataFrame.
+
+    Returns:
+        a MLDataset
+    """
+    df = _rechunk_if_needed(df, num_shards)
+    # chunk_addr_refs is fetched directly rather than in batches
+    # during `fetch` procedure, it'll be checked that df has been executed
+    # items in chunk_addr_refs are ordered by positions in df
+    # while adjacent chunks may belong to different addrs, i.e.
+    #       chunk1 for addr1,
+    #       chunk2 & chunk3 for addr2,
+    #       chunk4 for addr1
+    fetched_infos: Dict[str, List] = df.fetch_infos(fields=["bands", "object_refs"])
+    chunk_addr_refs: List[Tuple[Tuple, "ray.ObjectRef"]] = []
+    for bands, object_refs in zip(fetched_infos["bands"], fetched_infos["object_refs"]):
+        chunk_addr_ref = (
+            (bands[0], object_refs[0]) if bands else ("ray_dag_0", object_refs[0])
+        )
+        chunk_addr_refs.append(chunk_addr_ref)
+
+    group_to_obj_refs: Dict[str, List[ray.ObjectRef]] = _group_chunk_refs(
+        chunk_addr_refs, num_shards
+    )
+
+    record_batches = []
+    for rank, obj_refs in enumerate(group_to_obj_refs.values()):
+        record_batches.append(ChunkRefBatch(shard_id=rank, obj_refs=obj_refs))
+    worker_cls = ray.remote(num_cpus=0)(parallel_it.ParallelIteratorWorker)
+    actors = [worker_cls.remote(g, False) for g in record_batches]
+    it = parallel_it.from_actors(actors, "from_mars")
+    dataset = ml_dataset.from_parallel_iter(it, need_convert=False, batch_size=0)
+    # Hold mars dataframe to avoid mars dataframe and ray object gc.
+    dataset.dataframe = df
+
+    def __getstate__():
+        state = dataset.__dict__.copy()
+        state.pop("dataframe", None)
+        return state
+
+    # `dataframe` is not serializable by ray.
+    dataset.__getstate__ = __getstate__
+    return dataset
diff --git a/python/xorbits/_mars/dataframe/contrib/raydataset/tests/__init__.py b/python/xorbits/_mars/dataframe/contrib/raydataset/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/contrib/raydataset/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/contrib/raydataset/tests/test_mldataset.py b/python/xorbits/_mars/dataframe/contrib/raydataset/tests/test_mldataset.py
new file mode 100644
index 000000000..14d612e0c
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/contrib/raydataset/tests/test_mldataset.py
@@ -0,0 +1,150 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ..... import dataframe as md
+from .....conftest import MARS_CI_BACKEND
+from .....deploy.oscar.ray import new_cluster
+from .....deploy.oscar.session import new_session
+from .....tests.core import require_ray
+from .....utils import lazy_import
+from ....contrib import raydataset as mdd
+from ....utils import ray_deprecate_ml_dataset
+
+ray = lazy_import("ray")
+ml_dataset = lazy_import("ray.util.data", rename="ml_dataset")
+
+try:
+    import xgboost_ray
+except ImportError:  # pragma: no cover
+    xgboost_ray = None
+try:
+    import sklearn
+except ImportError:  # pragma: no cover
+    sklearn = None
+
+
+@pytest.fixture
+async def create_cluster(request):
+    client = await new_cluster(
+        supervisor_mem=256 * 1024**2,
+        worker_num=2,
+        worker_cpu=1,
+        worker_mem=256 * 1024**2,
+        backend=MARS_CI_BACKEND,
+    )
+    async with client:
+        yield client
+
+
+@require_ray
+@pytest.mark.asyncio
+@pytest.mark.skipif(
+    ray_deprecate_ml_dataset in (True, None),
+    reason="Ray (>=2.0) has deprecated MLDataset.",
+)
+async def test_dataset_related_classes(ray_start_regular_shared):
+    from ..mldataset import ChunkRefBatch
+
+    # in order to pass checks
+    value1 = np.random.rand(10, 10)
+    value2 = np.random.rand(10, 10)
+    df1 = pd.DataFrame(value1)
+    df2 = pd.DataFrame(value2)
+    if ray:
+        obj_ref1, obj_ref2 = ray.put(df1), ray.put(df2)
+        batch = ChunkRefBatch(shard_id=0, obj_refs=[obj_ref1, obj_ref2])
+        assert batch.shard_id == 0
+        # the first data in batch
+        batch = iter(batch)
+        pd.testing.assert_frame_equal(next(batch), df1)
+        pd.testing.assert_frame_equal(next(batch), df2)
+
+
+@require_ray
+@pytest.mark.asyncio
+@pytest.mark.parametrize("chunk_size_and_num_shards", [[5, 5], [5, 4], [None, None]])
+@pytest.mark.skipif(
+    ray_deprecate_ml_dataset in (True, None),
+    reason="Ray (>=2.0) has deprecated MLDataset.",
+)
+async def test_convert_to_ray_mldataset(
+    ray_start_regular_shared, create_cluster, chunk_size_and_num_shards
+):
+    assert create_cluster.session
+    session = new_session(address=create_cluster.address, backend="ray")
+    with session:
+        value = np.random.rand(10, 10)
+        chunk_size, num_shards = chunk_size_and_num_shards
+        df: md.DataFrame = md.DataFrame(value, chunk_size=chunk_size)
+        df.execute()
+
+        ds = mdd.to_ray_mldataset(df, num_shards=num_shards)
+        assert isinstance(ds, ml_dataset.MLDataset)
+
+
+@require_ray
+@pytest.mark.asyncio
+@pytest.mark.skipif(xgboost_ray is None, reason="xgboost_ray not installed")
+@pytest.mark.skipif(
+    ray_deprecate_ml_dataset in (True, None),
+    reason="Ray (>=2.0) has deprecated MLDataset.",
+)
+async def test_mars_with_xgboost(ray_start_regular_shared, create_cluster):
+    from sklearn.datasets import load_breast_cancer
+    from xgboost_ray import RayDMatrix, RayParams, predict, train
+
+    assert create_cluster.session
+    session = new_session(address=create_cluster.address, backend="ray")
+    with session:
+        train_x, train_y = load_breast_cancer(return_X_y=True, as_frame=True)
+        df: md.DataFrame = md.concat(
+            [md.DataFrame(train_x), md.DataFrame(train_y)], axis=1
+        )
+        df.execute()
+
+        num_shards = 4
+        ds = mdd.to_ray_mldataset(df, num_shards)
+        assert isinstance(ds, ml_dataset.MLDataset)
+
+        import gc
+
+        gc.collect()  # Ensure MLDataset does hold mars dataframe to avoid gc.
+
+        # train
+        train_set = RayDMatrix(ds, "target")
+        evals_result = {}
+        bst = train(
+            {
+                "objective": "binary:logistic",
+                "eval_metric": ["logloss", "error"],
+            },
+            train_set,
+            evals_result=evals_result,
+            evals=[(train_set, "train")],
+            verbose_eval=False,
+            ray_params=RayParams(
+                num_actors=num_shards, cpus_per_actor=1  # Number of remote actors
+            ),
+        )
+        bst.save_model("model.xgb")
+        assert os.path.exists("model.xgb")
+        os.remove("model.xgb")
+        print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
+        predict(bst, train_set, ray_params=RayParams(num_actors=2))
diff --git a/python/xorbits/_mars/dataframe/contrib/raydataset/tests/test_raydataset.py b/python/xorbits/_mars/dataframe/contrib/raydataset/tests/test_raydataset.py
new file mode 100644
index 000000000..7d2b4b6b4
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/contrib/raydataset/tests/test_raydataset.py
@@ -0,0 +1,188 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ..... import dataframe as md
+from .....conftest import MARS_CI_BACKEND
+from .....deploy.oscar.ray import new_cluster
+from .....deploy.oscar.session import new_session
+from .....tests.core import require_ray
+from .....utils import lazy_import
+from ....contrib import raydataset as mdd
+
+ray = lazy_import("ray")
+# Ray Datasets is available in early preview at ray.data with Ray 1.6+
+# (and ray.experimental.data in Ray 1.5)
+ray_dataset = lazy_import("ray.data", rename="ray_dataset")
+xgboost_ray = lazy_import("xgboost_ray")
+try:
+    import sklearn
+except ImportError:  # pragma: no cover
+    sklearn = None
+
+
+@pytest.fixture
+async def create_cluster(request):
+    client = await new_cluster(
+        supervisor_mem=256 * 1024**2,
+        worker_num=2,
+        worker_cpu=1,
+        worker_mem=256 * 1024**2,
+        backend=MARS_CI_BACKEND,
+    )
+    async with client:
+        yield client
+
+
+@require_ray
+@pytest.mark.asyncio
+@pytest.mark.parametrize("chunk_size_and_num_shards", [[3, 3], [3, 2], [None, None]])
+async def test_convert_to_ray_dataset(
+    ray_start_regular_shared, create_cluster, chunk_size_and_num_shards
+):
+    assert create_cluster.session
+    session = new_session(address=create_cluster.address, default=True)
+    with session:
+        value = np.random.rand(10, 10)
+        chunk_size, num_shards = chunk_size_and_num_shards
+        # ray dataset needs str columns
+        df: md.DataFrame = md.DataFrame(
+            value,
+            chunk_size=chunk_size,
+            columns=[f"c{i}" for i in range(value.shape[1])],
+        )
+        df.execute()
+
+        ds = mdd.to_ray_dataset(df, num_shards=num_shards)
+        assert isinstance(ds, ray_dataset.Dataset)
+
+
+@require_ray
+@pytest.mark.asyncio
+@pytest.mark.skipif(xgboost_ray is None, reason="xgboost_ray not installed")
+async def test_mars_with_xgboost(ray_start_regular_shared, create_cluster):
+    from sklearn.datasets import load_breast_cancer
+    from xgboost_ray import RayDMatrix, RayParams, train
+
+    assert create_cluster.session
+    session = new_session(address=create_cluster.address, backend="ray")
+    with session:
+        train_x, train_y = load_breast_cancer(return_X_y=True, as_frame=True)
+        pd_df = pd.concat([train_x, train_y], axis=1)
+        df: md.DataFrame = md.DataFrame(pd_df)
+        df.execute()
+
+        num_shards = 4
+        ds = md.to_ray_dataset(df, num_shards=num_shards)
+        assert isinstance(ds, ray_dataset.Dataset)
+
+        # train
+        train_set = RayDMatrix(ds, "target")
+        evals_result = {}
+        bst = train(
+            {
+                "objective": "binary:logistic",
+                "eval_metric": ["logloss", "error"],
+            },
+            train_set,
+            evals_result=evals_result,
+            evals=[(train_set, "train")],
+            verbose_eval=False,
+            ray_params=RayParams(
+                num_actors=num_shards, cpus_per_actor=1  # Number of remote actors
+            ),
+        )
+        bst.save_model("model.xgb")
+        assert os.path.exists("model.xgb")
+        os.remove("model.xgb")
+        print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
+
+
+@require_ray
+@pytest.mark.asyncio
+@pytest.mark.skipif(sklearn is None, reason="sklearn not installed")
+@pytest.mark.skipif(xgboost_ray is None, reason="xgboost_ray not installed")
+async def test_mars_with_xgboost_sklearn_clf(ray_start_regular_shared, create_cluster):
+    from sklearn.datasets import load_breast_cancer
+    from xgboost_ray import RayDMatrix, RayParams, RayXGBClassifier
+
+    assert create_cluster.session
+    session = new_session(address=create_cluster.address, backend="ray")
+    with session:
+        train_x, train_y = load_breast_cancer(return_X_y=True, as_frame=True)
+        df: md.DataFrame = md.concat(
+            [md.DataFrame(train_x), md.DataFrame(train_y)], axis=1
+        )
+        df.execute()
+        columns = list(df.columns.to_pandas())
+        print(f"Columns {columns}, pandas columns {train_x.columns}")
+        assert columns[:-1] == list(train_x.columns)
+        num_shards = 4
+        ds = md.to_ray_dataset(df, num_shards)
+        assert isinstance(ds, ray_dataset.Dataset)
+        print(f"Columns {columns}, dataset columns {train_x.columns}")
+        assert columns == ds.schema().names
+        import gc
+
+        gc.collect()  # Ensure MLDataset does hold mars dataframe to avoid gc.
+        ray_params = RayParams(num_actors=2, cpus_per_actor=1)
+        clf = RayXGBClassifier(
+            ray_params=ray_params,
+            random_state=42,
+            use_label_encoder=False,
+            num_class=2,
+        )
+        # train
+        clf.fit(RayDMatrix(ds, "target"), y=None, ray_params=ray_params)
+        clf.predict(RayDMatrix(ds, "target"))
+        # Enable it when https://github.com/ray-project/xgboost_ray/issues/177 got fixed
+        # pred = clf.predict(train_x)
+        # print("predicted values: ", pred)
+
+
+@require_ray
+@pytest.mark.asyncio
+@pytest.mark.skipif(sklearn is None, reason="sklearn not installed")
+@pytest.mark.skipif(xgboost_ray is None, reason="xgboost_ray not installed")
+async def test_mars_with_xgboost_sklearn_reg(ray_start_regular_shared, create_cluster):
+    from sklearn.datasets import make_regression
+    from xgboost_ray import RayDMatrix, RayParams, RayXGBRegressor
+
+    assert create_cluster.session
+    session = new_session(address=create_cluster.address, backend="ray")
+    with session:
+        np_X, np_y = make_regression(n_samples=1_0000, n_features=10)
+        columns = [f"c{i}" for i in range(np_X.shape[1])]
+        X, y = md.DataFrame(np_X, columns=columns), md.DataFrame({"target": np_y})
+        df: md.DataFrame = md.concat([md.DataFrame(X), md.DataFrame(y)], axis=1)
+        df.execute()
+
+        num_shards = 4
+        ds = md.to_ray_dataset(df, num_shards)
+        assert isinstance(ds, ray_dataset.Dataset)
+
+        import gc
+
+        gc.collect()  # Ensure Dataset does hold mars dataframe to avoid gc.
+        ray_params = RayParams(num_actors=2, cpus_per_actor=1)
+        reg = RayXGBRegressor(ray_params=ray_params, random_state=42)
+        # train
+        reg.fit(RayDMatrix(ds, "target"), y=None, ray_params=ray_params)
+        reg.predict(RayDMatrix(ds, "target"))
+        reg.predict(pd.DataFrame(np_X, columns=columns))
diff --git a/python/xorbits/_mars/dataframe/core.py b/python/xorbits/_mars/dataframe/core.py
new file mode 100644
index 000000000..92cea24b3
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/core.py
@@ -0,0 +1,3264 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import operator
+import weakref
+from collections.abc import Iterable
+from io import StringIO
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import pandas as pd
+
+from ..core import (
+    ENTITY_TYPE,
+    Chunk,
+    ChunkData,
+    HasShapeTileable,
+    HasShapeTileableData,
+    OutputType,
+    Tileable,
+    _ExecuteAndFetchMixin,
+    is_build_mode,
+    register_output_types,
+)
+from ..core.entity.utils import refresh_tileable_shape
+from ..deploy.oscar.session import get_default_session
+from ..lib.groupby_wrapper import GroupByWrapper
+from ..serialization.serializables import (
+    AnyField,
+    BoolField,
+    DataTypeField,
+    DictField,
+    FieldTypes,
+    Int8Field,
+    Int32Field,
+    IntervalArrayField,
+    ListField,
+    NDArrayField,
+    OneOfField,
+    ReferenceField,
+    Serializable,
+    SeriesField,
+    SliceField,
+    StringField,
+    TupleField,
+)
+from ..tensor import statistics
+from ..utils import (
+    calc_nsplits,
+    ceildiv,
+    estimate_pandas_size,
+    on_deserialize_shape,
+    on_serialize_numpy_type,
+    on_serialize_shape,
+    tokenize,
+)
+from .utils import ReprSeries, fetch_corner_data, merge_index_value, parse_index
+
+
+class IndexValue(Serializable):
+    """
+    Meta class for index, held by IndexData, SeriesData and DataFrameData
+    """
+
+    __slots__ = ()
+
+    class IndexBase(Serializable):
+        _key = StringField("key")  # to identify if the index is the same
+        _is_monotonic_increasing = BoolField("is_monotonic_increasing")
+        _is_monotonic_decreasing = BoolField("is_monotonic_decreasing")
+        _is_unique = BoolField("is_unique")
+        _max_val = AnyField("max_val", on_serialize=on_serialize_numpy_type)
+        _max_val_close = BoolField("max_val_close")
+        _min_val = AnyField("min_val", on_serialize=on_serialize_numpy_type)
+        _min_val_close = BoolField("min_val_close")
+
+        @property
+        def is_monotonic_increasing(self):
+            return self._is_monotonic_increasing
+
+        @property
+        def is_monotonic_decreasing(self):
+            return self._is_monotonic_decreasing
+
+        @property
+        def is_unique(self):
+            return self._is_unique
+
+        @property
+        def min_val(self):
+            return self._min_val
+
+        @property
+        def min_val_close(self):
+            return self._min_val_close
+
+        @property
+        def max_val(self):
+            return self._max_val
+
+        @property
+        def max_val_close(self):
+            return self._max_val_close
+
+        @property
+        def key(self):
+            return self._key
+
+        @property
+        def inferred_type(self):
+            return None
+
+        def to_pandas(self):
+            kw = {
+                field.tag: getattr(self, attr, None)
+                for attr, field in self._FIELDS.items()
+                if attr not in super(type(self), self)._FIELDS
+            }
+            kw = {k: v for k, v in kw.items() if v is not None}
+            if kw.get("data") is None:
+                kw["data"] = []
+
+            pd_initializer = getattr(self, "_pd_initializer", None)
+            if pd_initializer is None:
+                pd_initializer = getattr(pd, type(self).__name__)
+            return pd_initializer(**kw)
+
+    class Index(IndexBase):
+        _name = AnyField("name")
+        _data = NDArrayField("data")
+        _dtype = DataTypeField("dtype")
+
+    class RangeIndex(IndexBase):
+        _name = AnyField("name")
+        _slice = SliceField("slice")
+        _dtype = DataTypeField("dtype")
+
+        @property
+        def slice(self):
+            return self._slice
+
+        @property
+        def dtype(self):
+            return getattr(self, "_dtype", np.dtype(np.intc))
+
+        def to_pandas(self):
+            slc = self._slice
+            return pd.RangeIndex(
+                slc.start, slc.stop, slc.step, name=getattr(self, "_name", None)
+            )
+
+    class CategoricalIndex(IndexBase):
+        _name = AnyField("name")
+        _data = NDArrayField("data")
+        _categories = AnyField("categories")
+        _ordered = BoolField("ordered")
+
+        @property
+        def inferred_type(self):
+            return "categorical"
+
+    class IntervalIndex(IndexBase):
+        _name = AnyField("name")
+        _data = IntervalArrayField("data")
+        _closed = StringField("closed")
+
+        @property
+        def inferred_type(self):
+            return "interval"
+
+    class DatetimeIndex(IndexBase):
+        _name = AnyField("name")
+        _data = NDArrayField("data")
+        _freq = AnyField("freq")
+        _start = AnyField("start")
+        _periods = AnyField("periods")
+        _end = AnyField("end")
+        _closed = AnyField("closed")
+        _tz = AnyField("tz")
+        _ambiguous = AnyField("ambiguous")
+        _dayfirst = BoolField("dayfirst")
+        _yearfirst = BoolField("yearfirst")
+
+        @property
+        def inferred_type(self):
+            return "datetime64"
+
+        @property
+        def freq(self):
+            return getattr(self, "_freq", None)
+
+    class TimedeltaIndex(IndexBase):
+        _name = AnyField("name")
+        _data = NDArrayField("data")
+        _unit = AnyField("unit")
+        _freq = AnyField("freq")
+        _start = AnyField("start")
+        _periods = AnyField("periods")
+        _end = AnyField("end")
+        _closed = AnyField("closed")
+
+        @property
+        def inferred_type(self):
+            return "timedelta64"
+
+    class PeriodIndex(IndexBase):
+        _name = AnyField("name")
+        _data = NDArrayField("data")
+        _freq = AnyField("freq")
+        _start = AnyField("start")
+        _periods = AnyField("periods")
+        _end = AnyField("end")
+        _year = AnyField("year")
+        _month = AnyField("month")
+        _quarter = AnyField("quarter")
+        _day = AnyField("day")
+        _hour = AnyField("hour")
+        _minute = AnyField("minute")
+        _second = AnyField("second")
+        _tz = AnyField("tz")
+        _dtype = DataTypeField("dtype")
+
+        @property
+        def inferred_type(self):
+            return "period"
+
+    class Int64Index(IndexBase):
+        _pd_initializer = pd.Index
+
+        _name = AnyField("name")
+        _data = NDArrayField("data")
+        _dtype = DataTypeField("dtype")
+
+        @property
+        def inferred_type(self):
+            return "integer"
+
+    class UInt64Index(IndexBase):
+        _pd_initializer = pd.Index
+
+        _name = AnyField("name")
+        _data = NDArrayField("data")
+        _dtype = DataTypeField("dtype")
+
+        @property
+        def inferred_type(self):
+            return "integer"
+
+    class Float64Index(IndexBase):
+        _pd_initializer = pd.Index
+
+        _name = AnyField("name")
+        _data = NDArrayField("data")
+        _dtype = DataTypeField("dtype")
+
+        @property
+        def inferred_type(self):
+            return "floating"
+
+    class MultiIndex(IndexBase):
+        _names = ListField("names", on_serialize=list)
+        _dtypes = ListField("dtypes", on_serialize=list)
+        _data = NDArrayField("data")
+        _sortorder = Int32Field("sortorder")
+
+        @property
+        def inferred_type(self):
+            return "mixed"
+
+        @property
+        def names(self) -> list:
+            return self._names
+
+        def to_pandas(self):
+            data = getattr(self, "_data", None)
+            sortorder = getattr(self, "_sortorder", None)
+
+            def _build_empty_array(dtype):
+                try:
+                    return np.array([], dtype=dtype)
+                except TypeError:  # pragma: no cover
+                    return pd.array([], dtype=dtype)
+
+            if data is None:
+                return pd.MultiIndex.from_arrays(
+                    [_build_empty_array(dtype) for dtype in self._dtypes],
+                    sortorder=sortorder,
+                    names=self._names,
+                )
+            return pd.MultiIndex.from_tuples(
+                [tuple(d) for d in data], sortorder=sortorder, names=self._names
+            )
+
+    _index_value = OneOfField(
+        "index_value",
+        index=Index,
+        range_index=RangeIndex,
+        categorical_index=CategoricalIndex,
+        interval_index=IntervalIndex,
+        datetime_index=DatetimeIndex,
+        timedelta_index=TimedeltaIndex,
+        period_index=PeriodIndex,
+        int64_index=Int64Index,
+        uint64_index=UInt64Index,
+        float64_index=Float64Index,
+        multi_index=MultiIndex,
+    )
+
+    def __mars_tokenize__(self):
+        # return object for tokenize
+        v = self._index_value
+        return v._key
+
+    @property
+    def value(self):
+        return self._index_value
+
+    @property
+    def key(self):
+        return self._index_value.key
+
+    @property
+    def is_monotonic_increasing(self):
+        return self._index_value.is_monotonic_increasing
+
+    @property
+    def is_monotonic_decreasing(self):
+        return self._index_value.is_monotonic_decreasing
+
+    @property
+    def is_monotonic_increasing_or_decreasing(self):
+        return self.is_monotonic_increasing or self.is_monotonic_decreasing
+
+    @property
+    def is_unique(self):
+        return self._index_value.is_unique
+
+    @property
+    def min_val(self):
+        return self._index_value.min_val
+
+    @property
+    def min_val_close(self):
+        return self._index_value.min_val_close
+
+    @property
+    def max_val(self):
+        return self._index_value.max_val
+
+    @property
+    def max_val_close(self):
+        return self._index_value.max_val_close
+
+    @property
+    def min_max(self):
+        return (
+            self._index_value.min_val,
+            self._index_value.min_val_close,
+            self._index_value.max_val,
+            self._index_value.max_val_close,
+        )
+
+    @property
+    def name(self):
+        return getattr(self._index_value, "_name", None)
+
+    @property
+    def inferred_type(self):
+        return self._index_value.inferred_type
+
+    def has_value(self):
+        if isinstance(self._index_value, self.RangeIndex):
+            if np.isnan(self._index_value.max_val):
+                return False
+            else:
+                return True
+        elif getattr(self._index_value, "_data", None) is not None:
+            return True
+        return False
+
+    def to_pandas(self):
+        return self._index_value.to_pandas()
+
+
+class DtypesValue(Serializable):
+    """
+    Meta class for dtypes.
+    """
+
+    __slots__ = ()
+
+    _key = StringField("key")
+    _value = SeriesField("value")
+
+    def __init__(self, key=None, value=None, **kw):
+        super().__init__(_key=key, _value=value, **kw)
+        if self._key is None:
+            self._key = tokenize(self._value)
+
+    @property
+    def key(self):
+        return self._key
+
+    @property
+    def value(self):
+        return self._value
+
+
+def refresh_index_value(tileable: ENTITY_TYPE):
+    index_to_index_values = dict()
+    for chunk in tileable.chunks:
+        if chunk.ndim == 1:
+            index_to_index_values[chunk.index] = chunk.index_value
+        elif chunk.index[1] == 0:
+            index_to_index_values[chunk.index] = chunk.index_value
+    index_value = merge_index_value(index_to_index_values, store_data=False)
+    # keep key as original index_value's
+    if tileable.index_value is not None:
+        index_value._index_value._key = tileable.index_value.key
+    tileable._index_value = index_value
+
+
+def refresh_dtypes(tileable: ENTITY_TYPE):
+    all_dtypes = [c.dtypes_value.value for c in tileable.chunks if c.index[0] == 0]
+    dtypes = pd.concat(all_dtypes)
+    tileable._dtypes = dtypes
+    columns_values = parse_index(dtypes.index, store_data=True)
+    tileable._columns_value = columns_values
+    tileable._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes)
+
+
+_tileable_key_property = "_tileable_key"
+_tileable_dtypes_property = "_tileable_dtypes"
+_tileable_index_value_property = "_tileable_index_value"
+_tileable_columns_value_property = "_tileable_columns_value"
+_nsplits_property = "_tileable_nsplits"
+_lazy_chunk_meta_properties = (
+    _tileable_key_property,
+    _tileable_dtypes_property,
+    _tileable_index_value_property,
+    _tileable_columns_value_property,
+    _nsplits_property,
+)
+
+
+class LazyMetaChunkData(ChunkData):
+    __slots__ = _lazy_chunk_meta_properties
+
+    def _set_tileable_meta(
+        self,
+        tileable_key: str = None,
+        nsplits: Tuple[Tuple[int, ...]] = None,
+        index_value: IndexValue = None,
+        columns_value: IndexValue = None,
+        dtypes: pd.Series = None,
+    ):
+        setattr(self, _tileable_key_property, tileable_key)
+        setattr(self, _nsplits_property, nsplits)
+        setattr(self, _tileable_index_value_property, index_value)
+        setattr(self, _tileable_columns_value_property, columns_value)
+        setattr(self, _tileable_dtypes_property, dtypes)
+
+
+def is_chunk_meta_lazy(chunk: ChunkData) -> bool:
+    chunk = chunk.data if hasattr(chunk, "data") else chunk
+    return isinstance(chunk, LazyMetaChunkData) and hasattr(
+        chunk, _tileable_key_property
+    )
+
+
+@functools.lru_cache(maxsize=128)
+def _get_cum_nsplit(nsplit: Tuple[int]) -> List[int]:
+    return [0] + np.cumsum(nsplit).tolist()
+
+
+def _calc_axis_slice(nsplit: Tuple[int], index: int) -> slice:
+    if not isinstance(nsplit, tuple):
+        nsplit = tuple(nsplit)
+    cum_nsplit = _get_cum_nsplit(nsplit)
+    return slice(cum_nsplit[index], cum_nsplit[index + 1])
+
+
+class ChunkDtypesField(SeriesField):
+    _tileable_key_index_to_dtypes = dict()
+
+    @staticmethod
+    def _gen_chunk_dtypes(instance: Chunk, index: int) -> Optional[pd.Series]:
+        # dtypes of tileable
+        try:
+            tileable_key = getattr(instance, _tileable_key_property)
+        except AttributeError:
+            return
+        cache = ChunkDtypesField._tileable_key_index_to_dtypes
+        try:
+            return cache[tileable_key, index]
+        except KeyError:
+            tileable_dtypes = getattr(instance, _tileable_dtypes_property)
+            # nsplits of tileable
+            nsplits = getattr(instance, _nsplits_property)[1]
+            # calc slice
+            slc = _calc_axis_slice(nsplits, index)
+            dtypes = tileable_dtypes.iloc[slc]
+            cache[tileable_key, index] = dtypes
+            return dtypes
+
+    def __get__(self, instance, owner=None):
+        if not issubclass(owner, LazyMetaChunkData):  # pragma: no cover
+            return super().__get__(instance, owner)
+
+        try:
+            value = self.get(instance, owner)
+            if value is not None:
+                return value
+        except AttributeError:  # pragma: no cover
+            pass
+
+        if instance.index is None:
+            return super().__get__(instance, owner)
+
+        # get dtypes lazily
+        index = instance.index[1]
+        dtypes = self._gen_chunk_dtypes(instance, index)
+        # cache dtypes
+        self.set(instance, dtypes)
+        return dtypes
+
+
+class ChunkIndexValueField(ReferenceField):
+    _tileable_key_index_to_index_value = dict()
+
+    @staticmethod
+    def _gen_chunk_index_value(instance: Chunk, index: int) -> Optional[IndexValue]:
+        # index_value of tileable
+        try:
+            tileable_key = getattr(instance, _tileable_key_property)
+        except AttributeError:
+            return
+        cache = ChunkIndexValueField._tileable_key_index_to_index_value
+        try:
+            return cache[tileable_key, index]
+        except KeyError:
+            tileable_index_value = getattr(instance, _tileable_index_value_property)
+            # nsplits of tileable
+            nsplit = getattr(instance, _nsplits_property)[0]
+            # calc slice
+            slc = _calc_axis_slice(nsplit, index)
+            pd_index = tileable_index_value.to_pandas()
+            if np.isnan(slc.stop - slc.start):
+                chunk_pd_index = pd_index[:0]
+            else:
+                chunk_pd_index = pd_index[slc]
+            index_value = parse_index(
+                chunk_pd_index,
+                key=f"{tileable_index_value.key}_index_{index}_{slc.start}_{slc.stop}",
+            )
+            cache[tileable_key, index] = index_value
+            return index_value
+
+    def __get__(self, instance, owner=None):
+        if not issubclass(owner, LazyMetaChunkData):  # pragma: no cover
+            return super().__get__(instance, owner)
+
+        try:
+            value = self.get(instance, owner)
+            if value is not None:
+                return value
+        except AttributeError:  # pragma: no cover
+            pass
+
+        if instance.index is None:
+            return super().__get__(instance, owner)
+
+        # get index_value lazily
+        index = instance.index[0]
+        index_value = self._gen_chunk_index_value(instance, index)
+        # cache index_value
+        self.set(instance, index_value)
+        return index_value
+
+
+class ChunkColumnsValueField(ReferenceField):
+    _tileable_key_index_to_index_value = dict()
+
+    @staticmethod
+    def _gen_chunk_columns_value(instance: Chunk, index: int) -> Optional[IndexValue]:
+        # columns_value of tileable
+        try:
+            tileable_key = getattr(instance, _tileable_key_property)
+        except AttributeError:
+            return
+        cache = ChunkColumnsValueField._tileable_key_index_to_index_value
+        try:
+            return cache[tileable_key, index]
+        except KeyError:
+            tileable_columns_value = getattr(instance, _tileable_columns_value_property)
+            # nsplits of tileable
+            nsplit = getattr(instance, _nsplits_property)[1]
+            # calc slice
+            slc = _calc_axis_slice(nsplit, index)
+            pd_index = tileable_columns_value.to_pandas()
+            chunk_pd_index = (
+                pd_index[:0] if np.isnan(slc.stop - slc.start) else pd_index[slc]
+            )
+            columns_value = parse_index(chunk_pd_index, store_data=True)
+            cache[tileable_key, index] = columns_value
+            return columns_value
+
+    def __get__(self, instance, owner=None):
+        if not issubclass(owner, LazyMetaChunkData):  # pragma: no cover
+            return super().__get__(instance, owner)
+
+        try:
+            value = self.get(instance, owner)
+            if value is not None:
+                return value
+        except AttributeError:  # pragma: no cover
+            pass
+
+        if instance.index is None:
+            return super().__get__(instance, owner)
+
+        # get columns_value lazily
+        index = instance.index[1]
+        columns_value = self._gen_chunk_columns_value(instance, index)
+        # cache columns_value
+        self.set(instance, columns_value)
+        return columns_value
+
+
+class IndexChunkData(ChunkData):
+    __slots__ = ()
+    type_name = "Index"
+
+    # required fields
+    _shape = TupleField(
+        "shape",
+        FieldTypes.int64,
+        on_serialize=on_serialize_shape,
+        on_deserialize=on_deserialize_shape,
+    )
+    # optional field
+    _dtype = DataTypeField("dtype")
+    _name = AnyField("name")
+    _index_value = ReferenceField("index_value", IndexValue)
+
+    def __init__(
+        self,
+        op=None,
+        shape=None,
+        index=None,
+        dtype=None,
+        name=None,
+        index_value=None,
+        **kw,
+    ):
+        super().__init__(
+            _op=op,
+            _shape=shape,
+            _index=index,
+            _dtype=dtype,
+            _name=name,
+            _index_value=index_value,
+            **kw,
+        )
+
+    @property
+    def params(self) -> Dict[str, Any]:
+        # params return the properties which useful to rebuild a new chunk
+        return {
+            "shape": self.shape,
+            "dtype": self.dtype,
+            "index": self.index,
+            "index_value": self.index_value,
+            "name": self.name,
+        }
+
+    @params.setter
+    def params(self, new_params: Dict[str, Any]):
+        params = new_params.copy()
+        params.pop("index", None)  # index not needed to update
+        new_shape = params.pop("shape", None)
+        if new_shape is not None:
+            self._shape = new_shape
+        dtype = params.pop("dtype", None)
+        if dtype is not None:
+            self._dtype = dtype
+        index_value = params.pop("index_value", None)
+        if index_value is not None:
+            self._index_value = index_value
+        name = params.pop("name", None)
+        if name is not None:
+            self._name = name
+        if params:  # pragma: no cover
+            raise TypeError(f"Unknown params: {list(params)}")
+
+    @classmethod
+    def get_params_from_data(cls, data: pd.Index) -> Dict[str, Any]:
+        return {
+            "shape": data.shape,
+            "dtype": data.dtype,
+            "index_value": parse_index(data, store_data=False),
+            "name": data.name,
+        }
+
+    @property
+    def shape(self):
+        return getattr(self, "_shape", None)
+
+    @property
+    def ndim(self):
+        return len(self.shape)
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def index_value(self):
+        return self._index_value
+
+
+class IndexChunk(Chunk):
+    __slots__ = ()
+    _allow_data_type_ = (IndexChunkData,)
+    type_name = "Index"
+
+
+def _on_deserialize_index_value(index_value):
+    if index_value is None:
+        return
+    try:
+        getattr(index_value, "value")
+        return index_value
+    except AttributeError:
+        return
+
+
+class _ToPandasMixin(_ExecuteAndFetchMixin):
+    __slots__ = ()
+
+    def to_pandas(self, session=None, **kw):
+        return self._execute_and_fetch(session=session, **kw)
+
+
+class _BatchedFetcher:
+    __slots__ = ()
+
+    def _iter(self, batch_size=None, session=None, **kw):
+        from .indexing.iloc import iloc
+
+        if batch_size is not None:
+            size = self.shape[0]
+            n_batch = ceildiv(size, batch_size)
+
+            if n_batch > 1:
+                for i in range(n_batch):
+                    batch_data = iloc(self)[batch_size * i : batch_size * (i + 1)]
+                    yield batch_data._fetch(session=session, **kw)
+            else:
+                yield self._fetch(session=session, **kw)
+        else:
+            # if batch_size is not specified, use first batch to estimate
+            # batch_size.
+            default_batch_bytes = 50 * 1024**2
+            first_batch = 1000
+            size = self.shape[0]
+
+            if size >= first_batch:
+                batch_data = iloc(self)[:first_batch]
+                first_batch_data = batch_data._fetch(session=session, **kw)
+                yield first_batch_data
+                data_size = estimate_pandas_size(first_batch_data)
+                batch_size = int(default_batch_bytes / data_size * first_batch)
+                n_batch = ceildiv(size - 1000, batch_size)
+                for i in range(n_batch):
+                    batch_data = iloc(self)[
+                        first_batch
+                        + batch_size * i : first_batch
+                        + batch_size * (i + 1)
+                    ]
+                    yield batch_data._fetch(session=session, **kw)
+            else:
+                yield self._fetch(session=session, **kw)
+
+    def iterbatch(self, batch_size=None, session=None, **kw):
+        # trigger execution
+        self.execute(session=session, **kw)
+        return self._iter(batch_size=batch_size, session=session)
+
+    def fetch(self, session=None, **kw):
+        from .indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem
+
+        batch_size = kw.pop("batch_size", None)
+        if isinstance(self.op, (DataFrameIlocGetItem, SeriesIlocGetItem)):
+            # see GH#1871
+            # already iloc, do not trigger batch fetch
+            return self._fetch(session=session, **kw)
+        else:
+            batches = list(self._iter(batch_size=batch_size, session=session, **kw))
+            return pd.concat(batches) if len(batches) > 1 else batches[0]
+
+    def fetch_infos(self, fields=None, session=None, **kw):
+        return self._fetch_infos(fields=fields, session=session, **kw)
+
+
+class IndexData(HasShapeTileableData, _ToPandasMixin):
+    __slots__ = ()
+    type_name = "Index"
+
+    # optional field
+    _dtype = DataTypeField("dtype")
+    _name = AnyField("name")
+    _names = AnyField("names")
+    _index_value = ReferenceField(
+        "index_value", IndexValue, on_deserialize=_on_deserialize_index_value
+    )
+    _chunks = ListField(
+        "chunks",
+        FieldTypes.reference(IndexChunkData),
+        on_serialize=lambda x: [it.data for it in x] if x is not None else x,
+        on_deserialize=lambda x: [IndexChunk(it) for it in x] if x is not None else x,
+    )
+
+    def __init__(
+        self,
+        op=None,
+        shape=None,
+        nsplits=None,
+        dtype=None,
+        name=None,
+        names=None,
+        index_value=None,
+        chunks=None,
+        **kw,
+    ):
+        super().__init__(
+            _op=op,
+            _shape=shape,
+            _nsplits=nsplits,
+            _dtype=dtype,
+            _name=name,
+            _names=names,
+            _index_value=index_value,
+            _chunks=chunks,
+            **kw,
+        )
+
+    @property
+    def params(self) -> Dict[str, Any]:
+        # params return the properties which useful to rebuild a new tileable object
+        return {
+            "shape": self.shape,
+            "dtype": self.dtype,
+            "name": self.name,
+            "index_value": self.index_value,
+        }
+
+    @params.setter
+    def params(self, new_params: Dict[str, Any]):
+        params = new_params.copy()
+        new_shape = params.pop("shape", None)
+        if new_shape is not None:
+            self._shape = new_shape
+        dtype = params.pop("dtype", None)
+        if dtype is not None:
+            self._dtype = dtype
+        index_value = params.pop("index_value", None)
+        if index_value is not None:
+            self._index_value = index_value
+        name = params.pop("name", None)
+        if name is not None:
+            self._name = name
+        if params:  # pragma: no cover
+            raise TypeError(f"Unknown params: {list(params)}")
+
+    def refresh_params(self):
+        # refresh params when chunks updated
+        refresh_tileable_shape(self)
+        refresh_index_value(self)
+        if self._dtype is None:
+            self._dtype = self.chunks[0].dtype
+        if self._name is None:
+            self._name = self.chunks[0].name
+
+    def _to_str(self, representation=False):
+        if is_build_mode() or len(self._executed_sessions) == 0:
+            # in build mode, or not executed, just return representation
+            if representation:
+                return f"Index <op={type(self._op).__name__}, key={self.key}"
+            else:
+                return f"Index(op={type(self._op).__name__})"
+        else:
+            data = self.fetch(session=self._executed_sessions[-1])
+            return repr(data) if repr(data) else str(data)
+
+    def __str__(self):
+        return self._to_str(representation=False)
+
+    def __repr__(self):
+        return self._to_str(representation=True)
+
+    def _to_mars_tensor(self, dtype=None, order="K", extract_multi_index=False):
+        tensor = self.to_tensor(extract_multi_index=extract_multi_index)
+        dtype = dtype if dtype is not None else tensor.dtype
+        return tensor.astype(dtype=dtype, order=order, copy=False)
+
+    def __mars_tensor__(self, dtype=None, order="K"):
+        return self._to_mars_tensor(dtype=dtype, order=order)
+
+    @property
+    def dtype(self):
+        return getattr(self, "_dtype", None) or self.op.dtype
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def names(self):
+        return getattr(self, "_names", None) or [self.name]
+
+    @property
+    def index_value(self) -> IndexValue:
+        return self._index_value
+
+    @property
+    def inferred_type(self):
+        return self._index_value.inferred_type
+
+    def to_tensor(self, dtype=None, extract_multi_index=False):
+        from ..tensor.datasource.from_dataframe import from_index
+
+        return from_index(self, dtype=dtype, extract_multi_index=extract_multi_index)
+
+
+class Index(HasShapeTileable, _ToPandasMixin):
+    __slots__ = "_df_or_series", "_parent_key", "_axis"
+    _allow_data_type_ = (IndexData,)
+    type_name = "Index"
+
+    def __new__(cls, data: Union[pd.Index, IndexData] = None, **_):
+        if data is not None and not isinstance(data, pd.Index):
+            # create corresponding Index class
+            # according to type of index_value
+            clz = globals()[type(data.index_value.value).__name__]
+        else:
+            clz = cls
+        return object.__new__(clz)
+
+    def __len__(self):
+        return len(self._data)
+
+    def __mars_tensor__(self, dtype=None, order="K"):
+        return self._data.__mars_tensor__(dtype=dtype, order=order)
+
+    def _get_df_or_series(self):
+        obj = getattr(self, "_df_or_series", None)
+        if obj is not None:
+            return obj()
+        return None
+
+    def _set_df_or_series(self, df_or_series, axis):
+        self._df_or_series = weakref.ref(df_or_series)
+        self._parent_key = df_or_series.key
+        self._axis = axis
+
+    @property
+    def T(self):
+        """Return the transpose, which is by definition self."""
+        return self
+
+    @property
+    def name(self):
+        return self._data.name
+
+    @name.setter
+    def name(self, value):
+        df_or_series = self._get_df_or_series()
+        if df_or_series is not None and df_or_series.key == self._parent_key:
+            df_or_series.rename_axis(value, axis=self._axis, inplace=True)
+            self.data = df_or_series.axes[self._axis].data
+        else:
+            self.rename(value, inplace=True)
+
+    @property
+    def names(self):
+        return self._data.names
+
+    @names.setter
+    def names(self, value):
+        df_or_series = self._get_df_or_series()
+        if df_or_series is not None:
+            df_or_series.rename_axis(value, axis=self._axis, inplace=True)
+            self.data = df_or_series.axes[self._axis].data
+        else:
+            self.rename(value, inplace=True)
+
+    @property
+    def values(self):
+        return self.to_tensor()
+
+    def to_frame(self, index: bool = True, name=None):
+        """
+        Create a DataFrame with a column containing the Index.
+
+        Parameters
+        ----------
+        index : bool, default True
+            Set the index of the returned DataFrame as the original Index.
+
+        name : object, default None
+            The passed name should substitute for the index name (if it has
+            one).
+
+        Returns
+        -------
+        DataFrame
+            DataFrame containing the original Index data.
+
+        See Also
+        --------
+        Index.to_series : Convert an Index to a Series.
+        Series.to_frame : Convert Series to DataFrame.
+
+        Examples
+        --------
+        >>> import mars.dataframe as md
+        >>> idx = md.Index(['Ant', 'Bear', 'Cow'], name='animal')
+        >>> idx.to_frame().execute()
+               animal
+        animal
+        Ant       Ant
+        Bear     Bear
+        Cow       Cow
+
+        By default, the original Index is reused. To enforce a new Index:
+
+        >>> idx.to_frame(index=False).execute()
+          animal
+        0    Ant
+        1   Bear
+        2    Cow
+
+        To override the name of the resulting column, specify `name`:
+
+        >>> idx.to_frame(index=False, name='zoo').execute()
+            zoo
+        0   Ant
+        1  Bear
+        2   Cow
+        """
+        from . import dataframe_from_tensor
+
+        if isinstance(self.index_value.value, IndexValue.MultiIndex):
+            old_names = self.index_value.value.names
+
+            if (
+                name is not None
+                and not isinstance(name, Iterable)
+                or isinstance(name, str)
+            ):
+                raise TypeError("'name' must be a list / sequence of column names.")
+
+            name = list(name if name is not None else old_names)
+            if len(name) != len(old_names):
+                raise ValueError(
+                    "'name' should have same length as number of levels on index."
+                )
+
+            columns = [
+                old or new or idx for idx, (old, new) in enumerate(zip(old_names, name))
+            ]
+        else:
+            columns = [name or self.name or 0]
+        index_ = self if index else None
+        return dataframe_from_tensor(
+            self._data._to_mars_tensor(self, extract_multi_index=True),
+            index=index_,
+            columns=columns,
+        )
+
+    def to_series(self, index=None, name=None):
+        """
+        Create a Series with both index and values equal to the index keys.
+
+        Useful with map for returning an indexer based on an index.
+
+        Parameters
+        ----------
+        index : Index, optional
+            Index of resulting Series. If None, defaults to original index.
+        name : str, optional
+            Dame of resulting Series. If None, defaults to name of original
+            index.
+
+        Returns
+        -------
+        Series
+            The dtype will be based on the type of the Index values.
+        """
+        from . import series_from_index
+
+        return series_from_index(self, index=index, name=name)
+
+
+class RangeIndex(Index):
+    __slots__ = ()
+
+
+class CategoricalIndex(Index):
+    __slots__ = ()
+
+
+class IntervalIndex(Index):
+    __slots__ = ()
+
+
+class DatetimeIndex(Index):
+    __slots__ = ()
+
+
+class TimedeltaIndex(Index):
+    __slots__ = ()
+
+
+class PeriodIndex(Index):
+    __slots__ = ()
+
+
+class Int64Index(Index):
+    __slots__ = ()
+
+
+class UInt64Index(Index):
+    __slots__ = ()
+
+
+class Float64Index(Index):
+    __slots__ = ()
+
+
+class MultiIndex(Index):
+    __slots__ = ()
+
+
+class BaseSeriesChunkData(LazyMetaChunkData):
+    __slots__ = ()
+
+    # required fields
+    _shape = TupleField(
+        "shape",
+        FieldTypes.int64,
+        on_serialize=on_serialize_shape,
+        on_deserialize=on_deserialize_shape,
+    )
+    # optional field
+    _dtype = DataTypeField("dtype")
+    _name = AnyField("name")
+    _index_value = ChunkIndexValueField(
+        "index_value", IndexValue, on_deserialize=_on_deserialize_index_value
+    )
+
+    def __init__(
+        self,
+        op=None,
+        shape=None,
+        index=None,
+        dtype=None,
+        name=None,
+        index_value=None,
+        **kw,
+    ):
+        super().__init__(
+            _op=op,
+            _shape=shape,
+            _index=index,
+            _dtype=dtype,
+            _name=name,
+            _index_value=index_value,
+            **kw,
+        )
+
+    def _get_params(self) -> Dict[str, Any]:
+        # params return the properties which useful to rebuild a new chunk
+        return {
+            "shape": self.shape,
+            "dtype": self.dtype,
+            "index": self.index,
+            "index_value": self.index_value,
+            "name": self.name,
+        }
+
+    def _set_params(self, new_params: Dict[str, Any]):
+        params = new_params.copy()
+        params.pop("index", None)  # index not needed to update
+        new_shape = params.pop("shape", None)
+        if new_shape is not None:
+            self._shape = new_shape
+        dtype = params.pop("dtype", None)
+        if dtype is not None:
+            self._dtype = dtype
+        index_value = params.pop("index_value", None)
+        if index_value is not None:
+            self._index_value = index_value
+        name = params.pop("name", None)
+        if name is not None:
+            self._name = name
+        if params:  # pragma: no cover
+            raise TypeError(f"Unknown params: {list(params)}")
+
+    params = property(_get_params, _set_params)
+
+    @classmethod
+    def get_params_from_data(cls, data: pd.Series) -> Dict[str, Any]:
+        return {
+            "shape": data.shape,
+            "dtype": data.dtype,
+            "index_value": parse_index(data.index, store_data=False),
+            "name": data.name,
+        }
+
+    @property
+    def shape(self):
+        return getattr(self, "_shape", None)
+
+    @property
+    def ndim(self):
+        return len(self.shape)
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def index_value(self):
+        return self._index_value
+
+
+class SeriesChunkData(BaseSeriesChunkData):
+    type_name = "Series"
+
+
+class SeriesChunk(Chunk):
+    __slots__ = ()
+    _allow_data_type_ = (SeriesChunkData,)
+    type_name = "Series"
+
+
+class BaseSeriesData(HasShapeTileableData, _ToPandasMixin):
+    __slots__ = "_cache", "_accessors"
+
+    # optional field
+    _dtype = DataTypeField("dtype")
+    _name = AnyField("name")
+    _index_value = ReferenceField(
+        "index_value", IndexValue, on_deserialize=_on_deserialize_index_value
+    )
+    _chunks = ListField(
+        "chunks",
+        FieldTypes.reference(SeriesChunkData),
+        on_serialize=lambda x: [it.data for it in x] if x is not None else x,
+        on_deserialize=lambda x: [SeriesChunk(it) for it in x] if x is not None else x,
+    )
+
+    def __init__(
+        self,
+        op=None,
+        shape=None,
+        nsplits=None,
+        dtype=None,
+        name=None,
+        index_value=None,
+        chunks=None,
+        **kw,
+    ):
+        super().__init__(
+            _op=op,
+            _shape=shape,
+            _nsplits=nsplits,
+            _dtype=dtype,
+            _name=name,
+            _index_value=index_value,
+            _chunks=chunks,
+            **kw,
+        )
+        self._accessors = dict()
+
+    def _get_params(self) -> Dict[str, Any]:
+        # params return the properties which useful to rebuild a new tileable object
+        return {
+            "shape": self.shape,
+            "dtype": self.dtype,
+            "name": self.name,
+            "index_value": self.index_value,
+        }
+
+    def _set_params(self, new_params: Dict[str, Any]):
+        params = new_params.copy()
+        new_shape = params.pop("shape", None)
+        if new_shape is not None:
+            self._shape = new_shape
+        dtype = params.pop("dtype", None)
+        if dtype is not None:
+            self._dtype = dtype
+        index_value = params.pop("index_value", None)
+        if index_value is not None:
+            self._index_value = index_value
+        name = params.pop("name", None)
+        if name is not None:
+            self._name = name
+        if params:  # pragma: no cover
+            raise TypeError(f"Unknown params: {list(params)}")
+
+    params = property(_get_params, _set_params)
+
+    def refresh_params(self):
+        # refresh params when chunks updated
+        refresh_tileable_shape(self)
+        refresh_index_value(self)
+        if self._dtype is None:
+            self._dtype = self.chunks[0].dtype
+        if self._name is None:
+            self._name = self.chunks[0].name
+
+    def _to_str(self, representation=False):
+        if is_build_mode() or len(self._executed_sessions) == 0:
+            # in build mode, or not executed, just return representation
+            if representation:
+                return (
+                    f"{self.type_name} <op={type(self._op).__name__}, key={self.key}>"
+                )
+            else:
+                return f"{self.type_name}(op={type(self._op).__name__})"
+        else:
+            corner_data = fetch_corner_data(self, session=self._executed_sessions[-1])
+
+            buf = StringIO()
+            max_rows = pd.get_option("display.max_rows")
+            corner_max_rows = (
+                max_rows if self.shape[0] <= max_rows else corner_data.shape[0] - 1
+            )  # make sure max_rows < corner_data
+
+            with pd.option_context("display.max_rows", corner_max_rows):
+                if self.shape[0] <= max_rows:
+                    corner_series = corner_data
+                else:
+                    corner_series = ReprSeries(corner_data, self.shape)
+                buf.write(repr(corner_series) if representation else str(corner_series))
+
+            return buf.getvalue()
+
+    def __str__(self):
+        return self._to_str(representation=False)
+
+    def __repr__(self):
+        return self._to_str(representation=False)
+
+    @property
+    def dtype(self):
+        return getattr(self, "_dtype", None) or getattr(self.op, "dtype", None)
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def index_value(self):
+        return self._index_value
+
+    @property
+    def index(self):
+        from .datasource.index import from_tileable
+
+        return from_tileable(self)
+
+    @property
+    def axes(self):
+        return [self.index]
+
+    @property
+    def empty(self):
+        shape = getattr(self, "_shape")
+        if np.any(np.isnan(shape)):
+            raise ValueError("Tileable object must be executed first")
+        return shape == (0,)
+
+    def to_tensor(self, dtype=None):
+        from ..tensor.datasource.from_dataframe import from_series
+
+        return from_series(self, dtype=dtype)
+
+    @staticmethod
+    def from_tensor(in_tensor, index=None, name=None):
+        from .datasource.from_tensor import series_from_tensor
+
+        return series_from_tensor(in_tensor, index=index, name=name)
+
+
+class SeriesData(_BatchedFetcher, BaseSeriesData):
+    type_name = "Series"
+
+    def __mars_tensor__(self, dtype=None, order="K"):
+        tensor = self.to_tensor()
+        dtype = dtype if dtype is not None else tensor.dtype
+        return tensor.astype(dtype=dtype, order=order, copy=False)
+
+    def iteritems(self, batch_size=10000, session=None):
+        for batch_data in self.iterbatch(batch_size=batch_size, session=session):
+            yield from getattr(batch_data, "iteritems")()
+
+    items = iteritems
+
+    def to_dict(self, into=dict, batch_size=10000, session=None):
+        fetch_kwargs = dict(batch_size=batch_size)
+        return self.to_pandas(session=session, fetch_kwargs=fetch_kwargs).to_dict(
+            into=into
+        )
+
+
+class Series(HasShapeTileable, _ToPandasMixin):
+    __slots__ = ("_cache",)
+    _allow_data_type_ = (SeriesData,)
+    type_name = "Series"
+
+    def to_tensor(self, dtype=None):
+        return self._data.to_tensor(dtype=dtype)
+
+    def from_tensor(self, in_tensor, index=None, name=None):
+        return self._data.from_tensor(in_tensor, index=index, name=name)
+
+    @property
+    def T(self):
+        """Return the transpose, which is by definition self."""
+        return self
+
+    @property
+    def ndim(self):
+        """
+        Return an int representing the number of axes / array dimensions.
+
+        Return 1 if Series. Otherwise return 2 if DataFrame.
+
+        See Also
+        --------
+        ndarray.ndim : Number of array dimensions.
+
+        Examples
+        --------
+        >>> import mars.dataframe as md
+        >>> s = md.Series({'a': 1, 'b': 2, 'c': 3})
+        >>> s.ndim.execute()
+        1
+
+        >>> df = md.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
+        >>> df.ndim.execute()
+        2
+        """
+        return super().ndim
+
+    @property
+    def index(self):
+        """
+        The index (axis labels) of the Series.
+        """
+        idx = self._data.index
+        idx._set_df_or_series(self, 0)
+        return idx
+
+    @index.setter
+    def index(self, new_index):
+        self.set_axis(new_index, axis=0, inplace=True)
+
+    @property
+    def name(self):
+        return self._data.name
+
+    @name.setter
+    def name(self, val):
+        from .indexing.rename import DataFrameRename
+
+        op = DataFrameRename(new_name=val, output_types=[OutputType.series])
+        new_series = op(self)
+        self.data = new_series.data
+
+    @property
+    def dtype(self):
+        """
+        Return the dtype object of the underlying data.
+        """
+        return self._data.dtype
+
+    def copy(self, deep=True):  # pylint: disable=arguments-differ
+        """
+        Make a copy of this object's indices and data.
+
+        When ``deep=True`` (default), a new object will be created with a
+        copy of the calling object's data and indices. Modifications to
+        the data or indices of the copy will not be reflected in the
+        original object (see notes below).
+
+        When ``deep=False``, a new object will be created without copying
+        the calling object's data or index (only references to the data
+        and index are copied). Any changes to the data of the original
+        will be reflected in the shallow copy (and vice versa).
+
+        Parameters
+        ----------
+        deep : bool, default True
+            Make a deep copy, including a copy of the data and the indices.
+            With ``deep=False`` neither the indices nor the data are copied.
+
+        Returns
+        -------
+        copy : Series or DataFrame
+            Object type matches caller.
+        """
+        if deep:
+            return super().copy()
+        else:
+            return super()._view()
+
+    def __len__(self):
+        return len(self._data)
+
+    def __mars_tensor__(self, dtype=None, order="K"):
+        return self._data.__mars_tensor__(dtype=dtype, order=order)
+
+    def keys(self):
+        """
+        Return alias for index.
+
+        Returns
+        -------
+        Index
+            Index of the Series.
+        """
+        return self.index
+
+    @property
+    def values(self):
+        return self.to_tensor()
+
+    def iteritems(self, batch_size=10000, session=None):
+        """
+        Lazily iterate over (index, value) tuples.
+
+        This method returns an iterable tuple (index, value). This is
+        convenient if you want to create a lazy iterator.
+
+        Returns
+        -------
+        iterable
+            Iterable of tuples containing the (index, value) pairs from a
+            Series.
+
+        See Also
+        --------
+        DataFrame.items : Iterate over (column name, Series) pairs.
+        DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) pairs.
+
+        Examples
+        --------
+        >>> import mars.dataframe as md
+        >>> s = md.Series(['A', 'B', 'C'])
+        >>> for index, value in s.items():
+        ...     print(f"Index : {index}, Value : {value}")
+        Index : 0, Value : A
+        Index : 1, Value : B
+        Index : 2, Value : C
+        """
+        return self._data.iteritems(batch_size=batch_size, session=session)
+
+    items = iteritems
+
+    def to_dict(self, into=dict, batch_size=10000, session=None):
+        """
+        Convert Series to {label -> value} dict or dict-like object.
+
+        Parameters
+        ----------
+        into : class, default dict
+            The collections.abc.Mapping subclass to use as the return
+            object. Can be the actual class or an empty
+            instance of the mapping type you want.  If you want a
+            collections.defaultdict, you must pass it initialized.
+
+        Returns
+        -------
+        collections.abc.Mapping
+            Key-value representation of Series.
+
+        Examples
+        --------
+        >>> import mars.dataframe as md
+        >>> s = md.Series([1, 2, 3, 4])
+        >>> s.to_dict()
+        {0: 1, 1: 2, 2: 3, 3: 4}
+        >>> from collections import OrderedDict, defaultdict
+        >>> s.to_dict(OrderedDict)
+        OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])
+        >>> dd = defaultdict(list)
+        >>> s.to_dict(dd)
+        defaultdict(<class 'list'>, {0: 1, 1: 2, 2: 3, 3: 4})
+        """
+        return self._data.to_dict(into=into, batch_size=batch_size, session=session)
+
+    def to_frame(self, name=None):
+        """
+        Convert Series to DataFrame.
+
+        Parameters
+        ----------
+        name : object, default None
+            The passed name should substitute for the series name (if it has
+            one).
+
+        Returns
+        -------
+        DataFrame
+            DataFrame representation of Series.
+
+        Examples
+        --------
+        >>> import mars.dataframe as md
+        >>> s = md.Series(["a", "b", "c"], name="vals")
+        >>> s.to_frame().execute()
+          vals
+        0    a
+        1    b
+        2    c
+        """
+        from . import dataframe_from_tensor
+
+        name = name or self.name or 0
+        return dataframe_from_tensor(self, columns=[name])
+
+    def between(self, left, right, inclusive="both"):
+        """
+        Return boolean Series equivalent to left <= series <= right.
+        This function returns a boolean vector containing `True` wherever the
+        corresponding Series element is between the boundary values `left` and
+        `right`. NA values are treated as `False`.
+
+        Parameters
+        ----------
+        left : scalar or list-like
+            Left boundary.
+        right : scalar or list-like
+            Right boundary.
+        inclusive : {"both", "neither", "left", "right"}
+            Include boundaries. Whether to set each bound as closed or open.
+
+        Returns
+        -------
+        Series
+            Series representing whether each element is between left and
+            right (inclusive).
+
+        See Also
+        --------
+        Series.gt : Greater than of series and other.
+        Series.lt : Less than of series and other.
+
+        Notes
+        -----
+        This function is equivalent to ``(left <= ser) & (ser <= right)``
+
+        Examples
+        --------
+        >>> import mars.dataframe as md
+        >>> s = md.Series([2, 0, 4, 8, np.nan])
+
+        Boundary values are included by default:
+
+        >>> s.between(1, 4).execute()
+        0     True
+        1    False
+        2     True
+        3    False
+        4    False
+        dtype: bool
+
+        With `inclusive` set to ``"neither"`` boundary values are excluded:
+
+        >>> s.between(1, 4, inclusive="neither").execute()
+        0     True
+        1    False
+        2    False
+        3    False
+        4    False
+        dtype: bool
+
+        `left` and `right` can be any scalar value:
+
+        >>> s = md.Series(['Alice', 'Bob', 'Carol', 'Eve'])
+        >>> s.between('Anna', 'Daniel').execute()
+        0    False
+        1     True
+        2     True
+        3    False
+        dtype: bool
+        """
+        if isinstance(inclusive, bool):  # pragma: no cover
+            # for pandas < 1.3.0
+            if inclusive:
+                inclusive = "both"
+            else:
+                inclusive = "neither"
+        if inclusive == "both":
+            lmask = self >= left
+            rmask = self <= right
+        elif inclusive == "left":
+            lmask = self >= left
+            rmask = self < right
+        elif inclusive == "right":
+            lmask = self > left
+            rmask = self <= right
+        elif inclusive == "neither":
+            lmask = self > left
+            rmask = self < right
+        else:
+            raise ValueError(
+                "Inclusive has to be either string of 'both',"
+                "'left', 'right', or 'neither'."
+            )
+
+        return lmask & rmask
+
+    def median(
+        self, axis=None, skipna=True, out=None, overwrite_input=False, keepdims=False
+    ):
+        """
+        Return the median of the values over the requested axis.
+
+        Parameters
+        ----------
+        axis : {index (0)}
+            Axis or axes along which the medians are computed. The default
+            is to compute the median along a flattened version of the tensor.
+            A sequence of axes is supported since version 1.9.0.
+        skipna : bool, optional, default True
+            Exclude NA/null values when computing the result.
+        out : Tensor, default None
+            Output tensor in which to place the result. It must
+            have the same shape and buffer length as the expected output,
+            but the type (of the output) will be cast if necessary.
+        overwrite_input : bool, default False
+            Just for compatibility with Numpy, would not take effect.
+        keepdims : bool, default False
+            If this is set to True, the axes which are reduced are left
+            in the result as dimensions with size one. With this option,
+            the result will broadcast correctly against the original `arr`.
+
+        Returns
+        -------
+        median : scalar
+            Return the median of the values over the requested axis.
+
+        See Also
+        --------
+        tensor.mean, tensor.percentile
+
+        Notes
+        -----
+        Given a vector ``V`` of length ``N``, the median of ``V`` is the
+        middle value of a sorted copy of ``V``, ``V_sorted`` - i
+        e., ``V_sorted[(N-1)/2]``, when ``N`` is odd, and the average of the
+        two middle values of ``V_sorted`` when ``N`` is even.
+
+        Examples
+        --------
+        >>> import mars.dataframe as md
+        >>> a = md.Series([10, 7, 4, 3, 2, 1])
+        >>> a.median().execute()
+        2.0
+        >>> mt.median(a).execute()
+        3.5
+        >>> a = md.Series([10, 7, 4, None, 2, 1])
+        >>> a.median().execute()
+        4.0
+        >>> a.median(skipna=False).execute()
+        nan
+        """
+        if skipna:
+            return statistics.median(
+                self.dropna(),
+                axis=None,
+                out=None,
+                overwrite_input=False,
+                keepdims=False,
+            )
+        else:
+            return statistics.median(
+                self, axis=None, out=None, overwrite_input=False, keepdims=False
+            )
+
+
+class BaseDataFrameChunkData(LazyMetaChunkData):
+    __slots__ = ("_dtypes_value",)
+    _no_copy_attrs_ = ChunkData._no_copy_attrs_ | {"_dtypes", "_columns_value"}
+
+    # required fields
+    _shape = TupleField(
+        "shape",
+        FieldTypes.int64,
+        on_serialize=on_serialize_shape,
+        on_deserialize=on_deserialize_shape,
+    )
+    # optional fields
+    _dtypes = ChunkDtypesField("dtypes")
+    _index_value = ChunkIndexValueField(
+        "index_value", IndexValue, on_deserialize=_on_deserialize_index_value
+    )
+    _columns_value = ChunkColumnsValueField("columns_value", IndexValue)
+
+    def __init__(
+        self,
+        op=None,
+        shape=None,
+        index=None,
+        dtypes=None,
+        index_value=None,
+        columns_value=None,
+        **kw,
+    ):
+        super().__init__(
+            _op=op,
+            _shape=shape,
+            _index=index,
+            _dtypes=dtypes,
+            _index_value=index_value,
+            _columns_value=columns_value,
+            **kw,
+        )
+        self._dtypes_value = None
+
+    def __on_deserialize__(self):
+        super(BaseDataFrameChunkData, self).__on_deserialize__()
+        self._dtypes_value = None
+
+    def __len__(self):
+        return self.shape[0]
+
+    def _get_params(self) -> Dict[str, Any]:
+        # params return the properties which useful to rebuild a new chunk
+        return {
+            "shape": self.shape,
+            "dtypes": self.dtypes,
+            "dtypes_value": self.dtypes_value,
+            "index": self.index,
+            "index_value": self.index_value,
+            "columns_value": self.columns_value,
+        }
+
+    def _set_params(self, new_params: Dict[str, Any]):
+        params = new_params.copy()
+        params.pop("index", None)  # index not needed to update
+        new_shape = params.pop("shape", None)
+        if new_shape is not None:
+            self._shape = new_shape
+        index_value = params.pop("index_value", None)
+        if index_value is not None:
+            self._index_value = index_value
+        dtypes = params.pop("dtypes", None)
+        if dtypes is not None:
+            self._dtypes = dtypes
+        columns_value = params.pop("columns_value", None)
+        if columns_value is not None:
+            self._columns_value = columns_value
+        dtypes_value = params.pop("dtypes_value", None)
+        if dtypes_value is not None:
+            if dtypes is None:
+                self._dtypes = dtypes_value.value
+            if columns_value is None:
+                self._columns_value = parse_index(self._dtypes.index, store_data=True)
+            self._dtypes_value = dtypes_value
+        if params:  # pragma: no cover
+            raise TypeError(f"Unknown params: {list(params)}")
+
+    params = property(_get_params, _set_params)
+
+    @classmethod
+    def get_params_from_data(cls, data: pd.DataFrame) -> Dict[str, Any]:
+        parse_index(data.index, store_data=False)
+        return {
+            "shape": data.shape,
+            "index_value": parse_index(data.index, store_data=False),
+            "dtypes_value": DtypesValue(key=tokenize(data.dtypes), value=data.dtypes),
+        }
+
+    @property
+    def shape(self):
+        return getattr(self, "_shape", None)
+
+    @property
+    def ndim(self):
+        return len(self.shape)
+
+    @property
+    def dtypes(self):
+        dt = getattr(self, "_dtypes", None)
+        if dt is not None:
+            return dt
+        return getattr(self.op, "dtypes", None)
+
+    @property
+    def dtypes_value(self):
+        if self._dtypes_value is not None:
+            return self._dtypes_value
+        # TODO(qinxuye): when creating Dataframe,
+        #  dtypes_value instead of dtypes later must be passed into
+        dtypes = self.dtypes
+        if dtypes is not None:
+            self._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes)
+            return self._dtypes_value
+
+    @property
+    def index_value(self):
+        return self._index_value
+
+    @property
+    def columns_value(self):
+        return self._columns_value
+
+
+class DataFrameChunkData(BaseDataFrameChunkData):
+    type_name = "DataFrame"
+
+
+class DataFrameChunk(Chunk):
+    __slots__ = ()
+    _allow_data_type_ = (DataFrameChunkData,)
+    type_name = "DataFrame"
+
+    def __len__(self):
+        return len(self._data)
+
+
+class BaseDataFrameData(HasShapeTileableData, _ToPandasMixin):
+    __slots__ = "_accessors", "_dtypes_value", "_dtypes_dict"
+
+    # optional fields
+    _dtypes = SeriesField("dtypes")
+    _index_value = ReferenceField(
+        "index_value", IndexValue, on_deserialize=_on_deserialize_index_value
+    )
+    _columns_value = ReferenceField("columns_value", IndexValue)
+    _chunks = ListField(
+        "chunks",
+        FieldTypes.reference(DataFrameChunkData),
+        on_serialize=lambda x: [it.data for it in x] if x is not None else x,
+        on_deserialize=lambda x: [DataFrameChunk(it) for it in x]
+        if x is not None
+        else x,
+    )
+
+    def __init__(
+        self,
+        op=None,
+        shape=None,
+        nsplits=None,
+        dtypes=None,
+        index_value=None,
+        columns_value=None,
+        chunks=None,
+        **kw,
+    ):
+        super().__init__(
+            _op=op,
+            _shape=shape,
+            _nsplits=nsplits,
+            _dtypes=dtypes,
+            _index_value=index_value,
+            _columns_value=columns_value,
+            _chunks=chunks,
+            **kw,
+        )
+        self._accessors = dict()
+        self._dtypes_value = None
+        self._dtypes_dict = None
+
+    def __on_deserialize__(self):
+        super().__on_deserialize__()
+        self._accessors = dict()
+        self._dtypes_value = None
+        self._dtypes_dict = None
+
+    def _get_params(self) -> Dict[str, Any]:
+        # params return the properties which useful to rebuild a new tileable object
+        return {
+            "shape": self.shape,
+            "dtypes": self.dtypes,
+            "index_value": self.index_value,
+            "columns_value": self.columns_value,
+            "dtypes_value": self.dtypes_value,
+        }
+
+    def _set_params(self, new_params: Dict[str, Any]):
+        params = new_params.copy()
+        new_shape = params.pop("shape", None)
+        if new_shape is not None:
+            self._shape = new_shape
+        index_value = params.pop("index_value", None)
+        if index_value is not None:
+            self._index_value = index_value
+        dtypes = params.pop("dtypes", None)
+        if dtypes is not None:
+            self._dtypes = dtypes
+        columns_value = params.pop("columns_value", None)
+        if columns_value is not None:
+            self._columns_value = columns_value
+        dtypes_value = params.pop("dtypes_value", None)
+        if dtypes_value is not None:
+            if dtypes is None:
+                self._dtypes = dtypes_value.value
+            if columns_value is None:
+                self._columns_value = parse_index(self._dtypes.index, store_data=True)
+            self._dtypes_value = dtypes_value
+        if params:  # pragma: no cover
+            raise TypeError(f"Unknown params: {list(params)}")
+
+    params = property(_get_params, _set_params)
+
+    def refresh_params(self):
+        # refresh params when chunks updated
+        refresh_tileable_shape(self)
+        refresh_index_value(self)
+        refresh_dtypes(self)
+
+    @property
+    def dtypes(self):
+        dt = getattr(self, "_dtypes", None)
+        if dt is not None:
+            return dt
+        return getattr(self.op, "dtypes", None)
+
+    @property
+    def dtypes_value(self):
+        if self._dtypes_value is not None:
+            return self._dtypes_value
+        # TODO(qinxuye): when creating Dataframe,
+        #  dtypes_value instead of dtypes later must be passed into
+        dtypes = self.dtypes
+        if dtypes is not None:
+            self._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes)
+            return self._dtypes_value
+
+    @property
+    def index_value(self):
+        return self._index_value
+
+    @property
+    def columns_value(self):
+        return self._columns_value
+
+    @property
+    def empty(self):
+        shape = getattr(self, "_shape")
+        if np.any(np.isnan(shape)):
+            raise ValueError("Tileable object must be executed first")
+        return 0 in shape
+
+    def to_tensor(self, dtype=None):
+        from ..tensor.datasource.from_dataframe import from_dataframe
+
+        return from_dataframe(self, dtype=dtype)
+
+    @staticmethod
+    def from_tensor(in_tensor, index=None, columns=None):
+        from .datasource.from_tensor import dataframe_from_tensor
+
+        return dataframe_from_tensor(in_tensor, index=index, columns=columns)
+
+    @staticmethod
+    def from_records(records, **kw):
+        from .datasource.from_records import from_records
+
+        return from_records(records, **kw)
+
+    @property
+    def index(self):
+        from .datasource.index import from_tileable
+
+        return from_tileable(self)
+
+    @property
+    def columns(self):
+        from .datasource.index import from_pandas as from_pandas_index
+
+        return from_pandas_index(self.dtypes.index, store_data=True)
+
+    @property
+    def axes(self):
+        return [self.index, self.columns]
+
+    def _get_dtypes_dict(self):
+        if self._dtypes_dict is None:
+            self._dtypes_dict = d = dict()
+            for k, v in self.dtypes.items():
+                try:
+                    obj_list = d[k]
+                except KeyError:
+                    obj_list = d[k] = []
+                obj_list.append(v)
+        return self._dtypes_dict
+
+    def _get_dtypes_by_columns(self, columns: list):
+        dtypes_dict = self._get_dtypes_dict()
+        return functools.reduce(operator.add, (dtypes_dict[c] for c in columns), [])
+
+    def _get_columns_by_columns(self, columns: list):
+        dtypes_dict = self._get_dtypes_dict()
+        return functools.reduce(
+            operator.add, ([c] * len(dtypes_dict[c]) for c in columns), []
+        )
+
+
+class DataFrameData(_BatchedFetcher, BaseDataFrameData):
+    type_name = "DataFrame"
+
+    def _to_str(self, representation=False):
+        if is_build_mode() or len(self._executed_sessions) == 0:
+            # in build mode, or not executed, just return representation
+            if representation:
+                return (
+                    f"{self.type_name} <op={type(self._op).__name__}, key={self.key}>"
+                )
+            else:
+                return f"{self.type_name}(op={type(self._op).__name__})"
+        else:
+            corner_data = fetch_corner_data(self, session=self._executed_sessions[-1])
+
+            buf = StringIO()
+            max_rows = pd.get_option("display.max_rows")
+
+            if self.shape[0] <= max_rows:
+                buf.write(repr(corner_data) if representation else str(corner_data))
+            else:
+                # remember we cannot directly call repr(df),
+                # because the [... rows x ... columns] may show wrong rows
+                with pd.option_context(
+                    "display.show_dimensions",
+                    False,
+                    "display.max_rows",
+                    corner_data.shape[0] - 1,
+                ):
+                    if representation:
+                        s = repr(corner_data)
+                    else:
+                        s = str(corner_data)
+                    buf.write(s)
+                if pd.get_option("display.show_dimensions"):
+                    n_rows, n_cols = self.shape
+                    buf.write(f"\n\n[{n_rows} rows x {n_cols} columns]")
+
+            return buf.getvalue()
+
+    def __str__(self):
+        return self._to_str(representation=False)
+
+    def __repr__(self):
+        return self._to_str(representation=True)
+
+    def __mars_tensor__(self, dtype=None, order="K"):
+        return self.to_tensor().astype(dtype=dtype, order=order, copy=False)
+
+    def _repr_html_(self):
+        if len(self._executed_sessions) == 0:
+            # not executed before, fall back to normal repr
+            raise NotImplementedError
+
+        corner_data = fetch_corner_data(self, session=self._executed_sessions[-1])
+
+        buf = StringIO()
+        max_rows = pd.get_option("display.max_rows")
+        if self.shape[0] <= max_rows:
+            buf.write(corner_data._repr_html_())
+        else:
+            with pd.option_context(
+                "display.show_dimensions",
+                False,
+                "display.max_rows",
+                corner_data.shape[0] - 1,
+            ):
+                buf.write(corner_data._repr_html_().rstrip().rstrip("</div>"))
+            if pd.get_option("display.show_dimensions"):
+                n_rows, n_cols = self.shape
+                buf.write(f"<p>{n_rows} rows × {n_cols} columns</p>\n")
+            buf.write("</div>")
+
+        return buf.getvalue()
+
+    def items(self):
+        for col_name in self.dtypes.index:
+            yield col_name, self[col_name]
+
+    iteritems = items
+
+    def iterrows(self, batch_size=1000, session=None):
+        for batch_data in self.iterbatch(batch_size=batch_size, session=session):
+            yield from getattr(batch_data, "iterrows")()
+
+    def itertuples(self, index=True, name="Pandas", batch_size=1000, session=None):
+        for batch_data in self.iterbatch(batch_size=batch_size, session=session):
+            yield from getattr(batch_data, "itertuples")(index=index, name=name)
+
+    def _need_execution(self):
+        if self._dtypes is None:
+            return True
+        return False
+
+
+class DataFrame(HasShapeTileable, _ToPandasMixin):
+    __slots__ = ("_cache",)
+    _allow_data_type_ = (DataFrameData,)
+    type_name = "DataFrame"
+
+    def __len__(self):
+        return len(self._data)
+
+    def to_tensor(self):
+        return self._data.to_tensor()
+
+    def from_tensor(self, in_tensor, index=None, columns=None):
+        return self._data.from_tensor(in_tensor, index=index, columns=columns)
+
+    def from_records(self, records, **kw):
+        return self._data.from_records(records, **kw)
+
+    def __mars_tensor__(self, dtype=None, order="K"):
+        return self._data.__mars_tensor__(dtype=dtype, order=order)
+
+    def __getattr__(self, key):
+        try:
+            return getattr(self._data, key)
+        except AttributeError:
+            if key in self.dtypes:
+                return self[key]
+            else:
+                raise
+
+    def __dir__(self):
+        result = list(super().__dir__())
+        return sorted(
+            result
+            + [k for k in self.dtypes.index if isinstance(k, str) and k.isidentifier()]
+        )
+
+    @property
+    def T(self):
+        return self.transpose()
+
+    @property
+    def ndim(self):
+        """
+        Return an int representing the number of axes / array dimensions.
+
+        Return 1 if Series. Otherwise return 2 if DataFrame.
+
+        See Also
+        --------
+        ndarray.ndim : Number of array dimensions.
+
+        Examples
+        --------
+        >>> import mars.dataframe as md
+        >>> s = md.Series({'a': 1, 'b': 2, 'c': 3})
+        >>> s.ndim.execute()
+        1
+
+        >>> df = md.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
+        >>> df.ndim.execute()
+        2
+        """
+        return super().ndim
+
+    @property
+    def index(self):
+        idx = self._data.index
+        idx._set_df_or_series(self, 0)
+        return idx
+
+    @index.setter
+    def index(self, new_index):
+        self.set_axis(new_index, axis=0, inplace=True)
+
+    @property
+    def columns(self):
+        col = self._data.columns
+        col._set_df_or_series(self, 1)
+        return col
+
+    @columns.setter
+    def columns(self, new_columns):
+        self.set_axis(new_columns, axis=1, inplace=True)
+
+    def keys(self):
+        """
+        Get the 'info axis' (see Indexing for more).
+
+        This is index for Series, columns for DataFrame.
+
+        Returns
+        -------
+        Index
+            Info axis.
+        """
+        return self.columns
+
+    @property
+    def values(self):
+        return self.to_tensor()
+
+    @property
+    def dtypes(self):
+        """
+        Return the dtypes in the DataFrame.
+
+        This returns a Series with the data type of each column.
+        The result's index is the original DataFrame's columns. Columns
+        with mixed types are stored with the ``object`` dtype. See
+        :ref:`the User Guide <basics.dtypes>` for more.
+
+        Returns
+        -------
+        pandas.Series
+            The data type of each column.
+
+        Examples
+        --------
+        >>> import mars.dataframe as md
+        >>> df = md.DataFrame({'float': [1.0],
+        ...                    'int': [1],
+        ...                    'datetime': [md.Timestamp('20180310')],
+        ...                    'string': ['foo']})
+        >>> df.dtypes
+        float              float64
+        int                  int64
+        datetime    datetime64[ns]
+        string              object
+        dtype: object
+        """
+        return self._data.dtypes
+
+    def iterrows(self, batch_size=1000, session=None):
+        """
+        Iterate over DataFrame rows as (index, Series) pairs.
+
+        Yields
+        ------
+        index : label or tuple of label
+            The index of the row. A tuple for a `MultiIndex`.
+        data : Series
+            The data of the row as a Series.
+
+        it : generator
+            A generator that iterates over the rows of the frame.
+
+        See Also
+        --------
+        DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.
+        DataFrame.items : Iterate over (column name, Series) pairs.
+
+        Notes
+        -----
+
+        1. Because ``iterrows`` returns a Series for each row,
+           it does **not** preserve dtypes across the rows (dtypes are
+           preserved across columns for DataFrames). For example,
+
+           >>> import mars.dataframe as md
+           >>> df = md.DataFrame([[1, 1.5]], columns=['int', 'float'])
+           >>> row = next(df.iterrows())[1]
+           >>> row
+           int      1.0
+           float    1.5
+           Name: 0, dtype: float64
+           >>> print(row['int'].dtype)
+           float64
+           >>> print(df['int'].dtype)
+           int64
+
+           To preserve dtypes while iterating over the rows, it is better
+           to use :meth:`itertuples` which returns namedtuples of the values
+           and which is generally faster than ``iterrows``.
+
+        2. You should **never modify** something you are iterating over.
+           This is not guaranteed to work in all cases. Depending on the
+           data types, the iterator returns a copy and not a view, and writing
+           to it will have no effect.
+        """
+        return self._data.iterrows(batch_size=batch_size, session=session)
+
+    def itertuples(self, index=True, name="Pandas", batch_size=1000, session=None):
+        """
+        Iterate over DataFrame rows as namedtuples.
+
+        Parameters
+        ----------
+        index : bool, default True
+            If True, return the index as the first element of the tuple.
+        name : str or None, default "Pandas"
+            The name of the returned namedtuples or None to return regular
+            tuples.
+
+        Returns
+        -------
+        iterator
+            An object to iterate over namedtuples for each row in the
+            DataFrame with the first field possibly being the index and
+            following fields being the column values.
+
+        See Also
+        --------
+        DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
+            pairs.
+        DataFrame.items : Iterate over (column name, Series) pairs.
+
+        Notes
+        -----
+        The column names will be renamed to positional names if they are
+        invalid Python identifiers, repeated, or start with an underscore.
+        On python versions < 3.7 regular tuples are returned for DataFrames
+        with a large number of columns (>254).
+
+        Examples
+        --------
+        >>> import mars.dataframe as md
+        >>> df = md.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
+        ...                   index=['dog', 'hawk'])
+        >>> df.execute()
+              num_legs  num_wings
+        dog          4          0
+        hawk         2          2
+        >>> for row in df.itertuples():
+        ...     print(row)
+        ...
+        Pandas(Index='dog', num_legs=4, num_wings=0)
+        Pandas(Index='hawk', num_legs=2, num_wings=2)
+
+        By setting the `index` parameter to False we can remove the index
+        as the first element of the tuple:
+
+        >>> for row in df.itertuples(index=False):
+        ...     print(row)
+        ...
+        Pandas(num_legs=4, num_wings=0)
+        Pandas(num_legs=2, num_wings=2)
+
+        With the `name` parameter set we set a custom name for the yielded
+        namedtuples:
+
+        >>> for row in df.itertuples(name='Animal'):
+        ...     print(row)
+        ...
+        Animal(Index='dog', num_legs=4, num_wings=0)
+        Animal(Index='hawk', num_legs=2, num_wings=2)
+        """
+        return self._data.itertuples(
+            batch_size=batch_size, session=session, index=index, name=name
+        )
+
+    def assign(self, **kwargs):
+        """
+        Assign new columns to a DataFrame.
+        Returns a new object with all original columns in addition to new ones.
+        Existing columns that are re-assigned will be overwritten.
+
+        Parameters
+        ----------
+        **kwargs : dict of {str: callable or Series}
+            The column names are keywords. If the values are
+            callable, they are computed on the DataFrame and
+            assigned to the new columns. The callable must not
+            change input DataFrame (though pandas doesn't check it).
+            If the values are not callable, (e.g. a Series, scalar, or array),
+            they are simply assigned.
+
+        Returns
+        -------
+        DataFrame
+            A new DataFrame with the new columns in addition to
+            all the existing columns.
+
+        Notes
+        -----
+        Assigning multiple columns within the same ``assign`` is possible.
+        Later items in 'kwargs' may refer to newly created or modified
+        columns in 'df'; items are computed and assigned into 'df' in order.
+
+        Examples
+        --------
+        >>> import mars.dataframe as md
+        >>> df = md.DataFrame({'temp_c': [17.0, 25.0]},
+        ...                   index=['Portland', 'Berkeley'])
+        >>> df.execute()
+                  temp_c
+        Portland    17.0
+        Berkeley    25.0
+
+        Where the value is a callable, evaluated on `df`:
+
+        >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32).execute()
+                  temp_c  temp_f
+        Portland    17.0    62.6
+        Berkeley    25.0    77.0
+
+        Alternatively, the same behavior can be achieved by directly
+        referencing an existing Series or sequence:
+
+        >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32).execute()
+                  temp_c  temp_f
+        Portland    17.0    62.6
+        Berkeley    25.0    77.0
+
+        You can create multiple columns within the same assign where one
+        of the columns depends on another one defined within the same assign:
+
+        >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
+        ...           temp_k=lambda x: (x['temp_f'] +  459.67) * 5 / 9).execute()
+                  temp_c  temp_f  temp_k
+        Portland    17.0    62.6  290.15
+        Berkeley    25.0    77.0  298.15
+        """
+
+        def apply_if_callable(maybe_callable, obj, **kwargs):
+            if callable(maybe_callable):
+                return maybe_callable(obj, **kwargs)
+
+            return maybe_callable
+
+        data = self.copy()
+
+        for k, v in kwargs.items():
+            data[k] = apply_if_callable(v, data)
+        return data
+
+
+class DataFrameGroupByChunkData(BaseDataFrameChunkData):
+    type_name = "DataFrameGroupBy"
+
+    _key_dtypes = SeriesField("key_dtypes")
+    _selection = AnyField("selection")
+
+    @property
+    def key_dtypes(self):
+        return self._key_dtypes
+
+    @property
+    def selection(self):
+        return self._selection
+
+    def _get_params(self) -> Dict[str, Any]:
+        p = super()._get_params()
+        p.update(dict(key_dtypes=self.key_dtypes, selection=self.selection))
+        return p
+
+    def _set_params(self, new_params: Dict[str, Any]):
+        params = new_params.copy()
+        key_dtypes = params.pop("key_dtypes", None)
+        if key_dtypes is not None:
+            self._key_dtypes = key_dtypes
+        selection = params.pop("selection", None)
+        if selection is not None:
+            self._selection = selection
+        super()._set_params(params)
+
+    params = property(_get_params, _set_params)
+
+    @classmethod
+    def get_params_from_data(cls, data: GroupByWrapper) -> Dict[str, Any]:
+        params = super().get_params_from_data(data.obj)
+        if data.selection:
+            dtypes = params["dtypes_value"].value[data.selection]
+            params["dtypes_value"] = DtypesValue(value=dtypes)
+            params["shape"] = data.shape
+        return params
+
+    def __init__(self, key_dtypes=None, selection=None, **kw):
+        super().__init__(_key_dtypes=key_dtypes, _selection=selection, **kw)
+
+
+class DataFrameGroupByChunk(Chunk):
+    __slots__ = ()
+    _allow_data_type_ = (DataFrameGroupByChunkData,)
+    type_name = "DataFrameGroupBy"
+
+    def __len__(self):
+        return len(self._data)
+
+
+class SeriesGroupByChunkData(BaseSeriesChunkData):
+    type_name = "SeriesGroupBy"
+
+    _key_dtypes = AnyField("key_dtypes")
+
+    @property
+    def key_dtypes(self):
+        return self._key_dtypes
+
+    def _get_params(self) -> Dict[str, Any]:
+        p = super()._get_params()
+        p["key_dtypes"] = self.key_dtypes
+        return p
+
+    def _set_params(self, new_params: Dict[str, Any]):
+        params = new_params.copy()
+        key_dtypes = params.pop("key_dtypes", None)
+        if key_dtypes is not None:
+            self._key_dtypes = key_dtypes
+        super()._set_params(new_params)
+
+    params = property(_get_params, _set_params)
+
+    @classmethod
+    def get_params_from_data(cls, data: GroupByWrapper):
+        series_name = data.selection or data.obj.name
+        if hasattr(data.obj, "dtype"):
+            dtype = data.obj.dtype
+        else:
+            dtype = data.obj.dtypes[series_name]
+
+        return {
+            "shape": (data.obj.shape[0],),
+            "dtype": dtype,
+            "index_value": parse_index(data.obj.index, store_data=False),
+            "name": series_name,
+        }
+
+    def __init__(self, key_dtypes=None, **kw):
+        super().__init__(_key_dtypes=key_dtypes, **kw)
+
+
+class SeriesGroupByChunk(Chunk):
+    __slots__ = ()
+    _allow_data_type_ = (SeriesGroupByChunkData,)
+    type_name = "SeriesGroupBy"
+
+    def __len__(self):
+        return len(self._data)
+
+
+class DataFrameGroupByData(BaseDataFrameData):
+    type_name = "DataFrameGroupBy"
+
+    _key_dtypes = SeriesField("key_dtypes")
+    _selection = AnyField("selection")
+    _chunks = ListField(
+        "chunks",
+        FieldTypes.reference(DataFrameGroupByChunkData),
+        on_serialize=lambda x: [it.data for it in x] if x is not None else x,
+        on_deserialize=lambda x: [DataFrameGroupByChunk(it) for it in x]
+        if x is not None
+        else x,
+    )
+
+    @property
+    def key_dtypes(self):
+        return self._key_dtypes
+
+    @property
+    def selection(self):
+        return self._selection
+
+    def _get_params(self) -> Dict[str, Any]:
+        p = super()._get_params()
+        p.update(dict(key_dtypes=self.key_dtypes, selection=self.selection))
+        return p
+
+    def _set_params(self, new_params: Dict[str, Any]):
+        params = new_params.copy()
+        key_dtypes = params.pop("key_dtypes", None)
+        if key_dtypes is not None:
+            self._key_dtypes = key_dtypes
+        selection = params.pop("selection", None)
+        if selection is not None:
+            self._selection = selection
+        super()._set_params(params)
+
+    params = property(_get_params, _set_params)
+
+    def __init__(self, key_dtypes=None, selection=None, **kw):
+        super().__init__(_key_dtypes=key_dtypes, _selection=selection, **kw)
+
+    def _equal(self, o):
+        # FIXME We need to implemented a true `==` operator for DataFrameGroupby
+        if is_build_mode():
+            return self is o
+        else:
+            return self == o
+
+
+class SeriesGroupByData(BaseSeriesData):
+    type_name = "SeriesGroupBy"
+
+    _key_dtypes = AnyField("key_dtypes")
+    _chunks = ListField(
+        "chunks",
+        FieldTypes.reference(SeriesGroupByChunkData),
+        on_serialize=lambda x: [it.data for it in x] if x is not None else x,
+        on_deserialize=lambda x: [SeriesGroupByChunk(it) for it in x]
+        if x is not None
+        else x,
+    )
+
+    @property
+    def key_dtypes(self):
+        return self._key_dtypes
+
+    def _get_params(self) -> Dict[str, Any]:
+        p = super()._get_params()
+        p["key_dtypes"] = self.key_dtypes
+        return p
+
+    def _set_params(self, new_params: Dict[str, Any]):
+        params = new_params.copy()
+        key_dtypes = params.pop("key_dtypes", None)
+        if key_dtypes is not None:
+            self._key_dtypes = key_dtypes
+        super()._set_params(params)
+
+    params = property(_get_params, _set_params)
+
+    def __init__(self, key_dtypes=None, **kw):
+        super().__init__(_key_dtypes=key_dtypes, **kw)
+
+    def _equal(self, o):
+        # FIXME We need to implemented a true `==` operator for DataFrameGroupby
+        if is_build_mode():
+            return self is o
+        else:
+            return self == o
+
+
+class GroupBy(Tileable, _ToPandasMixin):
+    __slots__ = ()
+
+
+class DataFrameGroupBy(GroupBy):
+    __slots__ = ()
+    _allow_data_type_ = (DataFrameGroupByData,)
+    type_name = "DataFrameGroupBy"
+
+    def __eq__(self, other):
+        return self._equal(other)
+
+    def __hash__(self):
+        # NB: we have customized __eq__ explicitly, thus we need define __hash__ explicitly as well.
+        return super().__hash__()
+
+    def __getattr__(self, item):
+        try:
+            return super().__getattr__(item)
+        except AttributeError:
+            if item in self.dtypes:
+                return self[item]
+            else:
+                raise
+
+    def __dir__(self):
+        result = list(super().__dir__())
+        return sorted(
+            result
+            + [k for k in self.dtypes.index if isinstance(k, str) and k.isidentifier()]
+        )
+
+
+class SeriesGroupBy(GroupBy):
+    __slots__ = ()
+    _allow_data_type_ = (SeriesGroupByData,)
+    type_name = "SeriesGroupBy"
+
+    def __eq__(self, other):
+        return self._equal(other)
+
+    def __hash__(self):
+        # NB: we have customized __eq__ explicitly, thus we need define __hash__ explicitly as well.
+        return super().__hash__()
+
+
+class CategoricalChunkData(ChunkData):
+    __slots__ = ()
+    type_name = "Categorical"
+
+    # required fields
+    _shape = TupleField(
+        "shape",
+        FieldTypes.int64,
+        on_serialize=on_serialize_shape,
+        on_deserialize=on_deserialize_shape,
+    )
+    # optional field
+    _dtype = DataTypeField("dtype")
+    _categories_value = ReferenceField(
+        "categories_value", IndexValue, on_deserialize=_on_deserialize_index_value
+    )
+
+    def __init__(
+        self, op=None, shape=None, index=None, dtype=None, categories_value=None, **kw
+    ):
+        super().__init__(
+            _op=op,
+            _shape=shape,
+            _index=index,
+            _dtype=dtype,
+            _categories_value=categories_value,
+            **kw,
+        )
+
+    @property
+    def params(self) -> Dict[str, Any]:
+        # params return the properties which useful to rebuild a new chunk
+        return {
+            "shape": self.shape,
+            "dtype": self.dtype,
+            "index": self.index,
+            "categories_value": self.categories_value,
+        }
+
+    @params.setter
+    def params(self, new_params: Dict[str, Any]):
+        params = new_params.copy()
+        params.pop("index", None)  # index not needed to update
+        new_shape = params.pop("shape", None)
+        if new_shape is not None:
+            self._shape = new_shape
+        dtype = params.pop("dtype", None)
+        if dtype is not None:
+            self._dtype = dtype
+        categories_value = params.pop("categories_value", None)
+        if categories_value is not None:
+            self._categories_value = categories_value
+        if params:  # pragma: no cover
+            raise TypeError(f"Unknown params: {list(params)}")
+
+    @classmethod
+    def get_params_from_data(cls, data: pd.Categorical) -> Dict[str, Any]:
+        return {
+            "shape": data.shape,
+            "dtype": data.dtype,
+            "categories_value": parse_index(data.categories, store_data=True),
+        }
+
+    @property
+    def shape(self):
+        return getattr(self, "_shape", None)
+
+    @property
+    def ndim(self):
+        return len(self.shape)
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @property
+    def categories_value(self):
+        return self._categories_value
+
+
+class CategoricalChunk(Chunk):
+    __slots__ = ()
+    _allow_data_type_ = (CategoricalChunkData,)
+    type_name = "Categorical"
+
+
+class CategoricalData(HasShapeTileableData, _ToPandasMixin):
+    __slots__ = ("_cache",)
+    type_name = "Categorical"
+
+    # optional field
+    _dtype = DataTypeField("dtype")
+    _categories_value = ReferenceField(
+        "categories_value", IndexValue, on_deserialize=_on_deserialize_index_value
+    )
+    _chunks = ListField(
+        "chunks",
+        FieldTypes.reference(CategoricalChunkData),
+        on_serialize=lambda x: [it.data for it in x] if x is not None else x,
+        on_deserialize=lambda x: [CategoricalChunk(it) for it in x]
+        if x is not None
+        else x,
+    )
+
+    def __init__(
+        self,
+        op=None,
+        shape=None,
+        nsplits=None,
+        dtype=None,
+        categories_value=None,
+        chunks=None,
+        **kw,
+    ):
+        super().__init__(
+            _op=op,
+            _shape=shape,
+            _nsplits=nsplits,
+            _dtype=dtype,
+            _categories_value=categories_value,
+            _chunks=chunks,
+            **kw,
+        )
+
+    @property
+    def params(self) -> Dict[str, Any]:
+        # params return the properties which useful to rebuild a new tileable object
+        return {
+            "shape": self.shape,
+            "dtype": self.dtype,
+            "categories_value": self.categories_value,
+        }
+
+    @params.setter
+    def params(self, new_params: Dict[str, Any]):
+        params = new_params.copy()
+        new_shape = params.pop("shape", None)
+        if new_shape is not None:
+            self._shape = new_shape
+        dtype = params.pop("dtype", None)
+        if dtype is not None:
+            self._dtype = dtype
+        categories_value = params.pop("categories_value", None)
+        if categories_value is not None:
+            self._categories_value = categories_value
+        if params:  # pragma: no cover
+            raise TypeError(f"Unknown params: {list(params)}")
+
+    def refresh_params(self):
+        # refresh params when chunks updated
+        refresh_tileable_shape(self)
+        if self._dtype is None:
+            self._dtype = self.chunks[0].dtype
+        if self._categories_value is None:
+            categories = []
+            for chunk in self.chunks:
+                categories.extend(chunk.categories_value.to_pandas())
+            self._categories_value = parse_index(
+                pd.Categorical(categories).categories, store_data=True
+            )
+
+    def _to_str(self, representation=False):
+        if is_build_mode() or len(self._executed_sessions) == 0:
+            # in build mode, or not executed, just return representation
+            if representation:
+                return f"{self.type_name} <op={type(self.op).__name__}, key={self.key}>"
+            else:
+                return f"{self.type_name}(op={type(self.op).__name__})"
+        else:
+            data = self.fetch(session=self._executed_sessions[-1])
+            return repr(data) if repr(data) else str(data)
+
+    def __str__(self):
+        return self._to_str(representation=False)
+
+    def __repr__(self):
+        return self._to_str(representation=True)
+
+    def _equal(self, o):
+        # FIXME We need to implemented a true `==` operator for DataFrameGroupby
+        if is_build_mode():
+            return self is o
+        else:  # pragma: no cover
+            return self == o
+
+    @property
+    def dtype(self):
+        return getattr(self, "_dtype", None) or self.op.dtype
+
+    @property
+    def categories_value(self):
+        return self._categories_value
+
+    def __eq__(self, other):
+        return self._equal(other)
+
+    def __hash__(self):
+        # NB: we have customized __eq__ explicitly, thus we need define __hash__ explicitly as well.
+        return super().__hash__()
+
+
+class Categorical(HasShapeTileable, _ToPandasMixin):
+    __slots__ = ()
+    _allow_data_type_ = (CategoricalData,)
+    type_name = "Categorical"
+
+    def __len__(self):
+        return len(self._data)
+
+    def __eq__(self, other):
+        return self._equal(other)
+
+    def __hash__(self):
+        # NB: we have customized __eq__ explicitly, thus we need define __hash__ explicitly as well.
+        return super().__hash__()
+
+
+class DataFrameOrSeriesChunkData(ChunkData):
+    __slots__ = ()
+    type_name = "DataFrameOrSeries"
+
+    _collapse_axis = Int8Field("collapse_axis")
+    _data_type = StringField("data_type")
+    _data_params = DictField("data_params")
+
+    def __init__(
+        self,
+        op=None,
+        index=None,
+        collapse_axis=None,
+        data_type=None,
+        data_params=None,
+        **kw,
+    ):
+        self._collapse_axis = collapse_axis
+        self._index = index
+        self._data_type = data_type
+        self._data_params = data_params or dict()
+        super().__init__(_op=op, **kw)
+
+    def __getattr__(self, item):
+        if item in self._data_params:
+            return self._data_params[item]
+        raise AttributeError(f"'{type(self)}' object has no attribute '{item}'")
+
+    @property
+    def ndim(self) -> int:
+        return (self._data_type == "dataframe") + 1
+
+    @property
+    def params(self) -> Dict[str, Any]:
+        return {
+            "collapse_axis": self._collapse_axis,
+            "index": self._index,
+            "data_type": self._data_type,
+            "data_params": self._data_params,
+        }
+
+    @params.setter
+    def params(self, new_params: Dict[str, Any]):
+        self._data_type = new_params.get("data_type")
+        if self._collapse_axis is not None and self._data_type == "series":
+            self._index = (self._index[1 - self._collapse_axis],)
+        if self._collapse_axis is None and self._data_type == "dataframe":
+            self._index = (self._index[0], 0)
+        data_params = new_params["data_params"]
+        if self._data_type == "dataframe":
+            data_params["dtypes"] = data_params["dtypes_value"].value
+            data_params["columns_value"] = parse_index(
+                data_params["dtypes_value"].value.index, store_data=True
+            )
+        self._data_params = {k: v for k, v in data_params.items()}
+
+    @classmethod
+    def get_params_from_data(cls, data: Any) -> Dict[str, Any]:
+        if isinstance(data, pd.DataFrame):
+            return {
+                "data_type": "dataframe",
+                "data_params": DataFrameChunkData.get_params_from_data(data),
+            }
+        else:
+            return {
+                "data_type": "series",
+                "data_params": SeriesChunkData.get_params_from_data(data),
+            }
+
+
+class DataFrameOrSeriesChunk(Chunk):
+    __slots__ = ()
+    _allow_data_type_ = (DataFrameOrSeriesChunkData,)
+    type_name = "DataFrameOrSeries"
+
+
+class DataFrameOrSeriesData(HasShapeTileableData, _ToPandasMixin):
+    __slots__ = ()
+    _chunks = ListField(
+        "chunks",
+        FieldTypes.reference(DataFrameOrSeriesChunkData),
+        on_serialize=lambda x: [it.data for it in x] if x is not None else x,
+        on_deserialize=lambda x: [DataFrameOrSeriesChunk(it) for it in x]
+        if x is not None
+        else x,
+    )
+
+    _data_type = StringField("data_type")
+    _data_params = DictField("data_params")
+
+    def __init__(
+        self,
+        op=None,
+        chunks=None,
+        data_type=None,
+        data_params=None,
+        **kw,
+    ):
+        self._data_type = data_type
+        self._data_params = data_params or dict()
+        super().__init__(
+            _op=op,
+            _chunks=chunks,
+            **kw,
+        )
+
+    def __getattr__(self, item):
+        if item in self._data_params:
+            return self._data_params[item]
+        raise AttributeError(f"'{type(self)}' object has no attribute '{item}'")
+
+    @property
+    def shape(self):
+        return self._data_params.get("shape", None)
+
+    @property
+    def nsplits(self):
+        return self._data_params.get("nsplits", None)
+
+    @property
+    def data_type(self):
+        return self._data_type
+
+    @property
+    def data_params(self):
+        return self._data_params
+
+    @property
+    def params(self) -> Dict[str, Any]:
+        return {"data_type": self._data_type, "data_params": self._data_params}
+
+    @params.setter
+    def params(self, new_params: Dict[str, Any]):
+        # After execution, create DataFrameFetch, and the data
+        # corresponding to the original key is still DataFrameOrSeries type,
+        # so when restoring DataFrameOrSeries type,
+        # there is no "data_type" field in params.
+        if "data_type" not in new_params:
+            if "dtype" in new_params:
+                self._data_type = "series"
+            else:
+                self._data_type = "dataframe"
+            self._data_params = new_params.copy()
+        else:
+            self._data_type = new_params.get("data_type")
+            self._data_params = {
+                k: v for k, v in new_params.get("data_params", {}).items()
+            }
+
+    def refresh_params(self):
+        index_to_index_values = dict()
+        for chunk in self.chunks:
+            if chunk.ndim == 1:
+                index_to_index_values[chunk.index] = chunk.index_value
+            elif chunk.index[1] == 0:
+                index_to_index_values[chunk.index] = chunk.index_value
+        index_value = merge_index_value(index_to_index_values, store_data=False)
+        nsplits = calc_nsplits({c.index: c.shape for c in self.chunks})
+        shape = tuple(sum(ns) for ns in nsplits)
+
+        data_params = dict()
+        data_params["nsplits"] = nsplits
+        data_params["shape"] = shape
+        data_params["index_value"] = index_value
+
+        self._data_type = self._chunks[0]._data_type
+        if self.data_type == "dataframe":
+            all_dtypes = [c.dtypes_value.value for c in self.chunks if c.index[0] == 0]
+            dtypes = pd.concat(all_dtypes)
+            data_params["dtypes"] = dtypes
+            columns_values = parse_index(dtypes.index, store_data=True)
+            data_params["columns_value"] = columns_values
+            data_params["dtypes_value"] = DtypesValue(
+                key=tokenize(dtypes), value=dtypes
+            )
+        else:
+            data_params["dtype"] = self.chunks[0].dtype
+            data_params["name"] = self.chunks[0].name
+        self._data_params.update(data_params)
+
+    def ensure_data(self):
+        from .fetch.core import DataFrameFetch
+
+        self.execute()
+        default_sess = get_default_session()
+        self._detach_session(default_sess._session)
+
+        fetch_tileable = default_sess._session._tileable_to_fetch[self]
+        new = DataFrameFetch(
+            output_types=[getattr(OutputType, self.data_type)]
+        ).new_tileable(
+            [],
+            _key=self.key,
+            chunks=fetch_tileable.chunks,
+            nsplits=fetch_tileable.nsplits,
+            **self.data_params,
+        )
+        new._attach_session(default_sess._session)
+        return new
+
+
+class DataFrameOrSeries(HasShapeTileable, _ToPandasMixin):
+    __slots__ = ()
+    _allow_data_type_ = (DataFrameOrSeriesData,)
+    type_name = "DataFrameOrSeries"
+
+
+INDEX_TYPE = (Index, IndexData)
+INDEX_CHUNK_TYPE = (IndexChunk, IndexChunkData)
+SERIES_TYPE = (Series, SeriesData)
+SERIES_CHUNK_TYPE = (SeriesChunk, SeriesChunkData)
+DATAFRAME_OR_SERIES_TYPE = (DataFrameOrSeries, DataFrameOrSeriesData)
+DATAFRAME_OR_SERIES_CHUNK_TYPE = (DataFrameOrSeriesChunk, DataFrameOrSeriesChunkData)
+DATAFRAME_TYPE = (DataFrame, DataFrameData)
+DATAFRAME_CHUNK_TYPE = (DataFrameChunk, DataFrameChunkData)
+DATAFRAME_GROUPBY_TYPE = (DataFrameGroupBy, DataFrameGroupByData)
+DATAFRAME_GROUPBY_CHUNK_TYPE = (DataFrameGroupByChunk, DataFrameGroupByChunkData)
+SERIES_GROUPBY_TYPE = (SeriesGroupBy, SeriesGroupByData)
+SERIES_GROUPBY_CHUNK_TYPE = (SeriesGroupByChunk, SeriesGroupByChunkData)
+GROUPBY_TYPE = (GroupBy,) + DATAFRAME_GROUPBY_TYPE + SERIES_GROUPBY_TYPE
+GROUPBY_CHUNK_TYPE = DATAFRAME_GROUPBY_CHUNK_TYPE + SERIES_GROUPBY_CHUNK_TYPE
+CATEGORICAL_TYPE = (Categorical, CategoricalData)
+CATEGORICAL_CHUNK_TYPE = (CategoricalChunk, CategoricalChunkData)
+TILEABLE_TYPE = (
+    INDEX_TYPE + SERIES_TYPE + DATAFRAME_TYPE + GROUPBY_TYPE + CATEGORICAL_TYPE
+)
+CHUNK_TYPE = (
+    INDEX_CHUNK_TYPE
+    + SERIES_CHUNK_TYPE
+    + DATAFRAME_CHUNK_TYPE
+    + GROUPBY_CHUNK_TYPE
+    + CATEGORICAL_CHUNK_TYPE
+)
+
+register_output_types(OutputType.dataframe, DATAFRAME_TYPE, DATAFRAME_CHUNK_TYPE)
+register_output_types(OutputType.series, SERIES_TYPE, SERIES_CHUNK_TYPE)
+register_output_types(
+    OutputType.df_or_series, DATAFRAME_OR_SERIES_TYPE, DATAFRAME_OR_SERIES_CHUNK_TYPE
+)
+register_output_types(OutputType.index, INDEX_TYPE, INDEX_CHUNK_TYPE)
+register_output_types(OutputType.categorical, CATEGORICAL_TYPE, CATEGORICAL_CHUNK_TYPE)
+register_output_types(
+    OutputType.dataframe_groupby, DATAFRAME_GROUPBY_TYPE, DATAFRAME_GROUPBY_CHUNK_TYPE
+)
+register_output_types(
+    OutputType.series_groupby, SERIES_GROUPBY_TYPE, SERIES_GROUPBY_CHUNK_TYPE
+)
diff --git a/python/xorbits/_mars/dataframe/datasource/__init__.py b/python/xorbits/_mars/dataframe/datasource/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datasource/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/datasource/core.py b/python/xorbits/_mars/dataframe/datasource/core.py
new file mode 100644
index 000000000..005efc3e7
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datasource/core.py
@@ -0,0 +1,243 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import uuid
+from typing import List, Optional, Union
+
+import numpy as np
+
+from ...config import options
+from ...core import recursive_tile
+from ...core.context import Context, get_context
+from ...oscar import ActorNotExist
+from ...serialization.serializables import Int64Field, StringField
+from ...typing import OperandType, TileableType
+from ...utils import parse_readable_size
+from ..core import IndexValue, OutputType
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import merge_index_value
+
+
+class HeadOptimizedDataSource(DataFrameOperand, DataFrameOperandMixin):
+    __slots__ = ()
+    # Data source op that optimized for head,
+    # First, it will try to trigger first_chunk.head() and raise TilesError,
+    # When iterative tiling is triggered,
+    # check if the first_chunk.head() meets requirements.
+    nrows = Int64Field("nrows", default=None)
+
+    @property
+    def first_chunk(self):
+        return getattr(self, "_first_chunk", None)
+
+    @classmethod
+    def _tile(cls, op):  # pragma: no cover
+        raise NotImplementedError
+
+    @classmethod
+    def _tile_head(cls, op: "HeadOptimizedDataSource"):
+        tileds = cls._tile(op)
+        chunks = tileds[0].chunks
+
+        # execute first chunk
+        yield chunks[:1]
+
+        chunk_shape = chunks[0].shape
+        if chunk_shape[0] == op.nrows:
+            # the first chunk has enough data
+            tileds[0]._nsplits = tuple((s,) for s in chunk_shape)
+            chunks[0]._shape = chunk_shape
+            tileds[0]._chunks = chunks[:1]
+            tileds[0]._shape = chunk_shape
+        else:
+            for chunk in tileds[0].chunks:
+                chunk.op.nrows = None
+            # otherwise
+            tiled = yield from recursive_tile(tileds[0].iloc[: op.nrows])
+            tileds = [tiled]
+        return tileds
+
+    @classmethod
+    def tile(cls, op: "HeadOptimizedDataSource"):
+        if op.nrows is not None:
+            return (yield from cls._tile_head(op))
+        else:
+            return cls._tile(op)
+
+
+class ColumnPruneSupportedDataSourceMixin(DataFrameOperandMixin):
+    __slots__ = ()
+
+    def get_columns(self):  # pragma: no cover
+        raise NotImplementedError
+
+    def set_pruned_columns(self, columns, *, keep_order=None):  # pragma: no cover
+        raise NotImplementedError
+
+
+class _IncrementalIndexRecorder:
+    _done: List[Optional[asyncio.Event]]
+    _chunk_sizes: List[Optional[int]]
+
+    def __init__(self, n_chunk: int):
+        self._n_chunk = n_chunk
+        self._done = [asyncio.Event() for _ in range(n_chunk)]
+        self._chunk_sizes = [None] * n_chunk
+        self._waiters = set()
+
+    def _can_destroy(self):
+        return all(e.is_set() for e in self._done) and not self._waiters
+
+    def add_waiter(self, i: int):
+        self._waiters.add(i)
+
+    async def wait(self, i: int):
+        if i == 0:
+            return 0, self._can_destroy()
+        self._waiters.add(i)
+        try:
+            await asyncio.gather(*(e.wait() for e in self._done[:i]))
+        finally:
+            self._waiters.remove(i)
+        # all chunk finished and no waiters
+        return sum(self._chunk_sizes[:i]), self._can_destroy()
+
+    async def finish(self, i: int, size: int):
+        self._chunk_sizes[i] = size
+        self._done[i].set()
+
+
+class IncrementalIndexDatasource(HeadOptimizedDataSource):
+    __slots__ = ()
+
+    incremental_index_recorder_name = StringField("incremental_index_recorder_name")
+
+
+class IncrementalIndexDataSourceMixin(DataFrameOperandMixin):
+    __slots__ = ()
+
+    @classmethod
+    def post_tile(cls, op: OperandType, results: List[TileableType]):
+        if (
+            op.incremental_index
+            and results is not None
+            and isinstance(results[0].index_value.value, IndexValue.RangeIndex)
+        ):
+            result = results[0]
+            chunks = []
+            for chunk in result.chunks:
+                if not isinstance(chunk.op, cls):
+                    # some chunks are merged, get the inputs
+                    chunks.extend(chunk.inputs)
+                else:
+                    chunks.append(chunk)
+            for chunk in chunks:
+                chunk.op.priority = -chunk.index[0]
+            n_chunk = len(chunks)
+            ctx = get_context()
+            if ctx:
+                name = str(uuid.uuid4())
+                ctx.create_remote_object(name, _IncrementalIndexRecorder, n_chunk)
+                for chunk in chunks:
+                    chunk.op.incremental_index_recorder_name = name
+
+    @classmethod
+    def pre_execute(cls, ctx: Union[dict, Context], op: OperandType):
+        out = op.outputs[0]
+        if (
+            op.incremental_index
+            and isinstance(out.index_value.value, IndexValue.RangeIndex)
+            and getattr(op, "incremental_index_recorder_name", None)
+        ):
+            index = out.index[0]
+            recorder_name = op.incremental_index_recorder_name
+            recorder = ctx.get_remote_object(recorder_name)
+            recorder.add_waiter(index)
+
+    @classmethod
+    def post_execute(cls, ctx: Union[dict, Context], op: OperandType):
+        out = op.outputs[0]
+        result = ctx[out.key]
+        if (
+            op.incremental_index
+            and isinstance(out.index_value.value, IndexValue.RangeIndex)
+            and getattr(op, "incremental_index_recorder_name", None)
+        ):
+            recorder_name = op.incremental_index_recorder_name
+            recorder = ctx.get_remote_object(recorder_name)
+            index = out.index[0]
+            recorder.finish(index, len(result))
+            # wait for previous chunks to finish, then update index
+            size, can_destroy = recorder.wait(index)
+            result.index += size
+            if can_destroy:
+                try:
+                    ctx.destroy_remote_object(recorder_name)
+                except ActorNotExist:
+                    pass
+
+
+def merge_small_files(
+    df: TileableType,
+    n_sample_file: int = 10,
+    merged_file_size: Union[int, float, str] = None,
+) -> TileableType:
+    from ..merge import DataFrameConcat
+
+    if len(df.chunks) < n_sample_file:
+        # if number of chunks is small(less than `n_sample_file`,
+        # skip this process
+        return df
+
+    if merged_file_size is not None:
+        merged_file_size = parse_readable_size(merged_file_size)[0]
+    else:
+        # Estimated size is relatively large than the real one,
+        # so we double the merged size
+        merged_file_size = options.chunk_store_limit * 2
+    # sample files whose size equals `n_sample_file`
+    sampled_chunks = np.random.choice(df.chunks, n_sample_file)
+    max_chunk_size = 0
+    ctx = dict()
+    for sampled_chunk in sampled_chunks:
+        sampled_chunk.op.estimate_size(ctx, sampled_chunk.op)
+        size = ctx[sampled_chunk.key][0]
+        max_chunk_size = max(max_chunk_size, size)
+    to_merge_size = merged_file_size // max_chunk_size
+    if to_merge_size < 2:
+        return df
+    # merge files
+    n_new_chunks = np.ceil(len(df.chunks) / to_merge_size)
+    new_chunks = []
+    new_nsplit = []
+    for i, chunks in enumerate(np.array_split(df.chunks, n_new_chunks)):
+        chunk_size = sum(c.shape[0] for c in chunks)
+        kw = dict(
+            dtypes=chunks[0].dtypes,
+            index_value=merge_index_value({c.index: c.index_value for c in chunks}),
+            columns_value=chunks[0].columns_value,
+            shape=(chunk_size, chunks[0].shape[1]),
+            index=(i, 0),
+        )
+        new_chunk = DataFrameConcat(output_types=[OutputType.dataframe]).new_chunk(
+            chunks.tolist(), **kw
+        )
+        new_chunks.append(new_chunk)
+        new_nsplit.append(chunk_size)
+    new_op = df.op.copy()
+    params = df.params.copy()
+    params["chunks"] = new_chunks
+    params["nsplits"] = (tuple(new_nsplit), df.nsplits[1])
+    return new_op.new_dataframe(df.op.inputs, kws=[params])
diff --git a/python/xorbits/_mars/dataframe/datasource/dataframe.py b/python/xorbits/_mars/dataframe/datasource/dataframe.py
new file mode 100644
index 000000000..6a2e6c524
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datasource/dataframe.py
@@ -0,0 +1,129 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import OutputType
+from ...serialization.serializables import DataFrameField, SeriesField
+from ...tensor.utils import get_chunk_slices
+from ...utils import estimate_pandas_size
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import decide_dataframe_chunk_sizes, is_cudf, parse_index
+
+
+class DataFrameDataSource(DataFrameOperand, DataFrameOperandMixin):
+    """
+    Represents data from pandas DataFrame
+    """
+
+    _op_type_ = OperandDef.DATAFRAME_DATA_SOURCE
+
+    data = DataFrameField("data")
+    dtypes = SeriesField("dtypes")
+
+    def __init__(self, data=None, dtypes=None, gpu=None, **kw):
+        if dtypes is None and data is not None:
+            dtypes = data.dtypes
+        if gpu is None and is_cudf(data):  # pragma: no cover
+            gpu = True
+        super().__init__(
+            data=data,
+            dtypes=dtypes,
+            gpu=gpu,
+            _output_types=[OutputType.dataframe],
+            **kw
+        )
+
+    def __call__(self, shape, chunk_size=None):
+        return self.new_dataframe(
+            None,
+            shape,
+            dtypes=self.dtypes,
+            index_value=parse_index(self.data.index),
+            columns_value=parse_index(self.data.columns, store_data=True),
+            raw_chunk_size=chunk_size,
+        )
+
+    @classmethod
+    def tile(cls, op: "DataFrameDataSource"):
+        df = op.outputs[0]
+        raw_df = op.data
+
+        # estimate column memory usage instead of calling df.memory_usage(deep=True)
+        memory_usage = pd.Series(
+            {c: estimate_pandas_size(s) for c, s in raw_df.items()}
+        )
+        chunk_size = df.extra_params.raw_chunk_size or options.chunk_size
+        chunk_size = decide_dataframe_chunk_sizes(df.shape, chunk_size, memory_usage)
+        chunk_size_idxes = (range(len(size)) for size in chunk_size)
+
+        out_chunks = []
+        index_values = dict()
+        column_values = dict()
+        for chunk_shape, chunk_idx in zip(
+            itertools.product(*chunk_size), itertools.product(*chunk_size_idxes)
+        ):
+            chunk_op = op.copy().reset_key()
+            slc = get_chunk_slices(chunk_size, chunk_idx)
+            i_slc, j_slc = slc
+            if j_slc == slice(0, df.shape[1]):
+                # optimize full slice, it's way more faster
+                j_slc = slice(None)
+            chunk_op.data = raw_df.iloc[i_slc, j_slc]
+            chunk_op.dtypes = chunk_op.data.dtypes
+            i, j = chunk_idx
+            if i in index_values:
+                index_value = index_values[i]
+            else:
+                index_value = index_values[i] = parse_index(chunk_op.data.index)
+            if j in column_values:
+                column_value = column_values[j]
+            else:
+                column_value = column_values[j] = parse_index(
+                    chunk_op.data.columns, store_data=True
+                )
+            out_chunk = chunk_op.new_chunk(
+                None,
+                shape=chunk_shape,
+                index=chunk_idx,
+                index_value=index_value,
+                columns_value=column_value,
+                dtypes=chunk_op.data.dtypes,
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            None,
+            df.shape,
+            dtypes=op.dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+            chunks=out_chunks,
+            nsplits=chunk_size,
+            **df.extra_params
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        ctx[op.outputs[0].key] = op.data
+
+
+def from_pandas(data, chunk_size=None, gpu=None, sparse=False):
+    op = DataFrameDataSource(data=data, gpu=gpu, sparse=sparse)
+    return op(data.shape, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/dataframe/datasource/date_range.py b/python/xorbits/_mars/dataframe/datasource/date_range.py
new file mode 100644
index 000000000..7e952ace8
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datasource/date_range.py
@@ -0,0 +1,601 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from datetime import date, datetime, time
+
+import numpy as np
+import pandas as pd
+from pandas import NaT, Timestamp
+from pandas._libs.tslibs import timezones
+from pandas.tseries.frequencies import to_offset
+from pandas.tseries.offsets import Tick
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import OutputType
+from ...serialization.serializables import AnyField, BoolField, Int64Field, StringField
+from ...tensor.utils import decide_chunk_sizes
+from ...utils import no_default, pd_release_version
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index
+
+try:
+    from pandas._libs.tslib import normalize_date
+except ImportError:  # pragma: no cover
+
+    def normalize_date(dt):  # from pandas/_libs/tslibs/conversion.pyx
+        if isinstance(dt, datetime):
+            if isinstance(dt, pd.Timestamp):
+                return dt.replace(
+                    hour=0, minute=0, second=0, microsecond=0, nanosecond=0
+                )
+            else:
+                return dt.replace(hour=0, minute=0, second=0, microsecond=0)
+        elif isinstance(dt, date):
+            return datetime(dt.year, dt.month, dt.day)
+        else:
+            raise TypeError(f"Unrecognized type: {type(dt)}")
+
+
+_date_range_use_inclusive = pd_release_version[:2] >= (1, 4)
+
+
+# adapted from pandas.core.arrays.datetimes.generate_range
+def generate_range_count(
+    start=None, end=None, periods=None, offset=None
+):  # pragma: no cover
+    offset = to_offset(offset)
+
+    start = Timestamp(start)
+    start = start if start is not NaT else None
+    end = Timestamp(end)
+    end = end if end is not NaT else None
+
+    if start and not offset.is_on_offset(start):
+        start = offset.rollforward(start)
+
+    elif end and not offset.is_on_offset(end):
+        end = offset.rollback(end)
+
+    if periods is None and end < start and offset.n >= 0:
+        end = None
+        periods = 0
+
+    if end is None:
+        end = start + (periods - 1) * offset
+
+    if start is None:
+        start = end - (periods - 1) * offset
+
+    cur = start
+    count = 0
+    if offset.n >= 0:
+        while cur <= end:
+            count += 1
+
+            if cur == end:
+                # GH#24252 avoid overflows by not performing the addition
+                # in offset.apply unless we have to
+                break
+
+            # faster than cur + offset
+            try:
+                next_date = offset._apply(cur)
+            except AttributeError:
+                next_date = cur + offset
+            if next_date <= cur:
+                raise ValueError(f"Offset {offset} did not increment date")
+            cur = next_date
+    else:
+        while cur >= end:
+            count += 1
+
+            if cur == end:
+                # GH#24252 avoid overflows by not performing the addition
+                # in offset.apply unless we have to
+                break
+
+            # faster than cur + offset
+            try:
+                next_date = offset._apply(cur)
+            except AttributeError:
+                next_date = cur + offset
+            if next_date >= cur:
+                raise ValueError(f"Offset {offset} did not decrement date")
+            cur = next_date
+    return count
+
+
+class DataFrameDateRange(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.DATE_RANGE
+
+    start = AnyField("start")
+    end = AnyField("end")
+    periods = Int64Field("periods")
+    freq = AnyField("freq")
+    tz = AnyField("tz")
+    normalize = BoolField("normalize")
+    name = StringField("name")
+    inclusive = StringField("inclusive")
+
+    def __init__(
+        self,
+        output_types=None,
+        **kw,
+    ):
+        super().__init__(_output_types=output_types, **kw)
+        if self.output_types is None:
+            self.output_types = [OutputType.index]
+        if getattr(self, "inclusive", None) is None:
+            self.inclusive = "both"
+
+    def __call__(self, shape, chunk_size=None):
+        dtype = pd.Index([self.start]).dtype
+        index_value = parse_index(
+            pd.Index([], dtype=dtype), self.start, self.end, self.periods, self.tz
+        )
+        # gen index value info
+        index_value.value._min_val = self.start
+        index_value.value._min_val_close = True
+        index_value.value._max_val = self.end
+        index_value.value._max_val_close = True
+        index_value.value._is_unique = True
+        index_value.value._is_monotonic_increasing = True
+        index_value.value._freq = self.freq
+        return self.new_index(
+            None,
+            shape=shape,
+            dtype=dtype,
+            index_value=index_value,
+            name=self.name,
+            raw_chunk_size=chunk_size,
+            freq=self.freq,
+        )
+
+    @classmethod
+    def tile(cls, op: "DataFrameDateRange"):
+        out = op.outputs[0]
+        start = op.start
+        end = op.end
+        freq = op.freq
+        periods = op.periods
+        inclusive = op.inclusive
+
+        chunk_length = out.extra_params.raw_chunk_size or options.chunk_size
+        chunk_length = decide_chunk_sizes(out.shape, chunk_length, out.dtype.itemsize)[
+            0
+        ]
+
+        if inclusive in ("neither", "right"):
+            # if left not close, add one more for the first chunk
+            chunk_length = (chunk_length[0] + 1,) + chunk_length[1:]
+
+        if freq is None:
+            if periods > 1:
+                freq = (end - op.start) / (periods - 1)
+            else:
+                freq = end - start
+
+        out_chunks = []
+        cum_nsplit = [0] + np.cumsum(chunk_length).tolist()
+        for i, chunk_size in enumerate(chunk_length):
+            chunk_op = op.copy().reset_key()
+            chunk_op.periods = chunk_size
+
+            if i > 0 or inclusive not in ("neither", "right"):
+                # for chunks in the middle, all sides are inclusive
+                chunk_op.inclusive = "both"
+            elif 0 == i and inclusive == "neither":
+                chunk_op.inclusive = "right"
+
+            chunk_i_start = cum_nsplit[i]
+            if chunk_i_start > 0:
+                chunk_start = chunk_op.start = start + freq * chunk_i_start
+            else:
+                chunk_start = chunk_op.start = start
+            chunk_end = chunk_op.end = chunk_start + (chunk_size - 1) * freq
+
+            # gen chunk index_value
+            chunk_index_value = parse_index(out.index_value.to_pandas(), i, out)
+            chunk_index_value.value._min_val = chunk_start
+            chunk_index_value.value._min_val_close = True
+            chunk_index_value.value._max_val = chunk_end
+            chunk_index_value.value._max_val_close = True
+            chunk_index_value.value._is_unique = True
+            chunk_index_value.value._is_monotonic_increasing = True
+
+            size = (
+                chunk_size - 1
+                if i == 0 and inclusive in ("neither", "right")
+                else chunk_size
+            )
+            out_chunk = chunk_op.new_chunk(
+                None,
+                shape=(size,),
+                index=(i,),
+                dtype=out.dtype,
+                index_value=chunk_index_value,
+                name=out.name,
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        params = out.params
+        params["chunks"] = out_chunks
+        params["nsplits"] = (tuple(c.shape[0] for c in out_chunks),)
+        return new_op.new_indexes(None, kws=[params])
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameDateRange"):
+        start, end, periods = op.start, op.end, op.periods
+        freq = op.freq
+        if freq is not None:
+            end = None
+        kw = dict(
+            start=start,
+            end=end,
+            periods=periods,
+            freq=freq,
+            tz=op.tz,
+            normalize=op.normalize,
+            name=op.name,
+            inclusive=op.inclusive,
+        )
+        if not _date_range_use_inclusive:
+            closed = kw.pop("inclusive")
+            assert closed != "neither"
+            kw["closed"] = None if closed == "both" else closed
+        ctx[op.outputs[0].key] = pd.date_range(**kw)
+
+
+_midnight = time(0, 0)
+
+
+def _maybe_normalize_endpoints(start, end, normalize):  # pragma: no cover
+    _normalized = True
+
+    if start is not None:
+        if normalize:
+            start = normalize_date(start)
+            _normalized = True
+        else:
+            _normalized = _normalized and start.time() == _midnight
+
+    if end is not None:
+        if normalize:
+            end = normalize_date(end)
+            _normalized = True
+        else:
+            _normalized = _normalized and end.time() == _midnight
+
+    return start, end, _normalized
+
+
+def _infer_tz_from_endpoints(start, end, tz):  # pragma: no cover
+    """
+    If a timezone is not explicitly given via `tz`, see if one can
+    be inferred from the `start` and `end` endpoints.  If more than one
+    of these inputs provides a timezone, require that they all agree.
+
+    Parameters
+    ----------
+    start : Timestamp
+    end : Timestamp
+    tz : tzinfo or None
+
+    Returns
+    -------
+    tz : tzinfo or None
+
+    Raises
+    ------
+    TypeError : if start and end timezones do not agree
+    """
+    try:
+        inferred_tz = timezones.infer_tzinfo(start, end)
+    except AssertionError:
+        # infer_tzinfo raises AssertionError if passed mismatched timezones
+        raise TypeError(
+            "Start and end cannot both be tz-aware with different timezones"
+        )
+
+    inferred_tz = timezones.maybe_get_tz(inferred_tz)
+    tz = timezones.maybe_get_tz(tz)
+
+    if tz is not None and inferred_tz is not None:
+        if not timezones.tz_compare(inferred_tz, tz):
+            raise AssertionError("Inferred time zone not equal to passed time zone")
+
+    elif inferred_tz is not None:
+        tz = inferred_tz
+
+    return tz
+
+
+def _maybe_localize_point(
+    ts, is_none, is_not_none, freq, tz, ambiguous, nonexistent
+):  # pragma: no cover
+    """
+    Localize a start or end Timestamp to the timezone of the corresponding
+    start or end Timestamp
+
+    Parameters
+    ----------
+    ts : start or end Timestamp to potentially localize
+    is_none : argument that should be None
+    is_not_none : argument that should not be None
+    freq : Tick, DateOffset, or None
+    tz : str, timezone object or None
+    ambiguous: str, localization behavior for ambiguous times
+    nonexistent: str, localization behavior for nonexistent times
+
+    Returns
+    -------
+    ts : Timestamp
+    """
+    # Make sure start and end are timezone localized if:
+    # 1) freq = a Timedelta-like frequency (Tick)
+    # 2) freq = None i.e. generating a linspaced range
+    if is_none is None and is_not_none is not None:
+        # Note: We can't ambiguous='infer' a singular ambiguous time; however,
+        # we have historically defaulted ambiguous=False
+        ambiguous = ambiguous if ambiguous != "infer" else False
+        localize_args = {"ambiguous": ambiguous, "nonexistent": nonexistent, "tz": None}
+        if isinstance(freq, Tick) or freq is None:
+            localize_args["tz"] = tz
+        ts = ts.tz_localize(**localize_args)
+    return ts
+
+
+def date_range(
+    start=None,
+    end=None,
+    periods=None,
+    freq=None,
+    tz=None,
+    normalize=False,
+    name=None,
+    closed=no_default,
+    inclusive=None,
+    chunk_size=None,
+    **kwargs,
+):
+    """
+    Return a fixed frequency DatetimeIndex.
+
+    Parameters
+    ----------
+    start : str or datetime-like, optional
+        Left bound for generating dates.
+    end : str or datetime-like, optional
+        Right bound for generating dates.
+    periods : int, optional
+        Number of periods to generate.
+    freq : str or DateOffset, default 'D'
+        Frequency strings can have multiples, e.g. '5H'. See
+        :ref:`here <timeseries.offset_aliases>` for a list of
+        frequency aliases.
+    tz : str or tzinfo, optional
+        Time zone name for returning localized DatetimeIndex, for example
+        'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is
+        timezone-naive.
+    normalize : bool, default False
+        Normalize start/end dates to midnight before generating date range.
+    name : str, default None
+        Name of the resulting DatetimeIndex.
+    inclusive : {“both”, “neither”, “left”, “right”}, default “both”
+        Include boundaries; Whether to set each bound as closed or open.
+    **kwargs
+        For compatibility. Has no effect on the result.
+
+    Returns
+    -------
+    rng : DatetimeIndex
+
+    See Also
+    --------
+    DatetimeIndex : An immutable container for datetimes.
+    timedelta_range : Return a fixed frequency TimedeltaIndex.
+    period_range : Return a fixed frequency PeriodIndex.
+    interval_range : Return a fixed frequency IntervalIndex.
+
+    Notes
+    -----
+    Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,
+    exactly three must be specified. If ``freq`` is omitted, the resulting
+    ``DatetimeIndex`` will have ``periods`` linearly spaced elements between
+    ``start`` and ``end`` (closed on both sides).
+
+    To learn more about the frequency strings, please see `this link
+    <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
+
+    Examples
+    --------
+    **Specifying the values**
+
+    The next four examples generate the same `DatetimeIndex`, but vary
+    the combination of `start`, `end` and `periods`.
+
+    Specify `start` and `end`, with the default daily frequency.
+    >>> import mars.dataframe as md
+
+    >>> md.date_range(start='1/1/2018', end='1/08/2018').execute()
+    DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
+                   '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
+                  dtype='datetime64[ns]', freq='D')
+
+    Specify `start` and `periods`, the number of periods (days).
+
+    >>> md.date_range(start='1/1/2018', periods=8).execute()
+    DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
+                   '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
+                  dtype='datetime64[ns]', freq='D')
+
+    Specify `end` and `periods`, the number of periods (days).
+
+    >>> md.date_range(end='1/1/2018', periods=8).execute()
+    DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28',
+                   '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'],
+                  dtype='datetime64[ns]', freq='D')
+
+    Specify `start`, `end`, and `periods`; the frequency is generated
+    automatically (linearly spaced).
+
+    >>> md.date_range(start='2018-04-24', end='2018-04-27', periods=3).execute()
+    DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00',
+                   '2018-04-27 00:00:00'],
+                  dtype='datetime64[ns]', freq=None)
+
+    **Other Parameters**
+
+    Changed the `freq` (frequency) to ``'M'`` (month end frequency).
+
+    >>> md.date_range(start='1/1/2018', periods=5, freq='M').execute()
+    DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',
+                   '2018-05-31'],
+                  dtype='datetime64[ns]', freq='M')
+
+    Multiples are allowed
+
+    >>> md.date_range(start='1/1/2018', periods=5, freq='3M').execute()
+    DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
+                   '2019-01-31'],
+                  dtype='datetime64[ns]', freq='3M')
+
+    `freq` can also be specified as an Offset object.
+
+    >>> md.date_range(start='1/1/2018', periods=5, freq=md.offsets.MonthEnd(3)).execute()
+    DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
+                   '2019-01-31'],
+                  dtype='datetime64[ns]', freq='3M')
+
+    Specify `tz` to set the timezone.
+
+    >>> md.date_range(start='1/1/2018', periods=5, tz='Asia/Tokyo').execute()
+    DatetimeIndex(['2018-01-01 00:00:00+09:00', '2018-01-02 00:00:00+09:00',
+                   '2018-01-03 00:00:00+09:00', '2018-01-04 00:00:00+09:00',
+                   '2018-01-05 00:00:00+09:00'],
+                  dtype='datetime64[ns, Asia/Tokyo]', freq='D')
+
+    `inclusive` controls whether to include `start` and `end` that are on the
+    boundary. The default, "both", includes boundary points on either end.
+
+    >>> md.date_range(start='2017-01-01', end='2017-01-04', inclusive='both').execute()
+    DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'],
+                  dtype='datetime64[ns]', freq='D')
+
+    Use ``inclusive='left'`` to exclude `end` if it falls on the boundary.
+
+    >>> md.date_range(start='2017-01-01', end='2017-01-04', closed='left').execute()
+    DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'],
+                  dtype='datetime64[ns]', freq='D')
+
+    Use ``inclusive='right'`` to exclude `start` if it falls on the boundary,
+    and similarly inclusive='neither' will exclude both `start` and `end`.
+
+    >>> md.date_range(start='2017-01-01', end='2017-01-04', closed='right').execute()
+    DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'],
+                  dtype='datetime64[ns]', freq='D')
+
+    .. note::
+        Pandas 1.4.0 or later is required to use ``inclusive='neither'``.
+        Otherwise an error may be raised.
+    """
+    # validate periods
+    if isinstance(periods, (float, np.floating)):
+        periods = int(periods)
+    if periods is not None and not isinstance(periods, (int, np.integer)):
+        raise TypeError(f"periods must be a number, got {periods}")
+
+    if freq is None and any(arg is None for arg in [periods, start, end]):
+        freq = "D"
+    if sum(arg is not None for arg in [start, end, periods, freq]) != 3:
+        raise ValueError(
+            "Of the four parameters: start, end, periods, "
+            "and freq, exactly three must be specified"
+        )
+    freq = to_offset(freq)
+
+    if _date_range_use_inclusive and closed is not no_default:
+        warnings.warn(
+            "Argument `closed` is deprecated in favor of `inclusive`.", FutureWarning
+        )
+    elif closed is no_default:
+        closed = None
+
+    if inclusive is None and closed is not no_default:
+        inclusive = closed
+
+    if start is not None:
+        start = pd.Timestamp(start)
+
+    if end is not None:
+        end = pd.Timestamp(end)
+
+    if start is pd.NaT or end is pd.NaT:
+        raise ValueError("Neither `start` nor `end` can be NaT")
+
+    start, end, _ = _maybe_normalize_endpoints(start, end, normalize)
+    tz = _infer_tz_from_endpoints(start, end, tz)
+
+    if start is None and end is not None:
+        # start is None and end is not None
+        # adjust end first
+        end = pd.date_range(end=end, periods=1, freq=freq)[0]
+        if inclusive == "neither":
+            end -= freq
+        size = periods
+        start = end - (periods - 1) * freq
+        if inclusive in ("neither", "left"):
+            size -= 1
+        elif inclusive == "right":
+            # when start is None, closed == 'left' would not take effect
+            # thus just ignore
+            inclusive = "both"
+    elif end is None:
+        # end is None
+        # adjust start first
+        start = pd.date_range(start=start, periods=1, freq=freq)[0]
+        size = periods
+        end = start + (periods - 1) * freq
+        if inclusive in ("neither", "right"):
+            size -= 1
+        elif inclusive == "left":
+            # when end is None, closed == 'left' would not take effect
+            # thus just ignore
+            inclusive = "both"
+    else:
+        if periods is None:
+            periods = size = generate_range_count(start, end, periods, freq)
+        else:
+            size = periods
+        if inclusive in ("left", "right"):
+            size -= 1
+        elif inclusive == "neither":
+            size -= 2
+
+    shape = (size,)
+    op = DataFrameDateRange(
+        start=start,
+        end=end,
+        periods=periods,
+        freq=freq,
+        tz=tz,
+        normalize=normalize,
+        name=name,
+        inclusive=inclusive,
+        **kwargs,
+    )
+    return op(shape, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/dataframe/datasource/from_index.py b/python/xorbits/_mars/dataframe/datasource/from_index.py
new file mode 100644
index 000000000..8c91b3c89
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datasource/from_index.py
@@ -0,0 +1,99 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes
+from ...core import recursive_tile
+from ...serialization.serializables import AnyField, KeyField
+from ..initializer import Index
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+
+
+class SeriesFromIndex(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.SERIES_FROM_INDEX
+
+    input_ = KeyField("input_")
+    index = KeyField("index")
+    name = AnyField("name", default=None)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self.input_ = self._inputs[0]
+        if len(self._inputs) > 1:
+            self.index = self._inputs[1]
+
+    def __call__(self, index, new_index=None, name=None):
+        inputs = [index]
+        index_value = index.index_value
+        if new_index is not None:
+            inputs.append(new_index)
+            index_value = new_index.index_value
+        return self.new_series(
+            inputs,
+            shape=index.shape,
+            dtype=index.dtype,
+            index_value=index_value,
+            name=name,
+        )
+
+    @classmethod
+    def tile(cls, op: "SeriesFromIndex"):
+        inp = op.input_
+        out = op.outputs[0]
+        index = op.index
+
+        if index is not None:
+            index = yield from recursive_tile(op.index.rechunk({0: inp.nsplits[0]}))
+
+        chunks = []
+        for i, c in enumerate(inp.chunks):
+            chunk_op = op.copy().reset_key()
+            chunk_inputs = [c]
+            chunk_index_value = c.index_value
+            if index is not None:
+                index_chunk = index.chunks[i]
+                chunk_index_value = index_chunk.index_value
+                chunk_inputs.append(index_chunk)
+            chunk = chunk_op.new_chunk(
+                chunk_inputs,
+                shape=c.shape,
+                dtype=c.dtype,
+                index_value=chunk_index_value,
+                name=out.name,
+                index=c.index,
+            )
+            chunks.append(chunk)
+
+        new_op = op.copy()
+        params = out.params
+        params["chunks"] = chunks
+        params["nsplits"] = inp.nsplits
+        return new_op.new_tileables([inp], kws=[params])
+
+    @classmethod
+    def execute(cls, ctx, op):
+        out = op.outputs[0]
+        inp = ctx[op.input_.key]
+        index = None
+        if op.index is not None:
+            index = ctx[op.index.key]
+        name = op.name or out.name
+        ctx[out.key] = inp.to_series(index=index, name=name)
+
+
+def series_from_index(ind, index=None, name=None):
+    name = name or ind.name or 0
+    if index is not None:
+        index = Index(index)
+    op = SeriesFromIndex(input_=ind, index=index, name=name)
+    return op(ind, new_index=index, name=name)
diff --git a/python/xorbits/_mars/dataframe/datasource/from_records.py b/python/xorbits/_mars/dataframe/datasource/from_records.py
new file mode 100644
index 000000000..990ef5165
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datasource/from_records.py
@@ -0,0 +1,164 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import OutputType
+from ...serialization.serializables import BoolField, Int32Field, ListField
+from ...tensor.core import TENSOR_TYPE
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index
+
+
+class DataFrameFromRecords(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.DATAFRAME_FROM_RECORDS
+
+    columns = ListField("columns", default=None)
+    exclude = ListField("exclude", default=None)
+    coerce_float = BoolField("coerce_float", default=False)
+    nrows = Int32Field("nrows", default=None)
+
+    def __init__(self, index=None, columns=None, **kw):
+        if index is not None or columns is not None:
+            raise NotImplementedError("Specifying index value is not supported for now")
+        super().__init__(columns=columns, _output_types=[OutputType.dataframe], **kw)
+
+    def __call__(self, data):
+        if self.nrows is None:
+            nrows = data.shape[0]
+        else:
+            nrows = self.nrows
+        index_value = parse_index(pd.RangeIndex(start=0, stop=nrows))
+        dtypes = pd.Series(dict((k, np.dtype(v)) for k, v in data.dtype.descr))
+        columns_value = parse_index(pd.Index(data.dtype.names), store_data=True)
+        return self.new_dataframe(
+            [data],
+            (data.shape[0], len(data.dtype.names)),
+            dtypes=dtypes,
+            index_value=index_value,
+            columns_value=columns_value,
+        )
+
+    @classmethod
+    def tile(cls, op):
+        df = op.outputs[0]
+        tensor = op.inputs[0]
+
+        nsplit_acc = np.cumsum(tensor.nsplits[0])
+        out_chunks = []
+        for chunk in tensor.chunks:
+            begin_index = nsplit_acc[chunk.index[0]] - chunk.shape[0]
+            end_index = nsplit_acc[chunk.index[0]]
+            chunk_index_value = parse_index(
+                pd.RangeIndex(start=begin_index, stop=end_index)
+            )
+
+            # Here the `new_chunk` is tricky:
+            #
+            # We can construct tensor that have identifcal chunks, for example, from `mt.ones(...)`, we know
+            # that after tiling the chunk of the same shape (but at different position) in `mt.ones` is indeed
+            # the same chunk (has the same key)!
+            #
+            # Thus, when we construct dataframe from such tensor, we will have dataframe chunks that only differ
+            # in `index_value`. However the `index_value` field won't be used to calculate the chunk key of
+            # the dataframe chunk, thus `new_chunk` generated the same keys for those indeed different chunks
+            # (they have different `index_values`).
+            #
+            # Here, we construct new chunk with some unique `_extra_params` to make the `new_chunk` work as
+            # expected.
+            chunk_op = op.copy().reset_key()
+            chunk_op.extra_params["begin_index"] = begin_index
+            chunk_op.extra_params["end_index"] = end_index
+            out_chunk = chunk_op.new_chunk(
+                [chunk],
+                shape=(chunk.shape[0], df.shape[1]),
+                index=(chunk.index[0], 0),
+                dtypes=df.dtypes,
+                index_value=chunk_index_value,
+                columns_value=df.columns_value,
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            [tensor],
+            df.shape,
+            dtypes=df.dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+            chunks=out_chunks,
+            nsplits=[tensor.nsplits[0], [df.shape[1]]],
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        ctx[chunk.key] = pd.DataFrame.from_records(
+            ctx[op.inputs[0].key],
+            index=chunk.index_value.to_pandas(),
+            columns=chunk.columns_value.to_pandas(),
+            exclude=op.exclude,
+            coerce_float=op.coerce_float,
+            nrows=op.nrows,
+        )
+
+
+def from_records(
+    data,
+    index=None,
+    exclude=None,
+    columns=None,
+    coerce_float=False,
+    nrows=None,
+    gpu=None,
+    sparse=False,
+    **kw
+):
+    if isinstance(data, np.ndarray):
+        from .dataframe import from_pandas
+
+        return from_pandas(
+            pd.DataFrame.from_records(
+                data,
+                index=index,
+                exclude=exclude,
+                columns=columns,
+                coerce_float=coerce_float,
+                nrows=nrows,
+            ),
+            **kw
+        )
+    elif isinstance(data, TENSOR_TYPE):
+        if data.dtype.names is None:
+            raise TypeError("Not a tensor with structured dtype {0}", data.dtype)
+        if data.ndim != 1:
+            raise ValueError(
+                "Not a tensor with non 1-D structured dtype {0}", data.shape
+            )
+
+        op = DataFrameFromRecords(
+            index=None,
+            exclude=exclude,
+            columns=columns,
+            coerce_float=coerce_float,
+            nrows=nrows,
+            gpu=gpu,
+            sparse=sparse,
+            **kw
+        )
+        return op(data)
+    else:
+        raise TypeError("Not support create DataFrame from {0}", type(data))
diff --git a/python/xorbits/_mars/dataframe/datasource/from_tensor.py b/python/xorbits/_mars/dataframe/datasource/from_tensor.py
new file mode 100644
index 000000000..49b5a58dd
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datasource/from_tensor.py
@@ -0,0 +1,754 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from typing import Any, Dict, List, Union
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import ENTITY_TYPE, OutputType, recursive_tile
+from ...core.context import Context
+from ...serialization.serializables import AnyField, KeyField
+from ...tensor.core import Tensor
+from ...tensor.datasource import tensor as astensor
+from ...tensor.utils import unify_chunks
+from ...typing import EntityType, TileableType
+from ...utils import has_unknown_shape
+from ..core import INDEX_TYPE, SERIES_TYPE
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index
+
+
+class DataFrameFromTensor(DataFrameOperand, DataFrameOperandMixin):
+    """
+    Represents data from mars tensor
+    """
+
+    _op_type_ = OperandDef.DATAFRAME_FROM_TENSOR
+
+    input = AnyField("input")
+    index = AnyField("index")
+    columns = AnyField("columns")
+
+    def __init__(self, *args, **kwargs):
+        kwargs["_output_types"] = [OutputType.dataframe]
+        super().__init__(*args, **kwargs)
+
+    def _set_inputs(self, inputs: List[EntityType]):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        if self.input is not None:
+            if not isinstance(self.input, dict):
+                self.input = next(inputs_iter)
+            else:
+                # check each value for input
+                new_input = OrderedDict()
+                for k, v in self.input.items():
+                    if isinstance(v, ENTITY_TYPE):
+                        new_input[k] = next(inputs_iter)
+                    else:
+                        new_input[k] = v
+                self.input = new_input
+
+        if isinstance(self.index, ENTITY_TYPE):
+            self.index = next(inputs_iter)
+
+    def __call__(
+        self,
+        input_tensor: Tensor,
+        index: Union[TileableType, pd.Index],
+        columns: pd.Index,
+        dtypes: pd.Series,
+    ):
+        if isinstance(input_tensor, dict):
+            return self._call_input_1d_tileables(input_tensor, index, columns, dtypes)
+        elif input_tensor is not None:
+            return self._call_input_tensor(input_tensor, index, columns, dtypes)
+        else:
+            return self._call_tensor_none(index, columns, dtypes)
+
+    def _process_index(
+        self, index: Union[TileableType, pd.Index], inputs: List[EntityType]
+    ):
+        if not isinstance(index, pd.Index):
+            if isinstance(index, INDEX_TYPE):
+                index_value = index.index_value
+                inputs.append(index)
+            elif isinstance(index, ENTITY_TYPE):
+                index = astensor(index)
+                if index.ndim != 1:
+                    raise ValueError(f"index should be 1-d, got {index.ndim}-d")
+                index_value = parse_index(
+                    pd.Index([], dtype=index.dtype), index, type(self).__name__
+                )
+                inputs.append(index)
+            else:
+                index = pd.Index(index)
+                index_value = parse_index(index)
+        else:
+            index_value = parse_index(index)
+        return index_value
+
+    def _call_input_1d_tileables(
+        self,
+        input_1d_tileables: Dict[Any, TileableType],
+        index: Union[TileableType, pd.Index],
+        columns: pd.Index,
+        dtypes: pd.Series,
+    ):
+        tileables = []
+        shape = None
+        for tileable in input_1d_tileables.values():
+            tileable_shape = astensor(tileable).shape
+            if len(tileable_shape) > 0:
+                if shape is None:
+                    shape = tileable_shape
+                elif shape != tileable_shape:
+                    raise ValueError("input 1-d tensors should have same shape")
+
+            if isinstance(tileable, ENTITY_TYPE):
+                tileables.append(tileable)
+
+        if index is not None:
+            tileable_size = tileables[0].shape[0]
+            if hasattr(index, "shape"):
+                index_size = index.shape[0]
+            else:
+                index_size = len(index)
+            if (
+                not pd.isna(tileable_size)
+                and not pd.isna(index_size)
+                and tileable_size != index_size
+            ):
+                raise ValueError(
+                    f"index {index} should have the same shape "
+                    f"with tensor: {tileable_size}"
+                )
+            index_value = self._process_index(index, tileables)
+        else:
+            self.index = index = pd.RangeIndex(0, tileables[0].shape[0])
+            index_value = parse_index(index)
+
+        if columns is not None:
+            if len(input_1d_tileables) != len(columns):
+                raise ValueError(
+                    f"columns {columns} should have size {len(input_1d_tileables)}"
+                )
+            if not isinstance(columns, pd.Index):
+                if isinstance(columns, ENTITY_TYPE):
+                    raise NotImplementedError("The columns value cannot be a tileable")
+                columns = pd.Index(columns)
+            columns_value = parse_index(columns, store_data=True)
+        else:
+            columns_value = parse_index(
+                pd.RangeIndex(0, len(input_1d_tileables)), store_data=True
+            )
+
+        shape = (shape[0], len(input_1d_tileables))
+        return self.new_dataframe(
+            tileables,
+            shape,
+            dtypes=dtypes,
+            index_value=index_value,
+            columns_value=columns_value,
+        )
+
+    def _call_input_tensor(
+        self,
+        input_tensor: Tensor,
+        index: Union[TileableType, pd.Index],
+        columns: pd.Index,
+        dtypes: pd.Series,
+    ):
+        if input_tensor.ndim not in {1, 2}:
+            raise ValueError("Must pass 1-d or 2-d input")
+        inputs = [input_tensor]
+
+        if index is not None:
+            if input_tensor.shape[0] != len(index):
+                raise ValueError(
+                    f"index {index} should have the same shape with tensor: {input_tensor.shape[0]}"
+                )
+            index_value = self._process_index(index, inputs)
+        elif isinstance(input_tensor, SERIES_TYPE):
+            index_value = input_tensor.index_value
+        else:
+            stop = input_tensor.shape[0]
+            stop = -1 if np.isnan(stop) else stop
+            index = self.index = pd.RangeIndex(start=0, stop=stop)
+            index_value = parse_index(index)
+
+        if columns is not None:
+            if not (
+                input_tensor.ndim == 1
+                and len(columns) == 1
+                or input_tensor.shape[1] == len(columns)
+            ):
+                raise ValueError(
+                    f"columns {columns} should have the same shape with tensor: {input_tensor.shape[1]}"
+                )
+            if not isinstance(columns, pd.Index):
+                if isinstance(columns, ENTITY_TYPE):
+                    raise NotImplementedError("The columns value cannot be a tileable")
+                columns = pd.Index(columns)
+            columns_value = parse_index(columns, store_data=True)
+        else:
+            if input_tensor.ndim == 1:
+                # convert to 1-d DataFrame
+                columns_value = parse_index(
+                    pd.RangeIndex(start=0, stop=1), store_data=True
+                )
+            else:
+                columns_value = parse_index(
+                    pd.RangeIndex(start=0, stop=input_tensor.shape[1]), store_data=True
+                )
+
+        if input_tensor.ndim == 1:
+            shape = (input_tensor.shape[0], 1)
+        else:
+            shape = input_tensor.shape
+
+        return self.new_dataframe(
+            inputs,
+            shape,
+            dtypes=dtypes,
+            index_value=index_value,
+            columns_value=columns_value,
+        )
+
+    def _call_tensor_none(
+        self, index: Union[TileableType, pd.Index], columns: pd.Index, dtypes: pd.Series
+    ):
+        inputs = []
+        shape = []
+        if index is not None:
+            index_value = self._process_index(index, inputs)
+            shape.append(index.shape[0])
+        else:
+            index = self.index = pd.Index([], dtype=object)
+            index_value = parse_index(index)
+            shape.append(0)
+
+        if columns is not None:
+            if not isinstance(columns, pd.Index):
+                if isinstance(columns, ENTITY_TYPE):
+                    raise NotImplementedError("The columns value cannot be a tileable")
+                columns = pd.Index(columns)
+            columns_value = parse_index(columns, store_data=True)
+            shape.append(columns.shape[0])
+        else:
+            columns_value = parse_index(pd.Index([], dtype=object), store_data=True)
+            shape.append(0)
+
+        return self.new_dataframe(
+            inputs,
+            shape=tuple(shape),
+            dtypes=dtypes,
+            index_value=index_value,
+            columns_value=columns_value,
+        )
+
+    @classmethod
+    def tile(cls, op: "DataFrameFromTensor"):
+        if isinstance(op.input, dict):
+            return (yield from cls._tile_input_1d_tileables(op))
+        elif op.input is not None:
+            return (yield from cls._tile_input_tensor(op))
+        else:
+            return cls._tile_tensor_none(op)
+
+    @classmethod
+    def _tile_input_1d_tileables(cls, op: "DataFrameFromTensor"):
+        # make sure all tensor have known chunk shapes
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        out_df = op.outputs[0]
+        in_tensors = op.inputs
+        in_tensors = yield from unify_chunks(*in_tensors)
+        nsplit = in_tensors[0].nsplits[0]
+
+        cum_sizes = [0] + np.cumsum(nsplit).tolist()
+        out_chunks = []
+        for i in range(in_tensors[0].chunk_shape[0]):
+            chunk_op = op.copy().reset_key()
+            new_input = OrderedDict()
+            for k, v in op.input.items():
+                if not isinstance(v, ENTITY_TYPE):
+                    try:
+                        new_input[k] = v[cum_sizes[i] : cum_sizes[i + 1]]
+                    except TypeError:
+                        # scalar
+                        new_input[k] = v
+                else:
+                    # do not need to do slice,
+                    # will be done in set_inputs
+                    new_input[k] = v
+            chunk_op.input = new_input
+            columns_value = out_df.columns_value
+            dtypes = out_df.dtypes
+            chunk_index = (i, 0)
+            if isinstance(op.index, INDEX_TYPE):
+                index_value = in_tensors[-1].chunks[i].index_value
+            elif isinstance(op.index, pd.Index):
+                chunk_op.index = pd_index = op.index[cum_sizes[i] : cum_sizes[i + 1]]
+                index_value = parse_index(pd_index, store_data=True)
+            else:
+                assert op.index is not None
+                index_chunk = in_tensors[-1].cix[i,]
+                index_value = parse_index(
+                    pd.Index([], dtype=index_chunk.dtype),
+                    index_chunk,
+                    type(chunk_op).__name__,
+                )
+            shape = (nsplit[i], len(out_df.dtypes))
+            out_chunk = chunk_op.new_chunk(
+                [t.cix[(i,)] for t in in_tensors],
+                shape=shape,
+                index=chunk_index,
+                dtypes=dtypes,
+                index_value=index_value,
+                columns_value=columns_value,
+            )
+            out_chunks.append(out_chunk)
+
+        nsplits = (nsplit, (len(out_df.dtypes),))
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            out_df.inputs,
+            out_df.shape,
+            dtypes=out_df.dtypes,
+            index_value=out_df.index_value,
+            columns_value=out_df.columns_value,
+            chunks=out_chunks,
+            nsplits=nsplits,
+        )
+
+    @classmethod
+    def _tile_input_tensor(cls, op: "DataFrameFromTensor"):
+        out_df = op.outputs[0]
+        in_tensor = op.input
+        out_chunks = []
+        if out_df.index_value.has_value() and has_unknown_shape(in_tensor):
+            yield
+
+        nsplits = in_tensor.nsplits
+
+        if op.index is not None and hasattr(op.index, "key"):
+            # rechunk index if it's a tensor
+            if has_unknown_shape(*op.inputs):
+                yield
+            index_tensor = yield from recursive_tile(op.index.rechunk([nsplits[0]]))
+        else:
+            index_tensor = None
+
+        # nsplits
+        if in_tensor.ndim == 1:
+            out_nsplits = in_tensor.nsplits + ((1,),)
+        else:
+            out_nsplits = in_tensor.nsplits
+
+        cum_nsplits = [[0] + np.cumsum(ns).tolist() for ns in out_nsplits]
+        for in_chunk in in_tensor.chunks:
+            out_op = op.copy().reset_key()
+            chunk_inputs = [in_chunk]
+            if in_chunk.ndim == 1:
+                i = in_chunk.index[0]
+                chunk_index = (i, 0)
+                chunk_shape = (in_chunk.shape[0], 1)
+            else:
+                i, j = in_chunk.index
+                chunk_index = in_chunk.index
+                chunk_shape = in_chunk.shape
+
+            if op.columns is not None:
+                column_nsplit = cum_nsplits[1]
+                j = chunk_index[1]
+                out_op.columns = op.columns[column_nsplit[j] : column_nsplit[j + 1]]
+
+            if isinstance(op.index, INDEX_TYPE):
+                index_chunk = index_tensor.chunks[i]
+                chunk_inputs.append(index_chunk)
+            elif isinstance(op.index, pd.Index):
+                index_nsplit = cum_nsplits[0]
+                if op.index.size > 0:
+                    out_op.index = op.index[index_nsplit[i] : index_nsplit[i + 1]]
+            elif index_tensor is not None:
+                index_chunk = index_tensor.cix[i]
+                chunk_inputs.append(index_chunk)
+
+            out_chunk = out_op.new_chunk(
+                chunk_inputs, shape=chunk_shape, index=chunk_index
+            )
+            out_chunk._set_tileable_meta(
+                tileable_key=out_df.key,
+                nsplits=out_nsplits,
+                index_value=out_df.index_value,
+                columns_value=out_df.columns_value,
+                dtypes=out_df.dtypes,
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        params = out_df.params.copy()
+        params["chunks"] = out_chunks
+        params["nsplits"] = out_nsplits
+        return new_op.new_dataframes(out_df.inputs, kws=[params])
+
+    @classmethod
+    def _tile_tensor_none(cls, op: "DataFrameFromTensor"):
+        out_df = op.outputs[0]
+
+        out_chunks = []
+        assert isinstance(op.index, INDEX_TYPE)
+        # tile as index
+        for index_chunk in op.index.chunks:
+            index_value = index_chunk.index_value
+
+            chunk_shape = (index_chunk.shape[0], out_df.shape[1])
+            chunk_index = (index_chunk.index[0], 0)
+
+            chunk_op = op.copy().reset_key()
+            out_chunk = chunk_op.new_chunk(
+                [index_chunk],
+                shape=chunk_shape,
+                index=chunk_index,
+                index_value=index_value,
+                columns_value=out_df.columns_value,
+                dtypes=out_df.dtypes,
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        params = out_df.params.copy()
+        params["nsplits"] = (op.index.nsplits[0], (out_df.shape[1],))
+        params["chunks"] = out_chunks
+        return new_op.new_dataframes(out_df.inputs, kws=[params])
+
+    @classmethod
+    def execute(cls, ctx: Union[dict, Context], op: "DataFrameFromTensor"):
+        chunk = op.outputs[0]
+
+        if isinstance(op.input, dict):
+            d = OrderedDict()
+            for k, v in op.input.items():
+                if hasattr(v, "key"):
+                    d[k] = ctx[v.key]
+                else:
+                    d[k] = v
+            if op.index is not None and hasattr(op.index, "key"):
+                index_data = ctx[op.index.key]
+            else:
+                index_data = op.index
+            ctx[chunk.key] = pd.DataFrame(d, index=index_data, columns=op.columns)
+        elif op.input is not None:
+            tensor_data = ctx[op.inputs[0].key]
+            if isinstance(tensor_data, pd.Series):
+                ctx[chunk.key] = tensor_data.to_frame(name=chunk.dtypes.index[0])
+            else:
+                if op.index is not None and hasattr(op.index, "key"):
+                    # index is a tensor
+                    index_data = ctx[op.inputs[1].key]
+                else:
+                    index_data = op.index
+                    if isinstance(index_data, pd.RangeIndex) and len(index_data) == 0:
+                        index_data = None
+                ctx[chunk.key] = pd.DataFrame(
+                    tensor_data,
+                    index=index_data,
+                    columns=op.columns,
+                )
+        else:
+            index_data = ctx[op.index.key]
+            ctx[chunk.key] = pd.DataFrame(index=index_data, columns=op.columns)
+
+
+def dataframe_from_tensor(
+    tensor: Tensor,
+    index: Union[TileableType, pd.Index] = None,
+    columns: Union[pd.Index, list] = None,
+    gpu: bool = None,
+    sparse: bool = False,
+):
+    if tensor is not None:
+        if tensor.ndim > 2 or tensor.ndim <= 0:
+            raise TypeError(
+                f"Not support create DataFrame from {tensor.ndim} dims tensor"
+            )
+        try:
+            col_num = tensor.shape[1]
+        except IndexError:
+            col_num = 1
+        gpu = tensor.op.gpu if gpu is None else gpu
+        dtypes = pd.Series([tensor.dtype] * col_num, index=columns)
+        if columns is None:
+            columns = dtypes.index
+    else:
+        gpu = None
+        if columns is not None:
+            dtypes = pd.Series([], index=columns)
+        else:
+            dtypes = pd.Series([], index=pd.Index([], dtype=object))
+    if index is not None and not isinstance(index, ENTITY_TYPE):
+        index = pd.Index(index)
+    op = DataFrameFromTensor(
+        input=tensor, index=index, columns=columns, gpu=gpu, sparse=sparse
+    )
+    return op(tensor, index, columns, dtypes)
+
+
+def dataframe_from_1d_tileables(
+    d: Dict[Any, TileableType],
+    index: Union[TileableType, pd.Index, list] = None,
+    columns: Union[pd.Index, list] = None,
+    gpu: bool = None,
+    sparse: bool = False,
+):
+    data = dict()
+    for k, v in d.items():
+        if isinstance(v, (list, tuple)) and any(
+            isinstance(sv, ENTITY_TYPE) for sv in v
+        ):
+            data[k] = astensor(v)
+        else:
+            data[k] = v
+    d = data
+    if columns is not None:
+        tileables = [d.get(c) for c in columns]
+    else:
+        columns = list(d.keys())
+        tileables = list(d.values())
+
+    gpu = (
+        next((t.op.gpu for t in tileables if hasattr(t, "op")), False)
+        if gpu is None
+        else gpu
+    )
+    dtypes = pd.Series(
+        [t.dtype if hasattr(t, "dtype") else pd.Series(t).dtype for t in tileables],
+        index=columns,
+    )
+    if index is not None and not isinstance(index, ENTITY_TYPE):
+        index = pd.Index(index)
+    op = DataFrameFromTensor(
+        input=d, index=index, columns=columns, gpu=gpu, sparse=sparse
+    )
+    return op(d, index, columns, dtypes)
+
+
+class SeriesFromTensor(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.SERIES_FROM_TENSOR
+
+    input = KeyField("input")
+    index = AnyField("index")
+
+    def _set_inputs(self, inputs: List[EntityType]):
+        super()._set_inputs(inputs)
+        if self.input is not None:
+            self.input = self.inputs[0]
+        if self.index is not None and hasattr(self.index, "key"):
+            self.index = self.inputs[-1]
+
+    @classmethod
+    def tile(cls, op: "SeriesFromTensor"):
+        if op.index is None or not hasattr(op.index, "key"):
+            # check all inputs to make sure no unknown chunk shape
+            if has_unknown_shape(*op.inputs):
+                yield
+
+        if op.input is None:
+            return cls._tile_tensor_none(op)
+
+        out_series = op.outputs[0]
+        in_tensor = op.inputs[0]
+        nsplits = in_tensor.nsplits
+
+        index_tensor = series_index = None
+        if op.index is not None:
+            if hasattr(op.index, "key"):
+                index_tensor = yield from recursive_tile(op.index.rechunk([nsplits[0]]))
+            else:
+                series_index = op.index
+
+        index_start = 0
+        out_chunks = []
+        for in_chunk in in_tensor.chunks:
+            new_op = op.copy().reset_key()
+            new_op.extra_params["index_start"] = index_start
+            chunk_inputs = [in_chunk]
+            if index_tensor is not None:
+                index_chunk = index_tensor.cix[in_chunk.index]
+                chunk_inputs.append(index_chunk)
+                if isinstance(op.index, INDEX_TYPE):
+                    index_value = index_chunk.index_value
+                else:
+                    index_value = parse_index(
+                        pd.Index([], dtype=in_chunk.dtype),
+                        index_chunk,
+                        type(new_op).__name__,
+                    )
+            else:
+                chunk_pd_index = series_index[
+                    index_start : index_start + in_chunk.shape[0]
+                ]
+                index_value = parse_index(chunk_pd_index)
+                new_op.index = chunk_pd_index
+            index_start += in_chunk.shape[0]
+            out_chunk = new_op.new_chunk(
+                chunk_inputs,
+                shape=in_chunk.shape,
+                index=in_chunk.index,
+                index_value=index_value,
+                name=out_series.name,
+                dtype=out_series.dtype,
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tileables(
+            op.inputs,
+            shape=out_series.shape,
+            dtype=out_series.dtype,
+            index_value=out_series.index_value,
+            name=out_series.name,
+            chunks=out_chunks,
+            nsplits=in_tensor.nsplits,
+        )
+
+    @classmethod
+    def _tile_tensor_none(cls, op: "SeriesFromTensor"):
+        out_series = op.outputs[0]
+
+        out_chunks = []
+        assert isinstance(op.index, INDEX_TYPE)
+        # tile as index
+        for index_chunk in op.index.chunks:
+            index_value = index_chunk.index_value
+
+            chunk_shape = (index_chunk.shape[0],)
+            chunk_index = (index_chunk.index[0],)
+
+            chunk_op = op.copy().reset_key()
+            out_chunk = chunk_op.new_chunk(
+                [index_chunk],
+                shape=chunk_shape,
+                index=chunk_index,
+                index_value=index_value,
+                dtype=out_series.dtype,
+                name=out_series.name,
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        params = out_series.params.copy()
+        params["nsplits"] = (op.index.nsplits[0],)
+        params["chunks"] = out_chunks
+        return new_op.new_tileables(out_series.inputs, kws=[params])
+
+    @classmethod
+    def execute(cls, ctx: Union[dict, Context], op: "SeriesFromTensor"):
+        chunk = op.outputs[0]
+        if op.input is not None:
+            tensor_data = ctx[op.input.key]
+        else:
+            tensor_data = None
+
+        if op.index is not None and hasattr(op.index, "key"):
+            index_data = ctx[op.index.key]
+        else:
+            index_data = op.index
+            if (
+                tensor_data is not None
+                and isinstance(index_data, pd.RangeIndex)
+                and len(index_data) == 0
+            ):
+                # index not specified
+                index_data = None
+
+        ctx[chunk.key] = pd.Series(
+            tensor_data, index=index_data, name=chunk.name, dtype=chunk.dtype
+        )
+
+    def __call__(
+        self,
+        input_tensor: Tensor,
+        index: Union[TileableType, pd.Index],
+        dtype: np.dtype,
+        name: Any,
+    ):
+        inputs = [input_tensor] if input_tensor is not None else []
+        if index is not None:
+            if not isinstance(index, pd.Index):
+                if isinstance(index, INDEX_TYPE):
+                    self.index = index
+                    index_value = index.index_value
+                    inputs.append(index)
+                elif isinstance(index, ENTITY_TYPE):
+                    self.index = index
+                    index = astensor(index)
+                    if index.ndim != 1:
+                        raise ValueError(f"index should be 1-d, got {index.ndim}-d")
+                    index_value = parse_index(
+                        pd.Index([], dtype=index.dtype), index, type(self).__name__
+                    )
+                    inputs.append(index)
+                else:
+                    self.index = index = pd.Index(index)
+                    index_value = parse_index(index)
+            else:
+                self.index = index
+                index_value = parse_index(index)
+        elif input_tensor is not None:
+            if pd.isna(input_tensor.shape[0]):
+                pd_index = pd.RangeIndex(-1)
+            else:
+                pd_index = pd.RangeIndex(start=0, stop=input_tensor.shape[0])
+            index_value = parse_index(pd_index)
+            self.index = pd_index
+        else:
+            self.index = index = pd.Index([], dtype=object)
+            index_value = parse_index(index)
+
+        if input_tensor is not None:
+            shape = input_tensor.shape
+        elif index is not None:
+            shape = index.shape
+        else:
+            shape = (0,)
+
+        return self.new_series(
+            inputs, shape=shape, dtype=dtype, index_value=index_value, name=name
+        )
+
+
+def series_from_tensor(
+    tensor: Tensor,
+    index: Union[TileableType, pd.Index, list] = None,
+    name: Any = None,
+    dtype: np.dtype = None,
+    gpu: bool = None,
+    sparse: bool = False,
+):
+    if tensor is not None:
+        if tensor.ndim > 1 or tensor.ndim <= 0:
+            raise TypeError(f"Not support create Series from {tensor.ndim} dims tensor")
+        gpu = tensor.op.gpu if gpu is None else gpu
+        dtype = dtype or tensor.dtype
+    else:
+        gpu = None
+        dtype = dtype or np.dtype(float)
+    op = SeriesFromTensor(input=tensor, gpu=gpu, sparse=sparse)
+    return op(tensor, index, dtype, name)
diff --git a/python/xorbits/_mars/dataframe/datasource/from_vineyard.py b/python/xorbits/_mars/dataframe/datasource/from_vineyard.py
new file mode 100644
index 000000000..4714948d5
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datasource/from_vineyard.py
@@ -0,0 +1,261 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import OutputType
+from ...core.context import get_context
+from ...serialization.serializables import Int32Field, StringField
+from ...tensor.datasource.from_vineyard import resolve_vineyard_socket
+from ...utils import calc_nsplits, has_unknown_shape, lazy_import
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index
+
+vineyard = lazy_import("vineyard")
+vy_data_utils = lazy_import("vineyard.data.utils", rename="vy_data_utils")
+
+
+class DataFrameFromVineyard(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.DATAFRAME_FROM_VINEYARD_CHUNK
+
+    # generated columns for metadata
+    generated_columns = ["id", "worker_address", "dtypes", "shape", "index", "columns"]
+
+    # vineyard ipc socket
+    vineyard_socket = StringField("vineyard_socket")
+
+    # ObjectID in vineyard
+    object_id = StringField("object_id")
+
+    # a dummy attr to make sure ops have different keys
+    operator_index = Int32Field("operator_index")
+
+    def __init__(self, vineyard_socket=None, object_id=None, **kw):
+        super().__init__(
+            vineyard_socket=vineyard_socket,
+            object_id=object_id,
+            _output_types=[OutputType.dataframe],
+            **kw
+        )
+
+    def check_inputs(self, inputs):
+        # no inputs
+        if inputs and len(inputs) > 0:
+            raise ValueError("DataFrame data source has no inputs")
+
+    def _new_chunks(self, inputs, kws=None, **kw):
+        shape = kw.get("shape", None)
+        self.extra_params[
+            "shape"
+        ] = shape  # set shape to make the operand key different
+        return super()._new_chunks(inputs, kws=kws, **kw)
+
+    def _new_tileables(self, inputs, kws=None, **kw):
+        shape = kw.get("shape", None)
+        self.extra_params[
+            "shape"
+        ] = shape  # set shape to make the operand key different
+        return super()._new_tileables(inputs, kws=kws, **kw)
+
+    def __call__(self, shape, dtypes=None, index_value=None, columns_value=None):
+        return self.new_dataframe(
+            None,
+            shape,
+            dtypes=dtypes,
+            index_value=index_value,
+            columns_value=columns_value,
+        )
+
+    @classmethod
+    def tile(cls, op):
+        ctx = get_context()
+        workers = ctx.get_worker_addresses()
+
+        out_chunks = []
+        dtypes = pd.Series(
+            [np.dtype("O")] * len(cls.generated_columns), index=cls.generated_columns
+        )
+        for index, worker in enumerate(workers):
+            chunk_op = op.copy().reset_key()
+            chunk_op.expect_worker = worker
+            chunk_op.operator_index = index
+            out_chunk = chunk_op.new_chunk(
+                [],
+                dtypes=dtypes,
+                shape=(1, len(cls.generated_columns)),
+                index=(index, 0),
+                index_value=parse_index(pd.RangeIndex(0, 1)),
+                columns_value=parse_index(pd.Index(cls.generated_columns)),
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy().reset_key()
+        return new_op.new_dataframes(
+            op.inputs,
+            shape=(np.nan, np.nan),
+            dtypes=dtypes,
+            chunks=out_chunks,
+            nsplits=((np.nan,), (np.nan,)),
+            # use the same value as `read_csv`
+            index_value=parse_index(pd.RangeIndex(0, 1)),
+            columns_value=parse_index(pd.Index(cls.generated_columns)),
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if vineyard is None:
+            raise RuntimeError("vineyard is not available")
+
+        socket = resolve_vineyard_socket(ctx, op)
+        client = vineyard.connect(socket)
+
+        meta = client.get_meta(vineyard.ObjectID(op.object_id))
+        chunks, dtypes = [], None
+        for idx in range(meta["partitions_-size"]):
+            chunk_meta = meta["partitions_-%d" % idx]
+            columns = pd.Index(vy_data_utils.from_json(chunk_meta["columns_"]))
+            shape = (np.nan, len(columns))
+            if not chunk_meta.islocal:
+                continue
+            if dtypes is None:
+                dtypes = []
+                for idx in range(len(columns)):
+                    column_meta = chunk_meta["__values_-value-%d" % idx]
+                    dtype = vy_data_utils.normalize_dtype(
+                        column_meta["value_type_"],
+                        column_meta.get("value_type_meta_", None),
+                    )
+                    dtypes.append(dtype)
+                dtypes = pd.Series(dtypes, index=columns)
+            chunk_index = (
+                chunk_meta["partition_index_row_"],
+                chunk_meta["partition_index_column_"],
+            )
+            # chunk: (chunk_id, worker_address, dtype, shape, index, columns)
+            chunks.append(
+                (
+                    repr(chunk_meta.id),
+                    ctx.worker_address,
+                    dtypes,
+                    shape,
+                    chunk_index,
+                    columns,
+                )
+            )
+
+        ctx[op.outputs[0].key] = pd.DataFrame(chunks, columns=cls.generated_columns)
+
+
+class DataFrameFromVineyardChunk(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.TENSOR_FROM_VINEYARD_CHUNK
+
+    # vineyard ipc socket
+    vineyard_socket = StringField("vineyard_socket")
+
+    # ObjectID of chunk in vineyard
+    object_id = StringField("object_id")
+
+    def __init__(self, vineyard_socket=None, object_id=None, **kw):
+        super().__init__(vineyard_socket=vineyard_socket, object_id=object_id, **kw)
+
+    def __call__(self, meta):
+        return self.new_dataframe([meta])
+
+    @classmethod
+    def tile(cls, op):
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        ctx = get_context()
+
+        in_chunk_keys = [chunk.key for chunk in op.inputs[0].chunks]
+        out_chunks = []
+        chunk_map = dict()
+        dtypes, columns = None, None
+        for chunk, infos in zip(
+            op.inputs[0].chunks, ctx.get_chunks_result(in_chunk_keys)
+        ):
+            for _, info in infos.iterrows():
+                chunk_op = op.copy().reset_key()
+                chunk_op.object_id = info["id"]
+                chunk_op.expect_worker = info["worker_address"]
+                dtypes = info["dtypes"]
+                columns = info["columns"]
+                shape = info["shape"]
+                chunk_index = info["index"]
+                chunk_map[chunk_index] = info["shape"]
+                out_chunk = chunk_op.new_chunk(
+                    [chunk],
+                    shape=shape,
+                    index=chunk_index,
+                    dtypes=dtypes,
+                    index_value=parse_index(pd.RangeIndex(0, -1)),
+                    columns_value=parse_index(pd.Index(columns)),
+                )
+                out_chunks.append(out_chunk)
+
+        nsplits = calc_nsplits(chunk_map)
+        shape = [np.sum(nsplit) for nsplit in nsplits]
+        new_op = op.copy().reset_key()
+        return new_op.new_dataframes(
+            op.inputs,
+            shape=shape,
+            dtypes=dtypes,
+            chunks=out_chunks,
+            nsplits=nsplits,
+            index_value=parse_index(pd.RangeIndex(0, -1)),
+            columns_value=parse_index(pd.Index(columns)),
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if vineyard is None:
+            raise RuntimeError("vineyard is not available")
+
+        socket = resolve_vineyard_socket(ctx, op)
+        client = vineyard.connect(socket)
+
+        client = vineyard.connect(socket)
+        ctx[op.outputs[0].key] = client.get(vineyard.ObjectID(op.object_id))
+
+
+def from_vineyard(df, vineyard_socket=None):
+    if vineyard is not None and isinstance(df, vineyard.Object):  # pragma: no cover
+        if "vineyard::GlobalDataFrame" not in df.typename:
+            raise TypeError(
+                "The input dataframe %r is not a vineyard' GlobalDataFrame" % df
+            )
+        object_id = df.id
+    else:
+        object_id = df
+    if vineyard is not None and isinstance(object_id, vineyard.ObjectID):
+        object_id = repr(object_id)
+    metaop = DataFrameFromVineyard(
+        vineyard_socket=vineyard_socket,
+        object_id=object_id,
+        dtype=np.dtype("byte"),
+        gpu=None,
+    )
+    meta = metaop(
+        shape=(np.nan,),
+        dtypes=pd.Series([]),
+        index_value=parse_index(pd.Index([])),
+        columns_value=parse_index(pd.Index([])),
+    )
+    op = DataFrameFromVineyardChunk(
+        vineyard_socket=vineyard_socket, object_id=object_id, gpu=None
+    )
+    return op(meta)
diff --git a/python/xorbits/_mars/dataframe/datasource/index.py b/python/xorbits/_mars/dataframe/datasource/index.py
new file mode 100644
index 000000000..37da1be45
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datasource/index.py
@@ -0,0 +1,260 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import OutputType
+from ...serialization.serializables import BoolField, DataTypeField, IndexField
+from ...tensor.utils import get_chunk_slices
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import decide_series_chunk_size, is_cudf, parse_index
+
+
+class IndexDataSource(DataFrameOperand, DataFrameOperandMixin):
+    """
+    Represent data from pandas Index
+    """
+
+    _op_type_ = OperandDef.INDEX_DATA_SOURCE
+
+    data = IndexField("data")
+    dtype = DataTypeField("dtype")
+    store_data = BoolField("store_data")
+
+    def __init__(self, data=None, dtype=None, gpu=None, store_data=None, **kw):
+        if dtype is None and data is not None:
+            dtype = data.dtype
+        if gpu is None and is_cudf(data):  # pragma: no cover
+            gpu = True
+        super().__init__(
+            data=data,
+            dtype=dtype,
+            gpu=gpu,
+            store_data=store_data,
+            _output_types=[OutputType.index],
+            **kw
+        )
+
+    def __call__(self, shape=None, chunk_size=None, inp=None, name=None, names=None):
+        if inp is None:
+            # create from pandas Index
+            name = name if name is not None else self.data.name
+            names = names if names is not None else self.data.names
+            return self.new_index(
+                None,
+                shape=shape,
+                dtype=self.dtype,
+                index_value=parse_index(self.data, store_data=self.store_data),
+                name=name,
+                names=names,
+                raw_chunk_size=chunk_size,
+            )
+        elif hasattr(inp, "index_value"):
+            # get index from Mars DataFrame, Series or Index
+            name = name if name is not None else inp.index_value.name
+            names = names if names is not None else [name]
+            if inp.index_value.has_value():
+                self.data = data = inp.index_value.to_pandas()
+                return self.new_index(
+                    None,
+                    shape=(inp.shape[0],),
+                    dtype=data.dtype,
+                    index_value=parse_index(data, store_data=self.store_data),
+                    name=name,
+                    names=names,
+                    raw_chunk_size=chunk_size,
+                )
+            else:
+                if self.dtype is None:
+                    self.dtype = inp.index_value.to_pandas().dtype
+                return self.new_index(
+                    [inp],
+                    shape=(inp.shape[0],),
+                    dtype=self.dtype,
+                    index_value=inp.index_value,
+                    name=name,
+                    names=names,
+                )
+        else:
+            if inp.ndim != 1:
+                raise ValueError("Index data must be 1-dimensional")
+            # get index from tensor
+            dtype = inp.dtype if self.dtype is None else self.dtype
+            pd_index = pd.Index([], dtype=dtype)
+            if self.dtype is None:
+                self.dtype = pd_index.dtype
+            return self.new_index(
+                [inp],
+                shape=inp.shape,
+                dtype=self.dtype,
+                index_value=parse_index(pd_index, inp, store_data=self.store_data),
+                name=name,
+                names=names,
+            )
+
+    @classmethod
+    def _tile_from_pandas(cls, op):
+        index = op.outputs[0]
+        raw_index = op.data
+
+        memory_usage = raw_index.memory_usage(deep=True)
+        chunk_size = index.extra_params.raw_chunk_size or options.chunk_size
+        chunk_size = decide_series_chunk_size(index.shape, chunk_size, memory_usage)
+        chunk_size_idxes = (range(len(size)) for size in chunk_size)
+
+        out_chunks = []
+        for chunk_index, chunk_shape in zip(
+            itertools.product(*chunk_size_idxes), itertools.product(*chunk_size)
+        ):
+            chunk_op = op.copy().reset_key()
+            slc = get_chunk_slices(chunk_size, chunk_index)
+            if is_cudf(raw_index):  # pragma: no cover
+                chunk_op.data = chunk_data = raw_index[slc[0]]
+            else:
+                chunk_op.data = chunk_data = raw_index[slc]
+            out_chunk = chunk_op.new_chunk(
+                None,
+                shape=chunk_shape,
+                dtype=index.dtype,
+                index=chunk_index,
+                name=index.name,
+                index_value=parse_index(chunk_data, store_data=op.store_data),
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_indexes(
+            None,
+            index.shape,
+            dtype=index.dtype,
+            index_value=index.index_value,
+            name=index.name,
+            chunks=out_chunks,
+            nsplits=chunk_size,
+        )
+
+    @classmethod
+    def _tile_from_dataframe(cls, op):
+        inp = op.inputs[0]
+        out = op.outputs[0]
+
+        out_chunks = []
+        if inp.ndim == 1:
+            # series, index
+            for c in inp.chunks:
+                chunk_op = op.copy().reset_key()
+                out_chunk = chunk_op.new_chunk(
+                    [c],
+                    shape=c.shape,
+                    dtype=out.dtype,
+                    index=c.index,
+                    index_value=c.index_value,
+                    name=out.name,
+                )
+                out_chunks.append(out_chunk)
+            nsplits = inp.nsplits
+        else:
+            # DataFrame
+            nsplit = inp.nsplits[1]
+            axis_1_index = np.argmin(nsplit).item()
+            for i in range(inp.chunk_shape[0]):
+                chunk_index = (i, axis_1_index)
+                c = inp.cix[chunk_index]
+                chunk_op = op.copy().reset_key()
+                out_chunk = chunk_op.new_chunk(
+                    [c],
+                    shape=(c.shape[0],),
+                    dtype=out.dtype,
+                    index=(i,),
+                    index_value=c.index_value,
+                    name=out.name,
+                )
+                out_chunks.append(out_chunk)
+            nsplits = (inp.nsplits[0],)
+
+        new_op = op.copy()
+        params = out.params
+        params["chunks"] = out_chunks
+        params["nsplits"] = nsplits
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def _tile_from_tensor(cls, op):
+        inp = op.inputs[0]
+        out = op.outputs[0]
+        out_chunks = []
+        for c in inp.chunks:
+            chunk_op = op.copy().reset_key()
+            index_value = parse_index(
+                out.index_value.to_pandas(), c, store_data=op.store_data
+            )
+            out_chunk = chunk_op.new_chunk(
+                [c],
+                shape=c.shape,
+                dtype=out.dtype,
+                index=c.index,
+                index_value=index_value,
+                name=out.name,
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        params = out.params
+        params["chunks"] = out_chunks
+        params["nsplits"] = inp.nsplits
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def tile(cls, op):
+        if not op.inputs:
+            # from pandas
+            return cls._tile_from_pandas(op)
+        elif hasattr(op.inputs[0], "index_value"):
+            # from DataFrame or Series
+            return cls._tile_from_dataframe(op)
+        else:
+            # from tensor
+            return cls._tile_from_tensor(op)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if not op.inputs:
+            # from pandas
+            ctx[op.outputs[0].key] = op.data
+        else:
+            out = op.outputs[0]
+            inp = ctx[op.inputs[0].key]
+            dtype = out.dtype if out.dtype != object else None
+            if hasattr(inp, "index"):
+                # DataFrame, Series
+                ctx[out.key] = pd.Index(inp.index, dtype=dtype, name=out.name)
+            else:
+                ctx[out.key] = pd.Index(inp, dtype=dtype, name=out.name)
+
+
+def from_pandas(data, chunk_size=None, gpu=None, sparse=False, store_data=False):
+    op = IndexDataSource(
+        data=data, gpu=gpu, sparse=sparse, dtype=data.dtype, store_data=store_data
+    )
+    return op(shape=data.shape, chunk_size=chunk_size)
+
+
+def from_tileable(tileable, dtype=None, name=None, names=None):
+    op = IndexDataSource(gpu=tileable.op.gpu, sparse=tileable.issparse(), dtype=dtype)
+    return op(inp=tileable, name=name, names=names)
diff --git a/python/xorbits/_mars/dataframe/datasource/read_csv.py b/python/xorbits/_mars/dataframe/datasource/read_csv.py
new file mode 100644
index 000000000..594a7c273
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datasource/read_csv.py
@@ -0,0 +1,760 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from io import BytesIO
+from urllib.parse import urlparse
+
+import numpy as np
+import pandas as pd
+
+try:
+    from pyarrow import NativeFile
+except ImportError:  # pragma: no cover
+    NativeFile = None
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import OutputType
+from ...lib.filesystem import file_size, get_fs, glob, open_file
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    DictField,
+    Int32Field,
+    Int64Field,
+    ListField,
+    StringField,
+)
+from ...utils import FixedSizeFileObject, lazy_import, parse_readable_size
+from ..arrays import ArrowStringDtype
+from ..utils import build_empty_df, contain_arrow_dtype, parse_index, to_arrow_dtypes
+from .core import (
+    ColumnPruneSupportedDataSourceMixin,
+    IncrementalIndexDatasource,
+    IncrementalIndexDataSourceMixin,
+    merge_small_files,
+)
+
+cudf = lazy_import("cudf")
+
+
+def _find_delimiter(f, block_size=2**16):
+    delimiter = b"\n"
+    if f.tell() == 0:
+        return 0
+    while True:
+        b = f.read(block_size)
+        if not b:
+            return f.tell()
+        elif delimiter in b:
+            return f.tell() - len(b) + b.index(delimiter) + 1
+
+
+def _find_hdfs_start_end(f, offset, size):
+    # As pyarrow doesn't support `readline` operation (https://github.com/apache/arrow/issues/3838),
+    # we need to find the start and end of file block manually.
+
+    # Be careful with HdfsFile's seek, it doesn't allow seek beyond EOF.
+    loc = min(offset, f.size())
+    f.seek(loc)
+    start = _find_delimiter(f)
+    loc = min(offset + size, f.size())
+    f.seek(loc)
+    end = _find_delimiter(f)
+    return start, end
+
+
+def _find_chunk_start_end(f, offset, size):
+    if NativeFile is not None and isinstance(f, NativeFile):
+        return _find_hdfs_start_end(f, offset, size)
+    f.seek(offset)
+    if f.tell() == 0:
+        start = 0
+    else:
+        f.readline()
+        start = f.tell()
+    f.seek(offset + size)
+    f.readline()
+    end = f.tell()
+    return start, end
+
+
+class DataFrameReadCSV(
+    IncrementalIndexDatasource,
+    ColumnPruneSupportedDataSourceMixin,
+    IncrementalIndexDataSourceMixin,
+):
+    _op_type_ = OperandDef.READ_CSV
+
+    path = AnyField("path")
+    names = ListField("names")
+    sep = StringField("sep")
+    header = AnyField("header")
+    index_col = Int32Field("index_col")
+    compression = StringField("compression")
+    usecols = AnyField("usecols")
+    offset = Int64Field("offset")
+    size = Int64Field("size")
+    incremental_index = BoolField("incremental_index")
+    use_arrow_dtype = BoolField("use_arrow_dtype")
+    keep_usecols_order = BoolField("keep_usecols_order", default=None)
+    storage_options = DictField("storage_options")
+    merge_small_files = BoolField("merge_small_files")
+    merge_small_file_options = DictField("merge_small_file_options")
+
+    def get_columns(self):
+        return self.usecols
+
+    def set_pruned_columns(self, columns, *, keep_order=None):
+        self.usecols = columns
+        self.keep_usecols_order = keep_order
+
+    @classmethod
+    def _tile_compressed(cls, op):
+        # Compression does not support break into small parts
+        df = op.outputs[0]
+        chunk_op = op.copy().reset_key()
+        chunk_op.offset = 0
+        chunk_op.size = file_size(op.path, storage_options=op.storage_options)
+        shape = df.shape
+        new_chunk = chunk_op.new_chunk(
+            None,
+            shape=shape,
+            index=(0, 0),
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+            dtypes=df.dtypes,
+        )
+        new_op = op.copy()
+        nsplits = ((np.nan,), (df.shape[1],))
+        return new_op.new_dataframes(
+            None,
+            df.shape,
+            dtypes=df.dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+            chunks=[new_chunk],
+            nsplits=nsplits,
+        )
+
+    @classmethod
+    def _tile(cls, op: "DataFrameReadCSV"):
+        if op.compression:
+            return cls._tile_compressed(op)
+
+        df = op.outputs[0]
+        chunk_bytes = df.extra_params.chunk_bytes
+        chunk_bytes = int(parse_readable_size(chunk_bytes)[0])
+
+        dtypes = df.dtypes
+        if (
+            op.use_arrow_dtype is None
+            and not op.gpu
+            and options.dataframe.use_arrow_dtype
+        ):  # pragma: no cover
+            # check if use_arrow_dtype set on the server side
+            dtypes = to_arrow_dtypes(df.dtypes)
+
+        path_prefix = ""
+        if isinstance(op.path, (tuple, list)):
+            paths = op.path
+        elif get_fs(op.path, op.storage_options).isdir(op.path):
+            parsed_path = urlparse(op.path)
+            if parsed_path.scheme.lower() == "hdfs":
+                path_prefix = f"{parsed_path.scheme}://{parsed_path.netloc}"
+                paths = get_fs(op.path, op.storage_options).ls(op.path)
+            else:
+                paths = glob(
+                    op.path.rstrip("/") + "/*", storage_options=op.storage_options
+                )
+        else:
+            paths = glob(op.path, storage_options=op.storage_options)
+
+        out_chunks = []
+        index_num = 0
+        for path in paths:
+            path = path_prefix + path
+            total_bytes = file_size(path, storage_options=op.storage_options)
+            offset = 0
+            for _ in range(int(np.ceil(total_bytes * 1.0 / chunk_bytes))):
+                chunk_op = op.copy().reset_key()
+                chunk_op.path = path
+                chunk_op.offset = offset
+                chunk_op.size = min(chunk_bytes, total_bytes - offset)
+                shape = (np.nan, len(dtypes))
+                index_value = parse_index(df.index_value.to_pandas(), path, index_num)
+                new_chunk = chunk_op.new_chunk(
+                    None,
+                    shape=shape,
+                    index=(index_num, 0),
+                    index_value=index_value,
+                    columns_value=df.columns_value,
+                    dtypes=dtypes,
+                )
+                out_chunks.append(new_chunk)
+                index_num += 1
+                offset += chunk_bytes
+
+        new_op = op.copy()
+        nsplits = ((np.nan,) * len(out_chunks), (df.shape[1],))
+        df = new_op.new_dataframe(
+            None,
+            df.shape,
+            dtypes=dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+            chunks=out_chunks,
+            nsplits=nsplits,
+        )
+        if op.merge_small_files:
+            df = merge_small_files(df, **(op.merge_small_file_options or dict()))
+        return [df]
+
+    @classmethod
+    def _pandas_read_csv(cls, f, op):
+        csv_kwargs = op.extra_params.copy()
+        out_df = op.outputs[0]
+        start, end = _find_chunk_start_end(f, op.offset, op.size)
+        f.seek(start)
+        b = FixedSizeFileObject(f, end - start)
+        if hasattr(out_df, "dtypes"):
+            dtypes = out_df.dtypes
+        else:
+            # Output will be a Series in some optimize rules.
+            dtypes = pd.Series([out_df.dtype], index=[out_df.name])
+        if end == start:
+            # the last chunk may be empty
+            df = build_empty_df(dtypes)
+            if op.keep_usecols_order and not isinstance(op.usecols, list):
+                # convert to Series, if usecols is a scalar
+                df = df[op.usecols]
+        else:
+            if start == 0:
+                # The first chunk contains header
+                # As we specify names and dtype, we need to skip header rows
+                csv_kwargs["header"] = op.header
+            if op.usecols:
+                usecols = op.usecols if isinstance(op.usecols, list) else [op.usecols]
+            else:
+                usecols = op.usecols
+            if contain_arrow_dtype(dtypes):
+                # when keep_default_na is True which is default,
+                # will replace null value with np.nan,
+                # which will cause failure when converting to arrow string array
+                csv_kwargs["keep_default_na"] = False
+                csv_kwargs["dtype"] = cls._select_arrow_dtype(dtypes)
+            df = pd.read_csv(
+                b,
+                sep=op.sep,
+                names=op.names,
+                index_col=op.index_col,
+                usecols=usecols,
+                nrows=op.nrows,
+                **csv_kwargs,
+            )
+            if op.keep_usecols_order:
+                df = df[op.usecols]
+        return df
+
+    @classmethod
+    def _cudf_read_csv(cls, op):  # pragma: no cover
+        if op.usecols:
+            usecols = op.usecols if isinstance(op.usecols, list) else [op.usecols]
+        else:
+            usecols = op.usecols
+        csv_kwargs = op.extra_params
+        if op.offset == 0:
+            df = cudf.read_csv(
+                op.path,
+                byte_range=(op.offset, op.size),
+                sep=op.sep,
+                usecols=usecols,
+                **csv_kwargs,
+            )
+        else:
+            df = cudf.read_csv(
+                op.path,
+                byte_range=(op.offset, op.size),
+                sep=op.sep,
+                names=op.names,
+                usecols=usecols,
+                nrows=op.nrows,
+                **csv_kwargs,
+            )
+
+        if op.keep_usecols_order:
+            df = df[op.usecols]
+        return df
+
+    @classmethod
+    def _contains_arrow_dtype(cls, dtypes):
+        return any(isinstance(dtype, ArrowStringDtype) for dtype in dtypes)
+
+    @classmethod
+    def _select_arrow_dtype(cls, dtypes):
+        return dict(
+            (c, dtype)
+            for c, dtype in dtypes.items()
+            if isinstance(dtype, ArrowStringDtype)
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        xdf = cudf if op.gpu else pd
+        out_df = op.outputs[0]
+        csv_kwargs = op.extra_params.copy()
+
+        with open_file(
+            op.path, compression=op.compression, storage_options=op.storage_options
+        ) as f:
+            if op.compression is not None:
+                # As we specify names and dtype, we need to skip header rows
+                csv_kwargs["header"] = op.header
+                dtypes = op.outputs[0].dtypes
+                if contain_arrow_dtype(dtypes):
+                    # when keep_default_na is True which is default,
+                    # will replace null value with np.nan,
+                    # which will cause failure when converting to arrow string array
+                    csv_kwargs["keep_default_na"] = False
+                    csv_kwargs["dtype"] = cls._select_arrow_dtype(dtypes)
+                df = xdf.read_csv(
+                    f,
+                    sep=op.sep,
+                    names=op.names,
+                    index_col=op.index_col,
+                    usecols=op.usecols,
+                    nrows=op.nrows,
+                    **csv_kwargs,
+                )
+                if op.keep_usecols_order:
+                    df = df[op.usecols]
+            else:
+                df = cls._cudf_read_csv(op) if op.gpu else cls._pandas_read_csv(f, op)
+        ctx[out_df.key] = df
+
+    def estimate_size(cls, ctx, op):
+        phy_size = op.size * (op.memory_scale or 1)
+        ctx[op.outputs[0].key] = (phy_size, phy_size * 2)
+
+    def __call__(
+        self, index_value=None, columns_value=None, dtypes=None, chunk_bytes=None
+    ):
+        self._output_types = [OutputType.dataframe]
+        shape = (np.nan, len(dtypes))
+        return self.new_dataframe(
+            None,
+            shape,
+            dtypes=dtypes,
+            index_value=index_value,
+            columns_value=columns_value,
+            chunk_bytes=chunk_bytes,
+        )
+
+
+def read_csv(
+    path,
+    names=None,
+    sep: str = ",",
+    index_col=None,
+    compression=None,
+    header="infer",
+    dtype=None,
+    usecols=None,
+    nrows=None,
+    chunk_bytes="64M",
+    gpu=None,
+    head_bytes="100k",
+    head_lines=None,
+    incremental_index: bool = True,
+    use_arrow_dtype: bool = None,
+    storage_options: dict = None,
+    memory_scale: int = None,
+    merge_small_files: bool = True,
+    merge_small_file_options: dict = None,
+    **kwargs,
+):
+    r"""
+    Read a comma-separated values (csv) file into DataFrame.
+    Also supports optionally iterating or breaking of the file
+    into chunks.
+
+    Parameters
+    ----------
+    path : str
+        Any valid string path is acceptable. The string could be a URL. Valid
+        URL schemes include http, ftp, s3, and file. For file URLs, a host is
+        expected. A local file could be: file://localhost/path/to/table.csv,
+        you can also read from external resources using a URL like:
+        hdfs://localhost:8020/test.csv.
+        If you want to pass in a path object, pandas accepts any ``os.PathLike``.
+        By file-like object, we refer to objects with a ``read()`` method, such as
+        a file handler (e.g. via builtin ``open`` function) or ``StringIO``.
+    sep : str, default ','
+        Delimiter to use. If sep is None, the C engine cannot automatically detect
+        the separator, but the Python parsing engine can, meaning the latter will
+        be used and automatically detect the separator by Python's builtin sniffer
+        tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
+        different from ``'\s+'`` will be interpreted as regular expressions and
+        will also force the use of the Python parsing engine. Note that regex
+        delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
+    delimiter : str, default ``None``
+        Alias for sep.
+    header : int, list of int, default 'infer'
+        Row number(s) to use as the column names, and the start of the
+        data.  Default behavior is to infer the column names: if no names
+        are passed the behavior is identical to ``header=0`` and column
+        names are inferred from the first line of the file, if column
+        names are passed explicitly then the behavior is identical to
+        ``header=None``. Explicitly pass ``header=0`` to be able to
+        replace existing names. The header can be a list of integers that
+        specify row locations for a multi-index on the columns
+        e.g. [0,1,3]. Intervening rows that are not specified will be
+        skipped (e.g. 2 in this example is skipped). Note that this
+        parameter ignores commented lines and empty lines if
+        ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
+        data rather than the first line of the file.
+    names : array-like, optional
+        List of column names to use. If the file contains a header row,
+        then you should explicitly pass ``header=0`` to override the column names.
+        Duplicates in this list are not allowed.
+    index_col : int, str, sequence of int / str, or False, default ``None``
+      Column(s) to use as the row labels of the ``DataFrame``, either given as
+      string name or column index. If a sequence of int / str is given, a
+      MultiIndex is used.
+      Note: ``index_col=False`` can be used to force pandas to *not* use the first
+      column as the index, e.g. when you have a malformed file with delimiters at
+      the end of each line.
+    usecols : list-like or callable, optional
+        Return a subset of the columns. If list-like, all elements must either
+        be positional (i.e. integer indices into the document columns) or strings
+        that correspond to column names provided either by the user in `names` or
+        inferred from the document header row(s). For example, a valid list-like
+        `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
+        Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
+        To instantiate a DataFrame from ``data`` with element order preserved use
+        ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
+        in ``['foo', 'bar']`` order or
+        ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
+        for ``['bar', 'foo']`` order.
+        If callable, the callable function will be evaluated against the column
+        names, returning names where the callable function evaluates to True. An
+        example of a valid callable argument would be ``lambda x: x.upper() in
+        ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
+        parsing time and lower memory usage.
+    squeeze : bool, default False
+        If the parsed data only contains one column then return a Series.
+    prefix : str, optional
+        Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
+    mangle_dupe_cols : bool, default True
+        Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
+        'X'...'X'. Passing in False will cause data to be overwritten if there
+        are duplicate names in the columns.
+    dtype : Type name or dict of column -> type, optional
+        Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32,
+        'c': 'Int64'}
+        Use `str` or `object` together with suitable `na_values` settings
+        to preserve and not interpret dtype.
+        If converters are specified, they will be applied INSTEAD
+        of dtype conversion.
+    engine : {'c', 'python'}, optional
+        Parser engine to use. The C engine is faster while the python engine is
+        currently more feature-complete.
+    converters : dict, optional
+        Dict of functions for converting values in certain columns. Keys can either
+        be integers or column labels.
+    true_values : list, optional
+        Values to consider as True.
+    false_values : list, optional
+        Values to consider as False.
+    skipinitialspace : bool, default False
+        Skip spaces after delimiter.
+    skiprows : list-like, int or callable, optional
+        Line numbers to skip (0-indexed) or number of lines to skip (int)
+        at the start of the file.
+        If callable, the callable function will be evaluated against the row
+        indices, returning True if the row should be skipped and False otherwise.
+        An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
+    skipfooter : int, default 0
+        Number of lines at bottom of file to skip (Unsupported with engine='c').
+    nrows : int, optional
+        Number of rows of file to read. Useful for reading pieces of large files.
+    na_values : scalar, str, list-like, or dict, optional
+        Additional strings to recognize as NA/NaN. If dict passed, specific
+        per-column NA values.  By default the following values are interpreted as
+        NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
+        '1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a',
+        'nan', 'null'.
+    keep_default_na : bool, default True
+        Whether or not to include the default NaN values when parsing the data.
+        Depending on whether `na_values` is passed in, the behavior is as follows:
+        * If `keep_default_na` is True, and `na_values` are specified, `na_values`
+          is appended to the default NaN values used for parsing.
+        * If `keep_default_na` is True, and `na_values` are not specified, only
+          the default NaN values are used for parsing.
+        * If `keep_default_na` is False, and `na_values` are specified, only
+          the NaN values specified `na_values` are used for parsing.
+        * If `keep_default_na` is False, and `na_values` are not specified, no
+          strings will be parsed as NaN.
+        Note that if `na_filter` is passed in as False, the `keep_default_na` and
+        `na_values` parameters will be ignored.
+    na_filter : bool, default True
+        Detect missing value markers (empty strings and the value of na_values). In
+        data without any NAs, passing na_filter=False can improve the performance
+        of reading a large file.
+    verbose : bool, default False
+        Indicate number of NA values placed in non-numeric columns.
+    skip_blank_lines : bool, default True
+        If True, skip over blank lines rather than interpreting as NaN values.
+    parse_dates : bool or list of int or names or list of lists or dict, default False
+        The behavior is as follows:
+        * boolean. If True -> try parsing the index.
+        * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
+          each as a separate date column.
+        * list of lists. e.g.  If [[1, 3]] -> combine columns 1 and 3 and parse as
+          a single date column.
+        * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call
+          result 'foo'
+        If a column or index cannot be represented as an array of datetimes,
+        say because of an unparsable value or a mixture of timezones, the column
+        or index will be returned unaltered as an object data type. For
+        non-standard datetime parsing, use ``pd.to_datetime`` after
+        ``pd.read_csv``. To parse an index or column with a mixture of timezones,
+        specify ``date_parser`` to be a partially-applied
+        :func:`pandas.to_datetime` with ``utc=True``. See
+        :ref:`io.csv.mixed_timezones` for more.
+        Note: A fast-path exists for iso8601-formatted dates.
+    infer_datetime_format : bool, default False
+        If True and `parse_dates` is enabled, pandas will attempt to infer the
+        format of the datetime strings in the columns, and if it can be inferred,
+        switch to a faster method of parsing them. In some cases this can increase
+        the parsing speed by 5-10x.
+    keep_date_col : bool, default False
+        If True and `parse_dates` specifies combining multiple columns then
+        keep the original columns.
+    date_parser : function, optional
+        Function to use for converting a sequence of string columns to an array of
+        datetime instances. The default uses ``dateutil.parser.parser`` to do the
+        conversion. Pandas will try to call `date_parser` in three different ways,
+        advancing to the next if an exception occurs: 1) Pass one or more arrays
+        (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
+        string values from the columns defined by `parse_dates` into a single array
+        and pass that; and 3) call `date_parser` once for each row using one or
+        more strings (corresponding to the columns defined by `parse_dates`) as
+        arguments.
+    dayfirst : bool, default False
+        DD/MM format dates, international and European format.
+    cache_dates : bool, default True
+        If True, use a cache of unique, converted dates to apply the datetime
+        conversion. May produce significant speed-up when parsing duplicate
+        date strings, especially ones with timezone offsets.
+        .. versionadded:: 0.25.0
+    iterator : bool, default False
+        Return TextFileReader object for iteration or getting chunks with
+        ``get_chunk()``.
+    chunksize : int, optional
+        Return TextFileReader object for iteration.
+        See the `IO Tools docs
+        <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
+        for more information on ``iterator`` and ``chunksize``.
+    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
+        For on-the-fly decompression of on-disk data. If 'infer' and
+        `filepath_or_buffer` is path-like, then detect compression from the
+        following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
+        decompression). If using 'zip', the ZIP file must contain only one data
+        file to be read in. Set to None for no decompression.
+    thousands : str, optional
+        Thousands separator.
+    decimal : str, default '.'
+        Character to recognize as decimal point (e.g. use ',' for European data).
+    lineterminator : str (length 1), optional
+        Character to break file into lines. Only valid with C parser.
+    quotechar : str (length 1), optional
+        The character used to denote the start and end of a quoted item. Quoted
+        items can include the delimiter and it will be ignored.
+    quoting : int or csv.QUOTE_* instance, default 0
+        Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
+        QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
+    doublequote : bool, default ``True``
+       When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
+       whether or not to interpret two consecutive quotechar elements INSIDE a
+       field as a single ``quotechar`` element.
+    escapechar : str (length 1), optional
+        One-character string used to escape other characters.
+    comment : str, optional
+        Indicates remainder of line should not be parsed. If found at the beginning
+        of a line, the line will be ignored altogether. This parameter must be a
+        single character. Like empty lines (as long as ``skip_blank_lines=True``),
+        fully commented lines are ignored by the parameter `header` but not by
+        `skiprows`. For example, if ``comment='#'``, parsing
+        ``#empty\na,b,c\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
+        treated as the header.
+    encoding : str, optional
+        Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
+        standard encodings
+        <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
+    dialect : str or csv.Dialect, optional
+        If provided, this parameter will override values (default or not) for the
+        following parameters: `delimiter`, `doublequote`, `escapechar`,
+        `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
+        override values, a ParserWarning will be issued. See csv.Dialect
+        documentation for more details.
+    error_bad_lines : bool, default True
+        Lines with too many fields (e.g. a csv line with too many commas) will by
+        default cause an exception to be raised, and no DataFrame will be returned.
+        If False, then these "bad lines" will dropped from the DataFrame that is
+        returned.
+    warn_bad_lines : bool, default True
+        If error_bad_lines is False, and warn_bad_lines is True, a warning for each
+        "bad line" will be output.
+    delim_whitespace : bool, default False
+        Specifies whether or not whitespace (e.g. ``' '`` or ``'    '``) will be
+        used as the sep. Equivalent to setting ``sep='\s+'``. If this option
+        is set to True, nothing should be passed in for the ``delimiter``
+        parameter.
+    low_memory : bool, default True
+        Internally process the file in chunks, resulting in lower memory use
+        while parsing, but possibly mixed type inference.  To ensure no mixed
+        types either set False, or specify the type with the `dtype` parameter.
+        Note that the entire file is read into a single DataFrame regardless,
+        use the `chunksize` or `iterator` parameter to return the data in chunks.
+        (Only valid with C parser).
+    float_precision : str, optional
+        Specifies which converter the C engine should use for floating-point
+        values. The options are `None` for the ordinary converter,
+        `high` for the high-precision converter, and `round_trip` for the
+        round-trip converter.
+    chunk_bytes: int, float or str, optional
+        Number of chunk bytes.
+    gpu: bool, default False
+        If read into cudf DataFrame.
+    head_bytes: int, float or str, optional
+        Number of bytes to use in the head of file, mainly for data inference.
+    head_lines: int, optional
+        Number of lines to use in the head of file, mainly for data inference.
+    incremental_index: bool, default True
+        If index_col not specified, ensure range index incremental,
+        gain a slightly better performance if setting False.
+    use_arrow_dtype: bool, default None
+        If True, use arrow dtype to store columns.
+    storage_options: dict, optional
+        Options for storage connection.
+    merge_small_files: bool, default True
+        Merge small files whose size is small.
+    merge_small_file_options: dict
+        Options for merging small files
+
+    Returns
+    -------
+    DataFrame
+        A comma-separated values (csv) file is returned as two-dimensional
+        data structure with labeled axes.
+
+    See Also
+    --------
+    to_csv : Write DataFrame to a comma-separated values (csv) file.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> from mars.lib.filesystem.oss import build_oss_path
+    >>> md.read_csv('data.csv')  # doctest: +SKIP
+    >>> # read from HDFS
+    >>> md.read_csv('hdfs://localhost:8020/test.csv')  # doctest: +SKIP
+    >>> # read from OSS
+    >>> auth_path = build_oss_path(file_path, access_key_id, access_key_secret, end_point)
+    >>> md.read_csv(auth_path)
+    """
+    # infer dtypes and columns
+    if isinstance(path, (list, tuple)):
+        file_path = path[0]
+    elif get_fs(path, storage_options).isdir(path):
+        parsed_path = urlparse(path)
+        if parsed_path.scheme.lower() == "hdfs":
+            path_prefix = f"{parsed_path.scheme}://{parsed_path.netloc}"
+            file_path = path_prefix + get_fs(path, storage_options).ls(path)[0]
+        else:
+            file_path = glob(path.rstrip("/") + "/*", storage_options)[0]
+    else:
+        file_path = glob(path, storage_options)[0]
+
+    with open_file(
+        file_path, compression=compression, storage_options=storage_options
+    ) as f:
+        if head_lines is not None:
+            b = b"".join([f.readline() for _ in range(head_lines)])
+        else:
+            head_bytes = int(parse_readable_size(head_bytes)[0])
+            head_start, head_end = _find_chunk_start_end(f, 0, head_bytes)
+            f.seek(head_start)
+            b = f.read(head_end - head_start)
+        mini_df = pd.read_csv(
+            BytesIO(b),
+            sep=sep,
+            index_col=index_col,
+            dtype=dtype,
+            names=names,
+            header=header,
+        )
+        if header == "infer" and names is not None:
+            # ignore header as we always specify names
+            header = None
+        else:
+            # replace header if we specify names or header
+            header = 0
+        if names is None:
+            names = list(mini_df.columns)
+        if usecols:
+            usecols = usecols if isinstance(usecols, list) else [usecols]
+            col_index = sorted(mini_df.columns.get_indexer(usecols))
+            mini_df = mini_df.iloc[:, col_index]
+
+    if isinstance(mini_df.index, pd.RangeIndex):
+        index_value = parse_index(pd.RangeIndex(-1))
+    else:
+        index_value = parse_index(mini_df.index)
+    columns_value = parse_index(mini_df.columns, store_data=True)
+    if index_col and not isinstance(index_col, int):
+        index_col = list(mini_df.columns).index(index_col)
+    op = DataFrameReadCSV(
+        path=path,
+        names=names,
+        sep=sep,
+        header=header,
+        index_col=index_col,
+        usecols=usecols,
+        compression=compression,
+        gpu=gpu,
+        incremental_index=incremental_index,
+        use_arrow_dtype=use_arrow_dtype,
+        storage_options=storage_options,
+        memory_scale=memory_scale,
+        merge_small_files=merge_small_files,
+        merge_small_file_options=merge_small_file_options,
+        **kwargs,
+    )
+    chunk_bytes = chunk_bytes or options.chunk_store_limit
+    dtypes = mini_df.dtypes
+    if use_arrow_dtype is None:
+        use_arrow_dtype = options.dataframe.use_arrow_dtype
+    if not gpu and use_arrow_dtype:
+        dtypes = to_arrow_dtypes(dtypes, test_df=mini_df)
+    ret = op(
+        index_value=index_value,
+        columns_value=columns_value,
+        dtypes=dtypes,
+        chunk_bytes=chunk_bytes,
+    )
+    if nrows is not None:
+        return ret.head(nrows)
+    return ret
diff --git a/python/xorbits/_mars/dataframe/datasource/read_parquet.py b/python/xorbits/_mars/dataframe/datasource/read_parquet.py
new file mode 100644
index 000000000..86c3dc63a
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datasource/read_parquet.py
@@ -0,0 +1,695 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Dict
+from urllib.parse import urlparse
+
+import numpy as np
+import pandas as pd
+
+try:
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+except ImportError:
+    pa = None
+
+try:
+    import fastparquet
+except ImportError:
+    fastparquet = None
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...lib.filesystem import FileSystem, file_size, get_fs, glob, open_file
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    DictField,
+    Int32Field,
+    Int64Field,
+    ListField,
+    StringField,
+)
+from ...utils import is_object_dtype, lazy_import
+from ..arrays import ArrowStringDtype
+from ..operands import OutputType
+from ..utils import contain_arrow_dtype, parse_index, to_arrow_dtypes
+from .core import (
+    ColumnPruneSupportedDataSourceMixin,
+    IncrementalIndexDatasource,
+    IncrementalIndexDataSourceMixin,
+    merge_small_files,
+)
+
+PARQUET_MEMORY_SCALE = 15
+STRING_FIELD_OVERHEAD = 50
+cudf = lazy_import("cudf")
+
+
+def check_engine(engine):
+    if engine == "auto":
+        if pa is not None:
+            return "pyarrow"
+        elif fastparquet is not None:  # pragma: no cover
+            return "fastparquet"
+        else:  # pragma: no cover
+            raise RuntimeError("Please install either pyarrow or fastparquet.")
+    elif engine == "pyarrow":
+        if pa is None:  # pragma: no cover
+            raise RuntimeError("Please install pyarrow first.")
+        return engine
+    elif engine == "fastparquet":
+        if fastparquet is None:  # pragma: no cover
+            raise RuntimeError("Please install fastparquet first.")
+        return engine
+    else:  # pragma: no cover
+        raise RuntimeError("Unsupported engine {} to read parquet.".format(engine))
+
+
+def get_engine(engine):
+    if engine == "pyarrow":
+        return ArrowEngine()
+    elif engine == "fastparquet":
+        return FastpaquetEngine()
+    else:  # pragma: no cover
+        raise RuntimeError("Unsupported engine {}".format(engine))
+
+
+class ParquetEngine:
+    def get_row_num(self, f):
+        raise NotImplementedError
+
+    def read_dtypes(self, f, **kwargs):
+        raise NotImplementedError
+
+    def read_to_pandas(
+        self, f, columns=None, nrows=None, use_arrow_dtype=None, **kwargs
+    ):
+        raise NotImplementedError
+
+    def read_group_to_pandas(
+        self, f, group_index, columns=None, nrows=None, use_arrow_dtype=None, **kwargs
+    ):
+        raise NotImplementedError
+
+    def read_partitioned_to_pandas(
+        self,
+        f,
+        partitions: Dict,
+        partition_keys: Dict,
+        columns=None,
+        nrows=None,
+        use_arrow_dtype=None,
+        **kwargs,
+    ):
+        raw_df = self.read_to_pandas(
+            f, columns=columns, nrows=nrows, use_arrow_dtype=use_arrow_dtype, **kwargs
+        )
+        for col, value in partition_keys.items():
+            dictionary = partitions[col]
+            raw_df[col] = pd.Series(
+                value,
+                dtype=pd.CategoricalDtype(categories=dictionary.tolist()),
+                index=raw_df.index,
+            )
+        return raw_df
+
+    def read_partitioned_dtypes(self, fs: FileSystem, directory, storage_options):
+        # As ParquetDataset will iterate all files,
+        # here we just find one file to infer dtypes
+        current_path = directory
+        partition_cols = []
+        while fs.isdir(current_path):
+            _, dirs, files = next(fs.walk(current_path))
+            dirs = [d for d in dirs if not d.startswith(".")]
+            files = [f for f in files if not f.startswith(".")]
+            if len(files) == 0:
+                # directory as partition
+                partition_cols.append(dirs[0].split("=", 1)[0])
+                current_path = os.path.join(current_path, dirs[0])
+            elif len(dirs) == 0:
+                # parquet files in deepest directory
+                current_path = os.path.join(current_path, files[0])
+            else:  # pragma: no cover
+                raise ValueError(
+                    "Files and directories are mixed in an intermediate directory"
+                )
+
+        # current path is now a parquet file
+        with open_file(current_path, storage_options=storage_options) as f:
+            dtypes = self.read_dtypes(f)
+        for partition in partition_cols:
+            dtypes[partition] = pd.CategoricalDtype()
+        return dtypes
+
+
+def _parse_prefix(path):
+    path_prefix = ""
+    if isinstance(path, str):
+        parsed_path = urlparse(path)
+        if parsed_path.scheme:
+            path_prefix = f"{parsed_path.scheme}://{parsed_path.netloc}"
+    return path_prefix
+
+
+class ArrowEngine(ParquetEngine):
+    def get_row_num(self, f):
+        file = pq.ParquetFile(f)
+        return file.metadata.num_rows
+
+    def read_dtypes(self, f, **kwargs):
+        file = pq.ParquetFile(f)
+        return file.schema_arrow.empty_table().to_pandas().dtypes
+
+    @classmethod
+    def _table_to_pandas(cls, t, nrows=None, use_arrow_dtype=None):
+        if nrows is not None:
+            t = t.slice(0, nrows)
+        if use_arrow_dtype:
+            df = t.to_pandas(types_mapper={pa.string(): ArrowStringDtype()}.get)
+        else:
+            df = t.to_pandas()
+        return df
+
+    def read_to_pandas(
+        self, f, columns=None, nrows=None, use_arrow_dtype=None, **kwargs
+    ):
+        file = pq.ParquetFile(f)
+        t = file.read(columns=columns, **kwargs)
+        return self._table_to_pandas(t, nrows=nrows, use_arrow_dtype=use_arrow_dtype)
+
+    def read_group_to_pandas(
+        self, f, group_index, columns=None, nrows=None, use_arrow_dtype=None, **kwargs
+    ):
+        file = pq.ParquetFile(f)
+        t = file.read_row_group(group_index, columns=columns, **kwargs)
+        return self._table_to_pandas(t, nrows=nrows, use_arrow_dtype=use_arrow_dtype)
+
+
+class FastpaquetEngine(ParquetEngine):
+    def get_row_num(self, f):
+        file = fastparquet.ParquetFile(f)
+        return file.count()
+
+    def read_dtypes(self, f, **kwargs):
+        file = fastparquet.ParquetFile(f)
+        dtypes_dict = file._dtypes()
+        return pd.Series(dict((c, dtypes_dict[c]) for c in file.columns))
+
+    def read_to_pandas(
+        self, f, columns=None, nrows=None, use_arrow_dtype=None, **kwargs
+    ):
+        file = fastparquet.ParquetFile(f)
+        df = file.to_pandas(columns, **kwargs)
+        if nrows is not None:
+            df = df.head(nrows)
+        if use_arrow_dtype:
+            df = df.astype(to_arrow_dtypes(df.dtypes).to_dict())
+        return df
+
+
+class CudfEngine:
+    @classmethod
+    def read_to_cudf(cls, file, columns: list = None, nrows: int = None, **kwargs):
+        df = cudf.read_parquet(file, columns=columns, **kwargs)
+        if nrows is not None:
+            df = df.head(nrows)
+        return df
+
+    def read_group_to_cudf(
+        self, file, group_index: int, columns: list = None, nrows: int = None, **kwargs
+    ):
+        return self.read_to_cudf(
+            file, columns=columns, nrows=nrows, row_groups=group_index, **kwargs
+        )
+
+    @classmethod
+    def read_partitioned_to_cudf(
+        cls,
+        file,
+        partitions: Dict,
+        partition_keys: Dict,
+        columns=None,
+        nrows=None,
+        **kwargs,
+    ):
+        # cudf will read entire partitions even if only one partition provided,
+        # so we just read with pyarrow and convert to cudf DataFrame
+        file = pq.ParquetFile(file)
+        t = file.read(columns=columns, **kwargs)
+        t = t.slice(0, nrows) if nrows is not None else t
+        t = pa.table(t.columns, names=t.column_names)
+        raw_df = cudf.DataFrame.from_arrow(t)
+        for col, value in partition_keys.items():
+            dictionary = partitions[col].tolist()
+            codes = cudf.core.column.as_column(
+                dictionary.index(value), length=len(raw_df)
+            )
+            raw_df[col] = cudf.core.column.build_categorical_column(
+                categories=dictionary,
+                codes=codes,
+                size=codes.size,
+                offset=codes.offset,
+                ordered=False,
+            )
+        return raw_df
+
+
+class DataFrameReadParquet(
+    IncrementalIndexDatasource,
+    ColumnPruneSupportedDataSourceMixin,
+    IncrementalIndexDataSourceMixin,
+):
+    _op_type_ = OperandDef.READ_PARQUET
+
+    path = AnyField("path")
+    engine = StringField("engine")
+    columns = ListField("columns")
+    use_arrow_dtype = BoolField("use_arrow_dtype")
+    groups_as_chunks = BoolField("groups_as_chunks")
+    group_index = Int32Field("group_index")
+    read_kwargs = DictField("read_kwargs")
+    incremental_index = BoolField("incremental_index")
+    storage_options = DictField("storage_options")
+    is_partitioned = BoolField("is_partitioned")
+    merge_small_files = BoolField("merge_small_files")
+    merge_small_file_options = DictField("merge_small_file_options")
+    # for chunk
+    partitions = DictField("partitions", default=None)
+    partition_keys = DictField("partition_keys", default=None)
+    num_group_rows = Int64Field("num_group_rows", default=None)
+    # as read meta may be too time-consuming when number of files is large,
+    # thus we only read first file to get row number and raw file size
+    first_chunk_row_num = Int64Field("first_chunk_row_num")
+    first_chunk_raw_bytes = Int64Field("first_chunk_raw_bytes")
+
+    def get_columns(self):
+        return self.columns
+
+    def set_pruned_columns(self, columns, *, keep_order=None):
+        self.columns = columns
+
+    @classmethod
+    def _to_arrow_dtypes(cls, dtypes, op):
+        if (
+            op.use_arrow_dtype is None
+            and not op.gpu
+            and options.dataframe.use_arrow_dtype
+        ):  # pragma: no cover
+            # check if use_arrow_dtype set on the server side
+            dtypes = to_arrow_dtypes(dtypes)
+        return dtypes
+
+    @classmethod
+    def _tile_partitioned(cls, op: "DataFrameReadParquet"):
+        out_df = op.outputs[0]
+        shape = (np.nan, out_df.shape[1])
+        dtypes = cls._to_arrow_dtypes(out_df.dtypes, op)
+        dataset = pq.ParquetDataset(op.path, use_legacy_dataset=False)
+
+        path_prefix = _parse_prefix(op.path)
+
+        chunk_index = 0
+        out_chunks = []
+        first_chunk_row_num, first_chunk_raw_bytes = None, None
+        for i, fragment in enumerate(dataset.fragments):
+            chunk_op = op.copy().reset_key()
+            chunk_op.path = chunk_path = path_prefix + fragment.path
+            relpath = os.path.relpath(chunk_path, op.path)
+            partition_keys = dict(
+                tuple(s.split("=")) for s in relpath.split(os.sep)[:-1]
+            )
+            chunk_op.partition_keys = partition_keys
+            chunk_op.partitions = dict(
+                zip(
+                    dataset.partitioning.schema.names, dataset.partitioning.dictionaries
+                )
+            )
+            if i == 0:
+                first_row_group = fragment.row_groups[0]
+                first_chunk_raw_bytes = first_row_group.total_byte_size
+                first_chunk_row_num = first_row_group.num_rows
+            chunk_op.first_chunk_row_num = first_chunk_row_num
+            chunk_op.first_chunk_raw_bytes = first_chunk_raw_bytes
+            new_chunk = chunk_op.new_chunk(
+                None,
+                shape=shape,
+                index=(chunk_index, 0),
+                index_value=out_df.index_value,
+                columns_value=out_df.columns_value,
+                dtypes=dtypes,
+            )
+            out_chunks.append(new_chunk)
+            chunk_index += 1
+
+        new_op = op.copy()
+        nsplits = ((np.nan,) * len(out_chunks), (out_df.shape[1],))
+        return new_op.new_dataframes(
+            None,
+            out_df.shape,
+            dtypes=dtypes,
+            index_value=out_df.index_value,
+            columns_value=out_df.columns_value,
+            chunks=out_chunks,
+            nsplits=nsplits,
+        )
+
+    @classmethod
+    def _tile_no_partitioned(cls, op: "DataFrameReadParquet"):
+        chunk_index = 0
+        out_chunks = []
+        out_df = op.outputs[0]
+
+        dtypes = cls._to_arrow_dtypes(out_df.dtypes, op)
+        shape = (np.nan, out_df.shape[1])
+
+        path_prefix = ""
+        if isinstance(op.path, (tuple, list)):
+            paths = op.path
+        elif get_fs(op.path, op.storage_options).isdir(op.path):
+            parsed_path = urlparse(op.path)
+            if parsed_path.scheme.lower() == "hdfs":
+                path_prefix = f"{parsed_path.scheme}://{parsed_path.netloc}"
+            paths = get_fs(op.path, op.storage_options).ls(op.path)
+        else:
+            paths = glob(op.path, storage_options=op.storage_options)
+
+        first_chunk_row_num, first_chunk_raw_bytes = None, None
+        for i, pth in enumerate(paths):
+            pth = path_prefix + pth
+            if i == 0:
+                with open_file(pth, storage_options=op.storage_options) as f:
+                    first_chunk_row_num = get_engine(op.engine).get_row_num(f)
+                first_chunk_raw_bytes = file_size(
+                    pth, storage_options=op.storage_options
+                )
+
+            if op.groups_as_chunks:
+                num_row_groups = pq.ParquetFile(pth).num_row_groups
+                for group_idx in range(num_row_groups):
+                    chunk_op = op.copy().reset_key()
+                    chunk_op.path = pth
+                    chunk_op.group_index = group_idx
+                    chunk_op.first_chunk_row_num = first_chunk_row_num
+                    chunk_op.first_chunk_raw_bytes = first_chunk_raw_bytes
+                    chunk_op.num_group_rows = num_row_groups
+                    new_chunk = chunk_op.new_chunk(
+                        None,
+                        shape=shape,
+                        index=(chunk_index, 0),
+                        index_value=out_df.index_value,
+                        columns_value=out_df.columns_value,
+                        dtypes=dtypes,
+                    )
+                    out_chunks.append(new_chunk)
+                    chunk_index += 1
+            else:
+                chunk_op = op.copy().reset_key()
+                chunk_op.path = pth
+                chunk_op.first_chunk_row_num = first_chunk_row_num
+                chunk_op.first_chunk_raw_bytes = first_chunk_raw_bytes
+                new_chunk = chunk_op.new_chunk(
+                    None,
+                    shape=shape,
+                    index=(chunk_index, 0),
+                    index_value=out_df.index_value,
+                    columns_value=out_df.columns_value,
+                    dtypes=dtypes,
+                )
+                out_chunks.append(new_chunk)
+                chunk_index += 1
+
+        new_op = op.copy()
+        nsplits = ((np.nan,) * len(out_chunks), (out_df.shape[1],))
+        return new_op.new_dataframes(
+            None,
+            out_df.shape,
+            dtypes=dtypes,
+            index_value=out_df.index_value,
+            columns_value=out_df.columns_value,
+            chunks=out_chunks,
+            nsplits=nsplits,
+        )
+
+    @classmethod
+    def _tile(cls, op: "DataFrameReadParquet"):
+        if op.is_partitioned:
+            tiled = cls._tile_partitioned(op)
+        else:
+            tiled = cls._tile_no_partitioned(op)
+        if op.merge_small_files:
+            tiled = [
+                merge_small_files(tiled[0], **(op.merge_small_file_options or dict()))
+            ]
+        return tiled
+
+    @classmethod
+    def _execute_partitioned(cls, ctx, op: "DataFrameReadParquet"):
+        out = op.outputs[0]
+        engine = get_engine(op.engine)
+        with open_file(op.path, storage_options=op.storage_options) as f:
+            ctx[out.key] = engine.read_partitioned_to_pandas(
+                f,
+                op.partitions,
+                op.partition_keys,
+                columns=op.columns,
+                nrows=op.nrows,
+                use_arrow_dtype=op.use_arrow_dtype,
+                **op.read_kwargs or dict(),
+            )
+
+    @classmethod
+    def _pandas_read_parquet(cls, ctx: dict, op: "DataFrameReadParquet"):
+        out = op.outputs[0]
+        path = op.path
+
+        if op.partitions is not None:
+            return cls._execute_partitioned(ctx, op)
+
+        engine = get_engine(op.engine)
+        with open_file(path, storage_options=op.storage_options) as f:
+            use_arrow_dtype = contain_arrow_dtype(out.dtypes)
+            if op.groups_as_chunks:
+                df = engine.read_group_to_pandas(
+                    f,
+                    op.group_index,
+                    columns=op.columns,
+                    nrows=op.nrows,
+                    use_arrow_dtype=use_arrow_dtype,
+                    **op.read_kwargs or dict(),
+                )
+            else:
+                df = engine.read_to_pandas(
+                    f,
+                    columns=op.columns,
+                    nrows=op.nrows,
+                    use_arrow_dtype=use_arrow_dtype,
+                    **op.read_kwargs or dict(),
+                )
+
+            ctx[out.key] = df
+
+    @classmethod
+    def _cudf_read_parquet(cls, ctx: dict, op: "DataFrameReadParquet"):
+        out = op.outputs[0]
+        path = op.path
+
+        engine = CudfEngine()
+        if os.path.exists(path):
+            file = op.path
+            close = lambda: None
+        else:  # pragma: no cover
+            file = open_file(path, storage_options=op.storage_options)
+            close = file.close
+
+        try:
+            if op.partitions is not None:
+                ctx[out.key] = engine.read_partitioned_to_cudf(
+                    file,
+                    op.partitions,
+                    op.partition_keys,
+                    columns=op.columns,
+                    nrows=op.nrows,
+                    **op.read_kwargs or dict(),
+                )
+            else:
+                if op.groups_as_chunks:
+                    df = engine.read_group_to_cudf(
+                        file,
+                        op.group_index,
+                        columns=op.columns,
+                        nrows=op.nrows,
+                        **op.read_kwargs or dict(),
+                    )
+                else:
+                    df = engine.read_to_cudf(
+                        file,
+                        columns=op.columns,
+                        nrows=op.nrows,
+                        **op.read_kwargs or dict(),
+                    )
+                ctx[out.key] = df
+        finally:
+            close()
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameReadParquet"):
+        if not op.gpu:
+            cls._pandas_read_parquet(ctx, op)
+        else:
+            cls._cudf_read_parquet(ctx, op)
+
+    @classmethod
+    def estimate_size(cls, ctx, op: "DataFrameReadParquet"):
+        first_chunk_row_num = op.first_chunk_row_num
+        first_chunk_raw_bytes = op.first_chunk_raw_bytes
+        raw_bytes = file_size(op.path, storage_options=op.storage_options)
+        if op.num_group_rows:
+            raw_bytes = (
+                np.ceil(np.divide(raw_bytes, op.num_group_rows)).astype(np.int64).item()
+            )
+
+        estimated_row_num = (
+            np.ceil(first_chunk_row_num * (raw_bytes / first_chunk_raw_bytes))
+            .astype(np.int64)
+            .item()
+        )
+        phy_size = raw_bytes * (op.memory_scale or PARQUET_MEMORY_SCALE)
+        n_strings = len([dt for dt in op.outputs[0].dtypes if is_object_dtype(dt)])
+        pd_size = phy_size + n_strings * estimated_row_num * STRING_FIELD_OVERHEAD
+        ctx[op.outputs[0].key] = (pd_size, pd_size + phy_size)
+
+    def __call__(self, index_value=None, columns_value=None, dtypes=None):
+        self._output_types = [OutputType.dataframe]
+        shape = (np.nan, len(dtypes))
+        return self.new_dataframe(
+            None,
+            shape,
+            dtypes=dtypes,
+            index_value=index_value,
+            columns_value=columns_value,
+        )
+
+
+def read_parquet(
+    path,
+    engine: str = "auto",
+    columns: list = None,
+    groups_as_chunks: bool = False,
+    use_arrow_dtype: bool = None,
+    incremental_index: bool = False,
+    storage_options: dict = None,
+    memory_scale: int = None,
+    merge_small_files: bool = True,
+    merge_small_file_options: dict = None,
+    gpu: bool = None,
+    **kwargs,
+):
+    """
+    Load a parquet object from the file path, returning a DataFrame.
+
+    Parameters
+    ----------
+    path : str, path object or file-like object
+        Any valid string path is acceptable. The string could be a URL.
+        For file URLs, a host is expected. A local file could be:
+        ``file://localhost/path/to/table.parquet``.
+        A file URL can also be a path to a directory that contains multiple
+        partitioned parquet files. Both pyarrow and fastparquet support
+        paths to directories as well as file URLs. A directory path could be:
+        ``file://localhost/path/to/tables``.
+        By file-like object, we refer to objects with a ``read()`` method,
+        such as a file handler (e.g. via builtin ``open`` function)
+        or ``StringIO``.
+    engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
+        Parquet library to use. The default behavior is to try 'pyarrow',
+        falling back to 'fastparquet' if 'pyarrow' is unavailable.
+    columns : list, default=None
+        If not None, only these columns will be read from the file.
+    groups_as_chunks : bool, default False
+        if True, each row group correspond to a chunk.
+        if False, each file correspond to a chunk.
+        Only available for 'pyarrow' engine.
+    incremental_index: bool, default False
+        If index_col not specified, ensure range index incremental,
+        gain a slightly better performance if setting False.
+    use_arrow_dtype: bool, default None
+        If True, use arrow dtype to store columns.
+    storage_options: dict, optional
+        Options for storage connection.
+    memory_scale: int, optional
+        Scale that real memory occupation divided with raw file size.
+    merge_small_files: bool, default True
+        Merge small files whose size is small.
+    merge_small_file_options: dict
+        Options for merging small files
+    **kwargs
+        Any additional kwargs are passed to the engine.
+
+    Returns
+    -------
+    Mars DataFrame
+    """
+
+    engine_type = check_engine(engine)
+    engine = get_engine(engine_type)
+
+    single_path = path[0] if isinstance(path, list) else path
+    fs = get_fs(single_path, storage_options)
+    is_partitioned = False
+    if fs.isdir(single_path):
+        paths = fs.ls(path)
+        if all(fs.isdir(p) for p in paths):
+            # If all are directories, it is read as a partitioned dataset.
+            dtypes = engine.read_partitioned_dtypes(fs, path, storage_options)
+            is_partitioned = True
+        else:
+            with fs.open(paths[0], mode="rb") as f:
+                dtypes = engine.read_dtypes(f)
+    else:
+        if not isinstance(path, list):
+            file_path = glob(path, storage_options=storage_options)[0]
+        else:
+            file_path = path[0]
+
+        with open_file(file_path, storage_options=storage_options) as f:
+            dtypes = engine.read_dtypes(f)
+
+    if columns:
+        dtypes = dtypes[columns]
+
+    if use_arrow_dtype is None:
+        use_arrow_dtype = options.dataframe.use_arrow_dtype
+    if use_arrow_dtype:
+        dtypes = to_arrow_dtypes(dtypes)
+
+    index_value = parse_index(pd.RangeIndex(-1))
+    columns_value = parse_index(dtypes.index, store_data=True)
+    op = DataFrameReadParquet(
+        path=path,
+        engine=engine_type,
+        columns=columns,
+        groups_as_chunks=groups_as_chunks,
+        use_arrow_dtype=use_arrow_dtype,
+        read_kwargs=kwargs,
+        incremental_index=incremental_index,
+        storage_options=storage_options,
+        is_partitioned=is_partitioned,
+        memory_scale=memory_scale,
+        merge_small_files=merge_small_files,
+        merge_small_file_options=merge_small_file_options,
+        gpu=gpu,
+    )
+    return op(index_value=index_value, columns_value=columns_value, dtypes=dtypes)
diff --git a/python/xorbits/_mars/dataframe/datasource/read_raydataset.py b/python/xorbits/_mars/dataframe/datasource/read_raydataset.py
new file mode 100644
index 000000000..76191437c
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datasource/read_raydataset.py
@@ -0,0 +1,249 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import warnings
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import OutputType
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    Int64Field,
+    ListField,
+    ReferenceField,
+)
+from ..utils import lazy_import, parse_index, tokenize
+from .core import (
+    HeadOptimizedDataSource,
+    IncrementalIndexDatasource,
+    IncrementalIndexDataSourceMixin,
+)
+
+ray = lazy_import("ray")
+# Ray Datasets is available in early preview at ray.data with Ray 1.6+
+# (and ray.experimental.data in Ray 1.5)
+ray_dataset = lazy_import("ray.data", rename="ray_dataset")
+ray_exp_dataset = lazy_import("ray.experimental.data", rename="ray_exp_dataset")
+real_ray_dataset = ray_dataset or ray_exp_dataset
+
+
+class DataFrameReadRayDataset(
+    IncrementalIndexDatasource, IncrementalIndexDataSourceMixin
+):
+    _op_type_ = OperandDef.READ_RAYDATASET
+
+    refs = AnyField("refs", default=None)
+    columns = ListField("columns", default=None)
+    incremental_index = BoolField("incremental_index", default=None)
+    nrows = Int64Field("nrows", default=None)
+
+    @classmethod
+    def _tile_partitioned(cls, op: "DataFrameReadRayDataset"):
+        out_df = op.outputs[0]
+        shape = (np.nan, out_df.shape[1])
+        dtypes = out_df.dtypes
+        dataset = op.refs
+
+        chunk_index = 0
+        out_chunks = []
+        for object_ref in dataset:
+            chunk_op = op.copy().reset_key()
+            chunk_op._refs = [object_ref]
+            new_chunk = chunk_op.new_chunk(
+                None,
+                shape=shape,
+                index=(chunk_index, 0),
+                index_value=out_df.index_value,
+                columns_value=out_df.columns_value,
+                dtypes=dtypes,
+            )
+            out_chunks.append(new_chunk)
+            chunk_index += 1
+
+        new_op = op.copy()
+        nsplits = ((np.nan,) * len(out_chunks), (out_df.shape[1],))
+        return new_op.new_dataframes(
+            None,
+            out_df.shape,
+            dtypes=dtypes,
+            index_value=out_df.index_value,
+            columns_value=out_df.columns_value,
+            chunks=out_chunks,
+            nsplits=nsplits,
+        )
+
+    @classmethod
+    def _tile(cls, op):
+        return cls._tile_partitioned(op)
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameReadRayDataset"):
+        out = op.outputs[0]
+        ref = op.refs[0]
+
+        df = ray.get(ref)
+        ctx[out.key] = df
+
+    def __call__(self, index_value=None, columns_value=None, dtypes=None):
+        shape = (np.nan, len(dtypes))
+        return self.new_dataframe(
+            None,
+            shape,
+            dtypes=dtypes,
+            index_value=index_value,
+            columns_value=columns_value,
+        )
+
+
+def read_ray_dataset(ds, columns=None, incremental_index=False, **kwargs):
+    assert isinstance(ds, real_ray_dataset.Dataset)
+    refs = ds.to_pandas_refs()
+    schema = ds.schema()
+
+    import pyarrow as pa
+
+    try:
+        from ray.data._internal.pandas_block import PandasBlockSchema
+    except ImportError:
+        try:
+            from ray.data.impl.pandas_block import PandasBlockSchema
+        except ImportError:  # pragma: no cover
+            PandasBlockSchema = type(None)
+
+    if isinstance(schema, PandasBlockSchema):
+        dtypes = pd.Series(schema.types, index=schema.names)
+    elif isinstance(schema, pa.Schema):
+        dtypes = schema.empty_table().to_pandas().dtypes
+    else:
+        raise NotImplementedError(f"Unsupported format of schema {schema}")
+
+    index_value = parse_index(pd.RangeIndex(-1))
+    columns_value = parse_index(dtypes.index, store_data=True)
+    op = DataFrameReadRayDataset(
+        refs=refs, columns=columns, incremental_index=incremental_index
+    )
+    return op(index_value=index_value, columns_value=columns_value, dtypes=dtypes)
+
+
+# keep it for back compatibility
+@functools.wraps(read_ray_dataset)
+def read_raydataset(*args, **kwargs):
+    warnings.warn(
+        "read_raydataset has been renamed to read_ray_dataset",
+        DeprecationWarning,
+    )
+    return read_ray_dataset(*args, **kwargs)
+
+
+class DataFrameReadMLDataset(HeadOptimizedDataSource):
+    _op_type_ = OperandDef.READ_MLDATASET
+
+    mldataset = ReferenceField("mldataset", "ray.util.data.MLDataset", default=None)
+    columns = ListField("columns", default=None)
+
+    def __init__(self, **kw):
+        super().__init__(_output_types=[OutputType.dataframe], **kw)
+
+    def _update_key(self):
+        """We can't direct generate token for mldataset when we use
+        ray client, so we use all mldataset's actor_id to generate
+        token.
+        """
+        datas = []
+        for value in self._values_:
+            if isinstance(value, ray.util.data.MLDataset):
+                actor_sets = [
+                    ([str(actor) for actor in actor_set.actors], actor_set.transforms)
+                    for actor_set in value.actor_sets
+                ]
+                datas.append(actor_sets)
+                continue
+            datas.append(value)
+        self._obj_set("_key", tokenize(type(self).__name__, *datas))
+        return self
+
+    def __call__(self, dtypes, nrows: int):
+        columns_value = parse_index(dtypes.index, store_data=True)
+        index_value = parse_index(pd.RangeIndex(nrows))
+        return self.new_dataframe(
+            None,
+            (nrows, len(dtypes)),
+            dtypes=dtypes,
+            index_value=index_value,
+            columns_value=columns_value,
+        )
+
+    @classmethod
+    def tile(cls, op: "DataFrameReadMLDataset"):
+        count_iter = op.mldataset.for_each(lambda df: len(df))
+        nsplits = [sum(shard) for shard in count_iter.shards()]
+        nsplits_acc = np.cumsum(nsplits)
+        out_df = op.outputs[0]
+        out_chunks = []
+        for shard_index in range(op.mldataset.num_shards()):
+            chunk_op = op.copy().reset_key()
+            # Make chunk key unique, otherwise all chunk will have same key.
+            # See `DataFrameFromRecords#tile`
+            chunk_op.extra_params["shard_index"] = shard_index
+            shape = (nsplits[shard_index], out_df.shape[1])
+            begin_index = nsplits_acc[shard_index] - nsplits[shard_index]
+            end_index = nsplits_acc[shard_index]
+            index = parse_index(pd.RangeIndex(start=begin_index, stop=end_index))
+            new_chunk = chunk_op.new_chunk(
+                None,
+                shape=shape,
+                index=(shard_index, 0),
+                index_value=index,
+                columns_value=out_df.columns_value,
+                dtypes=out_df.dtypes,
+            )
+            out_chunks.append(new_chunk)
+        new_op = op.copy()
+        nsplits = ((np.nan,) * len(out_chunks), (out_df.shape[1],))
+        return new_op.new_dataframes(
+            None,
+            out_df.shape,
+            dtypes=out_df.dtypes,
+            index_value=out_df.index_value,
+            columns_value=out_df.columns_value,
+            chunks=out_chunks,
+            nsplits=nsplits,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        shard = op.mldataset.get_shard(chunk.index[0])
+        pd_dfs = list(shard)
+        pd_df = pd.concat(pd_dfs).set_index(chunk.index_value.to_pandas())
+        ctx[chunk.key] = pd_df
+
+
+def read_ray_mldataset(mldataset, **kwargs):
+    import ray.util.data
+
+    assert isinstance(mldataset, ray.util.data.MLDataset)
+    not_empty_dfs = mldataset.filter(lambda df: len(df) > 0).take(1)
+    if not not_empty_dfs:
+        raise ValueError(
+            f"MLDataset {mldataset} is empty, please provide an non-empty dataset."
+        )
+    df_record: pd.DataFrame = not_empty_dfs[0]
+    columns = df_record.columns.names
+    nrows = sum(mldataset.for_each(lambda df: len(df)).gather_async())
+    op = DataFrameReadMLDataset(mldataset=mldataset, columns=columns, nrows=nrows)
+    return op(df_record.dtypes, nrows)
diff --git a/python/xorbits/_mars/dataframe/datasource/read_sql.py b/python/xorbits/_mars/dataframe/datasource/read_sql.py
new file mode 100644
index 000000000..a239ef9cc
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datasource/read_sql.py
@@ -0,0 +1,935 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import binascii
+import datetime
+import pickle
+import uuid
+from typing import List, Union
+
+import cloudpickle
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core.context import Context
+from ...core.operand import OperatorLogicKeyGeneratorMixin
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    BytesField,
+    Float64Field,
+    Int64Field,
+    ListField,
+    StringField,
+)
+from ...tensor.utils import normalize_chunk_sizes
+from ...typing import OperandType, TileableType
+from ..arrays import ArrowStringDtype
+from ..utils import create_sa_connection, parse_index, to_arrow_dtypes
+from .core import (
+    ColumnPruneSupportedDataSourceMixin,
+    IncrementalIndexDatasource,
+    IncrementalIndexDataSourceMixin,
+)
+
+
+class DataFrameReadSQLLogicKeyGenerator(OperatorLogicKeyGeneratorMixin):
+    def _get_logic_key_token_values(self):
+        fields_to_tokenize = [
+            getattr(self, k, None)
+            for k in [
+                "table_or_sql",
+                "schema",
+                "coerce_float",
+                "parse_dates",
+                "columns",
+                "method",
+                "incremental_index",
+                "use_arrow_dtype",
+                "partition_col",
+            ]
+        ]
+        return super()._get_logic_key_token_values() + fields_to_tokenize
+
+
+class DataFrameReadSQL(
+    IncrementalIndexDatasource,
+    ColumnPruneSupportedDataSourceMixin,
+    IncrementalIndexDataSourceMixin,
+    DataFrameReadSQLLogicKeyGenerator,
+):
+    _op_type_ = OperandDef.READ_SQL
+
+    table_or_sql = AnyField("table_or_sql")
+    selectable = BytesField(
+        "selectable", on_serialize=pickle.dumps, on_deserialize=pickle.loads
+    )
+    con = AnyField("con")
+    schema = StringField("schema")
+    index_col = AnyField("index_col")
+    coerce_float = BoolField("coerce_float")
+    parse_dates = AnyField("parse_dates")
+    columns = ListField("columns")
+    engine_kwargs = BytesField(
+        "engine_kwargs",
+        on_serialize=cloudpickle.dumps,
+        on_deserialize=cloudpickle.loads,
+    )
+    row_memory_usage = Float64Field("row_memory_usage")
+    method = StringField("method")
+    incremental_index = BoolField("incremental_index")
+    use_arrow_dtype = BoolField("use_arrow_dtype")
+    chunk_size = AnyField("chunk_size")
+    # for chunks
+    offset = Int64Field("offset")
+    partition_col = StringField("partition_col")
+    num_partitions = Int64Field("num_partitions")
+    low_limit = AnyField("low_limit")
+    high_limit = AnyField("high_limit")
+    left_end = BoolField("left_end")
+    right_end = BoolField("right_end")
+    nrows = Int64Field("nrows", default=None)
+
+    def get_columns(self):
+        return self.columns
+
+    def set_pruned_columns(self, columns, *, keep_order=None):
+        self.columns = columns
+
+    def _get_selectable(self, engine_or_conn, columns=None):
+        import sqlalchemy as sa
+        from sqlalchemy import sql
+        from sqlalchemy.exc import SQLAlchemyError
+
+        # process table_name
+        if self.selectable is not None:
+            selectable = self.selectable
+        else:
+            if isinstance(self.table_or_sql, sa.Table):
+                selectable = self.table_or_sql
+                self.table_or_sql = selectable.name
+            else:
+                m = sa.MetaData()
+                try:
+                    selectable = sa.Table(
+                        self.table_or_sql,
+                        m,
+                        autoload=True,
+                        autoload_with=engine_or_conn,
+                        schema=self.schema,
+                    )
+                except SQLAlchemyError:
+                    temp_name_1 = "t1_" + binascii.b2a_hex(uuid.uuid4().bytes).decode()
+                    temp_name_2 = "t2_" + binascii.b2a_hex(uuid.uuid4().bytes).decode()
+                    if columns:
+                        selectable = (
+                            sql.text(self.table_or_sql)
+                            .columns(*[sql.column(c) for c in columns])
+                            .alias(temp_name_2)
+                        )
+                    else:
+                        selectable = sql.select(
+                            "*",
+                            from_obj=sql.text(
+                                f"({self.table_or_sql}) AS {temp_name_1}"
+                            ),
+                        ).alias(temp_name_2)
+                    self.selectable = selectable
+        return selectable
+
+    def _collect_info(self, engine_or_conn, selectable, columns, test_rows):
+        from sqlalchemy import sql
+
+        # fetch test DataFrame
+        if columns:
+            query = sql.select(
+                [sql.column(c) for c in columns], from_obj=selectable
+            ).limit(test_rows)
+        else:
+            query = sql.select(selectable.columns, from_obj=selectable).limit(test_rows)
+        test_df = pd.read_sql(
+            query,
+            engine_or_conn,
+            index_col=self.index_col,
+            coerce_float=self.coerce_float,
+            parse_dates=self.parse_dates,
+        )
+        if len(test_df) == 0:
+            self.row_memory_usage = None
+        else:
+            self.row_memory_usage = test_df.memory_usage(
+                deep=True, index=True
+            ).sum() / len(test_df)
+
+        if self.method == "offset":
+            # fetch size
+            size = list(
+                engine_or_conn.execute(
+                    sql.select([sql.func.count()]).select_from(selectable)
+                )
+            )[0][0]
+            shape = (size, test_df.shape[1])
+        else:
+            shape = (np.nan, test_df.shape[1])
+
+        return test_df, shape
+
+    def __call__(self, test_rows, chunk_size):
+        import sqlalchemy as sa
+        from sqlalchemy.sql import elements
+
+        with create_sa_connection(self.con, **(self.engine_kwargs or dict())) as con:
+            self.con = str(con.engine.url)
+            selectable = self._get_selectable(con)
+
+            # process index_col
+            index_col = self.index_col
+            if index_col is not None:
+                if not isinstance(index_col, (list, tuple)):
+                    index_col = (index_col,)
+                new_index_col = []
+                for col in index_col:
+                    if isinstance(col, (sa.Column, elements.Label)):
+                        new_index_col.append(col.name)
+                    elif isinstance(col, str):
+                        new_index_col.append(col)
+                    elif col is not None:
+                        raise TypeError(f"unknown index_col type: {type(col)}")
+                self.index_col = new_index_col
+
+            # process columns
+            columns = self.columns or []
+            new_columns = []
+            for col in columns:
+                if isinstance(col, str):
+                    new_columns.append(col)
+                else:
+                    new_columns.append(col.name)
+            self.columns = new_columns
+
+            if self.columns:
+                collect_cols = self.columns + (self.index_col or [])
+            else:
+                collect_cols = []
+            test_df, shape = self._collect_info(
+                con, selectable, collect_cols, test_rows
+            )
+
+            # reconstruct selectable using known column names
+            if not collect_cols:
+                self.columns = list(test_df.columns)
+                if self.selectable is not None:
+                    self.selectable = None
+                    self._get_selectable(
+                        con, columns=self.columns + (self.index_col or [])
+                    )
+
+            if self.method == "partition":
+                if not self.index_col or self.partition_col not in self.index_col:
+                    part_frame = test_df
+                else:
+                    part_frame = test_df.index.to_frame()
+
+                if not issubclass(
+                    part_frame[self.partition_col].dtype.type,
+                    (np.number, np.datetime64),
+                ):
+                    raise TypeError(
+                        "Type of partition column should be numeric or datetime, "
+                        f"now it is {test_df[self.partition_col].dtype}"
+                    )
+
+            if isinstance(test_df.index, pd.RangeIndex):
+                index_value = parse_index(
+                    pd.RangeIndex(shape[0] if not np.isnan(shape[0]) else -1),
+                    str(selectable),
+                    self.con,
+                )
+            else:
+                index_value = parse_index(test_df.index)
+
+            columns_value = parse_index(test_df.columns, store_data=True)
+
+            dtypes = test_df.dtypes
+            use_arrow_dtype = self.use_arrow_dtype
+            if use_arrow_dtype is None:
+                use_arrow_dtype = options.dataframe.use_arrow_dtype
+            if use_arrow_dtype:
+                dtypes = to_arrow_dtypes(dtypes, test_df=test_df)
+
+            return self.new_dataframe(
+                None,
+                shape=shape,
+                dtypes=dtypes,
+                index_value=index_value,
+                columns_value=columns_value,
+                raw_chunk_size=chunk_size,
+            )
+
+    @classmethod
+    def _tile_offset(cls, op: "DataFrameReadSQL"):
+        df = op.outputs[0]
+
+        if op.row_memory_usage is not None:
+            # Data selected
+            chunk_size = df.extra_params.raw_chunk_size or options.chunk_size
+            if chunk_size is None:
+                chunk_size = (
+                    int(options.chunk_store_limit / op.row_memory_usage),
+                    df.shape[1],
+                )
+            row_chunk_sizes = normalize_chunk_sizes(df.shape, chunk_size)[0]
+        else:
+            # No data selected
+            row_chunk_sizes = (0,)
+        offsets = np.cumsum((0,) + row_chunk_sizes).tolist()
+
+        out_chunks = []
+        for i, row_size in enumerate(row_chunk_sizes):
+            chunk_op = op.copy().reset_key()
+            chunk_op._row_memory_usage = None  # no need for chunk
+            offset = chunk_op.offset = offsets[i]
+            if df.index_value.has_value():
+                # range index
+                index_value = parse_index(
+                    df.index_value.to_pandas()[offset : offsets[i + 1]]
+                )
+            else:
+                index_value = parse_index(
+                    df.index_value.to_pandas(),
+                    op.table_or_sql or str(op.selectable),
+                    op.con,
+                    i,
+                    row_size,
+                )
+            out_chunk = chunk_op.new_chunk(
+                None,
+                shape=(row_size, df.shape[1]),
+                columns_value=df.columns_value,
+                index_value=index_value,
+                dtypes=df.dtypes,
+                index=(i, 0),
+            )
+            out_chunks.append(out_chunk)
+
+        nsplits = (row_chunk_sizes, (df.shape[1],))
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            None, chunks=out_chunks, nsplits=nsplits, **df.params
+        )
+
+    def _parse_datetime(self, val):
+        if isinstance(self.parse_dates, list):
+            return pd.to_datetime(val)
+        args = self.parse_dates[self.partition_col]
+        args = {"format": args} if isinstance(args, str) else args
+        return pd.to_datetime(val, **args)
+
+    @classmethod
+    def _tile_partition(cls, op: "DataFrameReadSQL"):
+        df = op.outputs[0]
+
+        selectable = op._get_selectable(None)
+
+        if op.low_limit is None or op.high_limit is None:
+            import sqlalchemy as sa
+            from sqlalchemy import sql
+
+            engine = sa.create_engine(op.con, **(op.engine_kwargs or dict()))
+            try:
+                part_col = selectable.columns[op.partition_col]
+                range_results = engine.execute(
+                    sql.select([sql.func.min(part_col), sql.func.max(part_col)])
+                )
+
+                op.low_limit, op.high_limit = next(range_results)
+                if op.parse_dates and op.partition_col in op.parse_dates:
+                    op.low_limit = op._parse_datetime(op.low_limit)
+                    op.high_limit = op._parse_datetime(op.high_limit)
+            finally:
+                engine.dispose()
+
+        if isinstance(op.low_limit, (datetime.datetime, np.datetime64, pd.Timestamp)):
+            seps = pd.date_range(op.low_limit, op.high_limit, op.num_partitions + 1)
+        else:
+            seps = np.linspace(
+                op.low_limit, op.high_limit, op.num_partitions + 1, endpoint=True
+            )
+
+        out_chunks = []
+        for i, (start, end) in enumerate(zip(seps, seps[1:])):
+            chunk_op = op.copy().reset_key()
+            chunk_op.row_memory_usage = None  # no need for chunk
+            chunk_op.num_partitions = None
+            chunk_op.low_limit = start
+            chunk_op.high_limit = end
+            chunk_op.left_end = i == 0
+            chunk_op.right_end = i == op.num_partitions - 1
+
+            if df.index_value.has_value():
+                # range index
+                index_value = parse_index(-1, chunk_op.key, chunk_op.index_value.key)
+            else:
+                index_value = parse_index(
+                    df.index_value.to_pandas(), str(selectable), op.con, i
+                )
+            out_chunk = chunk_op.new_chunk(
+                None,
+                shape=(np.nan, df.shape[1]),
+                columns_value=df.columns_value,
+                index_value=index_value,
+                dtypes=df.dtypes,
+                index=(i, 0),
+            )
+            out_chunks.append(out_chunk)
+
+        nsplits = ((np.nan,) * len(out_chunks), (df.shape[1],))
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            None, chunks=out_chunks, nsplits=nsplits, **df.params
+        )
+
+    @classmethod
+    def tile(cls, op: "DataFrameReadSQL"):
+        if op.method == "offset":
+            return cls._tile_offset(op)
+        else:
+            return cls._tile_partition(op)
+
+    @classmethod
+    def post_tile(cls, op: OperandType, results: List[TileableType]):
+        if op.method != "offset":
+            # method `offset` knows shape of each chunk
+            # just skip incremental process
+            return super().post_tile(op, results)
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameReadSQL"):
+        import sqlalchemy as sa
+
+        def _adapt_datetime(dt):
+            if isinstance(dt, np.datetime64):
+                return dt.astype("<M8[ms]").astype(datetime.datetime)
+            elif isinstance(dt, pd.Timestamp):
+                return dt.to_pydatetime()
+            return dt
+
+        out = op.outputs[0]
+
+        engine = sa.create_engine(op.con, **(op.engine_kwargs or dict()))
+        try:
+            selectable = op._get_selectable(engine)
+
+            columns = [selectable.columns[col] for col in op.columns]
+            column_names = set(op.columns)
+            if op.index_col:
+                for icol in op.index_col:
+                    if icol not in column_names:
+                        columns.append(selectable.columns[icol])
+
+            # convert to python timestamp in case np / pd time types not handled
+            op.low_limit = _adapt_datetime(op.low_limit)
+            op.high_limit = _adapt_datetime(op.high_limit)
+
+            query = sa.sql.select(columns)
+            if op.method == "partition":
+                part_col = selectable.columns[op.partition_col]
+                if op.left_end:
+                    query = query.where(part_col < op.high_limit)
+                elif op.right_end:
+                    query = query.where(part_col >= op.low_limit)
+                else:
+                    query = query.where(
+                        (part_col >= op.low_limit) & (part_col < op.high_limit)
+                    )
+
+            if hasattr(selectable, "primary_key") and len(selectable.primary_key) > 0:
+                # if table has primary key, sort as the order
+                query = query.order_by(*list(selectable.primary_key))
+            elif op.index_col:
+                # if no primary key, sort as the index_col
+                query = query.order_by(
+                    *[selectable.columns[col] for col in op.index_col]
+                )
+            else:
+                # at last, we sort by all the columns
+                query = query.order_by(*columns)
+
+            if op.method == "offset":
+                query = query.limit(out.shape[0])
+                if op.offset > 0:
+                    query = query.offset(op.offset)
+
+            if op.nrows is not None:
+                query = query.limit(op.nrows)
+
+            df = pd.read_sql(
+                query,
+                engine,
+                index_col=op.index_col,
+                coerce_float=op.coerce_float,
+                parse_dates=op.parse_dates,
+            )
+            if op.method == "offset" and op.index_col is None and op.offset > 0:
+                index = pd.RangeIndex(op.offset, op.offset + out.shape[0])
+                if op.nrows is not None:
+                    index = index[: op.nrows]
+                df.index = index
+
+            use_arrow_dtype = op.use_arrow_dtype
+            if use_arrow_dtype is None:
+                use_arrow_dtype = options.dataframe.use_arrow_dtype
+            if use_arrow_dtype:
+                dtypes = to_arrow_dtypes(df.dtypes, test_df=df)
+                for i in range(len(dtypes)):
+                    dtype = dtypes.iloc[i]
+                    if isinstance(dtype, ArrowStringDtype):
+                        df.iloc[:, i] = df.iloc[:, i].astype(dtype)
+
+            if out.ndim == 2:
+                ctx[out.key] = df
+            else:
+                # this happens when column pruning results in one single series
+                ctx[out.key] = df.iloc[:, 0]
+        finally:
+            engine.dispose()
+
+    @classmethod
+    def post_execute(cls, ctx: Union[dict, Context], op: OperandType):
+        if op.method != "offset":
+            # method `offset` knows shape of each chunk
+            # just skip incremental process
+            return super().post_execute(ctx, op)
+
+
+def _read_sql(
+    table_or_sql,
+    con,
+    schema=None,
+    index_col=None,
+    coerce_float=True,
+    params=None,
+    parse_dates=None,
+    columns=None,
+    chunksize=None,
+    incremental_index=False,
+    use_arrow_dtype=None,
+    test_rows=None,
+    chunk_size=None,
+    engine_kwargs=None,
+    partition_col=None,
+    num_partitions=None,
+    low_limit=None,
+    high_limit=None,
+):
+    if chunksize is not None:
+        raise NotImplementedError("read_sql_query with chunksize not supported")
+    method = "offset" if partition_col is None else "partition"
+
+    op = DataFrameReadSQL(
+        table_or_sql=table_or_sql,
+        selectable=None,
+        con=con,
+        schema=schema,
+        index_col=index_col,
+        coerce_float=coerce_float,
+        params=params,
+        parse_dates=parse_dates,
+        columns=columns,
+        engine_kwargs=engine_kwargs,
+        incremental_index=incremental_index,
+        use_arrow_dtype=use_arrow_dtype,
+        method=method,
+        partition_col=partition_col,
+        num_partitions=num_partitions,
+        low_limit=low_limit,
+        high_limit=high_limit,
+        chunk_size=chunk_size,
+    )
+    return op(test_rows, chunk_size)
+
+
+def read_sql(
+    sql,
+    con,
+    index_col=None,
+    coerce_float=True,
+    params=None,
+    parse_dates=None,
+    columns=None,
+    chunksize=None,
+    test_rows=5,
+    chunk_size=None,
+    engine_kwargs=None,
+    incremental_index=True,
+    partition_col=None,
+    num_partitions=None,
+    low_limit=None,
+    high_limit=None,
+):
+    """
+    Read SQL query or database table into a DataFrame.
+
+    This function is a convenience wrapper around ``read_sql_table`` and
+    ``read_sql_query`` (for backward compatibility). It will delegate
+    to the specific function depending on the provided input. A SQL query
+    will be routed to ``read_sql_query``, while a database table name will
+    be routed to ``read_sql_table``. Note that the delegated function might
+    have more specific notes about their functionality not listed here.
+
+    Parameters
+    ----------
+    sql : str or SQLAlchemy Selectable (select or text object)
+        SQL query to be executed or a table name.
+    con : SQLAlchemy connectable (engine/connection) or database str URI
+        or DBAPI2 connection (fallback mode)'
+
+        Using SQLAlchemy makes it possible to use any DB supported by that
+        library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible
+        for engine disposal and connection closure for the SQLAlchemy connectable. See
+        `here <https://docs.sqlalchemy.org/en/13/core/connections.html>`_
+    index_col : str or list of strings, optional, default: None
+        Column(s) to set as index(MultiIndex).
+    coerce_float : bool, default True
+        Attempts to convert values of non-string, non-numeric objects (like
+        decimal.Decimal) to floating point, useful for SQL result sets.
+    params : list, tuple or dict, optional, default: None
+        List of parameters to pass to execute method.  The syntax used
+        to pass parameters is database driver dependent. Check your
+        database driver documentation for which of the five syntax styles,
+        described in PEP 249's paramstyle, is supported.
+        Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}.
+    parse_dates : list or dict, default: None
+        - List of column names to parse as dates.
+        - Dict of ``{column_name: format string}`` where format string is
+          strftime compatible in case of parsing string times, or is one of
+          (D, s, ns, ms, us) in case of parsing integer timestamps.
+        - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
+          to the keyword arguments of :func:`pandas.to_datetime`
+          Especially useful with databases without native Datetime support,
+          such as SQLite.
+    columns : list, default: None
+        List of column names to select from SQL table (only used when reading
+        a table).
+    chunksize : int, default None
+        If specified, return an iterator where `chunksize` is the number of rows
+        to include in each chunk. Note that this argument is only kept for
+        compatibility. If a non-none value passed, an error will be reported.
+    test_rows: int, default 5
+        The number of rows to fetch for inferring dtypes.
+    chunk_size: : int or tuple of ints, optional
+        Specifies chunk size for each dimension.
+    engine_kwargs: dict, default None
+        Extra kwargs to pass to sqlalchemy.create_engine
+    incremental_index: bool, default True
+        If index_col not specified, ensure range index incremental,
+        gain a slightly better performance if setting False.
+    partition_col : str, default None
+        Specify name of the column to split the result of the query. If
+        specified, the range ``[low_limit, high_limit]`` will be divided
+        into ``n_partitions`` chunks with equal lengths. We do not
+        guarantee the sizes of chunks be equal. When the value is None,
+        ``OFFSET`` and ``LIMIT`` clauses will be used to cut the result
+        of the query.
+    num_partitions : int, default None
+        The number of chunks to divide the result of the query into,
+        when ``partition_col`` is specified.
+    low_limit : default None
+        The lower bound of the range of column ``partition_col``. If not
+        specified, a query will be executed to query the minimum of
+        the column.
+    high_limit : default None
+        The higher bound of the range of column ``partition_col``. If not
+        specified, a query will be executed to query the maximum of
+        the column.
+
+    Returns
+    -------
+    DataFrame
+
+    See Also
+    --------
+    read_sql_table : Read SQL database table into a DataFrame.
+    read_sql_query : Read SQL query into a DataFrame.
+    """
+    return _read_sql(
+        table_or_sql=sql,
+        con=con,
+        index_col=index_col,
+        coerce_float=coerce_float,
+        params=params,
+        parse_dates=parse_dates,
+        columns=columns,
+        engine_kwargs=engine_kwargs,
+        incremental_index=incremental_index,
+        chunksize=chunksize,
+        test_rows=test_rows,
+        chunk_size=chunk_size,
+        partition_col=partition_col,
+        num_partitions=num_partitions,
+        low_limit=low_limit,
+        high_limit=high_limit,
+    )
+
+
+def read_sql_table(
+    table_name,
+    con,
+    schema=None,
+    index_col=None,
+    coerce_float=True,
+    parse_dates=None,
+    columns=None,
+    chunksize=None,
+    test_rows=5,
+    chunk_size=None,
+    engine_kwargs=None,
+    incremental_index=True,
+    use_arrow_dtype=None,
+    partition_col=None,
+    num_partitions=None,
+    low_limit=None,
+    high_limit=None,
+):
+    """
+    Read SQL database table into a DataFrame.
+
+    Given a table name and a SQLAlchemy connectable, returns a DataFrame.
+    This function does not support DBAPI connections.
+
+    Parameters
+    ----------
+    table_name : str
+        Name of SQL table in database.
+    con : SQLAlchemy connectable or str
+        A database URI could be provided as as str.
+        SQLite DBAPI connection mode not supported.
+    schema : str, default None
+        Name of SQL schema in database to query (if database flavor
+        supports this). Uses default schema if None (default).
+    index_col : str or list of str, optional, default: None
+        Column(s) to set as index(MultiIndex).
+    coerce_float : bool, default True
+        Attempts to convert values of non-string, non-numeric objects (like
+        decimal.Decimal) to floating point. Can result in loss of Precision.
+    parse_dates : list or dict, default None
+        - List of column names to parse as dates.
+        - Dict of ``{column_name: format string}`` where format string is
+          strftime compatible in case of parsing string times or is one of
+          (D, s, ns, ms, us) in case of parsing integer timestamps.
+        - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
+          to the keyword arguments of :func:`pandas.to_datetime`
+          Especially useful with databases without native Datetime support,
+          such as SQLite.
+    columns : list, default None
+        List of column names to select from SQL table.
+    chunksize : int, default None
+        If specified, returns an iterator where `chunksize` is the number of
+        rows to include in each chunk. Note that this argument is only kept
+        for compatibility. If a non-none value passed, an error will be
+        reported.
+    test_rows: int, default 5
+        The number of rows to fetch for inferring dtypes.
+    chunk_size: : int or tuple of ints, optional
+        Specifies chunk size for each dimension.
+    engine_kwargs: dict, default None
+        Extra kwargs to pass to sqlalchemy.create_engine
+    incremental_index: bool, default True
+        If index_col not specified, ensure range index incremental,
+        gain a slightly better performance if setting False.
+    use_arrow_dtype: bool, default None
+        If True, use arrow dtype to store columns.
+    partition_col : str, default None
+        Specify name of the column to split the result of the query. If
+        specified, the range ``[low_limit, high_limit]`` will be divided
+        into ``n_partitions`` chunks with equal lengths. We do not
+        guarantee the sizes of chunks be equal. When the value is None,
+        ``OFFSET`` and ``LIMIT`` clauses will be used to cut the result
+        of the query.
+    num_partitions : int, default None
+        The number of chunks to divide the result of the query into,
+        when ``partition_col`` is specified.
+    low_limit : default None
+        The lower bound of the range of column ``partition_col``. If not
+        specified, a query will be executed to query the minimum of
+        the column.
+    high_limit : default None
+        The higher bound of the range of column ``partition_col``. If not
+        specified, a query will be executed to query the maximum of
+        the column.
+
+    Returns
+    -------
+    DataFrame
+        A SQL table is returned as two-dimensional data structure with labeled
+        axes.
+
+    See Also
+    --------
+    read_sql_query : Read SQL query into a DataFrame.
+    read_sql : Read SQL query or database table into a DataFrame.
+
+    Notes
+    -----
+    Any datetime values with time zone information will be converted to UTC.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> md.read_sql_table('table_name', 'postgres:///db_name')  # doctest:+SKIP
+    """
+    return _read_sql(
+        table_or_sql=table_name,
+        con=con,
+        schema=schema,
+        index_col=index_col,
+        coerce_float=coerce_float,
+        parse_dates=parse_dates,
+        columns=columns,
+        engine_kwargs=engine_kwargs,
+        incremental_index=incremental_index,
+        use_arrow_dtype=use_arrow_dtype,
+        chunksize=chunksize,
+        test_rows=test_rows,
+        chunk_size=chunk_size,
+        partition_col=partition_col,
+        num_partitions=num_partitions,
+        low_limit=low_limit,
+        high_limit=high_limit,
+    )
+
+
+def read_sql_query(
+    sql,
+    con,
+    index_col=None,
+    coerce_float=True,
+    params=None,
+    parse_dates=None,
+    chunksize=None,
+    test_rows=5,
+    chunk_size=None,
+    engine_kwargs=None,
+    incremental_index=True,
+    use_arrow_dtype=None,
+    partition_col=None,
+    num_partitions=None,
+    low_limit=None,
+    high_limit=None,
+):
+    """
+    Read SQL query into a DataFrame.
+
+    Returns a DataFrame corresponding to the result set of the query
+    string. Optionally provide an `index_col` parameter to use one of the
+    columns as the index, otherwise default integer index will be used.
+
+    Parameters
+    ----------
+    sql : str SQL query or SQLAlchemy Selectable (select or text object)
+        SQL query to be executed.
+    con : SQLAlchemy connectable(engine/connection), database str URI,
+        or sqlite3 DBAPI2 connection
+        Using SQLAlchemy makes it possible to use any DB supported by that
+        library.
+        If a DBAPI2 object, only sqlite3 is supported.
+    index_col : str or list of strings, optional, default: None
+        Column(s) to set as index(MultiIndex).
+    coerce_float : bool, default True
+        Attempts to convert values of non-string, non-numeric objects (like
+        decimal.Decimal) to floating point. Useful for SQL result sets.
+    params : list, tuple or dict, optional, default: None
+        List of parameters to pass to execute method.  The syntax used
+        to pass parameters is database driver dependent. Check your
+        database driver documentation for which of the five syntax styles,
+        described in PEP 249's paramstyle, is supported.
+        Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}.
+    parse_dates : list or dict, default: None
+        - List of column names to parse as dates.
+        - Dict of ``{column_name: format string}`` where format string is
+          strftime compatible in case of parsing string times, or is one of
+          (D, s, ns, ms, us) in case of parsing integer timestamps.
+        - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
+          to the keyword arguments of :func:`pandas.to_datetime`
+          Especially useful with databases without native Datetime support,
+          such as SQLite.
+    chunksize : int, default None
+        If specified, return an iterator where `chunksize` is the number of
+        rows to include in each chunk. Note that this argument is only kept
+        for compatibility. If a non-none value passed, an error will be
+        reported.
+    incremental_index: bool, default True
+        If index_col not specified, ensure range index incremental,
+        gain a slightly better performance if setting False.
+    use_arrow_dtype: bool, default None
+        If True, use arrow dtype to store columns.
+    test_rows: int, default 5
+        The number of rows to fetch for inferring dtypes.
+    chunk_size: : int or tuple of ints, optional
+        Specifies chunk size for each dimension.
+    engine_kwargs: dict, default None
+        Extra kwargs to pass to sqlalchemy.create_engine
+    partition_col : str, default None
+        Specify name of the column to split the result of the query. If
+        specified, the range ``[low_limit, high_limit]`` will be divided
+        into ``n_partitions`` chunks with equal lengths. We do not
+        guarantee the sizes of chunks be equal. When the value is None,
+        ``OFFSET`` and ``LIMIT`` clauses will be used to cut the result
+        of the query.
+    num_partitions : int, default None
+        The number of chunks to divide the result of the query into,
+        when ``partition_col`` is specified.
+    low_limit : default None
+        The lower bound of the range of column ``partition_col``. If not
+        specified, a query will be executed to query the minimum of
+        the column.
+    high_limit : default None
+        The higher bound of the range of column ``partition_col``. If not
+        specified, a query will be executed to query the maximum of
+        the column.
+
+    Returns
+    -------
+    DataFrame
+
+    See Also
+    --------
+    read_sql_table : Read SQL database table into a DataFrame.
+    read_sql
+
+    Notes
+    -----
+    Any datetime values with time zone information parsed via the `parse_dates`
+    parameter will be converted to UTC.
+    """
+    return _read_sql(
+        table_or_sql=sql,
+        con=con,
+        index_col=index_col,
+        coerce_float=coerce_float,
+        params=params,
+        parse_dates=parse_dates,
+        engine_kwargs=engine_kwargs,
+        incremental_index=incremental_index,
+        use_arrow_dtype=use_arrow_dtype,
+        chunksize=chunksize,
+        test_rows=test_rows,
+        chunk_size=chunk_size,
+        partition_col=partition_col,
+        num_partitions=num_partitions,
+        low_limit=low_limit,
+        high_limit=high_limit,
+    )
diff --git a/python/xorbits/_mars/dataframe/datasource/series.py b/python/xorbits/_mars/dataframe/datasource/series.py
new file mode 100644
index 000000000..bb06a3a91
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datasource/series.py
@@ -0,0 +1,104 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import OutputType
+from ...serialization.serializables import DataTypeField, SeriesField
+from ...tensor.utils import get_chunk_slices
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import decide_series_chunk_size, is_cudf, parse_index
+
+
+class SeriesDataSource(DataFrameOperand, DataFrameOperandMixin):
+    """
+    Represents data from pandas Series
+    """
+
+    _op_type_ = OperandDef.SERIES_DATA_SOURCE
+
+    data = SeriesField("data")
+    dtype = DataTypeField("dtype")
+
+    def __init__(self, data=None, dtype=None, gpu=None, **kw):
+        if dtype is None and data is not None:
+            dtype = data.dtype
+        if gpu is None and is_cudf(data):  # pragma: no cover
+            gpu = True
+        super().__init__(
+            data=data, dtype=dtype, gpu=gpu, _output_types=[OutputType.series], **kw
+        )
+
+    def __call__(self, shape, chunk_size=None):
+        return self.new_series(
+            None,
+            shape=shape,
+            dtype=self.dtype,
+            index_value=parse_index(self.data.index),
+            name=self.data.name,
+            raw_chunk_size=chunk_size,
+        )
+
+    @classmethod
+    def tile(cls, op: "SeriesDataSource"):
+        series = op.outputs[0]
+        raw_series = op.data
+
+        memory_usage = raw_series.memory_usage(index=False, deep=True)
+        chunk_size = series.extra_params.raw_chunk_size or options.chunk_size
+        chunk_size = decide_series_chunk_size(series.shape, chunk_size, memory_usage)
+        chunk_size_idxes = (range(len(size)) for size in chunk_size)
+
+        out_chunks = []
+        for chunk_shape, chunk_idx in zip(
+            itertools.product(*chunk_size), itertools.product(*chunk_size_idxes)
+        ):
+            chunk_op = op.copy().reset_key()
+            slc = get_chunk_slices(chunk_size, chunk_idx)
+            if is_cudf(raw_series):  # pragma: no cover
+                chunk_op.data = raw_series.iloc[slc[0]]
+            else:
+                chunk_op.data = raw_series.iloc[slc]
+            chunk_op.dtype = chunk_op.data.dtype
+            out_chunk = chunk_op.new_chunk(
+                None,
+                shape=chunk_shape,
+                dtype=op.dtype,
+                index=chunk_idx,
+                index_value=parse_index(chunk_op.data.index),
+                name=series.name,
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_seriess(
+            None,
+            series.shape,
+            dtype=op.dtype,
+            index_value=series.index_value,
+            name=series.name,
+            chunks=out_chunks,
+            nsplits=chunk_size,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        ctx[op.outputs[0].key] = op.data
+
+
+def from_pandas(data, chunk_size=None, gpu=None, sparse=False):
+    op = SeriesDataSource(data=data, gpu=gpu, sparse=sparse)
+    return op(data.shape, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/dataframe/datasource/tests/__init__.py b/python/xorbits/_mars/dataframe/datasource/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datasource/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource.py b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource.py
new file mode 100644
index 000000000..c8e181a56
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource.py
@@ -0,0 +1,649 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import string
+import tempfile
+from collections import OrderedDict
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .... import tensor as mt
+from ....config import option_context
+from ....core import tile
+from ....tests.core import require_ray
+from ....utils import lazy_import
+from ...core import DatetimeIndex, Float64Index, IndexValue, Int64Index
+from ...utils import ray_deprecate_ml_dataset
+from ..core import merge_small_files
+from ..dataframe import from_pandas as from_pandas_df
+from ..date_range import date_range
+from ..from_records import from_records
+from ..from_tensor import (
+    dataframe_from_1d_tileables,
+    dataframe_from_tensor,
+    series_from_tensor,
+)
+from ..index import from_pandas as from_pandas_index
+from ..index import from_tileable
+from ..read_csv import DataFrameReadCSV, read_csv
+from ..read_raydataset import (
+    DataFrameReadMLDataset,
+    DataFrameReadRayDataset,
+    read_ray_dataset,
+    read_ray_mldataset,
+)
+from ..read_sql import DataFrameReadSQL, read_sql_query, read_sql_table
+from ..series import from_pandas as from_pandas_series
+
+ray = lazy_import("ray")
+
+
+def test_from_pandas_dataframe():
+    data = pd.DataFrame(
+        np.random.rand(10, 10), columns=["c" + str(i) for i in range(10)]
+    )
+    df = from_pandas_df(data, chunk_size=4)
+
+    pd.testing.assert_series_equal(df.op.dtypes, data.dtypes)
+    assert isinstance(df.index_value._index_value, IndexValue.RangeIndex)
+    assert df.index_value._index_value._slice == slice(0, 10, 1)
+    assert df.index_value.is_monotonic_increasing is True
+    assert df.index_value.is_monotonic_decreasing is False
+    assert df.index_value.is_unique is True
+    assert df.index_value.min_val == 0
+    assert df.index_value.max_val == 9
+    np.testing.assert_equal(df.columns_value._index_value._data, data.columns.values)
+
+    df = tile(df)
+
+    assert len(df.chunks) == 9
+    pd.testing.assert_frame_equal(df.chunks[0].op.data, df.op.data.iloc[:4, :4])
+    assert df.chunks[0].index_value._index_value._slice == slice(0, 4, 1)
+    assert df.chunks[0].index_value._index_value._is_monotonic_increasing is True
+    assert df.chunks[0].index_value._index_value._is_monotonic_decreasing is False
+    assert df.chunks[0].index_value._index_value._is_unique is True
+    pd.testing.assert_frame_equal(df.chunks[1].op.data, df.op.data.iloc[:4, 4:8])
+    assert df.chunks[1].index_value._index_value._slice == slice(0, 4, 1)
+    assert df.chunks[1].index_value._index_value._is_monotonic_increasing is True
+    assert df.chunks[1].index_value._index_value._is_monotonic_decreasing is False
+    assert df.chunks[1].index_value._index_value._is_unique is True
+    pd.testing.assert_frame_equal(df.chunks[2].op.data, df.op.data.iloc[:4, 8:])
+    assert df.chunks[2].index_value._index_value._slice == slice(0, 4, 1)
+    assert df.chunks[2].index_value._index_value._is_monotonic_increasing is True
+    assert df.chunks[2].index_value._index_value._is_monotonic_decreasing is False
+    assert df.chunks[2].index_value._index_value._is_unique is True
+    pd.testing.assert_frame_equal(df.chunks[3].op.data, df.op.data.iloc[4:8, :4])
+    assert df.chunks[3].index_value._index_value._slice == slice(4, 8, 1)
+    assert df.chunks[3].index_value._index_value._is_monotonic_increasing is True
+    assert df.chunks[3].index_value._index_value._is_monotonic_decreasing is False
+    assert df.chunks[3].index_value._index_value._is_unique is True
+    pd.testing.assert_frame_equal(df.chunks[4].op.data, df.op.data.iloc[4:8, 4:8])
+    assert df.chunks[4].index_value._index_value._slice == slice(4, 8, 1)
+    assert df.chunks[4].index_value._index_value._is_monotonic_increasing is True
+    assert df.chunks[4].index_value._index_value._is_monotonic_decreasing is False
+    assert df.chunks[4].index_value._index_value._is_unique is True
+    pd.testing.assert_frame_equal(df.chunks[5].op.data, df.op.data.iloc[4:8, 8:])
+    assert df.chunks[5].index_value._index_value._slice == slice(4, 8, 1)
+    assert df.chunks[5].index_value._index_value._is_monotonic_increasing is True
+    assert df.chunks[5].index_value._index_value._is_monotonic_decreasing is False
+    assert df.chunks[5].index_value._index_value._is_unique is True
+    pd.testing.assert_frame_equal(df.chunks[6].op.data, df.op.data.iloc[8:, :4])
+    assert df.chunks[6].index_value._index_value._slice == slice(8, 10, 1)
+    assert df.chunks[6].index_value._index_value._is_monotonic_increasing is True
+    assert df.chunks[6].index_value._index_value._is_monotonic_decreasing is False
+    assert df.chunks[6].index_value._index_value._is_unique is True
+    pd.testing.assert_frame_equal(df.chunks[7].op.data, df.op.data.iloc[8:, 4:8])
+    assert df.chunks[7].index_value._index_value._slice == slice(8, 10, 1)
+    assert df.chunks[7].index_value._index_value._is_monotonic_increasing is True
+    assert df.chunks[7].index_value._index_value._is_monotonic_decreasing is False
+    assert df.chunks[7].index_value._index_value._is_unique is True
+    pd.testing.assert_frame_equal(df.chunks[8].op.data, df.op.data.iloc[8:, 8:])
+    assert df.chunks[8].index_value._index_value._slice == slice(8, 10, 1)
+    assert df.chunks[8].index_value._index_value._is_monotonic_increasing is True
+    assert df.chunks[8].index_value._index_value._is_monotonic_decreasing is False
+    assert df.chunks[8].index_value._index_value._is_unique is True
+
+    data2 = data[::2]
+    df2 = from_pandas_df(data2, chunk_size=4)
+
+    pd.testing.assert_series_equal(df.op.dtypes, data2.dtypes)
+    assert isinstance(df2.index_value._index_value, IndexValue.RangeIndex)
+    assert df2.index_value._index_value._slice == slice(0, 10, 2)
+
+    df2 = tile(df2)
+
+    assert len(df2.chunks) == 6
+    pd.testing.assert_frame_equal(df2.chunks[0].op.data, df2.op.data.iloc[:4, :4])
+    assert df2.chunks[0].index_value._index_value._slice == slice(0, 8, 2)
+    pd.testing.assert_frame_equal(df2.chunks[1].op.data, df2.op.data.iloc[:4, 4:8])
+    assert df2.chunks[1].index_value._index_value._slice == slice(0, 8, 2)
+    pd.testing.assert_frame_equal(df2.chunks[2].op.data, df2.op.data.iloc[:4, 8:])
+    assert df2.chunks[2].index_value._index_value._slice == slice(0, 8, 2)
+    pd.testing.assert_frame_equal(df2.chunks[3].op.data, df2.op.data.iloc[4:, :4])
+    assert df2.chunks[3].index_value._index_value._slice == slice(8, 10, 2)
+    pd.testing.assert_frame_equal(df2.chunks[4].op.data, df2.op.data.iloc[4:, 4:8])
+    assert df2.chunks[3].index_value._index_value._slice == slice(8, 10, 2)
+    pd.testing.assert_frame_equal(df2.chunks[5].op.data, df2.op.data.iloc[4:, 8:])
+    assert df2.chunks[3].index_value._index_value._slice == slice(8, 10, 2)
+
+    raw = pd.DataFrame(
+        {
+            "a": [
+                string.printable[i : i + 15]
+                for i in np.random.randint(len(string.printable), size=100)
+            ],
+            "b": np.random.rand(100),
+        }
+    )
+    with option_context({"chunk_store_limit": raw["a"].memory_usage(deep=True) / 10}):
+        df = tile(from_pandas_df(raw))
+        # see GH#2985, empty chunks are wrongly generated
+        assert len([ns for ns in df.nsplits[1] if ns == 0]) == 0
+
+
+def test_from_pandas_series():
+    data = pd.Series(np.random.rand(10), name="a")
+    series = from_pandas_series(data, chunk_size=4)
+
+    assert series.name == data.name
+    assert isinstance(series.index_value._index_value, IndexValue.RangeIndex)
+    assert series.index_value._index_value._slice == slice(0, 10, 1)
+    assert series.index_value.is_monotonic_increasing is True
+    assert series.index_value.is_monotonic_decreasing is False
+    assert series.index_value.is_unique is True
+    assert series.index_value.min_val == 0
+    assert series.index_value.max_val == 9
+
+    series = tile(series)
+
+    assert len(series.chunks) == 3
+    pd.testing.assert_series_equal(series.chunks[0].op.data, series.op.data.iloc[:4])
+    assert series.chunks[0].index_value._index_value._slice == slice(0, 4, 1)
+    assert series.chunks[0].index_value._index_value._is_monotonic_increasing is True
+    assert series.chunks[0].index_value._index_value._is_monotonic_decreasing is False
+    assert series.chunks[0].index_value._index_value._is_unique is True
+    pd.testing.assert_series_equal(series.chunks[1].op.data, series.op.data.iloc[4:8])
+    assert series.chunks[1].index_value._index_value._slice == slice(4, 8, 1)
+    assert series.chunks[1].index_value._index_value._is_monotonic_increasing is True
+    assert series.chunks[1].index_value._index_value._is_monotonic_decreasing is False
+    assert series.chunks[1].index_value._index_value._is_unique is True
+    pd.testing.assert_series_equal(series.chunks[2].op.data, series.op.data.iloc[8:])
+    assert series.chunks[2].index_value._index_value._slice == slice(8, 10, 1)
+    assert series.chunks[2].index_value._index_value._is_monotonic_increasing is True
+    assert series.chunks[2].index_value._index_value._is_monotonic_decreasing is False
+    assert series.chunks[2].index_value._index_value._is_unique is True
+
+
+def test_from_pandas_index():
+    data = pd.date_range("2020-1-1", periods=10, name="date")
+    index = from_pandas_index(data, chunk_size=4)
+
+    assert isinstance(index, DatetimeIndex)
+    assert index.name == data.name
+    assert index.dtype == data.dtype
+    assert isinstance(index.index_value.value, IndexValue.DatetimeIndex)
+
+    index = tile(index)
+
+    for i, c in enumerate(index.chunks):
+        assert c.name == data.name
+        pd.testing.assert_index_equal(c.op.data, data[i * 4 : (i + 1) * 4])
+        assert c.dtype == data.dtype
+        assert isinstance(c.index_value.value, IndexValue.DatetimeIndex)
+
+
+def test_from_tileable_index():
+    t = mt.random.rand(10, 4)
+
+    with pytest.raises(ValueError):
+        from_tileable(t)
+
+    pd_df = pd.DataFrame(
+        np.random.rand(10, 4), index=np.arange(10, 0, -1).astype(np.int64)
+    )
+    pd_df.index.name = "ind"
+    df = from_pandas_df(pd_df, chunk_size=6)
+
+    for o in [df, df[0]]:
+        index = o.index
+        assert isinstance(index, Int64Index)
+        assert index.dtype == np.int64
+        assert index.name == pd_df.index.name
+        assert isinstance(index.index_value.value, IndexValue.Int64Index)
+
+        index = tile(index)
+
+        assert len(index.chunks) == 2
+        for c in index.chunks:
+            assert c.dtype == np.int64
+            assert c.name == pd_df.index.name
+            assert isinstance(c.index_value.value, IndexValue.Int64Index)
+
+    t = mt.random.rand(10, chunk_size=6)
+    index = from_tileable(t, name="new_name")
+
+    assert isinstance(index, Float64Index)
+    assert index.dtype == np.float64
+    assert index.name == "new_name"
+    assert isinstance(index.index_value.value, IndexValue.Float64Index)
+
+    index = tile(index)
+
+    assert len(index.chunks) == 2
+    for c in index.chunks:
+        assert c.dtype == np.float64
+        assert c.name == "new_name"
+        assert isinstance(c.index_value.value, IndexValue.Float64Index)
+
+
+def test_from_tensor():
+    tensor = mt.random.rand(10, 10, chunk_size=5)
+    df = dataframe_from_tensor(tensor)
+    assert isinstance(df.index_value._index_value, IndexValue.RangeIndex)
+    assert df.dtypes[0] == tensor.dtype
+
+    df = tile(df)
+    assert len(df.chunks) == 4
+    assert isinstance(df.chunks[0].index_value._index_value, IndexValue.RangeIndex)
+    assert isinstance(df.chunks[0].index_value, IndexValue)
+
+    # test converted from 1-d tensor
+    tensor2 = mt.array([1, 2, 3])
+    # in fact, tensor3 is (3,1)
+    tensor3 = mt.array([tensor2]).T
+
+    df2 = dataframe_from_tensor(tensor2)
+    df3 = dataframe_from_tensor(tensor3)
+    df2 = tile(df2)
+    df3 = tile(df3)
+    np.testing.assert_equal(df2.chunks[0].index, (0, 0))
+    np.testing.assert_equal(df3.chunks[0].index, (0, 0))
+
+    # test converted from scalar
+    scalar = mt.array(1)
+    np.testing.assert_equal(scalar.ndim, 0)
+    with pytest.raises(TypeError):
+        dataframe_from_tensor(scalar)
+
+    # from tensor with given index
+    df = dataframe_from_tensor(tensor, index=np.arange(0, 20, 2))
+    df = tile(df)
+    pd.testing.assert_index_equal(df.chunks[0].op.index, pd.Index(np.arange(0, 10, 2)))
+    pd.testing.assert_index_equal(df.chunks[1].op.index, pd.Index(np.arange(0, 10, 2)))
+    pd.testing.assert_index_equal(df.chunks[2].op.index, pd.Index(np.arange(10, 20, 2)))
+    pd.testing.assert_index_equal(df.chunks[3].op.index, pd.Index(np.arange(10, 20, 2)))
+
+    # from tensor with index that is a tensor as well
+    df = dataframe_from_tensor(tensor, index=mt.arange(0, 20, 2))
+    df = tile(df)
+    assert len(df.chunks[0].inputs) == 2
+    assert df.chunks[0].index_value.has_value() is False
+
+    # from tensor with given columns
+    df = dataframe_from_tensor(tensor, columns=list("abcdefghij"))
+    df = tile(df)
+    pd.testing.assert_index_equal(df.dtypes.index, pd.Index(list("abcdefghij")))
+    pd.testing.assert_index_equal(
+        df.chunks[0].columns_value.to_pandas(), pd.Index(["a", "b", "c", "d", "e"])
+    )
+    pd.testing.assert_index_equal(
+        df.chunks[0].dtypes.index, pd.Index(["a", "b", "c", "d", "e"])
+    )
+    pd.testing.assert_index_equal(
+        df.chunks[1].columns_value.to_pandas(), pd.Index(["f", "g", "h", "i", "j"])
+    )
+    pd.testing.assert_index_equal(
+        df.chunks[1].dtypes.index, pd.Index(["f", "g", "h", "i", "j"])
+    )
+    pd.testing.assert_index_equal(
+        df.chunks[2].columns_value.to_pandas(), pd.Index(["a", "b", "c", "d", "e"])
+    )
+    pd.testing.assert_index_equal(
+        df.chunks[2].dtypes.index, pd.Index(["a", "b", "c", "d", "e"])
+    )
+    pd.testing.assert_index_equal(
+        df.chunks[3].columns_value.to_pandas(), pd.Index(["f", "g", "h", "i", "j"])
+    )
+    pd.testing.assert_index_equal(
+        df.chunks[3].dtypes.index, pd.Index(["f", "g", "h", "i", "j"])
+    )
+
+    # test series from tensor
+    tensor = mt.random.rand(10, chunk_size=4)
+    series = series_from_tensor(tensor, name="a")
+
+    assert series.dtype == tensor.dtype
+    assert series.name == "a"
+    pd.testing.assert_index_equal(series.index_value.to_pandas(), pd.RangeIndex(10))
+
+    series = tile(series)
+    assert len(series.chunks) == 3
+    pd.testing.assert_index_equal(
+        series.chunks[0].index_value.to_pandas(), pd.RangeIndex(0, 4)
+    )
+    assert series.chunks[0].name == "a"
+    pd.testing.assert_index_equal(
+        series.chunks[1].index_value.to_pandas(), pd.RangeIndex(4, 8)
+    )
+    assert series.chunks[1].name == "a"
+    pd.testing.assert_index_equal(
+        series.chunks[2].index_value.to_pandas(), pd.RangeIndex(8, 10)
+    )
+    assert series.chunks[2].name == "a"
+
+    d = OrderedDict(
+        [(0, mt.tensor(np.random.rand(4))), (1, mt.tensor(np.random.rand(4)))]
+    )
+    df = dataframe_from_1d_tileables(d)
+    pd.testing.assert_index_equal(df.columns_value.to_pandas(), pd.RangeIndex(2))
+
+    df = tile(df)
+
+    pd.testing.assert_index_equal(
+        df.chunks[0].index_value.to_pandas(), pd.RangeIndex(4)
+    )
+
+    series = series_from_tensor(mt.random.rand(4))
+    pd.testing.assert_index_equal(series.index_value.to_pandas(), pd.RangeIndex(4))
+
+    series = series_from_tensor(mt.random.rand(4), index=[1, 2, 3])
+    pd.testing.assert_index_equal(series.op.index, pd.Index([1, 2, 3]))
+
+    series = series_from_tensor(
+        mt.random.rand(4), index=pd.Index([1, 2, 3], name="my_index")
+    )
+    pd.testing.assert_index_equal(series.op.index, pd.Index([1, 2, 3], name="my_index"))
+    assert series.index_value.name == "my_index"
+
+    with pytest.raises(TypeError):
+        series_from_tensor(mt.ones((10, 10)))
+
+    # index has wrong shape
+    with pytest.raises(ValueError):
+        dataframe_from_tensor(mt.random.rand(4, 3), index=mt.random.rand(5))
+
+    # columns have wrong shape
+    with pytest.raises(ValueError):
+        dataframe_from_tensor(mt.random.rand(4, 3), columns=["a", "b"])
+
+    # index should be 1-d
+    with pytest.raises(ValueError):
+        dataframe_from_tensor(
+            mt.tensor(np.random.rand(3, 2)), index=mt.tensor(np.random.rand(3, 2))
+        )
+
+    # 1-d tensors should have same shape
+    with pytest.raises(ValueError):
+        dataframe_from_1d_tileables(
+            OrderedDict(
+                [(0, mt.tensor(np.random.rand(3))), (1, mt.tensor(np.random.rand(2)))]
+            )
+        )
+
+    # index has wrong shape
+    with pytest.raises(ValueError):
+        dataframe_from_1d_tileables(
+            {0: mt.tensor(np.random.rand(3))}, index=mt.tensor(np.random.rand(2))
+        )
+
+    # columns have wrong shape
+    with pytest.raises(ValueError):
+        dataframe_from_1d_tileables(
+            {0: mt.tensor(np.random.rand(3))}, columns=["a", "b"]
+        )
+
+    # index should be 1-d
+    with pytest.raises(ValueError):
+        series_from_tensor(mt.random.rand(4), index=mt.random.rand(4, 3))
+
+
+def test_from_records():
+    dtype = np.dtype([("x", "int"), ("y", "double"), ("z", "<U16")])
+
+    tensor = mt.ones((10,), dtype=dtype, chunk_size=3)
+    df = from_records(tensor)
+    df = tile(df)
+
+    assert df.chunk_shape == (4, 1)
+    assert df.chunks[0].shape == (3, 3)
+    assert df.chunks[1].shape == (3, 3)
+    assert df.chunks[2].shape == (3, 3)
+    assert df.chunks[3].shape == (1, 3)
+
+    assert df.chunks[0].inputs[0].shape == (3,)
+    assert df.chunks[1].inputs[0].shape == (3,)
+    assert df.chunks[2].inputs[0].shape == (3,)
+    assert df.chunks[3].inputs[0].shape == (1,)
+
+    assert df.chunks[0].op.extra_params == {"begin_index": 0, "end_index": 3}
+    assert df.chunks[1].op.extra_params == {"begin_index": 3, "end_index": 6}
+    assert df.chunks[2].op.extra_params == {"begin_index": 6, "end_index": 9}
+    assert df.chunks[3].op.extra_params == {"begin_index": 9, "end_index": 10}
+
+    names = pd.Index(["x", "y", "z"])
+    dtypes = pd.Series(
+        {"x": np.dtype("int"), "y": np.dtype("double"), "z": np.dtype("<U16")}
+    )
+    for chunk in df.chunks:
+        pd.testing.assert_index_equal(chunk.columns_value.to_pandas(), names)
+        pd.testing.assert_series_equal(chunk.dtypes, dtypes)
+
+    pd.testing.assert_index_equal(
+        df.chunks[0].index_value.to_pandas(), pd.RangeIndex(0, 3)
+    )
+    pd.testing.assert_index_equal(
+        df.chunks[1].index_value.to_pandas(), pd.RangeIndex(3, 6)
+    )
+    pd.testing.assert_index_equal(
+        df.chunks[2].index_value.to_pandas(), pd.RangeIndex(6, 9)
+    )
+    pd.testing.assert_index_equal(
+        df.chunks[3].index_value.to_pandas(), pd.RangeIndex(9, 10)
+    )
+
+
+def test_read_csv():
+    tempdir = tempfile.mkdtemp()
+    file_path = os.path.join(tempdir, "test.csv")
+    try:
+        df = pd.DataFrame(
+            np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
+            columns=["a", "b", "c"],
+            dtype=np.int64,
+        )
+        df.to_csv(file_path)
+        mdf = read_csv(file_path, index_col=0, chunk_bytes=10)
+        assert isinstance(mdf.op, DataFrameReadCSV)
+        assert mdf.shape[1] == 3
+        pd.testing.assert_index_equal(df.columns, mdf.columns_value.to_pandas())
+
+        mdf = tile(mdf)
+        assert len(mdf.chunks) == 4
+        index_keys = set()
+        for chunk in mdf.chunks:
+            index_keys.add(chunk.index_value.key)
+            pd.testing.assert_index_equal(df.columns, chunk.columns_value.to_pandas())
+            pd.testing.assert_series_equal(df.dtypes, chunk.dtypes)
+        assert len(index_keys) > 1
+    finally:
+        shutil.rmtree(tempdir)
+
+
+def test_read_sql():
+    test_df = pd.DataFrame(
+        {
+            "a": np.arange(10).astype(np.int64, copy=False),
+            "b": [f"s{i}" for i in range(10)],
+        }
+    )
+
+    with tempfile.TemporaryDirectory() as d:
+        table_name = "test"
+        uri = "sqlite:///" + os.path.join(d, "test.db")
+
+        test_df.to_sql(table_name, uri, index=False)
+
+        df = read_sql_table(table_name, uri, chunk_size=4)
+
+        assert df.shape == test_df.shape
+        pd.testing.assert_index_equal(df.index_value.to_pandas(), test_df.index)
+        pd.testing.assert_series_equal(df.dtypes, test_df.dtypes)
+
+        df = tile(df)
+        assert df.nsplits == ((4, 4, 2), (2,))
+        for c in df.chunks:
+            assert isinstance(c.op, DataFrameReadSQL)
+            assert c.op.offset is not None
+
+        with pytest.raises(NotImplementedError):
+            read_sql_table(table_name, uri, chunksize=4, index_col=b"a")
+        with pytest.raises(TypeError):
+            read_sql_table(table_name, uri, chunk_size=4, index_col=b"a")
+        with pytest.raises(TypeError):
+            read_sql_query("select * from " + table_name, uri, partition_col="b")
+
+
+@require_ray
+def test_read_ray_dataset(ray_start_regular):
+    test_df1 = pd.DataFrame(
+        {
+            "a": np.arange(10).astype(np.int64, copy=False),
+            "b": [f"s{i}" for i in range(10)],
+        }
+    )
+    test_df2 = pd.DataFrame(
+        {
+            "a": np.arange(10).astype(np.int64, copy=False),
+            "b": [f"s{i}" for i in range(10)],
+        }
+    )
+    df = pd.concat([test_df1, test_df2])
+    ds = ray.data.from_pandas_refs([ray.put(test_df1), ray.put(test_df2)])
+    mdf = read_ray_dataset(ds)
+
+    assert mdf.shape[1] == 2
+    pd.testing.assert_index_equal(df.columns, mdf.columns_value.to_pandas())
+    pd.testing.assert_series_equal(df.dtypes, mdf.dtypes)
+
+    mdf = tile(mdf)
+    assert len(mdf.chunks) == 2
+    for chunk in mdf.chunks:
+        assert isinstance(chunk.op, DataFrameReadRayDataset)
+
+
+def test_date_range():
+    with pytest.raises(TypeError):
+        _ = date_range("2020-1-1", periods="2")
+
+    with pytest.raises(ValueError):
+        _ = date_range("2020-1-1", "2020-1-10", periods=10, freq="D")
+
+    with pytest.raises(ValueError):
+        _ = date_range(pd.NaT, periods=10)
+
+    expected = pd.date_range("2020-1-1", periods=9.0, name="date")
+
+    dr = date_range("2020-1-1", periods=9.0, name="date", chunk_size=3)
+    assert isinstance(dr, DatetimeIndex)
+    assert dr.shape == (9,)
+    assert dr.dtype == expected.dtype
+    assert isinstance(dr.index_value.value, IndexValue.DatetimeIndex)
+    assert dr.index_value.min_val == expected.min()
+    assert dr.index_value.min_val_close is True
+    assert dr.index_value.max_val == expected.max()
+    assert dr.index_value.max_val_close is True
+    assert dr.index_value.is_unique == expected.is_unique
+    assert dr.index_value.is_monotonic_increasing == expected.is_monotonic_increasing
+    assert dr.name == expected.name
+
+    dr = tile(dr)
+
+    for i, c in enumerate(dr.chunks):
+        ec = expected[i * 3 : (i + 1) * 3]
+        assert c.shape == (3,)
+        assert c.dtype == ec.dtype
+        assert isinstance(c.index_value.value, IndexValue.DatetimeIndex)
+        assert c.index_value.min_val == ec.min()
+        assert c.index_value.min_val_close is True
+        assert c.index_value.max_val == ec.max()
+        assert c.index_value.max_val_close is True
+        assert c.index_value.is_unique == ec.is_unique
+        assert c.index_value.is_monotonic_increasing == ec.is_monotonic_increasing
+        assert c.name == ec.name
+
+
+@require_ray
+@pytest.mark.skipif(
+    ray_deprecate_ml_dataset in (True, None),
+    reason="Ray (>=2.0) has deprecated MLDataset.",
+)
+def test_read_ray_mldataset(ray_start_regular):
+    test_df1 = pd.DataFrame(
+        {
+            "a": np.arange(10).astype(np.int64, copy=False),
+            "b": [f"s{i}" for i in range(10)],
+        }
+    )
+    test_df2 = pd.DataFrame(
+        {
+            "a": np.arange(10).astype(np.int64, copy=False),
+            "b": [f"s{i}" for i in range(10)],
+        }
+    )
+    df = pd.concat([test_df1, test_df2])
+    import ray.util.iter
+    from ray.util.data import from_parallel_iter
+
+    ml_dataset = from_parallel_iter(
+        ray.util.iter.from_items([test_df1, test_df2], num_shards=2), need_convert=False
+    )
+    mdf = read_ray_mldataset(ml_dataset)
+
+    assert mdf.shape[1] == 2
+    pd.testing.assert_index_equal(df.columns, mdf.columns_value.to_pandas())
+    pd.testing.assert_series_equal(df.dtypes, mdf.dtypes)
+
+    mdf = tile(mdf)
+    assert len(mdf.chunks) == 2
+    for chunk in mdf.chunks:
+        assert isinstance(chunk.op, DataFrameReadMLDataset)
+
+
+def test_merge_small_files():
+    raw = pd.DataFrame(np.random.rand(16, 4))
+    df = tile(from_pandas_df(raw, chunk_size=4))
+
+    chunk_size = 4 * 4 * 8
+    # number of chunks < 10
+    assert df is merge_small_files(df, n_sample_file=10)
+    # merged_chunk_size
+    assert df is merge_small_files(
+        df, n_sample_file=2, merged_file_size=chunk_size + 0.1
+    )
+
+    df2 = merge_small_files(df, n_sample_file=2, merged_file_size=2 * chunk_size)
+    assert len(df2.chunks) == 2
+    assert df2.chunks[0].shape == (8, 4)
+    pd.testing.assert_index_equal(
+        df2.chunks[0].index_value.to_pandas(), pd.RangeIndex(8)
+    )
+    assert df2.chunks[1].shape == (8, 4)
+    pd.testing.assert_index_equal(
+        df2.chunks[1].index_value.to_pandas(), pd.RangeIndex(8, 16)
+    )
+    assert df2.nsplits == ((8, 8), (4,))
diff --git a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py
new file mode 100644
index 000000000..4f8a815de
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py
@@ -0,0 +1,1325 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import os
+import tempfile
+import time
+from collections import OrderedDict
+from datetime import datetime
+from string import printable
+
+import numpy as np
+import pandas as pd
+import pytest
+
+try:
+    import pyarrow as pa
+except ImportError:  # pragma: no cover
+    pa = None
+try:
+    import fastparquet
+except ImportError:  # pragma: no cover
+    fastparquet = None
+try:
+    import sqlalchemy
+except ImportError:  # pragma: no cover
+    sqlalchemy = None
+
+
+from .... import dataframe as md
+from .... import tensor as mt
+from ....config import option_context
+from ....tests.core import require_cudf, require_ray
+from ....utils import arrow_array_to_objects, lazy_import, pd_release_version
+from ...utils import ray_deprecate_ml_dataset
+from ..dataframe import from_pandas as from_pandas_df
+from ..from_records import from_records
+from ..from_tensor import dataframe_from_1d_tileables, dataframe_from_tensor
+from ..index import from_pandas as from_pandas_index
+from ..index import from_tileable
+from ..series import from_pandas as from_pandas_series
+
+ray = lazy_import("ray")
+_date_range_use_inclusive = pd_release_version[:2] >= (1, 4)
+
+
+def test_from_pandas_dataframe_execution(setup):
+    # test empty DataFrame
+    pdf = pd.DataFrame()
+    df = from_pandas_df(pdf)
+
+    result = df.execute().fetch()
+    pd.testing.assert_frame_equal(pdf, result)
+
+    pdf = pd.DataFrame(columns=list("ab"))
+    df = from_pandas_df(pdf)
+
+    result = df.execute().fetch()
+    pd.testing.assert_frame_equal(pdf, result)
+
+    pdf = pd.DataFrame(
+        np.random.rand(20, 30), index=[np.arange(20), np.arange(20, 0, -1)]
+    )
+    df = from_pandas_df(pdf, chunk_size=(13, 21))
+
+    result = df.execute().fetch()
+    pd.testing.assert_frame_equal(pdf, result)
+
+
+def test_from_pandas_series_execution(setup):
+    # test empty Series
+    ps = pd.Series(name="a")
+    series = from_pandas_series(ps, chunk_size=13)
+
+    result = series.execute().fetch()
+    pd.testing.assert_series_equal(ps, result)
+
+    series = from_pandas_series(ps)
+
+    result = series.execute().fetch()
+    pd.testing.assert_series_equal(ps, result)
+
+    ps = pd.Series(
+        np.random.rand(20), index=[np.arange(20), np.arange(20, 0, -1)], name="a"
+    )
+    series = from_pandas_series(ps, chunk_size=13)
+
+    result = series.execute().fetch()
+    pd.testing.assert_series_equal(ps, result)
+
+
+def test_from_pandas_index_execution(setup):
+    pd_index = pd.timedelta_range("1 days", periods=10)
+    index = from_pandas_index(pd_index, chunk_size=7)
+
+    result = index.execute().fetch()
+    pd.testing.assert_index_equal(pd_index, result)
+
+
+def test_index_execution(setup):
+    rs = np.random.RandomState(0)
+    pdf = pd.DataFrame(
+        rs.rand(20, 10),
+        index=np.arange(20, 0, -1),
+        columns=["a" + str(i) for i in range(10)],
+    )
+    df = from_pandas_df(pdf, chunk_size=13)
+
+    # test df.index
+    result = df.index.execute().fetch()
+    pd.testing.assert_index_equal(result, pdf.index)
+
+    result = df.columns.execute().fetch()
+    pd.testing.assert_index_equal(result, pdf.columns)
+
+    # df has unknown chunk shape on axis 0
+    df = df[df.a1 < 0.5]
+
+    # test df.index
+    result = df.index.execute().fetch()
+    pd.testing.assert_index_equal(result, pdf[pdf.a1 < 0.5].index)
+
+    s = pd.Series(pdf["a1"], index=pd.RangeIndex(20))
+    series = from_pandas_series(s, chunk_size=13)
+
+    # test series.index which has value
+    result = series.index.execute().fetch()
+    pd.testing.assert_index_equal(result, s.index)
+
+    s = pdf["a2"]
+    series = from_pandas_series(s, chunk_size=13)
+
+    # test series.index
+    result = series.index.execute().fetch()
+    pd.testing.assert_index_equal(result, s.index)
+
+    # test tensor
+    raw = rs.random(20)
+    t = mt.tensor(raw, chunk_size=13)
+
+    result = from_tileable(t).execute().fetch()
+    pd.testing.assert_index_equal(result, pd.Index(raw))
+
+
+def test_initializer_execution(setup):
+    arr = np.random.rand(20, 30)
+
+    pdf = pd.DataFrame(arr, index=[np.arange(20), np.arange(20, 0, -1)])
+    df = md.DataFrame(pdf, chunk_size=(15, 10))
+    result = df.execute().fetch()
+    pd.testing.assert_frame_equal(pdf, result)
+
+    df = md.DataFrame(arr, index=md.date_range("2020-1-1", periods=20))
+    result = df.execute().fetch()
+    pd.testing.assert_frame_equal(
+        result, pd.DataFrame(arr, index=pd.date_range("2020-1-1", periods=20))
+    )
+
+    df = md.DataFrame(
+        {"prices": [100, 101, np.nan, 100, 89, 88]},
+        index=md.date_range("1/1/2010", periods=6, freq="D"),
+    )
+    result = df.execute().fetch()
+    pd.testing.assert_frame_equal(
+        result,
+        pd.DataFrame(
+            {"prices": [100, 101, np.nan, 100, 89, 88]},
+            index=pd.date_range("1/1/2010", periods=6, freq="D"),
+        ),
+    )
+
+    s = np.random.rand(20)
+
+    ps = pd.Series(s, index=[np.arange(20), np.arange(20, 0, -1)], name="a")
+    series = md.Series(ps, chunk_size=7)
+    result = series.execute().fetch()
+    pd.testing.assert_series_equal(ps, result)
+
+    series = md.Series(s, index=md.date_range("2020-1-1", periods=20))
+    result = series.execute().fetch()
+    pd.testing.assert_series_equal(
+        result, pd.Series(s, index=pd.date_range("2020-1-1", periods=20))
+    )
+
+    pi = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
+    index = md.Index(md.Index(pi))
+    result = index.execute().fetch()
+    pd.testing.assert_index_equal(pi, result)
+
+
+def test_index_only(setup):
+    df = md.DataFrame(index=[1, 2, 3])
+    pd.testing.assert_frame_equal(df.execute().fetch(), pd.DataFrame(index=[1, 2, 3]))
+
+    s = md.Series(index=[1, 2, 3])
+    pd.testing.assert_series_equal(s.execute().fetch(), pd.Series(index=[1, 2, 3]))
+
+    df = md.DataFrame(index=md.Index([1, 2, 3]))
+    pd.testing.assert_frame_equal(df.execute().fetch(), pd.DataFrame(index=[1, 2, 3]))
+
+    s = md.Series(index=md.Index([1, 2, 3]), dtype=object)
+    pd.testing.assert_series_equal(
+        s.execute().fetch(), pd.Series(index=[1, 2, 3], dtype=object)
+    )
+
+
+def test_series_from_tensor(setup):
+    data = np.random.rand(10)
+    series = md.Series(mt.tensor(data), name="a")
+    pd.testing.assert_series_equal(series.execute().fetch(), pd.Series(data, name="a"))
+
+    series = md.Series(mt.tensor(data, chunk_size=3))
+    pd.testing.assert_series_equal(series.execute().fetch(), pd.Series(data))
+
+    series = md.Series(mt.ones((10,), chunk_size=4))
+    pd.testing.assert_series_equal(
+        series.execute().fetch(),
+        pd.Series(np.ones(10)),
+    )
+
+    index_data = np.random.rand(10)
+    series = md.Series(
+        mt.tensor(data, chunk_size=3),
+        name="a",
+        index=mt.tensor(index_data, chunk_size=4),
+    )
+    pd.testing.assert_series_equal(
+        series.execute().fetch(), pd.Series(data, name="a", index=index_data)
+    )
+
+    series = md.Series(
+        mt.tensor(data, chunk_size=3),
+        name="a",
+        index=md.date_range("2020-1-1", periods=10),
+    )
+    pd.testing.assert_series_equal(
+        series.execute().fetch(),
+        pd.Series(data, name="a", index=pd.date_range("2020-1-1", periods=10)),
+    )
+
+
+def test_from_tensor_execution(setup):
+    tensor = mt.random.rand(10, 10, chunk_size=5)
+    df = dataframe_from_tensor(tensor)
+    tensor_res = tensor.execute().fetch()
+    pdf_expected = pd.DataFrame(tensor_res)
+    df_result = df.execute().fetch()
+    pd.testing.assert_index_equal(df_result.index, pd.RangeIndex(0, 10))
+    pd.testing.assert_index_equal(df_result.columns, pd.RangeIndex(0, 10))
+    pd.testing.assert_frame_equal(df_result, pdf_expected)
+
+    # test from tensor with unknown shape
+    tensor2 = tensor[tensor[:, 0] < 0.9]
+    df = dataframe_from_tensor(tensor2)
+    df_result = df.execute().fetch()
+    tensor_res = tensor2.execute().fetch()
+    pdf_expected = pd.DataFrame(tensor_res)
+    pd.testing.assert_frame_equal(df_result.reset_index(drop=True), pdf_expected)
+
+    # test converted with specified index_value and columns
+    tensor2 = mt.random.rand(2, 2, chunk_size=1)
+    df2 = dataframe_from_tensor(
+        tensor2, index=pd.Index(["a", "b"]), columns=pd.Index([3, 4])
+    )
+    df_result = df2.execute().fetch()
+    pd.testing.assert_index_equal(df_result.index, pd.Index(["a", "b"]))
+    pd.testing.assert_index_equal(df_result.columns, pd.Index([3, 4]))
+
+    # test converted from 1-d tensor
+    tensor3 = mt.array([1, 2, 3])
+    df3 = dataframe_from_tensor(tensor3)
+    result3 = df3.execute().fetch()
+    pdf_expected = pd.DataFrame(np.array([1, 2, 3]))
+    pd.testing.assert_frame_equal(pdf_expected, result3)
+
+    # test converted from identical chunks
+    tensor4 = mt.ones((10, 10), chunk_size=3)
+    df4 = dataframe_from_tensor(tensor4)
+    result4 = df4.execute().fetch()
+    pdf_expected = pd.DataFrame(tensor4.execute().fetch())
+    pd.testing.assert_frame_equal(pdf_expected, result4)
+
+    # from tensor with given index
+    tensor5 = mt.ones((10, 10), chunk_size=3)
+    df5 = dataframe_from_tensor(tensor5, index=np.arange(0, 20, 2))
+    result5 = df5.execute().fetch()
+    pdf_expected = pd.DataFrame(np.ones((10, 10)), index=np.arange(0, 20, 2))
+    pd.testing.assert_frame_equal(pdf_expected, result5)
+
+    # from tensor with given index that is a tensor
+    raw7 = np.random.rand(10, 10)
+    tensor7 = mt.tensor(raw7, chunk_size=3)
+    index_raw7 = np.random.rand(10)
+    index7 = mt.tensor(index_raw7, chunk_size=4)
+    df7 = dataframe_from_tensor(tensor7, index=index7)
+    result7 = df7.execute().fetch()
+    pdf_expected = pd.DataFrame(raw7, index=index_raw7)
+    pd.testing.assert_frame_equal(pdf_expected, result7)
+
+    # from tensor with given index is a md.Index
+    raw10 = np.random.rand(10, 10)
+    tensor10 = mt.tensor(raw10, chunk_size=3)
+    index10 = md.date_range("2020-1-1", periods=10, chunk_size=3)
+    df10 = dataframe_from_tensor(tensor10, index=index10)
+    result10 = df10.execute().fetch()
+    pdf_expected = pd.DataFrame(raw10, index=pd.date_range("2020-1-1", periods=10))
+    pd.testing.assert_frame_equal(pdf_expected, result10)
+
+    # from tensor with given columns
+    tensor6 = mt.ones((10, 10), chunk_size=3)
+    df6 = dataframe_from_tensor(tensor6, columns=list("abcdefghij"))
+    result6 = df6.execute().fetch()
+    pdf_expected = pd.DataFrame(tensor6.execute().fetch(), columns=list("abcdefghij"))
+    pd.testing.assert_frame_equal(pdf_expected, result6)
+
+    # from 1d tensors
+    raws8 = [
+        ("a", np.random.rand(8)),
+        ("b", np.random.randint(10, size=8)),
+        ("c", ["".join(np.random.choice(list(printable), size=6)) for _ in range(8)]),
+    ]
+    tensors8 = OrderedDict((r[0], mt.tensor(r[1], chunk_size=3)) for r in raws8)
+    raws8.append(("d", 1))
+    raws8.append(("e", pd.date_range("2020-1-1", periods=8)))
+    tensors8["d"] = 1
+    tensors8["e"] = raws8[-1][1]
+    df8 = dataframe_from_1d_tileables(tensors8, columns=[r[0] for r in raws8])
+    result = df8.execute().fetch()
+    pdf_expected = pd.DataFrame(OrderedDict(raws8))
+    pd.testing.assert_frame_equal(result, pdf_expected)
+
+    # from 1d tensors and specify index with a tensor
+    index_raw9 = np.random.rand(8)
+    index9 = mt.tensor(index_raw9, chunk_size=4)
+    df9 = dataframe_from_1d_tileables(
+        tensors8, columns=[r[0] for r in raws8], index=index9
+    )
+    result = df9.execute().fetch()
+    pdf_expected = pd.DataFrame(OrderedDict(raws8), index=index_raw9)
+    pd.testing.assert_frame_equal(result, pdf_expected)
+
+    # from 1d tensors and specify index
+    df11 = dataframe_from_1d_tileables(
+        tensors8,
+        columns=[r[0] for r in raws8],
+        index=md.date_range("2020-1-1", periods=8),
+    )
+    result = df11.execute().fetch()
+    pdf_expected = pd.DataFrame(
+        OrderedDict(raws8), index=pd.date_range("2020-1-1", periods=8)
+    )
+    pd.testing.assert_frame_equal(result, pdf_expected)
+
+    df12 = dataframe_from_1d_tileables({"a": [md.Series([1, 2, 3]).sum() + 1]})
+    result = df12.execute().fetch()
+    pdf_expected = pd.DataFrame({"a": [pd.Series([1, 2, 3]).sum() + 1]})
+    pd.testing.assert_frame_equal(result, pdf_expected)
+
+
+def test_from_records_execution(setup):
+    dtype = np.dtype([("x", "int"), ("y", "double"), ("z", "<U16")])
+
+    ndarr = np.ones((10,), dtype=dtype)
+    pdf_expected = pd.DataFrame.from_records(ndarr, index=pd.RangeIndex(10))
+
+    # from structured array of mars
+    tensor = mt.ones((10,), dtype=dtype, chunk_size=3)
+    df1 = from_records(tensor)
+    df1_result = df1.execute().fetch()
+    pd.testing.assert_frame_equal(df1_result, pdf_expected)
+
+    # from structured array of numpy
+    df2 = from_records(ndarr)
+    df2_result = df2.execute().fetch()
+    pd.testing.assert_frame_equal(df2_result, pdf_expected)
+
+
+def test_read_csv_execution(setup):
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test.csv")
+
+        df = pd.DataFrame(
+            np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int64),
+            columns=["a", "b", "c"],
+        )
+        df.to_csv(file_path)
+
+        pdf = pd.read_csv(file_path, index_col=0)
+        r = md.read_csv(file_path, index_col=0)
+        mdf = r.execute().fetch()
+        pd.testing.assert_frame_equal(pdf, mdf)
+        # size_res = self.executor.execute_dataframe(r, mock=True)
+        # assert sum(s[0] for s in size_res) == os.stat(file_path).st_size
+
+        mdf2 = md.read_csv(file_path, index_col=0, chunk_bytes=10).execute().fetch()
+        pd.testing.assert_frame_equal(pdf, mdf2)
+
+        mdf = md.read_csv(file_path, index_col=0, nrows=1).execute().fetch()
+        pd.testing.assert_frame_equal(df[:1], mdf)
+
+    # test names and usecols
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test.csv")
+        df = pd.DataFrame(
+            np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int64),
+            columns=["a", "b", "c"],
+        )
+        df.to_csv(file_path, index=False)
+
+        mdf = md.read_csv(file_path, usecols=["c", "b"]).execute().fetch()
+        pd.testing.assert_frame_equal(pd.read_csv(file_path, usecols=["c", "b"]), mdf)
+
+        mdf = (
+            md.read_csv(file_path, names=["a", "b", "c"], usecols=["c", "b"])
+            .execute()
+            .fetch()
+        )
+        pd.testing.assert_frame_equal(
+            pd.read_csv(file_path, names=["a", "b", "c"], usecols=["c", "b"]), mdf
+        )
+
+        mdf = (
+            md.read_csv(file_path, names=["a", "b", "c"], usecols=["a", "c"])
+            .execute()
+            .fetch()
+        )
+        pd.testing.assert_frame_equal(
+            pd.read_csv(file_path, names=["a", "b", "c"], usecols=["a", "c"]), mdf
+        )
+
+        mdf = md.read_csv(file_path, usecols=["a", "c"]).execute().fetch()
+        pd.testing.assert_frame_equal(pd.read_csv(file_path, usecols=["a", "c"]), mdf)
+
+    # test sep
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test.csv")
+
+        df = pd.DataFrame(
+            np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=["a", "b", "c"]
+        )
+        df.to_csv(file_path, sep=";")
+
+        pdf = pd.read_csv(file_path, sep=";", index_col=0)
+        mdf = md.read_csv(file_path, sep=";", index_col=0).execute().fetch()
+        pd.testing.assert_frame_equal(pdf, mdf)
+
+        mdf2 = (
+            md.read_csv(file_path, sep=";", index_col=0, chunk_bytes=10)
+            .execute()
+            .fetch()
+        )
+        pd.testing.assert_frame_equal(pdf, mdf2)
+
+    # test missing value
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test.csv")
+
+        df = pd.DataFrame(
+            {
+                "c1": [np.nan, "a", "b", "c"],
+                "c2": [1, 2, 3, np.nan],
+                "c3": [np.nan, np.nan, 3.4, 2.2],
+            }
+        )
+        df.to_csv(file_path)
+
+        pdf = pd.read_csv(file_path, index_col=0)
+        mdf = md.read_csv(file_path, index_col=0).execute().fetch()
+        pd.testing.assert_frame_equal(pdf, mdf)
+
+        mdf2 = md.read_csv(file_path, index_col=0, chunk_bytes=12).execute().fetch()
+        pd.testing.assert_frame_equal(pdf, mdf2)
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test.csv")
+
+        index = pd.date_range(start="1/1/2018", periods=100)
+        df = pd.DataFrame(
+            {
+                "col1": np.random.rand(100),
+                "col2": np.random.choice(["a", "b", "c"], (100,)),
+                "col3": np.arange(100),
+            },
+            index=index,
+        )
+        df.to_csv(file_path)
+
+        pdf = pd.read_csv(file_path, index_col=0)
+        mdf = md.read_csv(file_path, index_col=0).execute().fetch()
+        pd.testing.assert_frame_equal(pdf, mdf)
+
+        mdf2 = md.read_csv(file_path, index_col=0, chunk_bytes=100).execute().fetch()
+        pd.testing.assert_frame_equal(pdf, mdf2)
+
+    # test nan
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test.csv")
+
+        df = pd.DataFrame(
+            {
+                "col1": np.random.rand(100),
+                "col2": np.random.choice(["a", "b", "c"], (100,)),
+                "col3": np.arange(100),
+            }
+        )
+        df.iloc[:100, :] = pd.NA
+        df.to_csv(file_path)
+
+        pdf = pd.read_csv(file_path, index_col=0)
+        mdf = md.read_csv(file_path, index_col=0, head_lines=10, chunk_bytes=200)
+        result = mdf.execute().fetch()
+        pd.testing.assert_frame_equal(pdf, result)
+
+        # dtypes is inferred as expected
+        pd.testing.assert_series_equal(mdf.dtypes, pdf.dtypes)
+
+    # test compression
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test.gzip")
+
+        index = pd.date_range(start="1/1/2018", periods=100)
+        df = pd.DataFrame(
+            {
+                "col1": np.random.rand(100),
+                "col2": np.random.choice(["a", "b", "c"], (100,)),
+                "col3": np.arange(100),
+            },
+            index=index,
+        )
+        df.to_csv(file_path, compression="gzip")
+
+        pdf = pd.read_csv(file_path, compression="gzip", index_col=0)
+        mdf = md.read_csv(file_path, compression="gzip", index_col=0).execute().fetch()
+        pd.testing.assert_frame_equal(pdf, mdf)
+
+        mdf2 = (
+            md.read_csv(file_path, compression="gzip", index_col=0, chunk_bytes="1k")
+            .execute()
+            .fetch()
+        )
+        pd.testing.assert_frame_equal(pdf, mdf2)
+
+    # test multiple files
+    for merge_small_file_option in [{"n_sample_file": 1}, None]:
+        with tempfile.TemporaryDirectory() as tempdir:
+            df = pd.DataFrame(np.random.rand(300, 3), columns=["a", "b", "c"])
+
+            file_paths = [os.path.join(tempdir, f"test{i}.csv") for i in range(3)]
+            df[:100].to_csv(file_paths[0])
+            df[100:200].to_csv(file_paths[1])
+            df[200:].to_csv(file_paths[2])
+
+            mdf = (
+                md.read_csv(
+                    file_paths,
+                    index_col=0,
+                    merge_small_file_options=merge_small_file_option,
+                )
+                .execute()
+                .fetch()
+            )
+            pd.testing.assert_frame_equal(df, mdf)
+
+            mdf2 = (
+                md.read_csv(file_paths, index_col=0, chunk_bytes=50).execute().fetch()
+            )
+            pd.testing.assert_frame_equal(df, mdf2)
+
+    # test wildcards in path
+    with tempfile.TemporaryDirectory() as tempdir:
+        df = pd.DataFrame(np.random.rand(300, 3), columns=["a", "b", "c"])
+
+        file_paths = [os.path.join(tempdir, f"test{i}.csv") for i in range(3)]
+        df[:100].to_csv(file_paths[0])
+        df[100:200].to_csv(file_paths[1])
+        df[200:].to_csv(file_paths[2])
+
+        # As we can not guarantee the order in which these files are processed,
+        # the result may not keep the original order.
+        mdf = md.read_csv(f"{tempdir}/*.csv", index_col=0).execute().fetch()
+        pd.testing.assert_frame_equal(df, mdf.sort_index())
+
+        mdf2 = (
+            md.read_csv(f"{tempdir}/*.csv", index_col=0, chunk_bytes=50)
+            .execute()
+            .fetch()
+        )
+        pd.testing.assert_frame_equal(df, mdf2.sort_index())
+
+    # test read directory
+    with tempfile.TemporaryDirectory() as tempdir:
+        testdir = os.path.join(tempdir, "test_dir")
+        os.makedirs(testdir, exist_ok=True)
+
+        df = pd.DataFrame(np.random.rand(300, 3), columns=["a", "b", "c"])
+
+        file_paths = [os.path.join(testdir, f"test{i}.csv") for i in range(3)]
+        df[:100].to_csv(file_paths[0])
+        df[100:200].to_csv(file_paths[1])
+        df[200:].to_csv(file_paths[2])
+
+        # As we can not guarantee the order in which these files are processed,
+        # the result may not keep the original order.
+        mdf = md.read_csv(testdir, index_col=0).execute().fetch()
+        pd.testing.assert_frame_equal(df, mdf.sort_index())
+
+        mdf2 = md.read_csv(testdir, index_col=0, chunk_bytes=50).execute().fetch()
+        pd.testing.assert_frame_equal(df, mdf2.sort_index())
+
+
+@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
+def test_read_csv_use_arrow_dtype(setup):
+    rs = np.random.RandomState(0)
+    df = pd.DataFrame(
+        {
+            "col1": rs.rand(100),
+            "col2": rs.choice(["a" * 2, "b" * 3, "c" * 4], (100,)),
+            "col3": np.arange(100),
+        }
+    )
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test.csv")
+        df.to_csv(file_path, index=False)
+
+        pdf = pd.read_csv(file_path)
+        mdf = md.read_csv(file_path, use_arrow_dtype=True)
+        result = mdf.execute().fetch()
+        assert isinstance(mdf.dtypes.iloc[1], md.ArrowStringDtype)
+        assert isinstance(result.dtypes.iloc[1], md.ArrowStringDtype)
+        pd.testing.assert_frame_equal(arrow_array_to_objects(result), pdf)
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        with option_context({"dataframe.use_arrow_dtype": True}):
+            file_path = os.path.join(tempdir, "test.csv")
+            df.to_csv(file_path, index=False)
+
+            pdf = pd.read_csv(file_path)
+            mdf = md.read_csv(file_path)
+            result = mdf.execute().fetch()
+            assert isinstance(mdf.dtypes.iloc[1], md.ArrowStringDtype)
+            assert isinstance(result.dtypes.iloc[1], md.ArrowStringDtype)
+            pd.testing.assert_frame_equal(arrow_array_to_objects(result), pdf)
+
+    # test compression
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test.gzip")
+        df.to_csv(file_path, compression="gzip", index=False)
+
+        pdf = pd.read_csv(file_path, compression="gzip")
+        mdf = md.read_csv(file_path, compression="gzip", use_arrow_dtype=True)
+        result = mdf.execute().fetch()
+        assert isinstance(mdf.dtypes.iloc[1], md.ArrowStringDtype)
+        assert isinstance(result.dtypes.iloc[1], md.ArrowStringDtype)
+        pd.testing.assert_frame_equal(arrow_array_to_objects(result), pdf)
+
+
+@require_cudf
+def test_read_csv_gpu_execution(setup_gpu):
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test.csv")
+
+        df = pd.DataFrame(
+            {
+                "col1": np.random.rand(100),
+                "col2": np.random.choice(["a", "b", "c"], (100,)),
+                "col3": np.arange(100),
+            }
+        )
+        df.to_csv(file_path, index=False)
+
+        pdf = pd.read_csv(file_path)
+        mdf = md.read_csv(file_path, gpu=True).execute().fetch()
+        pd.testing.assert_frame_equal(
+            pdf.reset_index(drop=True), mdf.to_pandas().reset_index(drop=True)
+        )
+
+        mdf2 = md.read_csv(file_path, gpu=True, chunk_bytes=200).execute().fetch()
+        pd.testing.assert_frame_equal(
+            pdf.reset_index(drop=True), mdf2.to_pandas().reset_index(drop=True)
+        )
+
+
+def test_read_csv_with_specific_names(setup):
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test_names.csv")
+        df = pd.DataFrame(
+            np.array(np.random.randint(0, 10, size=(10, 3))), columns=["a", "b", "c"]
+        )
+        df.to_csv(file_path, index=False)
+
+        pdf = pd.read_csv(file_path, names=["b", "a", "c"], header=0)
+        mdf = md.read_csv(file_path, names=["b", "a", "c"], header=0).execute().fetch()
+        pd.testing.assert_frame_equal(pdf, mdf)
+
+
+def test_read_csv_without_index(setup):
+    # test csv file without storing index
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test.csv")
+
+        df = pd.DataFrame(
+            np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=["a", "b", "c"]
+        )
+        df.to_csv(file_path, index=False)
+
+        pdf = pd.read_csv(file_path)
+        mdf = md.read_csv(file_path).execute().fetch()
+        pd.testing.assert_frame_equal(pdf, mdf)
+
+        mdf2 = md.read_csv(file_path, chunk_bytes=10).execute().fetch()
+        pd.testing.assert_frame_equal(pdf, mdf2)
+
+        file_path2 = os.path.join(tempdir, "test.csv")
+        df = pd.DataFrame(
+            np.random.RandomState(0).rand(100, 10),
+            columns=[f"col{i}" for i in range(10)],
+        )
+        df.to_csv(file_path2, index=False)
+
+        mdf3 = md.read_csv(file_path2, chunk_bytes=os.stat(file_path2).st_size / 5)
+        result = mdf3.execute().fetch()
+        expected = pd.read_csv(file_path2)
+        pd.testing.assert_frame_equal(result, expected)
+
+        # test incremental_index = False
+        mdf4 = md.read_csv(
+            file_path2,
+            chunk_bytes=os.stat(file_path2).st_size / 5,
+            incremental_index=False,
+        )
+        result = mdf4.execute().fetch()
+        assert not result.index.is_monotonic_increasing
+        expected = pd.read_csv(file_path2)
+        pd.testing.assert_frame_equal(result.reset_index(drop=True), expected)
+
+
+@pytest.mark.skipif(sqlalchemy is None, reason="sqlalchemy not installed")
+def test_read_sql_execution(setup):
+    import sqlalchemy as sa
+
+    rs = np.random.RandomState(0)
+    test_df = pd.DataFrame(
+        {
+            "a": np.arange(10).astype(np.int64, copy=False),
+            "b": [f"s{i}" for i in range(10)],
+            "c": rs.rand(10),
+            "d": [
+                datetime.fromtimestamp(time.time() + 3600 * (i - 5)) for i in range(10)
+            ],
+        }
+    )
+
+    with tempfile.TemporaryDirectory() as d:
+        table_name = "test"
+        table_name2 = "test2"
+        uri = "sqlite:///" + os.path.join(d, "test.db")
+
+        test_df.to_sql(table_name, uri, index=False)
+
+        # test read with table name
+        r = md.read_sql_table("test", uri, chunk_size=4)
+        result = r.execute().fetch()
+        pd.testing.assert_frame_equal(result, test_df)
+
+        # test read with sql string and offset method
+        r = md.read_sql_query(
+            "select * from test where c > 0.5", uri, parse_dates=["d"], chunk_size=4
+        )
+        result = r.execute().fetch()
+        pd.testing.assert_frame_equal(
+            result, test_df[test_df.c > 0.5].reset_index(drop=True)
+        )
+
+        # test read with sql string and partition method with integer cols
+        r = md.read_sql(
+            "select * from test where b > 's5'",
+            uri,
+            parse_dates=["d"],
+            partition_col="a",
+            num_partitions=3,
+        )
+        result = r.execute().fetch()
+        pd.testing.assert_frame_equal(
+            result, test_df[test_df.b > "s5"].reset_index(drop=True)
+        )
+
+        # test read with sql string and partition method with datetime cols
+        r = md.read_sql_query(
+            "select * from test where b > 's5'",
+            uri,
+            parse_dates={"d": "%Y-%m-%d %H:%M:%S"},
+            partition_col="d",
+            num_partitions=3,
+        )
+        result = r.execute().fetch()
+        pd.testing.assert_frame_equal(
+            result, test_df[test_df.b > "s5"].reset_index(drop=True)
+        )
+
+        # test read with sql string and partition method with datetime cols
+        r = md.read_sql_query(
+            "select * from test where b > 's5'",
+            uri,
+            parse_dates=["d"],
+            partition_col="d",
+            num_partitions=3,
+            index_col="d",
+        )
+        result = r.execute().fetch()
+        pd.testing.assert_frame_equal(result, test_df[test_df.b > "s5"].set_index("d"))
+
+        # test SQL that return no result
+        r = md.read_sql_query("select * from test where a > 1000", uri)
+        result = r.execute().fetch()
+        pd.testing.assert_frame_equal(result, pd.DataFrame(columns=test_df.columns))
+
+        engine = sa.create_engine(uri)
+        m = sa.MetaData()
+        try:
+            # test index_col and columns
+            r = md.read_sql_table(
+                "test",
+                engine.connect(),
+                chunk_size=4,
+                index_col="a",
+                columns=["b", "d"],
+            )
+            result = r.execute().fetch()
+            expected = test_df.copy(deep=True)
+            expected.set_index("a", inplace=True)
+            del expected["c"]
+            pd.testing.assert_frame_equal(result, expected)
+
+            # do not specify chunk_size
+            r = md.read_sql_table(
+                "test", engine.connect(), index_col="a", columns=["b", "d"]
+            )
+            result = r.execute().fetch()
+            pd.testing.assert_frame_equal(result, expected)
+
+            table = sa.Table(table_name, m, autoload=True, autoload_with=engine)
+            r = md.read_sql_table(
+                table,
+                engine,
+                chunk_size=4,
+                index_col=[table.columns["a"], table.columns["b"]],
+                columns=[table.columns["c"], "d"],
+            )
+            result = r.execute().fetch()
+            expected = test_df.copy(deep=True)
+            expected.set_index(["a", "b"], inplace=True)
+            pd.testing.assert_frame_equal(result, expected)
+
+            # test table with primary key
+            sa.Table(
+                table_name2,
+                m,
+                sa.Column("id", sa.Integer, primary_key=True),
+                sa.Column("a", sa.Integer),
+                sa.Column("b", sa.String),
+                sa.Column("c", sa.Float),
+                sa.Column("d", sa.DateTime),
+            )
+            m.create_all(engine)
+            test_df = test_df.copy(deep=True)
+            test_df.index.name = "id"
+            test_df.to_sql(table_name2, uri, if_exists="append")
+
+            r = md.read_sql_table(table_name2, engine, chunk_size=4, index_col="id")
+            result = r.execute().fetch()
+            pd.testing.assert_frame_equal(result, test_df)
+        finally:
+            engine.dispose()
+
+
+@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
+def test_read_sql_use_arrow_dtype(setup):
+    rs = np.random.RandomState(0)
+    test_df = pd.DataFrame(
+        {
+            "a": np.arange(10).astype(np.int64, copy=False),
+            "b": [f"s{i}" for i in range(10)],
+            "c": rs.rand(10),
+            "d": [
+                datetime.fromtimestamp(time.time() + 3600 * (i - 5)) for i in range(10)
+            ],
+        }
+    )
+
+    with tempfile.TemporaryDirectory() as d:
+        table_name = "test"
+        uri = "sqlite:///" + os.path.join(d, "test.db")
+
+        test_df.to_sql(table_name, uri, index=False)
+
+        r = md.read_sql_table("test", uri, chunk_size=4, use_arrow_dtype=True)
+        result = r.execute().fetch()
+        assert isinstance(r.dtypes.iloc[1], md.ArrowStringDtype)
+        assert isinstance(result.dtypes.iloc[1], md.ArrowStringDtype)
+        pd.testing.assert_frame_equal(arrow_array_to_objects(result), test_df)
+
+        # test read with sql string and offset method
+        r = md.read_sql_query(
+            "select * from test where c > 0.5",
+            uri,
+            parse_dates=["d"],
+            chunk_size=4,
+            use_arrow_dtype=True,
+        )
+        result = r.execute().fetch()
+        assert isinstance(r.dtypes.iloc[1], md.ArrowStringDtype)
+        assert isinstance(result.dtypes.iloc[1], md.ArrowStringDtype)
+        pd.testing.assert_frame_equal(
+            arrow_array_to_objects(result),
+            test_df[test_df.c > 0.5].reset_index(drop=True),
+        )
+
+
+@pytest.mark.pd_compat
+def test_date_range_execution(setup):
+    chunk_sizes = [None, 3]
+    inclusives = ["both", "neither", "left", "right"]
+
+    if _date_range_use_inclusive:
+        with pytest.warns(FutureWarning, match="closed"):
+            md.date_range("2020-1-1", periods=10, closed="right")
+
+    for chunk_size, inclusive in itertools.product(chunk_sizes, inclusives):
+        kw = dict()
+        if _date_range_use_inclusive:
+            kw["inclusive"] = inclusive
+        else:
+            if inclusive == "neither":
+                continue
+            elif inclusive == "both":
+                inclusive = None
+            kw["closed"] = inclusive
+
+        # start, periods, freq
+        dr = md.date_range("2020-1-1", periods=10, chunk_size=chunk_size, **kw)
+
+        result = dr.execute().fetch()
+        expected = pd.date_range("2020-1-1", periods=10, **kw)
+        pd.testing.assert_index_equal(result, expected)
+
+        # end, periods, freq
+        dr = md.date_range(end="2020-1-10", periods=10, chunk_size=chunk_size, **kw)
+
+        result = dr.execute().fetch()
+        expected = pd.date_range(end="2020-1-10", periods=10, **kw)
+        pd.testing.assert_index_equal(result, expected)
+
+        # start, end, freq
+        dr = md.date_range("2020-1-1", "2020-1-10", chunk_size=chunk_size, **kw)
+
+        result = dr.execute().fetch()
+        expected = pd.date_range("2020-1-1", "2020-1-10", **kw)
+        pd.testing.assert_index_equal(result, expected)
+
+        # start, end and periods
+        dr = md.date_range(
+            "2020-1-1", "2020-1-10", periods=19, chunk_size=chunk_size, **kw
+        )
+
+        result = dr.execute().fetch()
+        expected = pd.date_range("2020-1-1", "2020-1-10", periods=19, **kw)
+        pd.testing.assert_index_equal(result, expected)
+
+        # start, end and freq
+        dr = md.date_range(
+            "2020-1-1", "2020-1-10", freq="12H", chunk_size=chunk_size, **kw
+        )
+
+        result = dr.execute().fetch()
+        expected = pd.date_range("2020-1-1", "2020-1-10", freq="12H", **kw)
+        pd.testing.assert_index_equal(result, expected)
+
+    # test timezone
+    dr = md.date_range("2020-1-1", periods=10, tz="Asia/Shanghai", chunk_size=7)
+
+    result = dr.execute().fetch()
+    expected = pd.date_range("2020-1-1", periods=10, tz="Asia/Shanghai")
+    pd.testing.assert_index_equal(result, expected)
+
+    # test periods=0
+    dr = md.date_range("2020-1-1", periods=0)
+
+    result = dr.execute().fetch()
+    expected = pd.date_range("2020-1-1", periods=0)
+    pd.testing.assert_index_equal(result, expected)
+
+    # test start == end
+    dr = md.date_range("2020-1-1", "2020-1-1", periods=1)
+
+    result = dr.execute().fetch()
+    expected = pd.date_range("2020-1-1", "2020-1-1", periods=1)
+    pd.testing.assert_index_equal(result, expected)
+
+    # test normalize=True
+    dr = md.date_range("2020-1-1", periods=10, normalize=True, chunk_size=4)
+
+    result = dr.execute().fetch()
+    expected = pd.date_range("2020-1-1", periods=10, normalize=True)
+    pd.testing.assert_index_equal(result, expected)
+
+    # test freq
+    dr = md.date_range(start="1/1/2018", periods=5, freq="M", chunk_size=3)
+
+    result = dr.execute().fetch()
+    expected = pd.date_range(start="1/1/2018", periods=5, freq="M")
+    pd.testing.assert_index_equal(result, expected)
+
+    dr = md.date_range(start="2018/01/01", end="2018/07/01", freq="M")
+    result = dr.execute().fetch()
+    expected = pd.date_range(start="2018/01/01", end="2018/07/01", freq="M")
+    pd.testing.assert_index_equal(result, expected)
+
+
+parquet_engines = ["auto"]
+if pa is not None:
+    parquet_engines.append("pyarrow")
+if fastparquet is not None:
+    parquet_engines.append("fastparquet")
+
+
+@pytest.mark.skipif(
+    len(parquet_engines) == 1, reason="pyarrow and fastparquet are not installed"
+)
+@pytest.mark.parametrize("engine", parquet_engines)
+def test_read_parquet_arrow(setup, engine):
+    test_df = pd.DataFrame(
+        {
+            "a": np.arange(10).astype(np.int64, copy=False),
+            "b": [f"s{i}" for i in range(10)],
+            "c": np.random.rand(10),
+        }
+    )
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test.csv")
+        test_df.to_parquet(file_path)
+
+        df = md.read_parquet(file_path, engine=engine)
+        result = df.execute().fetch()
+        pd.testing.assert_frame_equal(result, test_df)
+        # size_res = self.executor.execute_dataframe(df, mock=True)
+        # assert sum(s[0] for s in size_res) > test_df.memory_usage(deep=True).sum()
+
+    if engine != "fastparquet":
+        with tempfile.TemporaryDirectory() as tempdir:
+            file_path = os.path.join(tempdir, "test.parquet")
+            test_df.to_parquet(file_path, row_group_size=3)
+
+            df = md.read_parquet(
+                file_path, groups_as_chunks=True, columns=["a", "b"], engine=engine
+            )
+            result = df.execute().fetch()
+            pd.testing.assert_frame_equal(
+                result.reset_index(drop=True), test_df[["a", "b"]]
+            )
+
+    if engine != "fastparquet":
+        with tempfile.TemporaryDirectory() as tempdir:
+            file_path = os.path.join(tempdir, "test.parquet")
+            test_df.to_parquet(file_path, row_group_size=5)
+
+            df = md.read_parquet(
+                file_path,
+                groups_as_chunks=True,
+                use_arrow_dtype=True,
+                incremental_index=True,
+                engine=engine,
+            )
+            result = df.execute().fetch()
+            assert isinstance(df.dtypes.iloc[1], md.ArrowStringDtype)
+            assert isinstance(result.dtypes.iloc[1], md.ArrowStringDtype)
+            pd.testing.assert_frame_equal(arrow_array_to_objects(result), test_df)
+
+    # test wildcards in path
+    for merge_small_file_option in [{"n_sample_file": 1}, None]:
+        with tempfile.TemporaryDirectory() as tempdir:
+            df = pd.DataFrame(
+                {
+                    "a": np.arange(300).astype(np.int64, copy=False),
+                    "b": [f"s{i}" for i in range(300)],
+                    "c": np.random.rand(300),
+                }
+            )
+
+            file_paths = [os.path.join(tempdir, f"test{i}.parquet") for i in range(3)]
+            df[:100].to_parquet(file_paths[0], row_group_size=50)
+            df[100:200].to_parquet(file_paths[1], row_group_size=30)
+            df[200:].to_parquet(file_paths[2])
+
+            mdf = md.read_parquet(f"{tempdir}/*.parquet", engine=engine)
+            r = mdf.execute().fetch()
+            pd.testing.assert_frame_equal(df, r.sort_values("a").reset_index(drop=True))
+
+            mdf = md.read_parquet(f"{tempdir}", engine=engine)
+            r = mdf.execute().fetch()
+            pd.testing.assert_frame_equal(df, r.sort_values("a").reset_index(drop=True))
+
+            file_list = [os.path.join(tempdir, name) for name in os.listdir(tempdir)]
+            mdf = md.read_parquet(file_list, engine=engine)
+            r = mdf.execute().fetch()
+            pd.testing.assert_frame_equal(df, r.sort_values("a").reset_index(drop=True))
+
+            # test `use_arrow_dtype=True`
+            mdf = md.read_parquet(
+                f"{tempdir}/*.parquet", engine=engine, use_arrow_dtype=True
+            )
+            result = mdf.execute().fetch()
+            assert isinstance(mdf.dtypes.iloc[1], md.ArrowStringDtype)
+            assert isinstance(result.dtypes.iloc[1], md.ArrowStringDtype)
+
+            if engine != "fastparquet":
+                mdf = md.read_parquet(
+                    f"{tempdir}/*.parquet",
+                    groups_as_chunks=True,
+                    engine=engine,
+                    merge_small_file_options=merge_small_file_option,
+                )
+                r = mdf.execute().fetch()
+                pd.testing.assert_frame_equal(
+                    df, r.sort_values("a").reset_index(drop=True)
+                )
+
+    # test partitioned
+    with tempfile.TemporaryDirectory() as tempdir:
+        df = pd.DataFrame(
+            {
+                "a": np.random.rand(300),
+                "b": [f"s{i}" for i in range(300)],
+                "c": np.random.choice(["a", "b", "c"], (300,)),
+            }
+        )
+        df.to_parquet(tempdir, partition_cols=["c"])
+        mdf = md.read_parquet(tempdir, engine=engine)
+        r = mdf.execute().fetch().astype(df.dtypes)
+        pd.testing.assert_frame_equal(
+            df.sort_values("a").reset_index(drop=True),
+            r.sort_values("a").reset_index(drop=True),
+        )
+
+
+@pytest.mark.skipif(fastparquet is None, reason="fastparquet not installed")
+def test_read_parquet_fast_parquet(setup):
+    test_df = pd.DataFrame(
+        {
+            "a": np.arange(10).astype(np.int64, copy=False),
+            "b": [f"s{i}" for i in range(10)],
+            "c": np.random.rand(10),
+        }
+    )
+
+    # test fastparquet engine
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test.csv")
+        test_df.to_parquet(file_path, compression=None)
+
+        df = md.read_parquet(file_path, engine="fastparquet")
+        result = df.execute().fetch()
+        pd.testing.assert_frame_equal(result, test_df)
+        # size_res = self.executor.execute_dataframe(df, mock=True)
+        # assert sum(s[0] for s in size_res) > test_df.memory_usage(deep=True).sum()
+
+
+@require_cudf
+def test_read_parquet_gpu_execution(setup_gpu):
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test.parquet")
+
+        df = pd.DataFrame(
+            {
+                "col1": np.random.rand(100),
+                "col2": np.random.choice(["a", "b", "c"], (100,)),
+                "col3": np.arange(100),
+            }
+        )
+        df.to_parquet(file_path, index=False)
+
+        pdf = pd.read_parquet(file_path)
+        mdf = md.read_parquet(file_path, gpu=True).execute().fetch()
+        pd.testing.assert_frame_equal(
+            pdf.reset_index(drop=True), mdf.to_pandas().reset_index(drop=True)
+        )
+
+        mdf2 = md.read_parquet(file_path, gpu=True).execute().fetch()
+        pd.testing.assert_frame_equal(
+            pdf.reset_index(drop=True), mdf2.to_pandas().reset_index(drop=True)
+        )
+
+        mdf3 = md.read_parquet(file_path, gpu=True).head(3).execute().fetch()
+        pd.testing.assert_frame_equal(
+            pdf.reset_index(drop=True).head(3), mdf3.to_pandas().reset_index(drop=True)
+        )
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test.parquet")
+        test_df = pd.DataFrame(
+            {
+                "a": np.arange(10).astype(np.int64, copy=False),
+                "b": [f"s{i}" for i in range(10)],
+                "c": np.random.rand(10),
+            }
+        )
+        test_df.to_parquet(file_path, row_group_size=3)
+
+        df = md.read_parquet(
+            file_path, groups_as_chunks=True, columns=["a", "b"], gpu=True
+        )
+        result = df.execute().fetch().to_pandas()
+        pd.testing.assert_frame_equal(
+            result.reset_index(drop=True), test_df[["a", "b"]]
+        )
+
+    # test partitioned
+    with tempfile.TemporaryDirectory() as tempdir:
+        df = pd.DataFrame(
+            {
+                "a": np.random.rand(300),
+                "b": [f"s{i}" for i in range(300)],
+                "c": np.random.choice(["a", "b", "c"], (300,)),
+            }
+        )
+        df.to_parquet(tempdir, partition_cols=["c"])
+        mdf = md.read_parquet(tempdir, gpu=True)
+        r = mdf.execute().fetch().to_pandas().astype(df.dtypes)
+        pd.testing.assert_frame_equal(
+            df.sort_values("a").reset_index(drop=True),
+            r.sort_values("a").reset_index(drop=True),
+        )
+
+
+@require_ray
+@pytest.mark.skip_ray_dag  # raydataset is not compatible with Ray DAG
+def test_read_raydataset(ray_start_regular, ray_create_mars_cluster):
+    test_df1 = pd.DataFrame(
+        {
+            "a": np.arange(10).astype(np.int64, copy=False),
+            "b": [f"s{i}" for i in range(10)],
+        }
+    )
+    test_df2 = pd.DataFrame(
+        {
+            "a": np.arange(10).astype(np.int64, copy=False),
+            "b": [f"s{i}" for i in range(10)],
+        }
+    )
+    df = pd.concat([test_df1, test_df2])
+    ds = ray.data.from_pandas_refs([ray.put(test_df1), ray.put(test_df2)])
+    mdf = md.read_ray_dataset(ds)
+    assert df.equals(mdf.execute().fetch())
+
+    n = 10000
+    pdf = pd.DataFrame({"a": list(range(n)), "b": list(range(n, 2 * n))})
+    df = md.DataFrame(pdf)
+
+    # Convert mars dataframe to ray dataset
+    ds = md.to_ray_dataset(df)
+    pd.testing.assert_frame_equal(ds.to_pandas(), df.to_pandas())
+    ds2 = ds.filter(lambda row: row["a"] % 2 == 0)
+    assert ds2.take(5) == [{"a": 2 * i, "b": n + 2 * i} for i in range(5)]
+
+    # Convert ray dataset to mars dataframe
+    df2 = md.read_ray_dataset(ds2)
+    pd.testing.assert_frame_equal(
+        df2.head(5).to_pandas(),
+        pd.DataFrame({"a": list(range(0, 10, 2)), "b": list(range(n, n + 10, 2))}),
+    )
+
+    # Test Arrow Dataset
+    pdf2 = pd.DataFrame({c: range(5) for c in "abc"})
+    ds3 = ray.data.from_arrow([pa.Table.from_pandas(pdf2) for _ in range(3)])
+    df3 = md.read_ray_dataset(ds3)
+    pd.testing.assert_frame_equal(
+        df3.head(5).to_pandas(),
+        pdf2,
+    )
+
+    # Test simple datasets
+    with pytest.raises(NotImplementedError):
+        ray.data.range(10).to_mars()
+
+
+@require_ray
+@pytest.mark.skipif(
+    ray_deprecate_ml_dataset in (True, None),
+    reason="Ray (>=2.0) has deprecated MLDataset.",
+)
+def test_read_ray_mldataset(ray_start_regular, ray_create_mars_cluster):
+    test_dfs = [
+        pd.DataFrame(
+            {
+                "a": np.arange(i * 10, (i + 1) * 10).astype(np.int64, copy=False),
+                "b": [f"s{j}" for j in range(i * 10, (i + 1) * 10)],
+            }
+        )
+        for i in range(5)
+    ]
+    import ray.util.iter
+    from ray.util.data import from_parallel_iter
+
+    ml_dataset = from_parallel_iter(
+        ray.util.iter.from_items(test_dfs, num_shards=4), need_convert=False
+    )
+    dfs = []
+    for shard in ml_dataset.shards():
+        dfs.extend(list(shard))
+    df = pd.concat(dfs).reset_index(drop=True)
+    mdf = md.read_ray_mldataset(ml_dataset)
+    pd.testing.assert_frame_equal(df, mdf.execute().fetch())
+    pd.testing.assert_frame_equal(df.head(5), mdf.head(5).execute().fetch())
+    pd.testing.assert_frame_equal(df.head(15), mdf.head(15).execute().fetch())
diff --git a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_hdfs.py b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_hdfs.py
new file mode 100644
index 000000000..f1f3ae5fe
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_hdfs.py
@@ -0,0 +1,141 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+from io import BytesIO, StringIO
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .... import dataframe as md
+from ....tests.core import require_hadoop
+
+TEST_DIR = "/tmp/test"
+
+
+@require_hadoop
+@pytest.fixture(scope="module")
+def setup_hdfs():
+    import pyarrow
+
+    hdfs = pyarrow.hdfs.connect(host="localhost", port=8020)
+    if hdfs.exists(TEST_DIR):
+        hdfs.rm(TEST_DIR, recursive=True)
+    try:
+        yield hdfs
+    finally:
+        if hdfs.exists(TEST_DIR):
+            hdfs.rm(TEST_DIR, recursive=True)
+
+
+@require_hadoop
+def test_read_csv_execution(setup, setup_hdfs):
+    hdfs = setup_hdfs
+
+    with hdfs.open(f"{TEST_DIR}/simple_test.csv", "wb", replication=1) as f:
+        f.write(b"name,amount,id\nAlice,100,1\nBob,200,2")
+
+    df = md.read_csv(f"hdfs://localhost:8020{TEST_DIR}/simple_test.csv")
+    expected = pd.read_csv(BytesIO(b"name,amount,id\nAlice,100,1\nBob,200,2"))
+    res = df.to_pandas()
+    pd.testing.assert_frame_equal(expected, res)
+
+    test_df = pd.DataFrame(
+        {
+            "A": np.random.rand(20),
+            "B": [
+                pd.Timestamp("2020-01-01") + pd.Timedelta(days=random.randint(0, 31))
+                for _ in range(20)
+            ],
+            "C": np.random.rand(20),
+            "D": np.random.randint(0, 100, size=(20,)),
+            "E": ["foo" + str(random.randint(0, 999999)) for _ in range(20)],
+        }
+    )
+    buf = StringIO()
+    test_df[:10].to_csv(buf)
+    csv_content = buf.getvalue().encode()
+
+    buf = StringIO()
+    test_df[10:].to_csv(buf)
+    csv_content2 = buf.getvalue().encode()
+
+    with hdfs.open(f"{TEST_DIR}/chunk_test.csv", "wb", replication=1) as f:
+        f.write(csv_content)
+
+    df = md.read_csv(f"hdfs://localhost:8020{TEST_DIR}/chunk_test.csv", chunk_bytes=50)
+    expected = pd.read_csv(BytesIO(csv_content))
+    res = df.to_pandas()
+    pd.testing.assert_frame_equal(
+        expected.reset_index(drop=True), res.reset_index(drop=True)
+    )
+
+    test_read_dir = f"{TEST_DIR}/test_read_csv_directory"
+    hdfs.mkdir(test_read_dir)
+    with hdfs.open(f"{test_read_dir}/part.csv", "wb", replication=1) as f:
+        f.write(csv_content)
+    with hdfs.open(f"{test_read_dir}/part2.csv", "wb", replication=1) as f:
+        f.write(csv_content2)
+
+    df = md.read_csv(f"hdfs://localhost:8020{test_read_dir}", chunk_bytes=50)
+    expected = pd.concat(
+        [pd.read_csv(BytesIO(csv_content)), pd.read_csv(BytesIO(csv_content2))]
+    )
+    res = df.to_pandas()
+    pd.testing.assert_frame_equal(
+        expected.reset_index(drop=True), res.reset_index(drop=True)
+    )
+
+
+@require_hadoop
+def test_read_parquet_execution(setup, setup_hdfs):
+    hdfs = setup_hdfs
+
+    test_df = pd.DataFrame(
+        {
+            "a": np.arange(10).astype(np.int64, copy=False),
+            "b": [f"s{i}" for i in range(10)],
+            "c": np.random.rand(10),
+        }
+    )
+    test_df2 = pd.DataFrame(
+        {
+            "a": np.arange(10).astype(np.int64, copy=False),
+            "b": [f"s{i}" for i in range(10)],
+            "c": np.random.rand(10),
+        }
+    )
+
+    with hdfs.open(f"{TEST_DIR}/test.parquet", "wb", replication=1) as f:
+        test_df.to_parquet(f, row_group_size=3)
+
+    df = md.read_parquet(f"hdfs://localhost:8020{TEST_DIR}/test.parquet")
+    res = df.to_pandas()
+    pd.testing.assert_frame_equal(res, test_df)
+
+    hdfs.mkdir(f"{TEST_DIR}/test_partitioned")
+
+    with hdfs.open(
+        f"{TEST_DIR}/test_partitioned/file1.parquet", "wb", replication=1
+    ) as f:
+        test_df.to_parquet(f, row_group_size=3)
+    with hdfs.open(
+        f"{TEST_DIR}/test_partitioned/file2.parquet", "wb", replication=1
+    ) as f:
+        test_df2.to_parquet(f, row_group_size=3)
+
+    df = md.read_parquet(f"hdfs://localhost:8020{TEST_DIR}/test_partitioned")
+    res = df.to_pandas()
+    pd.testing.assert_frame_equal(res, pd.concat([test_df, test_df2]))
diff --git a/python/xorbits/_mars/dataframe/datastore/__init__.py b/python/xorbits/_mars/dataframe/datastore/__init__.py
new file mode 100644
index 000000000..3931660d4
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datastore/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def _install():
+    from ..operands import DATAFRAME_TYPE, SERIES_TYPE
+    from .to_csv import to_csv
+    from .to_parquet import to_parquet
+    from .to_sql import to_sql
+    from .to_vineyard import to_vineyard
+
+    for cls in DATAFRAME_TYPE:
+        setattr(cls, "to_csv", to_csv)
+        setattr(cls, "to_sql", to_sql)
+        setattr(cls, "to_parquet", to_parquet)
+        setattr(cls, "to_vineyard", to_vineyard)
+
+    for cls in SERIES_TYPE:
+        setattr(cls, "to_csv", to_csv)
+        setattr(cls, "to_sql", to_sql)
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/dataframe/datastore/tests/__init__.py b/python/xorbits/_mars/dataframe/datastore/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datastore/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/datastore/tests/test_datastore.py b/python/xorbits/_mars/dataframe/datastore/tests/test_datastore.py
new file mode 100644
index 000000000..e13685a3a
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datastore/tests/test_datastore.py
@@ -0,0 +1,42 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ....core import tile
+from ... import DataFrame
+
+
+def test_to_csv():
+    raw = pd.DataFrame(np.random.rand(10, 5))
+    df = DataFrame(raw, chunk_size=4)
+
+    r = df.to_csv("*.csv")
+    r = tile(r)
+
+    assert r.chunk_shape[1] == 1
+    for i, c in enumerate(r.chunks):
+        assert type(c.op).__name__ == "DataFrameToCSV"
+        assert c.inputs[0] is r.inputs[0].chunks[i].data
+
+    # test one file
+    r = df.to_csv("out.csv")
+    r = tile(r)
+
+    assert r.chunk_shape[1] == 1
+    for i, c in enumerate(r.chunks):
+        assert len(c.inputs) == 2
+        assert c.inputs[0].inputs[0] is r.inputs[0].chunks[i].data
+        assert type(c.inputs[1].op).__name__ == "DataFrameToCSVStat"
diff --git a/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_execution.py b/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_execution.py
new file mode 100644
index 000000000..83dca8737
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_execution.py
@@ -0,0 +1,253 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+
+import numpy as np
+import pandas as pd
+import pytest
+
+try:
+    import vineyard
+except ImportError:
+    vineyard = None
+try:
+    import sqlalchemy
+except ImportError:
+    sqlalchemy = None
+try:
+    import pyarrow as pa
+except ImportError:
+    pa = None
+try:
+    import fastparquet
+except ImportError:
+    fastparquet = None
+
+from .... import dataframe as md
+from ....tests.core import flaky
+from ... import DataFrame
+
+
+def test_to_csv_execution(setup):
+    index = pd.RangeIndex(100, 0, -1, name="index")
+    raw = pd.DataFrame(
+        {
+            "col1": np.random.rand(100),
+            "col2": np.random.choice(["a", "b", "c"], (100,)),
+            "col3": np.arange(100),
+        },
+        index=index,
+    )
+    df = DataFrame(raw, chunk_size=33)
+
+    with tempfile.TemporaryDirectory() as base_path:
+        # DATAFRAME TESTS
+        # test one file with dataframe
+        path = os.path.join(base_path, "out.csv")
+
+        df.to_csv(path).execute()
+
+        result = pd.read_csv(path, dtype=raw.dtypes.to_dict())
+        result.set_index("index", inplace=True)
+        pd.testing.assert_frame_equal(result, raw)
+
+        # test multi files with dataframe
+        path = os.path.join(base_path, "out-*.csv")
+        df.to_csv(path).execute()
+
+        dfs = [
+            pd.read_csv(
+                os.path.join(base_path, f"out-{i}.csv"), dtype=raw.dtypes.to_dict()
+            )
+            for i in range(4)
+        ]
+        result = pd.concat(dfs, axis=0)
+        result.set_index("index", inplace=True)
+        pd.testing.assert_frame_equal(result, raw)
+        pd.testing.assert_frame_equal(dfs[1].set_index("index"), raw.iloc[33:66])
+
+        # test df with unknown shape
+        df2 = DataFrame(raw, chunk_size=(50, 2))
+        df2 = df2[df2["col1"] < 1]
+        path2 = os.path.join(base_path, "out2.csv")
+        df2.to_csv(path2).execute()
+
+        result = pd.read_csv(path2, dtype=raw.dtypes.to_dict())
+        result.set_index("index", inplace=True)
+        pd.testing.assert_frame_equal(result, raw)
+
+        # SERIES TESTS
+        series = md.Series(raw.col1, chunk_size=33)
+
+        # test one file with series
+        path = os.path.join(base_path, "out.csv")
+        series.to_csv(path).execute()
+
+        result = pd.read_csv(path, dtype=raw.dtypes.to_dict())
+        result.set_index("index", inplace=True)
+        pd.testing.assert_frame_equal(result, raw.col1.to_frame())
+
+        # test multi files with series
+        path = os.path.join(base_path, "out-*.csv")
+        series.to_csv(path).execute()
+
+        dfs = [
+            pd.read_csv(
+                os.path.join(base_path, f"out-{i}.csv"), dtype=raw.dtypes.to_dict()
+            )
+            for i in range(4)
+        ]
+        result = pd.concat(dfs, axis=0)
+        result.set_index("index", inplace=True)
+        pd.testing.assert_frame_equal(result, raw.col1.to_frame())
+        pd.testing.assert_frame_equal(
+            dfs[1].set_index("index"), raw.col1.to_frame().iloc[33:66]
+        )
+
+
+@pytest.mark.skipif(sqlalchemy is None, reason="sqlalchemy not installed")
+def test_to_sql():
+    index = pd.RangeIndex(100, 0, -1, name="index")
+    raw = pd.DataFrame(
+        {
+            "col1": np.random.rand(100),
+            "col2": np.random.choice(["a", "b", "c"], (100,)),
+            "col3": np.arange(100).astype("int64"),
+        },
+        index=index,
+    )
+
+    with tempfile.TemporaryDirectory() as d:
+        table_name1 = "test_table"
+        table_name2 = "test_table2"
+        uri = "sqlite:///" + os.path.join(d, "test.db")
+
+        engine = sqlalchemy.create_engine(uri)
+
+        # test write dataframe
+        df = DataFrame(raw, chunk_size=33)
+        df.to_sql(table_name1, con=engine).execute()
+
+        written = pd.read_sql(table_name1, con=engine, index_col="index").sort_index(
+            ascending=False
+        )
+        pd.testing.assert_frame_equal(raw, written)
+
+        # test write with existing table
+        with pytest.raises(ValueError):
+            df.to_sql(table_name1, con=uri).execute()
+
+        # test write series
+        series = md.Series(raw.col1, chunk_size=33)
+        with engine.connect() as conn:
+            series.to_sql(table_name2, con=conn).execute()
+
+        written = pd.read_sql(table_name2, con=engine, index_col="index").sort_index(
+            ascending=False
+        )
+        pd.testing.assert_frame_equal(raw.col1.to_frame(), written)
+
+
+@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
+@flaky(max_runs=3)
+def test_to_parquet_arrow_execution(setup):
+    raw = pd.DataFrame(
+        {
+            "col1": np.random.rand(100),
+            "col2": np.arange(100),
+            "col3": np.random.choice(["a", "b", "c"], (100,)),
+        }
+    )
+    df = DataFrame(raw, chunk_size=33)
+
+    with tempfile.TemporaryDirectory() as base_path:
+        # DATAFRAME TESTS
+        path = os.path.join(base_path, "out-*.parquet")
+        df.to_parquet(path).execute()
+
+        read_df = md.read_parquet(path)
+        result = read_df.execute().fetch()
+        result = result.sort_index()
+        pd.testing.assert_frame_equal(result, raw)
+
+        # test read_parquet then to_parquet
+        read_df = md.read_parquet(path)
+        read_df.to_parquet(path).execute()
+
+        # test partition_cols
+        path = os.path.join(base_path, "out-partitioned")
+        df.to_parquet(path, partition_cols=["col3"]).execute()
+
+        read_df = md.read_parquet(path)
+        result = read_df.execute().fetch()
+        result["col3"] = result["col3"].astype("object")
+        pd.testing.assert_frame_equal(
+            result.sort_values("col1").reset_index(drop=True),
+            raw.sort_values("col1").reset_index(drop=True),
+        )
+
+
+@pytest.mark.skipif(fastparquet is None, reason="fastparquet not installed")
+def test_to_parquet_fast_parquet_execution():
+    raw = pd.DataFrame(
+        {
+            "col1": np.random.rand(100),
+            "col2": np.arange(100),
+            "col3": np.random.choice(["a", "b", "c"], (100,)),
+        }
+    )
+    df = DataFrame(raw, chunk_size=33)
+
+    with tempfile.TemporaryDirectory() as base_path:
+        # test fastparquet
+        path = os.path.join(base_path, "out-fastparquet-*.parquet")
+        df.to_parquet(path, engine="fastparquet", compression="gzip").execute()
+
+
+@pytest.mark.skipif(vineyard is None, reason="vineyard not installed")
+def test_vineyard_execution(setup):
+    raw = np.random.RandomState(0).rand(55, 55)
+
+    extra_config = {
+        "check_dtype": False,
+        "check_nsplits": False,
+        "check_shape": False,
+        "check_dtypes": False,
+        "check_columns_value": False,
+        "check_index_value": False,
+    }
+
+    with vineyard.deploy.local.start_vineyardd() as (_, vineyard_socket, _):
+        raw = pd.DataFrame({"a": np.arange(0, 55), "b": np.arange(55, 110)})
+        a = md.DataFrame(raw, chunk_size=15)
+        a.execute()  # n.b.: pre-execute
+
+        b = a.to_vineyard(vineyard_socket=vineyard_socket)
+        object_id = b.execute(extra_config=extra_config).fetch()[0][0]
+
+        c = md.from_vineyard(object_id, vineyard_socket=vineyard_socket)
+        df = c.execute(extra_config=extra_config).fetch()
+        pd.testing.assert_frame_equal(df, raw)
+
+        raw = pd.DataFrame({"a": np.arange(0, 55), "b": np.arange(55, 110)})
+        a = md.DataFrame(raw, chunk_size=15)  # n.b.: no pre-execute
+
+        b = a.to_vineyard(vineyard_socket=vineyard_socket)
+        object_id = b.execute(extra_config=extra_config).fetch()[0][0]
+
+        c = md.from_vineyard(object_id, vineyard_socket=vineyard_socket)
+        df = c.execute(extra_config=extra_config).fetch()
+        pd.testing.assert_frame_equal(df, raw)
diff --git a/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_hdfs.py b/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_hdfs.py
new file mode 100644
index 000000000..47e794d2c
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_hdfs.py
@@ -0,0 +1,66 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .... import dataframe as md
+from ....tests.core import require_hadoop
+
+TEST_DIR = "/tmp/test"
+
+
+@require_hadoop
+@pytest.fixture(scope="module")
+def setup_hdfs():
+    import pyarrow
+
+    hdfs = pyarrow.hdfs.connect(host="localhost", port=8020)
+    if hdfs.exists(TEST_DIR):
+        hdfs.rm(TEST_DIR, recursive=True)
+
+    yield hdfs
+
+    if hdfs.exists(TEST_DIR):
+        hdfs.rm(TEST_DIR, recursive=True)
+
+
+@require_hadoop
+def test_to_parquet_execution(setup, setup_hdfs):
+    hdfs = setup_hdfs
+
+    test_df = pd.DataFrame(
+        {
+            "a": np.arange(10).astype(np.int64, copy=False),
+            "b": [f"s{i}" for i in range(10)],
+            "c": np.random.rand(10),
+        }
+    )
+    df = md.DataFrame(test_df, chunk_size=5)
+
+    dir_name = f"hdfs://localhost:8020{TEST_DIR}/test_to_parquet/"
+    hdfs.mkdir(dir_name)
+    df.to_parquet(dir_name).execute()
+
+    result = md.read_parquet(dir_name).to_pandas()
+    pd.testing.assert_frame_equal(result.reset_index(drop=True), test_df)
+
+    # test wildcard
+    dir_name = f"hdfs://localhost:8020{TEST_DIR}/test_to_parquet2/*.parquet"
+    hdfs.mkdir(dir_name.rsplit("/", 1)[0])
+    df.to_parquet(dir_name).execute()
+
+    result = md.read_parquet(dir_name.rsplit("/", 1)[0]).to_pandas()
+    pd.testing.assert_frame_equal(result.reset_index(drop=True), test_df)
diff --git a/python/xorbits/_mars/dataframe/datastore/to_csv.py b/python/xorbits/_mars/dataframe/datastore/to_csv.py
new file mode 100644
index 000000000..3bd4500c2
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datastore/to_csv.py
@@ -0,0 +1,601 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from io import StringIO
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import OutputType, recursive_tile
+from ...core.operand import OperandStage
+from ...lib.filesystem import open_file
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    DictField,
+    Int32Field,
+    Int64Field,
+    KeyField,
+    ListField,
+    StringField,
+)
+from ...tensor.core import TensorOrder
+from ...tensor.operands import TensorOperand, TensorOperandMixin
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index
+
+
+class DataFrameToCSV(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.TO_CSV
+
+    _input = KeyField("input")
+    _path = AnyField("path")
+    _sep = StringField("sep")
+    _na_rep = StringField("na_rep")
+    _float_format = StringField("float_format")
+    _columns = ListField("columns")
+    _header = AnyField("header")
+    _index = BoolField("index")
+    _index_label = AnyField("index_label")
+    _mode = StringField("mode")
+    _encoding = StringField("encoding")
+    _compression = AnyField("compression")
+    _quoting = Int32Field("quoting")
+    _quotechar = StringField("quotechar")
+    _line_terminator = StringField("line_terminator")
+    _chunksize = Int64Field("chunksize")
+    _date_format = StringField("date_format")
+    _doublequote = BoolField("doublequote")
+    _escapechar = StringField("escapechar")
+    _decimal = StringField("decimal")
+    _storage_options = DictField("storage_options")
+    # for chunk
+    _output_stat = BoolField("output_stat")
+
+    def __init__(
+        self,
+        path=None,
+        sep=None,
+        na_rep=None,
+        float_format=None,
+        columns=None,
+        header=None,
+        index=None,
+        index_label=None,
+        mode=None,
+        encoding=None,
+        compression=None,
+        quoting=None,
+        quotechar=None,
+        line_terminator=None,
+        chunksize=None,
+        date_format=None,
+        doublequote=None,
+        escapechar=None,
+        decimal=None,
+        output_stat=None,
+        storage_options=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _path=path,
+            _sep=sep,
+            _na_rep=na_rep,
+            _float_format=float_format,
+            _columns=columns,
+            _header=header,
+            _index=index,
+            _index_label=index_label,
+            _mode=mode,
+            _encoding=encoding,
+            _compression=compression,
+            _quoting=quoting,
+            _quotechar=quotechar,
+            _line_terminator=line_terminator,
+            _chunksize=chunksize,
+            _date_format=date_format,
+            _doublequote=doublequote,
+            _escapechar=escapechar,
+            _decimal=decimal,
+            _output_stat=output_stat,
+            _storage_options=storage_options,
+            _output_types=output_types,
+            **kw
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def path(self):
+        return self._path
+
+    @property
+    def sep(self):
+        return self._sep
+
+    @property
+    def na_rep(self):
+        return self._na_rep
+
+    @property
+    def float_format(self):
+        return self._float_format
+
+    @property
+    def columns(self):
+        return self._columns
+
+    @property
+    def header(self):
+        return self._header
+
+    @property
+    def index(self):
+        return self._index
+
+    @property
+    def index_label(self):
+        return self._index_label
+
+    @property
+    def mode(self):
+        return self._mode
+
+    @property
+    def encoding(self):
+        return self._encoding
+
+    @property
+    def compression(self):
+        return self._compression
+
+    @property
+    def quoting(self):
+        return self._quoting
+
+    @property
+    def quotechar(self):
+        return self._quotechar
+
+    @property
+    def line_terminator(self):
+        return self._line_terminator
+
+    @property
+    def chunksize(self):
+        return self._chunksize
+
+    @property
+    def date_format(self):
+        return self._date_format
+
+    @property
+    def doublequote(self):
+        return self._doublequote
+
+    @property
+    def escapechar(self):
+        return self._escapechar
+
+    @property
+    def decimal(self):
+        return self._decimal
+
+    @property
+    def storage_options(self):
+        return self._storage_options
+
+    @property
+    def one_file(self):
+        # if wildcard in path, write csv into multiple files
+        return "*" not in self._path
+
+    @property
+    def output_stat(self):
+        return self._output_stat
+
+    @property
+    def output_limit(self):
+        return 1 if not self.output_stat else 2
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    @classmethod
+    def tile(cls, op: "DataFrameToCSV"):
+        in_df = op.input
+        out_df = op.outputs[0]
+
+        if in_df.ndim == 2 and in_df.chunk_shape[1] > 1:
+            # make sure only 1 chunk on the column axis
+            in_df = yield from recursive_tile(in_df.rechunk({1: in_df.shape[1]}))
+
+        one_file = op.one_file
+
+        out_chunks = [], []
+        for chunk in in_df.chunks:
+            chunk_op = op.copy().reset_key()
+            if not one_file:
+                index_value = parse_index(chunk.index_value.to_pandas()[:0], chunk)
+                if chunk.ndim == 2:
+                    out_chunk = chunk_op.new_chunk(
+                        [chunk],
+                        shape=(0, 0),
+                        index_value=index_value,
+                        columns_value=out_df.columns_value,
+                        dtypes=out_df.dtypes,
+                        index=chunk.index,
+                    )
+                else:
+                    out_chunk = chunk_op.new_chunk(
+                        [chunk],
+                        shape=(0,),
+                        index_value=index_value,
+                        dtype=out_df.dtype,
+                        index=chunk.index,
+                    )
+                out_chunks[0].append(out_chunk)
+            else:
+                chunk_op._output_stat = True
+                chunk_op.stage = OperandStage.map
+                chunk_op.output_types = [OutputType.scalar] * 2
+                # bytes of csv
+                kws = [
+                    {
+                        "shape": (),
+                        "dtype": np.dtype(np.str_),
+                        "index": chunk.index,
+                        "order": TensorOrder.C_ORDER,
+                        "output_type": OutputType.scalar,
+                        "type": "csv",
+                    },
+                    {
+                        "shape": (),
+                        "dtype": np.dtype(np.intp),
+                        "index": chunk.index,
+                        "order": TensorOrder.C_ORDER,
+                        "output_type": OutputType.scalar,
+                        "type": "stat",
+                    },
+                ]
+                chunks = chunk_op.new_chunks([chunk], kws=kws, output_limit=len(kws))
+                out_chunks[0].append(chunks[0])
+                out_chunks[1].append(chunks[1])
+
+        if not one_file:
+            out_chunks = out_chunks[0]
+        else:
+            stat_chunk = DataFrameToCSVStat(
+                path=op.path,
+                dtype=np.dtype(np.int64),
+                storage_options=op.storage_options,
+            ).new_chunk(
+                out_chunks[1], shape=(len(out_chunks[0]),), order=TensorOrder.C_ORDER
+            )
+            new_out_chunks = []
+            for c in out_chunks[0]:
+                op = DataFrameToCSV(
+                    stage=OperandStage.agg,
+                    path=op.path,
+                    storage_options=op.storage_options,
+                    output_types=op.output_types,
+                )
+                if out_df.ndim == 2:
+                    out_chunk = op.new_chunk(
+                        [c, stat_chunk],
+                        shape=(0, 0),
+                        dtypes=out_df.dtypes,
+                        index_value=out_df.index_value,
+                        columns_value=out_df.columns_value,
+                        index=c.index,
+                    )
+                else:
+                    out_chunk = op.new_chunk(
+                        [c, stat_chunk],
+                        shape=(0,),
+                        dtype=out_df.dtype,
+                        index_value=out_df.index_value,
+                        index=c.index,
+                    )
+                new_out_chunks.append(out_chunk)
+            out_chunks = new_out_chunks
+
+        new_op = op.copy()
+        params = out_df.params.copy()
+        if out_df.ndim == 2:
+            params.update(
+                dict(chunks=out_chunks, nsplits=((0,) * in_df.chunk_shape[0], (0,)))
+            )
+        else:
+            params.update(
+                dict(chunks=out_chunks, nsplits=((0,) * in_df.chunk_shape[0],))
+            )
+        return new_op.new_tileables([in_df], **params)
+
+    def __call__(self, df):
+        index_value = parse_index(df.index_value.to_pandas()[:0], df)
+        if df.ndim == 2:
+            columns_value = parse_index(
+                df.columns_value.to_pandas()[:0], store_data=True
+            )
+            return self.new_dataframe(
+                [df],
+                shape=(0, 0),
+                dtypes=df.dtypes[:0],
+                index_value=index_value,
+                columns_value=columns_value,
+            )
+        else:
+            return self.new_series(
+                [df], shape=(0,), dtype=df.dtype, index_value=index_value
+            )
+
+    @classmethod
+    def _to_csv(cls, op, df, path, header=None):
+        if header is None:
+            header = op.header
+        df.to_csv(
+            path,
+            sep=op.sep,
+            na_rep=op.na_rep,
+            float_format=op.float_format,
+            columns=op.columns,
+            header=header,
+            index=op.index,
+            index_label=op.index_label,
+            mode=op.mode,
+            encoding=op.encoding,
+            compression=op.compression,
+            quoting=op.quoting,
+            quotechar=op.quotechar,
+            line_terminator=op.line_terminator,
+            chunksize=op.chunksize,
+            date_format=op.date_format,
+            doublequote=op.doublequote,
+            escapechar=op.escapechar,
+            decimal=op.decimal,
+        )
+
+    @classmethod
+    def _execute_map(cls, ctx, op):
+        out = op.outputs[0]
+
+        df = ctx[op.input.key]
+        sio = StringIO()
+        header = op.header if out.index[0] == 0 else False
+        # do not output header if index of chunk > 0
+        cls._to_csv(op, df, sio, header=header)
+
+        ret = sio.getvalue().encode(op.encoding or "utf-8")
+        ctx[op.outputs[0].key] = ret
+        ctx[op.outputs[1].key] = len(ret)
+
+    @classmethod
+    def _execute_agg(cls, ctx, op):
+        out = op.outputs[0]
+        i = out.index[0]
+        path = cls._get_path(op.path, i)
+
+        csv_bytes, offsets = [ctx[inp.key] for inp in op.inputs]
+        offset_start = offsets[i]
+
+        # write csv bytes into file
+        with open_file(path, mode="r+b", storage_options=op.storage_options) as f:
+            f.seek(offset_start)
+            f.write(csv_bytes)
+
+        ctx[out.key] = (
+            pd.DataFrame() if out.ndim == 2 else pd.Series([], dtype=out.dtype)
+        )
+
+    @classmethod
+    def _get_path(cls, path, i):
+        if "*" not in path:
+            return path
+        return path.replace("*", str(i))
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            cls._execute_map(ctx, op)
+        elif op.stage == OperandStage.agg:
+            cls._execute_agg(ctx, op)
+        else:
+            assert op.stage is None
+            df = ctx[op.input.key]
+            out = op.outputs[0]
+            path = cls._get_path(op.path, op.outputs[0].index[0])
+            with open_file(path, mode="w", storage_options=op.storage_options) as f:
+                cls._to_csv(op, df, f)
+            ctx[out.key] = (
+                pd.DataFrame() if out.ndim == 2 else pd.Series([], dtype=out.dtype)
+            )
+
+
+class DataFrameToCSVStat(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.TO_CSV_STAT
+
+    _path = AnyField("path")
+    _storage_options = DictField("storage_options")
+
+    def __init__(self, path=None, storage_options=None, dtype=None, **kw):
+        super().__init__(
+            _path=path, _storage_options=storage_options, dtype=dtype, **kw
+        )
+
+    @property
+    def path(self):
+        return self._path
+
+    @property
+    def storage_options(self):
+        return self._storage_options
+
+    @classmethod
+    def execute(cls, ctx, op):
+        sizes = [ctx[inp.key] for inp in op.inputs]
+        total_bytes = sum(sizes)
+        offsets = np.cumsum([0] + sizes)[:-1]
+
+        # write NULL bytes into file
+        with open_file(op.path, mode="wb", storage_options=op.storage_options) as f:
+            rest = total_bytes
+            while rest > 0:
+                # at most 4M
+                write_bytes = min(4 * 1024**2, rest)
+                f.write(b"\00" * write_bytes)
+                rest -= write_bytes
+
+        ctx[op.outputs[0].key] = offsets
+
+
+def to_csv(
+    df,
+    path,
+    sep=",",
+    na_rep="",
+    float_format=None,
+    columns=None,
+    header=True,
+    index=True,
+    index_label=None,
+    mode="w",
+    encoding=None,
+    compression="infer",
+    quoting=None,
+    quotechar='"',
+    line_terminator=None,
+    chunksize=None,
+    date_format=None,
+    doublequote=True,
+    escapechar=None,
+    decimal=".",
+    storage_options=None,
+):
+    r"""
+    Write object to a comma-separated values (csv) file.
+
+    Parameters
+    ----------
+    path : str
+        File path.
+        If path is a string with wildcard e.g. '/to/path/out-*.csv',
+        to_csv will try to write multiple files, for instance,
+        chunk (0, 0) will write data into '/to/path/out-0.csv'.
+        If path is a string without wildcard,
+        all data will be written into a single file.
+    sep : str, default ','
+        String of length 1. Field delimiter for the output file.
+    na_rep : str, default ''
+        Missing data representation.
+    float_format : str, default None
+        Format string for floating point numbers.
+    columns : sequence, optional
+        Columns to write.
+    header : bool or list of str, default True
+        Write out the column names. If a list of strings is given it is
+        assumed to be aliases for the column names.
+    index : bool, default True
+        Write row names (index).
+    index_label : str or sequence, or False, default None
+        Column label for index column(s) if desired. If None is given, and
+        `header` and `index` are True, then the index names are used. A
+        sequence should be given if the object uses MultiIndex. If
+        False do not print fields for index names. Use index_label=False
+        for easier importing in R.
+    mode : str
+        Python write mode, default 'w'.
+    encoding : str, optional
+        A string representing the encoding to use in the output file,
+        defaults to 'utf-8'.
+    compression : str or dict, default 'infer'
+        If str, represents compression mode. If dict, value at 'method' is
+        the compression mode. Compression mode may be any of the following
+        possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If
+        compression mode is 'infer' and `path_or_buf` is path-like, then
+        detect compression mode from the following extensions: '.gz',
+        '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given
+        and mode is 'zip' or inferred as 'zip', other entries passed as
+        additional compression options.
+    quoting : optional constant from csv module
+        Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
+        then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
+        will treat them as non-numeric.
+    quotechar : str, default '\"'
+        String of length 1. Character used to quote fields.
+    line_terminator : str, optional
+        The newline character or character sequence to use in the output
+        file. Defaults to `os.linesep`, which depends on the OS in which
+        this method is called ('\n' for linux, '\r\n' for Windows, i.e.).
+    chunksize : int or None
+        Rows to write at a time.
+    date_format : str, default None
+        Format string for datetime objects.
+    doublequote : bool, default True
+        Control quoting of `quotechar` inside a field.
+    escapechar : str, default None
+        String of length 1. Character used to escape `sep` and `quotechar`
+        when appropriate.
+    decimal : str, default '.'
+        Character recognized as decimal separator. E.g. use ',' for
+        European data.
+    Returns
+    -------
+    None or str
+        If path_or_buf is None, returns the resulting csv format as a
+        string. Otherwise returns None.
+
+    See Also
+    --------
+    read_csv : Load a CSV file into a DataFrame.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'name': ['Raphael', 'Donatello'],
+    ...                    'mask': ['red', 'purple'],
+    ...                    'weapon': ['sai', 'bo staff']})
+    >>> df.to_csv('out.csv', index=False).execute()
+    """
+
+    if mode != "w":  # pragma: no cover
+        raise NotImplementedError("only support to_csv with mode 'w' for now")
+    op = DataFrameToCSV(
+        path=path,
+        sep=sep,
+        na_rep=na_rep,
+        float_format=float_format,
+        columns=columns,
+        header=header,
+        index=index,
+        index_label=index_label,
+        mode=mode,
+        encoding=encoding,
+        compression=compression,
+        quoting=quoting,
+        quotechar=quotechar,
+        line_terminator=line_terminator,
+        chunksize=chunksize,
+        date_format=date_format,
+        doublequote=doublequote,
+        escapechar=escapechar,
+        decimal=decimal,
+        storage_options=storage_options,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/datastore/to_parquet.py b/python/xorbits/_mars/dataframe/datastore/to_parquet.py
new file mode 100644
index 000000000..e9793d82b
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datastore/to_parquet.py
@@ -0,0 +1,282 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import recursive_tile
+from ...lib.filesystem import get_fs, open_file
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    DictField,
+    KeyField,
+    ListField,
+    StringField,
+)
+from ...utils import has_unknown_shape
+from ..datasource.read_parquet import check_engine
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index
+
+try:
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+except ImportError:  # pragma: no cover
+    pq = None
+    pa = None
+
+
+class DataFrameToParquet(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.TO_PARQUET
+
+    _input = KeyField("input")
+    _path = AnyField("path")
+    _engine = StringField("engine")
+    _index = BoolField("index")
+    _compression = AnyField("compression")
+    _partition_cols = ListField("partition_cols")
+    _additional_kwargs = DictField("additional_kwargs")
+    _storage_options = DictField("storage_options")
+
+    def __init__(
+        self,
+        path=None,
+        engine=None,
+        index=None,
+        compression=None,
+        partition_cols=None,
+        storage_options=None,
+        additional_kwargs=None,
+        **kw,
+    ):
+        super().__init__(
+            _path=path,
+            _engine=engine,
+            _index=index,
+            _compression=compression,
+            _partition_cols=partition_cols,
+            _storage_options=storage_options,
+            _additional_kwargs=additional_kwargs,
+            **kw,
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def path(self):
+        return self._path
+
+    @property
+    def engine(self):
+        return self._engine
+
+    @property
+    def index(self):
+        return self._index
+
+    @property
+    def compression(self):
+        return self._compression
+
+    @property
+    def partition_cols(self):
+        return self._partition_cols
+
+    @property
+    def storage_options(self):
+        return self._storage_options
+
+    @property
+    def additional_kwargs(self):
+        return self._additional_kwargs
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    @classmethod
+    def _get_path(cls, path, i):
+        if "*" not in path:
+            return path
+        return path.replace("*", str(i))
+
+    @classmethod
+    def tile(cls, op):
+        in_df = op.input
+        out_df = op.outputs[0]
+
+        # make sure only 1 chunk on the column axis
+        if in_df.chunk_shape[1] > 1:
+            if has_unknown_shape(in_df):
+                yield
+            in_df = yield from recursive_tile(in_df.rechunk({1: in_df.shape[1]}))
+
+        out_chunks = []
+        for chunk in in_df.chunks:
+            chunk_op = op.copy().reset_key()
+            index_value = parse_index(chunk.index_value.to_pandas()[:0], chunk)
+            out_chunk = chunk_op.new_chunk(
+                [chunk],
+                shape=(0, 0),
+                index_value=index_value,
+                columns_value=out_df.columns_value,
+                dtypes=out_df.dtypes,
+                index=chunk.index,
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        params = out_df.params.copy()
+        params.update(
+            dict(chunks=out_chunks, nsplits=((0,) * in_df.chunk_shape[0], (0,)))
+        )
+        return new_op.new_tileables([in_df], **params)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        df = ctx[op.input.key]
+        out = op.outputs[0]
+        i = op.outputs[0].index[0]
+        path = op.path
+        has_wildcard = False
+        if "*" in path:
+            path = path.replace("*", str(i))
+            has_wildcard = True
+
+        if op.partition_cols is None:
+            if not has_wildcard:
+                fs = get_fs(path, op.storage_options)
+                path = fs.pathsep.join([path.rstrip(fs.pathsep), f"{i}.parquet"])
+            if op.engine == "fastparquet":
+                df.to_parquet(
+                    path,
+                    engine=op.engine,
+                    compression=op.compression,
+                    index=op.index,
+                    open_with=open_file,
+                    **op.additional_kwargs,
+                )
+            else:
+                with open_file(
+                    path, mode="wb", storage_options=op.storage_options
+                ) as f:
+                    df.to_parquet(
+                        f,
+                        engine=op.engine,
+                        compression=op.compression,
+                        index=op.index,
+                        **op.additional_kwargs or dict(),
+                    )
+        else:
+            if op.engine == "pyarrow":
+                pq.write_to_dataset(
+                    pa.Table.from_pandas(df), path, partition_cols=op.partition_cols
+                )
+            else:  # pragma: no cover
+                raise NotImplementedError(
+                    "Only support pyarrow engine when specify `partition_cols`."
+                )
+
+        ctx[out.key] = pd.DataFrame()
+
+    def __call__(self, df):
+        index_value = parse_index(df.index_value.to_pandas()[:0], df)
+        columns_value = parse_index(df.columns_value.to_pandas()[:0], store_data=True)
+        return self.new_dataframe(
+            [df],
+            shape=(0, 0),
+            dtypes=df.dtypes[:0],
+            index_value=index_value,
+            columns_value=columns_value,
+        )
+
+
+def to_parquet(
+    df,
+    path,
+    engine="auto",
+    compression="snappy",
+    index=None,
+    partition_cols=None,
+    storage_options: dict = None,
+    **kwargs,
+):
+    """
+    Write a DataFrame to the binary parquet format, each chunk will be
+    written to a Parquet file.
+
+    Parameters
+    ----------
+    path : str or file-like object
+        If path is a string with wildcard e.g. '/to/path/out-*.parquet',
+        `to_parquet` will try to write multiple files, for instance,
+        chunk (0, 0) will write data into '/to/path/out-0.parquet'.
+        If path is a string without wildcard, we will treat it as a directory.
+
+    engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
+        Parquet library to use. The default behavior is to try 'pyarrow',
+        falling back to 'fastparquet' if 'pyarrow' is unavailable.
+
+    compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
+        Name of the compression to use. Use ``None`` for no compression.
+
+    index : bool, default None
+        If ``True``, include the dataframe's index(es) in the file output.
+        If ``False``, they will not be written to the file.
+        If ``None``, similar to ``True`` the dataframe's index(es)
+        will be saved. However, instead of being saved as values,
+        the RangeIndex will be stored as a range in the metadata so it
+        doesn't require much space and is faster. Other indexes will
+        be included as columns in the file output.
+
+    partition_cols : list, optional, default None
+        Column names by which to partition the dataset.
+        Columns are partitioned in the order they are given.
+        Must be None if path is not a string.
+
+    **kwargs
+        Additional arguments passed to the parquet library.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
+    >>> df.to_parquet('*.parquet.gzip',
+    ...               compression='gzip').execute()  # doctest: +SKIP
+    >>> md.read_parquet('*.parquet.gzip').execute()  # doctest: +SKIP
+       col1  col2
+    0     1     3
+    1     2     4
+
+    >>> import io
+    >>> f = io.BytesIO()
+    >>> df.to_parquet(f).execute()
+    >>> f.seek(0)
+    0
+    >>> content = f.read()
+    """
+    engine = check_engine(engine)
+    op = DataFrameToParquet(
+        path=path,
+        engine=engine,
+        compression=compression,
+        index=index,
+        partition_cols=partition_cols,
+        storage_options=storage_options,
+        additional_kwargs=kwargs,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/datastore/to_sql.py b/python/xorbits/_mars/dataframe/datastore/to_sql.py
new file mode 100644
index 000000000..00d4e33aa
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datastore/to_sql.py
@@ -0,0 +1,352 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cloudpickle
+import pandas as pd
+
+from ... import opcodes
+from ...core import recursive_tile
+from ...core.operand import OperatorLogicKeyGeneratorMixin
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    BytesField,
+    Int64Field,
+    StringField,
+)
+from ..core import DATAFRAME_TYPE
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import (
+    build_empty_df,
+    build_empty_series,
+    create_sa_connection,
+    parse_index,
+)
+
+
+class DataFrameToSQLTableLogicKeyGeneratorMixin(OperatorLogicKeyGeneratorMixin):
+    def _get_logic_key_token_values(self):
+        fields_to_tokenize = [
+            getattr(self, k, None)
+            for k in [
+                "table_name",
+                "schema",
+                "if_exists",
+                "index",
+                "index_label",
+                "chunksize",
+                "dtype",
+                "method",
+            ]
+        ]
+        return super()._get_logic_key_token_values() + fields_to_tokenize
+
+
+class DataFrameToSQLTable(
+    DataFrameOperand, DataFrameOperandMixin, DataFrameToSQLTableLogicKeyGeneratorMixin
+):
+    _op_type_ = opcodes.TO_SQL
+
+    table_name = StringField("table_name")
+    con = AnyField("con")
+    schema = StringField("schema")
+    if_exists = StringField("if_exists")
+    index = BoolField("index")
+    index_label = AnyField("index_label")
+    chunksize = Int64Field("chunksize")
+    dtype = AnyField("dtype")
+    method = AnyField("method")
+    engine_kwargs = BytesField(
+        "engine_kwargs",
+        on_serialize=cloudpickle.dumps,
+        on_deserialize=cloudpickle.loads,
+        default=None,
+    )
+
+    def __call__(self, df_or_series):
+        with create_sa_connection(self.con, **(self.engine_kwargs or dict())) as con:
+            self.con = str(con.engine.url)
+            empty_index = df_or_series.index_value.to_pandas()[:0]
+            if isinstance(df_or_series, DATAFRAME_TYPE):
+                empty_obj = build_empty_df(df_or_series.dtypes, index=empty_index)
+            else:
+                empty_obj = build_empty_series(
+                    df_or_series.dtype, index=empty_index, name=df_or_series.name
+                )
+
+            empty_obj.to_sql(
+                self.table_name,
+                con=con,
+                schema=self.schema,
+                if_exists=self.if_exists,
+                index=self.index,
+                index_label=self.index_label,
+                dtype=self.dtype,
+            )
+
+            index_value = parse_index(
+                df_or_series.index_value.to_pandas()[:0], df_or_series.key, "index"
+            )
+            if isinstance(df_or_series, DATAFRAME_TYPE):
+                columns_value = parse_index(
+                    df_or_series.columns_value.to_pandas()[:0],
+                    df_or_series.key,
+                    "columns",
+                    store_data=True,
+                )
+                return self.new_dataframe(
+                    [df_or_series],
+                    shape=(0, 0),
+                    dtypes=df_or_series.dtypes[:0],
+                    index_value=index_value,
+                    columns_value=columns_value,
+                )
+            else:
+                return self.new_series(
+                    [df_or_series],
+                    shape=(0,),
+                    dtype=df_or_series.dtype,
+                    index_value=index_value,
+                )
+
+    @classmethod
+    def tile(cls, op: "DataFrameToSQLTable"):
+        inp = op.inputs[0]
+        out = op.outputs[0]
+        if inp.ndim == 2:
+            inp = yield from recursive_tile(inp.rechunk({1: (inp.shape[1],)}))
+
+        chunks = []
+        for c in inp.chunks:
+            new_op = op.copy().reset_key()
+            new_op.if_exists = "append"
+
+            index_value = parse_index(c.index_value.to_pandas()[:0], c)
+            if c.ndim == 2:
+                columns_value = parse_index(
+                    c.columns_value.to_pandas()[:0], store_data=True
+                )
+                chunks.append(
+                    new_op.new_chunk(
+                        [c],
+                        shape=(0, 0),
+                        index=c.index,
+                        dtypes=out.dtypes,
+                        index_value=index_value,
+                        columns_value=columns_value,
+                    )
+                )
+            else:
+                chunks.append(
+                    new_op.new_chunk(
+                        [c],
+                        shape=(0,),
+                        index=c.index,
+                        dtype=out.dtype,
+                        index_value=index_value,
+                    )
+                )
+
+        new_op = op.copy().reset_key()
+        params = out.params.copy()
+        params["nsplits"] = tuple((0,) * len(sp) for sp in inp.nsplits)
+        return new_op.new_tileables([inp], chunks=chunks, **params)
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameToSQLTable"):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+        in_data = ctx[in_df.key]
+
+        import sqlalchemy as sa
+
+        engine = sa.create_engine(op.con, **(op.engine_kwargs or dict()))
+
+        try:
+            with engine.connect() as connection:
+                with connection.begin():
+                    in_data.to_sql(
+                        op.table_name,
+                        con=connection,
+                        if_exists=op.if_exists,
+                        index=op.index,
+                        index_label=op.index_label,
+                        chunksize=op.chunksize,
+                        dtype=op.dtype,
+                        method=op.method,
+                    )
+        finally:
+            engine.dispose()
+
+        if in_df.ndim == 2:
+            ctx[out_df.key] = pd.DataFrame()
+        else:
+            ctx[out_df.key] = pd.Series([], dtype=in_data.dtype)
+
+
+def to_sql(
+    df,
+    name: str,
+    con,
+    schema=None,
+    if_exists: str = "fail",
+    index: bool = True,
+    index_label=None,
+    chunksize=None,
+    dtype=None,
+    method=None,
+):
+    """
+    Write records stored in a DataFrame to a SQL database.
+
+    Databases supported by SQLAlchemy [1]_ are supported. Tables can be
+    newly created, appended to, or overwritten.
+
+    Parameters
+    ----------
+    name : str
+        Name of SQL table.
+    con : sqlalchemy.engine.Engine or sqlite3.Connection
+        Using SQLAlchemy makes it possible to use any DB supported by that
+        library. Legacy support is provided for sqlite3.Connection objects. The user
+        is responsible for engine disposal and connection closure for the SQLAlchemy
+        connectable See `here                 <https://docs.sqlalchemy.org/en/13/core/connections.html>`_
+
+    schema : str, optional
+        Specify the schema (if database flavor supports this). If None, use
+        default schema.
+    if_exists : {'fail', 'replace', 'append'}, default 'fail'
+        How to behave if the table already exists.
+
+        * fail: Raise a ValueError.
+        * replace: Drop the table before inserting new values.
+        * append: Insert new values to the existing table.
+
+    index : bool, default True
+        Write DataFrame index as a column. Uses `index_label` as the column
+        name in the table.
+    index_label : str or sequence, default None
+        Column label for index column(s). If None is given (default) and
+        `index` is True, then the index names are used.
+        A sequence should be given if the DataFrame uses MultiIndex.
+    chunksize : int, optional
+        Specify the number of rows in each batch to be written at a time.
+        By default, all rows will be written at once.
+    dtype : dict or scalar, optional
+        Specifying the datatype for columns. If a dictionary is used, the
+        keys should be the column names and the values should be the
+        SQLAlchemy types or strings for the sqlite3 legacy mode. If a
+        scalar is provided, it will be applied to all columns.
+    method : {None, 'multi', callable}, optional
+        Controls the SQL insertion clause used:
+
+        * None : Uses standard SQL ``INSERT`` clause (one per row).
+        * 'multi': Pass multiple values in a single ``INSERT`` clause.
+        * callable with signature ``(pd_table, conn, keys, data_iter)``.
+
+        Details and a sample callable implementation can be found in the
+        section :ref:`insert method <io.sql.method>`.
+
+        .. versionadded:: 0.24.0
+
+    Raises
+    ------
+    ValueError
+        When the table already exists and `if_exists` is 'fail' (the
+        default).
+
+    See Also
+    --------
+    read_sql : Read a DataFrame from a table.
+
+    Notes
+    -----
+    Timezone aware datetime columns will be written as
+    ``Timestamp with timezone`` type with SQLAlchemy if supported by the
+    database. Otherwise, the datetimes will be stored as timezone unaware
+    timestamps local to the original timezone.
+
+    .. versionadded:: 0.24.0
+
+    References
+    ----------
+    .. [1] http://docs.sqlalchemy.org
+    .. [2] https://www.python.org/dev/peps/pep-0249/
+
+    Examples
+    --------
+
+    Create an in-memory SQLite database.
+
+    >>> import mars.dataframe as md
+    >>> from sqlalchemy import create_engine
+    >>> engine = create_engine('sqlite:////tmp/temp.db')
+
+    Create a table from scratch with 3 rows.
+
+    >>> df = md.DataFrame({'name' : ['User 1', 'User 2', 'User 3']})
+    >>> df.execute()
+         name
+    0  User 1
+    1  User 2
+    2  User 3
+
+    >>> df.to_sql('users', con=engine).execute()
+    >>> engine.execute("SELECT * FROM users").fetchall()
+    [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')]
+
+    >>> df1 = md.DataFrame({'name' : ['User 4', 'User 5']})
+    >>> df1.to_sql('users', con=engine, if_exists='append').execute()
+    >>> engine.execute("SELECT * FROM users").fetchall()
+    [(0, 'User 1'), (1, 'User 2'), (2, 'User 3'),
+     (0, 'User 4'), (1, 'User 5')]
+
+    Overwrite the table with just ``df1``.
+
+    >>> df1.to_sql('users', con=engine, if_exists='replace',
+    ...            index_label='id').execute()
+    >>> engine.execute("SELECT * FROM users").fetchall()
+    [(0, 'User 4'), (1, 'User 5')]
+
+    Specify the dtype (especially useful for integers with missing values).
+    Notice that while pandas is forced to store the data as floating point,
+    the database supports nullable integers. When fetching the data with
+    Python, we get back integer scalars.
+
+    >>> df = md.DataFrame({"A": [1, None, 2]})
+    >>> df.execute()
+         A
+    0  1.0
+    1  NaN
+    2  2.0
+
+    >>> from sqlalchemy.types import Integer
+    >>> df.to_sql('integers', con=engine, index=False,
+    ...           dtype={"A": Integer()}).execute()
+
+    >>> engine.execute("SELECT * FROM integers").fetchall()
+    [(1,), (None,), (2,)]
+    """
+    op = DataFrameToSQLTable(
+        table_name=name,
+        con=con,
+        schema=schema,
+        if_exists=if_exists,
+        index=index,
+        index_label=index_label,
+        chunksize=chunksize,
+        dtype=dtype,
+        method=method,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/datastore/to_vineyard.py b/python/xorbits/_mars/dataframe/datastore/to_vineyard.py
new file mode 100644
index 000000000..25236caa1
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/datastore/to_vineyard.py
@@ -0,0 +1,192 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import OutputType
+from ...serialization.serializables import FieldTypes, StringField, TupleField
+from ...tensor.datastore.to_vineyard import resolve_vineyard_socket
+from ...utils import lazy_import
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index
+
+vineyard = lazy_import("vineyard")
+vy_data_df = lazy_import("vineyard.data.dataframe", rename="vy_data_df")
+vy_data_utils = lazy_import("vineyard.data.utils", rename="vy_data_utils")
+
+
+class DataFrameToVineyardChunk(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.DATAFRAME_STORE_VINEYARD_CHUNK
+
+    # vineyard ipc socket
+    vineyard_socket = StringField("vineyard_socket")
+
+    # a dummy attr to make sure ops have different keys
+    operator_index = TupleField("operator_index", FieldTypes.int32)
+
+    def __init__(self, vineyard_socket=None, dtypes=None, **kw):
+        super().__init__(
+            vineyard_socket=vineyard_socket,
+            _dtypes=dtypes,
+            _output_types=[OutputType.dataframe],
+            **kw
+        )
+
+    def __call__(self, df):
+        return self.new_dataframe(
+            [df],
+            shape=(0, 0),
+            dtypes=df.dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+        )
+
+    @classmethod
+    def _process_out_chunks(cls, op, out_chunks):
+        dtypes = pd.Series([np.dtype("O")], index=pd.Index([0]))
+        merge_op = DataFrameToVinyardStoreMeta(
+            vineyard_socket=op.vineyard_socket,
+            chunk_shape=op.inputs[0].chunk_shape,
+            shape=(1, 1),
+            dtypes=dtypes,
+        )
+        return merge_op.new_chunks(
+            out_chunks, shape=(1, 1), dtypes=dtypes, index=(0, 0)
+        )
+
+    @classmethod
+    def tile(cls, op):
+        out_chunks = []
+        dtypes = pd.Series([np.dtype("O")], index=pd.Index([0]))
+        for idx, chunk in enumerate(op.inputs[0].chunks):
+            chunk_op = op.copy().reset_key()
+            chunk_op.operator_index = chunk.index
+            out_chunk = chunk_op.new_chunk(
+                [chunk],
+                shape=(1, 1),
+                dtypes=dtypes,
+                index_value=chunk.index_value,
+                columns_value=chunk.columns_value,
+                index=(idx, 0),
+            )
+            out_chunks.append(out_chunk)
+        out_chunks = cls._process_out_chunks(op, out_chunks)
+
+        in_df = op.inputs[0]
+        new_op = op.copy().reset_key()
+        return new_op.new_dataframes(
+            op.inputs,
+            shape=(len(out_chunks), 1),
+            dtypes=dtypes,
+            index_value=in_df.index_value,
+            columns_value=in_df.columns_value,
+            chunks=out_chunks,
+            nsplits=((np.prod(op.inputs[0].chunk_shape),),),
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if vineyard is None:
+            raise RuntimeError("vineyard is not available")
+
+        socket, needs_put = resolve_vineyard_socket(ctx, op)
+        client = vineyard.connect(socket)
+
+        # some op might be fused and executed twice on different workers
+        if not needs_put:
+            # might be fused
+            try:  # pragma: no cover
+                meta = ctx.get_chunks_meta([op.inputs[0].key])[0]
+                df_id = vineyard.ObjectID(meta["object_ref"])
+                if not client.exists(df_id):
+                    needs_put = True
+            except KeyError:
+                needs_put = True
+        if needs_put:
+            df_id = client.put(
+                ctx[op.inputs[0].key], partition_index=op.inputs[0].index
+            )
+        else:  # pragma: no cover
+            meta = client.get_meta(df_id)
+            new_meta = vineyard.ObjectMeta()
+            for k, v in meta.items():
+                if k not in ["id", "signature", "instance_id"]:
+                    if isinstance(v, vineyard.ObjectMeta):
+                        new_meta.add_member(k, v)
+                    else:
+                        new_meta[k] = v
+            new_meta["partition_index_"] = vy_data_utils.to_json(op.inputs[0].index)
+            df_id = client.create_metadata(new_meta).id
+
+        client.persist(df_id)
+        ctx[op.outputs[0].key] = pd.DataFrame({0: [df_id]})
+
+
+class DataFrameToVinyardStoreMeta(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.DATAFRAME_STORE_VINEYARD_META
+
+    # vineyard ipc socket
+    vineyard_socket = StringField("vineyard_socket")
+
+    def __init__(self, vineyard_socket=None, dtypes=None, **kw):
+        super().__init__(
+            vineyard_socket=vineyard_socket,
+            dtypes=dtypes,
+            _output_types=[OutputType.dataframe],
+            **kw
+        )
+
+    @classmethod
+    def tile(cls, op):
+        dtypes = pd.Series([np.dtype("O")], index=pd.Index([0]))
+        chunk_op = op.copy().reset_key()
+        out_chunk = chunk_op.new_chunk(
+            op.inputs[0].chunks,
+            shape=(1, 1),
+            dtypes=dtypes,
+            index_value=parse_index(pd.Index([-1])),
+            columns_value=parse_index(pd.Index([0])),
+            index=(0, 0),
+        )
+        new_op = op.copy().reset_key()
+        return new_op.new_dataframes(
+            op.inputs,
+            shape=(1, 1),
+            dtypes=dtypes,
+            index_value=parse_index(pd.Index([0])),
+            columns_value=parse_index(pd.Index([0])),
+            chunks=[out_chunk],
+            nsplits=((1,), (1,)),
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if vineyard is None:
+            raise RuntimeError("vineyard is not available")
+
+        socket, _ = resolve_vineyard_socket(ctx, op)
+        client = vineyard.connect(socket)
+
+        # # store the result object id to execution context
+        chunks = [ctx[chunk.key][0][0] for chunk in op.inputs]
+        ctx[op.outputs[0].key] = pd.DataFrame(
+            {0: [vy_data_df.make_global_dataframe(client, chunks).id]}
+        )
+
+
+def to_vineyard(df, vineyard_socket=None):
+    op = DataFrameToVineyardChunk(vineyard_socket=vineyard_socket)
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/fetch/__init__.py b/python/xorbits/_mars/dataframe/fetch/__init__.py
new file mode 100644
index 000000000..358b547f5
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/fetch/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import DataFrameFetch, DataFrameFetchShuffle
diff --git a/python/xorbits/_mars/dataframe/fetch/core.py b/python/xorbits/_mars/dataframe/fetch/core.py
new file mode 100644
index 000000000..2557abc9e
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/fetch/core.py
@@ -0,0 +1,94 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...core import OutputType, register_fetch_class
+from ...core.operand import Fetch, FetchMixin, FetchShuffle
+from ...serialization.serializables import FieldTypes, TupleField
+from ...utils import on_deserialize_shape, on_serialize_shape
+from ..operands import DataFrameOperandMixin
+
+
+class DataFrameFetchMixin(DataFrameOperandMixin, FetchMixin):
+    __slots__ = ()
+
+
+class DataFrameFetch(Fetch, DataFrameFetchMixin):
+    # required fields
+    _shape = TupleField(
+        "shape",
+        FieldTypes.int64,
+        on_serialize=on_serialize_shape,
+        on_deserialize=on_deserialize_shape,
+    )
+
+    def __init__(self, output_types=None, **kw):
+        super().__init__(_output_types=output_types, **kw)
+
+    def _extract_dataframe_or_series_kws(self, kws, **kw):
+        if kws is None:
+            kws = [kw]
+        new_kws = []
+        new_output_types = []
+        for output_type, kwargs in zip(self._output_types, kws):
+            if output_type == OutputType.df_or_series:
+                data_params = kwargs["data_params"]
+                data_type = kwargs["data_type"]
+                if data_type == "series":
+                    new_output_types.append(OutputType.series)
+                else:
+                    new_output_types.append(OutputType.dataframe)
+                new_kws.append(data_params)
+            else:
+                new_output_types.append(output_type)
+                new_kws.append(kwargs)
+        self._output_types = new_output_types
+        return new_kws
+
+    def _new_chunks(self, inputs, kws=None, **kw):
+        if "_key" in kw and self.source_key is None:
+            self.source_key = kw["_key"]
+        if "_shape" in kw and self._shape is None:
+            self._shape = kw["_shape"]
+        new_kws = self._extract_dataframe_or_series_kws(kws, **kw)
+        return super()._new_chunks(inputs, kws=new_kws, **kw)
+
+    def _new_tileables(self, inputs, kws=None, **kw):
+        if "_key" in kw and self.source_key is None:
+            self.source_key = kw["_key"]
+        new_kws = self._extract_dataframe_or_series_kws(kws, **kw)
+        return super()._new_tileables(inputs, kws=new_kws, **kw)
+
+
+class DataFrameFetchShuffle(FetchShuffle, DataFrameFetchMixin):
+    # required fields
+    _shape = TupleField(
+        "shape",
+        FieldTypes.int64,
+        on_serialize=on_serialize_shape,
+        on_deserialize=on_deserialize_shape,
+    )
+
+    def __init__(self, output_types=None, **kw):
+        super().__init__(_output_types=output_types, **kw)
+
+
+register_fetch_class(OutputType.dataframe, DataFrameFetch, DataFrameFetchShuffle)
+register_fetch_class(
+    OutputType.dataframe_groupby, DataFrameFetch, DataFrameFetchShuffle
+)
+register_fetch_class(OutputType.df_or_series, DataFrameFetch, DataFrameFetchShuffle)
+register_fetch_class(OutputType.series, DataFrameFetch, DataFrameFetchShuffle)
+register_fetch_class(OutputType.series_groupby, DataFrameFetch, DataFrameFetchShuffle)
+register_fetch_class(OutputType.index, DataFrameFetch, DataFrameFetchShuffle)
+register_fetch_class(OutputType.categorical, DataFrameFetch, DataFrameFetchShuffle)
diff --git a/python/xorbits/_mars/dataframe/groupby/__init__.py b/python/xorbits/_mars/dataframe/groupby/__init__.py
new file mode 100644
index 000000000..21d22109b
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/groupby/__init__.py
@@ -0,0 +1,87 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# noinspection PyUnresolvedReferences
+from ..core import DataFrameGroupBy, GroupBy, SeriesGroupBy
+
+
+def _install():
+    from ..core import DATAFRAME_GROUPBY_TYPE, DATAFRAME_TYPE, GROUPBY_TYPE, SERIES_TYPE
+    from .aggregation import agg
+    from .apply import groupby_apply
+    from .core import groupby
+    from .cum import cumcount, cummax, cummin, cumprod, cumsum
+    from .fill import bfill, ffill, fillna
+    from .getitem import df_groupby_getitem
+    from .head import head
+
+    # Just for enabling custom agg function registration.
+    # Therefore, del this immediately after import.
+    from .nunique import DataFrameCustomGroupByNuniqueMixin
+    from .sample import groupby_sample
+    from .transform import groupby_transform
+
+    del DataFrameCustomGroupByNuniqueMixin
+
+    for cls in DATAFRAME_TYPE:
+        setattr(cls, "groupby", groupby)
+
+    for cls in SERIES_TYPE:
+        setattr(cls, "groupby", groupby)
+
+    for cls in GROUPBY_TYPE:
+        setattr(cls, "agg", agg)
+        setattr(cls, "aggregate", agg)
+
+        setattr(cls, "sum", lambda groupby, **kw: agg(groupby, "sum", **kw))
+        setattr(cls, "prod", lambda groupby, **kw: agg(groupby, "prod", **kw))
+        setattr(cls, "max", lambda groupby, **kw: agg(groupby, "max", **kw))
+        setattr(cls, "min", lambda groupby, **kw: agg(groupby, "min", **kw))
+        setattr(cls, "count", lambda groupby, **kw: agg(groupby, "count", **kw))
+        setattr(cls, "size", lambda groupby, **kw: agg(groupby, "size", **kw))
+        setattr(cls, "mean", lambda groupby, **kw: agg(groupby, "mean", **kw))
+        setattr(cls, "var", lambda groupby, **kw: agg(groupby, "var", **kw))
+        setattr(cls, "std", lambda groupby, **kw: agg(groupby, "std", **kw))
+        setattr(cls, "all", lambda groupby, **kw: agg(groupby, "all", **kw))
+        setattr(cls, "any", lambda groupby, **kw: agg(groupby, "any", **kw))
+        setattr(cls, "skew", lambda groupby, **kw: agg(groupby, "skew", **kw))
+        setattr(cls, "kurt", lambda groupby, **kw: agg(groupby, "kurt", **kw))
+        setattr(cls, "kurtosis", lambda groupby, **kw: agg(groupby, "kurtosis", **kw))
+        setattr(cls, "sem", lambda groupby, **kw: agg(groupby, "sem", **kw))
+        setattr(cls, "nunique", lambda groupby, **kw: agg(groupby, "nunique", **kw))
+
+        setattr(cls, "apply", groupby_apply)
+        setattr(cls, "transform", groupby_transform)
+
+        setattr(cls, "cumcount", cumcount)
+        setattr(cls, "cummin", cummin)
+        setattr(cls, "cummax", cummax)
+        setattr(cls, "cumprod", cumprod)
+        setattr(cls, "cumsum", cumsum)
+
+        setattr(cls, "head", head)
+
+        setattr(cls, "sample", groupby_sample)
+
+        setattr(cls, "ffill", ffill)
+        setattr(cls, "bfill", bfill)
+        setattr(cls, "backfill", bfill)
+        setattr(cls, "fillna", fillna)
+
+    for cls in DATAFRAME_GROUPBY_TYPE:
+        setattr(cls, "__getitem__", df_groupby_getitem)
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/dataframe/groupby/aggregation.py b/python/xorbits/_mars/dataframe/groupby/aggregation.py
new file mode 100644
index 000000000..fd449d271
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/groupby/aggregation.py
@@ -0,0 +1,1350 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import itertools
+import logging
+import uuid
+from typing import Callable, Dict, List, Union
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import ENTITY_TYPE, OutputType
+from ...core.context import get_context
+from ...core.custom_log import redirect_custom_log
+from ...core.operand import OperandStage
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    DictField,
+    Int32Field,
+    Int64Field,
+    ListField,
+    StringField,
+)
+from ...typing import ChunkType, TileableType
+from ...utils import (
+    enter_current_session,
+    estimate_pandas_size,
+    lazy_import,
+    pd_release_version,
+)
+from ..arrays import ArrowArray
+from ..core import GROUPBY_TYPE
+from ..merge import DataFrameConcat
+from ..operands import DataFrameOperand, DataFrameOperandMixin, DataFrameShuffleProxy
+from ..reduction.aggregation import is_funcs_aggregate, normalize_reduction_funcs
+from ..reduction.core import ReductionAggStep, ReductionCompiler, ReductionSteps
+from ..utils import (
+    build_concatenated_rows_frame,
+    concat_on_columns,
+    is_cudf,
+    parse_index,
+)
+from .core import DataFrameGroupByOperand
+from .custom_aggregation import custom_agg_functions
+from .sort import (
+    DataFrameGroupbyConcatPivot,
+    DataFrameGroupbySortShuffle,
+    DataFramePSRSGroupbySample,
+)
+
+cp = lazy_import("cupy", rename="cp")
+cudf = lazy_import("cudf")
+
+logger = logging.getLogger(__name__)
+CV_THRESHOLD = 0.2
+MEAN_RATIO_THRESHOLD = 2 / 3
+_support_get_group_without_as_index = pd_release_version[:2] > (1, 0)
+
+
+class SizeRecorder:
+    def __init__(self):
+        self._raw_records = []
+        self._agg_records = []
+
+    def record(self, raw_record: int, agg_record: int):
+        self._raw_records.append(raw_record)
+        self._agg_records.append(agg_record)
+
+    def get(self):
+        return self._raw_records, self._agg_records
+
+
+_agg_functions = {
+    "sum": lambda x: x.sum(),
+    "prod": lambda x: x.prod(),
+    "product": lambda x: x.product(),
+    "min": lambda x: x.min(),
+    "max": lambda x: x.max(),
+    "all": lambda x: x.all(),
+    "any": lambda x: x.any(),
+    "count": lambda x: x.count(),
+    "size": lambda x: x._reduction_size(),
+    "mean": lambda x: x.mean(),
+    "var": lambda x, ddof=1: x.var(ddof=ddof),
+    "std": lambda x, ddof=1: x.std(ddof=ddof),
+    "sem": lambda x, ddof=1: x.sem(ddof=ddof),
+    "skew": lambda x, bias=False: x.skew(bias=bias),
+    "kurt": lambda x, bias=False: x.kurt(bias=bias),
+    "kurtosis": lambda x, bias=False: x.kurtosis(bias=bias),
+    "nunique": lambda x: x.nunique(),
+}
+_series_col_name = "col_name"
+
+
+def _patch_groupby_kurt():
+    try:
+        from pandas.core.groupby import DataFrameGroupBy, SeriesGroupBy
+
+        if not hasattr(DataFrameGroupBy, "kurt"):  # pragma: no branch
+
+            def _kurt_by_frame(a, *args, **kwargs):
+                data = a.to_frame().kurt(*args, **kwargs).iloc[0]
+                if is_cudf(data):  # pragma: no cover
+                    data = data.copy()
+                return data
+
+            def _group_kurt(x, *args, **kwargs):
+                if kwargs.get("numeric_only") is not None:
+                    return x.agg(functools.partial(_kurt_by_frame, *args, **kwargs))
+                else:
+                    return x.agg(functools.partial(pd.Series.kurt, *args, **kwargs))
+
+            DataFrameGroupBy.kurt = DataFrameGroupBy.kurtosis = _group_kurt
+            SeriesGroupBy.kurt = SeriesGroupBy.kurtosis = _group_kurt
+    except (AttributeError, ImportError):  # pragma: no cover
+        pass
+
+
+_patch_groupby_kurt()
+del _patch_groupby_kurt
+
+
+def build_mock_agg_result(
+    groupby: GROUPBY_TYPE,
+    groupby_params: Dict,
+    raw_func: Callable,
+    **raw_func_kw,
+):
+    try:
+        agg_result = groupby.op.build_mock_groupby().aggregate(raw_func, **raw_func_kw)
+    except ValueError:
+        if (
+            groupby_params.get("as_index") or _support_get_group_without_as_index
+        ):  # pragma: no cover
+            raise
+        agg_result = (
+            groupby.op.build_mock_groupby(as_index=True)
+            .aggregate(raw_func, **raw_func_kw)
+            .to_frame()
+        )
+        agg_result.index.names = [None] * agg_result.index.nlevels
+    return agg_result
+
+
+class DataFrameGroupByAgg(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.GROUPBY_AGG
+
+    raw_func = AnyField("raw_func")
+    raw_func_kw = DictField("raw_func_kw")
+    func = AnyField("func")
+    func_rename = ListField("func_rename")
+
+    raw_groupby_params = DictField("raw_groupby_params")
+    groupby_params = DictField("groupby_params")
+
+    method = StringField("method")
+    use_inf_as_na = BoolField("use_inf_as_na")
+
+    # for chunk
+    combine_size = Int32Field("combine_size")
+    chunk_store_limit = Int64Field("chunk_store_limit")
+    pre_funcs = ListField("pre_funcs")
+    agg_funcs = ListField("agg_funcs")
+    post_funcs = ListField("post_funcs")
+    index_levels = Int32Field("index_levels")
+    size_recorder_name = StringField("size_recorder_name")
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs[1:])
+        if len(self._inputs) > 1:
+            by = []
+            for v in self.groupby_params["by"]:
+                if isinstance(v, ENTITY_TYPE):
+                    by.append(next(inputs_iter))
+                else:
+                    by.append(v)
+            self.groupby_params["by"] = by
+
+    def _get_inputs(self, inputs):
+        if isinstance(self.groupby_params["by"], list):
+            for v in self.groupby_params["by"]:
+                if isinstance(v, ENTITY_TYPE):
+                    inputs.append(v)
+        return inputs
+
+    def _get_index_levels(self, groupby, mock_index):
+        if not self.groupby_params["as_index"]:
+            try:
+                as_index_agg_df = groupby.op.build_mock_groupby(
+                    as_index=True
+                ).aggregate(self.raw_func, **self.raw_func_kw)
+            except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+                # handling cases like mdf.groupby("b", as_index=False).b.agg({"c": "count"})
+                if isinstance(self.groupby_params["by"], list):
+                    return len(self.groupby_params["by"])
+                raise  # pragma: no cover
+            pd_index = as_index_agg_df.index
+        else:
+            pd_index = mock_index
+        return 1 if not isinstance(pd_index, pd.MultiIndex) else len(pd_index.levels)
+
+    def _fix_as_index(self, result_index: pd.Index):
+        # make sure if as_index=False takes effect
+        if isinstance(result_index, pd.MultiIndex):
+            # if MultiIndex, as_index=False definitely takes no effect
+            self.groupby_params["as_index"] = True
+        elif result_index.name is not None:
+            # if not MultiIndex and agg_df.index has a name
+            # means as_index=False takes no effect
+            self.groupby_params["as_index"] = True
+
+    def _call_dataframe(self, groupby, input_df):
+        agg_df = build_mock_agg_result(
+            groupby, self.groupby_params, self.raw_func, **self.raw_func_kw
+        )
+
+        shape = (np.nan, agg_df.shape[1])
+        if isinstance(agg_df.index, pd.RangeIndex):
+            index_value = parse_index(
+                pd.RangeIndex(-1), groupby.key, groupby.index_value.key
+            )
+        else:
+            index_value = parse_index(
+                agg_df.index, groupby.key, groupby.index_value.key
+            )
+
+        # make sure if as_index=False takes effect
+        self._fix_as_index(agg_df.index)
+
+        # determine num of indices to group in intermediate steps
+        self.index_levels = self._get_index_levels(groupby, agg_df.index)
+
+        inputs = self._get_inputs([input_df])
+        return self.new_dataframe(
+            inputs,
+            shape=shape,
+            dtypes=agg_df.dtypes,
+            index_value=index_value,
+            columns_value=parse_index(agg_df.columns, store_data=True),
+        )
+
+    def _call_series(self, groupby, in_series):
+        agg_result = build_mock_agg_result(
+            groupby, self.groupby_params, self.raw_func, **self.raw_func_kw
+        )
+
+        # make sure if as_index=False takes effect
+        self._fix_as_index(agg_result.index)
+
+        index_value = parse_index(
+            agg_result.index, groupby.key, groupby.index_value.key
+        )
+
+        inputs = self._get_inputs([in_series])
+
+        # determine num of indices to group in intermediate steps
+        self.index_levels = self._get_index_levels(groupby, agg_result.index)
+
+        # update value type
+        if isinstance(agg_result, pd.DataFrame):
+            return self.new_dataframe(
+                inputs,
+                shape=(np.nan, len(agg_result.columns)),
+                dtypes=agg_result.dtypes,
+                index_value=index_value,
+                columns_value=parse_index(agg_result.columns, store_data=True),
+            )
+        else:
+            return self.new_series(
+                inputs,
+                shape=(np.nan,),
+                dtype=agg_result.dtype,
+                name=agg_result.name,
+                index_value=index_value,
+            )
+
+    def __call__(self, groupby):
+        normalize_reduction_funcs(self, ndim=groupby.ndim)
+        df = groupby
+        while df.op.output_types[0] not in (OutputType.dataframe, OutputType.series):
+            df = df.inputs[0]
+
+        if self.raw_func == "size":
+            self.output_types = [OutputType.series]
+        else:
+            self.output_types = (
+                [OutputType.dataframe]
+                if groupby.op.output_types[0] == OutputType.dataframe_groupby
+                else [OutputType.series]
+            )
+
+        if self.output_types[0] == OutputType.dataframe:
+            return self._call_dataframe(groupby, df)
+        else:
+            return self._call_series(groupby, df)
+
+    @classmethod
+    def partition_merge_data(
+        cls,
+        op: "DataFrameGroupByAgg",
+        partition_chunks: List[ChunkType],
+        proxy_chunk: ChunkType,
+    ):
+        # stage 4: all *ith* classes are gathered and merged
+        partition_sort_chunks = []
+        properties = dict(by=op.groupby_params["by"], gpu=op.is_gpu())
+        out_df = op.outputs[0]
+
+        for i, partition_chunk in enumerate(partition_chunks):
+            output_types = (
+                [OutputType.dataframe_groupby]
+                if out_df.ndim == 2
+                else [OutputType.series_groupby]
+            )
+            partition_shuffle_reduce = DataFrameGroupbySortShuffle(
+                stage=OperandStage.reduce,
+                reducer_index=(i, 0),
+                n_reducers=len(partition_chunks),
+                output_types=output_types,
+                **properties,
+            )
+            chunk_shape = list(partition_chunk.shape)
+            chunk_shape[0] = np.nan
+
+            kw = dict(
+                shape=tuple(chunk_shape),
+                index=partition_chunk.index,
+                index_value=partition_chunk.index_value,
+            )
+            if op.outputs[0].ndim == 2:
+                kw.update(
+                    dict(
+                        columns_value=partition_chunk.columns_value,
+                        dtypes=partition_chunk.dtypes,
+                    )
+                )
+            else:
+                kw.update(dict(dtype=partition_chunk.dtype, name=partition_chunk.name))
+            cs = partition_shuffle_reduce.new_chunks([proxy_chunk], **kw)
+            partition_sort_chunks.append(cs[0])
+        return partition_sort_chunks
+
+    @classmethod
+    def partition_local_data(
+        cls,
+        op: "DataFrameGroupByAgg",
+        sorted_chunks: List[ChunkType],
+        concat_pivot_chunk: ChunkType,
+        in_df: TileableType,
+    ):
+        out_df = op.outputs[0]
+        map_chunks = []
+        chunk_shape = (in_df.chunk_shape[0], 1)
+        for chunk in sorted_chunks:
+            chunk_inputs = [chunk, concat_pivot_chunk]
+            output_types = (
+                [OutputType.dataframe_groupby]
+                if out_df.ndim == 2
+                else [OutputType.series_groupby]
+            )
+            map_chunk_op = DataFrameGroupbySortShuffle(
+                shuffle_size=chunk_shape[0],
+                stage=OperandStage.map,
+                n_partition=len(sorted_chunks),
+                output_types=output_types,
+            )
+            kw = dict()
+            if out_df.ndim == 2:
+                kw.update(
+                    dict(
+                        columns_value=chunk_inputs[0].columns_value,
+                        dtypes=chunk_inputs[0].dtypes,
+                    )
+                )
+            else:
+                kw.update(dict(dtype=chunk_inputs[0].dtype, name=chunk_inputs[0].name))
+
+            map_chunks.append(
+                map_chunk_op.new_chunk(
+                    chunk_inputs,
+                    shape=chunk_shape,
+                    index=chunk.index,
+                    index_value=chunk_inputs[0].index_value,
+                    # **kw
+                )
+            )
+
+        return map_chunks
+
+    @classmethod
+    def _gen_shuffle_chunks_with_pivot(
+        cls,
+        op: "DataFrameGroupByAgg",
+        in_df: TileableType,
+        chunks: List[ChunkType],
+        pivot: ChunkType,
+    ):
+        map_chunks = cls.partition_local_data(op, chunks, pivot, in_df)
+
+        proxy_chunk = DataFrameShuffleProxy(
+            output_types=[OutputType.dataframe]
+        ).new_chunk(map_chunks, shape=())
+
+        partition_sort_chunks = cls.partition_merge_data(op, map_chunks, proxy_chunk)
+
+        return partition_sort_chunks
+
+    @classmethod
+    def _gen_shuffle_chunks(cls, op, chunks):
+        # generate map chunks
+        map_chunks = []
+        chunk_shape = (len(chunks), 1)
+        for chunk in chunks:
+            # no longer consider as_index=False for the intermediate phases,
+            # will do reset_index at last if so
+            map_op = DataFrameGroupByOperand(
+                stage=OperandStage.map,
+                shuffle_size=chunk_shape[0],
+                output_types=[OutputType.dataframe_groupby],
+            )
+            map_chunks.append(
+                map_op.new_chunk(
+                    [chunk],
+                    shape=(np.nan, np.nan),
+                    index=chunk.index,
+                    index_value=op.outputs[0].index_value,
+                )
+            )
+
+        proxy_chunk = DataFrameShuffleProxy(
+            output_types=[OutputType.dataframe]
+        ).new_chunk(map_chunks, shape=())
+
+        # generate reduce chunks
+        reduce_chunks = []
+        out_indices = list(itertools.product(*(range(s) for s in chunk_shape)))
+        for out_idx in out_indices:
+            reduce_op = DataFrameGroupByOperand(
+                stage=OperandStage.reduce,
+                output_types=[OutputType.dataframe_groupby],
+                n_reducers=len(out_indices),
+            )
+            reduce_chunks.append(
+                reduce_op.new_chunk(
+                    [proxy_chunk],
+                    shape=(np.nan, np.nan),
+                    index=out_idx,
+                    index_value=None,
+                )
+            )
+        return reduce_chunks
+
+    @classmethod
+    def _gen_map_chunks(
+        cls,
+        op: "DataFrameGroupByAgg",
+        in_chunks: List[ChunkType],
+        out_df: TileableType,
+        func_infos: ReductionSteps,
+    ):
+        map_chunks = []
+        for chunk in in_chunks:
+            chunk_inputs = [chunk]
+            map_op = op.copy().reset_key()
+            # force as_index=True for map phase
+            map_op.output_types = op.output_types
+            map_op.groupby_params = map_op.groupby_params.copy()
+            map_op.groupby_params["as_index"] = True
+            if isinstance(map_op.groupby_params["by"], list):
+                by = []
+                for v in map_op.groupby_params["by"]:
+                    if isinstance(v, ENTITY_TYPE):
+                        by_chunk = v.cix[chunk.index[0],]
+                        chunk_inputs.append(by_chunk)
+                        by.append(by_chunk)
+                    else:
+                        by.append(v)
+                map_op.groupby_params["by"] = by
+            map_op.stage = OperandStage.map
+            map_op.pre_funcs = func_infos.pre_funcs
+            map_op.agg_funcs = func_infos.agg_funcs
+            new_index = chunk.index if len(chunk.index) == 2 else (chunk.index[0],)
+            if out_df.ndim == 2:
+                new_index = (new_index[0], 0) if len(new_index) == 1 else new_index
+                map_chunk = map_op.new_chunk(
+                    chunk_inputs,
+                    shape=out_df.shape,
+                    index=new_index,
+                    index_value=out_df.index_value,
+                    columns_value=out_df.columns_value,
+                    dtypes=out_df.dtypes,
+                )
+            else:
+                new_index = new_index[:1] if len(new_index) == 2 else new_index
+                map_chunk = map_op.new_chunk(
+                    chunk_inputs,
+                    shape=(out_df.shape[0],),
+                    index=new_index,
+                    index_value=out_df.index_value,
+                    dtype=out_df.dtype,
+                )
+            map_chunks.append(map_chunk)
+        return map_chunks
+
+    @classmethod
+    def _compile_funcs(cls, op: "DataFrameGroupByAgg", in_df) -> ReductionSteps:
+        compiler = ReductionCompiler(store_source=True)
+        if isinstance(op.func, list):
+            func_iter = ((None, f) for f in op.func)
+        else:
+            func_iter = ((col, f) for col, funcs in op.func.items() for f in funcs)
+
+        func_renames = (
+            op.func_rename
+            if getattr(op, "func_rename", None) is not None
+            else itertools.repeat(None)
+        )
+        for func_rename, (col, f) in zip(func_renames, func_iter):
+            func_name = None
+            if isinstance(f, str):
+                f, func_name = _agg_functions[f], f
+            if func_rename is not None:
+                func_name = func_rename
+
+            func_cols = None
+            if col is not None:
+                func_cols = [col]
+            compiler.add_function(f, in_df.ndim, cols=func_cols, func_name=func_name)
+        return compiler.compile()
+
+    @classmethod
+    def _tile_with_shuffle(
+        cls,
+        op: "DataFrameGroupByAgg",
+        in_df: TileableType,
+        out_df: TileableType,
+        func_infos: ReductionSteps,
+    ):
+        # First, perform groupby and aggregation on each chunk.
+        agg_chunks = cls._gen_map_chunks(op, in_df.chunks, out_df, func_infos)
+        return cls._perform_shuffle(op, agg_chunks, in_df, out_df, func_infos)
+
+    @classmethod
+    def _gen_pivot_chunk(
+        cls,
+        op: "DataFrameGroupByAgg",
+        sample_chunks: List[ChunkType],
+        agg_chunk_len: int,
+    ):
+        properties = dict(
+            by=op.groupby_params["by"],
+            gpu=op.is_gpu(),
+        )
+
+        # stage 2: gather and merge samples, choose and broadcast p-1 pivots
+        kind = "quicksort"
+        output_types = [OutputType.tensor]
+
+        concat_pivot_op = DataFrameGroupbyConcatPivot(
+            kind=kind,
+            n_partition=agg_chunk_len,
+            output_types=output_types,
+            **properties,
+        )
+
+        concat_pivot_chunk = concat_pivot_op.new_chunk(
+            sample_chunks,
+            shape=(agg_chunk_len,),
+            dtype=np.dtype(object),
+        )
+        return concat_pivot_chunk
+
+    @classmethod
+    def _sample_chunks(
+        cls,
+        op: "DataFrameGroupByAgg",
+        agg_chunks: List[ChunkType],
+    ):
+        chunk_shape = len(agg_chunks)
+        sampled_chunks = []
+
+        properties = dict(
+            by=op.groupby_params["by"],
+            gpu=op.is_gpu(),
+        )
+
+        for i, chunk in enumerate(agg_chunks):
+            kws = []
+            sampled_shape = (
+                (chunk_shape, chunk.shape[1]) if chunk.ndim == 2 else (chunk_shape,)
+            )
+            chunk_index = (i, 0) if chunk.ndim == 2 else (i,)
+            chunk_op = DataFramePSRSGroupbySample(
+                kind="quicksort",
+                n_partition=chunk_shape,
+                output_types=op.output_types,
+                **properties,
+            )
+            if op.output_types[0] == OutputType.dataframe:
+                kws.append(
+                    {
+                        "shape": sampled_shape,
+                        "index_value": chunk.index_value,
+                        "index": chunk_index,
+                        "type": "regular_sampled",
+                    }
+                )
+            else:
+                kws.append(
+                    {
+                        "shape": sampled_shape,
+                        "index_value": chunk.index_value,
+                        "index": chunk_index,
+                        "type": "regular_sampled",
+                        "dtype": chunk.dtype,
+                    }
+                )
+            chunk = chunk_op.new_chunk([chunk], kws=kws)
+            sampled_chunks.append(chunk)
+
+        return sampled_chunks
+
+    @classmethod
+    def _perform_shuffle(
+        cls,
+        op: "DataFrameGroupByAgg",
+        agg_chunks: List[ChunkType],
+        in_df: TileableType,
+        out_df: TileableType,
+        func_infos: ReductionSteps,
+    ):
+        if op.groupby_params["sort"] and len(in_df.chunks) > 1:
+            agg_chunk_len = len(agg_chunks)
+            sample_chunks = cls._sample_chunks(op, agg_chunks)
+            pivot_chunk = cls._gen_pivot_chunk(op, sample_chunks, agg_chunk_len)
+            reduce_chunks = cls._gen_shuffle_chunks_with_pivot(
+                op, in_df, agg_chunks, pivot_chunk
+            )
+        else:
+            reduce_chunks = cls._gen_shuffle_chunks(op, agg_chunks)
+
+        # Combine groups
+        agg_chunks = []
+        for chunk in reduce_chunks:
+            agg_op = op.copy().reset_key()
+            agg_op.tileable_op_key = op.key
+            agg_op.groupby_params = agg_op.groupby_params.copy()
+            agg_op.groupby_params.pop("selection", None)
+            # use levels instead of by for reducer
+            agg_op.groupby_params.pop("by", None)
+            agg_op.groupby_params["level"] = list(range(op.index_levels))
+            agg_op.stage = OperandStage.agg
+            agg_op.agg_funcs = func_infos.agg_funcs
+            agg_op.post_funcs = func_infos.post_funcs
+            if op.output_types[0] == OutputType.dataframe:
+                agg_chunk = agg_op.new_chunk(
+                    [chunk],
+                    shape=out_df.shape,
+                    index=chunk.index,
+                    index_value=out_df.index_value,
+                    dtypes=out_df.dtypes,
+                    columns_value=out_df.columns_value,
+                )
+            else:
+                agg_chunk = agg_op.new_chunk(
+                    [chunk],
+                    shape=out_df.shape,
+                    index=(chunk.index[0],),
+                    dtype=out_df.dtype,
+                    index_value=out_df.index_value,
+                    name=out_df.name,
+                )
+            agg_chunks.append(agg_chunk)
+
+        new_op = op.copy()
+        if op.output_types[0] == OutputType.dataframe:
+            nsplits = ((np.nan,) * len(agg_chunks), (out_df.shape[1],))
+        else:
+            nsplits = ((np.nan,) * len(agg_chunks),)
+        kw = out_df.params.copy()
+        kw.update(dict(chunks=agg_chunks, nsplits=nsplits))
+        return new_op.new_tileables([in_df], **kw)
+
+    @classmethod
+    def _tile_with_tree(
+        cls,
+        op: "DataFrameGroupByAgg",
+        in_df: TileableType,
+        out_df: TileableType,
+        func_infos: ReductionSteps,
+    ):
+        chunks = cls._gen_map_chunks(op, in_df.chunks, out_df, func_infos)
+        return cls._combine_tree(op, chunks, out_df, func_infos)
+
+    @classmethod
+    def _build_tree_chunks(
+        cls,
+        op: "DataFrameGroupByAgg",
+        chunks: List[ChunkType],
+        func_infos: ReductionSteps,
+        combine_size: int,
+        input_chunk_size: float = None,
+        chunk_store_limit: int = None,
+    ):
+        out_df = op.outputs[0]
+        # if concat chunk's size is greater than chunk_store_limit,
+        # stop combining them.
+        check_size = False
+        if chunk_store_limit is not None:
+            assert input_chunk_size is not None
+            check_size = True
+        concat_chunk_size = input_chunk_size
+        while (not check_size or concat_chunk_size < chunk_store_limit) and (
+            len(chunks) > combine_size
+        ):
+            new_chunks = []
+            for idx, i in enumerate(range(0, len(chunks), combine_size)):
+                chks = chunks[i : i + combine_size]
+                if len(chks) == 1:
+                    chk = chks[0]
+                else:
+                    concat_op = DataFrameConcat(output_types=out_df.op.output_types)
+                    # Change index for concatenate
+                    for j, c in enumerate(chks):
+                        c._index = (j, 0)
+                    if out_df.ndim == 2:
+                        chk = concat_op.new_chunk(chks, dtypes=chks[0].dtypes)
+                    else:
+                        chk = concat_op.new_chunk(chks, dtype=chunks[0].dtype)
+                chunk_op = op.copy().reset_key()
+                chunk_op.tileable_op_key = None
+                chunk_op.output_types = out_df.op.output_types
+                chunk_op.stage = OperandStage.combine
+                chunk_op.groupby_params = chunk_op.groupby_params.copy()
+                chunk_op.groupby_params.pop("selection", None)
+                # use levels instead of by for agg
+                chunk_op.groupby_params.pop("by", None)
+                chunk_op.groupby_params["level"] = list(range(op.index_levels))
+                chunk_op.agg_funcs = func_infos.agg_funcs
+
+                new_shape = (
+                    (np.nan, out_df.shape[1]) if len(out_df.shape) == 2 else (np.nan,)
+                )
+
+                new_chunks.append(
+                    chunk_op.new_chunk(
+                        [chk],
+                        index=(idx, 0),
+                        shape=new_shape,
+                        index_value=chks[0].index_value,
+                        columns_value=getattr(out_df, "columns_value", None),
+                    )
+                )
+            chunks = new_chunks
+            if concat_chunk_size is not None:
+                concat_chunk_size *= combine_size
+        if concat_chunk_size:
+            return chunks, concat_chunk_size
+        else:
+            return chunks
+
+    @classmethod
+    def _build_out_tileable(
+        cls,
+        op: "DataFrameGroupByAgg",
+        out_df: TileableType,
+        combined_chunks: List[ChunkType],
+        func_infos: ReductionSteps,
+    ):
+        if len(combined_chunks) == 1:
+            chk = combined_chunks[0]
+        else:
+            concat_op = DataFrameConcat(output_types=out_df.op.output_types)
+            if out_df.ndim == 2:
+                chk = concat_op.new_chunk(
+                    combined_chunks, dtypes=combined_chunks[0].dtypes
+                )
+            else:
+                chk = concat_op.new_chunk(
+                    combined_chunks, dtype=combined_chunks[0].dtype
+                )
+        chunk_op = op.copy().reset_key()
+        chunk_op.tileable_op_key = op.key
+        chunk_op.stage = OperandStage.agg
+        chunk_op.groupby_params = chunk_op.groupby_params.copy()
+        chunk_op.groupby_params.pop("selection", None)
+        # use levels instead of by for agg
+        chunk_op.groupby_params.pop("by", None)
+        chunk_op.groupby_params["level"] = list(range(op.index_levels))
+        chunk_op.agg_funcs = func_infos.agg_funcs
+        chunk_op.post_funcs = func_infos.post_funcs
+        kw = out_df.params.copy()
+        kw["index"] = (0, 0) if op.output_types[0] == OutputType.dataframe else (0,)
+        chunk = chunk_op.new_chunk([chk], **kw)
+        new_op = op.copy()
+        if op.output_types[0] == OutputType.dataframe:
+            nsplits = ((out_df.shape[0],), (out_df.shape[1],))
+        else:
+            nsplits = ((out_df.shape[0],),)
+
+        kw = out_df.params.copy()
+        kw.update(dict(chunks=[chunk], nsplits=nsplits))
+        return new_op.new_tileables(op.inputs, **kw)
+
+    @classmethod
+    def _combine_tree(
+        cls,
+        op: "DataFrameGroupByAgg",
+        chunks: List[ChunkType],
+        out_df: TileableType,
+        func_infos: ReductionSteps,
+    ):
+        combine_size = op.combine_size
+        chunks = cls._build_tree_chunks(op, chunks, func_infos, combine_size)
+        return cls._build_out_tileable(op, out_df, chunks, func_infos)
+
+    @classmethod
+    def _build_tree_and_shuffle_chunks(
+        cls,
+        op: "DataFrameGroupByAgg",
+        in_df: TileableType,
+        out_df: TileableType,
+        func_infos: ReductionSteps,
+        sample_map_chunks: List[ChunkType],
+        sample_agg_sizes: List[int],
+    ):
+        combine_size = op.combine_size
+        left_chunks = cls._gen_map_chunks(
+            op, in_df.chunks[combine_size:], out_df, func_infos
+        )
+        input_size = sum(sample_agg_sizes) / len(sample_agg_sizes)
+        combine_chunk_limit = op.chunk_store_limit / 4
+        combined_chunks, concat_size = cls._build_tree_chunks(
+            op,
+            sample_map_chunks + left_chunks,
+            func_infos,
+            combine_size,
+            input_size,
+            combine_chunk_limit,
+        )
+        logger.debug(
+            "Combine map chunks to %s chunks for groupby operand %s",
+            len(combined_chunks),
+            op,
+        )
+        if concat_size <= combine_chunk_limit:
+            logger.debug(
+                "Choose tree method after combining chunks for groupby operand %s", op
+            )
+            return cls._build_out_tileable(op, out_df, combined_chunks, func_infos)
+        else:
+            logger.debug(
+                "Choose shuffle method after combining chunks for "
+                "groupby operand %s, chunk count is %s",
+                op,
+                len(combined_chunks),
+            )
+            return cls._perform_shuffle(
+                op,
+                combined_chunks,
+                in_df,
+                out_df,
+                func_infos,
+            )
+
+    @classmethod
+    def _tile_auto(
+        cls,
+        op: "DataFrameGroupByAgg",
+        in_df: TileableType,
+        out_df: TileableType,
+        func_infos: ReductionSteps,
+    ):
+        ctx = get_context()
+        combine_size = op.combine_size
+        size_recorder_name = str(uuid.uuid4())
+        size_recorder = ctx.create_remote_object(size_recorder_name, SizeRecorder)
+
+        # collect the first combine_size chunks, run it
+        # to get the size before and after agg
+        chunks = cls._gen_map_chunks(
+            op, in_df.chunks[:combine_size], out_df, func_infos
+        )
+        for chunk in chunks:
+            chunk.op.size_recorder_name = size_recorder_name
+        # yield to trigger execution
+        yield chunks
+
+        raw_sizes, agg_sizes = size_recorder.get()
+        # destroy size recorder
+        ctx.destroy_remote_object(size_recorder_name)
+
+        logger.debug(
+            "Start to choose method for Groupby, agg_sizes: %s, raw_sizes: %s, "
+            "sample_count: %s, total_count: %s, chunk_store_limit: %s",
+            agg_sizes,
+            raw_sizes,
+            len(agg_sizes),
+            len(in_df.chunks),
+            op.chunk_store_limit,
+        )
+
+        return cls._build_tree_and_shuffle_chunks(
+            op, in_df, out_df, func_infos, chunks, agg_sizes
+        )
+
+    @classmethod
+    def tile(cls, op: "DataFrameGroupByAgg"):
+        in_df = op.inputs[0]
+        if len(in_df.shape) > 1:
+            in_df = build_concatenated_rows_frame(in_df)
+        out_df = op.outputs[0]
+
+        func_infos = cls._compile_funcs(op, in_df)
+
+        if op.method == "auto":
+            logger.debug("Choose auto method for groupby operand %s", op)
+            if len(in_df.chunks) <= op.combine_size:
+                return cls._tile_with_tree(op, in_df, out_df, func_infos)
+            else:
+                return (yield from cls._tile_auto(op, in_df, out_df, func_infos))
+        if op.method == "shuffle":
+            logger.debug("Choose shuffle method for groupby operand %s", op)
+            return cls._tile_with_shuffle(op, in_df, out_df, func_infos)
+        elif op.method == "tree":
+            logger.debug("Choose tree method for groupby operand %s", op)
+            return cls._tile_with_tree(op, in_df, out_df, func_infos)
+        else:  # pragma: no cover
+            raise NotImplementedError
+
+    @classmethod
+    def _get_grouped(cls, op: "DataFrameGroupByAgg", df, ctx, copy=False, grouper=None):
+        if copy:
+            df = df.copy()
+
+        params = op.groupby_params.copy()
+        params.pop("as_index", None)
+        selection = params.pop("selection", None)
+
+        if grouper is not None:
+            params["by"] = grouper
+            params.pop("level", None)
+        elif isinstance(params.get("by"), list):
+            new_by = []
+            for v in params["by"]:
+                if isinstance(v, ENTITY_TYPE):
+                    new_by.append(ctx[v.key])
+                else:
+                    new_by.append(v)
+            params["by"] = new_by
+
+        try:
+            grouped = df.groupby(**params)
+        except ValueError:  # pragma: no cover
+            if isinstance(df.index.values, ArrowArray):
+                df = df.copy()
+                df.index = pd.Index(df.index.to_numpy(), name=df.index.name)
+                grouped = df.groupby(**params)
+            else:
+                raise
+
+        if selection is not None:
+            grouped = grouped[selection]
+        return grouped
+
+    @staticmethod
+    def _pack_inputs(agg_funcs: List[ReductionAggStep], in_data):
+        pos = 0
+        out_dict = dict()
+        for step in agg_funcs:
+            if step.custom_reduction is None:
+                out_dict[step.output_key] = in_data[pos]
+            else:
+                out_dict[step.output_key] = tuple(
+                    in_data[pos : pos + step.output_limit]
+                )
+            pos += step.output_limit
+        return out_dict
+
+    @staticmethod
+    def _do_custom_agg(
+        func_name: str, op: "DataFrameGroupByAgg", in_data: pd.DataFrame
+    ) -> Union[pd.Series, pd.DataFrame]:
+        if op.stage == OperandStage.map:
+            return custom_agg_functions[func_name].execute_map(op, in_data)
+        elif op.stage == OperandStage.combine:
+            return custom_agg_functions[func_name].execute_combine(op, in_data)
+        else:  # must be OperandStage.agg, since OperandStage.reduce has been excluded in the execute function.
+            return custom_agg_functions[func_name].execute_agg(op, in_data)
+
+    @staticmethod
+    def _do_predefined_agg(input_obj, agg_func, single_func=False, **kwds):
+        ndim = getattr(input_obj, "ndim", None) or input_obj.obj.ndim
+        if agg_func == "str_concat":
+            agg_func = lambda x: x.str.cat(**kwds)
+        elif isinstance(agg_func, str) and not kwds.get("skipna", True):
+            func_name = agg_func
+            agg_func = lambda x: getattr(x, func_name)(skipna=False)
+            agg_func.__name__ = func_name
+
+        if ndim == 2:
+            if single_func:
+                result = input_obj.agg(agg_func)
+                if result.ndim == 1:
+                    # when agg_func == size, agg only returns one single series.
+                    result = result.to_frame(agg_func)
+            else:
+                result = input_obj.agg([agg_func])
+                result.columns = result.columns.droplevel(-1)
+            return result
+        else:
+            return input_obj.agg(agg_func)
+
+    @staticmethod
+    def _series_to_df(in_series, gpu):
+        xdf = cudf if gpu else pd
+
+        in_df = in_series.to_frame()
+        if in_series.name is not None:
+            in_df.columns = xdf.Index([in_series.name])
+        return in_df
+
+    @classmethod
+    def _execute_map(cls, ctx, op: "DataFrameGroupByAgg"):
+        xdf = cudf if op.gpu else pd
+
+        in_data = ctx[op.inputs[0].key]
+        if (
+            isinstance(in_data, xdf.Series)
+            and op.output_types[0] == OutputType.dataframe
+        ):
+            in_data = cls._series_to_df(in_data, op.gpu)
+
+        # map according to map groups
+        ret_map_groupbys = dict()
+        grouped = cls._get_grouped(op, in_data, ctx)
+        grouper = None
+        drop_names = False
+
+        for input_key, output_key, cols, func in op.pre_funcs:
+            if input_key == output_key:
+                if cols is None or getattr(grouped, "_selection", None) is not None:
+                    ret_map_groupbys[output_key] = grouped
+                else:
+                    ret_map_groupbys[output_key] = grouped[cols]
+            else:
+
+                def _wrapped_func(col):
+                    try:
+                        return func(col, gpu=op.is_gpu())
+                    except TypeError:
+                        return col
+
+                pre_df = in_data if cols is None else in_data[cols]
+                try:
+                    pre_df = func(pre_df, gpu=op.is_gpu())
+                except TypeError:
+                    pre_df = pre_df.transform(_wrapped_func)
+
+                if grouper is None:
+                    try:
+                        grouper = grouped.grouper
+                    except AttributeError:  # cudf does not have GroupBy.grouper
+                        grouper = xdf.Series(
+                            grouped.grouping.keys, index=grouped.obj.index
+                        )
+                        if in_data.ndim == 2:
+                            drop_names = True
+
+                if drop_names:
+                    pre_df = pre_df.drop(
+                        columns=grouped.grouping.names, errors="ignore"
+                    )
+                ret_map_groupbys[output_key] = cls._get_grouped(
+                    op, pre_df, ctx, grouper=grouper
+                )
+
+        agg_dfs = []
+        for (
+            input_key,
+            raw_func_name,
+            map_func_name,
+            _agg_func_name,
+            custom_reduction,
+            _output_key,
+            _output_limit,
+            kwds,
+        ) in op.agg_funcs:
+            input_obj = ret_map_groupbys[input_key]
+            if map_func_name == "custom_reduction":
+                agg_dfs.append(cls._do_custom_agg(raw_func_name, op, in_data))
+            else:
+                single_func = map_func_name == op.raw_func
+                agg_dfs.append(
+                    cls._do_predefined_agg(
+                        input_obj, map_func_name, single_func, **kwds
+                    )
+                )
+
+        if getattr(op, "size_recorder_name", None) is not None:
+            # record_size
+            raw_size = estimate_pandas_size(in_data)
+            # when agg by a list of methods, agg_size should be sum
+            agg_size = sum([estimate_pandas_size(item) for item in agg_dfs])
+            size_recorder = ctx.get_remote_object(op.size_recorder_name)
+            size_recorder.record(raw_size, agg_size)
+
+        ctx[op.outputs[0].key] = tuple(agg_dfs)
+
+    @classmethod
+    def _execute_combine(cls, ctx, op: "DataFrameGroupByAgg"):
+        xdf = cudf if op.gpu else pd
+
+        in_data_tuple = ctx[op.inputs[0].key]
+        in_data_list = []
+        for in_data in in_data_tuple:
+            if (
+                isinstance(in_data, xdf.Series)
+                and op.output_types[0] == OutputType.dataframe
+            ):
+                in_data = cls._series_to_df(in_data, op.gpu)
+            in_data_list.append(cls._get_grouped(op, in_data, ctx))
+        in_data_tuple = tuple(in_data_list)
+        in_data_dict = cls._pack_inputs(op.agg_funcs, in_data_tuple)
+
+        combines = []
+        for raw_input, (
+            _input_key,
+            raw_func_name,
+            _map_func_name,
+            agg_func_name,
+            custom_reduction,
+            output_key,
+            _output_limit,
+            kwds,
+        ) in zip(ctx[op.inputs[0].key], op.agg_funcs):
+            input_obj = in_data_dict[output_key]
+            if agg_func_name == "custom_reduction":
+                combines.append(cls._do_custom_agg(raw_func_name, op, raw_input))
+            else:
+                combines.append(
+                    cls._do_predefined_agg(input_obj, agg_func_name, **kwds)
+                )
+        ctx[op.outputs[0].key] = tuple(combines)
+
+    @classmethod
+    def _execute_agg(cls, ctx, op: "DataFrameGroupByAgg"):
+        xdf = cudf if op.gpu else pd
+        out_chunk = op.outputs[0]
+        col_value = (
+            out_chunk.columns_value.to_pandas()
+            if hasattr(out_chunk, "columns_value")
+            else None
+        )
+
+        in_data_tuple = ctx[op.inputs[0].key]
+        in_data_list = []
+        for in_data in in_data_tuple:
+            if (
+                isinstance(in_data, xdf.Series)
+                and op.output_types[0] == OutputType.dataframe
+            ):
+                in_data = cls._series_to_df(in_data, op.gpu)
+            in_data_list.append(in_data)
+        in_data_tuple = tuple(in_data_list)
+        in_data_dict = cls._pack_inputs(op.agg_funcs, in_data_tuple)
+
+        for (
+            _input_key,
+            raw_func_name,
+            _map_func_name,
+            agg_func_name,
+            custom_reduction,
+            output_key,
+            _output_limit,
+            kwds,
+        ) in op.agg_funcs:
+            if agg_func_name == "custom_reduction":
+                in_data_dict[output_key] = cls._do_custom_agg(
+                    raw_func_name, op, in_data_dict[output_key]
+                )
+            else:
+                input_obj = cls._get_grouped(op, in_data_dict[output_key], ctx)
+                in_data_dict[output_key] = cls._do_predefined_agg(
+                    input_obj, agg_func_name, **kwds
+                )
+
+        aggs = []
+        for input_keys, _output_key, func_name, cols, func in op.post_funcs:
+            if func_name in custom_agg_functions:
+                agg_df = in_data_dict[_output_key]
+            else:
+                if cols is None:
+                    func_inputs = [in_data_dict[k] for k in input_keys]
+                else:
+                    func_inputs = [in_data_dict[k][cols] for k in input_keys]
+
+                if (
+                    func_inputs[0].ndim == 2
+                    and len(set(inp.shape[1] for inp in func_inputs)) > 1
+                ):
+                    common_cols = func_inputs[0].columns
+                    for inp in func_inputs[1:]:
+                        common_cols = common_cols.join(inp.columns, how="inner")
+                    func_inputs = [inp[common_cols] for inp in func_inputs]
+
+                agg_df = func(*func_inputs, gpu=op.is_gpu())
+            if isinstance(agg_df, np.ndarray):
+                agg_df = xdf.DataFrame(agg_df, index=func_inputs[0].index)
+
+            new_cols = None
+            if out_chunk.ndim == 2 and col_value is not None:
+                if col_value.nlevels > agg_df.columns.nlevels:
+                    new_cols = xdf.MultiIndex.from_product(
+                        [agg_df.columns, [func_name]]
+                    )
+                elif agg_df.shape[-1] == 1 and func_name in col_value:
+                    new_cols = xdf.Index([func_name])
+            aggs.append((agg_df, new_cols))
+
+        for agg_df, new_cols in aggs:
+            if new_cols is not None:
+                agg_df.columns = new_cols
+        aggs = [item[0] for item in aggs]
+
+        if out_chunk.ndim == 2:
+            result = concat_on_columns(aggs)
+            if (
+                not op.groupby_params.get("as_index", True)
+                and col_value.nlevels == result.columns.nlevels
+            ):
+                result.reset_index(
+                    inplace=True, drop=result.index.name in result.columns
+                )
+            result = result.reindex(col_value, axis=1)
+
+            if result.ndim == 2 and len(result) == 0:
+                result = result.astype(out_chunk.dtypes)
+        else:
+            result = xdf.concat(aggs)
+            if result.ndim == 2:
+                result = result.iloc[:, 0]
+                if is_cudf(result):  # pragma: no cover
+                    result = result.copy()
+            result.name = out_chunk.name
+
+        ctx[out_chunk.key] = result
+
+    @classmethod
+    @redirect_custom_log
+    @enter_current_session
+    def execute(cls, ctx, op: "DataFrameGroupByAgg"):
+        try:
+            pd.set_option("mode.use_inf_as_na", op.use_inf_as_na)
+            if op.stage == OperandStage.map:
+                cls._execute_map(ctx, op)
+            elif op.stage == OperandStage.combine:
+                cls._execute_combine(ctx, op)
+            elif op.stage == OperandStage.agg:
+                cls._execute_agg(ctx, op)
+            else:  # pragma: no cover
+                raise ValueError("Aggregation operand not executable")
+        finally:
+            pd.reset_option("mode.use_inf_as_na")
+
+
+def agg(groupby, func=None, method="auto", combine_size=None, *args, **kwargs):
+    """
+    Aggregate using one or more operations on grouped data.
+
+    Parameters
+    ----------
+    groupby : Mars Groupby
+        Groupby data.
+    func : str or list-like
+        Aggregation functions.
+    method : {'auto', 'shuffle', 'tree'}, default 'auto'
+        'tree' method provide a better performance, 'shuffle' is recommended
+        if aggregated result is very large, 'auto' will use 'shuffle' method
+        in distributed mode and use 'tree' in local mode.
+    combine_size : int
+        The number of chunks to combine when method is 'tree'
+
+
+    Returns
+    -------
+    Series or DataFrame
+        Aggregated result.
+    """
+
+    # When perform a computation on the grouped data, we won't shuffle
+    # the data in the stage of groupby and do shuffle after aggregation.
+
+    if not isinstance(groupby, GROUPBY_TYPE):
+        raise TypeError(f"Input should be type of groupby, not {type(groupby)}")
+
+    if method is None:
+        method = "auto"
+    if method not in ["shuffle", "tree", "auto"]:
+        raise ValueError(
+            f"Method {method} is not available, please specify 'tree' or 'shuffle"
+        )
+
+    if not is_funcs_aggregate(func, ndim=groupby.ndim):
+        # pass index to transform, otherwise it will lose name info for index
+        agg_result = build_mock_agg_result(
+            groupby, groupby.op.groupby_params, func, **kwargs
+        )
+        if isinstance(agg_result.index, pd.RangeIndex):
+            # set -1 to represent unknown size for RangeIndex
+            index_value = parse_index(
+                pd.RangeIndex(-1), groupby.key, groupby.index_value.key
+            )
+        else:
+            index_value = parse_index(
+                agg_result.index, groupby.key, groupby.index_value.key
+            )
+        return groupby.transform(
+            func, *args, _call_agg=True, index=index_value, **kwargs
+        )
+
+    use_inf_as_na = kwargs.pop("_use_inf_as_na", options.dataframe.mode.use_inf_as_na)
+
+    agg_op = DataFrameGroupByAgg(
+        raw_func=func,
+        raw_func_kw=kwargs,
+        method=method,
+        raw_groupby_params=groupby.op.groupby_params,
+        groupby_params=groupby.op.groupby_params,
+        combine_size=combine_size or options.combine_size,
+        chunk_store_limit=options.chunk_store_limit,
+        use_inf_as_na=use_inf_as_na,
+    )
+    return agg_op(groupby)
diff --git a/python/xorbits/_mars/dataframe/groupby/apply.py b/python/xorbits/_mars/dataframe/groupby/apply.py
new file mode 100644
index 000000000..c16fefdbd
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/groupby/apply.py
@@ -0,0 +1,358 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import OutputType
+from ...core.context import get_context
+from ...core.custom_log import redirect_custom_log
+from ...core.operand import OperatorLogicKeyGeneratorMixin
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    DictField,
+    FunctionField,
+    StringField,
+    TupleField,
+)
+from ...utils import enter_current_session, get_func_token, quiet_stdio, tokenize
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import (
+    auto_merge_chunks,
+    build_empty_df,
+    build_empty_series,
+    clean_up_func,
+    make_dtype,
+    make_dtypes,
+    parse_index,
+    restore_func,
+    validate_output_types,
+)
+
+
+class GroupByApplyLogicKeyGeneratorMixin(OperatorLogicKeyGeneratorMixin):
+    def _get_logic_key_token_values(self):
+        token_values = super()._get_logic_key_token_values()
+        if self.func:
+            return token_values + [get_func_token(self.func)]
+        else:  # pragma: no cover
+            return token_values
+
+
+class GroupByApply(
+    DataFrameOperand, DataFrameOperandMixin, GroupByApplyLogicKeyGeneratorMixin
+):
+    _op_type_ = opcodes.APPLY
+    _op_module_ = "dataframe.groupby"
+
+    func = FunctionField("func")
+    args = TupleField("args", default_factory=tuple)
+    kwds = DictField("kwds", default_factory=dict)
+    maybe_agg = BoolField("maybe_agg", default=None)
+    logic_key = StringField("logic_key", default=None)
+    func_key = AnyField("func_key", default=None)
+    need_clean_up_func = BoolField("need_clean_up_func", default=False)
+
+    def __init__(self, output_types=None, **kw):
+        super().__init__(_output_types=output_types, **kw)
+
+    def _update_key(self):
+        values = [v for v in self._values_ if v is not self.func] + [
+            get_func_token(self.func)
+        ]
+        self._obj_set("_key", tokenize(type(self).__name__, *values))
+        return self
+
+    @classmethod
+    @redirect_custom_log
+    @enter_current_session
+    def execute(cls, ctx, op):
+        restore_func(ctx, op)
+        in_data = ctx[op.inputs[0].key]
+        out = op.outputs[0]
+        if not in_data:
+            if op.output_types[0] == OutputType.dataframe:
+                ctx[op.outputs[0].key] = build_empty_df(op.outputs[0].dtypes)
+            elif op.output_types[0] == OutputType.series:
+                ctx[op.outputs[0].key] = build_empty_series(
+                    op.outputs[0].dtype, name=out.name
+                )
+            else:
+                raise ValueError(
+                    "Chunk can not be empty except for dataframe/series, "
+                    "please specify output types"
+                )
+            return
+
+        applied = in_data.apply(op.func, *op.args, **op.kwds)
+
+        if isinstance(applied, pd.DataFrame):
+            # when there is only one group, pandas tend to return a DataFrame, while
+            # we need to convert it into a compatible series
+            if op.output_types[0] == OutputType.series:
+                assert len(applied.index) == 1
+                applied_idx = pd.MultiIndex.from_arrays(
+                    [
+                        [applied.index[0]] * len(applied.columns),
+                        applied.columns.tolist(),
+                    ]
+                )
+                applied_idx.names = [applied.index.name, None]
+                applied = pd.Series(
+                    np.array(applied.iloc[0]), applied_idx, name=applied.columns.name
+                )
+            else:
+                applied.columns.name = None
+        ctx[out.key] = applied
+
+    @classmethod
+    def tile(cls, op):
+        clean_up_func(op)
+        in_groupby = op.inputs[0]
+        out_df = op.outputs[0]
+
+        chunks = []
+        for c in in_groupby.chunks:
+            inp_chunks = [c]
+
+            new_op = op.copy().reset_key()
+            new_op.tileable_op_key = op.key
+            if op.output_types[0] == OutputType.df_or_series:
+                chunks.append(
+                    new_op.new_chunk(inp_chunks, index=c.index, collapse_axis=1)
+                )
+            elif op.output_types[0] == OutputType.dataframe:
+                chunks.append(
+                    new_op.new_chunk(
+                        inp_chunks,
+                        index=c.index,
+                        shape=(np.nan, len(out_df.dtypes)),
+                        dtypes=out_df.dtypes,
+                        columns_value=out_df.columns_value,
+                        index_value=out_df.index_value,
+                    )
+                )
+            else:
+                chunks.append(
+                    new_op.new_chunk(
+                        inp_chunks,
+                        name=out_df.name,
+                        index=(c.index[0],),
+                        shape=(np.nan,),
+                        dtype=out_df.dtype,
+                        index_value=out_df.index_value,
+                    )
+                )
+
+        new_op = op.copy()
+        kw = out_df.params.copy()
+        kw["chunks"] = chunks
+        if op.output_types[0] == OutputType.dataframe:
+            kw["nsplits"] = ((np.nan,) * len(chunks), (out_df.shape[1],))
+        else:
+            kw["nsplits"] = ((np.nan,) * len(chunks),)
+        ret = new_op.new_tileable([in_groupby], **kw)
+        if not op.maybe_agg:
+            return [ret]
+        else:
+            # auto merge small chunks if df.groupby().apply(func)
+            # may be an aggregation operation
+            yield ret.chunks  # trigger execution for chunks
+            return [auto_merge_chunks(get_context(), ret)]
+
+    def _infer_df_func_returns(
+        self, in_groupby, in_df, dtypes=None, dtype=None, name=None, index=None
+    ):
+        index_value, output_type, new_dtypes = None, None, None
+
+        if self.output_types is not None and (dtypes is not None or dtype is not None):
+            ret_dtypes = dtypes if dtypes is not None else (dtype, name)
+            ret_index_value = parse_index(index) if index is not None else None
+            return ret_dtypes, ret_index_value
+
+        try:
+            infer_df = in_groupby.op.build_mock_groupby().apply(
+                self.func, *self.args, **self.kwds
+            )
+
+            if len(infer_df) <= 2:
+                # we create mock df with 4 rows, 2 groups
+                # if return df has 2 rows, we assume that
+                # it's an aggregation operation
+                self.maybe_agg = True
+
+            # todo return proper index when sort=True is implemented
+            index_value = parse_index(infer_df.index[:0], in_df.key, self.func)
+
+            # for backward compatibility
+            dtype = dtype if dtype is not None else dtypes
+            if isinstance(infer_df, pd.DataFrame):
+                output_type = output_type or OutputType.dataframe
+                new_dtypes = new_dtypes or infer_df.dtypes
+            elif isinstance(infer_df, pd.Series):
+                output_type = output_type or OutputType.series
+                new_dtypes = new_dtypes or (
+                    name or infer_df.name,
+                    dtype or infer_df.dtype,
+                )
+            else:
+                output_type = OutputType.series
+                new_dtypes = (name, dtype or pd.Series(infer_df).dtype)
+        except:  # noqa: E722  # nosec
+            pass
+
+        self.output_types = (
+            [output_type] if not self.output_types else self.output_types
+        )
+        dtypes = new_dtypes if dtypes is None else dtypes
+        index_value = index_value if index is None else parse_index(index)
+        return dtypes, index_value
+
+    def __call__(self, groupby, dtypes=None, dtype=None, name=None, index=None):
+        in_df = groupby
+        if self.output_types and self.output_types[0] == OutputType.df_or_series:
+            return self.new_df_or_series([groupby])
+        while in_df.op.output_types[0] not in (OutputType.dataframe, OutputType.series):
+            in_df = in_df.inputs[0]
+
+        with quiet_stdio():
+            dtypes, index_value = self._infer_df_func_returns(
+                groupby, in_df, dtypes, dtype=dtype, name=name, index=index
+            )
+        if index_value is None:
+            index_value = parse_index(None, (in_df.key, in_df.index_value.key))
+        for arg, desc in zip((self.output_types, dtypes), ("output_types", "dtypes")):
+            if arg is None:
+                raise TypeError(
+                    f"Cannot determine {desc} by calculating with enumerate data, "
+                    "please specify it as arguments"
+                )
+
+        if self.output_types[0] == OutputType.dataframe:
+            new_shape = (np.nan, len(dtypes))
+            return self.new_dataframe(
+                [groupby],
+                shape=new_shape,
+                dtypes=dtypes,
+                index_value=index_value,
+                columns_value=parse_index(dtypes.index, store_data=True),
+            )
+        else:
+            name = name or dtypes[0]
+            dtype = dtype or dtypes[1]
+            new_shape = (np.nan,)
+            return self.new_series(
+                [groupby],
+                name=name,
+                shape=new_shape,
+                dtype=dtype,
+                index_value=index_value,
+            )
+
+
+def groupby_apply(
+    groupby,
+    func,
+    *args,
+    output_type=None,
+    dtypes=None,
+    dtype=None,
+    name=None,
+    index=None,
+    skip_infer=None,
+    **kwargs,
+):
+    """
+    Apply function `func` group-wise and combine the results together.
+
+    The function passed to `apply` must take a dataframe as its first
+    argument and return a DataFrame, Series or scalar. `apply` will
+    then take care of combining the results back together into a single
+    dataframe or series. `apply` is therefore a highly flexible
+    grouping method.
+
+    While `apply` is a very flexible method, its downside is that
+    using it can be quite a bit slower than using more specific methods
+    like `agg` or `transform`. Pandas offers a wide range of method that will
+    be much faster than using `apply` for their specific purposes, so try to
+    use them before reaching for `apply`.
+
+    Parameters
+    ----------
+    func : callable
+        A callable that takes a dataframe as its first argument, and
+        returns a dataframe, a series or a scalar. In addition the
+        callable may take positional and keyword arguments.
+
+    output_type : {'dataframe', 'series'}, default None
+        Specify type of returned object. See `Notes` for more details.
+
+    dtypes : Series, default None
+        Specify dtypes of returned DataFrames. See `Notes` for more details.
+
+    dtype : numpy.dtype, default None
+        Specify dtype of returned Series. See `Notes` for more details.
+
+    name : str, default None
+        Specify name of returned Series. See `Notes` for more details.
+
+    index : Index, default None
+        Specify index of returned object. See `Notes` for more details.
+
+    skip_infer: bool, default False
+        Whether infer dtypes when dtypes or output_type is not specified.
+
+    args, kwargs : tuple and dict
+        Optional positional and keyword arguments to pass to `func`.
+
+    Returns
+    -------
+    applied : Series or DataFrame
+
+    See Also
+    --------
+    pipe : Apply function to the full GroupBy object instead of to each
+        group.
+    aggregate : Apply aggregate function to the GroupBy object.
+    transform : Apply function column-by-column to the GroupBy object.
+    Series.apply : Apply a function to a Series.
+    DataFrame.apply : Apply a function to each row or column of a DataFrame.
+
+    Notes
+    -----
+    When deciding output dtypes and shape of the return value, Mars will
+    try applying ``func`` onto a mock grouped object, and the apply call
+    may fail. When this happens, you need to specify the type of apply
+    call (DataFrame or Series) in output_type.
+
+    * For DataFrame output, you need to specify a list or a pandas Series
+      as ``dtypes`` of output DataFrame. ``index`` of output can also be
+      specified.
+    * For Series output, you need to specify ``dtype`` and ``name`` of
+      output Series.
+    """
+    output_types = kwargs.pop("output_types", None)
+    object_type = kwargs.pop("object_type", None)
+    output_types = validate_output_types(
+        output_types=output_types, output_type=output_type, object_type=object_type
+    )
+    if output_types is None and skip_infer:
+        output_types = [OutputType.df_or_series]
+
+    dtypes = make_dtypes(dtypes)
+    dtype = make_dtype(dtype)
+    op = GroupByApply(func=func, args=args, kwds=kwargs, output_types=output_types)
+    return op(groupby, dtypes=dtypes, dtype=dtype, name=name, index=index)
diff --git a/python/xorbits/_mars/dataframe/groupby/core.py b/python/xorbits/_mars/dataframe/groupby/core.py
new file mode 100644
index 000000000..52f6ef8df
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/groupby/core.py
@@ -0,0 +1,533 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from functools import partial
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import ENTITY_TYPE, Entity, OutputType
+from ...core.operand import MapReduceOperand, OperandStage
+from ...lib.groupby_wrapper import wrapped_groupby
+from ...serialization.serializables import AnyField, BoolField, Int32Field
+from ...utils import lazy_import, no_default, pd_release_version
+from ..align import align_dataframe_series, align_series_series
+from ..core import SERIES_CHUNK_TYPE, SERIES_TYPE
+from ..initializer import Series as asseries
+from ..operands import DataFrameOperandMixin, DataFrameShuffleProxy
+from ..utils import (
+    build_concatenated_rows_frame,
+    build_df,
+    build_series,
+    hash_dataframe_on,
+    is_cudf,
+    parse_index,
+)
+
+cudf = lazy_import("cudf")
+
+_GROUP_KEYS_NO_DEFAULT = pd_release_version >= (1, 5, 0)
+_default_group_keys = no_default if _GROUP_KEYS_NO_DEFAULT else True
+
+
+class DataFrameGroupByOperand(MapReduceOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.GROUPBY
+
+    _by = AnyField("by", on_serialize=lambda x: x.data if isinstance(x, Entity) else x)
+    _level = AnyField("level")
+    _as_index = BoolField("as_index")
+    _sort = BoolField("sort")
+    _group_keys = BoolField("group_keys")
+
+    _shuffle_size = Int32Field("shuffle_size")
+
+    def __init__(
+        self,
+        by=None,
+        level=None,
+        as_index=None,
+        sort=None,
+        group_keys=None,
+        shuffle_size=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _by=by,
+            _level=level,
+            _as_index=as_index,
+            _sort=sort,
+            _group_keys=group_keys,
+            _shuffle_size=shuffle_size,
+            _output_types=output_types,
+            **kw
+        )
+        if output_types:
+            if self.stage in (OperandStage.map, OperandStage.reduce):
+                if output_types[0] in (
+                    OutputType.dataframe,
+                    OutputType.dataframe_groupby,
+                ):
+                    output_types = [OutputType.dataframe]
+                else:
+                    output_types = [OutputType.series]
+            else:
+                if output_types[0] in (
+                    OutputType.dataframe,
+                    OutputType.dataframe_groupby,
+                ):
+                    output_types = [OutputType.dataframe_groupby]
+                elif output_types[0] == OutputType.series:
+                    output_types = [OutputType.series_groupby]
+            self.output_types = output_types
+
+    @property
+    def by(self):
+        return self._by
+
+    @property
+    def level(self):
+        return self._level
+
+    @property
+    def as_index(self):
+        return self._as_index
+
+    @property
+    def sort(self):
+        return self._sort
+
+    @property
+    def group_keys(self):
+        return self._group_keys
+
+    @property
+    def shuffle_size(self):
+        return self._shuffle_size
+
+    @property
+    def is_dataframe_obj(self):
+        return self.output_types[0] in (
+            OutputType.dataframe_groupby,
+            OutputType.dataframe,
+        )
+
+    @property
+    def groupby_params(self):
+        return dict(
+            by=self.by,
+            level=self.level,
+            as_index=self.as_index,
+            sort=self.sort,
+            group_keys=self.group_keys,
+        )
+
+    def build_mock_groupby(self, **kwargs):
+        in_df = self.inputs[0]
+        if self.is_dataframe_obj:
+            mock_obj = build_df(
+                in_df, size=[2, 2], fill_value=[1, 2], ensure_string=True
+            )
+        else:
+            mock_obj = build_series(
+                in_df,
+                size=[2, 2],
+                fill_value=[1, 2],
+                name=in_df.name,
+                ensure_string=True,
+            )
+
+        new_kw = self.groupby_params
+        new_kw.update(kwargs)
+        if new_kw.get("level"):
+            new_kw["level"] = 0
+        if isinstance(new_kw["by"], list):
+            new_by = []
+            for v in new_kw["by"]:
+                if isinstance(v, ENTITY_TYPE):
+                    build_fun = build_df if v.ndim == 2 else build_series
+                    mock_by = pd.concat(
+                        [
+                            build_fun(v, size=2, fill_value=1, name=v.name),
+                            build_fun(v, size=2, fill_value=2, name=v.name),
+                        ]
+                    )
+                    new_by.append(mock_by)
+                else:
+                    new_by.append(v)
+            new_kw["by"] = new_by
+        return mock_obj.groupby(**new_kw)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs[1:])
+        if len(inputs) > 1:
+            by = []
+            for k in self._by:
+                if isinstance(k, (SERIES_TYPE, SERIES_CHUNK_TYPE)):
+                    by.append(next(inputs_iter))
+                else:
+                    by.append(k)
+            self._by = by
+
+    def __call__(self, df):
+        params = df.params.copy()
+        params["index_value"] = parse_index(None, df.key, df.index_value.key)
+        if df.ndim == 2:
+            if isinstance(self.by, list):
+                index, types = [], []
+                for k in self.by:
+                    if isinstance(k, SERIES_TYPE):
+                        index.append(k.name)
+                        types.append(k.dtype)
+                    elif k in df.dtypes:
+                        index.append(k)
+                        types.append(df.dtypes[k])
+                    else:
+                        raise KeyError(k)
+                params["key_dtypes"] = pd.Series(types, index=index)
+
+        inputs = [df]
+        if isinstance(self.by, list):
+            for k in self.by:
+                if isinstance(k, SERIES_TYPE):
+                    inputs.append(k)
+
+        return self.new_tileable(inputs, **params)
+
+    @classmethod
+    def _align_input_and_by(cls, op, inp, by):
+        align_method = (
+            partial(align_dataframe_series, axis="index")
+            if op.is_dataframe_obj
+            else align_series_series
+        )
+        nsplits, _, inp_chunks, by_chunks = align_method(inp, by)
+
+        inp_params = inp.params
+        inp_params["chunks"] = inp_chunks
+        inp_params["nsplits"] = nsplits
+        inp = inp.op.copy().new_tileable(op.inputs, kws=[inp_params])
+
+        by_params = by.params
+        by_params["chunks"] = by_chunks
+        if len(nsplits) == 2:
+            by_nsplits = nsplits[:1]
+        else:
+            by_nsplits = nsplits
+        by_params["nsplits"] = by_nsplits
+        by = by.op.copy().new_tileable(by.op.inputs, kws=[by_params])
+
+        return inp, by
+
+    @classmethod
+    def tile(cls, op):
+        in_df = op.inputs[0]
+        by = op.by
+
+        series_in_by = False
+        new_inputs = []
+        if len(op.inputs) > 1:
+            # by series
+            new_by = []
+            for k in by:
+                if isinstance(k, SERIES_TYPE):
+                    in_df, k = cls._align_input_and_by(op, in_df, k)
+                    if len(new_inputs) == 0:
+                        new_inputs.append(in_df)
+                    new_inputs.append(k)
+                    series_in_by = True
+                new_by.append(k)
+            by = new_by
+        else:
+            new_inputs = op.inputs
+
+        is_dataframe_obj = op.is_dataframe_obj
+        if is_dataframe_obj:
+            in_df = build_concatenated_rows_frame(in_df)
+            output_type = OutputType.dataframe
+            chunk_shape = (in_df.chunk_shape[0], 1)
+        else:
+            output_type = OutputType.series
+            chunk_shape = (in_df.chunk_shape[0],)
+
+        # generate map chunks
+        map_chunks = []
+        for chunk in in_df.chunks:
+            map_op = op.copy().reset_key()
+            map_op.stage = OperandStage.map
+            map_op._shuffle_size = chunk_shape[0]
+            map_op._output_types = [output_type]
+            chunk_inputs = [chunk]
+            if len(op.inputs) > 1:
+                chunk_by = []
+                for k in by:
+                    if isinstance(k, SERIES_TYPE):
+                        by_chunk = k.cix[chunk.index[0],]
+                        chunk_by.append(by_chunk)
+                        chunk_inputs.append(by_chunk)
+                    else:
+                        chunk_by.append(k)
+                map_op._by = chunk_by
+            map_chunks.append(
+                map_op.new_chunk(
+                    chunk_inputs,
+                    shape=(np.nan, np.nan),
+                    index=chunk.index,
+                )
+            )
+
+        proxy_chunk = DataFrameShuffleProxy(output_types=[output_type]).new_chunk(
+            map_chunks, shape=()
+        )
+
+        # generate reduce chunks
+        reduce_chunks = []
+        out_indices = list(itertools.product(*(range(s) for s in chunk_shape)))
+        for ordinal, out_idx in enumerate(out_indices):
+            reduce_op = op.copy().reset_key()
+            reduce_op._by = None
+            reduce_op._output_types = [output_type]
+            reduce_op.stage = OperandStage.reduce
+            reduce_op.reducer_ordinal = ordinal
+            reduce_op.n_reducers = len(out_indices)
+            reduce_chunks.append(
+                reduce_op.new_chunk(
+                    [proxy_chunk], shape=(np.nan, np.nan), index=out_idx
+                )
+            )
+
+        # generate groupby chunks
+        out_chunks = []
+        for chunk in reduce_chunks:
+            groupby_op = op.copy().reset_key()
+            if series_in_by:
+                # set by to None, cuz data of by will be passed from map to reduce to groupby
+                groupby_op._by = None
+            if is_dataframe_obj:
+                new_shape = (np.nan, in_df.shape[1])
+            else:
+                new_shape = (np.nan,)
+            params = dict(shape=new_shape, index=chunk.index)
+            if op.is_dataframe_obj:
+                params.update(
+                    dict(
+                        dtypes=in_df.dtypes,
+                        columns_value=in_df.columns_value,
+                        index_value=parse_index(None, chunk.key, proxy_chunk.key),
+                    )
+                )
+            else:
+                params.update(
+                    dict(
+                        name=in_df.name,
+                        dtype=in_df.dtype,
+                        index_value=parse_index(None, chunk.key, proxy_chunk.key),
+                    )
+                )
+            out_chunks.append(groupby_op.new_chunk([chunk], **params))
+
+        new_op = op.copy()
+        params = op.outputs[0].params.copy()
+        if is_dataframe_obj:
+            params["nsplits"] = ((np.nan,) * len(out_chunks), (in_df.shape[1],))
+        else:
+            params["nsplits"] = ((np.nan,) * len(out_chunks),)
+        params["chunks"] = out_chunks
+        return new_op.new_tileables(new_inputs, **params)
+
+    @classmethod
+    def execute_map(cls, ctx, op):
+        is_dataframe_obj = op.is_dataframe_obj
+        by = op.by
+        chunk = op.outputs[0]
+        df = ctx[op.inputs[0].key]
+
+        deliver_by = False  # output by for the upcoming process
+        if isinstance(by, list):
+            new_by = []
+            for v in by:
+                if isinstance(v, ENTITY_TYPE):
+                    deliver_by = True
+                    new_by.append(ctx[v.key])
+                else:
+                    new_by.append(v)
+            by = new_by
+
+        if isinstance(by, list) or callable(by):
+            on = by
+        else:
+            on = None
+
+        # Get the filter rule corresponding to each df.
+        dfs = df if isinstance(df, tuple) else (df,)
+        counter = itertools.count()
+        df_filters = []
+        idx_to_index_and_filters = dict()
+        for item in dfs:
+            is_new = True
+            for _, (index, filters) in idx_to_index_and_filters.items():
+                if item.index.equals(index):
+                    df_filters.append(filters)
+                    is_new = False
+                    break
+            if is_new:
+                filters = hash_dataframe_on(item, on, op.shuffle_size, level=op.level)
+                idx_to_index_and_filters[next(counter)] = (item.index, filters)
+                df_filters.append(filters)
+
+        def _take_index(src, f):
+            result = src.iloc[f]
+            if src.index.names:
+                result.index.names = src.index.names
+            if isinstance(src.index, pd.MultiIndex):
+                result.index = result.index.remove_unused_levels()
+            if is_cudf(result):  # pragma: no cover
+                result = result.copy()
+            return result
+
+        for index_idx in range(len(df_filters[0])):
+            if is_dataframe_obj:
+                reducer_index = (index_idx, chunk.index[1])
+            else:
+                reducer_index = (index_idx,)
+            filtered = []
+            filtered_by = []
+            for d, filters in zip(dfs, df_filters):
+                index_filter = filters[index_idx]
+                if deliver_by:
+                    for v in by:
+                        if isinstance(v, pd.Series):
+                            filtered_by.append(_take_index(v, index_filter))
+                        else:
+                            filtered_by.append(v)
+                filtered.append(_take_index(d, index_filter))
+            if deliver_by:
+                ctx[chunk.key, reducer_index] = ctx.get_current_chunk().index, (
+                    *filtered,
+                    filtered_by,
+                    deliver_by,
+                )
+            else:
+                if isinstance(df, tuple):
+                    ctx[chunk.key, reducer_index] = (
+                        ctx.get_current_chunk().index,
+                        tuple(filtered) + (deliver_by,),
+                    )
+                else:
+                    ctx[chunk.key, reducer_index] = (
+                        ctx.get_current_chunk().index,
+                        filtered[0],
+                    )
+
+    @classmethod
+    def execute_reduce(cls, ctx, op: "DataFrameGroupByOperand"):
+        xdf = cudf if op.gpu else pd
+        chunk = op.outputs[0]
+        input_idx_to_df = dict(op.iter_mapper_data(ctx))
+        row_idxes = sorted(input_idx_to_df.keys())
+
+        res = []
+        for row_idx in row_idxes:
+            row_df = input_idx_to_df.get(row_idx, None)
+            if row_df is not None:
+                res.append(row_df)
+        by = None
+        if isinstance(res[0], tuple):
+            # By is series
+            deliver_by = res[0][-1]
+            r = []
+            part_len = len(res[0])
+            part_len -= 1 if not deliver_by else 2
+            for n in range(part_len):
+                r.append(xdf.concat([it[n] for it in res], axis=0))
+            r = tuple(r)
+
+            if deliver_by:
+                by = [None] * len(res[0][-2])
+                for it in res:
+                    for i, v in enumerate(it[1]):
+                        if isinstance(v, pd.Series):
+                            if by[i] is None:
+                                by[i] = v
+                            else:
+                                by[i] = pd.concat([by[i], v], axis=0)
+                        else:
+                            by[i] = v
+        else:
+            r = pd.concat(res, axis=0)
+
+        if chunk.index_value is not None:
+            if isinstance(r, tuple):
+                for s in r:
+                    s.index.name = chunk.index_value.name
+            else:
+                r.index.name = chunk.index_value.name
+        if by is None:
+            ctx[chunk.key] = r
+        elif isinstance(r, tuple):
+            ctx[chunk.key] = r + (by,)
+        else:
+            ctx[chunk.key] = (r, by)
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameGroupByOperand"):
+        if op.stage == OperandStage.map:
+            cls.execute_map(ctx, op)
+        elif op.stage == OperandStage.reduce:
+            cls.execute_reduce(ctx, op)
+        else:
+            inp = ctx[op.inputs[0].key]
+            if isinstance(inp, tuple):
+                # df, by
+                df, by = inp
+            else:
+                df = inp
+                by = op.by
+            ctx[op.outputs[0].key] = wrapped_groupby(
+                df,
+                by=by,
+                level=op.level,
+                as_index=op.as_index,
+                sort=op.sort,
+                group_keys=op.group_keys if op.group_keys is not None else no_default,
+            )
+
+
+def groupby(
+    df, by=None, level=None, as_index=True, sort=True, group_keys=_default_group_keys
+):
+    if not as_index and df.op.output_types[0] == OutputType.series:
+        raise TypeError("as_index=False only valid with DataFrame")
+
+    output_types = (
+        [OutputType.dataframe_groupby] if df.ndim == 2 else [OutputType.series_groupby]
+    )
+    if isinstance(by, (SERIES_TYPE, pd.Series)):
+        if isinstance(by, pd.Series):
+            by = asseries(by)
+        by = [by]
+    elif df.ndim > 1 and by is not None and not isinstance(by, list):
+        by = [by]
+    op = DataFrameGroupByOperand(
+        by=by,
+        level=level,
+        as_index=as_index,
+        sort=sort,
+        group_keys=group_keys if group_keys is not no_default else None,
+        output_types=output_types,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/groupby/cum.py b/python/xorbits/_mars/dataframe/groupby/cum.py
new file mode 100644
index 000000000..4ca56c857
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/groupby/cum.py
@@ -0,0 +1,200 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import OutputType
+from ...serialization.serializables import AnyField, BoolField
+from ...utils import lazy_import
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_empty_df, build_empty_series, parse_index, validate_axis
+
+cudf = lazy_import("cudf")
+
+
+class GroupByCumReductionOperand(DataFrameOperandMixin, DataFrameOperand):
+    _op_module_ = "dataframe.groupby"
+
+    _axis = AnyField("axis")
+    _ascending = BoolField("ascending")
+
+    def __init__(self, axis=None, ascending=None, output_types=None, **kw):
+        super().__init__(
+            _axis=axis, _ascending=ascending, _output_types=output_types, **kw
+        )
+
+    @property
+    def axis(self) -> int:
+        return self._axis
+
+    @property
+    def ascending(self) -> bool:
+        return self._ascending
+
+    def _calc_out_dtypes(self, in_groupby):
+        mock_groupby = in_groupby.op.build_mock_groupby()
+        func_name = getattr(self, "_func_name")
+
+        if func_name == "cumcount":
+            result_df = mock_groupby.cumcount(ascending=self.ascending)
+        else:
+            result_df = getattr(mock_groupby, func_name)(axis=self.axis)
+
+        if isinstance(result_df, pd.DataFrame):
+            self.output_types = [OutputType.dataframe]
+            return result_df.dtypes
+        else:
+            self.output_types = [OutputType.series]
+            return result_df.name, result_df.dtype
+
+    def __call__(self, groupby):
+        in_df = groupby
+        while in_df.op.output_types[0] not in (OutputType.dataframe, OutputType.series):
+            in_df = in_df.inputs[0]
+
+        self._axis = validate_axis(self.axis or 0, in_df)
+
+        out_dtypes = self._calc_out_dtypes(groupby)
+
+        kw = in_df.params.copy()
+        kw["index_value"] = parse_index(pd.RangeIndex(-1), groupby.key)
+        if self.output_types[0] == OutputType.dataframe:
+            kw.update(
+                dict(
+                    columns_value=parse_index(out_dtypes.index, store_data=True),
+                    dtypes=out_dtypes,
+                    shape=(groupby.shape[0], len(out_dtypes)),
+                )
+            )
+        else:
+            name, dtype = out_dtypes
+            kw.update(dtype=dtype, name=name, shape=(groupby.shape[0],))
+        return self.new_tileable([groupby], **kw)
+
+    @classmethod
+    def tile(cls, op):
+        in_groupby = op.inputs[0]
+        out_df = op.outputs[0]
+
+        chunks = []
+        for c in in_groupby.chunks:
+            new_op = op.copy().reset_key()
+
+            new_index = parse_index(pd.RangeIndex(-1), c.key)
+            if op.output_types[0] == OutputType.dataframe:
+                chunks.append(
+                    new_op.new_chunk(
+                        [c],
+                        index=c.index,
+                        shape=(np.nan, len(out_df.dtypes)),
+                        dtypes=out_df.dtypes,
+                        columns_value=out_df.columns_value,
+                        index_value=new_index,
+                    )
+                )
+            else:
+                chunks.append(
+                    new_op.new_chunk(
+                        [c],
+                        index=(c.index[0],),
+                        shape=(np.nan,),
+                        dtype=out_df.dtype,
+                        index_value=new_index,
+                        name=out_df.name,
+                    )
+                )
+
+        new_op = op.copy().reset_key()
+        kw = out_df.params.copy()
+        kw["chunks"] = chunks
+        if op.output_types[0] == OutputType.dataframe:
+            kw["nsplits"] = ((np.nan,) * len(chunks), (len(out_df.dtypes),))
+        else:
+            kw["nsplits"] = ((np.nan,) * len(chunks),)
+        return new_op.new_tileables([in_groupby], **kw)
+
+    @classmethod
+    def execute(cls, ctx, op: "GroupByCumReductionOperand"):
+        in_data = ctx[op.inputs[0].key]
+        out_chunk = op.outputs[0]
+
+        if not in_data or in_data.empty:
+            ctx[out_chunk.key] = (
+                build_empty_df(out_chunk.dtypes)
+                if op.output_types[0] == OutputType.dataframe
+                else build_empty_series(out_chunk.dtype, name=out_chunk.name)
+            )
+            return
+
+        func_name = getattr(op, "_func_name")
+        if func_name == "cumcount":
+            ctx[out_chunk.key] = in_data.cumcount(ascending=op.ascending)
+        else:
+            result = getattr(in_data, func_name)(axis=op.axis)
+            if result.ndim == 2:
+                ctx[out_chunk.key] = result.astype(out_chunk.dtypes, copy=False)
+            else:
+                ctx[out_chunk.key] = result.astype(out_chunk.dtype, copy=False)
+
+
+class GroupByCummin(GroupByCumReductionOperand):
+    _op_type_ = opcodes.CUMMIN
+    _func_name = "cummin"
+
+
+class GroupByCummax(GroupByCumReductionOperand):
+    _op_type_ = opcodes.CUMMAX
+    _func_name = "cummax"
+
+
+class GroupByCumsum(GroupByCumReductionOperand):
+    _op_type_ = opcodes.CUMSUM
+    _func_name = "cumsum"
+
+
+class GroupByCumprod(GroupByCumReductionOperand):
+    _op_type_ = opcodes.CUMPROD
+    _func_name = "cumprod"
+
+
+class GroupByCumcount(GroupByCumReductionOperand):
+    _op_type_ = opcodes.CUMCOUNT
+    _func_name = "cumcount"
+
+
+def cumcount(groupby, ascending: bool = True):
+    op = GroupByCumcount(ascending=ascending)
+    return op(groupby)
+
+
+def cummin(groupby, axis=0):
+    op = GroupByCummin(axis=axis)
+    return op(groupby)
+
+
+def cummax(groupby, axis=0):
+    op = GroupByCummax(axis=axis)
+    return op(groupby)
+
+
+def cumprod(groupby, axis=0):
+    op = GroupByCumprod(axis=axis)
+    return op(groupby)
+
+
+def cumsum(groupby, axis=0):
+    op = GroupByCumsum(axis=axis)
+    return op(groupby)
diff --git a/python/xorbits/_mars/dataframe/groupby/custom_aggregation.py b/python/xorbits/_mars/dataframe/groupby/custom_aggregation.py
new file mode 100644
index 000000000..14c3818fc
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/groupby/custom_aggregation.py
@@ -0,0 +1,86 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC, abstractmethod
+from typing import Dict, Type, Union
+
+import pandas as pd
+
+
+class DataFrameCustomGroupByAggMixin(ABC):
+    @classmethod
+    @abstractmethod
+    def execute_map(cls, op, in_data: pd.DataFrame) -> Union[pd.DataFrame, pd.Series]:
+        """
+        Map stage implement.
+
+        Parameters
+        -------
+        op : Any operand
+            DataFrame operand.
+        in_data : pd.DataFrame
+            Input dataframe.
+
+        Returns
+        -------
+            The result of op map stage.
+        """
+
+    @classmethod
+    @abstractmethod
+    def execute_combine(
+        cls, op, in_data: pd.DataFrame
+    ) -> Union[pd.DataFrame, pd.Series]:
+        """
+        Combine stage implement.
+
+        Parameters
+        ----------
+        op : Any operand
+            DataFrame operand.
+        in_data : pd.Dataframe
+            Input dataframe.
+
+        Returns
+        -------
+            The result of op combine stage.
+        """
+
+    @classmethod
+    @abstractmethod
+    def execute_agg(cls, op, in_data: pd.DataFrame) -> Union[pd.DataFrame, pd.Series]:
+        """
+        Agg stage implement.
+
+        Parameters
+        ----------
+        op : Any operand
+            DataFrame operand.
+        in_data : pd.Dataframe
+            Input dataframe.
+
+        Returns
+        -------
+            The result of op agg stage.
+        """
+
+
+custom_agg_functions: Dict[str, Type[DataFrameCustomGroupByAggMixin]] = {}
+
+
+def register_custom_groupby_agg_func(method_name: str):
+    def wrap(func_type: Type[DataFrameCustomGroupByAggMixin]):
+        custom_agg_functions[method_name] = func_type
+        return func_type
+
+    return wrap
diff --git a/python/xorbits/_mars/dataframe/groupby/fill.py b/python/xorbits/_mars/dataframe/groupby/fill.py
new file mode 100644
index 000000000..5a4549af4
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/groupby/fill.py
@@ -0,0 +1,212 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import OutputType
+from ...serialization.serializables import AnyField, DictField, Int64Field, StringField
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_empty_df, build_empty_series, parse_index
+
+
+class GroupByFillOperand(DataFrameOperand, DataFrameOperandMixin):
+    _op_module_ = "dataframe.groupby"
+
+    value = AnyField("value", default=None)
+    method = StringField("method", default=None)
+    axis = AnyField("axis", default=0)
+    limit = Int64Field("limit", default=None)
+    downcast = DictField("downcast", default=None)
+
+    def _calc_out_dtypes(self, in_groupby):
+        mock_groupby = in_groupby.op.build_mock_groupby()
+        func_name = getattr(self, "_func_name")
+
+        if func_name == "fillna":
+            result_df = mock_groupby.fillna(
+                value=self.value,
+                method=self.method,
+                axis=self.axis,
+                limit=self.limit,
+                downcast=self.downcast,
+            )
+        else:
+            result_df = getattr(mock_groupby, func_name)(limit=self.limit)
+
+        if isinstance(result_df, pd.DataFrame):
+            self.output_types = [OutputType.dataframe]
+            return result_df.dtypes
+        else:
+            self.output_types = [OutputType.series]
+            return result_df.name, result_df.dtype
+
+    def __call__(self, groupby):
+        in_df = groupby
+        while in_df.op.output_types[0] not in (OutputType.dataframe, OutputType.series):
+            in_df = in_df.inputs[0]
+        out_dtypes = self._calc_out_dtypes(groupby)
+
+        kw = in_df.params.copy()
+        kw["index_value"] = parse_index(pd.RangeIndex(-1), groupby.key)
+        if self.output_types[0] == OutputType.dataframe:
+            kw.update(
+                dict(
+                    columns_value=parse_index(out_dtypes.index, store_data=True),
+                    dtypes=out_dtypes,
+                    shape=(groupby.shape[0], len(out_dtypes)),
+                )
+            )
+        else:
+            name, dtype = out_dtypes
+            kw.update(dtype=dtype, name=name, shape=(groupby.shape[0],))
+        return self.new_tileable([groupby], **kw)
+
+    @classmethod
+    def tile(cls, op):
+        in_groupby = op.inputs[0]
+        out_df = op.outputs[0]
+
+        chunks = []
+        for c in in_groupby.chunks:
+            new_op = op.copy().reset_key()
+
+            new_index = parse_index(pd.RangeIndex(-1), c.key)
+            if op.output_types[0] == OutputType.dataframe:
+                chunks.append(
+                    new_op.new_chunk(
+                        [c],
+                        index=c.index,
+                        shape=(np.nan, len(out_df.dtypes)),
+                        dtypes=out_df.dtypes,
+                        columns_value=out_df.columns_value,
+                        index_value=new_index,
+                    )
+                )
+            else:
+                chunks.append(
+                    new_op.new_chunk(
+                        [c],
+                        index=(c.index[0],),
+                        shape=(np.nan,),
+                        dtype=out_df.dtype,
+                        index_value=new_index,
+                        name=out_df.name,
+                    )
+                )
+        new_op = op.copy().reset_key()
+        kw = out_df.params.copy()
+        kw["chunks"] = chunks
+        if op.output_types[0] == OutputType.dataframe:
+            kw["nsplits"] = ((np.nan,) * len(chunks), (len(out_df.dtypes),))
+        else:
+            kw["nsplits"] = ((np.nan,) * len(chunks),)
+        return new_op.new_tileables([in_groupby], **kw)
+
+    @classmethod
+    def execute(cls, ctx, op: "GroupByFillOperand"):
+        in_data = ctx[op.inputs[0].key]
+        out_chunk = op.outputs[0]
+
+        if not in_data or in_data.empty:
+            ctx[out_chunk.key] = (
+                build_empty_df(out_chunk.dtypes)
+                if op.output_types[0] == OutputType.dataframe
+                else build_empty_series(out_chunk.dtype, name=out_chunk.name)
+            )
+            return
+
+        func_name = getattr(op, "_func_name")
+        if func_name == "fillna":
+            ctx[out_chunk.key] = in_data.fillna(
+                value=op.value,
+                method=op.method,
+                axis=op.axis,
+                limit=op.limit,
+                downcast=op.downcast,
+            )
+        else:
+            result = getattr(in_data, func_name)(limit=op.limit)
+            if result.ndim == 2:
+                ctx[out_chunk.key] = result.astype(out_chunk.dtypes, copy=False)
+            else:
+                ctx[out_chunk.key] = result.astype(out_chunk.dtype, copy=False)
+
+
+class GroupByFFill(GroupByFillOperand):
+    _op_type_ = opcodes.FILL_NA
+    _func_name = "ffill"
+
+
+class GroupByBFill(GroupByFillOperand):
+    _op_type = opcodes.FILL_NA
+    _func_name = "bfill"
+
+
+class GroupByFillNa(GroupByFillOperand):
+    _op_type = opcodes.FILL_NA
+    _func_name = "fillna"
+
+
+def ffill(groupby, limit=None):
+    """
+    Forward fill the values.
+
+    limit:  int, default None
+            Limit number of values to fill
+
+    return: Series or DataFrame
+    """
+    op = GroupByFFill(limit=limit)
+    return op(groupby)
+
+
+def bfill(groupby, limit=None):
+    """
+    Backward fill the values.
+
+    limit:  int, default None
+            Limit number of values to fill
+
+    return: Series or DataFrame
+    """
+    op = GroupByBFill(limit=limit)
+    return op(groupby)
+
+
+def fillna(groupby, value=None, method=None, axis=None, limit=None, downcast=None):
+    """
+    Fill NA/NaN values using the specified method
+
+    value:  scalar, dict, Series, or DataFrame
+            Value to use to fill holes (e.g. 0), alternately a dict/Series/DataFrame
+            of values specifying which value to use for each index (for a Series) or
+            column (for a DataFrame). Values not in the dict/Series/DataFrame
+            will not be filled. This value cannot be a list.
+    method: {'backfill','bfill','ffill',None}, default None
+    axis:   {0 or 'index', 1 or 'column'}
+    limit:  int, default None
+            If method is specified, this is the maximum number of consecutive
+            NaN values to forward/backward fill
+    downcast:   dict, default None
+                A dict of item->dtype of what to downcast if possible,
+                or the string ‘infer’ which will try to downcast to an appropriate equal type
+
+    return: DataFrame or None
+    """
+    op = GroupByFillNa(
+        value=value, method=method, axis=axis, limit=limit, downcast=downcast
+    )
+    return op(groupby)
diff --git a/python/xorbits/_mars/dataframe/groupby/getitem.py b/python/xorbits/_mars/dataframe/groupby/getitem.py
new file mode 100644
index 000000000..c4cbd8c7a
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/groupby/getitem.py
@@ -0,0 +1,137 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Iterable
+
+from ... import opcodes
+from ...core import OutputType
+from ...serialization.serializables import AnyField
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index
+
+
+class GroupByIndex(DataFrameOperandMixin, DataFrameOperand):
+    _op_type_ = opcodes.INDEX
+    _op_module_ = "dataframe.groupby"
+
+    _selection = AnyField("selection")
+
+    def __init__(self, selection=None, output_types=None, **kw):
+        super().__init__(_selection=selection, _output_types=output_types, **kw)
+
+    @property
+    def selection(self):
+        return self._selection
+
+    @property
+    def groupby_params(self):
+        params = self.inputs[0].op.groupby_params
+        params["selection"] = self.selection
+        return params
+
+    def build_mock_groupby(self, **kwargs):
+        groupby_op = self.inputs[0].op
+        return groupby_op.build_mock_groupby(**kwargs)[self.selection]
+
+    def __call__(self, groupby):
+        indexed = groupby.op.build_mock_groupby()[self.selection]
+
+        if indexed.ndim == 1:
+            self.output_types = [OutputType.series_groupby]
+            params = dict(
+                shape=(groupby.shape[0],),
+                name=self.selection,
+                dtype=groupby.dtypes[self.selection],
+                index_value=groupby.index_value,
+                key_dtypes=groupby.key_dtypes,
+            )
+        else:
+            self.output_types = [OutputType.dataframe_groupby]
+
+            if isinstance(self.selection, Iterable) and not isinstance(
+                self.selection, str
+            ):
+                item_list = list(self.selection)
+            else:
+                item_list = [self.selection]
+
+            params = groupby.params.copy()
+            params["dtypes"] = new_dtypes = groupby.dtypes[item_list]
+            params["selection"] = self.selection
+            params["shape"] = (groupby.shape[0], len(item_list))
+            params["columns_value"] = parse_index(new_dtypes.index, store_data=True)
+
+        return self.new_tileable([groupby], **params)
+
+    @classmethod
+    def tile(cls, op: "GroupByIndex"):
+        in_groupby = op.inputs[0]
+        out_groupby = op.outputs[0]
+
+        chunks = []
+        for c in in_groupby.chunks:
+            if op.output_types[0] == OutputType.series_groupby:
+                params = dict(
+                    shape=(c.shape[0],),
+                    name=op.selection,
+                    index=(c.index[0],),
+                    dtype=c.dtypes[op.selection],
+                    index_value=c.index_value,
+                    key_dtypes=c.key_dtypes,
+                )
+            else:
+                params = c.params.copy()
+                params["dtypes"] = out_groupby.dtypes
+                params["selection"] = op.selection
+                params["shape"] = (c.shape[0], len(op.selection))
+                params["columns_value"] = out_groupby.columns_value
+
+            new_op = op.copy().reset_key()
+            chunks.append(new_op.new_chunk([c], **params))
+
+        new_op = op.copy().reset_key()
+        params = out_groupby.params.copy()
+        new_nsplits = (
+            (in_groupby.nsplits[0], (len(op.selection),))
+            if out_groupby.ndim == 2
+            else (in_groupby.nsplits[0],)
+        )
+        params.update(dict(chunks=chunks, nsplits=new_nsplits))
+        return new_op.new_tileables([in_groupby], **params)
+
+    @classmethod
+    def execute(cls, ctx, op: "GroupByIndex"):
+        in_data = ctx[op.inputs[0].key]
+        ctx[op.outputs[0].key] = in_data[op.selection]
+
+
+def df_groupby_getitem(df_groupby, item):
+    try:
+        hash(item)
+        hashable = True
+    except TypeError:
+        hashable = False
+
+    if hashable and item in df_groupby.dtypes:
+        output_types = [OutputType.series_groupby]
+    elif isinstance(item, Iterable) and all(it in df_groupby.dtypes for it in item):
+        output_types = [OutputType.dataframe_groupby]
+    else:
+        raise NameError(f"Cannot slice groupby with {item!r}")
+
+    if df_groupby.selection:
+        raise IndexError(f"Column(s) {df_groupby.selection!r} already selected")
+
+    op = GroupByIndex(selection=item, output_types=output_types)
+    return op(df_groupby)
diff --git a/python/xorbits/_mars/dataframe/groupby/head.py b/python/xorbits/_mars/dataframe/groupby/head.py
new file mode 100644
index 000000000..b1b4f4a86
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/groupby/head.py
@@ -0,0 +1,225 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import OutputType, get_output_types, recursive_tile
+from ...serialization.serializables import BoolField, DictField, Int64Field
+from ...utils import pd_release_version
+from ..core import IndexValue
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_concatenated_rows_frame, parse_index
+
+_pandas_enable_negative = pd_release_version >= (1, 4, 0)
+
+
+class GroupByHead(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.GROUPBY_HEAD
+    _op_module_ = "dataframe.groupby"
+
+    row_count = Int64Field("row_count")
+    groupby_params = DictField("groupby_params")
+    enable_negative = BoolField("enable_negative")
+
+    def __call__(self, groupby):
+        df = groupby
+        while df.op.output_types[0] not in (OutputType.dataframe, OutputType.series):
+            df = df.inputs[0]
+
+        selection = groupby.op.groupby_params.pop("selection", None)
+        if df.ndim > 1 and selection:
+            if isinstance(selection, tuple) and selection not in df.dtypes:
+                selection = list(selection)
+
+            result_df = df[selection]
+        else:
+            result_df = df
+
+        self._output_types = (
+            [OutputType.dataframe] if result_df.ndim == 2 else [OutputType.series]
+        )
+
+        params = result_df.params
+        params["shape"] = (np.nan,) + result_df.shape[1:]
+        if isinstance(df.index_value.value, IndexValue.RangeIndex):
+            params["index_value"] = parse_index(pd.RangeIndex(-1), df.key)
+
+        return self.new_tileable([df], **params)
+
+    @classmethod
+    def tile(cls, op: "GroupByHead"):
+        in_df = op.inputs[0]
+        groupby_params = op.groupby_params.copy()
+        selection = groupby_params.pop("selection", None)
+
+        enable_negative = _pandas_enable_negative and op.enable_negative
+
+        if len(in_df.shape) > 1:
+            in_df = build_concatenated_rows_frame(in_df)
+        out_df = op.outputs[0]
+
+        # when row_count is not positive and pandas does not support negative head,
+        #  or there is only one chunk, tile with a single chunk
+        if (not enable_negative and op.row_count <= 0) or len(in_df.chunks) <= 1:
+            row_num = 0 if not enable_negative and op.row_count <= 0 else np.nan
+            new_shape = (row_num,)
+            new_nsplits = ((row_num,),)
+            if out_df.ndim > 1:
+                new_shape += (out_df.shape[1],)
+                new_nsplits += ((out_df.shape[1],),)
+
+            c = in_df.chunks[0]
+            chunk_op = op.copy().reset_key()
+            params = out_df.params
+            params["shape"] = new_shape
+            params["index"] = (0,) * out_df.ndim
+            out_chunk = chunk_op.new_chunk([c], **params)
+
+            tileable_op = op.copy().reset_key()
+            return tileable_op.new_tileables(
+                [in_df], nsplits=new_nsplits, chunks=[out_chunk], **params
+            )
+
+        if in_df.ndim > 1 and selection:
+            if isinstance(selection, tuple) and selection not in in_df.dtypes:
+                selection = list(selection)
+
+            if not isinstance(selection, list):
+                pre_selection = [selection]
+            else:
+                pre_selection = list(selection)
+
+            if isinstance(groupby_params.get("by"), list):
+                pre_selection += [
+                    el for el in groupby_params["by"] if el not in pre_selection
+                ]
+
+            if len(pre_selection) != in_df.shape[1]:
+                in_df = yield from recursive_tile(in_df[pre_selection])
+
+        # generate pre chunks
+        if op.row_count < 0:
+            # when we have negative row counts, pre-groupby optimization is not possible
+            pre_chunks = in_df.chunks
+        else:
+            pre_chunks = []
+            for c in in_df.chunks:
+                pre_op = op.copy().reset_key()
+                pre_op._output_types = get_output_types(c)
+                pre_op.groupby_params = op.groupby_params.copy()
+                pre_op.groupby_params.pop("selection", None)
+                params = c.params
+                params["shape"] = (np.nan,) + c.shape[1:]
+                pre_chunks.append(pre_op.new_chunk([c], **params))
+
+        new_op = op.copy().reset_key()
+        new_op._output_types = get_output_types(in_df)
+        new_nsplits = ((np.nan,) * len(in_df.nsplits[0]),) + in_df.nsplits[1:]
+        pre_tiled = new_op.new_tileable(
+            [in_df], chunks=pre_chunks, nsplits=new_nsplits, **in_df.params
+        )
+
+        # generate groupby
+        grouped = yield from recursive_tile(pre_tiled.groupby(**groupby_params))
+        if selection:
+            grouped = yield from recursive_tile(grouped[selection])
+
+        # generate post chunks
+        post_chunks = []
+        for c in grouped.chunks:
+            post_op = op.copy().reset_key()
+            post_op.groupby_params = op.groupby_params.copy()
+            post_op.groupby_params.pop("selection", None)
+            if op.output_types[0] == OutputType.dataframe:
+                index = c.index
+            else:
+                index = (c.index[0],)
+            params = out_df.params
+            params["index"] = index
+            post_chunks.append(post_op.new_chunk([c], **params))
+
+        new_op = op.copy().reset_key()
+        new_nsplits = ((np.nan,) * len(in_df.nsplits[0]),)
+        if out_df.ndim > 1:
+            new_nsplits += ((out_df.shape[1],),)
+        return new_op.new_tileables(
+            [in_df], chunks=post_chunks, nsplits=new_nsplits, **out_df.params
+        )
+
+    @classmethod
+    def execute(cls, ctx, op: "GroupByHead"):
+        in_data = ctx[op.inputs[0].key]
+
+        params = op.groupby_params.copy()
+        selection = params.pop("selection", None)
+
+        if hasattr(in_data, "groupby"):
+            grouped = in_data.groupby(**params)
+        else:
+            grouped = in_data
+
+        if selection:
+            grouped = grouped[selection]
+
+        result = grouped.head(op.row_count)
+        if not op.enable_negative and op.row_count < 0:
+            result = result.iloc[:0]
+        ctx[op.outputs[0].key] = result
+
+
+def head(groupby, n=5):
+    """
+    Return first n rows of each group.
+
+    Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows
+    from the original Series or DataFrame with original index and order preserved
+    (``as_index`` flag is ignored).
+
+    Does not work for negative values of `n`.
+
+    Returns
+    -------
+    Series or DataFrame
+
+    See Also
+    --------
+    Series.groupby
+    DataFrame.groupby
+
+    Examples
+    --------
+
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame([[1, 2], [1, 4], [5, 6]],
+    ...                   columns=['A', 'B'])
+    >>> df.groupby('A').head(1).execute()
+       A  B
+    0  1  2
+    2  5  6
+    >>> df.groupby('A').head(-1).execute()
+    Empty DataFrame
+    Columns: [A, B]
+    Index: []
+    """
+    groupby_params = groupby.op.groupby_params.copy()
+    groupby_params.pop("as_index", None)
+
+    op = GroupByHead(
+        row_count=n,
+        groupby_params=groupby_params,
+        enable_negative=_pandas_enable_negative,
+    )
+    return op(groupby)
diff --git a/python/xorbits/_mars/dataframe/groupby/nunique.py b/python/xorbits/_mars/dataframe/groupby/nunique.py
new file mode 100644
index 000000000..9a9f97c31
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/groupby/nunique.py
@@ -0,0 +1,157 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Union
+
+import pandas as pd
+
+from ...core import OutputType
+from ...utils import implements
+from .aggregation import DataFrameGroupByAgg
+from .custom_aggregation import (
+    DataFrameCustomGroupByAggMixin,
+    register_custom_groupby_agg_func,
+)
+
+
+@register_custom_groupby_agg_func("nunique")
+class DataFrameCustomGroupByNuniqueMixin(DataFrameCustomGroupByAggMixin):
+    @classmethod
+    def _get_level_indexes(
+        cls, op: DataFrameGroupByAgg, data: pd.DataFrame
+    ) -> List[int]:
+        """
+        When group by level, get the level index list.
+        Level can be int, level name, or sequence of such.
+        This function calculates the corresponding indexes.
+        Parameters
+        ----------
+        op
+        data
+
+        Returns
+        -------
+
+        """
+        index = [data.index.name] if data.index.name else data.index.names
+        index = pd.Index(index)
+        level = op.groupby_params["level"]
+        if isinstance(level, int):
+            indexes = [level]
+        elif isinstance(level, str):
+            indexes = [index.get_loc(level)]
+        else:
+            level = list(level)
+            if isinstance(level[0], int):
+                indexes = level
+            else:
+                indexes = index.get_indexer(level).tolist()
+        return indexes
+
+    @classmethod
+    def _get_selection_columns(cls, op: DataFrameGroupByAgg) -> Union[None, List]:
+        """
+        Get groupby selection columns from op parameters.
+        If this returns None, it means all columns are required.
+        Parameters
+        ----------
+        op
+
+        Returns
+        -------
+
+        """
+        if "selection" in op.groupby_params:
+            selection = op.groupby_params["selection"]
+            if isinstance(selection, (tuple, list)):
+                selection = [n for n in selection]
+            else:
+                selection = [selection]
+            return selection
+
+    @classmethod
+    def _get_execute_map_result(
+        cls, op: DataFrameGroupByAgg, in_data: pd.DataFrame
+    ) -> Union[pd.DataFrame, pd.Series]:
+        selections = cls._get_selection_columns(op)
+        by_cols = op.raw_groupby_params["by"]
+        if by_cols is not None:
+            cols = (
+                [*selections, *by_cols] if selections is not None else in_data.columns
+            )
+            res = in_data[cols].drop_duplicates(subset=cols).set_index(by_cols)
+        else:  # group by level
+            selections = selections if selections is not None else in_data.columns
+            level_indexes = cls._get_level_indexes(op, in_data)
+            in_data = in_data.reset_index()
+            index_names = in_data.columns[level_indexes].tolist()
+            cols = [*index_names, *selections]
+            res = in_data[cols].drop_duplicates().set_index(index_names)
+
+        # if sort=True is specified， sort index when finishing drop_duplicates.
+        if op.raw_groupby_params["sort"]:
+            res = res.sort_index()
+
+        if op.output_types[0] == OutputType.series:
+            res = res.squeeze()
+
+        return res
+
+    @classmethod
+    def _get_execute_combine_result(
+        cls, op: DataFrameGroupByAgg, in_data: pd.DataFrame
+    ) -> Union[pd.DataFrame, pd.Series]:
+        # in_data.index.names means MultiIndex (groupby on multi cols)
+        index_col = in_data.index.name or in_data.index.names
+        res = in_data.reset_index().drop_duplicates().set_index(index_col)
+        if op.output_types[0] == OutputType.series:
+            res = res.squeeze()
+        return res
+
+    @classmethod
+    def _get_execute_agg_result(
+        cls, op: DataFrameGroupByAgg, in_data: pd.DataFrame
+    ) -> Union[pd.DataFrame, pd.Series]:
+        groupby_params = op.groupby_params.copy()
+        cols = in_data.index.name or in_data.index.names
+        by = op.raw_groupby_params["by"]
+
+        if by is not None:
+            if op.output_types[0] == OutputType.dataframe:
+                groupby_params.pop("level", None)
+                groupby_params["by"] = cols
+                in_data = in_data.reset_index()
+        else:
+            # When group by multi levels, we must get the actual all levels from raw_groupby_params,
+            # since level field in op.groupby_params is not correct.
+            groupby_params["level"] = op.raw_groupby_params["level"]
+
+        res = in_data.groupby(**groupby_params).nunique()
+        return res
+
+    @classmethod
+    @implements(DataFrameCustomGroupByAggMixin.execute_map)
+    def execute_map(cls, op, in_data: pd.DataFrame) -> Union[pd.DataFrame, pd.Series]:
+        return cls._get_execute_map_result(op, in_data)
+
+    @classmethod
+    @implements(DataFrameCustomGroupByAggMixin.execute_combine)
+    def execute_combine(
+        cls, op, in_data: pd.DataFrame
+    ) -> Union[pd.DataFrame, pd.Series]:
+        return cls._get_execute_combine_result(op, in_data)
+
+    @classmethod
+    @implements(DataFrameCustomGroupByAggMixin.execute_agg)
+    def execute_agg(cls, op, in_data: pd.DataFrame) -> Union[pd.DataFrame, pd.Series]:
+        return cls._get_execute_agg_result(op, in_data)
diff --git a/python/xorbits/_mars/dataframe/groupby/sample.py b/python/xorbits/_mars/dataframe/groupby/sample.py
new file mode 100644
index 000000000..49b2c9635
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/groupby/sample.py
@@ -0,0 +1,626 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import itertools
+import random
+from collections.abc import Iterable
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import ENTITY_TYPE, OutputType, get_output_types, recursive_tile
+from ...core.operand import MapReduceOperand, OperandStage
+from ...serialization.serializables import (
+    BoolField,
+    DictField,
+    Float32Field,
+    Int32Field,
+    Int64Field,
+    KeyField,
+    NDArrayField,
+    StringField,
+)
+from ...tensor.operands import TensorShuffleProxy
+from ...tensor.random import RandomStateField
+from ...tensor.utils import gen_random_seeds
+from ...utils import has_unknown_shape
+from ..initializer import Series as asseries
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index
+
+_ILOC_COL_HEADER = "_gsamp_iloc_col_"
+_WEIGHT_COL_HEADER = "_gsamp_weight_col_"
+
+
+# code adapted from pandas.core.groupby.groupby.DataFrameGroupBy.sample
+def _sample_groupby_iter(
+    groupby, obj_index, n, frac, replace, weights, random_state=None, errors="ignore"
+):
+    if weights is None:
+        ws = [None] * groupby.ngroups
+    elif not isinstance(weights, Iterable) or isinstance(weights, str):
+        ws = [weights] * groupby.ngroups
+    else:
+        weights = pd.Series(weights, index=obj_index)
+        ws = [weights.iloc[idx] for idx in groupby.indices.values()]
+
+    group_iterator = groupby.grouper.get_iterator(groupby._selected_obj)
+    if not replace and errors == "ignore":
+        for (_, obj), w in zip(group_iterator, ws):
+            yield obj.sample(
+                n=n, frac=frac, replace=replace, weights=w, random_state=random_state
+            ) if len(obj) > n else obj
+    else:
+        for (_, obj), w in zip(group_iterator, ws):
+            yield obj.sample(
+                n=n, frac=frac, replace=replace, weights=w, random_state=random_state
+            )
+
+
+class GroupBySampleILoc(DataFrameOperand, DataFrameOperandMixin):
+    _op_code_ = opcodes.GROUPBY_SAMPLE_ILOC
+    _op_module_ = "dataframe.groupby"
+
+    groupby_params = DictField("groupby_params", default=None)
+    size = Int64Field("size", default=None)
+    frac = Float32Field("frac", default=None)
+    replace = BoolField("replace", default=None)
+    weights = KeyField("weights", default=None)
+    seed = Int32Field("seed", default=None)
+    _random_state = RandomStateField("random_state", default=None)
+    errors = StringField("errors", default=None)
+
+    random_col_id = Int32Field("random_col_id", default=None)
+
+    # for chunks
+    # num of instances for chunks
+    left_iloc_bound = Int64Field("left_iloc_bound", default=None)
+
+    def __init__(self, random_state=None, **kw):
+        super().__init__(_random_state=random_state, **kw)
+        if self.random_col_id is None:
+            self.random_col_id = random.randint(10000, 99999)
+
+    @property
+    def random_state(self):
+        if self._random_state is None:
+            self._random_state = np.random.RandomState(self.seed)
+        return self._random_state
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        input_iter = iter(inputs)
+        next(input_iter)
+        if isinstance(self.weights, ENTITY_TYPE):
+            self.weights = next(input_iter)
+
+    def __call__(self, df):
+        self._output_types = [OutputType.tensor]
+        inp_tileables = [df]
+        if self.weights is not None:
+            inp_tileables.append(self.weights)
+        return self.new_tileable(
+            inp_tileables, dtype=np.dtype(np.int_), shape=(np.nan,)
+        )
+
+    @classmethod
+    def tile(cls, op: "GroupBySampleILoc"):
+        in_df = op.inputs[0]
+        out_tensor = op.outputs[0]
+        iloc_col_header = _ILOC_COL_HEADER + str(op.random_col_id)
+        weight_col_header = _WEIGHT_COL_HEADER + str(op.random_col_id)
+
+        if has_unknown_shape(in_df):
+            yield
+
+        if op.weights is None:
+            weights_iter = itertools.repeat(None)
+        else:
+            weights_iter = iter(op.weights.chunks)
+
+        if isinstance(op.groupby_params["by"], list):
+            map_cols = list(op.groupby_params["by"])
+        else:  # pragma: no cover
+            map_cols = []
+
+        dtypes = in_df.dtypes.copy()
+        dtypes.at[iloc_col_header] = np.dtype(np.int_)
+        map_cols.append(iloc_col_header)
+        if op.weights is not None:
+            dtypes.at[weight_col_header] = op.weights.dtype
+            map_cols.append(weight_col_header)
+
+        new_dtypes = dtypes[map_cols]
+        new_columns_value = parse_index(new_dtypes.index, store_data=True)
+
+        map_chunks = []
+        left_ilocs = np.array((0,) + in_df.nsplits[0]).cumsum()
+        for inp_chunk, weight_chunk in zip(in_df.chunks, weights_iter):
+            new_op = op.copy().reset_key()
+            new_op.left_iloc_bound = int(left_ilocs[inp_chunk.index[0]])
+            new_op.stage = OperandStage.map
+            new_op.output_types = [OutputType.dataframe]
+
+            inp_chunks = [inp_chunk]
+            if weight_chunk is not None:
+                inp_chunks.append(weight_chunk)
+            params = inp_chunk.params
+            params.update(
+                dict(
+                    dtypes=new_dtypes,
+                    columns_value=new_columns_value,
+                    shape=(inp_chunk.shape[0], len(new_dtypes)),
+                    index=inp_chunk.index,
+                )
+            )
+            map_chunks.append(new_op.new_chunk(inp_chunks, **params))
+
+        new_op = op.copy().reset_key()
+        new_op._output_types = [OutputType.dataframe]
+        params = in_df.params
+        params.update(
+            dict(
+                chunks=map_chunks,
+                nsplits=(in_df.nsplits[0], (len(new_dtypes),)),
+                dtypes=new_dtypes,
+                columns_value=new_columns_value,
+                shape=(in_df.shape[0], len(new_dtypes)),
+            )
+        )
+        map_df = new_op.new_tileable(op.inputs, **params)
+
+        groupby_params = op.groupby_params.copy()
+        groupby_params.pop("selection", None)
+        grouped = yield from recursive_tile(map_df.groupby(**groupby_params))
+
+        result_chunks = []
+        seeds = gen_random_seeds(len(grouped.chunks), op.random_state)
+        for group_chunk, seed in zip(grouped.chunks, seeds):
+            new_op = op.copy().reset_key()
+            new_op.stage = OperandStage.reduce
+            new_op.weights = None
+            new_op._random_state = None
+            new_op.seed = seed
+
+            result_chunks.append(
+                new_op.new_chunk(
+                    [group_chunk],
+                    shape=(np.nan,),
+                    index=(group_chunk.index[0],),
+                    dtype=out_tensor.dtype,
+                )
+            )
+
+        new_op = op.copy().reset_key()
+        params = out_tensor.params
+        params.update(
+            dict(chunks=result_chunks, nsplits=((np.nan,) * len(result_chunks),))
+        )
+        return new_op.new_tileables(op.inputs, **params)
+
+    @classmethod
+    def execute(cls, ctx, op: "GroupBySampleILoc"):
+        in_data = ctx[op.inputs[0].key]
+        iloc_col = _ILOC_COL_HEADER + str(op.random_col_id)
+        weight_col = _WEIGHT_COL_HEADER + str(op.random_col_id)
+        if op.stage == OperandStage.map:
+            if op.weights is not None:
+                ret = pd.DataFrame(
+                    {
+                        iloc_col: np.arange(
+                            op.left_iloc_bound, op.left_iloc_bound + len(in_data)
+                        ),
+                        weight_col: ctx[op.weights.key],
+                    },
+                    index=in_data.index,
+                )
+            else:
+                ret = pd.DataFrame(
+                    {
+                        iloc_col: np.arange(
+                            op.left_iloc_bound, op.left_iloc_bound + len(in_data)
+                        ),
+                    },
+                    index=in_data.index,
+                )
+
+            if isinstance(op.groupby_params["by"], list):
+                ret = pd.concat([in_data[op.groupby_params["by"]], ret], axis=1)
+
+            ctx[op.outputs[0].key] = ret
+        else:
+            if weight_col not in in_data.obj.columns:
+                weight_col = None
+
+            if len(in_data.obj) == 0 or in_data.ngroups == 0:
+                ctx[op.outputs[0].key] = np.array([], dtype=np.int_)
+            else:
+                ctx[op.outputs[0].key] = np.concatenate(
+                    [
+                        sample_pd[iloc_col].to_numpy()
+                        for sample_pd in _sample_groupby_iter(
+                            in_data,
+                            in_data.obj.index,
+                            n=op.size,
+                            frac=op.frac,
+                            replace=op.replace,
+                            weights=weight_col,
+                            random_state=op.random_state,
+                            errors=op.errors,
+                        )
+                    ]
+                )
+
+
+class GroupBySample(MapReduceOperand, DataFrameOperandMixin):
+    _op_code_ = opcodes.RAND_SAMPLE
+    _op_module_ = "dataframe.groupby"
+
+    groupby_params = DictField("groupby_params", default=None)
+    size = Int64Field("size", default=None)
+    frac = Float32Field("frac", default=None)
+    replace = BoolField("replace", default=None)
+    weights = KeyField("weights", default=None)
+    seed = Int32Field("seed", default=None)
+    _random_state = RandomStateField("random_state", default=None)
+    errors = StringField("errors", default=None)
+
+    # for chunks
+    # num of instances for chunks
+    input_nsplits = NDArrayField("input_nsplits", default=None)
+
+    def __init__(self, random_state=None, **kw):
+        super().__init__(_random_state=random_state, **kw)
+
+    @property
+    def random_state(self):
+        return self._random_state
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        input_iter = iter(inputs)
+        next(input_iter)
+        if isinstance(self.weights, ENTITY_TYPE):
+            self.weights = next(input_iter)
+
+    def __call__(self, groupby):
+        df = groupby
+        while df.op.output_types[0] not in (OutputType.dataframe, OutputType.series):
+            df = df.inputs[0]
+
+        selection = groupby.op.groupby_params.pop("selection", None)
+        if df.ndim > 1 and selection:
+            if isinstance(selection, tuple) and selection not in df.dtypes:
+                selection = list(selection)
+            result_df = df[selection]
+        else:
+            result_df = df
+
+        params = result_df.params
+        params["shape"] = (
+            (np.nan,) if result_df.ndim == 1 else (np.nan, result_df.shape[-1])
+        )
+        params["index_value"] = parse_index(result_df.index_value.to_pandas()[:0])
+
+        input_dfs = [df]
+        if isinstance(self.weights, ENTITY_TYPE):
+            input_dfs.append(self.weights)
+
+        self._output_types = get_output_types(result_df)
+        return self.new_tileable(input_dfs, **params)
+
+    @classmethod
+    def _tile_one_chunk(cls, op: "GroupBySample", in_df, weights):
+        out = op.outputs[0]
+
+        input_dfs = [in_df]
+        if isinstance(weights, ENTITY_TYPE):
+            input_dfs.append(weights)
+
+        params = out.params
+        chunk_op = op.copy().reset_key()
+        if isinstance(weights, ENTITY_TYPE):
+            chunk_op._weights = weights
+        params["index"] = (0,) * out.ndim
+        chunk = chunk_op.new_chunk([c.chunks[0] for c in input_dfs], **params)
+
+        df_op = op.copy().reset_key()
+        return df_op.new_tileables(
+            input_dfs, chunks=[chunk], nsplits=((s,) for s in out.shape), **params
+        )
+
+    @classmethod
+    def _tile_distributed(cls, op: "GroupBySample", in_df, weights):
+        out_df = op.outputs[0]
+        if has_unknown_shape(in_df):
+            yield
+
+        sample_iloc_op = GroupBySampleILoc(
+            groupby_params=op.groupby_params,
+            size=op.size,
+            frac=op.frac,
+            replace=op.replace,
+            weights=weights,
+            random_state=op.random_state,
+            errors=op.errors,
+            seed=None,
+            left_iloc_bound=None,
+        )
+        sampled_iloc = yield from recursive_tile(sample_iloc_op(in_df))
+
+        map_chunks = []
+        for c in sampled_iloc.chunks:
+            new_op = op.copy().reset_key()
+            new_op.stage = OperandStage.map
+            new_op.weights = None
+            new_op.output_types = [OutputType.tensor]
+            new_op.input_nsplits = np.array(in_df.nsplits[0])
+
+            map_chunks.append(
+                new_op.new_chunk(
+                    [c], dtype=sampled_iloc.dtype, shape=(np.nan,), index=c.index
+                )
+            )
+
+        proxy_chunk = TensorShuffleProxy(dtype=sampled_iloc.dtype).new_chunk(
+            map_chunks, shape=()
+        )
+
+        reduce_chunks = []
+        for ordinal, src_chunk in enumerate(in_df.chunks):
+            new_op = op.copy().reset_key()
+            new_op.weights = None
+            new_op.output_types = [OutputType.tensor]
+            new_op.stage = OperandStage.reduce
+            new_op.reducer_index = (src_chunk.index[0],)
+            new_op.reducer_ordinal = ordinal
+            new_op.n_reducers = len(in_df.chunks)
+            new_op.input_nsplits = np.array(in_df.nsplits[0])
+
+            reduce_chunks.append(
+                new_op.new_chunk(
+                    [proxy_chunk],
+                    index=src_chunk.index,
+                    dtype=sampled_iloc.dtype,
+                    shape=(np.nan,),
+                )
+            )
+
+        combine_chunks = []
+        for src_chunk, reduce_chunk in zip(in_df.chunks, reduce_chunks):
+            new_op = op.copy().reset_key()
+            new_op.stage = OperandStage.combine
+            new_op._weights = None
+
+            params = out_df.params
+            if out_df.ndim == 2:
+                params.update(
+                    dict(
+                        index=src_chunk.index,
+                        dtypes=out_df.dtypes,
+                        shape=(np.nan, out_df.shape[1]),
+                        columns_value=out_df.columns_value,
+                    )
+                )
+            else:
+                params.update(
+                    dict(
+                        index=(src_chunk.index[0],),
+                        dtype=out_df.dtype,
+                        shape=(np.nan,),
+                        name=out_df.name,
+                    )
+                )
+            combine_chunks.append(new_op.new_chunk([src_chunk, reduce_chunk], **params))
+
+        new_op = op.copy().reset_key()
+        if out_df.ndim == 2:
+            new_nsplits = ((np.nan,) * in_df.chunk_shape[0], (out_df.shape[1],))
+        else:
+            new_nsplits = ((np.nan,) * in_df.chunk_shape[0],)
+        return new_op.new_tileables(
+            out_df.inputs, chunks=combine_chunks, nsplits=new_nsplits, **out_df.params
+        )
+
+    @classmethod
+    def tile(cls, op: "GroupBySample"):
+        in_df = op.inputs[0]
+        if in_df.ndim == 2:
+            in_df = yield from recursive_tile(in_df.rechunk({1: (in_df.shape[1],)}))
+
+        weights = op.weights
+        if isinstance(weights, ENTITY_TYPE):
+            weights = yield from recursive_tile(weights.rechunk({0: in_df.nsplits[0]}))
+
+        if len(in_df.chunks) == 1:
+            return cls._tile_one_chunk(op, in_df, weights)
+        return (yield from cls._tile_distributed(op, in_df, weights))
+
+    @classmethod
+    def execute(cls, ctx, op: "GroupBySample"):
+        out_df = op.outputs[0]
+
+        if op.stage == OperandStage.map:
+            in_data = ctx[op.inputs[0].key]
+            in_data = np.sort(in_data)
+            input_nsplits = np.copy(op.input_nsplits).tolist()
+            pos_array = np.cumsum([0] + input_nsplits)
+            poses = np.searchsorted(in_data, pos_array).tolist()
+            for idx, (left, right) in enumerate(zip(poses, poses[1:])):
+                ctx[op.outputs[0].key, (idx,)] = in_data[left:right]
+        elif op.stage == OperandStage.reduce:
+            in_indexes = list(op.iter_mapper_data(ctx))
+            idx = np.sort(np.concatenate(in_indexes))
+            if op.outputs[0].index[0] > 0:
+                acc_nsplits = np.cumsum(op.input_nsplits)
+                idx -= acc_nsplits[op.outputs[0].index[0] - 1]
+            ctx[op.outputs[0].key] = idx
+        elif op.stage == OperandStage.combine:
+            in_data = ctx[op.inputs[0].key]
+            idx = ctx[op.inputs[1].key]
+            selection = op.groupby_params.get("selection")
+            if selection:
+                in_data = in_data[selection]
+            ctx[op.outputs[0].key] = in_data.iloc[idx]
+        else:
+            in_data = ctx[op.inputs[0].key]
+            weights = op.weights
+            if isinstance(weights, ENTITY_TYPE):
+                weights = ctx[weights.key]
+            params = op.groupby_params.copy()
+            selection = params.pop("selection", None)
+
+            grouped = in_data.groupby(**params)
+            if selection is not None:
+                grouped = grouped[selection]
+
+            result = pd.concat(
+                [
+                    sample_df
+                    for sample_df in _sample_groupby_iter(
+                        grouped,
+                        in_data.index,
+                        n=op.size,
+                        frac=op.frac,
+                        replace=op.replace,
+                        weights=weights,
+                        random_state=op.random_state,
+                        errors=op.errors,
+                    )
+                ]
+            )
+            ctx[out_df.key] = result
+
+
+def groupby_sample(
+    groupby,
+    n: Optional[int] = None,
+    frac: Optional[float] = None,
+    replace: bool = False,
+    weights: Union[Sequence, pd.Series, None] = None,
+    random_state: Optional[np.random.RandomState] = None,
+    errors: str = "ignore",
+):
+    """
+    Return a random sample of items from each group.
+
+    You can use `random_state` for reproducibility.
+
+    Parameters
+    ----------
+    n : int, optional
+        Number of items to return for each group. Cannot be used with
+        `frac` and must be no larger than the smallest group unless
+        `replace` is True. Default is one if `frac` is None.
+    frac : float, optional
+        Fraction of items to return. Cannot be used with `n`.
+    replace : bool, default False
+        Allow or disallow sampling of the same row more than once.
+    weights : list-like, optional
+        Default None results in equal probability weighting.
+        If passed a list-like then values must have the same length as
+        the underlying DataFrame or Series object and will be used as
+        sampling probabilities after normalization within each group.
+        Values must be non-negative with at least one positive element
+        within each group.
+    random_state : int, array-like, BitGenerator, np.random.RandomState, optional
+        If int, array-like, or BitGenerator (NumPy>=1.17), seed for
+        random number generator
+        If np.random.RandomState, use as numpy RandomState object.
+    errors : {'ignore', 'raise'}, default 'ignore'
+        If ignore, errors will not be raised when `replace` is False
+        and size of some group is less than `n`.
+
+    Returns
+    -------
+    Series or DataFrame
+        A new object of same type as caller containing items randomly
+        sampled within each group from the caller object.
+
+    See Also
+    --------
+    DataFrame.sample: Generate random samples from a DataFrame object.
+    numpy.random.choice: Generate a random sample from a given 1-D numpy
+        array.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame(
+    ...     {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)}
+    ... )
+    >>> df.execute()
+           a  b
+    0    red  0
+    1    red  1
+    2   blue  2
+    3   blue  3
+    4  black  4
+    5  black  5
+
+    Select one row at random for each distinct value in column a. The
+    `random_state` argument can be used to guarantee reproducibility:
+
+    >>> df.groupby("a").sample(n=1, random_state=1).execute()
+           a  b
+    4  black  4
+    2   blue  2
+    1    red  1
+
+    Set `frac` to sample fixed proportions rather than counts:
+
+    >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2).execute()
+    5    5
+    2    2
+    0    0
+    Name: b, dtype: int64
+
+    Control sample probabilities within groups by setting weights:
+
+    >>> df.groupby("a").sample(
+    ...     n=1,
+    ...     weights=[1, 1, 1, 0, 0, 1],
+    ...     random_state=1,
+    ... ).execute()
+           a  b
+    5  black  5
+    2   blue  2
+    0    red  0
+    """
+    groupby_params = groupby.op.groupby_params.copy()
+    groupby_params.pop("as_index", None)
+
+    if weights is not None and not isinstance(weights, ENTITY_TYPE):
+        weights = asseries(weights)
+
+    n = 1 if n is None and frac is None else n
+    rs = copy.deepcopy(
+        random_state.to_numpy() if hasattr(random_state, "to_numpy") else random_state
+    )
+    if not isinstance(rs, np.random.RandomState):  # pragma: no cover
+        rs = np.random.RandomState(rs)
+
+    op = GroupBySample(
+        size=n,
+        frac=frac,
+        replace=replace,
+        weights=weights,
+        random_state=rs,
+        groupby_params=groupby_params,
+        errors=errors,
+    )
+    return op(groupby)
diff --git a/python/xorbits/_mars/dataframe/groupby/sort.py b/python/xorbits/_mars/dataframe/groupby/sort.py
new file mode 100644
index 000000000..df43b4156
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/groupby/sort.py
@@ -0,0 +1,167 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import OutputType
+from ...core.operand import MapReduceOperand, OperandStage
+from ...serialization.serializables import Int32Field, ListField
+from ...utils import lazy_import
+from ..operands import DataFrameOperandMixin
+from ..sort.psrs import DataFramePSRSChunkOperand
+
+cudf = lazy_import("cudf")
+
+
+def _series_to_df(in_series, xdf):
+    in_df = in_series.to_frame()
+    if in_series.name is not None:
+        in_df.columns = xdf.Index([in_series.name])
+    return in_df
+
+
+class DataFrameGroupbyConcatPivot(DataFramePSRSChunkOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.GROUPBY_SORT_PIVOT
+
+    @property
+    def output_limit(self):
+        return 1
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameGroupbyConcatPivot"):
+        inputs = [ctx[c.key] for c in op.inputs if len(ctx[c.key]) > 0]
+
+        xdf = pd if isinstance(inputs[0], (pd.DataFrame, pd.Series)) else cudf
+
+        a = xdf.concat(inputs, axis=0)
+        a = a.sort_index()
+        index = a.index.drop_duplicates()
+
+        p = len(inputs)
+        if len(index) < p:
+            num = p // len(index) + 1
+            index = index.append([index] * (num - 1))
+
+        index = index.sort_values()
+
+        values = index.values
+
+        slc = np.linspace(
+            p - 1, len(index) - 1, num=len(op.inputs) - 1, endpoint=False
+        ).astype(int)
+        out = values[slc]
+        ctx[op.outputs[-1].key] = out
+
+
+class DataFramePSRSGroupbySample(DataFramePSRSChunkOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.GROUPBY_SORT_REGULAR_SAMPLE
+
+    @property
+    def output_limit(self):
+        return 1
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFramePSRSGroupbySample"):
+        a = ctx[op.inputs[0].key][0]
+        xdf = pd if isinstance(a, (pd.DataFrame, pd.Series)) else cudf
+        if isinstance(a, xdf.Series) and op.output_types[0] == OutputType.dataframe:
+            a = _series_to_df(a, xdf)
+
+        n = op.n_partition
+        if a.shape[0] < n:
+            num = n // a.shape[0] + 1
+            a = xdf.concat([a] * num).sort_index()
+
+        w = a.shape[0] * 1.0 / (n + 1)
+
+        slc = np.linspace(max(w - 1, 0), a.shape[0] - 1, num=n, endpoint=False).astype(
+            int
+        )
+
+        out = a.iloc[slc]
+        if op.output_types[0] == OutputType.series and out.ndim == 2:
+            assert out.shape[1] == 1
+            out = out.iloc[:, 0]
+        ctx[op.outputs[-1].key] = out
+
+
+class DataFrameGroupbySortShuffle(MapReduceOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.GROUPBY_SORT_SHUFFLE
+
+    # for shuffle map
+    by = ListField("by")
+    n_partition = Int32Field("n_partition")
+
+    def __init__(self, output_types=None, **kw):
+        super().__init__(_output_types=output_types, **kw)
+
+    @property
+    def output_limit(self):
+        return 1
+
+    @classmethod
+    def _execute_map(cls, ctx, op: "DataFrameGroupbySortShuffle"):
+        df, pivots = [ctx[c.key] for c in op.inputs]
+        out = op.outputs[0]
+
+        def _get_out_df(p_index, in_df):
+            if p_index == 0:
+                out_df = in_df.loc[: pivots[p_index]]
+            elif p_index == op.n_partition - 1:
+                out_df = in_df.loc[pivots[p_index - 1] :].drop(
+                    index=pivots[p_index - 1], errors="ignore"
+                )
+            else:
+                out_df = in_df.loc[pivots[p_index - 1] : pivots[p_index]].drop(
+                    index=pivots[p_index - 1], errors="ignore"
+                )
+            return out_df
+
+        for i in range(op.n_partition):
+            index = (i, 0)
+            out_df = tuple(_get_out_df(i, x) for x in df)
+            ctx[out.key, index] = out_df
+
+    @classmethod
+    def _execute_reduce(cls, ctx, op: "DataFrameGroupbySortShuffle"):
+        raw_inputs = list(op.iter_mapper_data(ctx, pop=False))
+        by = op.by
+        xdf = cudf if op.gpu else pd
+
+        r = []
+
+        tuple_len = len(raw_inputs[0])
+        for i in range(tuple_len):
+            r.append(xdf.concat([inp[i] for inp in raw_inputs], axis=0))
+        r = tuple(r)
+
+        ctx[op.outputs[0].key] = r + (by,)
+
+    @classmethod
+    def estimate_size(cls, ctx, op: "DataFrameGroupbySortShuffle"):
+        super().estimate_size(ctx, op)
+        result = ctx[op.outputs[0].key]
+        if op.stage == OperandStage.reduce:
+            ctx[op.outputs[0].key] = (result[0], result[1] * 1.5)
+        else:
+            ctx[op.outputs[0].key] = result
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameGroupbySortShuffle"):
+        if op.stage == OperandStage.map:
+            cls._execute_map(ctx, op)
+        else:
+            cls._execute_reduce(ctx, op)
diff --git a/python/xorbits/_mars/dataframe/groupby/tests/__init__.py b/python/xorbits/_mars/dataframe/groupby/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/groupby/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/groupby/tests/test_groupby.py b/python/xorbits/_mars/dataframe/groupby/tests/test_groupby.py
new file mode 100644
index 000000000..79e4eec0b
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/groupby/tests/test_groupby.py
@@ -0,0 +1,526 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .... import dataframe as md
+from .... import opcodes
+from ....config import option_context
+from ....core import OutputType, tile
+from ....core.operand import OperandStage
+from ...core import DataFrame, DataFrameGroupBy, SeriesGroupBy
+from ..aggregation import DataFrameGroupByAgg
+from ..core import DataFrameGroupByOperand, DataFrameShuffleProxy
+from ..getitem import GroupByIndex
+from ..sort import DataFrameGroupbySortShuffle
+
+
+def test_groupby():
+    df = pd.DataFrame(
+        {"a": [3, 4, 5, 3, 5, 4, 1, 2, 3], "b": [1, 3, 4, 5, 6, 5, 4, 4, 4]}
+    )
+    mdf = md.DataFrame(df, chunk_size=2)
+    with pytest.raises(KeyError):
+        mdf.groupby("c2")
+    with pytest.raises(KeyError):
+        mdf.groupby(["b", "c2"])
+
+    grouped = mdf.groupby("b")
+    assert isinstance(grouped, DataFrameGroupBy)
+    assert isinstance(grouped.op, DataFrameGroupByOperand)
+    assert list(grouped.key_dtypes.index) == ["b"]
+
+    grouped = tile(grouped)
+    assert len(grouped.chunks) == 5
+    for chunk in grouped.chunks:
+        assert isinstance(chunk.op, DataFrameGroupByOperand)
+
+    series = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3])
+    ms = md.Series(series, chunk_size=3)
+    grouped = ms.groupby(lambda x: x + 1)
+
+    assert isinstance(grouped, SeriesGroupBy)
+    assert isinstance(grouped.op, DataFrameGroupByOperand)
+
+    grouped = tile(grouped)
+    assert len(grouped.chunks) == 3
+    for chunk in grouped.chunks:
+        assert isinstance(chunk.op, DataFrameGroupByOperand)
+
+    with pytest.raises(TypeError):
+        ms.groupby(lambda x: x + 1, as_index=False)
+
+
+def test_groupby_get_item():
+    df1 = pd.DataFrame(
+        {
+            "a": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+            "b": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+            "c": list("aabaaddce"),
+        }
+    )
+    mdf = md.DataFrame(df1, chunk_size=3)
+
+    r = tile(mdf.groupby("b")[["a", "b"]])
+    assert isinstance(r, DataFrameGroupBy)
+    assert isinstance(r.op, GroupByIndex)
+    assert r.selection == ["a", "b"]
+    assert list(r.key_dtypes.index) == ["b"]
+    assert len(r.chunks) == 3
+
+    r = tile(mdf.groupby("b").a)
+    assert isinstance(r, SeriesGroupBy)
+    assert isinstance(r.op, GroupByIndex)
+    assert r.name == "a"
+    assert list(r.key_dtypes.index) == ["b"]
+    assert len(r.chunks) == 3
+
+    with pytest.raises(IndexError):
+        getattr(mdf.groupby("b")[["a", "b"]], "a")
+
+
+def test_groupby_agg():
+    df = pd.DataFrame(
+        {
+            "a": np.random.choice([2, 3, 4], size=(20,)),
+            "b": np.random.choice([2, 3, 4], size=(20,)),
+        }
+    )
+    mdf = md.DataFrame(df, chunk_size=3)
+    r = mdf.groupby("a").agg("sum", method="tree")
+    assert isinstance(r.op, DataFrameGroupByAgg)
+    assert isinstance(r, DataFrame)
+    assert r.op.method == "tree"
+    r = tile(r)
+    assert len(r.chunks) == 1
+    assert r.chunks[0].op.stage == OperandStage.agg
+    assert len(r.chunks[0].inputs) == 1
+    assert len(r.chunks[0].inputs[0].inputs) == 2
+
+    df = pd.DataFrame(
+        {
+            "c1": range(10),
+            "c2": np.random.choice(["a", "b", "c"], (10,)),
+            "c3": np.random.rand(10),
+        }
+    )
+    mdf = md.DataFrame(df, chunk_size=2)
+    r = mdf.groupby("c2", sort=False).sum(method="shuffle")
+
+    assert isinstance(r.op, DataFrameGroupByAgg)
+    assert isinstance(r, DataFrame)
+
+    r = tile(r)
+    assert len(r.chunks) == 5
+    for chunk in r.chunks:
+        assert isinstance(chunk.op, DataFrameGroupByAgg)
+        assert chunk.op.stage == OperandStage.agg
+        assert isinstance(chunk.inputs[0].op, DataFrameGroupByOperand)
+        assert chunk.inputs[0].op.stage == OperandStage.reduce
+        assert isinstance(chunk.inputs[0].inputs[0].op, DataFrameShuffleProxy)
+        assert isinstance(
+            chunk.inputs[0].inputs[0].inputs[0].op, DataFrameGroupByOperand
+        )
+        assert chunk.inputs[0].inputs[0].inputs[0].op.stage == OperandStage.map
+
+        agg_chunk = chunk.inputs[0].inputs[0].inputs[0].inputs[0]
+        assert agg_chunk.op.stage == OperandStage.map
+
+    r = mdf.groupby(
+        "c2",
+    ).sum(method="shuffle")
+
+    assert isinstance(r.op, DataFrameGroupByAgg)
+    assert isinstance(r, DataFrame)
+
+    r = tile(r)
+    assert len(r.chunks) == 5
+    for chunk in r.chunks:
+        assert isinstance(chunk.op, DataFrameGroupByAgg)
+        assert chunk.op.stage == OperandStage.agg
+        assert isinstance(chunk.inputs[0].op, DataFrameGroupbySortShuffle)
+        assert chunk.inputs[0].op.stage == OperandStage.reduce
+        assert isinstance(chunk.inputs[0].inputs[0].op, DataFrameShuffleProxy)
+        assert isinstance(
+            chunk.inputs[0].inputs[0].inputs[0].op, DataFrameGroupbySortShuffle
+        )
+        assert chunk.inputs[0].inputs[0].inputs[0].op.stage == OperandStage.map
+
+        agg_chunk = chunk.inputs[0].inputs[0].inputs[0].inputs[0]
+        assert agg_chunk.op.stage == OperandStage.map
+
+    # test unknown method
+    with pytest.raises(ValueError):
+        mdf.groupby("c2").sum(method="not_exist")
+
+
+def test_groupby_auto_on_cluster():
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        {
+            "c1": rs.randint(20, size=100),
+            "c2": rs.choice(["a", "b", "c"], (100,)),
+            "c3": rs.rand(100),
+        }
+    )
+    # test DataFrameGroupByAgg._tile_auto_on_distributed
+    with option_context({"chunk_store_limit": 80}):
+        # chunk_store_limit is 30, each chunk's size is 8,
+        # will combine once, then shuffle 5 combined chunk
+        mdf = md.DataFrame(raw, chunk_size=5)
+        tiled_mdf = tile(mdf)
+        r = mdf.groupby("c2").sum()
+        func_infos = DataFrameGroupByAgg._compile_funcs(r.op, mdf)
+        tiled = DataFrameGroupByAgg._build_tree_and_shuffle_chunks(
+            r.op, tiled_mdf, r, func_infos, tiled_mdf.chunks[:4], [8] * 4
+        )[0]
+        assert len(tiled.chunks) == 5
+
+
+def test_groupby_apply():
+    df1 = pd.DataFrame(
+        {
+            "a": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+            "b": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+            "c": list("aabaaddce"),
+        }
+    )
+
+    def apply_call_with_err(_):
+        raise ValueError
+
+    def apply_df(df):
+        return df.sort_index()
+
+    def apply_df_with_error(df):
+        assert len(df) > 2
+        return df.sort_index()
+
+    def apply_series(s):
+        return s.sort_index()
+
+    mdf = md.DataFrame(df1, chunk_size=3)
+
+    # when dtype and output_type specified, apply function
+    # shall not be called
+    applied = mdf.groupby("b").apply(
+        apply_call_with_err, output_type="series", dtype=int
+    )
+    assert applied.dtype == int
+    assert applied.op.output_types[0] == OutputType.series
+
+    with pytest.raises(TypeError):
+        mdf.groupby("b").apply(apply_df_with_error)
+
+    applied = tile(
+        mdf.groupby("b").apply(
+            apply_df_with_error, output_type="dataframe", dtypes=df1.dtypes
+        )
+    )
+    pd.testing.assert_series_equal(applied.dtypes, df1.dtypes)
+    assert applied.shape == (np.nan, 3)
+    assert applied.op._op_type_ == opcodes.APPLY
+    assert applied.op.output_types[0] == OutputType.dataframe
+    assert len(applied.chunks) == 3
+    assert applied.chunks[0].shape == (np.nan, 3)
+    pd.testing.assert_series_equal(applied.chunks[0].dtypes, df1.dtypes)
+
+    applied = tile(mdf.groupby("b").apply(apply_df))
+    pd.testing.assert_series_equal(applied.dtypes, df1.dtypes)
+    assert applied.shape == (np.nan, 3)
+    assert applied.op._op_type_ == opcodes.APPLY
+    assert applied.op.output_types[0] == OutputType.dataframe
+    assert len(applied.chunks) == 3
+    assert applied.chunks[0].shape == (np.nan, 3)
+    pd.testing.assert_series_equal(applied.chunks[0].dtypes, df1.dtypes)
+
+    applied = tile(mdf.groupby("b").apply(lambda df: df.a))
+    assert applied.dtype == df1.a.dtype
+    assert applied.shape == (np.nan,)
+    assert applied.op._op_type_ == opcodes.APPLY
+    assert applied.op.output_types[0] == OutputType.series
+    assert len(applied.chunks) == 3
+    assert applied.chunks[0].shape == (np.nan,)
+    assert applied.chunks[0].dtype == df1.a.dtype
+
+    applied = mdf.groupby("b").apply(lambda df: df.a.sum())
+    assert applied.op.maybe_agg is True
+    # force set to pass test
+    applied.op.maybe_agg = None
+    applied = tile(applied)
+    assert applied.dtype == df1.a.dtype
+    assert applied.shape == (np.nan,)
+    assert applied.op._op_type_ == opcodes.APPLY
+    assert applied.op.output_types[0] == OutputType.series
+    assert len(applied.chunks) == 3
+    assert applied.chunks[0].shape == (np.nan,)
+    assert applied.chunks[0].dtype == df1.a.dtype
+
+    series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3])
+
+    ms1 = md.Series(series1, chunk_size=3)
+    applied = tile(ms1.groupby(lambda x: x % 3).apply(apply_series))
+    assert applied.dtype == series1.dtype
+    assert applied.shape == (np.nan,)
+    assert applied.op._op_type_ == opcodes.APPLY
+    assert applied.op.output_types[0] == OutputType.series
+    assert len(applied.chunks) == 3
+    assert applied.chunks[0].shape == (np.nan,)
+    assert applied.chunks[0].dtype == series1.dtype
+
+
+def test_groupby_transform():
+    df1 = pd.DataFrame(
+        {
+            "a": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+            "b": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+            "c": list("aabaaddce"),
+            "d": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+            "e": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+            "f": list("aabaaddce"),
+        }
+    )
+
+    def transform_df(df):
+        return df.sort_index()
+
+    def transform_df_with_err(df):
+        assert len(df) > 2
+        return df.sort_index()
+
+    mdf = md.DataFrame(df1, chunk_size=3)
+
+    with pytest.raises(TypeError):
+        mdf.groupby("b").transform(["cummax", "cumcount"])
+
+    with pytest.raises(TypeError):
+        mdf.groupby("b").transform(transform_df_with_err)
+
+    r = tile(
+        mdf.groupby("b").transform(transform_df_with_err, dtypes=df1.dtypes.drop("b"))
+    )
+    assert r.dtypes.index.tolist() == list("acdef")
+    assert r.shape == (9, 5)
+    assert r.op._op_type_ == opcodes.TRANSFORM
+    assert r.op.output_types[0] == OutputType.dataframe
+    assert len(r.chunks) == 3
+    assert r.chunks[0].shape == (np.nan, 5)
+    assert r.chunks[0].dtypes.index.tolist() == list("acdef")
+
+    r = tile(mdf.groupby("b").transform(transform_df))
+    assert r.dtypes.index.tolist() == list("acdef")
+    assert r.shape == (9, 5)
+    assert r.op._op_type_ == opcodes.TRANSFORM
+    assert r.op.output_types[0] == OutputType.dataframe
+    assert len(r.chunks) == 3
+    assert r.chunks[0].shape == (np.nan, 5)
+    assert r.chunks[0].dtypes.index.tolist() == list("acdef")
+
+    r = tile(mdf.groupby("b").transform(["cummax", "cumcount"], _call_agg=True))
+    assert r.shape == (np.nan, 6)
+    assert r.op._op_type_ == opcodes.TRANSFORM
+    assert r.op.output_types[0] == OutputType.dataframe
+    assert len(r.chunks) == 3
+    assert r.chunks[0].shape == (np.nan, 6)
+
+    agg_dict = OrderedDict([("d", "cummax"), ("b", "cumsum")])
+    r = tile(mdf.groupby("b").transform(agg_dict, _call_agg=True))
+    assert r.shape == (np.nan, 2)
+    assert r.op._op_type_ == opcodes.TRANSFORM
+    assert r.op.output_types[0] == OutputType.dataframe
+    assert len(r.chunks) == 3
+    assert r.chunks[0].shape == (np.nan, 2)
+
+    agg_list = ["sum", lambda s: s.sum()]
+    r = tile(mdf.groupby("b").transform(agg_list, _call_agg=True))
+    assert r.shape == (np.nan, 10)
+    assert r.op._op_type_ == opcodes.TRANSFORM
+    assert r.op.output_types[0] == OutputType.dataframe
+    assert len(r.chunks) == 3
+    assert r.chunks[0].shape == (np.nan, 10)
+
+    series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3])
+    ms1 = md.Series(series1, chunk_size=3)
+
+    r = tile(ms1.groupby(lambda x: x % 3).transform(lambda x: x + 1))
+    assert r.dtype == series1.dtype
+    assert r.shape == series1.shape
+    assert r.op._op_type_ == opcodes.TRANSFORM
+    assert r.op.output_types[0] == OutputType.series
+    assert len(r.chunks) == 3
+    assert r.chunks[0].shape == (np.nan,)
+    assert r.chunks[0].dtype == series1.dtype
+
+    r = tile(ms1.groupby(lambda x: x % 3).transform("cummax", _call_agg=True))
+    assert r.shape == (np.nan,)
+    assert r.op._op_type_ == opcodes.TRANSFORM
+    assert r.op.output_types[0] == OutputType.series
+    assert len(r.chunks) == 3
+    assert r.chunks[0].shape == (np.nan,)
+
+    agg_list = ["cummax", "cumcount"]
+    r = tile(ms1.groupby(lambda x: x % 3).transform(agg_list, _call_agg=True))
+    assert r.shape == (np.nan, 2)
+    assert r.op._op_type_ == opcodes.TRANSFORM
+    assert r.op.output_types[0] == OutputType.dataframe
+    assert len(r.chunks) == 3
+    assert r.chunks[0].shape == (np.nan, 2)
+
+
+def test_groupby_cum():
+    df1 = pd.DataFrame(
+        {
+            "a": [3, 5, 2, 7, 1, 2, 4, 6, 2, 4],
+            "b": [8, 3, 4, 1, 8, 2, 2, 2, 2, 3],
+            "c": [1, 8, 8, 5, 3, 5, 0, 0, 5, 4],
+        }
+    )
+    mdf = md.DataFrame(df1, chunk_size=3)
+
+    for fun in ["cummin", "cummax", "cumprod", "cumsum"]:
+        r = tile(getattr(mdf.groupby("b"), fun)())
+        assert r.op.output_types[0] == OutputType.dataframe
+        assert len(r.chunks) == 4
+        assert r.shape == (len(df1), 2)
+        assert r.chunks[0].shape == (np.nan, 2)
+        pd.testing.assert_index_equal(
+            r.chunks[0].columns_value.to_pandas(), pd.Index(["a", "c"])
+        )
+
+        r = tile(getattr(mdf.groupby("b"), fun)(axis=1))
+        assert r.op.output_types[0] == OutputType.dataframe
+        assert len(r.chunks) == 4
+        assert r.shape == (len(df1), 3)
+        assert r.chunks[0].shape == (np.nan, 3)
+        pd.testing.assert_index_equal(
+            r.chunks[0].columns_value.to_pandas(), df1.columns
+        )
+
+    r = tile(mdf.groupby("b").cumcount())
+    assert r.op.output_types[0] == OutputType.series
+    assert len(r.chunks) == 4
+    assert r.shape == (len(df1),)
+    assert r.chunks[0].shape == (np.nan,)
+
+    series1 = pd.Series([2, 2, 5, 7, 3, 7, 8, 8, 5, 6])
+    ms1 = md.Series(series1, chunk_size=3)
+
+    for fun in ["cummin", "cummax", "cumprod", "cumsum", "cumcount"]:
+        r = tile(getattr(ms1.groupby(lambda x: x % 2), fun)())
+        assert r.op.output_types[0] == OutputType.series
+        assert len(r.chunks) == 4
+        assert r.shape == (len(series1),)
+        assert r.chunks[0].shape == (np.nan,)
+
+
+def test_groupby_fill():
+    df1 = pd.DataFrame(
+        [
+            [1, 1, 10],
+            [1, 1, np.nan],
+            [1, 1, np.nan],
+            [1, 2, np.nan],
+            [1, 2, 20],
+            [1, 2, np.nan],
+            [1, 3, np.nan],
+            [1, 3, np.nan],
+        ],
+        columns=["one", "two", "three"],
+    )
+    mdf = md.DataFrame(df1, chunk_size=3)
+
+    r = tile(mdf.groupby(["one", "two"]).ffill())
+    assert r.op.output_types[0] == OutputType.dataframe
+    assert r.shape == (len(df1), 1)
+    assert len(r.chunks) == 3
+    assert r.chunks[0].shape == (np.nan, 1)
+    assert r.dtypes.index.tolist() == ["three"]
+
+    r = tile(mdf.groupby(["two"]).bfill())
+    assert r.op.output_types[0] == OutputType.dataframe
+    assert r.shape == (len(df1), 2)
+    assert len(r.chunks) == 3
+    assert r.chunks[0].shape == (np.nan, 2)
+    assert r.dtypes.index.tolist() == ["one", "three"]
+
+    r = tile(mdf.groupby(["two"]).backfill())
+    assert r.op.output_types[0] == OutputType.dataframe
+    assert r.shape == (len(df1), 2)
+    assert len(r.chunks) == 3
+    assert r.chunks[0].shape == (np.nan, 2)
+    assert r.dtypes.index.tolist() == ["one", "three"]
+
+    r = tile(mdf.groupby(["one"]).fillna(5))
+    assert r.op.output_types[0] == OutputType.dataframe
+    assert r.shape == (len(df1), 2)
+    assert len(r.chunks) == 3
+    assert r.chunks[0].shape == (np.nan, 2)
+    assert r.dtypes.index.tolist() == ["two", "three"]
+
+    s1 = pd.Series([4, 3, 9, np.nan, np.nan, 7, 10, 8, 1, 6])
+    ms1 = md.Series(s1, chunk_size=3)
+    r = tile(ms1.groupby(lambda x: x % 2).ffill())
+    assert r.op.output_types[0] == OutputType.series
+    assert len(r.chunks) == 4
+    assert r.shape == (len(s1),)
+    assert r.chunks[0].shape == (np.nan,)
+
+    r = tile(ms1.groupby(lambda x: x % 2).bfill())
+    assert r.op.output_types[0] == OutputType.series
+    assert len(r.chunks) == 4
+    assert r.shape == (len(s1),)
+    assert r.chunks[0].shape == (np.nan,)
+
+    r = tile(ms1.groupby(lambda x: x % 2).backfill())
+    assert r.op.output_types[0] == OutputType.series
+    assert len(r.chunks) == 4
+    assert r.shape == (len(s1),)
+    assert r.chunks[0].shape == (np.nan,)
+
+    r = tile(ms1.groupby(lambda x: x % 2).fillna(5))
+    assert r.op.output_types[0] == OutputType.series
+    assert len(r.chunks) == 4
+    assert r.shape == (len(s1),)
+    assert r.chunks[0].shape == (np.nan,)
+
+    s1 = pd.Series([4, 3, 9, np.nan, np.nan, 7, 10, 8, 1, 6])
+    ms1 = md.Series(s1, chunk_size=3)
+
+    r = tile(ms1.groupby(lambda x: x % 2).ffill())
+    assert r.op.output_types[0] == OutputType.series
+    assert len(r.chunks) == 4
+    assert r.shape == (len(s1),)
+    assert r.chunks[0].shape == (np.nan,)
+
+    r = tile(ms1.groupby(lambda x: x % 2).bfill())
+    assert r.op.output_types[0] == OutputType.series
+    assert len(r.chunks) == 4
+    assert r.shape == (len(s1),)
+    assert r.chunks[0].shape == (np.nan,)
+
+    r = tile(ms1.groupby(lambda x: x % 2).backfill())
+    assert r.op.output_types[0] == OutputType.series
+    assert len(r.chunks) == 4
+    assert r.shape == (len(s1),)
+    assert r.chunks[0].shape == (np.nan,)
+
+    r = tile(ms1.groupby(lambda x: x % 2).fillna(5))
+    assert r.op.output_types[0] == OutputType.series
+    assert len(r.chunks) == 4
+    assert r.shape == (len(s1),)
+    assert r.chunks[0].shape == (np.nan,)
diff --git a/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_execution.py b/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_execution.py
new file mode 100644
index 000000000..08b1e9bd7
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_execution.py
@@ -0,0 +1,1513 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import OrderedDict
+
+import numpy as np
+import pandas as pd
+import pytest
+
+try:
+    import pyarrow as pa
+except ImportError:  # pragma: no cover
+    pa = None
+
+from .... import dataframe as md
+from ....config import option_context
+from ....core.operand import OperandStage
+from ....tests.core import assert_groupby_equal, require_cudf
+from ....utils import arrow_array_to_objects, pd_release_version
+from ...core import DATAFRAME_OR_SERIES_TYPE
+from ..aggregation import DataFrameGroupByAgg
+
+pytestmark = pytest.mark.pd_compat
+
+_agg_size_as_frame = pd_release_version[:2] > (1, 0)
+
+
+class MockReduction1(md.CustomReduction):
+    def agg(self, v1):
+        return v1.sum()
+
+
+def test_groupby(setup):
+    rs = np.random.RandomState(0)
+    data_size = 100
+    data_dict = {
+        "a": rs.randint(0, 10, size=(data_size,)),
+        "b": rs.randint(0, 10, size=(data_size,)),
+        "c": rs.choice(list("abcd"), size=(data_size,)),
+    }
+
+    # test groupby with DataFrames and RangeIndex
+    df1 = pd.DataFrame(data_dict)
+    mdf = md.DataFrame(df1, chunk_size=13)
+    grouped = mdf.groupby("b")
+    assert_groupby_equal(grouped.execute().fetch(), df1.groupby("b"))
+
+    # test groupby with string index with duplications
+    df2 = pd.DataFrame(data_dict, index=["i" + str(i % 3) for i in range(data_size)])
+    mdf = md.DataFrame(df2, chunk_size=13)
+    grouped = mdf.groupby("b")
+    assert_groupby_equal(grouped.execute().fetch(), df2.groupby("b"))
+
+    # test groupby with DataFrames by series
+    grouped = mdf.groupby(mdf["b"])
+    assert_groupby_equal(grouped.execute().fetch(), df2.groupby(df2["b"]))
+
+    # test groupby with DataFrames by multiple series
+    grouped = mdf.groupby(by=[mdf["b"], mdf["c"]])
+    assert_groupby_equal(
+        grouped.execute().fetch(), df2.groupby(by=[df2["b"], df2["c"]])
+    )
+
+    # test groupby with DataFrames with MultiIndex
+    df3 = pd.DataFrame(
+        data_dict,
+        index=pd.MultiIndex.from_tuples(
+            [(i % 3, "i" + str(i)) for i in range(data_size)]
+        ),
+    )
+    mdf = md.DataFrame(df3, chunk_size=13)
+    grouped = mdf.groupby(level=0)
+    assert_groupby_equal(grouped.execute().fetch(), df3.groupby(level=0))
+
+    # test groupby with DataFrames by integer columns
+    df4 = pd.DataFrame(list(data_dict.values())).T
+    mdf = md.DataFrame(df4, chunk_size=13)
+    grouped = mdf.groupby(0)
+    assert_groupby_equal(grouped.execute().fetch(), df4.groupby(0))
+
+    series1 = pd.Series(data_dict["a"])
+    ms1 = md.Series(series1, chunk_size=13)
+    grouped = ms1.groupby(lambda x: x % 3)
+    assert_groupby_equal(grouped.execute().fetch(), series1.groupby(lambda x: x % 3))
+
+    # test groupby series
+    grouped = ms1.groupby(ms1)
+    assert_groupby_equal(grouped.execute().fetch(), series1.groupby(series1))
+
+    series2 = pd.Series(data_dict["a"], index=["i" + str(i) for i in range(data_size)])
+    ms2 = md.Series(series2, chunk_size=13)
+    grouped = ms2.groupby(lambda x: int(x[1:]) % 3)
+    assert_groupby_equal(
+        grouped.execute().fetch(), series2.groupby(lambda x: int(x[1:]) % 3)
+    )
+
+
+def test_groupby_getitem(setup):
+    rs = np.random.RandomState(0)
+    data_size = 100
+    raw = pd.DataFrame(
+        {
+            "a": rs.randint(0, 10, size=(data_size,)),
+            "b": rs.randint(0, 10, size=(data_size,)),
+            "c": rs.choice(list("abcd"), size=(data_size,)),
+        },
+        index=pd.MultiIndex.from_tuples(
+            [(i % 3, "i" + str(i)) for i in range(data_size)]
+        ),
+    )
+    mdf = md.DataFrame(raw, chunk_size=13)
+
+    r = mdf.groupby(level=0)[["a", "b"]]
+    assert_groupby_equal(
+        r.execute().fetch(), raw.groupby(level=0)[["a", "b"]], with_selection=True
+    )
+
+    for method in ("tree", "shuffle"):
+        r = mdf.groupby(level=0)[["a", "b"]].sum(method=method)
+        pd.testing.assert_frame_equal(
+            r.execute().fetch().sort_index(),
+            raw.groupby(level=0)[["a", "b"]].sum().sort_index(),
+        )
+
+    r = mdf.groupby(level=0)[["a", "b"]].apply(lambda x: x + 1)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(),
+        raw.groupby(level=0)[["a", "b"]].apply(lambda x: x + 1).sort_index(),
+    )
+
+    r = mdf.groupby("b")[["a", "b"]]
+    assert_groupby_equal(
+        r.execute().fetch(), raw.groupby("b")[["a", "b"]], with_selection=True
+    )
+
+    r = mdf.groupby("b")[["a", "c"]]
+    assert_groupby_equal(
+        r.execute().fetch(), raw.groupby("b")[["a", "c"]], with_selection=True
+    )
+
+    for method in ("tree", "shuffle"):
+        r = mdf.groupby("b")[["a", "b"]].sum(method=method)
+        pd.testing.assert_frame_equal(
+            r.execute().fetch().sort_index(),
+            raw.groupby("b")[["a", "b"]].sum().sort_index(),
+        )
+
+        r = mdf.groupby("b")[["a", "b"]].agg(["sum", "count"], method=method)
+        pd.testing.assert_frame_equal(
+            r.execute().fetch().sort_index(),
+            raw.groupby("b")[["a", "b"]].agg(["sum", "count"]).sort_index(),
+        )
+
+        r = mdf.groupby("b")[["a", "c"]].agg(["sum", "count"], method=method)
+        pd.testing.assert_frame_equal(
+            r.execute().fetch().sort_index(),
+            raw.groupby("b")[["a", "c"]].agg(["sum", "count"]).sort_index(),
+        )
+
+    r = mdf.groupby("b")[["a", "b"]].apply(lambda x: x + 1)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(),
+        raw.groupby("b")[["a", "b"]].apply(lambda x: x + 1).sort_index(),
+    )
+
+    r = mdf.groupby("b")[["a", "b"]].transform(lambda x: x + 1)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(),
+        raw.groupby("b")[["a", "b"]].transform(lambda x: x + 1).sort_index(),
+    )
+
+    r = mdf.groupby("b")[["a", "b"]].cumsum()
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(),
+        raw.groupby("b")[["a", "b"]].cumsum().sort_index(),
+    )
+
+    r = mdf.groupby("b").a
+    assert_groupby_equal(r.execute().fetch(), raw.groupby("b").a, with_selection=True)
+
+    for method in ("shuffle", "tree"):
+        r = mdf.groupby("b").a.sum(method=method)
+        pd.testing.assert_series_equal(
+            r.execute().fetch().sort_index(), raw.groupby("b").a.sum().sort_index()
+        )
+
+        r = mdf.groupby("b").a.agg(["sum", "mean", "var"], method=method)
+        pd.testing.assert_frame_equal(
+            r.execute().fetch().sort_index(),
+            raw.groupby("b").a.agg(["sum", "mean", "var"]).sort_index(),
+        )
+
+        r = mdf.groupby("b", as_index=False).a.sum(method=method)
+        pd.testing.assert_frame_equal(
+            r.execute().fetch().sort_values("b", ignore_index=True),
+            raw.groupby("b", as_index=False)
+            .a.sum()
+            .sort_values("b", ignore_index=True),
+        )
+
+        r = mdf.groupby("b", as_index=False).b.count(method=method)
+        result = r.execute().fetch().sort_values("b", ignore_index=True)
+        try:
+            expected = (
+                raw.groupby("b", as_index=False)
+                .b.count()
+                .sort_values("b", ignore_index=True)
+            )
+        except ValueError:
+            expected = raw.groupby("b").b.count().to_frame()
+            expected.index.names = [None] * expected.index.nlevels
+            expected = expected.sort_values("b", ignore_index=True)
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = mdf.groupby("b", as_index=False).b.agg({"cnt": "count"}, method=method)
+        result = r.execute().fetch().sort_values("b", ignore_index=True)
+        try:
+            expected = (
+                raw.groupby("b", as_index=False)
+                .b.agg({"cnt": "count"})
+                .sort_values("b", ignore_index=True)
+            )
+        except ValueError:
+            expected = raw.groupby("b").b.agg({"cnt": "count"}).to_frame()
+            expected.index.names = [None] * expected.index.nlevels
+            expected = expected.sort_values("b", ignore_index=True)
+        pd.testing.assert_frame_equal(result, expected)
+
+    r = mdf.groupby("b").a.apply(lambda x: x + 1)
+    pd.testing.assert_series_equal(
+        r.execute().fetch().sort_index(),
+        raw.groupby("b").a.apply(lambda x: x + 1).sort_index(),
+    )
+
+    r = mdf.groupby("b").a.transform(lambda x: x + 1)
+    pd.testing.assert_series_equal(
+        r.execute().fetch().sort_index(),
+        raw.groupby("b").a.transform(lambda x: x + 1).sort_index(),
+    )
+
+    r = mdf.groupby("b").a.cumsum()
+    pd.testing.assert_series_equal(
+        r.execute().fetch().sort_index(), raw.groupby("b").a.cumsum().sort_index()
+    )
+
+    # special test for selection key == 0
+    raw = pd.DataFrame(rs.rand(data_size, 10))
+    raw[0] = 0
+    mdf = md.DataFrame(raw, chunk_size=13)
+    r = mdf.groupby(0, as_index=False)[0].agg({"cnt": "count"}, method="tree")
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(),
+        raw.groupby(0, as_index=False)[0].agg({"cnt": "count"}),
+    )
+
+    # test groupby getitem then agg(#GH 2640)
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        {
+            "c1": rs.randint(0, 10, size=(100,)).astype(np.int64),
+            "c2": rs.choice(["a", "b", "c"], (100,)),
+            "c3": rs.rand(100),
+            "c4": rs.rand(100),
+        }
+    )
+    mdf = md.DataFrame(raw, chunk_size=20)
+
+    r = mdf.groupby(["c2"])[["c1", "c3"]].agg({"c1": "max", "c3": "min"}, method="tree")
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(),
+        raw.groupby(["c2"])[["c1", "c3"]].agg({"c1": "max", "c3": "min"}),
+    )
+
+    mdf = md.DataFrame(raw.copy(), chunk_size=30)
+    r = mdf.groupby(["c2"])[["c1", "c4"]].agg(
+        {"c1": "max", "c4": "mean"}, method="shuffle"
+    )
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(),
+        raw.groupby(["c2"])[["c1", "c4"]].agg({"c1": "max", "c4": "mean"}),
+    )
+
+    # test anonymous function lists
+    agg_funs = [lambda x: (x + 1).sum()]
+    r = mdf.groupby(["c2"])["c1"].agg(agg_funs)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw.groupby(["c2"])["c1"].agg(agg_funs)
+    )
+
+    # test group by multiple cols
+    r = mdf.groupby(["c1", "c2"], as_index=False)["c3"].sum()
+    expected = raw.groupby(["c1", "c2"], as_index=False)["c3"].sum()
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_values(["c1", "c2"]).reset_index(drop=True),
+        expected.sort_values(["c1", "c2"]).reset_index(drop=True),
+    )
+
+    r = mdf.groupby(["c1", "c2"], as_index=False)["c3"].agg(["sum"])
+    expected = raw.groupby(["c1", "c2"], as_index=False)["c3"].agg(["sum"])
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_values(["c1", "c2"]),
+        expected.sort_values(["c1", "c2"]),
+    )
+
+
+def test_dataframe_groupby_agg(setup):
+    agg_funs = [
+        "std",
+        "mean",
+        "var",
+        "max",
+        "count",
+        "size",
+        "all",
+        "any",
+        "skew",
+        "kurt",
+        "sem",
+        "nunique",
+    ]
+
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        {
+            "c1": np.arange(100).astype(np.int64),
+            "c2": rs.choice(["a", "b", "c"], (100,)),
+            "c3": rs.rand(100),
+        }
+    )
+    mdf = md.DataFrame(raw, chunk_size=13)
+
+    for method in ["tree", "shuffle"]:
+        for sort in [True, False]:
+            r = mdf.groupby("c2").agg("size", method=method)
+            pd.testing.assert_series_equal(
+                r.execute().fetch().sort_index(),
+                raw.groupby("c2").agg("size").sort_index(),
+            )
+
+            for agg_fun in agg_funs:
+                if agg_fun == "size":
+                    continue
+                r = mdf.groupby("c2", sort=sort).agg(agg_fun, method=method)
+                pd.testing.assert_frame_equal(
+                    r.execute().fetch().sort_index(),
+                    raw.groupby("c2").agg(agg_fun).sort_index(),
+                )
+
+            r = mdf.groupby("c2", sort=sort).agg(agg_funs, method=method)
+            pd.testing.assert_frame_equal(
+                r.execute().fetch().sort_index(),
+                raw.groupby("c2").agg(agg_funs).sort_index(),
+            )
+
+            agg = OrderedDict([("c1", ["min", "mean"]), ("c3", "std")])
+            r = mdf.groupby("c2", sort=sort).agg(agg, method=method)
+            pd.testing.assert_frame_equal(
+                r.execute().fetch().sort_index(),
+                raw.groupby("c2").agg(agg).sort_index(),
+            )
+
+            agg = OrderedDict([("c1", "min"), ("c3", "sum")])
+            r = mdf.groupby("c2", sort=sort).agg(agg, method=method)
+            pd.testing.assert_frame_equal(
+                r.execute().fetch().sort_index(),
+                raw.groupby("c2").agg(agg).sort_index(),
+            )
+
+            r = mdf.groupby("c2", sort=sort).agg(
+                {"c1": "min", "c3": "min"}, method=method
+            )
+            pd.testing.assert_frame_equal(
+                r.execute().fetch().sort_index(),
+                raw.groupby("c2").agg({"c1": "min", "c3": "min"}).sort_index(),
+            )
+
+            r = mdf.groupby("c2", sort=sort).agg({"c1": "min"}, method=method)
+            pd.testing.assert_frame_equal(
+                r.execute().fetch().sort_index(),
+                raw.groupby("c2").agg({"c1": "min"}).sort_index(),
+            )
+
+            # test groupby series
+            r = mdf.groupby(mdf["c2"], sort=sort).sum(method=method)
+            pd.testing.assert_frame_equal(
+                r.execute().fetch().sort_index(),
+                raw.groupby(raw["c2"]).sum().sort_index(),
+            )
+
+    r = mdf.groupby("c2").size(method="tree")
+    pd.testing.assert_series_equal(r.execute().fetch(), raw.groupby("c2").size())
+
+    # test inserted kurt method
+    r = mdf.groupby("c2").kurtosis(method="tree")
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.groupby("c2").kurtosis())
+
+    for agg_fun in agg_funs:
+        if agg_fun == "size" or callable(agg_fun):
+            continue
+        r = getattr(mdf.groupby("c2"), agg_fun)(method="tree")
+        pd.testing.assert_frame_equal(
+            r.execute().fetch(), getattr(raw.groupby("c2"), agg_fun)()
+        )
+
+    # test as_index=False
+    for method in ["tree", "shuffle"]:
+        r = mdf.groupby("c2", as_index=False).agg("size", method=method)
+        if _agg_size_as_frame:
+            result = r.execute().fetch().sort_values("c2", ignore_index=True)
+            expected = (
+                raw.groupby("c2", as_index=False)
+                .agg("size")
+                .sort_values("c2", ignore_index=True)
+            )
+            pd.testing.assert_frame_equal(result, expected)
+        else:
+            result = r.execute().fetch().sort_index()
+            expected = raw.groupby("c2", as_index=False).agg("size").sort_index()
+            pd.testing.assert_series_equal(result, expected)
+
+        r = mdf.groupby("c2", as_index=False).agg("mean", method=method)
+        pd.testing.assert_frame_equal(
+            r.execute().fetch().sort_values("c2", ignore_index=True),
+            raw.groupby("c2", as_index=False)
+            .agg("mean")
+            .sort_values("c2", ignore_index=True),
+        )
+        assert r.op.groupby_params["as_index"] is False
+
+        r = mdf.groupby(["c1", "c2"], as_index=False).agg("mean", method=method)
+        pd.testing.assert_frame_equal(
+            r.execute().fetch().sort_values(["c1", "c2"], ignore_index=True),
+            raw.groupby(["c1", "c2"], as_index=False)
+            .agg("mean")
+            .sort_values(["c1", "c2"], ignore_index=True),
+        )
+        assert r.op.groupby_params["as_index"] is False
+
+    # test as_index=False takes no effect
+    r = mdf.groupby(["c1", "c2"], as_index=False).agg(["mean", "count"])
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(),
+        raw.groupby(["c1", "c2"], as_index=False).agg(["mean", "count"]),
+    )
+    assert r.op.groupby_params["as_index"] is True
+
+    r = mdf.groupby("c2").agg(["cumsum", "cumcount"])
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(),
+        raw.groupby("c2").agg(["cumsum", "cumcount"]).sort_index(),
+    )
+
+    r = mdf.groupby("c2").agg(
+        sum_c1=md.NamedAgg("c1", "sum"),
+        min_c1=md.NamedAgg("c1", "min"),
+        mean_c3=md.NamedAgg("c3", "mean"),
+        method="tree",
+    )
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(),
+        raw.groupby("c2").agg(
+            sum_c1=md.NamedAgg("c1", "sum"),
+            min_c1=md.NamedAgg("c1", "min"),
+            mean_c3=md.NamedAgg("c3", "mean"),
+        ),
+    )
+
+
+def test_dataframe_groupby_agg_sort(setup):
+    agg_funs = [
+        "std",
+        "mean",
+        "var",
+        "max",
+        "count",
+        "size",
+        "all",
+        "any",
+        "skew",
+        "kurt",
+        "sem",
+        "nunique",
+    ]
+
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        {
+            "c1": np.arange(100).astype(np.int64),
+            "c2": rs.choice(["a", "b", "c"], (100,)),
+            "c3": rs.rand(100),
+        }
+    )
+    mdf = md.DataFrame(raw, chunk_size=13)
+
+    for method in ["tree", "shuffle"]:
+        r = mdf.groupby("c2").agg("size", method=method)
+        pd.testing.assert_series_equal(
+            r.execute().fetch(), raw.groupby("c2").agg("size")
+        )
+
+        for agg_fun in agg_funs:
+            if agg_fun == "size":
+                continue
+            r = mdf.groupby("c2").agg(agg_fun, method=method)
+            pd.testing.assert_frame_equal(
+                r.execute().fetch(),
+                raw.groupby("c2").agg(agg_fun),
+            )
+
+        r = mdf.groupby("c2").agg(agg_funs, method=method)
+        pd.testing.assert_frame_equal(
+            r.execute().fetch(),
+            raw.groupby("c2").agg(agg_funs),
+        )
+
+        agg = OrderedDict([("c1", ["min", "mean"]), ("c3", "std")])
+        r = mdf.groupby("c2").agg(agg, method=method)
+        pd.testing.assert_frame_equal(r.execute().fetch(), raw.groupby("c2").agg(agg))
+
+        agg = OrderedDict([("c1", "min"), ("c3", "sum")])
+        r = mdf.groupby("c2").agg(agg, method=method)
+        pd.testing.assert_frame_equal(r.execute().fetch(), raw.groupby("c2").agg(agg))
+
+        r = mdf.groupby("c2").agg({"c1": "min", "c3": "min"}, method=method)
+        pd.testing.assert_frame_equal(
+            r.execute().fetch(),
+            raw.groupby("c2").agg({"c1": "min", "c3": "min"}),
+        )
+
+        r = mdf.groupby("c2").agg({"c1": "min"}, method=method)
+        pd.testing.assert_frame_equal(
+            r.execute().fetch(),
+            raw.groupby("c2").agg({"c1": "min"}),
+        )
+
+        # test groupby series
+        r = mdf.groupby(mdf["c2"]).sum(method=method)
+        pd.testing.assert_frame_equal(r.execute().fetch(), raw.groupby(raw["c2"]).sum())
+
+    r = mdf.groupby("c2").size(method="tree")
+    pd.testing.assert_series_equal(r.execute().fetch(), raw.groupby("c2").size())
+
+    # test inserted kurt method
+    r = mdf.groupby("c2").kurtosis(method="tree")
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.groupby("c2").kurtosis())
+
+    for agg_fun in agg_funs:
+        if agg_fun == "size" or callable(agg_fun):
+            continue
+        r = getattr(mdf.groupby("c2"), agg_fun)(method="tree")
+        pd.testing.assert_frame_equal(
+            r.execute().fetch(), getattr(raw.groupby("c2"), agg_fun)()
+        )
+
+    # test as_index=False takes no effect
+    r = mdf.groupby(["c1", "c2"], as_index=False).agg(["mean", "count"])
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(),
+        raw.groupby(["c1", "c2"], as_index=False).agg(["mean", "count"]),
+    )
+    assert r.op.groupby_params["as_index"] is True
+
+
+def test_series_groupby_agg(setup):
+    rs = np.random.RandomState(0)
+    series1 = pd.Series(rs.rand(10))
+    ms1 = md.Series(series1, chunk_size=3)
+
+    agg_funs = [
+        "std",
+        "mean",
+        "var",
+        "max",
+        "count",
+        "size",
+        "all",
+        "any",
+        "skew",
+        "kurt",
+        "sem",
+    ]
+
+    for method in ["tree", "shuffle"]:
+        for agg_fun in agg_funs:
+            r = ms1.groupby(lambda x: x % 2).agg(agg_fun, method=method)
+            pd.testing.assert_series_equal(
+                r.execute().fetch(), series1.groupby(lambda x: x % 2).agg(agg_fun)
+            )
+
+        r = ms1.groupby(lambda x: x % 2).agg(agg_funs, method=method)
+        pd.testing.assert_frame_equal(
+            r.execute().fetch(), series1.groupby(lambda x: x % 2).agg(agg_funs)
+        )
+
+        # test groupby series
+        r = ms1.groupby(ms1).sum(method=method)
+        pd.testing.assert_series_equal(
+            r.execute().fetch().sort_index(),
+            series1.groupby(series1).sum().sort_index(),
+        )
+
+        r = ms1.groupby(ms1).sum(method=method)
+        pd.testing.assert_series_equal(
+            r.execute().fetch().sort_index(),
+            series1.groupby(series1).sum().sort_index(),
+        )
+
+    # test inserted kurt method
+    r = ms1.groupby(ms1).kurtosis(method="tree")
+    pd.testing.assert_series_equal(
+        r.execute().fetch(), series1.groupby(series1).kurtosis()
+    )
+
+    for agg_fun in agg_funs:
+        r = getattr(ms1.groupby(lambda x: x % 2), agg_fun)(method="tree")
+        pd.testing.assert_series_equal(
+            r.execute().fetch(), getattr(series1.groupby(lambda x: x % 2), agg_fun)()
+        )
+
+    r = ms1.groupby(lambda x: x % 2).agg(["cumsum", "cumcount"], method="tree")
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(),
+        series1.groupby(lambda x: x % 2).agg(["cumsum", "cumcount"]).sort_index(),
+    )
+
+    r = ms1.groupby(lambda x: x % 2).agg(col_var="var", col_skew="skew", method="tree")
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(),
+        series1.groupby(lambda x: x % 2).agg(col_var="var", col_skew="skew"),
+    )
+
+
+def test_groupby_agg_auto_method(setup):
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        {
+            "c1": rs.randint(20, size=100),
+            "c2": rs.choice(["a", "b", "c"], (100,)),
+            "c3": rs.rand(100),
+        }
+    )
+    mdf = md.DataFrame(raw, chunk_size=20)
+
+    def _disallow_reduce(ctx, op):
+        assert op.stage != OperandStage.reduce
+        op.execute(ctx, op)
+
+    r = mdf.groupby("c2").agg("sum")
+    operand_executors = {DataFrameGroupByAgg: _disallow_reduce}
+    result = r.execute(
+        extra_config={"operand_executors": operand_executors, "check_all": False}
+    ).fetch()
+    pd.testing.assert_frame_equal(result.sort_index(), raw.groupby("c2").agg("sum"))
+
+    r = mdf.groupby("c3").agg("min")
+    operand_executors = {DataFrameGroupByAgg: _disallow_reduce}
+    result = r.execute(
+        extra_config={"operand_executors": operand_executors, "check_all": False}
+    ).fetch()
+    pd.testing.assert_frame_equal(result.sort_index(), raw.groupby("c3").agg("min"))
+
+    def _disallow_combine_and_agg(ctx, op):
+        assert op.stage != OperandStage.combine
+        op.execute(ctx, op)
+
+    with option_context({"chunk_store_limit": 1}):
+        raw2 = pd.DataFrame(
+            {
+                "c1": rs.randint(20, size=100),
+                "c2": rs.rand(100),
+                "c3": rs.rand(100),
+            }
+        )
+        mdf = md.DataFrame(raw2, chunk_size=20)
+        r = mdf.groupby("c3").agg("min")
+        operand_executors = {DataFrameGroupByAgg: _disallow_combine_and_agg}
+        result = r.execute(
+            extra_config={"operand_executors": operand_executors, "check_all": False}
+        ).fetch()
+        pd.testing.assert_frame_equal(
+            result.sort_index(), raw2.groupby("c3").agg("min")
+        )
+
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        {
+            "c1": list(range(4)) * 12,
+            "c2": rs.choice(["a", "b", "c"], (48,)),
+            "c3": rs.rand(48),
+        }
+    )
+
+    mdf = md.DataFrame(raw, chunk_size=8)
+    r = mdf.groupby("c1").agg("sum")
+    operand_executors = {DataFrameGroupByAgg: _disallow_reduce}
+    result = r.execute(
+        extra_config={"operand_executors": operand_executors, "check_all": False}
+    ).fetch()
+    pd.testing.assert_frame_equal(result.sort_index(), raw.groupby("c1").agg("sum"))
+
+
+@pytest.mark.skip_ray_dag  # _fetch_infos() is not supported by ray backend.
+def test_distributed_groupby_agg(setup_cluster):
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(rs.rand(50000, 10))
+    df = md.DataFrame(raw, chunk_size=raw.shape[0] // 2)
+    with option_context({"chunk_store_limit": 1024**2}):
+        r = df.groupby(0).sum(combine_size=1)
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(result, raw.groupby(0).sum())
+    # test use shuffle
+    assert len(r._fetch_infos()["memory_size"]) > 1
+
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        {
+            "c1": rs.randint(20, size=100),
+            "c2": rs.choice(["a", "b", "c"], (100,)),
+            "c3": rs.rand(100),
+        }
+    )
+    mdf = md.DataFrame(raw, chunk_size=20)
+    r = mdf.groupby("c2").sum().execute()
+    pd.testing.assert_frame_equal(r.fetch(), raw.groupby("c2").sum())
+    # test use tree
+    assert len(r._fetch_infos()["memory_size"]) == 1
+
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        {
+            "c1": rs.randint(20, size=100),
+            "c2": rs.choice(["a", "b", "c"], (100,)),
+            "c3": rs.rand(100),
+        }
+    )
+    mdf = md.DataFrame(raw, chunk_size=10)
+    with option_context({"chunk_store_limit": 2048}):
+        r = mdf.groupby("c2", sort=False).sum().execute()
+    pd.testing.assert_frame_equal(
+        r.fetch().sort_index(), raw.groupby("c2", sort=False).sum().sort_index()
+    )
+    # use tree and shuffle
+    assert len(r._fetch_infos()["memory_size"]) == 3
+
+
+def test_groupby_agg_str_cat(setup):
+    agg_fun = lambda x: x.str.cat(sep="_", na_rep="NA")
+
+    rs = np.random.RandomState(0)
+    raw_df = pd.DataFrame(
+        {
+            "a": rs.choice(["A", "B", "C"], size=(100,)),
+            "b": rs.choice([None, "alfa", "bravo", "charlie"], size=(100,)),
+        }
+    )
+
+    mdf = md.DataFrame(raw_df, chunk_size=13)
+
+    r = mdf.groupby("a").agg(agg_fun, method="tree")
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw_df.groupby("a").agg(agg_fun))
+
+    raw_series = pd.Series(rs.choice([None, "alfa", "bravo", "charlie"], size=(100,)))
+
+    ms = md.Series(raw_series, chunk_size=13)
+
+    r = ms.groupby(lambda x: x % 2).agg(agg_fun, method="tree")
+    pd.testing.assert_series_equal(
+        r.execute().fetch(), raw_series.groupby(lambda x: x % 2).agg(agg_fun)
+    )
+
+
+@require_cudf
+def test_gpu_groupby_agg(setup_gpu):
+    rs = np.random.RandomState(0)
+    df1 = pd.DataFrame(
+        {"a": rs.choice([2, 3, 4], size=(100,)), "b": rs.choice([2, 3, 4], size=(100,))}
+    )
+    mdf = md.DataFrame(df1, chunk_size=13).to_gpu()
+
+    r = mdf.groupby("a").sum()
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().to_pandas(), df1.groupby("a").sum()
+    )
+
+    r = mdf.groupby("a").kurt()
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().to_pandas(), df1.groupby("a").kurt()
+    )
+
+    r = mdf.groupby("a").agg(["sum", "var"])
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().to_pandas(), df1.groupby("a").agg(["sum", "var"])
+    )
+
+    rs = np.random.RandomState(0)
+    idx = pd.Index(np.where(rs.rand(10) > 0.5, "A", "B"))
+    series1 = pd.Series(rs.rand(10), index=idx)
+    ms = md.Series(series1, index=idx, chunk_size=3).to_gpu().to_gpu()
+
+    r = ms.groupby(level=0).sum()
+    pd.testing.assert_series_equal(
+        r.execute().fetch().to_pandas(), series1.groupby(level=0).sum()
+    )
+
+    r = ms.groupby(level=0).kurt()
+    pd.testing.assert_series_equal(
+        r.execute().fetch().to_pandas(), series1.groupby(level=0).kurt()
+    )
+
+    r = ms.groupby(level=0).agg(["sum", "var"])
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().to_pandas(), series1.groupby(level=0).agg(["sum", "var"])
+    )
+
+
+def test_groupby_apply(setup):
+    df1 = pd.DataFrame(
+        {
+            "a": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+            "b": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+            "c": list("aabaaddce"),
+        }
+    )
+
+    def apply_df(df, ret_series=False):
+        df = df.sort_index()
+        df.a += df.b
+        if len(df.index) > 0:
+            if not ret_series:
+                df = df.iloc[:-1, :]
+            else:
+                df = df.iloc[-1, :]
+        return df
+
+    def apply_series(s, truncate=True):
+        s = s.sort_index()
+        if truncate and len(s.index) > 0:
+            s = s.iloc[:-1]
+        return s
+
+    mdf = md.DataFrame(df1, chunk_size=3)
+
+    applied = mdf.groupby("b").apply(lambda df: None)
+    pd.testing.assert_frame_equal(
+        applied.execute().fetch(), df1.groupby("b").apply(lambda df: None)
+    )
+
+    applied = mdf.groupby("b").apply(apply_df)
+    pd.testing.assert_frame_equal(
+        applied.execute().fetch().sort_index(),
+        df1.groupby("b").apply(apply_df).sort_index(),
+    )
+
+    applied = mdf.groupby("b").apply(apply_df, ret_series=True)
+    pd.testing.assert_frame_equal(
+        applied.execute().fetch().sort_index(),
+        df1.groupby("b").apply(apply_df, ret_series=True).sort_index(),
+    )
+
+    applied = mdf.groupby("b").apply(lambda df: df.a, output_type="series")
+    pd.testing.assert_series_equal(
+        applied.execute().fetch().sort_index(),
+        df1.groupby("b").apply(lambda df: df.a).sort_index(),
+    )
+
+    applied = mdf.groupby("b").apply(lambda df: df.a.sum())
+    pd.testing.assert_series_equal(
+        applied.execute().fetch().sort_index(),
+        df1.groupby("b").apply(lambda df: df.a.sum()).sort_index(),
+    )
+
+    series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3])
+    ms1 = md.Series(series1, chunk_size=3)
+
+    applied = ms1.groupby(lambda x: x % 3).apply(lambda df: None)
+    pd.testing.assert_series_equal(
+        applied.execute().fetch(),
+        series1.groupby(lambda x: x % 3).apply(lambda df: None),
+    )
+
+    applied = ms1.groupby(lambda x: x % 3).apply(apply_series)
+    pd.testing.assert_series_equal(
+        applied.execute().fetch().sort_index(),
+        series1.groupby(lambda x: x % 3).apply(apply_series).sort_index(),
+    )
+
+    sindex2 = pd.MultiIndex.from_arrays([list(range(9)), list("ABCDEFGHI")])
+    series2 = pd.Series(list("CDECEDABC"), index=sindex2)
+    ms2 = md.Series(series2, chunk_size=3)
+
+    applied = ms2.groupby(lambda x: x[0] % 3).apply(apply_series)
+    pd.testing.assert_series_equal(
+        applied.execute().fetch().sort_index(),
+        series2.groupby(lambda x: x[0] % 3).apply(apply_series).sort_index(),
+    )
+
+
+def test_groupby_apply_with_df_or_series_output(setup):
+    raw = pd.DataFrame(
+        {
+            "a": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+            "b": [6, 3, 3, 5, 6, 5, 4, 4, 4],
+            "c": list("aabaabbbb"),
+        }
+    )
+    mdf = md.DataFrame(raw, chunk_size=3)
+
+    def f1(df):
+        return df.a.iloc[2]
+
+    with pytest.raises(TypeError):
+        mdf.groupby("c").apply(f1)
+
+    with pytest.raises(ValueError):
+        mdf.groupby("c").apply(f1, output_types=["df_or_series"]).execute()
+
+    for kwargs in [dict(output_type="df_or_series"), dict(skip_infer=True)]:
+        mdf = md.DataFrame(raw, chunk_size=5)
+        applied = mdf.groupby("c").apply(f1, **kwargs)
+        assert isinstance(applied, DATAFRAME_OR_SERIES_TYPE)
+        applied = applied.execute()
+        assert applied.data_type == "series"
+        assert not ("dtypes" in applied.data_params)
+        assert applied.shape == (2,)
+        pd.testing.assert_series_equal(
+            applied.fetch().sort_index(), raw.groupby("c").apply(f1).sort_index()
+        )
+
+    def f2(df):
+        return df[["a"]]
+
+    mdf = md.DataFrame(raw, chunk_size=5)
+    applied = mdf.groupby("c").apply(f2, output_types=["df_or_series"])
+    assert isinstance(applied, DATAFRAME_OR_SERIES_TYPE)
+    applied = applied.execute()
+    assert applied.data_type == "dataframe"
+    assert not ("dtype" in applied.data_params)
+    assert applied.shape == (9, 1)
+    expected = raw.groupby("c", as_index=True).apply(f2)
+    pd.testing.assert_series_equal(applied.dtypes, expected.dtypes)
+    pd.testing.assert_frame_equal(applied.fetch().sort_index(), expected.sort_index())
+
+
+def test_groupby_apply_closure(setup):
+    # DataFrame
+    df1 = pd.DataFrame(
+        {
+            "a": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+            "b": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+            "c": list("aabaaddce"),
+        }
+    )
+
+    x, y = 10, 11
+
+    def apply_closure_df(df):
+        return df["a"].max() * x
+
+    def apply_closure_series(s):
+        return s.mean() * y
+
+    class callable_df:
+        def __init__(self):
+            self.x = 10
+
+        def __call__(self, df):
+            return df["a"].max() * x
+
+    class callable_series:
+        def __init__(self):
+            self.y = 11
+
+        def __call__(self, s):
+            return s.mean() * y
+
+    mdf = md.DataFrame(df1, chunk_size=3)
+
+    applied = mdf.groupby("b").apply(apply_closure_df)
+    pd.testing.assert_series_equal(
+        applied.execute().fetch().sort_index(),
+        df1.groupby("b").apply(apply_closure_df).sort_index(),
+    )
+
+    cdf = callable_df()
+    applied = mdf.groupby("b").apply(cdf)
+    pd.testing.assert_series_equal(
+        applied.execute().fetch().sort_index(),
+        df1.groupby("b").apply(cdf).sort_index(),
+    )
+
+    # Series
+    series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3])
+    ms1 = md.Series(series1, chunk_size=3)
+
+    applied = ms1.groupby(lambda x: x % 3).apply(apply_closure_series)
+    pd.testing.assert_series_equal(
+        applied.execute().fetch().sort_index(),
+        series1.groupby(lambda x: x % 3).apply(apply_closure_series).sort_index(),
+    )
+
+    cs = callable_series()
+    applied = ms1.groupby(lambda x: x % 3).apply(cs)
+    pd.testing.assert_series_equal(
+        applied.execute().fetch().sort_index(),
+        series1.groupby(lambda x: x % 3).apply(cs).sort_index(),
+    )
+
+
+def test_groupby_transform(setup):
+    df1 = pd.DataFrame(
+        {
+            "a": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+            "b": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+            "c": list("aabaaddce"),
+            "d": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+            "e": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+            "f": list("aabaaddce"),
+        }
+    )
+
+    def transform_series(s, truncate=True):
+        s = s.sort_index()
+        if truncate and len(s.index) > 1:
+            s = s.iloc[:-1].reset_index(drop=True)
+        return s
+
+    mdf = md.DataFrame(df1, chunk_size=3)
+
+    r = mdf.groupby("b").transform(transform_series, truncate=False)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(),
+        df1.groupby("b").transform(transform_series, truncate=False).sort_index(),
+    )
+
+    df2 = pd.DataFrame(
+        {
+            "a": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+            "b": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+            "c": list("aabaabbba"),
+        }
+    )
+
+    def f(df):
+        if df.iloc[2]:
+            return df
+        else:
+            return df + df.max()
+
+    mdf2 = md.DataFrame(df2, chunk_size=5)
+    with pytest.raises(TypeError):
+        mdf2.groupby("c").transform(f)
+
+    r = mdf2.groupby("c").transform(f, skip_infer=True)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(),
+        df2.groupby("c").transform(f).sort_index(),
+    )
+
+    if pd.__version__ != "1.1.0":
+        r = mdf.groupby("b").transform(["cummax", "cumsum"], _call_agg=True)
+        pd.testing.assert_frame_equal(
+            r.execute().fetch().sort_index(),
+            df1.groupby("b").agg(["cummax", "cumsum"]).sort_index(),
+        )
+
+        agg_list = ["cummax", "cumsum"]
+        r = mdf.groupby("b").transform(agg_list, _call_agg=True)
+        pd.testing.assert_frame_equal(
+            r.execute().fetch().sort_index(),
+            df1.groupby("b").agg(agg_list).sort_index(),
+        )
+
+        agg_dict = OrderedDict([("d", "cummax"), ("b", "cumsum")])
+        r = mdf.groupby("b").transform(agg_dict, _call_agg=True)
+        pd.testing.assert_frame_equal(
+            r.execute().fetch().sort_index(),
+            df1.groupby("b").agg(agg_dict).sort_index(),
+        )
+
+    agg_list = ["sum", lambda s: s.sum()]
+    r = mdf.groupby("b").transform(agg_list, _call_agg=True)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(), df1.groupby("b").agg(agg_list).sort_index()
+    )
+
+    series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3])
+    ms1 = md.Series(series1, chunk_size=3)
+
+    r = ms1.groupby(lambda x: x % 3).transform(lambda x: x + 1)
+    pd.testing.assert_series_equal(
+        r.execute().fetch().sort_index(),
+        series1.groupby(lambda x: x % 3).transform(lambda x: x + 1).sort_index(),
+    )
+
+    r = ms1.groupby(lambda x: x % 3).transform("cummax", _call_agg=True)
+    pd.testing.assert_series_equal(
+        r.execute().fetch().sort_index(),
+        series1.groupby(lambda x: x % 3).agg("cummax").sort_index(),
+    )
+
+    agg_list = ["cummax", "cumcount"]
+    r = ms1.groupby(lambda x: x % 3).transform(agg_list, _call_agg=True)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(),
+        series1.groupby(lambda x: x % 3).agg(agg_list).sort_index(),
+    )
+
+
+def test_groupby_cum(setup):
+    df1 = pd.DataFrame(
+        {
+            "a": [3, 5, 2, 7, 1, 2, 4, 6, 2, 4],
+            "b": [8, 3, 4, 1, 8, 2, 2, 2, 2, 3],
+            "c": [1, 8, 8, 5, 3, 5, 0, 0, 5, 4],
+        }
+    )
+    mdf = md.DataFrame(df1, chunk_size=3)
+
+    for fun in ["cummin", "cummax", "cumprod", "cumsum"]:
+        r1 = getattr(mdf.groupby("b"), fun)()
+        pd.testing.assert_frame_equal(
+            r1.execute().fetch().sort_index(),
+            getattr(df1.groupby("b"), fun)().sort_index(),
+        )
+
+        r2 = getattr(mdf.groupby("b"), fun)(axis=1)
+        pd.testing.assert_frame_equal(
+            r2.execute().fetch().sort_index(),
+            getattr(df1.groupby("b"), fun)(axis=1).sort_index(),
+        )
+
+    r3 = mdf.groupby("b").cumcount()
+    pd.testing.assert_series_equal(
+        r3.execute().fetch().sort_index(), df1.groupby("b").cumcount().sort_index()
+    )
+
+    series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3])
+    ms1 = md.Series(series1, chunk_size=3)
+
+    for fun in ["cummin", "cummax", "cumprod", "cumsum", "cumcount"]:
+        r1 = getattr(ms1.groupby(lambda x: x % 2), fun)()
+        pd.testing.assert_series_equal(
+            r1.execute().fetch().sort_index(),
+            getattr(series1.groupby(lambda x: x % 2), fun)().sort_index(),
+        )
+
+
+def test_groupby_fill(setup):
+    df1 = pd.DataFrame(
+        [
+            [1, 1, 10],
+            [1, 1, np.nan],
+            [1, 1, np.nan],
+            [1, 2, np.nan],
+            [1, 2, 20],
+            [1, 2, np.nan],
+            [1, 3, np.nan],
+            [1, 3, np.nan],
+        ],
+        columns=["one", "two", "three"],
+    )
+    mdf = md.DataFrame(df1, chunk_size=3)
+    r1 = getattr(mdf.groupby(["one", "two"]), "ffill")()
+    pd.testing.assert_frame_equal(
+        r1.execute().fetch().sort_index(),
+        getattr(df1.groupby(["one", "two"]), "ffill")().sort_index(),
+    )
+
+    r2 = getattr(mdf.groupby("two"), "bfill")()
+    pd.testing.assert_frame_equal(
+        r2.execute().fetch().sort_index(),
+        getattr(df1.groupby("two"), "bfill")().sort_index(),
+    )
+
+    r3 = getattr(mdf.groupby("one"), "fillna")(5)
+    pd.testing.assert_frame_equal(
+        r3.execute().fetch().sort_index(),
+        getattr(df1.groupby("one"), "fillna")(5).sort_index(),
+    )
+
+    r4 = getattr(mdf.groupby("two"), "backfill")()
+    pd.testing.assert_frame_equal(
+        r4.execute().fetch().sort_index(),
+        getattr(df1.groupby("two"), "backfill")().sort_index(),
+    )
+
+    s1 = pd.Series([4, 3, 9, np.nan, np.nan, 7, 10, 8, 1, 6])
+    ms1 = md.Series(s1, chunk_size=3)
+
+    r1 = getattr(ms1.groupby(lambda x: x % 2), "ffill")()
+    pd.testing.assert_series_equal(
+        r1.execute().fetch().sort_index(),
+        getattr(s1.groupby(lambda x: x % 2), "ffill")().sort_index(),
+    )
+
+    r2 = getattr(ms1.groupby(lambda x: x % 2), "bfill")()
+    pd.testing.assert_series_equal(
+        r2.execute().fetch().sort_index(),
+        getattr(s1.groupby(lambda x: x % 2), "bfill")().sort_index(),
+    )
+
+    r4 = getattr(ms1.groupby(lambda x: x % 2), "backfill")()
+    pd.testing.assert_series_equal(
+        r4.execute().fetch().sort_index(),
+        getattr(s1.groupby(lambda x: x % 2), "backfill")().sort_index(),
+    )
+
+
+def test_groupby_head(setup):
+    df1 = pd.DataFrame(
+        {
+            "a": [3, 5, 2, 7, 1, 2, 4, 6, 2, 4],
+            "b": [8, 3, 4, 1, 8, 2, 2, 2, 2, 3],
+            "c": [1, 8, 8, 5, 3, 5, 0, 0, 5, 4],
+            "d": [9, 7, 6, 3, 6, 3, 2, 1, 5, 8],
+        }
+    )
+    # test single chunk
+    mdf = md.DataFrame(df1)
+
+    r = mdf.groupby("b").head(1)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(), df1.groupby("b").head(1)
+    )
+    r = mdf.groupby("b").head(-1)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(), df1.groupby("b").head(-1)
+    )
+    r = mdf.groupby("b")["a", "c"].head(1)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(), df1.groupby("b")["a", "c"].head(1)
+    )
+
+    # test multiple chunks
+    mdf = md.DataFrame(df1, chunk_size=3)
+
+    r = mdf.groupby("b").head(1)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(), df1.groupby("b").head(1)
+    )
+
+    r = mdf.groupby("b").head(-1)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(), df1.groupby("b").head(-1)
+    )
+
+    # test head with selection
+    r = mdf.groupby("b")["a", "d"].head(1)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(), df1.groupby("b")["a", "d"].head(1)
+    )
+    r = mdf.groupby("b")["c", "a", "d"].head(1)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch().sort_index(), df1.groupby("b")["c", "a", "d"].head(1)
+    )
+    r = mdf.groupby("b")["c"].head(1)
+    pd.testing.assert_series_equal(
+        r.execute().fetch().sort_index(), df1.groupby("b")["c"].head(1)
+    )
+
+    # test single chunk
+    series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3])
+    ms = md.Series(series1)
+
+    r = ms.groupby(lambda x: x % 2).head(1)
+    pd.testing.assert_series_equal(
+        r.execute().fetch().sort_index(), series1.groupby(lambda x: x % 2).head(1)
+    )
+    r = ms.groupby(lambda x: x % 2).head(-1)
+    pd.testing.assert_series_equal(
+        r.execute().fetch().sort_index(), series1.groupby(lambda x: x % 2).head(-1)
+    )
+
+    # test multiple chunk
+    ms = md.Series(series1, chunk_size=3)
+
+    r = ms.groupby(lambda x: x % 2).head(1)
+    pd.testing.assert_series_equal(
+        r.execute().fetch().sort_index(), series1.groupby(lambda x: x % 2).head(1)
+    )
+
+    # test with special index
+    series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3], index=[4, 1, 2, 3, 5, 8, 6, 7, 9])
+    ms = md.Series(series1, chunk_size=3)
+
+    r = ms.groupby(lambda x: x % 2).head(1)
+    pd.testing.assert_series_equal(
+        r.execute().fetch().sort_index(),
+        series1.groupby(lambda x: x % 2).head(1).sort_index(),
+    )
+
+
+def test_groupby_sample(setup):
+    rs = np.random.RandomState(0)
+    sample_count = 10
+    src_data_list = []
+    for b in range(5):
+        data_count = int(rs.randint(20, 100))
+        src_data_list.append(
+            pd.DataFrame(
+                {
+                    "a": rs.randint(0, 100, size=data_count),
+                    "b": np.array([b] * data_count),
+                    "c": rs.randint(0, 100, size=data_count),
+                    "d": rs.randint(0, 100, size=data_count),
+                }
+            )
+        )
+    df1 = pd.concat(src_data_list)
+    shuffle_idx = np.arange(len(df1))
+    rs.shuffle(shuffle_idx)
+    df1 = df1.iloc[shuffle_idx].reset_index(drop=True)
+
+    # test single chunk
+    mdf = md.DataFrame(df1)
+
+    r1 = mdf.groupby("b").sample(sample_count, random_state=rs)
+    result1 = r1.execute().fetch()
+    r2 = mdf.groupby("b").sample(sample_count, random_state=rs)
+    result2 = r2.execute().fetch()
+    pd.testing.assert_frame_equal(result1, result2)
+    assert not (result1.groupby("b").count() - sample_count).any()[0]
+
+    r1 = mdf.groupby("b").sample(
+        sample_count, weights=df1["c"] / df1["c"].sum(), random_state=rs
+    )
+    result1 = r1.execute().fetch()
+    r2 = mdf.groupby("b").sample(
+        sample_count, weights=df1["c"] / df1["c"].sum(), random_state=rs
+    )
+    result2 = r2.execute().fetch()
+    pd.testing.assert_frame_equal(result1, result2)
+    assert not (result1.groupby("b").count() - sample_count).any()[0]
+
+    r1 = mdf.groupby("b")[["b", "c"]].sample(sample_count, random_state=rs)
+    result1 = r1.execute().fetch()
+    r2 = mdf.groupby("b")[["b", "c"]].sample(sample_count, random_state=rs)
+    result2 = r2.execute().fetch()
+    pd.testing.assert_frame_equal(result1, result2)
+    assert len(result1.columns) == 2
+    assert not (result1.groupby("b").count() - sample_count).any()[0]
+
+    r1 = mdf.groupby("b").c.sample(sample_count, random_state=rs)
+    result1 = r1.execute().fetch()
+    r2 = mdf.groupby("b").c.sample(sample_count, random_state=rs)
+    result2 = r2.execute().fetch()
+    pd.testing.assert_series_equal(result1, result2)
+
+    r1 = mdf.groupby("b").c.sample(len(df1), random_state=rs)
+    result1 = r1.execute().fetch()
+    assert len(result1) == len(df1)
+
+    with pytest.raises(ValueError):
+        r1 = mdf.groupby("b").c.sample(len(df1), random_state=rs, errors="raises")
+        r1.execute().fetch()
+
+    # test multiple chunks
+    mdf = md.DataFrame(df1, chunk_size=47)
+
+    r1 = mdf.groupby("b").sample(sample_count, random_state=rs)
+    result1 = r1.execute().fetch()
+    r2 = mdf.groupby("b").sample(sample_count, random_state=rs)
+    result2 = r2.execute().fetch()
+    pd.testing.assert_frame_equal(result1, result2)
+    assert not (result1.groupby("b").count() - sample_count).any()[0]
+
+    r1 = mdf.groupby("b").sample(
+        sample_count, weights=df1["c"] / df1["c"].sum(), random_state=rs
+    )
+    result1 = r1.execute().fetch()
+    r2 = mdf.groupby("b").sample(
+        sample_count, weights=df1["c"] / df1["c"].sum(), random_state=rs
+    )
+    result2 = r2.execute().fetch()
+    pd.testing.assert_frame_equal(result1, result2)
+    assert not (result1.groupby("b").count() - sample_count).any()[0]
+
+    r1 = mdf.groupby("b")[["b", "c"]].sample(sample_count, random_state=rs)
+    result1 = r1.execute().fetch()
+    r2 = mdf.groupby("b")[["b", "c"]].sample(sample_count, random_state=rs)
+    result2 = r2.execute().fetch()
+    pd.testing.assert_frame_equal(result1, result2)
+    assert len(result1.columns) == 2
+    assert not (result1.groupby("b").count() - sample_count).any()[0]
+
+    r1 = mdf.groupby("b").c.sample(sample_count, random_state=rs)
+    result1 = r1.execute().fetch()
+    r2 = mdf.groupby("b").c.sample(sample_count, random_state=rs)
+    result2 = r2.execute().fetch()
+    pd.testing.assert_series_equal(result1, result2)
+
+    r1 = mdf.groupby("b").c.sample(len(df1), random_state=rs)
+    result1 = r1.execute().fetch()
+    assert len(result1) == len(df1)
+
+    with pytest.raises(ValueError):
+        r1 = mdf.groupby("b").c.sample(len(df1), random_state=rs, errors="raises")
+        r1.execute().fetch()
+
+
+@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
+def test_groupby_agg_with_arrow_dtype(setup):
+    df1 = pd.DataFrame({"a": [1, 2, 1], "b": ["a", "b", "a"]})
+    mdf = md.DataFrame(df1)
+    mdf["b"] = mdf["b"].astype("Arrow[string]")
+
+    r = mdf.groupby("a").count()
+    result = r.execute().fetch()
+    expected = df1.groupby("a").count()
+    pd.testing.assert_frame_equal(result, expected)
+
+    r = mdf.groupby("b").count()
+    result = r.execute().fetch()
+    result.index = result.index.astype(object)
+    expected = df1.groupby("b").count()
+    pd.testing.assert_frame_equal(result, expected)
+
+    series1 = df1["b"]
+    mseries = md.Series(series1).astype("Arrow[string]")
+
+    r = mseries.groupby(mseries).count()
+    result = r.execute().fetch()
+    result.index = result.index.astype(object)
+    expected = series1.groupby(series1).count()
+    pd.testing.assert_series_equal(result, expected)
+
+    series2 = series1.copy()
+    series2.index = pd.MultiIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
+    mseries = md.Series(series2).astype("Arrow[string]")
+
+    r = mseries.groupby(mseries).count()
+    result = r.execute().fetch()
+    result.index = result.index.astype(object)
+    expected = series2.groupby(series2).count()
+    pd.testing.assert_series_equal(result, expected)
+
+
+@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
+def test_groupby_apply_with_arrow_dtype(setup):
+    df1 = pd.DataFrame({"a": [1, 2, 1], "b": ["a", "b", "a"]})
+    mdf = md.DataFrame(df1)
+    mdf["b"] = mdf["b"].astype("Arrow[string]")
+
+    applied = mdf.groupby("b").apply(lambda df: df.a.sum())
+    result = applied.execute().fetch()
+    result.index = result.index.astype(object)
+    expected = df1.groupby("b").apply(lambda df: df.a.sum())
+    pd.testing.assert_series_equal(result, expected)
+
+    series1 = df1["b"]
+    mseries = md.Series(series1).astype("Arrow[string]")
+
+    applied = mseries.groupby(mseries).apply(lambda s: s)
+    result = applied.execute().fetch()
+    result.index = result.index.astype(np.int64)
+    expected = series1.groupby(series1).apply(lambda s: s)
+    pd.testing.assert_series_equal(arrow_array_to_objects(result), expected)
+
+
+def test_groupby_nunique(setup):
+    rs = np.random.RandomState(0)
+    data_size = 100
+    data_dict = {
+        "a": rs.randint(0, 10, size=(data_size,)),
+        "b": rs.choice(list("abcd"), size=(data_size,)),
+        "c": rs.choice(list("abcd"), size=(data_size,)),
+    }
+    df1 = pd.DataFrame(data_dict)
+
+    # one chunk
+    mdf = md.DataFrame(df1)
+    pd.testing.assert_frame_equal(
+        mdf.groupby("c").nunique().execute().fetch().sort_index(),
+        df1.groupby("c").nunique().sort_index(),
+    )
+
+    # multiple chunks
+    mdf = md.DataFrame(df1, chunk_size=13)
+    pd.testing.assert_frame_equal(
+        mdf.groupby("b").nunique().execute().fetch().sort_index(),
+        df1.groupby("b").nunique().sort_index(),
+    )
+
+    # getitem and nunique
+    mdf = md.DataFrame(df1, chunk_size=13)
+    pd.testing.assert_series_equal(
+        mdf.groupby("b")["a"].nunique().execute().fetch().sort_index(),
+        df1.groupby("b")["a"].nunique().sort_index(),
+    )
+
+    # test with as_index=False
+    mdf = md.DataFrame(df1, chunk_size=13)
+    if _agg_size_as_frame:
+        pd.testing.assert_frame_equal(
+            mdf.groupby("b", as_index=False)["a"]
+            .nunique()
+            .execute()
+            .fetch()
+            .sort_values(by="b", ignore_index=True),
+            df1.groupby("b", as_index=False)["a"]
+            .nunique()
+            .sort_values(by="b", ignore_index=True),
+        )
diff --git a/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_nunique_execution.py b/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_nunique_execution.py
new file mode 100644
index 000000000..f05deee11
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_nunique_execution.py
@@ -0,0 +1,330 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import pandas as pd
+import pytest
+
+
+from .... import dataframe as md
+
+
+@pytest.fixture
+def gen_data1():
+    rs = np.random.RandomState(0)
+    data_size = 100
+    data_dict = {
+        "a": rs.randint(0, 10, size=(data_size,)),
+        "b": rs.choice(list("abcd"), size=(data_size,)),
+        "c": rs.choice(list("abcd"), size=(data_size,)),
+    }
+    df = pd.DataFrame(data_dict)
+    yield df
+
+
+@pytest.fixture
+def gen_data2():
+    rs = np.random.RandomState(0)
+    data_size = 100
+    data_dict = {
+        "a": rs.randint(0, 10, size=(data_size,)),
+        "b": rs.choice(list("abcd"), size=(data_size,)),
+        "c": rs.choice(list("abcd"), size=(data_size,)),
+        "d": rs.randint(0, 10, size=(data_size,)),
+    }
+    df = pd.DataFrame(data_dict)
+    yield df
+
+
+@pytest.fixture
+def gen_data3():
+    arrays = [
+        ["Falcon", "Falcon", "Parrot", "Parrot"],
+        ["Captive", "Wild", "Captive", "Wild"],
+    ]
+    index = pd.MultiIndex.from_arrays(arrays, names=("Animal", "Type"))
+    df = pd.DataFrame({"Max Speed": [390.0, 350.0, 30.0, 20.0]}, index=index)
+    yield df
+
+
+def test_groupby_nunique_without_index(setup, gen_data1):
+    df = gen_data1
+    mdf = md.DataFrame(df, chunk_size=13)
+    r1 = mdf.groupby("b", sort=False)[["a"]].nunique(method="tree").execute().fetch()
+    r2 = (
+        mdf.groupby("b", sort=False)[["a"]]
+        .nunique(method="shuffle")
+        .execute()
+        .fetch()
+        .sort_index(level=0)
+    )
+    r3 = (
+        mdf.groupby("b", sort=False)[["a"]]
+        .nunique(method="auto")
+        .execute()
+        .fetch()
+        .sort_index(level=0)
+    )
+
+    expected = df.groupby("b", sort=False)[["a"]].nunique()
+    pd.testing.assert_frame_equal(r1, expected)
+    pd.testing.assert_frame_equal(r2, expected.sort_index(level=0))
+    pd.testing.assert_frame_equal(r3, expected.sort_index(level=0))
+
+
+def test_groupby_nunique_with_index(setup, gen_data1):
+    df = gen_data1
+    mdf = md.DataFrame(df, chunk_size=13)
+
+    r1 = (
+        mdf.groupby("b", as_index=False, sort=False)["a"]
+        .nunique(method="tree")
+        .execute()
+        .fetch()
+    )
+    # shuffle cannot ensure its order
+    r2 = (
+        mdf.groupby("b", as_index=False, sort=False)["a"]
+        .nunique(method="auto")
+        .execute()
+        .fetch()
+        .sort_values(by="b")
+        .reset_index(drop=True)
+    )
+    r3 = (
+        mdf.groupby("b", as_index=False, sort=False)["a"]
+        .nunique(method="shuffle")
+        .execute()
+        .fetch()
+        .sort_values(by="b")
+        .reset_index(drop=True)
+    )
+
+    expected = df.groupby("b", as_index=False, sort=False)["a"].nunique()
+    pd.testing.assert_frame_equal(r1, expected)
+    pd.testing.assert_frame_equal(
+        r2, expected.sort_values(by="b").reset_index(drop=True)
+    )
+    pd.testing.assert_frame_equal(
+        r3, expected.sort_values(by="b").reset_index(drop=True)
+    )
+
+
+def test_groupby_nunique_series(setup, gen_data1):
+    df = gen_data1
+    mdf = md.DataFrame(df, chunk_size=13)
+    # When method = shuffle and output is series, mars has issue about that.
+    # Therefore, skip the case.
+    r1 = mdf.groupby("b", sort=False)["a"].nunique(method="tree").execute().fetch()
+    r2 = (
+        mdf.groupby("b", sort=False)["a"]
+        .nunique(method="auto")
+        .execute()
+        .fetch()
+        .sort_index(level=0)
+    )
+
+    expected = df.groupby("b", sort=False)["a"].nunique()
+    pd.testing.assert_series_equal(r1, expected)
+    pd.testing.assert_series_equal(r2, expected.sort_index(level=0))
+
+
+def test_groupby_nunique_frame(setup, gen_data1):
+    df = gen_data1
+    mdf = md.DataFrame(df, chunk_size=13)
+
+    r1 = mdf.groupby("b", sort=False)["a", "c"].nunique(method="tree").execute().fetch()
+    r2 = (
+        mdf.groupby("b", sort=False)["a", "c"]
+        .nunique(method="auto")
+        .execute()
+        .fetch()
+        .sort_values(by="b")
+        .reset_index()
+    )
+    r3 = (
+        mdf.groupby("b", sort=False)["a", "c"]
+        .nunique(method="shuffle")
+        .execute()
+        .fetch()
+        .sort_values(by="b")
+        .reset_index()
+    )
+
+    expected = df.groupby("b", sort=False)["a", "c"].nunique()
+    pd.testing.assert_frame_equal(r1, expected)
+    pd.testing.assert_frame_equal(r2, expected.sort_values(by="b").reset_index())
+    pd.testing.assert_frame_equal(r3, expected.sort_values(by="b").reset_index())
+
+
+def test_groupby_nunique_with_sort(setup, gen_data1):
+    df = gen_data1
+    mdf = md.DataFrame(df, chunk_size=13)
+
+    r = mdf.groupby("b", sort=True)["a", "c"].nunique().execute().fetch()
+
+    expected = df.groupby("b", sort=True)["a", "c"].nunique()
+    pd.testing.assert_frame_equal(r, expected)
+
+    r = mdf.groupby(["b", "c"], sort=True)["a"].nunique().execute().fetch()
+    expected = df.groupby(["b", "c"], sort=True)["a"].nunique()
+    pd.testing.assert_series_equal(r, expected)
+
+
+def test_groupby_nunique_multiindex(setup, gen_data2):
+    df = gen_data2
+    mdf = md.DataFrame(df, chunk_size=13)
+
+    r1 = (
+        mdf.groupby(["b", "c"], sort=False)["a", "d"]
+        .nunique(method="tree")
+        .execute()
+        .fetch()
+    )
+    r2 = (
+        mdf.groupby(["b", "c"], sort=False)["a", "d"]
+        .nunique(method="shuffle")
+        .execute()
+        .fetch()
+        .sort_values(by=["b", "c"])
+        .reset_index()
+    )
+    r3 = (
+        mdf.groupby(["b", "c"], sort=False)["a", "d"]
+        .nunique(method="auto")
+        .execute()
+        .fetch()
+        .sort_values(by=["b", "c"])
+        .reset_index()
+    )
+
+    expected = df.groupby(["b", "c"], sort=False)["a", "d"].nunique()
+    pd.testing.assert_frame_equal(r1, expected)
+    pd.testing.assert_frame_equal(r2, expected.sort_values(by=["b", "c"]).reset_index())
+    pd.testing.assert_frame_equal(r3, expected.sort_values(by=["b", "c"]).reset_index())
+
+
+def test_groupby_nunique_level(setup, gen_data1, gen_data3):
+    df = gen_data1
+    mdf = md.DataFrame(df, chunk_size=13)
+
+    r = (
+        mdf.groupby(level=0, as_index=False, sort=False)["a"]
+        .nunique()
+        .execute()
+        .fetch()
+    )
+
+    expected = df.groupby(level=0, as_index=False, sort=False)["a"].nunique()
+    pd.testing.assert_frame_equal(r, expected)
+
+    r = mdf.groupby(level=0, sort=False)["a"].nunique().execute().fetch()
+    expected = df.groupby(level=0, sort=False)["a"].nunique()
+    pd.testing.assert_series_equal(r, expected, check_index=False)
+
+    r = mdf.groupby(level=0, sort=False)["a", "b"].nunique().execute().fetch()
+    expected = df.groupby(level=0, sort=False)["a", "b"].nunique()
+    pd.testing.assert_frame_equal(
+        r.reset_index(drop=True), expected.reset_index(drop=True)
+    )
+
+    df2 = gen_data3
+    mdf2 = md.DataFrame(df2, chunk_size=2)
+    r = mdf2.groupby(level="Type", sort=False).nunique().execute().fetch()
+    expected = df2.groupby(level="Type", sort=False).nunique()
+    pd.testing.assert_frame_equal(r, expected)
+
+    r = mdf2.groupby(level=["Animal", "Type"], sort=False).nunique().execute().fetch()
+    expected = df2.groupby(level=["Animal", "Type"], sort=False).nunique()
+    pd.testing.assert_frame_equal(r, expected)
+
+    r = mdf2.groupby(level=(0, 1), sort=False).nunique().execute().fetch()
+    expected = df2.groupby(level=(0, 1), sort=False).nunique()
+    pd.testing.assert_frame_equal(r, expected)
+
+    r = mdf2.groupby(level=["Type", "Animal"]).nunique().execute().fetch()
+    expected = df2.groupby(level=["Type", "Animal"]).nunique()
+    pd.testing.assert_frame_equal(r, expected)
+
+    r = (
+        mdf2.groupby(level=(0, 1), sort=False)
+        .nunique(method="shuffle")
+        .execute()
+        .fetch()
+    )
+    expected = df2.groupby(level=(0, 1), sort=False).nunique()
+    pd.testing.assert_frame_equal(r.sort_index(), expected.sort_index())
+
+    r = mdf2.groupby(level=["Type", "Animal"]).nunique(method="tree").execute().fetch()
+    expected = df2.groupby(level=["Type", "Animal"]).nunique()
+    pd.testing.assert_frame_equal(r, expected)
+
+    r = mdf2.groupby(level=["Type", "Animal"]).nunique(method="auto").execute().fetch()
+    expected = df2.groupby(level=["Type", "Animal"]).nunique()
+    pd.testing.assert_frame_equal(r, expected)
+
+    r = (
+        mdf2.groupby(level=["Type", "Animal"], sort=False)
+        .nunique(method="shuffle")
+        .execute()
+        .fetch()
+    )
+    expected = df2.groupby(level=["Type", "Animal"]).nunique()
+    pd.testing.assert_frame_equal(r.sort_index(), expected.sort_index())
+
+
+def test_groupby_agg_nunique(setup, gen_data1):
+    df = gen_data1
+    mdf = md.DataFrame(df, chunk_size=13)
+
+    r = mdf.groupby(["b", "c"]).agg("nunique").execute().fetch()
+    expected = df.groupby(["b", "c"]).agg("nunique")
+    pd.testing.assert_frame_equal(r, expected)
+
+    r = mdf.groupby(["b", "c"]).agg(["nunique"], method="tree").execute().fetch()
+    expected = df.groupby(["b", "c"]).agg(["nunique"])
+    pd.testing.assert_frame_equal(r, expected)
+
+    r = mdf.groupby(["b", "c"]).agg(["nunique"], method="auto").execute().fetch()
+    expected = df.groupby(["b", "c"]).agg(["nunique"])
+    pd.testing.assert_frame_equal(r, expected)
+
+    r = mdf.groupby(["b", "c"]).agg(["nunique"], method="shuffle").execute().fetch()
+    expected = df.groupby(["b", "c"]).agg(["nunique"])
+    pd.testing.assert_frame_equal(r, expected)
+
+    r = mdf.groupby(["b", "c"], as_index=False).agg("nunique").execute().fetch()
+    expected = df.groupby(["b", "c"], as_index=False).agg("nunique")
+    pd.testing.assert_frame_equal(r, expected)
+
+    r = (
+        mdf.groupby(["b", "c"], as_index=False, sort=False)
+        .agg("nunique")
+        .execute()
+        .fetch()
+    )
+    expected = df.groupby(["b", "c"], as_index=False, sort=False).agg("nunique")
+    pd.testing.assert_frame_equal(r, expected)
+
+    is_sort = [True, False]
+    methods = ["auto", "shuffle", "tree"]
+    for sort in is_sort:
+        for method in methods:
+            r = (
+                mdf.groupby("b", sort=sort)
+                .agg(["sum", "nunique"], method=method)
+                .execute()
+                .fetch()
+            )
+            expected = df.groupby("b", sort=sort).agg(["sum", "nunique"])
+            pd.testing.assert_frame_equal(r.sort_index(), expected.sort_index())
diff --git a/python/xorbits/_mars/dataframe/groupby/transform.py b/python/xorbits/_mars/dataframe/groupby/transform.py
new file mode 100644
index 000000000..d2050faa2
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/groupby/transform.py
@@ -0,0 +1,374 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import OutputType
+from ...core.custom_log import redirect_custom_log
+from ...serialization.serializables import AnyField, BoolField, DictField, TupleField
+from ...utils import enter_current_session, quiet_stdio
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_empty_df, build_empty_series, parse_index
+
+
+class GroupByTransform(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.TRANSFORM
+    _op_module_ = "dataframe.groupby"
+
+    _func = AnyField("func")
+    _args = TupleField("args")
+    _kwds = DictField("kwds")
+
+    _call_agg = BoolField("call_agg")
+
+    def __init__(
+        self, func=None, args=None, kwds=None, call_agg=None, output_types=None, **kw
+    ):
+        super().__init__(
+            _func=func,
+            _args=args,
+            _kwds=kwds,
+            _call_agg=call_agg,
+            _output_types=output_types,
+            **kw,
+        )
+
+    @property
+    def func(self):
+        return self._func
+
+    @property
+    def args(self):
+        return getattr(self, "_args", None) or ()
+
+    @property
+    def kwds(self):
+        return getattr(self, "_kwds", None) or dict()
+
+    @property
+    def call_agg(self):
+        return self._call_agg
+
+    def _infer_df_func_returns(self, in_groupby, dtypes, index):
+        index_value, output_types, new_dtypes = None, None, None
+
+        output_types = (
+            [OutputType.dataframe]
+            if in_groupby.op.output_types[0] == OutputType.dataframe_groupby
+            else [OutputType.series]
+        )
+
+        try:
+            mock_groupby = in_groupby.op.build_mock_groupby()
+            with np.errstate(all="ignore"), quiet_stdio():
+                if self.call_agg:
+                    infer_df = mock_groupby.agg(self.func, *self.args, **self.kwds)
+                else:
+                    infer_df = mock_groupby.transform(
+                        self.func, *self.args, **self.kwds
+                    )
+
+            # todo return proper index when sort=True is implemented
+            index_value = parse_index(None, in_groupby.key, self.func)
+
+            if isinstance(infer_df, pd.DataFrame):
+                output_types = [OutputType.dataframe]
+                new_dtypes = new_dtypes or infer_df.dtypes
+            else:
+                output_types = [OutputType.series]
+                new_dtypes = new_dtypes or (infer_df.name, infer_df.dtype)
+        except:  # noqa: E722  # nosec
+            pass
+
+        self.output_types = output_types if not self.output_types else self.output_types
+        dtypes = new_dtypes if dtypes is None else dtypes
+        index_value = index_value if index is None else parse_index(index)
+        return dtypes, index_value
+
+    def __call__(
+        self, groupby, dtypes=None, dtype=None, name=None, index=None, skip_infer=None
+    ):
+        in_df = groupby.inputs[0]
+
+        if dtypes is None and dtype is not None:
+            dtypes = (name, dtype)
+        if skip_infer:
+            dtypes, index_value = None, None
+            self.output_types = (
+                [OutputType.dataframe]
+                if groupby.op.output_types[0] == OutputType.dataframe_groupby
+                else [OutputType.series]
+            )
+        else:
+            dtypes, index_value = self._infer_df_func_returns(groupby, dtypes, index)
+            for arg, desc in zip(
+                (self.output_types, dtypes), ("output_types", "dtypes")
+            ):
+                if arg is None:
+                    raise TypeError(
+                        f"Cannot determine {desc} by calculating with enumerate data, "
+                        "please specify it as arguments"
+                    )
+        if index_value is None:
+            index_value = parse_index(None, (in_df.key, in_df.index_value.key))
+
+        if self.output_types[0] == OutputType.dataframe:
+            new_shape = (
+                np.nan if self.call_agg else in_df.shape[0],
+                len(dtypes) if dtypes is not None else np.nan,
+            )
+            columns_value = (
+                parse_index(dtypes.index, store_data=True)
+                if dtypes is not None
+                else None
+            )
+            return self.new_dataframe(
+                [groupby],
+                shape=new_shape,
+                dtypes=dtypes,
+                index_value=index_value,
+                columns_value=columns_value,
+            )
+        else:
+            name, dtype = dtypes
+            new_shape = (np.nan,) if self.call_agg else groupby.shape
+            return self.new_series(
+                [groupby],
+                name=name,
+                shape=new_shape,
+                dtype=dtype,
+                index_value=index_value,
+            )
+
+    @classmethod
+    def tile(cls, op):
+        in_groupby = op.inputs[0]
+        out_df = op.outputs[0]
+
+        chunks = []
+        if op.output_types[0] == OutputType.dataframe:
+            chunk_shape = (
+                np.nan,
+                len(out_df.dtypes) if out_df.dtypes is not None else np.nan,
+            )
+        else:
+            chunk_shape = (np.nan,)
+        for c in in_groupby.chunks:
+            inp_chunks = [c]
+
+            new_op = op.copy().reset_key()
+            new_op.tileable_op_key = op.key
+            if op.output_types[0] == OutputType.dataframe:
+                new_index = c.index if c.ndim == 2 else c.index + (0,)
+                chunks.append(
+                    new_op.new_chunk(
+                        inp_chunks,
+                        index=new_index,
+                        shape=chunk_shape,
+                        dtypes=out_df.dtypes,
+                        columns_value=out_df.columns_value,
+                        index_value=out_df.index_value,
+                    )
+                )
+            else:
+                chunks.append(
+                    new_op.new_chunk(
+                        inp_chunks,
+                        name=out_df.name,
+                        index=(c.index[0],),
+                        shape=chunk_shape,
+                        dtype=out_df.dtype,
+                        index_value=out_df.index_value,
+                    )
+                )
+
+        new_op = op.copy()
+        kw = out_df.params.copy()
+        kw["chunks"] = chunks
+        if op.output_types[0] == OutputType.dataframe:
+            kw["nsplits"] = (
+                (np.nan,) * len(chunks),
+                (len(out_df.dtypes) if out_df.dtypes is not None else np.nan,),
+            )
+        else:
+            kw["nsplits"] = ((np.nan,) * len(chunks),)
+        return new_op.new_tileables([in_groupby], **kw)
+
+    @classmethod
+    @redirect_custom_log
+    @enter_current_session
+    def execute(cls, ctx, op):
+        in_data = ctx[op.inputs[0].key]
+        out_chunk = op.outputs[0]
+
+        if in_data is None:
+            if op.output_types[0] == OutputType.dataframe:
+                ctx[op.outputs[0].key] = build_empty_df(
+                    out_chunk.dtypes, index=out_chunk.index_value.to_pandas()
+                )
+            else:
+                ctx[op.outputs[0].key] = build_empty_series(
+                    out_chunk.dtype,
+                    name=out_chunk.name,
+                    index=out_chunk.index_value.to_pandas(),
+                )
+            return
+
+        if op.call_agg:
+            result = in_data.agg(op.func, *op.args, **op.kwds)
+        elif in_data.shape[0] > 0:
+            # cannot perform groupby-transform over empty dataframe
+            result = in_data.transform(op.func, *op.args, **op.kwds)
+        else:
+            if out_chunk.ndim == 2:
+                result = pd.DataFrame(columns=out_chunk.dtypes.index)
+            else:
+                result = pd.Series([], name=out_chunk.name, dtype=out_chunk.dtype)
+
+        if result.ndim == 2:
+            if out_chunk.dtypes is not None:
+                result = result.astype(out_chunk.dtypes, copy=False)
+        else:
+            if out_chunk.dtype is not None:
+                result = result.astype(out_chunk.dtype, copy=False)
+        ctx[op.outputs[0].key] = result
+
+
+def groupby_transform(
+    groupby,
+    f,
+    *args,
+    dtypes=None,
+    dtype=None,
+    name=None,
+    index=None,
+    output_types=None,
+    skip_infer=False,
+    **kwargs,
+):
+    """
+    Call function producing a like-indexed DataFrame on each group and
+    return a DataFrame having the same indexes as the original object
+    filled with the transformed values
+
+    Parameters
+    ----------
+    f : function
+        Function to apply to each group.
+
+    dtypes : Series, default None
+        Specify dtypes of returned DataFrames. See `Notes` for more details.
+
+    dtype : numpy.dtype, default None
+        Specify dtype of returned Series. See `Notes` for more details.
+
+    name : str, default None
+        Specify name of returned Series. See `Notes` for more details.
+
+    skip_infer: bool, default False
+        Whether infer dtypes when dtypes or output_type is not specified.
+
+    *args
+        Positional arguments to pass to func
+
+    **kwargs
+        Keyword arguments to be passed into func.
+
+    Returns
+    -------
+    DataFrame
+
+    See Also
+    --------
+    DataFrame.groupby.apply
+    DataFrame.groupby.aggregate
+    DataFrame.transform
+
+    Notes
+    -----
+    Each group is endowed the attribute 'name' in case you need to know
+    which group you are working on.
+
+    The current implementation imposes three requirements on f:
+
+    * f must return a value that either has the same shape as the input
+      subframe or can be broadcast to the shape of the input subframe.
+      For example, if `f` returns a scalar it will be broadcast to have the
+      same shape as the input subframe.
+    * if this is a DataFrame, f must support application column-by-column
+      in the subframe. If f also supports application to the entire subframe,
+      then a fast path is used starting from the second chunk.
+    * f must not mutate groups. Mutation is not supported and may
+      produce unexpected results.
+
+    Notes
+    -----
+    When deciding output dtypes and shape of the return value, Mars will
+    try applying ``func`` onto a mock grouped object, and the transform call
+    may fail.
+
+    * For DataFrame output, you need to specify a list or a pandas Series
+      as ``dtypes`` of output DataFrame. ``index`` of output can also be
+      specified.
+    * For Series output, you need to specify ``dtype`` and ``name`` of
+      output Series.
+
+    Examples
+    --------
+
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
+    ...                           'foo', 'bar'],
+    ...                    'B' : ['one', 'one', 'two', 'three',
+    ...                           'two', 'two'],
+    ...                    'C' : [1, 5, 5, 2, 5, 5],
+    ...                    'D' : [2.0, 5., 8., 1., 2., 9.]})
+    >>> grouped = df.groupby('A')
+    >>> grouped.transform(lambda x: (x - x.mean()) / x.std()).execute()
+              C         D
+    0 -1.154701 -0.577350
+    1  0.577350  0.000000
+    2  0.577350  1.154701
+    3 -1.154701 -1.000000
+    4  0.577350 -0.577350
+    5  0.577350  1.000000
+
+    Broadcast result of the transformation
+
+    >>> grouped.transform(lambda x: x.max() - x.min()).execute()
+       C    D
+    0  4  6.0
+    1  3  8.0
+    2  4  6.0
+    3  3  8.0
+    4  4  6.0
+    5  3  8.0
+    """
+    call_agg = kwargs.pop("_call_agg", False)
+    if not call_agg and isinstance(f, (dict, list)):
+        raise TypeError(f"Does not support transform with {type(f)}")
+
+    op = GroupByTransform(
+        func=f, args=args, kwds=kwargs, output_types=output_types, call_agg=call_agg
+    )
+    return op(
+        groupby,
+        dtypes=dtypes,
+        dtype=dtype,
+        name=name,
+        index=index,
+        skip_infer=skip_infer,
+    )
diff --git a/python/xorbits/_mars/dataframe/indexing/__init__.py b/python/xorbits/_mars/dataframe/indexing/__init__.py
new file mode 100644
index 000000000..10805c3a4
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/__init__.py
@@ -0,0 +1,86 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def _install():
+    from pandas.util import cache_readonly
+
+    from ..operands import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
+    from .add_prefix_suffix import (
+        df_add_prefix,
+        df_add_suffix,
+        series_add_prefix,
+        series_add_suffix,
+    )
+    from .align import align
+    from .at import at
+    from .getitem import dataframe_getitem, series_getitem
+    from .iat import iat
+    from .iloc import head, iloc, index_getitem, index_setitem, tail
+    from .insert import df_insert
+    from .loc import loc
+    from .reindex import reindex, reindex_like
+    from .rename import df_rename, index_rename, index_set_names, series_rename
+    from .rename_axis import rename_axis
+    from .reset_index import df_reset_index, series_reset_index
+    from .sample import sample
+    from .set_axis import df_set_axis, series_set_axis
+    from .set_index import set_index
+    from .setitem import dataframe_setitem
+    from .where import mask, where
+
+    for cls in DATAFRAME_TYPE + SERIES_TYPE:
+        setattr(cls, "iloc", cache_readonly(iloc))
+        setattr(cls, "loc", cache_readonly(loc))
+        setattr(cls, "iat", cache_readonly(iat))
+        setattr(cls, "at", cache_readonly(at))
+        setattr(cls, "head", head)
+        setattr(cls, "reindex", reindex)
+        setattr(cls, "reindex_like", reindex_like)
+        setattr(cls, "rename_axis", rename_axis)
+        setattr(cls, "tail", tail)
+        setattr(cls, "mask", mask)
+        setattr(cls, "where", where)
+        setattr(cls, "sample", sample)
+
+    for cls in DATAFRAME_TYPE:
+        setattr(cls, "set_index", set_index)
+        setattr(cls, "__getitem__", dataframe_getitem)
+        setattr(cls, "__setitem__", dataframe_setitem)
+        setattr(cls, "insert", df_insert)
+        setattr(cls, "reset_index", df_reset_index)
+        setattr(cls, "rename", df_rename)
+        setattr(cls, "set_axis", df_set_axis)
+        setattr(cls, "add_prefix", df_add_prefix)
+        setattr(cls, "add_suffix", df_add_suffix)
+        setattr(cls, "align", align)
+
+    for cls in SERIES_TYPE:
+        setattr(cls, "__getitem__", series_getitem)
+        setattr(cls, "reset_index", series_reset_index)
+        setattr(cls, "rename", series_rename)
+        setattr(cls, "set_axis", series_set_axis)
+        setattr(cls, "add_prefix", series_add_prefix)
+        setattr(cls, "add_suffix", series_add_suffix)
+        setattr(cls, "align", align)
+
+    for cls in INDEX_TYPE:
+        setattr(cls, "__getitem__", index_getitem)
+        setattr(cls, "__setitem__", index_setitem)
+        setattr(cls, "rename", index_rename)
+        setattr(cls, "set_names", index_set_names)
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/dataframe/indexing/add_prefix_suffix.py b/python/xorbits/_mars/dataframe/indexing/add_prefix_suffix.py
new file mode 100644
index 000000000..ed87cea1a
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/add_prefix_suffix.py
@@ -0,0 +1,110 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import textwrap
+from functools import partial
+
+
+def _get_prefix_suffix_docs(is_prefix: bool):
+    if is_prefix:
+        action, pos = "prefix", "before"
+        r_action = "suffix"
+    else:
+        action, pos = "suffix", "after"
+        r_action = "prefix"
+
+    def mk_col(ch: str, s: str):
+        return f"{ch}_{s}" if is_prefix else f"{s}_{ch}"
+
+    doc = f"""
+    {action.capitalize()} labels with string `{action}`.
+
+    For Series, the row labels are {action}ed.
+    For DataFrame, the column labels are {action}ed.
+
+    Parameters
+    ----------
+    {action} : str
+        The string to add {pos} each label.
+
+    Returns
+    -------
+    Series or DataFrame
+        New Series or DataFrame with updated labels.
+
+    See Also
+    --------
+    Series.add_{r_action}: Suffix row labels with string `{r_action}`.
+    DataFrame.add_{r_action}: Suffix column labels with string `{r_action}`.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> s = md.Series([1, 2, 3, 4])
+    >>> s.execute()
+    0    1
+    1    2
+    2    3
+    3    4
+    dtype: int64
+
+    >>> s.add_prefix({mk_col('item', '')!r}).execute()
+    {mk_col('item', '0')}    1
+    {mk_col('item', '1')}    2
+    {mk_col('item', '2')}    3
+    {mk_col('item', '3')}    4
+    dtype: int64
+
+    >>> df = md.DataFrame({{'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}})
+    >>> df.execute()
+       A  B
+    0  1  3
+    1  2  4
+    2  3  5
+    3  4  6
+
+    >>> df.add_prefix({mk_col('col', '')!r}).execute()
+         {mk_col('col', 'A')}  {mk_col('col', 'B')}
+    0        1      3
+    1        2      4
+    2        3      5
+    3        4      6
+    """
+    return textwrap.dedent(doc).strip()
+
+
+def df_add_prefix(df, prefix):
+    f = partial("{prefix}{}".format, prefix=prefix)
+    return df.rename(columns=f)
+
+
+def series_add_prefix(series, prefix):
+    f = partial("{prefix}{}".format, prefix=prefix)
+    return series.rename(index=f)
+
+
+def df_add_suffix(df, suffix):
+    f = partial("{}{suffix}".format, suffix=suffix)
+    return df.rename(columns=f)
+
+
+def series_add_suffix(series, suffix):
+    f = partial("{}{suffix}".format, suffix=suffix)
+    return series.rename(index=f)
+
+
+df_add_prefix.__doc__ = _get_prefix_suffix_docs(True)
+series_add_prefix.__doc__ = df_add_prefix.__doc__
+df_add_suffix.__doc__ = _get_prefix_suffix_docs(False)
+series_add_suffix.__doc__ = df_add_suffix.__doc__
diff --git a/python/xorbits/_mars/dataframe/indexing/align.py b/python/xorbits/_mars/dataframe/indexing/align.py
new file mode 100644
index 000000000..c12b4ed8d
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/align.py
@@ -0,0 +1,554 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Optional, Union
+
+import numpy as np
+
+from ... import opcodes
+from ...core import OutputType, get_output_types, recursive_tile
+from ...serialization.serializables import (
+    AnyField,
+    Int16Field,
+    Int64Field,
+    KeyField,
+    StringField,
+)
+from ...typing import TileableType
+from ..align import (
+    align_dataframe_dataframe,
+    align_dataframe_series,
+    align_series_series,
+)
+from ..core import IndexValue
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_empty_df, parse_index, validate_axis
+
+
+class _NoNeedToAlign(Exception):
+    pass
+
+
+class DataFrameAlign(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.ALIGN
+
+    lhs = KeyField("lhs")
+    rhs = KeyField("rhs")
+    join = StringField("join", default=None)
+    axis = Int16Field("axis", default=None)
+    level = AnyField("level", default=None)
+    fill_value = AnyField("fill_value", default=None)
+    method = StringField("method", default=None)
+    limit = Int64Field("limit", default=None)
+    fill_axis = Int16Field("fill_axis", default=None)
+    broadcast_axis = Int16Field("broadcast_axis", default=None)
+
+    @property
+    def output_limit(self) -> int:
+        return 2
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self.lhs = inputs[0]
+        self.rhs = inputs[1]
+
+    def __call__(self, lhs: TileableType, rhs: TileableType):
+        if self.broadcast_axis != 1 or lhs.ndim == rhs.ndim:
+            self._output_types = get_output_types(lhs, rhs)
+        else:
+            self._output_types = [OutputType.dataframe, OutputType.dataframe]
+
+        if lhs.ndim == rhs.ndim:
+            if lhs.ndim == 1:
+                return self._call_series_series(lhs, rhs)
+            else:
+                return self._call_dataframe_dataframe(lhs, rhs)
+        else:
+            if lhs.ndim == 1:
+                # join order need to be reversed if not symmetric
+                asym_joins = {"left", "right"} - {self.join}
+                if len(asym_joins) == 1:  # self.join in {"left", "right"}
+                    self.join = asym_joins.pop()
+                # need to put dataframe first
+                self._output_types = get_output_types(rhs, lhs)
+                return self._call_dataframe_series(rhs, lhs)[::-1]
+            else:
+                return self._call_dataframe_series(lhs, rhs)
+
+    def _call_dataframe_dataframe(self, lhs: TileableType, rhs: TileableType):
+        l_shape = list(lhs.shape)
+        r_shape = list(rhs.shape)
+        if self.axis is None or self.axis == 0:
+            l_idx_val = r_idx_val = self._merge_index(
+                lhs.index_value, rhs.index_value, how=self.join
+            )
+            l_shape[0] = r_shape[0] = np.nan
+        else:
+            l_idx_val, r_idx_val = lhs.index_value, rhs.index_value
+
+        if self.axis is None or self.axis == 1:
+            l_empty = build_empty_df(lhs.dtypes)
+            r_empty = build_empty_df(rhs.dtypes)
+            aligned, _ = l_empty.align(r_empty, axis=1)
+            l_dtypes = r_dtypes = aligned.dtypes
+            l_col_val = r_col_val = parse_index(aligned.columns, store_data=True)
+            l_shape[1] = r_shape[1] = len(l_dtypes)
+        else:
+            l_dtypes, r_dtypes = lhs.dtypes, rhs.dtypes
+            l_col_val, r_col_val = lhs.columns_value, rhs.columns_value
+
+        l_kws = {
+            "index_value": l_idx_val,
+            "dtypes": l_dtypes,
+            "shape": tuple(l_shape),
+            "columns_value": l_col_val,
+        }
+        r_kws = {
+            "index_value": r_idx_val,
+            "dtypes": r_dtypes,
+            "shape": tuple(r_shape),
+            "columns_value": r_col_val,
+        }
+        return self.new_tileables([lhs, rhs], kws=[l_kws, r_kws])
+
+    def _call_dataframe_series(self, lhs: TileableType, rhs: TileableType):
+        l_shape = list(lhs.shape)
+        if self.axis == 0 or self.broadcast_axis == 1:
+            dtypes = lhs.dtypes
+            col_val = lhs.columns_value
+            l_idx_val = r_idx_val = self._merge_index(
+                lhs.index_value, rhs.index_value, how=self.join
+            )
+            l_shape[0] = r_size = np.nan
+        else:
+            l_idx_val = lhs.index_value
+            if not rhs.index_value.has_value():
+                dtypes = None
+                l_shape[1] = r_size = np.nan
+                col_val = r_idx_val = self._merge_index(
+                    lhs.columns_value, rhs.index_value, how=self.join
+                )
+            else:
+                series_index = rhs.index_value.to_pandas()
+                dtypes = lhs.dtypes.reindex(
+                    lhs.dtypes.index.join(series_index, how=self.join)
+                ).fillna(np.dtype(np.float_))
+                l_shape[1] = r_size = len(dtypes)
+                col_val = r_idx_val = parse_index(dtypes.index, store_data=True)
+
+        l_kws = {
+            "index_value": l_idx_val,
+            "dtypes": dtypes,
+            "shape": tuple(l_shape),
+            "columns_value": col_val,
+        }
+        if self.broadcast_axis == 1:
+            r_kws = {
+                "index_value": r_idx_val,
+                "dtypes": dtypes,
+                "shape": tuple(l_shape),
+                "columns_value": col_val,
+            }
+        else:
+            r_kws = {
+                "index_value": r_idx_val,
+                "shape": (r_size,),
+                "dtype": rhs.dtype,
+            }
+        return self.new_tileables([lhs, rhs], kws=[l_kws, r_kws])
+
+    def _call_series_series(self, lhs: TileableType, rhs: TileableType):
+        idx = self._merge_index(lhs.index_value, rhs.index_value, how=self.join)
+        kws = [
+            {"index_value": idx, "shape": (np.nan,), "dtype": lhs.dtype},
+            {"index_value": idx, "shape": (np.nan,), "dtype": rhs.dtype},
+        ]
+        return self.new_tileables([lhs, rhs], kws=kws)
+
+    @staticmethod
+    def _merge_index(
+        left_index_value: IndexValue, right_index_value: IndexValue, how: str = "outer"
+    ):
+        left_pd = left_index_value.to_pandas()
+        right_pd = right_index_value.to_pandas()
+
+        if not left_index_value.has_value() or not right_index_value.has_value():
+            left_pd = left_pd[:0]
+            right_pd = right_pd[:0]
+            store_data = False
+        else:
+            store_data = True
+
+        joined = left_pd.join(right_pd, how=how)
+        if store_data:
+            return parse_index(joined, store_data=store_data)
+        else:
+            return parse_index(
+                joined,
+                {left_index_value.key, right_index_value.key},
+                store_data=store_data,
+            )
+
+    @classmethod
+    def _select_nsplits(
+        cls, op: "DataFrameAlign", tileable: TileableType, val_to_replace: list
+    ):
+        if op.axis is None:
+            return val_to_replace[: tileable.ndim]
+        else:
+            attr_val = tileable.nsplits
+            axis = op.axis % tileable.ndim
+            return [
+                tuple(val_to_replace[op.axis]) if i == axis else attr_val[i]
+                for i in range(len(attr_val))
+            ]
+
+    @classmethod
+    def _build_tiled_kw(
+        cls, op: "DataFrameAlign", idx: int, chunks: list, nsplits: list
+    ):
+        in_tileable = op.inputs[idx]
+        out_tileable = op.outputs[idx]
+        kw = out_tileable.params.copy()
+        kw.update(
+            {
+                "chunks": chunks,
+                "nsplits": tuple(cls._select_nsplits(op, in_tileable, nsplits)),
+            }
+        )
+        return kw
+
+    @classmethod
+    def _check_align_needed(
+        cls, op: "DataFrameAlign", left_chunks: list, right_chunks: list
+    ):
+        lhs, rhs = op.lhs, op.rhs
+        if all(lc.key == rc.key for lc, rc in zip(lhs.chunks, left_chunks)) and all(
+            lc.key == rc.key for lc, rc in zip(rhs.chunks, right_chunks)
+        ):
+            raise _NoNeedToAlign
+
+    @classmethod
+    def _tile_dataframe_dataframe(cls, op: "DataFrameAlign"):
+        lhs, rhs = op.lhs, op.rhs
+        nsplits, chunk_shapes, left_chunks, right_chunks = align_dataframe_dataframe(
+            lhs, rhs, axis=op.axis
+        )
+        cls._check_align_needed(op, left_chunks, right_chunks)
+
+        left_chunk_array = np.array(left_chunks, dtype="O").reshape(chunk_shapes[0])
+        right_chunk_array = np.array(right_chunks, dtype="O").reshape(chunk_shapes[1])
+
+        left_idx_to_chunk = dict()
+        l_chunks, r_chunks = [], []
+
+        iterator = np.nditer(right_chunk_array, flags=["refs_ok", "multi_index"])
+        for rc_obj in iterator:
+            rc = rc_obj.tolist()
+            r_index = iterator.multi_index
+            l_index = tuple(r_index[i] % chunk_shapes[0][i] for i in (0, 1))
+            lc = left_chunk_array[l_index]
+
+            kws = [lc.params, rc.params]
+            kws[0]["index"] = l_index
+            kws[1]["index"] = r_index
+
+            chunk_op = op.copy().reset_key()
+            l_chunk, r_chunk = chunk_op.new_chunks([lc, rc], kws=kws)
+            left_idx_to_chunk[l_index] = l_chunk
+            r_chunks.append(r_chunk)
+
+        iterator = np.nditer(left_chunk_array, flags=["refs_ok", "multi_index"])
+        for lc_obj in iterator:
+            lc = lc_obj.tolist()
+            l_index = iterator.multi_index
+            try:
+                l_chunk = left_idx_to_chunk[l_index]
+                l_chunks.append(l_chunk)
+                continue
+            except KeyError:
+                pass
+
+            r_index = tuple(l_index[i] % chunk_shapes[1][i] for i in (0, 1))
+            rc = right_chunk_array[r_index]
+
+            kws = [lc.params, rc.params]
+            kws[0]["index"] = l_index
+
+            chunk_op = op.copy().reset_key()
+            l_chunk, _r_chunk = chunk_op.new_chunks([lc, rc], kws=kws)
+            l_chunks.append(l_chunk)
+
+        return nsplits, l_chunks, r_chunks
+
+    @classmethod
+    def _tile_dataframe_series(cls, op: "DataFrameAlign"):
+        lhs, rhs = op.lhs, op.rhs
+        nsplits, left_chunk_shape, left_chunks, right_chunks = align_dataframe_series(
+            lhs, rhs, axis=op.axis
+        )
+        cls._check_align_needed(op, left_chunks, right_chunks)
+
+        left_chunk_array = np.array(left_chunks, dtype="O").reshape(left_chunk_shape)
+        axis = op.axis if op.broadcast_axis != 1 else 0
+        l_chunks, r_chunks = [], []
+        iterator = np.nditer(left_chunk_array, flags=["refs_ok", "multi_index"])
+        for c_obj in iterator:
+            c = c_obj.tolist()
+            l_index = iterator.multi_index
+
+            right_chunk = right_chunks[l_index[axis]]
+            kws = [c.params, right_chunk.params]
+            kws[0]["index"] = l_index
+            if op.broadcast_axis != 1:
+                kws[1]["index"] = (l_index[axis],)
+            else:
+                kws[1]["index"] = l_index
+
+            chunk_op = op.copy().reset_key()
+            l_chunk, r_chunk = chunk_op.new_chunks([c, right_chunk], kws=kws)
+
+            l_chunks.append(l_chunk)
+            if op.broadcast_axis == 1 or l_index[1 - axis] == 0:
+                r_chunks.append(r_chunk)
+
+        return nsplits, l_chunks, r_chunks
+
+    @classmethod
+    def _tile_series_series(cls, op: "DataFrameAlign"):
+        nsplits, _, left_chunks, right_chunks = align_series_series(op.lhs, op.rhs)
+        cls._check_align_needed(op, left_chunks, right_chunks)
+
+        l_chunks, r_chunks = [], []
+        for idx, (lc, rc) in enumerate(zip(left_chunks, right_chunks)):
+            kws = [lc.params, rc.params]
+            kws[0]["index"] = kws[1]["index"] = (idx,)
+
+            chunk_op = op.copy().reset_key()
+            l_chunk, r_chunk = chunk_op.new_chunks([lc, rc], kws=kws)
+            l_chunks.append(l_chunk)
+            r_chunks.append(r_chunk)
+        return nsplits, l_chunks, r_chunks
+
+    @classmethod
+    def _tile_with_fillna(cls, tileable: TileableType):
+        op = tileable.op
+        if op.method is None:
+            return tileable
+        axis = op.fill_axis if tileable.ndim == 2 else 0
+        tileable = tileable.fillna(method=op.method, limit=op.limit, axis=axis)
+        return (yield from recursive_tile(tileable))
+
+    @classmethod
+    def _make_direct_output_kws(cls, left: TileableType, right: TileableType):
+        kws = [left.params, right.params]
+        kws[0].update(dict(chunks=left.chunks, nsplits=left.nsplits))
+        kws[1].update(dict(chunks=right.chunks, nsplits=right.nsplits))
+        return kws
+
+    @classmethod
+    def tile(cls, op: "DataFrameAlign"):
+        try:
+            if op.lhs.ndim == op.rhs.ndim:
+                if op.lhs.ndim == 2:
+                    nsplits, left_chunks, right_chunks = cls._tile_dataframe_dataframe(
+                        op
+                    )
+                else:
+                    nsplits, left_chunks, right_chunks = cls._tile_series_series(op)
+            else:
+                nsplits, left_chunks, right_chunks = cls._tile_dataframe_series(op)
+        except _NoNeedToAlign:
+            kws = cls._make_direct_output_kws(op.lhs, op.rhs)
+        else:
+            kws = [
+                cls._build_tiled_kw(op, 0, left_chunks, nsplits),
+                cls._build_tiled_kw(op, 1, right_chunks, nsplits),
+            ]
+        new_left, new_right = op.copy().new_tileables(op.inputs, kws=kws)
+
+        new_left_filled = yield from cls._tile_with_fillna(new_left)
+        new_right_filled = yield from cls._tile_with_fillna(new_right)
+        if new_left_filled is not new_left or new_right_filled is not new_right:
+            kws = cls._make_direct_output_kws(new_left_filled, new_right_filled)
+            new_left, new_right = op.copy().new_tileables(op.inputs, kws=kws)
+
+        return [new_left, new_right]
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameAlign"):
+        lhs_val = ctx[op.lhs.key]
+        rhs_val = ctx[op.rhs.key]
+        l_res, r_res = lhs_val.align(
+            rhs_val,
+            axis=op.axis,
+            join=op.join,
+            fill_value=op.fill_value,
+            broadcast_axis=op.broadcast_axis,
+        )
+        ctx[op.outputs[0].key] = l_res
+        ctx[op.outputs[1].key] = r_res
+
+
+def align(
+    df,
+    other,
+    join: str = "outer",
+    axis: Union[int, str, None] = None,
+    level: Union[int, str, None] = None,
+    copy: bool = True,
+    fill_value: Any = None,
+    method: str = None,
+    limit: Optional[int] = None,
+    fill_axis: Union[int, str] = 0,
+    broadcast_axis: Union[int, str] = None,
+):
+    """
+    Align two objects on their axes with the specified join method.
+
+    Join method is specified for each axis Index.
+
+    Parameters
+    ----------
+    other : DataFrame or Series
+    join : {'outer', 'inner', 'left', 'right'}, default 'outer'
+    axis : allowed axis of the other object, default None
+        Align on index (0), columns (1), or both (None).
+    level : int or level name, default None
+        Broadcast across a level, matching Index values on the
+        passed MultiIndex level.
+    copy : bool, default True
+        Always returns new objects. If copy=False and no reindexing is
+        required then original objects are returned.
+    fill_value : scalar, default np.NaN
+        Value to use for missing values. Defaults to NaN, but can be any
+        "compatible" value.
+    method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
+        Method to use for filling holes in reindexed Series:
+
+        - pad / ffill: propagate last valid observation forward to next valid.
+        - backfill / bfill: use NEXT valid observation to fill gap.
+
+    limit : int, default None
+        If method is specified, this is the maximum number of consecutive
+        NaN values to forward/backward fill. In other words, if there is
+        a gap with more than this number of consecutive NaNs, it will only
+        be partially filled. If method is not specified, this is the
+        maximum number of entries along the entire axis where NaNs will be
+        filled. Must be greater than 0 if not None.
+    fill_axis : {0 or 'index', 1 or 'columns'}, default 0
+        Filling axis, method and limit.
+    broadcast_axis : {0 or 'index', 1 or 'columns'}, default None
+        Broadcast values along this axis, if aligning two objects of
+        different dimensions.
+
+    Notes
+    -----
+    Currently argument `level` is not supported.
+
+    Returns
+    -------
+    (left, right) : (DataFrame, type of other)
+        Aligned objects.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame(
+    ...     [[1, 2, 3, 4], [6, 7, 8, 9]], columns=["D", "B", "E", "A"], index=[1, 2]
+    ... )
+    >>> other = md.DataFrame(
+    ...     [[10, 20, 30, 40], [60, 70, 80, 90], [600, 700, 800, 900]],
+    ...     columns=["A", "B", "C", "D"],
+    ...     index=[2, 3, 4],
+    ... )
+    >>> df.execute()
+       D  B  E  A
+    1  1  2  3  4
+    2  6  7  8  9
+    >>> other.execute()
+        A    B    C    D
+    2   10   20   30   40
+    3   60   70   80   90
+    4  600  700  800  900
+
+    Align on columns:
+
+    >>> left, right = df.align(other, join="outer", axis=1)
+    >>> left.execute()
+       A  B   C  D  E
+    1  4  2 NaN  1  3
+    2  9  7 NaN  6  8
+    >>> right.execute()
+        A    B    C    D   E
+    2   10   20   30   40 NaN
+    3   60   70   80   90 NaN
+    4  600  700  800  900 NaN
+
+    We can also align on the index:
+
+    >>> left, right = df.align(other, join="outer", axis=0)
+    >>> left.execute()
+        D    B    E    A
+    1  1.0  2.0  3.0  4.0
+    2  6.0  7.0  8.0  9.0
+    3  NaN  NaN  NaN  NaN
+    4  NaN  NaN  NaN  NaN
+    >>> right.execute()
+        A      B      C      D
+    1    NaN    NaN    NaN    NaN
+    2   10.0   20.0   30.0   40.0
+    3   60.0   70.0   80.0   90.0
+    4  600.0  700.0  800.0  900.0
+
+    Finally, the default `axis=None` will align on both index and columns:
+
+    >>> left, right = df.align(other, join="outer", axis=None)
+    >>> left.execute()
+         A    B   C    D    E
+    1  4.0  2.0 NaN  1.0  3.0
+    2  9.0  7.0 NaN  6.0  8.0
+    3  NaN  NaN NaN  NaN  NaN
+    4  NaN  NaN NaN  NaN  NaN
+    >>> right.execute()
+           A      B      C      D   E
+    1    NaN    NaN    NaN    NaN NaN
+    2   10.0   20.0   30.0   40.0 NaN
+    3   60.0   70.0   80.0   90.0 NaN
+    4  600.0  700.0  800.0  900.0 NaN
+    """
+    axis = validate_axis(axis) if axis is not None else None
+    fill_axis = validate_axis(fill_axis) if fill_axis is not None else None
+    broadcast_axis = (
+        validate_axis(broadcast_axis) if broadcast_axis is not None else None
+    )
+
+    if level is not None:
+        raise NotImplementedError(f"Argument `level` not supported")
+    if df.ndim != other.ndim and axis is None:
+        raise ValueError("Must specify axis=0 or 1")
+
+    op = DataFrameAlign(
+        join=join,
+        axis=axis,
+        level=level,
+        copy=copy,
+        fill_value=fill_value,
+        method=method,
+        limit=limit,
+        fill_axis=fill_axis,
+        broadcast_axis=broadcast_axis,
+    )
+    return op(df, other)
diff --git a/python/xorbits/_mars/dataframe/indexing/at.py b/python/xorbits/_mars/dataframe/indexing/at.py
new file mode 100644
index 000000000..35b37460f
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/at.py
@@ -0,0 +1,83 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from .loc import DataFrameLoc
+
+
+class DataFrameAt:
+    def __init__(self, obj):
+        self._obj = obj
+        self._loc = DataFrameLoc(self._obj)
+
+    def __getitem__(self, indexes):
+        if not isinstance(indexes, tuple):
+            indexes = (indexes,)
+
+        for index in indexes:
+            if not np.isscalar(index):
+                raise ValueError("Invalid call for scalar access (getting)!")
+
+        return self._loc[indexes]
+
+
+def at(a):
+    """
+    Access a single value for a row/column label pair.
+
+    Similar to ``loc``, in that both provide label-based lookups. Use
+    ``at`` if you only need to get or set a single value in a DataFrame
+    or Series.
+
+    Raises
+    ------
+    KeyError
+        If 'label' does not exist in DataFrame.
+
+    See Also
+    --------
+    DataFrame.iat : Access a single value for a row/column pair by integer
+        position.
+    DataFrame.loc : Access a group of rows and columns by label(s).
+    Series.at : Access a single value using a label.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]],
+    ...                   index=[4, 5, 6], columns=['A', 'B', 'C'])
+    >>> df.execute()
+        A   B   C
+    4   0   2   3
+    5   0   4   1
+    6  10  20  30
+
+    Get value at specified row/column pair
+
+    >>> df.at[4, 'B'].execute()
+    2
+
+    # Set value at specified row/column pair
+    #
+    # >>> df.at[4, 'B'] = 10
+    # >>> df.at[4, 'B']
+    # 10
+
+    Get value within a Series
+
+    >>> df.loc[5].at['B'].execute()
+    4
+    """
+    return DataFrameAt(a)
diff --git a/python/xorbits/_mars/dataframe/indexing/getitem.py b/python/xorbits/_mars/dataframe/indexing/getitem.py
new file mode 100644
index 000000000..aea1f3e90
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/getitem.py
@@ -0,0 +1,635 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from numbers import Integral
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import ENTITY_TYPE, OutputType, recursive_tile
+from ...serialization.serializables import AnyField, BoolField, Int32Field
+from ...tensor.core import TENSOR_CHUNK_TYPE, TENSOR_TYPE
+from ...tensor.datasource import tensor as astensor
+from ...utils import has_unknown_shape
+from ..align import align_dataframe_dataframe, align_dataframe_series
+from ..core import (
+    DATAFRAME_CHUNK_TYPE,
+    DATAFRAME_TYPE,
+    SERIES_CHUNK_TYPE,
+    SERIES_TYPE,
+    is_chunk_meta_lazy,
+)
+from ..merge import DataFrameConcat
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import in_range_index, parse_index
+from .utils import calc_columns_index
+
+
+class SeriesIndex(DataFrameOperand, DataFrameOperandMixin):
+    _op_module_ = "series"
+    _op_type_ = OperandDef.INDEX
+
+    _labels = AnyField("labels")
+
+    _combine_size = Int32Field("combine_size")
+    _is_intermediate = BoolField("is_intermediate")
+
+    def __init__(
+        self,
+        labels=None,
+        combine_size=None,
+        is_intermediate=None,
+        output_types=None,
+        **kw,
+    ):
+        super().__init__(
+            _labels=labels,
+            _combine_size=combine_size,
+            _is_intermediate=is_intermediate,
+            _output_types=output_types,
+            **kw,
+        )
+
+    @property
+    def labels(self):
+        return self._labels
+
+    @property
+    def combine_size(self):
+        return self._combine_size
+
+    @property
+    def is_intermediate(self):
+        return self._is_intermediate
+
+    def __call__(self, series, name=None):
+        return self.new_tileable([series], dtype=series.dtype, name=name)
+
+    def _new_tileables(self, inputs, kws=None, **kw):
+        # Override this method to automatically decide the output type,
+        # when `labels` is a list, we will set `output_types` as series,
+        # otherwise it will be a scalar.
+        output_types = getattr(self, "_output_types", None)
+        shape = kw.pop("shape", None)
+        is_scalar = not isinstance(self._labels, list)
+        if not output_types:
+            output_types = [OutputType.scalar] if is_scalar else [OutputType.series]
+            self.output_types = output_types
+        if shape is None:
+            shape = () if is_scalar else ((len(self._labels)),)
+            kw["shape"] = shape
+        if not is_scalar:
+            index_value = kw.pop("index_value", None) or parse_index(
+                pd.Index(self._labels)
+            )
+            kw["index_value"] = index_value
+        return super()._new_tileables(inputs, kws=kws, **kw)
+
+    def _new_chunks(self, inputs, kws=None, **kw):
+        # Override this method to automatically decide the output type,
+        # when `labels` is a list, we will set `output_types` as series,
+        # otherwise it will be a scalar.
+        output_types = getattr(self, "_output_types", None)
+        is_scalar = not isinstance(self._labels, list)
+        if not output_types:
+            output_types = [OutputType.scalar] if is_scalar else [OutputType.series]
+            self.output_types = output_types
+        if kw.get("shape", None) is None:
+            shape = () if is_scalar else ((len(self._labels)),)
+            kw["shape"] = shape
+        if not is_scalar:
+            index_value = kw.pop("index_value", None) or parse_index(
+                pd.Index(self._labels)
+            )
+            kw["index_value"] = index_value
+        else:
+            # tensor chunk cannot accept index_value
+            kw.pop("index_value", None)
+        return super()._new_chunks(inputs, kws=kws, **kw)
+
+    @classmethod
+    def _calc_chunk_index(cls, label, chunk_indexes):
+        for i, index in enumerate(chunk_indexes):
+            if isinstance(index, pd.RangeIndex) and in_range_index(label, index):
+                return i
+            elif label in index:
+                return i
+        raise TypeError(f"label {label} doesn't exist")
+
+    @classmethod
+    def _tile_one_chunk(cls, op):
+        in_series = op.inputs[0]
+        out_series = op.outputs[0]
+
+        index_op = SeriesIndex(labels=op.labels)
+        kw = {"name": out_series.name} if hasattr(out_series, "name") else {}
+        index_chunk = index_op.new_chunk(in_series.chunks, dtype=out_series.dtype, **kw)
+        new_op = op.copy()
+        nsplits = ((len(op.labels),),) if isinstance(op.labels, list) else ()
+        return new_op.new_tileables(
+            op.inputs, chunks=[index_chunk], nsplits=nsplits, dtype=out_series.dtype
+        )
+
+    @classmethod
+    def _tree_getitem(cls, op):
+        """
+        DataFrame doesn't store the index value except RangeIndex or specify `store=True` in `parse_index`,
+        So we build a tree structure to avoid too much dependence for getitem node.
+        """
+        out_series = op.outputs[0]
+        combine_size = options.combine_size
+        chunks = op.inputs[0].chunks
+        while len(chunks) > combine_size:
+            new_chunks = []
+            for i in range(0, len(chunks), combine_size):
+                chks = chunks[i : i + combine_size]
+                if len(chks) == 1:
+                    chk = chks[0]
+                else:
+                    concat_op = DataFrameConcat(output_types=[OutputType.series])
+                    chk = concat_op.new_chunk(chks, dtype=chks[0].dtype)
+                chk_op = SeriesIndex(labels=op.labels, is_intermediate=True)
+                kw = {"name": out_series.name} if hasattr(out_series, "name") else {}
+                chk = chk_op.new_chunk(
+                    [chk],
+                    shape=(np.nan,),
+                    dtype=chk.dtype,
+                    index_value=parse_index(pd.RangeIndex(-1)),
+                    **kw,
+                )
+                new_chunks.append(chk)
+            chunks = new_chunks
+
+        concat_op = DataFrameConcat(output_types=[OutputType.series])
+        kw = {"name": out_series.name} if hasattr(out_series, "name") else {}
+        kw["index"] = (0,)
+        chk = concat_op.new_chunk(chunks, dtype=chunks[0].dtype, **kw)
+        index_op = SeriesIndex(labels=op.labels)
+        chunk = index_op.new_chunk([chk], dtype=chk.dtype, **kw)
+        new_op = op.copy()
+        nsplits = ((len(op.labels),),) if isinstance(op.labels, list) else ()
+        kw = out_series.params
+        kw["nsplits"] = nsplits
+        kw["chunks"] = [chunk]
+        return new_op.new_tileables(op.inputs, kws=[kw])
+
+    @classmethod
+    def tile(cls, op):
+        in_series = op.inputs[0]
+        out_series = op.outputs[0]
+
+        if len(in_series.chunks) == 1:
+            return cls._tile_one_chunk(op)
+        if not in_series.index_value.has_value():
+            return cls._tree_getitem(op)
+
+        chunk_indexes = [c.index_value.to_pandas() for c in in_series.chunks]
+        if not isinstance(op.labels, list):
+            selected_chunk = in_series.chunks[
+                cls._calc_chunk_index(op.labels, chunk_indexes)
+            ]
+            index_op = op.copy().reset_key()
+            out_chunk = index_op.new_chunk(
+                [selected_chunk], shape=(), dtype=selected_chunk.dtype
+            )
+            new_op = op.copy()
+            return new_op.new_scalars(
+                op.inputs, dtype=out_series.dtype, chunks=[out_chunk]
+            )
+        else:
+            # When input series's index is RangeIndex(5), chunk_size is 3, and labels is [4, 2, 3, 4],
+            # Combine the labels in the same chunk, so the splits will be [[4], [2], [3, 4]],
+            # the corresponding chunk index is [1, 0, 1].
+            selected_index = [
+                cls._calc_chunk_index(label, chunk_indexes) for label in op.labels
+            ]
+            condition = np.where(np.diff(selected_index))[0] + 1
+            column_splits = np.split(op.labels, condition)
+            column_indexes = np.split(selected_index, condition)
+
+            out_chunks = []
+            nsplits = []
+            for i, (labels, idx) in enumerate(zip(column_splits, column_indexes)):
+                index_op = SeriesIndex(labels=list(labels))
+                c = in_series.chunks[idx[0]]
+                nsplits.append(len(labels))
+                index_value = parse_index(
+                    pd.Index([], dtype=c.index_value.to_pandas().dtype), c, labels
+                )
+                out_chunks.append(
+                    index_op.new_chunk(
+                        [c],
+                        shape=(len(labels),),
+                        dtype=c.dtype,
+                        index_value=index_value,
+                        name=c.name,
+                        index=(i,),
+                    )
+                )
+            new_op = op.copy()
+            return new_op.new_seriess(
+                op.inputs,
+                shape=out_series.shape,
+                dtype=out_series.dtype,
+                index_value=out_series.index_value,
+                nsplits=(tuple(nsplits),),
+                chunks=out_chunks,
+                name=out_series.name,
+            )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        series = ctx[op.inputs[0].key]
+        labels = op.labels
+        if op.is_intermediate:
+            # for intermediate result, it is always a series even if labels is a scalar.
+            labels = labels if isinstance(labels, list) else [labels]
+            labels = [label for label in set(labels) if label in series]
+        ctx[op.outputs[0].key] = series[labels]
+
+
+class DataFrameIndex(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.INDEX
+
+    col_names = AnyField("col_names", default=None)
+
+    # for bool index
+    mask = AnyField("mask", default=None)
+    identical_index = BoolField("identical_index")
+
+    def __init__(self, output_types=None, **kw):
+        output_types = output_types or [OutputType.series]
+        super().__init__(_output_types=output_types, **kw)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if isinstance(self.col_names, ENTITY_TYPE):
+            self.col_names = self._inputs[0]
+        if isinstance(self.mask, ENTITY_TYPE):
+            self.mask = self._inputs[-1]
+
+    def __call__(self, df):
+        if self.col_names is not None:
+            # if col_names is a list, return a DataFrame, else return a Series
+            col_names = self.col_names
+            if not isinstance(col_names, list):
+                col_names = [col_names]
+                is_list = False
+            else:
+                is_list = True
+
+            dtypes_list = df._get_dtypes_by_columns(col_names)
+            if is_list or len(dtypes_list) > 1:
+                if len(col_names) != len(dtypes_list):
+                    col_names = df._get_columns_by_columns(col_names)
+                columns = parse_index(pd.Index(col_names), store_data=True)
+                return self.new_dataframe(
+                    [df],
+                    shape=(df.shape[0], len(col_names)),
+                    dtypes=pd.Series(dtypes_list, index=col_names, dtype=np.dtype("O")),
+                    index_value=df.index_value,
+                    columns_value=columns,
+                )
+            else:
+                dtype = dtypes_list[0]
+                return self.new_series(
+                    [df],
+                    shape=(df.shape[0],),
+                    dtype=dtype,
+                    index_value=df.index_value,
+                    name=self.col_names,
+                )
+        else:
+            if isinstance(self.mask, (SERIES_TYPE, DATAFRAME_TYPE, TENSOR_TYPE)):
+                index_value = parse_index(
+                    pd.Index(
+                        [],
+                        dtype=df.index_value.to_pandas().dtype,
+                        name=df.index_value.name,
+                    ),
+                    df,
+                    self.mask,
+                )
+                return self.new_dataframe(
+                    [df, self.mask],
+                    shape=(np.nan, df.shape[1]),
+                    dtypes=df.dtypes,
+                    index_value=index_value,
+                    columns_value=df.columns_value,
+                )
+            else:
+                index_value = parse_index(
+                    pd.Index(
+                        [],
+                        dtype=df.index_value.to_pandas().dtype,
+                        name=df.index_value.name,
+                    ),
+                    df,
+                    self.mask,
+                )
+                return self.new_dataframe(
+                    [df],
+                    shape=(np.nan, df.shape[1]),
+                    dtypes=df.dtypes,
+                    index_value=index_value,
+                    columns_value=df.columns_value,
+                )
+
+    @classmethod
+    def tile(cls, op):
+        if op.col_names is not None:
+            return cls.tile_with_columns(op)
+        else:
+            return (yield from cls.tile_with_mask(op))
+
+    @classmethod
+    def tile_with_mask(cls, op: "DataFrameIndex"):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+
+        out_chunks = []
+
+        if isinstance(op.mask, (SERIES_TYPE, DATAFRAME_TYPE, TENSOR_TYPE)):
+            mask = op.inputs[1]
+
+            if hasattr(mask, "index_value") and mask.ndim == 1 and op.identical_index:
+                if has_unknown_shape(in_df, mask):
+                    yield
+                nsplits = ((np.nan,) * in_df.chunk_shape[0], in_df.nsplits[1])
+                out_shape = in_df.chunk_shape
+                df_chunks = in_df.chunks
+                aligned_mask = yield from recursive_tile(
+                    mask.rechunk(in_df.nsplits[: mask.ndim])
+                )
+                mask_chunks = aligned_mask.chunks
+            elif isinstance(mask, SERIES_TYPE):
+                nsplits, out_shape, df_chunks, mask_chunks = align_dataframe_series(
+                    in_df, mask, axis="index"
+                )
+            elif isinstance(mask, DATAFRAME_TYPE):
+                nsplits, out_shapes, df_chunks, mask_chunks = align_dataframe_dataframe(
+                    in_df, mask
+                )
+                out_shape = out_shapes[0]
+            else:
+                # tensor
+                nsplits = in_df.nsplits
+                mask = yield from recursive_tile(mask.rechunk(nsplits[: mask.ndim]))
+                out_shape = in_df.chunk_shape
+                df_chunks = in_df.chunks
+                mask_chunks = mask.chunks
+            out_chunk_indexes = itertools.product(*(range(s) for s in out_shape))
+
+            out_chunks = []
+            for i, idx, df_chunk in zip(
+                itertools.count(), out_chunk_indexes, df_chunks
+            ):
+                if op.mask.ndim == 1:
+                    mask_chunk = mask_chunks[df_chunk.index[0]]
+                else:
+                    mask_chunk = mask_chunks[i]
+                out_chunk = (
+                    op.copy()
+                    .reset_key()
+                    .new_chunk(
+                        [df_chunk, mask_chunk],
+                        index=idx,
+                        shape=(np.nan, df_chunk.shape[1]),
+                    )
+                )
+                out_chunk._set_tileable_meta(
+                    tileable_key=out_df.key,
+                    nsplits=nsplits,
+                    index_value=out_df.index_value,
+                    columns_value=out_df.columns_value,
+                    dtypes=out_df.dtypes,
+                )
+                out_chunks.append(out_chunk)
+        else:
+            if has_unknown_shape(in_df):
+                yield
+            nsplits_acc = np.cumsum((0,) + in_df.nsplits[0])
+            for idx in range(in_df.chunk_shape[0]):
+                for idxj in range(in_df.chunk_shape[1]):
+                    in_chunk = in_df.cix[idx, idxj]
+                    chunk_op = op.copy().reset_key()
+                    chunk_op.mask = op.mask.iloc[
+                        nsplits_acc[idx] : nsplits_acc[idx + 1]
+                    ]
+                    out_chunk = chunk_op.new_chunk(
+                        [in_chunk],
+                        index=in_chunk.index,
+                        shape=(np.nan, in_chunk.shape[1]),
+                        dtypes=in_chunk.dtypes,
+                        index_value=in_df.index_value,
+                        columns_value=in_chunk.columns_value,
+                    )
+                    out_chunks.append(out_chunk)
+
+        nsplits_on_columns = tuple(c.shape[1] for c in out_chunks if c.index[0] == 0)
+        row_chunk_num = len([c.shape[0] for c in out_chunks if c.index[1] == 0])
+        nsplits = ((np.nan,) * row_chunk_num, nsplits_on_columns)
+
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            op.inputs,
+            shape=out_df.shape,
+            dtypes=out_df.dtypes,
+            index_value=out_df.index_value,
+            columns_value=out_df.columns_value,
+            chunks=out_chunks,
+            nsplits=nsplits,
+        )
+
+    @classmethod
+    def tile_with_columns(cls, op):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+        col_names = op.col_names
+        chunk_meta_lazy = is_chunk_meta_lazy(in_df.chunks[0])
+        if out_df.ndim < 2:
+            # Series
+            column_index = calc_columns_index(col_names, in_df)[0]
+            out_chunks = []
+            dtype = in_df.dtypes[col_names]
+            out_nsplits = (in_df.nsplits[0],)
+            for i in range(in_df.chunk_shape[0]):
+                c = in_df.cix[(i, column_index)]
+                chunk_op = DataFrameIndex(col_names=col_names)
+                if chunk_meta_lazy:
+                    out_chunk = chunk_op.new_chunk(
+                        [c],
+                        shape=(c.shape[0],),
+                        index=(i,),
+                        dtype=dtype,
+                        name=col_names,
+                    )
+                    out_chunk._set_tileable_meta(
+                        tileable_key=out_df.key,
+                        nsplits=out_nsplits,
+                        index_value=out_df.index_value,
+                    )
+                else:
+                    out_chunk = chunk_op.new_chunk(
+                        [c],
+                        shape=(c.shape[0],),
+                        index=(i,),
+                        dtype=dtype,
+                        index_value=c.index_value,
+                        name=col_names,
+                    )
+                out_chunks.append(out_chunk)
+            new_op = op.copy()
+            params = out_df.params.copy()
+            params["chunks"] = out_chunks
+            params["nsplits"] = out_nsplits
+            return new_op.new_seriess(op.inputs, kws=[params])
+        else:
+            # combine columns into one chunk and keep the columns order at the same time.
+            # When chunk columns are ['c1', 'c2', 'c3'], ['c4', 'c5'],
+            # selected columns are ['c2', 'c3', 'c4', 'c2'], `column_splits` will be
+            # [(['c2', 'c3'], 0), ('c4', 1), ('c2', 0)].
+            if not isinstance(col_names, _list_like_types):
+                col_names = [col_names]
+            selected_index = [calc_columns_index(col, in_df) for col in col_names]
+            selected_index = list(itertools.chain.from_iterable(selected_index))
+            condition = np.where(np.diff(selected_index))[0] + 1
+            column_splits = np.split(col_names, condition)
+            column_indexes = np.split(selected_index, condition)
+
+            out_chunks = [[] for _ in range(in_df.chunk_shape[0])]
+            nsplits = [in_df.nsplits[0], []]
+            column_nsplits = nsplits[1]
+            for i, (columns, column_idx) in enumerate(
+                zip(column_splits, column_indexes)
+            ):
+                try:
+                    dtypes = in_df.dtypes[columns]
+                except ValueError:  # pragma: no cover
+                    dtypes = in_df.dtypes[list(columns)]
+                column_nsplits.append(len(dtypes))
+                for j in range(in_df.chunk_shape[0]):
+                    c = in_df.cix[(j, column_idx[0])]
+                    index_op = DataFrameIndex(
+                        col_names=list(columns), output_types=[OutputType.dataframe]
+                    )
+                    if chunk_meta_lazy:
+                        out_chunk = index_op.new_chunk(
+                            [c], shape=(c.shape[0], len(dtypes)), index=(j, i)
+                        )
+                        out_chunk._set_tileable_meta(
+                            tileable_key=out_df.key,
+                            nsplits=nsplits,
+                            index_value=out_df.index_value,
+                            columns_value=out_df.columns_value,
+                            dtypes=out_df.dtypes,
+                        )
+                    else:
+                        out_chunk = index_op.new_chunk(
+                            [c],
+                            shape=(c.shape[0], len(dtypes)),
+                            index=(j, i),
+                            dtypes=dtypes,
+                            index_value=c.index_value,
+                            columns_value=parse_index(
+                                pd.Index(dtypes.index), store_data=True
+                            ),
+                        )
+                    out_chunks[j].append(out_chunk)
+            out_chunks = [item for cl in out_chunks for item in cl]
+            new_op = op.copy()
+            params = out_df.params.copy()
+            params["chunks"] = out_chunks
+            params["nsplits"] = nsplits
+            return new_op.new_dataframes(op.inputs, kws=[params])
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameIndex"):
+        if op.mask is None:
+            df = ctx[op.inputs[0].key]
+            ctx[op.outputs[0].key] = df[op.col_names]
+        else:
+            df = ctx[op.inputs[0].key]
+            if isinstance(
+                op.mask, (SERIES_CHUNK_TYPE, DATAFRAME_CHUNK_TYPE, TENSOR_CHUNK_TYPE)
+            ):
+                mask = ctx[op.inputs[1].key]
+            else:
+                mask = op.mask
+            if hasattr(mask, "reindex_like") and not op.identical_index:
+                mask = mask.reindex_like(df).fillna(False)
+            ctx[op.outputs[0].key] = df[mask]
+
+    @classmethod
+    def estimate_size(cls, ctx: dict, op: "DataFrameIndex"):
+        super().estimate_size(ctx, op)
+        result_size = ctx[op.outputs[0].key][0]
+        ctx[op.outputs[0].key] = (result_size, result_size)
+
+
+_list_like_types = (list, np.ndarray, SERIES_TYPE, pd.Series, TENSOR_TYPE)
+
+
+def dataframe_getitem(df, item):
+    columns_set = set(df.dtypes.keys())
+
+    if isinstance(item, (np.ndarray, pd.Series)) and item.dtype != np.bool_:
+        item = item.tolist()
+
+    if isinstance(item, slice):
+        edge = item.start if item.start is not None else item.stop
+        if isinstance(edge, Integral):
+            return df.iloc[item]
+        else:
+            return df.loc[item]
+    elif isinstance(item, list):
+        for col_name in item:
+            if col_name not in columns_set:
+                raise KeyError(f"{col_name} not in columns")
+        op = DataFrameIndex(col_names=item, output_types=[OutputType.dataframe])
+    elif isinstance(item, _list_like_types) or hasattr(item, "dtypes"):
+        # NB: don't enforce the dtype of `item` to be `bool` since it may be unknown
+        if isinstance(item, DATAFRAME_TYPE + SERIES_TYPE):
+            identical_index = df.index_value.key == item.index_value.key
+        else:
+            identical_index = False
+        op = DataFrameIndex(
+            mask=item,
+            identical_index=identical_index,
+            output_types=[OutputType.dataframe],
+        )
+    else:
+        if item not in columns_set:
+            raise KeyError(f"{item} not in columns {columns_set}")
+        op = DataFrameIndex(col_names=item)
+    return op(df)
+
+
+def series_getitem(series, labels, combine_size=None):
+    if isinstance(labels, list) or np.isscalar(labels):
+        op = SeriesIndex(labels=labels, combine_size=combine_size)
+        return op(series, name=series.name)
+    elif isinstance(labels, _list_like_types) and astensor(labels).dtype == np.bool_:
+        return series.loc[labels]
+    elif isinstance(labels, slice):
+        edge = labels.start if labels.start is not None else labels.stop
+        if isinstance(edge, Integral):
+            return series.iloc[labels]
+        else:
+            return series.loc[labels]
+    else:
+        raise NotImplementedError(f"type {type(labels)} is not support for getitem")
diff --git a/python/xorbits/_mars/dataframe/indexing/iat.py b/python/xorbits/_mars/dataframe/indexing/iat.py
new file mode 100644
index 000000000..c03dfb17b
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/iat.py
@@ -0,0 +1,37 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from numbers import Integral
+
+from .iloc import DataFrameIloc
+
+
+class DataFrameIat:
+    def __init__(self, obj):
+        self._obj = obj
+        self._iloc = DataFrameIloc(self._obj)
+
+    def __getitem__(self, indexes):
+        if not isinstance(indexes, tuple):
+            indexes = (indexes,)
+
+        for index in indexes:
+            if not isinstance(index, Integral):
+                raise ValueError("Invalid call for scalar access (getting)!")
+
+        return self._iloc[indexes]
+
+
+def iat(a):
+    return DataFrameIat(a)
diff --git a/python/xorbits/_mars/dataframe/indexing/iloc.py b/python/xorbits/_mars/dataframe/indexing/iloc.py
new file mode 100644
index 000000000..225cf6fd7
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/iloc.py
@@ -0,0 +1,893 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from numbers import Integral
+
+import numpy as np
+import pandas as pd
+from pandas.core.dtypes.cast import find_common_type
+from pandas.core.indexing import IndexingError
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import ENTITY_TYPE, OutputType, recursive_tile
+from ...serialization.serializables import AnyField, KeyField, ListField
+from ...tensor import asarray
+from ...tensor.datasource.empty import empty
+from ...tensor.indexing.core import calc_shape
+from ...utils import ceildiv
+from ..operands import DATAFRAME_TYPE, DataFrameOperand, DataFrameOperandMixin
+from ..utils import indexing_index_value, is_cudf
+from .index_lib import DataFrameIlocIndexesHandler
+
+_ILOC_ERROR_MSG = (
+    "Location based indexing can only have [integer, "
+    "integer slice (START point is INCLUDED, END point is EXCLUDED), "
+    "listlike of integers, boolean array] types"
+)
+
+
+def process_iloc_indexes(inp, indexes):
+    ndim = inp.ndim
+
+    if not isinstance(indexes, tuple):
+        indexes = (indexes,)
+    if len(indexes) < ndim:
+        indexes += (slice(None),) * (ndim - len(indexes))
+    if len(indexes) > ndim:
+        raise IndexingError("Too many indexers")
+
+    new_indexes = []
+    # check each index
+    for ax, index in enumerate(indexes):
+        if isinstance(index, tuple):
+            # a tuple should already have been caught by this point
+            # so don't treat a tuple as a valid indexer
+            raise IndexingError("Too many indexers")
+        elif isinstance(index, slice):
+            if any(v is not None for v in [index.start, index.stop, index.step]):
+                pd_index = (
+                    inp.index_value if ax == 0 else inp.columns_value
+                ).to_pandas()
+                for val in [index.start, index.stop, index.step]:
+                    if val is not None:
+                        try:
+                            pd_index[val]  # check on the pandas
+                        except IndexError:
+                            pass
+                        except TypeError:
+                            raise TypeError(
+                                f"cannot do slice indexing on {type(pd_index)} "
+                                f"with these indexers [{val}] of {type(val)}"
+                            )
+            new_indexes.append(index)
+        elif isinstance(index, (list, np.ndarray, pd.Series, ENTITY_TYPE)):
+            if not isinstance(index, ENTITY_TYPE):
+                index = np.asarray(index)
+            else:
+                index = asarray(index)
+                if ax == 1:
+                    # do not support tensor index on axis 1
+                    # because if so, the dtypes and columns_value would be unknown
+                    try:
+                        index = index.fetch()
+                    except (RuntimeError, ValueError):
+                        raise NotImplementedError(
+                            "indexer on axis columns cannot be non-executed tensor"
+                        )
+            if index.dtype != np.bool_:
+                index = index.astype(np.int64)
+            if index.ndim != 1:
+                raise ValueError(
+                    "Buffer has wrong number of dimensions "
+                    f"(expected 1, got {index.ndim})"
+                )
+            new_indexes.append(index)
+        elif isinstance(index, Integral):
+            shape = inp.shape[ax]
+            if not np.isnan(shape):
+                if index < -shape or index >= shape:
+                    raise IndexError("single positional indexer is out-of-bounds")
+            new_indexes.append(index)
+        else:
+            raise ValueError(_ILOC_ERROR_MSG)
+
+    return new_indexes
+
+
+class DataFrameIloc:
+    def __init__(self, obj):
+        self._obj = obj
+
+    def __getitem__(self, indexes):
+        if isinstance(self._obj, DATAFRAME_TYPE):
+            op = DataFrameIlocGetItem(indexes=process_iloc_indexes(self._obj, indexes))
+        else:
+            op = SeriesIlocGetItem(indexes=process_iloc_indexes(self._obj, indexes))
+        return op(self._obj)
+
+    def __setitem__(self, indexes, value):
+        if not np.isscalar(value):
+            raise NotImplementedError("Only scalar value is supported to set by iloc")
+
+        if isinstance(self._obj, DATAFRAME_TYPE):
+            op = DataFrameIlocSetItem(
+                indexes=process_iloc_indexes(self._obj, indexes), value=value
+            )
+        else:
+            op = SeriesIlocSetItem(
+                indexes=process_iloc_indexes(self._obj, indexes), value=value
+            )
+
+        ret = op(self._obj)
+        self._obj.data = ret.data
+
+
+class HeadTailOptimizedOperandMixin(DataFrameOperandMixin):
+    __slots__ = ()
+
+    @classmethod
+    def _is_head(cls, index0):
+        return (
+            (index0.start is None or index0.start == 0)
+            and index0.stop is not None
+            and index0.stop > 0
+        )
+
+    @classmethod
+    def _is_tail(cls, index0):
+        return index0.start is not None and index0.start < 0 and index0.stop is None
+
+    @classmethod
+    def _is_indexes_head_or_tail(cls, indexes):
+        index0 = indexes[0]
+        if not isinstance(index0, slice):
+            # have to be slice
+            return False
+        if index0.step is not None and index0.step != 1:
+            return False
+        if len(indexes) == 2:
+            if not isinstance(indexes[1], slice):
+                return False
+            if indexes[1] != slice(None):
+                return False
+        if cls._is_tail(index0):
+            # tail
+            return True
+        if cls._is_head(index0):
+            # head
+            return True
+        return False
+
+    @classmethod
+    def _need_tile_head_tail(cls, op):
+        # first, the input DataFrame should
+        # have unknown chunk shapes on the index axis,
+        inp = op.input
+        if not any(np.isnan(s) for s in inp.nsplits[0]):
+            return False
+
+        # if input is a DataFrame,
+        # should have 1 chunk on columns axis
+        if inp.ndim > 1 and inp.chunk_shape[1] > 1:
+            return False
+
+        return cls._is_indexes_head_or_tail(op.indexes)
+
+    @classmethod
+    def _tile_head_tail(cls, op):
+        from ..merge import DataFrameConcat
+
+        inp = op.input
+        out = op.outputs[0]
+        combine_size = options.combine_size
+
+        chunks = inp.chunks
+
+        new_chunks = []
+        for c in chunks:
+            chunk_op = op.copy().reset_key()
+            params = out.params
+            params["index"] = c.index
+            params["shape"] = c.shape if np.isnan(c.shape[0]) else out.shape
+            new_chunks.append(chunk_op.new_chunk([c], kws=[params]))
+        chunks = new_chunks
+
+        while len(chunks) > 1:
+            new_size = ceildiv(len(chunks), combine_size)
+            new_chunks = []
+            for i in range(new_size):
+                in_chunks = chunks[combine_size * i : combine_size * (i + 1)]
+                chunk_index = (i, 0) if in_chunks[0].ndim == 2 else (i,)
+                if len(inp.shape) == 1:
+                    shape = (sum(c.shape[0] for c in in_chunks),)
+                else:
+                    shape = (sum(c.shape[0] for c in in_chunks), in_chunks[0].shape[1])
+                concat_chunk = DataFrameConcat(
+                    axis=0, output_types=in_chunks[0].op.output_types
+                ).new_chunk(in_chunks, index=chunk_index, shape=shape)
+                chunk_op = op.copy().reset_key()
+                params = out.params
+                params["index"] = chunk_index
+                params["shape"] = (
+                    in_chunks[0].shape if np.isnan(in_chunks[0].shape[0]) else out.shape
+                )
+                new_chunks.append(chunk_op.new_chunk([concat_chunk], kws=[params]))
+            chunks = new_chunks
+
+        new_op = op.copy()
+        params = out.params
+        params["nsplits"] = tuple((s,) for s in out.shape)
+        params["chunks"] = chunks
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    def can_be_optimized(self):
+        return (
+            self._is_indexes_head_or_tail(self._indexes)
+            and self._is_head(self._indexes[0])
+            and self._indexes[0].stop <= options.optimize.head_optimize_threshold
+        )
+
+    @classmethod
+    def tile(cls, op):
+        if cls._need_tile_head_tail(op):
+            return cls._tile_head_tail(op)
+
+
+class DataFrameIlocGetItem(DataFrameOperand, HeadTailOptimizedOperandMixin):
+    _op_type_ = OperandDef.DATAFRAME_ILOC_GETITEM
+
+    _input = KeyField("input")
+    _indexes = ListField("indexes")
+
+    def __init__(self, indexes=None, gpu=None, sparse=False, output_types=None, **kw):
+        super().__init__(
+            _indexes=indexes, gpu=gpu, sparse=sparse, _output_types=output_types, **kw
+        )
+        if not self.output_types:
+            self.output_types = [OutputType.dataframe]
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def indexes(self):
+        return self._indexes
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        self._input = next(inputs_iter)
+        indexes = []
+        for index in self._indexes:
+            if isinstance(index, ENTITY_TYPE):
+                indexes.append(next(inputs_iter))
+            else:
+                indexes.append(index)
+        self._indexes = indexes
+
+    def __call__(self, df):
+        # Note [Fancy Index of Numpy and Pandas]
+        #
+        # The numpy and pandas.iloc have different semantic when processing fancy index:
+        #
+        # >>> np.ones((3,3))[[1,2],[1,2]]
+        # array([1., 1.])
+        #
+        # >>> pd.DataFrame(np.ones((3,3))).iloc[[1,2],[1,2]]
+        #    1    2
+        # 1  1.0  1.0
+        # 2  1.0  1.0
+        #
+        # Thus, we processing the index along two axis of DataFrame separately.
+        shape0 = tuple(calc_shape((df.shape[0],), (self.indexes[0],)))
+        shape1 = tuple(calc_shape((df.shape[1],), (self.indexes[1],)))
+
+        inputs = [df] + [
+            index for index in self._indexes if isinstance(index, ENTITY_TYPE)
+        ]
+
+        # NB: pandas only compresses the result to series when index on one of axis is integral
+        if isinstance(self.indexes[1], Integral):
+            shape = shape0
+            dtype = df.dtypes.iloc[self.indexes[1]]
+            index_value = indexing_index_value(df.index_value, self.indexes[0])
+            if isinstance(self.indexes[0], Integral):
+                # scalar
+                return self.new_scalar(inputs, dtype=dtype)
+            else:
+                return self.new_series(
+                    inputs,
+                    shape=shape,
+                    dtype=dtype,
+                    index_value=index_value,
+                    name=df.dtypes.index[self.indexes[1]],
+                )
+        elif isinstance(self.indexes[0], Integral):
+            shape = shape1
+            dtype = find_common_type(list(df.dtypes.iloc[self.indexes[1]].values))
+            index_value = indexing_index_value(df.columns_value, self.indexes[1])
+            return self.new_series(
+                inputs, shape=shape, dtype=dtype, index_value=index_value
+            )
+        else:
+            return self.new_dataframe(
+                inputs,
+                shape=shape0 + shape1,
+                dtypes=df.dtypes.iloc[self.indexes[1]],
+                index_value=indexing_index_value(df.index_value, self.indexes[0]),
+                columns_value=indexing_index_value(
+                    df.columns_value, self.indexes[1], store_data=True
+                ),
+            )
+
+    # FIXME The view behavior of DataFrame.iloc
+    #
+    # The pandas's iloc has complicated behavior about whether to create a view or not, it depends
+    # on the further operation on the view, as illustrated by the following example:
+    #
+    # >>> df = pd.DataFrame([[1,2], [3,4]])
+    # >>> x = df.iloc[:]
+    # >>> df
+    #    0  1
+    # 0  1  2
+    # 1  3  4
+    # >>> x
+    #    0  1
+    # 0  1  2
+    # 1  3  4
+    #
+    # >>> x.iloc[:] = 1000
+    # >>> x
+    #       0     1
+    # 0  1000  1000
+    # 1  1000  1000
+    # df
+    #       0     1
+    # 0  1000  1000
+    # 1  1000  1000
+    #
+    # >>> x.iloc[:] = 2000.0
+    # >>> x
+    #         0       1
+    # 0  2000.0  2000.0
+    # 1  2000.0  2000.0
+    # >>> df
+    #       0     1
+    # 0  1000  1000
+    # 1  1000  1000
+
+    @classmethod
+    def tile(cls, op):
+        tileds = super().tile(op)
+        if tileds is not None:
+            return tileds
+
+        handler = DataFrameIlocIndexesHandler()
+        return [(yield from handler.handle(op))]
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        df = ctx[op.input.key]
+        if len(op.inputs) > 1:
+            indexes = tuple(
+                ctx[index.key] if hasattr(index, "key") else index
+                for index in op.indexes
+            )
+        else:
+            indexes = tuple(op.indexes)
+        r = df.iloc[indexes]
+        if isinstance(r, pd.Series) and r.dtype != chunk.dtype:
+            r = r.astype(chunk.dtype)
+        if is_cudf(r):  # pragma: no cover
+            r = r.copy()
+        ctx[chunk.key] = r
+
+
+class DataFrameIlocSetItem(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.DATAFRAME_ILOC_SETITEM
+
+    _indexes = ListField("indexes")
+    _value = AnyField("value")
+
+    def __init__(
+        self, indexes=None, value=None, gpu=None, sparse=False, output_types=None, **kw
+    ):
+        super().__init__(
+            _indexes=indexes,
+            _value=value,
+            gpu=gpu,
+            sparse=sparse,
+            _output_types=output_types,
+            **kw,
+        )
+        if not self.output_types:
+            self.output_types = [OutputType.dataframe]
+
+    @property
+    def indexes(self):
+        return self._indexes
+
+    @property
+    def value(self):
+        return self._value
+
+    def __call__(self, df):
+        return self.new_dataframe(
+            [df],
+            shape=df.shape,
+            dtypes=df.dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+        )
+
+    @classmethod
+    def tile(cls, op):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+
+        # See Note [Fancy Index of Numpy and Pandas]
+        tensor0 = yield from recursive_tile(
+            empty(in_df.shape[0], chunk_size=(in_df.nsplits[0],))[op.indexes[0]]
+        )
+        tensor1 = yield from recursive_tile(
+            empty(in_df.shape[1], chunk_size=(in_df.nsplits[1],))[op.indexes[1]]
+        )
+
+        chunk_mapping = {
+            c0.inputs[0].index + c1.inputs[0].index: (c0, c1)
+            for c0, c1 in itertools.product(tensor0.chunks, tensor1.chunks)
+        }
+
+        out_chunks = []
+        for chunk in in_df.chunks:
+            if chunk.index not in chunk_mapping:
+                out_chunks.append(chunk)
+            else:
+                chunk_op = op.copy().reset_key()
+                index_chunk, column_chunk = chunk_mapping[chunk.index]
+                chunk_op._indexes = [
+                    index_chunk.op.indexes[0],
+                    column_chunk.op.indexes[0],
+                ]
+                chunk_op._value = op.value
+                out_chunk = chunk_op.new_chunk(
+                    [chunk],
+                    shape=chunk.shape,
+                    index=chunk.index,
+                    dtypes=chunk.dtypes,
+                    index_value=chunk.index_value,
+                    columns_value=chunk.columns_value,
+                )
+                out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            op.inputs,
+            shape=out_df.shape,
+            dtypes=out_df.dtypes,
+            index_value=out_df.index_value,
+            columns_value=out_df.columns_value,
+            chunks=out_chunks,
+            nsplits=in_df.nsplits,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        r = ctx[op.inputs[0].key].copy(deep=True)
+        r.iloc[tuple(op.indexes)] = op.value
+        ctx[chunk.key] = r
+
+
+class SeriesIlocGetItem(DataFrameOperand, HeadTailOptimizedOperandMixin):
+    _op_module_ = "series"
+    _op_type_ = OperandDef.DATAFRAME_ILOC_GETITEM
+
+    _input = KeyField("input")
+    _indexes = ListField("indexes")
+
+    def __init__(self, indexes=None, gpu=None, sparse=False, output_types=None, **kw):
+        super().__init__(
+            _indexes=indexes, gpu=gpu, sparse=sparse, _output_types=output_types, **kw
+        )
+        if not self.output_types:
+            self.output_types = [OutputType.series]
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def indexes(self):
+        return self._indexes
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+
+        inputs_iter = iter(self._inputs)
+        self._input = next(inputs_iter)
+
+        indexes = []
+        for index in self._indexes:
+            if isinstance(index, ENTITY_TYPE):
+                indexes.append(next(inputs_iter))
+            else:
+                indexes.append(index)
+        self._indexes = indexes
+
+    @classmethod
+    def tile(cls, op):
+        tileds = super().tile(op)
+        if tileds is not None:
+            return tileds
+
+        handler = DataFrameIlocIndexesHandler()
+        return [(yield from handler.handle(op))]
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        series = ctx[op.input.key]
+        if len(op.inputs) > 1:
+            indexes = tuple(
+                ctx[index.key] if hasattr(index, "key") else index
+                for index in op.indexes
+            )
+        else:
+            indexes = tuple(op.indexes)
+        if hasattr(series, "iloc"):
+            ctx[chunk.key] = series.iloc[indexes]
+        else:
+            # index, only happen for calling from rechunk
+            ctx[chunk.key] = series[indexes if len(indexes) > 1 else indexes[0]]
+
+    def __call__(self, series):
+        if isinstance(self._indexes[0], Integral):
+            return self.new_scalar([series], dtype=series.dtype)
+        else:
+            shape = tuple(calc_shape(series.shape, self.indexes))
+            index_value = indexing_index_value(series.index_value, self.indexes[0])
+            inputs = [series] + [
+                index for index in self._indexes if isinstance(index, ENTITY_TYPE)
+            ]
+            return self.new_series(
+                inputs,
+                shape=shape,
+                dtype=series.dtype,
+                index_value=index_value,
+                name=series.name,
+            )
+
+
+class SeriesIlocSetItem(DataFrameOperand, DataFrameOperandMixin):
+    _op_module_ = "series"
+    _op_type_ = OperandDef.DATAFRAME_ILOC_SETITEM
+
+    _indexes = ListField("indexes")
+    _value = AnyField("value")
+
+    def __init__(self, indexes=None, value=None, gpu=None, sparse=False, **kw):
+        super().__init__(
+            _indexes=indexes,
+            _value=value,
+            gpu=gpu,
+            sparse=sparse,
+            _output_types=[OutputType.series],
+            **kw,
+        )
+
+    @property
+    def indexes(self):
+        return self._indexes
+
+    @property
+    def value(self):
+        return self._value
+
+    def __call__(self, series):
+        return self.new_series(
+            [series],
+            shape=series.shape,
+            dtype=series.dtype,
+            index_value=series.index_value,
+            name=series.name,
+        )
+
+    @classmethod
+    def tile(cls, op):
+        in_series = op.inputs[0]
+        out = op.outputs[0]
+
+        # Reuse the logic of fancy indexing in tensor module.
+        tensor = yield from recursive_tile(
+            empty(in_series.shape, chunk_size=in_series.nsplits)[op.indexes[0]]
+        )
+
+        chunk_mapping = dict((c.inputs[0].index, c) for c in tensor.chunks)
+
+        out_chunks = []
+        for chunk in in_series.chunks:
+            if chunk.index not in chunk_mapping:
+                out_chunks.append(chunk)
+            else:
+                chunk_op = op.copy().reset_key()
+                index_chunk = chunk_mapping[chunk.index]
+                chunk_op._indexes = index_chunk.op.indexes
+                chunk_op._value = op.value
+                out_chunk = chunk_op.new_chunk(
+                    [chunk],
+                    shape=chunk.shape,
+                    index=chunk.index,
+                    dtype=chunk.dtype,
+                    index_value=chunk.index_value,
+                    name=chunk.name,
+                )
+                out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_seriess(
+            op.inputs,
+            shape=out.shape,
+            dtype=out.dtype,
+            index_value=out.index_value,
+            name=out.name,
+            chunks=out_chunks,
+            nsplits=in_series.nsplits,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        r = ctx[op.inputs[0].key].copy(deep=True)
+        r.iloc[tuple(op.indexes)] = op.value
+        ctx[chunk.key] = r
+
+
+class IndexIlocGetItem(DataFrameOperand, DataFrameOperandMixin):
+    _op_module_ = "index"
+    _op_type_ = OperandDef.DATAFRAME_ILOC_GETITEM
+
+    _input = KeyField("input")
+    _indexes = ListField("indexes")
+
+    def __init__(self, indexes=None, gpu=None, sparse=False, output_types=None, **kw):
+        super().__init__(
+            _indexes=indexes, gpu=gpu, sparse=sparse, _output_types=output_types, **kw
+        )
+        if not self.output_types:
+            self.output_types = [OutputType.index]
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def indexes(self):
+        return self._indexes
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+
+        inputs_iter = iter(self._inputs)
+        self._input = next(inputs_iter)
+
+        indexes = []
+        for index in self._indexes:
+            if isinstance(index, ENTITY_TYPE):
+                indexes.append(next(inputs_iter))
+            else:
+                indexes.append(index)
+        self._indexes = indexes
+
+    @classmethod
+    def tile(cls, op):
+        handler = DataFrameIlocIndexesHandler()
+        return [(yield from handler.handle(op))]
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        idx = ctx[op.input.key]
+        if len(op.inputs) > 1:
+            indexes = tuple(
+                ctx[index.key] if hasattr(index, "key") else index
+                for index in op.indexes
+            )
+        else:
+            indexes = tuple(op.indexes)
+            if len(indexes) == 1:
+                indexes = indexes[0]
+        ctx[chunk.key] = idx[indexes]
+
+    def __call__(self, idx):
+        if isinstance(self._indexes[0], Integral):
+            return self.new_scalar([idx], dtype=idx.dtype)
+        else:
+            shape = tuple(calc_shape(idx.shape, self.indexes))
+            index_value = indexing_index_value(idx.index_value, self.indexes[0])
+            inputs = [idx] + [
+                index for index in self._indexes if isinstance(index, ENTITY_TYPE)
+            ]
+            return self.new_index(
+                inputs,
+                shape=shape,
+                dtype=idx.dtype,
+                index_value=index_value,
+                name=idx.name,
+            )
+
+
+def index_getitem(idx, indexes):
+    op = IndexIlocGetItem(indexes=process_iloc_indexes(idx, indexes))
+    return op(idx)
+
+
+def index_setitem(_idx, *_):
+    raise TypeError("Index does not support mutable operations")
+
+
+def iloc(a):
+    return DataFrameIloc(a)
+
+
+def head(a, n=5):
+    """
+    Return the first `n` rows.
+
+    This function returns the first `n` rows for the object based
+    on position. It is useful for quickly testing if your object
+    has the right type of data in it.
+
+    For negative values of `n`, this function returns all rows except
+    the last `n` rows, equivalent to ``df[:-n]``.
+
+    Parameters
+    ----------
+    n : int, default 5
+        Number of rows to select.
+
+    Returns
+    -------
+    same type as caller
+        The first `n` rows of the caller object.
+
+    See Also
+    --------
+    DataFrame.tail: Returns the last `n` rows.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
+    ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
+    >>> df.execute()
+          animal
+    0  alligator
+    1        bee
+    2     falcon
+    3       lion
+    4     monkey
+    5     parrot
+    6      shark
+    7      whale
+    8      zebra
+
+    Viewing the first 5 lines
+
+    >>> df.head().execute()
+          animal
+    0  alligator
+    1        bee
+    2     falcon
+    3       lion
+    4     monkey
+
+    Viewing the first `n` lines (three in this case)
+
+    >>> df.head(3).execute()
+          animal
+    0  alligator
+    1        bee
+    2     falcon
+
+    For negative values of `n`
+
+    >>> df.head(-3).execute()
+          animal
+    0  alligator
+    1        bee
+    2     falcon
+    3       lion
+    4     monkey
+    5     parrot
+    """
+    return DataFrameIloc(a)[0:n]
+
+
+def tail(a, n=5):
+    """
+    Return the last `n` rows.
+
+    This function returns last `n` rows from the object based on
+    position. It is useful for quickly verifying data, for example,
+    after sorting or appending rows.
+
+    For negative values of `n`, this function returns all rows except
+    the first `n` rows, equivalent to ``df[n:]``.
+
+    Parameters
+    ----------
+    n : int, default 5
+        Number of rows to select.
+
+    Returns
+    -------
+    type of caller
+        The last `n` rows of the caller object.
+
+    See Also
+    --------
+    DataFrame.head : The first `n` rows of the caller object.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
+    ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
+    >>> df.execute()
+          animal
+    0  alligator
+    1        bee
+    2     falcon
+    3       lion
+    4     monkey
+    5     parrot
+    6      shark
+    7      whale
+    8      zebra
+
+    Viewing the last 5 lines
+
+    >>> df.tail().execute()
+       animal
+    4  monkey
+    5  parrot
+    6   shark
+    7   whale
+    8   zebra
+
+    Viewing the last `n` lines (three in this case)
+
+    >>> df.tail(3).execute()
+      animal
+    6  shark
+    7  whale
+    8  zebra
+
+    For negative values of `n`
+
+    >>> df.tail(-3).execute()
+       animal
+    3    lion
+    4  monkey
+    5  parrot
+    6   shark
+    7   whale
+    8   zebra
+    """
+    return DataFrameIloc(a)[-n:]
diff --git a/python/xorbits/_mars/dataframe/indexing/index_lib.py b/python/xorbits/_mars/dataframe/indexing/index_lib.py
new file mode 100644
index 000000000..bc701e40c
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/index_lib.py
@@ -0,0 +1,1202 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from collections import namedtuple
+from typing import List, Tuple, Union
+
+import numpy as np
+import pandas as pd
+from pandas.core.dtypes.cast import find_common_type
+
+from ...core import Chunk, OutputType, Tileable, recursive_tile
+from ...core.operand import OperandStage
+from ...tensor.core import TENSOR_TYPE
+from ...tensor.indexing.index_lib import ChunkIndexInfo as ChunkIndexInfoBase
+from ...tensor.indexing.index_lib import (
+    IndexesHandler,
+    IndexHandler,
+    IndexHandlerContext,
+    IndexInfo,
+    IndexType,
+    IntegralIndexHandler,
+)
+from ...tensor.indexing.index_lib import (
+    NDArrayBoolIndexHandler as NDArrayBoolIndexHandlerBase,
+)
+from ...tensor.indexing.index_lib import SliceIndexHandler as SliceIndexHandlerBase
+from ...tensor.indexing.index_lib import (
+    TensorBoolIndexHandler as TensorBoolIndexHandlerBase,
+)
+from ...tensor.utils import (
+    calc_pos,
+    calc_sliced_size,
+    filter_inputs,
+    normalize_chunk_sizes,
+    slice_split,
+    split_indexes_into_chunks,
+    to_numpy,
+)
+from ...utils import classproperty, has_unknown_shape, is_full_slice
+from ..core import SERIES_CHUNK_TYPE, SERIES_TYPE, IndexValue
+from ..utils import parse_index
+from .utils import convert_labels_into_positions
+
+ChunkIndexAxisInfo = namedtuple(
+    "chunk_index_axis_info",
+    ["output_axis_index", "processed_index", "output_shape", "index_value", "dtypes"],
+)
+
+
+class ChunkIndexInfo(ChunkIndexInfoBase):
+    def __init__(self):
+        super().__init__()
+        self.index_values = []
+        self.dtypes = None
+
+    def set(self, info: ChunkIndexAxisInfo):
+        super().set(info)
+        if getattr(info, "index_value", None) is not None:
+            self.index_values.append(info.index_value)
+        if getattr(info, "dtypes", None) is not None:
+            self.dtypes = info.dtypes
+
+
+class FancyIndexInfo(IndexInfo):
+    def __init__(
+        self,
+        index_type: IndexType,
+        input_axis: int,
+        output_axis: int,
+        raw_index,
+        handler,
+    ):
+        super().__init__(index_type, input_axis, output_axis, raw_index, handler)
+
+        # extra info for DataFrame fancy index
+        # split info
+        #   - chunk_index_to_fancy_index_arrays
+        #   - chunk_index_to_raw_positions
+        #   - is_fancy_index_asc_sorted
+        self.split_info = None
+
+
+class LabelFancyIndexInfo(IndexInfo):
+    def __init__(
+        self,
+        index_type: IndexType,
+        input_axis: int,
+        output_axis: int,
+        raw_index,
+        handler,
+    ):
+        super().__init__(index_type, input_axis, output_axis, raw_index, handler)
+
+        # store chunk_index -> labels
+        self.chunk_index_to_labels = None
+        self.is_label_asc_sorted = None
+
+
+class DataFrameIndexHandlerContext(IndexHandlerContext):
+    def set_tileable(self, tileable: Tileable):
+        for chunk in tileable.chunks:
+            self.chunk_index_to_info[chunk.index] = ChunkIndexInfo()
+
+    def concat_chunks(self, chunks: List[Chunk], axis: Union[Tuple[int], int]) -> Chunk:
+        dataframe_op_type = type(chunks[0].op)
+        # create tileable from chunks
+        concat_tileable = dataframe_op_type.create_tileable_from_chunks(
+            chunks, inputs=chunks
+        )
+        # concat chunks
+        chunk = dataframe_op_type.concat_tileable_chunks(concat_tileable).chunks[0]
+        if chunk.ndim > 1 and (
+            (isinstance(axis, tuple) and len(axis) == 1) or isinstance(axis, int)
+        ):
+            # adjust index and axis
+            axis = axis[0] if isinstance(axis, tuple) else axis
+            chunk.op._axis = axis
+            chunk_index = list(chunk.index)
+            chunk_index[1 - axis] = chunks[0].index[1 - axis]
+            chunk._index = tuple(chunk_index)
+        return chunk
+
+    def create_chunk(
+        self, chunk_index: Tuple[int], chunk_index_info: ChunkIndexInfo
+    ) -> Chunk:
+        chunk_op = self.op.copy().reset_key()
+        chunk_op._indexes = indexes = chunk_index_info.indexes
+        chunk_op.stage = OperandStage.map
+
+        chunk_input = self.tileable.cix[chunk_index]
+        chunk_inputs = filter_inputs([chunk_input] + indexes)
+
+        kw = {}
+        kw["shape"] = shape = tuple(chunk_index_info.output_chunk_shape)
+        kw["index"] = tuple(chunk_index_info.output_chunk_index)
+        index_values = chunk_index_info.index_values
+        if len(shape) == 0:
+            # scalar
+            chunk_op.output_types = [OutputType.scalar]
+            kw["dtype"] = self.op.outputs[0].dtype
+        elif len(shape) == 1:
+            # Series or Index
+            chunk_op.output_types = (
+                [OutputType.index]
+                if chunk_op._op_module_ == "index"
+                else [OutputType.series]
+            )
+            kw["index_value"] = index_values[0]
+            kw["dtype"] = self.op.outputs[0].dtype
+            kw["name"] = getattr(self.op.outputs[0], "name", None)
+        else:
+            # dataframe
+            chunk_op.output_types = [OutputType.dataframe]
+            kw["index_value"] = index_values[0]
+            kw["columns_value"] = index_values[1]
+            kw["dtypes"] = chunk_index_info.dtypes
+
+        return chunk_op.new_chunk(chunk_inputs, kws=[kw])
+
+
+class SliceIndexHandler(SliceIndexHandlerBase):
+    @classmethod
+    def set_chunk_index_info(
+        cls,
+        context: IndexHandlerContext,
+        index_info: IndexInfo,
+        chunk_index: Tuple[int],
+        chunk_index_info: ChunkIndexInfo,
+        output_axis_index: int,
+        index,
+        output_shape: int,
+    ):
+        tileable = context.tileable
+        chunk_input = tileable.cix[chunk_index]
+        slc = index
+
+        kw = {
+            "output_axis_index": output_axis_index,
+            "processed_index": slc,
+            "output_shape": output_shape,
+            "dtypes": None,
+        }
+        if index_info.input_axis == 0:
+            if is_full_slice(slc):
+                kw["index_value"] = chunk_input.index_value
+            else:
+                index = chunk_input.index_value.to_pandas()
+                kw["index_value"] = parse_index(
+                    index[slc], chunk_input, slc, store_data=False
+                )
+        else:
+            assert index_info.input_axis == 1
+            index = chunk_input.columns_value.to_pandas()
+            # do not store index value if output axis is 0
+            store_data = True if index_info.output_axis == 1 else False
+            kw["index_value"] = parse_index(index[slc], store_data=store_data)
+            kw["dtypes"] = chunk_input.dtypes[slc]
+
+        chunk_index_info.set(ChunkIndexAxisInfo(**kw))
+
+
+class LabelSliceIndexHandler(IndexHandler):
+    def accept(cls, raw_index):
+        return isinstance(raw_index, slice)
+
+    def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo:
+        info = IndexInfo(
+            IndexType.label_slice,
+            context.input_axis,
+            context.output_axis,
+            raw_index,
+            self,
+        )
+        context.input_axis += 1
+        context.output_axis += 1
+        context.append(info)
+        return info
+
+    @staticmethod
+    def _slice_all(slc):
+        return (
+            slc.start is None
+            and slc.stop is None
+            and (slc.step is None or slc.step == 1)
+        )
+
+    def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        tileable = context.tileable
+        input_axis = index_info.input_axis
+        if isinstance(tileable, SERIES_TYPE):
+            index_value = tileable.index_value
+        else:
+            index_value = [tileable.index_value, tileable.columns_value][input_axis]
+
+        # check if chunks have unknown shape
+        if (
+            not self._slice_all(index_info.raw_index)
+            and index_value.has_value()
+            and any(np.isnan(ns) for ns in tileable.nsplits[input_axis])
+        ):  # pragma: no cover
+            yield
+
+    def set_chunk_index_info(
+        cls,
+        context: IndexHandlerContext,
+        index_info: IndexInfo,
+        chunk_index: Tuple[int],
+        chunk_index_info: ChunkIndexInfo,
+        output_axis_index: int,
+        index,
+        output_shape: int,
+    ):
+        tileable = context.tileable
+        chunk_input = tileable.cix[chunk_index]
+        slc = index
+
+        kw = {
+            "output_axis_index": output_axis_index,
+            "processed_index": slc,
+            "output_shape": output_shape,
+            "dtypes": None,
+        }
+        if index_info.input_axis == 0:
+            if is_full_slice(index):
+                kw["index_value"] = chunk_input.index_value
+            else:
+                index = chunk_input.index_value.to_pandas()
+                start, stop = index.slice_locs(
+                    slc.start, slc.stop, slc.step, kind="loc"
+                )
+                pos_slc = slice(start, stop, slc.step)
+                kw["index_value"] = parse_index(
+                    index[pos_slc], chunk_input, slc, store_data=False
+                )
+        else:
+            assert index_info.input_axis == 1
+            dtypes = chunk_input.dtypes
+            # do not store index value if output axis is 0
+            store_data = True if index_info.output_axis == 1 else False
+            columns = dtypes.loc[slc].index
+            kw["index_value"] = parse_index(columns, store_data=store_data)
+            kw["dtypes"] = chunk_input.dtypes[slc]
+
+        chunk_index_info.set(ChunkIndexAxisInfo(**kw))
+
+    def _process_slice_all_index(
+        self,
+        tileable: Tileable,
+        index_info: IndexInfo,
+        input_axis: int,
+        context: IndexHandlerContext,
+    ) -> None:
+        index_to_info = context.chunk_index_to_info.copy()
+        for chunk_index, chunk_index_info in index_to_info.items():
+            i = chunk_index[input_axis]
+            size = tileable.nsplits[input_axis][i]
+            self.set_chunk_index_info(
+                context,
+                index_info,
+                chunk_index,
+                chunk_index_info,
+                i,
+                slice(None),
+                size,
+            )
+
+    def _process_has_value_index(
+        self,
+        tileable: Tileable,
+        index_info: IndexInfo,
+        index_value,
+        input_axis: int,
+        context: IndexHandlerContext,
+    ) -> None:
+        pd_index = index_value.to_pandas()
+        # turn label-based slice into position-based slice
+        start, end = pd_index.slice_locs(
+            index_info.raw_index.start,
+            index_info.raw_index.stop,
+            index_info.raw_index.step,
+            kind="loc",
+        )
+        slc = slice(start, end, index_info.raw_index.step)
+
+        cum_nsplit = [0] + np.cumsum(tileable.nsplits[index_info.input_axis]).tolist()
+        # split position-based slice into chunk slices
+        effected_i_to_slc = slice_split(slc, tileable.nsplits[index_info.input_axis])
+        is_reversed = (slc.step or 0) < 0
+        output_axis_index_range = (
+            range(len(effected_i_to_slc))
+            if not is_reversed
+            else range(len(effected_i_to_slc) - 1, -1, -1)
+        )
+        other_index_to_iter = dict()
+
+        index_to_info = context.chunk_index_to_info.copy()
+        for chunk_index, chunk_index_info in index_to_info.items():
+            i = chunk_index[input_axis]
+            other_index = chunk_index[:input_axis] + chunk_index[input_axis + 1 :]
+            size = tileable.nsplits[input_axis][i]
+            if i not in effected_i_to_slc:
+                # delete it, the input chunk could be ignored
+                del context.chunk_index_to_info[chunk_index]
+            else:
+                chunk_slc = effected_i_to_slc[i]
+                output_shape = calc_sliced_size(size, chunk_slc)
+                if other_index not in other_index_to_iter:
+                    other_index_to_iter[other_index] = iter(output_axis_index_range)
+                output_axis_index = next(other_index_to_iter[other_index])
+
+                # turn position-based slice back into label-based slice
+                start = chunk_slc.start
+                if start is not None:
+                    abs_start = cum_nsplit[i] + start
+                    label_start = pd_index[abs_start]
+                else:
+                    label_start = None
+                stop = chunk_slc.stop
+                if stop is not None:
+                    abs_stop = cum_nsplit[i] + stop - 1  # label slice include the stop
+                    label_stop = (
+                        pd_index[abs_stop] if abs_stop < len(pd_index) else None
+                    )
+                else:
+                    label_stop = None
+
+                label_slc = slice(label_start, label_stop, chunk_slc.step)
+                self.set_chunk_index_info(
+                    context,
+                    index_info,
+                    chunk_index,
+                    chunk_index_info,
+                    output_axis_index,
+                    label_slc,
+                    output_shape,
+                )
+
+    def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        tileable = context.tileable
+        input_axis = index_info.input_axis
+        if isinstance(tileable, SERIES_TYPE):
+            index_value = tileable.index_value
+        else:
+            index_value = [tileable.index_value, tileable.columns_value][input_axis]
+
+        if self._slice_all(index_info.raw_index):
+            self._process_slice_all_index(tileable, index_info, input_axis, context)
+        elif index_value.has_value():
+            self._process_has_value_index(
+                tileable, index_info, index_value, input_axis, context
+            )
+        else:
+            other_index_to_iter = dict()
+            # slice on all chunks on the specified axis
+            for chunk_index, chunk_index_info in context.chunk_index_to_info.items():
+                other_index = chunk_index[:1] if input_axis == 1 else chunk_index[1:]
+                if other_index not in other_index_to_iter:
+                    other_index_to_iter[other_index] = itertools.count()
+                output_axis_index = next(other_index_to_iter[other_index])
+                self.set_chunk_index_info(
+                    context,
+                    index_info,
+                    chunk_index,
+                    chunk_index_info,
+                    output_axis_index,
+                    index_info.raw_index,
+                    np.nan,
+                )
+
+
+class LabelIndexHandler(IndexHandler):
+    def accept(cls, raw_index):
+        # accept type other than slice, ndarray and tensor
+        return not isinstance(raw_index, (slice, np.ndarray, TENSOR_TYPE))
+
+    def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo:
+        tileable = context.tileable
+        input_axis = context.input_axis
+        if tileable.ndim == 2:
+            index_value = [tileable.index_value, tileable.columns_value][input_axis]
+        else:
+            index_value = tileable.index_value
+
+        if index_value.has_value():
+            pd_index = index_value.to_pandas()
+            loc = pd_index.get_loc(raw_index)
+            if isinstance(loc, slice):
+                # if is slice, means index not unique, but monotonic
+                # just call LabelSliceIndexHandler
+                new_raw_index = slice(raw_index, raw_index)
+                return LabelSliceIndexHandler.get_instance().parse(
+                    new_raw_index, context
+                )
+            elif isinstance(loc, np.ndarray):
+                # bool indexing, non unique, and not monotonic
+                return NDArrayBoolIndexHandler.get_instance().parse(loc, context)
+        else:
+            return LabelNDArrayFancyIndexHandler.get_instance().parse(
+                raw_index, context
+            )
+
+        info = IndexInfo(
+            IndexType.label, context.input_axis, context.output_axis, raw_index, self
+        )
+        context.input_axis += 1
+        context.append(info)
+        return info
+
+    def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        # if index has value on input axis,
+        # label will be converted to position,
+        # thus chunks cannot have unknown shape on this axis
+        tileable = context.tileable
+        input_axis = index_info.input_axis
+        if tileable.ndim == 1:
+            index_value = tileable.index_value
+        else:
+            index_value = [tileable.index_value, tileable.columns_value][input_axis]
+        if index_value.has_value():
+            if any(
+                np.isnan(ns) for ns in tileable.nsplits[input_axis]
+            ):  # pragma: no cover
+                yield
+
+    def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        tileable = context.tileable
+        input_axis = index_info.input_axis
+        if tileable.ndim == 1:
+            index_value = tileable.index_value
+        else:
+            index_value = [tileable.index_value, tileable.columns_value][input_axis]
+
+        if index_value.has_value():
+            pd_index = index_value.to_pandas()
+            loc = pd_index.get_loc(index_info.raw_index)
+
+            # other situations have been delegated to different handlers
+            assert isinstance(loc, int)
+
+            effected_i_to_slc = slice_split(
+                loc, tileable.nsplits[index_info.input_axis]
+            )
+
+            index_to_info = context.chunk_index_to_info.copy()
+            for chunk_index, chunk_index_info in index_to_info.items():
+                i = chunk_index[input_axis]
+                if i not in effected_i_to_slc:
+                    # delete it, the input chunk could be ignored
+                    del context.chunk_index_to_info[chunk_index]
+                else:
+                    chunk_index_info.set(
+                        ChunkIndexAxisInfo(
+                            output_axis_index=None,
+                            processed_index=index_info.raw_index,
+                            output_shape=None,
+                            index_value=None,
+                            dtypes=None,
+                        )
+                    )
+
+
+class DataFrameIndexHandler:
+    @classmethod
+    def _calc_dtypes(cls, dtypes, index, context: IndexHandlerContext):
+        if getattr(context.op, "can_index_miss", False):
+            # reindex
+            return dtypes.reindex(index).fillna(np.dtype(np.float64))
+        else:
+            # loc, iloc
+            return getattr(dtypes, cls.kind)[index]
+
+    @classmethod
+    def set_chunk_index_info(
+        cls,
+        context: IndexHandlerContext,
+        index_info: IndexInfo,
+        chunk_index: Tuple[int],
+        chunk_index_info: ChunkIndexInfo,
+        output_axis_index: int,
+        index,
+        output_shape: int,
+    ):
+        tileable = context.tileable
+        chunk_input = tileable.cix[chunk_index]
+
+        dtypes = None
+        if index_info.input_axis == 0:
+            index_value = parse_index(
+                chunk_input.index_value.to_pandas()[:0],
+                chunk_input,
+                index,
+                store_data=False,
+            )
+        else:
+            dtypes = cls._calc_dtypes(chunk_input.dtypes, index, context)
+            columns = dtypes.index
+            index_value = parse_index(columns, store_data=True)
+
+        info = ChunkIndexAxisInfo(
+            output_axis_index=output_axis_index,
+            processed_index=index,
+            output_shape=output_shape,
+            index_value=index_value,
+            dtypes=dtypes,
+        )
+        chunk_index_info.set(info)
+
+
+class NDArrayBoolIndexHandler(NDArrayBoolIndexHandlerBase):
+    @classmethod
+    def set_chunk_index_info(
+        cls,
+        context: IndexHandlerContext,
+        index_info: IndexInfo,
+        chunk_index: Tuple[int],
+        chunk_index_info: ChunkIndexInfo,
+        output_axis_index: int,
+        index,
+        output_shape: int,
+    ):
+        tileable = context.tileable
+        chunk_input = tileable.cix[chunk_index]
+
+        if index_info.input_axis == 0:
+            dtype = chunk_input.index_value.to_pandas().dtype
+            index_value = parse_index(
+                pd.Index([], dtype=dtype), chunk_input, index, store_data=False
+            )
+            dtypes = None
+        else:
+            pd_index = chunk_input.columns_value.to_pandas()
+            filtered_index = pd_index[index]
+            index_value = parse_index(filtered_index, store_data=True)
+            dtypes = chunk_input.dtypes[index]
+
+        info = ChunkIndexAxisInfo(
+            output_axis_index=output_axis_index,
+            processed_index=index,
+            output_shape=output_shape,
+            index_value=index_value,
+            dtypes=dtypes,
+        )
+        chunk_index_info.set(info)
+
+
+class TensorBoolIndexHandler(TensorBoolIndexHandlerBase):
+    @classmethod
+    def set_chunk_index_info(
+        cls,
+        context: IndexHandlerContext,
+        index_info: IndexInfo,
+        chunk_index: Tuple[int],
+        chunk_index_info: ChunkIndexInfo,
+        output_axis_index: int,
+        index,
+        output_shape: int,
+    ):
+        tileable = context.tileable
+        chunk_input = tileable.cix[chunk_index]
+
+        assert (
+            index_info.input_axis == 0
+        ), "bool indexing on axis columns cannot be tensor"
+
+        index_value = parse_index(
+            pd.Index([], chunk_input.index_value.to_pandas().dtype),
+            chunk_input,
+            index,
+            store_data=False,
+        )
+
+        info = ChunkIndexAxisInfo(
+            output_axis_index=output_axis_index,
+            processed_index=index,
+            output_shape=output_shape,
+            index_value=index_value,
+            dtypes=None,
+        )
+        chunk_index_info.set(info)
+
+
+class _FancyIndexHandler(DataFrameIndexHandler, IndexHandler):
+    @classproperty
+    def kind(self):  # pylint: disable=no-self-use
+        return "iloc"
+
+    def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo:
+        info = FancyIndexInfo(
+            IndexType.fancy_index,
+            context.input_axis,
+            context.output_axis,
+            raw_index,
+            self,
+        )
+        context.input_axis += 1
+        context.output_axis += 1
+        context.append(info)
+        return info
+
+
+class NDArrayFancyIndexHandler(_FancyIndexHandler):
+    def accept(cls, raw_index):
+        # raw index like list, and pd.Series
+        # would have been converted to ndarray or tensor already
+        return isinstance(raw_index, np.ndarray) and raw_index.dtype != np.bool_
+
+    def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        tileable = context.tileable
+        if has_unknown_shape(tileable):  # pragma: no cover
+            yield
+
+        # split raw index into chunks on the given axis
+        split_info = split_indexes_into_chunks(
+            [tileable.nsplits[index_info.input_axis]], [index_info.raw_index]
+        )
+        index_info.split_info = split_info
+
+    def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        chunk_index_to_fancy_index_arrays = index_info.split_info[0]
+
+        other_index_to_iter = dict()
+        chunk_index_to_info = context.chunk_index_to_info.copy()
+        for chunk_index, chunk_index_info in chunk_index_to_info.items():
+            i = chunk_index[index_info.input_axis]
+            fancy_index_array = chunk_index_to_fancy_index_arrays[(i,)][0]
+
+            if fancy_index_array.size == 0:
+                # not effected
+                del context.chunk_index_to_info[chunk_index]
+                continue
+
+            other_index = (
+                chunk_index[:1] if index_info.input_axis == 1 else chunk_index[1:]
+            )
+            if other_index not in other_index_to_iter:
+                other_index_to_iter[other_index] = itertools.count()
+            output_axis_index = next(other_index_to_iter[other_index])
+            output_axis_shape = fancy_index_array.shape[0]
+            self.set_chunk_index_info(
+                context,
+                index_info,
+                chunk_index,
+                chunk_index_info,
+                output_axis_index,
+                fancy_index_array,
+                output_axis_shape,
+            )
+
+    @classmethod
+    def need_postprocess(cls, index_info: IndexInfo, context: IndexHandlerContext):
+        tileable = context.tileable
+
+        if tileable.chunk_shape[index_info.input_axis] == 1:
+            # if tileable only has 1 chunk on this axis
+            # do not need postprocess
+            return False
+        # if ascending sorted, no need to postprocess
+        return not index_info.split_info[2]
+
+    def postprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        # could be 2 fancy indexes at most
+        fancy_indexes = context.get_indexes(index_info.index_type)
+        i_fancy_index = fancy_indexes.index(index_info)
+        need_postprocesses = [
+            fancy_index.handler.need_postprocess(fancy_index, context)
+            for fancy_index in fancy_indexes
+        ]
+
+        if not need_postprocesses[i_fancy_index]:
+            # do not need postprocess
+            return
+
+        if (
+            i_fancy_index == 0
+            and len(fancy_indexes) == 2
+            and need_postprocesses[1]
+            and isinstance(fancy_indexes[1].raw_index, np.ndarray)
+        ):
+            # check if need postprocess if 2 fancy indexes and now it's the first,
+            # if so, skip postprocess for this one,
+            # and do MapReduce just once for the second postprocess
+            return
+
+        chunks, nsplits = context.out_chunks, context.out_nsplits
+        index_to_chunks = {c.index: c for c in chunks}
+
+        to_concat_axes = tuple(
+            fancy_index.output_axis
+            for i, fancy_index in enumerate(fancy_indexes)
+            if need_postprocesses[i]
+        )
+        reorder_indexes = [
+            calc_pos(fancy_index.raw_index.shape, fancy_index.split_info[1])
+            for i, fancy_index in enumerate(fancy_indexes)
+            if need_postprocesses[i]
+        ]
+        new_out_chunks = []
+        for chunk_index in itertools.product(
+            *(
+                range(len(ns))
+                for ax, ns in enumerate(nsplits)
+                if ax not in to_concat_axes
+            )
+        ):
+            if len(to_concat_axes) == 2:
+                to_concat_chunks = chunks
+            else:
+                to_concat_chunks = []
+                for i in range(len(nsplits[to_concat_axes[0]])):
+                    to_concat_index = list(chunk_index)
+                    to_concat_index.insert(to_concat_axes[0], i)
+                    to_concat_chunks.append(index_to_chunks[tuple(to_concat_index)])
+            concat_chunk = context.concat_chunks(to_concat_chunks, to_concat_axes)
+            reorder_chunk = self._create_reorder_chunk(
+                concat_chunk, to_concat_axes, reorder_indexes, context
+            )
+            new_out_chunks.append(reorder_chunk)
+
+        new_nsplits = list(nsplits)
+        for fancy_index in fancy_indexes:
+            new_nsplits[fancy_index.output_axis] = (fancy_index.raw_index.shape[0],)
+        context.out_chunks = new_out_chunks
+        context.out_nsplits = new_nsplits
+
+    @classmethod
+    def _create_reorder_chunk(
+        cls,
+        concat_chunk: Chunk,
+        to_concat_axes: Tuple,
+        reorder_indexes: List,
+        context: IndexHandlerContext,
+    ):
+        reorder_chunk_op = context.op.copy().reset_key()
+        indexes = [slice(None)] * concat_chunk.ndim
+        for ax, reorder_index in zip(to_concat_axes, reorder_indexes):
+            indexes[ax] = reorder_index
+        reorder_chunk_op._indexes = indexes
+
+        params = concat_chunk.params
+        if isinstance(concat_chunk, SERIES_CHUNK_TYPE):
+            if concat_chunk.index_value.has_value():
+                # if concat chunk's index has value, we could calculate the new index
+                reorder_index = concat_chunk.index_value.to_pandas()[reorder_indexes[0]]
+                params["index_value"] = parse_index(reorder_index, store_data=True)
+            else:
+                params["index_value"] = parse_index(
+                    concat_chunk.index_value.to_pandas(), indexes
+                )
+            return reorder_chunk_op.new_chunk([concat_chunk], kws=[params])
+        else:
+            if 0 in to_concat_axes:
+                if concat_chunk.index_value.has_value():
+                    # if concat chunk's index has value, and index on axis 0,
+                    # we could calculate the new index
+                    reorder_index = concat_chunk.index_value.to_pandas()[
+                        reorder_indexes[0]
+                    ]
+                    params["index_value"] = parse_index(reorder_index, store_data=True)
+                else:
+                    params["index_value"] = parse_index(
+                        concat_chunk.index_value.to_pandas(), indexes[0]
+                    )
+            if 1 in to_concat_axes:
+                reorder_columns = concat_chunk.columns_value.to_pandas()[
+                    reorder_indexes[-1]
+                ]
+                params["columns_value"] = parse_index(reorder_columns, store_data=True)
+                params["dtypes"] = concat_chunk.dtypes[reorder_indexes[-1]]
+
+        return reorder_chunk_op.new_chunk([concat_chunk], kws=[params])
+
+
+class _LabelFancyIndexHandler(DataFrameIndexHandler, IndexHandler):
+    @classproperty
+    def kind(self):  # pylint: disable=no-self-use
+        return "loc"
+
+
+class LabelNDArrayFancyIndexHandler(_LabelFancyIndexHandler):
+    def accept(cls, raw_index):
+        return isinstance(raw_index, np.ndarray) and raw_index.dtype != np.bool_
+
+    def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo:
+        info = LabelFancyIndexInfo(
+            IndexType.label_fancy_index,
+            context.input_axis,
+            context.output_axis,
+            raw_index,
+            self,
+        )
+        context.input_axis += 1
+        if not np.isscalar(raw_index):
+            context.output_axis += 1
+        context.append(info)
+        return info
+
+    def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        tileable = context.tileable
+        op = context.op
+
+        input_axis = index_info.input_axis
+
+        # check unknown shape
+        if any(np.isnan(s) for s in tileable.nsplits[input_axis]):
+            yield
+
+        if tileable.ndim == 2:
+            index_value = [tileable.index_value, tileable.columns_value][input_axis]
+        else:
+            index_value = tileable.index_value
+        cum_nsplit = [0] + np.cumsum(tileable.nsplits[input_axis]).tolist()
+        if not op.can_index_miss and index_value.has_value():
+            # df.loc cannot have missed index, reindex can have
+            # thus for reindex, do not try to resolve by converting to positions
+            # turn label-based fancy index into position-based
+            pd_index = index_value.to_pandas()
+            positions = convert_labels_into_positions(pd_index, index_info.raw_index)
+            split_info = split_indexes_into_chunks(
+                [tileable.nsplits[input_axis]], [positions]
+            )
+            chunk_index_to_pos = split_info[0]
+            is_asc_sorted = split_info[-1]
+
+            # convert back to labels for chunk_index
+            chunk_index_to_labels = dict()
+            for chunk_index, pos in chunk_index_to_pos.items():
+                # chunk_index and pos are all list with 1 element
+                abs_pos = pos[0] + cum_nsplit[chunk_index[0]]
+                if isinstance(pd_index, pd.RangeIndex) and len(abs_pos) == 0:
+                    chunk_labels = np.array([], dtype=pd_index.dtype)
+                else:
+                    chunk_labels = to_numpy(pd_index[abs_pos])
+                chunk_index_to_labels[chunk_index[0]] = chunk_labels
+
+            index_info.is_label_asc_sorted = is_asc_sorted
+            index_info.chunk_index_to_labels = chunk_index_to_labels
+        else:
+            index = index_info.raw_index
+            if np.isscalar(index):
+                # delegation from label index handler
+                index = np.atleast_1d(index)
+            # does not know the right positions, need postprocess always
+            index_info.is_label_asc_sorted = False
+            # do df.loc on each chunk
+            index_info.chunk_index_to_labels = {
+                i: index for i in range(tileable.chunk_shape[input_axis])
+            }
+
+    def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        tileable = context.tileable
+        input_axis = index_info.input_axis
+        chunk_index_to_labels = index_info.chunk_index_to_labels
+        full_label_size = sum(labels.size for labels in chunk_index_to_labels.values())
+
+        other_index_to_iter = dict()
+        chunk_index_to_info = context.chunk_index_to_info.copy()
+        for chunk_index, chunk_index_info in chunk_index_to_info.items():
+            i = chunk_index[input_axis]
+            chunk_labels = chunk_index_to_labels[i]
+            size = chunk_labels.size
+
+            if size == 0 and full_label_size > 0 and tileable.shape[0] > 0:
+                # not effected when
+                # 1) tileable not empty
+                # 2) full index not empty
+                # 3) no index chosen for this chunk
+                del context.chunk_index_to_info[chunk_index]
+                continue
+
+            if (
+                np.isscalar(index_info.raw_index)
+                and isinstance(tileable.index_value.value, IndexValue.DatetimeIndex)
+                and isinstance(chunk_labels[0], str)
+            ):
+                # special case when index is DatetimeIndex and loc by string
+                # convert back list to scalar because if keep list,
+                # KeyError will always happen
+                chunk_labels = chunk_labels[0].item()
+
+            other_index = chunk_index[:1] if input_axis == 1 else chunk_index[1:]
+            if other_index not in other_index_to_iter:
+                other_index_to_iter[other_index] = itertools.count()
+            output_axis_index = next(other_index_to_iter[other_index])
+            output_axis_shape = size
+            self.set_chunk_index_info(
+                context,
+                index_info,
+                chunk_index,
+                chunk_index_info,
+                output_axis_index,
+                chunk_labels,
+                output_axis_shape,
+            )
+
+    @classmethod
+    def need_postprocess(cls, index_info: IndexInfo, context: IndexHandlerContext):
+        # if ascending sorted, no need to postprocess
+        return not index_info.is_label_asc_sorted
+
+    def postprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        if not self.need_postprocess(index_info, context):
+            # do not need postprocess
+            return
+
+        chunks, nsplits = context.out_chunks, context.out_nsplits
+        index_to_chunks = {c.index: c for c in chunks}
+
+        axis = index_info.output_axis
+        new_out_chunks = []
+        chunk_axis_shapes = dict()
+        for chunk_index in itertools.product(
+            *(range(len(ns)) for ax, ns in enumerate(nsplits) if ax != axis)
+        ):
+            to_concat_chunks = []
+            for i in range(len(nsplits[axis])):
+                if axis == 0:
+                    to_concat_index = (i,) + chunk_index
+                else:
+                    to_concat_index = chunk_index + (i,)
+                to_concat_chunks.append(index_to_chunks[to_concat_index])
+            concat_chunk = context.concat_chunks(to_concat_chunks, axis)
+            chunk_op = context.op.copy().reset_key()
+            indexes = [slice(None)] * len(nsplits)
+            indexes[axis] = index_info.raw_index
+            params = concat_chunk.params
+            if np.isscalar(index_info.raw_index):
+                assert axis == 0
+                if "columns_value" in params:
+                    params["index_value"] = params.pop("columns_value")
+                    params["dtype"] = find_common_type(params["dtypes"].tolist())
+                    del params["dtypes"]
+                    if getattr(context.op.outputs[0], "name", None) is not None:
+                        params["name"] = context.op.outputs[0].name
+                if len(params["index"]) == chunks[0].ndim:
+                    index = list(params["index"])
+                    index.pop(index_info.output_axis)
+                    params["index"] = tuple(index)
+                    shape = list(params["shape"])
+                    shape.pop(index_info.output_axis)
+                    params["shape"] = tuple(shape)
+                if context.op.outputs[0].ndim == 0:
+                    del params["index_value"]
+            elif axis == 0:
+                pd_index = pd.Index(index_info.raw_index)
+                params["index_value"] = parse_index(pd_index, store_data=False)
+                shape = list(params["shape"])
+                shape[0] = len(pd_index)
+                params["shape"] = shape
+            else:
+                if context.op.can_index_miss:
+                    # reindex
+                    params["dtypes"] = dtypes = to_concat_chunks[0].dtypes
+                else:
+                    params["dtypes"] = dtypes = concat_chunk.dtypes.loc[
+                        index_info.raw_index
+                    ]
+                params["columns_value"] = parse_index(dtypes.index, store_data=True)
+                shape = list(params["shape"])
+                shape[1] = len(dtypes)
+                params["shape"] = tuple(shape)
+            chunk_op._indexes = indexes
+            chunk_op.stage = OperandStage.agg
+            out_chunk = chunk_op.new_chunk([concat_chunk], kws=[params])
+            if len(out_chunk.shape) != 0:
+                chunk_axis_shapes[out_chunk.index[axis]] = out_chunk.shape[axis]
+            new_out_chunks.append(out_chunk)
+
+        new_nsplits = list(nsplits)
+        if np.isscalar(index_info.raw_index):
+            new_nsplits = new_nsplits[:axis] + new_nsplits[axis + 1 :]
+        else:
+            new_nsplits[axis] = (sum(chunk_axis_shapes.values()),)
+        context.out_chunks = new_out_chunks
+        context.out_nsplits = new_nsplits
+
+
+class LabelTensorFancyIndexHandler(_LabelFancyIndexHandler):
+    def accept(cls, raw_index):
+        return isinstance(raw_index, TENSOR_TYPE) and raw_index.dtype != np.bool_
+
+    def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo:
+        if context.input_axis == 1:  # pragma: no cover
+            raise NotImplementedError(
+                "do not support tensor-based index on columns axis"
+            )
+        info = LabelFancyIndexInfo(
+            IndexType.label_fancy_index,
+            context.input_axis,
+            context.output_axis,
+            raw_index,
+            self,
+        )
+        context.input_axis += 1
+        context.output_axis += 1
+        context.append(info)
+        return info
+
+    def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        if has_unknown_shape(index_info.raw_index):
+            yield
+        # rechunk index into one
+        index_info.unprocessed_raw_index = index_info.raw_index
+        index_info.raw_index = yield from recursive_tile(
+            index_info.raw_index.rechunk(index_info.raw_index.shape)
+        )
+
+    def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        tileable = context.tileable
+        input_axis = index_info.input_axis
+
+        assert len(index_info.raw_index.chunks) == 1
+        chunk_labels = index_info.raw_index.chunks[0]
+
+        other_index_to_iter = dict()
+        for chunk in tileable.chunks:
+            chunk_index = chunk.index
+            other_index = chunk_index[:1] if input_axis == 1 else chunk_index[1:]
+            if other_index not in other_index_to_iter:
+                other_index_to_iter[other_index] = itertools.count()
+            output_axis_index = next(other_index_to_iter[other_index])
+            self.set_chunk_index_info(
+                context,
+                index_info,
+                chunk_index,
+                context.chunk_index_to_info[chunk.index],
+                output_axis_index,
+                chunk_labels,
+                len(chunk_labels),
+            )
+
+    def postprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        from .iloc import DataFrameIlocGetItem
+
+        tileable = context.tileable
+        out = context.op.outputs[0]
+        max_chunk_size = max(tileable.nsplits[index_info.input_axis])
+        max_chunk_size = 0 if np.isnan(max_chunk_size) else max_chunk_size
+        max_chunk_size = max(
+            max_chunk_size, max(index_info.unprocessed_raw_index.nsplits[0])
+        )
+        new_chunk_sizes = normalize_chunk_sizes(
+            index_info.raw_index.shape[0], max_chunk_size
+        )[0]
+        cum_new_chunk_sizes = [0] + np.cumsum(new_chunk_sizes).tolist()
+
+        chunks, nsplits = context.out_chunks, context.out_nsplits
+        index_to_chunks = {c.index: c for c in chunks}
+
+        axis = index_info.output_axis
+        assert axis == 0
+        new_out_chunks = []
+        for chunk_index in itertools.product(
+            *(range(len(ns)) for ax, ns in enumerate(nsplits) if ax != axis)
+        ):
+            select_chunks = []
+            for i in range(len(nsplits[axis])):
+                select_index = (i,) + chunk_index
+                select_chunks.append(index_to_chunks[select_index])
+
+            for j in range(len(new_chunk_sizes)):
+                slc = slice(cum_new_chunk_sizes[j], cum_new_chunk_sizes[j + 1])
+                indexes = [slice(None)] * len(nsplits)
+                indexes[axis] = slc
+
+                slice_chunks = []
+                for select_chunk in select_chunks:
+                    output_types = (
+                        [OutputType.series]
+                        if len(nsplits) == 1
+                        else [OutputType.dataframe]
+                    )
+                    slc_op = DataFrameIlocGetItem(
+                        indexes=indexes, output_types=output_types
+                    )
+                    slice_chunk_shape = list(select_chunk.shape)
+                    slice_chunk_shape[axis] = new_chunk_sizes[j]
+                    slice_chunk = slc_op.new_chunk(
+                        [select_chunk], shape=tuple(slice_chunk_shape)
+                    )
+                    slice_chunks.append(slice_chunk)
+
+                chunk_op = context.op.copy().reset_key()
+                chunk_op.stage = OperandStage.agg
+                chunk_op._indexes = (None,) * len(nsplits)
+                chunk_op._fill_value = None
+                assert axis == 0
+                params = dict()
+                params["index"] = (
+                    (j,) + chunk_index if axis == 0 else chunk_index + (j,)
+                )
+                params["index_value"] = parse_index(
+                    out.index_value.to_pandas()[slc], slice_chunks, store_data=False
+                )
+                if select_chunks[0].ndim == 2:
+                    params["columns_value"] = select_chunks[0].columns_value
+                    params["dtypes"] = select_chunks[0].dtypes
+                else:
+                    params["dtype"] = select_chunks[0].dtype
+                    params["name"] = select_chunks[0].name
+                params["shape"] = slice_chunks[0].shape
+                out_chunk = chunk_op.new_chunk(slice_chunks, kws=[params])
+                new_out_chunks.append(out_chunk)
+
+        new_nsplits = list(nsplits)
+        new_nsplits[axis] = tuple(new_chunk_sizes)
+        context.out_chunks = new_out_chunks
+        context.out_nsplits = new_nsplits
+
+
+class DataFrameIlocIndexesHandler(IndexesHandler):
+    def __init__(self):
+        super().__init__()
+        self.register(
+            IntegralIndexHandler,
+            SliceIndexHandler,
+            NDArrayBoolIndexHandler,
+            TensorBoolIndexHandler,
+            NDArrayFancyIndexHandler,
+        )
+
+    def create_context(self, op):
+        return DataFrameIndexHandlerContext(op)
+
+
+class DataFrameLocIndexesHandler(IndexesHandler):
+    def __init__(self):
+        super().__init__()
+        self.register(
+            LabelIndexHandler,
+            LabelSliceIndexHandler,
+            NDArrayBoolIndexHandler,
+            TensorBoolIndexHandler,
+            LabelNDArrayFancyIndexHandler,
+        )
+
+    def create_context(self, op):
+        return DataFrameIndexHandlerContext(op)
+
+
+class DataFrameReindexHandler(IndexesHandler):
+    def __init__(self):
+        super().__init__()
+        self.register(
+            LabelTensorFancyIndexHandler,
+            LabelNDArrayFancyIndexHandler,
+            LabelSliceIndexHandler,
+        )
+
+    def create_context(self, op):
+        return DataFrameIndexHandlerContext(op)
diff --git a/python/xorbits/_mars/dataframe/indexing/insert.py b/python/xorbits/_mars/dataframe/indexing/insert.py
new file mode 100644
index 000000000..72f5d4277
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/insert.py
@@ -0,0 +1,190 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import recursive_tile
+from ...serialization.serializables import AnyField, BoolField, Int64Field
+from ...tensor.core import TENSOR_CHUNK_TYPE, TENSOR_TYPE
+from ..core import SERIES_CHUNK_TYPE, SERIES_TYPE
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_empty_df, parse_index
+
+
+class DataFrameInsert(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.INSERT
+
+    _loc = Int64Field("loc")
+    _column = AnyField("column")
+    _value = AnyField("value")
+    _allow_duplicates = BoolField("allow_duplicates")
+
+    def __init__(self, loc=None, column=None, value=None, allow_duplicates=None, **kw):
+        super().__init__(
+            _loc=loc,
+            _column=column,
+            _value=value,
+            _allow_duplicates=allow_duplicates,
+            **kw,
+        )
+
+    @property
+    def loc(self) -> int:
+        return self._loc
+
+    @property
+    def column(self):
+        return self._column
+
+    @property
+    def value(self):
+        return self._value
+
+    @property
+    def allow_duplicates(self):
+        return self._allow_duplicates
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if len(inputs) > 1:
+            self._value = self._inputs[-1]
+
+    def __call__(self, df):
+        inputs = [df]
+        if isinstance(self.value, (SERIES_TYPE, TENSOR_TYPE)):
+            value_dtype = self.value.dtype
+            inputs.append(self.value)
+        else:
+            value_dtype = pd.Series(self.value).dtype
+
+        empty_df = build_empty_df(df.dtypes)
+        empty_df.insert(
+            loc=self.loc,
+            column=self.column,
+            allow_duplicates=self.allow_duplicates,
+            value=pd.Series([], dtype=value_dtype),
+        )
+
+        params = df.params
+        params["columns_value"] = parse_index(empty_df.columns, store_data=True)
+        params["dtypes"] = empty_df.dtypes
+        params["shape"] = (df.shape[0], df.shape[1] + 1)
+        return self.new_dataframe(inputs, **params)
+
+    @classmethod
+    def tile(cls, op: "DataFrameInsert"):
+        inp = op.inputs[0]
+        value = op.value
+        if isinstance(value, (SERIES_TYPE, TENSOR_TYPE)):
+            value = yield from recursive_tile(value.rechunk({0: inp.nsplits[0]}))
+        out = op.outputs[0]
+
+        chunk_bounds = np.cumsum((0,) + inp.nsplits[1])
+        chunk_bounds[-1] += 1
+
+        chunks = []
+        new_split = list(inp.nsplits[1])
+        chunk_dtypes = None
+        chunk_columns_value = None
+        for c in inp.chunks:
+            left_bound = int(chunk_bounds[c.index[1]])
+            right_bound = int(chunk_bounds[c.index[1] + 1])
+            if left_bound > op.loc or right_bound <= op.loc:
+                chunks.append(c)
+                continue
+
+            if chunk_dtypes is None:
+                new_split[c.index[1]] = inp.nsplits[1][c.index[1]] + 1
+
+                if isinstance(value, (SERIES_TYPE, TENSOR_TYPE)):
+                    value_dtype = value.dtype
+                else:
+                    value_dtype = pd.Series(value).dtype
+
+                empty_df = build_empty_df(c.dtypes)
+                empty_df.insert(
+                    loc=op.loc - left_bound,
+                    column=op.column,
+                    allow_duplicates=op.allow_duplicates,
+                    value=pd.Series([], dtype=value_dtype),
+                )
+
+                chunk_dtypes = empty_df.dtypes
+                chunk_columns_value = parse_index(chunk_dtypes.index, store_data=True)
+
+            params = c.params
+            params["columns_value"] = chunk_columns_value
+            params["dtypes"] = chunk_dtypes
+            params["shape"] = (c.shape[0], c.shape[1] + 1)
+
+            new_op = op.copy().reset_key()
+            new_op._loc = op.loc - left_bound
+
+            if isinstance(value, (SERIES_TYPE, TENSOR_TYPE)):
+                inputs = [c, value.chunks[c.index[0]]]
+            else:
+                inputs = [c]
+            chunks.append(new_op.new_chunk(inputs, **params))
+
+        new_op = op.copy().reset_key()
+        return new_op.new_tileables(
+            [inp],
+            chunks=chunks,
+            nsplits=(inp.nsplits[0], tuple(new_split)),
+            **out.params,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameInsert"):
+        input_ = ctx[op.inputs[0].key]
+        value = op.value
+        if isinstance(value, (SERIES_CHUNK_TYPE, TENSOR_CHUNK_TYPE)):
+            value = ctx[value.key]
+        ctx[op.outputs[0].key] = copied = input_.copy()
+        copied.insert(
+            loc=op.loc,
+            column=op.column,
+            allow_duplicates=op.allow_duplicates,
+            value=value,
+        )
+
+
+def df_insert(df, loc, column, value, allow_duplicates=False):
+    """
+    Insert column into DataFrame at specified location.
+
+    Raises a ValueError if `column` is already contained in the DataFrame,
+    unless `allow_duplicates` is set to True.
+
+    Parameters
+    ----------
+    loc : int
+        Insertion index. Must verify 0 <= loc <= len(columns).
+    column : str, number, or hashable object
+        Label of the inserted column.
+    value : int, Series, or array-like
+    allow_duplicates : bool, optional
+    """
+    if isinstance(value, TENSOR_TYPE) and value.ndim > 1:
+        raise ValueError(
+            f"Wrong number of items passed {value.ndim}, placement implies 1"
+        )
+
+    op = DataFrameInsert(
+        loc=loc, column=column, value=value, allow_duplicates=allow_duplicates
+    )
+    out_df = op(df)
+    df.data = out_df.data
diff --git a/python/xorbits/_mars/dataframe/indexing/loc.py b/python/xorbits/_mars/dataframe/indexing/loc.py
new file mode 100644
index 000000000..bc3aca577
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/loc.py
@@ -0,0 +1,555 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from numbers import Integral
+from typing import Dict
+
+import numpy as np
+import pandas as pd
+from pandas.core.dtypes.cast import find_common_type
+from pandas.core.indexing import IndexingError
+
+from ... import opcodes as OperandDef
+from ...core import ENTITY_TYPE, OutputType
+from ...core.operand import OperandStage
+from ...serialization.serializables import AnyField, KeyField, ListField
+from ...tensor.datasource import asarray
+from ...tensor.utils import calc_sliced_size, filter_inputs
+from ...utils import is_full_slice, lazy_import
+from ..core import DATAFRAME_TYPE, IndexValue
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import is_index_value_identical, parse_index
+from .iloc import DataFrameIlocSetItem
+from .index_lib import DataFrameLocIndexesHandler
+
+cudf = lazy_import("cudf")
+
+
+def process_loc_indexes(inp, indexes, fetch_index: bool = True):
+    ndim = inp.ndim
+
+    if not isinstance(indexes, tuple):
+        indexes = (indexes,)
+    if len(indexes) < ndim:
+        indexes += (slice(None),) * (ndim - len(indexes))
+    if len(indexes) > ndim:
+        raise IndexingError("Too many indexers")
+
+    new_indexes = []
+    for ax, index in enumerate(indexes):
+        if isinstance(index, (list, np.ndarray, pd.Series, ENTITY_TYPE)):
+            if not isinstance(index, ENTITY_TYPE):
+                index = np.asarray(index)
+            elif fetch_index:
+                index = asarray(index)
+                if ax == 1:
+                    # do not support tensor index on axis 1
+                    # because if so, the dtypes and columns_value would be unknown
+                    try:
+                        index = index.fetch()
+                    except (RuntimeError, ValueError):
+                        raise NotImplementedError(
+                            "indexer on axis columns cannot be non-executed tensor"
+                        )
+        new_indexes.append(index)
+
+    return new_indexes
+
+
+class DataFrameLoc:
+    def __init__(self, obj):
+        self._obj = obj
+
+    def _use_iloc(self, indexes):
+        # for RangeIndex from 0, use iloc instead of loc
+        index_value = self._obj.index_value.value
+        if len(indexes) == 2:
+            if not isinstance(indexes[1], slice):
+                return False, None
+            elif indexes[1] != slice(None):
+                return False, None
+        if not isinstance(index_value, IndexValue.RangeIndex):
+            return False, None
+        if index_value.slice.start != 0 and index_value.slice.start is not None:
+            return False, None
+        if not isinstance(indexes[0], (Integral, slice)):
+            return False, None
+        if isinstance(indexes[0], Integral):
+            if indexes[0] < 0:
+                return False, None
+        else:
+            index0 = indexes[0]
+            for v in (index0.start, index0.stop, index0.step):
+                if v is None:
+                    continue
+                if not isinstance(v, Integral):
+                    return False, None
+                if v < 0:
+                    return False, None
+            if index0.stop is not None:
+                # adjust slice right bound
+                return (
+                    True,
+                    [slice(index0.start, index0.stop + 1, index0.step)] + indexes[1:],
+                )
+        return True, None
+
+    def __getitem__(self, indexes):
+        indexes = process_loc_indexes(self._obj, indexes)
+
+        use_iloc, new_indexes = self._use_iloc(indexes)
+        if use_iloc:
+            # use iloc instead
+            return self._obj.iloc[tuple(new_indexes or indexes)]
+
+        op = DataFrameLocGetItem(indexes=indexes)
+        return op(self._obj)
+
+    def __setitem__(self, indexes, value):
+        if not np.isscalar(value):
+            raise NotImplementedError("Only scalar value is supported to set by loc")
+        if not isinstance(self._obj, DATAFRAME_TYPE):
+            raise NotImplementedError("Only DataFrame is supported to set by loc")
+        indexes = process_loc_indexes(self._obj, indexes, fetch_index=False)
+        use_iloc, new_indexes = self._use_iloc(indexes)
+        if use_iloc:
+            op = DataFrameIlocSetItem(indexes=new_indexes, value=value)
+            ret = op(self._obj)
+            self._obj.data = ret.data
+        else:
+            other_indices = []
+            indices_tileable = [
+                idx
+                for idx in indexes
+                if isinstance(idx, ENTITY_TYPE) or other_indices.append(idx)
+            ]
+            op = DataFramelocSetItem(indexes=other_indices, value=value)
+            ret = op([self._obj] + indices_tileable)
+            self._obj.data = ret.data
+
+
+class DataFramelocSetItem(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.DATAFRAME_ILOC_SETITEM
+
+    _indexes = ListField("indexes")
+    _value = AnyField("value")
+
+    def __init__(
+        self, indexes=None, value=None, gpu=None, sparse=False, output_types=None, **kw
+    ):
+        super().__init__(
+            _indexes=indexes,
+            _value=value,
+            gpu=gpu,
+            sparse=sparse,
+            _output_types=output_types,
+            **kw,
+        )
+        if not self.output_types:
+            self.output_types = [OutputType.dataframe]
+
+    @property
+    def indexes(self):
+        return self._indexes
+
+    @property
+    def value(self):
+        return self._value
+
+    def __call__(self, inputs):
+        df = inputs[0]
+        return self.new_dataframe(
+            inputs,
+            shape=df.shape,
+            dtypes=df.dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+        )
+
+    @classmethod
+    def tile(cls, op):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+        out_chunks = []
+        if len(op.inputs) > 1:
+            index_series = op.inputs[1]
+            is_identical = is_index_value_identical(in_df, index_series)
+            if not is_identical:
+                raise NotImplementedError("Only identical index value is supported")
+            if len(in_df.nsplits[1]) != 1:
+                raise NotImplementedError("Column-split chunks are not supported")
+            for target_chunk, index_chunk in zip(in_df.chunks, index_series.chunks):
+                chunk_op = op.copy().reset_key()
+                out_chunk = chunk_op.new_chunk(
+                    [target_chunk, index_chunk],
+                    shape=target_chunk.shape,
+                    index=target_chunk.index,
+                    dtypes=target_chunk.dtypes,
+                    index_value=target_chunk.index_value,
+                    columns_value=target_chunk.columns_value,
+                )
+                out_chunks.append(out_chunk)
+        else:
+            for target_chunk in in_df.chunks:
+                chunk_op = op.copy().reset_key()
+                out_chunk = chunk_op.new_chunk(
+                    [target_chunk],
+                    shape=target_chunk.shape,
+                    index=target_chunk.index,
+                    dtypes=target_chunk.dtypes,
+                    index_value=target_chunk.index_value,
+                    columns_value=target_chunk.columns_value,
+                )
+                out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            op.inputs,
+            shape=out_df.shape,
+            dtypes=out_df.dtypes,
+            index_value=out_df.index_value,
+            columns_value=out_df.columns_value,
+            chunks=out_chunks,
+            nsplits=in_df.nsplits,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        r = ctx[op.inputs[0].key].copy(deep=True)
+        if len(op.inputs) > 1:
+            row_index = ctx[op.inputs[1].key]
+            r.loc[(row_index,) + tuple(op.indexes)] = op.value
+        else:
+            r.loc[tuple(op.indexes)] = op.value
+        ctx[chunk.key] = r
+
+
+class DataFrameLocGetItem(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.DATAFRAME_LOC_GETITEM
+
+    _input = KeyField("input")
+    _indexes = ListField("indexes")
+
+    def __init__(self, indexes=None, gpu=None, sparse=False, output_types=None, **kw):
+        super().__init__(
+            _indexes=indexes, gpu=gpu, sparse=sparse, _output_types=output_types, **kw
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def indexes(self):
+        return self._indexes
+
+    @property
+    def can_index_miss(self):
+        return False
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        self._input = next(inputs_iter)
+        indexes = []
+        for index in self._indexes:
+            if isinstance(index, ENTITY_TYPE):
+                indexes.append(next(inputs_iter))
+            else:
+                indexes.append(index)
+        self._indexes = list(indexes)
+
+    @classmethod
+    def _calc_slice_param(
+        cls,
+        input_index_value: IndexValue,
+        pd_index: pd.Index,
+        inp,
+        index: slice,
+        axis: int,
+    ) -> Dict:
+        param = dict()
+        if is_full_slice(index):
+            # full slice on this axis
+            param["shape"] = inp.shape[axis]
+            param["index_value"] = input_index_value
+            if axis == 1:
+                param["dtypes"] = inp.dtypes
+        elif input_index_value.has_value():
+            start, end = pd_index.slice_locs(
+                index.start, index.stop, index.step, kind="loc"
+            )
+            slc = slice(start, end, index.step)
+            size = calc_sliced_size(inp.shape[axis], slc)
+            param["shape"] = size
+            out_index = pd_index[slc]
+            param["index_value"] = parse_index(out_index, store_data=axis == 1)
+            if axis == 1:
+                param["dtypes"] = inp.dtypes[slc]
+        else:
+            assert axis == 0
+            if index.start is None and index.stop is None:
+                param["shape"] = calc_sliced_size(inp.shape[axis], index)
+            else:
+                param["shape"] = np.nan
+            param["index_value"] = parse_index(pd_index, inp, index)
+
+        return param
+
+    @classmethod
+    def _calc_bool_index_param(
+        cls, input_index_value: IndexValue, pd_index: pd.Index, inp, index, axis: int
+    ) -> Dict:
+        param = dict()
+        if input_index_value.has_value():
+            if isinstance(index, np.ndarray):
+                filtered_index = pd_index[index]
+                param["shape"] = len(filtered_index)
+                param["index_value"] = parse_index(filtered_index, store_data=axis == 1)
+                if axis == 1:
+                    param["dtypes"] = inp.dtypes[index]
+            else:
+                # tensor, cannot be indexer on axis 1
+                assert axis == 0
+                param["shape"] = np.nan
+                param["index_value"] = parse_index(
+                    pd.Index([], dtype=pd_index.dtype), inp, index, store_data=False
+                )
+        else:
+            assert axis == 0
+            if isinstance(index, np.ndarray):
+                param["shape"] = int(index.sum())
+            else:
+                param["shape"] = np.nan
+            param["index_value"] = parse_index(pd_index, inp, index, store_data=False)
+
+        return param
+
+    @classmethod
+    def _calc_fancy_index_param(
+        cls, input_index_value: IndexValue, pd_index: pd.Index, inp, index, axis: int
+    ) -> Dict:
+        param = dict()
+        if input_index_value.has_value():
+            if isinstance(index, np.ndarray):
+                if not pd_index.is_unique:
+                    assert axis == 1
+                    # as there's no direct method in pandas to handle fancy indexes
+                    # we creates a empty
+                    new_dtypes = inp.dtypes.loc[index]
+                    param["shape"] = len(new_dtypes)
+                    param["index_value"] = parse_index(
+                        new_dtypes.index, store_data=True
+                    )
+                    param["dtypes"] = new_dtypes
+                else:
+                    for it in index:
+                        if it not in pd_index:
+                            axis_name = "index" if axis == 0 else "columns"
+                            raise KeyError(
+                                f"Label [{it}] not found in the [{axis_name}]"
+                            )
+                    param["shape"] = len(index)
+                    param["index_value"] = parse_index(pd.Index(index), store_data=True)
+                    if axis == 1:
+                        param["dtypes"] = inp.dtypes[index]
+            else:
+                assert axis == 0
+                param["shape"] = index.shape[0]
+                param["index_value"] = parse_index(
+                    pd.Index([], dtype=pd_index.dtype), inp, index
+                )
+        else:
+            assert axis == 0
+            param["shape"] = index.shape[0]
+            param["index_value"] = parse_index(pd_index, inp, index)
+
+        return param
+
+    @classmethod
+    def _calc_param(cls, inp, axis: int, index) -> Dict:
+        input_index_value = inp.index_value if axis == 0 else inp.columns_value
+        pd_index = input_index_value.to_pandas()
+
+        if isinstance(index, slice):
+            return cls._calc_slice_param(input_index_value, pd_index, inp, index, axis)
+        elif hasattr(index, "dtype") and index.ndim == 1:
+            if index.dtype == np.bool_:
+                # bool indexing
+                return cls._calc_bool_index_param(
+                    input_index_value, pd_index, inp, index, axis
+                )
+            else:
+                # fancy indexing
+                return cls._calc_fancy_index_param(
+                    input_index_value, pd_index, inp, index, axis
+                )
+        else:
+            param = dict()
+            if input_index_value.has_value():
+                loc = pd_index.get_loc(index)
+                if isinstance(loc, (slice, np.ndarray)):
+                    assert axis == 1
+                    new_dtypes = inp.dtypes[loc]
+                    param["shape"] = len(new_dtypes)
+                    param["index_value"] = parse_index(
+                        new_dtypes.index, store_data=True
+                    )
+                    param["dtypes"] = new_dtypes
+                else:
+                    # append None to indicate returning Series
+                    param["shape"] = None
+            else:
+                param["shape"] = None
+            return param
+
+    def __call__(self, inp):
+        inputs = [inp] + filter_inputs(self._indexes)
+
+        shape = []
+        sizes = []
+        index_value = columns_value = dtypes = None
+        for ax, index in enumerate(self._indexes):
+            param = self._calc_param(inp, ax, index)
+
+            size = param.get("shape")
+            sizes.append(size)
+            if size is not None:
+                shape.append(size)
+
+            if ax == 0:
+                index_value = param.get("index_value")
+            else:
+                columns_value = param.get("index_value")
+                dtypes = param.get("dtypes")
+
+        shape = tuple(shape)
+        if len(shape) == 0:
+            # scalar
+            if isinstance(inp, DATAFRAME_TYPE):
+                dtype = inp.dtypes[self._indexes[1]]
+            else:
+                dtype = inp.dtype
+            return self.new_scalar(inputs, dtype=dtype)
+        elif len(shape) == 1:
+            # series
+            if isinstance(inp, DATAFRAME_TYPE):
+                if sizes[0] is None:
+                    # label on axis 0
+                    dtype = find_common_type(list(dtypes))
+                    return self.new_series(
+                        inputs,
+                        shape=shape,
+                        dtype=dtype,
+                        index_value=columns_value,
+                        name=self._indexes[0],
+                    )
+                else:
+                    # label on axis 1
+                    dtype = inp.dtypes[self._indexes[1]]
+                    return self.new_series(
+                        inputs,
+                        shape=shape,
+                        dtype=dtype,
+                        index_value=index_value,
+                        name=self._indexes[1],
+                    )
+            else:
+                return self.new_series(
+                    inputs,
+                    shape=shape,
+                    dtype=inp.dtype,
+                    index_value=index_value,
+                    name=inp.name,
+                )
+        else:
+            # dataframe
+            return self.new_dataframe(
+                inputs,
+                shape=shape,
+                dtypes=dtypes,
+                index_value=index_value,
+                columns_value=columns_value,
+            )
+
+    @classmethod
+    def tile(cls, op):
+        handler = DataFrameLocIndexesHandler()
+        return [(yield from handler.handle(op))]
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        df = ctx[op.input.key]
+        if len(op.inputs) > 1:
+            indexes = tuple(
+                ctx[index.key] if hasattr(index, "key") else index
+                for index in op.indexes
+            )
+        else:
+            indexes = tuple(op.indexes)
+        xdf = pd if isinstance(df, (pd.Series, pd.DataFrame)) or cudf is None else cudf
+
+        if op.stage != OperandStage.map:
+            try:
+                r = df.loc[indexes]
+            except AttributeError:
+                # workaround for error when calling series.loc[(index,)]
+                r = df.loc[indexes[0]]
+        else:
+            # for map stage, and when some index is fancy index
+            # ignore keys that do not exist
+            new_indexes = []
+            str_loc_on_datetime_index = False
+            for ax, index in enumerate(indexes):
+                if ax == 0:
+                    if isinstance(index, np.ndarray) and index.dtype != np.bool_:
+                        new_indexes.append(df.index.intersection(index))
+                    elif isinstance(df.index, pd.DatetimeIndex) and isinstance(
+                        index, str
+                    ):
+                        # special process for datetime index
+                        str_loc_on_datetime_index = True
+                        new_indexes.append(index)
+                    else:
+                        new_indexes.append(index)
+                else:
+                    new_indexes.append(index)
+
+            try:
+                r = df.loc[tuple(new_indexes)]
+                if str_loc_on_datetime_index:
+                    # convert back to DataFrame or Series
+                    if r.ndim == 0:
+                        index = df.index[df.index.get_loc(new_indexes[0])]
+                        r = xdf.Series([r], index=[index])
+                    elif r.ndim == 1:
+                        rdf = xdf.DataFrame(columns=r.index)
+                        rdf.loc[r.name] = r
+                        r = rdf
+            except KeyError:
+                if str_loc_on_datetime_index:
+                    new_indexes[0] = []
+                    r = df.loc[tuple(new_indexes)]
+                else:  # pragma: no cover
+                    raise
+
+        if isinstance(r, pd.Series) and r.dtype != chunk.dtype:
+            r = r.astype(chunk.dtype)
+        ctx[chunk.key] = r
+
+
+def loc(a):
+    return DataFrameLoc(a)
diff --git a/python/xorbits/_mars/dataframe/indexing/reindex.py b/python/xorbits/_mars/dataframe/indexing/reindex.py
new file mode 100644
index 000000000..f17290b58
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/reindex.py
@@ -0,0 +1,900 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+try:
+    import scipy.sparse as sps
+except ImportError:  # pragma: no cover
+    sps = None
+
+from ... import opcodes
+from ...core import ENTITY_TYPE, recursive_tile
+from ...core.operand import OperandStage
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    Int64Field,
+    KeyField,
+    StringField,
+)
+from ...tensor import tensor as astensor
+from ...utils import lazy_import, pd_release_version
+from ..core import INDEX_TYPE
+from ..core import Index as DataFrameIndexType
+from ..initializer import Index as asindex
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index, validate_axis_style_args
+from .index_lib import DataFrameReindexHandler
+
+cudf = lazy_import("cudf")
+
+# under pandas<1.1, SparseArray ignores zeros on creation
+_pd_sparse_miss_zero = pd_release_version[:2] < (1, 1)
+
+
+class DataFrameReindex(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.REINDEX
+
+    _input = KeyField("input")
+    _index = AnyField("index")
+    _index_freq = AnyField("index_freq")
+    _columns = AnyField("columns")
+    _method = StringField("method")
+    _level = AnyField("level")
+    _fill_value = AnyField("fill_value")
+    _limit = Int64Field("limit")
+    _enable_sparse = BoolField("enable_sparse")
+
+    def __init__(
+        self,
+        index=None,
+        index_freq=None,
+        columns=None,
+        method=None,
+        level=None,
+        fill_value=None,
+        limit=None,
+        enable_sparse=None,
+        **kw,
+    ):
+        super().__init__(
+            _index=index,
+            _index_freq=index_freq,
+            _columns=columns,
+            _method=method,
+            _level=level,
+            _fill_value=fill_value,
+            _limit=limit,
+            _enable_sparse=enable_sparse,
+            **kw,
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def index(self):
+        return self._index
+
+    @property
+    def index_freq(self):
+        return self._index_freq
+
+    @property
+    def columns(self):
+        return self._columns
+
+    @property
+    def method(self):
+        return self._method
+
+    @property
+    def level(self):
+        return self._level
+
+    @property
+    def fill_value(self):
+        return self._fill_value
+
+    @property
+    def limit(self):
+        return self._limit
+
+    @property
+    def enable_sparse(self):
+        return self._enable_sparse
+
+    @property
+    def _indexes(self):
+        # used for index_lib
+        indexes = []
+        names = ("index", "columns")
+        for ax in range(self.input.ndim):
+            index = names[ax]
+            val = getattr(self, index)
+            if val is not None:
+                indexes.append(val)
+            else:
+                indexes.append(slice(None))
+        return indexes
+
+    @_indexes.setter
+    def _indexes(self, new_indexes):
+        for index_field, new_index in zip(["_index", "_columns"], new_indexes):
+            setattr(self, index_field, new_index)
+
+    @property
+    def indexes(self):
+        return self._indexes
+
+    @property
+    def can_index_miss(self):
+        return True
+
+    def _new_chunks(self, inputs, kws=None, **kw):
+        if self.stage == OperandStage.map and len(inputs) < len(self._inputs):
+            assert len(inputs) == len(self._inputs) - 1
+            inputs.append(self._fill_value.chunks[0])
+
+        if self.stage == OperandStage.agg and self._fill_value is not None:
+            # fill_value is not required
+            self._fill_value = None
+
+        return super()._new_chunks(inputs, kws=kws, **kw)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        self._input = next(inputs_iter)
+        if self._index is not None and isinstance(self._index, ENTITY_TYPE):
+            self._index = next(inputs_iter)
+        if self._fill_value is not None and isinstance(self._fill_value, ENTITY_TYPE):
+            self._fill_value = next(inputs_iter)
+
+    def __call__(self, df_or_series):
+        inputs = [df_or_series]
+        shape = list(df_or_series.shape)
+        index_value = df_or_series.index_value
+        columns_value = dtypes = None
+        if df_or_series.ndim == 2:
+            columns_value = df_or_series.columns_value
+            dtypes = df_or_series.dtypes
+
+        if self._index is not None:
+            shape[0] = self._index.shape[0]
+            index_value = asindex(self._index).index_value
+            self._index = astensor(self._index)
+            if isinstance(self._index, ENTITY_TYPE):
+                inputs.append(self._index)
+        if self._columns is not None:
+            shape[1] = self._columns.shape[0]
+            dtypes = df_or_series.dtypes.reindex(index=self._columns).fillna(
+                np.dtype(np.float64)
+            )
+            columns_value = parse_index(dtypes.index, store_data=True)
+        if self._fill_value is not None and isinstance(self._fill_value, ENTITY_TYPE):
+            inputs.append(self._fill_value)
+
+        if df_or_series.ndim == 1:
+            return self.new_series(
+                inputs,
+                shape=tuple(shape),
+                dtype=df_or_series.dtype,
+                index_value=index_value,
+                name=df_or_series.name,
+            )
+        else:
+            return self.new_dataframe(
+                inputs,
+                shape=tuple(shape),
+                dtypes=dtypes,
+                index_value=index_value,
+                columns_value=columns_value,
+            )
+
+    @classmethod
+    def tile(cls, op):
+        if all(len(inp.chunks) == 1 for inp in op.inputs):
+            # tile one chunk
+            out = op.outputs[0]
+
+            chunk_op = op.copy().reset_key()
+            chunk_params = out.params.copy()
+            chunk_params["index"] = (0,) * out.ndim
+            out_chunk = chunk_op.new_chunk(
+                [inp.chunks[0] for inp in op.inputs], kws=[chunk_params]
+            )
+
+            params = out.params.copy()
+            params["nsplits"] = ((s,) for s in out.shape)
+            params["chunks"] = [out_chunk]
+            new_op = op.copy()
+            return new_op.new_tileables(op.inputs, kws=[params])
+
+        handler = DataFrameReindexHandler()
+        result = yield from handler.handle(op)
+        if op.method is None and op.fill_value is None:
+            return [result]
+        else:
+            axis = 1 if op.columns is not None and op.index is None else 0
+            result = result.fillna(
+                value=op.fill_value, method=op.method, axis=axis, limit=op.limit
+            )
+            return [(yield from recursive_tile(result))]
+
+    @classmethod
+    def _get_value(cls, ctx, obj):
+        if obj is not None and hasattr(obj, "key"):
+            return ctx[obj.key]
+        return obj
+
+    @classmethod
+    def _convert_to_writable(cls, obj):
+        if isinstance(obj, np.ndarray) and not obj.flags.writeable:
+            return obj.copy()
+        return obj
+
+    @classmethod
+    def _sparse_reindex(cls, inp, index=None, columns=None):
+        if inp.ndim == 2:
+            columns = inp.columns if columns is None else columns
+            index_shape = len(index) if index is not None else len(inp)
+            i_to_columns = dict()
+
+            for i, col in enumerate(columns):
+                if col in inp.dtypes:
+                    if index is None:
+                        i_to_columns[i] = inp[col]
+                    else:
+                        indexer = inp.index.reindex(index)[1]
+                        cond = indexer >= 0
+                        available_indexer = indexer[cond]
+                        del indexer
+                        data = inp[col].iloc[available_indexer].to_numpy()
+                        ind = cond.nonzero()[0]
+                        spmatrix = sps.csc_matrix(
+                            (data, (ind, np.zeros_like(ind))),
+                            shape=(index_shape, 1),
+                            dtype=inp[col].dtype,
+                        )
+                        # convert to SparseDtype(xxx, np.nan)
+                        # to ensure 0 in sparse_array not converted to np.nan
+                        if not _pd_sparse_miss_zero:
+                            sparse_array = pd.arrays.SparseArray.from_spmatrix(spmatrix)
+                            sparse_array = pd.arrays.SparseArray(
+                                sparse_array.sp_values,
+                                sparse_index=sparse_array.sp_index,
+                                fill_value=np.nan,
+                                dtype=pd.SparseDtype(sparse_array.dtype, np.nan),
+                            )
+                        else:
+                            from pandas._libs.sparse import IntIndex
+
+                            sparse_array = pd.arrays.SparseArray(
+                                data,
+                                sparse_index=IntIndex(index_shape, ind),
+                                fill_value=np.nan,
+                                dtype=pd.SparseDtype(data.dtype, np.nan),
+                            )
+                        series = pd.Series(sparse_array, index=index)
+
+                        i_to_columns[i] = series
+                else:
+                    ind = index if index is not None else inp.index
+                    i_to_columns[i] = pd.DataFrame.sparse.from_spmatrix(
+                        sps.coo_matrix((index_shape, 1), dtype=np.float64), index=ind
+                    ).iloc[:, 0]
+
+            df = pd.DataFrame(i_to_columns)
+            df.columns = columns
+            return df
+        else:
+            indexer = inp.index.reindex(index)[1]
+            cond = indexer >= 0
+            available_indexer = indexer[cond]
+            del indexer
+            data = inp.iloc[available_indexer].to_numpy()
+            ind = cond.nonzero()[0]
+            spmatrix = sps.csc_matrix(
+                (data, (ind, np.zeros_like(ind))),
+                shape=(len(index), 1),
+                dtype=inp.dtype,
+            )
+            sparse_array = pd.arrays.SparseArray.from_spmatrix(spmatrix)
+            # convert to SparseDtype(xxx, np.nan)
+            # to ensure 0 in sparse_array not converted to np.nan
+            sparse_array = pd.arrays.SparseArray(
+                sparse_array.sp_values,
+                sparse_index=sparse_array.sp_index,
+                fill_value=np.nan,
+                dtype=pd.SparseDtype(sparse_array.dtype, np.nan),
+            )
+            series = pd.Series(sparse_array, index=index, name=inp.name)
+            return series
+
+    @classmethod
+    def _reindex(cls, ctx, op, fill=True, try_sparse=None):
+        inp = cls._convert_to_writable(ctx[op.input.key])
+        index = cls._get_value(ctx, op.index)
+        if op.index_freq is not None:
+            index = pd.Index(index, freq=op.index_freq)
+        columns = cls._get_value(ctx, op.columns)
+        kw = {"level": op.level}
+        if index is not None and not isinstance(index, slice):
+            kw["index"] = cls._convert_to_writable(index)
+        if columns is not None and not isinstance(columns, slice):
+            kw["columns"] = cls._convert_to_writable(columns)
+        if fill:
+            kw["method"] = op.method
+            kw["fill_value"] = cls._get_value(ctx, op.fill_value)
+            kw["limit"] = op.limit
+
+        if (
+            try_sparse
+            and not fill
+            and op.level is None
+            and isinstance(inp, (pd.DataFrame, pd.Series))
+            and sps is not None
+        ):
+            # 1. sparse is used in map only
+            # 2. for MultiIndex, sparse is not needed as well
+            # 3. only consider cpu
+            # 4. scipy is installed
+
+            if op.enable_sparse is None:
+                # try to use sparse if estimated size > 2 * input_size
+                cur_size = inp.memory_usage(deep=True)
+                if inp.ndim == 2:
+                    cur_size = cur_size.sum()
+                element_size = cur_size / inp.size
+                shape = list(inp.shape)
+                if "index" in kw:
+                    shape[0] = len(kw["index"])
+                if "columns" in kw:
+                    shape[1] = len(kw["columns"])
+                estimate_size = np.prod(shape) * element_size
+
+                fitted = estimate_size > cur_size * 2
+            else:
+                # specified when op.enable_sparse == True
+                fitted = True
+
+            if fitted:
+                # use sparse instead
+                return cls._sparse_reindex(
+                    inp, index=kw.get("index"), columns=kw.get("columns")
+                )
+
+        return inp.reindex(**kw)
+
+    @classmethod
+    def _execute_reindex(cls, ctx, op):
+        ctx[op.outputs[0].key] = cls._reindex(ctx, op)
+
+    @classmethod
+    def _execute_map(cls, ctx, op):
+        if op.enable_sparse is not None:
+            try_sparse = op.enable_sparse
+        else:
+            try_sparse = True
+        ctx[op.outputs[0].key] = cls._reindex(
+            ctx, op, fill=False, try_sparse=try_sparse
+        )
+
+    @classmethod
+    def _convert_to_dense(cls, series):
+        if isinstance(series.dtype, pd.SparseDtype):
+            return series.astype(
+                pd.SparseDtype(series.dtype.subtype, np.nan)
+            ).sparse.to_dense()
+        return series
+
+    @classmethod
+    def _merge_chunks(cls, inputs):
+        xdf = cls._get_xdf(inputs[0])
+
+        ndim = inputs[0].ndim
+        if ndim == 2:
+            columns = inputs[0].columns
+            result = xdf.DataFrame(
+                np.full(inputs[0].shape, np.nan), columns=columns, index=inputs[0].index
+            )
+        else:
+            columns = [inputs[0].name]
+            result = None
+
+        for i in range(len(columns)):
+            if ndim == 1:
+                curr = cls._convert_to_dense(inputs[0]).copy()
+            else:
+                curr = cls._convert_to_dense(inputs[0].iloc[:, i]).copy()
+            for j in range(len(inputs) - 1):
+                if ndim == 2:
+                    left = cls._convert_to_dense(inputs[j].iloc[:, i])
+                    right = cls._convert_to_dense(inputs[j + 1].iloc[:, i])
+                else:
+                    left = cls._convert_to_dense(inputs[j])
+                    right = cls._convert_to_dense(inputs[j + 1])
+
+                left_notna = left.notna()
+                right_notna = right.notna()
+                if (left_notna & right_notna).sum() > 0:
+                    raise ValueError("cannot reindex from a duplicate axis")
+                curr.loc[left_notna] = left.loc[left_notna]
+                curr.loc[right_notna] = right.loc[right_notna]
+            if ndim == 1:
+                result = curr
+            else:
+                result.iloc[:, i] = curr
+
+        return result
+
+    @classmethod
+    def _get_xdf(cls, obj):
+        return (
+            pd if isinstance(obj, (pd.DataFrame, pd.Series)) or cudf is None else cudf
+        )
+
+    @classmethod
+    def _execute_agg(cls, ctx, op):
+        out = op.outputs[0]
+
+        if op.index is None and op.columns is None:
+            # index is tensor
+            inputs = [ctx[inp.key] for inp in op.inputs]
+
+            xdf = cls._get_xdf(inputs[0])
+
+            if inputs[0].index.nlevels > 1 and op.level is not None:
+                # multi index
+                result = xdf.concat(inputs)
+            else:
+                result = cls._merge_chunks(inputs) if len(inputs) > 1 else inputs[0]
+
+            ctx[out.key] = result
+
+        else:
+            # ndarray index or columns
+            if isinstance(op.index, slice) and op.index == slice(None):
+                axis = 1
+                labels = op.columns
+            else:
+                assert op.columns is None or (
+                    isinstance(op.columns, slice) and op.columns == slice(None)
+                )
+                axis = 0
+                labels = op.index
+
+            inp = ctx[op.inputs[0].key]
+            if inp.index.nlevels > 1 and op.level is not None:
+                new_inp = inp
+            else:
+                # split input
+                size = out.shape[axis]
+                assert inp.shape[axis] % size == 0
+                inputs = []
+                for i in range(inp.shape[axis] // size):
+                    slc = [slice(None)] * inp.ndim
+                    slc[axis] = slice(size * i, size * (i + 1))
+                    inputs.append(inp.iloc[tuple(slc)])
+                new_inp = cls._merge_chunks(inputs)
+
+            labels = cls._convert_to_writable(labels)
+            if out.ndim == 2:
+                result = new_inp.reindex(labels=labels, axis=axis, level=op.level)
+            else:
+                result = new_inp.reindex(index=labels, level=op.level)
+            ctx[out.key] = result
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            return cls._execute_map(ctx, op)
+        elif op.stage == OperandStage.agg:
+            return cls._execute_agg(ctx, op)
+        else:
+            assert op.stage is None
+            return cls._execute_reindex(ctx, op)
+
+
+def reindex(df_or_series, *args, **kwargs):
+    """
+    Conform Series/DataFrame to new index with optional filling logic.
+
+    Places NA/NaN in locations having no value in the previous index. A new object
+    is produced unless the new index is equivalent to the current one and
+    ``copy=False``.
+
+    Parameters
+    ----------
+    labels : array-like, optional
+        New labels / index to conform the axis specified by 'axis' to.
+    index, columns : array-like, optional
+        New labels / index to conform to, should be specified using
+        keywords. Preferably an Index object to avoid duplicating data.
+    axis : int or str, optional
+        Axis to target. Can be either the axis name ('index', 'columns')
+        or number (0, 1).
+    method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}
+        Method to use for filling holes in reindexed DataFrame.
+        Please note: this is only applicable to DataFrames/Series with a
+        monotonically increasing/decreasing index.
+
+        * None (default): don't fill gaps
+        * pad / ffill: Propagate last valid observation forward to next
+          valid.
+        * backfill / bfill: Use next valid observation to fill gap.
+        * nearest: Use nearest valid observations to fill gap.
+
+    copy : bool, default True
+        Return a new object, even if the passed indexes are the same.
+    level : int or name
+        Broadcast across a level, matching Index values on the
+        passed MultiIndex level.
+    fill_value : scalar, default np.NaN
+        Value to use for missing values. Defaults to NaN, but can be any
+        "compatible" value.
+    limit : int, default None
+        Maximum number of consecutive elements to forward or backward fill.
+    tolerance : optional
+        Maximum distance between original and new labels for inexact
+        matches. The values of the index at the matching locations most
+        satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
+
+        Tolerance may be a scalar value, which applies the same tolerance
+        to all values, or list-like, which applies variable tolerance per
+        element. List-like includes list, tuple, array, Series, and must be
+        the same size as the index and its dtype must exactly match the
+        index's type.
+
+    Returns
+    -------
+    Series/DataFrame with changed index.
+
+    See Also
+    --------
+    DataFrame.set_index : Set row labels.
+    DataFrame.reset_index : Remove row labels or move them to new columns.
+    DataFrame.reindex_like : Change to same indices as other DataFrame.
+
+    Examples
+    --------
+
+    ``DataFrame.reindex`` supports two calling conventions
+
+    * ``(index=index_labels, columns=column_labels, ...)``
+    * ``(labels, axis={'index', 'columns'}, ...)``
+
+    We *highly* recommend using keyword arguments to clarify your
+    intent.
+
+    Create a dataframe with some fictional data.
+
+    >>> import mars.dataframe as md
+    >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
+    >>> df = md.DataFrame({'http_status': [200, 200, 404, 404, 301],
+    ...                   'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},
+    ...                   index=index)
+    >>> df.execute()
+               http_status  response_time
+    Firefox            200           0.04
+    Chrome             200           0.02
+    Safari             404           0.07
+    IE10               404           0.08
+    Konqueror          301           1.00
+
+    Create a new index and reindex the dataframe. By default
+    values in the new index that do not have corresponding
+    records in the dataframe are assigned ``NaN``.
+
+    >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10',
+    ...              'Chrome']
+    >>> df.reindex(new_index).execute()
+                   http_status  response_time
+    Safari               404.0           0.07
+    Iceweasel              NaN            NaN
+    Comodo Dragon          NaN            NaN
+    IE10                 404.0           0.08
+    Chrome               200.0           0.02
+
+    We can fill in the missing values by passing a value to
+    the keyword ``fill_value``. Because the index is not monotonically
+    increasing or decreasing, we cannot use arguments to the keyword
+    ``method`` to fill the ``NaN`` values.
+
+    >>> df.reindex(new_index, fill_value=0).execute()
+                   http_status  response_time
+    Safari                 404           0.07
+    Iceweasel                0           0.00
+    Comodo Dragon            0           0.00
+    IE10                   404           0.08
+    Chrome                 200           0.02
+
+    >>> df.reindex(new_index, fill_value='missing').execute()
+                  http_status response_time
+    Safari                404          0.07
+    Iceweasel         missing       missing
+    Comodo Dragon     missing       missing
+    IE10                  404          0.08
+    Chrome                200          0.02
+
+    We can also reindex the columns.
+
+    >>> df.reindex(columns=['http_status', 'user_agent']).execute()
+               http_status  user_agent
+    Firefox            200         NaN
+    Chrome             200         NaN
+    Safari             404         NaN
+    IE10               404         NaN
+    Konqueror          301         NaN
+
+    Or we can use "axis-style" keyword arguments
+
+    >>> df.reindex(['http_status', 'user_agent'], axis="columns").execute()
+               http_status  user_agent
+    Firefox            200         NaN
+    Chrome             200         NaN
+    Safari             404         NaN
+    IE10               404         NaN
+    Konqueror          301         NaN
+
+    To further illustrate the filling functionality in
+    ``reindex``, we will create a dataframe with a
+    monotonically increasing index (for example, a sequence
+    of dates).
+
+    >>> date_index = md.date_range('1/1/2010', periods=6, freq='D')
+    >>> df2 = md.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]},
+    ...                    index=date_index)
+    >>> df2.execute()
+                prices
+    2010-01-01   100.0
+    2010-01-02   101.0
+    2010-01-03     NaN
+    2010-01-04   100.0
+    2010-01-05    89.0
+    2010-01-06    88.0
+
+    Suppose we decide to expand the dataframe to cover a wider
+    date range.
+
+    >>> date_index2 = md.date_range('12/29/2009', periods=10, freq='D')
+    >>> df2.reindex(date_index2).execute()
+                prices
+    2009-12-29     NaN
+    2009-12-30     NaN
+    2009-12-31     NaN
+    2010-01-01   100.0
+    2010-01-02   101.0
+    2010-01-03     NaN
+    2010-01-04   100.0
+    2010-01-05    89.0
+    2010-01-06    88.0
+    2010-01-07     NaN
+
+    The index entries that did not have a value in the original data frame
+    (for example, '2009-12-29') are by default filled with ``NaN``.
+    If desired, we can fill in the missing values using one of several
+    options.
+
+    For example, to back-propagate the last valid value to fill the ``NaN``
+    values, pass ``bfill`` as an argument to the ``method`` keyword.
+
+    >>> df2.reindex(date_index2, method='bfill').execute()
+                prices
+    2009-12-29   100.0
+    2009-12-30   100.0
+    2009-12-31   100.0
+    2010-01-01   100.0
+    2010-01-02   101.0
+    2010-01-03     NaN
+    2010-01-04   100.0
+    2010-01-05    89.0
+    2010-01-06    88.0
+    2010-01-07     NaN
+
+    Please note that the ``NaN`` value present in the original dataframe
+    (at index value 2010-01-03) will not be filled by any of the
+    value propagation schemes. This is because filling while reindexing
+    does not look at dataframe values, but only compares the original and
+    desired indexes. If you do want to fill in the ``NaN`` values present
+    in the original dataframe, use the ``fillna()`` method.
+
+    See the :ref:`user guide <basics.reindexing>` for more.
+    """
+    axes = validate_axis_style_args(df_or_series, args, kwargs, "labels", "reindex")
+    # Pop these, since the values are in `kwargs` under different names
+    kwargs.pop("index", None)
+    if df_or_series.ndim > 1:
+        kwargs.pop("columns", None)
+        kwargs.pop("axis", None)
+        kwargs.pop("labels", None)
+    method = kwargs.pop("method", None)
+    level = kwargs.pop("level", None)
+    copy = kwargs.pop("copy", True)
+    limit = kwargs.pop("limit", None)
+    tolerance = kwargs.pop("tolerance", None)
+    fill_value = kwargs.pop("fill_value", None)
+    enable_sparse = kwargs.pop("enable_sparse", None)
+
+    if kwargs:
+        raise TypeError(
+            "reindex() got an unexpected keyword "
+            f'argument "{list(kwargs.keys())[0]}"'
+        )
+
+    if tolerance is not None:  # pragma: no cover
+        raise NotImplementedError("`tolerance` is not supported yet")
+
+    if method == "nearest":  # pragma: no cover
+        raise NotImplementedError("method=nearest is not supported yet")
+
+    index = axes.get("index")
+    index_freq = None
+    if isinstance(index, ENTITY_TYPE):
+        if isinstance(index, DataFrameIndexType):
+            index_freq = getattr(index.index_value.value, "freq", None)
+        if not isinstance(index, INDEX_TYPE):
+            index = astensor(index)
+    elif index is not None:
+        index = np.asarray(index)
+        index_freq = getattr(index, "freq", None)
+
+    columns = axes.get("columns")
+    if isinstance(columns, ENTITY_TYPE):  # pragma: no cover
+        try:
+            columns = columns.fetch()
+        except ValueError:
+            raise NotImplementedError(
+                "`columns` need to be executed first if it's a Mars object"
+            )
+    elif columns is not None:
+        columns = np.asarray(columns)
+
+    if isinstance(fill_value, ENTITY_TYPE) and getattr(fill_value, "ndim", 0) != 0:
+        raise ValueError("fill_value must be a scalar")
+
+    op = DataFrameReindex(
+        index=index,
+        index_freq=index_freq,
+        columns=columns,
+        method=method,
+        level=level,
+        fill_value=fill_value,
+        limit=limit,
+        enable_sparse=enable_sparse,
+    )
+    ret = op(df_or_series)
+
+    if copy:
+        return ret.copy()
+    return ret
+
+
+def reindex_like(
+    df_or_series, other, method=None, copy=True, limit=None, tolerance=None
+):
+    """
+    Return an object with matching indices as other object.
+
+    Conform the object to the same index on all axes. Optional
+    filling logic, placing NaN in locations having no value
+    in the previous index. A new object is produced unless the
+    new index is equivalent to the current one and copy=False.
+
+    Parameters
+    ----------
+    other : Object of the same data type
+        Its row and column indices are used to define the new indices
+        of this object.
+    method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}
+        Method to use for filling holes in reindexed DataFrame.
+        Please note: this is only applicable to DataFrames/Series with a
+        monotonically increasing/decreasing index.
+
+        * None (default): don't fill gaps
+        * pad / ffill: propagate last valid observation forward to next
+          valid
+        * backfill / bfill: use next valid observation to fill gap
+        * nearest: use nearest valid observations to fill gap.
+
+    copy : bool, default True
+        Return a new object, even if the passed indexes are the same.
+    limit : int, default None
+        Maximum number of consecutive labels to fill for inexact matches.
+    tolerance : optional
+        Maximum distance between original and new labels for inexact
+        matches. The values of the index at the matching locations must
+        satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
+
+        Tolerance may be a scalar value, which applies the same tolerance
+        to all values, or list-like, which applies variable tolerance per
+        element. List-like includes list, tuple, array, Series, and must be
+        the same size as the index and its dtype must exactly match the
+        index's type.
+
+    Returns
+    -------
+    Series or DataFrame
+        Same type as caller, but with changed indices on each axis.
+
+    See Also
+    --------
+    DataFrame.set_index : Set row labels.
+    DataFrame.reset_index : Remove row labels or move them to new columns.
+    DataFrame.reindex : Change to new indices or expand indices.
+
+    Notes
+    -----
+    Same as calling
+    ``.reindex(index=other.index, columns=other.columns,...)``.
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> import mars.dataframe as md
+    >>> df1 = md.DataFrame([[24.3, 75.7, 'high'],
+    ...                     [31, 87.8, 'high'],
+    ...                     [22, 71.6, 'medium'],
+    ...                     [35, 95, 'medium']],
+    ...                    columns=['temp_celsius', 'temp_fahrenheit',
+    ...                             'windspeed'],
+    ...                    index=md.date_range(start='2014-02-12',
+    ...                                        end='2014-02-15', freq='D'))
+
+    >>> df1.execute()
+               temp_celsius temp_fahrenheit windspeed
+    2014-02-12         24.3            75.7      high
+    2014-02-13           31            87.8      high
+    2014-02-14           22            71.6    medium
+    2014-02-15           35              95    medium
+
+    >>> df2 = md.DataFrame([[28, 'low'],
+    ...                     [30, 'low'],
+    ...                     [35.1, 'medium']],
+    ...                    columns=['temp_celsius', 'windspeed'],
+    ...                    index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',
+    ...                                            '2014-02-15']))
+
+    >>> df2.execute()
+                temp_celsius windspeed
+    2014-02-12          28.0       low
+    2014-02-13          30.0       low
+    2014-02-15          35.1    medium
+
+    >>> df2.reindex_like(df1).execute()
+                temp_celsius  temp_fahrenheit windspeed
+    2014-02-12          28.0              NaN       low
+    2014-02-13          30.0              NaN       low
+    2014-02-14           NaN              NaN       NaN
+    2014-02-15          35.1              NaN    medium
+    """
+    cond = df_or_series.index_value.key == other.index_value.key
+    if df_or_series.ndim == 2:
+        cond &= df_or_series.columns_value.key == other.columns_value.key
+    if cond and not copy:
+        return df_or_series
+
+    kw = {
+        "index": other.index,
+        "method": method,
+        "limit": limit,
+        "tolerance": tolerance,
+    }
+    if df_or_series.ndim == 2:
+        kw["columns"] = other.dtypes.index
+    return reindex(df_or_series, **kw)
diff --git a/python/xorbits/_mars/dataframe/indexing/rename.py b/python/xorbits/_mars/dataframe/indexing/rename.py
new file mode 100644
index 000000000..5a38462a6
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/rename.py
@@ -0,0 +1,555 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+from ... import opcodes
+from ...core import OutputType, get_output_types
+from ...serialization.serializables import AnyField, StringField
+from ..core import SERIES_TYPE
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_df, build_series, parse_index, validate_axis
+
+
+class DataFrameRename(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.RENAME
+
+    _columns_mapper = AnyField("columns_mapper")
+    _index_mapper = AnyField("index_mapper")
+    _new_name = AnyField("new_name")
+    _level = AnyField("level")
+    _errors = StringField("errors")
+
+    def __init__(
+        self,
+        columns_mapper=None,
+        index_mapper=None,
+        new_name=None,
+        level=None,
+        errors=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _columns_mapper=columns_mapper,
+            _index_mapper=index_mapper,
+            _new_name=new_name,
+            _level=level,
+            _errors=errors,
+            _output_types=output_types,
+            **kw
+        )
+
+    @property
+    def columns_mapper(self):
+        return self._columns_mapper
+
+    @property
+    def index_mapper(self):
+        return self._index_mapper
+
+    @property
+    def new_name(self):
+        return self._new_name
+
+    @property
+    def level(self):
+        return self._level
+
+    @property
+    def errors(self) -> str:
+        return self._errors
+
+    def _calc_renamed_df(self, df, errors="ignore"):
+        empty_df = build_df(df)
+        return empty_df.rename(
+            columns=self._columns_mapper,
+            index=self._index_mapper,
+            level=self._level,
+            errors=errors,
+        )
+
+    def _calc_renamed_series(self, df, errors="ignore"):
+        empty_series = build_series(df, name=df.name)
+        new_series = empty_series.rename(
+            index=self._index_mapper, level=self._level, errors=errors
+        )
+        if self._new_name:
+            new_series.name = self._new_name
+        return new_series
+
+    def __call__(self, df):
+        params = df.params
+        raw_index = df.index_value.to_pandas()
+        if df.ndim == 2:
+            new_df = self._calc_renamed_df(df, errors=self.errors)
+            new_index = new_df.index
+        elif isinstance(df, SERIES_TYPE):
+            new_df = self._calc_renamed_series(df, errors=self.errors)
+            new_index = new_df.index
+        else:
+            new_df = new_index = raw_index.set_names(
+                self._index_mapper or self._new_name, level=self._level
+            )
+
+        if self._columns_mapper is not None:
+            params["columns_value"] = parse_index(new_df.columns, store_data=True)
+            params["dtypes"] = new_df.dtypes
+        if self._index_mapper is not None:
+            params["index_value"] = parse_index(new_index)
+        if df.ndim == 1:
+            params["name"] = new_df.name
+        return self.new_tileable([df], **params)
+
+    @classmethod
+    def tile(cls, op: "DataFrameRename"):
+        inp = op.inputs[0]
+        out = op.outputs[0]
+        chunks = []
+
+        dtypes_cache = dict()
+        for c in inp.chunks:
+            params = c.params
+            new_op = op.copy().reset_key()
+
+            if op.columns_mapper is not None:
+                try:
+                    new_dtypes = dtypes_cache[c.index[1]]
+                except KeyError:
+                    new_dtypes = dtypes_cache[c.index[1]] = op._calc_renamed_df(
+                        c
+                    ).dtypes
+
+                params["columns_value"] = parse_index(new_dtypes.index, store_data=True)
+                params["dtypes"] = new_dtypes
+            if op.index_mapper is not None:
+                params["index_value"] = out.index_value
+            if out.ndim == 1:
+                params["name"] = out.name
+
+            if isinstance(op.columns_mapper, dict):
+                idx = params["dtypes"].index
+                if op._level is not None:
+                    idx = idx.get_level_values(op._level)
+                new_op._columns_mapper = {
+                    k: v for k, v in op.columns_mapper.items() if v in idx
+                }
+            chunks.append(new_op.new_chunk([c], **params))
+
+        new_op = op.copy().reset_key()
+        return new_op.new_tileables(
+            [inp], chunks=chunks, nsplits=inp.nsplits, **out.params
+        )
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameRename"):
+        input_ = ctx[op.inputs[0].key]
+        if input_.ndim == 2:
+            ctx[op.outputs[0].key] = input_.rename(
+                index=op.index_mapper, columns=op.columns_mapper, level=op.level
+            )
+        elif op.output_types[0] == OutputType.series:
+            ctx[op.outputs[0].key] = input_.rename(
+                index=op.index_mapper or op.new_name, level=op.level
+            )
+        else:
+            ctx[op.outputs[0].key] = input_.set_names(
+                op.index_mapper or op.new_name, level=op.level
+            )
+
+
+def _rename(
+    df_obj,
+    index_mapper=None,
+    columns_mapper=None,
+    copy=True,
+    inplace=False,
+    level=None,
+    errors="ignore",
+):
+    if not copy:
+        raise NotImplementedError("`copy=False` not implemented")
+
+    if index_mapper is not None and errors == "raise" and not inplace:
+        warnings.warn("Errors will not raise for non-existing indices")
+
+    op = DataFrameRename(
+        columns_mapper=columns_mapper,
+        index_mapper=index_mapper,
+        level=level,
+        errors=errors,
+        output_types=get_output_types(df_obj),
+    )
+    ret = op(df_obj)
+    if inplace:
+        df_obj.data = ret.data
+    else:
+        return ret
+
+
+def df_rename(
+    df,
+    mapper=None,
+    index=None,
+    columns=None,
+    axis="index",
+    copy=True,
+    inplace=False,
+    level=None,
+    errors="ignore",
+):
+    """
+    Alter axes labels.
+
+    Function / dict values must be unique (1-to-1). Labels not contained in
+    a dict / Series will be left as-is. Extra labels listed don't throw an
+    error.
+
+    Parameters
+    ----------
+    mapper : dict-like or function
+        Dict-like or functions transformations to apply to
+        that axis' values. Use either ``mapper`` and ``axis`` to
+        specify the axis to target with ``mapper``, or ``index`` and
+        ``columns``.
+    index : dict-like or function
+        Alternative to specifying axis (``mapper, axis=0``
+        is equivalent to ``index=mapper``).
+    columns : dict-like or function
+        Alternative to specifying axis (``mapper, axis=1``
+        is equivalent to ``columns=mapper``).
+    axis : int or str
+        Axis to target with ``mapper``. Can be either the axis name
+        ('index', 'columns') or number (0, 1). The default is 'index'.
+    copy : bool, default True
+        Also copy underlying data.
+    inplace : bool, default False
+        Whether to return a new DataFrame. If True then value of copy is
+        ignored.
+    level : int or level name, default None
+        In case of a MultiIndex, only rename labels in the specified
+        level.
+    errors : {'ignore', 'raise'}, default 'ignore'
+        If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
+        or `columns` contains labels that are not present in the Index
+        being transformed.
+        If 'ignore', existing keys will be renamed and extra keys will be
+        ignored.
+
+    Returns
+    -------
+    DataFrame
+        DataFrame with the renamed axis labels.
+
+    Raises
+    ------
+    KeyError
+        If any of the labels is not found in the selected axis and
+        "errors='raise'".
+
+    See Also
+    --------
+    DataFrame.rename_axis : Set the name of the axis.
+
+    Examples
+    --------
+
+    ``DataFrame.rename`` supports two calling conventions
+
+    * ``(index=index_mapper, columns=columns_mapper, ...)``
+    * ``(mapper, axis={'index', 'columns'}, ...)``
+
+    We *highly* recommend using keyword arguments to clarify your
+    intent.
+
+    Rename columns using a mapping:
+
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+    >>> df.rename(columns={"A": "a", "B": "c"}).execute()
+       a  c
+    0  1  4
+    1  2  5
+    2  3  6
+
+    Rename index using a mapping:
+
+    >>> df.rename(index={0: "x", 1: "y", 2: "z"}).execute()
+       A  B
+    x  1  4
+    y  2  5
+    z  3  6
+
+    Cast index labels to a different type:
+
+    >>> df.index.execute()
+    RangeIndex(start=0, stop=3, step=1)
+    >>> df.rename(index=str).index.execute()
+    Index(['0', '1', '2'], dtype='object')
+
+    >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise").execute()
+    Traceback (most recent call last):
+    KeyError: ['C'] not found in axis
+
+    Using axis-style parameters
+
+    >>> df.rename(str.lower, axis='columns').execute()
+       a  b
+    0  1  4
+    1  2  5
+    2  3  6
+
+    >>> df.rename({1: 2, 2: 4}, axis='index').execute()
+       A  B
+    0  1  4
+    2  2  5
+    4  3  6
+
+    """
+    axis = validate_axis(axis, df)
+    if axis == 0:
+        index_mapper = index if index is not None else mapper
+        columns_mapper = columns
+    else:
+        columns_mapper = columns if columns is not None else mapper
+        index_mapper = index
+
+    if index_mapper is not None and errors == "raise" and not inplace:
+        warnings.warn("Errors will not raise for non-existing indices")
+
+    return _rename(
+        df,
+        index_mapper=index_mapper,
+        columns_mapper=columns_mapper,
+        copy=copy,
+        inplace=inplace,
+        level=level,
+        errors=errors,
+    )
+
+
+def series_rename(
+    series,
+    index=None,
+    *,
+    axis="index",
+    copy=True,
+    inplace=False,
+    level=None,
+    errors="ignore"
+):
+    """
+    Alter Series index labels or name.
+
+    Function / dict values must be unique (1-to-1). Labels not contained in
+    a dict / Series will be left as-is. Extra labels listed don't throw an
+    error.
+
+    Alternatively, change ``Series.name`` with a scalar value.
+
+    Parameters
+    ----------
+    axis : {0 or "index"}
+        Unused. Accepted for compatibility with DataFrame method only.
+    index : scalar, hashable sequence, dict-like or function, optional
+        Functions or dict-like are transformations to apply to
+        the index.
+        Scalar or hashable sequence-like will alter the ``Series.name``
+        attribute.
+
+    **kwargs
+        Additional keyword arguments passed to the function. Only the
+        "inplace" keyword is used.
+
+    Returns
+    -------
+    Series
+        Series with index labels or name altered.
+
+    See Also
+    --------
+    DataFrame.rename : Corresponding DataFrame method.
+    Series.rename_axis : Set the name of the axis.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> s = md.Series([1, 2, 3])
+    >>> s.execute()
+    0    1
+    1    2
+    2    3
+    dtype: int64
+    >>> s.rename("my_name").execute()  # scalar, changes Series.name.execute()
+    0    1
+    1    2
+    2    3
+    Name: my_name, dtype: int64
+    >>> s.rename(lambda x: x ** 2).execute()  # function, changes labels.execute()
+    0    1
+    1    2
+    4    3
+    dtype: int64
+    >>> s.rename({1: 3, 2: 5}).execute()  # mapping, changes labels.execute()
+    0    1
+    3    2
+    5    3
+    dtype: int64
+    """
+    validate_axis(axis)
+    return _rename(
+        series,
+        index_mapper=index,
+        copy=copy,
+        inplace=inplace,
+        level=level,
+        errors=errors,
+    )
+
+
+def index_rename(index, name, inplace=False):
+    """
+    Alter Index or MultiIndex name.
+
+    Able to set new names without level. Defaults to returning new index.
+    Length of names must match number of levels in MultiIndex.
+
+    Parameters
+    ----------
+    name : label or list of labels
+        Name(s) to set.
+    inplace : bool, default False
+        Modifies the object directly, instead of creating a new Index or
+        MultiIndex.
+
+    Returns
+    -------
+    Index
+        The same type as the caller or None if inplace is True.
+
+    See Also
+    --------
+    Index.set_names : Able to set new names partially and by level.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> idx = md.Index(['A', 'C', 'A', 'B'], name='score')
+    >>> idx.rename('grade').execute()
+    Index(['A', 'C', 'A', 'B'], dtype='object', name='grade')
+
+    >>> idx = md.Index([('python', 2018),
+    ...                 ('python', 2019),
+    ...                 ('cobra', 2018),
+    ...                 ('cobra', 2019)],
+    ...                names=['kind', 'year'])
+    >>> idx.execute()
+    MultiIndex([('python', 2018),
+                ('python', 2019),
+                ( 'cobra', 2018),
+                ( 'cobra', 2019)],
+               names=['kind', 'year'])
+    >>> idx.rename(['species', 'year']).execute()
+    MultiIndex([('python', 2018),
+                ('python', 2019),
+                ( 'cobra', 2018),
+                ( 'cobra', 2019)],
+               names=['species', 'year'])
+    >>> idx.rename('species').execute()
+    Traceback (most recent call last):
+    TypeError: Must pass list-like as `names`.
+    """
+    op = DataFrameRename(index_mapper=name, output_types=get_output_types(index))
+    ret = op(index)
+    if inplace:
+        index.data = ret.data
+    else:
+        return ret
+
+
+def index_set_names(index, names, level=None, inplace=False):
+    """
+    Set Index or MultiIndex name.
+
+    Able to set new names partially and by level.
+
+    Parameters
+    ----------
+    names : label or list of label
+        Name(s) to set.
+    level : int, label or list of int or label, optional
+        If the index is a MultiIndex, level(s) to set (None for all
+        levels). Otherwise level must be None.
+    inplace : bool, default False
+        Modifies the object directly, instead of creating a new Index or
+        MultiIndex.
+
+    Returns
+    -------
+    Index
+        The same type as the caller or None if inplace is True.
+
+    See Also
+    --------
+    Index.rename : Able to set new names without level.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> idx = md.Index([1, 2, 3, 4])
+    >>> idx.execute()
+    Int64Index([1, 2, 3, 4], dtype='int64')
+    >>> idx.set_names('quarter').execute()
+    Int64Index([1, 2, 3, 4], dtype='int64', name='quarter')
+
+    >>> idx = md.MultiIndex.from_product([['python', 'cobra'],
+    ...                                   [2018, 2019]])
+    >>> idx.execute()
+    MultiIndex([('python', 2018),
+                ('python', 2019),
+                ( 'cobra', 2018),
+                ( 'cobra', 2019)],
+               )
+    >>> idx.set_names(['kind', 'year'], inplace=True)
+    >>> idx.execute()
+    MultiIndex([('python', 2018),
+                ('python', 2019),
+                ( 'cobra', 2018),
+                ( 'cobra', 2019)],
+               names=['kind', 'year'])
+    >>> idx.set_names('species', level=0).execute()
+    MultiIndex([('python', 2018),
+                ('python', 2019),
+                ( 'cobra', 2018),
+                ( 'cobra', 2019)],
+               names=['species', 'year'])
+    """
+    op = DataFrameRename(
+        index_mapper=names, level=level, output_types=get_output_types(index)
+    )
+    ret = op(index)
+
+    if inplace:
+        df_or_series = getattr(index, "_get_df_or_series", lambda: None)()
+        if df_or_series is not None:
+            from .rename_axis import rename_axis_with_level
+
+            rename_axis_with_level(
+                df_or_series, names, axis=index._axis, level=level, inplace=True
+            )
+            index.data = df_or_series.axes[index._axis].data
+        else:
+            index.data = ret.data
+    else:
+        return ret
diff --git a/python/xorbits/_mars/dataframe/indexing/rename_axis.py b/python/xorbits/_mars/dataframe/indexing/rename_axis.py
new file mode 100644
index 000000000..6bee645ea
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/rename_axis.py
@@ -0,0 +1,277 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes
+from ...core import OutputType
+from ...serialization.serializables import AnyField, BoolField
+from ..core import DATAFRAME_TYPE
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_df, build_series, parse_index, validate_axis
+
+
+class DataFrameRenameAxis(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.RENAME_AXIS
+
+    _index = AnyField("index")
+    _columns = AnyField("columns")
+    _copy_value = BoolField("copy_value")
+    _level = AnyField("level")
+
+    def __init__(self, index=None, columns=None, copy_value=None, level=None, **kw):
+        super().__init__(
+            _index=index, _columns=columns, _copy_value=copy_value, _level=level, **kw
+        )
+
+    @property
+    def index(self):
+        return self._index
+
+    @property
+    def columns(self):
+        return self._columns
+
+    @property
+    def copy_value(self):
+        return self._copy_value
+
+    @property
+    def level(self):
+        return self._level
+
+    @staticmethod
+    def _update_params(params, obj, mapper, axis, level):
+        if obj.ndim == 2:
+            test_obj = build_df(obj)
+        else:
+            test_obj = build_series(obj)
+
+        if level is None:
+            test_obj = test_obj.rename_axis(mapper, axis=axis)
+        else:
+            test_obj.axes[axis].set_names(mapper, level=level, inplace=True)
+
+        if axis == 0:
+            params["index_value"] = parse_index(test_obj.index, store_data=False)
+        else:
+            params["dtypes"] = test_obj.dtypes
+            params["columns_value"] = parse_index(test_obj.columns, store_data=True)
+
+    def __call__(self, df_or_series):
+        params = df_or_series.params
+
+        if isinstance(df_or_series, DATAFRAME_TYPE):
+            self._output_types = [OutputType.dataframe]
+        else:
+            self._output_types = [OutputType.series]
+
+        if self.index is not None:
+            self._update_params(
+                params, df_or_series, self.index, axis=0, level=self.level
+            )
+        else:
+            self._update_params(
+                params, df_or_series, self.columns, axis=1, level=self.level
+            )
+
+        return self.new_tileable([df_or_series], **params)
+
+    @classmethod
+    def tile(cls, op: "DataFrameRenameAxis"):
+        in_obj = op.inputs[0]
+        out_obj = op.outputs[0]
+
+        chunks = []
+        idx_cache = dict()
+        for c in in_obj.chunks:
+            params = c.params
+            if op.index is not None:
+                try:
+                    params["index_value"] = idx_cache[c.index[0]]
+                except KeyError:
+                    cls._update_params(params, c, op.index, axis=0, level=op.level)
+                    idx_cache[c.index[0]] = params["index_value"]
+            else:
+                try:
+                    params["columns_value"], params["dtypes"] = idx_cache[c.index[1]]
+                except KeyError:
+                    cls._update_params(params, c, op.columns, axis=1, level=op.level)
+                    idx_cache[c.index[1]] = params["columns_value"], params["dtypes"]
+
+            new_op = op.copy().reset_key()
+            chunks.append(new_op.new_chunk([c], **params))
+
+        new_op = op.copy().reset_key()
+        return new_op.new_tileables(
+            [in_obj], chunks=chunks, nsplits=in_obj.nsplits, **out_obj.params
+        )
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameRenameAxis"):
+        in_data = ctx[op.inputs[0].key]
+        if op.index is not None:
+            val, axis = op.index, 0
+        else:
+            val, axis = op.columns, 1
+
+        if op.level is None:
+            ctx[op.outputs[0].key] = in_data.rename_axis(
+                val, axis=axis, copy=op.copy_value
+            )
+        else:
+            ret = in_data.copy() if op.copy_value else in_data
+            ret.axes[axis].set_names(val, level=op.level, inplace=True)
+            ctx[op.outputs[0].key] = ret
+
+
+def rename_axis_with_level(
+    df_or_series,
+    mapper=None,
+    index=None,
+    columns=None,
+    axis=0,
+    copy=True,
+    level=None,
+    inplace=False,
+):
+    axis = validate_axis(axis, df_or_series)
+    if mapper is not None:
+        if axis == 0:
+            index = mapper
+        else:
+            columns = mapper
+    op = DataFrameRenameAxis(index=index, columns=columns, copy_value=copy, level=level)
+    result = op(df_or_series)
+    if not inplace:
+        return result
+    else:
+        df_or_series.data = result.data
+
+
+def rename_axis(
+    df_or_series,
+    mapper=None,
+    index=None,
+    columns=None,
+    axis=0,
+    copy=True,
+    inplace=False,
+):
+    """
+    Set the name of the axis for the index or columns.
+
+    Parameters
+    ----------
+    mapper : scalar, list-like, optional
+        Value to set the axis name attribute.
+    index, columns : scalar, list-like, dict-like or function, optional
+        A scalar, list-like, dict-like or functions transformations to
+        apply to that axis' values.
+        Note that the ``columns`` parameter is not allowed if the
+        object is a Series. This parameter only apply for DataFrame
+        type objects.
+
+        Use either ``mapper`` and ``axis`` to
+        specify the axis to target with ``mapper``, or ``index``
+        and/or ``columns``.
+
+    axis : {0 or 'index', 1 or 'columns'}, default 0
+        The axis to rename.
+    copy : bool, default True
+        Also copy underlying data.
+    inplace : bool, default False
+        Modifies the object directly, instead of creating a new Series
+        or DataFrame.
+
+    Returns
+    -------
+    Series, DataFrame, or None
+        The same type as the caller or None if `inplace` is True.
+
+    See Also
+    --------
+    Series.rename : Alter Series index labels or name.
+    DataFrame.rename : Alter DataFrame index labels or name.
+    Index.rename : Set new names on index.
+
+    Notes
+    -----
+    ``DataFrame.rename_axis`` supports two calling conventions
+
+    * ``(index=index_mapper, columns=columns_mapper, ...)``
+    * ``(mapper, axis={'index', 'columns'}, ...)``
+
+    The first calling convention will only modify the names of
+    the index and/or the names of the Index object that is the columns.
+    In this case, the parameter ``copy`` is ignored.
+
+    The second calling convention will modify the names of the
+    the corresponding index if mapper is a list or a scalar.
+    However, if mapper is dict-like or a function, it will use the
+    deprecated behavior of modifying the axis *labels*.
+
+    We *highly* recommend using keyword arguments to clarify your
+    intent.
+
+    Examples
+    --------
+    **Series**
+
+    >>> import mars.dataframe as md
+    >>> s = md.Series(["dog", "cat", "monkey"])
+    >>> s.execute()
+    0       dog
+    1       cat
+    2    monkey
+    dtype: object
+    >>> s.rename_axis("animal").execute()
+    animal
+    0    dog
+    1    cat
+    2    monkey
+    dtype: object
+
+    **DataFrame**
+
+    >>> df = md.DataFrame({"num_legs": [4, 4, 2],
+    ...                    "num_arms": [0, 0, 2]},
+    ...                   ["dog", "cat", "monkey"])
+    >>> df.execute()
+            num_legs  num_arms
+    dog            4         0
+    cat            4         0
+    monkey         2         2
+    >>> df = df.rename_axis("animal")
+    >>> df.execute()
+            num_legs  num_arms
+    animal
+    dog            4         0
+    cat            4         0
+    monkey         2         2
+    >>> df = df.rename_axis("limbs", axis="columns")
+    >>> df.execute()
+    limbs   num_legs  num_arms
+    animal
+    dog            4         0
+    cat            4         0
+    monkey         2         2
+    """
+    return rename_axis_with_level(
+        df_or_series,
+        mapper=mapper,
+        index=index,
+        columns=columns,
+        axis=axis,
+        copy=copy,
+        inplace=inplace,
+    )
diff --git a/python/xorbits/_mars/dataframe/indexing/reset_index.py b/python/xorbits/_mars/dataframe/indexing/reset_index.py
new file mode 100644
index 000000000..30fad4aab
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/reset_index.py
@@ -0,0 +1,619 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import OutputType
+from ...serialization.serializables import AnyField, BoolField
+from ...utils import calc_nsplits, no_default
+from ..core import IndexValue
+from ..operands import DATAFRAME_TYPE, DataFrameOperand, DataFrameOperandMixin
+from ..utils import (
+    build_empty_df,
+    build_empty_series,
+    parse_index,
+    standardize_range_index,
+)
+
+
+class DataFrameResetIndex(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.RESET_INDEX
+
+    _level = AnyField("level")
+    _drop = BoolField("drop")
+    _name = AnyField("name")
+    _col_level = AnyField("col_level")
+    _col_fill = AnyField("col_fill")
+    _incremental_index = BoolField("incremental_index")
+
+    def __init__(
+        self,
+        level=None,
+        drop=None,
+        name=None,
+        col_level=None,
+        col_fill=None,
+        incremental_index=None,
+        output_types=None,
+        **kwargs
+    ):
+        super().__init__(
+            _level=level,
+            _drop=drop,
+            _name=name,
+            _col_level=col_level,
+            _col_fill=col_fill,
+            _incremental_index=incremental_index,
+            _output_types=output_types,
+            **kwargs
+        )
+
+    @property
+    def level(self):
+        return self._level
+
+    @property
+    def drop(self):
+        return self._drop
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def col_level(self):
+        return self._col_level
+
+    @property
+    def col_fill(self):
+        return self._col_fill
+
+    @property
+    def incremental_index(self):
+        return self._incremental_index
+
+    @classmethod
+    def _tile_series(cls, op: "DataFrameResetIndex"):
+        out_chunks = []
+        out = op.outputs[0]
+        is_range_index = out.index_value.has_value()
+        cum_range = np.cumsum((0,) + op.inputs[0].nsplits[0])
+        for c in op.inputs[0].chunks:
+            if is_range_index:
+                index_value = parse_index(
+                    pd.RangeIndex(cum_range[c.index[0]], cum_range[c.index[0] + 1])
+                )
+            else:
+                index_value = out.index_value
+            chunk_op = op.copy().reset_key()
+            if op.drop:
+                out_chunk = chunk_op.new_chunk(
+                    [c],
+                    shape=c.shape,
+                    index=c.index,
+                    dtype=c.dtype,
+                    name=c.name,
+                    index_value=index_value,
+                )
+            else:
+                shape = (c.shape[0], out.shape[1])
+                out_chunk = chunk_op.new_chunk(
+                    [c],
+                    shape=shape,
+                    index=c.index + (0,),
+                    dtypes=out.dtypes,
+                    index_value=index_value,
+                    columns_value=out.columns_value,
+                )
+            out_chunks.append(out_chunk)
+        if (
+            not is_range_index
+            and isinstance(out.index_value.value, IndexValue.RangeIndex)
+            and op.incremental_index
+        ):
+            yield out_chunks
+            out_chunks = standardize_range_index(out_chunks)
+        new_op = op.copy()
+        nsplits = calc_nsplits({c.index: c.shape for c in out_chunks})
+        if op.drop:
+            return new_op.new_seriess(
+                op.inputs,
+                op.inputs[0].shape,
+                name=out.name,
+                chunks=out_chunks,
+                nsplits=nsplits,
+                dtype=out.dtype,
+                index_value=out.index_value,
+            )
+        else:
+            return new_op.new_dataframes(
+                op.inputs,
+                out.shape,
+                nsplits=nsplits,
+                chunks=out_chunks,
+                index_value=out.index_value,
+                columns_value=out.columns_value,
+                dtypes=out.dtypes,
+            )
+
+    @classmethod
+    def _tile_dataframe(cls, op: "DataFrameResetIndex"):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+        added_columns_num = len(out_df.dtypes) - len(in_df.dtypes)
+        out_chunks = []
+        index_has_value = out_df.index_value.has_value()
+        chunk_has_nan = any(np.isnan(s) for s in in_df.nsplits[0])
+        cum_range = np.cumsum((0,) + in_df.nsplits[0])
+        for c in in_df.chunks:
+            if index_has_value:
+                if chunk_has_nan:
+                    index_value = parse_index(pd.RangeIndex(-1))
+                else:
+                    index_value = parse_index(
+                        pd.RangeIndex(cum_range[c.index[0]], cum_range[c.index[0] + 1])
+                    )
+            else:
+                index_value = out_df.index_value
+            if c.index[1] == 0:
+                chunk_op = op.copy().reset_key()
+                dtypes = out_df.dtypes[: (added_columns_num + len(c.dtypes))]
+                columns_value = parse_index(dtypes.index)
+                new_chunk = chunk_op.new_chunk(
+                    [c],
+                    shape=(c.shape[0], c.shape[1] + added_columns_num),
+                    index=c.index,
+                    index_value=index_value,
+                    columns_value=columns_value,
+                    dtypes=dtypes,
+                )
+            else:
+                chunk_op = op.copy().reset_key()
+                chunk_op._drop = True
+                new_chunk = chunk_op.new_chunk(
+                    [c],
+                    shape=c.shape,
+                    index_value=index_value,
+                    index=c.index,
+                    columns_value=c.columns_value,
+                    dtypes=c.dtypes,
+                )
+            out_chunks.append(new_chunk)
+        if not index_has_value or chunk_has_nan:
+            if (
+                isinstance(out_df.index_value.value, IndexValue.RangeIndex)
+                and op.incremental_index
+            ):
+                yield out_chunks
+                out_chunks = standardize_range_index(out_chunks)
+        new_op = op.copy()
+        columns_splits = list(in_df.nsplits[1])
+        columns_splits[0] += added_columns_num
+        nsplits = calc_nsplits({c.index: c.shape for c in out_chunks})
+        return new_op.new_dataframes(
+            op.inputs,
+            out_df.shape,
+            nsplits=nsplits,
+            chunks=out_chunks,
+            dtypes=out_df.dtypes,
+            index_value=out_df.index_value,
+            columns_value=out_df.columns_value,
+        )
+
+    @classmethod
+    def tile(cls, op):
+        if isinstance(op.inputs[0], DATAFRAME_TYPE):
+            return (yield from cls._tile_dataframe(op))
+        else:
+            return (yield from cls._tile_series(op))
+
+    @classmethod
+    def execute(cls, ctx, op):
+        in_data = ctx[op.inputs[0].key]
+        out = op.outputs[0]
+
+        kwargs = dict()
+        if op.name is not None:
+            kwargs["name"] = op.name
+        if op.col_level is not None:
+            kwargs["col_level"] = op.col_level
+        if op.col_fill is not None:
+            kwargs["col_fill"] = op.col_fill
+
+        r = in_data.reset_index(level=op.level, drop=op.drop, **kwargs)
+        if out.index_value.has_value():
+            r.index = out.index_value.to_pandas()
+        ctx[out.key] = r
+
+    @classmethod
+    def _get_out_index(cls, df, out_shape):
+        if isinstance(df.index, pd.RangeIndex):
+            range_value = -1 if np.isnan(out_shape[0]) else out_shape[0]
+            index_value = parse_index(pd.RangeIndex(range_value))
+        else:
+            index_value = parse_index(df.index)
+        return index_value
+
+    def _call_series(self, a):
+        if self.drop:
+            range_value = -1 if np.isnan(a.shape[0]) else a.shape[0]
+            index_value = parse_index(pd.RangeIndex(range_value))
+            return self.new_series(
+                [a], shape=a.shape, dtype=a.dtype, name=a.name, index_value=index_value
+            )
+        else:
+            empty_series = build_empty_series(
+                dtype=a.dtype, index=a.index_value.to_pandas()[:0], name=a.name
+            )
+            empty_df = empty_series.reset_index(level=self.level, name=self.name)
+            shape = (a.shape[0], len(empty_df.dtypes))
+            index_value = self._get_out_index(empty_df, shape)
+            return self.new_dataframe(
+                [a],
+                shape=shape,
+                index_value=index_value,
+                columns_value=parse_index(empty_df.columns),
+                dtypes=empty_df.dtypes,
+            )
+
+    def _call_dataframe(self, a):
+        if self.drop:
+            shape = a.shape
+            columns_value = a.columns_value
+            dtypes = a.dtypes
+            range_value = -1 if np.isnan(a.shape[0]) else a.shape[0]
+            index_value = parse_index(pd.RangeIndex(range_value))
+        else:
+            empty_df = build_empty_df(a.dtypes)
+            empty_df.index = a.index_value.to_pandas()[:0]
+            empty_df = empty_df.reset_index(
+                level=self.level, col_level=self.col_level, col_fill=self.col_fill
+            )
+            shape = (a.shape[0], len(empty_df.columns))
+            columns_value = parse_index(empty_df.columns, store_data=True)
+            dtypes = empty_df.dtypes
+            index_value = self._get_out_index(empty_df, shape)
+        return self.new_dataframe(
+            [a],
+            shape=shape,
+            columns_value=columns_value,
+            index_value=index_value,
+            dtypes=dtypes,
+        )
+
+    def __call__(self, a):
+        if isinstance(a, DATAFRAME_TYPE):
+            return self._call_dataframe(a)
+        else:
+            return self._call_series(a)
+
+
+def df_reset_index(
+    df,
+    level=None,
+    drop=False,
+    inplace=False,
+    col_level=0,
+    col_fill="",
+    incremental_index=False,
+):
+    """
+    Reset the index, or a level of it.
+
+    Reset the index of the DataFrame, and use the default one instead.
+    If the DataFrame has a MultiIndex, this method can remove one or more
+    levels.
+
+    Parameters
+    ----------
+    level : int, str, tuple, or list, default None
+        Only remove the given levels from the index. Removes all levels by
+        default.
+    drop : bool, default False
+        Do not try to insert index into dataframe columns. This resets
+        the index to the default integer index.
+    inplace : bool, default False
+        Modify the DataFrame in place (do not create a new object).
+    col_level : int or str, default 0
+        If the columns have multiple levels, determines which level the
+        labels are inserted into. By default it is inserted into the first
+        level.
+    col_fill : object, default ''
+        If the columns have multiple levels, determines how the other
+        levels are named. If None then the index name is repeated.
+    incremental_index: bool, default False
+        Ensure RangeIndex incremental, when output DataFrame has multiple chunks,
+        ensuring index incremental costs more computation,
+        so by default, each chunk will have index which starts from 0,
+        setting incremental_index=True，reset_index will guarantee that
+        output DataFrame's index is from 0 to n - 1.
+
+    Returns
+    -------
+    DataFrame or None
+        DataFrame with the new index or None if ``inplace=True``.
+
+    See Also
+    --------
+    DataFrame.set_index : Opposite of reset_index.
+    DataFrame.reindex : Change to new indices or expand indices.
+    DataFrame.reindex_like : Change to same indices as other DataFrame.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame([('bird', 389.0),
+    ...                    ('bird', 24.0),
+    ...                    ('mammal', 80.5),
+    ...                    ('mammal', mt.nan)],
+    ...                   index=['falcon', 'parrot', 'lion', 'monkey'],
+    ...                   columns=('class', 'max_speed'))
+    >>> df.execute()
+             class  max_speed
+    falcon    bird      389.0
+    parrot    bird       24.0
+    lion    mammal       80.5
+    monkey  mammal        NaN
+
+    When we reset the index, the old index is added as a column, and a
+    new sequential index is used:
+
+    >>> df.reset_index().execute()
+        index   class  max_speed
+    0  falcon    bird      389.0
+    1  parrot    bird       24.0
+    2    lion  mammal       80.5
+    3  monkey  mammal        NaN
+
+    We can use the `drop` parameter to avoid the old index being added as
+    a column:
+
+    >>> df.reset_index(drop=True).execute()
+        class  max_speed
+    0    bird      389.0
+    1    bird       24.0
+    2  mammal       80.5
+    3  mammal        NaN
+
+    You can also use `reset_index` with `MultiIndex`.
+
+    >>> import pandas as pd
+    >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
+    ...                                    ('bird', 'parrot'),
+    ...                                    ('mammal', 'lion'),
+    ...                                    ('mammal', 'monkey')],
+    ...                                   names=['class', 'name'])
+    >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
+    ...                                      ('species', 'type')])
+    >>> df = md.DataFrame([(389.0, 'fly'),
+    ...                    ( 24.0, 'fly'),
+    ...                    ( 80.5, 'run'),
+    ...                    (mt.nan, 'jump')],
+    ...                   index=index,
+    ...                   columns=columns)
+    >>> df.execute()
+                   speed species
+                     max    type
+    class  name
+    bird   falcon  389.0     fly
+           parrot   24.0     fly
+    mammal lion     80.5     run
+           monkey    NaN    jump
+
+    If the index has multiple levels, we can reset a subset of them:
+
+    >>> df.reset_index(level='class').execute()
+             class  speed species
+                      max    type
+    name
+    falcon    bird  389.0     fly
+    parrot    bird   24.0     fly
+    lion    mammal   80.5     run
+    monkey  mammal    NaN    jump
+
+    If we are not dropping the index, by default, it is placed in the top
+    level. We can place it in another level:
+
+    >>> df.reset_index(level='class', col_level=1).execute()
+                    speed species
+             class    max    type
+    name
+    falcon    bird  389.0     fly
+    parrot    bird   24.0     fly
+    lion    mammal   80.5     run
+    monkey  mammal    NaN    jump
+
+    When the index is inserted under another level, we can specify under
+    which one with the parameter `col_fill`:
+
+    >>> df.reset_index(level='class', col_level=1, col_fill='species').execute()
+                  species  speed species
+                    class    max    type
+    name
+    falcon           bird  389.0     fly
+    parrot           bird   24.0     fly
+    lion           mammal   80.5     run
+    monkey         mammal    NaN    jump
+
+    If we specify a nonexistent level for `col_fill`, it is created:
+
+    >>> df.reset_index(level='class', col_level=1, col_fill='genus').execute()
+                    genus  speed species
+                    class    max    type
+    name
+    falcon           bird  389.0     fly
+    parrot           bird   24.0     fly
+    lion           mammal   80.5     run
+    monkey         mammal    NaN    jump
+    """
+    op = DataFrameResetIndex(
+        level=level,
+        drop=drop,
+        col_level=col_level,
+        col_fill=col_fill,
+        incremental_index=incremental_index,
+        output_types=[OutputType.dataframe],
+    )
+    ret = op(df)
+    if not inplace:
+        return ret
+    else:
+        df.data = ret.data
+
+
+def series_reset_index(
+    series,
+    level=None,
+    drop=False,
+    name=no_default,
+    inplace=False,
+    incremental_index=False,
+):
+    """
+    Generate a new DataFrame or Series with the index reset.
+
+    This is useful when the index needs to be treated as a column, or
+    when the index is meaningless and needs to be reset to the default
+    before another operation.
+
+    Parameters
+    ----------
+    level : int, str, tuple, or list, default optional
+        For a Series with a MultiIndex, only remove the specified levels
+        from the index. Removes all levels by default.
+    drop : bool, default False
+        Just reset the index, without inserting it as a column in
+        the new DataFrame.
+    name : object, optional
+        The name to use for the column containing the original Series
+        values. Uses ``self.name`` by default. This argument is ignored
+        when `drop` is True.
+    inplace : bool, default False
+        Modify the Series in place (do not create a new object).
+    incremental_index: bool, default False
+        Ensure RangeIndex incremental, when output Series has multiple chunks,
+        ensuring index incremental costs more computation,
+        so by default, each chunk will have index which starts from 0,
+        setting incremental_index=True，reset_index will guarantee that
+        output Series's index is from 0 to n - 1.
+
+    Returns
+    -------
+    Series or DataFrame
+        When `drop` is False (the default), a DataFrame is returned.
+        The newly created columns will come first in the DataFrame,
+        followed by the original Series values.
+        When `drop` is True, a `Series` is returned.
+        In either case, if ``inplace=True``, no value is returned.
+
+    See Also
+    --------
+    DataFrame.reset_index: Analogous function for DataFrame.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> import mars.dataframe as md
+    >>> s = md.Series([1, 2, 3, 4], name='foo',
+    ...               index=md.Index(['a', 'b', 'c', 'd'], name='idx'))
+
+    Generate a DataFrame with default index.
+
+    >>> s.reset_index().execute()
+      idx  foo
+    0   a    1
+    1   b    2
+    2   c    3
+    3   d    4
+
+    To specify the name of the new column use `name`.
+
+    >>> s.reset_index(name='values').execute()
+      idx  values
+    0   a       1
+    1   b       2
+    2   c       3
+    3   d       4
+
+    To generate a new Series with the default set `drop` to True.
+
+    >>> s.reset_index(drop=True).execute()
+    0    1
+    1    2
+    2    3
+    3    4
+    Name: foo, dtype: int64
+
+    To update the Series in place, without generating a new one
+    set `inplace` to True. Note that it also requires ``drop=True``.
+
+    >>> s.reset_index(inplace=True, drop=True)
+    >>> s.execute()
+    0    1
+    1    2
+    2    3
+    3    4
+    Name: foo, dtype: int64
+
+    The `level` parameter is interesting for Series with a multi-level
+    index.
+
+    >>> import numpy as np
+    >>> import pandas as pd
+    >>> arrays = [np.array(['bar', 'bar', 'baz', 'baz']),
+    ...           np.array(['one', 'two', 'one', 'two'])]
+    >>> s2 = md.Series(
+    ...     range(4), name='foo',
+    ...     index=pd.MultiIndex.from_arrays(arrays,
+    ...                                     names=['a', 'b']))
+
+    To remove a specific level from the Index, use `level`.
+
+    >>> s2.reset_index(level='a').execute()
+           a  foo
+    b
+    one  bar    0
+    two  bar    1
+    one  baz    2
+    two  baz    3
+
+    If `level` is not set, all levels are removed from the Index.
+
+    >>> s2.reset_index().execute()
+         a    b  foo
+    0  bar  one    0
+    1  bar  two    1
+    2  baz  one    2
+    3  baz  two    3
+    """
+    if name is no_default:
+        name = series.name if series.name is not None else 0
+
+    op = DataFrameResetIndex(
+        level=level,
+        drop=drop,
+        name=name,
+        incremental_index=incremental_index,
+        output_types=[OutputType.series],
+    )
+    ret = op(series)
+    if not inplace:
+        return ret
+    elif ret.ndim == 2:
+        raise TypeError("Cannot reset_index inplace on a Series to create a DataFrame")
+    else:
+        series.data = ret.data
diff --git a/python/xorbits/_mars/dataframe/indexing/sample.py b/python/xorbits/_mars/dataframe/indexing/sample.py
new file mode 100644
index 000000000..29e4e0d67
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/sample.py
@@ -0,0 +1,600 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import itertools
+
+import numpy as np
+
+from ... import opcodes
+from ...core import ENTITY_TYPE, get_output_types, recursive_tile
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    Float64Field,
+    Int8Field,
+    Int64Field,
+    KeyField,
+)
+from ...tensor import searchsorted
+from ...tensor.base import TensorMapChunk
+from ...tensor.merge import TensorConcatenate
+from ...tensor.random import RandomState as TensorRandomState
+from ...tensor.random import RandomStateField
+from ...tensor.utils import gen_random_seeds, normalize_chunk_sizes
+from ...utils import ceildiv, has_unknown_shape
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index, validate_axis
+
+
+class DataFrameSample(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.RAND_SAMPLE
+
+    _size = Int64Field("size")
+    _frac = Float64Field("frac")
+    _replace = BoolField("replace")
+    _weights = AnyField("weights")
+    _axis = Int8Field("axis")
+    _seed = Int64Field("seed")
+    _random_state = RandomStateField("random_state")
+    _always_multinomial = BoolField("always_multinomial")
+
+    # for chunks
+    # num of instances for chunks
+    _chunk_samples = KeyField("chunk_samples")
+
+    def __init__(
+        self,
+        size=None,
+        frac=None,
+        replace=None,
+        weights=None,
+        seed=None,
+        axis=None,
+        random_state=None,
+        always_multinomial=None,
+        chunk_samples=None,
+        **kw
+    ):
+        super().__init__(
+            _size=size,
+            _frac=frac,
+            _replace=replace,
+            _weights=weights,
+            _seed=seed,
+            _axis=axis,
+            _random_state=random_state,
+            _always_multinomial=always_multinomial,
+            _chunk_samples=chunk_samples,
+            **kw
+        )
+
+    @property
+    def size(self):
+        return self._size
+
+    @property
+    def frac(self):
+        return self._frac
+
+    @property
+    def replace(self):
+        return self._replace
+
+    @property
+    def weights(self):
+        return self._weights
+
+    @property
+    def seed(self):
+        return self._seed
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def random_state(self):
+        if self._random_state is None:
+            self._random_state = np.random.RandomState(self.seed)
+        return self._random_state
+
+    @property
+    def always_multinomial(self):
+        return self._always_multinomial
+
+    @property
+    def chunk_samples(self):
+        return self._chunk_samples
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        it = iter(inputs)
+        next(it)
+        if isinstance(self.weights, ENTITY_TYPE):
+            self._weights = next(it)
+        if isinstance(self.chunk_samples, ENTITY_TYPE):
+            self._chunk_samples = next(it)
+
+    def __call__(self, df):
+        params = df.params
+        new_shape = list(df.shape)
+
+        if self.frac is not None and not np.isnan(df.shape[self.axis]):
+            self._size = int(self.frac * df.shape[self.axis])
+            self._frac = None
+
+        if self.size is not None:
+            new_shape[self.axis] = self.size
+        params["shape"] = tuple(new_shape)
+        params["index_value"] = parse_index(df.index_value.to_pandas()[:0])
+
+        input_dfs = [df]
+        if isinstance(self.weights, ENTITY_TYPE):
+            input_dfs.append(self.weights)
+
+        self._output_types = get_output_types(df)
+        return self.new_tileable(input_dfs, **params)
+
+    @classmethod
+    def _tile_one_chunk(cls, op: "DataFrameSample", in_df, weights):
+        out = op.outputs[0]
+
+        input_dfs = [in_df]
+        if isinstance(weights, ENTITY_TYPE):
+            input_dfs.append(weights)
+
+        params = out.params
+        chunk_op = op.copy().reset_key()
+        if isinstance(weights, ENTITY_TYPE):
+            chunk_op._weights = weights
+        params["index"] = (0,) * out.ndim
+        chunk = chunk_op.new_chunk([c.chunks[0] for c in input_dfs], **params)
+
+        df_op = op.copy().reset_key()
+        return df_op.new_tileables(
+            input_dfs, chunks=[chunk], nsplits=((s,) for s in out.shape), **params
+        )
+
+    @classmethod
+    def _tile_multinomial(cls, op: "DataFrameSample", in_df, weights):
+        out_data = op.outputs[0]
+        input_dfs = [in_df]
+        size = op.size
+
+        weight_chunks = itertools.repeat(None)
+        if isinstance(op.weights, ENTITY_TYPE):
+            input_dfs.append(weights)
+            weight_chunks = weights.chunks
+
+        chunks = []
+        new_nsplits = list(in_df.nsplits)
+        rs = op.random_state
+        seeds = gen_random_seeds(len(in_df.chunks), op.random_state)
+        if weights is None:
+            # weights is None, use nsplits to sample num of instances for each chunk
+            probs = np.array(in_df.nsplits[op.axis])
+            probs = 1.0 * probs / probs.sum()
+            chunk_sizes = rs.multinomial(size, probs)
+            new_nsplits[op.axis] = tuple(int(s) for s in chunk_sizes if s > 0)
+
+            chunk_idx = 0
+            for data_chunk, chunk_size, seed in zip(in_df.chunks, chunk_sizes, seeds):
+                if chunk_size == 0:
+                    continue
+
+                chunk_op = op.copy().reset_key()
+                chunk_op._random_state = None
+                chunk_op._seed = seed
+                chunk_op._size = int(chunk_size)
+
+                params = data_chunk.params
+                params["index_value"] = parse_index(
+                    params["index_value"].to_pandas()[:0]
+                )
+                new_shape = list(data_chunk.shape)
+                new_shape[op.axis] = int(chunk_size)
+                params["shape"] = tuple(new_shape)
+
+                idx_list = [0] * data_chunk.ndim
+                idx_list[op.axis] = chunk_idx
+                params["index"] = tuple(idx_list)
+
+                chunks.append(chunk_op.new_chunk([data_chunk], **params))
+                chunk_idx += 1
+        else:
+            mn_seed = gen_random_seeds(1, op.random_state)[0]
+
+            # weights is specified, use weights to sample num of instances for each chunk
+            chunk_weights = yield from recursive_tile(
+                weights.to_tensor().map_chunk(lambda x: x.sum(keepdims=True))
+            )
+            chunk_weights_chunk = TensorConcatenate(
+                dtype=chunk_weights.dtype
+            ).new_chunk(
+                chunk_weights.chunks, shape=(len(chunk_weights.chunks),), index=(0,)
+            )
+            chunk_samples = TensorMapChunk(
+                func=lambda x: np.random.RandomState(mn_seed).multinomial(
+                    size, x / x.sum()
+                )
+            ).new_chunk(
+                [chunk_weights_chunk], shape=(len(chunk_weights.chunks),), index=(0,)
+            )
+            new_nsplits[op.axis] = (np.nan,) * len(chunk_weights.chunks)
+            for chunk_idx, (data_chunk, weight_chunk, seed) in enumerate(
+                zip(in_df.chunks, weight_chunks, seeds)
+            ):
+                input_chunks = [data_chunk]
+
+                chunk_op = op.copy().reset_key()
+                chunk_op._size = None
+                chunk_op._random_state = None
+                chunk_op._seed = seed
+                chunk_op._chunk_samples = chunk_samples
+                if weight_chunk is not None:
+                    chunk_op._weights = weight_chunk
+                    input_chunks.append(weight_chunk)
+
+                params = data_chunk.params
+                params["index_value"] = parse_index(
+                    params["index_value"].to_pandas()[:0]
+                )
+                new_shape = list(data_chunk.shape)
+                new_shape[op.axis] = np.nan
+                params["shape"] = tuple(new_shape)
+
+                idx_list = [0] * data_chunk.ndim
+                idx_list[op.axis] = chunk_idx
+                params["index"] = tuple(idx_list)
+
+                chunks.append(
+                    chunk_op.new_chunk(input_chunks + [chunk_samples], **params)
+                )
+
+        params = out_data.params
+        new_shape = list(in_df.shape)
+        new_shape[op.axis] = size
+        params["shape"] = tuple(new_shape)
+
+        df_op = op.copy().reset_key()
+        return df_op.new_tileables(
+            input_dfs, chunks=chunks, nsplits=tuple(new_nsplits), **params
+        )
+
+    @classmethod
+    def _tile_reservoirs(cls, op: "DataFrameSample", in_df, weights):
+        out_data = op.outputs[0]
+        input_dfs = [in_df]
+        size = op.size
+
+        weight_chunks = itertools.repeat(None)
+        if isinstance(weights, ENTITY_TYPE):
+            input_dfs.append(weights)
+            weight_chunks = weights.chunks
+
+        if any(cs < size for cs in in_df.nsplits[op.axis]):
+            # make sure all chunk > m
+            n_records = in_df.shape[op.axis]
+            n_chunk = min(max(ceildiv(n_records, size), 1), in_df.chunk_shape[0])
+            chunk_size = ceildiv(in_df.shape[op.axis], n_chunk)
+            chunk_sizes = list(normalize_chunk_sizes(n_records, chunk_size)[0])
+            if chunk_sizes[-1] < size and len(chunk_sizes) > 1:
+                # the last chunk may still less than m
+                # merge it into previous one
+                chunk_sizes[-2] += chunk_sizes[-1]
+                chunk_sizes = chunk_sizes[:-1]
+            in_df = yield from recursive_tile(in_df.rechunk({0: tuple(chunk_sizes)}))
+            if isinstance(weights, ENTITY_TYPE):
+                weights = yield from recursive_tile(
+                    weights.rechunk({0: tuple(chunk_sizes)})
+                )
+            if len(chunk_sizes) == 1:
+                return cls._tile_one_chunk(op, in_df, weights)
+
+        # for each chunk in a, do regular sampling
+        sampled_chunks = []
+        seeds = gen_random_seeds(len(in_df.chunks), op.random_state)
+        for data_chunk, weights_chunk, seed in zip(in_df.chunks, weight_chunks, seeds):
+            input_chunks = [data_chunk]
+
+            chunk_op = op.copy().reset_key()
+            chunk_op._random_state = None
+            chunk_op._seed = seed
+            if isinstance(op.weights, ENTITY_TYPE):
+                input_chunks.append(weights_chunk)
+                chunk_op._weights = weights_chunk
+
+            params = data_chunk.params
+            new_shape = list(data_chunk.shape)
+            new_shape[op.axis] = size
+            params["shape"] = tuple(new_shape)
+            sampled_chunks.append(chunk_op.new_chunk(input_chunks, **params))
+
+        # generate a random variable for samples in every chunk
+        state = TensorRandomState.from_numpy(op.random_state)
+        indices = state.rand(size)
+
+        if weights is None:
+            # weights not specified, use nsplits to calculate cumulative probability
+            # to distribute samples in each chunk
+            cum_offsets = np.cumsum(in_df.nsplits[op.axis])
+            cum_offsets = cum_offsets * 1.0 / cum_offsets[-1]
+        else:
+            # weights specified, use weights to calculate cumulative probability
+            # to distribute samples in each chunk
+            chunk_weights = yield from recursive_tile(
+                weights.to_tensor().map_chunk(lambda x: x.sum(keepdims=True))
+            )
+            chunk_weights_chunk = TensorConcatenate(
+                dtype=chunk_weights.dtype
+            ).new_chunk(
+                chunk_weights.chunks, shape=(len(chunk_weights.chunks),), index=(0,)
+            )
+
+            cum_chunk = TensorMapChunk(func=lambda x: (x / x.sum()).cumsum()).new_chunk(
+                [chunk_weights_chunk], shape=(len(chunk_weights.chunks),), index=(0,)
+            )
+            cum_offsets = TensorMapChunk(func=cum_chunk.op.func).new_tensor(
+                [weights],
+                chunks=[cum_chunk],
+                nsplits=((s,) for s in cum_chunk.shape),
+                **cum_chunk.params
+            )
+
+        index_chunks = []
+        # seek which chunk the final sample will select
+        chunk_sel = yield from recursive_tile(
+            searchsorted(cum_offsets, indices, side="right")
+        )
+        # for every chunk, select samples with bool indexing
+        for idx, sampled_chunk in enumerate(sampled_chunks):
+            chunk_index = chunk_sel.map_chunk(
+                func=lambda x, i: x == i, args=(idx,), elementwise=True, dtype=bool
+            )
+            sampled_df_op = sampled_chunk.op.copy().reset_key()
+            sampled_chunk._index = (0,) * sampled_chunk.ndim
+            sampled_df = sampled_df_op.new_tileable(
+                input_dfs,
+                chunks=[sampled_chunk],
+                nsplits=((s,) for s in sampled_chunk.shape),
+                **sampled_chunk.params
+            )
+            index_chunk = (
+                yield from recursive_tile(sampled_df.iloc[chunk_index])
+            ).chunks[0]
+
+            chunk_idx = [0] * sampled_chunk.ndim
+            chunk_idx[op.axis] = idx
+            index_chunk._index = tuple(chunk_idx)
+            index_chunks.append(index_chunk)
+
+        params = out_data.params
+        new_shape = list(in_df.shape)
+        new_shape[op.axis] = size
+        params["shape"] = tuple(new_shape)
+
+        new_nsplits = list(in_df.nsplits)
+        new_nsplits[op.axis] = (np.nan,) * len(index_chunks)
+
+        df_op = op.copy().reset_key()
+        return df_op.new_tileables(
+            input_dfs, chunks=index_chunks, nsplits=tuple(new_nsplits), **params
+        )
+
+    @classmethod
+    def tile(cls, op: "DataFrameSample"):
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        in_df = op.inputs[0]
+        if in_df.ndim == 2:
+            in_df = yield from recursive_tile(
+                in_df.rechunk({(1 - op.axis): (in_df.shape[1 - op.axis],)})
+            )
+
+        if op.size is None:
+            op._size = int(op.frac * in_df.shape[op.axis])
+
+        weights = op.weights
+        if isinstance(weights, ENTITY_TYPE):
+            weights = yield from recursive_tile(
+                weights.rechunk({0: in_df.nsplits[op.axis]})
+            )
+        elif in_df.ndim > 1 and weights in in_df.dtypes.index:
+            weights = yield from recursive_tile(in_df[weights])
+
+        if len(in_df.chunks) == 1:
+            return cls._tile_one_chunk(op, in_df, weights)
+
+        if op.replace or op.always_multinomial:
+            return (yield from cls._tile_multinomial(op, in_df, weights))
+        else:
+            return (yield from cls._tile_reservoirs(op, in_df, weights))
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameSample"):
+        in_data = ctx[op.inputs[0].key]
+        weights = op.weights
+        if isinstance(weights, ENTITY_TYPE):
+            weights = ctx[weights.key]
+
+        size = op.size
+        chunk_samples = op.chunk_samples
+        if isinstance(chunk_samples, ENTITY_TYPE):
+            chunk_samples = ctx[chunk_samples.key]
+        if chunk_samples is not None:
+            size = chunk_samples[op.inputs[0].index[op.axis]]
+
+        try:
+            ctx[op.outputs[0].key] = in_data.sample(
+                n=size,
+                frac=op.frac,
+                replace=op.replace,
+                weights=weights,
+                random_state=op.random_state,
+                axis=op.axis,
+            )
+        except ValueError:  # pragma: no cover
+            ctx[op.outputs[0].key] = in_data.copy().sample(
+                n=size,
+                frac=op.frac,
+                replace=op.replace,
+                weights=weights,
+                random_state=op.random_state,
+                axis=op.axis,
+            )
+
+
+def sample(
+    df_or_series,
+    n=None,
+    frac=None,
+    replace=False,
+    weights=None,
+    random_state=None,
+    axis=None,
+    always_multinomial=False,
+):
+    """
+    Return a random sample of items from an axis of object.
+
+    You can use `random_state` for reproducibility.
+
+    Parameters
+    ----------
+    n : int, optional
+        Number of items from axis to return. Cannot be used with `frac`.
+        Default = 1 if `frac` = None.
+    frac : float, optional
+        Fraction of axis items to return. Cannot be used with `n`.
+    replace : bool, default False
+        Allow or disallow sampling of the same row more than once.
+    weights : str or ndarray-like, optional
+        Default 'None' results in equal probability weighting.
+        If passed a Series, will align with target object on index. Index
+        values in weights not found in sampled object will be ignored and
+        index values in sampled object not in weights will be assigned
+        weights of zero.
+        If called on a DataFrame, will accept the name of a column
+        when axis = 0.
+        Unless weights are a Series, weights must be same length as axis
+        being sampled.
+        If weights do not sum to 1, they will be normalized to sum to 1.
+        Missing values in the weights column will be treated as zero.
+        Infinite values not allowed.
+    random_state : int, array-like, BitGenerator, np.random.RandomState, optional
+        If int, array-like, or BitGenerator (NumPy>=1.17), seed for
+        random number generator
+        If np.random.RandomState, use as numpy RandomState object.
+    axis : {0 or ‘index’, 1 or ‘columns’, None}, default None
+        Axis to sample. Accepts axis number or name. Default is stat axis
+        for given data type (0 for Series and DataFrames).
+    always_multinomial : bool, default False
+        If True, always treat distribution of sample counts between data chunks
+        as multinomial distribution. This will accelerate sampling when data
+        is huge, but may affect randomness of samples when number of instances
+        is not very large.
+
+    Returns
+    -------
+    Series or DataFrame
+        A new object of same type as caller containing `n` items randomly
+        sampled from the caller object.
+
+    See Also
+    --------
+    DataFrameGroupBy.sample: Generates random samples from each group of a
+        DataFrame object.
+    SeriesGroupBy.sample: Generates random samples from each group of a
+        Series object.
+    numpy.random.choice: Generates a random sample from a given 1-D numpy
+        array.
+
+    Notes
+    -----
+    If `frac` > 1, `replacement` should be set to `True`.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'num_legs': [2, 4, 8, 0],
+    ...                    'num_wings': [2, 0, 0, 0],
+    ...                    'num_specimen_seen': [10, 2, 1, 8]},
+    ...                   index=['falcon', 'dog', 'spider', 'fish'])
+    >>> df.execute()
+            num_legs  num_wings  num_specimen_seen
+    falcon         2          2                 10
+    dog            4          0                  2
+    spider         8          0                  1
+    fish           0          0                  8
+
+    Extract 3 random elements from the ``Series`` ``df['num_legs']``:
+    Note that we use `random_state` to ensure the reproducibility of
+    the examples.
+
+    >>> df['num_legs'].sample(n=3, random_state=1).execute()
+    fish      0
+    spider    8
+    falcon    2
+    Name: num_legs, dtype: int64
+
+    A random 50% sample of the ``DataFrame`` with replacement:
+
+    >>> df.sample(frac=0.5, replace=True, random_state=1).execute()
+          num_legs  num_wings  num_specimen_seen
+    dog          4          0                  2
+    fish         0          0                  8
+
+    An upsample sample of the ``DataFrame`` with replacement:
+    Note that `replace` parameter has to be `True` for `frac` parameter > 1.
+
+    >>> df.sample(frac=2, replace=True, random_state=1).execute()
+            num_legs  num_wings  num_specimen_seen
+    dog            4          0                  2
+    fish           0          0                  8
+    falcon         2          2                 10
+    falcon         2          2                 10
+    fish           0          0                  8
+    dog            4          0                  2
+    fish           0          0                  8
+    dog            4          0                  2
+
+    Using a DataFrame column as weights. Rows with larger value in the
+    `num_specimen_seen` column are more likely to be sampled.
+
+    >>> df.sample(n=2, weights='num_specimen_seen', random_state=1).execute()
+            num_legs  num_wings  num_specimen_seen
+    falcon         2          2                 10
+    fish           0          0                  8
+
+    """
+    axis = validate_axis(axis or 0, df_or_series)
+    if axis == 1:
+        raise NotImplementedError("Currently cannot sample over columns")
+    rs = copy.deepcopy(
+        random_state.to_numpy() if hasattr(random_state, "to_numpy") else random_state
+    )
+    if isinstance(rs, (int, np.ndarray)):
+        rs = np.random.RandomState(rs)
+    op = DataFrameSample(
+        size=n,
+        frac=frac,
+        replace=replace,
+        weights=weights,
+        random_state=rs,
+        axis=axis,
+        always_multinomial=always_multinomial,
+    )
+    return op(df_or_series)
diff --git a/python/xorbits/_mars/dataframe/indexing/set_axis.py b/python/xorbits/_mars/dataframe/indexing/set_axis.py
new file mode 100644
index 000000000..282f6e657
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/set_axis.py
@@ -0,0 +1,292 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import ENTITY_TYPE, get_output_types, recursive_tile
+from ...serialization.serializables import AnyField, Int8Field, KeyField
+from ...utils import has_unknown_shape
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index, validate_axis
+
+
+class DataFrameSetAxis(DataFrameOperand, DataFrameOperandMixin):
+    _op_code_ = opcodes.DATAFRAME_SET_AXIS
+
+    _input = KeyField("input")
+    _axis = Int8Field("axis")
+    _value = AnyField("value")
+
+    def __init__(self, value=None, axis=None, **kw):
+        super().__init__(_value=value, _axis=axis, **kw)
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def value(self):
+        return self._value
+
+    @property
+    def axis(self):
+        return self._axis
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = inputs[0]
+        if isinstance(self.value, ENTITY_TYPE):
+            self._value = inputs[-1]
+
+    def __call__(self, df_or_series):
+        new_size = self.value.shape[0]
+        expect_size = df_or_series.axes[self.axis].shape[0]
+        if (
+            not np.isnan(new_size)
+            and not np.isnan(expect_size)
+            and new_size != expect_size
+        ):
+            raise ValueError(
+                f"Length mismatch: Expected axis has {expect_size} elements, "
+                f"new values have {new_size} elements"
+            )
+
+        params = df_or_series.params
+        if self.axis == 0:
+            params["index_value"] = (
+                parse_index(self.value)
+                if isinstance(self.value, pd.Index)
+                else self.value.index_value
+            )
+        else:
+            params["columns_value"] = (
+                parse_index(self.value, store_data=True)
+                if isinstance(self.value, pd.Index)
+                else self.value.index_value
+            )
+            pd_columns = (
+                self.value.index_value.to_pandas()
+                if isinstance(self.value, ENTITY_TYPE)
+                else self.value
+            )
+            params["dtypes"] = params["dtypes"].set_axis(pd_columns)
+
+        self._output_types = get_output_types(df_or_series)
+        inputs = [df_or_series]
+        if isinstance(self.value, ENTITY_TYPE):
+            inputs += [self.value]
+        return self.new_tileable(inputs, **params)
+
+    @classmethod
+    def tile(cls, op: "DataFrameSetAxis"):
+        output = op.outputs[0]
+        input_tileables = [op.input]
+
+        value = op.value
+        if isinstance(value, ENTITY_TYPE):
+            input_tileables.append(value)
+            if has_unknown_shape(value):
+                yield
+
+        if any(np.isnan(s) for s in op.input.nsplits[op.axis]):
+            yield
+
+        if op.input.shape[op.axis] != value.shape[0]:
+            raise ValueError(
+                f"Length mismatch: Expected axis has {value.shape[0]} elements, "
+                f"new values have {op.input.shape[op.axis]} elements"
+            )
+
+        if isinstance(value, ENTITY_TYPE):
+            value = yield from recursive_tile(
+                value.rechunk({0: op.input.nsplits[op.axis]})
+            )
+            input_tileables[-1] = value
+
+        slices = np.array((0,) + op.input.nsplits[op.axis]).cumsum()
+        slice_left = slices[:-1]
+        slice_right = slices[1:]
+
+        chunks = []
+        param_cache = [None] * len(op.input.nsplits[op.axis])
+        for inp_chunk in op.input.chunks:
+            input_chunks = [inp_chunk]
+            value_index = inp_chunk.index[op.axis]
+            params = inp_chunk.params
+
+            if isinstance(value, ENTITY_TYPE):
+                value_data = value.chunks[value_index]
+                input_chunks.append(value_data)
+            else:
+                value_data = value[slice_left[value_index] : slice_right[value_index]]
+
+            if param_cache[value_index] is None:
+                cached_params = param_cache[value_index] = dict()
+                if isinstance(value, ENTITY_TYPE):
+                    if op.axis == 0:
+                        cached_params["index_value"] = value_data.index_value
+                    else:
+                        cached_params["columns_value"] = value_data.index_value
+                        cached_params["dtypes"] = output.dtypes.iloc[
+                            slice_left[value_index] : slice_right[value_index]
+                        ]
+                else:
+                    if op.axis == 0:
+                        cached_params["index_value"] = parse_index(value_data)
+                    else:
+                        cached_params["columns_value"] = parse_index(
+                            value_data, store_data=True
+                        )
+                        cached_params["dtypes"] = params["dtypes"].set_axis(value_data)
+
+            params.update(param_cache[value_index])
+
+            new_op = op.copy().reset_key()
+            new_op._value = value_data
+            chunks.append(new_op.new_chunk(input_chunks, **params))
+
+        params = op.outputs[0].params
+        params["chunks"] = chunks
+        params["nsplits"] = op.input.nsplits
+        new_op = op.copy().reset_key()
+        return new_op.new_tileables(input_tileables, **params)
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameSetAxis"):
+        in_data = ctx[op.input.key]
+        value = op.value
+        if isinstance(value, ENTITY_TYPE):
+            value = ctx[value.key]
+        ctx[op.outputs[0].key] = in_data.set_axis(value, axis=op.axis)
+
+
+def _set_axis(df_or_axis, labels, axis=0, inplace=False):
+    axis = validate_axis(axis, df_or_axis)
+    if not isinstance(labels, ENTITY_TYPE) and not isinstance(labels, pd.Index):
+        labels = pd.Index(labels)
+
+    op = DataFrameSetAxis(value=labels, axis=axis)
+    result = op(df_or_axis)
+    if inplace:
+        df_or_axis.data = result.data
+    else:
+        return result
+
+
+def df_set_axis(df, labels, axis=0, inplace=False):
+    """
+    Assign desired index to given axis.
+
+    Indexes for column or row labels can be changed by assigning
+    a list-like or Index.
+
+    Parameters
+    ----------
+    labels : list-like, Index
+        The values for the new index.
+
+    axis : {0 or 'index', 1 or 'columns'}, default 0
+        The axis to update. The value 0 identifies the rows, and 1 identifies the columns.
+
+    inplace : bool, default False
+        Whether to return a new DataFrame instance.
+
+    Returns
+    -------
+    renamed : DataFrame or None
+        An object of type DataFrame or None if ``inplace=True``.
+
+    See Also
+    --------
+    DataFrame.rename_axis : Alter the name of the index or columns.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+
+    Change the row labels.
+
+    >>> df.set_axis(['a', 'b', 'c'], axis='index').execute()
+       A  B
+    a  1  4
+    b  2  5
+    c  3  6
+
+    Change the column labels.
+
+    >>> df.set_axis(['I', 'II'], axis='columns').execute()
+       I  II
+    0  1   4
+    1  2   5
+    2  3   6
+
+    Now, update the labels inplace.
+
+    >>> df.set_axis(['i', 'ii'], axis='columns', inplace=True)
+    >>> df.execute()
+       i  ii
+    0  1   4
+    1  2   5
+    2  3   6
+    """
+    return _set_axis(df, labels, axis=axis, inplace=inplace)
+
+
+def series_set_axis(series, labels, axis=0, inplace=False):
+    """
+    Assign desired index to given axis.
+
+    Indexes for row labels can be changed by assigning
+    a list-like or Index.
+
+    Parameters
+    ----------
+    labels : list-like, Index
+        The values for the new index.
+
+    axis : {0 or 'index'}, default 0
+        The axis to update. The value 0 identifies the rows.
+
+    inplace : bool, default False
+        Whether to return a new Series instance.
+
+    Returns
+    -------
+    renamed : Series or None
+        An object of type Series or None if ``inplace=True``.
+
+    See Also
+    --------
+    Series.rename_axis : Alter the name of the index.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> s = md.Series([1, 2, 3])
+    >>> s.execute()
+    0    1
+    1    2
+    2    3
+    dtype: int64
+
+    >>> s.set_axis(['a', 'b', 'c'], axis=0).execute()
+    a    1
+    b    2
+    c    3
+    dtype: int64
+    """
+    return _set_axis(series, labels, axis=axis, inplace=inplace)
diff --git a/python/xorbits/_mars/dataframe/indexing/set_index.py b/python/xorbits/_mars/dataframe/indexing/set_index.py
new file mode 100644
index 000000000..762a7fc03
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/set_index.py
@@ -0,0 +1,212 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import OutputType
+from ...serialization.serializables import AnyField, BoolField
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_empty_df, parse_index
+
+
+class DataFrameSetIndex(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.DATAFRAME_SET_INDEX
+
+    _keys = AnyField("keys")
+    _drop = BoolField("drop")
+    _append = BoolField("append")
+    _verify_integrity = BoolField("verify_integrity")
+
+    def __init__(
+        self,
+        keys=None,
+        drop=True,
+        append=False,
+        verify_integrity=False,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _keys=keys,
+            _drop=drop,
+            _append=append,
+            _verify_integrity=verify_integrity,
+            _output_types=output_types,
+            **kw
+        )
+
+    @property
+    def keys(self):
+        return self._keys
+
+    @property
+    def drop(self):
+        return self._drop
+
+    @property
+    def append(self):
+        return self._append
+
+    @property
+    def verify_integrity(self):
+        return self._verify_integrity
+
+    def __call__(self, df):
+        new_df = build_empty_df(df.dtypes).set_index(
+            keys=self.keys,
+            drop=self.drop,
+            append=self.append,
+            verify_integrity=self.verify_integrity,
+        )
+        return self.new_dataframe(
+            [df],
+            shape=(df.shape[0], new_df.shape[1]),
+            dtypes=new_df.dtypes,
+            index_value=parse_index(new_df.index),
+            columns_value=parse_index(new_df.columns, store_data=True),
+        )
+
+    @classmethod
+    def _tile_column_axis_n_chunk(cls, op, in_df, out_df, out_chunks):
+        if not isinstance(op.keys, str):  # pragma: no cover
+            raise NotImplementedError("DataFrame.set_index only support label")
+        if op.verify_integrity:  # pragma: no cover
+            raise NotImplementedError(
+                "DataFrame.set_index not support verify_integrity yet"
+            )
+
+        try:
+            column_index = in_df.columns_value.to_pandas().get_loc(op.keys)
+        except KeyError:  # pragma: no cover
+            raise NotImplementedError(
+                "The new index label must be a column of the original dataframe"
+            )
+
+        chunk_index = np.searchsorted(np.cumsum(in_df.nsplits[1]), column_index + 1)
+
+        for row_idx in range(in_df.chunk_shape[0]):
+            index_chunk = in_df.cix[row_idx, chunk_index]
+            for col_idx in range(in_df.chunk_shape[1]):
+                input_chunk = in_df.cix[row_idx, col_idx]
+                if op.drop and input_chunk.key == index_chunk.key:
+                    new_shape = (input_chunk.shape[0], input_chunk.shape[1] - 1)
+                    selected = input_chunk.columns_value.to_pandas().drop(op.keys)
+                    columns = parse_index(selected, store_data=True)
+                    dtypes = input_chunk.dtypes.loc[selected]
+                else:
+                    new_shape = input_chunk.shape
+                    columns = input_chunk.columns_value
+                    dtypes = input_chunk.dtypes
+                out_op = op.copy().reset_key()
+                out_chunk = out_op.new_chunk(
+                    [index_chunk, input_chunk],
+                    shape=new_shape,
+                    dtypes=dtypes,
+                    index=input_chunk.index,
+                    index_value=parse_index(pd.Index([], dtype=np.int64)),
+                    columns_value=columns,
+                )
+                out_chunks.append(out_chunk)
+
+    @classmethod
+    def _tile_column_axis_1_chunk(cls, op, in_df, out_df, out_chunks):
+        out_pd_index = out_df.index_value.to_pandas()
+        for c in in_df.chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_shape = (c.shape[0], out_df.shape[1])
+            index_value = parse_index(out_pd_index, c)
+            out_chunk = chunk_op.new_chunk(
+                [c],
+                shape=chunk_shape,
+                dtypes=out_df.dtypes,
+                index=c.index,
+                index_value=index_value,
+                columns_value=out_df.columns_value,
+            )
+            out_chunks.append(out_chunk)
+
+    @classmethod
+    def tile(cls, op):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+
+        out_chunks = []
+        if in_df.chunk_shape[1] > 1:
+            cls._tile_column_axis_n_chunk(op, in_df, out_df, out_chunks)
+        else:
+            cls._tile_column_axis_1_chunk(op, in_df, out_df, out_chunks)
+
+        new_op = op.copy()
+        columns_nsplits = list(in_df.nsplits[1])
+        if op.drop:
+            columns_nsplits = tuple(
+                split - 1 if i == 0 else split
+                for i, split in enumerate(columns_nsplits)
+            )
+        nsplits = (in_df.nsplits[0], columns_nsplits)
+        return new_op.new_dataframes(
+            op.inputs,
+            out_df.shape,
+            dtypes=out_df.dtypes,
+            index_value=out_df.index_value,
+            columns_value=out_df.columns_value,
+            chunks=out_chunks,
+            nsplits=nsplits,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+
+        if len(op.inputs) == 2:
+            # axis 1 has more than 1 chunk
+            index_chunk, input_chunk = op.inputs
+            # Optimization: we don't need to get value of the column
+            # that is set as new index.
+            if input_chunk.key == index_chunk.key:
+                new_index = op.keys
+            else:
+                new_index = ctx[index_chunk.key][op.keys]
+            ctx[chunk.key] = ctx[input_chunk.key].set_index(
+                new_index,
+                drop=op.drop,
+                append=op.append,
+                verify_integrity=op.verify_integrity,
+            )
+        else:
+            # axis 1 has 1 chunk
+            inp = ctx[op.inputs[0].key]
+            ctx[chunk.key] = inp.set_index(
+                op.keys,
+                drop=op.drop,
+                append=op.append,
+                verify_integrity=op.verify_integrity,
+            )
+
+
+def set_index(df, keys, drop=True, append=False, inplace=False, verify_integrity=False):
+    op = DataFrameSetIndex(
+        keys=keys,
+        drop=drop,
+        append=append,
+        verify_integrity=verify_integrity,
+        output_types=[OutputType.dataframe],
+    )
+    result = op(df)
+    if not inplace:
+        return result
+    else:
+        df.data = result.data
diff --git a/python/xorbits/_mars/dataframe/indexing/setitem.py b/python/xorbits/_mars/dataframe/indexing/setitem.py
new file mode 100644
index 000000000..b7ced7513
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/setitem.py
@@ -0,0 +1,337 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+
+import numpy as np
+import pandas as pd
+from pandas.api.types import is_list_like
+
+from ... import opcodes
+from ...core import OutputType, recursive_tile
+from ...serialization.serializables import AnyField, KeyField
+from ...tensor.core import TENSOR_TYPE
+from ...utils import pd_release_version
+from ..core import DATAFRAME_TYPE, SERIES_TYPE, DataFrame
+from ..initializer import DataFrame as asframe
+from ..initializer import Series as asseries
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import is_index_value_identical, parse_index
+
+# in pandas 1.0.x, __setitem__ with a list with missing items are not allowed
+_allow_set_missing_list = pd_release_version[:2] >= (1, 1)
+
+
+class DataFrameSetitem(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.INDEXSETVALUE
+
+    _target = KeyField("target")
+    _indexes = AnyField("indexes")
+    _value = AnyField("value")
+
+    def __init__(self, target=None, indexes=None, value=None, output_types=None, **kw):
+        super().__init__(
+            _target=target,
+            _indexes=indexes,
+            _value=value,
+            _output_types=output_types,
+            **kw,
+        )
+        if self.output_types is None:
+            self.output_types = [OutputType.dataframe]
+
+    @property
+    def target(self):
+        return self._target
+
+    @property
+    def indexes(self):
+        return self._indexes
+
+    @property
+    def value(self):
+        return self._value
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._target = self._inputs[0]
+        if len(inputs) > 1:
+            self._value = self._inputs[-1]
+
+    @staticmethod
+    def _is_scalar_tensor(t):
+        return isinstance(t, TENSOR_TYPE) and t.ndim == 0
+
+    def __call__(self, target: DataFrame, value):
+        raw_target = target
+
+        inputs = [target]
+        if np.isscalar(value):
+            value_dtype = np.array(value).dtype
+        elif self._is_scalar_tensor(value):
+            inputs.append(value)
+            value_dtype = value.dtype
+        else:
+            if isinstance(value, (pd.Series, SERIES_TYPE)):
+                value = asseries(value)
+                value_dtype = value.dtype
+            elif isinstance(value, (pd.DataFrame, DATAFRAME_TYPE)):
+                if len(self.indexes) != value.shape[1]:  # pragma: no cover
+                    raise ValueError("Columns must be same length as key")
+
+                value = asframe(value)
+                value_dtype = pd.Series(list(value.dtypes), index=self._indexes)
+            elif is_list_like(value) or isinstance(value, TENSOR_TYPE):
+                # convert to numpy to get actual dim and shape
+                if is_list_like(value):
+                    value = np.array(value)
+
+                if value.ndim == 1:
+                    value = asseries(value, index=target.index)
+                    value_dtype = value.dtype
+                else:
+                    if len(self.indexes) != value.shape[1]:  # pragma: no cover
+                        raise ValueError("Columns must be same length as key")
+
+                    value = asframe(value, index=target.index)
+                    value_dtype = pd.Series(list(value.dtypes), index=self._indexes)
+            else:  # pragma: no cover
+                raise TypeError(
+                    "Wrong value type, could be one of scalar, Series or tensor"
+                )
+
+            if target.shape[0] == 0:
+                # target empty, reindex target first
+                target = target.reindex(value.index)
+                inputs[0] = target
+            elif value.index_value.key != target.index_value.key:
+                # need reindex when target df is not empty and index different
+                value = value.reindex(target.index)
+            inputs.append(value)
+
+        index_value = target.index_value
+        dtypes = target.dtypes.copy(deep=True)
+
+        try:
+            dtypes.loc[self._indexes] = value_dtype
+        except KeyError:
+            # when some index not exist, try update one by one
+            if isinstance(value_dtype, pd.Series):
+                for idx in self._indexes:
+                    dtypes.loc[idx] = value_dtype.loc[idx]
+            else:
+                for idx in self._indexes:
+                    dtypes.loc[idx] = value_dtype
+
+        columns_value = parse_index(dtypes.index, store_data=True)
+        ret = self.new_dataframe(
+            inputs,
+            shape=(target.shape[0], len(dtypes)),
+            dtypes=dtypes,
+            index_value=index_value,
+            columns_value=columns_value,
+        )
+        raw_target.data = ret.data
+
+    @classmethod
+    def tile(cls, op: "DataFrameSetitem"):
+        from ..merge.concat import DataFrameConcat
+
+        out = op.outputs[0]
+        target = op.target
+        value = op.value
+        indexes = op.indexes
+        columns = target.columns_value.to_pandas()
+        last_column_index = target.chunk_shape[1] - 1
+        is_value_scalar = np.isscalar(value) or cls._is_scalar_tensor(value)
+        has_multiple_cols = getattr(out.dtypes[indexes], "ndim", 0) > 0
+        target_index_to_value_index = collections.defaultdict(list)
+
+        if has_multiple_cols:
+            append_cols = [c for c in indexes if c not in columns]
+        else:
+            append_cols = [indexes] if indexes not in columns else []
+
+        if not is_value_scalar:
+            rechunk_arg = {}
+
+            # check if all chunk's index_value are identical
+            is_identical = is_index_value_identical(target, value)
+            if not is_identical:
+                # do rechunk
+                if any(np.isnan(s) for s in target.nsplits[0]) or any(
+                    np.isnan(s) for s in value.nsplits[0]
+                ):  # pragma: no cover
+                    yield
+
+                rechunk_arg[0] = target.nsplits[0]
+
+            if isinstance(value, DATAFRAME_TYPE):
+                if len(append_cols) < len(indexes):
+                    # rechunk in column dim given distribution of indexes in target chunks
+                    target_col_to_chunk_index = {
+                        col: head_chunk.index[1]
+                        for head_chunk in target.cix[0, :]
+                        for col in head_chunk.dtypes.keys()
+                    }
+                    value_chunk_indexes = [
+                        target_col_to_chunk_index.get(vc, None) for vc in indexes
+                    ]
+                    col_nsplits = []
+                    last_cidx = value_chunk_indexes[0]
+                    match_idxes = []
+                    for cidx, idx in zip(value_chunk_indexes, indexes):
+                        if cidx != last_cidx:
+                            target_index_to_value_index[last_cidx].append(
+                                len(col_nsplits)
+                            )
+                            col_nsplits.append(len(match_idxes))
+                            last_cidx = cidx
+                            match_idxes = [idx]
+                        else:
+                            match_idxes.append(idx)
+                    target_index_to_value_index[last_cidx].append(len(col_nsplits))
+                    col_nsplits.append(len(match_idxes))
+
+                    # merge last column indexes and keep column order
+                    last_value_index = target_index_to_value_index.pop(
+                        last_column_index, []
+                    )
+                    append_value_index = target_index_to_value_index.pop(None, [])
+                    target_index_to_value_index[None] = (
+                        last_value_index + append_value_index
+                    )
+
+                    rechunk_arg[1] = col_nsplits
+                else:
+                    target_index_to_value_index[None] = [0]
+                    rechunk_arg[1] = [len(append_cols)]
+
+            if rechunk_arg:
+                value = yield from recursive_tile(value.rechunk(rechunk_arg))
+
+        out_chunks = []
+        nsplits = [list(ns) for ns in target.nsplits]
+        nsplits[1][-1] += len(append_cols)
+        nsplits = tuple(tuple(ns) for ns in nsplits)
+
+        for c in target.chunks:
+            result_chunk = c
+
+            if has_multiple_cols:
+                new_indexes = [vc for vc in indexes if vc in c.dtypes]
+            else:
+                new_indexes = [indexes] if indexes in c.dtypes else []
+
+            if c.index[-1] == last_column_index:
+                new_indexes.extend(append_cols)
+
+            if new_indexes:
+                # update needed on current chunk
+                chunk_op = op.copy().reset_key()
+                chunk_op._indexes = new_indexes if has_multiple_cols else new_indexes[0]
+
+                if pd.api.types.is_scalar(value):
+                    chunk_inputs = [c]
+                elif is_value_scalar:
+                    chunk_inputs = [c, value.chunks[0]]
+                else:
+                    # get proper chunk from value chunks
+                    if has_multiple_cols:
+                        value_chunks = []
+                        target_index = (
+                            None if c.index[-1] == last_column_index else c.index[1]
+                        )
+                        for value_index in target_index_to_value_index[target_index]:
+                            value_chunk = value.cix[c.index[0], value_index]
+                            value_chunks.append(value_chunk)
+                        if len(value_chunks) == 1:
+                            value_chunk = value_chunks[0]
+                        else:
+                            # concat multiple columns by order
+                            shape = (
+                                value_chunks[0].shape[0],
+                                sum(c.shape[1] for c in value_chunks),
+                            )
+                            dtypes = pd.concat([c.dtypes for c in value_chunks])
+                            concat_op = DataFrameConcat(output_types=op.output_types)
+                            value_chunk = concat_op.new_chunk(
+                                value_chunks, shape=shape, dtypes=dtypes
+                            )
+                    else:
+                        value_chunk = value.cix[c.index[0],]
+
+                    chunk_inputs = [c, value_chunk]
+
+                shape = c.shape
+                if append_cols and c.index[-1] == last_column_index:
+                    # some columns appended at the last column of chunks
+                    shape = (shape[0], shape[1] + len(append_cols))
+
+                result_chunk = chunk_op.new_chunk(
+                    chunk_inputs,
+                    shape=shape,
+                    index=c.index,
+                )
+                result_chunk._set_tileable_meta(
+                    tileable_key=out.key,
+                    nsplits=nsplits,
+                    index_value=out.index_value,
+                    columns_value=out.columns_value,
+                    dtypes=out.dtypes,
+                )
+            out_chunks.append(result_chunk)
+
+        params = out.params
+        params["nsplits"] = nsplits
+        params["chunks"] = out_chunks
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def estimate_size(cls, ctx: dict, op: "DataFrameSetitem"):
+        result_size = ctx[op.target.key][0]
+        ctx[op.outputs[0].key] = (result_size, result_size)
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameSetitem"):
+        target = ctx[op.target.key]
+        # only deep copy when updating
+        indexes = (
+            (op.indexes,)
+            if not isinstance(op.indexes, (tuple, list, set))
+            else op.indexes
+        )
+        deep = bool(set(indexes) & set(target.columns))
+        target = ctx[op.target.key].copy(deep=deep)
+        value = ctx[op.value.key] if not np.isscalar(op.value) else op.value
+        try:
+            target[op.indexes] = value
+        except KeyError:
+            if _allow_set_missing_list:  # pragma: no cover
+                raise
+            else:
+                existing = set(target.columns)
+                new_columns = target.columns.append(
+                    pd.Index([idx for idx in op.indexes if idx not in existing])
+                )
+                target = target.reindex(new_columns, axis=1)
+                target[op.indexes] = value
+
+        ctx[op.outputs[0].key] = target
+
+
+def dataframe_setitem(df, col, value):
+    op = DataFrameSetitem(target=df, indexes=col, value=value)
+    return op(df, value)
diff --git a/python/xorbits/_mars/dataframe/indexing/tests/__init__.py b/python/xorbits/_mars/dataframe/indexing/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/indexing/tests/test_indexing.py b/python/xorbits/_mars/dataframe/indexing/tests/test_indexing.py
new file mode 100644
index 000000000..ac8ac194d
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/tests/test_indexing.py
@@ -0,0 +1,959 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .... import dataframe as md
+from .... import tensor as mt
+from ....core import tile
+from ....tensor.core import TENSOR_CHUNK_TYPE, TENSOR_TYPE, Tensor
+from ...core import (
+    DATAFRAME_CHUNK_TYPE,
+    DATAFRAME_TYPE,
+    SERIES_CHUNK_TYPE,
+    SERIES_TYPE,
+    DataFrame,
+    Series,
+)
+from ...datasource.from_tensor import dataframe_from_tensor
+from ..iloc import (
+    DataFrameIlocGetItem,
+    DataFrameIlocSetItem,
+    HeadTailOptimizedOperandMixin,
+    IndexingError,
+)
+from ..loc import DataFrameLocGetItem
+
+
+def test_set_index():
+    df1 = pd.DataFrame(
+        [[1, 3, 3], [4, 2, 6], [7, 8, 9]],
+        index=["a1", "a2", "a3"],
+        columns=["x", "y", "z"],
+    )
+    df2 = md.DataFrame(df1, chunk_size=2)
+
+    df3 = df2.set_index("y", drop=True)
+    df3 = tile(df3)
+    assert df3.chunk_shape == (2, 2)
+    pd.testing.assert_index_equal(
+        df3.chunks[0].columns_value.to_pandas(), pd.Index(["x"])
+    )
+    pd.testing.assert_index_equal(
+        df3.chunks[1].columns_value.to_pandas(), pd.Index(["z"])
+    )
+
+    df4 = df2.set_index("y", drop=False)
+    df4 = tile(df4)
+    assert df4.chunk_shape == (2, 2)
+    pd.testing.assert_index_equal(
+        df4.chunks[0].columns_value.to_pandas(), pd.Index(["x", "y"])
+    )
+    pd.testing.assert_index_equal(
+        df4.chunks[1].columns_value.to_pandas(), pd.Index(["z"])
+    )
+
+
+def test_iloc_getitem():
+    df1 = pd.DataFrame(
+        [[1, 3, 3], [4, 2, 6], [7, 8, 9]],
+        index=["a1", "a2", "a3"],
+        columns=["x", "y", "z"],
+    )
+    df2 = md.DataFrame(df1, chunk_size=2)
+
+    with pytest.raises(IndexingError):
+        _ = df2.iloc[1, 1, 1]
+
+    # index cannot be tuple
+    with pytest.raises(IndexingError):
+        _ = df2.iloc[((1,),)]
+
+    # index wrong type
+    with pytest.raises(TypeError):
+        _ = df2.iloc["a1":]
+
+    with pytest.raises(NotImplementedError):
+        _ = df2.iloc[0, md.Series(["a2", "a3"])]
+
+    # fancy index should be 1-d
+    with pytest.raises(ValueError):
+        _ = df2.iloc[[[0, 1], [1, 2]]]
+
+    with pytest.raises(ValueError):
+        _ = df2.iloc[1, ...]
+
+    with pytest.raises(IndexError):
+        _ = df2.iloc[-4]
+
+    with pytest.raises(IndexError):
+        _ = df2.iloc[3]
+
+    # plain index
+    df3 = df2.iloc[1]
+    df3 = tile(df3)
+    assert isinstance(df3, SERIES_TYPE)
+    assert isinstance(df3.op, DataFrameIlocGetItem)
+    assert df3.shape == (3,)
+    assert df3.chunk_shape == (2,)
+    assert df3.chunks[0].shape == (2,)
+    assert df3.chunks[1].shape == (1,)
+    assert df3.chunks[0].op.indexes == [1, slice(None, None, None)]
+    assert df3.chunks[1].op.indexes == [1, slice(None, None, None)]
+    assert df3.chunks[0].inputs[0].index == (0, 0)
+    assert df3.chunks[0].inputs[0].shape == (2, 2)
+    assert df3.chunks[1].inputs[0].index == (0, 1)
+    assert df3.chunks[1].inputs[0].shape == (2, 1)
+
+    # slice index
+    df4 = df2.iloc[:, 2:4]
+    df4 = tile(df4)
+    assert isinstance(df4, DATAFRAME_TYPE)
+    assert isinstance(df4.op, DataFrameIlocGetItem)
+    assert df4.index_value.key == df2.index_value.key
+    assert df4.shape == (3, 1)
+    assert df4.chunk_shape == (2, 1)
+    assert df4.chunks[0].shape == (2, 1)
+    pd.testing.assert_index_equal(
+        df4.chunks[0].columns_value.to_pandas(), df1.columns[2:3]
+    )
+    pd.testing.assert_series_equal(df4.chunks[0].dtypes, df1.dtypes[2:3])
+    assert isinstance(df4.chunks[0].index_value.to_pandas(), type(df1.index))
+    assert df4.chunks[1].shape == (1, 1)
+    pd.testing.assert_index_equal(
+        df4.chunks[1].columns_value.to_pandas(), df1.columns[2:3]
+    )
+    pd.testing.assert_series_equal(df4.chunks[1].dtypes, df1.dtypes[2:3])
+    assert df4.chunks[0].index_value.key != df4.chunks[1].index_value.key
+    assert isinstance(df4.chunks[1].index_value.to_pandas(), type(df1.index))
+    assert df4.chunks[0].op.indexes == [
+        slice(None, None, None),
+        slice(None, None, None),
+    ]
+    assert df4.chunks[1].op.indexes == [
+        slice(None, None, None),
+        slice(None, None, None),
+    ]
+    assert df4.chunks[0].inputs[0].index == (0, 1)
+    assert df4.chunks[0].inputs[0].shape == (2, 1)
+    assert df4.chunks[1].inputs[0].index == (1, 1)
+    assert df4.chunks[1].inputs[0].shape == (1, 1)
+
+    # plain fancy index
+    df5 = df2.iloc[[0], [0, 1, 2]]
+    df5 = tile(df5)
+    assert isinstance(df5, DATAFRAME_TYPE)
+    assert isinstance(df5.op, DataFrameIlocGetItem)
+    assert df5.shape == (1, 3)
+    assert df5.chunk_shape == (1, 2)
+    assert df5.chunks[0].shape == (1, 2)
+    pd.testing.assert_index_equal(
+        df5.chunks[0].columns_value.to_pandas(), df1.columns[:2]
+    )
+    pd.testing.assert_series_equal(df5.chunks[0].dtypes, df1.dtypes[:2])
+    assert isinstance(df5.chunks[0].index_value.to_pandas(), type(df1.index))
+    assert df5.chunks[1].shape == (1, 1)
+    pd.testing.assert_index_equal(
+        df5.chunks[1].columns_value.to_pandas(), df1.columns[2:]
+    )
+    pd.testing.assert_series_equal(df5.chunks[1].dtypes, df1.dtypes[2:])
+    assert isinstance(df5.chunks[1].index_value.to_pandas(), type(df1.index))
+    np.testing.assert_array_equal(df5.chunks[0].op.indexes[0], [0])
+    np.testing.assert_array_equal(df5.chunks[0].op.indexes[1], [0, 1])
+    np.testing.assert_array_equal(df5.chunks[1].op.indexes[0], [0])
+    np.testing.assert_array_equal(df5.chunks[1].op.indexes[1], [0])
+    assert df5.chunks[0].inputs[0].index == (0, 0)
+    assert df5.chunks[0].inputs[0].shape == (2, 2)
+    assert df5.chunks[1].inputs[0].index == (0, 1)
+    assert df5.chunks[1].inputs[0].shape == (2, 1)
+
+    # fancy index
+    df6 = df2.iloc[[1, 2], [0, 1, 2]]
+    df6 = tile(df6)
+    assert isinstance(df6, DATAFRAME_TYPE)
+    assert isinstance(df6.op, DataFrameIlocGetItem)
+    assert df6.shape == (2, 3)
+    assert df6.chunk_shape == (2, 2)
+    assert df6.chunks[0].shape == (1, 2)
+    assert df6.chunks[1].shape == (1, 1)
+    assert df6.chunks[2].shape == (1, 2)
+    assert df6.chunks[3].shape == (1, 1)
+    np.testing.assert_array_equal(df6.chunks[0].op.indexes[0], [1])
+    np.testing.assert_array_equal(df6.chunks[0].op.indexes[1], [0, 1])
+    np.testing.assert_array_equal(df6.chunks[1].op.indexes[0], [1])
+    np.testing.assert_array_equal(df6.chunks[1].op.indexes[1], [0])
+    np.testing.assert_array_equal(df6.chunks[2].op.indexes[0], [0])
+    np.testing.assert_array_equal(df6.chunks[2].op.indexes[1], [0, 1])
+    np.testing.assert_array_equal(df6.chunks[3].op.indexes[0], [0])
+    np.testing.assert_array_equal(df6.chunks[3].op.indexes[1], [0])
+    assert df6.chunks[0].inputs[0].index == (0, 0)
+    assert df6.chunks[0].inputs[0].shape == (2, 2)
+    assert df6.chunks[1].inputs[0].index == (0, 1)
+    assert df6.chunks[1].inputs[0].shape == (2, 1)
+    assert df6.chunks[2].inputs[0].index == (1, 0)
+    assert df6.chunks[2].inputs[0].shape == (1, 2)
+    assert df6.chunks[3].inputs[0].index == (1, 1)
+    assert df6.chunks[3].inputs[0].shape == (1, 1)
+
+    # plain index
+    df7 = df2.iloc[1, 2]
+    df7 = tile(df7)
+    assert isinstance(df7, TENSOR_TYPE)  # scalar
+    assert isinstance(df7.op, DataFrameIlocGetItem)
+    assert df7.shape == ()
+    assert df7.chunk_shape == ()
+    assert df7.chunks[0].dtype == df7.dtype
+    assert df7.chunks[0].shape == ()
+    assert df7.chunks[0].op.indexes == [1, 0]
+    assert df7.chunks[0].inputs[0].index == (0, 1)
+    assert df7.chunks[0].inputs[0].shape == (2, 1)
+
+    # test Series iloc getitem
+
+    # slice
+    series = md.Series(pd.Series(np.arange(10)), chunk_size=3).iloc[4:8]
+    series = tile(series)
+
+    assert series.shape == (4,)
+
+    assert len(series.chunks) == 2
+    assert series.chunks[0].shape == (2,)
+    assert series.chunks[0].index == (0,)
+    assert series.chunks[0].op.indexes == [slice(1, 3, 1)]
+    assert series.chunks[1].shape == (2,)
+    assert series.chunks[1].op.indexes == [slice(0, 2, 1)]
+    assert series.chunks[1].index == (1,)
+
+    # fancy index
+    series = md.Series(pd.Series(np.arange(10)), chunk_size=3).iloc[[2, 4, 8]]
+    series = tile(series)
+
+    assert series.shape == (3,)
+
+    assert len(series.chunks) == 3
+    assert series.chunks[0].shape == (1,)
+    assert series.chunks[0].index == (0,)
+    assert series.chunks[0].op.indexes[0] == [2]
+    assert series.chunks[1].shape == (1,)
+    assert series.chunks[1].op.indexes[0] == [1]
+    assert series.chunks[1].index == (1,)
+    assert series.chunks[2].shape == (1,)
+    assert series.chunks[2].op.indexes[0] == [2]
+    assert series.chunks[2].index == (2,)
+
+
+def test_iloc_setitem():
+    df1 = pd.DataFrame(
+        [[1, 3, 3], [4, 2, 6], [7, 8, 9]],
+        index=["a1", "a2", "a3"],
+        columns=["x", "y", "z"],
+    )
+    df2 = md.DataFrame(df1, chunk_size=2)
+    df2 = tile(df2)
+
+    # plain index
+    df3 = md.DataFrame(df1, chunk_size=2)
+    df3.iloc[1] = 100
+    df3 = tile(df3)
+    assert isinstance(df3.op, DataFrameIlocSetItem)
+    assert df3.chunk_shape == df2.chunk_shape
+    pd.testing.assert_index_equal(
+        df2.index_value.to_pandas(), df3.index_value.to_pandas()
+    )
+    pd.testing.assert_index_equal(
+        df2.columns_value.to_pandas(), df3.columns_value.to_pandas()
+    )
+    for c1, c2 in zip(df2.chunks, df3.chunks):
+        assert c1.shape == c2.shape
+        pd.testing.assert_index_equal(
+            c1.index_value.to_pandas(), c2.index_value.to_pandas()
+        )
+        pd.testing.assert_index_equal(
+            c1.columns_value.to_pandas(), c2.columns_value.to_pandas()
+        )
+        if isinstance(c2.op, DataFrameIlocSetItem):
+            assert c1.key == c2.inputs[0].key
+        else:
+            assert c1.key == c2.key
+    assert df3.chunks[0].op.indexes == [1, slice(None, None, None)]
+    assert df3.chunks[1].op.indexes == [1, slice(None, None, None)]
+
+    # # slice index
+    df4 = md.DataFrame(df1, chunk_size=2)
+    df4.iloc[:, 2:4] = 1111
+    df4 = tile(df4)
+    assert isinstance(df4.op, DataFrameIlocSetItem)
+    assert df4.chunk_shape == df2.chunk_shape
+    pd.testing.assert_index_equal(
+        df2.index_value.to_pandas(), df4.index_value.to_pandas()
+    )
+    pd.testing.assert_index_equal(
+        df2.columns_value.to_pandas(), df4.columns_value.to_pandas()
+    )
+    for c1, c2 in zip(df2.chunks, df4.chunks):
+        assert c1.shape == c2.shape
+        pd.testing.assert_index_equal(
+            c1.index_value.to_pandas(), c2.index_value.to_pandas()
+        )
+        pd.testing.assert_index_equal(
+            c1.columns_value.to_pandas(), c2.columns_value.to_pandas()
+        )
+        if isinstance(c2.op, DataFrameIlocSetItem):
+            assert c1.key == c2.inputs[0].key
+        else:
+            assert c1.key == c2.key
+    assert df4.chunks[1].op.indexes == [
+        slice(None, None, None),
+        slice(None, None, None),
+    ]
+    assert df4.chunks[3].op.indexes == [
+        slice(None, None, None),
+        slice(None, None, None),
+    ]
+
+    # plain fancy index
+    df5 = md.DataFrame(df1, chunk_size=2)
+    df5.iloc[[0], [0, 1, 2]] = 2222
+    df5 = tile(df5)
+    assert isinstance(df5.op, DataFrameIlocSetItem)
+    assert df5.chunk_shape == df2.chunk_shape
+    pd.testing.assert_index_equal(
+        df2.index_value.to_pandas(), df5.index_value.to_pandas()
+    )
+    pd.testing.assert_index_equal(
+        df2.columns_value.to_pandas(), df5.columns_value.to_pandas()
+    )
+    for c1, c2 in zip(df2.chunks, df5.chunks):
+        assert c1.shape == c2.shape
+        pd.testing.assert_index_equal(
+            c1.index_value.to_pandas(), c2.index_value.to_pandas()
+        )
+        pd.testing.assert_index_equal(
+            c1.columns_value.to_pandas(), c2.columns_value.to_pandas()
+        )
+        if isinstance(c2.op, DataFrameIlocSetItem):
+            assert c1.key == c2.inputs[0].key
+        else:
+            assert c1.key == c2.key
+    np.testing.assert_array_equal(df5.chunks[0].op.indexes[0], [0])
+    np.testing.assert_array_equal(df5.chunks[0].op.indexes[1], [0, 1])
+    np.testing.assert_array_equal(df5.chunks[1].op.indexes[0], [0])
+    np.testing.assert_array_equal(df5.chunks[1].op.indexes[1], [0])
+
+    # fancy index
+    df6 = md.DataFrame(df1, chunk_size=2)
+    df6.iloc[[1, 2], [0, 1, 2]] = 3333
+    df6 = tile(df6)
+    assert isinstance(df6.op, DataFrameIlocSetItem)
+    assert df6.chunk_shape == df2.chunk_shape
+    pd.testing.assert_index_equal(
+        df2.index_value.to_pandas(), df6.index_value.to_pandas()
+    )
+    pd.testing.assert_index_equal(
+        df2.columns_value.to_pandas(), df6.columns_value.to_pandas()
+    )
+    for c1, c2 in zip(df2.chunks, df6.chunks):
+        assert c1.shape == c2.shape
+        pd.testing.assert_index_equal(
+            c1.index_value.to_pandas(), c2.index_value.to_pandas()
+        )
+        pd.testing.assert_index_equal(
+            c1.columns_value.to_pandas(), c2.columns_value.to_pandas()
+        )
+        if isinstance(c2.op, DataFrameIlocSetItem):
+            assert c1.key == c2.inputs[0].key
+        else:
+            assert c1.key == c2.key
+    np.testing.assert_array_equal(df6.chunks[0].op.indexes[0], [1])
+    np.testing.assert_array_equal(df6.chunks[0].op.indexes[1], [0, 1])
+    np.testing.assert_array_equal(df6.chunks[1].op.indexes[0], [1])
+    np.testing.assert_array_equal(df6.chunks[1].op.indexes[1], [0])
+    np.testing.assert_array_equal(df6.chunks[2].op.indexes[0], [0])
+    np.testing.assert_array_equal(df6.chunks[2].op.indexes[1], [0, 1])
+    np.testing.assert_array_equal(df6.chunks[3].op.indexes[0], [0])
+    np.testing.assert_array_equal(df6.chunks[3].op.indexes[1], [0])
+
+    # plain index
+    df7 = md.DataFrame(df1, chunk_size=2)
+    df7.iloc[1, 2] = 4444
+    df7 = tile(df7)
+    assert isinstance(df7.op, DataFrameIlocSetItem)
+    assert df7.chunk_shape == df2.chunk_shape
+    pd.testing.assert_index_equal(
+        df2.index_value.to_pandas(), df7.index_value.to_pandas()
+    )
+    pd.testing.assert_index_equal(
+        df2.columns_value.to_pandas(), df7.columns_value.to_pandas()
+    )
+    for c1, c2 in zip(df2.chunks, df7.chunks):
+        assert c1.shape == c2.shape
+        pd.testing.assert_index_equal(
+            c1.index_value.to_pandas(), c2.index_value.to_pandas()
+        )
+        pd.testing.assert_index_equal(
+            c1.columns_value.to_pandas(), c2.columns_value.to_pandas()
+        )
+        if isinstance(c2.op, DataFrameIlocSetItem):
+            assert c1.key == c2.inputs[0].key
+        else:
+            assert c1.key == c2.key
+    assert df7.chunks[1].op.indexes == [1, 0]
+
+    # test Series
+
+    # slice
+    series = md.Series(pd.Series(np.arange(10)), chunk_size=3)
+    series.iloc[:4] = 2
+    series = tile(series)
+
+    assert series.shape == (10,)
+    assert len(series.chunks) == 4
+
+    assert series.chunks[0].op.indexes == [
+        slice(None, None, None),
+    ]
+    assert series.chunks[0].op.value == 2
+    assert series.chunks[1].op.indexes == [
+        slice(0, 1, 1),
+    ]
+    assert series.chunks[1].op.value == 2
+
+    raw = pd.DataFrame(
+        np.random.rand(9, 2),
+        index=["a1", "a2", "a3"] * 3,
+        columns=["x", "y"],
+    )
+    df = md.DataFrame(raw, chunk_size=4)
+    iloc_df = df.iloc[:, 1:]
+    tiled_df, tiled_iloc_df = tile(df, iloc_df)
+    # for full slice, index_value should be same as input chunk
+    for loc_chunk, chunk in zip(tiled_iloc_df.chunks, tiled_df.chunks):
+        assert loc_chunk.index_value.key == chunk.index_value.key
+
+    # fancy index
+    series = md.Series(pd.Series(np.arange(10)), chunk_size=3)
+    series.iloc[[2, 4, 9]] = 3
+    series = tile(series)
+
+    assert series.shape == (10,)
+
+    assert len(series.chunks) == 4
+    assert series.chunks[0].index == (0,)
+    assert series.chunks[0].op.indexes[0].tolist() == [2]
+    assert series.chunks[0].op.value == 3
+    assert series.chunks[1].index == (1,)
+    assert series.chunks[1].op.indexes[0].tolist() == [1]
+    assert series.chunks[1].op.value == 3
+    assert series.chunks[3].index == (3,)
+    assert series.chunks[3].op.indexes[0].tolist() == [0]
+    assert series.chunks[3].op.value == 3
+
+
+def test_dataframe_loc():
+    raw = pd.DataFrame(
+        [[1, 3, 3], [4, 2, 6], [7, 8, 9]],
+        index=["a1", "a2", "a3"],
+        columns=["x", "y", "z"],
+    )
+    df = md.DataFrame(raw, chunk_size=2)
+    raw2 = raw.copy()
+    raw2.reset_index(inplace=True, drop=True)
+    df3 = md.DataFrame(raw2, chunk_size=2)
+    s = pd.Series([1, 3, 5], index=["a1", "a2", "a3"])
+    series = md.Series(s, chunk_size=2)
+
+    # test return scalar
+    df2 = df.loc["a1", "z"]
+    assert isinstance(df2, Tensor)
+    assert df2.shape == ()
+    assert df2.dtype == raw["z"].dtype
+
+    df2 = tile(df2)
+    assert len(df2.chunks) == 1
+    assert isinstance(df2.chunks[0], TENSOR_CHUNK_TYPE)
+
+    # test return series for index axis
+    df2 = df.loc[:, "y"]
+    assert isinstance(df2, Series)
+    assert df2.shape == (3,)
+    pd.testing.assert_index_equal(
+        df2.index_value.to_pandas(), df.index_value.to_pandas()
+    )
+    assert df2.name == "y"
+    assert df2.index_value.key == df.index_value.key
+
+    df2 = tile(df2)
+    assert len(df2.chunks) == 2
+    for c in df2.chunks:
+        assert isinstance(c, SERIES_CHUNK_TYPE)
+        assert isinstance(c.index_value.to_pandas(), type(raw.index))
+        assert c.name == "y"
+        assert c.dtype == raw["y"].dtype
+
+    # test return series for column axis
+    df2 = df.loc["a2", :]
+    assert isinstance(df2, Series)
+    assert df2.shape == (3,)
+    pd.testing.assert_index_equal(
+        df2.index_value.to_pandas(), df.columns_value.to_pandas()
+    )
+    assert df2.name == "a2"
+
+    df2 = tile(df2)
+    assert len(df2.chunks) == 2
+    for c in df2.chunks:
+        assert isinstance(c, SERIES_CHUNK_TYPE)
+        assert isinstance(c.index_value.to_pandas(), type(raw.columns))
+        assert c.name == "a2"
+        assert c.dtype == raw.loc["a2"].dtype
+
+    # test slice
+    df2 = df.loc["a2":"a3", "y":"z"]
+    assert isinstance(df2, DataFrame)
+    assert df2.shape == (np.nan, 2)
+    pd.testing.assert_index_equal(
+        df2.index_value.to_pandas(), df.index_value.to_pandas()
+    )
+    assert df2.index_value.key != df.index_value.key
+    pd.testing.assert_index_equal(
+        df2.columns_value.to_pandas(), raw.loc[:, "y":"z"].columns
+    )
+    pd.testing.assert_series_equal(df2.dtypes, raw.loc[:, "y":"z"].dtypes)
+
+    # test fancy index on index axis
+    df2 = df.loc[["a3", "a2"], [True, False, True]]
+    assert isinstance(df2, DataFrame)
+    assert df2.shape == (2, 2)
+    pd.testing.assert_index_equal(
+        df2.index_value.to_pandas(), df.index_value.to_pandas()
+    )
+    assert df2.index_value.key != df.index_value.key
+    pd.testing.assert_index_equal(
+        df2.columns_value.to_pandas(), raw.loc[:, [True, False, True]].columns
+    )
+    pd.testing.assert_series_equal(df2.dtypes, raw.loc[:, [True, False, True]].dtypes)
+
+    # test fancy index which is md.Series on index axis
+    df2 = df.loc[md.Series(["a3", "a2"]), [True, False, True]]
+    assert isinstance(df2, DataFrame)
+    assert df2.shape == (2, 2)
+    pd.testing.assert_index_equal(
+        df2.index_value.to_pandas(), df.index_value.to_pandas()
+    )
+    assert df2.index_value.key != df.index_value.key
+    pd.testing.assert_index_equal(
+        df2.columns_value.to_pandas(), raw.loc[:, [True, False, True]].columns
+    )
+    pd.testing.assert_series_equal(df2.dtypes, raw.loc[:, [True, False, True]].dtypes)
+
+    # test fancy index on columns axis
+    df2 = df.loc[[True, False, True], ["z", "x", "y"]]
+    assert isinstance(df2, DataFrame)
+    assert df2.shape == (2, 3)
+    pd.testing.assert_index_equal(
+        df2.index_value.to_pandas(), df.index_value.to_pandas()
+    )
+    assert df2.index_value.key != df.index_value.key
+    pd.testing.assert_index_equal(
+        df2.columns_value.to_pandas(), raw.loc[:, ["z", "x", "y"]].columns
+    )
+    pd.testing.assert_series_equal(df2.dtypes, raw.loc[:, ["z", "x", "y"]].dtypes)
+
+    df2 = tile(df2)
+    assert len(df2.chunks) == 2
+    for c in df2.chunks:
+        assert isinstance(c, DATAFRAME_CHUNK_TYPE)
+        pd.testing.assert_index_equal(
+            c.index_value.to_pandas(), df.index_value.to_pandas()
+        )
+        assert c.index_value.key != df.index_value.key
+        pd.testing.assert_index_equal(
+            c.columns_value.to_pandas(), raw.loc[:, ["z", "x", "y"]].columns
+        )
+        pd.testing.assert_series_equal(c.dtypes, raw.loc[:, ["z", "x", "y"]].dtypes)
+
+    df2 = df.loc[md.Series([True, False, True])]
+    assert isinstance(df2, DataFrame)
+    assert df2.shape == (np.nan, 3)
+    pd.testing.assert_index_equal(
+        df2.index_value.to_pandas(), df.index_value.to_pandas()
+    )
+    assert df2.index_value.key != df.index_value.key
+    pd.testing.assert_index_equal(df2.columns_value.to_pandas(), raw.columns)
+    pd.testing.assert_series_equal(df2.dtypes, raw.dtypes)
+
+    df2 = df3.loc[md.Series([True, False, True])]
+    assert isinstance(df2, DataFrame)
+    assert df2.shape == (np.nan, 3)
+    assert isinstance(
+        df2.index_value.to_pandas(), type(raw.loc[[True, False, True]].index)
+    )
+    assert df2.index_value.key != df3.index_value.key
+    pd.testing.assert_index_equal(df2.columns_value.to_pandas(), raw.columns)
+    pd.testing.assert_series_equal(df2.dtypes, raw.dtypes)
+
+    df2 = df3.loc[md.Series([2, 1])]
+    assert isinstance(df2, DataFrame)
+    assert df2.shape == (2, 3)
+    assert isinstance(df2.index_value.to_pandas(), type(raw2.loc[[2, 1]].index))
+    assert df2.index_value.key != df3.index_value.key
+    pd.testing.assert_index_equal(df2.columns_value.to_pandas(), raw.columns)
+    pd.testing.assert_series_equal(df2.dtypes, raw.dtypes)
+
+    series2 = series.loc["a2"]
+    assert isinstance(series2, Tensor)
+    assert series2.shape == ()
+    assert series2.dtype == s.dtype
+
+    series2 = series.loc[["a2", "a3"]]
+    assert isinstance(series2, Series)
+    assert series2.shape == (2,)
+    assert series2.dtype == s.dtype
+    assert series2.name == s.name
+
+    with pytest.raises(IndexingError):
+        _ = df.loc["a1", "z", ...]
+
+    with pytest.raises(NotImplementedError):
+        _ = df.loc[:, md.Series([True, False, True])]
+
+    with pytest.raises(KeyError):
+        _ = df.loc[:, ["non_exist"]]
+
+    # test loc chunk's index_value
+    raw = pd.DataFrame(
+        np.random.rand(9, 2),
+        index=["a1", "a2", "a3"] * 3,
+        columns=["x", "y"],
+    )
+    df = md.DataFrame(raw, chunk_size=4)
+    loc_df = df.loc[:, ["x"]]
+    tiled_df, tiled_loc_df = tile(df, loc_df)
+    # for full slice, index_value should be same as input chunk
+    for loc_chunk, chunk in zip(tiled_loc_df.chunks, tiled_df.chunks):
+        assert loc_chunk.index_value.key == chunk.index_value.key
+
+    # test loc on filtered df
+    df2 = df[df["x"] < 1]
+    loc_df = df2.loc[:, ["y", "x"]]
+    tiled_loc_df = tile(loc_df)
+    assert len(tiled_loc_df.chunks) == 3
+
+
+def test_loc_use_iloc():
+    raw = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], columns=["x", "y", "z"])
+    df = md.DataFrame(raw, chunk_size=2)
+
+    assert isinstance(df.loc[:3].op, DataFrameIlocGetItem)
+    assert isinstance(df.loc[1:3].op, DataFrameIlocGetItem)
+    assert isinstance(df.loc[1].op, DataFrameIlocGetItem)
+    # negative
+    assert isinstance(df.loc[:-3].op, DataFrameLocGetItem)
+    with pytest.raises(KeyError):
+        _ = df.loc[-3]
+    # index 1 not None
+    assert isinstance(df.loc[:3, :"y"].op, DataFrameLocGetItem)
+    # index 1 not slice
+    assert isinstance(df.loc[:3, [True, False, True]].op, DataFrameLocGetItem)
+    assert isinstance(df.loc[[True, False, True]].op, DataFrameLocGetItem)
+
+    raw2 = raw.copy()
+    raw2.index = pd.RangeIndex(1, 4)
+    df2 = md.DataFrame(raw2, chunk_size=2)
+
+    assert isinstance(df2.loc[:3].op, DataFrameLocGetItem)
+    assert isinstance(df2.loc["a3":].op, DataFrameLocGetItem)
+
+    raw2 = raw.copy()
+    raw2.index = [f"a{i}" for i in range(3)]
+    df2 = md.DataFrame(raw2, chunk_size=2)
+
+    assert isinstance(df2.loc[:3].op, DataFrameLocGetItem)
+
+
+def test_dataframe_getitem():
+    data = pd.DataFrame(np.random.rand(10, 5), columns=["c1", "c2", "c3", "c4", "c5"])
+    df = md.DataFrame(data, chunk_size=2)
+
+    series = df["c3"]
+    assert isinstance(series, Series)
+    assert series.shape == (10,)
+    assert series.name == "c3"
+    assert series.dtype == data["c3"].dtype
+    assert series.index_value == df.index_value
+
+    series = tile(series)
+    assert isinstance(series, SERIES_TYPE)
+    assert all(not i.is_coarse() for i in series.inputs) is True
+    assert series.nsplits == ((2, 2, 2, 2, 2),)
+    assert len(series.chunks) == 5
+    for i, c in enumerate(series.chunks):
+        assert isinstance(c, SERIES_CHUNK_TYPE)
+        assert c.index == (i,)
+        assert c.shape == (2,)
+
+    df1 = df[["c1", "c2", "c3"]]
+    assert isinstance(df1, DataFrame)
+    assert df1.shape == (10, 3)
+    assert df1.index_value == df.index_value
+    pd.testing.assert_index_equal(
+        df1.columns_value.to_pandas(), data[["c1", "c2", "c3"]].columns
+    )
+    pd.testing.assert_series_equal(df1.dtypes, data[["c1", "c2", "c3"]].dtypes)
+
+    df1 = tile(df1)
+    assert df1.nsplits == ((2, 2, 2, 2, 2), (2, 1))
+    assert len(df1.chunks) == 10
+    for i, c in enumerate(df1.chunks[slice(0, 10, 2)]):
+        assert isinstance(c, DATAFRAME_CHUNK_TYPE)
+        assert c.index == (i, 0)
+        assert c.shape == (2, 2)
+    for i, c in enumerate(df1.chunks[slice(1, 10, 2)]):
+        assert isinstance(c, DATAFRAME_CHUNK_TYPE)
+        assert c.index == (i, 1)
+        assert c.shape == (2, 1)
+
+
+def test_dataframe_getitem_bool():
+    data = pd.DataFrame(
+        np.random.rand(10, 5),
+        columns=["c1", "c2", "c3", "c4", "c5"],
+        index=pd.RangeIndex(10, name="i"),
+    )
+    df = md.DataFrame(data, chunk_size=2)
+
+    mask_data1 = data.c1 > 0.5
+    mask_data2 = data.c1 < 0.5
+    mask1 = md.Series(mask_data1, chunk_size=2)
+    mask2 = md.Series(mask_data2, chunk_size=2)
+
+    r1 = df[mask1]
+    r2 = df[mask2]
+    r3 = df[mask1]
+
+    assert r1.index_value.key != df.index_value.key
+    assert r1.index_value.key != mask1.index_value.key
+    assert r1.columns_value.key == df.columns_value.key
+    assert r1.columns_value is df.columns_value
+    assert r1.index_value.name == "i"
+
+    assert r1.index_value.key != r2.index_value.key
+    assert r1.columns_value.key == r2.columns_value.key
+    assert r1.columns_value is r2.columns_value
+
+    assert r1.index_value.key == r3.index_value.key
+    assert r1.columns_value.key == r3.columns_value.key
+    assert r1.columns_value is r3.columns_value
+
+
+def test_series_getitem():
+    data = pd.Series(np.random.rand(10), name="a")
+    series = md.Series(data, chunk_size=3)
+
+    result1 = series[2]
+    assert result1.shape == ()
+
+    result1 = tile(result1)
+    assert result1.nsplits == ()
+    assert len(result1.chunks) == 1
+    assert isinstance(result1.chunks[0], TENSOR_CHUNK_TYPE)
+    assert result1.chunks[0].shape == ()
+    assert result1.chunks[0].dtype == data.dtype
+
+    result2 = series[[4, 5, 1, 2, 3]]
+    assert result2.shape == (5,)
+
+    result2 = tile(result2)
+    assert result2.nsplits == ((2, 2, 1),)
+    assert len(result2.chunks) == 3
+    assert result2.chunks[0].op.labels == [4, 5]
+    assert result2.chunks[1].op.labels == [1, 2]
+    assert result2.chunks[2].op.labels == [3]
+
+    data = pd.Series(np.random.rand(10), index=["i" + str(i) for i in range(10)])
+    series = md.Series(data, chunk_size=3)
+
+    result1 = series["i2"]
+    assert result1.shape == ()
+
+    result1 = tile(result1)
+    assert result1.nsplits == ()
+    assert result1.chunks[0].dtype == data.dtype
+    assert result1.chunks[0].op.labels == "i2"
+
+    result2 = series[["i2", "i4"]]
+    assert result2.shape == (2,)
+
+    result2 = tile(result2)
+    assert result2.nsplits == ((2,),)
+    assert result2.chunks[0].dtype == data.dtype
+    assert result2.chunks[0].op.labels == ["i2", "i4"]
+
+
+def test_setitem():
+    data = pd.DataFrame(np.random.rand(10, 2), columns=["c1", "c2"])
+    df = md.DataFrame(data, chunk_size=4)
+
+    df["new"] = 1
+    assert df.shape == (10, 3)
+    pd.testing.assert_series_equal(df.inputs[0].dtypes, data.dtypes)
+
+    tiled = tile(df)
+    assert tiled.chunks[0].shape == (4, 3)
+    pd.testing.assert_series_equal(tiled.inputs[0].dtypes, data.dtypes)
+    assert tiled.chunks[1].shape == (4, 3)
+    pd.testing.assert_series_equal(tiled.inputs[0].dtypes, data.dtypes)
+    assert tiled.chunks[2].shape == (2, 3)
+    pd.testing.assert_series_equal(tiled.inputs[0].dtypes, data.dtypes)
+
+    for c in tiled.chunks:
+        pd.testing.assert_series_equal(c.inputs[0].dtypes, data.dtypes)
+
+
+def test_reset_index():
+    data = pd.DataFrame(
+        [("bird", 389.0), ("bird", 24.0), ("mammal", 80.5), ("mammal", np.nan)],
+        index=["falcon", "parrot", "lion", "monkey"],
+        columns=("class", "max_speed"),
+    )
+    df = md.DataFrame(data, chunk_size=2).reset_index()
+    r = data.reset_index()
+
+    assert df.shape == (4, 3)
+    pd.testing.assert_series_equal(df.dtypes, r.dtypes)
+    pd.testing.assert_index_equal(df.columns_value.to_pandas(), r.columns)
+
+    df2 = tile(df)
+
+    assert len(df2.chunks) == 2
+    assert df2.chunks[0].shape == (2, 3)
+    pd.testing.assert_index_equal(
+        df2.chunks[0].index_value.to_pandas(), pd.RangeIndex(2)
+    )
+    pd.testing.assert_series_equal(df2.chunks[0].dtypes, r.dtypes)
+    assert df2.chunks[1].shape == (2, 3)
+    pd.testing.assert_index_equal(
+        df2.chunks[1].index_value.to_pandas(), pd.RangeIndex(2, 4)
+    )
+    pd.testing.assert_series_equal(df2.chunks[1].dtypes, r.dtypes)
+
+    df = md.DataFrame(data, chunk_size=1).reset_index(drop=True)
+    r = data.reset_index(drop=True)
+
+    assert df.shape == (4, 2)
+    pd.testing.assert_series_equal(df.dtypes, r.dtypes)
+
+    df2 = tile(df)
+
+    assert len(df2.chunks) == 8
+
+    for c in df2.chunks:
+        assert c.shape == (1, 1)
+        pd.testing.assert_index_equal(
+            c.index_value.to_pandas(), pd.RangeIndex(c.index[0], c.index[0] + 1)
+        )
+        pd.testing.assert_series_equal(c.dtypes, r.dtypes[c.index[1] : c.index[1] + 1])
+
+    # test Series
+    series_data = pd.Series(
+        [1, 2, 3, 4], name="foo", index=pd.Index(["a", "b", "c", "d"], name="idx")
+    )
+    s = md.Series(series_data, chunk_size=2).reset_index()
+    r = series_data.reset_index()
+
+    assert s.shape == (4, 2)
+    pd.testing.assert_series_equal(s.dtypes, r.dtypes)
+
+    s2 = tile(s)
+    assert len(s2.chunks) == 2
+    assert s2.chunks[0].shape == (2, 2)
+    pd.testing.assert_index_equal(
+        s2.chunks[0].index_value.to_pandas(), pd.RangeIndex(2)
+    )
+    assert s2.chunks[1].shape == (2, 2)
+    pd.testing.assert_index_equal(
+        s2.chunks[1].index_value.to_pandas(), pd.RangeIndex(2, 4)
+    )
+
+    with pytest.raises(TypeError):
+        md.Series(series_data, chunk_size=2).reset_index(inplace=True)
+
+
+def test_head_tail_optimize():
+    raw = pd.DataFrame(np.random.rand(4, 3))
+
+    df = md.DataFrame(raw, chunk_size=2)
+
+    # no nan chunk shape
+    assert (
+        HeadTailOptimizedOperandMixin._need_tile_head_tail(tile(df).head(2).op) is False
+    )
+
+    df2 = tile(df[df[0] < 0.5])
+    # chunk shape on axis 1 greater than 1
+    assert HeadTailOptimizedOperandMixin._need_tile_head_tail(df2.head(2).op) is False
+
+    df = md.DataFrame(raw, chunk_size=(2, 3))
+    df2 = tile(df[df[0] < 0.5])
+    # not slice
+    assert HeadTailOptimizedOperandMixin._need_tile_head_tail(df2.iloc[2].op) is False
+    # step not None
+    assert (
+        HeadTailOptimizedOperandMixin._need_tile_head_tail(df2.iloc[:2:2].op) is False
+    )
+    # not head or tail
+    assert HeadTailOptimizedOperandMixin._need_tile_head_tail(df2.iloc[1:3].op) is False
+    # slice 1 is not slice(None)
+    assert (
+        HeadTailOptimizedOperandMixin._need_tile_head_tail(df2.iloc[:3, :2].op) is False
+    )
+
+
+def test_reindex():
+    raw = pd.DataFrame(np.random.rand(4, 3))
+
+    df = md.DataFrame(raw, chunk_size=2)
+
+    with pytest.raises(TypeError):
+        df.reindex(unknown_arg=1)
+
+    with pytest.raises(ValueError):
+        df.reindex([1, 2], fill_value=mt.tensor([1, 2]))
+
+
+def test_getitem_lazy_chunk_meta():
+    df = dataframe_from_tensor(mt.random.rand(10, 3, chunk_size=3))
+    df2 = df[[0, 2]]
+    df2 = tile(df2)
+
+    chunk = df2.chunks[0].data
+    assert chunk._FIELDS["_dtypes"].get(chunk) is None
+    pd.testing.assert_series_equal(chunk.dtypes, df.dtypes[[0, 2]])
+    assert chunk._FIELDS["_dtypes"].get(chunk) is not None
+    assert chunk._FIELDS["_index_value"].get(chunk) is None
+    pd.testing.assert_index_equal(chunk.index_value.to_pandas(), pd.RangeIndex(3))
+    assert chunk._FIELDS["_index_value"].get(chunk) is not None
+    assert chunk._FIELDS["_columns_value"].get(chunk) is None
+    pd.testing.assert_index_equal(chunk.columns_value.to_pandas(), pd.Index([0, 2]))
+    assert chunk._FIELDS["_columns_value"].get(chunk) is not None
+
+    df2 = df[2]
+    df2 = tile(df2)
+
+    chunk = df2.chunks[0].data
+    assert chunk._FIELDS["_index_value"].get(chunk) is None
+    pd.testing.assert_index_equal(chunk.index_value.to_pandas(), pd.RangeIndex(3))
+    assert chunk._FIELDS["_index_value"].get(chunk) is not None
diff --git a/python/xorbits/_mars/dataframe/indexing/tests/test_indexing_execution.py b/python/xorbits/_mars/dataframe/indexing/tests/test_indexing_execution.py
new file mode 100644
index 000000000..2b5a9b0d2
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/tests/test_indexing_execution.py
@@ -0,0 +1,1851 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+
+import mars
+import numpy as np
+import pandas as pd
+import pytest
+
+try:
+    import pyarrow as pa
+except ImportError:  # pragma: no cover
+    pa = None
+try:
+    import fastparquet as fp
+except ImportError:  # pragma: no cover
+    fp = None
+
+from .... import dataframe as md
+from .... import tensor as mt
+from ....utils import pd_release_version
+from ...datasource.read_csv import DataFrameReadCSV
+from ...datasource.read_parquet import DataFrameReadParquet
+from ...datasource.read_sql import DataFrameReadSQL
+
+_allow_set_missing_list = pd_release_version[:2] >= (1, 1)
+
+
+@pytest.mark.parametrize("chunk_size", [2, (2, 3)])
+def test_set_index(setup, chunk_size):
+    df1 = pd.DataFrame(
+        [[1, 3, 3], [4, 2, 6], [7, 8, 9]],
+        index=["a1", "a2", "a3"],
+        columns=["x", "y", "z"],
+    )
+
+    df2 = md.DataFrame(df1, chunk_size=chunk_size)
+
+    expected = df1.set_index("y", drop=True)
+    df3 = df2.set_index("y", drop=True)
+    pd.testing.assert_frame_equal(expected, df3.execute().fetch())
+
+    expected = df1.set_index("y", drop=False)
+    df4 = df2.set_index("y", drop=False)
+    pd.testing.assert_frame_equal(expected, df4.execute().fetch())
+
+    expected = df1.set_index("y")
+    df2.set_index("y", inplace=True)
+    pd.testing.assert_frame_equal(expected, df2.execute().fetch())
+
+
+def test_iloc_getitem(setup):
+    df1 = pd.DataFrame(
+        [[1, 3, 3], [4, 2, 6], [7, 8, 9]],
+        index=["a1", "a2", "a3"],
+        columns=["x", "y", "z"],
+    )
+    df2 = md.DataFrame(df1, chunk_size=2)
+
+    # plain index
+    expected = df1.iloc[1]
+    df3 = df2.iloc[1]
+    result = df3.execute(extra_config={"check_series_name": False}).fetch()
+    pd.testing.assert_series_equal(expected, result)
+
+    # plain index on axis 1
+    expected = df1.iloc[:2, 1]
+    df4 = df2.iloc[:2, 1]
+    pd.testing.assert_series_equal(expected, df4.execute().fetch())
+
+    # slice index
+    expected = df1.iloc[:, 2:4]
+    df5 = df2.iloc[:, 2:4]
+    pd.testing.assert_frame_equal(expected, df5.execute().fetch())
+
+    # plain fancy index
+    expected = df1.iloc[[0], [0, 1, 2]]
+    df6 = df2.iloc[[0], [0, 1, 2]]
+    pd.testing.assert_frame_equal(expected, df6.execute().fetch())
+
+    # plain fancy index with shuffled order
+    expected = df1.iloc[[0], [1, 2, 0]]
+    df7 = df2.iloc[[0], [1, 2, 0]]
+    pd.testing.assert_frame_equal(expected, df7.execute().fetch())
+
+    # fancy index
+    expected = df1.iloc[[1, 2], [0, 1, 2]]
+    df8 = df2.iloc[[1, 2], [0, 1, 2]]
+    pd.testing.assert_frame_equal(expected, df8.execute().fetch())
+
+    # fancy index with shuffled order
+    expected = df1.iloc[[2, 1], [1, 2, 0]]
+    df9 = df2.iloc[[2, 1], [1, 2, 0]]
+    pd.testing.assert_frame_equal(expected, df9.execute().fetch())
+
+    # one fancy index
+    expected = df1.iloc[[2, 1]]
+    df10 = df2.iloc[[2, 1]]
+    pd.testing.assert_frame_equal(expected, df10.execute().fetch())
+
+    # plain index
+    expected = df1.iloc[1, 2]
+    df11 = df2.iloc[1, 2]
+    assert expected == df11.execute().fetch()
+
+    # bool index array
+    expected = df1.iloc[[True, False, True], [2, 1]]
+    df12 = df2.iloc[[True, False, True], [2, 1]]
+    pd.testing.assert_frame_equal(expected, df12.execute().fetch())
+
+    # bool index array on axis 1
+    expected = df1.iloc[[2, 1], [True, False, True]]
+    df14 = df2.iloc[[2, 1], [True, False, True]]
+    pd.testing.assert_frame_equal(expected, df14.execute().fetch())
+
+    # bool index
+    expected = df1.iloc[[True, False, True], [2, 1]]
+    df13 = df2.iloc[md.Series([True, False, True], chunk_size=1), [2, 1]]
+    pd.testing.assert_frame_equal(expected, df13.execute().fetch())
+
+    # test Series
+    data = pd.Series(np.arange(10))
+    series = md.Series(data, chunk_size=3).iloc[:3]
+    pd.testing.assert_series_equal(series.execute().fetch(), data.iloc[:3])
+
+    series = md.Series(data, chunk_size=3).iloc[4]
+    assert series.execute().fetch() == data.iloc[4]
+
+    series = md.Series(data, chunk_size=3).iloc[[2, 3, 4, 9]]
+    pd.testing.assert_series_equal(series.execute().fetch(), data.iloc[[2, 3, 4, 9]])
+
+    series = md.Series(data, chunk_size=3).iloc[[4, 3, 9, 2]]
+    pd.testing.assert_series_equal(series.execute().fetch(), data.iloc[[4, 3, 9, 2]])
+
+    series = md.Series(data).iloc[5:]
+    pd.testing.assert_series_equal(series.execute().fetch(), data.iloc[5:])
+
+    # bool index array
+    selection = np.random.RandomState(0).randint(2, size=10, dtype=bool)
+    series = md.Series(data).iloc[selection]
+    pd.testing.assert_series_equal(series.execute().fetch(), data.iloc[selection])
+
+    # bool index
+    series = md.Series(data).iloc[md.Series(selection, chunk_size=4)]
+    pd.testing.assert_series_equal(series.execute().fetch(), data.iloc[selection])
+
+    # test index
+    data = pd.Index(np.arange(10))
+    index = md.Index(data, chunk_size=3)[:3]
+    pd.testing.assert_index_equal(index.execute().fetch(), data[:3])
+
+    index = md.Index(data, chunk_size=3)[4]
+    assert index.execute().fetch() == data[4]
+
+    index = md.Index(data, chunk_size=3)[[2, 3, 4, 9]]
+    pd.testing.assert_index_equal(index.execute().fetch(), data[[2, 3, 4, 9]])
+
+    index = md.Index(data, chunk_size=3)[[4, 3, 9, 2]]
+    pd.testing.assert_index_equal(index.execute().fetch(), data[[4, 3, 9, 2]])
+
+    index = md.Index(data)[5:]
+    pd.testing.assert_index_equal(index.execute().fetch(), data[5:])
+
+    # bool index array
+    selection = np.random.RandomState(0).randint(2, size=10, dtype=bool)
+    index = md.Index(data)[selection]
+    pd.testing.assert_index_equal(index.execute().fetch(), data[selection])
+
+    index = md.Index(data)[mt.tensor(selection, chunk_size=4)]
+    pd.testing.assert_index_equal(index.execute().fetch(), data[selection])
+
+
+def test_iloc_setitem(setup):
+    df1 = pd.DataFrame(
+        [[1, 3, 3], [4, 2, 6], [7, 8, 9]],
+        index=["a1", "a2", "a3"],
+        columns=["x", "y", "z"],
+    )
+    df2 = md.DataFrame(df1, chunk_size=2)
+
+    # plain index
+    expected = df1
+    expected.iloc[1] = 100
+    df2.iloc[1] = 100
+    pd.testing.assert_frame_equal(expected, df2.execute().fetch())
+
+    # slice index
+    expected.iloc[:, 2:4] = 1111
+    df2.iloc[:, 2:4] = 1111
+    pd.testing.assert_frame_equal(expected, df2.execute().fetch())
+
+    # plain fancy index
+    expected.iloc[[0], [0, 1, 2]] = 2222
+    df2.iloc[[0], [0, 1, 2]] = 2222
+    pd.testing.assert_frame_equal(expected, df2.execute().fetch())
+
+    # fancy index
+    expected.iloc[[1, 2], [0, 1, 2]] = 3333
+    df2.iloc[[1, 2], [0, 1, 2]] = 3333
+    pd.testing.assert_frame_equal(expected, df2.execute().fetch())
+
+    # plain index
+    expected.iloc[1, 2] = 4444
+    df2.iloc[1, 2] = 4444
+    pd.testing.assert_frame_equal(expected, df2.execute().fetch())
+
+    # test Series
+    data = pd.Series(np.arange(10))
+    series = md.Series(data, chunk_size=3)
+    series.iloc[:3] = 1
+    data.iloc[:3] = 1
+    pd.testing.assert_series_equal(series.execute().fetch(), data)
+
+    series.iloc[4] = 2
+    data.iloc[4] = 2
+    pd.testing.assert_series_equal(series.execute().fetch(), data)
+
+    series.iloc[[2, 3, 4, 9]] = 3
+    data.iloc[[2, 3, 4, 9]] = 3
+    pd.testing.assert_series_equal(series.execute().fetch(), data)
+
+    series.iloc[5:] = 4
+    data.iloc[5:] = 4
+    pd.testing.assert_series_equal(series.execute().fetch(), data)
+
+    # test Index
+    data = pd.Index(np.arange(10))
+    index = md.Index(data, chunk_size=3)
+    with pytest.raises(TypeError):
+        index[5:] = 4
+
+
+def test_loc_getitem(setup):
+    rs = np.random.RandomState(0)
+    # index and columns are labels
+    raw1 = pd.DataFrame(
+        rs.randint(10, size=(5, 4)),
+        index=["a1", "a2", "a3", "a4", "a5"],
+        columns=["a", "b", "c", "d"],
+    )
+    # columns are labels
+    raw2 = raw1.copy()
+    raw2.reset_index(inplace=True, drop=True)
+    # columns are non unique and monotonic
+    raw3 = raw1.copy()
+    raw3.columns = ["a", "b", "b", "d"]
+    # columns are non unique and non monotonic
+    raw4 = raw1.copy()
+    raw4.columns = ["b", "a", "b", "d"]
+    # index that is timestamp
+    raw5 = raw1.copy()
+    raw5.index = pd.date_range("2020-1-1", periods=5)
+    raw6 = raw1[:0]
+
+    df1 = md.DataFrame(raw1, chunk_size=2)
+    df2 = md.DataFrame(raw2, chunk_size=2)
+    df3 = md.DataFrame(raw3, chunk_size=2)
+    df4 = md.DataFrame(raw4, chunk_size=2)
+    df5 = md.DataFrame(raw5, chunk_size=2)
+    df6 = md.DataFrame(raw6)
+
+    df = df2.loc[3, "b"]
+    result = df.execute().fetch()
+    expected = raw2.loc[3, "b"]
+    assert result == expected
+
+    df = df1.loc["a3", "b"]
+    result = df.execute(extra_config={"check_shape": False}).fetch()
+    expected = raw1.loc["a3", "b"]
+    assert result == expected
+
+    # test empty list
+    df = df1.loc[[]]
+    result = df.execute().fetch()
+    expected = raw1.loc[[]]
+    pd.testing.assert_frame_equal(result, expected)
+
+    df = df2.loc[[]]
+    result = df.execute().fetch()
+    expected = raw2.loc[[]]
+    pd.testing.assert_frame_equal(result, expected)
+
+    df = df2.loc[1:4]
+    result = df.execute().fetch()
+    expected = raw2.loc[1:4]
+    pd.testing.assert_frame_equal(result, expected)
+
+    df = df2.loc[1:4, "b":"d"]
+    result = df.execute().fetch()
+    expected = raw2.loc[1:4, "b":"d"]
+    pd.testing.assert_frame_equal(result, expected)
+
+    df = df2.loc[:4, "b":]
+    result = df.execute().fetch()
+    expected = raw2.loc[:4, "b":]
+    pd.testing.assert_frame_equal(result, expected)
+
+    # slice on axis index whose index_value does not have value
+    df = df1.loc["a2":"a4", "b":]
+    result = df.execute().fetch()
+    expected = raw1.loc["a2":"a4", "b":]
+    pd.testing.assert_frame_equal(result, expected)
+
+    df = df2.loc[:, "b"]
+    result = df.execute().fetch()
+    expected = raw2.loc[:, "b"]
+    pd.testing.assert_series_equal(result, expected)
+    df = df2.loc[:, ["b", "a"]]
+    result = df.execute().fetch()
+    expected = raw2.loc[:, ["b", "a"]]
+    pd.testing.assert_frame_equal(result, expected)
+
+    # 'b' is non-unique
+    df = df3.loc[:, "b"]
+    result = df.execute().fetch()
+    expected = raw3.loc[:, "b"]
+    pd.testing.assert_frame_equal(result, expected)
+
+    # 'b' is non-unique, and non-monotonic
+    df = df4.loc[:, "b"]
+    result = df.execute().fetch()
+    expected = raw4.loc[:, "b"]
+    pd.testing.assert_frame_equal(result, expected)
+
+    # label on axis 0
+    df = df1.loc["a2", :]
+    result = df.execute().fetch()
+    expected = raw1.loc["a2", :]
+    pd.testing.assert_series_equal(result, expected)
+
+    # label-based fancy index
+    df = df2.loc[[3, 0, 1], ["c", "a", "d"]]
+    result = df.execute().fetch()
+    expected = raw2.loc[[3, 0, 1], ["c", "a", "d"]]
+    pd.testing.assert_frame_equal(result, expected)
+    df = df2[df2["a"] < 10]
+    df = df.loc[[3, 0, 1], ["c", "a", "d"]]
+    result = df.execute().fetch()
+    expected = raw2.loc[[3, 0, 1], ["c", "a", "d"]]
+    pd.testing.assert_frame_equal(result, expected)
+
+    # label-based fancy index, asc sorted
+    df = df2.loc[[0, 1, 3], ["a", "c", "d"]]
+    result = df.execute().fetch()
+    expected = raw2.loc[[0, 1, 3], ["a", "c", "d"]]
+    pd.testing.assert_frame_equal(result, expected)
+
+    # label-based fancy index in which non-unique exists
+    selection = rs.randint(2, size=(5,), dtype=bool)
+    df = df3.loc[selection, ["b", "a", "d"]]
+    result = df.execute().fetch()
+    expected = raw3.loc[selection, ["b", "a", "d"]]
+    pd.testing.assert_frame_equal(result, expected)
+
+    df = df3.loc[md.Series(selection), ["b", "a", "d"]]
+    result = df.execute().fetch()
+    expected = raw3.loc[selection, ["b", "a", "d"]]
+    pd.testing.assert_frame_equal(result, expected)
+
+    # label-based fancy index on index
+    # whose index_value does not have value
+    df = df1.loc[["a3", "a1"], ["b", "a", "d"]]
+    result = df.execute(extra_config={"check_nsplits": False}).fetch()
+    expected = raw1.loc[["a3", "a1"], ["b", "a", "d"]]
+    pd.testing.assert_frame_equal(result, expected)
+
+    # get timestamp by str
+    df = df5.loc["20200101"]
+    result = df.execute(extra_config={"check_series_name": False}).fetch(
+        extra_config={"check_series_name": False}
+    )
+    expected = raw5.loc["20200101"]
+    pd.testing.assert_series_equal(result, expected)
+
+    # get timestamp by str, return scalar
+    df = df5.loc["2020-1-1", "c"]
+    result = df.execute().fetch()
+    expected = raw5.loc["2020-1-1", "c"]
+    assert result == expected
+
+    # test empty df
+    df = df6.loc[[]]
+    result = df.execute().fetch()
+    expected = raw6.loc[[]]
+    pd.testing.assert_frame_equal(result, expected)
+
+
+@pytest.mark.pd_compat
+def test_dataframe_getitem(setup):
+    data = pd.DataFrame(np.random.rand(10, 5), columns=["c1", "c2", "c3", "c4", "c5"])
+    df = md.DataFrame(data, chunk_size=2)
+    data2 = data.copy()
+    data2.index = pd.date_range("2020-1-1", periods=10)
+    mdf = md.DataFrame(data2, chunk_size=3)
+
+    series1 = df["c2"]
+    pd.testing.assert_series_equal(series1.execute().fetch(), data["c2"])
+
+    series2 = df["c5"]
+    pd.testing.assert_series_equal(series2.execute().fetch(), data["c5"])
+
+    df1 = df[["c1", "c2", "c3"]]
+    pd.testing.assert_frame_equal(df1.execute().fetch(), data[["c1", "c2", "c3"]])
+
+    df2 = df[["c3", "c2", "c1"]]
+    pd.testing.assert_frame_equal(df2.execute().fetch(), data[["c3", "c2", "c1"]])
+
+    df3 = df[["c1"]]
+    pd.testing.assert_frame_equal(df3.execute().fetch(), data[["c1"]])
+
+    df4 = df[["c3", "c1", "c2", "c1"]]
+    pd.testing.assert_frame_equal(df4.execute().fetch(), data[["c3", "c1", "c2", "c1"]])
+
+    df5 = df[np.array(["c1", "c2", "c3"])]
+    pd.testing.assert_frame_equal(df5.execute().fetch(), data[["c1", "c2", "c3"]])
+
+    df6 = df[["c3", "c2", "c1"]]
+    pd.testing.assert_frame_equal(df6.execute().fetch(), data[["c3", "c2", "c1"]])
+
+    df7 = df[1:7:2]
+    pd.testing.assert_frame_equal(df7.execute().fetch(), data[1:7:2])
+
+    df8 = df[["c1", "c1"]]["c1"]
+    pd.testing.assert_frame_equal(df8.execute().fetch(), data[["c1", "c1"]]["c1"])
+
+    series3 = df["c1"][0]
+    assert series3.execute().fetch() == data["c1"][0]
+
+    df8 = mdf[3:7]
+    pd.testing.assert_frame_equal(df8.execute().fetch(), data2[3:7])
+
+    df9 = mdf["2020-1-2":"2020-1-5"]
+    pd.testing.assert_frame_equal(df9.execute().fetch(), data2["2020-1-2":"2020-1-5"])
+
+
+def test_dataframe_getitem_bool(setup):
+    data = pd.DataFrame(np.random.rand(10, 5), columns=["c1", "c2", "c3", "c4", "c5"])
+    df = md.DataFrame(data, chunk_size=2)
+
+    mask_data = data.c1 > 0.5
+    mask = md.Series(mask_data, chunk_size=2)
+
+    # getitem by mars series
+    assert df[mask].execute().fetch().shape == data[mask_data].shape
+    pd.testing.assert_frame_equal(df[mask].execute().fetch(), data[mask_data])
+
+    # getitem by pandas series
+    pd.testing.assert_frame_equal(df[mask_data].execute().fetch(), data[mask_data])
+
+    # getitem by mars series with alignment but no shuffle
+    mask_data = pd.Series(
+        [True, True, True, False, False, True, True, False, False, True],
+        index=range(9, -1, -1),
+    )
+    mask = md.Series(mask_data, chunk_size=2)
+    pd.testing.assert_frame_equal(df[mask].execute().fetch(), data[mask_data])
+
+    # getitem by mars series with shuffle alignment
+    mask_data = pd.Series(
+        [True, True, True, False, False, True, True, False, False, True],
+        index=[0, 3, 6, 2, 9, 8, 5, 7, 1, 4],
+    )
+    mask = md.Series(mask_data, chunk_size=2)
+    pd.testing.assert_frame_equal(
+        df[mask].execute().fetch().sort_index(), data[mask_data]
+    )
+
+    # getitem by mars series with shuffle alignment and extra element
+    mask_data = pd.Series(
+        [True, True, True, False, False, True, True, False, False, True, False],
+        index=[0, 3, 6, 2, 9, 8, 5, 7, 1, 4, 10],
+    )
+    mask = md.Series(mask_data, chunk_size=2)
+    pd.testing.assert_frame_equal(
+        df[mask].execute().fetch().sort_index(), data[mask_data]
+    )
+
+    # getitem by DataFrame with all bool columns
+    r = df[df > 0.5]
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(result, data[data > 0.5])
+
+    # getitem by tensor mask
+    r = df[(df["c1"] > 0.5).to_tensor()]
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(result, data[data["c1"] > 0.5])
+
+    # test input data with unknown shape
+    data = pd.DataFrame(np.random.rand(10, 2), columns=["c1", "c2"])
+    mask_data = data[data["c2"] > 0.5]
+    df = md.DataFrame(data, chunk_size=2)
+    s = md.Series(mask_data["c1"] > 0.5, chunk_size=2)
+    df1 = df[df["c2"] > 0.5]
+    s._index_value = df1.index_value
+    r = df1[s]
+    pd.testing.assert_frame_equal(r.execute().fetch(), mask_data[mask_data["c1"] > 0.5])
+
+
+def test_dataframe_getitem_using_attr(setup):
+    data = pd.DataFrame(
+        np.random.rand(10, 5), columns=["c1", "c2", "key", "dtypes", "size"]
+    )
+    df = md.DataFrame(data, chunk_size=2)
+
+    series1 = df.c2
+    pd.testing.assert_series_equal(series1.execute().fetch(), data.c2)
+
+    # accessing column using attribute shouldn't overwrite existing attributes
+    assert df.key == getattr(getattr(df, "_data"), "_key")
+    assert df.size == data.size
+    pd.testing.assert_series_equal(df.dtypes, data.dtypes)
+
+    # accessing non-existing attributes should trigger exception
+    with pytest.raises(AttributeError):
+        _ = df.zzz  # noqa: F841
+
+
+def test_series_getitem(setup):
+    data = pd.Series(np.random.rand(10))
+    series = md.Series(data)
+    assert series[1].execute().fetch() == data[1]
+
+    data = pd.Series(np.random.rand(10), name="a")
+    series = md.Series(data, chunk_size=4)
+
+    for i in range(10):
+        series1 = series[i]
+        assert series1.execute().fetch() == data[i]
+
+    series2 = series[[0, 1, 2, 3, 4]]
+    pd.testing.assert_series_equal(series2.execute().fetch(), data[[0, 1, 2, 3, 4]])
+
+    series3 = series[[4, 3, 2, 1, 0]]
+    pd.testing.assert_series_equal(series3.execute().fetch(), data[[4, 3, 2, 1, 0]])
+
+    series4 = series[[1, 2, 3, 2, 1, 0]]
+    pd.testing.assert_series_equal(series4.execute().fetch(), data[[1, 2, 3, 2, 1, 0]])
+    #
+    index = ["i" + str(i) for i in range(20)]
+    data = pd.Series(np.random.rand(20), index=index, name="a")
+    series = md.Series(data, chunk_size=3)
+
+    for idx in index:
+        series1 = series[idx]
+        assert series1.execute().fetch() == data[idx]
+
+    selected = ["i1", "i2", "i3", "i4", "i5"]
+    series2 = series[selected]
+    pd.testing.assert_series_equal(series2.execute().fetch(), data[selected])
+
+    selected = ["i4", "i7", "i0", "i1", "i5"]
+    series3 = series[selected]
+    pd.testing.assert_series_equal(series3.execute().fetch(), data[selected])
+
+    selected = ["i0", "i1", "i5", "i4", "i0", "i1"]
+    series4 = series[selected]
+    pd.testing.assert_series_equal(series4.execute().fetch(), data[selected])
+
+    selected = ["i0"]
+    series5 = series[selected]
+    pd.testing.assert_series_equal(series5.execute().fetch(), data[selected])
+
+    data = pd.Series(np.random.rand(10))
+    series = md.Series(data, chunk_size=3)
+    selected = series[:2]
+    pd.testing.assert_series_equal(selected.execute().fetch(), data[:2])
+
+    selected = series[2:8:2]
+    pd.testing.assert_series_equal(selected.execute().fetch(), data[2:8:2])
+
+    data = pd.Series(np.random.rand(9), index=["c" + str(i) for i in range(9)])
+    series = md.Series(data, chunk_size=3)
+    selected = series[:"c2"]
+    pd.testing.assert_series_equal(selected.execute().fetch(), data[:"c2"])
+    selected = series["c2":"c9"]
+    pd.testing.assert_series_equal(selected.execute().fetch(), data["c2":"c9"])
+
+
+def test_head(setup):
+    data = pd.DataFrame(np.random.rand(10, 5), columns=["c1", "c2", "c3", "c4", "c5"])
+    df = md.DataFrame(data, chunk_size=2)
+
+    pd.testing.assert_frame_equal(df.head().execute().fetch(), data.head())
+    pd.testing.assert_frame_equal(df.head(3).execute().fetch(), data.head(3))
+    pd.testing.assert_frame_equal(df.head(-3).execute().fetch(), data.head(-3))
+    pd.testing.assert_frame_equal(df.head(8).execute().fetch(), data.head(8))
+    pd.testing.assert_frame_equal(df.head(-8).execute().fetch(), data.head(-8))
+    pd.testing.assert_frame_equal(df.head(13).execute().fetch(), data.head(13))
+    pd.testing.assert_frame_equal(df.head(-13).execute().fetch(), data.head(-13))
+
+
+def test_tail(setup):
+    data = pd.DataFrame(np.random.rand(10, 5), columns=["c1", "c2", "c3", "c4", "c5"])
+    df = md.DataFrame(data, chunk_size=2)
+
+    pd.testing.assert_frame_equal(df.tail().execute().fetch(), data.tail())
+    pd.testing.assert_frame_equal(df.tail(3).execute().fetch(), data.tail(3))
+    pd.testing.assert_frame_equal(df.tail(-3).execute().fetch(), data.tail(-3))
+    pd.testing.assert_frame_equal(df.tail(8).execute().fetch(), data.tail(8))
+    pd.testing.assert_frame_equal(df.tail(-8).execute().fetch(), data.tail(-8))
+    pd.testing.assert_frame_equal(df.tail(13).execute().fetch(), data.tail(13))
+    pd.testing.assert_frame_equal(df.tail(-13).execute().fetch(), data.tail(-13))
+
+
+def test_at(setup):
+    data = pd.DataFrame(
+        np.random.rand(10, 5),
+        columns=["c" + str(i) for i in range(5)],
+        index=["i" + str(i) for i in range(10)],
+    )
+    df = md.DataFrame(data, chunk_size=3)
+    data2 = data.copy()
+    data2.index = np.arange(10)
+    df2 = md.DataFrame(data2, chunk_size=3)
+
+    with pytest.raises(ValueError):
+        _ = df.at[["i3, i4"], "c1"]
+
+    result = df.at["i3", "c1"].execute().fetch()
+    assert result == data.at["i3", "c1"]
+
+    result = df["c1"].at["i2"].execute().fetch()
+    assert result == data["c1"].at["i2"]
+
+    result = df2.at[3, "c2"].execute().fetch()
+    assert result == data2.at[3, "c2"]
+
+    result = df2.loc[3].at["c2"].execute().fetch()
+    assert result == data2.loc[3].at["c2"]
+
+
+def test_iat(setup):
+    data = pd.DataFrame(
+        np.random.rand(10, 5),
+        columns=["c" + str(i) for i in range(5)],
+        index=["i" + str(i) for i in range(10)],
+    )
+    df = md.DataFrame(data, chunk_size=3)
+
+    with pytest.raises(ValueError):
+        _ = df.iat[[1, 2], 3]
+
+    result = df.iat[3, 4].execute().fetch()
+    assert result == data.iat[3, 4]
+
+    result = df.iloc[:, 2].iat[3].execute().fetch()
+    assert result == data.iloc[:, 2].iat[3]
+
+
+@pytest.mark.pd_compat
+def test_setitem(setup):
+    data = pd.DataFrame(
+        np.random.rand(10, 5),
+        columns=["c" + str(i) for i in range(5)],
+        index=["i" + str(i) for i in range(10)],
+    )
+    data2 = np.random.rand(10)
+    data3 = np.random.rand(10, 2)
+    df = md.DataFrame(data, chunk_size=3)
+
+    df["c3"] = df["c3"] + 1
+    df["c10"] = 10
+    df[4] = mt.tensor(data2, chunk_size=4)
+    df["d1"] = df["c4"].mean()
+    df["e1"] = data2 * 2
+
+    result = df.execute().fetch()
+    expected = data.copy()
+    expected["c3"] = expected["c3"] + 1
+    expected["c10"] = 10
+    expected[4] = data2
+    expected["d1"] = data["c4"].mean()
+    expected["e1"] = data2 * 2
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test set multiple cols with scalar
+    df = md.DataFrame(data, chunk_size=3)
+    df[["c0", "c2"]] = 1
+    df[["c1", "c10"]] = df["c4"].mean()
+    df[["c11", "c12"]] = mt.tensor(data3, chunk_size=4)
+
+    result = df.execute().fetch()
+    if not _allow_set_missing_list:
+        expected = data.copy().reindex(
+            ["c" + str(i) for i in range(5)] + ["c10", "c11", "c12"],
+            axis=1,
+        )
+    else:
+        expected = data.copy()
+    expected[["c0", "c2"]] = 1
+    expected[["c1", "c10"]] = expected["c4"].mean()
+    expected[["c11", "c12"]] = data3
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test set multiple rows
+    df = md.DataFrame(data, chunk_size=3)
+    df[["c1", "c4", "c10"]] = df[["c2", "c3", "c4"]] * 2
+
+    result = df.execute().fetch()
+    expected = data.copy()
+    expected[["c1", "c4", "c10"]] = expected[["c2", "c3", "c4"]] * 2
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test setitem into empty DataFrame
+    df = md.DataFrame()
+    df["a"] = md.Series(np.arange(1, 11), chunk_size=3)
+    pd.testing.assert_index_equal(df.index_value.to_pandas(), pd.RangeIndex(10))
+
+    result = df.execute().fetch()
+    expected = pd.DataFrame()
+    expected["a"] = pd.Series(np.arange(1, 11))
+    pd.testing.assert_frame_equal(result, expected)
+
+    df["b"] = md.Series(np.arange(2, 12), index=pd.RangeIndex(1, 11), chunk_size=3)
+    result = df.execute().fetch()
+    expected["b"] = pd.Series(np.arange(2, 12), index=pd.RangeIndex(1, 11))
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test set multiple item order
+    data = pd.DataFrame(
+        [list(range(5))] * 10,
+        columns=["cc" + str(i) for i in range(5)],
+        index=["i" + str(i) for i in range(10)],
+    )
+    df = md.DataFrame(data, chunk_size=3)
+    df2 = df.apply(
+        lambda x: x * 2,
+        axis=1,
+        result_type="expand",
+        dtypes=[np.int64, np.int64, np.int64, np.int64, np.int64],
+        output_type="dataframe",
+    )
+    columns = ["dd" + str(i) for i in range(5)]
+    columns[1] = "cc2"
+    columns[3] = "cc1"
+    columns[4] = "cc3"
+    df2.columns = columns
+    df[columns] = df2[columns]
+    result = df.execute().fetch()
+    df2 = data.apply(lambda x: x * 2)
+    df2.columns = columns
+    data[columns] = df2[columns]
+    pd.testing.assert_frame_equal(result, data)
+
+
+def test_reset_index_execution(setup):
+    data = pd.DataFrame(
+        [("bird", 389.0), ("bird", 24.0), ("mammal", 80.5), ("mammal", np.nan)],
+        index=["falcon", "parrot", "lion", "monkey"],
+        columns=("class", "max_speed"),
+    )
+    df = md.DataFrame(data)
+    df2 = df.reset_index()
+    result = df2.execute().fetch()
+    expected = data.reset_index()
+    pd.testing.assert_frame_equal(result, expected)
+
+    df = md.DataFrame(data, chunk_size=2)
+    df2 = df.reset_index()
+    result = df2.execute().fetch()
+    expected = data.reset_index()
+    pd.testing.assert_frame_equal(result, expected)
+
+    df = md.DataFrame(data, chunk_size=1)
+    df2 = df.reset_index(drop=True)
+    result = df2.execute().fetch()
+    expected = data.reset_index(drop=True)
+    pd.testing.assert_frame_equal(result, expected)
+
+    index = pd.MultiIndex.from_tuples(
+        [
+            ("bird", "falcon"),
+            ("bird", "parrot"),
+            ("mammal", "lion"),
+            ("mammal", "monkey"),
+        ],
+        names=["class", "name"],
+    )
+    data = pd.DataFrame(
+        [("bird", 389.0), ("bird", 24.0), ("mammal", 80.5), ("mammal", np.nan)],
+        index=index,
+        columns=("type", "max_speed"),
+    )
+    df = md.DataFrame(data, chunk_size=1)
+    df2 = df.reset_index(level="class")
+    result = df2.execute().fetch()
+    expected = data.reset_index(level="class")
+    pd.testing.assert_frame_equal(result, expected)
+
+    columns = pd.MultiIndex.from_tuples([("speed", "max"), ("species", "type")])
+    data.columns = columns
+    df = md.DataFrame(data, chunk_size=2)
+    df2 = df.reset_index(level="class", col_level=1, col_fill="species")
+    result = df2.execute().fetch()
+    expected = data.reset_index(level="class", col_level=1, col_fill="species")
+    pd.testing.assert_frame_equal(result, expected)
+
+    df = md.DataFrame(data, chunk_size=3)
+    df.reset_index(level="class", col_level=1, col_fill="species", inplace=True)
+    result = df.execute().fetch()
+    expected = data.reset_index(level="class", col_level=1, col_fill="species")
+    pd.testing.assert_frame_equal(result, expected)
+
+    # Test Series
+
+    s = pd.Series(
+        [1, 2, 3, 4], name="foo", index=pd.Index(["a", "b", "c", "d"], name="idx")
+    )
+
+    series = md.Series(s)
+    s2 = series.reset_index(name="bar")
+    result = s2.execute().fetch()
+    expected = s.reset_index(name="bar")
+    pd.testing.assert_frame_equal(result, expected)
+
+    series = md.Series(s, chunk_size=2)
+    s2 = series.reset_index(drop=True)
+    result = s2.execute().fetch()
+    expected = s.reset_index(drop=True)
+    pd.testing.assert_series_equal(result, expected)
+
+    # Test Unknown shape
+    data1 = pd.DataFrame(np.random.rand(10, 3), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9])
+    df1 = md.DataFrame(data1, chunk_size=5)
+    data2 = pd.DataFrame(np.random.rand(10, 3), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
+    df2 = md.DataFrame(data2, chunk_size=6)
+    df = (df1 + df2).reset_index(incremental_index=True)
+    result = df.execute().fetch()
+    pd.testing.assert_index_equal(result.index, pd.RangeIndex(12))
+    # Inconsistent with Pandas when input dataframe's shape is unknown.
+    result = result.sort_values(by=result.columns[0])
+    expected = (data1 + data2).reset_index()
+    np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy())
+
+    data1 = pd.Series(
+        np.random.rand(10),
+        index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
+    )
+    series1 = md.Series(data1, chunk_size=3)
+    data2 = pd.Series(
+        np.random.rand(10),
+        index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
+    )
+    series2 = md.Series(data2, chunk_size=3)
+    df = (series1 + series2).reset_index(incremental_index=True)
+    result = df.execute().fetch()
+    pd.testing.assert_index_equal(result.index, pd.RangeIndex(12))
+    # Inconsistent with Pandas when input dataframe's shape is unknown.
+    result = result.sort_values(by=result.columns[0])
+    expected = (data1 + data2).reset_index()
+    np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy())
+
+    series1 = md.Series(data1, chunk_size=3)
+    series1.reset_index(inplace=True, drop=True)
+    result = series1.execute().fetch()
+    pd.testing.assert_index_equal(result.index, pd.RangeIndex(10))
+
+    # case from https://github.com/mars-project/mars/issues/1286
+    data = pd.DataFrame(np.random.rand(10, 3), columns=list("abc"))
+    df = md.DataFrame(data, chunk_size=3)
+
+    r = df.sort_values("a").reset_index(drop=True, incremental_index=True)
+    result = r.execute().fetch()
+    expected = data.sort_values("a").reset_index(drop=True)
+    pd.testing.assert_frame_equal(result, expected)
+
+
+def test_rename(setup):
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(rs.rand(10, 4), columns=["A", "B", "C", "D"])
+    df = md.DataFrame(raw, chunk_size=3)
+
+    with pytest.warns(Warning):
+        df.rename(str, errors="raise")
+
+    with pytest.raises(NotImplementedError):
+        df.rename({"A": "a", "B": "b"}, axis=1, copy=False)
+
+    r = df.rename(str)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.rename(str))
+
+    r = df.rename({"A": "a", "B": "b"}, axis=1)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw.rename({"A": "a", "B": "b"}, axis=1)
+    )
+
+    df.rename({"A": "a", "B": "b"}, axis=1, inplace=True)
+    pd.testing.assert_frame_equal(
+        df.execute().fetch(), raw.rename({"A": "a", "B": "b"}, axis=1)
+    )
+
+    raw = pd.DataFrame(
+        rs.rand(10, 4),
+        columns=pd.MultiIndex.from_tuples(
+            (("A", "C"), ("A", "D"), ("B", "E"), ("B", "F"))
+        ),
+    )
+    df = md.DataFrame(raw, chunk_size=3)
+
+    r = df.rename({"C": "a", "D": "b"}, level=1, axis=1)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw.rename({"C": "a", "D": "b"}, level=1, axis=1)
+    )
+
+    raw = pd.Series(rs.rand(10), name="series")
+    series = md.Series(raw, chunk_size=3)
+
+    r = series.rename("new_series")
+    pd.testing.assert_series_equal(r.execute().fetch(), raw.rename("new_series"))
+
+    r = series.rename(lambda x: 2**x)
+    pd.testing.assert_series_equal(r.execute().fetch(), raw.rename(lambda x: 2**x))
+
+    with pytest.raises(TypeError):
+        series.name = {1: 10, 2: 20}
+
+    series.name = "new_series"
+    pd.testing.assert_series_equal(series.execute().fetch(), raw.rename("new_series"))
+
+    raw = pd.MultiIndex.from_frame(pd.DataFrame(rs.rand(10, 2), columns=["A", "B"]))
+    idx = md.Index(raw)
+
+    r = idx.rename(["C", "D"])
+    pd.testing.assert_index_equal(r.execute().fetch(), raw.rename(["C", "D"]))
+
+    r = idx.set_names("C", level=0)
+    pd.testing.assert_index_equal(r.execute().fetch(), raw.set_names("C", level=0))
+
+
+def test_rename_axis(setup):
+    rs = np.random.RandomState(0)
+
+    # test dataframe cases
+    raw = pd.DataFrame(rs.rand(10, 4), columns=["A", "B", "C", "D"])
+    df = md.DataFrame(raw, chunk_size=3)
+
+    r = df.rename_axis("idx")
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.rename_axis("idx"))
+
+    r = df.rename_axis("cols", axis=1)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.rename_axis("cols", axis=1))
+
+    df.rename_axis("c", axis=1, inplace=True)
+    pd.testing.assert_frame_equal(df.execute().fetch(), raw.rename_axis("c", axis=1))
+
+    df.columns.name = "df_cols"
+    pd.testing.assert_frame_equal(
+        df.execute().fetch(), raw.rename_axis("df_cols", axis=1)
+    )
+
+    # test dataframe cases with MultiIndex
+    raw = pd.DataFrame(
+        rs.rand(10, 4),
+        columns=pd.MultiIndex.from_tuples([("A", 1), ("B", 2), ("C", 3), ("D", 4)]),
+    )
+    df = md.DataFrame(raw, chunk_size=3)
+
+    df.columns.names = ["c1", "c2"]
+    pd.testing.assert_frame_equal(
+        df.execute().fetch(), raw.rename_axis(["c1", "c2"], axis=1)
+    )
+
+    df.columns.set_names("c2_1", level=1, inplace=True)
+    pd.testing.assert_frame_equal(
+        df.execute().fetch(), raw.rename_axis(["c1", "c2_1"], axis=1)
+    )
+
+    # test series cases
+    raw = pd.Series(rs.rand(10))
+    s = md.Series(raw, chunk_size=3)
+
+    r = s.rename_axis("idx")
+    pd.testing.assert_series_equal(r.execute().fetch(), raw.rename_axis("idx"))
+
+    s.index.name = "series_idx"
+    pd.testing.assert_series_equal(s.execute().fetch(), raw.rename_axis("series_idx"))
+
+
+def test_insert(setup):
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(rs.rand(10, 4), columns=["A", "B", "C", "D"])
+
+    with pytest.raises(ValueError):
+        tensor = mt.tensor(rs.rand(10, 10), chunk_size=4)
+        df = md.DataFrame(raw.copy(deep=True), chunk_size=3)
+        df.insert(4, "E", tensor)
+
+    df = md.DataFrame(raw.copy(deep=True), chunk_size=3)
+    df.insert(4, "E", 0)
+    raw_dup = raw.copy(deep=True)
+    raw_dup.insert(4, "E", 0)
+    pd.testing.assert_frame_equal(df.execute().fetch(), raw_dup)
+
+    raw_tensor = rs.rand(10)
+    tensor = mt.tensor(raw_tensor, chunk_size=4)
+    df = md.DataFrame(raw.copy(deep=True), chunk_size=3)
+    df.insert(4, "E", tensor)
+    raw_dup = raw.copy(deep=True)
+    raw_dup.insert(4, "E", raw_tensor)
+    pd.testing.assert_frame_equal(df.execute().fetch(), raw_dup)
+
+
+def _wrap_execute_data_source(limit, op_cls):
+    def _execute_data_source(ctx, op):
+        op_cls.execute(ctx, op)
+        result = ctx[op.outputs[0].key]
+        if len(result) > limit:
+            raise RuntimeError("have data more than expected")  # pragma: no cover
+
+    return _execute_data_source
+
+
+def _wrap_execute_data_source_usecols(usecols, op_cls):
+    def _execute_data_source(ctx, op):  # pragma: no cover
+        op_cls.execute(ctx, op)
+        result = ctx[op.outputs[0].key]
+        if not isinstance(usecols, list):
+            if not isinstance(result, pd.Series):
+                raise RuntimeError(f"Out data should be a Series, got {type(result)}")
+        elif len(result.columns) > len(usecols):
+            params = dict(
+                (k, getattr(op, k, None))
+                for k in op._keys_
+                if k not in op._no_copy_attrs_
+            )
+            raise RuntimeError(
+                f"have data more than expected, got {result.columns}, "
+                f"result {result}, op params {params}"
+            )
+
+    return _execute_data_source
+
+
+def _wrap_execute_data_source_mixed(limit, usecols, op_cls):
+    def _execute_data_source(ctx, op):  # pragma: no cover
+        op_cls.execute(ctx, op)
+        result = ctx[op.outputs[0].key]
+        if not isinstance(usecols, list):
+            if not isinstance(result, pd.Series):
+                raise RuntimeError("Out data should be a Series")
+        elif len(result.columns) > len(usecols):
+            raise RuntimeError("have data more than expected")
+        if len(result) > limit:
+            raise RuntimeError("have data more than expected")
+
+    return _execute_data_source
+
+
+@pytest.mark.skip_ray_dag  # operand_executors is not supported by ray backend.
+@pytest.mark.pd_compat
+def test_optimization(setup):
+    import sqlalchemy as sa
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        filename = os.path.join(tempdir, "test_head.csv")
+        rs = np.random.RandomState(0)
+        pd_df = pd.DataFrame(
+            {
+                "a": rs.randint(1000, size=(2000,)).astype(np.int64),
+                "b": rs.randint(1000, size=(2000,)).astype(np.int64),
+                "c": ["sss" for _ in range(2000)],
+                "d": ["eeee" for _ in range(2000)],
+            }
+        )
+        pd_df.to_csv(filename, index=False)
+
+        size = os.path.getsize(filename)
+        chunk_bytes = size / 3 - 2
+
+        df = md.read_csv(filename, chunk_bytes=chunk_bytes)
+
+        cols = ["b", "a", "c"]
+        r = df[cols]
+        operand_executors = {
+            DataFrameReadCSV: _wrap_execute_data_source_usecols(cols, DataFrameReadCSV)
+        }
+        result = r.execute(
+            extra_config={"operand_executors": operand_executors}
+        ).fetch()
+        expected = pd_df[cols]
+        result.reset_index(drop=True, inplace=True)
+        pd.testing.assert_frame_equal(result, expected)
+
+        cols = ["b", "a", "b"]
+        r = df[cols].head(20)
+        operand_executors = {
+            DataFrameReadCSV: _wrap_execute_data_source_usecols(cols, DataFrameReadCSV)
+        }
+        result = r.execute(
+            extra_config={"operand_executors": operand_executors}
+        ).fetch()
+        expected = pd_df[cols].head(20)
+        result.reset_index(drop=True, inplace=True)
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = df["c"]
+        operand_executors = {
+            DataFrameReadCSV: _wrap_execute_data_source_usecols("c", DataFrameReadCSV)
+        }
+        result = r.execute(
+            extra_config={"operand_executors": operand_executors}
+        ).fetch()
+        expected = pd_df["c"]
+        result.reset_index(drop=True, inplace=True)
+        pd.testing.assert_series_equal(result, expected)
+
+        r = df["d"].head(3)
+        operand_executors = {
+            DataFrameReadCSV: _wrap_execute_data_source_mixed(3, "d", DataFrameReadCSV)
+        }
+        result = r.execute(
+            extra_config={"operand_executors": operand_executors}
+        ).fetch()
+        expected = pd_df["d"].head(3)
+        pd.testing.assert_series_equal(result, expected)
+
+        # test DataFrame.head
+        r = df.head(3)
+        operand_executors = {
+            DataFrameReadCSV: _wrap_execute_data_source(3, DataFrameReadCSV)
+        }
+        result = r.execute(
+            extra_config={"operand_executors": operand_executors}
+        ).fetch()
+        expected = pd_df.head(3)
+        pd.testing.assert_frame_equal(result, expected)
+
+        # test DataFrame.tail
+        r = df.tail(3)
+        result = r.execute().fetch()
+        expected = pd_df.tail(3)
+        pd.testing.assert_frame_equal(
+            result.reset_index(drop=True), expected.reset_index(drop=True)
+        )
+
+        # test head more than 1 chunk
+        r = df.head(99)
+        result = r.execute().fetch()
+        result.reset_index(drop=True, inplace=True)
+        expected = pd_df.head(99)
+        pd.testing.assert_frame_equal(result, expected)
+
+        # test Series.tail more than 1 chunk
+        r = df.tail(99)
+        result = r.execute().fetch()
+        expected = pd_df.tail(99)
+        pd.testing.assert_frame_equal(
+            result.reset_index(drop=True), expected.reset_index(drop=True)
+        )
+
+        # test head number greater than limit
+        df = md.read_csv(filename, chunk_bytes=chunk_bytes)
+        r = df.head(1100)
+
+        with pytest.raises(RuntimeError):
+            operand_executors = {
+                DataFrameReadCSV: _wrap_execute_data_source(3, DataFrameReadCSV)
+            }
+            r.execute(extra_config={"operand_executors": operand_executors})
+
+        result = r.execute().fetch()
+        expected = pd_df.head(1100)
+        pd.testing.assert_frame_equal(
+            result.reset_index(drop=True), expected.reset_index(drop=True)
+        )
+
+        filename = os.path.join(tempdir, "test_sql.db")
+        conn = sa.create_engine("sqlite:///" + filename)
+        pd_df.to_sql("test_sql", conn)
+
+        df = md.read_sql("test_sql", conn, index_col="index", chunk_size=20)
+
+        # test DataFrame.head
+        r = df.head(3)
+        operand_executors = {
+            DataFrameReadSQL: _wrap_execute_data_source(3, DataFrameReadSQL)
+        }
+        result = r.execute(
+            extra_config={"operand_executors": operand_executors}
+        ).fetch()
+        result.index.name = None
+        expected = pd_df.head(3)
+        pd.testing.assert_frame_equal(result, expected)
+
+        # test head on read_parquet
+        filename = os.path.join(tempdir, "test_parquet.db")
+        pd_df.to_parquet(filename, index=False, compression="gzip")
+
+        engines = []
+        if pa is not None:
+            engines.append("pyarrow")
+        if fp is not None:
+            engines.append("fastparquet")
+
+        for engine in engines:
+            df = md.read_parquet(filename, engine=engine)
+            r = df.head(3)
+
+            operand_executors = {
+                DataFrameReadParquet: _wrap_execute_data_source(3, DataFrameReadParquet)
+            }
+            result = r.execute(
+                extra_config={"operand_executors": operand_executors}
+            ).fetch()
+            expected = pd_df.head(3)
+            pd.testing.assert_frame_equal(result, expected)
+
+        dirname = os.path.join(tempdir, "test_parquet2")
+        os.makedirs(dirname)
+        pd_df[:1000].to_parquet(os.path.join(dirname, "q1.parquet"))
+        pd_df[1000:].to_parquet(os.path.join(dirname, "q2.parquet"))
+
+        df = md.read_parquet(dirname)
+        r = df.head(3)
+
+        operand_executors = {
+            DataFrameReadParquet: _wrap_execute_data_source(3, DataFrameReadParquet)
+        }
+        result = r.execute(
+            extra_config={"operand_executors": operand_executors}
+        ).fetch()
+        expected = pd_df.head(3)
+        pd.testing.assert_frame_equal(result, expected)
+
+
+def test_reindex_execution(setup):
+    data = pd.DataFrame(np.random.rand(10, 5), columns=["c1", "c2", "c3", "c4", "c5"])
+    df = md.DataFrame(data, chunk_size=4)
+
+    for enable_sparse in [True, False, None]:
+        r = df.reindex(
+            index=mt.arange(10, 1, -1, chunk_size=3), enable_sparse=enable_sparse
+        )
+
+        result = r.execute().fetch()
+        expected = data.reindex(index=np.arange(10, 1, -1))
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = df.reindex(columns=["c5", "c6", "c2"], enable_sparse=enable_sparse)
+
+        result = r.execute().fetch()
+        expected = data.reindex(columns=["c5", "c6", "c2"])
+        pd.testing.assert_frame_equal(result, expected)
+
+    for enable_sparse in [True, False]:
+        r = df.reindex(
+            index=[5, 11, 1], columns=["c5", "c6", "c2"], enable_sparse=enable_sparse
+        )
+
+        result = r.execute().fetch()
+        expected = data.reindex(index=[5, 11, 1], columns=["c5", "c6", "c2"])
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = df.reindex(
+            index=mt.tensor([2, 4, 10]),
+            columns=["c2", "c3", "c5", "c7"],
+            method="bfill",
+            enable_sparse=enable_sparse,
+        )
+
+        result = r.execute().fetch()
+        expected = data.reindex(
+            index=[2, 4, 10], columns=["c2", "c3", "c5", "c7"], method="bfill"
+        )
+        pd.testing.assert_frame_equal(result, expected)
+
+        for fill_value, test_fill_value in [
+            (3, 3),
+            (df.iloc[:, 0].max(), data.iloc[:, 0].max()),
+        ]:
+            r = df.reindex(
+                index=mt.tensor([2, 4, 10]),
+                columns=["c2", "c3", "c5", "c7"],
+                fill_value=fill_value,
+                enable_sparse=enable_sparse,
+            )
+
+            result = r.execute().fetch()
+            expected = data.reindex(
+                index=[2, 4, 10],
+                columns=["c2", "c3", "c5", "c7"],
+                fill_value=test_fill_value,
+            )
+            pd.testing.assert_frame_equal(result, expected)
+
+        # test date_range index
+        data = pd.DataFrame(
+            np.random.rand(10, 5), index=pd.date_range("2020-1-1", periods=10)
+        )
+        df = md.DataFrame(data, chunk_size=5)
+
+        r = df.reindex(
+            index=md.date_range("2020-1-6", periods=6),
+            method="ffill",
+            enable_sparse=enable_sparse,
+        )
+
+        result = r.execute().fetch()
+        expected = data.reindex(
+            index=pd.date_range("2020-1-6", periods=6), method="ffill"
+        )
+        pd.testing.assert_frame_equal(result, expected)
+
+        # test MultiIndex
+        data = pd.DataFrame(
+            np.random.rand(10, 5),
+            index=pd.MultiIndex.from_arrays([np.arange(10), np.arange(11, 1, -1)]),
+        )
+        df = md.DataFrame(data, chunk_size=5)
+
+        r = df.reindex([2, 4, 9, 12], level=1, enable_sparse=enable_sparse)
+
+        result = r.execute(extra_config={"check_shape": False}).fetch(
+            extra_config={"check_shape": False}
+        )
+        expected = data.reindex([2, 4, 9, 12], level=1)
+        pd.testing.assert_frame_equal(result, expected)
+
+        r = df.reindex(
+            mt.tensor([2, 4, 9, 12], chunk_size=2), level=1, enable_sparse=enable_sparse
+        )
+
+        result = r.execute(extra_config={"check_shape": False}).fetch(
+            extra_config={"check_shape": False}
+        )
+        expected = data.reindex([2, 4, 9, 12], level=1)
+        pd.testing.assert_frame_equal(result, expected)
+
+        # test duplicate index
+        index = np.arange(10)
+        index[-1] = 0
+        data = pd.DataFrame(np.random.rand(10, 5), index=index)
+        df = md.DataFrame(data, chunk_size=5)
+
+        with pytest.raises(ValueError):
+            r = df.reindex([0, 1], enable_sparse=enable_sparse)
+            r.execute()
+
+        # test one chunk
+        data = pd.DataFrame(
+            np.random.rand(10, 5), columns=["c1", "c2", "c3", "c4", "c5"]
+        )
+        df = md.DataFrame(data, chunk_size=10)
+
+        r = df.reindex(
+            index=mt.arange(10, 1, -1, chunk_size=10),
+            fill_value=df["c1"].max(),
+            enable_sparse=enable_sparse,
+        )
+
+        result = r.execute().fetch()
+        expected = data.reindex(index=np.arange(10, 1, -1), fill_value=data["c1"].max())
+        pd.testing.assert_frame_equal(result, expected)
+
+        # test series
+        s_data = pd.Series(np.random.rand(10), index=[f"c{i + 1}" for i in range(10)])
+        series = md.Series(s_data, chunk_size=6)
+
+        r = series.reindex(["c2", "c11", "c4"], copy=False, enable_sparse=enable_sparse)
+
+        result = r.execute().fetch()
+        expected = s_data.reindex(["c2", "c11", "c4"], copy=False)
+        pd.testing.assert_series_equal(result, expected)
+
+
+def test_reindex_like_execution(setup):
+    data = pd.DataFrame(
+        np.random.rand(10, 5),
+        columns=["c1", "c2", "c3", "c4", "c5"],
+        index=pd.date_range("2021-1-1", periods=10),
+    )
+    data2 = pd.DataFrame(
+        np.random.rand(4, 2),
+        columns=["c2", "c4"],
+        index=pd.date_range("2020-1-2", periods=4),
+    )
+    df = md.DataFrame(data, chunk_size=4)
+    df2 = md.DataFrame(data2, chunk_size=3)
+
+    r = df.reindex_like(df2)
+    result = r.execute().fetch()
+    expected = data.reindex_like(data2)
+    pd.testing.assert_frame_equal(result, expected)
+
+    r = df.reindex_like(df, copy=False)
+    result = r.execute().fetch()
+    expected = data.reindex_like(data)
+    pd.testing.assert_frame_equal(result, expected)
+
+    s = md.Series(data["c2"], chunk_size=4)
+    s2 = md.Series(data2["c2"], chunk_size=3)
+
+    r = s.reindex_like(s2)
+    result = r.execute().fetch()
+    expected = data["c2"].reindex_like(data2["c2"])
+    pd.testing.assert_series_equal(result, expected)
+
+    r = s.reindex_like(s, copy=False)
+    result = r.execute().fetch()
+    expected = data["c2"].reindex_like(data["c2"])
+    pd.testing.assert_series_equal(result, expected)
+
+
+def test_where_execution(setup):
+    dates = pd.date_range("1/1/2000", periods=20)
+
+    raw_df = pd.DataFrame(
+        np.random.randn(20, 10), index=dates, columns=list("ABCDEFGHIJ")
+    )
+    raw_df2 = pd.DataFrame(
+        np.random.randn(20, 10), index=dates, columns=list("ABCDEFGHIJ")
+    )
+    df = md.DataFrame(raw_df, chunk_size=6)
+    df2 = md.DataFrame(raw_df2, chunk_size=7)
+
+    raw_series = pd.Series(np.random.randn(20), index=dates)
+    raw_series2 = pd.Series(np.random.randn(20), index=dates)
+    raw_series3 = pd.Series(np.random.randn(10), index=list("ABCDEFGHIJ"))
+    series = md.Series(raw_series, chunk_size=6)
+    series2 = md.Series(raw_series2, chunk_size=7)
+    series3 = md.Series(raw_series3, chunk_size=7)
+
+    # tests for dataframes
+    with pytest.raises(NotImplementedError):
+        df.mask(df < 0, md.DataFrame(np.random.randn(5, 5)))
+    with pytest.raises(NotImplementedError):
+        df.mask(series < 0, md.Series(np.random.randn(5)), axis=0)
+
+    r = df.mask(df < 0)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw_df.mask(raw_df < 0))
+    r = df.mask(raw_df < 0, df2)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw_df.mask(raw_df < 0, raw_df2))
+
+    # tests for series
+    with pytest.raises(NotImplementedError):
+        series.mask(series < 0, md.Series(np.random.randn(5)))
+
+    r = series.where(series < 0, 0)
+    pd.testing.assert_series_equal(
+        r.execute().fetch(), raw_series.where(raw_series < 0, 0)
+    )
+    r = series.where(series < 0, series2)
+    pd.testing.assert_series_equal(
+        r.execute().fetch(), raw_series.where(raw_series < 0, raw_series2)
+    )
+
+    # test for dataframe with series
+    with pytest.raises(ValueError):
+        df.mask(df < 0, series)
+
+    r = df.mask(df < 0, series, axis=0)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw_df.mask(raw_df < 0, raw_series, axis=0)
+    )
+    r = df.mask(series < 0, df2)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw_df.mask(raw_series < 0, raw_df2)
+    )
+    r = df.mask(series < 0, series3, axis=1)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw_df.mask(raw_series < 0, raw_series3, axis=1)
+    )
+
+    # test inplace
+    new_df = df.copy()
+    new_df.mask(new_df < 0, inplace=True)
+    pd.testing.assert_frame_equal(new_df.execute().fetch(), raw_df.mask(raw_df < 0))
+
+
+def test_set_axis_execution(setup):
+    raw_df = pd.DataFrame(np.random.rand(10, 5), columns=["c1", "c2", "c3", "c4", "c5"])
+    df = md.DataFrame(raw_df, chunk_size=3)
+
+    # test axis=0
+    idx_data = np.arange(0, 10)
+    np.random.shuffle(idx_data)
+    new_idx = md.Index(idx_data, chunk_size=4)
+
+    r = df.set_axis(new_idx)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw_df.set_axis(idx_data))
+
+    new_idx = pd.Index(range(9, -1, -1))
+    r = df.set_axis(new_idx)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw_df.set_axis(new_idx))
+
+    df1 = df.copy()
+    df1.index = pd.Index(range(9, -1, -1))
+    pd.testing.assert_frame_equal(df1.execute().fetch(), raw_df.set_axis(new_idx))
+
+    ser = md.Series(idx_data)
+    with pytest.raises(ValueError):
+        df.set_axis(ser[ser > 5]).execute()
+
+    # test axis=1
+    new_axis = ["a1", "a2", "a3", "a4", "a5"]
+    r = df.set_axis(new_axis, axis=1)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw_df.set_axis(new_axis, axis=1)
+    )
+
+    r = df.set_axis(md.Index(new_axis, store_data=True), axis=1)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw_df.set_axis(new_axis, axis=1)
+    )
+
+    df1 = df.copy()
+    df1.columns = new_axis
+    pd.testing.assert_frame_equal(
+        df1.execute().fetch(), raw_df.set_axis(new_axis, axis=1)
+    )
+
+    with pytest.raises(ValueError):
+        df.set_axis(["a1", "a2", "a3", "a4"], axis=1)
+
+    # test series
+    raw_series = pd.Series(np.random.rand(10))
+    s = md.Series(raw_series, chunk_size=3)
+
+    idx_data = np.arange(0, 10)
+    np.random.shuffle(idx_data)
+    new_idx = md.Index(idx_data, chunk_size=4)
+
+    r = s.set_axis(new_idx)
+    pd.testing.assert_series_equal(r.execute().fetch(), raw_series.set_axis(idx_data))
+
+    s1 = s.copy()
+    s1.index = new_idx
+    pd.testing.assert_series_equal(s1.execute().fetch(), raw_series.set_axis(idx_data))
+
+
+def test_sample_execution(setup):
+    rs = np.random.RandomState(0)
+
+    # test dataframe
+    raw_df = pd.DataFrame(rs.rand(100, 5), columns=["c1", "c2", "c3", "c4", "c5"])
+
+    # test single chunk
+    df = md.DataFrame(raw_df)
+    r = df.sample(10, random_state=rs)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw_df.sample(10, random_state=rs)
+    )
+    r = df.sample(frac=0.1, weights="c1", random_state=rs)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw_df.sample(frac=0.1, weights="c1", random_state=rs)
+    )
+    r = df.sample(10, weights=df["c2"], random_state=rs)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw_df.sample(10, weights=raw_df["c2"], random_state=rs)
+    )
+
+    r = df.sample(10, weights=df["c2"], random_state=0)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw_df.sample(10, weights=raw_df["c2"], random_state=0)
+    )
+
+    r = df.sample(10, weights=df["c2"], random_state=np.array([1, 2]))
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(),
+        raw_df.sample(10, weights=raw_df["c2"], random_state=np.array([1, 2])),
+    )
+
+    # test multinomial tile & execution
+    df = md.DataFrame(raw_df, chunk_size=13)
+    r1 = df.sample(10, replace=True, random_state=rs)
+    r2 = df[:].sample(10, replace=True, random_state=rs)
+    pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch())
+
+    r1 = df.sample(frac=0.1, weights="c2", always_multinomial=True, random_state=rs)
+    r2 = df[:].sample(frac=0.1, weights="c2", always_multinomial=True, random_state=rs)
+    pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch())
+
+    r1 = df.sample(frac=0.1, weights=df["c2"], always_multinomial=True, random_state=rs)
+    r2 = df[:].sample(
+        frac=0.1, weights=df["c2"], always_multinomial=True, random_state=rs
+    )
+    pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch())
+
+    r1 = df.sample(frac=0.1, weights=df["c2"], always_multinomial=True, random_state=0)
+    r2 = df[:].sample(
+        frac=0.1, weights=df["c2"], always_multinomial=True, random_state=0
+    )
+    pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch())
+
+    r1 = df.sample(
+        frac=0.1,
+        weights=df["c2"],
+        always_multinomial=True,
+        random_state=np.array([1, 2]),
+    )
+    r2 = df[:].sample(
+        frac=0.1,
+        weights=df["c2"],
+        always_multinomial=True,
+        random_state=np.array([1, 2]),
+    )
+    pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch())
+
+    # test reservoir tile & execution
+    df = md.DataFrame(raw_df, chunk_size=13)
+    r1 = df.sample(90, random_state=rs)
+    r2 = df[:].sample(90, random_state=rs)
+    pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch())
+
+    r1 = df.sample(10, random_state=rs)
+    r2 = df[:].sample(10, random_state=rs)
+    pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch())
+
+    r1 = df.sample(frac=0.1, weights="c2", random_state=rs)
+    r2 = df[:].sample(frac=0.1, weights="c2", random_state=rs)
+    pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch())
+
+    r1 = df.sample(frac=0.1, weights=df["c2"], random_state=rs)
+    r2 = df[:].sample(frac=0.1, weights=df["c2"], random_state=rs)
+    pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch())
+
+    r1 = df.sample(frac=0.1, weights=df["c2"], random_state=0)
+    r2 = df[:].sample(frac=0.1, weights=df["c2"], random_state=0)
+    pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch())
+
+    r1 = df.sample(frac=0.1, weights=df["c2"], random_state=np.array([1, 2]))
+    r2 = df[:].sample(frac=0.1, weights=df["c2"], random_state=np.array([1, 2]))
+    pd.testing.assert_frame_equal(r1.execute().fetch(), r2.execute().fetch())
+
+    # test series
+    raw_series = pd.Series(rs.rand(100))
+    raw_weights = pd.Series(rs.rand(100))
+
+    # test single chunk
+    s = md.Series(raw_series)
+    r = s.sample(10, random_state=rs)
+    pd.testing.assert_series_equal(
+        r.execute().fetch(), raw_series.sample(10, random_state=rs)
+    )
+    weights = md.Series(raw_weights, chunk_size=13)
+    r = s.sample(10, weights=weights, random_state=rs)
+    pd.testing.assert_series_equal(
+        r.execute().fetch(), raw_series.sample(10, weights=raw_weights, random_state=rs)
+    )
+
+    # test multinomial tile & execution
+    s = md.Series(raw_series, chunk_size=13)
+    weights = md.Series(raw_weights, chunk_size=13)
+
+    r1 = s.sample(10, replace=True, random_state=rs)
+    r2 = s[:].sample(10, replace=True, random_state=rs)
+    pd.testing.assert_series_equal(r1.execute().fetch(), r2.execute().fetch())
+
+    r1 = s.sample(frac=0.1, weights=weights, always_multinomial=True, random_state=rs)
+    r2 = s[:].sample(
+        frac=0.1, weights=weights, always_multinomial=True, random_state=rs
+    )
+    pd.testing.assert_series_equal(r1.execute().fetch(), r2.execute().fetch())
+
+    # test reservoir tile & execution
+    r1 = s.sample(10, random_state=rs)
+    r2 = s[:].sample(10, random_state=rs)
+    pd.testing.assert_series_equal(r1.execute().fetch(), r2.execute().fetch())
+
+    r1 = s.sample(frac=0.1, weights=weights, random_state=rs)
+    r2 = s[:].sample(frac=0.1, weights=weights, random_state=rs)
+    pd.testing.assert_series_equal(r1.execute().fetch(), r2.execute().fetch())
+
+
+def test_loc_setitem(setup):
+    raw_df = pd.DataFrame({"a": [1, 2, 3, 4, 2, 4, 5, 7, 2, 8, 9], 1: [10] * 11})
+    md_data = md.DataFrame(raw_df, chunk_size=3)
+    md_data.loc[md_data["a"] <= 4, 1] = "v1"
+    pd_data = raw_df.copy(True)
+    pd_data.loc[pd_data["a"] <= 4, 1] = "v1"
+    pd.testing.assert_frame_equal(md_data.to_pandas(), pd_data)
+
+    md_data1 = md.DataFrame(raw_df, chunk_size=3)
+    md_data1.loc[1:3] = "v2"
+    pd_data1 = raw_df.copy(True)
+    pd_data1.loc[1:3] = "v2"
+    pd.testing.assert_frame_equal(md_data1.to_pandas(), pd_data1)
+
+    md_data2 = md.DataFrame(raw_df, chunk_size=3)
+    md_data2.loc[1:3, 1] = "v2"
+    pd_data2 = raw_df.copy(True)
+    pd_data2.loc[1:3, 1] = "v2"
+    pd.testing.assert_frame_equal(md_data2.to_pandas(), pd_data2)
+
+
+def test_add_prefix_suffix(setup):
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(rs.rand(10, 4), columns=["A", "B", "C", "D"])
+    df = md.DataFrame(raw, chunk_size=3)
+
+    r = df.add_prefix("col_")
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.add_prefix("col_"))
+
+    r = df.add_suffix("_col")
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.add_suffix("_col"))
+
+    raw = pd.Series(rs.rand(10), name="series")
+    series = md.Series(raw, chunk_size=3)
+
+    r = series.add_prefix("item_")
+    pd.testing.assert_series_equal(r.execute().fetch(), raw.add_prefix("item_"))
+
+    r = series.add_suffix("_item")
+    pd.testing.assert_series_equal(r.execute().fetch(), raw.add_suffix("_item"))
+
+
+@pytest.mark.parametrize("join", ["outer", "left"])
+def test_align_execution(setup, join):
+    rs = np.random.RandomState(0)
+    raw_df1 = pd.DataFrame(
+        rs.rand(10, 10), columns=list("ABCDEFGHIJ"), index=pd.RangeIndex(10)
+    )
+    raw_df2 = pd.DataFrame(
+        rs.rand(10, 10),
+        columns=list("ACDFGIJKLM"),
+        index=[2, 3, 6, 7, 8, 9, 10, 13, 15, 17],
+    )
+    raw_s1 = pd.Series(rs.rand(10), index=[2, 3, 6, 7, 8, 9, 10, 13, 15, 17])
+    raw_s2 = pd.Series(rs.rand(10), index=pd.RangeIndex(10))
+    raw_s3 = raw_s4 = raw_df2.iloc[0, :]
+    raw_s5 = raw_df1.iloc[0, :]
+
+    df1 = md.DataFrame(raw_df1, chunk_size=5)
+    df2 = md.DataFrame(raw_df2, chunk_size=4)
+    s1 = md.Series(raw_s1, chunk_size=4)
+    s2 = md.Series(raw_s2, chunk_size=4)
+    s3 = md.Series(raw_s3, chunk_size=4)
+    s4 = df2.iloc[0, :]
+    s5 = df1.iloc[0, :]
+
+    # test dataframe vs dataframe
+    r1, r2 = mars.fetch(
+        mars.execute(*df1.align(df1, join=join), extra_config={"check_nsplits": False})
+    )
+    pd.testing.assert_frame_equal(r1, raw_df1)
+    pd.testing.assert_frame_equal(r2, raw_df1)
+
+    r1, r2 = mars.fetch(
+        mars.execute(*df1.align(df2, join=join), extra_config={"check_nsplits": False})
+    )
+    exp1, exp2 = raw_df1.align(raw_df2, join=join)
+    pd.testing.assert_frame_equal(r1, exp1)
+    pd.testing.assert_frame_equal(r2, exp2)
+
+    r1, r2 = mars.fetch(
+        mars.execute(
+            *df1.align(df2, join=join, axis=0), extra_config={"check_nsplits": False}
+        )
+    )
+    exp1, exp2 = raw_df1.align(raw_df2, join=join, axis=0)
+    pd.testing.assert_frame_equal(r1.sort_index(axis=1), exp1)
+    pd.testing.assert_frame_equal(r2.sort_index(axis=1), exp2)
+
+    r2, r1 = mars.fetch(
+        mars.execute(
+            *df2.align(df1, join=join, axis=0, fill_value=0.0),
+            extra_config={"check_nsplits": False},
+        )
+    )
+    exp2, exp1 = raw_df2.align(raw_df1, join=join, axis=0, fill_value=0.0)
+    pd.testing.assert_frame_equal(r1.sort_index(axis=1), exp1)
+    pd.testing.assert_frame_equal(r2.sort_index(axis=1), exp2)
+
+    r1, r2 = mars.fetch(
+        mars.execute(
+            *df1.align(df2, join=join, axis=1), extra_config={"check_nsplits": False}
+        )
+    )
+    exp1, exp2 = raw_df1.align(raw_df2, join=join, axis=1)
+    pd.testing.assert_frame_equal(r1.sort_index(), exp1)
+    pd.testing.assert_frame_equal(r2.sort_index(), exp2)
+
+    # test dataframe vs series
+    with pytest.raises(ValueError):
+        # must specify align axis
+        df1.align(s1)
+
+    r1, r2 = mars.fetch(
+        mars.execute(
+            *df1.align(s1, join=join, axis=0, method="ffill"),
+            extra_config={"check_nsplits": False},
+        )
+    )
+    exp1, exp2 = raw_df1.align(raw_s1, join=join, axis=0, method="ffill")
+    pd.testing.assert_frame_equal(r1.sort_index(), exp1)
+    pd.testing.assert_series_equal(r2.sort_index(), exp2)
+
+    r1, r2 = mars.fetch(
+        mars.execute(
+            *df1.align(s1, join=join, axis=0, broadcast_axis=1),
+            extra_config={"check_nsplits": False},
+        )
+    )
+    exp1, exp2 = raw_df1.align(raw_s1, join=join, axis=0, broadcast_axis=1)
+    pd.testing.assert_frame_equal(r1.sort_index(), exp1)
+    pd.testing.assert_frame_equal(r2.sort_index(), exp2)
+
+    r1, r2 = mars.fetch(
+        mars.execute(
+            *df1.align(s3, join=join, axis=1), extra_config={"check_nsplits": False}
+        )
+    )
+    exp1, exp2 = raw_df1.align(raw_s3, join=join, axis=1)
+    pd.testing.assert_frame_equal(r1.sort_index(axis=1), exp1)
+    pd.testing.assert_series_equal(r2.sort_index(), exp2)
+
+    r1, r2 = mars.fetch(
+        mars.execute(
+            *df1.align(s4, join=join, axis=1), extra_config={"check_nsplits": False}
+        )
+    )
+    exp1, exp2 = raw_df1.align(raw_s4, join=join, axis=1)
+    pd.testing.assert_frame_equal(r1.sort_index(axis=1), exp1)
+    pd.testing.assert_series_equal(r2.sort_index(), exp2)
+
+    r1, r2 = mars.fetch(
+        mars.execute(
+            *s1.align(df1, join=join, axis=0), extra_config={"check_nsplits": False}
+        )
+    )
+    exp1, exp2 = raw_s1.align(raw_df1, join=join, axis=0)
+    pd.testing.assert_series_equal(r1.sort_index(), exp1)
+    pd.testing.assert_frame_equal(r2.sort_index(), exp2)
+
+    # test series vs series
+    r1, r2 = mars.fetch(
+        mars.execute(*s1.align(s2, join=join), extra_config={"check_nsplits": False})
+    )
+    exp1, exp2 = raw_s1.align(raw_s2, join=join)
+    pd.testing.assert_series_equal(r1.sort_index(), exp1)
+    pd.testing.assert_series_equal(r2.sort_index(), exp2)
+
+    r1, r2 = mars.fetch(
+        mars.execute(*s4.align(s5, join=join), extra_config={"check_nsplits": False})
+    )
+    exp1, exp2 = raw_s4.align(raw_s5, join=join)
+    pd.testing.assert_series_equal(r1.sort_index(), exp1)
+    pd.testing.assert_series_equal(r2.sort_index(), exp2)
diff --git a/python/xorbits/_mars/dataframe/indexing/utils.py b/python/xorbits/_mars/dataframe/indexing/utils.py
new file mode 100644
index 000000000..b8dd957d2
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/utils.py
@@ -0,0 +1,53 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+
+def calc_columns_index(column_name, df):
+    """
+    Calculate the chunk index on the axis 1 according to the selected column.
+    :param column_name: selected column name
+    :param df: input tiled DataFrame
+    :return: chunk index on the columns axis
+    """
+    column_nsplits = df.nsplits[1]
+    # if has duplicate columns, will return multiple values
+    columns = df.columns_value.to_pandas().to_numpy()
+    column_locs = (columns == column_name).nonzero()[0]
+
+    return [
+        np.searchsorted(np.cumsum(column_nsplits), column_loc + 1)
+        for column_loc in column_locs
+    ]
+
+
+def convert_labels_into_positions(pandas_index, labels):
+    """
+    Convert labels into positions
+
+    :param pandas_index: pandas Index
+    :param labels: labels
+    :return: positions
+    """
+    result = []
+    for label in labels:
+        loc = pandas_index.get_loc(label)
+        if isinstance(loc, (int, np.integer)):
+            result.append(loc)
+        else:
+            # slice or boolean array
+            result.extend(pd.RangeIndex(len(pandas_index))[loc].tolist())
+    return np.asarray(result)
diff --git a/python/xorbits/_mars/dataframe/indexing/where.py b/python/xorbits/_mars/dataframe/indexing/where.py
new file mode 100644
index 000000000..ccf45611c
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/indexing/where.py
@@ -0,0 +1,431 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import ENTITY_TYPE, recursive_tile
+from ...serialization.serializables import AnyField, BoolField, Int32Field, StringField
+from ...tensor.utils import filter_inputs
+from ..core import DATAFRAME_TYPE, SERIES_TYPE
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_df, build_series, validate_axis
+
+
+class DataFrameWhere(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.WHERE
+
+    _input = AnyField("input")
+    _cond = AnyField("cond")
+    _other = AnyField("other")
+    _axis = Int32Field("axis")
+    _level = AnyField("level")
+    _errors = StringField("errors")
+    _try_cast = BoolField("try_cast")
+    _replace_true = BoolField("replace_true")
+
+    def __init__(
+        self,
+        input=None,
+        cond=None,
+        other=None,  # pylint: disable=redefined-builtin
+        axis=None,
+        level=None,
+        errors=None,
+        try_cast=None,
+        replace_true=None,
+        **kw
+    ):
+        super().__init__(
+            _input=input,
+            _cond=cond,
+            _other=other,
+            _axis=axis,
+            _level=level,
+            _errors=errors,
+            _try_cast=try_cast,
+            _replace_true=replace_true,
+            **kw
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def cond(self):
+        return self._cond
+
+    @property
+    def other(self):
+        return self._other
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def level(self):
+        return self._level
+
+    @property
+    def errors(self):
+        return self._errors
+
+    @property
+    def try_cast(self):
+        return self._try_cast
+
+    @property
+    def replace_true(self):
+        return self._replace_true
+
+    def __call__(self, df_or_series):
+        def _check_input_index(obj, axis=None):
+            axis = axis if axis is not None else self.axis
+            if isinstance(obj, DATAFRAME_TYPE) and (
+                df_or_series.columns_value.key != obj.columns_value.key
+                or df_or_series.index_value.key != obj.index_value.key
+            ):
+                raise NotImplementedError("Aligning different indices not supported")
+            elif (
+                isinstance(obj, SERIES_TYPE)
+                and df_or_series.axes[axis].index_value.key != obj.index_value.key
+            ):
+                raise NotImplementedError("Aligning different indices not supported")
+
+        _check_input_index(self.cond, axis=0)
+        _check_input_index(self.other)
+
+        if isinstance(df_or_series, DATAFRAME_TYPE):
+            mock_obj = build_df(df_or_series)
+        else:
+            mock_obj = build_series(df_or_series)
+
+        if isinstance(self.other, (pd.DataFrame, DATAFRAME_TYPE)):
+            mock_other = build_df(self.other)
+        elif isinstance(self.other, (pd.Series, SERIES_TYPE)):
+            mock_other = build_series(self.other)
+        else:
+            mock_other = self.other
+
+        result_df = mock_obj.where(
+            np.zeros(mock_obj.shape).astype(bool),
+            other=mock_other,
+            axis=self.axis,
+            level=self.level,
+            errors=self.errors,
+            try_cast=self.try_cast,
+        )
+
+        inputs = filter_inputs([df_or_series, self.cond, self.other])
+        if isinstance(df_or_series, DATAFRAME_TYPE):
+            return self.new_dataframe(
+                inputs,
+                shape=df_or_series.shape,
+                dtypes=result_df.dtypes,
+                index_value=df_or_series.index_value,
+                columns_value=df_or_series.columns_value,
+            )
+        else:
+            return self.new_series(
+                inputs,
+                shape=df_or_series.shape,
+                name=df_or_series.name,
+                dtype=result_df.dtype,
+                index_value=df_or_series.index_value,
+            )
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        self._input = next(inputs_iter)
+        if isinstance(self._cond, ENTITY_TYPE):
+            self._cond = next(inputs_iter)
+        if isinstance(self._other, ENTITY_TYPE):
+            self._other = next(inputs_iter)
+
+    @classmethod
+    def tile(cls, op: "DataFrameWhere"):
+        def rechunk_input(inp, axis=None):
+            axis = axis if axis is not None else op.axis
+            if isinstance(inp, DATAFRAME_TYPE):
+                inp = yield from recursive_tile(inp.rechunk(op.input.nsplits))
+            elif isinstance(inp, SERIES_TYPE):
+                inp = yield from recursive_tile(
+                    inp.rechunk({0: op.input.nsplits[axis]})
+                )
+            return inp
+
+        def get_tiled_chunk(obj, index, axis=None):
+            if isinstance(obj, DATAFRAME_TYPE):
+                return obj.cix[index[0], index[1]]
+            elif isinstance(obj, SERIES_TYPE):
+                axis = axis if axis is not None else op.axis
+                return obj.cix[index[axis],]
+            else:
+                return obj
+
+        # TODO support axis alignment for three objects
+        cond = yield from rechunk_input(op.cond, axis=0)
+        other = yield from rechunk_input(op.other)
+
+        chunks = []
+        for c in op.input.chunks:
+            cond_chunk = get_tiled_chunk(cond, c.index, axis=0)
+            other_chunk = get_tiled_chunk(other, c.index)
+
+            new_op = op.copy().reset_key()
+            new_op._cond = cond_chunk
+            new_op._other = other_chunk
+
+            inputs = filter_inputs([c, cond_chunk, other_chunk])
+            chunks.append(new_op.new_chunk(inputs, **c.params))
+
+        new_op = op.copy().reset_key()
+        return new_op.new_tileables(
+            op.inputs, chunks=chunks, nsplits=op.input.nsplits, **op.input.params
+        )
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameWhere"):
+        out_obj = op.outputs[0]
+
+        input_data = ctx[op.input.key]
+        cond = op.cond
+        if isinstance(cond, ENTITY_TYPE):
+            cond = ctx[cond.key]
+
+        other = op.other
+        if isinstance(other, ENTITY_TYPE):
+            other = ctx[other.key]
+
+        if op.replace_true:
+            ctx[out_obj.key] = input_data.mask(
+                cond,
+                other,
+                axis=op.axis,
+                level=op.level,
+                errors=op.errors,
+                try_cast=op.try_cast,
+            )
+        else:
+            ctx[out_obj.key] = input_data.where(
+                cond,
+                other,
+                axis=op.axis,
+                level=op.level,
+                errors=op.errors,
+                try_cast=op.try_cast,
+            )
+
+
+_doc_template = """
+Replace values where the condition is {replace_true}.
+
+Parameters
+----------
+cond : bool Series/DataFrame, array-like, or callable
+    Where `cond` is False, keep the original value. Where
+    True, replace with corresponding value from `other`.
+    If `cond` is callable, it is computed on the Series/DataFrame and
+    should return boolean Series/DataFrame or array. The callable must
+    not change input Series/DataFrame (though pandas doesn't check it).
+other : scalar, Series/DataFrame, or callable
+    Entries where `cond` is True are replaced with
+    corresponding value from `other`.
+    If other is callable, it is computed on the Series/DataFrame and
+    should return scalar or Series/DataFrame. The callable must not
+    change input Series/DataFrame (though pandas doesn't check it).
+inplace : bool, default False
+    Whether to perform the operation in place on the data.
+axis : int, default None
+    Alignment axis if needed.
+level : int, default None
+    Alignment level if needed.
+errors : str, {{'raise', 'ignore'}}, default 'raise'
+    Note that currently this parameter won't affect
+    the results and will always coerce to a suitable dtype.
+
+    - 'raise' : allow exceptions to be raised.
+    - 'ignore' : suppress exceptions. On error return original object.
+
+try_cast : bool, default False
+    Try to cast the result back to the input type (if possible).
+
+Returns
+-------
+Same type as caller
+
+See Also
+--------
+:func:`DataFrame.{opposite}` : Return an object of same shape as
+    self.
+
+Notes
+-----
+The mask method is an application of the if-then idiom. For each
+element in the calling DataFrame, if ``cond`` is ``False`` the
+element is used; otherwise the corresponding element from the DataFrame
+``other`` is used.
+
+The signature for :func:`DataFrame.where` differs from
+:func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to
+``np.where(m, df1, df2)``.
+
+For further details and examples see the ``mask`` documentation in
+:ref:`indexing <indexing.where_mask>`.
+
+Examples
+--------
+>>> import mars.tensor as mt
+>>> import mars.dataframe as md
+>>> s = md.Series(range(5))
+>>> s.where(s > 0).execute()
+0    NaN
+1    1.0
+2    2.0
+3    3.0
+4    4.0
+dtype: float64
+
+>>> s.mask(s > 0).execute()
+0    0.0
+1    NaN
+2    NaN
+3    NaN
+4    NaN
+dtype: float64
+
+>>> s.where(s > 1, 10).execute()
+0    10
+1    10
+2    2
+3    3
+4    4
+dtype: int64
+
+>>> df = md.DataFrame(mt.arange(10).reshape(-1, 2), columns=['A', 'B'])
+>>> df.execute()
+   A  B
+0  0  1
+1  2  3
+2  4  5
+3  6  7
+4  8  9
+>>> m = df % 3 == 0
+>>> df.where(m, -df).execute()
+   A  B
+0  0 -1
+1 -2  3
+2 -4 -5
+3  6 -7
+4 -8  9
+>>> df.where(m, -df) == mt.where(m, df, -df).execute()
+      A     B
+0  True  True
+1  True  True
+2  True  True
+3  True  True
+4  True  True
+>>> df.where(m, -df) == df.mask(~m, -df).execute()
+      A     B
+0  True  True
+1  True  True
+2  True  True
+3  True  True
+4  True  True
+"""
+
+
+def _where(
+    df_or_series,
+    cond,
+    other=np.nan,
+    inplace=False,
+    axis=None,
+    level=None,
+    errors="raise",
+    try_cast=False,
+    replace_true=False,
+):
+    if df_or_series.ndim == 2 and getattr(other, "ndim", 2) == 1 and axis is None:
+        raise ValueError("Must specify axis=0 or 1")
+
+    axis = validate_axis(axis or 0, df_or_series)
+    op = DataFrameWhere(
+        cond=cond,
+        other=other,
+        axis=axis,
+        level=level,
+        errors=errors,
+        try_cast=try_cast,
+        replace_true=replace_true,
+    )
+    result = op(df_or_series)
+    if inplace:
+        df_or_series.data = result.data
+    else:
+        return result
+
+
+def where(
+    df_or_series,
+    cond,
+    other=np.nan,
+    inplace=False,
+    axis=None,
+    level=None,
+    errors="raise",
+    try_cast=False,
+):
+    return _where(
+        df_or_series,
+        cond,
+        other=other,
+        inplace=inplace,
+        axis=axis,
+        level=level,
+        errors=errors,
+        try_cast=try_cast,
+        replace_true=False,
+    )
+
+
+def mask(
+    df_or_series,
+    cond,
+    other=np.nan,
+    inplace=False,
+    axis=None,
+    level=None,
+    errors="raise",
+    try_cast=False,
+):
+    return _where(
+        df_or_series,
+        cond,
+        other=other,
+        inplace=inplace,
+        axis=axis,
+        level=level,
+        errors=errors,
+        try_cast=try_cast,
+        replace_true=True,
+    )
+
+
+mask.__doc__ = _doc_template.format(replace_true=True, opposite="where")
+where.__doc__ = _doc_template.format(replace_true=False, opposite="mask")
diff --git a/python/xorbits/_mars/dataframe/initializer.py b/python/xorbits/_mars/dataframe/initializer.py
new file mode 100644
index 000000000..46f4b8e90
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/initializer.py
@@ -0,0 +1,255 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+from pandas.core.dtypes.common import pandas_dtype
+
+from ..core import ENTITY_TYPE
+from ..serialization.serializables import SerializableMeta
+from ..tensor import stack
+from ..tensor import tensor as astensor
+from ..tensor.array_utils import is_cupy
+from ..tensor.core import TENSOR_TYPE
+from ..utils import ceildiv, lazy_import
+from .core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
+from .core import DataFrame as _Frame
+from .core import Index as _Index
+from .core import Series as _Series
+from .datasource.dataframe import from_pandas as from_pandas_df
+from .datasource.from_tensor import (
+    dataframe_from_1d_tileables,
+    dataframe_from_tensor,
+    series_from_tensor,
+)
+from .datasource.index import from_pandas as from_pandas_index
+from .datasource.index import from_tileable as from_tileable_index
+from .datasource.series import from_pandas as from_pandas_series
+from .utils import is_cudf, is_index
+
+cudf = lazy_import("cudf")
+
+
+class InitializerMeta(SerializableMeta):
+    def __instancecheck__(cls, instance):
+        return isinstance(instance, (cls.__base__,) + getattr(cls, "_allow_data_type_"))
+
+
+class DataFrame(_Frame, metaclass=InitializerMeta):
+    def __init__(
+        self,
+        data=None,
+        index=None,
+        columns=None,
+        dtype=None,
+        copy=False,
+        chunk_size=None,
+        gpu=None,
+        sparse=None,
+        num_partitions=None,
+    ):
+        need_repart = False
+        if isinstance(data, TENSOR_TYPE):
+            if chunk_size is not None:
+                data = data.rechunk(chunk_size)
+            df = dataframe_from_tensor(
+                data, index=index, columns=columns, gpu=gpu, sparse=sparse
+            )
+            need_repart = num_partitions is not None
+        elif isinstance(data, SERIES_TYPE):
+            df = data.to_frame()
+            need_repart = num_partitions is not None
+        elif isinstance(data, DATAFRAME_TYPE):
+            if not hasattr(data, "data"):
+                # DataFrameData
+                df = _Frame(data)
+            else:
+                df = data
+            need_repart = num_partitions is not None
+        elif isinstance(data, dict) and self._can_process_by_1d_tileables(data):
+            # data is a dict and some value is tensor
+            df = dataframe_from_1d_tileables(
+                data, index=index, columns=columns, gpu=gpu, sparse=sparse
+            )
+            need_repart = num_partitions is not None
+        elif isinstance(data, list) and any(isinstance(v, ENTITY_TYPE) for v in data):
+            # stack data together
+            data = stack(data)
+            df = dataframe_from_tensor(
+                data, index=index, columns=columns, gpu=gpu, sparse=sparse
+            )
+            need_repart = num_partitions is not None
+        elif isinstance(index, (INDEX_TYPE, SERIES_TYPE)):
+            if isinstance(data, dict):
+                data = {k: astensor(v, chunk_size=chunk_size) for k, v in data.items()}
+                df = dataframe_from_1d_tileables(
+                    data, index=index, columns=columns, gpu=gpu, sparse=sparse
+                )
+            else:
+                if data is not None:
+                    data = astensor(data, chunk_size=chunk_size)
+                df = dataframe_from_tensor(
+                    data, index=index, columns=columns, gpu=gpu, sparse=sparse
+                )
+            need_repart = num_partitions is not None
+        else:
+            if is_cudf(data) or is_cupy(data):  # pragma: no cover
+                pdf = cudf.DataFrame(data, index=index, columns=columns, dtype=dtype)
+                if copy:
+                    pdf = pdf.copy()
+            else:
+                pdf = pd.DataFrame(
+                    data, index=index, columns=columns, dtype=dtype, copy=copy
+                )
+            if num_partitions is not None:
+                chunk_size = ceildiv(len(pdf), num_partitions)
+            df = from_pandas_df(pdf, chunk_size=chunk_size, gpu=gpu, sparse=sparse)
+
+        if need_repart:
+            df = df.rebalance(num_partitions=num_partitions)
+        super().__init__(df.data)
+
+    @classmethod
+    def _can_process_by_1d_tileables(cls, data: dict):
+        for value in data.values():
+            if isinstance(value, ENTITY_TYPE):
+                return True
+            elif isinstance(value, (list, tuple)) and any(
+                isinstance(v, ENTITY_TYPE) for v in value
+            ):
+                return True
+        return False
+
+
+class Series(_Series, metaclass=InitializerMeta):
+    def __init__(
+        self,
+        data=None,
+        index=None,
+        dtype=None,
+        name=None,
+        copy=False,
+        chunk_size=None,
+        gpu=None,
+        sparse=None,
+        num_partitions=None,
+    ):
+        if dtype is not None:
+            dtype = pandas_dtype(dtype)
+        need_repart = False
+        if isinstance(data, (TENSOR_TYPE, INDEX_TYPE)):
+            if chunk_size is not None:
+                data = data.rechunk(chunk_size)
+            name = name or getattr(data, "name", None)
+            series = series_from_tensor(
+                data, index=index, name=name, gpu=gpu, sparse=sparse
+            )
+            need_repart = num_partitions is not None
+        elif isinstance(index, INDEX_TYPE):
+            if data is not None:
+                data = astensor(data, chunk_size=chunk_size)
+            series = series_from_tensor(
+                data, index=index, name=name, dtype=dtype, gpu=gpu, sparse=sparse
+            )
+            need_repart = num_partitions is not None
+        elif isinstance(data, SERIES_TYPE):
+            if not hasattr(data, "data"):
+                # SeriesData
+                series = _Series(data)
+            else:
+                series = data
+            need_repart = num_partitions is not None
+        else:
+            if is_cudf(data) or is_cupy(data):  # pragma: no cover
+                pd_series = cudf.Series(data, index=index, dtype=dtype, name=name)
+                if copy:
+                    pd_series = pd_series.copy()
+            else:
+                pd_series = pd.Series(
+                    data, index=index, dtype=dtype, name=name, copy=copy
+                )
+            if num_partitions is not None:
+                chunk_size = ceildiv(len(pd_series), num_partitions)
+            series = from_pandas_series(
+                pd_series, chunk_size=chunk_size, gpu=gpu, sparse=sparse
+            )
+
+        if need_repart:
+            series = series.rebalance(num_partitions=num_partitions)
+        super().__init__(series.data)
+
+
+class Index(_Index, metaclass=InitializerMeta):
+    def __new__(cls, data, **_):
+        # just return cls always until we support other Index's initializers
+        return object.__new__(cls)
+
+    def __init__(
+        self,
+        data=None,
+        dtype=None,
+        copy=False,
+        name=None,
+        tupleize_cols=True,
+        chunk_size=None,
+        gpu=None,
+        sparse=None,
+        names=None,
+        num_partitions=None,
+        store_data=False,
+    ):
+        need_repart = False
+        if isinstance(data, INDEX_TYPE):
+            if not hasattr(data, "data"):
+                # IndexData
+                index = _Index(data)
+            else:
+                index = data
+            need_repart = num_partitions is not None
+        else:
+            if isinstance(data, ENTITY_TYPE):
+                name = name if name is not None else getattr(data, "name", None)
+                index = from_tileable_index(data, dtype=dtype, name=name, names=names)
+                need_repart = num_partitions is not None
+            else:
+                if not is_index(data):
+                    name = name if name is not None else getattr(data, "name", None)
+                    xdf = cudf if is_cudf(data) or is_cupy(data) else pd
+                    try:
+                        pd_index = xdf.Index(
+                            data=data,
+                            dtype=dtype,
+                            copy=copy,
+                            name=name,
+                            tupleize_cols=tupleize_cols,
+                        )
+                    except TypeError:  # pragma: no cover
+                        pd_index = xdf.Index(
+                            data=data, dtype=dtype, copy=copy, name=name
+                        )
+                else:
+                    pd_index = data
+
+                if num_partitions is not None:
+                    chunk_size = ceildiv(len(pd_index), num_partitions)
+                index = from_pandas_index(
+                    pd_index,
+                    chunk_size=chunk_size,
+                    gpu=gpu,
+                    sparse=sparse,
+                    store_data=store_data,
+                )
+
+        if need_repart:
+            index = index.rebalance(num_partitions=num_partitions)
+        super().__init__(index.data)
diff --git a/python/xorbits/_mars/dataframe/merge/__init__.py b/python/xorbits/_mars/dataframe/merge/__init__.py
new file mode 100644
index 000000000..db87d09ce
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/merge/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .concat import DataFrameConcat, concat
+from .merge import join, merge, DataFrameMerge, DataFrameMergeAlign
+from .append import DataFrameAppend, append  # isort: skip
+
+
+def _install():
+    from ..core import DATAFRAME_TYPE, SERIES_TYPE
+
+    for cls in DATAFRAME_TYPE:
+        setattr(cls, "join", join)
+        setattr(cls, "merge", merge)
+
+    for cls in DATAFRAME_TYPE + SERIES_TYPE:
+        setattr(cls, "append", append)
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/dataframe/merge/append.py b/python/xorbits/_mars/dataframe/merge/append.py
new file mode 100644
index 000000000..8b32231e5
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/merge/append.py
@@ -0,0 +1,222 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import OutputType, recursive_tile
+from ...serialization.serializables import BoolField
+from ..datasource.dataframe import from_pandas
+from ..indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem
+from ..operands import (
+    DATAFRAME_TYPE,
+    SERIES_TYPE,
+    DataFrameOperand,
+    DataFrameOperandMixin,
+)
+from ..utils import parse_index, standardize_range_index
+
+
+class DataFrameAppend(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.APPEND
+
+    ignore_index = BoolField("ignore_index")
+    verify_integrity = BoolField("verify_integrity")
+    sort = BoolField("sort")
+
+    def __init__(self, output_types=None, **kw):
+        super().__init__(_output_types=output_types, **kw)
+
+    @classmethod
+    def _tile_dataframe(cls, op: "DataFrameAppend"):
+        out_df = op.outputs[0]
+        inputs = op.inputs
+        first_df, others = inputs[0], inputs[1:]
+        column_splits = first_df.nsplits[1]
+        new_others = []
+        for item in others:
+            r = yield from recursive_tile(item.rechunk({1: column_splits}))
+            new_others.append(r)
+        others = new_others
+        out_chunks = []
+        nsplits = [[], list(first_df.nsplits[1])]
+        row_index = 0
+        for df in [first_df] + others:
+            for c in df.chunks:
+                index = (c.index[0] + row_index, c.index[1])
+                iloc_op = DataFrameIlocGetItem(indexes=[slice(None)] * 2)
+                out_chunks.append(
+                    iloc_op.new_chunk(
+                        [c],
+                        shape=c.shape,
+                        index=index,
+                        dtypes=c.dtypes,
+                        index_value=c.index_value,
+                        columns_value=c.columns_value,
+                    )
+                )
+            nsplits[0] += df.nsplits[0]
+            row_index += len(df.nsplits[0])
+        if op.ignore_index:
+            yield out_chunks
+            out_chunks = standardize_range_index(out_chunks)
+
+        nsplits = tuple(tuple(n) for n in nsplits)
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            op.inputs,
+            out_df.shape,
+            nsplits=nsplits,
+            chunks=out_chunks,
+            dtypes=out_df.dtypes,
+            index_value=out_df.index_value,
+            columns_value=out_df.columns_value,
+        )
+
+    @classmethod
+    def _tile_series(cls, op: "DataFrameAppend"):
+        out_series = op.outputs[0]
+        inputs = op.inputs
+        first_series, others = inputs[0], inputs[1:]
+        out_chunks = []
+        nsplits = ()
+        row_index = 0
+        for series in [first_series] + others:
+            for c in series.chunks:
+                index = (c.index[0] + row_index,)
+                iloc_op = SeriesIlocGetItem(indexes=(slice(None),))
+                out_chunks.append(
+                    iloc_op.new_chunk(
+                        [c],
+                        shape=c.shape,
+                        index=index,
+                        index_value=c.index_value,
+                        dtype=c.dtype,
+                        name=c.name,
+                    )
+                )
+            nsplits += series.nsplits[0]
+            row_index += len(series.nsplits[0])
+
+        if op.ignore_index:
+            yield out_chunks
+            out_chunks = standardize_range_index(out_chunks)
+
+        nsplits = (tuple(nsplits),)
+        new_op = op.copy()
+        return new_op.new_seriess(
+            op.inputs,
+            out_series.shape,
+            nsplits=nsplits,
+            chunks=out_chunks,
+            dtype=out_series.dtype,
+            index_value=out_series.index_value,
+            name=out_series.name,
+        )
+
+    @classmethod
+    def tile(cls, op: "DataFrameAppend"):
+        if op.output_types[0] == OutputType.dataframe:
+            return (yield from cls._tile_dataframe(op))
+        else:
+            return (yield from cls._tile_series(op))
+
+    def _call_dataframe(self, df, other):
+        if isinstance(other, DATAFRAME_TYPE):
+            shape = (df.shape[0] + other.shape[0], df.shape[1])
+            inputs = [df, other]
+            if self.ignore_index:
+                index_value = parse_index(pd.RangeIndex(shape[0]))
+            else:
+                index_value = parse_index(
+                    df.index_value.to_pandas().append(other.index_value.to_pandas())
+                )
+        elif isinstance(other, list):
+            row_length = df.shape[0]
+            index = df.index_value.to_pandas()
+            for item in other:
+                if not isinstance(item, DATAFRAME_TYPE):  # pragma: no cover
+                    raise ValueError(f"Invalid type {type(item)} to append")
+                row_length += item.shape[0]
+                index = index.append(item.index_value.to_pandas())
+            shape = (row_length, df.shape[1])
+            if self.ignore_index:  # pragma: no cover
+                index_value = parse_index(pd.RangeIndex(shape[0]))
+            else:
+                index_value = parse_index(index)
+            inputs = [df] + other
+        else:  # pragma: no cover
+            raise ValueError(f"Invalid type {type(other)} to append")
+        return self.new_dataframe(
+            inputs,
+            shape=shape,
+            dtypes=df.dtypes,
+            index_value=index_value,
+            columns_value=df.columns_value,
+        )
+
+    def _call_series(self, df, other):
+        if isinstance(other, SERIES_TYPE):
+            shape = (df.shape[0] + other.shape[0],)
+            inputs = [df, other]
+            if self.ignore_index:
+                index_value = parse_index(pd.RangeIndex(shape[0]))
+            else:
+                index_value = parse_index(
+                    df.index_value.to_pandas().append(other.index_value.to_pandas())
+                )
+        elif isinstance(other, list):
+            row_length = df.shape[0]
+            index = df.index_value.to_pandas()
+            for item in other:
+                if not isinstance(item, SERIES_TYPE):  # pragma: no cover
+                    raise ValueError(f"Invalid type {type(item)} to append")
+                row_length += item.shape[0]
+                index = index.append(item.index_value.to_pandas())
+            shape = (row_length,)
+            if self.ignore_index:  # pragma: no cover
+                index_value = parse_index(pd.RangeIndex(shape[0]))
+            else:
+                index_value = parse_index(index)
+            inputs = [df] + other
+        else:  # pragma: no cover
+            raise ValueError(f"Invalid type {type(other)} to append")
+        return self.new_series(
+            inputs, shape=shape, dtype=df.dtype, index_value=index_value, name=df.name
+        )
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameAppend"):
+        first, others = ctx[op.inputs[0].key], [ctx[inp.key] for inp in op.inputs[1:]]
+        r = first.append(others, verify_integrity=op.verify_integrity, sort=op.sort)
+        ctx[op.outputs[0].key] = r
+
+    def __call__(self, df, other):
+        if isinstance(df, DATAFRAME_TYPE):
+            self.output_types = [OutputType.dataframe]
+            return self._call_dataframe(df, other)
+        else:
+            self.output_types = [OutputType.series]
+            return self._call_series(df, other)
+
+
+def append(df, other, ignore_index=False, verify_integrity=False, sort=False):
+    if verify_integrity or sort:  # pragma: no cover
+        raise NotImplementedError("verify_integrity and sort are not supported now")
+    if isinstance(other, dict):
+        other = from_pandas(pd.DataFrame(dict((k, [v]) for k, v in other.items())))
+    op = DataFrameAppend(
+        ignore_index=ignore_index, verify_integrity=verify_integrity, sort=sort
+    )
+    return op(df, other)
diff --git a/python/xorbits/_mars/dataframe/merge/concat.py b/python/xorbits/_mars/dataframe/merge/concat.py
new file mode 100644
index 000000000..b3c21d66b
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/merge/concat.py
@@ -0,0 +1,617 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import ENTITY_TYPE, OutputType, recursive_tile
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    FieldTypes,
+    ListField,
+    StringField,
+)
+from ...utils import has_unknown_shape, lazy_import
+from ..operands import SERIES_TYPE, DataFrameOperand, DataFrameOperandMixin
+from ..utils import (
+    build_empty_df,
+    build_empty_series,
+    parse_index,
+    standardize_range_index,
+    validate_axis,
+)
+
+cudf = lazy_import("cudf")
+
+
+class DataFrameConcat(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.CONCATENATE
+
+    axis = AnyField("axis", default=None)
+    join = StringField("join", default=None)
+    ignore_index = BoolField("ignore_index", default=None)
+    keys = ListField("keys", default=None)
+    levels = ListField("levels", default=None)
+    names = ListField("names", default=None)
+    verify_integrity = BoolField("verify_integrity", default=None)
+    sort = BoolField("sort", default=None)
+    copy_ = BoolField("copy", default=None)
+
+    def __init__(self, copy=None, output_types=None, **kw):
+        super().__init__(copy_=copy, _output_types=output_types, **kw)
+
+    @property
+    def level(self):
+        return self.levels
+
+    @property
+    def name(self):
+        return self.names
+
+    @classmethod
+    def _tile_dataframe(cls, op):
+        from ..indexing.iloc import DataFrameIlocGetItem
+
+        out_df = op.outputs[0]
+        inputs = op.inputs
+        axis = op.axis
+
+        if not all(
+            inputs[i].nsplits[1 - axis] == inputs[i + 1].nsplits[1 - axis]
+            for i in range(len(inputs) - 1)
+        ):
+            # need rechunk
+            if has_unknown_shape(*inputs):
+                yield
+            normalized_nsplits = {1 - axis: inputs[0].nsplits[1 - axis]}
+            new_inputs = []
+            for inp in inputs:
+                new_inputs.append(
+                    (yield from recursive_tile(inp.rechunk(normalized_nsplits)))
+                )
+            inputs = new_inputs
+
+        out_chunks = []
+        nsplits = []
+        cum_index = 0
+        for df in inputs:
+            for c in df.chunks:
+                if op.axis == 0:
+                    index = (c.index[0] + cum_index, c.index[1])
+                else:
+                    index = (c.index[0], c.index[1] + cum_index)
+
+                iloc_op = DataFrameIlocGetItem(indexes=[slice(None)] * 2)
+                out_chunks.append(
+                    iloc_op.new_chunk(
+                        [c],
+                        shape=c.shape,
+                        index=index,
+                        dtypes=c.dtypes,
+                        index_value=c.index_value,
+                        columns_value=c.columns_value,
+                    )
+                )
+            nsplits.extend(df.nsplits[op.axis])
+            cum_index += len(df.nsplits[op.axis])
+        out_nsplits = (
+            (tuple(nsplits), inputs[0].nsplits[1])
+            if op.axis == 0
+            else (inputs[0].nsplits[0], tuple(nsplits))
+        )
+
+        if op.ignore_index:
+            yield out_chunks
+            out_chunks = standardize_range_index(out_chunks)
+
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            op.inputs,
+            out_df.shape,
+            nsplits=out_nsplits,
+            chunks=out_chunks,
+            dtypes=out_df.dtypes,
+            index_value=out_df.index_value,
+            columns_value=out_df.columns_value,
+        )
+
+    @classmethod
+    def _tile_series(cls, op: "DataFrameConcat"):
+        from ..datasource.from_tensor import DataFrameFromTensor
+        from ..indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem
+
+        out = op.outputs[0]
+        inputs = op.inputs
+        out_chunks = []
+
+        if op.axis == 1:
+            if has_unknown_shape(*inputs):
+                yield
+            new_inputs = []
+            for inp in inputs:
+                new_inputs.append(
+                    (yield from recursive_tile(inp.rechunk(op.inputs[0].nsplits)))
+                )
+            inputs = new_inputs
+
+        cum_index = 0
+        offset = 0
+        nsplits = []
+        for series in inputs:
+            for c in series.chunks:
+                if op.axis == 0:
+                    index = (c.index[0] + cum_index,)
+                    shape = c.shape
+                    iloc_op = SeriesIlocGetItem(indexes=(slice(None),))
+                    out_chunks.append(
+                        iloc_op.new_chunk(
+                            [c],
+                            shape=shape,
+                            index=index,
+                            index_value=c.index_value,
+                            dtype=c.dtype,
+                            name=c.name,
+                        )
+                    )
+                else:
+                    index = (c.index[0], cum_index)
+                    shape = (c.shape[0], 1)
+                    to_frame_op = DataFrameFromTensor(
+                        input=c,
+                        index=None,
+                        columns=None,
+                    )
+                    if c.name:
+                        dtypes = pd.Series([c.dtype], index=[c.name])
+                    else:
+                        dtypes = pd.Series(
+                            [c.dtype], index=pd.RangeIndex(offset, offset + 1)
+                        )
+                    df_chunk = to_frame_op.new_chunk(
+                        [c],
+                        shape=shape,
+                        index=index,
+                        index_value=c.index_value,
+                        columns_value=parse_index(dtypes.index, store_data=True),
+                        dtypes=dtypes,
+                    )
+                    iloc_op = DataFrameIlocGetItem(indexes=[slice(None)] * 2)
+                    out_chunks.append(
+                        iloc_op.new_chunk(
+                            [df_chunk],
+                            shape=df_chunk.shape,
+                            index=index,
+                            dtypes=df_chunk.dtypes,
+                            index_value=df_chunk.index_value,
+                            columns_value=df_chunk.columns_value,
+                        )
+                    )
+
+            if op.axis == 0:
+                nsplits.extend(series.nsplits[0])
+                cum_index += len(series.nsplits[op.axis])
+            else:
+                nsplits.append(1)
+                cum_index += 1
+                offset += 1
+
+        if op.ignore_index:
+            yield out_chunks
+            out_chunks = standardize_range_index(out_chunks)
+
+        new_op = op.copy()
+        if op.axis == 0:
+            nsplits = (tuple(nsplits),)
+            return new_op.new_seriess(
+                op.inputs,
+                out.shape,
+                nsplits=nsplits,
+                chunks=out_chunks,
+                dtype=out.dtype,
+                index_value=out.index_value,
+                name=out.name,
+            )
+        else:
+            nsplits = (inputs[0].nsplits[0], tuple(nsplits))
+            return new_op.new_dataframes(
+                op.inputs,
+                out.shape,
+                nsplits=nsplits,
+                chunks=out_chunks,
+                dtypes=out.dtypes,
+                index_value=out.index_value,
+                columns_value=out.columns_value,
+            )
+
+    @classmethod
+    def tile(cls, op: "DataFrameConcat"):
+        if isinstance(op.inputs[0], SERIES_TYPE):
+            return (yield from cls._tile_series(op))
+        else:
+            return (yield from cls._tile_dataframe(op))
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameConcat"):
+        def _base_concat(chunk, inputs):
+            # auto generated concat when executing a DataFrame, Series or Index
+            if chunk.op.output_types[0] == OutputType.dataframe:
+                return _auto_concat_dataframe_chunks(chunk, inputs)
+            elif chunk.op.output_types[0] == OutputType.series:
+                return _auto_concat_series_chunks(chunk, inputs)
+            elif chunk.op.output_types[0] == OutputType.index:
+                return _auto_concat_index_chunks(chunk, inputs)
+            elif chunk.op.output_types[0] == OutputType.categorical:
+                return _auto_concat_categorical_chunks(chunk, inputs)
+            else:  # pragma: no cover
+                raise TypeError(
+                    "Only DataFrameChunk, SeriesChunk, IndexChunk, "
+                    "and CategoricalChunk can be automatically concatenated"
+                )
+
+        def _auto_concat_dataframe_chunks(chunk, inputs):
+            xdf = (
+                pd
+                if isinstance(inputs[0], (pd.DataFrame, pd.Series)) or cudf is None
+                else cudf
+            )
+
+            if chunk.op.axis is not None:
+                return xdf.concat(inputs, axis=op.axis)
+
+            # auto generated concat when executing a DataFrame
+            if len(inputs) == 1:
+                ret = inputs[0]
+            else:
+                n_rows = len(set(inp.index[0] for inp in chunk.inputs))
+                n_cols = int(len(inputs) // n_rows)
+                assert n_rows * n_cols == len(inputs)
+
+                concats = []
+                for i in range(n_rows):
+                    if n_cols == 1:
+                        concats.append(inputs[i])
+                    else:
+                        concat = xdf.concat(
+                            [inputs[i * n_cols + j] for j in range(n_cols)], axis=1
+                        )
+                        concats.append(concat)
+
+                if xdf is pd:
+                    # The `sort=False` is to suppress a `FutureWarning` of pandas,
+                    # when the index or column of chunks to concatenate is not aligned,
+                    # which may happens for certain ops.
+                    #
+                    # See also Note [Columns of Left Join] in test_merge_execution.py.
+                    ret = xdf.concat(concats, sort=False)
+                else:
+                    ret = xdf.concat(concats)
+                    # cuDF will lost index name when concat two seriess.
+                    ret.index.name = concats[0].index.name
+
+            return ret
+
+        def _auto_concat_series_chunks(chunk, inputs):
+            # auto generated concat when executing a Series
+            if len(inputs) == 1:
+                concat = inputs[0]
+            else:
+                xdf = pd if isinstance(inputs[0], pd.Series) or cudf is None else cudf
+                if chunk.op.axis is not None:
+                    concat = xdf.concat(inputs, axis=chunk.op.axis)
+                else:
+                    concat = xdf.concat(inputs)
+            return concat
+
+        def _auto_concat_index_chunks(chunk, inputs):
+            if len(inputs) == 1:
+                xdf = pd if isinstance(inputs[0], pd.Index) or cudf is None else cudf
+                concat_df = xdf.DataFrame(index=inputs[0])
+            else:
+                xdf = pd if isinstance(inputs[0], pd.Index) or cudf is None else cudf
+                empty_dfs = [xdf.DataFrame(index=inp) for inp in inputs]
+                concat_df = xdf.concat(empty_dfs, axis=0)
+            return concat_df.index
+
+        def _auto_concat_categorical_chunks(_, inputs):
+            if len(inputs) == 1:  # pragma: no cover
+                return inputs[0]
+            else:
+                # convert categorical into array
+                arrays = [np.asarray(inp) for inp in inputs]
+                array = np.concatenate(arrays)
+                return pd.Categorical(
+                    array, categories=inputs[0].categories, ordered=inputs[0].ordered
+                )
+
+        chunk = op.outputs[0]
+        inputs = [ctx[input.key] for input in op.inputs]
+
+        if isinstance(inputs[0], tuple):
+            ctx[chunk.key] = tuple(
+                _base_concat(chunk, [input[i] for input in inputs])
+                for i in range(len(inputs[0]))
+            )
+        else:
+            ctx[chunk.key] = _base_concat(chunk, inputs)
+
+    @classmethod
+    def _concat_index(cls, prev_index: pd.Index, cur_index: pd.Index):
+        if isinstance(prev_index, pd.RangeIndex) and isinstance(
+            cur_index, pd.RangeIndex
+        ):
+            # handle RangeIndex that append may generate huge amount of data
+            # e.g. pd.RangeIndex(10_000) and pd.RangeIndex(10_000)
+            # will generate a Int64Index full of data
+            # for details see GH#1647
+            prev_stop = prev_index.start + prev_index.size * prev_index.step
+            cur_start = cur_index.start
+            if prev_stop == cur_start and prev_index.step == cur_index.step:
+                # continuous RangeIndex, still return RangeIndex
+                return prev_index.append(cur_index)
+            else:
+                # otherwise, return an empty index
+                return pd.Index([], dtype=prev_index.dtype)
+        elif isinstance(prev_index, pd.RangeIndex):
+            return pd.Index([], prev_index.dtype).append(cur_index)
+        elif isinstance(cur_index, pd.RangeIndex):
+            return prev_index.append(pd.Index([], cur_index.dtype))
+        return prev_index.append(cur_index)
+
+    def _call_series(self, objs):
+        if self.axis == 0:
+            row_length = 0
+            index = None
+            for series in objs:
+                if index is None:
+                    index = series.index_value.to_pandas()
+                else:
+                    index = self._concat_index(index, series.index_value.to_pandas())
+                row_length += series.shape[0]
+            if self.ignore_index:  # pragma: no cover
+                index_value = parse_index(pd.RangeIndex(row_length))
+            else:
+                index_value = parse_index(index, objs)
+            return self.new_series(
+                objs,
+                shape=(row_length,),
+                dtype=objs[0].dtype,
+                index_value=index_value,
+                name=objs[0].name,
+            )
+        else:
+            col_length = 0
+            columns = []
+            dtypes = dict()
+            undefined_name = 0
+            for series in objs:
+                if series.name is None:
+                    dtypes[undefined_name] = series.dtype
+                    undefined_name += 1
+                    columns.append(undefined_name)
+                else:
+                    dtypes[series.name] = series.dtype
+                    columns.append(series.name)
+                col_length += 1
+            if self.ignore_index or undefined_name == len(objs):
+                columns_value = parse_index(pd.RangeIndex(col_length))
+            else:
+                columns_value = parse_index(pd.Index(columns), store_data=True)
+
+            shape = (objs[0].shape[0], col_length)
+            return self.new_dataframe(
+                objs,
+                shape=shape,
+                dtypes=pd.Series(dtypes),
+                index_value=objs[0].index_value,
+                columns_value=columns_value,
+            )
+
+    def _call_dataframes(self, objs):
+        if self.axis == 0:
+            row_length = 0
+            index = None
+            empty_dfs = []
+            for df in objs:
+                if index is None:
+                    index = df.index_value.to_pandas()
+                else:
+                    index = self._concat_index(index, df.index_value.to_pandas())
+                row_length += df.shape[0]
+                if df.ndim == 2:
+                    empty_dfs.append(build_empty_df(df.dtypes))
+                else:
+                    empty_dfs.append(build_empty_series(df.dtype, name=df.name))
+
+            emtpy_result = pd.concat(empty_dfs, join=self.join, sort=self.sort)
+            shape = (row_length, emtpy_result.shape[1])
+            columns_value = parse_index(emtpy_result.columns, store_data=True)
+
+            if self.join == "inner":
+                objs = [o[list(emtpy_result.columns)] for o in objs]
+
+            if self.ignore_index:  # pragma: no cover
+                index_value = parse_index(pd.RangeIndex(row_length))
+            else:
+                index_value = parse_index(index, objs)
+
+            new_objs = []
+            for obj in objs:
+                if obj.ndim != 2:
+                    # series
+                    new_obj = obj.to_frame().reindex(columns=emtpy_result.dtypes.index)
+                else:
+                    # dataframe
+                    if list(obj.dtypes.index) != list(emtpy_result.dtypes.index):
+                        new_obj = obj.reindex(columns=emtpy_result.dtypes.index)
+                    else:
+                        new_obj = obj
+                new_objs.append(new_obj)
+
+            return self.new_dataframe(
+                new_objs,
+                shape=shape,
+                dtypes=emtpy_result.dtypes,
+                index_value=index_value,
+                columns_value=columns_value,
+            )
+        else:
+            col_length = 0
+            empty_dfs = []
+            for df in objs:
+                if df.ndim == 2:
+                    # DataFrame
+                    col_length += df.shape[1]
+                    empty_dfs.append(build_empty_df(df.dtypes))
+                else:
+                    # Series
+                    col_length += 1
+                    empty_dfs.append(build_empty_series(df.dtype, name=df.name))
+
+            emtpy_result = pd.concat(empty_dfs, join=self.join, axis=1, sort=True)
+            if self.ignore_index:
+                columns_value = parse_index(pd.RangeIndex(col_length))
+            else:
+                columns_value = parse_index(
+                    pd.Index(emtpy_result.columns), store_data=True
+                )
+
+            if self.ignore_index or len({o.index_value.key for o in objs}) == 1:
+                new_objs = [obj if obj.ndim == 2 else obj.to_frame() for obj in objs]
+            else:  # pragma: no cover
+                raise NotImplementedError(
+                    "Does not support concat dataframes which has different index"
+                )
+
+            shape = (objs[0].shape[0], col_length)
+            return self.new_dataframe(
+                new_objs,
+                shape=shape,
+                dtypes=emtpy_result.dtypes,
+                index_value=objs[0].index_value,
+                columns_value=columns_value,
+            )
+
+    def __call__(self, objs):
+        if all(isinstance(obj, SERIES_TYPE) for obj in objs):
+            self.output_types = [OutputType.series]
+            return self._call_series(objs)
+        else:
+            self.output_types = [OutputType.dataframe]
+            return self._call_dataframes(objs)
+
+
+class GroupByConcat(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.GROUPBY_CONCAT
+
+    _groups = ListField("groups", FieldTypes.key)
+    _groupby_params = AnyField("groupby_params")
+
+    def __init__(self, groups=None, groupby_params=None, output_types=None, **kw):
+        super().__init__(
+            _groups=groups,
+            _groupby_params=groupby_params,
+            _output_types=output_types,
+            **kw
+        )
+
+    @property
+    def groups(self):
+        return self._groups
+
+    @property
+    def groupby_params(self):
+        return self._groupby_params
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+
+        new_groups = []
+        for _ in self._groups:
+            new_groups.append(next(inputs_iter))
+        self._groups = new_groups
+
+        if isinstance(self._groupby_params["by"], list):
+            by = []
+            for v in self._groupby_params["by"]:
+                if isinstance(v, ENTITY_TYPE):
+                    by.append(next(inputs_iter))
+                else:
+                    by.append(v)
+            self._groupby_params["by"] = by
+
+    @classmethod
+    def execute(cls, ctx, op):
+        input_data = [ctx[input.key] for input in op.groups]
+        obj = pd.concat([d.obj for d in input_data])
+
+        params = op.groupby_params.copy()
+        if isinstance(params["by"], list):
+            by = []
+            for v in params["by"]:
+                if isinstance(v, ENTITY_TYPE):
+                    by.append(ctx[v.key])
+                else:
+                    by.append(v)
+            params["by"] = by
+        selection = params.pop("selection", None)
+
+        result = obj.groupby(**params)
+        if selection:
+            result = result[selection]
+
+        ctx[op.outputs[0].key] = result
+
+
+def concat(
+    objs,
+    axis=0,
+    join="outer",
+    ignore_index=False,
+    keys=None,
+    levels=None,
+    names=None,
+    verify_integrity=False,
+    sort=False,
+    copy=True,
+):
+    if not isinstance(objs, (list, tuple)):  # pragma: no cover
+        raise TypeError(
+            "first argument must be an iterable of dataframe or series objects"
+        )
+    axis = validate_axis(axis)
+    if isinstance(objs, dict):  # pragma: no cover
+        keys = objs.keys()
+        objs = objs.values()
+    if axis == 1 and join == "inner":  # pragma: no cover
+        raise NotImplementedError("inner join is not support when specify `axis=1`")
+    if verify_integrity or sort or keys:  # pragma: no cover
+        raise NotImplementedError(
+            "verify_integrity, sort, keys arguments are not supported now"
+        )
+    op = DataFrameConcat(
+        axis=axis,
+        join=join,
+        ignore_index=ignore_index,
+        keys=keys,
+        levels=levels,
+        names=names,
+        verify_integrity=verify_integrity,
+        sort=sort,
+        copy=copy,
+    )
+
+    return op(objs)
diff --git a/python/xorbits/_mars/dataframe/merge/merge.py b/python/xorbits/_mars/dataframe/merge/merge.py
new file mode 100644
index 000000000..fc86bf353
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/merge/merge.py
@@ -0,0 +1,1342 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import logging
+from collections import namedtuple
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import OutputType, TileStatus, recursive_tile
+from ...core.context import get_context
+from ...core.operand import MapReduceOperand, OperandStage
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    DictField,
+    Int32Field,
+    KeyField,
+    NamedTupleField,
+    StringField,
+    TupleField,
+)
+from ...typing import TileableType
+from ...utils import has_unknown_shape, lazy_import
+from ..base.bloom_filter import filter_by_bloom_filter
+from ..core import DataFrame, DataFrameChunk, Series
+from ..operands import DataFrameOperand, DataFrameOperandMixin, DataFrameShuffleProxy
+from ..utils import (
+    auto_merge_chunks,
+    build_concatenated_rows_frame,
+    build_df,
+    hash_dataframe_on,
+    infer_index_value,
+    is_cudf,
+    parse_index,
+)
+
+logger = logging.getLogger(__name__)
+DEFAULT_BLOOM_FILTER_CHUNK_THRESHOLD = 10
+# use bloom filter to filter large DataFrame
+BLOOM_FILTER_OPTIONS = [
+    "max_elements",
+    "error_rate",
+    "apply_chunk_size_threshold",
+    "filter",
+    "combine_size",
+]
+BLOOM_FILTER_ON_OPTIONS = ["large", "small", "both"]
+DEFAULT_BLOOM_FILTER_ON = "large"
+
+cudf = lazy_import("cudf")
+
+
+class DataFrameMergeAlign(MapReduceOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.DATAFRAME_SHUFFLE_MERGE_ALIGN
+
+    index_shuffle_size = Int32Field("index_shuffle_size")
+    shuffle_on = AnyField("shuffle_on")
+
+    input = KeyField("input")
+    # for mapper
+    mapper_id = Int32Field("mapper_id", default=0)
+
+    def __init__(self, output_types=None, **kw):
+        super().__init__(_output_types=output_types, **kw)
+        if output_types is None:
+            if self.stage == OperandStage.map:
+                output_types = [OutputType.dataframe]
+            elif self.stage == OperandStage.reduce:
+                output_types = [OutputType.dataframe] * 2
+        self._output_types = output_types
+
+    @property
+    def output_limit(self) -> int:
+        return len(self.output_types)
+
+    @classmethod
+    def execute_map(cls, ctx, op):
+        chunk = op.outputs[0]
+        df = ctx[op.inputs[0].key]
+        shuffle_on = op.shuffle_on
+
+        if shuffle_on is not None:
+            # shuffle on field may be resident in index
+            to_reset_index_names = []
+            if not isinstance(shuffle_on, (list, tuple)):
+                if shuffle_on not in df.dtypes:
+                    to_reset_index_names.append(shuffle_on)
+            else:
+                for son in shuffle_on:
+                    if son not in df.dtypes:
+                        to_reset_index_names.append(shuffle_on)
+            if len(to_reset_index_names) > 0:
+                df = df.reset_index(to_reset_index_names)
+
+        filters = hash_dataframe_on(df, shuffle_on, op.index_shuffle_size)
+
+        # shuffle on index
+        for index_idx, index_filter in enumerate(filters):
+            reducer_index = (index_idx, chunk.index[1])
+            if index_filter is not None and index_filter is not list():
+                ctx[chunk.key, reducer_index] = (
+                    op.mapper_id,
+                    ctx.get_current_chunk().index,
+                    df.iloc[index_filter],
+                )
+            else:
+                ctx[chunk.key, reducer_index] = (
+                    op.mapper_id,
+                    ctx.get_current_chunk().index,
+                    None,
+                )
+
+    @classmethod
+    def execute_reduce(cls, ctx, op: "DataFrameMergeAlign"):
+        for i, chunk in enumerate(op.outputs):
+            input_idx_to_df = {
+                partition_index: data
+                for mapper_id, partition_index, data in op.iter_mapper_data(
+                    ctx, skip_none=True
+                )
+                if mapper_id == i
+            }
+            row_idxes = sorted({idx[0] for idx in input_idx_to_df})
+            res = []
+            for row_idx in row_idxes:
+                row_df = input_idx_to_df.get((row_idx, 0), None)
+                if row_df is not None:
+                    res.append(row_df)
+            xdf = cudf if is_cudf(res[0]) else pd
+            ctx[chunk.key] = xdf.concat(res, axis=0)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            cls.execute_map(ctx, op)
+        else:
+            cls.execute_reduce(ctx, op)
+
+
+MergeSplitInfo = namedtuple("MergeSplitInfo", "split_side, split_index, nsplits")
+
+
+class MergeMethod(Enum):
+    one_chunk = 0
+    broadcast = 1
+    shuffle = 2
+
+
+class DataFrameMerge(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.DATAFRAME_MERGE
+
+    how = StringField("how")
+    on = AnyField("on")
+    left_on = AnyField("left_on")
+    right_on = AnyField("right_on")
+    left_index = BoolField("left_index")
+    right_index = BoolField("right_index")
+    sort = BoolField("sort")
+    suffixes = TupleField("suffixes")
+    copy_ = BoolField("copy_")
+    indicator = BoolField("indicator")
+    validate = AnyField("validate")
+    method = StringField("method")
+    auto_merge = StringField("auto_merge")
+    auto_merge_threshold = Int32Field("auto_merge_threshold")
+    bloom_filter = AnyField("bloom_filter")
+    bloom_filter_options = DictField("bloom_filter_options")
+
+    # only for broadcast merge
+    split_info = NamedTupleField("split_info")
+
+    def __init__(self, copy=None, **kwargs):
+        super().__init__(copy_=copy, **kwargs)
+
+    def __call__(self, left, right):
+        empty_left, empty_right = build_df(left), build_df(right)
+
+        # validate arguments.
+        merged = empty_left.merge(
+            empty_right,
+            how=self.how,
+            on=self.on,
+            left_on=self.left_on,
+            right_on=self.right_on,
+            left_index=self.left_index,
+            right_index=self.right_index,
+            sort=self.sort,
+            suffixes=self.suffixes,
+            copy=self.copy_,
+            indicator=self.indicator,
+            validate=self.validate,
+        )
+
+        # update default values.
+        if self.on is None and self.left_on is None and self.right_on is None:
+            if not self.left_index or not self.right_index:
+                # use the common columns
+                left_cols = empty_left.columns
+                right_cols = empty_right.columns
+                common_cols = left_cols.intersection(right_cols)
+                self.left_on = self.right_on = list(common_cols)
+
+        # the `index_value` doesn't matter.
+        index_tokenize_objects = [
+            left,
+            right,
+            self.how,
+            self.left_on,
+            self.right_on,
+            self.left_index,
+            self.right_index,
+        ]
+        return self.new_dataframe(
+            [left, right],
+            shape=(np.nan, merged.shape[1]),
+            dtypes=merged.dtypes,
+            index_value=parse_index(merged.index, *index_tokenize_objects),
+            columns_value=parse_index(merged.columns, store_data=True),
+        )
+
+    @classmethod
+    def _gen_map_chunk(
+        cls,
+        chunk: DataFrameChunk,
+        shuffle_on: Union[List, str],
+        out_size: int,
+        mapper_id: int = 0,
+    ):
+        map_op = DataFrameMergeAlign(
+            stage=OperandStage.map,
+            shuffle_on=shuffle_on,
+            sparse=chunk.issparse(),
+            mapper_id=mapper_id,
+            index_shuffle_size=out_size,
+        )
+        return map_op.new_chunk(
+            [chunk],
+            shape=(np.nan, np.nan),
+            dtypes=chunk.dtypes,
+            index=chunk.index,
+            index_value=chunk.index_value,
+            columns_value=chunk.columns_value,
+        )
+
+    @classmethod
+    def _gen_shuffle_chunks(
+        cls,
+        out_shape: Tuple,
+        shuffle_on: Union[List, str],
+        df: Union[DataFrame, Series],
+    ):
+        # gen map chunks
+        map_chunks = [
+            cls._gen_map_chunk(chunk, shuffle_on, out_shape[0]) for chunk in df.chunks
+        ]
+
+        proxy_chunk = DataFrameShuffleProxy(
+            output_types=[OutputType.dataframe]
+        ).new_chunk(
+            map_chunks,
+            shape=(),
+            dtypes=df.dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+        )
+
+        # gen reduce chunks
+        reduce_chunks = []
+        out_indices = list(itertools.product(*(range(s) for s in out_shape)))
+        for out_idx in out_indices:
+            reduce_op = DataFrameMergeAlign(
+                stage=OperandStage.reduce,
+                n_reducers=len(out_indices),
+                sparse=proxy_chunk.issparse(),
+                output_types=[OutputType.dataframe],
+            )
+            reduce_chunks.append(
+                reduce_op.new_chunk(
+                    [proxy_chunk],
+                    shape=(np.nan, np.nan),
+                    dtypes=proxy_chunk.dtypes,
+                    index=out_idx,
+                    index_value=proxy_chunk.index_value,
+                    columns_value=proxy_chunk.columns_value,
+                )
+            )
+        return reduce_chunks
+
+    @classmethod
+    def _gen_both_shuffle_chunks(
+        cls,
+        out_shape: Tuple,
+        left_shuffle_on: Union[List, str],
+        right_shuffle_on: Union[List, str],
+        left: Union[DataFrame, Series],
+        right: Union[DataFrame, Series],
+    ):
+        # gen map chunks
+        # for left dataframe, use 0 as mapper_id
+        left_map_chunks = [
+            cls._gen_map_chunk(chunk, left_shuffle_on, out_shape[0], mapper_id=0)
+            for chunk in left.chunks
+        ]
+        # for right dataframe, use 1 as mapper_id
+        right_map_chunks = [
+            cls._gen_map_chunk(chunk, right_shuffle_on, out_shape[0], mapper_id=1)
+            for chunk in right.chunks
+        ]
+        map_chunks = left_map_chunks + right_map_chunks
+
+        proxy_chunk = DataFrameShuffleProxy(
+            output_types=[OutputType.dataframe]
+        ).new_chunk(
+            map_chunks,
+            shape=(),
+            dtypes=left.dtypes,
+            index_value=left.index_value,
+            columns_value=left.columns_value,
+        )
+
+        # gen reduce chunks
+        left_reduce_chunks = []
+        right_reduce_chunks = []
+        out_indices = list(itertools.product(*(range(s) for s in out_shape)))
+        for out_idx in out_indices:
+            reduce_op = DataFrameMergeAlign(
+                stage=OperandStage.reduce,
+                sparse=proxy_chunk.issparse(),
+                n_reducers=len(out_indices),
+            )
+            left_param = {
+                "shape": (np.nan, np.nan),
+                "dtypes": left.dtypes,
+                "index": out_idx,
+                "index_value": left.index_value,
+                "columns_value": left.columns_value,
+            }
+            right_param = {
+                "shape": (np.nan, np.nan),
+                "dtypes": right.dtypes,
+                "index": out_idx,
+                "index_value": right.index_value,
+                "columns_value": right.columns_value,
+            }
+            params = [left_param, right_param]
+            left_reduce, right_reduce = reduce_op.new_chunks([proxy_chunk], kws=params)
+            left_reduce_chunks.append(left_reduce)
+            right_reduce_chunks.append(right_reduce)
+        return left_reduce_chunks, right_reduce_chunks
+
+    @classmethod
+    def _apply_bloom_filter(
+        cls,
+        left: TileableType,
+        right: TileableType,
+        left_on: Union[List, str],
+        right_on: Union[List, str],
+        op: "DataFrameMerge",
+    ):
+        bloom_filter_params = dict()
+        bloom_filter_options = op.bloom_filter_options or dict()
+        for option in ["max_elements", "error_rate", "combine_size"]:
+            if option in bloom_filter_options:
+                bloom_filter_params[option] = bloom_filter_options[option]
+        if "max_elements" not in bloom_filter_params:
+            bloom_filter_params["max_elements"] = max(
+                c.shape[0] for c in left.chunks + right.chunks
+            )
+        filter_on = bloom_filter_options.get("filter", DEFAULT_BLOOM_FILTER_ON)
+        if filter_on == "large":
+            if len(left.chunks) > len(right.chunks):
+                left = filter_by_bloom_filter(
+                    left, right, left_on, right_on, **bloom_filter_params
+                )
+            else:
+                right = filter_by_bloom_filter(
+                    right, left, right_on, left_on, **bloom_filter_params
+                )
+        elif filter_on == "small":
+            if len(left.chunks) < len(right.chunks):
+                left = filter_by_bloom_filter(
+                    left, right, left_on, right_on, **bloom_filter_params
+                )
+            else:
+                right = filter_by_bloom_filter(
+                    right, left, right_on, left_on, **bloom_filter_params
+                )
+        else:
+            assert filter_on == "both"
+            # both
+            left = filter_by_bloom_filter(
+                left, right, left_on, right_on, **bloom_filter_params
+            )
+            right = filter_by_bloom_filter(
+                right, left, right_on, left_on, **bloom_filter_params
+            )
+        return left, right
+
+    @classmethod
+    def _tile_one_chunk(
+        cls,
+        op: "DataFrameMerge",
+        left: Union[DataFrame, Series],
+        right: Union[DataFrame, Series],
+    ):
+        df = op.outputs[0]
+        if len(left.chunks) == 1 and len(right.chunks) == 1:
+            merge_op = op.copy().reset_key()
+            out_chunk = merge_op.new_chunk(
+                [left.chunks[0], right.chunks[0]],
+                shape=df.shape,
+                index=left.chunks[0].index,
+                index_value=df.index_value,
+                dtypes=df.dtypes,
+                columns_value=df.columns_value,
+            )
+            out_chunks = [out_chunk]
+            nsplits = ((np.nan,), (df.shape[1],))
+        elif len(left.chunks) == 1:
+            out_chunks = []
+            left_chunk = left.chunks[0]
+            left_chunk.is_broadcaster = True
+            for c in right.chunks:
+                merge_op = op.copy().reset_key()
+                out_chunk = merge_op.new_chunk(
+                    [left_chunk, c],
+                    shape=(np.nan, df.shape[1]),
+                    index=c.index,
+                    index_value=infer_index_value(
+                        left_chunk.index_value, c.index_value
+                    ),
+                    dtypes=df.dtypes,
+                    columns_value=df.columns_value,
+                )
+                out_chunks.append(out_chunk)
+            nsplits = ((np.nan,) * len(right.chunks), (df.shape[1],))
+        else:
+            out_chunks = []
+            right_chunk = right.chunks[0]
+            # set `is_broadcaster` as True
+            right_chunk.is_broadcaster = True
+            for c in left.chunks:
+                merge_op = op.copy().reset_key()
+                out_chunk = merge_op.new_chunk(
+                    [c, right_chunk],
+                    shape=(np.nan, df.shape[1]),
+                    index=c.index,
+                    index_value=infer_index_value(
+                        right_chunk.index_value, c.index_value
+                    ),
+                    dtypes=df.dtypes,
+                    columns_value=df.columns_value,
+                )
+                out_chunks.append(out_chunk)
+            nsplits = ((np.nan,) * len(left.chunks), (df.shape[1],))
+
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            op.inputs,
+            df.shape,
+            nsplits=nsplits,
+            chunks=out_chunks,
+            dtypes=df.dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+        )
+
+    @classmethod
+    def _tile_shuffle(
+        cls,
+        op: "DataFrameMerge",
+        left: Union[DataFrame, Series],
+        right: Union[DataFrame, Series],
+    ):
+        df = op.outputs[0]
+        left_row_chunk_size = left.chunk_shape[0]
+        right_row_chunk_size = right.chunk_shape[0]
+        out_row_chunk_size = max(left_row_chunk_size, right_row_chunk_size)
+
+        out_chunk_shape = (out_row_chunk_size, 1)
+        nsplits = [[np.nan for _ in range(out_row_chunk_size)], [df.shape[1]]]
+
+        left_on = _prepare_shuffle_on(op.left_index, op.left_on, op.on)
+        right_on = _prepare_shuffle_on(op.right_index, op.right_on, op.on)
+
+        # do shuffle
+        left_chunks, right_chunks = cls._gen_both_shuffle_chunks(
+            out_chunk_shape, left_on, right_on, left, right
+        )
+
+        out_chunks = []
+        for left_chunk, right_chunk in zip(left_chunks, right_chunks):
+            merge_op = op.copy().reset_key()
+            out_chunk = merge_op.new_chunk(
+                [left_chunk, right_chunk],
+                shape=(np.nan, df.shape[1]),
+                index=left_chunk.index,
+                index_value=infer_index_value(
+                    left_chunk.index_value, right_chunk.index_value
+                ),
+                dtypes=df.dtypes,
+                columns_value=df.columns_value,
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            op.inputs,
+            df.shape,
+            nsplits=tuple(tuple(ns) for ns in nsplits),
+            chunks=out_chunks,
+            dtypes=df.dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+        )
+
+    @classmethod
+    def _tile_broadcast(
+        cls,
+        op: "DataFrameMerge",
+        left: Union[DataFrame, Series],
+        right: Union[DataFrame, Series],
+    ):
+        from .concat import DataFrameConcat
+
+        out_df = op.outputs[0]
+        out_chunks = []
+        if left.chunk_shape[0] < right.chunk_shape[0]:
+            # broadcast left
+            if op.how == "inner":
+                left_chunks = left.chunks
+                need_split = False
+            else:
+                left_on = _prepare_shuffle_on(op.left_index, op.left_on, op.on)
+                left_chunks = cls._gen_shuffle_chunks(left.chunk_shape, left_on, left)
+                need_split = True
+            # set is_broadcast property
+            for c in left_chunks:
+                c.is_broadcaster = True
+            right_chunks = right.chunks
+            for right_chunk in right_chunks:
+                merged_chunks = []
+                # concat all merged results
+                for j, left_chunk in enumerate(left_chunks):
+                    merge_op = op.copy().reset_key()
+                    if need_split:
+                        merge_op.split_info = MergeSplitInfo(
+                            "right", j, len(left_chunks)
+                        )
+                    merged_chunks.append(
+                        merge_op.new_chunk(
+                            [left_chunk, right_chunk],
+                            index=(j, 0),
+                            shape=(np.nan, out_df.shape[1]),
+                            columns_value=out_df.columns_value,
+                        )
+                    )
+                concat_op = DataFrameConcat(output_types=[OutputType.dataframe])
+                out_chunks.append(
+                    concat_op.new_chunk(
+                        merged_chunks,
+                        shape=(np.nan, out_df.shape[1]),
+                        dtypes=out_df.dtypes,
+                        index=right_chunk.index,
+                        index_value=infer_index_value(
+                            left_chunks[0].index_value, right_chunk.index_value
+                        ),
+                        columns_value=out_df.columns_value,
+                    )
+                )
+            nsplits = ((np.nan,) * len(right.chunks), (out_df.shape[1],))
+        else:
+            # broadcast right
+            if op.how == "inner":
+                need_split = False
+                right_chunks = right.chunks
+            else:
+                need_split = True
+                right_on = _prepare_shuffle_on(op.right_index, op.right_on, op.on)
+                right_chunks = cls._gen_shuffle_chunks(
+                    right.chunk_shape, right_on, right
+                )
+            # set is_broadcast property
+            for c in right_chunks:
+                c.is_broadcaster = True
+            left_chunks = left.chunks
+            for left_chunk in left_chunks:
+                merged_chunks = []
+                # concat all merged results
+                for j, right_chunk in enumerate(right_chunks):
+                    merge_op = op.copy().reset_key()
+                    if need_split:
+                        merge_op.split_info = MergeSplitInfo(
+                            "left", j, len(right_chunks)
+                        )
+                    merged_chunks.append(
+                        merge_op.new_chunk(
+                            [left_chunk, right_chunk],
+                            shape=(np.nan, out_df.shape[1]),
+                            index=(j, 0),
+                            columns_value=out_df.columns_value,
+                        )
+                    )
+                concat_op = DataFrameConcat(output_types=[OutputType.dataframe])
+                out_chunks.append(
+                    concat_op.new_chunk(
+                        merged_chunks,
+                        shape=(np.nan, out_df.shape[1]),
+                        dtypes=out_df.dtypes,
+                        index=left_chunk.index,
+                        index_value=infer_index_value(
+                            left_chunk.index_value, right_chunks[0].index_value
+                        ),
+                        columns_value=out_df.columns_value,
+                    )
+                )
+            nsplits = ((np.nan,) * len(left.chunks), (out_df.shape[1],))
+
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            op.inputs,
+            out_df.shape,
+            nsplits=tuple(tuple(ns) for ns in nsplits),
+            chunks=out_chunks,
+            dtypes=out_df.dtypes,
+            index_value=out_df.index_value,
+            columns_value=out_df.columns_value,
+        )
+
+    @classmethod
+    def _can_merge_with_one_chunk(
+        cls, left: TileableType, right: TileableType, how: str
+    ) -> bool:
+        return (len(left.chunks) == 1 and how in ["right", "inner"]) or (
+            len(right.chunks) == 1 and how in ["left", "inner"]
+        )
+
+    @classmethod
+    def _can_merge_with_broadcast(
+        cls, big_chunk_size: int, small_chunk_size: int, big_side: str, how: str
+    ) -> bool:
+        return how in [big_side, "inner"] and np.log2(big_chunk_size) > small_chunk_size
+
+    @classmethod
+    def _get_auto_merge_options(cls, auto_merge: str) -> Tuple[bool, bool]:
+        if auto_merge == "both":
+            return True, True
+        elif auto_merge == "none":
+            return False, False
+        elif auto_merge == "before":
+            return True, False
+        else:
+            assert auto_merge == "after"
+            return False, True
+
+    @classmethod
+    def _choose_merge_method(
+        cls, op: "DataFrameMerge", left: TileableType, right: TileableType
+    ):
+        how = op.how
+        method = op.method
+        left_row_chunk_size = left.chunk_shape[0]
+        right_row_chunk_size = right.chunk_shape[0]
+        if left_row_chunk_size > right_row_chunk_size:
+            big_side = "left"
+            big_chunk_size = left_row_chunk_size
+            small_chunk_size = right_row_chunk_size
+        else:
+            big_side = "right"
+            big_chunk_size = right_row_chunk_size
+            small_chunk_size = left_row_chunk_size
+        if method == "auto":
+            if cls._can_merge_with_one_chunk(left, right, how):
+                return MergeMethod.one_chunk
+            elif cls._can_merge_with_broadcast(
+                big_chunk_size, small_chunk_size, big_side, how
+            ):
+                return MergeMethod.broadcast
+            else:
+                return MergeMethod.shuffle
+        elif method == "broadcast":
+            if cls._can_merge_with_one_chunk(left, right, how):
+                return MergeMethod.one_chunk
+            elif how in [big_side, "inner"]:
+                return MergeMethod.broadcast
+            else:  # pragma: no cover
+                raise ValueError("Cannot specify merge method `broadcast`")
+        else:
+            assert method == "shuffle"
+            return MergeMethod.shuffle
+
+    @classmethod
+    def _if_apply_bloom_filter(
+        cls,
+        method: MergeMethod,
+        op: "DataFrameMerge",
+        left: TileableType,
+        right: TileableType,
+    ):
+        # bloom filter can only work for inner merge
+        if op.how != "inner" or op.bloom_filter is False:
+            return False
+        elif op.bloom_filter is True:
+            return True
+
+        bloom_filter_options = op.bloom_filter_options or dict()
+        bloom_filter_chunk_threshold = bloom_filter_options.get(
+            "apply_chunk_size_threshold", DEFAULT_BLOOM_FILTER_CHUNK_THRESHOLD
+        )
+
+        # TODO(hks): disable bloom_filter for now, when it is ready, turn it on them
+        # bloom_filter == auto
+        if len(left.chunks + right.chunks) <= bloom_filter_chunk_threshold:
+            # if size of input chunks <= threshold, skip bloom filter
+            return False
+        elif method == MergeMethod.shuffle:
+            # for shuffle, enable bloom filter by default
+            return False
+
+        return False
+
+    @classmethod
+    def tile(cls, op: "DataFrameMerge"):
+        left = build_concatenated_rows_frame(op.inputs[0])
+        right = build_concatenated_rows_frame(op.inputs[1])
+
+        ctx = get_context()
+        auto_merge_threshold = op.auto_merge_threshold
+        auto_merge_before, auto_merge_after = cls._get_auto_merge_options(op.auto_merge)
+
+        if (
+            auto_merge_before
+            and len(left.chunks) + len(right.chunks) > auto_merge_threshold
+        ):
+            yield TileStatus([left, right] + left.chunks + right.chunks, progress=0.2)
+            left_chunk_size = len(left.chunks)
+            right_chunk_size = len(right.chunks)
+            left = auto_merge_chunks(ctx, left)
+            right = auto_merge_chunks(ctx, right)
+            logger.info(
+                "Auto merge before %s, left data shape: %s, chunk count: %s -> %s, "
+                "right data shape: %s, chunk count: %s -> %s.",
+                op,
+                left.shape,
+                left_chunk_size,
+                len(left.chunks),
+                right.shape,
+                right_chunk_size,
+                len(right.chunks),
+            )
+        else:
+            logger.info(
+                "Skip auto merge before %s, left data shape: %s, chunk count: %d, "
+                "right data shape: %s, chunk count: %d.",
+                op,
+                left.shape,
+                len(left.chunks),
+                right.shape,
+                len(right.chunks),
+            )
+
+        method = cls._choose_merge_method(op, left, right)
+        if cls._if_apply_bloom_filter(method, op, left, right):
+            if has_unknown_shape(left, right):  # pragma: no cover
+                yield TileStatus(left.chunks + right.chunks, progress=0.3)
+            left_on = _prepare_shuffle_on(op.left_index, op.left_on, op.on)
+            right_on = _prepare_shuffle_on(op.right_index, op.right_on, op.on)
+            small_one = right if len(left.chunks) > len(right.chunks) else left
+            logger.info(
+                "Apply bloom filter for operand %s, use DataFrame %s to build bloom filter.",
+                op,
+                small_one,
+            )
+            left, right = yield from recursive_tile(
+                *cls._apply_bloom_filter(left, right, left_on, right_on, op)
+            )
+            # auto merge after bloom filter
+            yield TileStatus([left, right] + left.chunks + right.chunks, progress=0.5)
+            left = auto_merge_chunks(ctx, left)
+            right = auto_merge_chunks(ctx, right)
+
+            if op.method == "auto":
+                # if method is auto, select new method after auto merge
+                method = cls._choose_merge_method(op, left, right)
+        logger.info("Choose %s method for merge operand %s.", method, op)
+        if method == MergeMethod.one_chunk:
+            ret = cls._tile_one_chunk(op, left, right)
+        elif method == MergeMethod.broadcast:
+            ret = cls._tile_broadcast(op, left, right)
+        else:
+            assert method == MergeMethod.shuffle
+            ret = cls._tile_shuffle(op, left, right)
+
+        if (
+            op.how == "inner"
+            and auto_merge_after
+            and len(ret[0].chunks) > auto_merge_threshold
+        ):
+            # if how=="inner", output data size will reduce greatly with high probability，
+            # use auto_merge_chunks to combine small chunks.
+            yield TileStatus(
+                ret[0].chunks, progress=0.8
+            )  # trigger execution for chunks
+            merged = auto_merge_chunks(get_context(), ret[0])
+            logger.info(
+                "Auto merge after %s, data shape: %s, chunk count: %s -> %s.",
+                op,
+                merged.shape,
+                len(ret[0].chunks),
+                len(merged.chunks),
+            )
+            return [merged]
+        else:
+            logger.info(
+                "Skip auto merge after %s, data shape: %s, chunk count: %d.",
+                op,
+                ret[0].shape,
+                len(ret[0].chunks),
+            )
+            return ret
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        left, right = ctx[op.inputs[0].key], ctx[op.inputs[1].key]
+
+        if getattr(op, "split_info", None) is not None:
+            split_info = op.split_info
+            if split_info.split_side == "left":
+                index = hash_dataframe_on(left, on=op.on, size=split_info.nsplits)[
+                    split_info.split_index
+                ]
+                left = left.iloc[index]
+            else:
+                index = hash_dataframe_on(right, on=op.on, size=split_info.nsplits)[
+                    split_info.split_index
+                ]
+                right = right.iloc[index]
+
+        def execute_merge(x, y):
+            if not op.gpu:
+                kwargs = dict(
+                    copy=op.copy, validate=op.validate, indicator=op.indicator
+                )
+            else:  # pragma: no cover
+                # cudf doesn't support 'validate' and 'copy'
+                kwargs = dict(indicator=op.indicator)
+            return x.merge(
+                y,
+                how=op.how,
+                on=op.on,
+                left_on=op.left_on,
+                right_on=op.right_on,
+                left_index=op.left_index,
+                right_index=op.right_index,
+                sort=op.sort,
+                suffixes=op.suffixes,
+                **kwargs,
+            )
+
+        # workaround for: https://github.com/pandas-dev/pandas/issues/27943
+        try:
+            r = execute_merge(left, right)
+        except ValueError:
+            r = execute_merge(left.copy(deep=True), right.copy(deep=True))
+
+        # make sure column's order
+        if not all(
+            n1 == n2 for n1, n2 in zip(chunk.columns_value.to_pandas(), r.columns)
+        ):
+            r = r[list(chunk.columns_value.to_pandas())]
+        ctx[chunk.key] = r
+
+
+def _prepare_shuffle_on(use_index, side_on, on):
+    # consistent with pandas: `left_index` precedes `left_on` and `right_index` precedes `right_on`
+    if use_index:
+        # `None` means we will shuffle on df.index.
+        return None
+    elif side_on is not None:
+        return side_on
+    else:
+        return on
+
+
+def merge(
+    df: Union[DataFrame, Series],
+    right: Union[DataFrame, Series],
+    how: str = "inner",
+    on: str = None,
+    left_on: str = None,
+    right_on: str = None,
+    left_index: bool = False,
+    right_index: bool = False,
+    sort: bool = False,
+    suffixes: Tuple[Optional[str], Optional[str]] = ("_x", "_y"),
+    copy: bool = True,
+    indicator: bool = False,
+    validate: str = None,
+    method: str = "auto",
+    auto_merge: str = "both",
+    auto_merge_threshold: int = 8,
+    bloom_filter: Union[bool, str] = "auto",
+    bloom_filter_options: Dict[str, Any] = None,
+) -> DataFrame:
+    """
+    Merge DataFrame or named Series objects with a database-style join.
+
+    A named Series object is treated as a DataFrame with a single named column.
+
+    The join is done on columns or indexes. If joining columns on
+    columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
+    on indexes or indexes on a column or columns, the index will be passed on.
+    When performing a cross merge, no column specifications to merge on are
+    allowed.
+
+    Parameters
+    ----------
+    right : DataFrame or named Series
+        Object to merge with.
+    how : {'left', 'right', 'outer', 'inner'}, default 'inner'
+        Type of merge to be performed.
+
+        * left: use only keys from left frame, similar to a SQL left outer join;
+          preserve key order.
+        * right: use only keys from right frame, similar to a SQL right outer join;
+          preserve key order.
+        * outer: use union of keys from both frames, similar to a SQL full outer
+          join; sort keys lexicographically.
+        * inner: use intersection of keys from both frames, similar to a SQL inner
+          join; preserve the order of the left keys.
+
+    on : label or list
+        Column or index level names to join on. These must be found in both
+        DataFrames. If `on` is None and not merging on indexes then this defaults
+        to the intersection of the columns in both DataFrames.
+    left_on : label or list, or array-like
+        Column or index level names to join on in the left DataFrame. Can also
+        be an array or list of arrays of the length of the left DataFrame.
+        These arrays are treated as if they are columns.
+    right_on : label or list, or array-like
+        Column or index level names to join on in the right DataFrame. Can also
+        be an array or list of arrays of the length of the right DataFrame.
+        These arrays are treated as if they are columns.
+    left_index : bool, default False
+        Use the index from the left DataFrame as the join key(s). If it is a
+        MultiIndex, the number of keys in the other DataFrame (either the index
+        or a number of columns) must match the number of levels.
+    right_index : bool, default False
+        Use the index from the right DataFrame as the join key. Same caveats as
+        left_index.
+    sort : bool, default False
+        Sort the join keys lexicographically in the result DataFrame. If False,
+        the order of the join keys depends on the join type (how keyword).
+    suffixes : list-like, default is ("_x", "_y")
+        A length-2 sequence where each element is optionally a string
+        indicating the suffix to add to overlapping column names in
+        `left` and `right` respectively. Pass a value of `None` instead
+        of a string to indicate that the column name from `left` or
+        `right` should be left as-is, with no suffix. At least one of the
+        values must not be None.
+    copy : bool, default True
+        If False, avoid copy if possible.
+    indicator : bool or str, default False
+        If True, adds a column to the output DataFrame called "_merge" with
+        information on the source of each row. The column can be given a different
+        name by providing a string argument. The column will have a Categorical
+        type with the value of "left_only" for observations whose merge key only
+        appears in the left DataFrame, "right_only" for observations
+        whose merge key only appears in the right DataFrame, and "both"
+        if the observation's merge key is found in both DataFrames.
+    validate : str, optional
+        If specified, checks if merge is of specified type.
+
+        * "one_to_one" or "1:1": check if merge keys are unique in both
+          left and right datasets.
+        * "one_to_many" or "1:m": check if merge keys are unique in left
+          dataset.
+        * "many_to_one" or "m:1": check if merge keys are unique in right
+          dataset.
+        * "many_to_many" or "m:m": allowed, but does not result in checks.
+    method : {"auto", "shuffle", "broadcast"}, default auto
+        "broadcast" is recommended when one DataFrame is much smaller than the other,
+        otherwise, "shuffle" will be a better choice. By default, we choose method
+        according to actual data size.
+    auto_merge : {"both", "none", "before", "after"}, default both
+        Auto merge small chunks before or after merge
+
+        * "both": auto merge small chunks before and after,
+        * "none": do not merge small chunks
+        * "before": only merge small chunks before merge
+        * "after": only merge small chunks after merge
+    auto_merge_threshold : int, default 8
+        When how is "inner", merged result could be much smaller than original DataFrame,
+        if the number of chunks is greater than the threshold,
+        it will merge small chunks automatically.
+    bloom_filter: bool, str, default "auto"
+        Use bloom filter to optimize merge
+    bloom_filter_options: dict
+        * "max_elements": max elements in bloom filter,
+          default value is the max size of all input chunks
+        * "error_rate": error raite, default 0.1.
+        * "apply_chunk_size_threshold": min chunk size of input chunks to apply bloom filter, default 10
+          when chunk size of left and right is greater than this threshold, apply bloom filter
+        * "filter": "large", "small", "both", default "large"
+          decides to filter on large, small or both DataFrames.
+
+    Returns
+    -------
+    DataFrame
+        A DataFrame of the two merged objects.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df1 = md.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
+    ...                     'value': [1, 2, 3, 5]})
+    >>> df2 = md.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
+    ...                     'value': [5, 6, 7, 8]})
+    >>> df1.execute()
+        lkey value
+    0   foo      1
+    1   bar      2
+    2   baz      3
+    3   foo      5
+    >>> df2.execute()
+        rkey value
+    0   foo      5
+    1   bar      6
+    2   baz      7
+    3   foo      8
+
+    Merge df1 and df2 on the lkey and rkey columns. The value columns have
+    the default suffixes, _x and _y, appended.
+
+    >>> df1.merge(df2, left_on='lkey', right_on='rkey').execute()
+      lkey  value_x rkey  value_y
+    0  foo        1  foo        5
+    1  foo        1  foo        8
+    2  foo        5  foo        5
+    3  foo        5  foo        8
+    4  bar        2  bar        6
+    5  baz        3  baz        7
+
+    Merge DataFrames df1 and df2 with specified left and right suffixes
+    appended to any overlapping columns.
+
+    >>> df1.merge(df2, left_on='lkey', right_on='rkey',
+    ...           suffixes=('_left', '_right')).execute()
+      lkey  value_left rkey  value_right
+    0  foo           1  foo            5
+    1  foo           1  foo            8
+    2  foo           5  foo            5
+    3  foo           5  foo            8
+    4  bar           2  bar            6
+    5  baz           3  baz            7
+
+    Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
+    any overlapping columns.
+
+    >>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False)).execute()
+    Traceback (most recent call last):
+    ...
+    ValueError: columns overlap but no suffix specified:
+        Index(['value'], dtype='object')
+
+    >>> df1 = md.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
+    >>> df2 = md.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
+    >>> df1.execute()
+          a  b
+    0   foo  1
+    1   bar  2
+    >>> df2.execute()
+          a  c
+    0   foo  3
+    1   baz  4
+
+    >>> df1.merge(df2, how='inner', on='a').execute()
+          a  b  c
+    0   foo  1  3
+
+    >>> df1.merge(df2, how='left', on='a').execute()
+          a  b  c
+    0   foo  1  3.0
+    1   bar  2  NaN
+    """
+    if method is None:
+        method = "auto"
+    if method not in [
+        "auto",
+        "shuffle",
+        "broadcast",
+    ]:  # pragma: no cover
+        raise NotImplementedError(f"{method} merge is not supported")
+    if auto_merge not in ["both", "none", "before", "after"]:  # pragma: no cover
+        raise ValueError(
+            f"auto_merge can only be `both`, `none`, `before` or `after`, got {auto_merge}"
+        )
+    if bloom_filter not in [True, False, "auto"]:
+        raise ValueError(
+            f'bloom_filter can only be True, False, or "auto", got {bloom_filter}'
+        )
+    if bloom_filter_options:
+        if not isinstance(bloom_filter_options, dict):
+            raise TypeError(
+                f"bloom_filter_options must be a dict, got {type(bloom_filter_options)}"
+            )
+        for k, v in bloom_filter_options.items():
+            if k not in BLOOM_FILTER_OPTIONS:
+                raise ValueError(
+                    f"Invalid bloom filter option {k}, available: {BLOOM_FILTER_OPTIONS}"
+                )
+            if k == "filter" and v not in BLOOM_FILTER_ON_OPTIONS:
+                raise ValueError(
+                    f"Invalid filter {k}, available: {BLOOM_FILTER_ON_OPTIONS}"
+                )
+    op = DataFrameMerge(
+        how=how,
+        on=on,
+        left_on=left_on,
+        right_on=right_on,
+        left_index=left_index,
+        right_index=right_index,
+        sort=sort,
+        suffixes=suffixes,
+        copy=copy,
+        indicator=indicator,
+        validate=validate,
+        method=method,
+        auto_merge=auto_merge,
+        auto_merge_threshold=auto_merge_threshold,
+        bloom_filter=bloom_filter,
+        bloom_filter_options=bloom_filter_options,
+        output_types=[OutputType.dataframe],
+    )
+    return op(df, right)
+
+
+def join(
+    df: Union[DataFrame, Series],
+    other: Union[DataFrame, Series],
+    on: str = None,
+    how: str = "left",
+    lsuffix: str = "",
+    rsuffix: str = "",
+    sort: bool = False,
+    method: str = None,
+    auto_merge: str = "both",
+    auto_merge_threshold: int = 8,
+    bloom_filter: Union[bool, Dict] = True,
+    bloom_filter_options: Dict[str, Any] = None,
+) -> DataFrame:
+    """
+    Join columns of another DataFrame.
+
+    Join columns with `other` DataFrame either on index or on a key
+    column. Efficiently join multiple DataFrame objects by index at once by
+    passing a list.
+
+    Parameters
+    ----------
+    other : DataFrame, Series, or list of DataFrame
+        Index should be similar to one of the columns in this one. If a
+        Series is passed, its name attribute must be set, and that will be
+        used as the column name in the resulting joined DataFrame.
+    on : str, list of str, or array-like, optional
+        Column or index level name(s) in the caller to join on the index
+        in `other`, otherwise joins index-on-index. If multiple
+        values given, the `other` DataFrame must have a MultiIndex. Can
+        pass an array as the join key if it is not already contained in
+        the calling DataFrame. Like an Excel VLOOKUP operation.
+    how : {'left', 'right', 'outer', 'inner'}, default 'left'
+        How to handle the operation of the two objects.
+
+        * left: use calling frame's index (or column if on is specified)
+        * right: use `other`'s index.
+        * outer: form union of calling frame's index (or column if on is
+          specified) with `other`'s index, and sort it.
+          lexicographically.
+        * inner: form intersection of calling frame's index (or column if
+          on is specified) with `other`'s index, preserving the order
+          of the calling's one.
+
+    lsuffix : str, default ''
+        Suffix to use from left frame's overlapping columns.
+    rsuffix : str, default ''
+        Suffix to use from right frame's overlapping columns.
+    sort : bool, default False
+        Order result DataFrame lexicographically by the join key. If False,
+        the order of the join key depends on the join type (how keyword).
+    method : {"shuffle", "broadcast"}, default None
+        "broadcast" is recommended when one DataFrame is much smaller than the other,
+        otherwise, "shuffle" will be a better choice. By default, we choose method
+        according to actual data size.
+    auto_merge : {"both", "none", "before", "after"}, default both
+        Auto merge small chunks before or after merge
+
+        * "both": auto merge small chunks before and after,
+        * "none": do not merge small chunks
+        * "before": only merge small chunks before merge
+        * "after": only merge small chunks after merge
+    auto_merge_threshold : int, default 8
+        When how is "inner", merged result could be much smaller than original DataFrame,
+        if the number of chunks is greater than the threshold,
+        it will merge small chunks automatically.
+    bloom_filter: bool, str, default "auto"
+        Use bloom filter to optimize merge
+    bloom_filter_options: dict
+        * "max_elements": max elements in bloom filter,
+          default value is the max size of all input chunks
+        * "error_rate": error raite, default 0.1.
+        * "apply_chunk_size_threshold": min chunk size of input chunks to apply bloom filter, default 10
+          when chunk size of left and right is greater than this threshold, apply bloom filter
+        * "filter": "large", "small", "both", default "large"
+          decides to filter on large, small or both DataFrames.
+
+    Returns
+    -------
+    DataFrame
+        A dataframe containing columns from both the caller and `other`.
+
+    See Also
+    --------
+    DataFrame.merge : For column(s)-on-column(s) operations.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
+    ...                    'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
+
+    >>> df.execute()
+      key   A
+    0  K0  A0
+    1  K1  A1
+    2  K2  A2
+    3  K3  A3
+    4  K4  A4
+    5  K5  A5
+
+    >>> other = md.DataFrame({'key': ['K0', 'K1', 'K2'],
+    ...                       'B': ['B0', 'B1', 'B2']})
+
+    >>> other.execute()
+      key   B
+    0  K0  B0
+    1  K1  B1
+    2  K2  B2
+
+    Join DataFrames using their indexes.
+
+    >>> df.join(other, lsuffix='_caller', rsuffix='_other').execute()
+      key_caller   A key_other    B
+    0         K0  A0        K0   B0
+    1         K1  A1        K1   B1
+    2         K2  A2        K2   B2
+    3         K3  A3       NaN  NaN
+    4         K4  A4       NaN  NaN
+    5         K5  A5       NaN  NaN
+
+    If we want to join using the key columns, we need to set key to be
+    the index in both `df` and `other`. The joined DataFrame will have
+    key as its index.
+
+    >>> df.set_index('key').join(other.set_index('key')).execute()
+          A    B
+    key
+    K0   A0   B0
+    K1   A1   B1
+    K2   A2   B2
+    K3   A3  NaN
+    K4   A4  NaN
+    K5   A5  NaN
+
+    Another option to join using the key columns is to use the `on`
+    parameter. DataFrame.join always uses `other`'s index but we can use
+    any column in `df`. This method preserves the original DataFrame's
+    index in the result.
+
+    >>> df.join(other.set_index('key'), on='key').execute()
+      key   A    B
+    0  K0  A0   B0
+    1  K1  A1   B1
+    2  K2  A2   B2
+    3  K3  A3  NaN
+    4  K4  A4  NaN
+    5  K5  A5  NaN
+
+    Using non-unique key values shows how they are matched.
+
+    >>> df = md.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'],
+    ...                    'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
+
+    >>> df.execute()
+      key   A
+    0  K0  A0
+    1  K1  A1
+    2  K1  A2
+    3  K3  A3
+    4  K0  A4
+    5  K1  A5
+
+    >>> df.join(other.set_index('key'), on='key').execute()
+      key   A    B
+    0  K0  A0   B0
+    1  K1  A1   B1
+    2  K1  A2   B1
+    3  K3  A3  NaN
+    4  K0  A4   B0
+    5  K1  A5   B1
+    """
+    return merge(
+        df,
+        other,
+        left_on=on,
+        how=how,
+        left_index=on is None,
+        right_index=True,
+        suffixes=(lsuffix, rsuffix),
+        sort=sort,
+        method=method,
+        auto_merge=auto_merge,
+        auto_merge_threshold=auto_merge_threshold,
+        bloom_filter=bloom_filter,
+        bloom_filter_options=bloom_filter_options,
+    )
diff --git a/python/xorbits/_mars/dataframe/merge/tests/__init__.py b/python/xorbits/_mars/dataframe/merge/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/merge/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/merge/tests/test_merge.py b/python/xorbits/_mars/dataframe/merge/tests/test_merge.py
new file mode 100644
index 000000000..cb5a7686b
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/merge/tests/test_merge.py
@@ -0,0 +1,345 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ....core import tile
+from ....core.operand import OperandStage
+from ...core import IndexValue
+from ...datasource.dataframe import from_pandas
+from .. import DataFrameMerge, DataFrameMergeAlign, concat
+
+
+def test_merge():
+    df1 = pd.DataFrame(
+        np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"]
+    )
+    df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
+
+    mdf1 = from_pandas(df1, chunk_size=2)
+    mdf2 = from_pandas(df2, chunk_size=3)
+
+    parameters = [
+        {},
+        {"how": "left", "right_on": "x", "left_index": True},
+        {"how": "right", "left_on": "a", "right_index": True},
+        {"how": "left", "left_on": "a", "right_on": "x"},
+        {"how": "right", "left_on": "a", "right_index": True},
+        {"how": "right", "on": "a"},
+        {"how": "inner", "on": ["a", "b"]},
+    ]
+
+    for kw in parameters:
+        df = mdf1.merge(mdf2, **kw)
+        df = tile(df)
+
+        assert df.chunk_shape == (2, 1)
+        for chunk in df.chunks:
+            assert isinstance(chunk.op, DataFrameMerge)
+            assert chunk.op.how == kw.get("how", "inner")
+            left, right = chunk.op.inputs
+            assert isinstance(left.op, DataFrameMergeAlign)
+            assert left.op.stage == OperandStage.reduce
+            assert isinstance(right.op, DataFrameMergeAlign)
+            assert right.op.stage == OperandStage.reduce
+            assert len(left.inputs[0].inputs) == 4
+            assert len(right.inputs[0].inputs) == 4
+            for lchunk in left.inputs[0].inputs[:2]:
+                assert isinstance(lchunk.op, DataFrameMergeAlign)
+                assert lchunk.op.stage == OperandStage.map
+                assert lchunk.op.index_shuffle_size == 2
+                if kw.get("on", None) or kw.get("left_on", None):
+                    # defaults to common columns
+                    assert lchunk.op.shuffle_on == kw.get("on", None) or kw.get(
+                        "left_on", None
+                    )
+            for rchunk in right.inputs[0].inputs[2:]:
+                assert isinstance(rchunk.op, DataFrameMergeAlign)
+                assert rchunk.op.stage == OperandStage.map
+                assert rchunk.op.index_shuffle_size == 2
+                if kw.get("on", None) or kw.get("right_on", None):
+                    # defaults to common columns
+                    assert rchunk.op.shuffle_on == kw.get("on", None) or kw.get(
+                        "right_on", None
+                    )
+            pd.testing.assert_index_equal(
+                chunk.columns_value.to_pandas(), df.columns_value.to_pandas()
+            )
+
+
+def test_merge_invalid_parameters():
+    pdf1 = pd.DataFrame(
+        np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"]
+    )
+    pdf2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
+
+    df1 = from_pandas(pdf1, chunk_size=2)
+    df2 = from_pandas(pdf2, chunk_size=3)
+
+    with pytest.raises(ValueError):
+        df1.merge(df2, bloom_filter="wrong")
+
+    with pytest.raises(TypeError):
+        df1.merge(df2, bloom_filter_options="wrong")
+
+    with pytest.raises(ValueError):
+        df1.merge(df2, bloom_filter_options={"wrong": 1})
+
+    with pytest.raises(ValueError):
+        df1.merge(df2, bloom_filter_options={"filter": "wrong"})
+
+
+def test_join():
+    df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], index=["a1", "a2", "a3"])
+    df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], index=["a1", "b2", "b3"]) + 1
+    df2 = pd.concat([df2, df2 + 1])
+
+    mdf1 = from_pandas(df1, chunk_size=2)
+    mdf2 = from_pandas(df2, chunk_size=2)
+
+    parameters = [
+        {"lsuffix": "l_", "rsuffix": "r_"},
+        {"lsuffix": "l_", "rsuffix": "r_", "how": "left"},
+        {"lsuffix": "l_", "rsuffix": "r_", "how": "right"},
+        {"lsuffix": "l_", "rsuffix": "r_", "how": "inner"},
+        {"lsuffix": "l_", "rsuffix": "r_", "how": "left"},
+    ]
+
+    for kw in parameters:
+        df = mdf1.join(mdf2, auto_merge="none", bloom_filter=False, **kw)
+        df = tile(df)
+
+        assert df.chunk_shape == (3, 1)
+        for chunk in df.chunks:
+            assert isinstance(chunk.op, DataFrameMerge)
+            assert chunk.op.how == kw.get("how", "left")
+            left, right = chunk.op.inputs
+            assert isinstance(left.op, DataFrameMergeAlign)
+            assert left.op.stage == OperandStage.reduce
+            assert isinstance(right.op, DataFrameMergeAlign)
+            assert right.op.stage == OperandStage.reduce
+            assert len(left.inputs[0].inputs) == 5
+            assert len(right.inputs[0].inputs) == 5
+            for lchunk in left.inputs[0].inputs:
+                assert isinstance(lchunk.op, DataFrameMergeAlign)
+                assert lchunk.op.stage == OperandStage.map
+                assert lchunk.op.index_shuffle_size == 3
+                assert lchunk.op.shuffle_on == None
+            for rchunk in right.inputs[0].inputs:
+                assert isinstance(rchunk.op, DataFrameMergeAlign)
+                assert rchunk.op.stage == OperandStage.map
+                assert rchunk.op.index_shuffle_size == 3
+                assert rchunk.op.shuffle_on == None
+            pd.testing.assert_index_equal(
+                chunk.columns_value.to_pandas(), df.columns_value.to_pandas()
+            )
+
+
+def test_join_on():
+    df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], columns=["a1", "a2", "a3"])
+    df2 = (
+        pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], columns=["a1", "b2", "b3"]) + 1
+    )
+    df2 = pd.concat([df2, df2 + 1])
+
+    mdf1 = from_pandas(df1, chunk_size=2)
+    mdf2 = from_pandas(df2, chunk_size=2)
+
+    parameters = [
+        {"lsuffix": "l_", "rsuffix": "r_"},
+        {"lsuffix": "l_", "rsuffix": "r_", "how": "left", "on": "a1"},
+        {"lsuffix": "l_", "rsuffix": "r_", "how": "right", "on": "a2"},
+        {"lsuffix": "l_", "rsuffix": "r_", "how": "inner", "on": "a2"},
+        {"lsuffix": "l_", "rsuffix": "r_", "how": "outer", "on": "a2"},
+    ]
+
+    for kw in parameters:
+        df = mdf1.join(mdf2, auto_merge="none", bloom_filter=False, **kw)
+        df = tile(df)
+
+        assert df.chunk_shape == (3, 1)
+        for chunk in df.chunks:
+            assert isinstance(chunk.op, DataFrameMerge)
+            assert chunk.op.how == kw.get("how", "left")
+            left, right = chunk.op.inputs
+            assert isinstance(left.op, DataFrameMergeAlign)
+            assert left.op.stage == OperandStage.reduce
+            assert isinstance(right.op, DataFrameMergeAlign)
+            assert right.op.stage == OperandStage.reduce
+            assert len(left.inputs[0].inputs) == 5
+            assert len(right.inputs[0].inputs) == 5
+            for lchunk in left.inputs[0].inputs[:2]:
+                assert isinstance(lchunk.op, DataFrameMergeAlign)
+                assert lchunk.op.stage == OperandStage.map
+                assert lchunk.op.index_shuffle_size == 3
+                assert lchunk.op.shuffle_on == kw.get("on", None)
+            for rchunk in right.inputs[0].inputs[2:]:
+                assert isinstance(rchunk.op, DataFrameMergeAlign)
+                assert rchunk.op.stage == OperandStage.map
+                assert rchunk.op.index_shuffle_size == 3
+                assert rchunk.op.shuffle_on == None
+            pd.testing.assert_index_equal(
+                chunk.columns_value.to_pandas(), df.columns_value.to_pandas()
+            )
+
+
+def test_merge_one_chunk():
+    df1 = pd.DataFrame({"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]})
+    df2 = pd.DataFrame({"rkey": ["foo", "bar", "baz", "foo"], "value": [5, 6, 7, 8]})
+
+    # all have one chunk
+    mdf1 = from_pandas(df1)
+    mdf2 = from_pandas(df2)
+    df = mdf1.merge(mdf2, left_on="lkey", right_on="rkey")
+    tiled, tiled1, tiled2 = tile(df, mdf1, mdf2)
+
+    assert tiled.chunk_shape == (1, 1)
+    assert tiled.chunks[0].inputs[0].key == tiled1.chunks[0].key
+    assert tiled.chunks[0].inputs[1].key == tiled2.chunks[0].key
+
+    # left has one chunk
+    mdf1 = from_pandas(df1)
+    mdf2 = from_pandas(df2, chunk_size=2)
+    df = mdf1.merge(mdf2, left_on="lkey", right_on="rkey")
+    tiled, tiled1, tiled2 = tile(df, mdf1, mdf2)
+
+    assert tiled.chunk_shape == (2, 1)
+    assert tiled.chunks[0].inputs[0].key == tiled1.chunks[0].key
+    assert tiled.chunks[0].inputs[1].key == tiled2.chunks[0].key
+    assert tiled.chunks[1].inputs[0].key == tiled1.chunks[0].key
+    assert tiled.chunks[1].inputs[1].key == tiled2.chunks[1].key
+
+    # right has one chunk
+    mdf1 = from_pandas(df1, chunk_size=2)
+    mdf2 = from_pandas(df2)
+    df = mdf1.merge(mdf2, left_on="lkey", right_on="rkey")
+    tiled, tiled1, tiled2 = tile(df, mdf1, mdf2)
+
+    assert tiled.chunk_shape == (2, 1)
+    assert tiled.chunks[0].inputs[0].key == tiled1.chunks[0].key
+    assert tiled.chunks[0].inputs[1].key == tiled2.chunks[0].key
+    assert tiled.chunks[1].inputs[0].key == tiled1.chunks[1].key
+    assert tiled.chunks[1].inputs[1].key == tiled2.chunks[0].key
+
+
+def test_append():
+    df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
+    df2 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
+
+    mdf1 = from_pandas(df1, chunk_size=3)
+    mdf2 = from_pandas(df2, chunk_size=3)
+    adf = mdf1.append(mdf2)
+
+    assert adf.shape == (20, 4)
+    assert isinstance(adf.index_value.value, IndexValue.Int64Index)
+
+    tiled = tile(adf)
+    assert tiled.nsplits == ((3, 3, 3, 1, 3, 3, 3, 1), (3, 1))
+    assert tiled.chunk_shape == (8, 2)
+    for i, c in enumerate(tiled.chunks):
+        index = (i // 2, i % 2)
+        assert c.index == index
+
+    mdf1 = from_pandas(df1, chunk_size=3)
+    mdf2 = from_pandas(df2, chunk_size=3)
+    adf = mdf1.append(mdf2, ignore_index=True)
+
+    assert adf.shape == (20, 4)
+    assert isinstance(adf.index_value.value, IndexValue.RangeIndex)
+    pd.testing.assert_index_equal(adf.index_value.to_pandas(), pd.RangeIndex(20))
+
+
+def test_concat():
+    df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
+    df2 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
+
+    mdf1 = from_pandas(df1, chunk_size=4)
+    mdf2 = from_pandas(df2, chunk_size=4)
+    r = concat([mdf1, mdf2], axis="index")
+
+    assert r.shape == (20, 4)
+    pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
+
+    tiled = tile(r)
+    assert tiled.nsplits == ((4, 4, 2, 4, 4, 2), (4,))
+    for i, c in enumerate(tiled.chunks):
+        assert c.index == (i, 0)
+
+    df3 = pd.DataFrame(
+        np.random.rand(10, 4), columns=list("ABCD"), index=pd.RangeIndex(10, 20)
+    )
+
+    mdf3 = from_pandas(df3, chunk_size=4)
+    r = concat([mdf1, mdf3], axis="index")
+
+    assert r.shape == (20, 4)
+    pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
+    pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.RangeIndex(20))
+
+    df4 = pd.DataFrame(
+        np.random.rand(10, 4),
+        columns=list("ABCD"),
+        index=np.random.permutation(np.arange(10)),
+    )
+
+    mdf4 = from_pandas(df4, chunk_size=4)
+    r = concat([mdf1, mdf4], axis="index")
+
+    assert r.shape == (20, 4)
+    pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
+    pd.testing.assert_index_equal(
+        r.index_value.to_pandas(), pd.Index([], dtype=np.int64)
+    )
+
+    r = concat([mdf4, mdf1], axis="index")
+
+    assert r.shape == (20, 4)
+    pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
+    pd.testing.assert_index_equal(
+        r.index_value.to_pandas(), pd.Index([], dtype=np.int64)
+    )
+
+    r = concat([mdf4, mdf4], axis="index")
+
+    assert r.shape == (20, 4)
+    pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
+    pd.testing.assert_index_equal(
+        r.index_value.to_pandas(), pd.Index([], dtype=np.int64)
+    )
+
+    mdf1 = from_pandas(df1, chunk_size=3)
+    mdf2 = from_pandas(df2, chunk_size=4)
+    r = concat([mdf1, mdf2], axis="columns")
+
+    assert r.shape == (10, 8)
+    expected_dtypes = pd.concat([df1, df2], axis="columns").dtypes
+    pd.testing.assert_series_equal(r.dtypes, expected_dtypes)
+
+    tiled = tile(r)
+    assert tiled.nsplits == ((3, 3, 3, 1), (3, 1, 4))
+    for i, c in enumerate(tiled.chunks):
+        index = (i // 3, i % 3)
+        assert c.index == index
+
+    df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
+    df2 = pd.DataFrame(np.random.rand(10, 3), columns=list("ABC"))
+    mdf1 = from_pandas(df1, chunk_size=3)
+    mdf2 = from_pandas(df2, chunk_size=3)
+    r = concat([mdf1, mdf2], join="inner")
+
+    assert r.shape == (20, 3)
+    tiled = tile(r)
+    assert tiled.nsplits == ((3, 3, 3, 1, 3, 3, 3, 1), (3,))
diff --git a/python/xorbits/_mars/dataframe/merge/tests/test_merge_execution.py b/python/xorbits/_mars/dataframe/merge/tests/test_merge_execution.py
new file mode 100644
index 000000000..ae5f5c12d
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/merge/tests/test_merge_execution.py
@@ -0,0 +1,846 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ....core.graph.builder.utils import build_graph
+from ...datasource.dataframe import from_pandas
+from ...datasource.series import from_pandas as series_from_pandas
+from ...utils import sort_dataframe_inplace
+from .. import DataFrameConcat, DataFrameMergeAlign, concat
+
+
+def test_merge(setup):
+    df1 = pd.DataFrame(
+        np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"]
+    )
+    df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
+    df3 = df1.copy()
+    df3.index = pd.RangeIndex(2, 6, name="index")
+    df4 = df1.copy()
+    df4.index = pd.MultiIndex.from_tuples(
+        [(i, i + 1) for i in range(4)], names=["i1", "i2"]
+    )
+
+    mdf1 = from_pandas(df1, chunk_size=2)
+    mdf2 = from_pandas(df2, chunk_size=2)
+    mdf3 = from_pandas(df3, chunk_size=3)
+    mdf4 = from_pandas(df4, chunk_size=2)
+
+    # Note [Index of Merge]
+    #
+    # When `left_index` and `right_index` of `merge` is both false, pandas will generate an RangeIndex to
+    # the final result dataframe.
+    #
+    # We chunked the `left` and `right` dataframe, thus every result chunk will have its own RangeIndex.
+    # When they are contenated we don't generate a new RangeIndex for the result, thus we cannot obtain the
+    # same index value with pandas. But we guarantee that the content of dataframe is correct.
+
+    # merge on index
+    expected0 = df1.merge(df2)
+    jdf0 = mdf1.merge(mdf2, auto_merge="none")
+    result0 = jdf0.execute().fetch()
+    pd.testing.assert_frame_equal(
+        sort_dataframe_inplace(expected0, 0), sort_dataframe_inplace(result0, 0)
+    )
+
+    # merge on left index and `right_on`
+    expected1 = df1.merge(df2, how="left", right_on="x", left_index=True)
+    jdf1 = mdf1.merge(
+        mdf2, how="left", right_on="x", left_index=True, auto_merge="none"
+    )
+    result1 = jdf1.execute().fetch()
+    expected1.set_index("a_x", inplace=True)
+    result1.set_index("a_x", inplace=True)
+    pd.testing.assert_frame_equal(
+        sort_dataframe_inplace(expected1, 0), sort_dataframe_inplace(result1, 0)
+    )
+
+    # merge on `left_on` and right index
+    expected2 = df1.merge(df2, how="right", left_on="a", right_index=True)
+    jdf2 = mdf1.merge(
+        mdf2, how="right", left_on="a", right_index=True, auto_merge="none"
+    )
+    result2 = jdf2.execute().fetch()
+    expected2.set_index("a", inplace=True)
+    result2.set_index("a", inplace=True)
+    pd.testing.assert_frame_equal(
+        sort_dataframe_inplace(expected2, 0), sort_dataframe_inplace(result2, 0)
+    )
+
+    # merge on `left_on` and `right_on`
+    expected3 = df1.merge(df2, how="left", left_on="a", right_on="x")
+    jdf3 = mdf1.merge(mdf2, how="left", left_on="a", right_on="x", auto_merge="none")
+    result3 = jdf3.execute().fetch()
+    expected3.set_index("a_x", inplace=True)
+    result3.set_index("a_x", inplace=True)
+    pd.testing.assert_frame_equal(
+        sort_dataframe_inplace(expected3, 0), sort_dataframe_inplace(result3, 0)
+    )
+
+    # merge on `on`
+    expected4 = df1.merge(df2, how="right", on="a")
+    jdf4 = mdf1.merge(mdf2, how="right", on="a", auto_merge="none")
+    result4 = jdf4.execute().fetch()
+    expected4.set_index("a", inplace=True)
+    result4.set_index("a", inplace=True)
+    pd.testing.assert_frame_equal(
+        sort_dataframe_inplace(expected4, 0), sort_dataframe_inplace(result4, 0)
+    )
+
+    # merge on multiple columns
+    expected5 = df1.merge(df2, how="inner", on=["a", "b"])
+    jdf5 = mdf1.merge(mdf2, how="inner", on=["a", "b"], auto_merge="none")
+    result5 = jdf5.execute().fetch()
+    pd.testing.assert_frame_equal(
+        sort_dataframe_inplace(expected5, 0), sort_dataframe_inplace(result5, 0)
+    )
+
+    # merge when some on is index
+    expected6 = df3.merge(df2, how="inner", left_on="index", right_on="a")
+    jdf6 = mdf3.merge(
+        mdf2, how="inner", left_on="index", right_on="a", auto_merge="none"
+    )
+    result6 = jdf6.execute().fetch()
+    pd.testing.assert_frame_equal(
+        sort_dataframe_inplace(expected6, 0), sort_dataframe_inplace(result6, 0)
+    )
+
+    # merge when on is in MultiIndex
+    expected7 = df4.merge(df2, how="inner", left_on="i1", right_on="a")
+    jdf7 = mdf4.merge(mdf2, how="inner", left_on="i1", right_on="a", auto_merge="none")
+    result7 = jdf7.execute().fetch()
+    pd.testing.assert_frame_equal(
+        sort_dataframe_inplace(expected7, 0), sort_dataframe_inplace(result7, 0)
+    )
+
+    mdf5 = from_pandas(df2, chunk_size=4)
+    mdf6 = from_pandas(df4, chunk_size=1)
+    expected7 = df4.merge(df2, how="inner", left_on="i1", right_on="a")
+    jdf7 = mdf6.merge(mdf5, how="inner", left_on="i1", right_on="a", auto_merge="none")
+    result7 = jdf7.execute().fetch()
+    pd.testing.assert_frame_equal(
+        sort_dataframe_inplace(expected7, 0), sort_dataframe_inplace(result7, 0)
+    )
+
+    # merge when on is in MultiIndex, and on not in index
+    expected8 = df4.merge(df2, how="inner", on=["a", "b"])
+    jdf8 = mdf4.merge(mdf2, how="inner", on=["a", "b"], auto_merge="none")
+    result8 = jdf8.execute().fetch()
+    pd.testing.assert_frame_equal(
+        sort_dataframe_inplace(expected8, 0), sort_dataframe_inplace(result8, 0)
+    )
+
+
+def test_join(setup):
+    df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], index=["a1", "a2", "a3"])
+    df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], index=["a1", "b2", "b3"]) + 1
+    df2 = pd.concat([df2, df2 + 1])
+
+    mdf1 = from_pandas(df1, chunk_size=2)
+    mdf2 = from_pandas(df2, chunk_size=2)
+
+    # default `how`
+    expected0 = df1.join(df2, lsuffix="l_", rsuffix="r_")
+    jdf0 = mdf1.join(mdf2, lsuffix="l_", rsuffix="r_", auto_merge="none")
+    result0 = jdf0.execute().fetch()
+    pd.testing.assert_frame_equal(expected0.sort_index(), result0.sort_index())
+
+    # how = 'left'
+    expected1 = df1.join(df2, how="left", lsuffix="l_", rsuffix="r_")
+    jdf1 = mdf1.join(mdf2, how="left", lsuffix="l_", rsuffix="r_", auto_merge="none")
+    result1 = jdf1.execute().fetch()
+    pd.testing.assert_frame_equal(expected1.sort_index(), result1.sort_index())
+
+    # how = 'right'
+    expected2 = df1.join(df2, how="right", lsuffix="l_", rsuffix="r_")
+    jdf2 = mdf1.join(mdf2, how="right", lsuffix="l_", rsuffix="r_", auto_merge="none")
+    result2 = jdf2.execute().fetch()
+    pd.testing.assert_frame_equal(expected2.sort_index(), result2.sort_index())
+
+    # how = 'inner'
+    expected3 = df1.join(df2, how="inner", lsuffix="l_", rsuffix="r_")
+    jdf3 = mdf1.join(mdf2, how="inner", lsuffix="l_", rsuffix="r_", auto_merge="none")
+    result3 = jdf3.execute().fetch()
+    pd.testing.assert_frame_equal(expected3.sort_index(), result3.sort_index())
+
+    # how = 'outer'
+    expected4 = df1.join(df2, how="outer", lsuffix="l_", rsuffix="r_")
+    jdf4 = mdf1.join(mdf2, how="outer", lsuffix="l_", rsuffix="r_", auto_merge="none")
+    result4 = jdf4.execute().fetch()
+    pd.testing.assert_frame_equal(expected4.sort_index(), result4.sort_index())
+
+
+def test_join_on(setup):
+    df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], columns=["a1", "a2", "a3"])
+    df2 = (
+        pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], columns=["a1", "b2", "b3"]) + 1
+    )
+    df2 = pd.concat([df2, df2 + 1])
+
+    mdf1 = from_pandas(df1, chunk_size=2)
+    mdf2 = from_pandas(df2, chunk_size=2)
+
+    expected0 = df1.join(df2, on=None, lsuffix="_l", rsuffix="_r")
+    jdf0 = mdf1.join(mdf2, on=None, lsuffix="_l", rsuffix="_r", auto_merge="none")
+    result0 = jdf0.execute().fetch()
+    pd.testing.assert_frame_equal(
+        sort_dataframe_inplace(expected0, 0), sort_dataframe_inplace(result0, 0)
+    )
+
+    expected1 = df1.join(df2, how="left", on="a1", lsuffix="_l", rsuffix="_r")
+    jdf1 = mdf1.join(
+        mdf2, how="left", on="a1", lsuffix="_l", rsuffix="_r", auto_merge="none"
+    )
+    result1 = jdf1.execute().fetch()
+
+    # Note [Columns of Left Join]
+    #
+    # I believe we have no chance to obtain the entirely same result with pandas here:
+    #
+    # Look at the following example:
+    #
+    # >>> df1
+    #     a1  a2  a3
+    # 0   1   3   3
+    # >>> df2
+    #     a1  b2  b3
+    # 1   2   6   7
+    # >>> df3
+    #     a1  b2  b3
+    # 1   2   6   7
+    # 1   2   6   7
+    #
+    # >>> df1.merge(df2, how='left', left_on='a1', left_index=False, right_index=True)
+    #     a1_x  a2  a3  a1_y  b2  b3
+    # 0   1   3   3     2   6   7
+    # >>> df1.merge(df3, how='left', left_on='a1', left_index=False, right_index=True)
+    #     a1  a1_x  a2  a3  a1_y  b2  b3
+    # 0   1     1   3   3     2   6   7
+    # 0   1     1   3   3     2   6   7
+    #
+    # Note that the result of `df1.merge(df3)` has an extra column `a` compared to `df1.merge(df2)`.
+    # The value of column `a` is the same of `a1_x`, just because `1` occurs twice in index of `df3`.
+    # I haven't invistagated why pandas has such behaviour...
+    #
+    # We cannot yield the same result with pandas, because, the `df3` is chunked, then some of the
+    # result chunk has 6 columns, others may have 7 columns, when concatenated into one DataFrame
+    # some cells of column `a` will have value `NaN`, which is different from the result of pandas.
+    #
+    # But we can guarantee that other effective columns have absolutely same value with pandas.
+
+    columns_to_compare = jdf1.columns_value.to_pandas()
+
+    pd.testing.assert_frame_equal(
+        sort_dataframe_inplace(expected1[columns_to_compare], 0, 1),
+        sort_dataframe_inplace(result1[columns_to_compare], 0, 1),
+    )
+
+    # Note [Index of Join on EmptyDataFrame]
+    #
+    # It is tricky that it is non-trivial to get the same `index` result with pandas.
+    #
+    # Look at the following example:
+    #
+    # >>> df1
+    #    a1  a2  a3
+    # 1   4   2   6
+    # >>> df2
+    #    a1  b2  b3
+    # 1   2   6   7
+    # 2   8   9  10
+    # >>> df3
+    # Empty DataFrame
+    # Columns: [a1, a2, a3]
+    # Index: []
+    # >>> df1.join(df2, how='right', on='a2', lsuffix='_l', rsuffix='_r')
+    #       a1_l  a2   a3  a1_r  b2  b3
+    # 1.0   4.0   2  6.0     8   9  10
+    # NaN   NaN   1  NaN     2   6   7
+    # >>> df3.join(df2, how='right', on='a2', lsuffix='_l', rsuffix='_r')
+    #     a1_l  a2  a3  a1_r  b2  b3
+    # 1   NaN   1 NaN     2   6   7
+    # 2   NaN   2 NaN     8   9  10
+    #
+    # When the `left` dataframe is not empty, the mismatched rows in `right` will have index value `NaN`,
+    # and the matched rows have index value from `right`. When the `left` dataframe is empty, the mismatched
+    # rows have index value from `right`.
+    #
+    # Since we chunked the `left` dataframe, it is uneasy to obtain the same index value with pandas in the
+    # final result dataframe, but we guaranteed that the dataframe content is correctly.
+
+    expected2 = df1.join(df2, how="right", on="a2", lsuffix="_l", rsuffix="_r")
+    jdf2 = mdf1.join(
+        mdf2, how="right", on="a2", lsuffix="_l", rsuffix="_r", auto_merge="none"
+    )
+    result2 = jdf2.execute().fetch()
+
+    expected2.set_index("a2", inplace=True)
+    result2.set_index("a2", inplace=True)
+    pd.testing.assert_frame_equal(
+        sort_dataframe_inplace(expected2, 0), sort_dataframe_inplace(result2, 0)
+    )
+
+    expected3 = df1.join(df2, how="inner", on="a2", lsuffix="_l", rsuffix="_r")
+    jdf3 = mdf1.join(
+        mdf2, how="inner", on="a2", lsuffix="_l", rsuffix="_r", auto_merge="none"
+    )
+    result3 = jdf3.execute().fetch()
+    pd.testing.assert_frame_equal(
+        sort_dataframe_inplace(expected3, 0), sort_dataframe_inplace(result3, 0)
+    )
+
+    expected4 = df1.join(df2, how="outer", on="a2", lsuffix="_l", rsuffix="_r")
+    jdf4 = mdf1.join(
+        mdf2, how="outer", on="a2", lsuffix="_l", rsuffix="_r", auto_merge="none"
+    )
+    result4 = jdf4.execute().fetch()
+
+    expected4.set_index("a2", inplace=True)
+    result4.set_index("a2", inplace=True)
+    pd.testing.assert_frame_equal(
+        sort_dataframe_inplace(expected4, 0), sort_dataframe_inplace(result4, 0)
+    )
+
+
+def test_merge_one_chunk(setup):
+    df1 = pd.DataFrame(
+        {"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]},
+        index=["a1", "a2", "a3", "a4"],
+    )
+    df2 = pd.DataFrame(
+        {"rkey": ["foo", "bar", "baz", "foo"], "value": [5, 6, 7, 8]},
+        index=["a1", "a2", "a3", "a4"],
+    )
+
+    # all have one chunk
+    mdf1 = from_pandas(df1)
+    mdf2 = from_pandas(df2)
+
+    expected = df1.merge(df2, left_on="lkey", right_on="rkey")
+    jdf = mdf1.merge(mdf2, left_on="lkey", right_on="rkey", auto_merge="none")
+    result = jdf.execute().fetch()
+
+    pd.testing.assert_frame_equal(
+        expected.sort_values(by=expected.columns[1]).reset_index(drop=True),
+        result.sort_values(by=result.columns[1]).reset_index(drop=True),
+    )
+
+    # left have one chunk
+    mdf1 = from_pandas(df1)
+    mdf2 = from_pandas(df2, chunk_size=2)
+
+    expected = df1.merge(df2, left_on="lkey", right_on="rkey")
+    jdf = mdf1.merge(mdf2, left_on="lkey", right_on="rkey", auto_merge="none")
+    result = jdf.execute().fetch()
+
+    pd.testing.assert_frame_equal(
+        expected.sort_values(by=expected.columns[1]).reset_index(drop=True),
+        result.sort_values(by=result.columns[1]).reset_index(drop=True),
+    )
+
+    # right have one chunk
+    mdf1 = from_pandas(df1, chunk_size=3)
+    mdf2 = from_pandas(df2)
+
+    expected = df1.merge(df2, left_on="lkey", right_on="rkey")
+    jdf = mdf1.merge(mdf2, left_on="lkey", right_on="rkey", auto_merge="none")
+    result = jdf.execute().fetch()
+
+    pd.testing.assert_frame_equal(
+        expected.sort_values(by=expected.columns[1]).reset_index(drop=True),
+        result.sort_values(by=result.columns[1]).reset_index(drop=True),
+    )
+
+    # left have one chunk and how="left", then one chunk tile
+    # will result in wrong results, see #GH 2107
+    mdf1 = from_pandas(df1, chunk_size=2)
+    mdf2 = from_pandas(df2)
+
+    expected = df2.merge(df1, left_on="rkey", right_on="lkey", how="left")
+    jdf = mdf2.merge(
+        mdf1, left_on="rkey", right_on="lkey", how="left", auto_merge="none"
+    )
+    result = jdf.execute().fetch()
+
+    pd.testing.assert_frame_equal(
+        expected.sort_values(by=expected.columns[1]).reset_index(drop=True),
+        result.sort_values(by=result.columns[1]).reset_index(drop=True),
+    )
+
+
+def test_broadcast_merge(setup):
+    ns = np.random.RandomState(0)
+    # small dataframe
+    raw1 = pd.DataFrame(
+        {
+            "key": ns.randint(0, 10, size=10),
+            "value": np.arange(10),
+        },
+        index=[f"a{i}" for i in range(10)],
+    )
+    # big dataframe
+    raw2 = pd.DataFrame(
+        {
+            "key": ns.randint(0, 100, size=100),
+            "value": np.arange(100, 200),
+        },
+        index=[f"a{i}" for i in range(100)],
+    )
+
+    # test broadcast right and how="inner"
+    df1 = from_pandas(raw1, chunk_size=5)
+    df2 = from_pandas(raw2, chunk_size=10)
+    r = df2.merge(df1, on="key", auto_merge="none", bloom_filter=False)
+    # make sure it selects broadcast merge, for broadcast, there must be
+    # DataFrameConcat operands
+    graph = build_graph([r], tile=True)
+    assert any(isinstance(c.op, DataFrameConcat) for c in graph)
+    # inner join doesn't need shuffle
+    assert all(not isinstance(c.op, DataFrameMergeAlign) for c in graph)
+
+    result = r.execute().fetch()
+    expected = raw2.merge(raw1, on="key")
+
+    expected.set_index("key", inplace=True)
+    result.set_index("key", inplace=True)
+    pd.testing.assert_frame_equal(
+        sort_dataframe_inplace(expected, 0), sort_dataframe_inplace(result, 0)
+    )
+
+    # test broadcast right and how="left"
+    df1 = from_pandas(raw1, chunk_size=5)
+    df2 = from_pandas(raw2, chunk_size=10)
+    r = df2.merge(df1, on="key", how="left", auto_merge="none", method="broadcast")
+    # make sure it selects broadcast merge, for broadcast, there must be
+    # DataFrameConcat operands
+    graph = build_graph([r], tile=True)
+    assert any(isinstance(c.op, DataFrameConcat) for c in graph)
+    # left join need shuffle
+    assert any(isinstance(c.op, DataFrameMergeAlign) for c in graph)
+
+    result = r.execute().fetch()
+    expected = raw2.merge(raw1, on="key", how="left")
+
+    expected.set_index("key", inplace=True)
+    result.set_index("key", inplace=True)
+    pd.testing.assert_frame_equal(
+        expected.sort_values(by=["key", "value_x"]),
+        result.sort_values(by=["key", "value_x"]),
+    )
+
+    # test broadcast left
+    df1 = from_pandas(raw1, chunk_size=5)
+    df2 = from_pandas(raw2, chunk_size=10)
+    r = df1.merge(df2, on="key", auto_merge="none", bloom_filter=False)
+    # make sure it selects broadcast merge, for broadcast, there must be
+    # DataFrameConcat operands
+    graph = build_graph([r], tile=True)
+    assert any(isinstance(c.op, DataFrameConcat) for c in graph)
+    # inner join doesn't need shuffle
+    assert all(not isinstance(c.op, DataFrameMergeAlign) for c in graph)
+
+    result = r.execute().fetch()
+    expected = raw1.merge(raw2, on="key")
+
+    expected.set_index("key", inplace=True)
+    result.set_index("key", inplace=True)
+    pd.testing.assert_frame_equal(
+        sort_dataframe_inplace(expected, 0), sort_dataframe_inplace(result, 0)
+    )
+
+    # test broadcast left and how="right"
+    df1 = from_pandas(raw1, chunk_size=5)
+    df2 = from_pandas(raw2, chunk_size=10)
+    r = df1.merge(df2, on="key", how="right", auto_merge="none")
+    # make sure it selects broadcast merge, for broadcast, there must be
+    # DataFrameConcat operands
+    graph = build_graph([r], tile=True)
+    assert any(isinstance(c.op, DataFrameConcat) for c in graph)
+    # right join need shuffle
+    assert any(isinstance(c.op, DataFrameMergeAlign) for c in graph)
+
+    result = r.execute().fetch()
+    expected = raw1.merge(raw2, on="key", how="right")
+
+    expected.set_index("key", inplace=True)
+    result.set_index("key", inplace=True)
+    pd.testing.assert_frame_equal(
+        expected.sort_values(by=["key", "value_x"]),
+        result.sort_values(by=["key", "value_x"]),
+    )
+
+
+def test_merge_with_bloom_filter(setup):
+    ns = np.random.RandomState(0)
+    raw_df1 = pd.DataFrame(
+        {
+            "col1": ns.random(100),
+            "col2": ns.randint(0, 10, size=(100,)),
+            "col3": ns.randint(0, 10, size=(100,)),
+        }
+    )
+    raw_df2 = pd.DataFrame(
+        {
+            "col1": ns.random(100),
+            "col2": ns.randint(0, 10, size=(100,)),
+            "col3": ns.randint(0, 10, size=(100,)),
+        }
+    )
+
+    df1 = from_pandas(raw_df1, chunk_size=10)
+    df2 = from_pandas(raw_df2, chunk_size=15)
+
+    expected = raw_df1.merge(raw_df2, on="col2")
+
+    result = (
+        df1.merge(
+            df2,
+            on="col2",
+            bloom_filter=True,
+            bloom_filter_options={"max_elements": 100, "error_rate": 0.01},
+            auto_merge="none",
+        )
+        .execute()
+        .fetch()
+    )
+    pd.testing.assert_frame_equal(
+        expected.sort_values(by=["col1_x", "col2"]).reset_index(drop=True),
+        result.sort_values(by=["col1_x", "col2"]).reset_index(drop=True),
+    )
+
+    result = (
+        df2.merge(df1, on=["col2", "col3"], bloom_filter=True, auto_merge="none")
+        .execute()
+        .fetch()
+    )
+    expected = raw_df2.merge(raw_df1, on=["col2", "col3"])
+    pd.testing.assert_frame_equal(
+        expected.sort_values(by=["col1_x", "col2"]).reset_index(drop=True),
+        result.sort_values(by=["col1_x", "col2"]).reset_index(drop=True),
+    )
+
+    # on index
+    result = df2.merge(df1, bloom_filter=True, auto_merge="none").execute().fetch()
+    expected = raw_df2.merge(raw_df1)
+    pd.testing.assert_frame_equal(
+        expected.sort_index().reset_index(drop=True),
+        result.sort_index().reset_index(drop=True),
+    )
+
+    # on float column
+    result = (
+        df2.merge(df1, on="col1", bloom_filter=True, auto_merge="none")
+        .execute()
+        .fetch()
+    )
+    expected = raw_df2.merge(raw_df1, on="col1")
+    pd.testing.assert_frame_equal(
+        expected.sort_values(by=["col1", "col2_x"]).reset_index(drop=True),
+        result.sort_values(by=["col1", "col2_x"]).reset_index(drop=True),
+    )
+
+    # on float columns
+    result = (
+        df2.merge(df1, on=["col1", "col2"], bloom_filter=True, auto_merge="none")
+        .execute()
+        .fetch()
+    )
+    expected = raw_df2.merge(raw_df1, on=["col1", "col2"])
+    pd.testing.assert_frame_equal(
+        expected.sort_values(by=["col1", "col2"]).reset_index(drop=True),
+        result.sort_values(by=["col1", "col2"]).reset_index(drop=True),
+    )
+
+    # multi index
+    raw_df3 = raw_df1.copy()
+    raw_df3.index = pd.MultiIndex.from_tuples(
+        [(i, i + 1) for i in range(100)], names=["i1", "i2"]
+    )
+    df3 = from_pandas(raw_df3, chunk_size=8)
+    result = (
+        df3.merge(
+            df1, left_on="i1", right_on="col2", bloom_filter=True, auto_merge="none"
+        )
+        .execute()
+        .fetch()
+    )
+    expected = raw_df3.merge(raw_df1, left_on="i1", right_on="col2")
+    pd.testing.assert_frame_equal(
+        expected.sort_index().sort_values(by=["col1_x"]).reset_index(drop=True),
+        result.sort_index().sort_values(by=["col1_x"]).reset_index(drop=True),
+    )
+
+    df4 = from_pandas(raw_df3, chunk_size=20)
+    result = (
+        df4.merge(
+            df1, left_on="i1", right_on="col2", bloom_filter=True, auto_merge="none"
+        )
+        .execute()
+        .fetch()
+    )
+    expected = raw_df3.merge(raw_df1, left_on="i1", right_on="col2")
+    pd.testing.assert_frame_equal(
+        expected.sort_index().sort_values(by=["col1_x"]).reset_index(drop=True),
+        result.sort_index().sort_values(by=["col1_x"]).reset_index(drop=True),
+    )
+
+
+@pytest.mark.parametrize("filter", ["small", "large", "both"])
+def test_merge_with_bloom_filter_options(setup, filter):
+    ns = np.random.RandomState(0)
+    raw_df1 = pd.DataFrame(
+        {
+            "col1": ns.random(100),
+            "col2": ns.randint(0, 10, size=(100,)),
+            "col3": ns.randint(0, 10, size=(100,)),
+        }
+    )
+    raw_df2 = pd.DataFrame(
+        {
+            "col1": ns.random(100),
+            "col2": ns.randint(0, 10, size=(100,)),
+            "col3": ns.randint(0, 10, size=(100,)),
+        }
+    )
+
+    df1 = from_pandas(raw_df1, chunk_size=25)
+    df2 = from_pandas(raw_df2, chunk_size=30)
+    m = df1.merge(
+        df2,
+        on="col2",
+        auto_merge="none",
+        method="shuffle",
+        bloom_filter=True,
+        bloom_filter_options={"filter": filter, "apply_chunk_size_threshold": 0},
+    )
+
+    expected = raw_df1.merge(raw_df2, on="col2")
+    result = m.execute().fetch()
+    pd.testing.assert_frame_equal(
+        expected.sort_index().sort_values(by=["col1_x"]).reset_index(drop=True),
+        result.sort_index().sort_values(by=["col1_x"]).reset_index(drop=True),
+    )
+
+
+@pytest.mark.parametrize("auto_merge", ["none", "both", "before", "after"])
+def test_merge_on_duplicate_columns(setup, auto_merge):
+    raw1 = pd.DataFrame(
+        [["foo", 1, "bar"], ["bar", 2, "foo"], ["baz", 3, "foo"]],
+        columns=["lkey", "value", "value"],
+        index=["a1", "a2", "a3"],
+    )
+    raw2 = pd.DataFrame(
+        {"rkey": ["foo", "bar", "baz", "foo"], "value": [5, 6, 7, 8]},
+        index=["a1", "a2", "a3", "a4"],
+    )
+
+    df1 = from_pandas(raw1, chunk_size=2)
+    df2 = from_pandas(raw2, chunk_size=3)
+
+    r = df1.merge(
+        df2,
+        left_on="lkey",
+        right_on="rkey",
+        auto_merge=auto_merge,
+        auto_merge_threshold=0,
+    )
+    result = r.execute().fetch()
+    expected = raw1.merge(raw2, left_on="lkey", right_on="rkey")
+    pd.testing.assert_frame_equal(expected, result)
+
+
+def test_append_execution(setup):
+    df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
+    df2 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
+
+    mdf1 = from_pandas(df1, chunk_size=3)
+    mdf2 = from_pandas(df2, chunk_size=3)
+
+    adf = mdf1.append(mdf2)
+    expected = df1.append(df2)
+    result = adf.execute().fetch()
+    pd.testing.assert_frame_equal(expected, result)
+
+    adf = mdf1.append(mdf2, ignore_index=True)
+    expected = df1.append(df2, ignore_index=True)
+    result = adf.execute(extra_config={"check_index_value": False}).fetch()
+    pd.testing.assert_frame_equal(expected, result)
+
+    mdf1 = from_pandas(df1, chunk_size=3)
+    mdf2 = from_pandas(df2, chunk_size=2)
+
+    adf = mdf1.append(mdf2)
+    expected = df1.append(df2)
+    result = adf.execute().fetch()
+    pd.testing.assert_frame_equal(expected, result)
+
+    adf = mdf1.append(mdf2, ignore_index=True)
+    expected = df1.append(df2, ignore_index=True)
+    result = adf.execute(extra_config={"check_index_value": False}).fetch()
+    pd.testing.assert_frame_equal(expected, result)
+
+    df3 = pd.DataFrame(np.random.rand(8, 4), columns=list("ABCD"))
+    mdf3 = from_pandas(df3, chunk_size=3)
+    expected = df1.append([df2, df3])
+    adf = mdf1.append([mdf2, mdf3])
+    result = adf.execute().fetch()
+    pd.testing.assert_frame_equal(expected, result)
+
+    adf = mdf1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True)
+    expected = df1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True)
+    result = adf.execute(extra_config={"check_index_value": False}).fetch()
+    pd.testing.assert_frame_equal(expected, result)
+
+    # test for series
+    series1 = pd.Series(np.random.rand(10))
+    series2 = pd.Series(np.random.rand(10))
+
+    mseries1 = series_from_pandas(series1, chunk_size=3)
+    mseries2 = series_from_pandas(series2, chunk_size=3)
+
+    aseries = mseries1.append(mseries2)
+    expected = series1.append(series2)
+    result = aseries.execute().fetch()
+    pd.testing.assert_series_equal(expected, result)
+
+    aseries = mseries1.append(mseries2, ignore_index=True)
+    expected = series1.append(series2, ignore_index=True)
+    result = aseries.execute(extra_config={"check_index_value": False}).fetch()
+    pd.testing.assert_series_equal(expected, result)
+
+    mseries1 = series_from_pandas(series1, chunk_size=3)
+    mseries2 = series_from_pandas(series2, chunk_size=2)
+
+    aseries = mseries1.append(mseries2)
+    expected = series1.append(series2)
+    result = aseries.execute().fetch()
+    pd.testing.assert_series_equal(expected, result)
+
+    aseries = mseries1.append(mseries2, ignore_index=True)
+    expected = series1.append(series2, ignore_index=True)
+    result = aseries.execute(extra_config={"check_index_value": False}).fetch()
+    pd.testing.assert_series_equal(expected, result)
+
+    series3 = pd.Series(np.random.rand(4))
+    mseries3 = series_from_pandas(series3, chunk_size=2)
+    expected = series1.append([series2, series3])
+    aseries = mseries1.append([mseries2, mseries3])
+    result = aseries.execute().fetch()
+    pd.testing.assert_series_equal(expected, result)
+
+
+def test_concat(setup):
+    df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
+    df2 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
+
+    mdf1 = from_pandas(df1, chunk_size=3)
+    mdf2 = from_pandas(df2, chunk_size=3)
+
+    r = concat([mdf1, mdf2])
+    expected = pd.concat([df1, df2])
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(expected, result)
+
+    # test different chunk size and ignore_index=True
+    mdf1 = from_pandas(df1, chunk_size=2)
+    mdf2 = from_pandas(df2, chunk_size=3)
+
+    r = concat([mdf1, mdf2], ignore_index=True)
+    expected = pd.concat([df1, df2], ignore_index=True)
+    result = r.execute(extra_config={"check_index_value": False}).fetch()
+    pd.testing.assert_frame_equal(expected, result)
+
+    # test axis=1
+    mdf1 = from_pandas(df1, chunk_size=2)
+    mdf2 = from_pandas(df2, chunk_size=3)
+
+    r = concat([mdf1, mdf2], axis=1)
+    expected = pd.concat([df1, df2], axis=1)
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(expected, result)
+
+    # test multiply dataframes
+    r = concat([mdf1, mdf2, mdf1])
+    expected = pd.concat([df1, df2, df1])
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(expected, result)
+
+    df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
+    df2 = pd.DataFrame(np.random.rand(10, 3), columns=list("ABC"))
+
+    mdf1 = from_pandas(df1, chunk_size=3)
+    mdf2 = from_pandas(df2, chunk_size=3)
+
+    # test join=inner
+    r = concat([mdf1, mdf2], join="inner")
+    expected = pd.concat([df1, df2], join="inner")
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(expected, result)
+
+    # test for series
+    series1 = pd.Series(np.random.rand(10))
+    series2 = pd.Series(np.random.rand(10))
+
+    mseries1 = series_from_pandas(series1, chunk_size=3)
+    mseries2 = series_from_pandas(series2, chunk_size=3)
+
+    r = concat([mseries1, mseries2])
+    expected = pd.concat([series1, series2])
+    result = r.execute().fetch()
+    pd.testing.assert_series_equal(result, expected)
+
+    # test different series and ignore_index
+    mseries1 = series_from_pandas(series1, chunk_size=4)
+    mseries2 = series_from_pandas(series2, chunk_size=3)
+
+    r = concat([mseries1, mseries2], ignore_index=True)
+    expected = pd.concat([series1, series2], ignore_index=True)
+    result = r.execute(extra_config={"check_index_value": False}).fetch()
+    pd.testing.assert_series_equal(result, expected)
+
+    # test axis=1
+    mseries1 = series_from_pandas(series1, chunk_size=3)
+    mseries2 = series_from_pandas(series2, chunk_size=3)
+
+    r = concat([mseries1, mseries2], axis=1)
+    expected = pd.concat([series1, series2], axis=1)
+    result = r.execute(extra_config={"check_shape": False}).fetch()
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test merge dataframe and series
+    r = concat([mdf1, mseries2], ignore_index=True)
+    expected = pd.concat([df1, series2], ignore_index=True)
+    result = r.execute(extra_config={"check_index_value": False}).fetch()
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test merge series and dataframe
+    r = concat([mseries1, mdf2], ignore_index=True)
+    expected = pd.concat([series1, df2], ignore_index=True)
+    result = r.execute(extra_config={"check_index_value": False}).fetch()
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test merge dataframe and series, axis=1
+    r = concat([mdf1, mseries2], axis=1)
+    expected = pd.concat([df1, series2], axis=1)
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test merge series and dataframe, axis=1
+    r = concat([mseries1, mdf2], axis=1)
+    expected = pd.concat([series1, df2], axis=1)
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(result, expected)
diff --git a/python/xorbits/_mars/dataframe/missing/__init__.py b/python/xorbits/_mars/dataframe/missing/__init__.py
new file mode 100644
index 000000000..5632476fc
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/missing/__init__.py
@@ -0,0 +1,51 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .checkna import isna, isnull, notna, notnull
+from .dropna import df_dropna, index_dropna, series_dropna
+from .fillna import bfill, ffill, fillna, index_fillna
+from .replace import df_replace, series_replace
+
+
+def _install():
+    from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
+
+    for cls in DATAFRAME_TYPE + SERIES_TYPE:
+        setattr(cls, "fillna", fillna)
+        setattr(cls, "ffill", ffill)
+        setattr(cls, "pad", ffill)
+        setattr(cls, "backfill", bfill)
+        setattr(cls, "bfill", bfill)
+        setattr(cls, "isna", isna)
+        setattr(cls, "isnull", isnull)
+        setattr(cls, "notna", notna)
+        setattr(cls, "notnull", notnull)
+
+    for cls in DATAFRAME_TYPE:
+        setattr(cls, "dropna", df_dropna)
+        setattr(cls, "replace", df_replace)
+
+    for cls in SERIES_TYPE:
+        setattr(cls, "dropna", series_dropna)
+        setattr(cls, "replace", series_replace)
+
+    for cls in INDEX_TYPE:
+        setattr(cls, "fillna", index_fillna)
+        setattr(cls, "dropna", index_dropna)
+        setattr(cls, "isna", isna)
+        setattr(cls, "notna", notna)
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/dataframe/missing/checkna.py b/python/xorbits/_mars/dataframe/missing/checkna.py
new file mode 100644
index 000000000..65ab17555
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/missing/checkna.py
@@ -0,0 +1,295 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any
+
+import numpy as np
+import pandas as pd
+
+from ... import dataframe as md
+from ... import opcodes
+from ... import tensor as mt
+from ...config import options
+from ...core import OutputType
+from ...serialization.serializables import BoolField
+from ..operands import (
+    DATAFRAME_TYPE,
+    ENTITY_TYPE,
+    INDEX_TYPE,
+    SERIES_TYPE,
+    TENSOR_TYPE,
+    DataFrameOperand,
+    DataFrameOperandMixin,
+)
+
+
+class DataFrameCheckNA(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.CHECK_NA
+
+    _positive = BoolField("positive")
+    _use_inf_as_na = BoolField("use_inf_as_na")
+
+    def __init__(
+        self, positive=None, use_inf_as_na=None, sparse=None, output_types=None, **kw
+    ):
+        super().__init__(
+            _positive=positive,
+            _use_inf_as_na=use_inf_as_na,
+            _output_types=output_types,
+            sparse=sparse,
+            **kw,
+        )
+
+    @property
+    def positive(self) -> bool:
+        return self._positive
+
+    @property
+    def use_inf_as_na(self) -> bool:
+        return self._use_inf_as_na
+
+    def __call__(self, df):
+        if isinstance(df, DATAFRAME_TYPE):
+            self.output_types = [OutputType.dataframe]
+        elif isinstance(df, SERIES_TYPE):
+            self.output_types = [OutputType.series]
+        elif isinstance(df, TENSOR_TYPE) or isinstance(df, INDEX_TYPE):
+            self.output_types = [OutputType.tensor]
+        else:
+            raise TypeError(
+                f"Expecting mars dataframe, series, index, or tensor, got {type(df)}"
+            )
+
+        params = df.params.copy()
+        if self.output_types[0] == OutputType.dataframe:
+            params["dtypes"] = pd.Series(
+                [np.dtype("bool")] * len(df.dtypes), index=df.columns_value.to_pandas()
+            )
+        else:
+            params["dtype"] = np.dtype("bool")
+        return self.new_tileable([df], **params)
+
+    @classmethod
+    def tile(cls, op: "DataFrameCheckNA"):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+
+        chunks = []
+        for c in in_df.chunks:
+            params = c.params.copy()
+            if op.output_types[0] == OutputType.dataframe:
+                params["dtypes"] = pd.Series(
+                    [np.dtype("bool")] * len(c.dtypes),
+                    index=c.columns_value.to_pandas(),
+                )
+            else:
+                params["dtype"] = np.dtype("bool")
+            new_op = op.copy().reset_key()
+            chunks.append(new_op.new_chunk([c], **params))
+
+        new_op = op.copy().reset_key()
+        params = out_df.params.copy()
+        params.update(dict(chunks=chunks, nsplits=in_df.nsplits))
+        return new_op.new_tileables([in_df], **params)
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameCheckNA"):
+        in_data = ctx[op.inputs[0].key]
+        old_use_inf_as_na = pd.get_option("mode.use_inf_as_na")
+        try:
+            pd.set_option("mode.use_inf_as_na", op.use_inf_as_na)
+            if op.positive:
+                ctx[op.outputs[0].key] = in_data.isna()
+            else:
+                ctx[op.outputs[0].key] = in_data.notna()
+        finally:
+            pd.set_option("mode.use_inf_as_na", old_use_inf_as_na)
+
+
+def _from_pandas(obj: Any):
+    if isinstance(obj, pd.DataFrame):
+        from ..datasource.dataframe import from_pandas
+
+        return from_pandas(obj)
+    elif isinstance(obj, pd.Series):
+        from ..datasource.series import from_pandas
+
+        return from_pandas(obj)
+    elif isinstance(obj, np.ndarray):
+        return mt.tensor(obj)
+    else:
+        return obj
+
+
+def isna(obj):
+    """
+    Detect missing values.
+
+    Return a boolean same-sized object indicating if the values are NA.
+    NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
+    values.
+
+    Everything else gets mapped to False values. Characters such as empty
+    strings ``''`` or :attr:`numpy.inf` are not considered NA values
+    (unless you set ``pandas.options.mode.use_inf_as_na = True``).
+
+    Returns
+    -------
+    DataFrame
+        Mask of bool values for each element in DataFrame that
+        indicates whether an element is not an NA value.
+
+    See Also
+    --------
+    DataFrame.isnull : Alias of isna.
+    DataFrame.notna : Boolean inverse of isna.
+    DataFrame.dropna : Omit axes labels with missing values.
+    isna : Top-level isna.
+
+    Examples
+    --------
+    Show which entries in a DataFrame are NA.
+
+    >>> import numpy as np
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'age': [5, 6, np.NaN],
+    ...                    'born': [md.NaT, md.Timestamp('1939-05-27'),
+    ...                             md.Timestamp('1940-04-25')],
+    ...                    'name': ['Alfred', 'Batman', ''],
+    ...                    'toy': [None, 'Batmobile', 'Joker']})
+    >>> df.execute()
+       age       born    name        toy
+    0  5.0        NaT  Alfred       None
+    1  6.0 1939-05-27  Batman  Batmobile
+    2  NaN 1940-04-25              Joker
+
+    >>> df.isna().execute()
+         age   born   name    toy
+    0  False   True  False   True
+    1  False  False  False  False
+    2   True  False  False  False
+
+    Show which entries in a Series are NA.
+
+    >>> ser = md.Series([5, 6, np.NaN])
+    >>> ser.execute()
+    0    5.0
+    1    6.0
+    2    NaN
+    dtype: float64
+
+    >>> ser.isna().execute()
+    0    False
+    1    False
+    2     True
+    dtype: bool
+    """
+    if isinstance(obj, md.MultiIndex):
+        raise NotImplementedError("isna is not defined for MultiIndex")
+    elif isinstance(obj, ENTITY_TYPE):
+        if isinstance(obj, TENSOR_TYPE):
+            if options.dataframe.mode.use_inf_as_na:
+                return ~mt.isfinite(obj)
+            else:
+                return mt.isnan(obj)
+        else:
+            op = DataFrameCheckNA(
+                positive=True, use_inf_as_na=options.dataframe.mode.use_inf_as_na
+            )
+            return op(obj)
+    else:
+        return _from_pandas(pd.isna(obj))
+
+
+def notna(obj):
+    """
+    Detect existing (non-missing) values.
+
+    Return a boolean same-sized object indicating if the values are not NA.
+    Non-missing values get mapped to True. Characters such as empty
+    strings ``''`` or :attr:`numpy.inf` are not considered NA values
+    (unless you set ``pandas.options.mode.use_inf_as_na = True``).
+    NA values, such as None or :attr:`numpy.NaN`, get mapped to False
+    values.
+
+    Returns
+    -------
+    DataFrame
+        Mask of bool values for each element in DataFrame that
+        indicates whether an element is not an NA value.
+
+    See Also
+    --------
+    DataFrame.notnull : Alias of notna.
+    DataFrame.isna : Boolean inverse of notna.
+    DataFrame.dropna : Omit axes labels with missing values.
+    notna : Top-level notna.
+
+    Examples
+    --------
+    Show which entries in a DataFrame are not NA.
+
+    >>> import numpy as np
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'age': [5, 6, np.NaN],
+    ...                    'born': [md.NaT, md.Timestamp('1939-05-27'),
+    ...                             md.Timestamp('1940-04-25')],
+    ...                    'name': ['Alfred', 'Batman', ''],
+    ...                    'toy': [None, 'Batmobile', 'Joker']})
+    >>> df.execute()
+       age       born    name        toy
+    0  5.0        NaT  Alfred       None
+    1  6.0 1939-05-27  Batman  Batmobile
+    2  NaN 1940-04-25              Joker
+
+    >>> df.notna().execute()
+         age   born  name    toy
+    0   True  False  True  False
+    1   True   True  True   True
+    2  False   True  True   True
+
+    Show which entries in a Series are not NA.
+
+    >>> ser = md.Series([5, 6, np.NaN])
+    >>> ser.execute()
+    0    5.0
+    1    6.0
+    2    NaN
+    dtype: float64
+
+    >>> ser.notna().execute()
+    0     True
+    1     True
+    2    False
+    dtype: bool
+    """
+    if isinstance(obj, md.MultiIndex):
+        raise NotImplementedError("isna is not defined for MultiIndex")
+    elif isinstance(obj, ENTITY_TYPE):
+        if isinstance(obj, TENSOR_TYPE):
+            if options.dataframe.mode.use_inf_as_na:
+                return mt.isfinite(obj)
+            else:
+                return ~mt.isnan(obj)
+        else:
+            op = DataFrameCheckNA(
+                positive=False, use_inf_as_na=options.dataframe.mode.use_inf_as_na
+            )
+            return op(obj)
+    else:
+        return _from_pandas(pd.notna(obj))
+
+
+isnull = isna
+notnull = notna
diff --git a/python/xorbits/_mars/dataframe/missing/dropna.py b/python/xorbits/_mars/dataframe/missing/dropna.py
new file mode 100644
index 000000000..0e595c572
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/missing/dropna.py
@@ -0,0 +1,451 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...config import options
+from ...core import OutputType, recursive_tile
+from ...serialization.serializables import AnyField, BoolField, Int32Field, StringField
+from ...utils import no_default, pd_release_version
+from ..align import align_dataframe_series
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import parse_index, validate_axis
+
+_drop_na_enable_no_default = pd_release_version[:2] >= (1, 5)
+
+
+class DataFrameDropNA(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.DROP_NA
+
+    _axis = AnyField("axis")
+    _how = StringField("how")
+    _thresh = Int32Field("thresh")
+    _subset = AnyField("subset")
+    _use_inf_as_na = BoolField("use_inf_as_na")
+
+    # when True, dropna will be called on the input,
+    # otherwise non-nan counts will be used
+    _drop_directly = BoolField("drop_directly")
+    # size of subset, used when how == 'any'
+    _subset_size = Int32Field("subset_size")
+
+    def __init__(
+        self,
+        axis=None,
+        how=None,
+        thresh=None,
+        subset=None,
+        use_inf_as_na=None,
+        drop_directly=None,
+        subset_size=None,
+        sparse=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _axis=axis,
+            _how=how,
+            _thresh=thresh,
+            _subset=subset,
+            _use_inf_as_na=use_inf_as_na,
+            _drop_directly=drop_directly,
+            _subset_size=subset_size,
+            _output_types=output_types,
+            sparse=sparse,
+            **kw
+        )
+
+    @property
+    def axis(self) -> int:
+        return self._axis
+
+    @property
+    def how(self) -> str:
+        return self._how
+
+    @property
+    def thresh(self) -> int:
+        return self._thresh
+
+    @property
+    def subset(self) -> list:
+        return self._subset
+
+    @property
+    def use_inf_as_na(self) -> bool:
+        return self._use_inf_as_na
+
+    @property
+    def drop_directly(self) -> bool:
+        return self._drop_directly
+
+    @property
+    def subset_size(self) -> int:
+        return self._subset_size
+
+    def __call__(self, df):
+        new_shape = list(df.shape)
+        new_shape[0] = np.nan
+
+        params = df.params.copy()
+        params["index_value"] = parse_index(None, df.key, df.index_value.key)
+        params["shape"] = tuple(new_shape)
+        return self.new_tileable([df], **params)
+
+    @classmethod
+    def _tile_drop_directly(cls, op: "DataFrameDropNA"):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+
+        chunks = []
+        for c in in_df.chunks:
+            new_shape = list(c.shape)
+            new_shape[0] = np.nan
+
+            params = c.params.copy()
+            params["index_value"] = parse_index(None, c.key, c.index_value.key)
+            params["shape"] = tuple(new_shape)
+
+            new_op = op.copy().reset_key()
+            new_op._drop_directly = True
+            chunks.append(new_op.new_chunk([c], **params))
+
+        new_nsplits = list(in_df.nsplits)
+        new_nsplits[0] = (np.nan,) * len(in_df.nsplits[0])
+
+        new_op = op.copy().reset_key()
+        params = out_df.params.copy()
+        params.update(dict(chunks=chunks, nsplits=new_nsplits))
+        return new_op.new_tileables([in_df], **params)
+
+    @classmethod
+    def tile(cls, op: "DataFrameDropNA"):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+
+        if not _drop_na_enable_no_default:
+            op._how = None if op.how is no_default else op.how
+            op._thresh = None if op.thresh is no_default else op.thresh
+
+        # series tiling will go here
+        if len(in_df.chunk_shape) == 1 or in_df.chunk_shape[1] == 1:
+            return cls._tile_drop_directly(op)
+
+        subset_df = in_df
+        if op.subset:
+            subset_df = in_df[op.subset]
+        count_series = yield from recursive_tile(
+            subset_df.agg("count", axis=1, _use_inf_as_na=op.use_inf_as_na)
+        )
+
+        nsplits, out_shape, left_chunks, right_chunks = align_dataframe_series(
+            in_df, count_series, axis=0
+        )
+        out_chunk_indexes = itertools.product(*(range(s) for s in out_shape))
+
+        out_chunks = []
+        for out_idx, df_chunk in zip(out_chunk_indexes, left_chunks):
+            series_chunk = right_chunks[out_idx[0]]
+            kw = dict(
+                shape=(np.nan, nsplits[1][out_idx[1]]),
+                dtypes=df_chunk.dtypes,
+                index_value=df_chunk.index_value,
+                columns_value=df_chunk.columns_value,
+            )
+
+            new_op = op.copy().reset_key()
+            new_op._drop_directly = False
+            new_op._subset_size = len(op.subset) if op.subset else len(in_df.dtypes)
+            out_chunks.append(
+                new_op.new_chunk([df_chunk, series_chunk], index=out_idx, **kw)
+            )
+
+        new_op = op.copy().reset_key()
+        params = out_df.params.copy()
+        new_nsplits = list(tuple(ns) for ns in nsplits)
+        new_nsplits[0] = (np.nan,) * len(new_nsplits[0])
+        params.update(dict(nsplits=tuple(new_nsplits), chunks=out_chunks))
+        return new_op.new_tileables(op.inputs, **params)
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameDropNA"):
+        try:
+            pd.set_option("mode.use_inf_as_na", op.use_inf_as_na)
+
+            in_data = ctx[op.inputs[0].key]
+            if op.drop_directly:
+                if isinstance(in_data, pd.DataFrame):
+                    result = in_data.dropna(
+                        axis=op.axis, how=op.how, thresh=op.thresh, subset=op.subset
+                    )
+                elif isinstance(in_data, pd.Series):
+                    result = in_data.dropna(axis=op.axis, how=op.how)
+                else:
+                    result = in_data.dropna(how=op.how)
+                ctx[op.outputs[0].key] = result
+                return
+
+            in_counts = ctx[op.inputs[1].key]
+            if op.how == "all":
+                in_counts = in_counts[in_counts > 0]
+            else:
+                if op.thresh is None or op.thresh is no_default:
+                    thresh = op.subset_size
+                else:  # pragma: no cover
+                    thresh = op.thresh
+                in_counts = in_counts[in_counts >= thresh]
+
+            ctx[op.outputs[0].key] = in_data.reindex(in_counts.index)
+        finally:
+            pd.reset_option("mode.use_inf_as_na")
+
+
+def df_dropna(
+    df, axis=0, how=no_default, thresh=no_default, subset=None, inplace=False
+):
+    """
+    Remove missing values.
+
+    See the :ref:`User Guide <missing_data>` for more on which values are
+    considered missing, and how to work with missing data.
+
+    Parameters
+    ----------
+    axis : {0 or 'index', 1 or 'columns'}, default 0
+        Determine if rows or columns which contain missing values are
+        removed.
+
+        * 0, or 'index' : Drop rows which contain missing values.
+        * 1, or 'columns' : Drop columns which contain missing value.
+
+        .. versionchanged:: 1.0.0
+
+           Pass tuple or list to drop on multiple axes.
+           Only a single axis is allowed.
+
+    how : {'any', 'all'}, default 'any'
+        Determine if row or column is removed from DataFrame, when we have
+        at least one NA or all NA.
+
+        * 'any' : If any NA values are present, drop that row or column.
+        * 'all' : If all values are NA, drop that row or column.
+
+    thresh : int, optional
+        Require that many non-NA values.
+    subset : array-like, optional
+        Labels along other axis to consider, e.g. if you are dropping rows
+        these would be a list of columns to include.
+    inplace : bool, default False
+        If True, do operation inplace and return None.
+
+    Returns
+    -------
+    DataFrame
+        DataFrame with NA entries dropped from it.
+
+    See Also
+    --------
+    DataFrame.isna: Indicate missing values.
+    DataFrame.notna : Indicate existing (non-missing) values.
+    DataFrame.fillna : Replace missing values.
+    Series.dropna : Drop missing values.
+    Index.dropna : Drop missing indices.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
+    ...                    "toy": [np.nan, 'Batmobile', 'Bullwhip'],
+    ...                    "born": [md.NaT, md.Timestamp("1940-04-25"),
+    ...                             md.NaT]})
+    >>> df.execute()
+           name        toy       born
+    0    Alfred        NaN        NaT
+    1    Batman  Batmobile 1940-04-25
+    2  Catwoman   Bullwhip        NaT
+
+    Drop the rows where at least one element is missing.
+
+    >>> df.dropna().execute()
+         name        toy       born
+    1  Batman  Batmobile 1940-04-25
+
+    Drop the rows where all elements are missing.
+
+    >>> df.dropna(how='all').execute()
+           name        toy       born
+    0    Alfred        NaN        NaT
+    1    Batman  Batmobile 1940-04-25
+    2  Catwoman   Bullwhip        NaT
+
+    Keep only the rows with at least 2 non-NA values.
+
+    >>> df.dropna(thresh=2).execute()
+           name        toy       born
+    1    Batman  Batmobile 1940-04-25
+    2  Catwoman   Bullwhip        NaT
+
+    Define in which columns to look for missing values.
+
+    >>> df.dropna(subset=['name', 'born']).execute()
+           name        toy       born
+    1    Batman  Batmobile 1940-04-25
+
+    Keep the DataFrame with valid entries in the same variable.
+
+    >>> df.dropna(inplace=True)
+    >>> df.execute()
+         name        toy       born
+    1  Batman  Batmobile 1940-04-25
+    """
+    axis = validate_axis(axis, df)
+    if axis != 0:
+        raise NotImplementedError("Does not support dropna on DataFrame when axis=1")
+    if (
+        _drop_na_enable_no_default
+        and (how is not no_default)
+        and (thresh is not no_default)
+    ):
+        raise TypeError(
+            "You cannot set both the how and thresh arguments at the same time."
+        )
+    if thresh is no_default and how is no_default:
+        how = "any"
+
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameDropNA(
+        axis=axis,
+        how=how,
+        thresh=thresh,
+        subset=subset,
+        output_types=[OutputType.dataframe],
+        use_inf_as_na=use_inf_as_na,
+    )
+    out_df = op(df)
+    if inplace:
+        df.data = out_df.data
+    else:
+        return out_df
+
+
+def series_dropna(series, axis=0, inplace=False, how=None):
+    """
+    Return a new Series with missing values removed.
+
+    See the :ref:`User Guide <missing_data>` for more on which values are
+    considered missing, and how to work with missing data.
+
+    Parameters
+    ----------
+    axis : {0 or 'index'}, default 0
+        There is only one axis to drop values from.
+    inplace : bool, default False
+        If True, do operation inplace and return None.
+    how : str, optional
+        Not in use. Kept for compatibility.
+
+    Returns
+    -------
+    Series
+        Series with NA entries dropped from it.
+
+    See Also
+    --------
+    Series.isna: Indicate missing values.
+    Series.notna : Indicate existing (non-missing) values.
+    Series.fillna : Replace missing values.
+    DataFrame.dropna : Drop rows or columns which contain NA values.
+    Index.dropna : Drop missing indices.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> ser = md.Series([1., 2., np.nan])
+    >>> ser.execute()
+    0    1.0
+    1    2.0
+    2    NaN
+    dtype: float64
+
+    Drop NA values from a Series.
+
+    >>> ser.dropna().execute()
+    0    1.0
+    1    2.0
+    dtype: float64
+
+    Keep the Series with valid entries in the same variable.
+
+    >>> ser.dropna(inplace=True)
+    >>> ser.execute()
+    0    1.0
+    1    2.0
+    dtype: float64
+
+    Empty strings are not considered NA values. ``None`` is considered an
+    NA value.
+
+    >>> ser = md.Series([np.NaN, 2, md.NaT, '', None, 'I stay'])
+    >>> ser.execute()
+    0       NaN
+    1         2
+    2       NaT
+    3
+    4      None
+    5    I stay
+    dtype: object
+    >>> ser.dropna().execute()
+    1         2
+    3
+    5    I stay
+    dtype: object
+    """
+    axis = validate_axis(axis, series)
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameDropNA(
+        axis=axis,
+        how=how,
+        output_types=[OutputType.series],
+        use_inf_as_na=use_inf_as_na,
+    )
+    out_series = op(series)
+    if inplace:
+        series.data = out_series.data
+    else:
+        return out_series
+
+
+def index_dropna(index, how="any"):
+    """
+    Return Index without NA/NaN values.
+
+    Parameters
+    ----------
+    how : {'any', 'all'}, default 'any'
+        If the Index is a MultiIndex, drop the value when any or all levels
+        are NaN.
+
+    Returns
+    -------
+    Index
+    """
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameDropNA(
+        axis=0, how=how, output_types=[OutputType.index], use_inf_as_na=use_inf_as_na
+    )
+    return op(index)
diff --git a/python/xorbits/_mars/dataframe/missing/fillna.py b/python/xorbits/_mars/dataframe/missing/fillna.py
new file mode 100644
index 000000000..73c46cbc9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/missing/fillna.py
@@ -0,0 +1,678 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...config import options
+from ...core import ENTITY_TYPE, Entity, OutputType, get_output_types
+from ...core.operand import OperandStage
+from ...serialization.serializables import AnyField, BoolField, Int64Field, StringField
+from ..align import (
+    align_dataframe_dataframe,
+    align_dataframe_series,
+    align_series_series,
+)
+from ..core import DATAFRAME_TYPE, SERIES_TYPE
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import validate_axis
+
+
+class FillNA(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.FILL_NA
+
+    _value = AnyField(
+        "value", on_serialize=lambda x: x.data if isinstance(x, Entity) else x
+    )
+    _method = StringField("method")
+    _axis = AnyField("axis")
+    _limit = Int64Field("limit")
+    _downcast = AnyField("downcast")
+    _use_inf_as_na = BoolField("use_inf_as_na")
+
+    _output_limit = Int64Field("output_limit")
+
+    def __init__(
+        self,
+        value=None,
+        method=None,
+        axis=None,
+        limit=None,
+        downcast=None,
+        use_inf_as_na=None,
+        output_types=None,
+        output_limit=None,
+        **kw
+    ):
+        super().__init__(
+            _value=value,
+            _method=method,
+            _axis=axis,
+            _limit=limit,
+            _downcast=downcast,
+            _use_inf_as_na=use_inf_as_na,
+            _output_types=output_types,
+            _output_limit=output_limit,
+            **kw
+        )
+
+    @property
+    def value(self):
+        return self._value
+
+    @property
+    def method(self):
+        return self._method
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def limit(self):
+        return self._limit
+
+    @property
+    def downcast(self):
+        return self._downcast
+
+    @property
+    def use_inf_as_na(self):
+        return self._use_inf_as_na
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if self._method is None and len(inputs) > 1:
+            self._value = self._inputs[1]
+
+    @property
+    def output_limit(self):
+        return self._output_limit or 1
+
+    @staticmethod
+    def _get_first_slice(op, df, end):
+        if op.method == "bfill":
+            if op.output_types[0] == OutputType.series:
+                return df.iloc[:end]
+            else:
+                if op.axis == 1:
+                    return df.iloc[:, :end]
+                else:
+                    return df.iloc[:end, :]
+        else:
+            if op.output_types[0] == OutputType.series:
+                return df.iloc[-end:]
+            else:
+                if op.axis == 1:
+                    return df.iloc[:, -end:]
+                else:
+                    return df.iloc[-end:, :]
+
+    @classmethod
+    def _execute_map(cls, ctx, op):
+        input_data = ctx[op.inputs[0].key]
+        limit = op.limit
+        axis = op.axis
+        method = op.method
+
+        filled = input_data.fillna(
+            method=method, axis=axis, limit=limit, downcast=op.downcast
+        )
+        ctx[op.outputs[0].key] = cls._get_first_slice(op, filled, 1)
+        del filled
+
+    @classmethod
+    def _execute_combine(cls, ctx, op):
+        axis = op.axis
+        method = op.method
+        limit = op.limit
+
+        input_data = ctx[op.inputs[0].key]
+        if limit is not None:
+            n_summaries = (len(op.inputs) - 1) // 2
+            summaries = [ctx[inp.key] for inp in op.inputs[1 : 1 + n_summaries]]
+        else:
+            summaries = [ctx[inp.key] for inp in op.inputs[1:]]
+
+        if not summaries:
+            ctx[op.outputs[0].key] = input_data.fillna(
+                method=method, axis=axis, limit=limit, downcast=op.downcast
+            )
+            return
+
+        valid_summary = cls._get_first_slice(
+            op, pd.concat(summaries, axis=axis).fillna(method=method, axis=axis), 1
+        )
+
+        if method == "bfill":
+            concat_df = pd.concat([input_data, valid_summary], axis=axis)
+        else:
+            concat_df = pd.concat([valid_summary, input_data], axis=axis)
+
+        concat_df.fillna(
+            method=method, axis=axis, inplace=True, limit=limit, downcast=op.downcast
+        )
+        ctx[op.outputs[0].key] = cls._get_first_slice(op, concat_df, -1)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        try:
+            pd.set_option("mode.use_inf_as_na", op.use_inf_as_na)
+            if op.stage == OperandStage.map:
+                cls._execute_map(ctx, op)
+            elif op.stage == OperandStage.combine:
+                cls._execute_combine(ctx, op)
+            else:
+                input_data = ctx[op.inputs[0].key]
+                value = getattr(op, "value", None)
+                if isinstance(op.value, ENTITY_TYPE):
+                    value = ctx[op.value.key]
+                if not isinstance(input_data, pd.Index):
+                    ctx[op.outputs[0].key] = input_data.fillna(
+                        value=value,
+                        method=op.method,
+                        axis=op.axis,
+                        limit=op.limit,
+                        downcast=op.downcast,
+                    )
+                else:
+                    ctx[op.outputs[0].key] = input_data.fillna(
+                        value=value, downcast=op.downcast
+                    )
+        finally:
+            pd.reset_option("mode.use_inf_as_na")
+
+    @classmethod
+    def _tile_one_by_one(cls, op):
+        in_df = op.inputs[0]
+        in_value_df = op.value if isinstance(op.value, ENTITY_TYPE) else None
+        df = op.outputs[0]
+
+        new_chunks = []
+        for c in in_df.chunks:
+            inputs = [c] if in_value_df is None else [c, in_value_df.chunks[0]]
+            kw = c.params
+            new_op = op.copy().reset_key()
+            new_chunks.append(new_op.new_chunk(inputs, **kw))
+
+        kw = df.params.copy()
+        kw.update(dict(chunks=new_chunks, nsplits=in_df.nsplits))
+        new_op = op.copy().reset_key()
+        return new_op.new_tileables(op.inputs, **kw)
+
+    @classmethod
+    def _build_combine(cls, op, input_chunks, summary_chunks, idx, is_forward=True):
+        c = input_chunks[idx]
+
+        summaries_to_concat = []
+
+        idx_range = list(
+            range(idx) if is_forward else range(idx + 1, len(summary_chunks))
+        )
+        for i in idx_range:
+            summaries_to_concat.append(summary_chunks[i])
+
+        new_chunk_op = op.copy().reset_key()
+        new_chunk_op.stage = OperandStage.combine
+
+        chunks_to_concat = [c] + summaries_to_concat
+        return new_chunk_op.new_chunk(chunks_to_concat, **c.params)
+
+    @classmethod
+    def _tile_directional_dataframe(cls, op):
+        in_df = op.inputs[0]
+        df = op.outputs[0]
+        is_forward = op.method == "ffill"
+
+        n_rows, n_cols = in_df.chunk_shape
+
+        # map to get individual results and summaries
+        src_chunks = np.empty(in_df.chunk_shape, dtype=object)
+        summary_chunks = np.empty(in_df.chunk_shape, dtype=object)
+        for c in in_df.chunks:
+            new_chunk_op = op.copy().reset_key()
+            new_chunk_op.stage = OperandStage.map
+            if op.axis == 1:
+                summary_shape = (c.shape[0], 1)
+            else:
+                summary_shape = (1, c.shape[1])
+            src_chunks[c.index] = c
+            summary_chunks[c.index] = new_chunk_op.new_chunk(
+                [c], shape=summary_shape, dtypes=df.dtypes
+            )
+
+        # combine summaries into results
+        output_chunk_array = np.empty(in_df.chunk_shape, dtype=object)
+        if op.axis == 1:
+            for row in range(n_rows):
+                row_src = src_chunks[row, :]
+                row_summaries = summary_chunks[row, :]
+                for col in range(n_cols):
+                    output_chunk_array[row, col] = cls._build_combine(
+                        op, row_src, row_summaries, col, is_forward
+                    )
+        else:
+            for col in range(n_cols):
+                col_src = src_chunks[:, col]
+                col_summaries = summary_chunks[:, col]
+                for row in range(n_rows):
+                    output_chunk_array[row, col] = cls._build_combine(
+                        op, col_src, col_summaries, row, is_forward
+                    )
+
+        output_chunks = list(output_chunk_array.reshape((n_rows * n_cols,)))
+        new_op = op.copy().reset_key()
+        return new_op.new_tileables(
+            op.inputs,
+            shape=in_df.shape,
+            nsplits=in_df.nsplits,
+            chunks=output_chunks,
+            dtypes=df.dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+        )
+
+    @classmethod
+    def _tile_directional_series(cls, op):
+        in_series = op.inputs[0]
+        series = op.outputs[0]
+        forward = op.method == "ffill"
+
+        # map to get individual results and summaries
+        summary_chunks = np.empty(in_series.chunk_shape, dtype=object)
+        for c in in_series.chunks:
+            new_chunk_op = op.copy().reset_key()
+            new_chunk_op.stage = OperandStage.map
+            summary_chunks[c.index] = new_chunk_op.new_chunk(
+                [c], shape=(1,), dtype=series.dtype
+            )
+
+        # combine summaries into results
+        output_chunks = [
+            cls._build_combine(op, in_series.chunks, summary_chunks, i, forward)
+            for i in range(len(in_series.chunks))
+        ]
+        new_op = op.copy().reset_key()
+        return new_op.new_tileables(
+            op.inputs,
+            shape=in_series.shape,
+            nsplits=in_series.nsplits,
+            chunks=output_chunks,
+            dtype=series.dtype,
+            index_value=series.index_value,
+        )
+
+    @classmethod
+    def _tile_both_dataframes(cls, op):
+        in_df = op.inputs[0]
+        in_value = op.inputs[1]
+        df = op.outputs[0]
+
+        nsplits, out_shape, left_chunks, right_chunks = align_dataframe_dataframe(
+            in_df, in_value
+        )
+        out_chunk_indexes = itertools.product(*(range(s) for s in out_shape[0]))
+
+        out_chunks = []
+        for idx, left_chunk, right_chunk in zip(
+            out_chunk_indexes, left_chunks, right_chunks
+        ):
+            out_chunk = (
+                op.copy()
+                .reset_key()
+                .new_chunk([left_chunk, right_chunk], shape=(np.nan, np.nan), index=idx)
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_dataframes(
+            op.inputs,
+            df.shape,
+            nsplits=tuple(tuple(ns) for ns in nsplits),
+            chunks=out_chunks,
+            dtypes=df.dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+        )
+
+    @classmethod
+    def _tile_dataframe_series(cls, op):
+        left, right = op.inputs[0], op.inputs[1]
+        df = op.outputs[0]
+
+        nsplits, out_shape, left_chunks, right_chunks = align_dataframe_series(
+            left, right, axis=1
+        )
+        out_chunk_indexes = itertools.product(*(range(s) for s in out_shape))
+
+        out_chunks = []
+        for out_idx, df_chunk in zip(out_chunk_indexes, left_chunks):
+            series_chunk = right_chunks[out_idx[1]]
+            kw = dict(
+                shape=(nsplits[0][out_idx[0]], nsplits[1][out_idx[1]]),
+                index_value=df_chunk.index_value,
+                columns_value=df_chunk.columns_value,
+            )
+            out_chunk = (
+                op.copy()
+                .reset_key()
+                .new_chunk([df_chunk, series_chunk], index=out_idx, **kw)
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy().reset_key()
+        return new_op.new_dataframes(
+            op.inputs,
+            df.shape,
+            nsplits=tuple(tuple(ns) for ns in nsplits),
+            chunks=out_chunks,
+            dtypes=df.dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+        )
+
+    @classmethod
+    def _tile_both_series(cls, op):
+        left, right = op.inputs[0], op.inputs[1]
+        df = op.outputs[0]
+
+        nsplits, out_shape, left_chunks, right_chunks = align_series_series(left, right)
+
+        out_chunks = []
+        for idx, left_chunk, right_chunk in zip(
+            range(out_shape[0]), left_chunks, right_chunks
+        ):
+            out_chunk = (
+                op.copy()
+                .reset_key()
+                .new_chunk(
+                    [left_chunk, right_chunk],
+                    index_value=left_chunk.index_value,
+                    shape=(np.nan,),
+                    index=(idx,),
+                )
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_seriess(
+            op.inputs,
+            df.shape,
+            nsplits=tuple(tuple(ns) for ns in nsplits),
+            chunks=out_chunks,
+            dtype=df.dtype,
+            index_value=df.index_value,
+            name=df.name,
+        )
+
+    @classmethod
+    def tile(cls, op):
+        in_df = op.inputs[0]
+        if len(in_df.chunks) == 1 and (
+            not isinstance(op.value, ENTITY_TYPE) or len(op.value.chunks) == 1
+        ):
+            return cls._tile_one_by_one(op)
+        elif op.method is not None:
+            if op.output_types[0] == OutputType.dataframe:
+                return cls._tile_directional_dataframe(op)
+            else:
+                return cls._tile_directional_series(op)
+        elif not isinstance(op.value, ENTITY_TYPE):
+            return cls._tile_one_by_one(op)
+        elif isinstance(op.value, DATAFRAME_TYPE):
+            return cls._tile_both_dataframes(op)
+        elif op.output_types[0] == OutputType.dataframe:
+            return cls._tile_dataframe_series(op)
+        else:
+            return cls._tile_both_series(op)
+
+    def __call__(self, a, value_df=None):
+        method = getattr(self, "method", None)
+        if method == "backfill":
+            method = "bfill"
+        elif method == "pad":
+            method = "ffill"
+        self._method = method
+        axis = getattr(self, "axis", None) or 0
+        self._axis = validate_axis(axis, a)
+
+        inputs = [a]
+        if value_df is not None:
+            inputs.append(value_df)
+        if isinstance(a, DATAFRAME_TYPE):
+            return self.new_dataframe(
+                inputs,
+                shape=a.shape,
+                dtypes=a.dtypes,
+                index_value=a.index_value,
+                columns_value=a.columns_value,
+            )
+        elif isinstance(a, SERIES_TYPE):
+            return self.new_series(
+                inputs,
+                shape=a.shape,
+                dtype=a.dtype,
+                index_value=a.index_value,
+                name=a.name,
+            )
+        else:
+            return self.new_index(
+                inputs,
+                shape=a.shape,
+                dtype=a.dtype,
+                index_value=a.index_value,
+                name=a.name,
+                names=a.names,
+            )
+
+
+def fillna(
+    df, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None
+):
+    """
+    Fill NA/NaN values using the specified method.
+
+    Parameters
+    ----------
+    value : scalar, dict, Series, or DataFrame
+        Value to use to fill holes (e.g. 0), alternately a
+        dict/Series/DataFrame of values specifying which value to use for
+        each index (for a Series) or column (for a DataFrame).  Values not
+        in the dict/Series/DataFrame will not be filled. This value cannot
+        be a list.
+    method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
+        Method to use for filling holes in reindexed Series
+        pad / ffill: propagate last valid observation forward to next valid
+        backfill / bfill: use next valid observation to fill gap.
+    axis : {0 or 'index', 1 or 'columns'}
+        Axis along which to fill missing values.
+    inplace : bool, default False
+        If True, fill in-place. Note: this will modify any
+        other views on this object (e.g., a no-copy slice for a column in a
+        DataFrame).
+    limit : int, default None
+        If method is specified, this is the maximum number of consecutive
+        NaN values to forward/backward fill. In other words, if there is
+        a gap with more than this number of consecutive NaNs, it will only
+        be partially filled. If method is not specified, this is the
+        maximum number of entries along the entire axis where NaNs will be
+        filled. Must be greater than 0 if not None.
+    downcast : dict, default is None
+        A dict of item->dtype of what to downcast if possible,
+        or the string 'infer' which will try to downcast to an appropriate
+        equal type (e.g. float64 to int64 if possible).
+
+    Returns
+    -------
+    DataFrame or None
+        Object with missing values filled or None if ``inplace=True``.
+
+    See Also
+    --------
+    interpolate : Fill NaN values using interpolation.
+    reindex : Conform object to new index.
+    asfreq : Convert TimeSeries to specified frequency.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+            >>> import mars.dataframe as md
+    >>> df = md.DataFrame([[mt.nan, 2, mt.nan, 0],
+    ...                    [3, 4, mt.nan, 1],
+    ...                    [mt.nan, mt.nan, mt.nan, 5],
+    ...                    [mt.nan, 3, mt.nan, 4]],
+    ...                   columns=list('ABCD'))
+    >>> df.execute()
+         A    B   C  D
+    0  NaN  2.0 NaN  0
+    1  3.0  4.0 NaN  1
+    2  NaN  NaN NaN  5
+    3  NaN  3.0 NaN  4
+
+    Replace all NaN elements with 0s.
+
+    >>> df.fillna(0).execute()
+        A   B   C   D
+    0   0.0 2.0 0.0 0
+    1   3.0 4.0 0.0 1
+    2   0.0 0.0 0.0 5
+    3   0.0 3.0 0.0 4
+
+    We can also propagate non-null values forward or backward.
+
+    >>> df.fillna(method='ffill').execute()
+        A   B   C   D
+    0   NaN 2.0 NaN 0
+    1   3.0 4.0 NaN 1
+    2   3.0 4.0 NaN 5
+    3   3.0 3.0 NaN 4
+
+    Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
+    2, and 3 respectively.
+
+    >>> values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
+    >>> df.fillna(value=values).execute()
+        A   B   C   D
+    0   0.0 2.0 2.0 0
+    1   3.0 4.0 2.0 1
+    2   0.0 1.0 2.0 5
+    3   0.0 3.0 2.0 4
+    """
+    if value is None and method is None:
+        raise ValueError("Must specify a fill 'value' or 'method'.")
+    elif value is not None and method is not None:
+        raise ValueError("Cannot specify both 'value' and 'method'.")
+
+    if isinstance(df, SERIES_TYPE) and isinstance(
+        value, (DATAFRAME_TYPE, pd.DataFrame)
+    ):
+        raise ValueError(
+            '"value" parameter must be a scalar, dict or Series, but you passed a "%s"'
+            % type(value).__name__
+        )
+
+    if downcast is not None:
+        raise NotImplementedError(
+            'Currently argument "downcast" is not implemented yet'
+        )
+    if limit is not None:
+        raise NotImplementedError('Currently argument "limit" is not implemented yet')
+
+    if isinstance(value, ENTITY_TYPE):
+        value, value_df = None, value
+    else:
+        value_df = None
+
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = FillNA(
+        value=value,
+        method=method,
+        axis=axis,
+        limit=limit,
+        downcast=downcast,
+        use_inf_as_na=use_inf_as_na,
+        output_types=get_output_types(df),
+    )
+    out_df = op(df, value_df=value_df)
+    if inplace:
+        df.data = out_df.data
+    else:
+        return out_df
+
+
+def ffill(df, axis=None, inplace=False, limit=None, downcast=None):
+    """
+    Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``.
+
+    Returns
+    -------
+    {klass} or None
+        Object with missing values filled or None if ``inplace=True``.
+    """
+    return fillna(
+        df, method="ffill", axis=axis, inplace=inplace, limit=limit, downcast=downcast
+    )
+
+
+def bfill(df, axis=None, inplace=False, limit=None, downcast=None):
+    """
+    Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``.
+
+    Returns
+    -------
+    {klass} or None
+        Object with missing values filled or None if ``inplace=True``.
+    """
+    return fillna(
+        df, method="bfill", axis=axis, inplace=inplace, limit=limit, downcast=downcast
+    )
+
+
+def index_fillna(index, value=None, downcast=None):
+    """
+    Fill NA/NaN values with the specified value.
+
+    Parameters
+    ----------
+    value : scalar
+        Scalar value to use to fill holes (e.g. 0).
+        This value cannot be a list-likes.
+    downcast : dict, default is None
+        A dict of item->dtype of what to downcast if possible,
+        or the string 'infer' which will try to downcast to an appropriate
+        equal type (e.g. float64 to int64 if possible).
+
+    Returns
+    -------
+    Index
+
+    See Also
+    --------
+    DataFrame.fillna : Fill NaN values of a DataFrame.
+    Series.fillna : Fill NaN Values of a Series.
+    """
+    if isinstance(value, (list, pd.Series, SERIES_TYPE)):
+        raise ValueError("'value' must be a scalar, passed: %s" % type(value))
+
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = FillNA(
+        value=value,
+        downcast=downcast,
+        use_inf_as_na=use_inf_as_na,
+        output_types=get_output_types(index),
+    )
+    return op(index)
diff --git a/python/xorbits/_mars/dataframe/missing/replace.py b/python/xorbits/_mars/dataframe/missing/replace.py
new file mode 100644
index 000000000..edac7bcbc
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/missing/replace.py
@@ -0,0 +1,637 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import recursive_tile
+from ...core.operand import OperandStage
+from ...serialization.serializables import AnyField, FieldTypes, Int32Field, ListField
+from ...utils import no_default
+from ..operands import (
+    SERIES_CHUNK_TYPE,
+    SERIES_TYPE,
+    DataFrameOperand,
+    DataFrameOperandMixin,
+)
+from ..utils import build_df, build_series, parse_index
+
+
+class DataFrameReplace(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.REPLACE
+
+    _to_replace = AnyField("to_replace")
+    _value = AnyField("value")
+    _limit = Int32Field("limit")
+    _regex = AnyField("regex")
+    _method = AnyField("method")
+
+    _fill_chunks = ListField("fill_chunks", FieldTypes.key)
+
+    def __init__(
+        self,
+        to_replace=None,
+        value=None,
+        limit=None,
+        regex=None,
+        method=None,
+        fill_chunks=None,
+        **kw
+    ):
+        super().__init__(
+            _to_replace=to_replace,
+            _value=value,
+            _limit=limit,
+            _regex=regex,
+            _method=method,
+            _fill_chunks=fill_chunks,
+            **kw
+        )
+
+    @property
+    def to_replace(self):
+        return self._to_replace
+
+    @property
+    def value(self):
+        return self._value
+
+    @property
+    def limit(self):
+        return self._limit
+
+    @property
+    def regex(self):
+        return self._regex
+
+    @property
+    def method(self):
+        return self._method
+
+    @property
+    def fill_chunks(self):
+        return self._fill_chunks
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        input_iter = iter(inputs)
+        next(input_iter)
+        if isinstance(self.to_replace, (SERIES_TYPE, SERIES_CHUNK_TYPE)):
+            self._to_replace = next(input_iter)
+        if isinstance(self.value, (SERIES_TYPE, SERIES_CHUNK_TYPE)):
+            self._value = next(input_iter)
+        self._fill_chunks = list(input_iter)
+
+    def __call__(self, df_or_series):
+        inputs = [df_or_series]
+        mock_obj = (
+            build_df(df_or_series)
+            if df_or_series.ndim == 2
+            else build_series(df_or_series)
+        )
+
+        if isinstance(self.to_replace, SERIES_TYPE):
+            mock_to_replace = build_series(self.to_replace)
+            inputs.append(self.to_replace)
+        else:
+            mock_to_replace = self.to_replace
+
+        if isinstance(self.value, SERIES_TYPE):
+            mock_value = build_series(self.value)
+            inputs.append(self.value)
+        else:
+            mock_value = self.value
+
+        mock_result = mock_obj.replace(
+            mock_to_replace, mock_value, regex=self.regex, method=self.method
+        )
+
+        if df_or_series.ndim == 2:
+            return self.new_dataframe(
+                inputs,
+                shape=df_or_series.shape,
+                dtypes=mock_result.dtypes,
+                index_value=df_or_series.index_value,
+                columns_value=df_or_series.columns_value,
+            )
+        else:
+            return self.new_series(
+                inputs,
+                shape=df_or_series.shape,
+                dtype=mock_result.dtype,
+                index_value=df_or_series.index_value,
+            )
+
+    @classmethod
+    def _build_result_chunk(
+        cls, op: "DataFrameReplace", in_chunks, with_fill=False, stage=None
+    ):
+        in_obj = op.inputs[0]
+        out_obj = op.outputs[0]
+        in_chunk = in_chunks[0]
+
+        kw = in_chunk.params
+        new_shape = list(in_chunk.shape)
+        if with_fill:
+            new_shape[0] = 1
+
+        if in_obj.ndim == 2:
+            new_dtypes = out_obj.dtypes[in_chunk.dtypes.index]
+            kw.update(
+                dict(
+                    dtypes=new_dtypes,
+                    shape=tuple(new_shape),
+                    column_values=parse_index(new_dtypes.index),
+                )
+            )
+        else:
+            kw.update(dict(dtype=out_obj.dtype, shape=tuple(new_shape)))
+
+        new_op = op.copy().reset_key()
+        new_op.stage = stage
+        return new_op.new_chunk(in_chunks, **kw)
+
+    @classmethod
+    def tile(cls, op: "DataFrameReplace"):
+        in_obj = op.inputs[0]
+        out_obj = op.outputs[0]
+
+        chunk_inputs_ex = []
+        tileable_inputs_ex = []
+        to_replace = op.to_replace
+        if isinstance(to_replace, SERIES_TYPE):
+            to_replace = yield from recursive_tile(
+                to_replace.rechunk((to_replace.shape[0],))
+            )
+            chunk_inputs_ex.append(to_replace.chunks[0])
+            tileable_inputs_ex.append(to_replace)
+        value = op.value
+        if isinstance(value, SERIES_TYPE):
+            value = yield from recursive_tile(value.rechunk((value.shape[0],)))
+            chunk_inputs_ex.append(value.chunks[0])
+            tileable_inputs_ex.append(value)
+
+        # fill methods only available when `to_replace` is a scalar, list or tuple
+        # and `value` is no_default.
+        with_fill = (
+            op.value is no_default
+            and not isinstance(op.to_replace, dict)
+            and op.method is not None
+        )
+
+        chunks = []
+        if not with_fill:
+            for in_chunk in in_obj.chunks:
+                inputs = [in_chunk] + chunk_inputs_ex
+                chunks.append(
+                    cls._build_result_chunk(
+                        op, inputs, with_fill, OperandStage.map if with_fill else None
+                    )
+                )
+        else:
+            map_array = np.empty(out_obj.shape, dtype=object)
+            for in_chunk in in_obj.chunks:
+                inputs = [in_chunk] + chunk_inputs_ex
+                map_array[in_chunk.index] = cls._build_result_chunk(
+                    op, inputs, with_fill, OperandStage.map if with_fill else None
+                )
+
+            for in_chunk in in_obj.chunks:
+                if op.method in (no_default, "pad", "ffill"):
+                    slc = slice(0, in_chunk.index[0])
+                else:
+                    slc = slice(in_chunk.index[0] + 1, in_obj.chunk_shape[0])
+
+                if in_chunk.ndim == 2:
+                    append_chunks = list(map_array[slc, in_chunk.index[1]])
+                else:
+                    append_chunks = list(map_array[slc])
+
+                inputs = [in_chunk] + chunk_inputs_ex + append_chunks
+                chunks.append(
+                    cls._build_result_chunk(op, inputs, False, OperandStage.combine)
+                )
+
+        inputs = [in_obj] + tileable_inputs_ex
+        new_op = op.copy().reset_key()
+        return new_op.new_tileables(
+            inputs, chunks=chunks, nsplits=in_obj.nsplits, **out_obj.params
+        )
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameReplace"):
+        in_data = ctx[op.inputs[0].key]
+        to_replace = op.to_replace
+        if isinstance(to_replace, SERIES_CHUNK_TYPE):
+            to_replace = ctx[to_replace.key]
+        value = op.value
+        if isinstance(value, SERIES_CHUNK_TYPE):
+            value = ctx[value.key]
+
+        if not op.fill_chunks:
+            concat_data = in_data
+        else:
+            to_concat = [ctx[c.key] for c in op.fill_chunks]
+            if op.method in (no_default, "pad", "ffill"):
+                to_concat += [in_data]
+            else:
+                to_concat = [in_data] + to_concat
+            concat_data = pd.concat(to_concat)
+
+        replace_args = (to_replace,)
+        if value is not no_default:
+            replace_args += (value,)
+        replace_kwargs = dict(regex=op.regex, method=op.method, limit=op.limit)
+        replace_kwargs = {
+            k: v for k, v in replace_kwargs.items() if v is not no_default
+        }
+
+        result = concat_data.replace(*replace_args, **replace_kwargs)
+        del concat_data
+
+        if op.stage == OperandStage.map:
+            to_slice = op.outputs[0].shape[0]
+            if op.method in (no_default, "pad", "ffill"):
+                result = result.iloc[-to_slice:]
+            else:
+                result = result.iloc[:to_slice]
+        else:
+            to_remove = len(result) - len(in_data)
+            if to_remove > 0:
+                if op.method in (no_default, "pad", "ffill"):
+                    result = result.iloc[to_remove:]
+                else:
+                    result = result.iloc[:-to_remove]
+        ctx[op.outputs[0].key] = result
+
+
+_fun_doc = """
+Replace values given in `to_replace` with `value`.
+
+Values of the #obj_type# are replaced with other values dynamically.
+This differs from updating with ``.loc`` or ``.iloc``, which require
+you to specify a location to update with some value.
+
+Parameters
+----------
+to_replace : str, regex, list, dict, Series, int, float, or None
+    How to find the values that will be replaced.
+
+    * numeric, str or regex:
+
+        - numeric: numeric values equal to `to_replace` will be
+          replaced with `value`
+        - str: string exactly matching `to_replace` will be replaced
+          with `value`
+        - regex: regexs matching `to_replace` will be replaced with
+          `value`
+
+    * list of str, regex, or numeric:
+
+        - First, if `to_replace` and `value` are both lists, they
+          **must** be the same length.
+        - Second, if ``regex=True`` then all of the strings in **both**
+          lists will be interpreted as regexs otherwise they will match
+          directly. This doesn't matter much for `value` since there
+          are only a few possible substitution regexes you can use.
+        - str, regex and numeric rules apply as above.
+
+    * dict:
+
+        - Dicts can be used to specify different replacement values
+          for different existing values. For example,
+          ``{'a': 'b', 'y': 'z'}`` replaces the value 'a' with 'b' and
+          'y' with 'z'. To use a dict in this way the `value`
+          parameter should be `None`.
+        - For a DataFrame a dict can specify that different values
+          should be replaced in different columns. For example,
+          ``{'a': 1, 'b': 'z'}`` looks for the value 1 in column 'a'
+          and the value 'z' in column 'b' and replaces these values
+          with whatever is specified in `value`. The `value` parameter
+          should not be ``None`` in this case. You can treat this as a
+          special case of passing two lists except that you are
+          specifying the column to search in.
+        - For a DataFrame nested dictionaries, e.g.,
+          ``{'a': {'b': np.nan}}``, are read as follows: look in column
+          'a' for the value 'b' and replace it with NaN. The `value`
+          parameter should be ``None`` to use a nested dict in this
+          way. You can nest regular expressions as well. Note that
+          column names (the top-level dictionary keys in a nested
+          dictionary) **cannot** be regular expressions.
+
+    * None:
+
+        - This means that the `regex` argument must be a string,
+          compiled regular expression, or list, dict, ndarray or
+          Series of such elements. If `value` is also ``None`` then
+          this **must** be a nested dictionary or Series.
+
+    See the examples section for examples of each of these.
+value : scalar, dict, list, str, regex, default None
+    Value to replace any values matching `to_replace` with.
+    For a DataFrame a dict of values can be used to specify which
+    value to use for each column (columns not in the dict will not be
+    filled). Regular expressions, strings and lists or dicts of such
+    objects are also allowed.
+inplace : bool, default False
+    If True, in place. Note: this will modify any
+    other views on this object (e.g. a column from a DataFrame).
+    Returns the caller if this is True.
+limit : int, default None
+    Maximum size gap to forward or backward fill.
+regex : bool or same types as `to_replace`, default False
+    Whether to interpret `to_replace` and/or `value` as regular
+    expressions. If this is ``True`` then `to_replace` *must* be a
+    string. Alternatively, this could be a regular expression or a
+    list, dict, or array of regular expressions in which case
+    `to_replace` must be ``None``.
+method : {'pad', 'ffill', 'bfill', `None`}
+    The method to use when for replacement, when `to_replace` is a
+    scalar, list or tuple and `value` is ``None``.
+
+Returns
+-------
+#obj_type#
+    Object after replacement.
+
+Raises
+------
+AssertionError
+    * If `regex` is not a ``bool`` and `to_replace` is not
+      ``None``.
+TypeError
+    * If `to_replace` is a ``dict`` and `value` is not a ``list``,
+      ``dict``, ``ndarray``, or ``Series``
+    * If `to_replace` is ``None`` and `regex` is not compilable
+      into a regular expression or is a list, dict, ndarray, or
+      Series.
+    * When replacing multiple ``bool`` or ``datetime64`` objects and
+      the arguments to `to_replace` does not match the type of the
+      value being replaced
+ValueError
+    * If a ``list`` or an ``ndarray`` is passed to `to_replace` and
+      `value` but they are not the same length.
+
+See Also
+--------
+#obj_type#.fillna : Fill NA values.
+#obj_type#.where : Replace values based on boolean condition.
+Series.str.replace : Simple string replacement.
+
+Notes
+-----
+* Regex substitution is performed under the hood with ``re.sub``. The
+  rules for substitution for ``re.sub`` are the same.
+* Regular expressions will only substitute on strings, meaning you
+  cannot provide, for example, a regular expression matching floating
+  point numbers and expect the columns in your frame that have a
+  numeric dtype to be matched. However, if those floating point
+  numbers *are* strings, then you can do this.
+* This method has *a lot* of options. You are encouraged to experiment
+  and play with this method to gain intuition about how it works.
+* When dict is used as the `to_replace` value, it is like
+  key(s) in the dict are the to_replace part and
+  value(s) in the dict are the value parameter.
+
+Examples
+--------
+
+**Scalar `to_replace` and `value`**
+
+>>> import mars.tensor as mt
+>>> import mars.dataframe as md
+>>> s = md.Series([0, 1, 2, 3, 4])
+>>> s.replace(0, 5).execute()
+0    5
+1    1
+2    2
+3    3
+4    4
+dtype: int64
+
+>>> df = md.DataFrame({'A': [0, 1, 2, 3, 4],
+...                    'B': [5, 6, 7, 8, 9],
+...                    'C': ['a', 'b', 'c', 'd', 'e']})
+>>> df.replace(0, 5).execute()
+   A  B  C
+0  5  5  a
+1  1  6  b
+2  2  7  c
+3  3  8  d
+4  4  9  e
+
+**List-like `to_replace`**
+
+>>> df.replace([0, 1, 2, 3], 4).execute()
+   A  B  C
+0  4  5  a
+1  4  6  b
+2  4  7  c
+3  4  8  d
+4  4  9  e
+
+>>> df.replace([0, 1, 2, 3], [4, 3, 2, 1]).execute()
+   A  B  C
+0  4  5  a
+1  3  6  b
+2  2  7  c
+3  1  8  d
+4  4  9  e
+
+>>> s.replace([1, 2], method='bfill').execute()
+0    0
+1    3
+2    3
+3    3
+4    4
+dtype: int64
+
+**dict-like `to_replace`**
+
+>>> df.replace({0: 10, 1: 100}).execute()
+     A  B  C
+0   10  5  a
+1  100  6  b
+2    2  7  c
+3    3  8  d
+4    4  9  e
+
+>>> df.replace({'A': 0, 'B': 5}, 100).execute()
+     A    B  C
+0  100  100  a
+1    1    6  b
+2    2    7  c
+3    3    8  d
+4    4    9  e
+
+>>> df.replace({'A': {0: 100, 4: 400}}).execute()
+     A  B  C
+0  100  5  a
+1    1  6  b
+2    2  7  c
+3    3  8  d
+4  400  9  e
+
+**Regular expression `to_replace`**
+
+>>> df = md.DataFrame({'A': ['bat', 'foo', 'bait'],
+...                    'B': ['abc', 'bar', 'xyz']})
+>>> df.replace(to_replace=r'^ba.$', value='new', regex=True).execute()
+      A    B
+0   new  abc
+1   foo  new
+2  bait  xyz
+
+>>> df.replace({'A': r'^ba.$'}, {'A': 'new'}, regex=True).execute()
+      A    B
+0   new  abc
+1   foo  bar
+2  bait  xyz
+
+>>> df.replace(regex=r'^ba.$', value='new').execute()
+      A    B
+0   new  abc
+1   foo  new
+2  bait  xyz
+
+>>> df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'}).execute()
+      A    B
+0   new  abc
+1   xyz  new
+2  bait  xyz
+
+>>> df.replace(regex=[r'^ba.$', 'foo'], value='new').execute()
+      A    B
+0   new  abc
+1   new  new
+2  bait  xyz
+
+Note that when replacing multiple ``bool`` or ``datetime64`` objects,
+the data types in the `to_replace` parameter must match the data
+type of the value being replaced:
+
+>>> df = md.DataFrame({'A': [True, False, True],
+...                    'B': [False, True, False]})
+>>> df.replace({'a string': 'new value', True: False})  # raises.execute()
+Traceback (most recent call last):
+    ....execute()
+TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str'
+
+This raises a ``TypeError`` because one of the ``dict`` keys is not of
+the correct type for replacement.
+
+Compare the behavior of ``s.replace({'a': None})`` and
+``s.replace('a', None)`` to understand the peculiarities
+of the `to_replace` parameter:
+
+>>> s = md.Series([10, 'a', 'a', 'b', 'a'])
+
+When one uses a dict as the `to_replace` value, it is like the
+value(s) in the dict are equal to the `value` parameter.
+``s.replace({'a': None})`` is equivalent to
+``s.replace(to_replace={'a': None}, value=None, method=None)``:
+
+>>> s.replace({'a': None}).execute()
+0      10
+1    None
+2    None
+3       b
+4    None
+dtype: object
+
+When ``value=None`` and `to_replace` is a scalar, list or
+tuple, `replace` uses the method parameter (default 'pad') to do the
+replacement. So this is why the 'a' values are being replaced by 10
+in rows 1 and 2 and 'b' in row 4 in this case.
+The command ``s.replace('a', None)`` is actually equivalent to
+``s.replace(to_replace='a', value=None, method='pad')``:
+
+>>> s.replace('a', None).execute()
+0    10
+1    10
+2    10
+3     b
+4     b
+dtype: object
+"""
+
+
+def _replace(
+    df_or_series,
+    to_replace=None,
+    value=None,
+    inplace=False,
+    limit=None,
+    regex=False,
+    method=no_default,
+):
+    if not isinstance(to_replace, dict) and value is no_default and limit is not None:
+        raise NotImplementedError("fill with limit not supported when value is None")
+
+    if not isinstance(regex, bool):
+        to_replace = regex
+        regex = True
+    op = DataFrameReplace(
+        to_replace=to_replace, value=value, limit=limit, regex=regex, method=method
+    )
+    ret = op(df_or_series)
+    if inplace:
+        df_or_series.data = ret.data
+    else:
+        return ret
+
+
+def df_replace(
+    df,
+    to_replace=no_default,
+    value=no_default,
+    inplace=False,
+    limit=None,
+    regex=False,
+    method=no_default,
+):
+    return _replace(
+        df,
+        to_replace=to_replace,
+        value=value,
+        inplace=inplace,
+        limit=limit,
+        regex=regex,
+        method=method,
+    )
+
+
+def series_replace(
+    series,
+    to_replace=no_default,
+    value=no_default,
+    inplace=False,
+    limit=None,
+    regex=False,
+    method=no_default,
+):
+    return _replace(
+        series,
+        to_replace=to_replace,
+        value=value,
+        inplace=inplace,
+        limit=limit,
+        regex=regex,
+        method=method,
+    )
+
+
+df_replace.__doc__ = _fun_doc.replace("#obj_type#", "DataFrame")
+series_replace.__doc__ = _fun_doc.replace("#obj_type#", "Series")
diff --git a/python/xorbits/_mars/dataframe/missing/tests/__init__.py b/python/xorbits/_mars/dataframe/missing/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/missing/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/missing/tests/test_missing.py b/python/xorbits/_mars/dataframe/missing/tests/test_missing.py
new file mode 100644
index 000000000..0c4806192
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/missing/tests/test_missing.py
@@ -0,0 +1,438 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .... import dataframe as md
+from .... import tensor as mt
+from ....core import tile
+from ....core.operand import OperandStage
+from ....utils import pd_release_version
+
+_drop_na_enable_no_default = pd_release_version[:2] >= (1, 5)
+
+
+def test_fill_na():
+    df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list("ABCDEFGHIJ"))
+    for _ in range(20):
+        df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99)
+    value_df_raw = pd.DataFrame(
+        np.random.randint(0, 100, (10, 7)).astype(np.float32), columns=list("ABCDEFG")
+    )
+    series_raw = pd.Series(np.nan, index=range(20))
+    for _ in range(3):
+        series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)
+    value_series_raw = pd.Series(
+        np.random.randint(0, 100, (10,)).astype(np.float32), index=list("ABCDEFGHIJ")
+    )
+
+    df = md.DataFrame(df_raw)
+    series = md.Series(series_raw)
+
+    # when nothing supplied, raise
+    with pytest.raises(ValueError):
+        df.fillna()
+    # when both values and methods supplied, raises
+    with pytest.raises(ValueError):
+        df.fillna(value=1, method="ffill")
+    # when call on series, cannot supply DataFrames
+    with pytest.raises(ValueError):
+        series.fillna(value=df)
+    with pytest.raises(ValueError):
+        series.fillna(value=df_raw)
+    with pytest.raises(NotImplementedError):
+        series.fillna(value=series_raw, downcast="infer")
+    with pytest.raises(NotImplementedError):
+        series.ffill(limit=1)
+
+    df2 = tile(df.fillna(value_series_raw))
+    assert len(df2.chunks) == 1
+    assert df2.chunks[0].shape == df2.shape
+    assert df2.chunks[0].op.stage is None
+
+    series2 = tile(series.fillna(value_series_raw))
+    assert len(series2.chunks) == 1
+    assert series2.chunks[0].shape == series2.shape
+    assert series2.chunks[0].op.stage is None
+
+    df = md.DataFrame(df_raw, chunk_size=5)
+    df2 = tile(df.fillna(value_series_raw))
+    assert len(df2.chunks) == 8
+    assert df2.chunks[0].shape == (5, 5)
+    assert df2.chunks[0].op.stage is None
+
+    series = md.Series(series_raw, chunk_size=5)
+    series2 = tile(series.fillna(value_series_raw))
+    assert len(series2.chunks) == 4
+    assert series2.chunks[0].shape == (5,)
+    assert series2.chunks[0].op.stage is None
+
+    df2 = tile(df.ffill(axis="columns"))
+    assert len(df2.chunks) == 8
+    assert df2.chunks[0].shape == (5, 5)
+    assert df2.chunks[0].op.axis == 1
+    assert df2.chunks[0].op.stage == OperandStage.combine
+    assert df2.chunks[0].op.method == "ffill"
+    assert df2.chunks[0].op.limit is None
+
+    series2 = tile(series.bfill())
+    assert len(series2.chunks) == 4
+    assert series2.chunks[0].shape == (5,)
+    assert series2.chunks[0].op.stage == OperandStage.combine
+    assert series2.chunks[0].op.method == "bfill"
+    assert series2.chunks[0].op.limit is None
+
+    value_df = md.DataFrame(value_df_raw, chunk_size=7)
+    value_series = md.Series(value_series_raw, chunk_size=7)
+
+    df2 = tile(df.fillna(value_df))
+    assert df2.shape == df.shape
+    assert df2.chunks[0].op.stage is None
+
+    df2 = tile(df.fillna(value_series))
+    assert df2.shape == df.shape
+    assert df2.chunks[0].op.stage is None
+
+    value_series_raw.index = list(range(10))
+    value_series = md.Series(value_series_raw)
+    series2 = tile(series.fillna(value_series))
+    assert series2.shape == series.shape
+    assert series2.chunks[0].op.stage is None
+
+
+def test_drop_na():
+    # dataframe cases
+    df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list("ABCDEFGHIJ"))
+    for _ in range(30):
+        df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99)
+    for rowid in range(random.randint(1, 5)):
+        row = random.randint(0, 19)
+        for idx in range(0, 10):
+            df_raw.iloc[row, idx] = random.randint(0, 99)
+
+    # not supporting drop with axis=1
+    with pytest.raises(NotImplementedError):
+        md.DataFrame(df_raw).dropna(axis=1)
+
+    if _drop_na_enable_no_default:
+        with pytest.raises(TypeError):
+            md.DataFrame(df_raw).dropna(how="any", thresh=0)
+
+    # only one chunk in columns, can run dropna directly
+    r = tile(md.DataFrame(df_raw, chunk_size=(4, 10)).dropna())
+    assert r.shape == (np.nan, 10)
+    assert r.nsplits == ((np.nan,) * 5, (10,))
+    for c in r.chunks:
+        assert isinstance(c.op, type(r.op))
+        assert len(c.inputs) == 1
+        assert len(c.inputs[0].inputs) == 0
+        assert c.shape == (np.nan, 10)
+
+    # multiple chunks in columns, count() will be called first
+    r = tile(md.DataFrame(df_raw, chunk_size=4).dropna())
+    assert r.shape == (np.nan, 10)
+    assert r.nsplits == ((np.nan,) * 5, (4, 4, 2))
+    for c in r.chunks:
+        assert isinstance(c.op, type(r.op))
+        assert len(c.inputs) == 2
+        assert len(c.inputs[0].inputs) == 0
+        assert c.inputs[1].op.stage == OperandStage.agg
+        assert np.isnan(c.shape[0])
+
+    # series cases
+    series_raw = pd.Series(np.nan, index=range(20))
+    for _ in range(10):
+        series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)
+
+    r = tile(md.Series(series_raw, chunk_size=4).dropna())
+    assert r.shape == (np.nan,)
+    assert r.nsplits == ((np.nan,) * 5,)
+    for c in r.chunks:
+        assert isinstance(c.op, type(r.op))
+        assert len(c.inputs) == 1
+        assert len(c.inputs[0].inputs) == 0
+        assert c.shape == (np.nan,)
+
+
+def test_replace():
+    # dataframe cases
+    df_raw = pd.DataFrame(-1, index=range(0, 20), columns=list("ABCDEFGHIJ"))
+    for _ in range(30):
+        df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99)
+    for rowid in range(random.randint(1, 5)):
+        row = random.randint(0, 19)
+        for idx in range(0, 10):
+            df_raw.iloc[row, idx] = random.randint(0, 99)
+
+    # not supporting fill with limit
+    df = md.DataFrame(df_raw, chunk_size=4)
+    with pytest.raises(NotImplementedError):
+        df.replace(-1, method="ffill", limit=5)
+
+    r = tile(df.replace(-1, method="ffill"))
+    assert len(r.chunks) == 15
+    assert r.chunks[0].shape == (4, 4)
+    assert r.chunks[0].op.stage == OperandStage.combine
+    assert r.chunks[0].op.method == "ffill"
+    assert r.chunks[0].op.limit is None
+    assert r.chunks[-1].inputs[-1].shape == (1, 2)
+    assert r.chunks[-1].inputs[-1].op.stage == OperandStage.map
+    assert r.chunks[-1].inputs[-1].op.method == "ffill"
+    assert r.chunks[-1].inputs[-1].op.limit is None
+
+    r = tile(df.replace(-1, 99))
+    assert len(r.chunks) == 15
+    assert r.chunks[0].shape == (4, 4)
+    assert r.chunks[0].op.stage is None
+    assert r.chunks[0].op.limit is None
+
+    # series cases
+    series_raw = pd.Series(-1, index=range(20))
+    for _ in range(10):
+        series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)
+    series = md.Series(series_raw, chunk_size=4)
+
+    r = tile(series.replace(-1, method="ffill"))
+    assert len(r.chunks) == 5
+    assert r.chunks[0].shape == (4,)
+    assert r.chunks[0].op.stage == OperandStage.combine
+    assert r.chunks[0].op.method == "ffill"
+    assert r.chunks[0].op.limit is None
+    assert r.chunks[-1].inputs[-1].shape == (1,)
+    assert r.chunks[-1].inputs[-1].op.stage == OperandStage.map
+    assert r.chunks[-1].inputs[-1].op.method == "ffill"
+    assert r.chunks[-1].inputs[-1].op.limit is None
+
+    r = tile(series.replace(-1, 99))
+    assert len(r.chunks) == 5
+    assert r.chunks[0].shape == (4,)
+    assert r.chunks[0].op.stage is None
+    assert r.chunks[0].op.limit is None
+
+
+@pytest.mark.parametrize("inf_as_na", [True, False])
+def test_isna(setup, inf_as_na):
+    from ....config import options
+    from ..checkna import isna
+
+    old_mars_inf_as_na = options.dataframe.mode.use_inf_as_na
+    options.dataframe.mode.use_inf_as_na = inf_as_na
+    # this option could be changed by mars execution.
+    old_pd_inf_as_na = pd.get_option("mode.use_inf_as_na")
+    pd.options.mode.use_inf_as_na = inf_as_na
+
+    # scalars
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    assert isna("dog") == pd.isna("dog")
+    assert isna(None) == pd.isna(None)
+    assert isna(md.NA) == pd.isna(pd.NA)
+    assert isna(md.NaT) == pd.isna(pd.NaT)
+    assert isna(mt.NaN) == pd.isna(np.NaN)
+    assert isna(type) == pd.isna(type)
+
+    # multi index
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    with pytest.raises(NotImplementedError):
+        midx = md.MultiIndex()
+        isna(midx)
+
+    # list
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    l = [1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT]
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    actual = isna(l).execute().fetch()
+    expected = pd.isna(l)
+    np.testing.assert_array_equal(expected, actual)
+
+    # tuple
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    t = (1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT)
+    assert not isna(t)
+
+    # numpy ndarray
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    narr = np.array((1, 2, 3, np.Inf, np.NaN))
+    actual = isna(narr).execute().fetch()
+    expected = pd.isna(narr)
+    np.testing.assert_array_equal(expected, actual)
+
+    # pandas index
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    pi = pd.Index((1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT))
+    actual = isna(pi).execute().fetch()
+    expected = pd.isna(pi)
+    np.testing.assert_array_equal(expected, actual)
+
+    # pandas series
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    ps = pd.Series((1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT))
+    actual = isna(ps).execute().fetch()
+    expected = pd.isna(ps)
+    pd.testing.assert_series_equal(expected, actual)
+
+    # pandas dataframe
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    pdf = pd.DataFrame(
+        {"foo": (1, 2, 3, np.Inf, pd.NA), "bar": (4, 5, 6, np.NaN, pd.NaT)}
+    )
+    actual = isna(pdf).execute().fetch()
+    expected = pd.isna(pdf)
+    pd.testing.assert_frame_equal(expected, actual)
+
+    # mars tensor
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    marr = mt.tensor(narr)
+    actual = isna(marr).execute().fetch()
+    expected = pd.isna(narr)
+    np.testing.assert_array_equal(expected, actual)
+
+    # mars index
+    from ...datasource.index import from_pandas as from_pandas_index
+
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    mi = from_pandas_index(pi)
+    actual = isna(mi).execute().fetch()
+    expected = pd.isna(pi)
+    np.testing.assert_array_equal(expected, actual)
+
+    # mars series
+    from ...datasource.series import from_pandas as from_pandas_series
+
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    ms = from_pandas_series(ps)
+    actual = isna(ms).execute().fetch()
+    expected = pd.isna(ps)
+    pd.testing.assert_series_equal(expected, actual)
+
+    # mars dataframe
+    from ...datasource.dataframe import from_pandas as from_pandas_df
+
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    mdf = from_pandas_df(pdf)
+    actual = isna(mdf).execute().fetch()
+    expected = pd.isna(pdf)
+    pd.testing.assert_frame_equal(expected, actual)
+
+    options.dataframe.mode.use_inf_as_na = old_mars_inf_as_na
+    pd.options.mode.use_inf_as_na = old_pd_inf_as_na
+
+
+@pytest.mark.parametrize("inf_as_na", [True, False])
+def test_notna(setup, inf_as_na):
+    from ....config import options
+    from ..checkna import notna
+
+    old_mars_inf_as_na = options.dataframe.mode.use_inf_as_na
+    options.dataframe.mode.use_inf_as_na = inf_as_na
+    # this option could be changed by mars execution.
+    old_pd_inf_as_na = pd.get_option("mode.use_inf_as_na")
+    pd.options.mode.use_inf_as_na = inf_as_na
+
+    # scalars
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    assert notna("dog") == pd.notna("dog")
+    assert notna(None) == pd.notna(None)
+    assert notna(md.NA) == pd.notna(pd.NA)
+    assert notna(md.NaT) == pd.notna(pd.NaT)
+    assert notna(mt.NaN) == pd.notna(np.NaN)
+    assert notna(type) == pd.notna(type)
+
+    # multi index
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    with pytest.raises(NotImplementedError):
+        midx = md.MultiIndex()
+        notna(midx)
+
+    # list
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    l = [1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT]
+    actual = notna(l).execute().fetch()
+    expected = pd.notna(l)
+    np.testing.assert_array_equal(expected, actual)
+
+    # tuple
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    t = (1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT)
+    assert notna(t)
+
+    # numpy ndarray
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    narr = np.array((1, 2, 3, np.Inf, np.NaN))
+    actual = notna(narr).execute().fetch()
+    expected = pd.notna(narr)
+    np.testing.assert_array_equal(expected, actual)
+
+    # pandas index
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    pi = pd.Index((1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT))
+    actual = notna(pi).execute().fetch()
+    expected = pd.notna(pi)
+    np.testing.assert_array_equal(expected, actual)
+
+    # pandas series
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    ps = pd.Series((1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT))
+    actual = notna(ps).execute().fetch()
+    expected = pd.notna(ps)
+    pd.testing.assert_series_equal(expected, actual)
+
+    # pandas dataframe
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    pdf = pd.DataFrame(
+        {"foo": (1, 2, 3, np.Inf, pd.NA), "bar": (4, 5, 6, np.NaN, pd.NaT)}
+    )
+    actual = notna(pdf).execute().fetch()
+    expected = pd.notna(pdf)
+    pd.testing.assert_frame_equal(expected, actual)
+
+    # mars tensor
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    marr = mt.tensor(narr)
+    actual = notna(marr).execute().fetch()
+    expected = pd.notna(narr)
+    np.testing.assert_array_equal(expected, actual)
+
+    # mars index
+    from ...datasource.index import from_pandas as from_pandas_index
+
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    mi = from_pandas_index(pi)
+    actual = notna(mi).execute().fetch()
+    expected = pd.notna(pi)
+    np.testing.assert_array_equal(expected, actual)
+
+    # mars series
+    from ...datasource.series import from_pandas as from_pandas_series
+
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    ms = from_pandas_series(ps)
+    actual = notna(ms).execute().fetch()
+    expected = pd.notna(ps)
+    pd.testing.assert_series_equal(expected, actual)
+
+    # mars dataframe
+    from ...datasource.dataframe import from_pandas as from_pandas_df
+
+    assert pd.get_option("mode.use_inf_as_na") == inf_as_na
+    mdf = from_pandas_df(pdf)
+    actual = notna(mdf).execute().fetch()
+    expected = pd.notna(pdf)
+    pd.testing.assert_frame_equal(expected, actual)
+
+    options.dataframe.mode.use_inf_as_na = old_mars_inf_as_na
+    pd.options.mode.use_inf_as_na = old_pd_inf_as_na
diff --git a/python/xorbits/_mars/dataframe/missing/tests/test_missing_execution.py b/python/xorbits/_mars/dataframe/missing/tests/test_missing_execution.py
new file mode 100644
index 000000000..cc19fb284
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/missing/tests/test_missing_execution.py
@@ -0,0 +1,333 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import re
+import string
+
+import numpy as np
+import pandas as pd
+
+try:
+    import pyarrow as pa
+except ImportError:  # pragma: no cover
+    pa = None
+
+from .... import dataframe as md
+
+
+def test_check_na_execution(setup):
+    df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list("ABCDEFGHIJ"))
+    for _ in range(20):
+        df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99)
+
+    df = md.DataFrame(df_raw, chunk_size=4)
+
+    pd.testing.assert_frame_equal(df.isna().execute().fetch(), df_raw.isna())
+    pd.testing.assert_frame_equal(df.notna().execute().fetch(), df_raw.notna())
+
+    series_raw = pd.Series(np.nan, index=range(20))
+    for _ in range(3):
+        series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)
+
+    series = md.Series(series_raw, chunk_size=4)
+
+    pd.testing.assert_series_equal(series.isna().execute().fetch(), series_raw.isna())
+    pd.testing.assert_series_equal(series.notna().execute().fetch(), series_raw.notna())
+
+    idx_data = np.array([np.nan] * 20)
+    for _ in range(3):
+        idx_data[random.randint(0, 19)] = random.randint(0, 99)
+    idx_raw = pd.Index(idx_data)
+
+    idx = md.Index(idx_raw, chunk_size=4)
+
+    np.testing.assert_array_equal(idx.isna().execute().fetch(), idx_raw.isna())
+    np.testing.assert_array_equal(idx.notna().execute().fetch(), idx_raw.notna())
+
+
+def test_dataframe_fill_na_execution(setup):
+    df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list("ABCDEFGHIJ"))
+    for _ in range(20):
+        df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99)
+    value_df_raw = pd.DataFrame(
+        np.random.randint(0, 100, (10, 7)).astype(np.float32), columns=list("ABCDEFG")
+    )
+    df = md.DataFrame(df_raw)
+
+    # test DataFrame single chunk with numeric fill
+    r = df.fillna(1)
+    pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.fillna(1))
+
+    # test DataFrame single chunk with value as single chunk
+    value_df = md.DataFrame(value_df_raw)
+    r = df.fillna(value_df)
+    pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.fillna(value_df_raw))
+
+    df = md.DataFrame(df_raw, chunk_size=3)
+
+    # test chunked with numeric fill
+    r = df.fillna(1)
+    pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.fillna(1))
+
+    # test forward fill in axis=0 without limit
+    r = df.fillna(method="pad")
+    pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.fillna(method="pad"))
+
+    # test backward fill in axis=0 without limit
+    r = df.fillna(method="backfill")
+    pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.fillna(method="backfill"))
+
+    # test forward fill in axis=1 without limit
+    r = df.ffill(axis=1)
+    pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.ffill(axis=1))
+
+    # test backward fill in axis=1 without limit
+    r = df.bfill(axis=1)
+    pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.bfill(axis=1))
+
+    # test fill with dataframe
+    value_df = md.DataFrame(value_df_raw, chunk_size=4)
+    r = df.fillna(value_df)
+    pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.fillna(value_df_raw))
+
+    # test fill with series
+    value_series_raw = pd.Series(
+        np.random.randint(0, 100, (10,)).astype(np.float32), index=list("ABCDEFGHIJ")
+    )
+    value_series = md.Series(value_series_raw, chunk_size=4)
+    r = df.fillna(value_series)
+    pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.fillna(value_series_raw))
+
+    # test inplace tile
+    df.fillna(1, inplace=True)
+    pd.testing.assert_frame_equal(df.execute().fetch(), df_raw.fillna(1))
+
+
+def test_series_fill_na_execution(setup):
+    series_raw = pd.Series(np.nan, index=range(20))
+    for _ in range(3):
+        series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)
+    value_series_raw = pd.Series(np.random.randint(0, 100, (10,)).astype(np.float32))
+
+    # test single chunk
+    series = md.Series(series_raw)
+
+    r = series.fillna(1)
+    pd.testing.assert_series_equal(r.execute().fetch(), series_raw.fillna(1))
+
+    # test single chunk with value as single chunk
+    value_series = md.Series(value_series_raw)
+    r = series.fillna(value_series)
+    pd.testing.assert_series_equal(
+        r.execute().fetch(), series_raw.fillna(value_series_raw)
+    )
+
+    series = md.Series(series_raw, chunk_size=3)
+
+    # test chunked with numeric fill
+    r = series.fillna(1)
+    pd.testing.assert_series_equal(r.execute().fetch(), series_raw.fillna(1))
+
+    # test forward fill in axis=0 without limit
+    r = series.fillna(method="pad")
+    pd.testing.assert_series_equal(r.execute().fetch(), series_raw.fillna(method="pad"))
+
+    # test backward fill in axis=0 without limit
+    r = series.fillna(method="backfill")
+    pd.testing.assert_series_equal(
+        r.execute().fetch(), series_raw.fillna(method="backfill")
+    )
+
+    # test fill with series
+    value_df = md.Series(value_series_raw, chunk_size=4)
+    r = series.fillna(value_df)
+    pd.testing.assert_series_equal(
+        r.execute().fetch(), series_raw.fillna(value_series_raw)
+    )
+
+    # test inplace tile
+    series.fillna(1, inplace=True)
+    pd.testing.assert_series_equal(series.execute().fetch(), series_raw.fillna(1))
+
+
+def test_index_fill_na_execution(setup):
+    idx_data = np.array([np.nan] * 20)
+    for _ in range(10):
+        idx_data[random.randint(0, 19)] = random.randint(0, 99)
+    idx_raw = pd.Index(idx_data)
+
+    # test single chunk
+    idx = md.Index(idx_raw)
+
+    r = idx.fillna(1)
+    pd.testing.assert_index_equal(r.execute().fetch(), idx_raw.fillna(1))
+
+    idx = md.Index(idx_raw, chunk_size=3)
+
+    # test chunked with numeric fill
+    r = idx.fillna(1)
+    pd.testing.assert_index_equal(r.execute().fetch(), idx_raw.fillna(1))
+
+
+def test_drop_na_execution(setup):
+    # dataframe cases
+    df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list("ABCDEFGHIJ"))
+    for _ in range(30):
+        df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99)
+    for rowid in range(random.randint(1, 5)):
+        row = random.randint(0, 19)
+        for idx in range(0, 10):
+            df_raw.iloc[row, idx] = random.randint(0, 99)
+
+    # only one chunk in columns, can run dropna directly
+    r = md.DataFrame(df_raw, chunk_size=(4, 10)).dropna()
+    pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.dropna())
+
+    # multiple chunks in columns, count() will be called first
+    r = md.DataFrame(df_raw, chunk_size=4).dropna()
+    pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.dropna())
+
+    r = md.DataFrame(df_raw, chunk_size=4).dropna(how="all")
+    pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.dropna(how="all"))
+
+    r = md.DataFrame(df_raw, chunk_size=4).dropna(subset=list("ABFI"))
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), df_raw.dropna(subset=list("ABFI"))
+    )
+
+    r = md.DataFrame(df_raw, chunk_size=4).dropna(how="all", subset=list("BDHJ"))
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), df_raw.dropna(how="all", subset=list("BDHJ"))
+    )
+
+    r = md.DataFrame(df_raw, chunk_size=4)
+    r.dropna(how="all", inplace=True)
+    pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.dropna(how="all"))
+
+    # series cases
+    series_raw = pd.Series(np.nan, index=range(20))
+    for _ in range(10):
+        series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)
+
+    r = md.Series(series_raw, chunk_size=4).dropna()
+    pd.testing.assert_series_equal(r.execute().fetch(), series_raw.dropna())
+
+    r = md.Series(series_raw, chunk_size=4)
+    r.dropna(inplace=True)
+    pd.testing.assert_series_equal(r.execute().fetch(), series_raw.dropna())
+
+    # index cases
+    idx_data = np.array([np.nan] * 20)
+    for _ in range(10):
+        idx_data[random.randint(0, 19)] = random.randint(0, 99)
+    idx_raw = pd.Index(idx_data)
+
+    r = md.Index(idx_raw, chunk_size=4).dropna()
+    pd.testing.assert_index_equal(r.execute().fetch(), idx_raw.dropna())
+
+
+def test_replace_execution(setup):
+    # dataframe cases
+    df_raw = pd.DataFrame(-1, index=range(0, 20), columns=list("ABCDEFGHIJ"))
+    for _ in range(30):
+        df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99)
+    for rowid in range(random.randint(1, 5)):
+        row = random.randint(0, 19)
+        for idx in range(0, 10):
+            df_raw.iloc[row, idx] = random.randint(0, 99)
+    df = md.DataFrame(df_raw, chunk_size=4)
+
+    r = df.replace(-1, method="ffill")
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), df_raw.replace(-1, method="ffill")
+    )
+
+    r = df.replace(-1, method="bfill")
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), df_raw.replace(-1, method="bfill")
+    )
+
+    r = df.replace(-1, 999)
+    pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.replace(-1, 999))
+
+    if pd.__version__ >= "1.4.4":
+        r = df.replace({-1: 999})
+        pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.replace({-1: 999}))
+
+    raw_to_replace = pd.Series([-1, 1, 2])
+    to_replace_series = md.Series(raw_to_replace)
+    raw_value = pd.Series([2, 3, -1])
+    value_series = md.Series(raw_value)
+    r = df.replace(to_replace_series, value_series)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), df_raw.replace(raw_to_replace, raw_value)
+    )
+
+    df.replace({"A": -1}, {"A": 9}, inplace=True)
+    pd.testing.assert_frame_equal(
+        df.execute().fetch(), df_raw.replace({"A": -1}, {"A": 9})
+    )
+
+    if pd.__version__ >= "1.4.4":
+        df.replace({"A": {-1: 9}}, inplace=True)
+        pd.testing.assert_frame_equal(
+            df.execute().fetch(), df_raw.replace({"A": {-1: 9}})
+        )
+
+    # series cases
+    series_raw = pd.Series(-1, index=range(20))
+    for _ in range(10):
+        series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)
+    series = md.Series(series_raw, chunk_size=4)
+
+    if pd.__version__ >= "1.4.4":
+        r = series.replace(-1)
+        pd.testing.assert_series_equal(r.execute().fetch(), series_raw.replace(-1))
+
+    r = series.replace(-1, method="ffill")
+    pd.testing.assert_series_equal(
+        r.execute().fetch(), series_raw.replace(-1, method="ffill")
+    )
+
+    r = series.replace(-1, method="bfill")
+    pd.testing.assert_series_equal(
+        r.execute().fetch(), series_raw.replace(-1, method="bfill")
+    )
+
+    r = series.replace(-1, 999)
+    pd.testing.assert_series_equal(r.execute().fetch(), series_raw.replace(-1, 999))
+
+    # str series cases
+    tmpl_chars = list(string.ascii_letters + string.digits)
+    random.shuffle(tmpl_chars)
+
+    def _rand_slice():
+        lb = random.randint(0, len(tmpl_chars) - 1)
+        rb = random.randint(lb, len(tmpl_chars) - 1)
+        return "".join(tmpl_chars[lb : rb + 1])
+
+    series_raw = pd.Series([_rand_slice() for _ in range(20)])
+    series = md.Series(series_raw, chunk_size=4)
+
+    regs = [
+        re.compile(r".A.", flags=re.IGNORECASE),
+        re.compile(r".B.", flags=re.IGNORECASE),
+        re.compile(r".C.", flags=re.IGNORECASE),
+        re.compile(r".D.", flags=re.IGNORECASE),
+    ]
+    r = series.replace(regex=regs, value="new")
+    pd.testing.assert_series_equal(
+        r.execute().fetch(), series_raw.replace(regex=regs, value="new")
+    )
diff --git a/python/xorbits/_mars/dataframe/operands.py b/python/xorbits/_mars/dataframe/operands.py
new file mode 100644
index 000000000..760a8be44
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/operands.py
@@ -0,0 +1,482 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from functools import reduce
+
+import numpy as np
+import pandas as pd
+
+from ..core import ENTITY_TYPE, FuseChunk, FuseChunkData, OutputType
+from ..core.operand import (
+    Fuse,
+    FuseChunkMixin,
+    Operand,
+    ShuffleProxy,
+    TileableOperandMixin,
+)
+from ..tensor.core import TENSOR_TYPE
+from ..tensor.datasource import tensor as astensor
+from ..tensor.operands import TensorOperandMixin
+from ..utils import calc_nsplits
+from .core import (
+    CATEGORICAL_TYPE,
+    DATAFRAME_CHUNK_TYPE,
+    DATAFRAME_GROUPBY_TYPE,
+    DATAFRAME_TYPE,
+    INDEX_CHUNK_TYPE,
+    INDEX_TYPE,
+    SERIES_CHUNK_TYPE,
+    SERIES_GROUPBY_TYPE,
+    SERIES_TYPE,
+)
+from .utils import parse_index
+
+
+class DataFrameOperandMixin(TileableOperandMixin):
+    __slots__ = ()
+    _op_module_ = "dataframe"
+
+    def new_dataframes(
+        self,
+        inputs,
+        shape=None,
+        dtypes=None,
+        index_value=None,
+        columns_value=None,
+        chunks=None,
+        nsplits=None,
+        output_limit=None,
+        kws=None,
+        **kw
+    ):
+        setattr(self, "_output_types", [OutputType.dataframe])
+        return self.new_tileables(
+            inputs,
+            shape=shape,
+            dtypes=dtypes,
+            index_value=index_value,
+            columns_value=columns_value,
+            chunks=chunks,
+            nsplits=nsplits,
+            output_limit=output_limit,
+            kws=kws,
+            **kw
+        )
+
+    def new_dataframe(
+        self,
+        inputs,
+        shape=None,
+        dtypes=None,
+        index_value=None,
+        columns_value=None,
+        **kw
+    ):
+        if getattr(self, "output_limit") != 1:
+            raise TypeError("cannot new DataFrame with more than 1 outputs")
+
+        return self.new_dataframes(
+            inputs,
+            shape=shape,
+            dtypes=dtypes,
+            index_value=index_value,
+            columns_value=columns_value,
+            **kw
+        )[0]
+
+    def new_seriess(
+        self,
+        inputs,
+        shape=None,
+        dtype=None,
+        index_value=None,
+        name=None,
+        chunks=None,
+        nsplits=None,
+        output_limit=None,
+        kws=None,
+        **kw
+    ):
+        setattr(self, "_output_types", [OutputType.series])
+        return self.new_tileables(
+            inputs,
+            shape=shape,
+            dtype=dtype,
+            index_value=index_value,
+            name=name,
+            chunks=chunks,
+            nsplits=nsplits,
+            output_limit=output_limit,
+            kws=kws,
+            **kw
+        )
+
+    def new_series(
+        self, inputs, shape=None, dtype=None, index_value=None, name=None, **kw
+    ):
+        if getattr(self, "output_limit") != 1:
+            raise TypeError("cannot new Series with more than 1 outputs")
+
+        return self.new_seriess(
+            inputs, shape=shape, dtype=dtype, index_value=index_value, name=name, **kw
+        )[0]
+
+    def new_df_or_series(self, inputs, **kw):
+        setattr(self, "_output_types", [OutputType.df_or_series])
+        return self.new_tileables(inputs, **kw)[0]
+
+    def new_indexes(
+        self,
+        inputs,
+        shape=None,
+        dtype=None,
+        index_value=None,
+        name=None,
+        chunks=None,
+        nsplits=None,
+        output_limit=None,
+        kws=None,
+        **kw
+    ):
+        setattr(self, "_output_types", [OutputType.index])
+        return self.new_tileables(
+            inputs,
+            shape=shape,
+            dtype=dtype,
+            index_value=index_value,
+            name=name,
+            chunks=chunks,
+            nsplits=nsplits,
+            output_limit=output_limit,
+            kws=kws,
+            **kw
+        )
+
+    def new_index(
+        self, inputs, shape=None, dtype=None, index_value=None, name=None, **kw
+    ):
+        if getattr(self, "output_limit") != 1:
+            raise TypeError("cannot new Index with more than 1 outputs")
+
+        return self.new_indexes(
+            inputs, shape=shape, dtype=dtype, index_value=index_value, name=name, **kw
+        )[0]
+
+    def new_scalars(
+        self, inputs, dtype=None, chunks=None, output_limit=None, kws=None, **kw
+    ):
+        setattr(self, "_output_types", [OutputType.scalar])
+        return self.new_tileables(
+            inputs,
+            shape=(),
+            dtype=dtype,
+            chunks=chunks,
+            nsplits=(),
+            output_limit=output_limit,
+            kws=kws,
+            **kw
+        )
+
+    def new_scalar(self, inputs, dtype=None, **kw):
+        if getattr(self, "output_limit") != 1:
+            raise TypeError("cannot new tensor with more than 1 outputs")
+
+        return self.new_scalars(inputs, dtype=dtype, **kw)[0]
+
+    def new_categoricals(
+        self,
+        inputs,
+        shape=None,
+        dtype=None,
+        categories_value=None,
+        chunks=None,
+        nsplits=None,
+        output_limit=None,
+        kws=None,
+        **kw
+    ):
+        setattr(self, "_output_types", [OutputType.categorical])
+        return self.new_tileables(
+            inputs,
+            shape=shape,
+            dtype=dtype,
+            categories_value=categories_value,
+            chunks=chunks,
+            nsplits=nsplits,
+            output_limit=output_limit,
+            kws=kws,
+            **kw
+        )
+
+    def new_categorical(
+        self, inputs, shape=None, dtype=None, categories_value=None, **kw
+    ):
+        if getattr(self, "output_limit") != 1:
+            raise TypeError("cannot new Categorical with more than 1 outputs")
+
+        return self.new_categoricals(
+            inputs, shape=shape, dtype=dtype, categories_value=categories_value, **kw
+        )[0]
+
+    @classmethod
+    def _process_groupby_params(cls, groupby_params):
+        new_groupby_params = groupby_params.copy()
+        if isinstance(groupby_params["by"], list):
+            by = []
+            for v in groupby_params["by"]:
+                if isinstance(v, ENTITY_TYPE):
+                    by.append(cls.concat_tileable_chunks(v).chunks[0])
+                else:
+                    by.append(v)
+            new_groupby_params["by"] = by
+        return new_groupby_params
+
+    @classmethod
+    def _get_groupby_inputs(cls, groupby, groupby_params):
+        inputs = [groupby]
+        chunk_inputs = list(groupby.chunks)
+        if isinstance(groupby_params["by"], list):
+            for chunk_v, v in zip(
+                groupby_params["by"], groupby.op.groupby_params["by"]
+            ):
+                if isinstance(v, ENTITY_TYPE):
+                    inputs.append(v)
+                    chunk_inputs.append(chunk_v)
+        return inputs, chunk_inputs
+
+    @classmethod
+    def concat_tileable_chunks(cls, tileable):
+        from .merge.concat import DataFrameConcat, GroupByConcat
+
+        df = tileable
+        assert not df.is_coarse()
+
+        if isinstance(df, DATAFRAME_TYPE):
+            chunk = DataFrameConcat(output_types=[OutputType.dataframe]).new_chunk(
+                df.chunks,
+                shape=df.shape,
+                index=(0, 0),
+                dtypes=df.dtypes,
+                index_value=df.index_value,
+                columns_value=df.columns_value,
+            )
+            return DataFrameConcat(output_types=[OutputType.dataframe]).new_dataframe(
+                [df],
+                shape=df.shape,
+                chunks=[chunk],
+                nsplits=tuple((s,) for s in df.shape),
+                dtypes=df.dtypes,
+                index_value=df.index_value,
+                columns_value=df.columns_value,
+            )
+        elif isinstance(df, SERIES_TYPE):
+            chunk = DataFrameConcat(output_types=[OutputType.series]).new_chunk(
+                df.chunks,
+                shape=df.shape,
+                index=(0,),
+                dtype=df.dtype,
+                index_value=df.index_value,
+                name=df.name,
+            )
+            return DataFrameConcat(output_types=[OutputType.series]).new_series(
+                [df],
+                shape=df.shape,
+                chunks=[chunk],
+                nsplits=tuple((s,) for s in df.shape),
+                dtype=df.dtype,
+                index_value=df.index_value,
+                name=df.name,
+            )
+        elif isinstance(df, INDEX_TYPE):
+            chunk = DataFrameConcat(output_types=[OutputType.index]).new_chunk(
+                df.chunks,
+                shape=df.shape,
+                index=(0,),
+                dtype=df.dtype,
+                index_value=df.index_value,
+                name=df.name,
+            )
+            return DataFrameConcat(output_types=[OutputType.index]).new_index(
+                [df],
+                shape=df.shape,
+                chunks=[chunk],
+                nsplits=tuple((s,) for s in df.shape),
+                dtype=df.dtype,
+                index_value=df.index_value,
+                name=df.name,
+            )
+        elif isinstance(df, (DATAFRAME_GROUPBY_TYPE, SERIES_GROUPBY_TYPE)):
+            output_type = (
+                OutputType.dataframe_groupby
+                if isinstance(df, DATAFRAME_GROUPBY_TYPE)
+                else OutputType.series_groupby
+            )
+            groupby_params = cls._process_groupby_params(df.op.groupby_params)
+            inputs, chunk_inputs = cls._get_groupby_inputs(df, groupby_params)
+            chunk = GroupByConcat(
+                groups=df.chunks,
+                groupby_params=groupby_params,
+                output_types=[output_type],
+            ).new_chunk(chunk_inputs, **df.params)
+            return GroupByConcat(
+                groups=[df],
+                groupby_params=df.op.groupby_params,
+                output_types=[output_type],
+            ).new_tileable(inputs, chunks=[chunk], **df.params)
+        elif isinstance(df, CATEGORICAL_TYPE):
+            chunk = DataFrameConcat(output_types=[OutputType.categorical]).new_chunk(
+                df.chunks,
+                shape=df.shape,
+                index=(0,),
+                dtype=df.dtype,
+                categories_value=df.categories_value,
+            )
+            return DataFrameConcat(
+                output_types=[OutputType.categorical]
+            ).new_categorical(
+                [df],
+                shape=df.shape,
+                chunks=[chunk],
+                nsplits=tuple((s,) for s in df.shape),
+                dtype=df.dtype,
+                categories_value=df.categories_value,
+            )
+        elif isinstance(df, TENSOR_TYPE):
+            return TensorOperandMixin.concat_tileable_chunks(tileable)
+        else:
+            raise NotImplementedError
+
+    @classmethod
+    def create_tileable_from_chunks(cls, chunks, inputs=None, **kw):
+        ndim = chunks[0].ndim
+        index_min, index_max = [None] * ndim, [None] * ndim
+        for c in chunks:
+            for ax, i in enumerate(c.index):
+                if index_min[ax] is None:
+                    index_min[ax] = i
+                else:
+                    index_min[ax] = min(i, index_min[ax])
+                if index_max[ax] is None:
+                    index_max[ax] = i
+                else:
+                    index_max[ax] = max(i, index_max[ax])
+
+        # gen {chunk index -> shape}
+        chunk_index_to_shape = OrderedDict()
+        chunk_index_to_chunk = dict()
+        for c in chunks:
+            new_index = []
+            for ax, i in enumerate(c.index):
+                new_index.append(i - index_min[ax])
+            chunk_index_to_shape[tuple(new_index)] = c.shape
+            chunk_index_to_chunk[tuple(new_index)] = c
+
+        nsplits = calc_nsplits(chunk_index_to_shape)
+        shape = tuple(sum(ns) for ns in nsplits)
+        chunk_shape = tuple(len(ns) for ns in nsplits)
+        op = chunks[0].op.copy().reset_key()
+        if isinstance(chunks[0], DATAFRAME_CHUNK_TYPE):
+            params = cls._calc_dataframe_params(chunk_index_to_chunk, chunk_shape)
+            params.update(kw)
+            return op.new_dataframe(
+                inputs, shape=shape, chunks=chunks, nsplits=nsplits, **params
+            )
+        elif isinstance(chunks[0], SERIES_CHUNK_TYPE):
+            params = cls._calc_series_index_params(chunks)
+            params.update(kw)
+            return op.new_series(
+                inputs, shape=shape, chunks=chunks, nsplits=nsplits, **params
+            )
+        else:
+            assert isinstance(chunks[0], INDEX_CHUNK_TYPE)
+            params = cls._calc_series_index_params(chunks)
+            params.update(kw)
+            return op.new_index(
+                inputs, shape=shape, chunks=chunks, nsplits=nsplits, **params
+            )
+
+    @classmethod
+    def _calc_dataframe_params(cls, chunk_index_to_chunks, chunk_shape):
+        dtypes = pd.concat(
+            [
+                chunk_index_to_chunks[0, i].dtypes
+                for i in range(chunk_shape[1])
+                if (0, i) in chunk_index_to_chunks
+            ]
+        )
+        columns_value = parse_index(dtypes.index, store_data=True)
+        pd_indexes = [
+            chunk_index_to_chunks[i, 0].index_value.to_pandas()
+            for i in range(chunk_shape[0])
+            if (i, 0) in chunk_index_to_chunks
+        ]
+        pd_index = reduce(lambda x, y: x.append(y), pd_indexes)
+        index_value = parse_index(pd_index)
+        return {
+            "dtypes": dtypes,
+            "columns_value": columns_value,
+            "index_value": index_value,
+        }
+
+    @classmethod
+    def _calc_series_index_params(cls, chunks):
+        pd_indexes = [c.index_value.to_pandas() for c in chunks]
+        pd_index = reduce(lambda x, y: x.append(y), pd_indexes)
+        index_value = parse_index(pd_index)
+        return {"dtype": chunks[0].dtype, "index_value": index_value}
+
+    def get_fuse_op_cls(self, _):
+        return DataFrameFuseChunk
+
+    @staticmethod
+    def _process_input(x):
+        from .initializer import DataFrame, Series
+
+        if isinstance(x, (DATAFRAME_TYPE, SERIES_TYPE)) or pd.api.types.is_scalar(x):
+            return x
+        elif isinstance(x, pd.Series):
+            return Series(x)
+        elif isinstance(x, pd.DataFrame):
+            return DataFrame(x)
+        elif isinstance(x, (list, tuple, np.ndarray, TENSOR_TYPE)):
+            return astensor(x)
+        raise NotImplementedError
+
+
+DataFrameOperand = Operand
+
+
+class DataFrameShuffleProxy(ShuffleProxy, DataFrameOperandMixin):
+    def __init__(self, sparse=None, output_types=None, **kwargs):
+        super().__init__(sparse=sparse, _output_types=output_types, **kwargs)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        pass
+
+
+class DataFrameFuseChunkMixin(FuseChunkMixin, DataFrameOperandMixin):
+    __slots__ = ()
+
+    def _create_chunk(self, output_idx, index, **kw):
+        data = FuseChunkData(_index=index, _shape=kw.pop("shape", None), _op=self, **kw)
+
+        return FuseChunk(data)
+
+
+class DataFrameFuseChunk(Fuse, DataFrameFuseChunkMixin):
+    @property
+    def output_types(self):
+        return self.outputs[-1].chunk.op.output_types
diff --git a/python/xorbits/_mars/dataframe/plotting/__init__.py b/python/xorbits/_mars/dataframe/plotting/__init__.py
new file mode 100644
index 000000000..29371e4f1
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/plotting/__init__.py
@@ -0,0 +1,34 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def _install():
+    import pandas as pd
+
+    from ..base.accessor import CachedAccessor
+    from ..core import DATAFRAME_TYPE, GROUPBY_TYPE, SERIES_TYPE
+    from .core import PlotAccessor
+
+    for t in DATAFRAME_TYPE + SERIES_TYPE + GROUPBY_TYPE:
+        t.plot = CachedAccessor("plot", PlotAccessor)
+
+    for method in dir(pd.DataFrame.plot):
+        if not method.startswith("_"):
+            PlotAccessor._register(method)
+
+    PlotAccessor.__doc__ = pd.DataFrame.plot.__doc__.replace("pd.", "md.")
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/dataframe/plotting/core.py b/python/xorbits/_mars/dataframe/plotting/core.py
new file mode 100644
index 000000000..66c7acc59
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/plotting/core.py
@@ -0,0 +1,69 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+import pandas as pd
+
+from ...core import ENTITY_TYPE, ExecutableTuple
+from ...utils import adapt_mars_docstring
+
+
+class PlotAccessor:
+    def __init__(self, obj):
+        self._obj = obj
+
+    def __call__(self, kind="line", session=None, **kwargs):
+        to_executes = OrderedDict()
+        to_executes["__object__"] = self._obj
+
+        for k, v in kwargs.items():
+            if isinstance(v, ENTITY_TYPE):
+                to_executes[k] = v
+
+        result = dict()
+        executed = ExecutableTuple(to_executes.values()).execute().fetch()
+        for p, v in zip(to_executes, executed):
+            result[p] = v
+
+        data = result.pop("__object__")
+        pd_kwargs = kwargs.copy()
+        pd_kwargs["kind"] = kind
+        pd_kwargs.update(result)
+
+        return data.plot(**pd_kwargs)
+
+    @classmethod
+    def _gen_func(cls, name, doc):
+        def _inner(self, *args, **kwargs):
+            return self(kind=name, *args, **kwargs)
+
+        _inner.__name__ = name
+        _inner.__doc__ = doc
+
+        return _inner
+
+    @classmethod
+    def _register(cls, method):
+        doc = getattr(pd.DataFrame.plot, method).__doc__
+        new_doc = adapt_mars_docstring(doc)
+        if method == "hexbin":
+            # make doc pass
+            new_doc = new_doc.replace(
+                "reduce_C_function=mt.sum", "reduce_C_function=sum"
+            )
+        elif method == "line":
+            new_doc = new_doc.replace("s.plot.line().execute()", "s.plot.line()")
+            new_doc = new_doc.replace("type(axes).execute()", "type(axes)")
+        setattr(cls, method, cls._gen_func(method, new_doc))
diff --git a/python/xorbits/_mars/dataframe/plotting/tests/__init__.py b/python/xorbits/_mars/dataframe/plotting/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/plotting/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/plotting/tests/test_plot.py b/python/xorbits/_mars/dataframe/plotting/tests/test_plot.py
new file mode 100644
index 000000000..9badb2fc9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/plotting/tests/test_plot.py
@@ -0,0 +1,118 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import warnings
+
+import numpy as np
+import pandas as pd
+import pytest
+
+try:
+    import matplotlib
+except ImportError:  # pragma: no cover
+    matplotlib = None
+
+from .... import dataframe as md
+from .... import tensor as mt
+
+
+def close(fignum=None):  # pragma: no cover
+    from matplotlib.pyplot import close as _close
+    from matplotlib.pyplot import get_fignums
+
+    if fignum is None:
+        for fignum in get_fignums():
+            _close(fignum)
+    else:
+        _close(fignum)
+
+
+def assert_is_valid_plot_return_object(objs):  # pragma: no cover
+    import matplotlib.pyplot as plt
+
+    if isinstance(objs, (pd.Series, np.ndarray)):
+        for el in objs.ravel():
+            msg = (
+                "one of 'objs' is not a matplotlib Axes instance, "
+                f"type encountered {type(el).__name__}"
+            )
+            assert isinstance(el, (plt.Axes, dict)), msg
+    else:
+        msg = (
+            "objs is neither an ndarray of Artist instances nor a single "
+            f"ArtistArtist instance, tuple, or dict, 'objs' is a {type(objs).__name__}"
+        )
+        assert isinstance(objs, (plt.Artist, tuple, dict)), msg
+
+
+def _check_plot_works(f, filterwarnings="always", **kwargs):  # pragma: no cover
+    import matplotlib.pyplot as plt
+
+    ret = None
+    with warnings.catch_warnings():
+        warnings.simplefilter(filterwarnings)
+        try:
+            try:
+                fig = kwargs["figure"]
+            except KeyError:
+                fig = plt.gcf()
+
+            plt.clf()
+
+            kwargs.get("ax", fig.add_subplot(211))
+            ret = f(**kwargs)
+
+            assert_is_valid_plot_return_object(ret)
+
+            if f is pd.plotting.bootstrap_plot:
+                assert "ax" not in kwargs
+            else:
+                kwargs["ax"] = fig.add_subplot(212)
+
+            ret = f(**kwargs)
+            assert_is_valid_plot_return_object(ret)
+
+            with tempfile.TemporaryFile() as path:
+                plt.savefig(path)
+        finally:
+            close(fig)
+
+        return ret
+
+
+@pytest.mark.skipif(matplotlib is None, reason="matplotlib is not installed")
+def test_plot(setup):
+    raw = pd.DataFrame(
+        {
+            "a": ["s" + str(i) for i in range(10)],
+            "b": np.random.RandomState(0).randint(10, size=10),
+        }
+    )
+    df = md.DataFrame(raw, chunk_size=3)
+
+    _check_plot_works(df.plot, x="a", y="b")
+    _check_plot_works(df.plot, x="a", y=mt.tensor("b"))
+    _check_plot_works(df.plot.line)
+
+    raw = pd.DataFrame(
+        {
+            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
+            "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
+            "C": np.random.randn(8),
+            "D": np.random.randn(8),
+        }
+    )
+    df = md.DataFrame(raw, chunk_size=3)
+    _check_plot_works(df.groupby("A").plot)
diff --git a/python/xorbits/_mars/dataframe/reduction/__init__.py b/python/xorbits/_mars/dataframe/reduction/__init__.py
new file mode 100644
index 000000000..a8ccc76a5
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/__init__.py
@@ -0,0 +1,111 @@
+# isort: skip_file
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import CustomReduction
+from .aggregation import DataFrameAggregate
+
+from .sum import DataFrameSum
+from .prod import DataFrameProd
+from .max import DataFrameMax
+from .min import DataFrameMin
+from .count import DataFrameCount
+from .mean import DataFrameMean
+from .var import DataFrameVar
+from .all import DataFrameAll
+from .any import DataFrameAny
+from .skew import DataFrameSkew
+from .kurtosis import DataFrameKurtosis
+from .sem import DataFrameSem
+from .reduction_size import DataFrameSize
+from .str_concat import DataFrameStrConcat, build_str_concat_object
+from .custom_reduction import DataFrameCustomReduction
+
+from .cummax import DataFrameCummax
+from .cummin import DataFrameCummin
+from .cumprod import DataFrameCumprod
+from .cumsum import DataFrameCumsum
+
+from .nunique import DataFrameNunique
+from .unique import DataFrameUnique, unique
+
+
+def _install():
+    from ..core import DATAFRAME_TYPE, SERIES_TYPE, INDEX_TYPE
+    from .aggregation import aggregate
+    from .sum import sum_series, sum_dataframe
+    from .prod import prod_series, prod_dataframe
+    from .max import max_series, max_dataframe, max_index
+    from .min import min_series, min_dataframe, min_index
+    from .count import count_series, count_dataframe
+    from .mean import mean_series, mean_dataframe
+    from .var import var_series, var_dataframe
+    from .std import std_series, std_dataframe
+    from .all import all_series, all_dataframe, all_index
+    from .any import any_series, any_dataframe, any_index
+    from .cummax import cummax
+    from .cummin import cummin
+    from .cumprod import cumprod
+    from .cumsum import cumsum
+    from .nunique import nunique_dataframe, nunique_series
+    from .sem import sem_dataframe, sem_series
+    from .skew import skew_dataframe, skew_series
+    from .kurtosis import kurt_dataframe, kurt_series
+    from .reduction_size import size_dataframe, size_series
+
+    funcs = [
+        ("sum", sum_series, sum_dataframe),
+        ("prod", prod_series, prod_dataframe),
+        ("product", prod_series, prod_dataframe),
+        ("max", max_series, max_dataframe),
+        ("min", min_series, min_dataframe),
+        ("count", count_series, count_dataframe),
+        ("mean", mean_series, mean_dataframe),
+        ("var", var_series, var_dataframe),
+        ("std", std_series, std_dataframe),
+        ("all", all_series, all_dataframe),
+        ("any", any_series, any_dataframe),
+        ("cummax", cummax, cummax),
+        ("cummin", cummin, cummin),
+        ("cumprod", cumprod, cumprod),
+        ("cumsum", cumsum, cumsum),
+        ("agg", aggregate, aggregate),
+        ("aggregate", aggregate, aggregate),
+        ("nunique", nunique_series, nunique_dataframe),
+        ("sem", sem_series, sem_dataframe),
+        ("skew", skew_series, skew_dataframe),
+        ("kurt", kurt_series, kurt_dataframe),
+        ("kurtosis", kurt_series, kurt_dataframe),
+        ("unique", unique, None),
+        ("_reduction_size", size_dataframe, size_series),
+    ]
+    for func_name, series_func, df_func in funcs:
+        if df_func is not None:  # pragma: no branch
+            for t in DATAFRAME_TYPE:
+                setattr(t, func_name, df_func)
+        if series_func is not None:  # pragma: no branch
+            for t in SERIES_TYPE:
+                setattr(t, func_name, series_func)
+
+    for t in INDEX_TYPE:
+        setattr(t, "agg", aggregate)
+        setattr(t, "aggregate", aggregate)
+        setattr(t, "all", all_index)
+        setattr(t, "any", any_index)
+        setattr(t, "min", min_index)
+        setattr(t, "max", max_index)
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/dataframe/reduction/aggregation.py b/python/xorbits/_mars/dataframe/reduction/aggregation.py
new file mode 100644
index 000000000..4a0882f41
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/aggregation.py
@@ -0,0 +1,1037 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import functools
+import itertools
+from collections import OrderedDict
+from collections.abc import Iterable
+from typing import Dict, List
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ... import tensor as mars_tensor
+from ...config import options
+from ...core import ENTITY_TYPE, OutputType, enter_mode, recursive_tile
+from ...core.custom_log import redirect_custom_log
+from ...core.operand import OperandStage
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    DictField,
+    Int32Field,
+    ListField,
+)
+from ...utils import ceildiv, enter_current_session, lazy_import, pd_release_version
+from ..core import INDEX_CHUNK_TYPE
+from ..merge import DataFrameConcat
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_df, build_empty_df, build_series, parse_index, validate_axis
+from .core import CustomReduction, ReductionAggStep, ReductionCompiler, ReductionSteps
+
+cp = lazy_import("cupy", rename="cp")
+cudf = lazy_import("cudf")
+
+_agg_size_as_series = pd_release_version >= (1, 3, 0)
+
+
+def where_function(cond, var1, var2):
+    if hasattr(var1, "ndim") and var1.ndim >= 1:
+        return var1.where(cond, var2)
+    elif isinstance(var1, ENTITY_TYPE):
+        return mars_tensor.where(cond, var1, var2)
+    else:
+        return np.where(cond, var1, var2).item()
+
+
+_agg_functions = {
+    "sum": lambda x, skipna=True: x.sum(skipna=skipna),
+    "prod": lambda x, skipna=True: x.prod(skipna=skipna),
+    "product": lambda x, skipna=True: x.product(skipna=skipna),
+    "min": lambda x, skipna=True: x.min(skipna=skipna),
+    "max": lambda x, skipna=True: x.max(skipna=skipna),
+    "all": lambda x, skipna=True: x.all(skipna=skipna),
+    "any": lambda x, skipna=True: x.any(skipna=skipna),
+    "count": lambda x: x.count(),
+    "size": lambda x: x._reduction_size(),
+    "mean": lambda x, skipna=True: x.mean(skipna=skipna),
+    "var": lambda x, skipna=True, ddof=1: x.var(skipna=skipna, ddof=ddof),
+    "std": lambda x, skipna=True, ddof=1: x.std(skipna=skipna, ddof=ddof),
+    "sem": lambda x, skipna=True, ddof=1: x.sem(skipna=skipna, ddof=ddof),
+    "skew": lambda x, skipna=True, bias=False: x.skew(skipna=skipna, bias=bias),
+    "kurt": lambda x, skipna=True, bias=False: x.kurt(skipna=skipna, bias=bias),
+    "kurtosis": lambda x, skipna=True, bias=False: x.kurtosis(skipna=skipna, bias=bias),
+    "nunique": lambda x: x.nunique(),
+}
+
+
+class DataFrameAggregate(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.AGGREGATE
+
+    raw_func = AnyField("raw_func")
+    raw_func_kw = DictField("raw_func_kw")
+    func = AnyField("func")
+    func_rename = ListField("func_rename")
+    axis = AnyField("axis")
+    numeric_only = BoolField("numeric_only")
+    bool_only = BoolField("bool_only")
+    use_inf_as_na = BoolField("use_inf_as_na")
+
+    combine_size = Int32Field("combine_size")
+    pre_funcs = ListField("pre_funcs")
+    agg_funcs = ListField("agg_funcs")
+    post_funcs = ListField("post_funcs")
+
+    @staticmethod
+    def _filter_dtypes(op: "DataFrameAggregate", dtypes):
+        if not op.numeric_only and not op.bool_only:
+            return dtypes
+        empty_df = build_empty_df(dtypes)
+        return empty_df.select_dtypes(
+            [np.number, np.bool_] if op.numeric_only else [np.bool_]
+        ).dtypes
+
+    def _calc_result_shape(self, df):
+        if df.ndim == 2:
+            if self.numeric_only:
+                df = df.select_dtypes([np.number, np.bool_])
+            elif self.bool_only:
+                df = df.select_dtypes([np.bool_])
+
+        if self.output_types[0] == OutputType.dataframe:
+            test_obj = build_df(df, size=[2, 2], fill_value=[1, 2], ensure_string=True)
+        else:
+            test_obj = build_series(
+                df, size=[2, 2], fill_value=[1, 2], name=df.name, ensure_string=True
+            )
+
+        result_df = test_obj.agg(self.raw_func, axis=self.axis, **self.raw_func_kw)
+
+        if isinstance(result_df, pd.DataFrame):
+            self.output_types = [OutputType.dataframe]
+            return result_df.dtypes, result_df.index
+        elif isinstance(result_df, pd.Series):
+            self.output_types = [OutputType.series]
+            return pd.Series([result_df.dtype], index=[result_df.name]), result_df.index
+        else:
+            self.output_types = [OutputType.scalar]
+            return np.array(result_df).dtype, None
+
+    def __call__(self, df, output_type=None, dtypes=None, index=None):
+        self._output_types = df.op.output_types
+        normalize_reduction_funcs(self, ndim=df.ndim)
+        if output_type is None or dtypes is None:
+            with enter_mode(kernel=False, build=False):
+                dtypes, index = self._calc_result_shape(df)
+        else:
+            self.output_types = [output_type]
+
+        if self.output_types[0] == OutputType.dataframe:
+            if self.axis == 0:
+                new_shape = (len(index), len(dtypes))
+                new_index = parse_index(index, store_data=True)
+            else:
+                new_shape = (df.shape[0], len(dtypes))
+                new_index = df.index_value
+            return self.new_dataframe(
+                [df],
+                shape=new_shape,
+                dtypes=dtypes,
+                index_value=new_index,
+                columns_value=parse_index(dtypes.index, store_data=True),
+            )
+        elif self.output_types[0] == OutputType.series:
+            if df.ndim == 1:
+                new_shape = (len(index),)
+                new_index = parse_index(index, store_data=True)
+            elif self.axis == 0:
+                new_shape = (len(index),)
+                new_index = parse_index(index, store_data=True)
+            else:
+                new_shape = (df.shape[0],)
+                new_index = df.index_value
+            return self.new_series(
+                [df],
+                shape=new_shape,
+                dtype=dtypes[0],
+                name=dtypes.index[0],
+                index_value=new_index,
+            )
+        elif self.output_types[0] == OutputType.tensor:
+            return self.new_tileable([df], dtype=dtypes, shape=(np.nan,))
+        else:
+            return self.new_scalar([df], dtype=dtypes)
+
+    @staticmethod
+    def _safe_append(d, key, val):
+        if key not in d:
+            d[key] = []
+        if val not in d[key]:
+            d[key].append(val)
+
+    @classmethod
+    def _gen_map_chunks(
+        cls,
+        op,
+        in_df,
+        out_df,
+        func_infos: List[ReductionSteps],
+        input_index_to_output: Dict[int, int],
+    ):
+        axis = op.axis
+
+        if axis == 0:
+            agg_chunks_shape = (
+                (in_df.chunk_shape[0], len(func_infos))
+                if len(in_df.chunk_shape) == 2
+                else (in_df.chunk_shape[0], 1)
+            )
+        else:
+            agg_chunks_shape = (len(func_infos), in_df.chunk_shape[1])
+
+        agg_chunks = np.empty(agg_chunks_shape, dtype=object)
+        dtypes_cache = dict()
+        for chunk in in_df.chunks:
+            input_index = chunk.index[1 - axis] if len(chunk.index) > 1 else 0
+            if input_index not in input_index_to_output:
+                continue
+            map_op = op.copy().reset_key()  # type: "DataFrameAggregate"
+            new_axis_index = input_index_to_output[input_index]
+            func_info = func_infos[new_axis_index]
+            # force as_index=True for map phase
+            map_op.output_types = (
+                [OutputType.dataframe] if chunk.ndim == 2 else [OutputType.series]
+            )
+            map_op.stage = OperandStage.map
+            map_op.pre_funcs = func_info.pre_funcs
+            map_op.agg_funcs = func_info.agg_funcs
+
+            if axis == 0:
+                new_index = (
+                    (chunk.index[0], new_axis_index)
+                    if len(chunk.index) == 2
+                    else (chunk.index[0], 0)
+                )
+            else:
+                new_index = (new_axis_index, chunk.index[1])
+
+            if map_op.output_types[0] == OutputType.dataframe:
+                if axis == 0:
+                    shape = (1, out_df.shape[-1])
+                    if out_df.ndim == 2:
+                        columns_value = out_df.columns_value
+                        index_value = out_df.index_value
+                    else:
+                        columns_value = out_df.index_value
+                        index_value = parse_index(pd.Index([0]), out_df.key)
+
+                    try:
+                        dtypes = dtypes_cache[chunk.index[1]]
+                    except KeyError:
+                        dtypes = chunk.dtypes.reindex(
+                            columns_value.to_pandas()
+                        ).dropna()
+                        dtypes_cache[chunk.index[1]] = dtypes
+
+                    agg_chunk = map_op.new_chunk(
+                        [chunk],
+                        shape=shape,
+                        index=new_index,
+                        dtypes=dtypes,
+                        columns_value=columns_value,
+                        index_value=index_value,
+                    )
+                else:
+                    shape = (out_df.shape[0], 1)
+                    columns_value = parse_index(
+                        pd.Index([0]), out_df.key, store_data=True
+                    )
+                    index_value = out_df.index_value
+
+                    agg_chunk = map_op.new_chunk(
+                        [chunk],
+                        shape=shape,
+                        index=new_index,
+                        columns_value=columns_value,
+                        index_value=index_value,
+                    )
+            else:
+                agg_chunk = map_op.new_chunk([chunk], shape=(1,), index=new_index)
+            agg_chunks[agg_chunk.index] = agg_chunk
+        return agg_chunks
+
+    @classmethod
+    def _tile_single_chunk(cls, op: "DataFrameAggregate"):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+
+        chunk_op = op.copy().reset_key()
+        if op.output_types[0] == OutputType.dataframe:
+            chunk = chunk_op.new_chunk(
+                in_df.chunks,
+                index=(0, 0),
+                shape=out_df.shape,
+                index_value=out_df.index_value,
+                columns_value=out_df.columns_value,
+                dtypes=out_df.dtypes,
+            )
+        elif op.output_types[0] == OutputType.series:
+            chunk = chunk_op.new_chunk(
+                in_df.chunks,
+                index=(0,),
+                shape=out_df.shape,
+                dtype=out_df.dtype,
+                index_value=out_df.index_value,
+                name=out_df.name,
+            )
+        elif op.output_types[0] == OutputType.tensor:
+            chunk = chunk_op.new_chunk(
+                in_df.chunks, index=(0,), dtype=out_df.dtype, shape=(np.nan,)
+            )
+        else:
+            chunk = chunk_op.new_chunk(
+                in_df.chunks, dtype=out_df.dtype, index=(), shape=()
+            )
+
+        tileable_op = op.copy().reset_key()
+        kw = out_df.params.copy()
+        kw.update(dict(chunks=[chunk], nsplits=tuple((x,) for x in out_df.shape)))
+        return tileable_op.new_tileables([in_df], **kw)
+
+    @classmethod
+    def _tile_size(cls, op: "DataFrameAggregate"):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+
+        chunks = []
+        for c in in_df.chunks:
+            chunk_op = op.copy().reset_key()
+            chunks.append(
+                chunk_op.new_chunk(
+                    [c],
+                    index=c.index,
+                    shape=(1,) * len(in_df.shape),
+                    dtype=out_df.dtype,
+                )
+            )
+
+        tileable_op = op.copy().reset_key()
+        nsplits = tuple((1,) * s for s in in_df.chunk_shape)
+        tileable = tileable_op.new_tileable(
+            out_df.inputs,
+            chunks=chunks,
+            nsplits=nsplits,
+            shape=in_df.chunk_shape,
+            dtype=out_df.dtype,
+        )
+        ret = yield from recursive_tile(tileable.sum())
+        return [ret]
+
+    @staticmethod
+    def _add_functions(
+        op: "DataFrameAggregate", compiler: ReductionCompiler, cols=None
+    ):
+        if isinstance(op.func, list):
+            func_iter = ((None, f) for f in op.func)
+            cols_set = set(cols) if cols is not None else None
+        else:
+            assert cols is not None
+            cols_set = set(cols) & set(op.func.keys())
+            if len(cols_set) == 0:
+                return False
+            func_iter = ((col, f) for col, funcs in op.func.items() for f in funcs)
+
+        func_renames = (
+            op.func_rename
+            if getattr(op, "func_rename", None) is not None
+            else itertools.repeat(None)
+        )
+        for func_rename, (col, f) in zip(func_renames, func_iter):
+            if cols_set is not None and col is not None and col not in cols_set:
+                continue
+            func_name = None
+            if isinstance(f, str):
+                f, func_name = _agg_functions[f], f
+            if func_rename is not None:
+                func_name = func_rename
+            ndim = 1 if cols is None else 2
+            func_cols = [col] if col is not None else None
+            compiler.add_function(f, ndim, cols=func_cols, func_name=func_name)
+        return True
+
+    @classmethod
+    def _tile_tree(cls, op: "DataFrameAggregate"):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+        combine_size = op.combine_size
+        axis = op.axis
+
+        input_index_to_output = dict()
+        output_index_to_input = []
+        axis_func_infos = []
+        dtypes_list = []
+        if len(in_df.chunk_shape) > 1:
+            for col_idx in range(in_df.chunk_shape[1 - axis]):
+                compiler = ReductionCompiler(axis=op.axis)
+                idx_chunk = (
+                    in_df.cix[0, col_idx] if axis == 0 else in_df.cix[col_idx, 0]
+                )
+                new_dtypes = cls._filter_dtypes(op, idx_chunk.dtypes)
+                if not cls._add_functions(op, compiler, cols=list(new_dtypes.index)):
+                    continue
+                input_index_to_output[col_idx] = len(axis_func_infos)
+                output_index_to_input.append(col_idx)
+                axis_func_infos.append(compiler.compile())
+                dtypes_list.append(new_dtypes)
+        else:
+            compiler = ReductionCompiler(axis=op.axis)
+            cls._add_functions(op, compiler)
+            input_index_to_output[0] = 0
+            axis_func_infos.append(compiler.compile())
+
+        chunks = cls._gen_map_chunks(
+            op, in_df, out_df, axis_func_infos, input_index_to_output
+        )
+        while chunks.shape[axis] > combine_size:
+            if axis == 0:
+                new_chunks_shape = (
+                    ceildiv(chunks.shape[0], combine_size),
+                    chunks.shape[1],
+                )
+            else:
+                new_chunks_shape = (
+                    chunks.shape[0],
+                    ceildiv(chunks.shape[1], combine_size),
+                )
+
+            new_chunks = np.empty(new_chunks_shape, dtype=object)
+            for idx0, i in enumerate(range(0, chunks.shape[axis], combine_size)):
+                for idx1 in range(chunks.shape[1 - axis]):
+                    func_info = axis_func_infos[idx1]
+                    if axis == 0:
+                        chks = chunks[i : i + combine_size, idx1]
+                        chunk_index = (idx0, idx1)
+                        if chks[0].ndim == 1:
+                            concat_shape = (len(chks),)
+                            agg_shape = (1,)
+                        else:
+                            concat_shape = (len(chks), chks[0].shape[1])
+                            agg_shape = (chks[0].shape[1], 1)
+                    else:
+                        chks = chunks[idx1, i : i + combine_size]
+                        chunk_index = (idx1, idx0)
+                        concat_shape = (chks[0].shape[0], len(chks))
+                        agg_shape = (chks[0].shape[0], 1)
+
+                    chks = chks.reshape((chks.shape[0],)).tolist()
+                    if len(chks) == 1:
+                        chk = chks[0]
+                    else:
+                        concat_op = DataFrameConcat(
+                            output_types=[OutputType.dataframe], axis=axis
+                        )
+                        # Change index for concatenate
+                        for j, c in enumerate(chks):
+                            c._index = (j, 0) if axis == 0 else (0, j)
+                        chk = concat_op.new_chunk(
+                            chks,
+                            dtypes=dtypes_list[idx1] if dtypes_list else None,
+                            shape=concat_shape,
+                            index_value=chks[0].index_value,
+                        )
+                    chunk_op = op.copy().reset_key()
+                    chunk_op.output_types = [OutputType.dataframe]
+                    chunk_op.stage = OperandStage.combine
+                    chunk_op.agg_funcs = func_info.agg_funcs
+
+                    if axis == 0:
+                        new_chunks[chunk_index] = chunk_op.new_chunk(
+                            [chk],
+                            index=chunk_index,
+                            shape=agg_shape,
+                            index_value=chks[0].index_value,
+                        )
+                    else:
+                        new_chunks[chunk_index] = chunk_op.new_chunk(
+                            [chk],
+                            index=chunk_index,
+                            shape=agg_shape,
+                            index_value=chks[0].columns_value,
+                        )
+            chunks = new_chunks
+
+        agg_chunks = []
+        for idx in range(chunks.shape[1 - axis]):
+            func_info = axis_func_infos[idx]
+
+            concat_op = DataFrameConcat(output_types=[OutputType.dataframe], axis=axis)
+            if axis == 0:
+                chks = chunks[:, idx]
+                if chks[0].ndim == 1:
+                    concat_shape = (len(chks),)
+                else:
+                    concat_shape = (len(chks), chks[0].shape[1])
+            else:
+                chks = chunks[idx, :]
+                concat_shape = (chks[0].shape[0], len(chks))
+            chks = chks.reshape((chks.shape[0],)).tolist()
+            chk = concat_op.new_chunk(
+                chks,
+                dtypes=dtypes_list[idx] if dtypes_list else None,
+                shape=concat_shape,
+                index_value=chks[0].index_value,
+            )
+            chunk_op = op.copy().reset_key()
+            chunk_op.stage = OperandStage.agg
+            chunk_op.agg_funcs = func_info.agg_funcs
+            chunk_op.post_funcs = func_info.post_funcs
+
+            kw = out_df.params.copy()
+            if op.output_types[0] == OutputType.dataframe:
+                if axis == 0:
+                    src_col_chunk = in_df.cix[0, output_index_to_input[idx]]
+                    valid_cols = [
+                        c for pre in func_info.pre_funcs for c in pre.columns or ()
+                    ]
+                    if not valid_cols:
+                        columns_value = src_col_chunk.columns_value
+                        shape_len = src_col_chunk.shape[1]
+                    else:
+                        col_index = pd.Index(valid_cols).unique()
+                        columns_value = parse_index(col_index, store_data=True)
+                        shape_len = len(col_index)
+                    kw.update(
+                        dict(
+                            shape=(out_df.shape[0], shape_len),
+                            columns_value=columns_value,
+                            index=(0, idx),
+                            dtypes=out_df.dtypes[columns_value.to_pandas()],
+                        )
+                    )
+                else:
+                    src_col_chunk = in_df.cix[output_index_to_input[idx], 0]
+                    kw.update(
+                        dict(
+                            index=(idx, 0),
+                            index_value=src_col_chunk.index_value,
+                            shape=(src_col_chunk.shape[0], out_df.shape[1]),
+                            dtypes=out_df.dtypes,
+                        )
+                    )
+            else:
+                if op.output_types[0] == OutputType.series:
+                    if in_df.ndim == 1:
+                        index_value, shape = out_df.index_value, out_df.shape
+                    elif axis == 0:
+                        out_dtypes = dtypes_list[idx]
+                        index_value = parse_index(out_dtypes.index, store_data=True)
+                        shape = (len(out_dtypes),)
+                    else:
+                        src_chunk = in_df.cix[output_index_to_input[idx], 0]
+                        index_value, shape = (
+                            src_chunk.index_value,
+                            (src_chunk.shape[0],),
+                        )
+                    kw.update(
+                        dict(
+                            name=out_df.name,
+                            dtype=out_df.dtype,
+                            index=(idx,),
+                            index_value=index_value,
+                            shape=shape,
+                        )
+                    )
+                elif op.output_types[0] == OutputType.tensor:
+                    kw.update(dict(index=(0,), shape=(np.nan,), dtype=out_df.dtype))
+                else:
+                    kw.update(dict(index=(), shape=(), dtype=out_df.dtype))
+            agg_chunks.append(chunk_op.new_chunk([chk], **kw))
+
+        new_op = op.copy()
+        if op.output_types[0] == OutputType.dataframe:
+            if axis == 0:
+                nsplits = ((out_df.shape[0],), tuple(c.shape[1] for c in agg_chunks))
+            else:
+                nsplits = (tuple(c.shape[0] for c in agg_chunks), (out_df.shape[1],))
+            return new_op.new_tileables(
+                op.inputs,
+                chunks=agg_chunks,
+                nsplits=nsplits,
+                dtypes=out_df.dtypes,
+                shape=out_df.shape,
+                index_value=out_df.index_value,
+                columns_value=out_df.columns_value,
+            )
+        elif op.output_types[0] == OutputType.series:
+            nsplits = (tuple(c.shape[0] for c in agg_chunks),)
+            return new_op.new_tileables(
+                op.inputs,
+                chunks=agg_chunks,
+                nsplits=nsplits,
+                dtype=out_df.dtype,
+                shape=out_df.shape,
+                index_value=out_df.index_value,
+                name=out_df.name,
+            )
+        elif op.output_types[0] == OutputType.tensor:  # unique
+            return new_op.new_tileables(
+                op.inputs,
+                chunks=agg_chunks,
+                dtype=out_df.dtype,
+                shape=out_df.shape,
+                nsplits=((np.nan,),),
+            )
+        else:  # scalar
+            return new_op.new_tileables(
+                op.inputs, chunks=agg_chunks, dtype=out_df.dtype, shape=(), nsplits=()
+            )
+
+    @classmethod
+    def tile(cls, op: "DataFrameAggregate"):
+        in_df = op.inputs[0]
+
+        if len(in_df.chunks) == 1:
+            return cls._tile_single_chunk(op)
+        elif not _agg_size_as_series and in_df.ndim == 2 and op.raw_func == "size":
+            return (yield from cls._tile_size(op))
+        else:
+            return cls._tile_tree(op)
+
+    @classmethod
+    def _wrap_df(cls, op, value, index=None):
+        xdf = cudf if op.gpu else pd
+        axis = op.axis
+        ndim = op.inputs[0].ndim
+
+        if ndim == 2:
+            dtype = None
+            if isinstance(value, (np.generic, int, float, complex)):
+                value = xdf.DataFrame([value], columns=index)
+            elif not isinstance(value, xdf.DataFrame):
+                new_index = None if not op.gpu else getattr(value, "index", None)
+                dtype = getattr(value, "dtype", None)
+                if xdf is pd:
+                    value = xdf.DataFrame(value, columns=index, index=new_index)
+                else:  # pragma: no cover
+                    value = xdf.DataFrame(value)
+                    value.index = new_index
+                    value.columns = index
+            else:
+                return value
+
+            value = value.T if axis == 0 else value
+            if (
+                dtype == np.dtype("O")
+                and getattr(op.outputs[0], "dtypes", None) is not None
+            ):
+                value = value.astype(op.outputs[0].dtypes)
+            return value
+        else:
+            if isinstance(value, (np.generic, int, float, complex)):
+                value = xdf.Series([value], index=index)
+            elif isinstance(value, np.ndarray):
+                # assert value.ndim == 0
+                value = xdf.Series(value.tolist(), index=index)
+            return value
+
+    @staticmethod
+    def _pack_inputs(agg_funcs: List[ReductionAggStep], in_data):
+        pos = 0
+        out_dict = dict()
+        for step in agg_funcs:
+            if step.custom_reduction is None:
+                out_dict[step.output_key] = in_data[pos]
+            else:
+                out_dict[step.output_key] = tuple(
+                    in_data[pos : pos + step.output_limit]
+                )
+            pos += step.output_limit
+        return out_dict
+
+    @classmethod
+    def _do_predefined_agg(cls, op: "DataFrameAggregate", input_obj, func_name, kwds):
+        if func_name == "size":
+            return input_obj.agg(lambda x: x.size, axis=op.axis)
+        elif func_name == "str_concat":
+            ret = input_obj.agg(lambda x: x.str.cat(**kwds), axis=op.axis)
+            if isinstance(ret, str):
+                ret = pd.Series([ret])
+            return ret
+        else:
+            if op.gpu:
+                if kwds.pop("numeric_only", None):
+                    raise NotImplementedError("numeric_only not implemented under cudf")
+            if isinstance(input_obj, pd.Index):
+                kwds.pop("skipna", None)
+            return getattr(input_obj, func_name)(**kwds)
+
+    @classmethod
+    def _select_dtypes(cls, in_data, op: "DataFrameAggregate"):
+        if in_data.ndim == 2:
+            if op.numeric_only:
+                in_data = in_data.select_dtypes([np.number, np.bool_])
+            elif op.bool_only:
+                in_data = in_data.select_dtypes([np.bool_])
+        return in_data
+
+    @classmethod
+    def _execute_map(cls, ctx, op: "DataFrameAggregate"):
+        in_data = ctx[op.inputs[0].key]
+        axis_index = op.outputs[0].index[op.axis]
+        in_data = cls._select_dtypes(in_data, op)
+
+        # map according to map groups
+        ret_map_dfs = dict()
+        in_cols_set = set(in_data.columns) if in_data.ndim == 2 else None
+        for input_key, output_key, cols, func in op.pre_funcs:
+            if cols and in_cols_set == set(cols):
+                cols = None
+
+            src_df = in_data if cols is None else in_data[cols]
+            if input_key == output_key:
+                ret_map_dfs[output_key] = src_df
+            else:
+                ret_map_dfs[output_key] = func(src_df, gpu=op.is_gpu())
+
+        agg_dfs = []
+        for (
+            input_key,
+            _,
+            map_func_name,
+            _agg_func_name,
+            custom_reduction,
+            _output_key,
+            _output_limit,
+            kwds,
+        ) in op.agg_funcs:
+            input_obj = ret_map_dfs[input_key]
+            if map_func_name == "custom_reduction":
+                pre_result = custom_reduction.pre(input_obj)
+                if not isinstance(pre_result, tuple):
+                    pre_result = (pre_result,)
+
+                if custom_reduction.pre_with_agg:
+                    # when custom_reduction.pre already aggregates, skip
+                    agg_result = pre_result
+                else:
+                    agg_result = custom_reduction.agg(*pre_result)
+                    if not isinstance(agg_result, tuple):
+                        agg_result = (agg_result,)
+
+                agg_dfs.extend(
+                    [cls._wrap_df(op, r, index=[axis_index]) for r in agg_result]
+                )
+            else:
+                agg_dfs.append(
+                    cls._wrap_df(
+                        op,
+                        cls._do_predefined_agg(op, input_obj, map_func_name, kwds),
+                        index=[axis_index],
+                    )
+                )
+        ctx[op.outputs[0].key] = tuple(agg_dfs)
+
+    @classmethod
+    def _execute_combine(cls, ctx, op: "DataFrameAggregate"):
+        in_data = ctx[op.inputs[0].key]
+        in_data_dict = cls._pack_inputs(op.agg_funcs, in_data)
+        axis = op.axis
+        axis_index = op.outputs[0].index[axis]
+
+        combines = []
+        for (
+            _input_key,
+            _,
+            _map_func_name,
+            agg_func_name,
+            custom_reduction,
+            output_key,
+            _output_limit,
+            kwds,
+        ) in op.agg_funcs:
+            input_obj = in_data_dict[output_key]
+            if agg_func_name == "custom_reduction":
+                agg_result = custom_reduction.agg(*input_obj)
+                if not isinstance(agg_result, tuple):
+                    agg_result = (agg_result,)
+                combines.extend(
+                    [cls._wrap_df(op, r, index=[axis_index]) for r in agg_result]
+                )
+            else:
+                combines.append(
+                    cls._wrap_df(
+                        op,
+                        cls._do_predefined_agg(op, input_obj, agg_func_name, kwds),
+                        index=[axis_index],
+                    )
+                )
+        ctx[op.outputs[0].key] = tuple(combines)
+
+    @classmethod
+    def _execute_agg(cls, ctx, op: "DataFrameAggregate"):
+        xdf = cudf if op.gpu else pd
+        xp = cp if op.gpu else np
+
+        out = op.outputs[0]
+        in_data = ctx[op.inputs[0].key]
+        in_data_dict = cls._pack_inputs(op.agg_funcs, in_data)
+        axis = op.axis
+
+        # perform agg
+        for (
+            _input_key,
+            _,
+            _map_func_name,
+            agg_func_name,
+            custom_reduction,
+            output_key,
+            _output_limit,
+            kwds,
+        ) in op.agg_funcs:
+            input_obj = in_data_dict[output_key]
+            if agg_func_name == "custom_reduction":
+                agg_result = custom_reduction.agg(*input_obj)
+                if not isinstance(agg_result, tuple):
+                    agg_result = (agg_result,)
+                in_data_dict[output_key] = custom_reduction.post(*agg_result)
+            else:
+                in_data_dict[output_key] = cls._do_predefined_agg(
+                    op, input_obj, agg_func_name, kwds
+                )
+
+        aggs = []
+        # perform post op
+        for input_keys, _output_key, func_name, cols, func in op.post_funcs:
+            if cols is None:
+                func_inputs = [in_data_dict[k] for k in input_keys]
+            else:
+                func_inputs = [in_data_dict[k][cols] for k in input_keys]
+
+            agg_series = func(*func_inputs, gpu=op.is_gpu())
+            agg_series_ndim = getattr(agg_series, "ndim", 0)
+
+            ser_index = None
+            if agg_series_ndim < out.ndim:
+                ser_index = [func_name]
+            aggs.append(cls._wrap_df(op, agg_series, index=ser_index))
+
+        # concatenate to produce final result
+        concat_df = xdf.concat(aggs, axis=axis)
+        if op.output_types[0] == OutputType.series:
+            if concat_df.ndim > 1:
+                if op.inputs[0].ndim == 2:
+                    if axis == 0:
+                        concat_df = concat_df.iloc[0, :]
+                    else:
+                        concat_df = concat_df.iloc[:, 0]
+                else:
+                    concat_df = concat_df.iloc[:, 0]
+            concat_df.name = op.outputs[0].name
+
+            concat_df = concat_df.astype(op.outputs[0].dtype, copy=False)
+        elif op.output_types[0] == OutputType.scalar:
+            concat_df = concat_df.iloc[0]
+            try:
+                concat_df = concat_df.astype(op.outputs[0].dtype)
+            except AttributeError:
+                # concat_df may be a string and has no `astype` method
+                pass
+        elif op.output_types[0] == OutputType.tensor:
+            concat_df = xp.array(concat_df).astype(dtype=out.dtype)
+        else:
+            if axis == 0:
+                concat_df = concat_df.reindex(op.outputs[0].index_value.to_pandas())
+            else:
+                concat_df = concat_df[op.outputs[0].columns_value.to_pandas()]
+
+            concat_df = concat_df.astype(op.outputs[0].dtypes, copy=False)
+        ctx[op.outputs[0].key] = concat_df
+
+    @classmethod
+    @redirect_custom_log
+    @enter_current_session
+    def execute(cls, ctx, op: "DataFrameAggregate"):
+        try:
+            pd.set_option("mode.use_inf_as_na", op.use_inf_as_na)
+            if op.stage == OperandStage.map:
+                cls._execute_map(ctx, op)
+            elif op.stage == OperandStage.combine:
+                cls._execute_combine(ctx, op)
+            elif op.stage == OperandStage.agg:
+                cls._execute_agg(ctx, op)
+            elif not _agg_size_as_series and op.raw_func == "size":
+                xp = cp if op.gpu else np
+                ctx[op.outputs[0].key] = xp.array(
+                    ctx[op.inputs[0].key].agg(op.raw_func, axis=op.axis)
+                ).reshape(op.outputs[0].shape)
+            else:
+                xp = cp if op.gpu else np
+                in_obj = op.inputs[0]
+                in_data = ctx[in_obj.key]
+                in_data = cls._select_dtypes(in_data, op)
+                if isinstance(in_obj, INDEX_CHUNK_TYPE):
+                    result = op.func[0](in_data)
+                elif (
+                    op.output_types[0] == OutputType.scalar
+                    and in_data.shape == (0,)
+                    and callable(op.func[0])
+                ):
+                    result = op.func[0](in_data)
+                else:
+                    result = in_data.agg(op.raw_func, axis=op.axis)
+                    if op.outputs[0].ndim == 1:
+                        result = result.astype(op.outputs[0].dtype, copy=False)
+
+                if op.output_types[0] == OutputType.tensor:
+                    result = xp.array(result)
+                ctx[op.outputs[0].key] = result
+        finally:
+            pd.reset_option("mode.use_inf_as_na")
+
+
+def is_funcs_aggregate(func, func_kw=None, ndim=2):
+    func_kw = func_kw or dict()
+    if ndim == 1 and func is None:
+        func, func_kw = func_kw, dict()
+
+    to_check = []
+    if func is not None:
+        if isinstance(func, (list, tuple)):
+            to_check.extend(func)
+        elif isinstance(func, dict):
+            if ndim == 2:
+                for f in func.values():
+                    if isinstance(f, Iterable) and not isinstance(f, str):
+                        to_check.extend(f)
+                    else:
+                        to_check.append(f)
+            else:
+                if any(isinstance(v, tuple) for v in func.values()):
+                    raise TypeError("nested renamer is not supported")
+                to_check.extend(func.values())
+        else:
+            to_check.append(func)
+    else:
+        for v in func_kw.values():
+            if (
+                not isinstance(v, tuple)
+                or len(v) != 2
+                or (not isinstance(v[1], str) and not callable(v[1]))
+            ):
+                raise TypeError("Must provide 'func' or tuples of (column, aggfunc).")
+            else:
+                to_check.append(v[1])
+
+    compiler = ReductionCompiler()
+    for f in to_check:
+        if f in _agg_functions:
+            continue
+        elif callable(f):
+            try:
+                if ndim == 2:
+                    compiler.add_function(f, 2, cols=["A", "B"])
+                else:
+                    compiler.add_function(f, 1)
+            except ValueError:
+                return False
+        else:
+            return False
+    return True
+
+
+def normalize_reduction_funcs(op, ndim=None):
+    raw_func = op.raw_func
+    if ndim == 1 and raw_func is None:
+        raw_func = op.raw_func_kw
+
+    if raw_func is not None:
+        if isinstance(raw_func, dict):
+            if ndim == 2:
+                new_func = OrderedDict()
+                for k, v in raw_func.items():
+                    if isinstance(v, str) or callable(v):
+                        new_func[k] = [v]
+                    else:
+                        new_func[k] = v
+                op.func = new_func
+            else:
+                op.func = list(raw_func.values())
+                op.func_rename = list(raw_func.keys())
+        elif isinstance(raw_func, Iterable) and not isinstance(raw_func, str):
+            op.func = list(raw_func)
+        else:
+            op.func = [raw_func]
+    else:
+        new_func = OrderedDict()
+        new_func_names = OrderedDict()
+        for k, v in op.raw_func_kw.items():
+            try:
+                col_funcs = new_func[v[0]]
+                col_func_names = new_func_names[v[0]]
+            except KeyError:
+                col_funcs = new_func[v[0]] = []
+                col_func_names = new_func_names[v[0]] = []
+            col_funcs.append(v[1])
+            col_func_names.append(k)
+        op.func = new_func
+        op.func_rename = functools.reduce(
+            lambda a, b: a + b, new_func_names.values(), []
+        )
+
+    custom_idx = 0
+    if isinstance(op.func, list):
+        custom_iter = (f for f in op.func if isinstance(f, CustomReduction))
+    else:
+        custom_iter = (f for f in op.func.values() if isinstance(f, CustomReduction))
+    for r in custom_iter:
+        if r.name == "<custom>":
+            r.name = f"<custom_{custom_idx}>"
+            custom_idx += 1
+
+
+def aggregate(df, func=None, axis=0, **kw):
+    axis = validate_axis(axis, df)
+    use_inf_as_na = kw.pop("_use_inf_as_na", options.dataframe.mode.use_inf_as_na)
+    if (
+        df.ndim == 2
+        and isinstance(func, dict)
+        and (df.op.output_types[0] == OutputType.series or axis == 1)
+    ):
+        raise NotImplementedError(
+            "Currently cannot aggregate dicts over axis=1 on %s" % type(df).__name__
+        )
+    combine_size = kw.pop("_combine_size", None) or options.combine_size
+    numeric_only = kw.pop("_numeric_only", None)
+    bool_only = kw.pop("_bool_only", None)
+
+    output_type = kw.pop("_output_type", None)
+    dtypes = kw.pop("_dtypes", None)
+    index = kw.pop("_index", None)
+
+    if not is_funcs_aggregate(func, func_kw=kw, ndim=df.ndim):
+        return df.transform(func, axis=axis, _call_agg=True)
+
+    op = DataFrameAggregate(
+        raw_func=copy.deepcopy(func),
+        raw_func_kw=copy.deepcopy(kw),
+        axis=axis,
+        combine_size=combine_size,
+        numeric_only=numeric_only,
+        bool_only=bool_only,
+        use_inf_as_na=use_inf_as_na,
+    )
+
+    return op(df, output_type=output_type, dtypes=dtypes, index=index)
diff --git a/python/xorbits/_mars/dataframe/reduction/all.py b/python/xorbits/_mars/dataframe/reduction/all.py
new file mode 100644
index 000000000..f01db9b81
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/all.py
@@ -0,0 +1,129 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import OutputType
+from .core import (
+    DATAFRAME_TYPE,
+    DataFrameReductionMixin,
+    DataFrameReductionOperand,
+    recursive_tile,
+)
+
+
+class DataFrameAll(DataFrameReductionOperand, DataFrameReductionMixin):
+    _op_type_ = OperandDef.ALL
+    _func_name = "all"
+
+    @property
+    def is_atomic(self):
+        return True
+
+    @classmethod
+    def tile(cls, op):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+        if op.axis is None and isinstance(in_df, DATAFRAME_TYPE):
+            dtypes = pd.Series([out_df.dtype])
+            index = in_df.dtypes.index
+            out_df = yield from recursive_tile(
+                in_df.agg(
+                    cls.get_reduction_callable(op),
+                    axis=0,
+                    _numeric_only=op.numeric_only,
+                    _bool_only=op.bool_only,
+                    _combine_size=op.combine_size,
+                    _output_type=OutputType.series,
+                    _dtypes=dtypes,
+                    _index=index,
+                )
+            )
+            out_df = yield from recursive_tile(
+                out_df.agg(
+                    cls.get_reduction_callable(op),
+                    axis=0,
+                    _numeric_only=op.numeric_only,
+                    _bool_only=op.bool_only,
+                    _combine_size=op.combine_size,
+                    _output_type=OutputType.scalar,
+                    _dtypes=out_df.dtype,
+                    _index=None,
+                )
+            )
+            return [out_df]
+        else:
+            return (yield from super().tile(op))
+
+    def __call__(self, df):
+        if self.axis is None and isinstance(df, DATAFRAME_TYPE):
+            return self.new_scalar([df], np.dtype("bool"))
+        else:
+            return super().__call__(df)
+
+
+def all_series(
+    series,
+    axis=0,
+    bool_only=None,
+    skipna=True,
+    level=None,
+    combine_size=None,
+    method=None,
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameAll(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        bool_only=bool_only,
+        combine_size=combine_size,
+        output_types=[OutputType.scalar],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(series)
+
+
+def all_dataframe(
+    df,
+    axis=0,
+    bool_only=None,
+    skipna=True,
+    level=None,
+    combine_size=None,
+    method=None,
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    output_types = [OutputType.series] if axis is not None else [OutputType.scalar]
+    op = DataFrameAll(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        bool_only=bool_only,
+        combine_size=combine_size,
+        output_types=output_types,
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
+
+
+def all_index(idx):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameAll(output_types=[OutputType.scalar], use_inf_as_na=use_inf_as_na)
+    return op(idx)
diff --git a/python/xorbits/_mars/dataframe/reduction/any.py b/python/xorbits/_mars/dataframe/reduction/any.py
new file mode 100644
index 000000000..36bece7dc
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/any.py
@@ -0,0 +1,129 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import OutputType
+from .core import (
+    DATAFRAME_TYPE,
+    DataFrameReductionMixin,
+    DataFrameReductionOperand,
+    recursive_tile,
+)
+
+
+class DataFrameAny(DataFrameReductionOperand, DataFrameReductionMixin):
+    _op_type_ = OperandDef.ANY
+    _func_name = "any"
+
+    @property
+    def is_atomic(self):
+        return True
+
+    @classmethod
+    def tile(cls, op):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+        if op.axis is None and isinstance(in_df, DATAFRAME_TYPE):
+            dtypes = pd.Series([out_df.dtype])
+            index = in_df.dtypes.index
+            out_df = yield from recursive_tile(
+                in_df.agg(
+                    cls.get_reduction_callable(op),
+                    axis=0,
+                    _numeric_only=op.numeric_only,
+                    _bool_only=op.bool_only,
+                    _combine_size=op.combine_size,
+                    _output_type=OutputType.series,
+                    _dtypes=dtypes,
+                    _index=index,
+                )
+            )
+            out_df = yield from recursive_tile(
+                out_df.agg(
+                    cls.get_reduction_callable(op),
+                    axis=0,
+                    _numeric_only=op.numeric_only,
+                    _bool_only=op.bool_only,
+                    _combine_size=op.combine_size,
+                    _output_type=OutputType.scalar,
+                    _dtypes=out_df.dtype,
+                    _index=None,
+                )
+            )
+            return [out_df]
+        else:
+            return (yield from super().tile(op))
+
+    def __call__(self, df):
+        if self.axis is None and isinstance(df, DATAFRAME_TYPE):
+            return self.new_scalar([df], np.dtype("bool"))
+        else:
+            return super().__call__(df)
+
+
+def any_series(
+    series,
+    axis=0,
+    bool_only=None,
+    skipna=True,
+    level=None,
+    combine_size=None,
+    method=None,
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameAny(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        bool_only=bool_only,
+        combine_size=combine_size,
+        output_types=[OutputType.scalar],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(series)
+
+
+def any_dataframe(
+    df,
+    axis=0,
+    bool_only=None,
+    skipna=True,
+    level=None,
+    combine_size=None,
+    method=None,
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    output_types = [OutputType.series] if axis is not None else [OutputType.scalar]
+    op = DataFrameAny(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        bool_only=bool_only,
+        combine_size=combine_size,
+        output_types=output_types,
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
+
+
+def any_index(index):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameAny(output_types=[OutputType.scalar], use_inf_as_na=use_inf_as_na)
+    return op(index)
diff --git a/python/xorbits/_mars/dataframe/reduction/core.py b/python/xorbits/_mars/dataframe/reduction/core.py
new file mode 100644
index 000000000..283b13538
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/core.py
@@ -0,0 +1,1251 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import inspect
+from collections import OrderedDict
+from typing import Any, Callable, Dict, List, NamedTuple, Optional
+
+import numpy as np
+import pandas as pd
+
+from ...core import (
+    ENTITY_TYPE,
+    OutputType,
+    enter_mode,
+    is_build_mode,
+    is_kernel_mode,
+    recursive_tile,
+)
+from ...core.operand import OperandStage
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    DataTypeField,
+    Int32Field,
+    StringField,
+)
+from ...utils import pd_release_version, tokenize
+from ..core import SERIES_TYPE
+from ..operands import DATAFRAME_TYPE, DataFrameOperand, DataFrameOperandMixin
+from ..utils import (
+    build_df,
+    build_empty_df,
+    build_empty_series,
+    build_series,
+    parse_index,
+    validate_axis,
+)
+
+# in pandas<1.3, when aggregating with multiple levels and numeric_only is True,
+# object cols not ignored with min-max funcs
+_level_reduction_keep_object = pd_release_version[:2] < (1, 3)
+# in pandas>=1.3, when dataframes are reduced into series, mixture of float and bool
+# results in object.
+_reduce_bool_as_object = pd_release_version[:2] != (1, 2)
+
+
+class DataFrameReductionOperand(DataFrameOperand):
+    _axis = AnyField("axis")
+    _skipna = BoolField("skipna")
+    _level = AnyField("level")
+    _numeric_only = BoolField("numeric_only")
+    _bool_only = BoolField("bool_only")
+    _min_count = Int32Field("min_count")
+    _use_inf_as_na = BoolField("use_inf_as_na")
+    _method = StringField("method")
+
+    _dtype = DataTypeField("dtype")
+    _combine_size = Int32Field("combine_size")
+
+    def __init__(
+        self,
+        axis=None,
+        skipna=None,
+        level=None,
+        numeric_only=None,
+        bool_only=None,
+        min_count=None,
+        dtype=None,
+        combine_size=None,
+        gpu=None,
+        sparse=None,
+        output_types=None,
+        use_inf_as_na=None,
+        method=None,
+        **kw,
+    ):
+        super().__init__(
+            _axis=axis,
+            _skipna=skipna,
+            _level=level,
+            _numeric_only=numeric_only,
+            _bool_only=bool_only,
+            _min_count=min_count,
+            _dtype=dtype,
+            _combine_size=combine_size,
+            gpu=gpu,
+            sparse=sparse,
+            _output_types=output_types,
+            _use_inf_as_na=use_inf_as_na,
+            _method=method,
+            **kw,
+        )
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def skipna(self):
+        return self._skipna
+
+    @property
+    def level(self):
+        return self._level
+
+    @property
+    def numeric_only(self):
+        return self._numeric_only
+
+    @property
+    def bool_only(self):
+        return self._bool_only
+
+    @property
+    def min_count(self):
+        return self._min_count
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @property
+    def combine_size(self):
+        return self._combine_size
+
+    @property
+    def use_inf_as_na(self):
+        return self._use_inf_as_na
+
+    @property
+    def is_atomic(self):
+        return False
+
+    @property
+    def method(self):
+        return self._method
+
+    def get_reduction_args(self, axis=None):
+        args = dict(skipna=self.skipna)
+        if self.inputs and self.inputs[0].ndim > 1:
+            args["axis"] = axis
+        if self.numeric_only is not None:
+            args["numeric_only"] = self.numeric_only
+        if self.bool_only is not None:
+            args["bool_only"] = self.bool_only
+        return {k: v for k, v in args.items() if v is not None}
+
+
+class DataFrameCumReductionOperand(DataFrameOperand):
+    _axis = AnyField("axis")
+    _skipna = BoolField("skipna")
+    _use_inf_as_na = BoolField("use_inf_as_na")
+
+    _dtype = DataTypeField("dtype")
+
+    def __init__(
+        self,
+        axis=None,
+        skipna=None,
+        dtype=None,
+        gpu=None,
+        sparse=None,
+        output_types=None,
+        use_inf_as_na=None,
+        **kw,
+    ):
+        super().__init__(
+            _axis=axis,
+            _skipna=skipna,
+            _dtype=dtype,
+            gpu=gpu,
+            sparse=sparse,
+            _output_types=output_types,
+            _use_inf_as_na=use_inf_as_na,
+            **kw,
+        )
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def skipna(self):
+        return self._skipna
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @property
+    def use_inf_as_na(self):
+        return self._use_inf_as_na
+
+
+def _default_agg_fun(value, func_name=None, **kw):
+    if value.ndim == 1:
+        kw.pop("bool_only", None)
+        kw.pop("numeric_only", None)
+        return getattr(value, func_name)(**kw)
+    else:
+        return getattr(value, func_name)(**kw)
+
+
+@functools.lru_cache(100)
+def _get_series_reduction_dtype(
+    dtype,
+    func_name,
+    axis=None,
+    bool_only=False,
+    skipna=True,
+    numeric_only=False,
+):
+    test_series = build_series(dtype=dtype, ensure_string=True)
+    if func_name == "count":
+        reduced = test_series.count()
+    elif func_name == "nunique":
+        reduced = test_series.nunique()
+    elif func_name in ("all", "any"):
+        reduced = getattr(test_series, func_name)(axis=axis, bool_only=bool_only)
+    elif func_name == "size":
+        reduced = test_series.size
+    elif func_name == "str_concat":
+        reduced = pd.Series([test_series.str.cat()])
+    else:
+        reduced = getattr(test_series, func_name)(
+            axis=axis, skipna=skipna, numeric_only=numeric_only
+        )
+    return pd.Series(reduced).dtype
+
+
+@functools.lru_cache(100)
+def _get_df_reduction_dtype(
+    dtype, func_name, axis=None, bool_only=False, skipna=False, numeric_only=False
+):
+    test_df = build_series(dtype=dtype, ensure_string=True).to_frame()
+    if func_name == "count":
+        reduced = getattr(test_df, func_name)(axis=axis, numeric_only=numeric_only)
+    elif func_name == "nunique":
+        reduced = getattr(test_df, func_name)(axis=axis)
+    elif func_name in ("all", "any"):
+        reduced = getattr(test_df, func_name)(axis=axis, bool_only=bool_only)
+    elif func_name == "str_concat":
+        reduced = test_df.apply(lambda s: s.str.cat(), axis=axis)
+    else:
+        reduced = getattr(test_df, func_name)(
+            axis=axis, skipna=skipna, numeric_only=numeric_only
+        )
+    if len(reduced) == 0:
+        return None
+    return reduced.dtype
+
+
+class DataFrameReductionMixin(DataFrameOperandMixin):
+    @classmethod
+    def get_reduction_callable(cls, op):
+        func_name = getattr(op, "_func_name")
+        kw = dict(
+            skipna=op.skipna, numeric_only=op.numeric_only, bool_only=op.bool_only
+        )
+        kw = {k: v for k, v in kw.items() if v is not None}
+        fun = functools.partial(_default_agg_fun, func_name=func_name, **kw)
+        fun.__name__ = func_name
+        return fun
+
+    @classmethod
+    def tile(cls, op):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+
+        if isinstance(out_df, SERIES_TYPE):
+            output_type = OutputType.series
+            dtypes = pd.Series([out_df.dtype], index=[out_df.name])
+            index = out_df.index_value.to_pandas()
+        elif out_df.ndim == 1:
+            output_type = OutputType.tensor
+            dtypes, index = out_df.dtype, None
+        else:
+            output_type = OutputType.scalar
+            dtypes, index = out_df.dtype, None
+
+        out_df = yield from recursive_tile(
+            in_df.agg(
+                cls.get_reduction_callable(op),
+                axis=op.axis or 0,
+                _numeric_only=op.numeric_only,
+                _bool_only=op.bool_only,
+                _combine_size=op.combine_size,
+                _output_type=output_type,
+                _dtypes=dtypes,
+                _index=index,
+            )
+        )
+        return [out_df]
+
+    def _call_groupby_level(self, df, level):
+        return df.groupby(level=level).agg(
+            self.get_reduction_callable(self), method=self.method
+        )
+
+    def _call_dataframe(self, df):
+        axis = getattr(self, "axis", None) or 0
+        level = getattr(self, "level", None)
+        skipna = getattr(self, "skipna", True)
+        numeric_only = getattr(self, "numeric_only", None)
+        bool_only = getattr(self, "bool_only", None)
+        self._axis = axis = validate_axis(axis, df)
+        func_name = getattr(self, "_func_name")
+
+        if level is not None and axis == 1:
+            raise NotImplementedError("Not support specify level for axis==1")
+
+        if func_name == "size":
+            reduced = pd.Series(
+                np.zeros(df.shape[1 - axis]),
+                index=df.dtypes.index if axis == 0 else None,
+            )
+            reduced_cols = list(reduced.index)
+            reduced_dtype = reduced.dtype
+        elif func_name == "custom_reduction":
+            empty_df = build_df(df, ensure_string=True)
+            reduced = getattr(self, "custom_reduction").__call_agg__(empty_df)
+            reduced_cols = list(reduced.index)
+            reduced_dtype = reduced.dtype
+        else:
+            reduced_cols, dtypes = [], []
+            for col, src_dt in df.dtypes.items():
+                dt = _get_df_reduction_dtype(
+                    src_dt,
+                    func_name,
+                    axis=axis,
+                    bool_only=bool_only,
+                    skipna=skipna,
+                    numeric_only=numeric_only,
+                )
+                if dt is not None:
+                    reduced_cols.append(col)
+                    dtypes.append(dt)
+                elif (
+                    _level_reduction_keep_object
+                    and numeric_only
+                    and level is not None
+                    and func_name in ("min", "max")
+                    and src_dt == np.dtype(object)
+                ):  # pragma: no cover
+                    reduced_cols.append(col)
+                    dtypes.append(np.dtype(object))
+            if len(dtypes) == 0:
+                reduced_dtype = np.dtype("O")
+            elif all(dt == dtypes[0] for dt in dtypes):
+                reduced_dtype = dtypes[0]
+            else:
+                # as we already bypassed dtypes with same values,
+                # when has_mixed_bool is True, there are other dtypes
+                # other than bool.
+                has_mixed_bool = any(dt == np.dtype(bool) for dt in dtypes)
+                if _reduce_bool_as_object and has_mixed_bool:
+                    reduced_dtype = np.dtype("O")
+                elif not all(isinstance(dt, np.dtype) for dt in dtypes):
+                    # todo currently we return mixed dtypes as np.dtype('O').
+                    #  handle pandas Dtypes in the future more carefully.
+                    reduced_dtype = np.dtype("O")
+                else:
+                    reduced_dtype = np.find_common_type(dtypes, [])
+
+        if level is not None:
+            return self._call_groupby_level(df[reduced_cols], level)
+
+        if axis == 0:
+            reduced_shape = (len(reduced_cols),)
+            reduced_index_value = parse_index(pd.Index(reduced_cols), store_data=True)
+        else:
+            reduced_shape = (df.shape[0],)
+            reduced_index_value = parse_index(pd.RangeIndex(-1))
+
+        return self.new_series(
+            [df],
+            shape=reduced_shape,
+            dtype=reduced_dtype,
+            index_value=reduced_index_value,
+        )
+
+    def _call_series(self, series):
+        level = getattr(self, "level", None)
+        axis = getattr(self, "axis", None)
+        skipna = getattr(self, "skipna", True)
+        numeric_only = getattr(self, "numeric_only", None)
+        bool_only = getattr(self, "bool_only", None)
+        self._axis = axis = validate_axis(axis or 0, series)
+        func_name = getattr(self, "_func_name")
+
+        if level is not None:
+            return self._call_groupby_level(series, level)
+
+        if func_name == "custom_reduction":
+            empty_series = build_series(series, ensure_string=True)
+            result_scalar = getattr(self, "custom_reduction").__call_agg__(empty_series)
+            if hasattr(result_scalar, "to_pandas"):  # pragma: no cover
+                result_scalar = result_scalar.to_pandas()
+            result_dtype = pd.Series(result_scalar).dtype
+        else:
+            result_dtype = _get_series_reduction_dtype(
+                series.dtype,
+                func_name,
+                axis=axis,
+                bool_only=bool_only,
+                numeric_only=numeric_only,
+                skipna=skipna,
+            )
+        return self.new_scalar([series], dtype=result_dtype)
+
+    def __call__(self, a):
+        if is_kernel_mode() and not getattr(self, "is_atomic", False):
+            return self.get_reduction_callable(self)(a)
+
+        if isinstance(a, DATAFRAME_TYPE):
+            return self._call_dataframe(a)
+        else:
+            return self._call_series(a)
+
+
+class DataFrameCumReductionMixin(DataFrameOperandMixin):
+    @classmethod
+    def _tile_one_chunk(cls, op):
+        df = op.outputs[0]
+        params = df.params.copy()
+
+        chk = op.inputs[0].chunks[0]
+        chunk_params = {k: v for k, v in chk.params.items() if k in df.params}
+        chunk_params["shape"] = df.shape
+        chunk_params["index"] = chk.index
+        new_chunk_op = op.copy().reset_key()
+        chunk = new_chunk_op.new_chunk(op.inputs[0].chunks, kws=[chunk_params])
+
+        new_op = op.copy()
+        nsplits = tuple((s,) for s in chunk.shape)
+        params["chunks"] = [chunk]
+        params["nsplits"] = nsplits
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def _build_combine(cls, op, input_chunks, summary_chunks, idx):
+        c = input_chunks[idx]
+        to_concat_chunks = [c]
+        for j in range(idx):
+            to_concat_chunks.append(summary_chunks[j])
+
+        new_chunk_op = op.copy().reset_key()
+        new_chunk_op.stage = OperandStage.combine
+        return new_chunk_op.new_chunk(to_concat_chunks, **c.params)
+
+    @classmethod
+    def _tile_dataframe(cls, op):
+        in_df = op.inputs[0]
+        df = op.outputs[0]
+
+        n_rows, n_cols = in_df.chunk_shape
+
+        # map to get individual results and summaries
+        src_chunks = np.empty(in_df.chunk_shape, dtype=object)
+        summary_chunks = np.empty(in_df.chunk_shape, dtype=object)
+        for c in in_df.chunks:
+            new_chunk_op = op.copy().reset_key()
+            new_chunk_op.stage = OperandStage.map
+            if op.axis == 1:
+                summary_shape = (c.shape[0], 1)
+            else:
+                summary_shape = (1, c.shape[1])
+            src_chunks[c.index] = c
+            summary_chunks[c.index] = new_chunk_op.new_chunk(
+                [c], shape=summary_shape, dtypes=df.dtypes
+            )
+
+        # combine summaries into results
+        output_chunk_array = np.empty(in_df.chunk_shape, dtype=object)
+        if op.axis == 1:
+            for row in range(n_rows):
+                row_src = src_chunks[row, :]
+                row_summaries = summary_chunks[row, :]
+                for col in range(n_cols):
+                    output_chunk_array[row, col] = cls._build_combine(
+                        op, row_src, row_summaries, col
+                    )
+        else:
+            for col in range(n_cols):
+                col_src = src_chunks[:, col]
+                col_summaries = summary_chunks[:, col]
+                for row in range(n_rows):
+                    output_chunk_array[row, col] = cls._build_combine(
+                        op, col_src, col_summaries, row
+                    )
+
+        output_chunks = list(output_chunk_array.reshape((n_rows * n_cols,)))
+        new_op = op.copy().reset_key()
+        return new_op.new_tileables(
+            op.inputs,
+            shape=in_df.shape,
+            nsplits=in_df.nsplits,
+            chunks=output_chunks,
+            dtypes=df.dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+        )
+
+    @classmethod
+    def _tile_series(cls, op):
+        in_series = op.inputs[0]
+        series = op.outputs[0]
+
+        # map to get individual results and summaries
+        summary_chunks = np.empty(in_series.chunk_shape, dtype=object)
+        for c in in_series.chunks:
+            new_chunk_op = op.copy().reset_key()
+            new_chunk_op.stage = OperandStage.map
+            summary_chunks[c.index] = new_chunk_op.new_chunk(
+                [c], shape=(1,), dtype=series.dtype
+            )
+
+        # combine summaries into results
+        output_chunks = [
+            cls._build_combine(op, in_series.chunks, summary_chunks, i)
+            for i in range(len(in_series.chunks))
+        ]
+        new_op = op.copy().reset_key()
+        return new_op.new_tileables(
+            op.inputs,
+            shape=in_series.shape,
+            nsplits=in_series.nsplits,
+            chunks=output_chunks,
+            dtype=series.dtype,
+            index_value=series.index_value,
+            name=series.name,
+        )
+
+    @classmethod
+    def tile(cls, op):
+        in_df = op.inputs[0]
+        if len(in_df.chunks) == 1:
+            return cls._tile_one_chunk(op)
+        if isinstance(in_df, DATAFRAME_TYPE):
+            return cls._tile_dataframe(op)
+        else:
+            return cls._tile_series(op)
+
+    @staticmethod
+    def _get_last_slice(op, df, start):
+        if op.output_types[0] == OutputType.series:
+            return df.iloc[start:]
+        else:
+            if op.axis == 1:
+                return df.iloc[:, start:]
+            else:
+                return df.iloc[start:, :]
+
+    @classmethod
+    def _execute_map(cls, ctx, op):
+        in_data = ctx[op.inputs[0].key]
+        kwargs = dict()
+        if op.axis is not None:
+            kwargs["axis"] = op.axis
+        if op.skipna is not None:
+            kwargs["skipna"] = op.skipna
+        partial = getattr(in_data, getattr(cls, "_func_name"))(**kwargs)
+        if op.skipna:
+            partial.fillna(method="ffill", axis=op.axis, inplace=True)
+        ctx[op.outputs[0].key] = cls._get_last_slice(op, partial, -1)
+
+    @classmethod
+    def _execute_combine(cls, ctx, op):
+        kwargs = dict()
+        if op.axis is not None:
+            kwargs["axis"] = op.axis
+        if op.skipna is not None:
+            kwargs["skipna"] = op.skipna
+
+        if len(op.inputs) > 1:
+            ref_datas = [ctx[inp.key] for inp in op.inputs[1:]]
+            concat_df = getattr(
+                pd.concat(ref_datas, axis=op.axis), getattr(cls, "_func_name")
+            )(**kwargs)
+            if op.skipna:
+                concat_df.fillna(method="ffill", axis=op.axis, inplace=True)
+
+            in_data = ctx[op.inputs[0].key]
+            concat_df = pd.concat(
+                [cls._get_last_slice(op, concat_df, -1), in_data], axis=op.axis
+            )
+            result = getattr(concat_df, getattr(cls, "_func_name"))(**kwargs)
+            ctx[op.outputs[0].key] = cls._get_last_slice(op, result, 1)
+        else:
+            ctx[op.outputs[0].key] = getattr(
+                ctx[op.inputs[0].key], getattr(cls, "_func_name")
+            )(**kwargs)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        try:
+            pd.set_option("mode.use_inf_as_na", op.use_inf_as_na)
+            if op.stage == OperandStage.map:
+                return cls._execute_map(ctx, op)
+            else:
+                return cls._execute_combine(ctx, op)
+        finally:
+            pd.reset_option("mode.use_inf_as_na")
+
+    def _call_dataframe(self, df):
+        axis = getattr(self, "axis", None) or 0
+        self._axis = axis = validate_axis(axis, df)
+
+        empty_df = build_empty_df(df.dtypes)
+        reduced_df = getattr(empty_df, getattr(self, "_func_name"))(axis=axis)
+        return self.new_dataframe(
+            [df],
+            shape=df.shape,
+            dtypes=reduced_df.dtypes,
+            index_value=df.index_value,
+            columns_value=df.columns_value,
+        )
+
+    def _call_series(self, series):
+        axis = getattr(self, "axis", None) or 0
+        if axis == "index":
+            axis = 0
+        self._axis = axis
+
+        return self.new_series(
+            [series],
+            shape=series.shape,
+            dtype=series.dtype,
+            name=series.name,
+            index_value=series.index_value,
+        )
+
+    def __call__(self, a):
+        if isinstance(a, DATAFRAME_TYPE):
+            return self._call_dataframe(a)
+        else:
+            return self._call_series(a)
+
+
+class CustomReduction:
+    name: Optional[str]
+    output_limit: Optional[int]
+    kwds: Dict
+
+    # set to True when pre() already performs aggregation
+    pre_with_agg = False
+
+    def __init__(self, name=None, is_gpu=None):
+        self.name = name or "<custom>"
+        self.output_limit = 1
+        self._is_gpu = is_gpu
+
+    @property
+    def __name__(self):
+        return self.name
+
+    def __call__(self, value):
+        if isinstance(value, ENTITY_TYPE):
+            from .custom_reduction import build_custom_reduction_result
+
+            return build_custom_reduction_result(value, self)
+        return self.__call_agg__(value)
+
+    def __call_agg__(self, value):
+        r = self.pre(value)
+        if not isinstance(r, tuple):
+            r = (r,)
+        # update output limit into actual size
+        self.output_limit = len(r)
+
+        # only perform aggregation when pre() does not perform aggregation
+        if not self.pre_with_agg:
+            r = self.agg(*r)
+            if not isinstance(r, tuple):
+                r = (r,)
+
+        r = self.post(*r)
+        return r
+
+    def is_gpu(self):
+        return self._is_gpu if not is_build_mode() else False
+
+    def pre(self, value):  # noqa: R0201  # pylint: disable=no-self-use
+        return (value,)
+
+    def agg(self, *values):  # noqa: R0201  # pylint: disable=no-self-use
+        raise NotImplementedError
+
+    def post(self, *value):  # noqa: R0201  # pylint: disable=no-self-use
+        assert len(value) == 1
+        return value[0]
+
+    def __mars_tokenize__(self):
+        import cloudpickle
+
+        return cloudpickle.dumps(self)
+
+
+class ReductionPreStep(NamedTuple):
+    input_key: str
+    output_key: str
+    columns: Optional[List[str]]
+    func: Callable
+
+
+class ReductionAggStep(NamedTuple):
+    input_key: str
+    raw_func_name: Optional[str]
+    map_func_name: Optional[str]
+    agg_func_name: Optional[str]
+    custom_reduction: Optional[CustomReduction]
+    output_key: str
+    output_limit: int
+    kwds: Dict[str, Any]
+
+
+class ReductionPostStep(NamedTuple):
+    input_keys: List[str]
+    output_key: str
+    func_name: str
+    columns: Optional[List[str]]
+    func: Callable
+
+
+class ReductionSteps(NamedTuple):
+    pre_funcs: List[ReductionPreStep]
+    agg_funcs: List[ReductionAggStep]
+    post_funcs: List[ReductionPostStep]
+
+
+# lookup table for numpy arithmetic operands in pandas
+_func_name_converts = dict(
+    greater="gt",
+    greater_equal="ge",
+    less="lt",
+    less_equal="le",
+    equal="eq",
+    not_equal="ne",
+    true_divide="truediv",
+    floor_divide="floordiv",
+    power="pow",
+)
+_func_name_to_op = dict(
+    greater=">",
+    gt=">",
+    greater_equal=">=",
+    ge=">",
+    less="<",
+    lt="<",
+    less_equal="<=",
+    le="<=",
+    equal="==",
+    eq="==",
+    not_equal="!=",
+    ne="!=",
+    bitwise_and="&",
+    __and__="&",
+    bitwise_or="|",
+    __or__="|",
+    bitwise_xor="^",
+    __xor__="^",
+    add="+",
+    subtract="-",
+    sub="-",
+    multiply="*",
+    mul="*",
+    true_divide="/",
+    truediv="/",
+    floor_divide="//",
+    floordiv="//",
+    power="**",
+    pow="**",
+    mod="%",
+)
+_func_compile_cache = dict()  # type: Dict[str, ReductionSteps]
+
+
+class ReductionCompiler:
+    def __init__(self, axis=0, store_source=False):
+        self._axis = axis
+        self._store_source = store_source
+
+        self._key_to_tileable = dict()
+        self._output_tileables = []
+        self._lambda_counter = 0
+        self._custom_counter = 0
+        self._func_cache = dict()
+
+        self._compiled_funcs = []
+        self._output_key_to_pre_steps = dict()
+        self._output_key_to_pre_cols = dict()
+        self._output_key_to_agg_steps = dict()
+        self._output_key_to_post_steps = dict()
+        self._output_key_to_post_cols = dict()
+
+    @classmethod
+    def _check_function_valid(cls, func):
+        if isinstance(func, functools.partial):
+            return cls._check_function_valid(func.func)
+        elif isinstance(func, CustomReduction):
+            return
+
+        func_code = func.__code__
+        func_vars = {n: func.__globals__.get(n) for n in func_code.co_names}
+        if func.__closure__:
+            func_vars.update(
+                {
+                    n: cell.cell_contents
+                    for n, cell in zip(func_code.co_freevars, func.__closure__)
+                }
+            )
+        # external Mars objects shall not be referenced
+        for var_name, val in func_vars.items():
+            if isinstance(val, ENTITY_TYPE):
+                raise ValueError(
+                    f"Variable {var_name} used by {func.__name__} "
+                    "cannot be a Mars object"
+                )
+
+    @staticmethod
+    def _update_col_dict(col_dict: Dict, key: str, cols: List):
+        if key in col_dict:
+            existing_cols = col_dict[key]
+            if existing_cols is not None:
+                existing_col_set = set(existing_cols)
+                col_dict[key].extend([c for c in cols if c not in existing_col_set])
+        else:
+            col_dict[key] = list(cols) if cols is not None else None
+
+    def add_function(self, func, ndim, cols=None, func_name=None):
+        from .aggregation import _agg_functions
+
+        cols = cols if cols is not None and self._axis == 0 else None
+
+        func_name = func_name or getattr(func, "__name__", None)
+        if func_name == "<lambda>" or func_name is None:
+            func_name = f"<lambda_{self._lambda_counter}>"
+            self._lambda_counter += 1
+        if func_name == "<custom>" or func_name is None:
+            func_name = f"<custom_{self._custom_counter}>"
+            self._custom_counter += 1
+
+        if inspect.isbuiltin(func):
+            raw_func_name = getattr(func, "__name__", "N/A")
+            if raw_func_name in _agg_functions:
+                func = _agg_functions[raw_func_name]
+            else:
+                raise ValueError(f"Unexpected built-in function {raw_func_name}")
+
+        compile_result = self._compile_function(func, func_name, ndim=ndim)
+        self._compiled_funcs.append(compile_result)
+
+        for step in compile_result.pre_funcs:
+            self._output_key_to_pre_steps[step.output_key] = step
+            self._update_col_dict(self._output_key_to_pre_cols, step.output_key, cols)
+
+        for step in compile_result.agg_funcs:
+            self._output_key_to_agg_steps[step.output_key] = step
+
+        for step in compile_result.post_funcs:
+            self._output_key_to_post_steps[step.output_key] = step
+            self._update_col_dict(self._output_key_to_post_cols, step.output_key, cols)
+
+    def _compile_expr_function(self, py_src: str, local_consts: dict):
+        from ... import dataframe, tensor
+
+        result_store = dict()
+        global_vars = globals().copy()
+        global_vars.update(local_consts)
+        global_vars.update(dict(mt=tensor, md=dataframe, array=np.array, nan=np.nan))
+        exec(
+            py_src, global_vars, result_store
+        )  # noqa: W0122  # nosec  # pylint: disable=exec-used
+        fun = result_store["expr_function"]
+        if self._store_source:
+            fun.__source__ = py_src
+        return fun
+
+    @staticmethod
+    def _build_mock_return_object(func, input_dtype, ndim):
+        from ..initializer import DataFrame as MarsDataFrame
+        from ..initializer import Series as MarsSeries
+
+        if ndim == 1:
+            mock_series = build_empty_series(np.dtype(input_dtype))
+            mock_obj = MarsSeries(mock_series)
+        else:
+            mock_df = build_empty_df(
+                pd.Series([np.dtype(input_dtype)] * 2, index=["A", "B"])
+            )
+            mock_obj = MarsDataFrame(mock_df)
+
+        # calc target tileable to generate DAG
+        with enter_mode(kernel=True, build=False):
+            return func(mock_obj)
+
+    @enter_mode(build=True)
+    def _compile_function(self, func, func_name=None, ndim=1) -> ReductionSteps:
+        from ...tensor.arithmetic.core import TensorBinOp, TensorUnaryOp
+        from ...tensor.base import TensorWhere
+        from ..arithmetic.core import DataFrameBinOp, DataFrameUnaryOp
+        from ..datasource.dataframe import DataFrameDataSource
+        from ..datasource.series import SeriesDataSource
+        from ..indexing.where import DataFrameWhere
+
+        func_token = tokenize(func, self._axis, func_name, ndim)
+        if func_token in _func_compile_cache:
+            return _func_compile_cache[func_token]
+        custom_reduction = func if isinstance(func, CustomReduction) else None
+
+        self._check_function_valid(func)
+
+        try:
+            func_ret = self._build_mock_return_object(func, float, ndim=ndim)
+        except (TypeError, AttributeError):
+            # we may encounter lambda x: x.str.cat(...), use an object series to test
+            func_ret = self._build_mock_return_object(func, object, ndim=1)
+        output_limit = getattr(func, "output_limit", None) or 1
+
+        if not isinstance(func_ret, ENTITY_TYPE):
+            raise ValueError(
+                f"Custom function should return a Mars object, not {type(func_ret)}"
+            )
+        if func_ret.ndim >= ndim:
+            raise ValueError("Function not a reduction")
+
+        agg_graph = func_ret.build_graph()
+        agg_tileables = set(t for t in agg_graph if getattr(t.op, "is_atomic", False))
+        # check operands before aggregation
+        for t in agg_graph.dfs(
+            list(agg_tileables), visit_predicate="all", reverse=True
+        ):
+            if t not in agg_tileables and not isinstance(
+                t.op,
+                (
+                    DataFrameUnaryOp,
+                    DataFrameBinOp,
+                    TensorUnaryOp,
+                    TensorBinOp,
+                    TensorWhere,
+                    DataFrameWhere,
+                    DataFrameDataSource,
+                    SeriesDataSource,
+                ),
+            ):
+                raise ValueError(f"Cannot support operand {type(t.op)} in aggregation")
+        # check operands after aggregation
+        for t in agg_graph.dfs(list(agg_tileables), visit_predicate="all"):
+            if t not in agg_tileables and not isinstance(
+                t.op,
+                (
+                    DataFrameUnaryOp,
+                    DataFrameBinOp,
+                    TensorWhere,
+                    DataFrameWhere,
+                    TensorUnaryOp,
+                    TensorBinOp,
+                ),
+            ):
+                raise ValueError(f"Cannot support operand {type(t.op)} in aggregation")
+
+        pre_funcs, agg_funcs, post_funcs = [], [], []
+        visited_inputs = set()
+        # collect aggregations and their inputs
+        for t in agg_tileables:
+            agg_input_key = t.inputs[0].key
+
+            # collect agg names
+            step_func_name = getattr(t.op, "_func_name")
+            if step_func_name in ("count", "size"):
+                map_func_name, agg_func_name = step_func_name, "sum"
+            else:
+                map_func_name, agg_func_name = step_func_name, step_func_name
+
+            # build agg description
+            agg_funcs.append(
+                ReductionAggStep(
+                    agg_input_key,
+                    func_name,
+                    map_func_name,
+                    agg_func_name,
+                    custom_reduction,
+                    t.key,
+                    output_limit,
+                    t.op.get_reduction_args(axis=self._axis),
+                )
+            )
+            # collect agg input and build function
+            if agg_input_key not in visited_inputs:
+                visited_inputs.add(agg_input_key)
+                initial_inputs = list(t.inputs[0].build_graph().iter_indep())
+                assert len(initial_inputs) == 1
+                input_key = initial_inputs[0].key
+
+                func_str, _, local_consts = self._generate_function_str(t.inputs[0])
+                pre_funcs.append(
+                    ReductionPreStep(
+                        input_key,
+                        agg_input_key,
+                        None,
+                        self._compile_expr_function(func_str, local_consts),
+                    )
+                )
+        # collect function output after agg
+        func_str, input_keys, local_consts = self._generate_function_str(func_ret)
+        post_funcs.append(
+            ReductionPostStep(
+                input_keys,
+                func_ret.key,
+                func_name,
+                None,
+                self._compile_expr_function(func_str, local_consts),
+            )
+        )
+        if len(_func_compile_cache) > 100:  # pragma: no cover
+            _func_compile_cache.pop(next(iter(_func_compile_cache.keys())))
+        result = _func_compile_cache[func_token] = ReductionSteps(
+            pre_funcs, agg_funcs, post_funcs
+        )
+        return result
+
+    def _generate_function_str(self, out_tileable):
+        """
+        Generate python code from tileable DAG
+        """
+        from ...tensor.arithmetic.core import TensorBinOp, TensorUnaryOp
+        from ...tensor.base import TensorWhere
+        from ...tensor.datasource import Scalar
+        from ..arithmetic.core import (
+            DataFrameBinOp,
+            DataFrameUnaryOp,
+            DataFrameUnaryUfunc,
+        )
+        from ..datasource.dataframe import DataFrameDataSource
+        from ..datasource.series import SeriesDataSource
+        from ..indexing.where import DataFrameWhere
+
+        input_key_to_var = OrderedDict()
+        local_key_to_var = dict()
+        local_consts_to_val = dict()
+        ref_counts = dict()
+        ref_visited = set()
+        local_lines = []
+
+        input_op_types = (
+            DataFrameDataSource,
+            SeriesDataSource,
+            DataFrameReductionOperand,
+        )
+
+        def _calc_ref_counts(t):
+            # calculate object refcount for t, this reduces memory usage in functions
+            if t.key in ref_visited:
+                return
+            ref_visited.add(t.key)
+            for inp in t.inputs:
+                _calc_ref_counts(inp)
+
+                if not isinstance(inp.op, input_op_types):
+                    if inp.key not in ref_counts:
+                        ref_counts[inp.key] = 0
+                    ref_counts[inp.key] += 1
+
+        def _gen_expr_str(t):
+            # generate code for t
+            if t.key in local_key_to_var:
+                return
+
+            if isinstance(t.op, input_op_types):
+                # tileable is an input arg, build a function variable
+                if t.key not in input_key_to_var:  # pragma: no branch
+                    input_key_to_var[t.key] = local_key_to_var[
+                        t.key
+                    ] = f"invar{len(input_key_to_var)}"
+            else:
+                keys_to_del = []
+                for inp in t.inputs:
+                    _gen_expr_str(inp)
+
+                    if inp.key in ref_counts:
+                        ref_counts[inp.key] -= 1
+                        if ref_counts[inp.key] == 0:
+                            # the input is no longer referenced, a del statement will be produced
+                            keys_to_del.append(inp.key)
+
+                var_name = local_key_to_var[t.key] = f"var{len(local_key_to_var)}"
+                keys_to_vars = {inp.key: local_key_to_var[inp.key] for inp in t.inputs}
+
+                def _interpret_var(v):
+                    # get representation for variables
+                    if hasattr(v, "key"):
+                        return keys_to_vars[v.key]
+                    elif isinstance(v, (int, bool, str, bytes, np.integer, np.bool_)):
+                        return repr(v)
+                    else:
+                        const_name = f"_const_{len(local_consts_to_val)}"
+                        local_consts_to_val[const_name] = v
+                        return const_name
+
+                func_name = func_name_raw = getattr(t.op, "_func_name", None)
+                rfunc_name = getattr(t.op, "_rfunc_name", func_name)
+
+                if func_name is None:
+                    func_name = func_name_raw = getattr(t.op, "_bit_func_name", None)
+                    rfunc_name = getattr(t.op, "_bit_rfunc_name", func_name)
+
+                # handle function name differences between numpy and pandas arithmetic ops
+                if func_name in _func_name_converts:
+                    func_name = _func_name_converts[func_name]
+                if rfunc_name in _func_name_converts:
+                    rfunc_name = "r" + _func_name_converts[rfunc_name]
+
+                # build given different op types
+                if isinstance(t.op, (DataFrameUnaryOp, TensorUnaryOp)):
+                    val = _interpret_var(t.inputs[0])
+                    if isinstance(t.op, DataFrameUnaryUfunc):
+                        statements = [f"{var_name} = np.{func_name_raw}({val})"]
+                    else:
+                        statements = [
+                            f"try:",
+                            f"    {var_name} = {val}.{func_name}()",
+                            f"except AttributeError:",
+                            f"    {var_name} = np.{func_name_raw}({val})",
+                        ]
+                elif isinstance(t.op, (DataFrameBinOp, TensorBinOp)):
+                    lhs, rhs = t.op.lhs, t.op.rhs
+                    op_axis = (
+                        1 - self._axis
+                        if hasattr(lhs, "ndim")
+                        and hasattr(rhs, "ndim")
+                        and lhs.ndim != rhs.ndim
+                        else None
+                    )
+                    lhs = _interpret_var(lhs)
+                    rhs = _interpret_var(rhs)
+                    axis_expr = f"axis={op_axis!r}, " if op_axis is not None else ""
+                    op_str = _func_name_to_op[func_name]
+                    if t.op.lhs is t.inputs[0]:
+                        statements = [
+                            f"try:",
+                            f"    {var_name} = {lhs}.{func_name}({rhs}, {axis_expr})",
+                            f"except AttributeError:",
+                            f"    {var_name} = {lhs} {op_str} {rhs}",
+                        ]
+                    else:
+                        statements = [
+                            f"try:",
+                            f"    {var_name} = {rhs}.{rfunc_name}({lhs}, {axis_expr})",
+                            f"except AttributeError:",
+                            f"    {var_name} = {rhs} {op_str} {lhs}",
+                        ]
+                elif isinstance(t.op, TensorWhere):
+                    cond = _interpret_var(t.op.condition)
+                    x = _interpret_var(t.op.x)
+                    y = _interpret_var(t.op.y)
+                    statements = [
+                        f"if not gpu:",
+                        f"    {var_name} = np.where({cond}, {x}, {y})",
+                        f"else:",  # there is a bug with cudf.where
+                        f"    {var_name} = {x}",
+                    ]
+                elif isinstance(t.op, DataFrameWhere):
+                    func_name = "mask" if t.op.replace_true else "where"
+                    inp = _interpret_var(t.op.input)
+                    cond = _interpret_var(t.op.cond)
+                    other = _interpret_var(t.op.other)
+                    statements = [
+                        f"if not gpu:",
+                        f"    {var_name} = {inp}.{func_name}({cond}, {other}, "
+                        f"axis={t.op.axis!r}, level={t.op.level!r})",
+                        f"else:",  # there is a bug with cudf.where
+                        f"    {var_name} = {inp}",
+                    ]
+                elif isinstance(t.op, Scalar):
+                    # for scalar inputs of other operands
+                    data = _interpret_var(t.op.data)
+                    statements = [f"{var_name} = {data}"]
+                else:  # pragma: no cover
+                    raise NotImplementedError(
+                        f"Does not support aggregating on {type(t.op)}"
+                    )
+
+                # append del statements for used inputs
+                for key in keys_to_del:
+                    statements.append(f"del {local_key_to_var[key]}")
+
+                local_lines.extend(statements)
+
+        _calc_ref_counts(out_tileable)
+        _gen_expr_str(out_tileable)
+
+        args_str = ", ".join(input_key_to_var.values())
+        lines_str = "\n    ".join(local_lines)
+        return (
+            f"def expr_function({args_str}, gpu=None):\n"
+            f"    {lines_str}\n"
+            f"    return {local_key_to_var[out_tileable.key]}",
+            list(input_key_to_var.keys()),
+            local_consts_to_val,
+        )
+
+    def compile(self) -> ReductionSteps:
+        pre_funcs, agg_funcs, post_funcs = [], [], []
+        referred_cols = set()
+        for key, step in self._output_key_to_pre_steps.items():
+            cols = self._output_key_to_pre_cols[key]
+            if cols:
+                referred_cols.update(cols)
+            pre_funcs.append(
+                ReductionPreStep(step.input_key, step.output_key, cols, step.func)
+            )
+
+        for step in self._output_key_to_agg_steps.values():
+            agg_funcs.append(step)
+
+        for key, step in self._output_key_to_post_steps.items():
+            cols = self._output_key_to_post_cols[key]
+            if cols and set(cols) == set(referred_cols):
+                post_cols = None
+            else:
+                post_cols = cols
+
+            func_name = step.func_name
+            if self._lambda_counter == 1 and step.func_name == "<lambda_0>":
+                func_name = "<lambda>"
+            if self._custom_counter == 1 and step.func_name == "<custom_0>":
+                func_name = "<custom>"
+
+            post_funcs.append(
+                ReductionPostStep(
+                    step.input_keys,
+                    step.output_key,
+                    func_name,
+                    post_cols,
+                    step.func,
+                )
+            )
+
+        return ReductionSteps(pre_funcs, agg_funcs, post_funcs)
diff --git a/python/xorbits/_mars/dataframe/reduction/count.py b/python/xorbits/_mars/dataframe/reduction/count.py
new file mode 100644
index 000000000..3b4643fe2
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/count.py
@@ -0,0 +1,68 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import OutputType
+from .core import DataFrameReductionMixin, DataFrameReductionOperand
+
+
+class DataFrameCount(DataFrameReductionOperand, DataFrameReductionMixin):
+    _op_type_ = OperandDef.COUNT
+    _func_name = "count"
+
+    @property
+    def is_atomic(self):
+        return True
+
+    @classmethod
+    def get_reduction_callable(cls, op):
+        skipna, numeric_only = op.skipna, op.numeric_only
+
+        def count(value):
+            if value.ndim == 1:
+                return value.count()
+            return value.count(skipna=skipna, numeric_only=numeric_only)
+
+        return count
+
+
+def count_series(series, level=None, combine_size=None, **kw):
+    use_inf_as_na = kw.pop("_use_inf_as_na", options.dataframe.mode.use_inf_as_na)
+    method = kw.pop("method", None)
+    op = DataFrameCount(
+        level=level,
+        combine_size=combine_size,
+        output_types=[OutputType.scalar],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(series)
+
+
+def count_dataframe(
+    df, axis=0, level=None, numeric_only=False, combine_size=None, **kw
+):
+    use_inf_as_na = kw.pop("_use_inf_as_na", options.dataframe.mode.use_inf_as_na)
+    method = kw.pop("method", None)
+    op = DataFrameCount(
+        axis=axis,
+        level=level,
+        numeric_only=numeric_only,
+        combine_size=combine_size,
+        output_types=[OutputType.series],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/reduction/cummax.py b/python/xorbits/_mars/dataframe/reduction/cummax.py
new file mode 100644
index 000000000..b48976c27
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/cummax.py
@@ -0,0 +1,33 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...config import options
+from .core import DataFrameCumReductionMixin, DataFrameCumReductionOperand
+
+
+class DataFrameCummax(DataFrameCumReductionOperand, DataFrameCumReductionMixin):
+    _op_type_ = OperandDef.CUMMAX
+    _func_name = "cummax"
+
+
+def cummax(df, axis=None, skipna=True):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameCummax(
+        axis=axis,
+        skipna=skipna,
+        output_types=df.op.output_types,
+        use_inf_as_na=use_inf_as_na,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/reduction/cummin.py b/python/xorbits/_mars/dataframe/reduction/cummin.py
new file mode 100644
index 000000000..502a59749
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/cummin.py
@@ -0,0 +1,33 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...config import options
+from .core import DataFrameCumReductionMixin, DataFrameCumReductionOperand
+
+
+class DataFrameCummin(DataFrameCumReductionOperand, DataFrameCumReductionMixin):
+    _op_type_ = OperandDef.CUMMIN
+    _func_name = "cummin"
+
+
+def cummin(df, axis=None, skipna=True):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameCummin(
+        axis=axis,
+        skipna=skipna,
+        output_types=df.op.output_types,
+        use_inf_as_na=use_inf_as_na,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/reduction/cumprod.py b/python/xorbits/_mars/dataframe/reduction/cumprod.py
new file mode 100644
index 000000000..22b3d99f9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/cumprod.py
@@ -0,0 +1,33 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...config import options
+from .core import DataFrameCumReductionMixin, DataFrameCumReductionOperand
+
+
+class DataFrameCumprod(DataFrameCumReductionOperand, DataFrameCumReductionMixin):
+    _op_type_ = OperandDef.CUMPROD
+    _func_name = "cumprod"
+
+
+def cumprod(df, axis=None, skipna=True):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameCumprod(
+        axis=axis,
+        skipna=skipna,
+        output_types=df.op.output_types,
+        use_inf_as_na=use_inf_as_na,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/reduction/cumsum.py b/python/xorbits/_mars/dataframe/reduction/cumsum.py
new file mode 100644
index 000000000..964e721c3
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/cumsum.py
@@ -0,0 +1,33 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...config import options
+from .core import DataFrameCumReductionMixin, DataFrameCumReductionOperand
+
+
+class DataFrameCumsum(DataFrameCumReductionOperand, DataFrameCumReductionMixin):
+    _op_type_ = OperandDef.CUMSUM
+    _func_name = "cumsum"
+
+
+def cumsum(df, axis=None, skipna=True):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameCumsum(
+        axis=axis,
+        skipna=skipna,
+        output_types=df.op.output_types,
+        use_inf_as_na=use_inf_as_na,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/reduction/custom_reduction.py b/python/xorbits/_mars/dataframe/reduction/custom_reduction.py
new file mode 100644
index 000000000..e4e66afa9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/custom_reduction.py
@@ -0,0 +1,45 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import OutputType
+from ...serialization.serializables import AnyField
+from .core import DataFrameReductionMixin, DataFrameReductionOperand
+
+
+class DataFrameCustomReduction(DataFrameReductionOperand, DataFrameReductionMixin):
+    _op_type_ = OperandDef.CUSTOM_REDUCTION
+    _func_name = "custom_reduction"
+
+    custom_reduction = AnyField("custom_reduction")
+
+    @property
+    def is_atomic(self):
+        return True
+
+    def get_reduction_args(self, axis=None):
+        return dict()
+
+
+def build_custom_reduction_result(df, custom_reduction_obj, method=None):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    output_type = OutputType.series if df.ndim == 2 else OutputType.scalar
+    op = DataFrameCustomReduction(
+        custom_reduction=custom_reduction_obj,
+        output_types=[output_type],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/reduction/kurtosis.py b/python/xorbits/_mars/dataframe/reduction/kurtosis.py
new file mode 100644
index 000000000..b8e66d02f
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/kurtosis.py
@@ -0,0 +1,124 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes
+from ...config import options
+from ...core import ENTITY_TYPE, OutputType
+from ...serialization.serializables import BoolField
+from .core import DataFrameReductionMixin, DataFrameReductionOperand
+
+
+class DataFrameKurtosis(DataFrameReductionOperand, DataFrameReductionMixin):
+    _op_type_ = opcodes.KURTOSIS
+    _func_name = "kurt"
+
+    _bias = BoolField("bias")
+    _fisher = BoolField("fisher")
+
+    def __init__(self, bias=None, fisher=None, **kw):
+        super().__init__(_bias=bias, _fisher=fisher, **kw)
+
+    @property
+    def bias(self):
+        return self._bias
+
+    @property
+    def fisher(self):
+        return self._fisher
+
+    @classmethod
+    def get_reduction_callable(cls, op):
+        from .aggregation import where_function
+
+        skipna, bias, fisher = op.skipna, op.bias, op.fisher
+
+        def kurt(x):
+            cnt = x.count()
+            mean = x.mean(skipna=skipna)
+            divided = (
+                (x**4).mean(skipna=skipna)
+                - 4 * (x**3).mean(skipna=skipna) * mean
+                + 6 * (x**2).mean(skipna=skipna) * mean**2
+                - 3 * mean**4
+            )
+            var = x.var(skipna=skipna, ddof=0)
+            if isinstance(var, ENTITY_TYPE) or var > 0:
+                val = where_function(var > 0, divided / var**2, np.nan)
+            else:
+                val = np.nan
+            if not bias:
+                val = where_function(
+                    (var > 0) & (cnt > 3),
+                    (val * (cnt**2 - 1) - 3 * (cnt - 1) ** 2) / (cnt - 2) / (cnt - 3),
+                    np.nan,
+                )
+            if not fisher:
+                val += 3
+            return val
+
+        return kurt
+
+
+def kurt_series(
+    df,
+    axis=None,
+    skipna=True,
+    level=None,
+    combine_size=None,
+    bias=False,
+    fisher=True,
+    method=None,
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameKurtosis(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        combine_size=combine_size,
+        bias=bias,
+        fisher=fisher,
+        output_types=[OutputType.scalar],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
+
+
+def kurt_dataframe(
+    df,
+    axis=None,
+    skipna=True,
+    level=None,
+    numeric_only=None,
+    combine_size=None,
+    bias=False,
+    fisher=True,
+    method=None,
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameKurtosis(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        numeric_only=numeric_only,
+        bias=bias,
+        fisher=fisher,
+        combine_size=combine_size,
+        output_types=[OutputType.series],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/reduction/max.py b/python/xorbits/_mars/dataframe/reduction/max.py
new file mode 100644
index 000000000..6a6e38ad7
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/max.py
@@ -0,0 +1,75 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import OutputType
+from .core import DataFrameReductionMixin, DataFrameReductionOperand
+
+
+class DataFrameMax(DataFrameReductionOperand, DataFrameReductionMixin):
+    _op_type_ = OperandDef.MAX
+    _func_name = "max"
+
+    @property
+    def is_atomic(self):
+        return True
+
+
+def max_series(df, axis=None, skipna=True, level=None, combine_size=None, method=None):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameMax(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        combine_size=combine_size,
+        output_types=[OutputType.scalar],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
+
+
+def max_dataframe(
+    df,
+    axis=None,
+    skipna=True,
+    level=None,
+    numeric_only=None,
+    combine_size=None,
+    method=None,
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameMax(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        numeric_only=numeric_only,
+        combine_size=combine_size,
+        output_types=[OutputType.series],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
+
+
+def max_index(df, axis=None, skipna=True):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameMax(
+        axis=axis,
+        skipna=skipna,
+        output_types=[OutputType.scalar],
+        use_inf_as_na=use_inf_as_na,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/reduction/mean.py b/python/xorbits/_mars/dataframe/reduction/mean.py
new file mode 100644
index 000000000..72a9196ba
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/mean.py
@@ -0,0 +1,69 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import OutputType
+from .core import DataFrameReductionMixin, DataFrameReductionOperand
+
+
+class DataFrameMean(DataFrameReductionOperand, DataFrameReductionMixin):
+    _op_type_ = OperandDef.MEAN
+    _func_name = "mean"
+
+    @classmethod
+    def get_reduction_callable(cls, op):
+        skipna = op.skipna
+
+        def mean(x):
+            return x.sum(skipna=skipna) / x.count()
+
+        return mean
+
+
+def mean_series(df, axis=None, skipna=True, level=None, combine_size=None, method=None):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameMean(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        combine_size=combine_size,
+        output_types=[OutputType.scalar],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
+
+
+def mean_dataframe(
+    df,
+    axis=None,
+    skipna=True,
+    level=None,
+    numeric_only=None,
+    combine_size=None,
+    method=None,
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameMean(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        numeric_only=numeric_only,
+        combine_size=combine_size,
+        output_types=[OutputType.series],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/reduction/min.py b/python/xorbits/_mars/dataframe/reduction/min.py
new file mode 100644
index 000000000..908b5a479
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/min.py
@@ -0,0 +1,75 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import OutputType
+from .core import DataFrameReductionMixin, DataFrameReductionOperand
+
+
+class DataFrameMin(DataFrameReductionOperand, DataFrameReductionMixin):
+    _op_type_ = OperandDef.MIN
+    _func_name = "min"
+
+    @property
+    def is_atomic(self):
+        return True
+
+
+def min_series(df, axis=None, skipna=True, level=None, combine_size=None, method=None):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameMin(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        combine_size=combine_size,
+        output_types=[OutputType.scalar],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
+
+
+def min_dataframe(
+    df,
+    axis=None,
+    skipna=True,
+    level=None,
+    numeric_only=None,
+    combine_size=None,
+    method=None,
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameMin(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        numeric_only=numeric_only,
+        combine_size=combine_size,
+        output_types=[OutputType.series],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
+
+
+def min_index(df, axis=None, skipna=True):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameMin(
+        axis=axis,
+        skipna=skipna,
+        output_types=[OutputType.scalar],
+        use_inf_as_na=use_inf_as_na,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/reduction/nunique.py b/python/xorbits/_mars/dataframe/reduction/nunique.py
new file mode 100644
index 000000000..0ccd6472a
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/nunique.py
@@ -0,0 +1,240 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+
+try:
+    import pyarrow as pa
+except ImportError:  # pragma: no cover
+    pa = None
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import OutputType
+from ...serialization.serializables import BoolField
+from ...utils import lazy_import
+from ..arrays import ArrowListArray, ArrowListDtype
+from .core import CustomReduction, DataFrameReductionMixin, DataFrameReductionOperand
+
+cudf = lazy_import("cudf")
+
+
+class NuniqueReduction(CustomReduction):
+    pre_with_agg = True
+
+    def __init__(
+        self, name="unique", axis=0, dropna=True, use_arrow_dtype=False, is_gpu=False
+    ):
+        super().__init__(name, is_gpu=is_gpu)
+        self._axis = axis
+        self._dropna = dropna
+        self._use_arrow_dtype = use_arrow_dtype
+
+    @staticmethod
+    def _drop_duplicates_to_arrow(v, explode=False):
+        if explode:
+            v = v.explode()
+        try:
+            return ArrowListArray([v.drop_duplicates().to_numpy()])
+        except pa.ArrowInvalid:
+            # fallback due to diverse dtypes
+            return [v.drop_duplicates().to_list()]
+
+    def pre(self, in_data):  # noqa: W0221  # pylint: disable=arguments-differ
+        xdf = cudf if self.is_gpu() else pd
+        if isinstance(in_data, xdf.Series):
+            unique_values = in_data.drop_duplicates()
+            return xdf.Series(unique_values, name=in_data.name)
+        else:
+            if self._axis == 0:
+                data = dict()
+                for d, v in in_data.iteritems():
+                    if not self._use_arrow_dtype or xdf is cudf:
+                        data[d] = [v.drop_duplicates().to_list()]
+                    else:
+                        data[d] = self._drop_duplicates_to_arrow(v)
+                df = xdf.DataFrame(data)
+            else:
+                df = xdf.DataFrame(columns=[0])
+                for d, v in in_data.iterrows():
+                    if not self._use_arrow_dtype or xdf is cudf:
+                        df.loc[d] = [v.drop_duplicates().to_list()]
+                    else:
+                        df.loc[d] = self._drop_duplicates_to_arrow(v)
+            return df
+
+    def agg(self, in_data):  # noqa: W0221  # pylint: disable=arguments-differ
+        xdf = cudf if self.is_gpu() else pd
+        if isinstance(in_data, xdf.Series):
+            unique_values = in_data.explode().drop_duplicates()
+            return xdf.Series(unique_values, name=in_data.name)
+        else:
+            if self._axis == 0:
+                data = dict()
+                for d, v in in_data.iteritems():
+                    if not self._use_arrow_dtype or xdf is cudf:
+                        data[d] = [v.explode().drop_duplicates().to_list()]
+                    else:
+                        v = pd.Series(v.to_numpy())
+                        data[d] = self._drop_duplicates_to_arrow(v, explode=True)
+                df = xdf.DataFrame(data)
+            else:
+                df = xdf.DataFrame(columns=[0])
+                for d, v in in_data.iterrows():
+                    if not self._use_arrow_dtype or xdf is cudf:
+                        df.loc[d] = [v.explode().drop_duplicates().to_list()]
+                    else:
+                        df.loc[d] = self._drop_duplicates_to_arrow(v, explode=True)
+            return df
+
+    def post(self, in_data):  # noqa: W0221  # pylint: disable=arguments-differ
+        xdf = cudf if self.is_gpu() else pd
+        if isinstance(in_data, xdf.Series):
+            return in_data.explode().nunique(dropna=self._dropna)
+        else:
+            in_data_iter = (
+                in_data.iteritems() if self._axis == 0 else in_data.iterrows()
+            )
+            data = dict()
+            for d, v in in_data_iter:
+                if isinstance(v.dtype, ArrowListDtype):
+                    v = xdf.Series(v.to_numpy())
+                data[d] = v.explode().nunique(dropna=self._dropna)
+            return xdf.Series(data)
+
+
+class DataFrameNunique(DataFrameReductionOperand, DataFrameReductionMixin):
+    _op_type_ = OperandDef.NUNIQUE
+    _func_name = "nunique"
+
+    _dropna = BoolField("dropna")
+    _use_arrow_dtype = BoolField("use_arrow_dtype")
+
+    def __init__(self, dropna=None, use_arrow_dtype=None, **kw):
+        super().__init__(_dropna=dropna, _use_arrow_dtype=use_arrow_dtype, **kw)
+
+    @property
+    def dropna(self):
+        return self._dropna
+
+    @property
+    def use_arrow_dtype(self):
+        return self._use_arrow_dtype
+
+    @classmethod
+    def get_reduction_callable(cls, op):
+        return NuniqueReduction(
+            name=cls._func_name,
+            axis=op.axis,
+            dropna=op.dropna,
+            use_arrow_dtype=op.use_arrow_dtype,
+            is_gpu=op.is_gpu(),
+        )
+
+
+def nunique_dataframe(df, axis=0, dropna=True, combine_size=None):
+    """
+    Count distinct observations over requested axis.
+
+    Return Series with number of distinct observations. Can ignore NaN
+    values.
+
+    Parameters
+    ----------
+    axis : {0 or 'index', 1 or 'columns'}, default 0
+        The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
+        column-wise.
+    dropna : bool, default True
+        Don't include NaN in the counts.
+    combine_size : int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    Series
+
+    See Also
+    --------
+    Series.nunique: Method nunique for Series.
+    DataFrame.count: Count non-NA cells for each column or row.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
+    >>> df.nunique().execute()
+    A    3
+    B    1
+    dtype: int64
+
+    >>> df.nunique(axis=1).execute()
+    0    1
+    1    2
+    2    2
+    dtype: int64
+    """
+    op = DataFrameNunique(
+        axis=axis,
+        dropna=dropna,
+        combine_size=combine_size,
+        output_types=[OutputType.series],
+        use_arrow_dtype=options.dataframe.use_arrow_dtype,
+    )
+    return op(df)
+
+
+def nunique_series(series, dropna=True, combine_size=None):
+    """
+    Return number of unique elements in the object.
+
+    Excludes NA values by default.
+
+    Parameters
+    ----------
+    dropna : bool, default True
+        Don't include NaN in the count.
+    combine_size : int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    int
+
+    See Also
+    --------
+    DataFrame.nunique: Method nunique for DataFrame.
+    Series.count: Count non-NA/null observations in the Series.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> s = md.Series([1, 3, 5, 7, 7])
+    >>> s.execute()
+    0    1
+    1    3
+    2    5
+    3    7
+    4    7
+    dtype: int64
+
+    >>> s.nunique().execute()
+    4
+    """
+    op = DataFrameNunique(
+        dropna=dropna,
+        combine_size=combine_size,
+        output_types=[OutputType.scalar],
+        use_arrow_dtype=options.dataframe.use_arrow_dtype,
+    )
+    return op(series)
diff --git a/python/xorbits/_mars/dataframe/reduction/prod.py b/python/xorbits/_mars/dataframe/reduction/prod.py
new file mode 100644
index 000000000..ca1e9caee
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/prod.py
@@ -0,0 +1,86 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes
+from ...config import options
+from ...core import OutputType
+from .aggregation import where_function
+from .core import DataFrameReductionMixin, DataFrameReductionOperand
+
+
+class DataFrameProd(DataFrameReductionOperand, DataFrameReductionMixin):
+    _op_type_ = opcodes.PROD
+    _func_name = "prod"
+
+    @property
+    def is_atomic(self):
+        return self.min_count == 0
+
+    @classmethod
+    def get_reduction_callable(cls, op):
+        skipna, min_count = op.skipna, op.min_count
+
+        def prod(value):
+            if min_count == 0:
+                return value.prod(skipna=skipna)
+            else:
+                return where_function(
+                    value.count() >= min_count, value.prod(skipna=skipna), np.nan
+                )
+
+        return prod
+
+
+def prod_series(
+    df, axis=None, skipna=True, level=None, min_count=0, combine_size=None, method=None
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameProd(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        min_count=min_count,
+        combine_size=combine_size,
+        output_types=[OutputType.scalar],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
+
+
+def prod_dataframe(
+    df,
+    axis=None,
+    skipna=True,
+    level=None,
+    min_count=0,
+    numeric_only=None,
+    combine_size=None,
+    method=None,
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameProd(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        min_count=min_count,
+        numeric_only=numeric_only,
+        combine_size=combine_size,
+        output_types=[OutputType.series],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/reduction/reduction_size.py b/python/xorbits/_mars/dataframe/reduction/reduction_size.py
new file mode 100644
index 000000000..3aa4dfbb5
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/reduction_size.py
@@ -0,0 +1,36 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...core import OutputType
+from .core import DataFrameReductionMixin, DataFrameReductionOperand
+
+
+class DataFrameSize(DataFrameReductionOperand, DataFrameReductionMixin):
+    _op_type_ = OperandDef.REDUCTION_SIZE
+    _func_name = "size"
+
+    @property
+    def is_atomic(self):
+        return True
+
+
+def size_series(df):
+    op = DataFrameSize(output_types=[OutputType.scalar])
+    return op(df)
+
+
+def size_dataframe(df):
+    op = DataFrameSize(output_types=[OutputType.series])
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/reduction/sem.py b/python/xorbits/_mars/dataframe/reduction/sem.py
new file mode 100644
index 000000000..a11c73105
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/sem.py
@@ -0,0 +1,86 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import OutputType
+from ...serialization.serializables import Int32Field
+from .core import DataFrameReductionMixin, DataFrameReductionOperand
+
+
+class DataFrameSem(DataFrameReductionOperand, DataFrameReductionMixin):
+    _op_type_ = OperandDef.SEM
+    _func_name = "sem"
+
+    _ddof = Int32Field("ddof")
+
+    def __init__(self, ddof=None, **kw):
+        super().__init__(_ddof=ddof, **kw)
+
+    @property
+    def ddof(self):
+        return self._ddof
+
+    @classmethod
+    def get_reduction_callable(cls, op):
+        skipna, ddof = op.skipna, op.ddof
+
+        def sem(x):
+            var = x.var(skipna=skipna, ddof=ddof)
+            cnt = x.count()
+            return (var / cnt) ** 0.5
+
+        return sem
+
+
+def sem_series(
+    series, axis=None, skipna=True, level=None, ddof=1, combine_size=None, method=None
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameSem(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        ddof=ddof,
+        combine_size=combine_size,
+        output_types=[OutputType.scalar],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(series)
+
+
+def sem_dataframe(
+    df,
+    axis=None,
+    skipna=True,
+    level=None,
+    ddof=1,
+    numeric_only=None,
+    combine_size=None,
+    method=None,
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameSem(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        ddof=ddof,
+        numeric_only=numeric_only,
+        combine_size=combine_size,
+        output_types=[OutputType.series],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/reduction/skew.py b/python/xorbits/_mars/dataframe/reduction/skew.py
new file mode 100644
index 000000000..5609535ea
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/skew.py
@@ -0,0 +1,106 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes
+from ...config import options
+from ...core import ENTITY_TYPE, OutputType
+from ...serialization.serializables import BoolField
+from .core import DataFrameReductionMixin, DataFrameReductionOperand
+
+
+class DataFrameSkew(DataFrameReductionOperand, DataFrameReductionMixin):
+    _op_type_ = opcodes.SKEW
+    _func_name = "skew"
+
+    _bias = BoolField("bias")
+
+    def __init__(self, bias=None, **kw):
+        super().__init__(_bias=bias, **kw)
+
+    @property
+    def bias(self):
+        return self._bias
+
+    @classmethod
+    def get_reduction_callable(cls, op):
+        from .aggregation import where_function
+
+        skipna, bias = op.skipna, op.bias
+
+        def skew(x):
+            cnt = x.count()
+            mean = x.mean(skipna=skipna)
+            divided = (
+                (x**3).mean(skipna=skipna)
+                - 3 * (x**2).mean(skipna=skipna) * mean
+                + 2 * mean**3
+            )
+            var = x.var(skipna=skipna, ddof=0)
+            if isinstance(var, ENTITY_TYPE) or var > 0:
+                val = where_function(var > 0, divided / var**1.5, np.nan)
+            else:
+                val = np.nan
+            if not bias:
+                val = where_function(
+                    (var > 0) & (cnt > 2),
+                    val * ((cnt * (cnt - 1)) ** 0.5 / (cnt - 2)),
+                    np.nan,
+                )
+            return val
+
+        return skew
+
+
+def skew_series(
+    df, axis=None, skipna=True, level=None, combine_size=None, bias=False, method=None
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameSkew(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        combine_size=combine_size,
+        bias=bias,
+        output_types=[OutputType.scalar],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
+
+
+def skew_dataframe(
+    df,
+    axis=None,
+    skipna=True,
+    level=None,
+    numeric_only=None,
+    combine_size=None,
+    bias=False,
+    method=None,
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameSkew(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        numeric_only=numeric_only,
+        bias=bias,
+        combine_size=combine_size,
+        output_types=[OutputType.series],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/reduction/std.py b/python/xorbits/_mars/dataframe/reduction/std.py
new file mode 100644
index 000000000..a2d446acf
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/std.py
@@ -0,0 +1,58 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...tensor.arithmetic import sqrt
+from .var import var_dataframe, var_series
+
+
+def std_dataframe(
+    df,
+    axis=None,
+    skipna=True,
+    level=None,
+    ddof=1,
+    numeric_only=None,
+    combine_size=None,
+    method=None,
+):
+    ret = sqrt(
+        var_dataframe(
+            df,
+            axis=axis,
+            skipna=skipna,
+            level=level,
+            ddof=ddof,
+            numeric_only=numeric_only,
+            combine_size=combine_size,
+            method=method,
+        )
+    )
+    return ret
+
+
+def std_series(
+    series, axis=None, skipna=True, level=None, ddof=1, combine_size=None, method=None
+):
+    ret = sqrt(
+        var_series(
+            series,
+            axis=axis,
+            skipna=skipna,
+            level=level,
+            ddof=ddof,
+            combine_size=combine_size,
+            method=method,
+        )
+    )
+    return ret
diff --git a/python/xorbits/_mars/dataframe/reduction/str_concat.py b/python/xorbits/_mars/dataframe/reduction/str_concat.py
new file mode 100644
index 000000000..2e2c82835
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/str_concat.py
@@ -0,0 +1,59 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...core import OutputType
+from ...serialization.serializables import StringField
+from .core import DataFrameReductionMixin, DataFrameReductionOperand
+
+
+class DataFrameStrConcat(DataFrameReductionOperand, DataFrameReductionMixin):
+    _op_type_ = OperandDef.STR_CONCAT
+    _func_name = "str_concat"
+
+    _sep = StringField("sep")
+    _na_rep = StringField("na_rep")
+
+    def __init__(self, sep=None, na_rep=None, **kw):
+        super().__init__(_sep=sep, _na_rep=na_rep, **kw)
+
+    @property
+    def sep(self):
+        return self._sep
+
+    @property
+    def na_rep(self):
+        return self._na_rep
+
+    def get_reduction_args(self, axis=None):
+        return dict(sep=self._sep, na_rep=self._na_rep)
+
+    @property
+    def is_atomic(self):
+        return True
+
+    @classmethod
+    def get_reduction_callable(cls, op):
+        sep, na_rep = op.sep, op.na_rep
+
+        def str_concat(obj):
+            return build_str_concat_object(obj, sep=sep, na_rep=na_rep)
+
+        return str_concat
+
+
+def build_str_concat_object(df, sep=None, na_rep=None):
+    output_type = OutputType.series if df.ndim == 2 else OutputType.scalar
+    op = DataFrameStrConcat(sep=sep, na_rep=na_rep, output_types=[output_type])
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/reduction/sum.py b/python/xorbits/_mars/dataframe/reduction/sum.py
new file mode 100644
index 000000000..7dc431574
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/sum.py
@@ -0,0 +1,87 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes
+from ...config import options
+from ...core import OutputType
+from .core import DataFrameReductionMixin, DataFrameReductionOperand
+
+
+class DataFrameSum(DataFrameReductionOperand, DataFrameReductionMixin):
+    _op_type_ = opcodes.SUM
+    _func_name = "sum"
+
+    @property
+    def is_atomic(self):
+        return self.min_count == 0
+
+    @classmethod
+    def get_reduction_callable(cls, op):
+        from .aggregation import where_function
+
+        skipna, min_count = op.skipna, op.min_count
+
+        def sum_(value):
+            if min_count == 0:
+                return value.sum(skipna=skipna)
+            else:
+                return where_function(
+                    value.count() >= min_count, value.sum(skipna=skipna), np.nan
+                )
+
+        return sum_
+
+
+def sum_series(
+    df, axis=None, skipna=True, level=None, min_count=0, combine_size=None, method=None
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameSum(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        min_count=min_count,
+        combine_size=combine_size,
+        output_types=[OutputType.scalar],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
+
+
+def sum_dataframe(
+    df,
+    axis=None,
+    skipna=True,
+    level=None,
+    min_count=0,
+    numeric_only=None,
+    combine_size=None,
+    method=None,
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameSum(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        min_count=min_count,
+        numeric_only=numeric_only,
+        combine_size=combine_size,
+        output_types=[OutputType.series],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/reduction/tests/__init__.py b/python/xorbits/_mars/dataframe/reduction/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/reduction/tests/test_reduction.py b/python/xorbits/_mars/dataframe/reduction/tests/test_reduction.py
new file mode 100644
index 000000000..4c5c92224
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/tests/test_reduction.py
@@ -0,0 +1,625 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import operator
+from functools import reduce
+from typing import NamedTuple
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .... import dataframe as md
+from ....core import tile
+from ....core.operand import OperandStage
+from ....tensor import Tensor
+from ...core import DataFrame, IndexValue, OutputType, Series
+from ...datasource.dataframe import from_pandas as from_pandas_df
+from ...datasource.series import from_pandas as from_pandas_series
+from ...merge import DataFrameConcat
+from .. import (
+    CustomReduction,
+    DataFrameAggregate,
+    DataFrameAll,
+    DataFrameAny,
+    DataFrameCount,
+    DataFrameCummax,
+    DataFrameCummin,
+    DataFrameCumprod,
+    DataFrameCumsum,
+    DataFrameKurtosis,
+    DataFrameMax,
+    DataFrameMean,
+    DataFrameMin,
+    DataFrameNunique,
+    DataFrameProd,
+    DataFrameSem,
+    DataFrameSkew,
+    DataFrameSum,
+    DataFrameVar,
+)
+from ..aggregation import where_function
+from ..core import ReductionCompiler
+
+pytestmark = pytest.mark.pd_compat
+
+
+class FunctionOptions(NamedTuple):
+    has_skipna: bool = True
+    has_numeric_only: bool = True
+    has_bool_only: bool = False
+
+
+reduction_functions = [
+    ("sum", DataFrameSum, FunctionOptions()),
+    ("prod", DataFrameProd, FunctionOptions()),
+    ("min", DataFrameMin, FunctionOptions()),
+    ("max", DataFrameMax, FunctionOptions()),
+    ("count", DataFrameCount, FunctionOptions(has_skipna=False)),
+    ("mean", DataFrameMean, FunctionOptions()),
+    ("var", DataFrameVar, FunctionOptions()),
+    ("skew", DataFrameSkew, FunctionOptions()),
+    ("kurt", DataFrameKurtosis, FunctionOptions()),
+    ("sem", DataFrameSem, FunctionOptions()),
+    ("all", DataFrameAll, FunctionOptions(has_numeric_only=False, has_bool_only=True)),
+    ("any", DataFrameAny, FunctionOptions(has_numeric_only=False, has_bool_only=True)),
+]
+
+
+@pytest.mark.parametrize("func_name,op,func_opts", reduction_functions)
+def test_series_reduction(func_name, op, func_opts: FunctionOptions):
+    data = pd.Series(range(20), index=[str(i) for i in range(20)])
+    series = getattr(from_pandas_series(data, chunk_size=3), func_name)()
+
+    assert isinstance(series, Tensor)
+    assert isinstance(series.op, op)
+    assert series.shape == ()
+
+    series = tile(series)
+
+    assert len(series.chunks) == 1
+    assert isinstance(series.chunks[0].op, DataFrameAggregate)
+    assert isinstance(series.chunks[0].inputs[0].op, DataFrameConcat)
+    assert len(series.chunks[0].inputs[0].inputs) == 2
+
+    data = pd.Series(np.random.rand(25), name="a")
+    if func_opts.has_skipna:
+        kwargs = dict(axis="index", skipna=False)
+    else:
+        kwargs = dict()
+    series = getattr(from_pandas_series(data, chunk_size=7), func_name)(**kwargs)
+
+    assert isinstance(series, Tensor)
+    assert series.shape == ()
+
+    series = tile(series)
+
+    assert len(series.chunks) == 1
+    assert isinstance(series.chunks[0].op, DataFrameAggregate)
+    assert isinstance(series.chunks[0].inputs[0].op, DataFrameConcat)
+    assert len(series.chunks[0].inputs[0].inputs) == 4
+
+
+@pytest.mark.parametrize("func_name,op,func_opts", reduction_functions)
+def test_dataframe_reduction(func_name, op, func_opts: FunctionOptions):
+    data = pd.DataFrame(
+        {"a": list(range(20)), "b": list(range(20, 0, -1))},
+        index=[str(i) for i in range(20)],
+    )
+    reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()
+
+    assert isinstance(reduction_df, Series)
+    assert isinstance(reduction_df.op, op)
+    assert isinstance(reduction_df.index_value._index_value, IndexValue.Index)
+    assert reduction_df.shape == (2,)
+
+    reduction_df = tile(reduction_df)
+
+    assert len(reduction_df.chunks) == 1
+    assert isinstance(reduction_df.chunks[0].op, DataFrameAggregate)
+    assert isinstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat)
+    assert len(reduction_df.chunks[0].inputs[0].inputs) == 2
+
+    data = pd.DataFrame(np.random.rand(20, 10))
+    reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()
+
+    assert isinstance(reduction_df, Series)
+    assert isinstance(
+        reduction_df.index_value._index_value,
+        (IndexValue.RangeIndex, IndexValue.Int64Index),
+    )
+    assert reduction_df.shape == (10,)
+
+    reduction_df = tile(reduction_df)
+
+    assert len(reduction_df.chunks) == 4
+    assert reduction_df.nsplits == ((3, 3, 3, 1),)
+    assert isinstance(reduction_df.chunks[0].op, DataFrameAggregate)
+    assert isinstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat)
+    assert len(reduction_df.chunks[0].inputs[0].inputs) == 2
+
+    data = pd.DataFrame(np.random.rand(20, 20), index=[str(i) for i in range(20)])
+    reduction_df = getattr(from_pandas_df(data, chunk_size=4), func_name)(
+        axis="columns"
+    )
+
+    assert reduction_df.shape == (20,)
+
+    reduction_df = tile(reduction_df)
+
+    assert len(reduction_df.chunks) == 5
+    assert reduction_df.nsplits == ((4,) * 5,)
+    assert isinstance(reduction_df.chunks[0].op, DataFrameAggregate)
+    assert isinstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat)
+    assert len(reduction_df.chunks[0].inputs[0].inputs) == 2
+
+    with pytest.raises(NotImplementedError):
+        getattr(from_pandas_df(data, chunk_size=3), func_name)(level=0, axis=1)
+
+
+cum_reduction_functions = [
+    ("cummin", DataFrameCummin, FunctionOptions()),
+    ("cummax", DataFrameCummax, FunctionOptions()),
+    ("cumprod", DataFrameCumprod, FunctionOptions()),
+    ("cumsum", DataFrameCumsum, FunctionOptions()),
+]
+
+
+@pytest.mark.parametrize("func_name,op,func_opts", cum_reduction_functions)
+def test_cum_series_reduction(func_name, op, func_opts: FunctionOptions):
+    data = pd.Series({"a": list(range(20))}, index=[str(i) for i in range(20)])
+    series = getattr(from_pandas_series(data, chunk_size=3), func_name)()
+
+    assert isinstance(series, Series)
+    assert series.shape == (20,)
+
+    series = tile(series)
+
+    assert len(series.chunks) == 7
+    assert isinstance(series.chunks[0].op, op)
+    assert series.chunks[0].op.stage == OperandStage.combine
+    assert isinstance(series.chunks[-1].inputs[-1].op, op)
+    assert series.chunks[-1].inputs[-1].op.stage == OperandStage.map
+    assert len(series.chunks[-1].inputs) == 7
+
+    data = pd.Series(np.random.rand(25), name="a")
+    if func_opts.has_skipna:
+        kwargs = dict(axis="index", skipna=False)
+    else:
+        kwargs = dict()
+    series = getattr(from_pandas_series(data, chunk_size=7), func_name)(**kwargs)
+
+    assert isinstance(series, Series)
+    assert series.shape == (25,)
+
+    series = tile(series)
+
+    assert len(series.chunks) == 4
+    assert isinstance(series.chunks[0].op, op)
+    assert series.chunks[0].op.stage == OperandStage.combine
+    assert isinstance(series.chunks[-1].inputs[-1].op, op)
+    assert series.chunks[-1].inputs[-1].op.stage == OperandStage.map
+    assert len(series.chunks[-1].inputs) == 4
+
+
+@pytest.mark.parametrize("func_name,op,func_opts", cum_reduction_functions)
+def test_cum_dataframe_reduction(func_name, op, func_opts: FunctionOptions):
+    data = pd.DataFrame(
+        {"a": list(range(20)), "b": list(range(20, 0, -1))},
+        index=[str(i) for i in range(20)],
+    )
+    reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()
+
+    assert isinstance(reduction_df, DataFrame)
+    assert isinstance(reduction_df.index_value._index_value, IndexValue.Index)
+    assert reduction_df.shape == (20, 2)
+
+    reduction_df = tile(reduction_df)
+
+    assert len(reduction_df.chunks) == 7
+    assert isinstance(reduction_df.chunks[0].op, op)
+    assert reduction_df.chunks[0].op.stage == OperandStage.combine
+    assert isinstance(reduction_df.chunks[-1].inputs[-1].op, op)
+    assert reduction_df.chunks[-1].inputs[-1].op.stage == OperandStage.map
+    assert len(reduction_df.chunks[-1].inputs) == 7
+
+    data = pd.DataFrame(np.random.rand(20, 10))
+    reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()
+
+    assert isinstance(reduction_df, DataFrame)
+    assert isinstance(reduction_df.index_value._index_value, IndexValue.RangeIndex)
+    assert reduction_df.shape == (20, 10)
+
+    reduction_df = tile(reduction_df)
+
+    assert len(reduction_df.chunks) == 28
+    assert reduction_df.nsplits == ((3, 3, 3, 3, 3, 3, 2), (3, 3, 3, 1))
+    assert reduction_df.chunks[0].op.stage == OperandStage.combine
+    assert isinstance(reduction_df.chunks[-1].inputs[-1].op, op)
+    assert reduction_df.chunks[-1].inputs[-1].op.stage == OperandStage.map
+    assert len(reduction_df.chunks[-1].inputs) == 7
+
+
+def test_nunique():
+    data = pd.DataFrame(
+        np.random.randint(0, 6, size=(20, 10)),
+        columns=["c" + str(i) for i in range(10)],
+    )
+    df = from_pandas_df(data, chunk_size=3)
+    result = df.nunique()
+
+    assert result.shape == (10,)
+    assert result.op.output_types[0] == OutputType.series
+    assert isinstance(result.op, DataFrameNunique)
+
+    tiled = tile(result)
+    assert tiled.shape == (10,)
+    assert len(tiled.chunks) == 4
+    assert tiled.nsplits == ((3, 3, 3, 1),)
+    assert tiled.chunks[0].op.stage == OperandStage.agg
+    assert isinstance(tiled.chunks[0].op, DataFrameAggregate)
+
+    data2 = data.copy()
+    df2 = from_pandas_df(data2, chunk_size=3)
+    result2 = df2.nunique(axis=1)
+
+    assert result2.shape == (20,)
+    assert result2.op.output_types[0] == OutputType.series
+    assert isinstance(result2.op, DataFrameNunique)
+
+    tiled = tile(result2)
+    assert tiled.shape == (20,)
+    assert len(tiled.chunks) == 7
+    assert tiled.nsplits == ((3, 3, 3, 3, 3, 3, 2),)
+    assert tiled.chunks[0].op.stage == OperandStage.agg
+    assert isinstance(tiled.chunks[0].op, DataFrameAggregate)
+
+
+def test_dataframe_aggregate():
+    data = pd.DataFrame(np.random.rand(20, 19))
+    agg_funcs = [
+        "sum",
+        "min",
+        "max",
+        "mean",
+        "var",
+        "std",
+        "all",
+        "any",
+        "skew",
+        "kurt",
+        "sem",
+    ]
+
+    df = from_pandas_df(data)
+    result = tile(df.agg(agg_funcs))
+    assert len(result.chunks) == 1
+    assert result.shape == (len(agg_funcs), data.shape[1])
+    assert list(result.columns_value.to_pandas()) == list(range(19))
+    assert list(result.index_value.to_pandas()) == agg_funcs
+    assert result.op.output_types[0] == OutputType.dataframe
+    assert result.op.func == agg_funcs
+
+    df = from_pandas_df(data, chunk_size=(3, 4))
+
+    result = tile(df.agg("sum"))
+    assert len(result.chunks) == 5
+    assert result.shape == (data.shape[1],)
+    assert list(result.index_value.to_pandas()) == list(range(data.shape[1]))
+    assert result.op.output_types[0] == OutputType.series
+    assert result.op.func == ["sum"]
+    agg_chunk = result.chunks[0]
+    assert agg_chunk.shape == (4,)
+    assert list(agg_chunk.index_value.to_pandas()) == list(range(4))
+    assert agg_chunk.op.stage == OperandStage.agg
+
+    result = tile(df.agg("sum", axis=1))
+    assert len(result.chunks) == 7
+    assert result.shape == (data.shape[0],)
+    assert list(result.index_value.to_pandas()) == list(range(data.shape[0]))
+    assert result.op.output_types[0] == OutputType.series
+    agg_chunk = result.chunks[0]
+    assert agg_chunk.shape == (3,)
+    assert list(agg_chunk.index_value.to_pandas()) == list(range(3))
+    assert agg_chunk.op.stage == OperandStage.agg
+
+    result = tile(df.agg("var", axis=1))
+    assert len(result.chunks) == 7
+    assert result.shape == (data.shape[0],)
+    assert list(result.index_value.to_pandas()) == list(range(data.shape[0]))
+    assert result.op.output_types[0] == OutputType.series
+    assert result.op.func == ["var"]
+    agg_chunk = result.chunks[0]
+    assert agg_chunk.shape == (3,)
+    assert list(agg_chunk.index_value.to_pandas()) == list(range(3))
+    assert agg_chunk.op.stage == OperandStage.agg
+
+    result = tile(df.agg(agg_funcs))
+    assert len(result.chunks) == 5
+    assert result.shape == (len(agg_funcs), data.shape[1])
+    assert list(result.columns_value.to_pandas()) == list(range(data.shape[1]))
+    assert list(result.index_value.to_pandas()) == agg_funcs
+    assert result.op.output_types[0] == OutputType.dataframe
+    assert result.op.func == agg_funcs
+    agg_chunk = result.chunks[0]
+    assert agg_chunk.shape == (len(agg_funcs), 4)
+    assert list(agg_chunk.columns_value.to_pandas()) == list(range(4))
+    assert list(agg_chunk.index_value.to_pandas()) == agg_funcs
+    assert agg_chunk.op.stage == OperandStage.agg
+
+    result = tile(df.agg(agg_funcs, axis=1))
+    assert len(result.chunks) == 7
+    assert result.shape == (data.shape[0], len(agg_funcs))
+    assert list(result.columns_value.to_pandas()) == agg_funcs
+    assert list(result.index_value.to_pandas()) == list(range(data.shape[0]))
+    assert result.op.output_types[0] == OutputType.dataframe
+    assert result.op.func == agg_funcs
+    agg_chunk = result.chunks[0]
+    assert agg_chunk.shape == (3, len(agg_funcs))
+    assert list(agg_chunk.columns_value.to_pandas()) == agg_funcs
+    assert list(agg_chunk.index_value.to_pandas()) == list(range(3))
+    assert agg_chunk.op.stage == OperandStage.agg
+
+    dict_fun = {0: "sum", 2: ["var", "max"], 9: ["mean", "var", "std"]}
+    all_cols = set(
+        reduce(
+            operator.add, [[v] if isinstance(v, str) else v for v in dict_fun.values()]
+        )
+    )
+    result = tile(df.agg(dict_fun))
+    assert len(result.chunks) == 2
+    assert result.shape == (len(all_cols), len(dict_fun))
+    assert set(result.columns_value.to_pandas()) == set(dict_fun.keys())
+    assert set(result.index_value.to_pandas()) == all_cols
+    assert result.op.output_types[0] == OutputType.dataframe
+    assert result.op.func[0] == [dict_fun[0]]
+    assert result.op.func[2] == dict_fun[2]
+    agg_chunk = result.chunks[0]
+    assert agg_chunk.shape == (len(all_cols), 2)
+    assert list(agg_chunk.columns_value.to_pandas()) == [0, 2]
+    assert set(agg_chunk.index_value.to_pandas()) == all_cols
+    assert agg_chunk.op.stage == OperandStage.agg
+
+    with pytest.raises(TypeError):
+        df.agg(sum_0="sum", mean_0="mean")
+    with pytest.raises(NotImplementedError):
+        df.agg({0: ["sum", "min", "var"], 9: ["mean", "var", "std"]}, axis=1)
+
+
+def test_series_aggregate():
+    data = pd.Series(np.random.rand(20), index=[str(i) for i in range(20)], name="a")
+    agg_funcs = [
+        "sum",
+        "min",
+        "max",
+        "mean",
+        "var",
+        "std",
+        "all",
+        "any",
+        "skew",
+        "kurt",
+        "sem",
+    ]
+
+    series = from_pandas_series(data)
+
+    result = tile(series.agg(agg_funcs))
+    assert len(result.chunks) == 1
+    assert result.shape == (len(agg_funcs),)
+    assert list(result.index_value.to_pandas()) == agg_funcs
+    assert result.op.output_types[0] == OutputType.series
+    assert result.op.func == agg_funcs
+
+    series = from_pandas_series(data, chunk_size=3)
+
+    result = tile(series.agg("sum"))
+    assert len(result.chunks) == 1
+    assert result.shape == ()
+    assert result.op.output_types[0] == OutputType.scalar
+    agg_chunk = result.chunks[0]
+    assert agg_chunk.shape == ()
+    assert agg_chunk.op.stage == OperandStage.agg
+
+    result = tile(series.agg(agg_funcs))
+    assert len(result.chunks) == 1
+    assert result.shape == (len(agg_funcs),)
+    assert list(result.index_value.to_pandas()) == agg_funcs
+    assert result.op.output_types[0] == OutputType.series
+    assert result.op.func == agg_funcs
+    agg_chunk = result.chunks[0]
+    assert agg_chunk.shape == (len(agg_funcs),)
+    assert list(agg_chunk.index_value.to_pandas()) == agg_funcs
+    assert agg_chunk.op.stage == OperandStage.agg
+
+    with pytest.raises(TypeError):
+        series.agg(sum_0=(0, "sum"), mean_0=(0, "mean"))
+
+
+def test_compile_function():
+    compiler = ReductionCompiler()
+    ms = md.Series([1, 2, 3])
+    # no Mars objects inside closures
+    with pytest.raises(ValueError):
+        compiler.add_function(functools.partial(lambda x: (x + ms).sum()), ndim=2)
+    # function should return a Mars object
+    with pytest.raises(ValueError):
+        compiler.add_function(lambda x: x is not None, ndim=2)
+    # function should perform some sort of reduction in dimensionality
+    with pytest.raises(ValueError):
+        compiler.add_function(lambda x: x, ndim=2)
+    # function should only contain acceptable operands
+    with pytest.raises(ValueError):
+        compiler.add_function(lambda x: x.sort_values().max(), ndim=1)
+    with pytest.raises(ValueError):
+        compiler.add_function(lambda x: x.max().shift(1), ndim=2)
+
+    # test agg for all data
+    for ndim in [1, 2]:
+        compiler = ReductionCompiler(store_source=True)
+        compiler.add_function(lambda x: (x**2).count() + 1, ndim=ndim)
+        result = compiler.compile()
+        # check pre_funcs
+        assert len(result.pre_funcs) == 1
+        assert "pow" in result.pre_funcs[0].func.__source__
+        # check agg_funcs
+        assert len(result.agg_funcs) == 1
+        assert result.agg_funcs[0].map_func_name == "count"
+        assert result.agg_funcs[0].agg_func_name == "sum"
+        # check post_funcs
+        assert len(result.post_funcs) == 1
+        assert result.post_funcs[0].func_name == "<lambda>"
+        assert "add" in result.post_funcs[0].func.__source__
+
+        compiler.add_function(
+            lambda x: -x.prod() ** 2 + (1 + (x**2).count()), ndim=ndim
+        )
+        result = compiler.compile()
+        # check pre_funcs
+        assert len(result.pre_funcs) == 2
+        assert (
+            "pow" in result.pre_funcs[0].func.__source__
+            or "pow" in result.pre_funcs[1].func.__source__
+        )
+        assert (
+            "pow" not in result.pre_funcs[0].func.__source__
+            or "pow" not in result.pre_funcs[1].func.__source__
+        )
+        # check agg_funcs
+        assert len(result.agg_funcs) == 2
+        assert set(result.agg_funcs[i].map_func_name for i in range(2)) == {
+            "count",
+            "prod",
+        }
+        assert set(result.agg_funcs[i].agg_func_name for i in range(2)) == {
+            "sum",
+            "prod",
+        }
+        # check post_funcs
+        assert len(result.post_funcs) == 2
+        assert result.post_funcs[0].func_name == "<lambda_0>"
+        assert "add" in result.post_funcs[0].func.__source__
+        assert "add" in result.post_funcs[1].func.__source__
+
+        compiler = ReductionCompiler(store_source=True)
+        compiler.add_function(
+            lambda x: where_function(x.all(), x.count(), 0), ndim=ndim
+        )
+        result = compiler.compile()
+        # check pre_funcs
+        assert len(result.pre_funcs) == 1
+        assert result.pre_funcs[0].input_key == result.pre_funcs[0].output_key
+        # check agg_funcs
+        assert len(result.agg_funcs) == 2
+        assert set(result.agg_funcs[i].map_func_name for i in range(2)) == {
+            "all",
+            "count",
+        }
+        assert set(result.agg_funcs[i].agg_func_name for i in range(2)) == {
+            "sum",
+            "all",
+        }
+        # check post_funcs
+        assert len(result.post_funcs) == 1
+        if ndim == 1:
+            assert "np.where" in result.post_funcs[0].func.__source__
+        else:
+            assert "np.where" not in result.post_funcs[0].func.__source__
+            assert ".where" in result.post_funcs[0].func.__source__
+
+        # check boolean expressions
+        compiler = ReductionCompiler(store_source=True)
+        compiler.add_function(lambda x: (x == "1").sum(), ndim=ndim)
+        result = compiler.compile()
+        # check pre_funcs
+        assert len(result.pre_funcs) == 1
+        assert "eq" in result.pre_funcs[0].func.__source__
+        # check agg_funcs
+        assert len(result.agg_funcs) == 1
+        assert result.agg_funcs[0].map_func_name == "sum"
+        assert result.agg_funcs[0].agg_func_name == "sum"
+
+    # test agg for specific columns
+    compiler = ReductionCompiler(store_source=True)
+    compiler.add_function(lambda x: 1 + x.sum(), ndim=2, cols=["a", "b"])
+    compiler.add_function(lambda x: -1 + x.sum(), ndim=2, cols=["b", "c"])
+    result = compiler.compile()
+    # check pre_funcs
+    assert len(result.pre_funcs) == 1
+    assert set(result.pre_funcs[0].columns) == set("abc")
+    # check agg_funcs
+    assert len(result.agg_funcs) == 1
+    assert result.agg_funcs[0].map_func_name == "sum"
+    assert result.agg_funcs[0].agg_func_name == "sum"
+    # check post_funcs
+    assert len(result.post_funcs) == 2
+    assert set("".join(sorted(result.post_funcs[i].columns)) for i in range(2)) == {
+        "ab",
+        "bc",
+    }
+
+    # test agg for multiple columns
+    compiler = ReductionCompiler(store_source=True)
+    compiler.add_function(lambda x: x.sum(), ndim=2, cols=["a"])
+    compiler.add_function(lambda x: x.sum(), ndim=2, cols=["b"])
+    compiler.add_function(lambda x: x.min(), ndim=2, cols=["c"])
+    result = compiler.compile()
+    # check pre_funcs
+    assert len(result.pre_funcs) == 1
+    assert set(result.pre_funcs[0].columns) == set("abc")
+    # check agg_funcs
+    assert len(result.agg_funcs) == 2
+    assert result.agg_funcs[0].map_func_name == "sum"
+    assert result.agg_funcs[0].agg_func_name == "sum"
+    # check post_funcs
+    assert len(result.post_funcs) == 2
+    assert set(result.post_funcs[0].columns) == set("ab")
+
+
+def test_custom_aggregation():
+    class MockReduction1(CustomReduction):
+        def agg(self, v1):
+            return v1.sum()
+
+    class MockReduction2(CustomReduction):
+        def pre(self, value):
+            return value + 1, value**2
+
+        def agg(self, v1, v2):
+            return v1.sum(), v2.prod()
+
+        def post(self, v1, v2):
+            return v1 + v2
+
+    for ndim in [1, 2]:
+        compiler = ReductionCompiler()
+        compiler.add_function(MockReduction1(), ndim=ndim)
+        result = compiler.compile()
+        # check agg_funcs
+        assert len(result.agg_funcs) == 1
+        assert result.agg_funcs[0].map_func_name == "custom_reduction"
+        assert result.agg_funcs[0].agg_func_name == "custom_reduction"
+        assert isinstance(result.agg_funcs[0].custom_reduction, MockReduction1)
+        assert result.agg_funcs[0].output_limit == 1
+
+        compiler = ReductionCompiler()
+        compiler.add_function(MockReduction2(), ndim=ndim)
+        result = compiler.compile()
+        # check agg_funcs
+        assert len(result.agg_funcs) == 1
+        assert result.agg_funcs[0].map_func_name == "custom_reduction"
+        assert result.agg_funcs[0].agg_func_name == "custom_reduction"
+        assert isinstance(result.agg_funcs[0].custom_reduction, MockReduction2)
+        assert result.agg_funcs[0].output_limit == 2
diff --git a/python/xorbits/_mars/dataframe/reduction/tests/test_reduction_execution.py b/python/xorbits/_mars/dataframe/reduction/tests/test_reduction_execution.py
new file mode 100644
index 000000000..3cedf6106
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/tests/test_reduction_execution.py
@@ -0,0 +1,1062 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import NamedTuple
+
+import numpy as np
+import pandas as pd
+import pytest
+
+try:
+    import pyarrow as pa
+except ImportError:  # pragma: no cover
+    pa = None
+
+from .... import dataframe as md
+from ....config import option_context
+from ....deploy.oscar.session import get_default_session
+from ....tests.core import require_cudf, require_cupy
+from ....utils import lazy_import, pd_release_version
+from ... import CustomReduction, NamedAgg
+from ...base import to_gpu
+
+pytestmark = pytest.mark.pd_compat
+
+cp = lazy_import("cupy", rename="cp")
+_agg_size_as_series = pd_release_version >= (1, 3)
+_support_kw_agg = pd_release_version >= (1, 1)
+
+
+@pytest.fixture
+def check_ref_counts():
+    yield
+
+    import gc
+
+    gc.collect()
+
+    sess = get_default_session()
+    assert len(sess._get_ref_counts()) == 0
+
+
+class FunctionOptions(NamedTuple):
+    has_min_count: bool = False
+
+
+reduction_functions = [
+    ("sum", FunctionOptions(has_min_count=True)),
+    ("prod", FunctionOptions(has_min_count=True)),
+    ("min", FunctionOptions()),
+    ("max", FunctionOptions()),
+    ("mean", FunctionOptions()),
+    ("var", FunctionOptions()),
+    ("std", FunctionOptions()),
+    ("sem", FunctionOptions()),
+    ("skew", FunctionOptions()),
+    ("kurt", FunctionOptions()),
+]
+
+
+@pytest.mark.parametrize("func_name,func_opts", reduction_functions)
+def test_series_reduction(
+    setup, check_ref_counts, func_name, func_opts: FunctionOptions
+):
+    def compute(data, **kwargs):
+        return getattr(data, func_name)(**kwargs)
+
+    rs = np.random.RandomState(0)
+    data = pd.Series(
+        rs.randint(0, 8, (10,)), index=[str(i) for i in range(10)], name="a"
+    )
+    r = compute(md.Series(data))
+    assert pytest.approx(compute(data)) == r.execute().fetch()
+
+    r = compute(md.Series(data, chunk_size=6))
+    assert pytest.approx(compute(data)) == r.execute().fetch()
+
+    r = compute(md.Series(data, chunk_size=3))
+    assert pytest.approx(compute(data)) == r.execute().fetch()
+
+    r = compute(md.Series(data, chunk_size=4), axis="index")
+    assert pytest.approx(compute(data, axis="index")) == r.execute().fetch()
+
+    r = compute(md.Series(data, chunk_size=4), axis="index")
+    assert pytest.approx(compute(data, axis="index")) == r.execute().fetch()
+
+    data = pd.Series(rs.rand(20), name="a")
+    data[0] = 0.1  # make sure not all elements are NAN
+    data[data > 0.5] = np.nan
+    r = compute(md.Series(data, chunk_size=3))
+    assert pytest.approx(compute(data)) == r.execute().fetch()
+
+    r = compute(md.Series(data, chunk_size=3), skipna=False)
+    assert np.isnan(r.execute().fetch())
+
+    if func_opts.has_min_count:
+        r = compute(md.Series(data, chunk_size=3), skipna=False, min_count=2)
+        assert np.isnan(r.execute().fetch())
+
+        r = compute(md.Series(data, chunk_size=3), min_count=1)
+        assert pytest.approx(compute(data, min_count=1)) == r.execute().fetch()
+
+        reduction_df5 = compute(md.Series(data, chunk_size=3), min_count=21)
+        assert np.isnan(reduction_df5.execute().fetch())
+
+    # test reduction on empty series
+    data = pd.Series([], dtype=float, name="a")
+    r = compute(md.Series(data))
+    np.testing.assert_equal(r.execute().fetch(), compute(data))
+
+
+@pytest.mark.parametrize("func_name,func_opts", reduction_functions)
+def test_series_level_reduction(setup, func_name, func_opts: FunctionOptions):
+    def compute(data, **kwargs):
+        return getattr(data, func_name)(**kwargs)
+
+    rs = np.random.RandomState(0)
+    idx = pd.MultiIndex.from_arrays(
+        [[str(i) for i in range(100)], rs.choice(["A", "B"], size=(100,))],
+        names=["a", "b"],
+    )
+    data = pd.Series(rs.randint(0, 8, size=(100,)), index=idx)
+
+    r = compute(md.Series(data, chunk_size=13), level=1, method="tree")
+    pd.testing.assert_series_equal(
+        compute(data, level=1).sort_index(), r.execute().fetch().sort_index()
+    )
+
+    # test null
+    data = pd.Series(rs.rand(100), name="a", index=idx)
+    idx_df = idx.to_frame()
+    data[data > 0.5] = np.nan
+    data[int(idx_df[idx_df.b == "A"].iloc[0, 0])] = 0.1
+    data[int(idx_df[idx_df.b == "B"].iloc[0, 0])] = 0.1
+
+    r = compute(md.Series(data, chunk_size=13), level=1, method="tree")
+    pd.testing.assert_series_equal(
+        compute(data, level=1).sort_index(), r.execute().fetch().sort_index()
+    )
+
+    r = compute(md.Series(data, chunk_size=13), level=1, skipna=False, method="tree")
+    pd.testing.assert_series_equal(
+        compute(data, level=1, skipna=False).sort_index(),
+        r.execute().fetch().sort_index(),
+    )
+
+    if func_opts.has_min_count:
+        r = compute(md.Series(data, chunk_size=13), min_count=1, level=1, method="tree")
+        pd.testing.assert_series_equal(
+            compute(data, min_count=1, level=1).sort_index(),
+            r.execute().fetch().sort_index(),
+        )
+
+
+@pytest.mark.parametrize("func_name,func_opts", reduction_functions)
+def test_dataframe_reduction(
+    setup, check_ref_counts, func_name, func_opts: FunctionOptions
+):
+    def compute(data, **kwargs):
+        return getattr(data, func_name)(**kwargs)
+
+    rs = np.random.RandomState(0)
+    data = pd.DataFrame(rs.rand(20, 10))
+    r = compute(md.DataFrame(data))
+    pd.testing.assert_series_equal(compute(data), r.execute().fetch())
+
+    r = compute(md.DataFrame(data, chunk_size=3))
+    pd.testing.assert_series_equal(compute(data), r.execute().fetch())
+
+    r = compute(md.DataFrame(data, chunk_size=6), axis="index", numeric_only=True)
+    pd.testing.assert_series_equal(
+        compute(data, axis="index", numeric_only=True), r.execute().fetch()
+    )
+
+    r = compute(md.DataFrame(data, chunk_size=3), axis=1)
+    pd.testing.assert_series_equal(compute(data, axis=1), r.execute().fetch())
+
+    # test null
+    np_data = rs.rand(20, 10)
+    np_data[np_data > 0.6] = np.nan
+    data = pd.DataFrame(np_data)
+
+    r = compute(md.DataFrame(data, chunk_size=3))
+    pd.testing.assert_series_equal(compute(data), r.execute().fetch())
+
+    r = compute(md.DataFrame(data, chunk_size=3), skipna=False)
+    pd.testing.assert_series_equal(compute(data, skipna=False), r.execute().fetch())
+
+    r = compute(md.DataFrame(data, chunk_size=3), skipna=False)
+    pd.testing.assert_series_equal(compute(data, skipna=False), r.execute().fetch())
+
+    if func_opts.has_min_count:
+        r = compute(md.DataFrame(data, chunk_size=3), min_count=15)
+        pd.testing.assert_series_equal(compute(data, min_count=15), r.execute().fetch())
+
+        r = compute(md.DataFrame(data, chunk_size=3), min_count=3)
+        pd.testing.assert_series_equal(compute(data, min_count=3), r.execute().fetch())
+
+        r = compute(md.DataFrame(data, chunk_size=3), axis=1, min_count=3)
+        pd.testing.assert_series_equal(
+            compute(data, axis=1, min_count=3), r.execute().fetch()
+        )
+
+        r = compute(md.DataFrame(data, chunk_size=3), axis=1, min_count=8)
+        pd.testing.assert_series_equal(
+            compute(data, axis=1, min_count=8), r.execute().fetch()
+        )
+
+    # test numeric_only
+    data = pd.DataFrame(
+        rs.rand(10, 10),
+        index=rs.randint(-100, 100, size=(10,)),
+        columns=[rs.bytes(10) for _ in range(10)],
+    )
+    r = compute(md.DataFrame(data, chunk_size=2))
+    pd.testing.assert_series_equal(compute(data), r.execute().fetch())
+
+    r = compute(md.DataFrame(data, chunk_size=6), axis="index", numeric_only=True)
+    pd.testing.assert_series_equal(
+        compute(data, axis="index", numeric_only=True), r.execute().fetch()
+    )
+
+    r = compute(md.DataFrame(data, chunk_size=3), axis="columns")
+    pd.testing.assert_series_equal(compute(data, axis="columns"), r.execute().fetch())
+
+    data_dict = dict((str(i), rs.rand(10)) for i in range(10))
+    data_dict["string"] = pd.Series([str(i) for i in range(10)]).radd("O")
+    data_dict["bool"] = rs.choice([True, False], (10,))
+    data = pd.DataFrame(data_dict)
+    r = compute(md.DataFrame(data, chunk_size=3), axis="index", numeric_only=True)
+    pd.testing.assert_series_equal(
+        compute(data, axis="index", numeric_only=True), r.execute().fetch()
+    )
+    r = compute(md.DataFrame(data), axis="index", numeric_only=True)
+    pd.testing.assert_series_equal(
+        compute(data, axis="index", numeric_only=True), r.execute().fetch()
+    )
+
+    data1 = pd.DataFrame(rs.rand(10, 10), columns=[str(i) for i in range(10)])
+    data2 = pd.DataFrame(rs.rand(10, 10), columns=[str(i) for i in range(10)])
+    df = md.DataFrame(data1, chunk_size=5) + md.DataFrame(data2, chunk_size=6)
+    r = compute(df)
+    pd.testing.assert_series_equal(
+        compute(data1 + data2).sort_index(), r.execute().fetch().sort_index()
+    )
+
+
+@pytest.mark.parametrize("func_name,func_opts", reduction_functions)
+def test_dataframe_level_reduction(
+    setup, check_ref_counts, func_name, func_opts: FunctionOptions
+):
+    def compute(data, **kwargs):
+        return getattr(data, func_name)(**kwargs)
+
+    rs = np.random.RandomState(0)
+    idx = pd.MultiIndex.from_arrays(
+        [[str(i) for i in range(100)], rs.choice(["A", "B"], size=(100,))],
+        names=["a", "b"],
+    )
+    data = pd.DataFrame(rs.rand(100, 10), index=idx)
+
+    r = compute(md.DataFrame(data, chunk_size=13), level=1, method="tree")
+    pd.testing.assert_frame_equal(
+        compute(data, level=1).sort_index(), r.execute().fetch().sort_index()
+    )
+
+    r = compute(
+        md.DataFrame(data, chunk_size=13), level=1, numeric_only=True, method="tree"
+    )
+    pd.testing.assert_frame_equal(
+        compute(data, numeric_only=True, level=1).sort_index(),
+        r.execute().fetch().sort_index(),
+    )
+
+    # test null
+    data = pd.DataFrame(rs.rand(100, 10), index=idx)
+    data[data > 0.6] = np.nan
+
+    r = compute(md.DataFrame(data, chunk_size=13), level=1, method="tree")
+    pd.testing.assert_frame_equal(
+        compute(data, level=1).sort_index(), r.execute().fetch().sort_index()
+    )
+
+    r = compute(md.DataFrame(data, chunk_size=13), level=1, skipna=False, method="tree")
+    pd.testing.assert_frame_equal(
+        compute(data, level=1, skipna=False).sort_index(),
+        r.execute().fetch().sort_index(),
+    )
+
+    if func_opts.has_min_count:
+        r = compute(
+            md.DataFrame(data, chunk_size=13), level=1, min_count=10, method="tree"
+        )
+        pd.testing.assert_frame_equal(
+            compute(data, level=1, min_count=10).sort_index(),
+            r.execute().fetch().sort_index(),
+        )
+
+    # behavior of 'skew', 'kurt' differs for cases with and without level
+    skip_funcs = ("skew", "kurt")
+    if pd_release_version <= (1, 2, 0):
+        # fails under pandas 1.2. see pandas-dev/pandas#38774 for more details
+        skip_funcs += ("sem",)
+
+    if func_name not in skip_funcs:
+        data_dict = dict((str(i), rs.rand(100)) for i in range(10))
+        data_dict["string"] = ["O" + str(i) for i in range(100)]
+        data_dict["bool"] = rs.choice([True, False], (100,))
+        data = pd.DataFrame(data_dict, index=idx)
+
+        r = compute(
+            md.DataFrame(data, chunk_size=13), level=1, numeric_only=True, method="tree"
+        )
+        pd.testing.assert_frame_equal(
+            compute(data, level=1, numeric_only=True).sort_index(),
+            r.execute().fetch().sort_index(),
+        )
+
+
+@require_cudf
+@require_cupy
+def test_gpu_execution(setup_gpu, check_ref_counts):
+    df_raw = pd.DataFrame(np.random.rand(30, 3), columns=list("abc"))
+    df = to_gpu(md.DataFrame(df_raw, chunk_size=6))
+
+    r = df.sum()
+    res = r.execute().fetch()
+    pd.testing.assert_series_equal(res.to_pandas(), df_raw.sum())
+
+    r = df.kurt()
+    res = r.execute().fetch()
+    pd.testing.assert_series_equal(res.to_pandas(), df_raw.kurt())
+
+    r = df.agg(["sum", "var"])
+    res = r.execute().fetch()
+    pd.testing.assert_frame_equal(res.to_pandas(), df_raw.agg(["sum", "var"]))
+
+    s_raw = pd.Series(np.random.rand(30))
+    s = to_gpu(md.Series(s_raw, chunk_size=6))
+
+    r = s.sum()
+    res = r.execute().fetch()
+    assert pytest.approx(res) == s_raw.sum()
+
+    r = s.kurt()
+    res = r.execute().fetch()
+    assert pytest.approx(res) == s_raw.kurt()
+
+    r = s.agg(["sum", "var"])
+    res = r.execute().fetch()
+    pd.testing.assert_series_equal(res.to_pandas(), s_raw.agg(["sum", "var"]))
+
+    s_raw = pd.Series(
+        np.random.randint(0, 3, size=(30,)) * np.random.randint(0, 5, size=(30,))
+    )
+    s = to_gpu(md.Series(s_raw, chunk_size=6))
+
+    r = s.unique()
+    res = r.execute().fetch()
+    np.testing.assert_array_equal(cp.asnumpy(res).sort(), s_raw.unique().sort())
+
+
+bool_reduction_functions = ["all", "any"]
+
+
+@pytest.mark.parametrize("func_name", bool_reduction_functions)
+def test_series_bool_reduction(setup, check_ref_counts, func_name):
+    def compute(data, **kwargs):
+        return getattr(data, func_name)(**kwargs)
+
+    rs = np.random.RandomState(0)
+    data = pd.Series(rs.rand(10) > 0.5, index=[str(i) for i in range(10)], name="a")
+    r = compute(md.Series(data))
+    assert compute(data) == r.execute().fetch()
+
+    r = compute(md.Series(data, chunk_size=6))
+    assert pytest.approx(compute(data)) == r.execute().fetch()
+
+    r = compute(md.Series(data, chunk_size=3))
+    assert pytest.approx(compute(data)) == r.execute().fetch()
+
+    r = compute(md.Series(data, chunk_size=4), axis="index")
+    assert pytest.approx(compute(data, axis="index")) == r.execute().fetch()
+
+    # test null
+    data = pd.Series(rs.rand(20), name="a")
+    data[0] = 0.1  # make sure not all elements are NAN
+    data[data > 0.5] = np.nan
+    r = compute(md.Series(data, chunk_size=3))
+    assert compute(data) == r.execute().fetch()
+
+    r = compute(md.Series(data, chunk_size=3), skipna=False)
+    assert r.execute().fetch() is True
+
+
+@pytest.mark.parametrize("func_name", bool_reduction_functions)
+def test_series_bool_level_reduction(setup, check_ref_counts, func_name):
+    def compute(data, **kwargs):
+        return getattr(data, func_name)(**kwargs)
+
+    rs = np.random.RandomState(0)
+    idx = pd.MultiIndex.from_arrays(
+        [[str(i) for i in range(100)], rs.choice(["A", "B"], size=(100,))],
+        names=["a", "b"],
+    )
+    data = pd.Series(rs.randint(0, 8, size=(100,)), index=idx)
+
+    r = compute(md.Series(data, chunk_size=13), level=1, method="tree")
+    pd.testing.assert_series_equal(
+        compute(data, level=1).sort_index(), r.execute().fetch().sort_index()
+    )
+
+    # test null
+    data = pd.Series(rs.rand(100), name="a", index=idx)
+    idx_df = idx.to_frame()
+    data[data > 0.5] = np.nan
+    data[int(idx_df[idx_df.b == "A"].iloc[0, 0])] = 0.1
+    data[int(idx_df[idx_df.b == "B"].iloc[0, 0])] = 0.1
+
+    r = compute(md.Series(data, chunk_size=13), level=1, method="tree")
+    pd.testing.assert_series_equal(
+        compute(data, level=1).sort_index(), r.execute().fetch().sort_index()
+    )
+
+    r = compute(md.Series(data, chunk_size=13), level=1, skipna=False, method="tree")
+    pd.testing.assert_series_equal(
+        compute(data, level=1, skipna=False).sort_index(),
+        r.execute().fetch().sort_index(),
+    )
+
+
+@pytest.mark.parametrize("func_name", bool_reduction_functions)
+def test_dataframe_bool_reduction(setup, check_ref_counts, func_name):
+    def compute(data, **kwargs):
+        return getattr(data, func_name)(**kwargs)
+
+    rs = np.random.RandomState(0)
+    data = pd.DataFrame(rs.rand(20, 10))
+    data.iloc[:, :5] = data.iloc[:, :5] > 0.5
+    r = compute(md.DataFrame(data))
+    pd.testing.assert_series_equal(compute(data), r.execute().fetch())
+
+    r = compute(md.DataFrame(data, chunk_size=3))
+    pd.testing.assert_series_equal(compute(data), r.execute().fetch())
+
+    r = compute(
+        md.DataFrame(data, chunk_size=6), axis="index", bool_only=True, method="tree"
+    )
+    pd.testing.assert_series_equal(
+        compute(data, axis="index", bool_only=True),
+        r.execute(extra_config={"check_all": False}).fetch(),
+    )
+
+    r = compute(md.DataFrame(data, chunk_size=3), axis=1)
+    pd.testing.assert_series_equal(compute(data, axis=1), r.execute().fetch())
+
+    r = compute(md.DataFrame(data, chunk_size=3), axis=None)
+    assert compute(data, axis=None) == r.execute().fetch()
+
+    # test null
+    np_data = rs.rand(20, 10)
+    np_data[np_data > 0.6] = np.nan
+    data = pd.DataFrame(np_data)
+
+    r = compute(md.DataFrame(data, chunk_size=3))
+    pd.testing.assert_series_equal(compute(data), r.execute().fetch())
+
+    r = compute(md.DataFrame(data, chunk_size=3), skipna=False)
+    pd.testing.assert_series_equal(compute(data, skipna=False), r.execute().fetch())
+
+    r = compute(md.DataFrame(data, chunk_size=3), skipna=False)
+    pd.testing.assert_series_equal(compute(data, skipna=False), r.execute().fetch())
+
+    # test bool_only
+    data = pd.DataFrame(
+        rs.rand(10, 10),
+        index=rs.randint(-100, 100, size=(10,)),
+        columns=[rs.bytes(10) for _ in range(10)],
+    )
+    data.iloc[:, :5] = data.iloc[:, :5] > 0.5
+    data.iloc[:5, 5:] = data.iloc[:5, 5:] > 0.5
+    r = compute(md.DataFrame(data, chunk_size=2))
+    pd.testing.assert_series_equal(compute(data), r.execute().fetch())
+
+    r = compute(md.DataFrame(data, chunk_size=6), axis="index", bool_only=True)
+    pd.testing.assert_series_equal(
+        compute(data, axis="index", bool_only=True), r.execute().fetch()
+    )
+
+    r = compute(md.DataFrame(data, chunk_size=3), axis="columns")
+    pd.testing.assert_series_equal(compute(data, axis="columns"), r.execute().fetch())
+
+    data_dict = dict((str(i), rs.rand(10)) for i in range(10))
+    data_dict["string"] = [str(i) for i in range(10)]
+    data_dict["bool"] = rs.choice([True, False], (10,))
+    data = pd.DataFrame(data_dict)
+    r = compute(md.DataFrame(data, chunk_size=3), axis="index", bool_only=True)
+    pd.testing.assert_series_equal(
+        compute(data, axis="index", bool_only=True), r.execute().fetch()
+    )
+
+
+@pytest.mark.parametrize("func_name", bool_reduction_functions)
+def test_dataframe_bool_level_reduction(setup, check_ref_counts, func_name):
+    def compute(data, **kwargs):
+        return getattr(data, func_name)(**kwargs)
+
+    rs = np.random.RandomState(0)
+    idx = pd.MultiIndex.from_arrays(
+        [[str(i) for i in range(100)], rs.choice(["A", "B"], size=(100,))],
+        names=["a", "b"],
+    )
+    data = pd.DataFrame(rs.rand(100, 10), index=idx)
+    data.iloc[:, :5] = data.iloc[:, :5] > 0.5
+
+    r = compute(md.DataFrame(data, chunk_size=13), level=1, method="tree")
+    pd.testing.assert_frame_equal(
+        compute(data, level=1).sort_index(), r.execute().fetch().sort_index()
+    )
+
+    # test null
+    data = pd.DataFrame(rs.rand(100, 10), index=idx)
+    data[data > 0.6] = np.nan
+
+    r = compute(md.DataFrame(data, chunk_size=13), level=1, method="tree")
+    pd.testing.assert_frame_equal(
+        compute(data, level=1).sort_index(), r.execute().fetch().sort_index()
+    )
+
+    r = compute(md.DataFrame(data, chunk_size=13), level=1, skipna=False, method="tree")
+    pd.testing.assert_frame_equal(
+        compute(data, level=1, skipna=False).sort_index(),
+        r.execute().fetch().sort_index(),
+    )
+
+    # test bool_only
+    # bool_only not supported when level specified
+
+
+def test_series_count(setup, check_ref_counts):
+    array = np.random.rand(10)
+    array[[2, 7, 9]] = np.nan
+    data = pd.Series(array)
+    series = md.Series(data)
+
+    result = series.count().execute().fetch()
+    expected = data.count()
+    assert result == expected
+
+    series2 = md.Series(data, chunk_size=1)
+
+    result = series2.count().execute().fetch()
+    expected = data.count()
+    assert result == expected
+
+    series2 = md.Series(data, chunk_size=3)
+
+    result = series2.count().execute().fetch()
+    expected = data.count()
+    assert result == expected
+
+
+def test_dataframe_count(setup, check_ref_counts):
+    data = pd.DataFrame(
+        {
+            "Person": ["John", "Myla", "Lewis", "John", "Myla"],
+            "Age": [24.0, np.nan, 21.0, 33, 26],
+            "Single": [False, True, True, True, False],
+        }
+    )
+    df = md.DataFrame(data)
+
+    result = df.count().execute().fetch()
+    expected = data.count()
+    pd.testing.assert_series_equal(result, expected)
+
+    result = df.count(axis="columns").execute().fetch()
+    expected = data.count(axis="columns")
+    pd.testing.assert_series_equal(result, expected)
+
+    df2 = md.DataFrame(data, chunk_size=2)
+
+    result = df2.count().execute().fetch()
+    expected = data.count()
+    pd.testing.assert_series_equal(result, expected)
+
+    result = df2.count(axis="columns").execute().fetch()
+    expected = data.count(axis="columns")
+    pd.testing.assert_series_equal(result, expected)
+
+    df3 = md.DataFrame(data, chunk_size=3)
+
+    result = df3.count(numeric_only=True).execute().fetch()
+    expected = data.count(numeric_only=True)
+    pd.testing.assert_series_equal(result, expected)
+
+    result = df3.count(axis="columns", numeric_only=True).execute().fetch()
+    expected = data.count(axis="columns", numeric_only=True)
+    pd.testing.assert_series_equal(result, expected)
+
+
+def test_nunique(setup, check_ref_counts):
+    data1 = pd.Series(np.random.randint(0, 5, size=(20,)))
+
+    series = md.Series(data1)
+    result = series.nunique().execute().fetch()
+    expected = data1.nunique()
+    assert result == expected
+
+    series = md.Series(data1, chunk_size=6)
+    result = series.nunique().execute().fetch()
+    expected = data1.nunique()
+    assert result == expected
+
+    # test dropna
+    data2 = data1.copy()
+    data2[[2, 9, 18]] = np.nan
+
+    series = md.Series(data2)
+    result = series.nunique().execute().fetch()
+    expected = data2.nunique()
+    assert result == expected
+
+    series = md.Series(data2, chunk_size=3)
+    result = series.nunique(dropna=False).execute().fetch()
+    expected = data2.nunique(dropna=False)
+    assert result == expected
+
+    # test dataframe
+    data1 = pd.DataFrame(
+        np.random.randint(0, 6, size=(20, 20)),
+        columns=["c" + str(i) for i in range(20)],
+    )
+    df = md.DataFrame(data1)
+    result = df.nunique().execute().fetch()
+    expected = data1.nunique()
+    pd.testing.assert_series_equal(result, expected)
+
+    df = md.DataFrame(data1, chunk_size=6)
+    result = df.nunique().execute().fetch()
+    expected = data1.nunique()
+    pd.testing.assert_series_equal(result, expected)
+
+    df = md.DataFrame(data1)
+    result = df.nunique(axis=1).execute().fetch()
+    expected = data1.nunique(axis=1)
+    pd.testing.assert_series_equal(result, expected)
+
+    df = md.DataFrame(data1, chunk_size=3)
+    result = df.nunique(axis=1).execute().fetch()
+    expected = data1.nunique(axis=1)
+    pd.testing.assert_series_equal(result, expected)
+
+    # test dropna
+    data2 = data1.copy()
+    data2.iloc[[2, 9, 18], [2, 9, 18]] = np.nan
+
+    df = md.DataFrame(data2)
+    result = df.nunique().execute().fetch()
+    expected = data2.nunique()
+    pd.testing.assert_series_equal(result, expected)
+
+    df = md.DataFrame(data2, chunk_size=3)
+    result = df.nunique(dropna=False).execute().fetch()
+    expected = data2.nunique(dropna=False)
+    pd.testing.assert_series_equal(result, expected)
+
+    df = md.DataFrame(data1, chunk_size=3)
+    result = df.nunique(axis=1).execute().fetch()
+    expected = data1.nunique(axis=1)
+    pd.testing.assert_series_equal(result, expected)
+
+
+@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
+def test_use_arrow_dtype_nunique(setup, check_ref_counts):
+    with option_context({"dataframe.use_arrow_dtype": True, "combine_size": 2}):
+        rs = np.random.RandomState(0)
+        data1 = pd.DataFrame(
+            {"a": rs.random(10), "b": [f"s{i}" for i in rs.randint(100, size=10)]}
+        )
+        data1["c"] = data1["b"].copy()
+        data1["d"] = data1["b"].copy()
+        data1["e"] = data1["b"].copy()
+
+        df = md.DataFrame(data1, chunk_size=(3, 2))
+        r = df.nunique(axis=0)
+        result = r.execute().fetch()
+        expected = data1.nunique(axis=0)
+        pd.testing.assert_series_equal(result, expected)
+
+        r = df.nunique(axis=1)
+        result = r.execute().fetch()
+        expected = data1.nunique(axis=1)
+        pd.testing.assert_series_equal(result, expected)
+
+
+def test_unique(setup, check_ref_counts):
+    data1 = pd.Series(np.random.randint(0, 5, size=(20,)))
+
+    series = md.Series(data1)
+    result = series.unique().execute().fetch()
+    expected = data1.unique()
+    np.testing.assert_array_equal(result, expected)
+
+    series = md.Series(data1, chunk_size=6)
+    result = series.unique().execute().fetch()
+    expected = data1.unique()
+    np.testing.assert_array_equal(result, expected)
+
+    data2 = pd.Series(
+        [pd.Timestamp("20200101", tz="US/Eastern")] * 5
+        + [pd.Timestamp("20200202")]
+        + [pd.Timestamp("20020101")] * 9
+    )
+    series = md.Series(data2)
+    result = series.unique().execute().fetch()
+    expected = data2.unique()
+    np.testing.assert_array_equal(result, expected)
+
+    series = md.Series(data2, chunk_size=6)
+    result = series.unique().execute().fetch()
+    expected = data2.unique()
+    np.testing.assert_array_equal(result, expected)
+
+    # test md.unique
+    result = md.unique(data2).execute().fetch()
+    expected = pd.unique(data2)
+    np.testing.assert_array_equal(result, expected)
+
+    raw_list = list("baabc")
+    result = md.unique(raw_list).execute().fetch()
+    expected = pd.unique(raw_list)
+    np.testing.assert_array_equal(result, expected)
+
+    data1 = pd.Series(np.random.randint(0, 5, size=(20,)))
+    result = md.unique(data1).execute().fetch()
+    expected = pd.unique(data1)
+    np.testing.assert_array_equal(result, expected)
+
+
+def test_index_reduction(setup, check_ref_counts):
+    rs = np.random.RandomState(0)
+    data = pd.Index(rs.randint(0, 5, (100,)))
+    data2 = pd.Index(rs.randint(1, 6, (100,)))
+
+    for method in ["min", "max", "all", "any"]:
+        idx = md.Index(data)
+        result = getattr(idx, method)().execute().fetch()
+        assert result == getattr(data, method)()
+
+        idx = md.Index(data, chunk_size=10)
+        result = getattr(idx, method)().execute().fetch()
+        assert result == getattr(data, method)()
+
+        idx = md.Index(data2)
+        result = getattr(idx, method)().execute().fetch()
+        assert result == getattr(data2, method)()
+
+        idx = md.Index(data2, chunk_size=10)
+        result = getattr(idx, method)().execute().fetch()
+        assert result == getattr(data2, method)()
+
+
+cum_reduction_functions = ["cummax", "cummin", "cumprod", "cumsum"]
+
+
+@pytest.mark.parametrize("func_name", cum_reduction_functions)
+def test_series_cum_reduction(setup, check_ref_counts, func_name):
+    def compute(data, **kwargs):
+        return getattr(data, func_name)(**kwargs)
+
+    data = pd.Series(np.random.rand(20), index=[str(i) for i in range(20)], name="a")
+    r = compute(md.Series(data))
+    pd.testing.assert_series_equal(compute(data), r.execute().fetch())
+
+    r = compute(md.Series(data, chunk_size=6))
+    pd.testing.assert_series_equal(compute(data), r.execute().fetch())
+
+    r = compute(md.Series(data, chunk_size=3))
+    pd.testing.assert_series_equal(compute(data), r.execute().fetch())
+
+    r = compute(md.Series(data, chunk_size=4), axis="index")
+    pd.testing.assert_series_equal(compute(data, axis="index"), r.execute().fetch())
+
+    data = pd.Series(np.random.rand(20), name="a")
+    data[0] = 0.1  # make sure not all elements are NAN
+    data[data > 0.5] = np.nan
+    r = compute(md.Series(data, chunk_size=3))
+    pd.testing.assert_series_equal(compute(data), r.execute().fetch())
+
+    r = compute(md.Series(data, chunk_size=3), skipna=False)
+    pd.testing.assert_series_equal(compute(data, skipna=False), r.execute().fetch())
+
+
+@pytest.mark.parametrize("func_name", cum_reduction_functions)
+def test_dataframe_cum_reduction(setup, check_ref_counts, func_name):
+    def compute(data, **kwargs):
+        return getattr(data, func_name)(**kwargs)
+
+    data = pd.DataFrame(np.random.rand(20, 10))
+    r = compute(md.DataFrame(data))
+    pd.testing.assert_frame_equal(compute(data), r.execute().fetch())
+
+    r = compute(md.DataFrame(data, chunk_size=3))
+    pd.testing.assert_frame_equal(compute(data), r.execute().fetch())
+
+    r = compute(md.DataFrame(data, chunk_size=3), axis=1)
+    pd.testing.assert_frame_equal(compute(data, axis=1), r.execute().fetch())
+
+    # test null
+    np_data = np.random.rand(20, 10)
+    np_data[np_data > 0.6] = np.nan
+    data = pd.DataFrame(np_data)
+
+    r = compute(md.DataFrame(data, chunk_size=3))
+    pd.testing.assert_frame_equal(compute(data), r.execute().fetch())
+
+    r = compute(md.DataFrame(data, chunk_size=3), skipna=False)
+    pd.testing.assert_frame_equal(compute(data, skipna=False), r.execute().fetch())
+
+    r = compute(md.DataFrame(data, chunk_size=3), skipna=False)
+    pd.testing.assert_frame_equal(compute(data, skipna=False), r.execute().fetch())
+
+    # test numeric_only
+    data = pd.DataFrame(
+        np.random.rand(10, 10),
+        index=np.random.randint(-100, 100, size=(10,)),
+        columns=[np.random.bytes(10) for _ in range(10)],
+    )
+    r = compute(md.DataFrame(data, chunk_size=2))
+    pd.testing.assert_frame_equal(compute(data), r.execute().fetch())
+
+    r = compute(md.DataFrame(data, chunk_size=3), axis="columns")
+    pd.testing.assert_frame_equal(compute(data, axis="columns"), r.execute().fetch())
+
+
+def test_dataframe_aggregate(setup, check_ref_counts):
+    all_aggs = [
+        "sum",
+        "prod",
+        "min",
+        "max",
+        "count",
+        "size",
+        "mean",
+        "var",
+        "std",
+        "sem",
+        "skew",
+        "kurt",
+    ]
+    data = pd.DataFrame(np.random.rand(20, 20))
+
+    df = md.DataFrame(data)
+    result = df.agg(all_aggs)
+    pd.testing.assert_frame_equal(result.execute().fetch(), data.agg(all_aggs))
+
+    result = df.agg("size")
+    if _agg_size_as_series:
+        pd.testing.assert_series_equal(result.execute().fetch(), data.agg("size"))
+    else:
+        assert result.execute().fetch() == data.agg("size")
+
+    for func in (a for a in all_aggs if a != "size"):
+        result = df.agg(func)
+        pd.testing.assert_series_equal(result.execute().fetch(), data.agg(func))
+
+        result = df.agg(func, axis=1)
+        pd.testing.assert_series_equal(result.execute().fetch(), data.agg(func, axis=1))
+
+    df = md.DataFrame(data, chunk_size=3)
+
+    # will redirect to transform
+    result = df.agg(["cumsum", "cummax"])
+    pd.testing.assert_frame_equal(
+        result.execute().fetch(), data.agg(["cumsum", "cummax"])
+    )
+
+    result = df.agg("size")
+    if _agg_size_as_series:
+        pd.testing.assert_series_equal(result.execute().fetch(), data.agg("size"))
+    else:
+        assert result.execute().fetch() == data.agg("size")
+
+    for func in (a for a in all_aggs if a != "size"):
+        result = df.agg(func)
+        pd.testing.assert_series_equal(result.execute().fetch(), data.agg(func))
+
+        result = df.agg(func, axis=1)
+        pd.testing.assert_series_equal(result.execute().fetch(), data.agg(func, axis=1))
+
+    result = df.agg(["sum"])
+    pd.testing.assert_frame_equal(result.execute().fetch(), data.agg(["sum"]))
+
+    result = df.agg([sum])
+    pd.testing.assert_frame_equal(result.execute().fetch(), data.agg([sum]))
+
+    result = df.agg(all_aggs)
+    pd.testing.assert_frame_equal(result.execute().fetch(), data.agg(all_aggs))
+
+    result = df.agg(all_aggs, axis=1)
+    pd.testing.assert_frame_equal(result.execute().fetch(), data.agg(all_aggs, axis=1))
+
+    result = df.agg({0: ["sum", "min", "var"], 9: ["mean", "var", "std"]})
+    pd.testing.assert_frame_equal(
+        result.execute().fetch(),
+        data.agg({0: ["sum", "min", "var"], 9: ["mean", "var", "std"]}),
+    )
+
+    result = df.agg({0: [sum, min, max]})
+    pd.testing.assert_frame_equal(
+        result.execute().fetch(),
+        data.agg({0: [sum, min, max]}),
+    )
+
+    if _support_kw_agg:
+        agg_kw = dict(
+            sum_0=NamedAgg(0, "sum"),
+            min_0=NamedAgg(0, "min"),
+            mean_9=NamedAgg(9, "mean"),
+        )
+        result = df.agg(**agg_kw)
+        pd.testing.assert_frame_equal(result.execute().fetch(), data.agg(**agg_kw))
+
+
+def test_series_aggregate(setup, check_ref_counts):
+    all_aggs = [
+        "sum",
+        "prod",
+        "min",
+        "max",
+        "count",
+        "size",
+        "mean",
+        "var",
+        "std",
+        "sem",
+        "skew",
+        "kurt",
+    ]
+    data = pd.Series(np.random.rand(20), index=[str(i) for i in range(20)], name="a")
+    series = md.Series(data)
+
+    result = series.agg(all_aggs)
+    pd.testing.assert_series_equal(result.execute().fetch(), data.agg(all_aggs))
+
+    for func in all_aggs:
+        result = series.agg(func)
+        assert pytest.approx(result.execute().fetch()) == data.agg(func)
+
+    series = md.Series(data, chunk_size=3)
+
+    for func in all_aggs:
+        result = series.agg(func)
+        assert pytest.approx(result.execute().fetch()) == data.agg(func)
+
+    result = series.agg(all_aggs)
+    pd.testing.assert_series_equal(result.execute().fetch(), data.agg(all_aggs))
+
+    result = series.agg({"col_sum": "sum", "col_count": "count"})
+    pd.testing.assert_series_equal(
+        result.execute().fetch(), data.agg({"col_sum": "sum", "col_count": "count"})
+    )
+
+    result = series.agg({"col_sum": sum, "col_count": "count"})
+    pd.testing.assert_series_equal(
+        result.execute().fetch(), data.agg({"col_sum": sum, "col_count": "count"})
+    )
+
+    if _support_kw_agg:
+        result = series.agg(col_var="var", col_skew="skew")
+        pd.testing.assert_series_equal(
+            result.execute().fetch(), data.agg(col_var="var", col_skew="skew")
+        )
+
+
+def test_aggregate_str_cat(setup, check_ref_counts):
+    agg_fun = lambda x: x.str.cat(sep="_", na_rep="NA")
+
+    rs = np.random.RandomState(0)
+    raw_df = pd.DataFrame(
+        {
+            "a": rs.choice(["A", "B", "C"], size=(100,)),
+            "b": rs.choice([None, "alfa", "bravo", "charlie"], size=(100,)),
+        }
+    )
+
+    mdf = md.DataFrame(raw_df, chunk_size=13)
+
+    r = mdf.agg(agg_fun)
+    pd.testing.assert_series_equal(r.execute().fetch(), raw_df.agg(agg_fun))
+
+    raw_series = pd.Series(rs.choice([None, "alfa", "bravo", "charlie"], size=(100,)))
+
+    ms = md.Series(raw_series, chunk_size=13)
+
+    r = ms.agg(agg_fun)
+    assert r.execute().fetch() == raw_series.agg(agg_fun)
+
+
+class MockReduction1(CustomReduction):
+    def agg(self, v1):
+        return v1.sum()
+
+
+class MockReduction2(CustomReduction):
+    def pre(self, value):
+        return value + 1, value**2
+
+    def agg(self, v1, v2):
+        return v1.sum(), v2.prod()
+
+    def post(self, v1, v2):
+        return v1 + v2
+
+
+def test_custom_dataframe_aggregate(setup, check_ref_counts):
+    rs = np.random.RandomState(0)
+    data = pd.DataFrame(rs.rand(30, 20))
+
+    df = md.DataFrame(data)
+    result = df.agg(MockReduction1())
+    pd.testing.assert_series_equal(result.execute().fetch(), data.agg(MockReduction1()))
+
+    result = df.agg(MockReduction2())
+    pd.testing.assert_series_equal(result.execute().fetch(), data.agg(MockReduction2()))
+
+    df = md.DataFrame(data, chunk_size=5)
+    result = df.agg(MockReduction2())
+    pd.testing.assert_series_equal(result.execute().fetch(), data.agg(MockReduction2()))
+
+    result = df.agg(MockReduction2())
+    pd.testing.assert_series_equal(result.execute().fetch(), data.agg(MockReduction2()))
+
+
+def test_custom_series_aggregate(setup, check_ref_counts):
+    rs = np.random.RandomState(0)
+    data = pd.Series(rs.rand(20))
+
+    s = md.Series(data)
+    result = s.agg(MockReduction1())
+    assert result.execute().fetch() == data.agg(MockReduction1())
+
+    result = s.agg(MockReduction2())
+    assert result.execute().fetch() == data.agg(MockReduction2())
+
+    s = md.Series(data, chunk_size=5)
+    result = s.agg(MockReduction2())
+    assert pytest.approx(result.execute().fetch()) == data.agg(MockReduction2())
+
+    result = s.agg(MockReduction2())
+    assert pytest.approx(result.execute().fetch()) == data.agg(MockReduction2())
diff --git a/python/xorbits/_mars/dataframe/reduction/unique.py b/python/xorbits/_mars/dataframe/reduction/unique.py
new file mode 100644
index 000000000..790862ffc
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/unique.py
@@ -0,0 +1,97 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import ENTITY_TYPE, OutputType
+from ...tensor.core import TensorOrder
+from ...utils import lazy_import
+from ..initializer import Series as asseries
+from .core import CustomReduction, DataFrameReductionMixin, DataFrameReductionOperand
+
+cudf = lazy_import("cudf")
+
+
+class UniqueReduction(CustomReduction):
+    def agg(self, data):  # noqa: W0221  # pylint: disable=arguments-differ
+        xdf = cudf if self.is_gpu() else pd
+        # convert to series data
+        return xdf.Series(data.unique())
+
+    def post(self, data):  # noqa: W0221  # pylint: disable=arguments-differ
+        return data.unique()
+
+
+class DataFrameUnique(DataFrameReductionOperand, DataFrameReductionMixin):
+    _op_type_ = OperandDef.UNIQUE
+    _func_name = "unique"
+
+    @classmethod
+    def get_reduction_callable(cls, op):
+        return UniqueReduction(name=cls._func_name, is_gpu=op.is_gpu())
+
+    @classmethod
+    def tile(cls, op):
+        if op.method == "tree":
+            return (yield from super().tile(op))
+        else:
+            raise NotImplementedError(f"Method {op.method} hasn't been supported")
+
+    def __call__(self, a):
+        if not isinstance(a, ENTITY_TYPE):
+            a = asseries(a)
+        self.output_types = [OutputType.tensor]
+        return self.new_tileables(
+            [a], shape=(np.nan,), dtype=a.dtype, order=TensorOrder.C_ORDER
+        )[0]
+
+
+def unique(values, method="tree"):
+    """
+    Uniques are returned in order of appearance. This does NOT sort.
+
+    Parameters
+    ----------
+    values : 1d array-like
+    method : 'shuffle' or 'tree', 'tree' method provide a better performance, 'shuffle'
+    is recommended if the number of unique values is very large.
+    See Also
+    --------
+    Index.unique
+    Series.unique
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> import pandas as pd
+    >>> md.unique(md.Series([2, 1, 3, 3])).execute()
+    array([2, 1, 3])
+
+    >>> md.unique(md.Series([2] + [1] * 5)).execute()
+    array([2, 1])
+
+    >>> md.unique(md.Series([pd.Timestamp('20160101'),
+    ...                     pd.Timestamp('20160101')])).execute()
+    array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')
+
+    >>> md.unique(md.Series([pd.Timestamp('20160101', tz='US/Eastern'),
+    ...                      pd.Timestamp('20160101', tz='US/Eastern')])).execute()
+    array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')],
+          dtype=object)
+    """
+    op = DataFrameUnique(method=method)
+    return op(values)
diff --git a/python/xorbits/_mars/dataframe/reduction/var.py b/python/xorbits/_mars/dataframe/reduction/var.py
new file mode 100644
index 000000000..35cc28ef5
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/reduction/var.py
@@ -0,0 +1,89 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import OutputType
+from ...serialization.serializables import Int32Field
+from .core import DataFrameReductionMixin, DataFrameReductionOperand
+
+
+class DataFrameVar(DataFrameReductionOperand, DataFrameReductionMixin):
+    _op_type_ = OperandDef.VAR
+    _func_name = "var"
+
+    _ddof = Int32Field("ddof")
+
+    def __init__(self, ddof=None, **kw):
+        super().__init__(_ddof=ddof, **kw)
+
+    @property
+    def ddof(self):
+        return self._ddof
+
+    @classmethod
+    def get_reduction_callable(cls, op):
+        skipna, ddof = op.skipna, op.ddof
+
+        def var(x):
+            cnt = x.count()
+            if ddof == 0:
+                return (x**2).mean(skipna=skipna) - (x.mean(skipna=skipna)) ** 2
+            return ((x**2).sum(skipna=skipna) - x.sum(skipna=skipna) ** 2 / cnt) / (
+                cnt - ddof
+            )
+
+        return var
+
+
+def var_series(
+    series, axis=None, skipna=True, level=None, ddof=1, combine_size=None, method=None
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameVar(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        ddof=ddof,
+        combine_size=combine_size,
+        output_types=[OutputType.scalar],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(series)
+
+
+def var_dataframe(
+    df,
+    axis=None,
+    skipna=True,
+    level=None,
+    ddof=1,
+    numeric_only=None,
+    combine_size=None,
+    method=None,
+):
+    use_inf_as_na = options.dataframe.mode.use_inf_as_na
+    op = DataFrameVar(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        ddof=ddof,
+        numeric_only=numeric_only,
+        combine_size=combine_size,
+        output_types=[OutputType.series],
+        use_inf_as_na=use_inf_as_na,
+        method=method,
+    )
+    return op(df)
diff --git a/python/xorbits/_mars/dataframe/sort/__init__.py b/python/xorbits/_mars/dataframe/sort/__init__.py
new file mode 100644
index 000000000..cc92657fa
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/sort/__init__.py
@@ -0,0 +1,34 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .sort_index import DataFrameSortIndex
+from .sort_values import DataFrameSortValues
+
+
+def _install():
+    from ..core import DATAFRAME_TYPE, SERIES_TYPE
+    from .sort_index import sort_index
+    from .sort_values import dataframe_sort_values, series_sort_values
+
+    for cls in DATAFRAME_TYPE:
+        setattr(cls, "sort_values", dataframe_sort_values)
+        setattr(cls, "sort_index", sort_index)
+
+    for cls in SERIES_TYPE:
+        setattr(cls, "sort_values", series_sort_values)
+        setattr(cls, "sort_index", sort_index)
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/dataframe/sort/core.py b/python/xorbits/_mars/dataframe/sort/core.py
new file mode 100644
index 000000000..a1f2aa879
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/sort/core.py
@@ -0,0 +1,121 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+
+from ...config import options
+from ...core import recursive_tile
+from ...core.operand import OperandStage
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    FieldTypes,
+    Int32Field,
+    Int64Field,
+    ListField,
+    StringField,
+)
+from ...utils import ceildiv
+from ..operands import DataFrameOperand
+from ..utils import parse_index
+
+
+class DataFrameSortOperand(DataFrameOperand):
+    axis = Int32Field("axis")
+    ascending = AnyField("ascending")
+    inplace = BoolField("inplace")
+    kind = StringField("kind")
+    na_position = StringField("na_position")
+    ignore_index = BoolField("ignore_index")
+    parallel_kind = StringField("parallel_kind")
+    psrs_kinds = ListField("psrs_kinds", FieldTypes.string)
+    nrows = Int64Field("nrows", default=None)
+
+    @classmethod
+    def _tile_head(cls, op: "DataFrameSortOperand"):
+        from ..merge import DataFrameConcat
+
+        inp = op.inputs[0]
+        out = op.outputs[0]
+        axis = op.axis
+        assert axis == 0
+        pd_index = out.index_value.to_pandas()
+        combine_size = options.combine_size
+
+        if inp.ndim == 2:
+            if inp.chunk_shape[1 - axis] > 1:  # pragma: no cover
+                if any(pd.isna(s) for s in inp.nsplits[1 - axis]):
+                    yield
+                inp = yield from recursive_tile(
+                    inp.rechunk({1 - axis: inp.shape[1 - axis]})
+                )
+
+        out_chunks = []
+        for c in inp.chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_op.stage = OperandStage.map
+            chunk_params = c.params
+            chunk_params["index_value"] = parse_index(pd_index, c)
+            out_chunks.append(chunk_op.new_chunk([c], kws=[chunk_params]))
+
+        while True:
+            chunk_size = ceildiv(len(out_chunks), combine_size)
+            combine_chunks = []
+            for i in range(chunk_size):
+                chunk_index = (i,) if inp.ndim == 1 else (i, 0)
+
+                to_combine_chunks = out_chunks[
+                    i * combine_size : (i + 1) * combine_size
+                ]
+                concat_params = to_combine_chunks[0].params
+                concat_params["index"] = chunk_index
+                shape = list(to_combine_chunks[0].shape)
+                shape[0] = sum(c.shape[0] for c in to_combine_chunks)
+                shape = tuple(shape)
+                concat_params["shape"] = shape
+                if len(to_combine_chunks) == 1:
+                    c = to_combine_chunks[0].copy()
+                    c._index = chunk_index
+                else:
+                    c = DataFrameConcat(
+                        axis=axis, output_types=op.output_types
+                    ).new_chunk(to_combine_chunks, kws=[concat_params])
+                chunk_op = op.copy().reset_key()
+                chunk_op.stage = (
+                    OperandStage.combine if chunk_size > 1 else OperandStage.agg
+                )
+                chunk_params = c.params
+                chunk_params["index_value"] = parse_index(pd_index, c)
+                chunk_params["shape"] = (min(shape[0], op.nrows),) + shape[1:]
+                combine_chunks.append(chunk_op.new_chunk([c], kws=[chunk_params]))
+            out_chunks = combine_chunks
+            if chunk_size == 1:
+                break
+
+        new_op = op.copy()
+        params = out.params
+        params["nsplits"] = tuple((s,) for s in out.shape)
+        params["chunks"] = out_chunks
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def _tile(cls, op):  # pragma: no cover
+        raise NotImplementedError
+
+    @classmethod
+    def tile(cls, op: "DataFrameSortOperand"):
+        if op.nrows is not None:
+            return (yield from cls._tile_head(op))
+        else:
+            return (yield from cls._tile(op))
diff --git a/python/xorbits/_mars/dataframe/sort/psrs.py b/python/xorbits/_mars/dataframe/sort/psrs.py
new file mode 100644
index 000000000..b91c4762c
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/sort/psrs.py
@@ -0,0 +1,729 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core.operand import MapReduceOperand, OperandStage
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    Int32Field,
+    ListField,
+    StringField,
+)
+from ...tensor.base.psrs import PSRSOperandMixin
+from ...utils import calc_nsplits, lazy_import
+from ..core import IndexValue, OutputType
+from ..operands import DataFrameOperand, DataFrameOperandMixin, DataFrameShuffleProxy
+from ..utils import is_cudf, parse_index, standardize_range_index
+
+cudf = lazy_import("cudf")
+
+_PSRS_DISTINCT_COL = "__PSRS_TMP_DISTINCT_COL"
+
+
+class _Largest:
+    """
+    This util class resolve TypeError when
+    comparing strings with None values
+    """
+
+    def __lt__(self, other):
+        return False
+
+    def __gt__(self, other):
+        return self is not other
+
+
+_largest = _Largest()
+
+
+class _ReversedValue:
+    def __init__(self, value):
+        self._value = value
+
+    def __lt__(self, other):
+        if type(other) is _ReversedValue:
+            # may happen when call searchsorted
+            return self._value >= other._value
+        return self._value >= other
+
+    def __gt__(self, other):
+        return self._value <= other
+
+    def __repr__(self):
+        return repr(self._value)
+
+
+class DataFramePSRSOperandMixin(DataFrameOperandMixin, PSRSOperandMixin):
+    @classmethod
+    def _collect_op_properties(cls, op):
+        from .sort_values import DataFrameSortValues
+
+        if isinstance(op, DataFrameSortValues):
+            properties = dict(
+                sort_type="sort_values",
+                axis=op.axis,
+                by=op.by,
+                ascending=op.ascending,
+                inplace=op.inplace,
+                na_position=op.na_position,
+                gpu=op.is_gpu(),
+            )
+        else:
+            properties = dict(
+                sort_type="sort_index",
+                axis=op.axis,
+                level=op.level,
+                ascending=op.ascending,
+                inplace=op.inplace,
+                na_position=op.na_position,
+                sort_remaining=op.sort_remaining,
+                gpu=op.is_gpu(),
+            )
+        return properties
+
+    @classmethod
+    def local_sort_and_regular_sample(
+        cls, op, in_data, axis_chunk_shape, axis_offsets, out_idx
+    ):
+        # stage 1: local sort and regular samples collected
+        sorted_chunks, indices_chunks, sampled_chunks = [], [], []
+        for i in range(axis_chunk_shape):
+            in_chunk = in_data.chunks[i]
+            kind = None if op.psrs_kinds is None else op.psrs_kinds[0]
+            chunk_op = DataFramePSRSSortRegularSample(
+                kind=kind,
+                n_partition=axis_chunk_shape,
+                output_types=op.output_types,
+                **cls._collect_op_properties(op)
+            )
+            kws = []
+            sort_shape = in_chunk.shape
+            kws.append(
+                {
+                    "shape": sort_shape,
+                    "index_value": in_chunk.index_value,
+                    "index": in_chunk.index,
+                }
+            )
+            if chunk_op.sort_type == "sort_values":
+                sampled_shape = (
+                    (axis_chunk_shape, len(op.by)) if op.by else (axis_chunk_shape,)
+                )
+            else:
+                sampled_shape = (
+                    (axis_chunk_shape, sort_shape[1])
+                    if len(sort_shape) == 2
+                    else (axis_chunk_shape,)
+                )
+            kws.append(
+                {
+                    "shape": sampled_shape,
+                    "index_value": in_chunk.index_value,
+                    "index": (i,),
+                    "type": "regular_sampled",
+                }
+            )
+            if op.outputs[0].ndim == 2:
+                kws[0].update(
+                    {"columns_value": in_chunk.columns_value, "dtypes": in_chunk.dtypes}
+                )
+                kws[1].update(
+                    {"columns_value": in_chunk.columns_value, "dtypes": in_chunk.dtypes}
+                )
+            else:
+                kws[0].update(({"dtype": in_chunk.dtype, "name": in_chunk.name}))
+                kws[1].update({"dtype": in_chunk.dtype})
+
+            chunks = chunk_op.new_chunks([in_chunk], kws=kws, output_limit=len(kws))
+            sort_chunk, sampled_chunk = chunks
+            sorted_chunks.append(sort_chunk)
+            sampled_chunks.append(sampled_chunk)
+        return sorted_chunks, indices_chunks, sampled_chunks
+
+    @classmethod
+    def concat_and_pivot(
+        cls, op, axis_chunk_shape, out_idx, sorted_chunks, sampled_chunks
+    ):
+        from .sort_values import DataFrameSortValues
+
+        # stage 2: gather and merge samples, choose and broadcast p-1 pivots
+        kind = None if op.psrs_kinds is None else op.psrs_kinds[1]
+        if isinstance(op, DataFrameSortValues):
+            output_types = op.output_types
+        else:
+            output_types = [OutputType.index]
+        concat_pivot_op = DataFramePSRSConcatPivot(
+            kind=kind,
+            n_partition=axis_chunk_shape,
+            output_types=output_types,
+            **cls._collect_op_properties(op)
+        )
+        concat_pivot_shape = (
+            sorted_chunks[0].shape[: op.axis]
+            + (axis_chunk_shape - 1,)
+            + sorted_chunks[0].shape[op.axis + 1 :]
+        )
+        concat_pivot_index = out_idx[: op.axis] + (0,) + out_idx[op.axis :]
+        concat_pivot_chunk = concat_pivot_op.new_chunk(
+            sampled_chunks,
+            shape=concat_pivot_shape,
+            index=concat_pivot_index,
+        )
+        return concat_pivot_chunk
+
+    @classmethod
+    def partition_local_data(
+        cls, op, axis_chunk_shape, sorted_chunks, indices_chunks, concat_pivot_chunk
+    ):
+        # stage 3: Local data is partitioned
+        partition_chunks = []
+        length = len(sorted_chunks)
+        for i in range(length):
+            chunk_inputs = [sorted_chunks[i], concat_pivot_chunk]
+            partition_shuffle_map = DataFramePSRSShuffle(
+                n_partition=axis_chunk_shape,
+                stage=OperandStage.map,
+                output_types=op.output_types,
+                **cls._collect_op_properties(op)
+            )
+            if isinstance(chunk_inputs[0].index_value.value, IndexValue.RangeIndex):
+                index_value = parse_index(pd.Index([], dtype=np.int64))
+            else:
+                index_value = chunk_inputs[0].index_value
+            kw = dict(
+                shape=chunk_inputs[0].shape,
+                index=chunk_inputs[0].index,
+                index_value=index_value,
+            )
+            if op.outputs[0].ndim == 2:
+                kw.update(
+                    dict(
+                        columns_value=chunk_inputs[0].columns_value,
+                        dtypes=chunk_inputs[0].dtypes,
+                    )
+                )
+            else:
+                kw.update(dict(dtype=chunk_inputs[0].dtype, name=chunk_inputs[0].name))
+            partition_chunk = partition_shuffle_map.new_chunk(chunk_inputs, **kw)
+            partition_chunks.append(partition_chunk)
+        return partition_chunks
+
+    @classmethod
+    def partition_merge_data(
+        cls, op, need_align, return_value, partition_chunks, proxy_chunk
+    ):
+        # stage 4: all *ith* classes are gathered and merged
+        partition_sort_chunks, partition_indices_chunks, sort_info_chunks = [], [], []
+        for i, partition_chunk in enumerate(partition_chunks):
+            kind = None if op.psrs_kinds is None else op.psrs_kinds[2]
+            partition_shuffle_reduce = DataFramePSRSShuffle(
+                stage=OperandStage.reduce,
+                kind=kind,
+                reducer_index=(i,),
+                n_reducers=len(partition_chunks),
+                output_types=op.output_types,
+                **cls._collect_op_properties(op)
+            )
+            chunk_shape = list(partition_chunk.shape)
+            chunk_shape[op.axis] = np.nan
+
+            kw = dict(
+                shape=tuple(chunk_shape),
+                index=partition_chunk.index,
+                index_value=partition_chunk.index_value,
+            )
+            if op.outputs[0].ndim == 2:
+                kw.update(
+                    dict(
+                        columns_value=partition_chunk.columns_value,
+                        dtypes=partition_chunk.dtypes,
+                    )
+                )
+            else:
+                kw.update(dict(dtype=partition_chunk.dtype, name=partition_chunk.name))
+            cs = partition_shuffle_reduce.new_chunks([proxy_chunk], **kw)
+
+            partition_sort_chunks.append(cs[0])
+        return partition_sort_chunks, partition_indices_chunks, sort_info_chunks
+
+    @classmethod
+    def _tile_psrs(cls, op, in_data):
+        out = op.outputs[0]
+        in_df, axis_chunk_shape, _, _ = yield from cls.preprocess(op, in_data=in_data)
+
+        # stage 1: local sort and regular samples collected
+        sorted_chunks, _, sampled_chunks = cls.local_sort_and_regular_sample(
+            op, in_df, axis_chunk_shape, None, None
+        )
+
+        # stage 2: gather and merge samples, choose and broadcast p-1 pivots
+        concat_pivot_chunk = cls.concat_and_pivot(
+            op,
+            axis_chunk_shape,
+            (0,) if in_df.ndim == 2 else (),
+            sorted_chunks,
+            sampled_chunks,
+        )
+
+        # stage 3: Local data is partitioned
+        partition_chunks = cls.partition_local_data(
+            op, axis_chunk_shape, sorted_chunks, None, concat_pivot_chunk
+        )
+
+        proxy_chunk = DataFrameShuffleProxy(output_types=op.output_types).new_chunk(
+            partition_chunks, shape=()
+        )
+
+        # stage 4: all *ith* classes are gathered and merged
+        partition_sort_chunks = cls.partition_merge_data(
+            op, False, None, partition_chunks, proxy_chunk
+        )[0]
+
+        if op.ignore_index:
+            yield partition_sort_chunks
+            chunks = standardize_range_index(partition_sort_chunks, axis=op.axis)
+        else:
+            chunks = partition_sort_chunks
+
+        nsplits = calc_nsplits({c.index: c.shape for c in chunks})
+        if op.outputs[0].ndim == 2:
+            new_op = op.copy()
+            return new_op.new_dataframes(
+                op.inputs,
+                shape=out.shape,
+                chunks=chunks,
+                nsplits=nsplits,
+                index_value=out.index_value,
+                columns_value=out.columns_value,
+                dtypes=out.dtypes,
+            )
+        else:
+            new_op = op.copy()
+            return new_op.new_seriess(
+                op.inputs,
+                shape=out.shape,
+                chunks=chunks,
+                nsplits=nsplits,
+                index_value=out.index_value,
+                dtype=out.dtype,
+                name=out.name,
+            )
+
+
+def execute_sort_values(data, op, inplace=None, by=None):
+    if inplace is None:
+        inplace = op.inplace
+    # ignore_index is new in Pandas version 1.0.0.
+    ignore_index = getattr(op, "ignore_index", False)
+    if isinstance(data, (pd.DataFrame, pd.Series)):
+        kwargs = dict(
+            axis=op.axis,
+            ascending=op.ascending,
+            ignore_index=ignore_index,
+            na_position=op.na_position,
+            kind=op.kind,
+        )
+        if isinstance(data, pd.DataFrame):
+            kwargs["by"] = by if by is not None else op.by
+        if inplace:
+            kwargs["inplace"] = True
+            try:
+                data.sort_values(**kwargs)
+            except TypeError:  # pragma: no cover
+                kwargs.pop("ignore_index", None)
+                data.sort_values(**kwargs)
+            return data
+        else:
+            try:
+                return data.sort_values(**kwargs)
+            except TypeError:  # pragma: no cover
+                kwargs.pop("ignore_index", None)
+                return data.sort_values(**kwargs)
+
+    else:  # pragma: no cover
+        # cudf doesn't support axis and kind
+        if isinstance(data, cudf.DataFrame):
+            return data.sort_values(
+                op.by, ascending=op.ascending, na_position=op.na_position
+            )
+        else:
+            return data.sort_values(ascending=op.ascending, na_position=op.na_position)
+
+
+def execute_sort_index(data, op, inplace=None):
+    if inplace is None:
+        inplace = op.inplace
+    # ignore_index is new in Pandas version 1.0.0.
+    ignore_index = getattr(op, "ignore_index", False)
+    if isinstance(data, (pd.DataFrame, pd.Series)):
+        kwargs = dict(
+            level=op.level,
+            ascending=op.ascending,
+            ignore_index=ignore_index,
+            na_position=op.na_position,
+            kind=op.kind,
+            sort_remaining=op.sort_remaining,
+        )
+        if inplace:
+            kwargs["inplace"] = True
+            try:
+                data.sort_index(**kwargs)
+            except TypeError:  # pragma: no cover
+                kwargs.pop("ignore_index", None)
+                data.sort_index(**kwargs)
+            return data
+        else:
+            try:
+                return data.sort_index(**kwargs)
+            except TypeError:  # pragma: no cover
+                kwargs.pop("ignore_index", None)
+                return data.sort_index(**kwargs)
+
+    else:  # pragma: no cover
+        # cudf only support ascending
+        return data.sort_index(ascending=op.ascending)
+
+
+class DataFramePSRSChunkOperand(DataFrameOperand):
+    # sort type could be 'sort_values' or 'sort_index'
+    sort_type = StringField("sort_type")
+
+    axis = Int32Field("axis")
+    by = ListField("by", default=None)
+    ascending = AnyField("ascending")
+    inplace = BoolField("inplace")
+    kind = StringField("kind")
+    na_position = StringField("na_position")
+
+    # for sort_index
+    level = ListField("level")
+    sort_remaining = BoolField("sort_remaining")
+
+    n_partition = Int32Field("n_partition")
+
+    def __init__(self, output_types=None, **kw):
+        super().__init__(_output_types=output_types, **kw)
+
+
+class DataFramePSRSSortRegularSample(DataFramePSRSChunkOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.PSRS_SORT_REGULAR_SMAPLE
+
+    @property
+    def output_limit(self):
+        return 2
+
+    @classmethod
+    def execute(cls, ctx, op):
+        a = ctx[op.inputs[0].key]
+        xdf = pd if isinstance(a, (pd.DataFrame, pd.Series)) else cudf
+
+        if len(a) == 0:
+            # when chunk is empty, return the empty chunk itself
+            ctx[op.outputs[0].key] = ctx[op.outputs[-1].key] = a
+            return
+
+        if op.sort_type == "sort_values":
+            ctx[op.outputs[0].key] = res = execute_sort_values(a, op)
+        else:
+            ctx[op.outputs[0].key] = res = execute_sort_index(a, op)
+
+        by = op.by
+        add_distinct_col = bool(int(os.environ.get("PSRS_DISTINCT_COL", "0")))
+        if (
+            add_distinct_col
+            and isinstance(a, xdf.DataFrame)
+            and op.sort_type == "sort_values"
+        ):
+            # when running under distributed mode, we introduce an extra column
+            # to make sure pivots are distinct
+            chunk_idx = op.inputs[0].index[0]
+            distinct_col = (
+                _PSRS_DISTINCT_COL
+                if a.columns.nlevels == 1
+                else (_PSRS_DISTINCT_COL,) + ("",) * (a.columns.nlevels - 1)
+            )
+            res[distinct_col] = np.arange(
+                chunk_idx << 32, (chunk_idx << 32) + len(a), dtype=np.int64
+            )
+            by = list(by) + [distinct_col]
+
+        n = op.n_partition
+        if op.sort_type == "sort_values" and a.shape[op.axis] < n:
+            num = n // a.shape[op.axis] + 1
+            res = execute_sort_values(xdf.concat([res] * num), op, by=by)
+
+        w = res.shape[op.axis] * 1.0 / (n + 1)
+        slc = np.linspace(
+            max(w - 1, 0), res.shape[op.axis] - 1, num=n, endpoint=False
+        ).astype(int)
+        if op.axis == 1:
+            slc = (slice(None), slc)
+        if op.sort_type == "sort_values":
+            # do regular sample
+            if op.by is not None:
+                ctx[op.outputs[-1].key] = res[by].iloc[slc]
+            else:
+                ctx[op.outputs[-1].key] = res.iloc[slc]
+        else:
+            # do regular sample
+            ctx[op.outputs[-1].key] = res.iloc[slc]
+
+
+class DataFramePSRSConcatPivot(DataFramePSRSChunkOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.PSRS_CONCAT_PIVOT
+
+    @property
+    def output_limit(self):
+        return 1
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs = [ctx[c.key] for c in op.inputs if len(ctx[c.key]) > 0]
+        if len(inputs) == 0:
+            # corner case: nothing sampled, we need to do nothing
+            ctx[op.outputs[-1].key] = ctx[op.inputs[0].key]
+            return
+
+        xdf = pd if isinstance(inputs[0], (pd.DataFrame, pd.Series)) else cudf
+
+        a = xdf.concat(inputs, axis=op.axis)
+        p = len(inputs)
+        assert a.shape[op.axis] == p * len(op.inputs)
+
+        slc = np.linspace(
+            p - 1, a.shape[op.axis] - 1, num=len(op.inputs) - 1, endpoint=False
+        ).astype(int)
+        if op.axis == 1:
+            slc = (slice(None), slc)
+        if op.sort_type == "sort_values":
+            a = execute_sort_values(a, op, inplace=False)
+            ctx[op.outputs[-1].key] = a.iloc[slc]
+        else:
+            a = execute_sort_index(a, op, inplace=False)
+            ctx[op.outputs[-1].key] = a.index[slc]
+
+
+class DataFramePSRSShuffle(MapReduceOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.PSRS_SHUFFLE
+
+    sort_type = StringField("sort_type")
+
+    # for shuffle map
+    axis = Int32Field("axis")
+    by = ListField("by")
+    ascending = AnyField("ascending")
+    inplace = BoolField("inplace")
+    na_position = StringField("na_position")
+    n_partition = Int32Field("n_partition")
+
+    # for sort_index
+    level = ListField("level")
+    sort_remaining = BoolField("sort_remaining")
+
+    # for shuffle reduce
+    kind = StringField("kind")
+
+    def __init__(self, output_types=None, **kw):
+        super().__init__(_output_types=output_types, **kw)
+
+    @property
+    def output_limit(self):
+        return 1
+
+    @staticmethod
+    def _calc_poses(src_cols, pivots, ascending=True):
+        # The pivots are immutable if it is got from shared memory, e.g. Ray object store.
+        # Pandas < 1.4 has item setting bug and pandas >= 1.4 has fixed it.
+        #
+        # Here, almost all the cases that the pivots are got from shared memory.
+        #
+        # `pivots[col] = -pivots[col]` will automatically replace the col with a new copy
+        # `-pivots[col]` in pandas >= 1.4, but it will try to inplace set col in pandas < 1.4
+        #
+        # So, we use assign here to walk around incorrect inplace set item bug in pandas < 1.4.
+        # Please refer to: https://github.com/mars-project/mars/issues/3215
+        # related issue: https://github.com/pandas-dev/pandas/pull/43406
+        copy_cols = {}
+        if isinstance(ascending, list):
+            for asc, col in zip(ascending, pivots.columns):
+                # Make pivots available to use ascending order when mixed order specified
+                if not asc:
+                    if pd.api.types.is_numeric_dtype(pivots.dtypes[col]):
+                        # for numeric dtypes, convert to negative is more efficient
+                        copy_cols[col] = -pivots[col]
+                        src_cols[col] = -src_cols[col]
+                    else:
+                        # for other types, convert to ReversedValue
+                        copy_cols[col] = pivots[col].map(
+                            lambda x: x
+                            if type(x) is _ReversedValue
+                            else _ReversedValue(x)
+                        )
+            ascending = True
+
+        if copy_cols:
+            pivots = pivots.assign(**copy_cols)
+
+        records = src_cols.to_records(index=False)
+        p_records = pivots.to_records(index=False)
+        if ascending:
+            poses = records.searchsorted(p_records, side="right")
+        else:
+            poses = len(records) - records[::-1].searchsorted(p_records, side="right")
+        del records, p_records
+        return poses
+
+    @classmethod
+    def _execute_dataframe_map(cls, ctx, op):
+        a, pivots = [ctx[c.key] for c in op.inputs]
+        out = op.outputs[0]
+
+        if len(a) == 0:
+            # when the chunk is empty, no slices can be produced
+            for i in range(op.n_partition):
+                ctx[out.key, (i,)] = a
+            return
+
+        # use numpy.searchsorted to find split positions.
+        by = op.by
+
+        distinct_col = (
+            _PSRS_DISTINCT_COL
+            if a.columns.nlevels == 1
+            else (_PSRS_DISTINCT_COL,) + ("",) * (a.columns.nlevels - 1)
+        )
+        if distinct_col in a.columns:
+            by = list(by) + [distinct_col]
+
+        try:
+            poses = cls._calc_poses(a[by], pivots, op.ascending)
+        except TypeError:
+            poses = cls._calc_poses(
+                a[by].fillna(_largest), pivots.fillna(_largest), op.ascending
+            )
+
+        poses = (None,) + tuple(poses) + (None,)
+        for i in range(op.n_partition):
+            values = a.iloc[poses[i] : poses[i + 1]]
+            if is_cudf(values):  # pragma: no cover
+                values = values.copy()
+            ctx[out.key, (i,)] = values
+
+    @classmethod
+    def _calc_series_poses(cls, s, pivots, ascending=True):
+        if ascending:
+            poses = s.searchsorted(pivots, side="right")
+        else:
+            poses = len(s) - s.iloc[::-1].searchsorted(pivots, side="right")
+        return poses
+
+    @classmethod
+    def _execute_series_map(cls, ctx, op):
+        a, pivots = [ctx[c.key] for c in op.inputs]
+        out = op.outputs[0]
+
+        if len(a) == 0:
+            # when the chunk is empty, no slices can be produced
+            for i in range(op.n_partition):
+                ctx[out.key, (i,)] = a
+            return
+
+        if isinstance(a, pd.Series):
+            try:
+                poses = cls._calc_series_poses(a, pivots, ascending=op.ascending)
+            except TypeError:
+                filled_a = a.fillna(_largest)
+                filled_pivots = pivots.fillna(_largest)
+                poses = cls._calc_series_poses(
+                    filled_a, filled_pivots, ascending=op.ascending
+                )
+            poses = (None,) + tuple(poses) + (None,)
+            for i in range(op.n_partition):
+                values = a.iloc[poses[i] : poses[i + 1]]
+                ctx[out.key, (i,)] = values
+
+    @classmethod
+    def _execute_sort_index_map(cls, ctx, op):
+        a, pivots = [ctx[c.key] for c in op.inputs]
+        out = op.outputs[0]
+
+        if op.ascending:
+            poses = a.index.searchsorted(list(pivots), side="right")
+        else:
+            poses = len(a) - a.index[::-1].searchsorted(list(pivots), side="right")
+        poses = (None,) + tuple(poses) + (None,)
+        for i in range(op.n_partition):
+            values = a.iloc[poses[i] : poses[i + 1]]
+            ctx[out.key, (i,)] = values
+
+    @classmethod
+    def _execute_map(cls, ctx, op):
+        a = [ctx[c.key] for c in op.inputs][0]
+        if op.sort_type == "sort_values":
+            if len(a.shape) == 2:
+                # DataFrame type
+                cls._execute_dataframe_map(ctx, op)
+            else:
+                # Series type
+                cls._execute_series_map(ctx, op)
+        else:
+            cls._execute_sort_index_map(ctx, op)
+
+    @classmethod
+    def _execute_reduce(cls, ctx, op: "DataFramePSRSShuffle"):
+        out_chunk = op.outputs[0]
+        raw_inputs = list(op.iter_mapper_data(ctx, pop=False))
+
+        xdf = pd if isinstance(raw_inputs[0], (pd.DataFrame, pd.Series)) else cudf
+        if xdf is pd:
+            concat_values = xdf.concat(raw_inputs, axis=op.axis, copy=False)
+        else:
+            concat_values = xdf.concat(raw_inputs, axis=op.axis)
+        del raw_inputs[:]
+
+        if isinstance(concat_values, xdf.DataFrame):
+            concat_values.drop(
+                _PSRS_DISTINCT_COL, axis=1, inplace=True, errors="ignore"
+            )
+
+            col_index_dtype = out_chunk.columns_value.to_pandas().dtype
+            if concat_values.columns.dtype != col_index_dtype:
+                concat_values.columns = concat_values.columns.astype(col_index_dtype)
+
+        if op.sort_type == "sort_values":
+            ctx[op.outputs[0].key] = execute_sort_values(concat_values, op)
+        else:
+            ctx[op.outputs[0].key] = execute_sort_index(concat_values, op)
+
+    @classmethod
+    def estimate_size(cls, ctx, op):
+        super().estimate_size(ctx, op)
+        result = ctx[op.outputs[0].key]
+        if op.stage == OperandStage.reduce:
+            ctx[op.outputs[0].key] = (result[0], result[1] * 1.5)
+        else:
+            ctx[op.outputs[0].key] = result
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            cls._execute_map(ctx, op)
+        else:
+            cls._execute_reduce(ctx, op)
diff --git a/python/xorbits/_mars/dataframe/sort/sort_index.py b/python/xorbits/_mars/dataframe/sort/sort_index.py
new file mode 100644
index 000000000..f7f0874be
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/sort/sort_index.py
@@ -0,0 +1,245 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import OutputType, recursive_tile
+from ...serialization.serializables import BoolField, ListField
+from ...tensor.base.sort import _validate_sort_psrs_kinds
+from ...utils import calc_nsplits
+from ..operands import DATAFRAME_TYPE
+from ..utils import (
+    build_concatenated_rows_frame,
+    parse_index,
+    standardize_range_index,
+    validate_axis,
+)
+from .core import DataFrameSortOperand
+from .psrs import DataFramePSRSOperandMixin, execute_sort_index
+
+
+class DataFrameSortIndex(DataFrameSortOperand, DataFramePSRSOperandMixin):
+    _op_type_ = OperandDef.SORT_INDEX
+
+    level = ListField("level", default=None)
+    sort_remaining = BoolField("sort_remaining", default=None)
+
+    @classmethod
+    def _tile(cls, op):
+        df = op.inputs[0]
+
+        if op.axis == 0:
+            if df.chunk_shape[op.axis] == 1:
+                if op.output_types[0] == OutputType.dataframe:
+                    df = build_concatenated_rows_frame(df)
+                    out_chunks = []
+                    for chunk in df.chunks:
+                        chunk_op = op.copy().reset_key()
+                        out_chunks.append(
+                            chunk_op.new_chunk(
+                                [chunk],
+                                shape=chunk.shape,
+                                index=chunk.index,
+                                index_value=chunk.index_value,
+                                columns_value=chunk.columns_value,
+                                dtypes=chunk.dtypes,
+                            )
+                        )
+                    new_op = op.copy()
+                    kws = op.outputs[0].params.copy()
+                    kws["nsplits"] = df.nsplits
+                    kws["chunks"] = out_chunks
+                    return new_op.new_dataframes(op.inputs, **kws)
+                else:
+                    out_chunks = []
+                    for chunk in df.chunks:
+                        chunk_op = op.copy().reset_key()
+                        out_chunks.append(
+                            chunk_op.new_chunk(
+                                [chunk],
+                                shape=chunk.shape,
+                                index=chunk.index,
+                                index_value=chunk.index_value,
+                                name=chunk.name,
+                                dtype=chunk.dtype,
+                            )
+                        )
+                    new_op = op.copy()
+                    kws = op.outputs[0].params.copy()
+                    kws["nsplits"] = df.nsplits
+                    kws["chunks"] = out_chunks
+                    return new_op.new_seriess(op.inputs, **kws)
+            else:
+                if op.output_types[0] == OutputType.dataframe:
+                    df = build_concatenated_rows_frame(df)
+                if op.na_position != "last":  # pragma: no cover
+                    raise NotImplementedError("Only support puts NaNs at the end.")
+                # use parallel sorting by regular sampling
+                return (yield from cls._tile_psrs(op, df))
+        else:
+            assert op.axis == 1
+
+            sorted_columns = list(
+                df.columns_value.to_pandas().sort_values(ascending=op.ascending)
+            )
+            r = [(yield from recursive_tile(df[sorted_columns]))]
+            if op.ignore_index:
+                chunks = r[0].chunks
+                yield chunks
+                out = op.outputs[0]
+                chunks = standardize_range_index(chunks, axis=0)
+                new_op = op.copy()
+                return new_op.new_dataframes(
+                    op.inputs,
+                    shape=out.shape,
+                    chunks=chunks,
+                    nsplits=calc_nsplits({c.index: c.shape for c in chunks}),
+                    index_value=out.index_value,
+                    columns_value=out.columns_value,
+                    dtypes=out.dtypes,
+                )
+            return r
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameSortIndex"):
+        in_data = ctx[op.inputs[0].key]
+        result = execute_sort_index(in_data, op)
+        if op.nrows is not None:
+            result = result.head(op.nrows)
+        ctx[op.outputs[0].key] = result
+
+    def _call_dataframe(self, df):
+        if self.ignore_index:
+            index_value = parse_index(pd.RangeIndex(df.shape[0]))
+        else:
+            index_value = df.index_value
+        if self.axis == 0:
+            return self.new_dataframe(
+                [df],
+                shape=df.shape,
+                dtypes=df.dtypes,
+                index_value=index_value,
+                columns_value=df.columns_value,
+            )
+        else:
+            dtypes = df.dtypes.sort_index(ascending=self.ascending)
+            columns_value = parse_index(dtypes.index, store_data=True)
+            return self.new_dataframe(
+                [df],
+                shape=df.shape,
+                dtypes=dtypes,
+                index_value=index_value,
+                columns_value=columns_value,
+            )
+
+    def _call_series(self, series):
+        if self.axis != 0:  # pragma: no cover
+            raise TypeError(f"Invalid axis: {self.axis}")
+        if self.ignore_index:
+            index_value = parse_index(pd.RangeIndex(series.shape[0]))
+        else:
+            index_value = series.index_value
+
+        return self.new_series(
+            [series],
+            shape=series.shape,
+            dtype=series.dtype,
+            index_value=index_value,
+            name=series.name,
+        )
+
+    def __call__(self, a):
+        if isinstance(a, DATAFRAME_TYPE):
+            self.output_types = [OutputType.dataframe]
+            return self._call_dataframe(a)
+        else:
+            self.output_types = [OutputType.series]
+            return self._call_series(a)
+
+
+def sort_index(
+    a,
+    axis=0,
+    level=None,
+    ascending=True,
+    inplace=False,
+    kind="quicksort",
+    na_position="last",
+    sort_remaining=True,
+    ignore_index: bool = False,
+    parallel_kind="PSRS",
+    psrs_kinds=None,
+):
+    """
+    Sort object by labels (along an axis).
+
+    Parameters
+    ----------
+    a : Input DataFrame or Series.
+    axis : {0 or 'index', 1 or 'columns'}, default 0
+        The axis along which to sort.  The value 0 identifies the rows,
+        and 1 identifies the columns.
+    level : int or level name or list of ints or list of level names
+        If not None, sort on values in specified index level(s).
+    ascending : bool, default True
+        Sort ascending vs. descending.
+    inplace : bool, default False
+        If True, perform operation in-place.
+    kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
+        Choice of sorting algorithm. See also ndarray.np.sort for more
+        information.  `mergesort` is the only stable algorithm. For
+        DataFrames, this option is only applied when sorting on a single
+        column or label.
+    na_position : {'first', 'last'}, default 'last'
+        Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.
+        Not implemented for MultiIndex.
+    sort_remaining : bool, default True
+        If True and sorting by level and index is multilevel, sort by other
+        levels too (in order) after sorting by specified level.
+    ignore_index : bool, default False
+        If True, the resulting axis will be labeled 0, 1, …, n - 1.
+    parallel_kind: {'PSRS'}, optional.
+        Parallel sorting algorithm, for the details, refer to:
+        http://csweb.cs.wfu.edu/bigiron/LittleFE-PSRS/build/html/PSRSalgorithm.html
+    psrs_kinds: Sorting algorithms during PSRS algorithm.
+
+    Returns
+    -------
+    sorted_obj : DataFrame or None
+        DataFrame with sorted index if inplace=False, None otherwise.
+    """
+    if na_position not in ["last", "first"]:  # pragma: no cover
+        raise TypeError(f"Invalid na_position: {na_position}")
+    psrs_kinds = _validate_sort_psrs_kinds(psrs_kinds)
+    axis = validate_axis(axis, a)
+    level = level if isinstance(level, (list, tuple)) else [level]
+    op = DataFrameSortIndex(
+        level=level,
+        axis=axis,
+        ascending=ascending,
+        inplace=inplace,
+        kind=kind,
+        na_position=na_position,
+        sort_remaining=sort_remaining,
+        ignore_index=ignore_index,
+        parallel_kind=parallel_kind,
+        psrs_kinds=psrs_kinds,
+        gpu=a.op.is_gpu(),
+    )
+    sorted_a = op(a)
+    if inplace:
+        a.data = sorted_a.data
+    else:
+        return sorted_a
diff --git a/python/xorbits/_mars/dataframe/sort/sort_values.py b/python/xorbits/_mars/dataframe/sort/sort_values.py
new file mode 100644
index 000000000..49174c836
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/sort/sort_values.py
@@ -0,0 +1,387 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import OutputType
+from ...serialization.serializables import ListField
+from ...tensor.base.sort import _validate_sort_psrs_kinds
+from ..core import IndexValue
+from ..utils import build_concatenated_rows_frame, parse_index, validate_axis
+from .core import DataFrameSortOperand
+from .psrs import DataFramePSRSOperandMixin, execute_sort_values
+
+
+class DataFrameSortValues(DataFrameSortOperand, DataFramePSRSOperandMixin):
+    _op_type_ = OperandDef.SORT_VALUES
+
+    by = ListField("by", default=None)
+
+    def __init__(self, output_types=None, **kw):
+        super().__init__(_output_types=output_types, **kw)
+
+    @classmethod
+    def _tile_dataframe(cls, op):
+        df = build_concatenated_rows_frame(op.inputs[0])
+
+        if df.chunk_shape[op.axis] == 1:
+            out_chunks = []
+            for chunk in df.chunks:
+                chunk_op = op.copy().reset_key()
+                out_chunks.append(
+                    chunk_op.new_chunk(
+                        [chunk],
+                        shape=chunk.shape,
+                        index=chunk.index,
+                        index_value=op.outputs[0].index_value,
+                        columns_value=chunk.columns_value,
+                        dtypes=chunk.dtypes,
+                    )
+                )
+            new_op = op.copy()
+            kws = op.outputs[0].params.copy()
+            kws["nsplits"] = df.nsplits
+            kws["chunks"] = out_chunks
+            return new_op.new_dataframes(op.inputs, **kws)
+        else:
+            if op.na_position != "last":  # pragma: no cover
+                raise NotImplementedError("Only support puts NaNs at the end.")
+            # use parallel sorting by regular sampling
+            return (yield from cls._tile_psrs(op, df))
+
+    @classmethod
+    def _tile_series(cls, op):
+        series = op.inputs[0]
+        if len(series.chunks) == 1:
+            chunk = series.chunks[0]
+            chunk_op = op.copy().reset_key()
+            out_chunks = [
+                chunk_op.new_chunk(
+                    series.chunks,
+                    shape=chunk.shape,
+                    index=chunk.index,
+                    index_value=op.outputs[0].index_value,
+                    dtype=chunk.dtype,
+                    name=chunk.name,
+                )
+            ]
+            new_op = op.copy()
+            kws = op.outputs[0].params.copy()
+            kws["nsplits"] = series.nsplits
+            kws["chunks"] = out_chunks
+            return new_op.new_seriess(op.inputs, **kws)
+        else:
+            if op.na_position != "last":  # pragma: no cover
+                raise NotImplementedError("Only support puts NaNs at the end.")
+            # use parallel sorting by regular sampling
+            return (yield from cls._tile_psrs(op, series))
+
+    @classmethod
+    def _tile(cls, op):
+        inp = op.inputs[0]
+        if inp.shape[op.axis] == 0:
+            # if the length is zero, return input directly
+            return inp
+        if inp.ndim == 2:
+            return (yield from cls._tile_dataframe(op))
+        else:
+            return (yield from cls._tile_series(op))
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameSortValues"):
+        in_data = ctx[op.inputs[0].key]
+        result = execute_sort_values(in_data, op)
+        if op.nrows is not None:
+            result = result.head(op.nrows)
+        ctx[op.outputs[0].key] = result
+
+    def __call__(self, a):
+        assert self.axis == 0
+        if self.ignore_index:
+            index_value = parse_index(pd.RangeIndex(a.shape[0]))
+        else:
+            if isinstance(a.index_value.value, IndexValue.RangeIndex):
+                index_value = parse_index(pd.Index([], dtype=np.int64))
+            else:
+                index_value = a.index_value
+        if a.ndim == 2:
+            return self.new_dataframe(
+                [a],
+                shape=a.shape,
+                dtypes=a.dtypes,
+                index_value=index_value,
+                columns_value=a.columns_value,
+            )
+        else:
+            return self.new_series(
+                [a], shape=a.shape, dtype=a.dtype, index_value=index_value, name=a.name
+            )
+
+
+def dataframe_sort_values(
+    df,
+    by,
+    axis=0,
+    ascending=True,
+    inplace=False,
+    kind="quicksort",
+    na_position="last",
+    ignore_index=False,
+    parallel_kind="PSRS",
+    psrs_kinds=None,
+):
+    """
+    Sort by the values along either axis.
+
+    Parameters
+    ----------
+    df : Mars DataFrame
+         Input dataframe.
+    by : str
+         Name or list of names to sort by.
+    axis : %(axes_single_arg)s, default 0
+         Axis to be sorted.
+    ascending : bool or list of bool, default True
+         Sort ascending vs. descending. Specify list for multiple sort
+         orders.  If this is a list of bools, must match the length of
+         the by.
+    inplace : bool, default False
+         If True, perform operation in-place.
+    kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
+         Choice of sorting algorithm. See also ndarray.np.sort for more
+         information.  `mergesort` is the only stable algorithm. For
+         DataFrames, this option is only applied when sorting on a single
+         column or label.
+    na_position : {'first', 'last'}, default 'last'
+         Puts NaNs at the beginning if `first`; `last` puts NaNs at the
+         end.
+    ignore_index : bool, default False
+         If True, the resulting axis will be labeled 0, 1, …, n - 1.
+    parallel_kind : {'PSRS'}, default 'PSRS'
+         Parallel sorting algorithm, for the details, refer to:
+         http://csweb.cs.wfu.edu/bigiron/LittleFE-PSRS/build/html/PSRSalgorithm.html
+
+    Returns
+    -------
+    sorted_obj : DataFrame or None
+        DataFrame with sorted values if inplace=False, None otherwise.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({
+    ...     'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],
+    ...     'col2': [2, 1, 9, 8, 7, 4],
+    ...     'col3': [0, 1, 9, 4, 2, 3],
+    ... })
+    >>> df.execute()
+        col1 col2 col3
+    0   A    2    0
+    1   A    1    1
+    2   B    9    9
+    3   NaN  8    4
+    4   D    7    2
+    5   C    4    3
+
+    Sort by col1
+
+    >>> df.sort_values(by=['col1']).execute()
+        col1 col2 col3
+    0   A    2    0
+    1   A    1    1
+    2   B    9    9
+    5   C    4    3
+    4   D    7    2
+    3   NaN  8    4
+
+    Sort by multiple columns
+
+    >>> df.sort_values(by=['col1', 'col2']).execute()
+        col1 col2 col3
+    1   A    1    1
+    0   A    2    0
+    2   B    9    9
+    5   C    4    3
+    4   D    7    2
+    3   NaN  8    4
+
+    Sort Descending
+
+    >>> df.sort_values(by='col1', ascending=False).execute()
+        col1 col2 col3
+    4   D    7    2
+    5   C    4    3
+    2   B    9    9
+    0   A    2    0
+    1   A    1    1
+    3   NaN  8    4
+
+    Putting NAs first
+
+    >>> df.sort_values(by='col1', ascending=False, na_position='first').execute()
+        col1 col2 col3
+    3   NaN  8    4
+    4   D    7    2
+    5   C    4    3
+    2   B    9    9
+    0   A    2    0
+    1   A    1    1
+    """
+
+    if na_position not in ["last", "first"]:  # pragma: no cover
+        raise TypeError(f"invalid na_position: {na_position}")
+    axis = validate_axis(axis, df)
+    if axis != 0:
+        raise NotImplementedError("Only support sort on axis 0")
+    psrs_kinds = _validate_sort_psrs_kinds(psrs_kinds)
+    by = by if isinstance(by, (list, tuple)) else [by]
+    if isinstance(ascending, list):  # pragma: no cover
+        if all(ascending):
+            # all are True, convert to True
+            ascending = True
+        elif not any(ascending):
+            # all are False, convert to False
+            ascending = False
+    op = DataFrameSortValues(
+        by=by,
+        axis=axis,
+        ascending=ascending,
+        inplace=inplace,
+        kind=kind,
+        na_position=na_position,
+        ignore_index=ignore_index,
+        parallel_kind=parallel_kind,
+        psrs_kinds=psrs_kinds,
+        gpu=df.op.is_gpu(),
+        output_types=[OutputType.dataframe],
+    )
+    sorted_df = op(df)
+    if inplace:
+        df.data = sorted_df.data
+    else:
+        return sorted_df
+
+
+def series_sort_values(
+    series,
+    axis=0,
+    ascending=True,
+    inplace=False,
+    kind="quicksort",
+    na_position="last",
+    ignore_index=False,
+    parallel_kind="PSRS",
+    psrs_kinds=None,
+):
+    """
+    Sort by the values.
+
+    Sort a Series in ascending or descending order by some
+    criterion.
+
+    Parameters
+    ----------
+    series : input Series.
+    axis : {0 or 'index'}, default 0
+        Axis to direct sorting. The value 'index' is accepted for
+        compatibility with DataFrame.sort_values.
+    ascending : bool, default True
+        If True, sort values in ascending order, otherwise descending.
+    inplace : bool, default False
+        If True, perform operation in-place.
+    kind : {'quicksort', 'mergesort' or 'heapsort'}, default 'quicksort'
+        Choice of sorting algorithm. See also :func:`numpy.sort` for more
+        information. 'mergesort' is the only stable  algorithm.
+    na_position : {'first' or 'last'}, default 'last'
+        Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at
+        the end.
+    ignore_index : bool, default False
+         If True, the resulting axis will be labeled 0, 1, …, n - 1.
+
+    Returns
+    -------
+    Series
+        Series ordered by values.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> raw = pd.Series([np.nan, 1, 3, 10, 5])
+    >>> s = md.Series(raw)
+    >>> s.execute()
+    0     NaN
+    1     1.0
+    2     3.0
+    3     10.0
+    4     5.0
+    dtype: float64
+
+    Sort values ascending order (default behaviour)
+
+    >>> s.sort_values(ascending=True).execute()
+    1     1.0
+    2     3.0
+    4     5.0
+    3    10.0
+    0     NaN
+    dtype: float64
+
+    Sort values descending order
+
+    >>> s.sort_values(ascending=False).execute()
+    3    10.0
+    4     5.0
+    2     3.0
+    1     1.0
+    0     NaN
+    dtype: float64
+
+    Sort values inplace
+
+    >>> s.sort_values(ascending=False, inplace=True)
+    >>> s.execute()
+    3    10.0
+    4     5.0
+    2     3.0
+    1     1.0
+    0     NaN
+    dtype: float64
+
+    Sort values putting NAs first
+    """
+    if na_position not in ["last", "first"]:  # pragma: no cover
+        raise TypeError(f"invalid na_position: {na_position}")
+    axis = validate_axis(axis, series)
+    if axis != 0:
+        raise NotImplementedError("Only support sort on axis 0")
+    psrs_kinds = _validate_sort_psrs_kinds(psrs_kinds)
+    op = DataFrameSortValues(
+        axis=axis,
+        ascending=ascending,
+        inplace=inplace,
+        kind=kind,
+        na_position=na_position,
+        ignore_index=ignore_index,
+        parallel_kind=parallel_kind,
+        psrs_kinds=psrs_kinds,
+        output_types=[OutputType.series],
+        gpu=series.op.is_gpu(),
+    )
+    sorted_series = op(series)
+    if inplace:
+        series.data = sorted_series.data
+    else:
+        return sorted_series
diff --git a/python/xorbits/_mars/dataframe/sort/tests/__init__.py b/python/xorbits/_mars/dataframe/sort/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/sort/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/sort/tests/test_sort.py b/python/xorbits/_mars/dataframe/sort/tests/test_sort.py
new file mode 100644
index 000000000..f66b33cde
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/sort/tests/test_sort.py
@@ -0,0 +1,124 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ....core import tile
+from ....core.operand import OperandStage
+from ...indexing.getitem import DataFrameIndex
+from ...initializer import DataFrame
+from ..sort_index import DataFrameSortIndex, sort_index
+from ..sort_values import DataFrameSortValues, dataframe_sort_values
+
+
+def test_sort_values():
+    raw = pd.DataFrame(
+        {
+            "a": np.random.rand(10),
+            "b": np.random.randint(1000, size=10),
+            "c": np.random.rand(10),
+            "d": [np.random.bytes(10) for _ in range(10)],
+            "e": [pd.Timestamp(f"201{i}") for i in range(10)],
+            "f": [pd.Timedelta(f"{i} days") for i in range(10)],
+        },
+    )
+    df = DataFrame(raw)
+    sorted_df = dataframe_sort_values(df, by="c")
+
+    assert sorted_df.shape == raw.shape
+    assert isinstance(sorted_df.op, DataFrameSortValues)
+
+    tiled = tile(sorted_df)
+
+    assert len(tiled.chunks) == 1
+    assert isinstance(tiled.chunks[0].op, DataFrameSortValues)
+
+    df = DataFrame(raw, chunk_size=6)
+    sorted_df = dataframe_sort_values(df, by="c")
+
+    assert sorted_df.shape == raw.shape
+    assert isinstance(sorted_df.op, DataFrameSortValues)
+
+    tiled = tile(sorted_df)
+
+    assert len(tiled.chunks) == 2
+    assert tiled.chunks[0].op.stage == OperandStage.reduce
+
+    df = DataFrame(raw, chunk_size=3)
+    sorted_df = dataframe_sort_values(df, by=["a", "c"])
+
+    assert sorted_df.shape == raw.shape
+    assert isinstance(sorted_df.op, DataFrameSortValues)
+
+    tiled = tile(sorted_df)
+
+    assert len(tiled.chunks) == 3
+    assert tiled.chunks[0].op.stage == OperandStage.reduce
+    pd.testing.assert_series_equal(tiled.chunks[0].dtypes, raw.dtypes)
+    assert tiled.chunks[1].op.stage == OperandStage.reduce
+    pd.testing.assert_series_equal(tiled.chunks[1].dtypes, raw.dtypes)
+    assert tiled.chunks[2].op.stage == OperandStage.reduce
+    pd.testing.assert_series_equal(tiled.chunks[2].dtypes, raw.dtypes)
+
+
+def test_sort_index():
+    raw = pd.DataFrame(
+        np.random.rand(10, 10), columns=np.random.rand(10), index=np.random.rand(10)
+    )
+    df = DataFrame(raw)
+    sorted_df = sort_index(df)
+
+    assert sorted_df.shape == raw.shape
+    assert isinstance(sorted_df.op, DataFrameSortIndex)
+
+    tiled = tile(sorted_df)
+
+    assert len(tiled.chunks) == 1
+    assert isinstance(tiled.chunks[0].op, DataFrameSortIndex)
+
+    df = DataFrame(raw, chunk_size=6)
+    sorted_df = sort_index(df)
+
+    assert sorted_df.shape == raw.shape
+    assert isinstance(sorted_df.op, DataFrameSortIndex)
+
+    tiled = tile(sorted_df)
+
+    assert len(tiled.chunks) == 2
+    assert tiled.chunks[0].op.stage == OperandStage.reduce
+
+    df = DataFrame(raw, chunk_size=3)
+    sorted_df = sort_index(df)
+
+    assert sorted_df.shape == raw.shape
+    assert isinstance(sorted_df.op, DataFrameSortIndex)
+
+    tiled = tile(sorted_df)
+
+    assert len(tiled.chunks) == 3
+    assert tiled.chunks[0].op.stage == OperandStage.reduce
+    assert tiled.chunks[1].op.stage == OperandStage.reduce
+    assert tiled.chunks[2].op.stage == OperandStage.reduce
+
+    # support on axis 1
+    df = DataFrame(raw, chunk_size=4)
+    sorted_df = sort_index(df, axis=1)
+
+    assert sorted_df.shape == raw.shape
+    assert isinstance(sorted_df.op, DataFrameSortIndex)
+
+    tiled = tile(sorted_df)
+
+    assert all(isinstance(c.op, DataFrameIndex) for c in tiled.chunks) is True
diff --git a/python/xorbits/_mars/dataframe/sort/tests/test_sort_execution.py b/python/xorbits/_mars/dataframe/sort/tests/test_sort_execution.py
new file mode 100644
index 000000000..8c04fe8de
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/sort/tests/test_sort_execution.py
@@ -0,0 +1,423 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ....tests.core import require_cudf
+from ... import ArrowStringDtype, DataFrame, Series
+
+
+@pytest.mark.parametrize(
+    "distinct_opt", ["0"] if sys.platform.lower().startswith("win") else ["0", "1"]
+)
+def test_sort_values_execution(setup, distinct_opt):
+    ns = np.random.RandomState(0)
+    os.environ["PSRS_DISTINCT_COL"] = distinct_opt
+    df = pd.DataFrame(ns.rand(100, 10), columns=["a" + str(i) for i in range(10)])
+
+    # test one chunk
+    mdf = DataFrame(df)
+    result = mdf.sort_values("a0").execute().fetch()
+    expected = df.sort_values("a0")
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    result = mdf.sort_values(["a6", "a7"], ascending=False).execute().fetch()
+    expected = df.sort_values(["a6", "a7"], ascending=False)
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test psrs
+    mdf = DataFrame(df, chunk_size=10)
+    result = mdf.sort_values("a0").execute().fetch()
+    expected = df.sort_values("a0")
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    result = mdf.sort_values(["a3", "a4"]).execute().fetch()
+    expected = df.sort_values(["a3", "a4"])
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test ascending=False
+    result = mdf.sort_values(["a0", "a1"], ascending=False).execute().fetch()
+    expected = df.sort_values(["a0", "a1"], ascending=False)
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    result = mdf.sort_values(["a7"], ascending=False).execute().fetch()
+    expected = df.sort_values(["a7"], ascending=False)
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test ascending is a list
+    result = (
+        mdf.sort_values(["a3", "a4", "a5", "a6"], ascending=[False, True, True, False])
+        .execute()
+        .fetch()
+    )
+    expected = df.sort_values(
+        ["a3", "a4", "a5", "a6"], ascending=[False, True, True, False]
+    )
+    pd.testing.assert_frame_equal(result, expected)
+
+    in_df = pd.DataFrame(
+        {
+            "col1": ns.choice([f"a{i}" for i in range(5)], size=(100,)),
+            "col2": ns.choice([f"b{i}" for i in range(5)], size=(100,)),
+            "col3": ns.choice([f"c{i}" for i in range(5)], size=(100,)),
+            "col4": ns.randint(10, 20, size=(100,)),
+        }
+    )
+    mdf = DataFrame(in_df, chunk_size=10)
+    result = (
+        mdf.sort_values(
+            ["col1", "col4", "col3", "col2"], ascending=[False, False, True, False]
+        )
+        .execute()
+        .fetch()
+    )
+    expected = in_df.sort_values(
+        ["col1", "col4", "col3", "col2"], ascending=[False, False, True, False]
+    )
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test multiindex
+    df2 = df.copy(deep=True)
+    df2.columns = pd.MultiIndex.from_product([list("AB"), list("CDEFG")])
+    mdf = DataFrame(df2, chunk_size=5)
+
+    result = mdf.sort_values([("A", "C")]).execute().fetch()
+    expected = df2.sort_values([("A", "C")])
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test rechunk
+    mdf = DataFrame(df, chunk_size=3)
+    result = mdf.sort_values("a0").execute().fetch()
+    expected = df.sort_values("a0")
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    result = mdf.sort_values(["a3", "a4"]).execute().fetch()
+    expected = df.sort_values(["a3", "a4"])
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test other types
+    raw = pd.DataFrame(
+        {
+            "a": np.random.rand(10),
+            "b": np.random.randint(1000, size=10),
+            "c": np.random.rand(10),
+            "d": [np.random.bytes(10) for _ in range(10)],
+            "e": [pd.Timestamp(f"201{i}") for i in range(10)],
+            "f": [pd.Timedelta(f"{i} days") for i in range(10)],
+        },
+    )
+    mdf = DataFrame(raw, chunk_size=3)
+
+    for label in raw.columns:
+        result = mdf.sort_values(label).execute().fetch()
+        expected = raw.sort_values(label)
+        pd.testing.assert_frame_equal(result, expected)
+
+    result = mdf.sort_values(["a", "b", "e"], ascending=False).execute().fetch()
+    expected = raw.sort_values(["a", "b", "e"], ascending=False)
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test nan
+    df = pd.DataFrame(
+        {
+            "col1": ["A", "A", "B", "B", "D", "C"],
+            "col2": [2, 1, 9, np.nan, 7, 4],
+            "col3": [0, 1, 9, 4, 2, 3],
+        }
+    )
+    mdf = DataFrame(df)
+    result = mdf.sort_values(["col2"]).execute().fetch()
+    expected = df.sort_values(["col2"])
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    mdf = DataFrame(df, chunk_size=3)
+    result = mdf.sort_values(["col2"]).execute().fetch()
+    expected = df.sort_values(["col2"])
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test None (issue #1885)
+    df = pd.DataFrame(np.random.rand(1000, 10))
+
+    df[0][df[0] < 0.5] = "A"
+    df[0][df[0] != "A"] = None
+
+    mdf = DataFrame(df)
+    result = mdf.sort_values([0, 1]).execute().fetch()
+    expected = df.sort_values([0, 1])
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    mdf = DataFrame(df, chunk_size=100)
+    result = mdf.sort_values([0, 1]).execute().fetch()
+    expected = df.sort_values([0, 1])
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test ignore_index
+    df = pd.DataFrame(np.random.rand(10, 3), columns=["a" + str(i) for i in range(3)])
+
+    mdf = DataFrame(df, chunk_size=3)
+    result = mdf.sort_values(["a0", "a1"], ignore_index=True).execute().fetch()
+    try:  # for python3.5
+        expected = df.sort_values(["a0", "a1"], ignore_index=True)
+    except TypeError:
+        expected = df.sort_values(["a0", "a1"])
+        expected.index = pd.RangeIndex(len(expected))
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test inplace
+    mdf = DataFrame(df)
+    mdf.sort_values("a0", inplace=True)
+    result = mdf.execute().fetch()
+    df.sort_values("a0", inplace=True)
+
+    pd.testing.assert_frame_equal(result, df)
+
+    # test unknown shape
+    df = pd.DataFrame({"a": list(range(10)), "b": np.random.random(10)})
+    mdf = DataFrame(df, chunk_size=4)
+    filtered = mdf[mdf["a"] > 2]
+    result = filtered.sort_values(by="b").execute().fetch()
+
+    pd.testing.assert_frame_equal(result, df[df["a"] > 2].sort_values(by="b"))
+
+    # test empty dataframe
+    df = pd.DataFrame({"a": list(range(10)), "b": np.random.random(10)})
+    mdf = DataFrame(df, chunk_size=4)
+    filtered = mdf[mdf["b"] > 100]
+    result = filtered.sort_values(by="b").execute().fetch()
+
+    pd.testing.assert_frame_equal(result, df[df["b"] > 100].sort_values(by="b"))
+
+    # test chunks with zero length
+    df = pd.DataFrame({"a": list(range(10)), "b": np.random.random(10)})
+    df.iloc[4:8, 1] = 0
+
+    mdf = DataFrame(df, chunk_size=4)
+    filtered = mdf[mdf["b"] != 0]
+    result = filtered.sort_values(by="b").execute().fetch()
+
+    pd.testing.assert_frame_equal(result, df[df["b"] != 0].sort_values(by="b"))
+
+    # test Series.sort_values
+    raw = pd.Series(np.random.rand(10))
+    series = Series(raw)
+    result = series.sort_values().execute().fetch()
+    expected = raw.sort_values()
+
+    pd.testing.assert_series_equal(result, expected)
+
+    series = Series(raw, chunk_size=3)
+    result = series.sort_values().execute().fetch()
+    expected = raw.sort_values()
+
+    pd.testing.assert_series_equal(result, expected)
+
+    series = Series(raw, chunk_size=2)
+    result = series.sort_values(ascending=False).execute().fetch()
+    expected = raw.sort_values(ascending=False)
+
+    pd.testing.assert_series_equal(result, expected)
+
+    # test empty series
+    series = pd.Series(list(range(10)), name="a")
+    mseries = Series(series, chunk_size=4)
+    filtered = mseries[mseries > 100]
+    result = filtered.sort_values().execute().fetch()
+
+    pd.testing.assert_series_equal(result, series[series > 100].sort_values())
+
+    # test series with None
+    series = pd.Series(np.arange(1000))
+
+    series[series < 500] = "A"
+    series[series != "A"] = None
+
+    mseries = Series(series, chunk_size=100)
+    result = mseries.sort_values().execute().fetch()
+    expected = series.sort_values()
+    pd.testing.assert_series_equal(
+        result.reset_index(drop=True), expected.reset_index(drop=True)
+    )
+
+    # test for empty input(#GH 2649)
+    pd_df = pd.DataFrame(np.random.rand(10, 3), columns=["col1", "col2", "col3"])
+    df = DataFrame(pd_df, chunk_size=4)
+    df = df[df["col2"] > 1].execute()
+    result = df.sort_values(by="col1").execute().fetch()
+    expected = pd_df[pd_df["col2"] > 1].sort_values(by="col1")
+    pd.testing.assert_frame_equal(result, expected)
+
+    pd_s = pd.Series(np.random.rand(10))
+    s = Series(pd_s, chunk_size=4)
+    s = s[s > 1].execute()
+    result = s.sort_values().execute().fetch()
+    expected = pd_s[pd_s > 1].sort_values()
+    pd.testing.assert_series_equal(result, expected)
+
+
+def test_sort_index_execution(setup):
+    raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100))
+
+    mdf = DataFrame(raw)
+    result = mdf.sort_index().execute().fetch()
+    expected = raw.sort_index()
+    pd.testing.assert_frame_equal(result, expected)
+
+    mdf = DataFrame(raw)
+    mdf.sort_index(inplace=True)
+    result = mdf.execute().fetch()
+    expected = raw.sort_index()
+    pd.testing.assert_frame_equal(result, expected)
+
+    mdf = DataFrame(raw, chunk_size=30)
+    result = mdf.sort_index().execute().fetch()
+    expected = raw.sort_index()
+    pd.testing.assert_frame_equal(result, expected)
+
+    mdf = DataFrame(raw, chunk_size=20)
+    result = mdf.sort_index(ascending=False).execute().fetch()
+    expected = raw.sort_index(ascending=False)
+    pd.testing.assert_frame_equal(result, expected)
+
+    mdf = DataFrame(raw, chunk_size=10)
+    result = mdf.sort_index(ignore_index=True).execute().fetch()
+    try:  # for python3.5
+        expected = raw.sort_index(ignore_index=True)
+    except TypeError:
+        expected = raw.sort_index()
+        expected.index = pd.RangeIndex(len(expected))
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test axis=1
+    raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10))
+
+    mdf = DataFrame(raw)
+    result = mdf.sort_index(axis=1).execute().fetch()
+    expected = raw.sort_index(axis=1)
+    pd.testing.assert_frame_equal(result, expected)
+
+    mdf = DataFrame(raw, chunk_size=3)
+    result = mdf.sort_index(axis=1).execute().fetch()
+    expected = raw.sort_index(axis=1)
+    pd.testing.assert_frame_equal(result, expected)
+
+    mdf = DataFrame(raw, chunk_size=4)
+    result = mdf.sort_index(axis=1, ascending=False).execute().fetch()
+    expected = raw.sort_index(axis=1, ascending=False)
+    pd.testing.assert_frame_equal(result, expected)
+
+    mdf = DataFrame(raw, chunk_size=4)
+
+    result = mdf.sort_index(axis=1, ignore_index=True).execute().fetch()
+    try:  # for python3.5
+        expected = raw.sort_index(axis=1, ignore_index=True)
+    except TypeError:
+        expected = raw.sort_index(axis=1)
+        expected.index = pd.RangeIndex(len(expected))
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test series
+    raw = pd.Series(np.random.rand(10), index=np.random.rand(10))
+
+    series = Series(raw)
+    result = series.sort_index().execute().fetch()
+    expected = raw.sort_index()
+    pd.testing.assert_series_equal(result, expected)
+
+    series = Series(raw, chunk_size=2)
+    result = series.sort_index().execute().fetch()
+    expected = raw.sort_index()
+    pd.testing.assert_series_equal(result, expected)
+
+    series = Series(raw, chunk_size=3)
+    result = series.sort_index(ascending=False).execute().fetch()
+    expected = raw.sort_index(ascending=False)
+    pd.testing.assert_series_equal(result, expected)
+
+
+def test_arrow_string_sort_values(setup):
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        {"a": rs.rand(10), "b": [f"s{rs.randint(1000)}" for _ in range(10)]}
+    )
+    raw["b"] = raw["b"].astype(ArrowStringDtype())
+    mdf = DataFrame(raw, chunk_size=3)
+
+    df = mdf.sort_values(by="b")
+    result = df.execute().fetch()
+    expected = raw.sort_values(by="b")
+    pd.testing.assert_frame_equal(result, expected)
+
+
+@require_cudf
+def test_gpu_execution(setup_gpu):
+    # test sort_values
+    rs = np.random.RandomState(0)
+    distinct_opts = ["0"] if sys.platform.lower().startswith("win") else ["0", "1"]
+    for add_distinct in distinct_opts:
+        os.environ["PSRS_DISTINCT_COL"] = add_distinct
+
+        # test dataframe
+        raw = pd.DataFrame(rs.rand(100, 10), columns=["a" + str(i) for i in range(10)])
+        mdf = DataFrame(raw, chunk_size=30).to_gpu()
+
+        result = mdf.sort_values(by="a0").execute().fetch()
+        expected = raw.sort_values(by="a0")
+        pd.testing.assert_frame_equal(result.to_pandas(), expected)
+
+        # test series
+        raw = pd.Series(rs.rand(10))
+        series = Series(raw).to_gpu()
+
+        result = series.sort_values().execute().fetch()
+        expected = raw.sort_values()
+        pd.testing.assert_series_equal(result.to_pandas(), expected)
+
+    # test DataFrame.sort_index
+    raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10))
+    mdf = DataFrame(raw).to_gpu()
+
+    result = mdf.sort_index().execute().fetch()
+    expected = raw.sort_index()
+    pd.testing.assert_frame_equal(result.to_pandas(), expected)
+
+    # test Series.sort_index
+    raw = pd.Series(
+        np.random.rand(10),
+        index=np.random.rand(10),
+    )
+    series = Series(raw).to_gpu()
+
+    result = series.sort_index().execute().fetch()
+    expected = raw.sort_index()
+    pd.testing.assert_series_equal(result.to_pandas(), expected)
diff --git a/python/xorbits/_mars/dataframe/statistics/__init__.py b/python/xorbits/_mars/dataframe/statistics/__init__.py
new file mode 100644
index 000000000..c994509f9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/statistics/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .quantile import DataFrameQuantile
+
+
+def _install():
+    from ..core import DATAFRAME_TYPE, SERIES_TYPE
+    from .corr import df_corr, df_corrwith, series_autocorr, series_corr
+    from .quantile import quantile_dataframe, quantile_series
+
+    for t in SERIES_TYPE:
+        t.quantile = quantile_series
+        t.corr = series_corr
+        t.autocorr = series_autocorr
+
+    for t in DATAFRAME_TYPE:
+        t.quantile = quantile_dataframe
+        t.corr = df_corr
+        t.corrwith = df_corrwith
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/dataframe/statistics/corr.py b/python/xorbits/_mars/dataframe/statistics/corr.py
new file mode 100644
index 000000000..accf53bd1
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/statistics/corr.py
@@ -0,0 +1,423 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes
+from ...core import ENTITY_TYPE, recursive_tile
+from ...serialization.serializables import AnyField, BoolField, Int32Field, KeyField
+from ...tensor.utils import filter_inputs
+from ...utils import has_unknown_shape
+from ..core import DATAFRAME_TYPE, SERIES_TYPE
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_empty_df, parse_index, validate_axis
+
+
+class DataFrameCorr(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.CORR
+
+    other = KeyField("other", default=None)
+    method = AnyField("method", default=None)
+    min_periods = Int32Field("min_periods", default=None)
+    axis = Int32Field("axis", default=None)
+    drop = BoolField("drop", default=None)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        next(inputs_iter)
+        if isinstance(self.other, ENTITY_TYPE):
+            self.other = next(inputs_iter)
+
+    def __call__(self, df_or_series):
+        if isinstance(df_or_series, SERIES_TYPE):
+            inputs = filter_inputs([df_or_series, self.other])
+            return self.new_scalar(inputs, dtype=np.dtype(np.float_))
+        else:
+
+            def _filter_numeric(obj):
+                if not isinstance(obj, DATAFRAME_TYPE):
+                    return obj
+                num_dtypes = build_empty_df(obj.dtypes)._get_numeric_data().dtypes
+                if len(num_dtypes) != len(obj.dtypes):
+                    return obj[list(num_dtypes.index)]
+                return obj
+
+            df_or_series = _filter_numeric(df_or_series)
+            self.other = _filter_numeric(self.other)
+
+            inputs = filter_inputs([df_or_series, self.other])
+            if self.axis is None:
+                dtypes = pd.Series(
+                    [np.dtype(np.float_)] * len(df_or_series.dtypes),
+                    index=df_or_series.dtypes.index,
+                )
+                return self.new_dataframe(
+                    inputs,
+                    shape=(df_or_series.shape[1],) * 2,
+                    dtypes=dtypes,
+                    index_value=df_or_series.columns_value,
+                    columns_value=df_or_series.columns_value,
+                )
+            else:
+                new_index_value = df_or_series.axes[1 - self.axis].index_value
+                if isinstance(self.other, DATAFRAME_TYPE):
+                    align_dtypes = pd.concat(
+                        [self.other.dtypes, df_or_series.dtypes], axis=1
+                    )
+                    align_shape = (np.nan, align_dtypes.shape[0])
+                    new_index_value = parse_index(align_dtypes.index)
+                else:
+                    align_shape = df_or_series.shape
+
+                shape = (np.nan,) if self.drop else (align_shape[1 - self.axis],)
+                return self.new_series(
+                    inputs,
+                    shape=shape,
+                    dtype=np.dtype(np.float_),
+                    index_value=new_index_value,
+                )
+
+    @classmethod
+    def _tile_single(cls, op: "DataFrameCorr"):
+        out = op.outputs[0]
+
+        new_op = op.copy().reset_key()
+        chunk = new_op.new_chunk(
+            [inp.chunks[0] for inp in op.inputs],
+            index=(0,) * len(out.shape),
+            **out.params,
+        )
+
+        new_op = op.copy().reset_key()
+        return new_op.new_tileables(
+            op.inputs, chunks=[chunk], nsplits=((s,) for s in out.shape), **out.params
+        )
+
+    @staticmethod
+    def _tile_pearson_cross(left, right, min_periods):
+        left_tensor, right_tensor = (
+            left.fillna(0).to_tensor(),
+            right.fillna(0).to_tensor(),
+        )
+
+        nna_left = left.notna().to_tensor().astype(np.float_)
+        nna_right = right.notna().to_tensor().astype(np.float_)
+
+        sum_left = left_tensor.T.dot(nna_right)
+        sum_right = right_tensor.T.dot(nna_left)
+        sum_left2 = (left_tensor.T**2).dot(nna_right)
+        sum_right2 = (right_tensor.T**2).dot(nna_left)
+        sum_mul = left_tensor.T.dot(right_tensor)
+        data_count = nna_left.T.dot(nna_right)
+
+        divisor = np.sqrt(data_count * sum_left2 - sum_left * sum_left).T * np.sqrt(
+            data_count * sum_right2 - sum_right * sum_right
+        )
+
+        result = (data_count * sum_mul - sum_left * sum_right.T) / divisor
+        if min_periods is not None:
+            result = np.where(data_count >= min_periods, result, np.nan)
+        return result
+
+    @classmethod
+    def _tile_pearson_align(cls, left, right, axis):
+        if left.ndim == right.ndim:
+            left, right = yield from recursive_tile(left.align(right))
+        else:
+            left, right = yield from recursive_tile(left.align(right, axis=axis))
+        if has_unknown_shape(left, right):
+            yield left.chunks + right.chunks + [left, right]
+
+        nna_left = left.notna().astype(np.float_)
+        nna_right = right.notna().astype(np.float_)
+
+        left, right = left.fillna(0), right.fillna(0)
+
+        sum_left = left.mul(nna_right, axis=axis).sum(axis=axis)
+        sum_right = nna_left.mul(right, axis=axis).sum(axis=axis)
+        sum_left2 = (left**2).mul(nna_right, axis=axis).sum(axis=axis)
+        sum_right2 = nna_left.mul(right**2, axis=axis).sum(axis=axis)
+        sum_mul = left.mul(right, axis=axis).sum(axis=axis)
+        data_count = nna_left.mul(nna_right, axis=axis).sum(axis=axis)
+
+        divisor = np.sqrt(data_count * sum_left2 - sum_left * sum_left) * np.sqrt(
+            data_count * sum_right2 - sum_right * sum_right
+        )
+        return (data_count * sum_mul - sum_left * sum_right) / divisor
+
+    @classmethod
+    def _tile_series(cls, op: "DataFrameCorr"):
+        left = op.inputs[0]
+        right = op.other
+
+        _check_supported_methods(op.method)
+        return [
+            (
+                yield from recursive_tile(
+                    cls._tile_pearson_cross(left, right, min_periods=op.min_periods)
+                )
+            )
+        ]
+
+    @classmethod
+    def _tile_dataframe_cross(cls, op: "DataFrameCorr"):
+        from ..initializer import DataFrame as MarsDataFrame
+
+        left = op.inputs[0]
+        right = op.other if op.other is not None else op.inputs[0]
+
+        _check_supported_methods(op.method)
+
+        result = cls._tile_pearson_cross(left, right, min_periods=op.min_periods)
+        result = MarsDataFrame(
+            result, index=left.dtypes.index, columns=right.dtypes.index
+        )
+        return [(yield from recursive_tile(result))]
+
+    @classmethod
+    def _tile_dataframe_align(cls, op: "DataFrameCorr"):
+        left = op.inputs[0]
+        right = op.other
+
+        _check_supported_methods(op.method)
+        result = yield from cls._tile_pearson_align(left, right, axis=op.axis)
+        if op.drop:
+            result = result.dropna(axis=op.axis)
+        return [(yield from recursive_tile(result))]
+
+    @classmethod
+    def tile(cls, op: "DataFrameCorr"):
+        inp = op.inputs[0]
+        if len(inp.chunks) == 1 and (op.other is None or len(op.other.chunks) == 1):
+            return cls._tile_single(op)
+        elif isinstance(inp, SERIES_TYPE):
+            return (yield from cls._tile_series(op))
+        elif op.axis is None:
+            return (yield from cls._tile_dataframe_cross(op))
+        else:
+            return (yield from cls._tile_dataframe_align(op))
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameCorr"):
+        inp = op.inputs[0]
+        out = op.outputs[0]
+        inp_data = ctx[inp.key]
+
+        if inp.ndim == 1:
+            ctx[out.key] = inp_data.corr(
+                ctx[op.other.key], method=op.method, min_periods=op.min_periods
+            )
+        elif op.axis is None:
+            ctx[out.key] = inp_data.corr(method=op.method, min_periods=op.min_periods)
+        else:
+            ctx[out.key] = inp_data.corrwith(
+                ctx[op.other.key], method=op.method, axis=op.axis, drop=op.drop
+            )
+
+
+def _check_supported_methods(method):
+    if method != "pearson":
+        raise NotImplementedError(f"Correlation method {method!r} not supported")
+
+
+def df_corr(df, method="pearson", min_periods=1):
+    """
+    Compute pairwise correlation of columns, excluding NA/null values.
+
+    Parameters
+    ----------
+    method : {'pearson', 'kendall', 'spearman'} or callable
+        Method of correlation:
+
+        * pearson : standard correlation coefficient
+        * kendall : Kendall Tau correlation coefficient
+        * spearman : Spearman rank correlation
+        * callable: callable with input two 1d ndarrays
+            and returning a float. Note that the returned matrix from corr
+            will have 1 along the diagonals and will be symmetric
+            regardless of the callable's behavior.
+
+        .. note::
+            kendall, spearman and callables not supported on multiple chunks yet.
+
+    min_periods : int, optional
+        Minimum number of observations required per pair of columns
+        to have a valid result. Currently only available for Pearson
+        and Spearman correlation.
+
+    Returns
+    -------
+    DataFrame
+        Correlation matrix.
+
+    See Also
+    --------
+    DataFrame.corrwith : Compute pairwise correlation with another
+        DataFrame or Series.
+    Series.corr : Compute the correlation between two Series.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
+    ...                   columns=['dogs', 'cats'])
+    >>> df.corr(method='pearson').execute()
+              dogs      cats
+    dogs  1.000000 -0.851064
+    cats -0.851064  1.000000
+    """
+    op = DataFrameCorr(method=method, min_periods=min_periods)
+    return op(df)
+
+
+def df_corrwith(df, other, axis=0, drop=False, method="pearson"):
+    """
+    Compute pairwise correlation.
+
+    Pairwise correlation is computed between rows or columns of
+    DataFrame with rows or columns of Series or DataFrame. DataFrames
+    are first aligned along both axes before computing the
+    correlations.
+
+    Parameters
+    ----------
+    other : DataFrame, Series
+        Object with which to compute correlations.
+    axis : {0 or 'index', 1 or 'columns'}, default 0
+        The axis to use. 0 or 'index' to compute column-wise, 1 or 'columns' for
+        row-wise.
+    drop : bool, default False
+        Drop missing indices from result.
+    method : {'pearson', 'kendall', 'spearman'} or callable
+        Method of correlation:
+
+        * pearson : standard correlation coefficient
+        * kendall : Kendall Tau correlation coefficient
+        * spearman : Spearman rank correlation
+        * callable: callable with input two 1d ndarrays
+            and returning a float.
+
+        .. note::
+            kendall, spearman and callables not supported on multiple chunks yet.
+
+    Returns
+    -------
+    Series
+        Pairwise correlations.
+
+    See Also
+    --------
+    DataFrame.corr : Compute pairwise correlation of columns.
+    """
+    axis = validate_axis(axis, df)
+    if drop:
+        # TODO implement with df.align(method='inner')
+        raise NotImplementedError("drop=True not implemented")
+    op = DataFrameCorr(other=other, method=method, axis=axis, drop=drop)
+    return op(df)
+
+
+def series_corr(series, other, method="pearson", min_periods=None):
+    """
+    Compute correlation with `other` Series, excluding missing values.
+
+    Parameters
+    ----------
+    other : Series
+        Series with which to compute the correlation.
+    method : {'pearson', 'kendall', 'spearman'} or callable
+        Method used to compute correlation:
+
+        - pearson : Standard correlation coefficient
+        - kendall : Kendall Tau correlation coefficient
+        - spearman : Spearman rank correlation
+        - callable: Callable with input two 1d ndarrays and returning a float.
+
+        .. note::
+            kendall, spearman and callables not supported on multiple chunks yet.
+
+    min_periods : int, optional
+        Minimum number of observations needed to have a valid result.
+
+    Returns
+    -------
+    float
+        Correlation with other.
+
+    See Also
+    --------
+    DataFrame.corr : Compute pairwise correlation between columns.
+    DataFrame.corrwith : Compute pairwise correlation with another
+        DataFrame or Series.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> s1 = md.Series([.2, .0, .6, .2])
+    >>> s2 = md.Series([.3, .6, .0, .1])
+    >>> s1.corr(s2, method='pearson').execute()
+    -0.8510644963469898
+    """
+    op = DataFrameCorr(other=other, method=method, min_periods=min_periods)
+    return op(series)
+
+
+def series_autocorr(series, lag=1):
+    """
+    Compute the lag-N autocorrelation.
+
+    This method computes the Pearson correlation between
+    the Series and its shifted self.
+
+    Parameters
+    ----------
+    lag : int, default 1
+        Number of lags to apply before performing autocorrelation.
+
+    Returns
+    -------
+    float
+        The Pearson correlation between self and self.shift(lag).
+
+    See Also
+    --------
+    Series.corr : Compute the correlation between two Series.
+    Series.shift : Shift index by desired number of periods.
+    DataFrame.corr : Compute pairwise correlation of columns.
+    DataFrame.corrwith : Compute pairwise correlation between rows or
+        columns of two DataFrame objects.
+
+    Notes
+    -----
+    If the Pearson correlation is not well defined return 'NaN'.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> s = md.Series([0.25, 0.5, 0.2, -0.05])
+    >>> s.autocorr().execute()  # doctest: +ELLIPSIS.execute()
+    0.10355...
+    >>> s.autocorr(lag=2).execute()  # doctest: +ELLIPSIS.execute()
+    -0.99999...
+
+    If the Pearson correlation is not well defined, then 'NaN' is returned.
+
+    >>> s = md.Series([1, 0, 0, 0])
+    >>> s.autocorr().execute()
+    nan
+    """
+    op = DataFrameCorr(other=series.shift(lag), method="pearson")
+    return op(series)
diff --git a/python/xorbits/_mars/dataframe/statistics/quantile.py b/python/xorbits/_mars/dataframe/statistics/quantile.py
new file mode 100644
index 000000000..4caa0fc26
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/statistics/quantile.py
@@ -0,0 +1,483 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import ENTITY_TYPE, recursive_tile
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    DataTypeField,
+    Int32Field,
+    KeyField,
+    StringField,
+)
+from ...tensor.core import TENSOR_CHUNK_TYPE, TENSOR_TYPE
+from ...tensor.datasource import empty
+from ...tensor.datasource import from_dataframe as tensor_from_dataframe
+from ...tensor.datasource import from_series as tensor_from_series
+from ...tensor.datasource import tensor as astensor
+from ...tensor.statistics.quantile import quantile as tensor_quantile
+from ..core import DATAFRAME_TYPE
+from ..datasource.from_tensor import dataframe_from_tensor, series_from_tensor
+from ..initializer import DataFrame as create_df
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_empty_df, find_common_type, parse_index, validate_axis
+
+
+class DataFrameQuantile(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = OperandDef.QUANTILE
+
+    _input = KeyField("input")
+    _q = AnyField("q")
+    _axis = Int32Field("axis")
+    _numeric_only = BoolField("numeric_only")
+    _interpolation = StringField("interpolation")
+
+    _dtype = DataTypeField("dtype")
+
+    def __init__(
+        self,
+        q=None,
+        interpolation=None,
+        axis=None,
+        numeric_only=None,
+        dtype=None,
+        gpu=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _q=q,
+            _interpolation=interpolation,
+            _axis=axis,
+            _numeric_only=numeric_only,
+            _dtype=dtype,
+            _output_types=output_types,
+            gpu=gpu,
+            **kw
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def q(self):
+        return self._q
+
+    @property
+    def interpolation(self):
+        return self._interpolation
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def numeric_only(self):
+        return self._numeric_only
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+        if isinstance(self._q, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)):
+            self._q = self._inputs[-1]
+
+    def _calc_dtype_on_axis_1(self, a, dtypes):
+        quantile_dtypes = []
+        for name in dtypes.index:
+            dt = tensor_quantile(
+                tensor_from_series(a[name]),
+                self._q,
+                interpolation=self._interpolation,
+                handle_non_numeric=not self._numeric_only,
+            ).dtype
+            quantile_dtypes.append(dt)
+        return find_common_type(quantile_dtypes)
+
+    def _call_dataframe(self, a, inputs):
+        if self._numeric_only:
+            empty_df = build_empty_df(a.dtypes)
+            dtypes = empty_df._get_numeric_data().dtypes
+        else:
+            dtypes = a.dtypes
+        if isinstance(self._q, TENSOR_TYPE):
+            q_val = self._q
+            pd_index = pd.Index([], dtype=q_val.dtype)
+            name = None
+            store_index_value = False
+        else:
+            q_val = np.asanyarray(self._q)
+            pd_index = pd.Index(q_val)
+            name = self._q if q_val.size == 1 else None
+            store_index_value = True
+        tokenize_objects = (a, q_val, self._interpolation, type(self).__name__)
+
+        if q_val.ndim == 0 and self._axis == 0:
+            index_value = parse_index(dtypes.index, store_data=store_index_value)
+            shape = (len(dtypes),)
+            # calc dtype
+            dtype = self._calc_dtype_on_axis_1(a, dtypes)
+            return self.new_series(
+                inputs,
+                shape=shape,
+                dtype=dtype,
+                index_value=index_value,
+                name=name or dtypes.index.name,
+            )
+        elif q_val.ndim == 0 and self._axis == 1:
+            index_value = a.index_value
+            shape = (len(a),)
+            # calc dtype
+            dt = tensor_quantile(
+                empty(a.shape[1], dtype=find_common_type(list(dtypes))),
+                self._q,
+                interpolation=self._interpolation,
+                handle_non_numeric=not self._numeric_only,
+            ).dtype
+            return self.new_series(
+                inputs,
+                shape=shape,
+                dtype=dt,
+                index_value=index_value,
+                name=name or index_value.name,
+            )
+        elif q_val.ndim == 1 and self._axis == 0:
+            shape = (len(q_val), len(dtypes))
+            index_value = parse_index(
+                pd_index, *tokenize_objects, store_data=store_index_value
+            )
+            dtype_list = []
+            for name in dtypes.index:
+                dtype_list.append(
+                    tensor_quantile(
+                        tensor_from_series(a[name]),
+                        self._q,
+                        interpolation=self._interpolation,
+                        handle_non_numeric=not self._numeric_only,
+                    ).dtype
+                )
+            dtypes = pd.Series(dtype_list, index=dtypes.index)
+            return self.new_dataframe(
+                inputs,
+                shape=shape,
+                dtypes=dtypes,
+                index_value=index_value,
+                columns_value=parse_index(dtypes.index, store_data=True),
+            )
+        else:
+            assert q_val.ndim == 1 and self._axis == 1
+            shape = (len(q_val), a.shape[0])
+            index_value = parse_index(
+                pd_index, *tokenize_objects, store_data=store_index_value
+            )
+            pd_columns = a.index_value.to_pandas()
+            dtype_list = np.full(len(pd_columns), self._calc_dtype_on_axis_1(a, dtypes))
+            dtypes = pd.Series(dtype_list, index=pd_columns)
+            return self.new_dataframe(
+                inputs,
+                shape=shape,
+                dtypes=dtypes,
+                index_value=index_value,
+                columns_value=parse_index(
+                    dtypes.index, store_data=True, key=a.index_value.key
+                ),
+            )
+
+    def _call_series(self, a, inputs):
+        if isinstance(self._q, TENSOR_TYPE):
+            q_val = self._q
+            index_val = pd.Index([], dtype=q_val.dtype)
+            store_index_value = False
+        else:
+            q_val = np.asanyarray(self._q)
+            index_val = pd.Index(q_val)
+            store_index_value = True
+
+        # get dtype by tensor
+        a_t = astensor(a)
+        self._dtype = dtype = tensor_quantile(
+            a_t,
+            self._q,
+            interpolation=self._interpolation,
+            handle_non_numeric=not self._numeric_only,
+        ).dtype
+
+        if q_val.ndim == 0:
+            return self.new_scalar(inputs, dtype=dtype)
+        else:
+            return self.new_series(
+                inputs,
+                shape=q_val.shape,
+                dtype=dtype,
+                index_value=parse_index(
+                    index_val,
+                    a,
+                    q_val,
+                    self._interpolation,
+                    type(self).__name__,
+                    store_data=store_index_value,
+                ),
+                name=a.name,
+            )
+
+    def __call__(self, a, q_input=None):
+        inputs = [a]
+        if q_input is not None:
+            inputs.append(q_input)
+        if isinstance(a, DATAFRAME_TYPE):
+            return self._call_dataframe(a, inputs)
+        else:
+            return self._call_series(a, inputs)
+
+    @classmethod
+    def _tile_dataframe(cls, op):
+        from ...tensor.merge.stack import TensorStack
+
+        df = op.outputs[0]
+        if df.ndim == 1:
+            if op.axis == 0:
+                ts = []
+                for name in df.index_value.to_pandas():
+                    a = tensor_from_series(op.input[name])
+                    t = tensor_quantile(
+                        a,
+                        op.q,
+                        interpolation=op.interpolation,
+                        handle_non_numeric=not op.numeric_only,
+                    )
+                    ts.append(t)
+                try:
+                    dtype = np.result_type(*[it.dtype for it in ts])
+                except TypeError:
+                    dtype = np.dtype(object)
+                stack_op = TensorStack(axis=0, dtype=dtype)
+                tr = stack_op(ts)
+                r = series_from_tensor(
+                    tr, index=df.index_value.to_pandas(), name=ts[0].op.q.item()
+                )
+            else:
+                assert op.axis == 1
+                empty_df = build_empty_df(op.input.dtypes)
+                fields = empty_df._get_numeric_data().columns.tolist()
+                t = tensor_from_dataframe(op.input[fields])
+                tr = tensor_quantile(
+                    t,
+                    op.q,
+                    axis=1,
+                    interpolation=op.interpolation,
+                    handle_non_numeric=not op.numeric_only,
+                )
+                r = series_from_tensor(tr, index=op.input.index, name=tr.op.q.item())
+        else:
+            assert df.ndim == 2
+            if op.axis == 0:
+                d = OrderedDict()
+                for name in df.dtypes.index:
+                    a = tensor_from_series(op.input[name])
+                    t = tensor_quantile(
+                        a,
+                        op.q,
+                        interpolation=op.interpolation,
+                        handle_non_numeric=not op.numeric_only,
+                    )
+                    d[name] = t
+                r = create_df(d, index=op.q)
+            else:
+                assert op.axis == 1
+                empty_df = build_empty_df(op.input.dtypes)
+                fields = empty_df._get_numeric_data().columns.tolist()
+                t = tensor_from_dataframe(op.input[fields])
+                tr = tensor_quantile(
+                    t,
+                    op.q,
+                    axis=1,
+                    interpolation=op.interpolation,
+                    handle_non_numeric=not op.numeric_only,
+                )
+                if not op.input.index_value.has_value():
+                    raise NotImplementedError
+                # TODO(xuye.qin): use index=op.input.index when we support DataFrame.index
+                r = dataframe_from_tensor(
+                    tr, index=op.q, columns=op.input.index_value.to_pandas()
+                )
+
+        return (yield from recursive_tile(r))
+
+    @classmethod
+    def _tile_series(cls, op):
+        a = tensor_from_series(op.input)
+        t = tensor_quantile(
+            a,
+            op.q,
+            interpolation=op.interpolation,
+            handle_non_numeric=not op.numeric_only,
+        )
+        if isinstance(op.outputs[0], TENSOR_TYPE):
+            r = t
+        else:
+            r = series_from_tensor(t, index=op.q, name=op.outputs[0].name)
+        r = yield from recursive_tile(r)
+        return [r]
+
+    @classmethod
+    def tile(cls, op):
+        if isinstance(op.input, DATAFRAME_TYPE):
+            tiled = yield from cls._tile_dataframe(op)
+        else:
+            tiled = yield from cls._tile_series(op)
+        return tiled
+
+
+def quantile_series(series, q=0.5, interpolation="linear"):
+    """
+    Return value at the given quantile.
+
+    Parameters
+    ----------
+    q : float or array-like, default 0.5 (50% quantile)
+        0 <= q <= 1, the quantile(s) to compute.
+    interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
+
+        This optional parameter specifies the interpolation method to use,
+        when the desired quantile lies between two data points `i` and `j`:
+
+            * linear: `i + (j - i) * fraction`, where `fraction` is the
+              fractional part of the index surrounded by `i` and `j`.
+            * lower: `i`.
+            * higher: `j`.
+            * nearest: `i` or `j` whichever is nearest.
+            * midpoint: (`i` + `j`) / 2.
+
+    Returns
+    -------
+    float or Series
+        If ``q`` is an array or a tensor, a Series will be returned where the
+        index is ``q`` and the values are the quantiles, otherwise
+        a float will be returned.
+
+    See Also
+    --------
+    core.window.Rolling.quantile
+    numpy.percentile
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> s = md.Series([1, 2, 3, 4])
+    >>> s.quantile(.5).execute()
+    2.5
+    >>> s.quantile([.25, .5, .75]).execute()
+    0.25    1.75
+    0.50    2.50
+    0.75    3.25
+    dtype: float64
+    """
+
+    if isinstance(q, ENTITY_TYPE):
+        q = astensor(q)
+        q_input = q
+    else:
+        q_input = None
+
+    op = DataFrameQuantile(q=q, interpolation=interpolation, gpu=series.op.gpu)
+    return op(series, q_input=q_input)
+
+
+def quantile_dataframe(df, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
+    """
+    Return values at the given quantile over requested axis.
+
+    Parameters
+    ----------
+    q : float or array-like, default 0.5 (50% quantile)
+        Value between 0 <= q <= 1, the quantile(s) to compute.
+    axis : {0, 1, 'index', 'columns'} (default 0)
+        Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
+    numeric_only : bool, default True
+        If False, the quantile of datetime and timedelta data will be
+        computed as well.
+    interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
+        This optional parameter specifies the interpolation method to use,
+        when the desired quantile lies between two data points `i` and `j`:
+        * linear: `i + (j - i) * fraction`, where `fraction` is the
+          fractional part of the index surrounded by `i` and `j`.
+        * lower: `i`.
+        * higher: `j`.
+        * nearest: `i` or `j` whichever is nearest.
+        * midpoint: (`i` + `j`) / 2.
+
+    Returns
+    -------
+    Series or DataFrame
+        If ``q`` is an array or a tensor, a DataFrame will be returned where the
+          index is ``q``, the columns are the columns of self, and the
+          values are the quantiles.
+        If ``q`` is a float, a Series will be returned where the
+          index is the columns of self and the values are the quantiles.
+
+    See Also
+    --------
+    core.window.Rolling.quantile: Rolling quantile.
+    numpy.percentile: Numpy function to compute the percentile.
+
+    Examples
+    --------
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
+    ...                   columns=['a', 'b'])
+    >>> df.quantile(.1).execute()
+    a    1.3
+    b    3.7
+    Name: 0.1, dtype: float64
+
+    >>> df.quantile([.1, .5]).execute()
+           a     b
+    0.1  1.3   3.7
+    0.5  2.5  55.0
+
+    Specifying `numeric_only=False` will also compute the quantile of
+    datetime and timedelta data.
+
+    >>> df = md.DataFrame({'A': [1, 2],
+    ...                    'B': [md.Timestamp('2010'),
+    ...                          md.Timestamp('2011')],
+    ...                    'C': [md.Timedelta('1 days'),
+    ...                          md.Timedelta('2 days')]})
+    >>> df.quantile(0.5, numeric_only=False).execute()
+    A                    1.5
+    B    2010-07-02 12:00:00
+    C        1 days 12:00:00
+    Name: 0.5, dtype: object
+    """
+    if isinstance(q, ENTITY_TYPE):
+        q = astensor(q)
+        q_input = q
+    else:
+        q_input = None
+    axis = validate_axis(axis, df)
+
+    op = DataFrameQuantile(
+        q=q,
+        interpolation=interpolation,
+        axis=axis,
+        numeric_only=numeric_only,
+        gpu=df.op.gpu,
+    )
+    return op(df, q_input=q_input)
diff --git a/python/xorbits/_mars/dataframe/statistics/tests/__init__.py b/python/xorbits/_mars/dataframe/statistics/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/statistics/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/statistics/tests/test_statistics.py b/python/xorbits/_mars/dataframe/statistics/tests/test_statistics.py
new file mode 100644
index 000000000..d183f09ce
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/statistics/tests/test_statistics.py
@@ -0,0 +1,92 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ....core import tile
+from ....tensor import Tensor
+from ...core import DataFrame, Series
+from ...datasource.dataframe import from_pandas as df_from_pandas
+from ...datasource.series import from_pandas as series_from_pandas
+
+
+def test_series_quantile():
+    raw = pd.Series(np.random.rand(10))
+    s = series_from_pandas(raw, chunk_size=3)
+
+    r = s.quantile()
+    assert isinstance(r, Tensor)
+    tile(r)
+
+    s = series_from_pandas(raw, chunk_size=3)
+
+    r = s.quantile([0.3, 0.7])
+    assert isinstance(r, Series)
+    assert r.shape == (2,)
+    pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.Index([0.3, 0.7]))
+    tile(r)
+
+
+def test_dataframe_quantile():
+    raw = pd.DataFrame(
+        {
+            "a": np.random.rand(10),
+            "b": np.random.randint(1000, size=10),
+            "c": [np.random.bytes(5) for _ in range(10)],
+        }
+    )
+    s = df_from_pandas(raw, chunk_size=7)
+
+    # q = 0.3, axis = 0
+    r = s.quantile(0.3)
+    e = raw.quantile(0.3)
+    assert isinstance(r, Series)
+    assert r.shape == (2,)
+    assert r.dtype == e.dtype
+    pd.testing.assert_index_equal(r.index_value.to_pandas(), e.index)
+
+    tile(r)
+
+    # q = 0.3, axis = 1
+    r = s.quantile(0.3, axis=1)
+    e = raw.quantile(0.3, axis=1)
+    assert isinstance(r, Series)
+    assert r.shape == e.shape
+    assert r.dtype == e.dtype
+    pd.testing.assert_index_equal(r.index_value.to_pandas(), e.index)
+
+    tile(r)
+
+    # q = [0.3, 0.7], axis = 0
+    r = s.quantile([0.3, 0.7])
+    e = raw.quantile([0.3, 0.7])
+    assert isinstance(r, DataFrame)
+    assert r.shape == e.shape
+    pd.testing.assert_series_equal(r.dtypes, e.dtypes)
+    pd.testing.assert_index_equal(r.index_value.to_pandas(), e.index)
+    pd.testing.assert_index_equal(r.columns_value.to_pandas(), e.columns)
+
+    tile(r)
+
+    # q = [0.3, 0.7], axis = 1
+    r = s.quantile([0.3, 0.7], axis=1)
+    e = raw.quantile([0.3, 0.7], axis=1)
+    assert isinstance(r, DataFrame)
+    assert r.shape == e.shape
+    pd.testing.assert_series_equal(r.dtypes, e.dtypes)
+    pd.testing.assert_index_equal(r.index_value.to_pandas(), e.index)
+    pd.testing.assert_index_equal(r.columns_value.to_pandas(), e.columns)
+
+    tile(r)
diff --git a/python/xorbits/_mars/dataframe/statistics/tests/test_statistics_execution.py b/python/xorbits/_mars/dataframe/statistics/tests/test_statistics_execution.py
new file mode 100644
index 000000000..c6a6bc02b
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/statistics/tests/test_statistics_execution.py
@@ -0,0 +1,262 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ....tensor import tensor
+from ... import DataFrame, Series
+
+
+def test_series_quantile_execution(setup):
+    raw = pd.Series(np.random.rand(10), name="a")
+    a = Series(raw, chunk_size=3)
+
+    # q = 0.5, scalar
+    r = a.quantile()
+    result = r.execute().fetch()
+    expected = raw.quantile()
+
+    assert result == expected
+
+    # q is a list
+    r = a.quantile([0.3, 0.7])
+    result = r.execute().fetch()
+    expected = raw.quantile([0.3, 0.7])
+
+    pd.testing.assert_series_equal(result, expected)
+
+    # test interpolation
+    r = a.quantile([0.3, 0.7], interpolation="midpoint")
+    result = r.execute().fetch()
+    expected = raw.quantile([0.3, 0.7], interpolation="midpoint")
+
+    pd.testing.assert_series_equal(result, expected)
+
+    q = tensor([0.3, 0.7])
+
+    # q is a tensor
+    r = a.quantile(q)
+    result = r.execute().fetch()
+    expected = raw.quantile([0.3, 0.7])
+
+    pd.testing.assert_series_equal(result, expected)
+
+
+def test_dataframe_quantile_execution(setup):
+    raw = pd.DataFrame(
+        {
+            "a": np.random.rand(10),
+            "b": np.random.randint(1000, size=10),
+            "c": np.random.rand(10),
+            "d": [np.random.bytes(10) for _ in range(10)],
+            "e": [pd.Timestamp(f"201{i}") for i in range(10)],
+            "f": [pd.Timedelta(f"{i} days") for i in range(10)],
+        },
+        index=pd.RangeIndex(1, 11),
+    )
+    df = DataFrame(raw, chunk_size=3)
+
+    # q = 0.5, axis = 0, series
+    r = df.quantile()
+    result = r.execute().fetch()
+    expected = raw.quantile()
+
+    pd.testing.assert_series_equal(result, expected)
+
+    # q = 0.5, axis = 1, series
+    r = df.quantile(axis=1)
+    result = r.execute().fetch()
+    expected = raw.quantile(axis=1)
+
+    pd.testing.assert_series_equal(result, expected)
+
+    # q is a list, axis = 0, dataframe
+    r = df.quantile([0.3, 0.7])
+    result = r.execute().fetch()
+    expected = raw.quantile([0.3, 0.7])
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    # q is a list, axis = 1, dataframe
+    r = df.quantile([0.3, 0.7], axis=1)
+    result = r.execute().fetch()
+    expected = raw.quantile([0.3, 0.7], axis=1)
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test interpolation
+    r = df.quantile([0.3, 0.7], interpolation="midpoint")
+    result = r.execute().fetch()
+    expected = raw.quantile([0.3, 0.7], interpolation="midpoint")
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    q = tensor([0.3, 0.7])
+
+    # q is a tensor
+    r = df.quantile(q)
+    result = r.execute().fetch()
+    expected = raw.quantile([0.3, 0.7])
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test numeric_only
+    raw2 = pd.DataFrame(
+        {
+            "a": np.random.rand(10),
+            "b": np.random.randint(1000, size=10),
+            "c": np.random.rand(10),
+            "d": [pd.Timestamp(f"201{i}") for i in range(10)],
+        },
+        index=pd.RangeIndex(1, 11),
+    )
+    df2 = DataFrame(raw2, chunk_size=3)
+
+    r = df2.quantile([0.3, 0.7], numeric_only=False)
+    result = r.execute().fetch()
+    expected = raw2.quantile([0.3, 0.7], numeric_only=False)
+
+    pd.testing.assert_frame_equal(result, expected)
+
+    r = df2.quantile(numeric_only=False)
+    result = r.execute().fetch()
+    expected = raw2.quantile(numeric_only=False)
+
+    pd.testing.assert_series_equal(result, expected)
+
+
+def test_dataframe_corr(setup):
+    rs = np.random.RandomState(0)
+    raw = rs.rand(20, 10)
+    raw = pd.DataFrame(np.where(raw > 0.4, raw, np.nan), columns=list("ABCDEFGHIJ"))
+    raw["k"] = pd.Series(["aaa"] * 20)
+
+    df = DataFrame(raw)
+
+    result = df.corr()
+    pd.testing.assert_frame_equal(result.execute().fetch(), raw.corr())
+
+    result = df.corr(method="kendall")
+    pd.testing.assert_frame_equal(result.execute().fetch(), raw.corr(method="kendall"))
+
+    df = DataFrame(raw, chunk_size=6)
+
+    with pytest.raises(Exception):
+        df.corr(method="kendall").execute()
+
+    result = df.corr()
+    pd.testing.assert_frame_equal(result.execute().fetch(), raw.corr())
+
+    result = df.corr(min_periods=7)
+    pd.testing.assert_frame_equal(result.execute().fetch(), raw.corr(min_periods=7))
+
+
+@pytest.mark.skip_ray_dag  # https://github.com/mars-project/mars/issues/3247
+def test_dataframe_corr_with(setup):
+    rs = np.random.RandomState(0)
+    raw_df = rs.rand(20, 10)
+    raw_df = pd.DataFrame(
+        np.where(raw_df > 0.4, raw_df, np.nan), columns=list("ABCDEFGHIJ")
+    )
+    raw_df2 = rs.rand(20, 10)
+    raw_df2 = pd.DataFrame(
+        np.where(raw_df2 > 0.4, raw_df2, np.nan), columns=list("ACDEGHIJKL")
+    )
+    raw_s = rs.rand(20)
+    raw_s = pd.Series(np.where(raw_s > 0.4, raw_s, np.nan))
+    raw_s2 = rs.rand(10)
+    raw_s2 = pd.Series(np.where(raw_s2 > 0.4, raw_s2, np.nan), index=raw_df2.columns)
+
+    df = DataFrame(raw_df)
+    df2 = DataFrame(raw_df2)
+
+    result = df.corrwith(df2)
+    pd.testing.assert_series_equal(result.execute().fetch(), raw_df.corrwith(raw_df2))
+
+    result = df.corrwith(df2, axis=1)
+    pd.testing.assert_series_equal(
+        result.execute().fetch(), raw_df.corrwith(raw_df2, axis=1)
+    )
+
+    result = df.corrwith(df2, method="kendall")
+    pd.testing.assert_series_equal(
+        result.execute().fetch(), raw_df.corrwith(raw_df2, method="kendall")
+    )
+
+    df = DataFrame(raw_df, chunk_size=4)
+    df2 = DataFrame(raw_df2, chunk_size=6)
+    s = Series(raw_s, chunk_size=5)
+    s2 = Series(raw_s2, chunk_size=5)
+
+    with pytest.raises(Exception):
+        df.corrwith(df2, method="kendall").execute()
+
+    result = df.corrwith(df2)
+    pd.testing.assert_series_equal(
+        result.execute().fetch().sort_index(), raw_df.corrwith(raw_df2).sort_index()
+    )
+
+    result = df.corrwith(df2, axis=1)
+    pd.testing.assert_series_equal(
+        result.execute().fetch().sort_index(),
+        raw_df.corrwith(raw_df2, axis=1).sort_index(),
+    )
+
+    result = df.corrwith(s)
+    pd.testing.assert_series_equal(
+        result.execute().fetch().sort_index(), raw_df.corrwith(raw_s).sort_index()
+    )
+
+    result = df.corrwith(s2, axis=1)
+    pd.testing.assert_series_equal(
+        result.execute().fetch().sort_index(),
+        raw_df.corrwith(raw_s2, axis=1).sort_index(),
+    )
+
+
+def test_series_corr(setup):
+    rs = np.random.RandomState(0)
+    raw = rs.rand(20)
+    raw = pd.Series(np.where(raw > 0.4, raw, np.nan))
+    raw2 = rs.rand(20)
+    raw2 = pd.Series(np.where(raw2 > 0.4, raw2, np.nan))
+
+    s = Series(raw)
+    s2 = Series(raw2)
+
+    result = s.corr(s2)
+    assert result.execute().fetch() == raw.corr(raw2)
+
+    result = s.corr(s2, method="kendall")
+    assert result.execute().fetch() == raw.corr(raw2, method="kendall")
+
+    result = s.autocorr(2)
+    assert result.execute().fetch() == raw.autocorr(2)
+
+    s = Series(raw, chunk_size=6)
+    s2 = Series(raw2, chunk_size=4)
+
+    with pytest.raises(Exception):
+        s.corr(s2, method="kendall").execute()
+
+    result = s.corr(s2)
+    assert pytest.approx(result.execute().fetch()) == raw.corr(raw2)
+
+    result = s.corr(s2, min_periods=7)
+    assert pytest.approx(result.execute().fetch()) == raw.corr(raw2, min_periods=7)
+
+    result = s.autocorr(2)
+    assert pytest.approx(result.execute().fetch()) == raw.autocorr(2)
diff --git a/python/xorbits/_mars/dataframe/tests/__init__.py b/python/xorbits/_mars/dataframe/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/tests/test_arrays.py b/python/xorbits/_mars/dataframe/tests/test_arrays.py
new file mode 100644
index 000000000..fdd521054
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/tests/test_arrays.py
@@ -0,0 +1,482 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import pytest
+
+try:
+    import pyarrow as pa
+except ImportError:
+    pa = None
+
+from ...config import option_context
+from ...core import enter_mode
+from .. import ArrowListArray, ArrowListDtype, ArrowStringArray, ArrowStringDtype
+from ..arrays import _use_bool_any_all
+from ..utils import arrow_table_to_pandas_dataframe
+
+
+@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
+def test_arrow_dtype():
+    s = pa.array(["a", "b"])
+    assert list(ArrowStringDtype().__from_arrow__(s)) == list(ArrowStringArray(s))
+
+    assert ArrowStringDtype() == ArrowStringDtype.construct_from_string("Arrow[string]")
+
+    assert ArrowListDtype(
+        ArrowListDtype("string")
+    ) == ArrowListDtype.construct_from_string("Arrow[List[string]]")
+
+    assert repr(ArrowListDtype(np.int8)) == "Arrow[List[int8]]"
+
+    with pytest.raises(TypeError):
+        ArrowListDtype.construct_from_string("Arrow[string]")
+
+    assert ArrowListDtype.is_dtype("Arrow[List[uint8]]") is True
+    assert ArrowListDtype.is_dtype("List[int8]") is False
+    assert ArrowListDtype.is_dtype(ArrowStringDtype()) is False
+
+    assert ArrowListDtype(np.int8) != ArrowStringDtype()
+    assert ArrowListDtype(np.int8).kind == np.dtype(object).kind
+
+    assert ArrowListDtype(np.int8).arrow_type == pa.list_(pa.int8())
+
+
+@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
+def test_arrow_string_array_creation():
+    # create from pandas Series
+    series = pd.Series(["a", "bc", "de"])
+    array = ArrowStringArray(series)
+    assert isinstance(array._arrow_array, pa.ChunkedArray)
+
+    if pd.__version__ >= "1.0.0":
+        # test create from StringArray which occurs in pandas 1.0
+        s = pd.arrays.StringArray(np.array(["a", "bc", "de"], dtype=object))
+        array = ArrowStringArray(s)
+        assert isinstance(array._arrow_array, pa.ChunkedArray)
+
+    # create from list
+    lst = ["a", "bc", "de"]
+    array = ArrowStringArray(lst)
+    assert isinstance(array._arrow_array, pa.ChunkedArray)
+
+    # create from pyarrow Array
+    a = pa.array(["a", "bc", "de"])
+    array = ArrowStringArray(a)
+    assert isinstance(array._arrow_array, pa.ChunkedArray)
+
+    # create from ArrowStringArray
+    array2 = ArrowStringArray(array)
+    assert isinstance(array2._arrow_array, pa.ChunkedArray)
+
+    # test copy
+    arrow_array = array2._arrow_array
+    array3 = ArrowStringArray(arrow_array, copy=True)
+    assert array3._arrow_array is not arrow_array
+
+    # test from_scalars
+    array = ArrowStringArray.from_scalars([1, 2])
+    assert isinstance(array._arrow_array, pa.ChunkedArray)
+    assert isinstance(array._arrow_array.chunks[0], pa.StringArray)
+
+    # test _from_sequence
+    array = ArrowStringArray._from_sequence(["a", "b", "cc"])
+    assert isinstance(array._arrow_array, pa.ChunkedArray)
+
+    # test _from_sequence_of_strings
+    array = ArrowStringArray._from_sequence_of_strings(["a", "b"])
+    assert isinstance(array._arrow_array, pa.ChunkedArray)
+
+
+@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
+def test_arrow_list_array_creation():
+    # create from pandas Series
+    series = pd.Series([["a", "b"], ["c"], ["d", "e"]])
+    array = ArrowListArray(series)
+    assert isinstance(array.dtype, ArrowListDtype)
+    assert isinstance(array.dtype.value_type, ArrowStringDtype)
+    assert isinstance(array._arrow_array, pa.ChunkedArray)
+
+    # create from list
+    lst = [["a"], ["b", "c"], ["d", "e"]]
+    array = ArrowListArray(lst)
+    assert isinstance(array.dtype, ArrowListDtype)
+    assert isinstance(array.dtype.value_type, ArrowStringDtype)
+    assert isinstance(array._arrow_array, pa.ChunkedArray)
+
+    # create from pyarrow Array
+    a = pa.array([[1.0], [2.0, 3.0], [4.0]])
+    array = ArrowListArray(a)
+    assert isinstance(array.dtype, ArrowListDtype)
+    assert array.dtype.value_type == np.float64
+    assert isinstance(array._arrow_array, pa.ChunkedArray)
+
+    # create from ArrowListArray
+    array2 = ArrowListArray(array)
+    assert isinstance(array2._arrow_array, pa.ChunkedArray)
+
+    # test _from_sequence
+    array = ArrowListArray._from_sequence([[1, 2], [3, 4], [5]])
+    assert isinstance(array.dtype, ArrowListDtype)
+    assert array.dtype.value_type == np.int64
+    assert isinstance(array._arrow_array, pa.ChunkedArray)
+
+    # test pandas_only
+    with option_context({"dataframe.arrow_array.pandas_only": True}):
+        array = ArrowListArray._from_sequence([[1, 2], [3, 4], [5]])
+        assert isinstance(array.dtype, ArrowListDtype)
+        assert isinstance(array._ndarray, np.ndarray)
+
+    # test pandas_only and in kernel mode
+    with enter_mode(kernel=True), option_context(
+        {"dataframe.arrow_array.pandas_only": True}
+    ), pytest.raises(ImportError):
+        ArrowListArray._from_sequence([[1, 2], [3, 4], [5]])
+
+
+@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
+def test_arrow_string_array_functions():
+    lst = np.array(["abc", "de", "eee", "中文"], dtype=object)
+    # leverage string array to get the right answer
+    string_array = pd.arrays.StringArray(lst)
+    has_na_arrow_array = ArrowStringArray(["abc", None, "eee", "中文"])
+    has_na_string_array = pd.arrays.StringArray(
+        np.array(["abc", pd.NA, "eee", "中文"], dtype=object)
+    )
+
+    for pandas_only in [False, True]:
+        with option_context({"dataframe.arrow_array.pandas_only": pandas_only}):
+            arrow_array = ArrowStringArray(lst)
+
+            # getitem, scalar
+            assert arrow_array[1] == string_array[1]
+            assert arrow_array[-1] == string_array[-1]
+            # getitem, slice
+            assert list(arrow_array[:2]) == list(string_array[:2])
+            assert list(arrow_array[1:-1]) == list(string_array[1:-1])
+            assert list(arrow_array[::2]) == list(string_array[::2])
+            # getitem, boolean index
+            cond = np.array([len(c) > 2 for c in lst])
+            assert list(arrow_array[cond]) == list(string_array[cond])
+            # getitem, fancy index
+            selection = [3, 1, 2]
+            assert list(arrow_array[selection]) == list(string_array[selection])
+            selection = [3, -1, 2, -4]
+            assert list(arrow_array[selection]) == list(string_array[selection])
+            selection = np.array([3, -1, 2, -4])
+            assert list(arrow_array[selection]) == list(string_array[selection])
+
+            # setitem
+            arrow_array2 = arrow_array.copy()
+            string_array2 = string_array.copy()
+            arrow_array2[0] = "ss"
+            string_array2[0] = "ss"
+            assert list(arrow_array2) == list(string_array2)
+            arrow_array2[1:3] = ["ss1", "ss2"]
+            string_array2[1:3] = ["ss1", "ss2"]
+            assert list(arrow_array2) == list(string_array2)
+            arrow_array2[1:3] = arrow_array2[2:4]
+            string_array2[1:3] = string_array2[2:4]
+            assert list(arrow_array2) == list(string_array2)
+            arrow_array2[2:] = pd.Series(["ss3", "ss4"])
+            string_array2[2:] = pd.Series(["ss3", "ss4"])
+            assert list(arrow_array2) == list(string_array2)
+            with pytest.raises(ValueError):
+                arrow_array2[0] = ["a", "b"]
+            arrow_array2[-1] = None
+            string_array2[-1] = None
+            assert list(arrow_array2)[:-1] == list(string_array2)[:-1]
+            assert pd.isna(list(arrow_array2)[-1]) is True
+            with pytest.raises(ValueError):
+                arrow_array2[0] = 2
+            with pytest.raises(ValueError):
+                arrow_array2[:2] = [1, 2]
+
+            # test to_numpy
+            np.testing.assert_array_equal(
+                arrow_array.to_numpy(), string_array.to_numpy()
+            )
+            np.testing.assert_array_equal(
+                arrow_array.to_numpy(copy=True), string_array.to_numpy(copy=True)
+            )
+            np.testing.assert_array_equal(
+                has_na_arrow_array.to_numpy(copy=True, na_value="ss"),
+                has_na_string_array.to_numpy(copy=True, na_value="ss"),
+            )
+
+            # test fillna
+            arrow_array3 = has_na_arrow_array.fillna("filled")
+            string_array3 = has_na_string_array.fillna("filled")
+            assert list(arrow_array3) == list(string_array3)
+
+            # test astype
+            arrow_array4 = ArrowStringArray(["1", "10", "100"])
+            # leverage string array to get the right answer
+            string_array4 = pd.arrays.StringArray(
+                np.array(["1", "10", "100"], dtype=object)
+            )
+            np.testing.assert_array_equal(
+                arrow_array4.astype(np.int64), string_array4.astype(np.int64)
+            )
+            np.testing.assert_almost_equal(
+                arrow_array4.astype(float), string_array4.astype(float)
+            )
+            assert list(arrow_array4.astype(ArrowStringDtype(), copy=False)) == list(
+                string_array4.astype(pd.StringDtype(), copy=False)
+            )
+            assert list(arrow_array4.astype(ArrowStringDtype(), copy=True)) == list(
+                string_array4.astype(pd.StringDtype(), copy=True)
+            )
+
+            # test factorize
+            codes, unique = arrow_array.factorize()
+            codes2, unique2 = string_array.factorize()
+            assert list(codes) == list(codes2)
+            assert list(unique) == list(unique2)
+
+            # test nbytes
+            assert arrow_array.nbytes < pd.Series(
+                string_array.astype(object)
+            ).memory_usage(deep=True, index=False)
+
+            # test memory_usage
+            if pandas_only:
+                assert arrow_array.memory_usage(deep=False) == pd.Series(
+                    string_array
+                ).memory_usage(index=False)
+            else:
+                assert arrow_array.memory_usage(deep=True) == arrow_array.nbytes
+
+            # test unique
+            assert arrow_array.unique() == pd.Series(string_array).unique()
+            arrow_array2 = arrow_array.copy()
+            arrow_array2._force_use_pandas = True
+            assert arrow_array2.unique() == pd.Series(string_array).unique()
+
+            # test isna
+            np.testing.assert_array_equal(
+                has_na_arrow_array.isna(), has_na_string_array.isna()
+            )
+            has_na_arrow_array2 = has_na_arrow_array.copy()
+            has_na_arrow_array2._force_use_pandas = True
+            np.testing.assert_array_equal(
+                has_na_arrow_array2.isna(), has_na_string_array.isna()
+            )
+
+            # test take
+            assert list(arrow_array.take([1, 2, -1])) == list(
+                string_array.take([1, 2, -1])
+            )
+            assert list(
+                arrow_array.take([1, 2, -1], allow_fill=True).fillna("aa")
+            ) == list(string_array.take([1, 2, -1], allow_fill=True).fillna("aa"))
+            assert list(
+                arrow_array.take([1, 2, -1], allow_fill=True, fill_value="aa")
+            ) == list(string_array.take([1, 2, -1], allow_fill=True, fill_value="aa"))
+
+            # test shift
+            assert list(arrow_array.shift(2, fill_value="aa")) == list(
+                string_array.shift(2, fill_value="aa")
+            )
+
+            # test value_counts
+            assert list(arrow_array.value_counts()) == list(string_array.value_counts())
+            assert list(has_na_arrow_array.value_counts(dropna=True)) == list(
+                has_na_string_array.value_counts(dropna=True)
+            )
+
+            # test all any
+            assert arrow_array.all() == string_array.all()
+            assert arrow_array.any() == string_array.any()
+
+            # test arithmetic
+            assert list(arrow_array + "s") == list(string_array + "s")
+            assert list((arrow_array + has_na_arrow_array).fillna("ss")) == list(
+                (string_array + has_na_string_array).fillna("ss")
+            )
+
+            # test comparison
+            np.testing.assert_array_equal(arrow_array < "s", string_array < "s")
+            pd.testing.assert_series_equal(
+                pd.Series(arrow_array < has_na_arrow_array),
+                pd.Series(string_array < has_na_string_array),
+            )
+
+            # test repr
+            assert "ArrowStringArray" in repr(arrow_array)
+
+            # test concat empty
+            arrow_array5 = ArrowStringArray(pa.chunked_array([], type=pa.string()))
+            concatenated = ArrowStringArray._concat_same_type(
+                [arrow_array5, arrow_array5]
+            )
+            if not pandas_only:
+                assert len(concatenated._arrow_array.chunks) == 1
+            pd.testing.assert_series_equal(
+                pd.Series(arrow_array5), pd.Series(concatenated)
+            )
+
+
+@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
+def test_arrow_list_functions():
+    lst = np.array([["a, bc"], ["de"], ["e", "ee"], ["中文", "中文2"]], dtype=object)
+    has_na_lst = lst.copy()
+    has_na_lst[1] = None
+
+    for pandas_only in [False, True]:
+        with option_context({"dataframe.arrow_array.pandas_only": pandas_only}):
+            arrow_array = ArrowListArray(lst)
+            has_na_arrow_array = ArrowListArray(has_na_lst)
+
+            # getitem, scalar
+            assert arrow_array[1] == lst[1]
+            assert list(arrow_array[-1]) == lst[-1]
+            # getitem, slice
+            np.testing.assert_array_equal(arrow_array[:2].to_numpy(), lst[:2])
+
+            # setitem
+            arrow_array2 = arrow_array.copy()
+            lst2 = lst.copy()
+            for s in [["ss"], pd.Series(["ss"])]:
+                arrow_array2[0] = s
+                lst2[0] = ["ss"]
+                np.testing.assert_array_equal(arrow_array2.to_numpy(), lst2)
+            arrow_array2[0] = None
+            lst2[0] = None
+            np.testing.assert_array_equal(arrow_array2.to_numpy(), lst2)
+            with pytest.raises(ValueError):
+                # must set list like object
+                arrow_array2[0] = "ss"
+
+            # test to_numpy
+            np.testing.assert_array_equal(arrow_array.to_numpy(), lst)
+            np.testing.assert_array_equal(arrow_array.to_numpy(copy=True), lst)
+            np.testing.assert_array_equal(
+                has_na_arrow_array.to_numpy(na_value=1),
+                pd.Series(has_na_lst).fillna(1).to_numpy(),
+            )
+
+            # test fillna
+            if not pandas_only:
+                arrow_array3 = has_na_arrow_array.fillna(lst[1])
+                np.testing.assert_array_equal(arrow_array3.to_numpy(), lst)
+
+            # test astype
+            with pytest.raises(TypeError):
+                arrow_array.astype(np.int64)
+            with pytest.raises(TypeError):
+                arrow_array.astype(ArrowListDtype(np.int64))
+            arrow_array4 = ArrowListArray([[1, 2], [3]])
+            expected = np.array([["1", "2"], ["3"]], dtype=object)
+            np.testing.assert_array_equal(
+                arrow_array4.astype(ArrowListDtype(str)), expected
+            )
+            np.testing.assert_array_equal(
+                arrow_array4.astype(ArrowListDtype(arrow_array4.dtype)), arrow_array4
+            )
+            np.testing.assert_array_equal(
+                arrow_array4.astype(ArrowListDtype(arrow_array4.dtype), copy=False),
+                arrow_array4,
+            )
+
+            # test nbytes
+            assert arrow_array.nbytes < pd.Series(lst).memory_usage(deep=True)
+
+            # test memory_usage
+            if not pandas_only:
+                assert arrow_array.memory_usage(deep=True) == arrow_array.nbytes
+
+            # test isna
+            np.testing.assert_array_equal(
+                has_na_arrow_array.isna(), pd.Series(has_na_lst).isna()
+            )
+
+            # test take
+            assert list(arrow_array.take([1, 2, -1])) == list(
+                pd.Series(lst).take([1, 2, -1])
+            )
+
+            # test shift
+            assert (
+                list(arrow_array.shift(2, fill_value=["aa"]))
+                == [["aa"]] * 2 + lst[:-2].tolist()
+            )
+
+            # test all any
+            if _use_bool_any_all:
+                assert arrow_array.all() == pd.array(lst).all()
+                assert arrow_array.any() == pd.array(lst).any()
+            else:
+                assert arrow_array.all() == lst.all()
+                assert arrow_array.any() == lst.any()
+
+            # test repr
+            assert "ArrowListArray" in repr(arrow_array)
+
+            # test concat empty
+            arrow_array5 = ArrowListArray(
+                pa.chunked_array([], type=pa.list_(pa.string()))
+            )
+            concatenated = ArrowListArray._concat_same_type(
+                [arrow_array5, arrow_array5]
+            )
+            if not pandas_only:
+                assert len(concatenated._arrow_array.chunks) == 1
+            pd.testing.assert_series_equal(
+                pd.Series(arrow_array5), pd.Series(concatenated)
+            )
+
+
+@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
+def test_to_pandas():
+    rs = np.random.RandomState(0)
+    df = pd.DataFrame(
+        {
+            "a": rs.rand(100),
+            "b": ["s" + str(i) for i in rs.randint(100, size=100)],
+            "c": [["ss0" + str(i), "ss1" + str(i)] for i in rs.randint(100, size=100)],
+        }
+    )
+
+    batch_size = 15
+    n_batch = len(df) // 15 + 1
+    batches = [
+        pa.RecordBatch.from_pandas(df[i * batch_size : (i + 1) * batch_size])
+        for i in range(n_batch)
+    ]
+    table = pa.Table.from_batches(batches)
+
+    df1 = arrow_table_to_pandas_dataframe(table, use_arrow_dtype=False)
+    assert df1.dtypes.iloc[1] == np.dtype("O")
+    assert df1.dtypes.iloc[2] == np.dtype("O")
+
+    df2 = arrow_table_to_pandas_dataframe(table)
+    assert df2.dtypes.iloc[1] == ArrowStringDtype()
+    assert df2.dtypes.iloc[2] == ArrowListDtype(str)
+    assert df2.memory_usage(deep=True).sum() < df.memory_usage(deep=True).sum()
+
+    # test df method
+    df4 = df2.groupby("b").sum()
+    df4.index = df4.index.astype(object)
+    expected = df.groupby("b").sum()
+    pd.testing.assert_frame_equal(df4, expected)
+
+    s = ("s" + df2["b"]).astype("string")
+    expected = ("s" + df["b"]).astype("string")
+    pd.testing.assert_series_equal(s, expected)
+
+    s2 = df2["b"].str[:2]
+    expected = df["b"].astype("string").str[:2]
+    pd.testing.assert_series_equal(s2, expected)
diff --git a/python/xorbits/_mars/dataframe/tests/test_core.py b/python/xorbits/_mars/dataframe/tests/test_core.py
new file mode 100644
index 000000000..53c01789c
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/tests/test_core.py
@@ -0,0 +1,394 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ...core import tile
+from ...lib.groupby_wrapper import wrapped_groupby
+from ...utils import pd_release_version
+from .. import cut
+from ..initializer import DataFrame, Index, Series
+
+_with_inclusive_bounds = pd_release_version >= (1, 3, 0)
+
+
+def test_dataframe_params():
+    raw = pd.DataFrame({"a": [1, 2, 3]})
+    df = DataFrame(raw)
+    df = df[df["a"] < 2]
+    df = tile(df)
+    c = df.chunks[0]
+
+    assert any(np.isnan(s) for s in c.params["shape"])
+    assert np.isnan(c.params["index_value"].min_val)
+    c.params = c.get_params_from_data(raw[raw["a"] < 2])
+    # shape and index_value updated
+    assert not any(np.isnan(s) for s in c.params["shape"])
+    assert not np.isnan(c.params["index_value"].min_val)
+
+    params = c.params.copy()
+    params.pop("index", None)
+    df.params = params
+    assert np.prod(df.shape) > 0
+    df.refresh_params()
+
+
+def test_series_params():
+    raw = pd.Series([1, 2, 3], name="a")
+    series = Series(raw)
+    series = series[series < 2]
+    series = tile(series)
+    c = series.chunks[0]
+
+    assert series.T is series
+
+    assert any(np.isnan(s) for s in c.params["shape"])
+    assert np.isnan(c.params["index_value"].min_val)
+    c.params = c.get_params_from_data(raw[raw < 2])
+    # shape and index_value updated
+    assert not any(np.isnan(s) for s in c.params["shape"])
+    assert not np.isnan(c.params["index_value"].min_val)
+
+    params = c.params.copy()
+    params.pop("index", None)
+    series.params = params
+    assert np.prod(series.shape) > 0
+    series.refresh_params()
+
+
+def test_index_params():
+    raw = pd.Series([1, 2, 3], name="a")
+    raw.index.name = "b"
+    series = Series(raw)
+    series = series[series < 2]
+    index = series.index
+    index = tile(index)
+    c = index.chunks[0]
+
+    assert index.T is index
+
+    assert any(np.isnan(s) for s in c.params["shape"])
+    assert np.isnan(c.params["index_value"].min_val)
+    c.params = c.get_params_from_data(raw[raw < 2].index)
+    # shape and index_value updated
+    assert not any(np.isnan(s) for s in c.params["shape"])
+    assert not np.isnan(c.params["index_value"].min_val)
+
+    params = c.params.copy()
+    params.pop("index", None)
+    index.params = params
+    assert np.prod(index.shape) > 0
+    index.refresh_params()
+
+
+def test_categorical_params():
+    raw = np.random.rand(10)
+    cate = cut(raw, [0.3, 0.5, 0.7])
+    cate = tile(cate)
+    c = cate.chunks[0]
+
+    c.params = c.get_params_from_data(pd.cut(raw, [0.3, 0.5, 0.7]))
+    assert len(c.params["categories_value"].to_pandas()) > 0
+
+    params = c.params.copy()
+    params.pop("index", None)
+    cate.params = params
+    assert len(cate.params["categories_value"].to_pandas()) > 0
+    cate.refresh_params()
+
+
+def test_groupby_params():
+    raw = pd.DataFrame({"a": [1, 2, 3]})
+    df = DataFrame(raw)
+    grouped = df.groupby("a")
+    grouped = tile(grouped)
+    c = grouped.chunks[0]
+
+    c.params = c.get_params_from_data(wrapped_groupby(raw, by="a"))
+    params = c.params.copy()
+    params.pop("index", None)
+    grouped.params = params
+
+    raw = pd.Series([1, 2, 3], name="a")
+    series = Series(raw)
+    grouped = series.groupby(level=0)
+    grouped = tile(grouped)
+    c = grouped.chunks[0]
+
+    c.params = c.get_params_from_data(wrapped_groupby(raw, level=0))
+    params = c.params.copy()
+    params.pop("index", None)
+    grouped.params = params
+    grouped.refresh_params()
+
+
+def test_dataframe_dir():
+    df = DataFrame(pd.DataFrame(np.random.rand(4, 3), columns=list("ABC")))
+    dir_result = set(dir(df))
+    for c in df.dtypes.index:
+        assert c in dir_result
+
+
+def test_to_frame_or_series(setup):
+    raw = pd.Series(np.random.rand(10), name="col")
+    series = Series(raw)
+
+    r = series.to_frame()
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(raw.to_frame(), result)
+
+    r = series.to_frame(name="new_name")
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(raw.to_frame(name="new_name"), result)
+
+    series = series[series > 0.1]
+    r = series.to_frame(name="new_name")
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(raw[raw > 0.1].to_frame(name="new_name"), result)
+
+    raw = pd.Index(np.random.rand(10), name="col")
+    index = Index(raw)
+
+    r = index.to_frame()
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(raw.to_frame(), result)
+
+    r = index.to_frame(index=False)
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(raw.to_frame(index=False), result)
+
+    r = index.to_frame(name="new_name")
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(raw.to_frame(name="new_name"), result)
+
+    r = index.to_series()
+    result = r.execute().fetch()
+    pd.testing.assert_series_equal(raw.to_series(), result)
+
+    r = index.to_series(index=pd.RangeIndex(0, 10))
+    result = r.execute().fetch()
+    pd.testing.assert_series_equal(raw.to_series(index=pd.RangeIndex(0, 10)), result)
+
+    r = index.to_series(name="new_name")
+    result = r.execute().fetch()
+    pd.testing.assert_series_equal(raw.to_series(name="new_name"), result)
+
+    raw = pd.MultiIndex.from_tuples([("A", "E"), ("B", "F"), ("C", "G")])
+    index = Index(raw, tupleize_cols=True)
+
+    r = index.to_frame()
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(raw.to_frame(), result)
+
+    with pytest.raises(TypeError):
+        index.to_frame(name="XY")
+
+    with pytest.raises(ValueError):
+        index.to_frame(name=["X", "Y", "Z"])
+
+    r = index.to_frame(name=["X", "Y"])
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(raw.to_frame(name=["X", "Y"]), result)
+
+    r = index.to_series(name="new_name")
+    result = r.execute().fetch()
+    pd.testing.assert_series_equal(raw.to_series(name="new_name"), result)
+
+
+def test_to_frame_or_series_apply(setup):
+    df1 = DataFrame(pd.DataFrame([[0, 1], [2, 3]], columns=["col1", "col2"]))
+    df2 = df1.append(DataFrame(pd.DataFrame(columns=["col1", "col2"])))
+    pd_df2 = df2.apply(
+        lambda row: pd.Series([1, 2], index=["c", "d"]), axis=1
+    ).to_pandas()
+    assert pd_df2.columns.tolist() == ["c", "d"]
+
+    def f(df):
+        df["col3"] = df["col2"]
+        return df
+
+    pd_df3 = df2.groupby(["col1"]).apply(f).to_pandas()
+    assert pd_df3.columns.tolist() == ["col1", "col2", "col3"]
+
+    pd_df4 = df2.map_chunk(
+        lambda chunk_df: chunk_df.apply(
+            lambda row: pd.Series([1, 2], index=["c", "d"]), axis=1
+        )
+    ).to_pandas()
+    assert pd_df4.columns.tolist() == ["c", "d"]
+
+    ser1 = Series(pd.Series(data={"a": 1, "b": 2, "c": 3}, index=["a", "b", "c"]))
+    ser2 = ser1.append(Series(pd.Series(dtype=np.int64)))
+    pd_ser2 = ser2.apply(lambda v: str(v)).execute()
+    assert pd_ser2.dtype == object
+
+    ser3 = ser2.map_chunk(
+        lambda chunk_series: chunk_series.apply(lambda x: float(x))
+    ).execute()
+
+    def check_dtype(s):
+        assert s.dtypes == np.float64
+        return s
+
+    ser3.map_chunk(check_dtype).execute()
+
+
+def test_assign(setup):
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame({"A": rs.rand(10), "B": rs.rand(10)})
+
+    df = DataFrame(raw, chunk_size=5)
+    result = df.assign(C=df.B / df.A).execute().fetch()
+    expected = raw.assign(C=raw.B / raw.A)
+    pd.testing.assert_frame_equal(result, expected)
+
+    # lambda syntax
+    result = df.assign(C=lambda x: x.B / x.A).execute().fetch()
+    expected = raw.assign(C=lambda x: x.B / x.A)
+    pd.testing.assert_frame_equal(result, expected)
+
+    # Non-Series array-like
+    row_list = rs.rand(10).tolist()
+    result = df.assign(C=row_list).execute().fetch()
+    expected = raw.assign(C=row_list)
+    pd.testing.assert_frame_equal(result, expected)
+
+    # multiple
+    row_list = rs.rand(10).tolist()
+    result = df.assign(C=row_list, D=df.A, E=lambda x: x.B)
+    result["C"] = result["C"].astype("int64")
+    expected = raw.assign(C=row_list, D=raw.A, E=lambda x: x.B)
+    expected["C"] = expected["C"].astype("int64")
+    pd.testing.assert_frame_equal(result.execute().fetch(), expected)
+
+
+def test_key_value(setup):
+    raw = pd.DataFrame(np.random.rand(4, 3), columns=list("ABC"))
+    df = DataFrame(raw)
+
+    result = df.values.execute().fetch()
+    np.testing.assert_array_equal(result, raw.values)
+
+    result = df.keys().execute().fetch()
+    pd.testing.assert_index_equal(result, raw.keys())
+
+    raw = pd.Series(np.random.rand(10))
+    s = Series(raw)
+
+    result = s.values.execute().fetch()
+    np.testing.assert_array_equal(result, raw.values)
+
+    result = s.keys().execute().fetch()
+    pd.testing.assert_index_equal(result, raw.keys())
+
+    raw = pd.Index(np.random.rand(10))
+    idx = Index(raw)
+
+    result = idx.values.execute().fetch()
+    np.testing.assert_array_equal(result, raw.values)
+
+
+@pytest.mark.pd_compat
+def test_between(setup):
+    pd_series = pd.Series(pd.date_range("1/1/2000", periods=10))
+    pd_left, pd_right = pd_series[3], pd_series[7]
+    series = Series(pd_series, chunk_size=5)
+    left, right = series.iloc[3], series.iloc[7]
+
+    result = series.between(left, right).execute().fetch()
+    expected = pd_series.between(pd_left, pd_right)
+    pd.testing.assert_series_equal(result, expected)
+
+    if _with_inclusive_bounds:
+        result = series.between(left, right, inclusive="both").execute().fetch()
+        expected = pd_series.between(pd_left, pd_right, inclusive="both")
+        pd.testing.assert_series_equal(result, expected)
+
+        result = series.between(left, right, inclusive="left").execute().fetch()
+        expected = pd_series.between(pd_left, pd_right, inclusive="left")
+        pd.testing.assert_series_equal(result, expected)
+
+        result = series.between(left, right, inclusive="right").execute().fetch()
+        expected = pd_series.between(pd_left, pd_right, inclusive="right")
+        pd.testing.assert_series_equal(result, expected)
+
+        result = series.between(left, right, inclusive="neither").execute().fetch()
+        expected = pd_series.between(pd_left, pd_right, inclusive="neither")
+        pd.testing.assert_series_equal(result, expected)
+
+    with pytest.raises(ValueError):
+        series = Series(pd.date_range("1/1/2000", periods=10), chunk_size=5)
+        series.between(left, right, inclusive="yes").execute().fetch()
+
+    # test_between_datetime_values
+    pd_series = pd.Series(pd.bdate_range("1/1/2000", periods=20).astype(object))
+    pd_series[::2] = np.nan
+
+    series = Series(pd_series, chunk_size=5)
+    result = series[series.between(series[3], series[17])].execute().fetch()
+    expected = pd_series[3:18].dropna()
+    pd.testing.assert_series_equal(result, expected)
+
+    result = (
+        series[series.between(series[3], series[17], inclusive="neither")]
+        .execute()
+        .fetch()
+    )
+    expected = pd_series[5:16].dropna()
+    pd.testing.assert_series_equal(result, expected)
+
+    # test_between_period_values
+    pd_series = pd.Series(pd.period_range("2000-01-01", periods=10, freq="D"))
+    pd_left, pd_right = pd_series[2], pd_series[7]
+
+    series = Series(pd_series, chunk_size=5)
+    left, right = series[2], series[7]
+
+    result = series.between(left, right).execute().fetch()
+    expected = pd_series.between(pd_left, pd_right)
+    pd.testing.assert_series_equal(result, expected)
+
+
+def test_series_median(setup):
+    raw = pd.Series(np.random.rand(10), name="col")
+    series = Series(raw)
+
+    r = series.median()
+    result = r.execute().fetch()
+    assert np.isclose(raw.median(), result)
+
+    raw = pd.Series(np.random.rand(100), name="col")
+    series = Series(raw)
+
+    r = series.median()
+    result = r.execute().fetch()
+    assert np.isclose(raw.median(), result)
+
+    raw = pd.Series(np.random.rand(10), name="col")
+    raw[np.random.randint(0, 10)] = None
+    series = Series(raw)
+
+    r = series.median()
+    result = r.execute().fetch()
+    assert np.isclose(raw.median(), result)
+
+    raw = pd.Series(np.random.rand(10), name="col")
+    raw[np.random.randint(0, 10)] = None
+    series = Series(raw)
+
+    r = series.median(skipna=False)
+    result = r.execute().fetch()
+    assert np.isnan(raw.median(skipna=False)) and np.isnan(result)
diff --git a/python/xorbits/_mars/dataframe/tests/test_initializer.py b/python/xorbits/_mars/dataframe/tests/test_initializer.py
new file mode 100644
index 000000000..2117acc40
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/tests/test_initializer.py
@@ -0,0 +1,209 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ... import dataframe as md
+from ... import tensor as mt
+from ...tests.core import require_cudf, require_cupy
+from ...utils import lazy_import
+
+cupy = lazy_import("cupy")
+cudf = lazy_import("cudf")
+
+
+def test_dataframe_initializer(setup):
+    # from tensor
+    raw = np.random.rand(100, 10)
+    tensor = mt.tensor(raw, chunk_size=7)
+    r = md.DataFrame(tensor)
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(result, pd.DataFrame(raw))
+
+    r = md.DataFrame(tensor, chunk_size=13)
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(result, pd.DataFrame(raw))
+
+    # from Mars dataframe
+    raw = pd.DataFrame(np.random.rand(100, 10), columns=list("ABCDEFGHIJ"))
+    df = md.DataFrame(raw, chunk_size=15) * 2
+    r = md.DataFrame(df, num_partitions=11)
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(result, raw * 2)
+
+    # from tileable dict
+    raw_dict = {
+        "C": np.random.choice(["u", "v", "w"], size=(100,)),
+        "A": pd.Series(np.random.rand(100)),
+        "B": np.random.randint(0, 10, size=(100,)),
+    }
+    m_dict = raw_dict.copy()
+    m_dict["A"] = md.Series(m_dict["A"])
+    m_dict["B"] = mt.tensor(m_dict["B"])
+    r = md.DataFrame(m_dict, columns=list("ABC"))
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(result, pd.DataFrame(raw_dict, columns=list("ABC")))
+
+    r = md.DataFrame({"a": [mt.tensor([1, 2, 3]).sum() + 1]})
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(result, pd.DataFrame({"a": [7]}))
+
+    # from tileable list
+    raw_list = [
+        np.random.choice(["u", "v", "w"], size=(3,)),
+        pd.Series(np.random.rand(3)),
+        np.random.randint(0, 10, size=(3,)),
+    ]
+    m_list = raw_list.copy()
+    m_list[1] = md.Series(m_list[1])
+    m_list[2] = mt.tensor(m_list[2])
+    r = md.DataFrame(m_list, columns=list("ABC"))
+    result = r.execute(extra_config={"check_dtypes": False}).fetch()
+    pd.testing.assert_frame_equal(result, pd.DataFrame(raw_list, columns=list("ABC")))
+
+    # from raw pandas initializer
+    raw = pd.DataFrame(np.random.rand(100, 10), columns=list("ABCDEFGHIJ"))
+    r = md.DataFrame(raw, num_partitions=10)
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(result, raw)
+
+    # from mars series
+    raw_s = np.random.rand(100)
+    s = md.Series(raw_s, chunk_size=20)
+    r = md.DataFrame(s, num_partitions=10)
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(result, pd.DataFrame(raw_s))
+
+    # test check instance
+    r = r * 2
+    assert isinstance(r, md.DataFrame)
+
+
+@require_cudf
+@require_cupy
+def test_dataframe_gpu_initializer(setup_gpu):
+    # from raw cudf initializer
+    raw = cudf.DataFrame(cupy.random.rand(100, 10), columns=list("ABCDEFGHIJ"))
+    r = md.DataFrame(raw, chunk_size=13)
+    result = r.execute().fetch()
+    pd.testing.assert_frame_equal(result.to_pandas(), raw.to_pandas())
+
+    raw = cupy.random.rand(100, 10)
+    r = md.DataFrame(raw, columns=list("ABCDEFGHIJ"), chunk_size=13)
+    result = r.execute().fetch()
+    expected = cudf.DataFrame(raw, columns=list("ABCDEFGHIJ"))
+    pd.testing.assert_frame_equal(result.to_pandas(), expected.to_pandas())
+
+
+def test_series_initializer(setup):
+    # from tensor
+    raw = np.random.rand(100)
+    tensor = mt.tensor(raw, chunk_size=7)
+    r = md.Series(tensor)
+    result = r.execute().fetch()
+    pd.testing.assert_series_equal(result, pd.Series(raw))
+
+    r = md.Series(tensor, chunk_size=13)
+    result = r.execute().fetch()
+    pd.testing.assert_series_equal(result, pd.Series(raw))
+
+    # from index
+    raw = np.arange(100)
+    np.random.shuffle(raw)
+    raw = pd.Index(raw, name="idx_name")
+    idx = md.Index(raw, chunk_size=7)
+    r = md.Series(idx)
+    result = r.execute().fetch()
+    pd.testing.assert_series_equal(result, pd.Series(raw))
+
+    # from Mars series
+    raw = pd.Series(np.random.rand(100), name="series_name")
+    ms = md.Series(raw, chunk_size=15) * 2
+    r = md.Series(ms, num_partitions=11)
+    result = r.execute().fetch()
+    pd.testing.assert_series_equal(result, raw * 2)
+
+    # from raw pandas initializer
+    raw = pd.Series(np.random.rand(100), name="series_name")
+    r = md.Series(raw, num_partitions=10)
+    result = r.execute().fetch()
+    pd.testing.assert_series_equal(result, raw)
+
+    # test check instance
+    r = r * 2
+    assert isinstance(r, md.Series)
+
+
+@require_cudf
+@require_cupy
+def test_series_gpu_initializer(setup_gpu):
+    # from raw cudf initializer
+    raw = cudf.Series(cupy.random.rand(100), name="a")
+    r = md.Series(raw, chunk_size=13)
+    result = r.execute().fetch()
+    pd.testing.assert_series_equal(result.to_pandas(), raw.to_pandas())
+
+    raw = cupy.random.rand(100)
+    r = md.Series(raw, name="a", chunk_size=13)
+    result = r.execute().fetch()
+    expected = cudf.Series(raw, name="a")
+    pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas())
+
+
+def test_index_initializer(setup):
+    # from tensor
+    raw = np.arange(100)
+    np.random.shuffle(raw)
+    tensor = mt.tensor(raw)
+    r = md.Index(tensor, chunk_size=7)
+    result = r.execute().fetch()
+    pd.testing.assert_index_equal(result, pd.Index(raw))
+
+    # from Mars index
+    raw = np.arange(100)
+    np.random.shuffle(raw)
+    idx = md.Index(raw, chunk_size=7)
+    r = md.Index(idx, num_partitions=11)
+    result = r.execute().fetch()
+    pd.testing.assert_index_equal(result, pd.Index(raw))
+
+    # from pandas initializer
+    raw = np.arange(100)
+    np.random.shuffle(raw)
+    raw_ser = pd.Series(raw, name="series_name")
+    r = md.Index(raw_ser, chunk_size=7)
+    result = r.execute().fetch()
+    pd.testing.assert_index_equal(result, pd.Index(raw_ser))
+
+    raw_idx = pd.Index(raw, name="idx_name")
+    r = md.Index(raw_idx, num_partitions=10)
+    result = r.execute().fetch()
+    pd.testing.assert_index_equal(result, pd.Index(raw_idx))
+
+
+@require_cudf
+@require_cupy
+def test_index_gpu_initializer(setup_gpu):
+    # from raw cudf initializer
+    raw = cudf.Index(cupy.random.rand(100), name="a")
+    r = md.Index(raw, chunk_size=13)
+    result = r.execute().fetch()
+    pd.testing.assert_index_equal(result.to_pandas(), raw.to_pandas())
+
+    raw = cupy.random.rand(100)
+    r = md.Index(raw, name="a", chunk_size=13)
+    result = r.execute().fetch()
+    expected = cudf.Index(raw, name="a")
+    pd.testing.assert_index_equal(result.to_pandas(), expected.to_pandas())
diff --git a/python/xorbits/_mars/dataframe/tests/test_utils.py b/python/xorbits/_mars/dataframe/tests/test_utils.py
new file mode 100644
index 000000000..58df53090
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/tests/test_utils.py
@@ -0,0 +1,689 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+from collections import OrderedDict
+from numbers import Integral
+from typing import Dict, List
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ...config import option_context
+from ...core import tile
+from ...utils import Timer
+from ..core import IndexValue
+from ..initializer import DataFrame, Index, Series
+from ..utils import (
+    auto_merge_chunks,
+    build_concatenated_rows_frame,
+    build_split_idx_to_origin_idx,
+    decide_dataframe_chunk_sizes,
+    decide_series_chunk_size,
+    fetch_corner_data,
+    filter_index_value,
+    infer_dtypes,
+    infer_index_value,
+    make_dtypes,
+    merge_index_value,
+    parse_index,
+    split_monotonic_index_min_max,
+    validate_axis,
+    whether_to_clean_up,
+)
+
+
+def test_decide_dataframe_chunks():
+    with option_context() as options:
+        options.chunk_store_limit = 64
+
+        memory_usage = pd.Series([8, 22.2, 4, 2, 11.2], index=list("abcde"))
+
+        shape = (10, 5)
+        nsplit = decide_dataframe_chunk_sizes(shape, None, memory_usage)
+        for ns in nsplit:
+            assert all(isinstance(i, Integral) for i in ns) is True
+        assert shape == tuple(sum(ns) for ns in nsplit)
+
+        nsplit = decide_dataframe_chunk_sizes(shape, {0: 4}, memory_usage)
+        for ns in nsplit:
+            assert all(isinstance(i, Integral) for i in ns) is True
+        assert shape == tuple(sum(ns) for ns in nsplit)
+
+        nsplit = decide_dataframe_chunk_sizes(shape, (2, 3), memory_usage)
+        for ns in nsplit:
+            assert all(isinstance(i, Integral) for i in ns) is True
+        assert shape == tuple(sum(ns) for ns in nsplit)
+
+        nsplit = decide_dataframe_chunk_sizes(shape, (10, 3), memory_usage)
+        for ns in nsplit:
+            assert all(isinstance(i, Integral) for i in ns) is True
+        assert shape == tuple(sum(ns) for ns in nsplit)
+
+        options.chunk_store_limit = 20
+
+        shape = (10, 5)
+        nsplit = decide_dataframe_chunk_sizes(shape, None, memory_usage)
+        for ns in nsplit:
+            assert all(isinstance(i, Integral) for i in ns) is True
+        assert shape == tuple(sum(ns) for ns in nsplit)
+
+        nsplit = decide_dataframe_chunk_sizes(shape, {1: 3}, memory_usage)
+        for ns in nsplit:
+            assert all(isinstance(i, Integral) for i in ns) is True
+        assert shape == tuple(sum(ns) for ns in nsplit)
+
+        nsplit = decide_dataframe_chunk_sizes(shape, (2, 3), memory_usage)
+        for ns in nsplit:
+            assert all(isinstance(i, Integral) for i in ns) is True
+        assert shape == tuple(sum(ns) for ns in nsplit)
+
+        nsplit = decide_dataframe_chunk_sizes(shape, (10, 3), memory_usage)
+        for ns in nsplit:
+            assert all(isinstance(i, Integral) for i in ns) is True
+        assert shape == tuple(sum(ns) for ns in nsplit)
+
+
+def test_decide_series_chunks():
+    with option_context() as options:
+        options.chunk_store_limit = 64
+
+        s = pd.Series(np.empty(50, dtype=np.int64))
+        nsplit = decide_series_chunk_size(
+            s.shape, None, s.memory_usage(index=False, deep=True)
+        )
+        assert len(nsplit) == 1
+        assert sum(nsplit[0]) == 50
+        assert nsplit[0][0] == 8
+
+
+def test_parse_index():
+    index = pd.Index([], dtype=np.int64)
+    parsed_index = parse_index(index)
+    assert isinstance(parsed_index.value, IndexValue.Int64Index)
+    pd.testing.assert_index_equal(index, parsed_index.to_pandas())
+
+    index = pd.Index([1, 2], dtype=np.int64)
+    parsed_index = parse_index(index)  # not parse data
+    assert isinstance(parsed_index.value, IndexValue.Int64Index)
+    with pytest.raises(AssertionError):
+        pd.testing.assert_index_equal(index, parsed_index.to_pandas())
+
+    parsed_index = parse_index(index, store_data=True)  # parse data
+    assert isinstance(parsed_index.value, IndexValue.Int64Index)
+    pd.testing.assert_index_equal(index, parsed_index.to_pandas())
+
+    index = pd.RangeIndex(0, 10, 3)
+    parsed_index = parse_index(index)
+    assert isinstance(parsed_index.value, IndexValue.RangeIndex)
+    pd.testing.assert_index_equal(index, parsed_index.to_pandas())
+
+    index = pd.MultiIndex.from_arrays([[0, 1], ["a", "b"], ["X", "Y"]])
+    parsed_index = parse_index(index)  # not parse data
+    assert isinstance(parsed_index.value, IndexValue.MultiIndex)
+    with pytest.raises(AssertionError):
+        pd.testing.assert_index_equal(index, parsed_index.to_pandas())
+
+    parsed_index = parse_index(index, store_data=True)  # parse data
+    assert isinstance(parsed_index.value, IndexValue.MultiIndex)
+    pd.testing.assert_index_equal(index, parsed_index.to_pandas())
+
+
+def test_split_monotonic_index_min_max():
+    left_min_max = [[0, True, 3, True], [3, False, 5, False]]
+    right_min_max = [[1, False, 3, True], [4, False, 6, True]]
+    left_splits, right_splits = split_monotonic_index_min_max(
+        left_min_max, True, right_min_max, True
+    )
+    assert left_splits == [
+        [(0, True, 1, True), (1, False, 3, True)],
+        [(3, False, 4, True), (4, False, 5, False), (5, True, 6, True)],
+    ]
+    assert right_splits == [
+        [(0, True, 1, True), (1, False, 3, True)],
+        [(3, False, 4, True), (4, False, 5, False), (5, True, 6, True)],
+    ]
+    left_splits, right_splits = split_monotonic_index_min_max(
+        right_min_max, False, left_min_max, False
+    )
+    assert list(reversed(left_splits)) == [
+        [(0, True, 1, True), (1, False, 3, True)],
+        [(3, False, 4, True), (4, False, 5, False), (5, True, 6, True)],
+    ]
+    assert list(reversed(right_splits)) == [
+        [(0, True, 1, True), (1, False, 3, True)],
+        [(3, False, 4, True), (4, False, 5, False), (5, True, 6, True)],
+    ]
+
+    left_min_max = [[2, True, 4, True], [8, True, 9, False]]
+    right_min_max = [[1, False, 3, True], [4, False, 6, True]]
+    left_splits, right_splits = split_monotonic_index_min_max(
+        left_min_max, True, right_min_max, True
+    )
+    assert left_splits == [
+        [(1, False, 2, False), (2, True, 3, True), (3, False, 4, True)],
+        [(4, False, 6, True), (8, True, 9, False)],
+    ]
+    assert right_splits == [
+        [(1, False, 2, False), (2, True, 3, True)],
+        [(3, False, 4, True), (4, False, 6, True), (8, True, 9, False)],
+    ]
+
+    left_min_max = [
+        [1, False, 3, True],
+        [4, False, 6, True],
+        [10, True, 12, False],
+        [13, True, 14, False],
+    ]
+    right_min_max = [[2, True, 4, True], [5, True, 7, False]]
+    left_splits, right_splits = split_monotonic_index_min_max(
+        left_min_max, True, right_min_max, True
+    )
+    assert left_splits == [
+        [(1, False, 2, False), (2, True, 3, True)],
+        [(3, False, 4, True), (4, False, 5, False), (5, True, 6, True)],
+        [(6, False, 7, False), (10, True, 12, False)],
+        [(13, True, 14, False)],
+    ]
+    assert right_splits == [
+        [(1, False, 2, False), (2, True, 3, True), (3, False, 4, True)],
+        [
+            (4, False, 5, False),
+            (5, True, 6, True),
+            (6, False, 7, False),
+            (10, True, 12, False),
+            (13, True, 14, False),
+        ],
+    ]
+    left_splits, right_splits = split_monotonic_index_min_max(
+        right_min_max, True, left_min_max, True
+    )
+    assert left_splits == [
+        [(1, False, 2, False), (2, True, 3, True), (3, False, 4, True)],
+        [
+            (4, False, 5, False),
+            (5, True, 6, True),
+            (6, False, 7, False),
+            (10, True, 12, False),
+            (13, True, 14, False),
+        ],
+    ]
+    assert right_splits == [
+        [(1, False, 2, False), (2, True, 3, True)],
+        [(3, False, 4, True), (4, False, 5, False), (5, True, 6, True)],
+        [(6, False, 7, False), (10, True, 12, False)],
+        [(13, True, 14, False)],
+    ]
+
+    # left min_max like ([.., .., 4 True], [4, False, ..., ...]
+    # right min_max like ([..., ..., 4 False], [4, True, ..., ...]
+    left_min_max = [[1, False, 4, True], [4, False, 6, True]]
+    right_min_max = [[1, False, 4, False], [4, True, 6, True]]
+    left_splits, right_splits = split_monotonic_index_min_max(
+        left_min_max, True, right_min_max, True
+    )
+    assert left_splits == [
+        [(1, False, 4, False), (4, True, 4, True)],
+        [(4, False, 6, True)],
+    ]
+    assert right_splits == [
+        [(1, False, 4, False)],
+        [(4, True, 4, True), (4, False, 6, True)],
+    ]
+
+    # identical index
+    left_min_max = [[1, False, 3, True], [4, False, 6, True]]
+    right_min_max = [[1, False, 3, True], [4, False, 6, True]]
+    left_splits, right_splits = split_monotonic_index_min_max(
+        left_min_max, True, right_min_max, True
+    )
+    assert left_splits == [[tuple(it)] for it in left_min_max]
+    assert right_splits == [[tuple(it)] for it in left_min_max]
+
+
+def test_build_split_idx_to_origin_idx():
+    splits = [[(1, False, 2, False), (2, True, 3, True)], [(5, False, 6, True)]]
+    res = build_split_idx_to_origin_idx(splits)
+
+    assert res == {0: (0, 0), 1: (0, 1), 2: (1, 0)}
+
+    splits = [[(5, False, 6, True)], [(1, False, 2, False), (2, True, 3, True)]]
+    res = build_split_idx_to_origin_idx(splits, increase=False)
+
+    assert res == {0: (1, 0), 1: (1, 1), 2: (0, 0)}
+
+
+def test_filter_index_value():
+    pd_index = pd.RangeIndex(10)
+    index_value = parse_index(pd_index)
+
+    min_max = (0, True, 9, True)
+    assert (
+        filter_index_value(index_value, min_max).to_pandas().tolist()
+        == pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist()
+    )
+
+    min_max = (0, False, 9, False)
+    assert (
+        filter_index_value(index_value, min_max).to_pandas().tolist()
+        == pd_index[(pd_index > 0) & (pd_index < 9)].tolist()
+    )
+
+    pd_index = pd.RangeIndex(1, 11, 3)
+    index_value = parse_index(pd_index)
+
+    min_max = (2, True, 10, True)
+    assert (
+        filter_index_value(index_value, min_max).to_pandas().tolist()
+        == pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist()
+    )
+
+    min_max = (2, False, 10, False)
+    assert (
+        filter_index_value(index_value, min_max).to_pandas().tolist()
+        == pd_index[(pd_index > 2) & (pd_index < 10)].tolist()
+    )
+
+    pd_index = pd.RangeIndex(9, -1, -1)
+    index_value = parse_index(pd_index)
+
+    min_max = (0, True, 9, True)
+    assert (
+        filter_index_value(index_value, min_max).to_pandas().tolist()
+        == pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist()
+    )
+
+    min_max = (0, False, 9, False)
+    assert (
+        filter_index_value(index_value, min_max).to_pandas().tolist()
+        == pd_index[(pd_index > 0) & (pd_index < 9)].tolist()
+    )
+
+    pd_index = pd.RangeIndex(10, 0, -3)
+    index_value = parse_index(pd_index, store_data=False)
+
+    min_max = (2, True, 10, True)
+    assert (
+        filter_index_value(index_value, min_max).to_pandas().tolist()
+        == pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist()
+    )
+
+    min_max = (2, False, 10, False)
+    assert (
+        filter_index_value(index_value, min_max).to_pandas().tolist()
+        == pd_index[(pd_index > 2) & (pd_index < 10)].tolist()
+    )
+
+    pd_index = pd.Index([0, 3, 8], dtype=np.int64)
+    index_value = parse_index(pd_index, store_data=True)
+
+    min_max = (2, True, 8, False)
+    assert (
+        filter_index_value(index_value, min_max, store_data=True).to_pandas().tolist()
+        == pd_index[(pd_index >= 2) & (pd_index < 8)].tolist()
+    )
+
+    index_value = parse_index(pd_index)
+
+    min_max = (2, True, 8, False)
+    filtered = filter_index_value(index_value, min_max)
+    assert len(filtered.to_pandas().tolist()) == 0
+    assert isinstance(filtered.value, IndexValue.Int64Index)
+
+
+def test_merge_index_value():
+    with Timer() as timer:
+        index_values = {i: parse_index(pd.RangeIndex(1e7)) for i in range(20)}
+        index_value = merge_index_value(index_values)
+        pd.testing.assert_index_equal(
+            index_value.to_pandas(), pd.Index([], dtype=np.int64)
+        )
+        assert index_value.min_val == 0
+        assert index_value.max_val == 1e7 - 1
+
+        # range indexes that are continuous
+        index_values = {
+            i: parse_index(pd.RangeIndex(i * 1e7, (i + 1) * 1e7)) for i in range(20)
+        }
+        index_value = merge_index_value(index_values)
+        pd.testing.assert_index_equal(index_value.to_pandas(), pd.RangeIndex(1e7 * 20))
+        assert index_value.min_val == 0
+        assert index_value.max_val == 1e7 * 20 - 1
+    assert timer.duration < 1
+
+
+def test_infer_dtypes():
+    data1 = pd.DataFrame([[1, "a", False]], columns=[2.0, 3.0, 4.0])
+    data2 = pd.DataFrame([[1, 3.0, "b"]], columns=[1, 2, 3])
+
+    pd.testing.assert_series_equal(
+        infer_dtypes(data1.dtypes, data2.dtypes, operator.add), (data1 + data2).dtypes
+    )
+
+
+def test_infer_index_value():
+    # same range index
+    index1 = pd.RangeIndex(1, 3)
+    index2 = pd.RangeIndex(1, 3)
+
+    ival1 = parse_index(index1)
+    ival2 = parse_index(index2)
+    oival = infer_index_value(ival1, ival2)
+
+    assert oival.key == ival1.key
+    assert oival.key == ival2.key
+
+    # different range index
+    index1 = pd.RangeIndex(1, 3)
+    index2 = pd.RangeIndex(2, 4)
+
+    ival1 = parse_index(index1)
+    ival2 = parse_index(index2)
+    oival = infer_index_value(ival1, ival2)
+
+    assert isinstance(oival.value, IndexValue.Int64Index)
+    assert oival.key != ival1.key
+    assert oival.key != ival2.key
+
+    # same int64 index, all unique
+    index1 = pd.Index([1, 2], dtype=np.int64)
+    index2 = pd.Index([1, 2], dtype=np.int64)
+
+    ival1 = parse_index(index1)
+    ival2 = parse_index(index2)
+    oival = infer_index_value(ival1, ival2)
+
+    assert isinstance(oival.value, IndexValue.Int64Index)
+    assert oival.key == ival1.key
+    assert oival.key == ival2.key
+
+    # same int64 index, not all unique
+    index1 = pd.Index([1, 2, 2], dtype=np.int64)
+    index2 = pd.Index([1, 2, 2], dtype=np.int64)
+
+    ival1 = parse_index(index1)
+    ival2 = parse_index(index2)
+    oival = infer_index_value(ival1, ival2)
+
+    assert isinstance(oival.value, IndexValue.Int64Index)
+    assert oival.key != ival1.key
+    assert oival.key != ival2.key
+
+    # different int64 index
+    index1 = pd.Index([1, 2], dtype=np.int64)
+    index2 = pd.Index([2, 3], dtype=np.int64)
+
+    ival1 = parse_index(index1)
+    ival2 = parse_index(index2)
+    oival = infer_index_value(ival1, ival2)
+
+    assert isinstance(oival.value, IndexValue.Int64Index)
+    assert oival.key != ival1.key
+    assert oival.key != ival2.key
+
+    # different index type
+    index1 = pd.Index([1, 2], dtype=np.int64)
+    index2 = pd.Index([2.0, 3.0], dtype=np.float64)
+
+    ival1 = parse_index(index1)
+    ival2 = parse_index(index2)
+    oival = infer_index_value(ival1, ival2)
+
+    assert isinstance(oival.value, IndexValue.Float64Index)
+    assert oival.key != ival1.key
+    assert oival.key != ival2.key
+
+    # range index and other index
+    index1 = pd.RangeIndex(1, 4)
+    index2 = pd.Index([2, 3, 4], dtype=np.float64)
+
+    ival1 = parse_index(index1)
+    ival2 = parse_index(index2)
+    oival = infer_index_value(ival1, ival2)
+
+    assert isinstance(oival.value, IndexValue.Float64Index)
+    assert oival.key != ival1.key
+    assert oival.key != ival2.key
+
+    index1 = pd.DatetimeIndex([])
+    index2 = pd.RangeIndex(2)
+
+    ival1 = parse_index(index1)
+    ival2 = parse_index(index2)
+    oival = infer_index_value(ival1, ival2)
+
+    assert isinstance(oival.value, IndexValue.Index)
+    assert oival.key != ival1.key
+    assert oival.key != ival2.key
+
+
+def test_index_inferred_type():
+    assert Index(pd.Index([1, 2, 3, 4])).inferred_type == "integer"
+    assert Index(pd.Index([1, 2, 3, 4]).astype("uint32")).inferred_type == "integer"
+    assert Index(pd.Index([1.2, 2.3, 4.5])).inferred_type == "floating"
+    assert (
+        Index(pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])).inferred_type
+        == "interval"
+    )
+    assert (
+        Index(pd.MultiIndex.from_tuples([("a", 1), ("b", 2)])).inferred_type == "mixed"
+    )
+
+
+def test_validate_axis():
+    df = DataFrame(pd.DataFrame(np.random.rand(4, 3)))
+
+    assert validate_axis(0, df) == 0
+    assert validate_axis("index", df) == 0
+    assert validate_axis(1, df) == 1
+    assert validate_axis("columns", df) == 1
+
+    with pytest.raises(ValueError):
+        validate_axis("unknown index", df)
+
+    with pytest.raises(ValueError):
+        validate_axis(object(), df)
+
+    with pytest.raises(ValueError):
+        validate_axis(-1, df)
+
+    with pytest.raises(ValueError):
+        validate_axis(2, df)
+
+    df2 = df[df[0] < 0.5]  # create unknown shape
+    assert validate_axis(0, df2) == 0
+
+
+def test_dataframe_dir():
+    df = DataFrame(pd.DataFrame(np.random.rand(4, 3), columns=list("ABC")))
+    dir_result = set(dir(df))
+    for c in df.dtypes.index:
+        assert c in dir_result
+
+
+def test_fetch_dataframe_corner_data(setup):
+    max_rows = pd.get_option("display.max_rows")
+    try:
+        min_rows = pd.get_option("display.min_rows")
+    except KeyError:  # pragma: no cover
+        min_rows = max_rows
+
+    for row in (
+        5,
+        max_rows - 2,
+        max_rows - 1,
+        max_rows,
+        max_rows + 1,
+        max_rows + 2,
+        max_rows + 3,
+    ):
+        pdf = pd.DataFrame(np.random.rand(row, 5))
+        df = DataFrame(pdf, chunk_size=max_rows // 2)
+        df.execute()
+
+        corner = fetch_corner_data(df)
+        assert corner.shape[0] <= max_rows + 2
+        corner_max_rows = max_rows if row <= max_rows else corner.shape[0] - 1
+        assert corner.to_string(
+            max_rows=corner_max_rows, min_rows=min_rows
+        ) == pdf.to_string(max_rows=max_rows, min_rows=min_rows)
+
+
+def test_make_dtypes():
+    s = make_dtypes([int, float, np.dtype(int)])
+    pd.testing.assert_series_equal(
+        s, pd.Series([np.dtype(int), np.dtype(float), np.dtype(int)])
+    )
+
+    s = make_dtypes(OrderedDict([("a", int), ("b", float), ("c", np.dtype(int))]))
+    pd.testing.assert_series_equal(
+        s, pd.Series([np.dtype(int), np.dtype(float), np.dtype(int)], index=list("abc"))
+    )
+
+    s = make_dtypes(pd.Series([int, float, np.dtype(int)]))
+    pd.testing.assert_series_equal(
+        s, pd.Series([np.dtype(int), np.dtype(float), np.dtype(int)])
+    )
+
+    assert make_dtypes(None) is None
+
+
+@pytest.mark.parametrize(
+    "columns",
+    [
+        pd.RangeIndex(8),
+        pd.MultiIndex.from_product([list("AB"), list("CDEF")]),
+    ],
+)
+def test_build_concatenated_rows_frame(setup, columns):
+    df = pd.DataFrame(np.random.rand(16, 8), columns=columns)
+
+    # single chunk
+    mdf = tile(DataFrame(df, chunk_size=8))
+    concatenated = build_concatenated_rows_frame(mdf)
+    assert len(concatenated.chunks) == 2
+    pd.testing.assert_frame_equal(concatenated.execute().fetch(), df)
+
+    # multiple chunks
+    mdf = tile(DataFrame(df, chunk_size=5))
+    concatenated = build_concatenated_rows_frame(mdf)
+    assert len(concatenated.chunks) == 4
+    for i in range(4):
+        pd.testing.assert_index_equal(
+            concatenated.chunks[i].columns_value.to_pandas(), df.columns
+        )
+    pd.testing.assert_frame_equal(concatenated.execute().fetch(), df)
+
+
+def test_auto_merge_chunks():
+    from ..merge import DataFrameConcat
+
+    pdf = pd.DataFrame(np.random.rand(16, 4), columns=list("abcd"))
+    memory_size = pdf.iloc[:4].memory_usage().sum()
+
+    class FakeContext:
+        def __init__(self, retval=True):
+            self._retval = retval
+
+        def get_chunks_meta(self, data_keys: List[str], **_) -> List[Dict]:
+            if self._retval:
+                return [{"memory_size": memory_size}] * len(data_keys)
+            else:
+                return [None] * len(data_keys)
+
+    df = tile(DataFrame(pdf, chunk_size=4))
+    df2 = auto_merge_chunks(FakeContext(), df, 2 * memory_size)
+    assert len(df2.chunks) == 2
+    assert isinstance(df2.chunks[0].op, DataFrameConcat)
+    assert len(df2.chunks[0].op.inputs) == 2
+    assert isinstance(df2.chunks[1].op, DataFrameConcat)
+    assert len(df2.chunks[1].op.inputs) == 2
+
+    df2 = auto_merge_chunks(FakeContext(), df, 3 * memory_size)
+    assert len(df2.chunks) == 2
+    assert isinstance(df2.chunks[0].op, DataFrameConcat)
+    assert len(df2.chunks[0].op.inputs) == 3
+    assert not isinstance(df2.chunks[1].op, DataFrameConcat)
+    assert len(df2.chunks[1].op.inputs) == 0
+    assert df2.chunks[1].shape == df.chunks[-1].shape
+    assert df2.chunks[1].index == (1, 0)
+
+    # mock situation that df not executed
+    df2 = auto_merge_chunks(FakeContext(False), df, 3 * memory_size)
+    assert df2 is df
+
+    # number of chunks on columns > 1
+    df3 = tile(DataFrame(pdf, chunk_size=2))
+    df4 = auto_merge_chunks(FakeContext(), df3, 2 * memory_size)
+    assert df4 is df3
+
+    # each chunk's size is greater than limit
+    df5 = auto_merge_chunks(FakeContext(), df, memory_size / 5)
+    assert all((c1.shape == c2.shape) for c1, c2 in zip(df.chunks, df5.chunks))
+
+    # test series
+    ps = pdf.loc[:, "a"]
+    memory_size = ps.iloc[:4].memory_usage()
+    s = tile(Series(ps, chunk_size=4))
+    s2 = auto_merge_chunks(FakeContext(), s, 2 * memory_size)
+    assert len(s2.chunks) == 2
+    assert isinstance(s2.chunks[0].op, DataFrameConcat)
+    assert s2.chunks[0].name == "a"
+    assert len(s2.chunks[0].op.inputs) == 2
+    assert isinstance(s2.chunks[1].op, DataFrameConcat)
+    assert s2.chunks[1].name == "a"
+    assert len(s2.chunks[1].op.inputs) == 2
+
+
+@pytest.mark.parametrize("multiplier_and_expected", [(1, False), (3, True), (4, True)])
+def test_whether_to_clean_up(multiplier_and_expected):
+    threshold = 10**4
+    multiplier, expected = multiplier_and_expected
+
+    class FakeOperandwithClosure:
+        def __init__(self, func):
+            self.func = func
+            self.need_clean_up_func = False
+
+        @property
+        def need_clean_up_func(self):
+            return self._need_clean_up_func
+
+        @need_clean_up_func.setter
+        def need_clean_up_func(self, need_clean_up_func: bool):
+            self._need_clean_up_func = need_clean_up_func
+
+    class FakeCallable:
+        __slots__ = "df", "__dict__"
+
+        def __init__(self, multiplier):
+            self.list = [
+                ["This is a string.", 1.2, range(10)],
+                [
+                    bytes("This is a byte message.", "utf-8"),
+                    bytearray("This is a byte array.", "utf-8"),
+                ],
+            ]
+            self.dic = {"one": pd.Series([i for i in range(10**multiplier)])}
+            self.df = pd.DataFrame(self.dic)
+            self.ds = pd.Series([i for i in range(10**multiplier)])
+
+        def __call__(self, z):
+            pass
+
+    op = FakeOperandwithClosure(func=FakeCallable(multiplier=multiplier))
+    result = whether_to_clean_up(op=op, threshold=threshold)
+    assert result is expected
+    assert op.need_clean_up_func is expected
diff --git a/python/xorbits/_mars/dataframe/tseries/__init__.py b/python/xorbits/_mars/dataframe/tseries/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/tseries/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/tseries/tests/__init__.py b/python/xorbits/_mars/dataframe/tseries/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/tseries/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/tseries/tests/test_tseries.py b/python/xorbits/_mars/dataframe/tseries/tests/test_tseries.py
new file mode 100644
index 000000000..3f857faa9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/tseries/tests/test_tseries.py
@@ -0,0 +1,30 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+import pytest
+
+from .... import dataframe as md
+
+
+def test_to_datetime():
+    wrong_args = [pd.DataFrame({"a": [1, 2]}), {"a": [1, 2]}]
+
+    for arg in wrong_args:
+        with pytest.raises(ValueError) as cm:
+            md.to_datetime(arg)
+        assert "[year, month, day]" in str(cm.value)
+
+    with pytest.raises(TypeError):
+        md.to_datetime([[1, 2], [3, 4]])
diff --git a/python/xorbits/_mars/dataframe/tseries/tests/test_tseries_execution.py b/python/xorbits/_mars/dataframe/tseries/tests/test_tseries_execution.py
new file mode 100644
index 000000000..0be314ac7
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/tseries/tests/test_tseries_execution.py
@@ -0,0 +1,97 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+
+from .... import dataframe as md
+from ....tensor import tensor
+from ....tests.core import require_cudf
+from ... import DataFrame, Index, Series, to_datetime
+
+
+def test_to_datetime_execution(setup):
+    # scalar
+    r = to_datetime(1490195805, unit="s")
+
+    result = r.execute().fetch(
+        extra_config={"check_dtypes": False, "check_shape": False}
+    )
+    expected = pd.to_datetime(1490195805, unit="s")
+    assert pd.to_datetime(result) == expected
+
+    # test list like
+    raw = ["3/11/2000", "3/12/2000", "3/13/2000"]
+    t = tensor(raw, chunk_size=2)
+    r = to_datetime(t, infer_datetime_format=True)
+
+    result = r.execute().fetch()
+    expected = pd.to_datetime(raw, infer_datetime_format=True)
+    pd.testing.assert_index_equal(result, expected)
+
+    # test series
+    raw_series = pd.Series(raw)
+    s = Series(raw_series, chunk_size=2)
+    r = to_datetime(s)
+
+    result = r.execute().fetch()
+    expected = pd.to_datetime(raw_series)
+    pd.testing.assert_series_equal(result, expected)
+
+    # test DataFrame
+    raw_df = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]})
+    df = DataFrame(raw_df, chunk_size=(1, 2))
+    r = to_datetime(df)
+
+    result = r.execute().fetch()
+    expected = pd.to_datetime(raw_df)
+    pd.testing.assert_series_equal(result, expected)
+
+    # test Index
+    raw_index = pd.Index([1, 2, 3])
+    s = Index(raw_index, chunk_size=2)
+    r = to_datetime(s)
+
+    result = r.execute().fetch()
+    expected = pd.to_datetime(raw_index)
+    pd.testing.assert_index_equal(result, expected)
+
+    # test raises == 'ignore'
+    raw = ["13000101"]
+    r = to_datetime(raw, format="%Y%m%d", errors="ignore")
+    result = r.execute().fetch()
+    expected = pd.to_datetime(raw, format="%Y%m%d", errors="ignore")
+    pd.testing.assert_index_equal(result, expected)
+
+    # test unit
+    r = to_datetime([1490195805], unit="s")
+    result = r.execute().fetch()
+    expected = pd.to_datetime([1490195805], unit="s")
+    pd.testing.assert_index_equal(result, expected)
+
+    # test origin
+    r = to_datetime([1, 2, 3], unit="D", origin=pd.Timestamp("1960-01-01"))
+    result = r.execute().fetch()
+    expected = pd.to_datetime([1, 2, 3], unit="D", origin=pd.Timestamp("1960-01-01"))
+    pd.testing.assert_index_equal(result, expected)
+
+
+@require_cudf
+def test_to_datetime_gpu_execution(setup_gpu):
+    s = md.Series(["3/11/2000", "3/12/2000", "3/13/2000"]).to_gpu()
+    r = to_datetime(s, format="%m/%d/%Y")
+    result = r.execute().fetch().to_pandas()
+    expected = pd.to_datetime(
+        pd.Series(["3/11/2000", "3/12/2000", "3/13/2000"]), format="%m/%d/%Y"
+    )
+    pd.testing.assert_series_equal(result, expected)
diff --git a/python/xorbits/_mars/dataframe/tseries/to_datetime.py b/python/xorbits/_mars/dataframe/tseries/to_datetime.py
new file mode 100644
index 000000000..8ff68c9d4
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/tseries/to_datetime.py
@@ -0,0 +1,370 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import Any
+
+import numpy as np
+import pandas as pd
+from pandas.api.types import is_dict_like, is_scalar
+
+from ... import opcodes
+from ...core import recursive_tile
+from ...serialization.serializables import AnyField, BoolField, KeyField, StringField
+from ...tensor import tensor as astensor
+from ...tensor.core import TENSOR_CHUNK_TYPE
+from ..core import DATAFRAME_TYPE, INDEX_CHUNK_TYPE, INDEX_TYPE, SERIES_TYPE
+from ..initializer import DataFrame as asdataframe
+from ..initializer import Index as asindex
+from ..initializer import Series as asseries
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import lazy_import, parse_index
+
+cudf = lazy_import("cudf")
+
+
+class DataFrameToDatetime(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.TO_DATETIME
+
+    arg = KeyField("arg")
+    errors = StringField("errors", default=None)
+    dayfirst = BoolField("dayfirst", default=None)
+    yearfirst = BoolField("yearfirst", default=None)
+    utc = BoolField("utc", default=None)
+    format = StringField("format", default=None)
+    exact = BoolField("exact", default=None)
+    unit = StringField("unit", default=None)
+    infer_datetime_format = BoolField("infer_datetime_format", default=None)
+    origin = AnyField("origin", default=None)
+    cache = BoolField("cache", default=None)
+
+    @property
+    def _params(self):
+        return tuple(
+            getattr(self, k)
+            for k in self._keys_
+            if k not in self._no_copy_attrs_ and k != "arg" and hasattr(self, k)
+        )
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self.arg = self._inputs[0]
+
+    def __call__(self, arg):
+        if is_scalar(arg):
+            ret = pd.to_datetime(
+                arg,
+                errors=self.errors,
+                dayfirst=self.dayfirst,
+                yearfirst=self.yearfirst,
+                utc=self.utc,
+                format=self.format,
+                exact=self.exact,
+                unit=self.unit,
+                infer_datetime_format=self.infer_datetime_format,
+                origin=self.origin,
+                cache=self.cache,
+            )
+            return astensor(ret)
+
+        dtype = np.datetime64(1, "ns").dtype
+        if isinstance(arg, (pd.Series, SERIES_TYPE)):
+            arg = asseries(arg)
+            return self.new_series(
+                [arg],
+                shape=arg.shape,
+                dtype=dtype,
+                index_value=arg.index_value,
+                name=arg.name,
+            )
+        if is_dict_like(arg) or isinstance(arg, DATAFRAME_TYPE):
+            arg = asdataframe(arg)
+            columns = arg.columns_value.to_pandas().tolist()
+            if sorted(columns) != sorted(["year", "month", "day"]):
+                missing = ",".join(
+                    c for c in ["day", "month", "year"] if c not in columns
+                )
+                raise ValueError(
+                    "to assemble mappings requires at least "
+                    f"that [year, month, day] be specified: [{missing}] is missing"
+                )
+            return self.new_series(
+                [arg], shape=(arg.shape[0],), dtype=dtype, index_value=arg.index_value
+            )
+        elif isinstance(arg, (pd.Index, INDEX_TYPE)):
+            arg = asindex(arg)
+            return self.new_index(
+                [arg],
+                shape=arg.shape,
+                dtype=dtype,
+                index_value=parse_index(pd.Index([], dtype=dtype), self._params, arg),
+                name=arg.name,
+            )
+        else:
+            arg = astensor(arg)
+            if arg.ndim != 1:
+                raise TypeError(
+                    "arg must be a string, datetime, "
+                    "list, tuple, 1-d tensor, or Series"
+                )
+            return self.new_index(
+                [arg],
+                shape=arg.shape,
+                dtype=dtype,
+                index_value=parse_index(pd.Index([], dtype=dtype), self._params, arg),
+            )
+
+    @classmethod
+    def tile(cls, op: "DataFrameToDatetime"):
+        out = op.outputs[0]
+        arg = op.arg
+
+        if isinstance(arg, DATAFRAME_TYPE):
+            if np.isnan(arg.shape[0]) or any(
+                np.isnan(s) for s in arg.nsplits[1]
+            ):  # pragma: no cover
+                yield
+
+            arg = yield from recursive_tile(arg.rechunk({1: arg.shape[1]}))
+
+        out_chunks = []
+        for chunk in arg.chunks:
+            chunk_op = op.copy().reset_key()
+            if isinstance(chunk, (TENSOR_CHUNK_TYPE, INDEX_CHUNK_TYPE)):
+                chunk_index_value = parse_index(
+                    pd.Index([], dtype=out.dtype), op._params, chunk
+                )
+            else:
+                chunk_index_value = chunk.index_value
+
+            out_chunk = chunk_op.new_chunk(
+                [chunk],
+                shape=(chunk.shape[0],),
+                dtype=out.dtype,
+                index_value=chunk_index_value,
+                name=out.name,
+                index=(chunk.index[0],),
+            )
+            out_chunks.append(out_chunk)
+
+        params = out.params
+        params["nsplits"] = (arg.nsplits[0],)
+        params["chunks"] = out_chunks
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def execute(cls, ctx, op: "DataFrameToDatetime"):
+        arg = ctx[op.arg.key]
+
+        unit = op.unit
+        if cudf and op.gpu:
+            func = cudf.to_datetime
+            if unit is None:
+                unit = "ns"
+        else:
+            func = pd.to_datetime
+
+        call = partial(
+            func,
+            errors=op.errors,
+            dayfirst=op.dayfirst,
+            yearfirst=op.yearfirst,
+            utc=op.utc,
+            format=op.format,
+            exact=op.exact,
+            unit=unit,
+            infer_datetime_format=op.infer_datetime_format,
+            origin=op.origin,
+            cache=op.cache,
+        )
+
+        try:
+            ctx[op.outputs[0].key] = call(arg)
+        except ValueError:  # pragma: no cover
+            ctx[op.outputs[0].key] = call(arg.copy())
+
+
+def to_datetime(
+    arg,
+    errors: str = "raise",
+    dayfirst: bool = False,
+    yearfirst: bool = False,
+    utc: bool = None,
+    format: str = None,
+    exact: bool = True,
+    unit: str = None,
+    infer_datetime_format: bool = False,
+    origin: Any = "unix",
+    cache: bool = True,
+):
+    """
+    Convert argument to datetime.
+
+    Parameters
+    ----------
+    arg : int, float, str, datetime, list, tuple, 1-d array, Series DataFrame/dict-like
+        The object to convert to a datetime.
+    errors : {'ignore', 'raise', 'coerce'}, default 'raise'
+        - If 'raise', then invalid parsing will raise an exception.
+        - If 'coerce', then invalid parsing will be set as NaT.
+        - If 'ignore', then invalid parsing will return the input.
+    dayfirst : bool, default False
+        Specify a date parse order if `arg` is str or its list-likes.
+        If True, parses dates with the day first, eg 10/11/12 is parsed as
+        2012-11-10.
+        Warning: dayfirst=True is not strict, but will prefer to parse
+        with day first (this is a known bug, based on dateutil behavior).
+    yearfirst : bool, default False
+        Specify a date parse order if `arg` is str or its list-likes.
+
+        - If True parses dates with the year first, eg 10/11/12 is parsed as
+          2010-11-12.
+        - If both dayfirst and yearfirst are True, yearfirst is preceded (same
+          as dateutil).
+
+        Warning: yearfirst=True is not strict, but will prefer to parse
+        with year first (this is a known bug, based on dateutil behavior).
+    utc : bool, default None
+        Return UTC DatetimeIndex if True (converting any tz-aware
+        datetime.datetime objects as well).
+    format : str, default None
+        The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
+        all the way up to nanoseconds.
+        See strftime documentation for more information on choices:
+        https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior.
+    exact : bool, True by default
+        Behaves as:
+        - If True, require an exact format match.
+        - If False, allow the format to match anywhere in the target string.
+
+    unit : str, default 'ns'
+        The unit of the arg (D,s,ms,us,ns) denote the unit, which is an
+        integer or float number. This will be based off the origin.
+        Example, with unit='ms' and origin='unix' (the default), this
+        would calculate the number of milliseconds to the unix epoch start.
+    infer_datetime_format : bool, default False
+        If True and no `format` is given, attempt to infer the format of the
+        datetime strings, and if it can be inferred, switch to a faster
+        method of parsing them. In some cases this can increase the parsing
+        speed by ~5-10x.
+    origin : scalar, default 'unix'
+        Define the reference date. The numeric values would be parsed as number
+        of units (defined by `unit`) since this reference date.
+
+        - If 'unix' (or POSIX) time; origin is set to 1970-01-01.
+        - If 'julian', unit must be 'D', and origin is set to beginning of
+          Julian Calendar. Julian day number 0 is assigned to the day starting
+          at noon on January 1, 4713 BC.
+        - If Timestamp convertible, origin is set to Timestamp identified by
+          origin.
+    cache : bool, default True
+        If True, use a cache of unique, converted dates to apply the datetime
+        conversion. May produce significant speed-up when parsing duplicate
+        date strings, especially ones with timezone offsets. The cache is only
+        used when there are at least 50 values. The presence of out-of-bounds
+        values will render the cache unusable and may slow down parsing.
+
+    Returns
+    -------
+    datetime
+        If parsing succeeded.
+        Return type depends on input:
+
+        - list-like: DatetimeIndex
+        - Series: Series of datetime64 dtype
+        - scalar: Timestamp
+
+        In case when it is not possible to return designated types (e.g. when
+        any element of input is before Timestamp.min or after Timestamp.max)
+        return will have datetime.datetime type (or corresponding
+        array/Series).
+
+    See Also
+    --------
+    DataFrame.astype : Cast argument to a specified dtype.
+    to_timedelta : Convert argument to timedelta.
+    convert_dtypes : Convert dtypes.
+
+    Examples
+    --------
+    Assembling a datetime from multiple columns of a DataFrame. The keys can be
+    common abbreviations like ['year', 'month', 'day', 'minute', 'second',
+    'ms', 'us', 'ns']) or plurals of the same
+
+    >>> import mars.dataframe as md
+
+    >>> df = md.DataFrame({'year': [2015, 2016],
+    ...                    'month': [2, 3],
+    ...                    'day': [4, 5]})
+    >>> md.to_datetime(df).execute()
+    0   2015-02-04
+    1   2016-03-05
+    dtype: datetime64[ns]
+
+    If a date does not meet the `timestamp limitations
+    <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
+    #timeseries-timestamp-limits>`_, passing errors='ignore'
+    will return the original input instead of raising any exception.
+
+    Passing errors='coerce' will force an out-of-bounds date to NaT,
+    in addition to forcing non-dates (or non-parseable dates) to NaT.
+
+    >>> md.to_datetime('13000101', format='%Y%m%d', errors='ignore').execute()
+    datetime.datetime(1300, 1, 1, 0, 0)
+    >>> md.to_datetime('13000101', format='%Y%m%d', errors='coerce').execute()
+    NaT
+
+    Passing infer_datetime_format=True can often-times speedup a parsing
+    if its not an ISO8601 format exactly, but in a regular format.
+
+    >>> s = md.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 1000)
+    >>> s.head().execute()
+    0    3/11/2000
+    1    3/12/2000
+    2    3/13/2000
+    3    3/11/2000
+    4    3/12/2000
+    dtype: object
+
+    Using a unix epoch time
+
+    >>> md.to_datetime(1490195805, unit='s').execute()
+    Timestamp('2017-03-22 15:16:45')
+    >>> md.to_datetime(1490195805433502912, unit='ns').execute()
+    Timestamp('2017-03-22 15:16:45.433502912')
+
+    .. warning:: For float arg, precision rounding might happen. To prevent
+        unexpected behavior use a fixed-width exact type.
+
+    Using a non-unix epoch origin
+
+    >>> md.to_datetime([1, 2, 3], unit='D',
+    ...                origin=md.Timestamp('1960-01-01')).execute()
+    DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], \
+dtype='datetime64[ns]', freq=None)
+    """
+    op = DataFrameToDatetime(
+        errors=errors,
+        dayfirst=dayfirst,
+        yearfirst=yearfirst,
+        utc=utc,
+        format=format,
+        exact=exact,
+        unit=unit,
+        infer_datetime_format=infer_datetime_format,
+        origin=origin,
+        cache=cache,
+    )
+    return op(arg)
diff --git a/python/xorbits/_mars/dataframe/ufunc/__init__.py b/python/xorbits/_mars/dataframe/ufunc/__init__.py
new file mode 100644
index 000000000..d49856b7c
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/ufunc/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def _install():
+    from ..core import DataFrame, Series
+    from .tensor import _tensor_ufunc
+    from .ufunc import _array_ufunc
+
+    for Entity in (DataFrame, Series):
+        Entity.__array_ufunc__ = _array_ufunc
+        Entity.__tensor_ufunc__ = _tensor_ufunc
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/dataframe/ufunc/tensor.py b/python/xorbits/_mars/dataframe/ufunc/tensor.py
new file mode 100644
index 000000000..2d91fa942
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/ufunc/tensor.py
@@ -0,0 +1,54 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils import classproperty
+
+_tensor_op_to_df_op = dict()
+
+
+def register_tensor_ufunc(op):
+    _tensor_op_to_df_op[op.tensor_op_type] = op
+
+
+def get_tensor_ufunc_implementation(tensor_op):
+    if tensor_op in _tensor_op_to_df_op:
+        return _tensor_op_to_df_op[tensor_op]
+
+
+class TensorUfuncMixin:
+    __slots__ = ()
+
+    @classproperty
+    def tensor_op_type(self):
+        raise NotImplementedError
+
+    @classmethod
+    def ufunc_call(cls, tensor_op, inputs, out, where, **kw):
+        if out is not None:
+            return NotImplemented
+        if where is not None:
+            raise NotImplementedError
+
+        try:
+            op = _tensor_op_to_df_op[tensor_op](**kw)
+            return op(*inputs)
+        except (KeyError, TypeError):
+            return NotImplemented
+
+
+def _tensor_ufunc(_, tensor_op, inputs, out, where, **kw):
+    op = get_tensor_ufunc_implementation(tensor_op)
+    if op is not None:
+        return op.ufunc_call(tensor_op, inputs, out, where, **kw)
+    return NotImplemented
diff --git a/python/xorbits/_mars/dataframe/ufunc/ufunc.py b/python/xorbits/_mars/dataframe/ufunc/ufunc.py
new file mode 100644
index 000000000..7f4a7fe61
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/ufunc/ufunc.py
@@ -0,0 +1,53 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from numbers import Number
+
+from ...tensor import tensor as astensor
+from ...tensor.ufunc.ufunc import UFUNC_TO_TENSOR_FUNCS
+from ..core import DATAFRAME_TYPE, SERIES_TYPE
+
+
+def _check_arg(arg):
+    if isinstance(arg, Number):
+        return True
+
+    if isinstance(arg, (DATAFRAME_TYPE, SERIES_TYPE)):
+        return True
+
+    try:
+        astensor(arg)
+        return True
+    except ValueError:
+        return False
+
+
+def _array_ufunc(_, ufunc, method, *inputs, **kwargs):
+    out = kwargs.get("out", tuple())
+    for x in inputs + out:
+        if not _check_arg(x):
+            return NotImplemented
+
+    if ufunc.signature is not None:
+        return NotImplemented
+    if ufunc not in UFUNC_TO_TENSOR_FUNCS:
+        return NotImplemented
+
+    # we delegate numpy ufunc to tensor ufunc,
+    # tensor ufunc will handle Mars DataFrame properly.
+    try:
+        tensor_func = getattr(UFUNC_TO_TENSOR_FUNCS[ufunc], method)
+        return tensor_func(*inputs, **kwargs)
+    except (AttributeError, NotImplementedError):
+        return NotImplemented
diff --git a/python/xorbits/_mars/dataframe/utils.py b/python/xorbits/_mars/dataframe/utils.py
new file mode 100644
index 000000000..4a4bdfb8a
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/utils.py
@@ -0,0 +1,1579 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import inspect
+import itertools
+import logging
+import operator
+import os
+import sys
+from contextlib import contextmanager
+from numbers import Integral
+from typing import Any, List, Union
+
+import cloudpickle
+import numpy as np
+import pandas as pd
+from pandas.api.extensions import ExtensionDtype
+from pandas.api.types import is_string_dtype
+from pandas.core.dtypes.cast import find_common_type
+
+from ..config import options
+from ..core import Entity, ExecutableTuple
+from ..core.context import Context, get_context
+from ..lib.mmh3 import hash as mmh_hash
+from ..tensor.utils import dictify_chunk_size, normalize_chunk_sizes
+from ..typing import ChunkType, TileableType
+from ..utils import (
+    ModulePlaceholder,
+    is_full_slice,
+    is_on_ray,
+    lazy_import,
+    parse_readable_size,
+    parse_version,
+    sbytes,
+    tokenize,
+)
+
+try:
+    import pyarrow as pa
+except ImportError:  # pragma: no cover
+    pa = ModulePlaceholder("pyarrow")
+
+cudf = lazy_import("cudf", rename="cudf")
+vineyard = lazy_import("vineyard")
+try:
+    import ray
+
+    ray_release_version = parse_version(ray.__version__).release
+    ray_deprecate_ml_dataset = ray_release_version[:2] >= (2, 0)
+except ImportError:
+    ray_release_version = None
+    ray_deprecate_ml_dataset = None
+logger = logging.getLogger(__name__)
+
+
+def hash_index(index, size):
+    def func(x, size):
+        return mmh_hash(sbytes(x)) % size
+
+    f = functools.partial(func, size=size)
+    idx_to_grouped = index.groupby(index.map(f))
+    return [idx_to_grouped.get(i, list()) for i in range(size)]
+
+
+def hash_dataframe_on(df, on, size, level=None):
+    if on is None:
+        idx = df.index
+        if level is not None:
+            idx = idx.to_frame(False)[level]
+        if cudf and isinstance(idx, cudf.Index):  # pragma: no cover
+            idx = idx.to_pandas()
+        hashed_label = pd.util.hash_pandas_object(idx, categorize=False)
+    elif callable(on):
+        # todo optimization can be added, if ``on`` is a numpy ufunc or sth can be vectorized
+        hashed_label = pd.util.hash_pandas_object(df.index.map(on), categorize=False)
+    else:
+        if isinstance(on, list):
+            to_concat = []
+            for v in on:
+                if isinstance(v, pd.Series):
+                    to_concat.append(v)
+                else:
+                    to_concat.append(df[v])
+            data = pd.concat(to_concat, axis=1)
+        else:
+            data = df[on]
+        hashed_label = pd.util.hash_pandas_object(data, index=False, categorize=False)
+    idx_to_grouped = pd.RangeIndex(0, len(hashed_label)).groupby(hashed_label % size)
+    return [idx_to_grouped.get(i, pd.Index([])) for i in range(size)]
+
+
+def hash_dtypes(dtypes, size):
+    hashed_indexes = hash_index(dtypes.index, size)
+    return [dtypes[index] for index in hashed_indexes]
+
+
+def sort_dataframe_inplace(df, *axis):
+    for ax in axis:
+        df.sort_index(axis=ax, inplace=True)
+    return df
+
+
+@functools.lru_cache(1)
+def _get_range_index_type():
+    if cudf is not None:
+        return pd.RangeIndex, cudf.RangeIndex
+    else:
+        return pd.RangeIndex
+
+
+@functools.lru_cache(1)
+def _get_multi_index_type():
+    if cudf is not None:
+        return pd.MultiIndex, cudf.MultiIndex
+    else:
+        return pd.MultiIndex
+
+
+def _get_range_index_start(pd_range_index):
+    try:
+        return pd_range_index.start
+    except AttributeError:  # pragma: no cover
+        return pd_range_index._start
+
+
+def _get_range_index_stop(pd_range_index):
+    try:
+        return pd_range_index.stop
+    except AttributeError:  # pragma: no cover
+        return pd_range_index._stop
+
+
+def _get_range_index_step(pd_range_index):
+    try:
+        return pd_range_index.step
+    except AttributeError:  # pragma: no cover
+        pass
+    try:  # pragma: no cover
+        return pd_range_index._step
+    except AttributeError:  # pragma: no cover
+        return 1  # cudf does not support step arg
+
+
+def is_pd_range_empty(pd_range_index):
+    start, stop, step = (
+        _get_range_index_start(pd_range_index),
+        _get_range_index_stop(pd_range_index),
+        _get_range_index_step(pd_range_index),
+    )
+    return (start >= stop and step >= 0) or (start <= stop and step < 0)
+
+
+def decide_dataframe_chunk_sizes(shape, chunk_size, memory_usage):
+    """
+    Decide how a given DataFrame can be split into chunk.
+
+    :param shape: DataFrame's shape
+    :param chunk_size: if dict provided, it's dimension id to chunk size;
+                       if provided, it's the chunk size for each dimension.
+    :param memory_usage: pandas Series in which each column's memory usage
+    :type memory_usage: pandas.Series
+    :return: the calculated chunk size for each dimension
+    :rtype: tuple
+    """
+    chunk_size = dictify_chunk_size(shape, chunk_size)
+    average_memory_usage = memory_usage / shape[0]
+
+    nleft = len(shape) - len(chunk_size)
+    if nleft < 0:
+        raise ValueError("chunks have more than two dimensions")
+    if nleft == 0:
+        return normalize_chunk_sizes(
+            shape, tuple(chunk_size[j] for j in range(len(shape)))
+        )
+
+    max_chunk_size = options.chunk_store_limit
+
+    # for the row side, along axis 0
+    if 0 not in chunk_size:
+        row_chunk_size = []
+        row_left_size = shape[0]
+    else:
+        row_chunk_size = normalize_chunk_sizes((shape[0],), (chunk_size[0],))[0]
+        row_left_size = -1
+    # for the column side, along axis 1
+    if 1 not in chunk_size:
+        col_chunk_size = []
+        col_chunk_store = []
+        col_left_size = shape[1]
+    else:
+        col_chunk_size = normalize_chunk_sizes((shape[1],), (chunk_size[1],))[0]
+        acc = [0] + np.cumsum(col_chunk_size).tolist()
+        col_chunk_store = [
+            average_memory_usage[acc[i] : acc[i + 1]].sum()
+            for i in range(len(col_chunk_size))
+        ]
+        col_left_size = -1
+
+    while True:
+        nbytes_occupied = np.prod(
+            [max(it) for it in (row_chunk_size, col_chunk_store) if it]
+        )
+        dim_size = np.maximum(
+            int(np.power(max_chunk_size / nbytes_occupied, 1 / float(nleft))), 1
+        )
+
+        if col_left_size == 0 and not col_chunk_size:
+            col_chunk_size.append(0)
+
+        if row_left_size == 0 and not row_chunk_size:
+            row_chunk_size.append(0)
+
+        # check col first
+        if col_left_size > 0:
+            cs = min(col_left_size, dim_size)
+            col_chunk_size.append(cs)
+            start = int(np.sum(col_chunk_size[:-1]))
+            col_chunk_store.append(average_memory_usage.iloc[start : start + cs].sum())
+            col_left_size -= cs
+        if row_left_size > 0:
+            if col_chunk_store:
+                max_col_chunk_store = max(col_chunk_store)
+                cs = min(row_left_size, int(max_chunk_size / max_col_chunk_store))
+            else:
+                cs = row_left_size
+            row_chunk_size.append(cs)
+            row_left_size -= cs
+
+        if col_left_size <= 0 and row_left_size <= 0:
+            break
+
+    return tuple(row_chunk_size), tuple(col_chunk_size)
+
+
+def decide_series_chunk_size(shape, chunk_size, memory_usage):
+    chunk_size = dictify_chunk_size(shape, chunk_size)
+    average_memory_usage = memory_usage / shape[0] if shape[0] != 0 else memory_usage
+
+    if len(chunk_size) == len(shape):
+        return normalize_chunk_sizes(shape, chunk_size[0])
+
+    if all(s == 0 for s in shape):
+        # skip when shape is 0
+        return tuple((s,) for s in shape)
+
+    max_chunk_size = options.chunk_store_limit
+    series_chunk_size = max_chunk_size / average_memory_usage
+    return normalize_chunk_sizes(shape, int(series_chunk_size))
+
+
+def parse_index(index_value, *args, store_data=False, key=None):
+    from .core import IndexValue
+
+    def _extract_property(index, tp, ret_data):
+        kw = {
+            "_min_val": _get_index_min(index),
+            "_max_val": _get_index_max(index),
+            "_min_val_close": True,
+            "_max_val_close": True,
+            "_key": key or _tokenize_index(index, *args),
+        }
+        if ret_data:
+            kw["_data"] = index.values
+        for field in tp._FIELDS:
+            if field in kw or field == "_data":
+                continue
+            val = getattr(index, field.lstrip("_"), None)
+            if val is not None:
+                kw[field] = val
+        return kw
+
+    def _tokenize_index(index, *token_objects):
+        if not index.empty:
+            return tokenize(index)
+        else:
+            return tokenize(index, *token_objects)
+
+    def _get_index_min(index):
+        try:
+            return index.min()
+        except (ValueError, AttributeError):
+            if isinstance(index, pd.IntervalIndex):
+                return None
+            raise
+        except TypeError:
+            return None
+
+    def _get_index_max(index):
+        try:
+            return index.max()
+        except (ValueError, AttributeError):
+            if isinstance(index, pd.IntervalIndex):
+                return None
+            raise
+        except TypeError:
+            return None
+
+    def _serialize_index(index):
+        tp = getattr(IndexValue, type(index).__name__)
+        properties = _extract_property(index, tp, store_data)
+        properties["_name"] = index.name
+        return tp(**properties)
+
+    def _serialize_range_index(index):
+        if is_pd_range_empty(index):
+            properties = {
+                "_is_monotonic_increasing": True,
+                "_is_monotonic_decreasing": False,
+                "_is_unique": True,
+                "_min_val": _get_index_min(index),
+                "_max_val": _get_index_max(index),
+                "_min_val_close": True,
+                "_max_val_close": False,
+                "_key": key or _tokenize_index(index, *args),
+                "_name": index.name,
+                "_dtype": index.dtype,
+            }
+        else:
+            properties = _extract_property(index, IndexValue.RangeIndex, False)
+        return IndexValue.RangeIndex(
+            _slice=slice(
+                _get_range_index_start(index),
+                _get_range_index_stop(index),
+                _get_range_index_step(index),
+            ),
+            **properties,
+        )
+
+    def _serialize_multi_index(index):
+        kw = _extract_property(index, IndexValue.MultiIndex, store_data)
+        kw["_sortorder"] = index.sortorder
+        kw["_dtypes"] = [lev.dtype for lev in index.levels]
+        return IndexValue.MultiIndex(**kw)
+
+    if index_value is None:
+        return IndexValue(
+            _index_value=IndexValue.Index(
+                _is_monotonic_increasing=False,
+                _is_monotonic_decreasing=False,
+                _is_unique=False,
+                _min_val=None,
+                _max_val=None,
+                _min_val_close=True,
+                _max_val_close=True,
+                _key=key or tokenize(*args),
+            )
+        )
+    if hasattr(index_value, "to_pandas"):  # pragma: no cover
+        # convert cudf.Index to pandas
+        index_value = index_value.to_pandas()
+
+    if isinstance(index_value, _get_range_index_type()):
+        return IndexValue(_index_value=_serialize_range_index(index_value))
+    elif isinstance(index_value, _get_multi_index_type()):
+        return IndexValue(_index_value=_serialize_multi_index(index_value))
+    else:
+        return IndexValue(_index_value=_serialize_index(index_value))
+
+
+def gen_unknown_index_value(index_value, *args):
+    pd_index = index_value.to_pandas()
+    if isinstance(pd_index, pd.RangeIndex):
+        return parse_index(pd.RangeIndex(-1), *args)
+    elif not isinstance(pd_index, pd.MultiIndex):
+        return parse_index(pd.Index([], dtype=pd_index.dtype), *args)
+    else:
+        i = pd.MultiIndex.from_arrays(
+            [c[:0] for c in pd_index.levels], names=pd_index.names
+        )
+        return parse_index(i, *args)
+
+
+def split_monotonic_index_min_max(
+    left_min_max, left_increase, right_min_max, right_increase
+):
+    """
+    Split the original two min_max into new min_max. Each min_max should be a list
+    in which each item should be a 4-tuple indicates that this chunk's min value,
+    whether the min value is close, the max value, and whether the max value is close.
+    The return value would be a nested list, each item is a list
+    indicates that how this chunk should be split into.
+
+    :param left_min_max: the left min_max
+    :param left_increase: if the original data of left is increased
+    :param right_min_max: the right min_max
+    :param right_increase: if the original data of right is increased
+    :return: nested list in which each item indicates how min_max is split
+
+    >>> left_min_max = [(0, True, 3, True), (4, True, 8, True), (12, True, 18, True),
+    ...                 (20, True, 22, True)]
+    >>> right_min_max = [(2, True, 6, True), (7, True, 9, True), (10, True, 14, True),
+    ...                  (18, True, 19, True)]
+    >>> l, r = split_monotonic_index_min_max(left_min_max, True, right_min_max, True)
+    >>> l
+    [[(0, True, 2, False), (2, True, 3, True)], [(3, False, 4, False), (4, True, 6, True), (6, False, 7, False),
+    (7, True, 8, True)], [(8, False, 9, True), (10, True, 12, False), (12, True, 14, True), (14, False, 18, False),
+    (18, True, 18, True)], [(18, False, 19, True), [20, True, 22, True]]]
+    >>> r
+    [[(0, True, 2, False), (2, True, 3, True), (3, False, 4, False), (4, True, 6, True)],
+    [(6, False, 7, False), (7, True, 8, True), (8, False, 9, True)], [(10, True, 12, False), (12, True, 14, True)],
+    [(14, False, 18, False), (18, True, 18, True), (18, False, 19, True), [20, True, 22, True]]]
+    """
+    left_idx_to_min_max = [[] for _ in left_min_max]
+    right_idx_to_min_max = [[] for _ in right_min_max]
+    left_curr_min_max = list(left_min_max[0])
+    right_curr_min_max = list(right_min_max[0])
+    left_curr_idx = right_curr_idx = 0
+    left_terminate = right_terminate = False
+
+    while not left_terminate or not right_terminate:
+        if left_terminate:
+            left_idx_to_min_max[left_curr_idx].append(tuple(right_curr_min_max))
+            right_idx_to_min_max[right_curr_idx].append(tuple(right_curr_min_max))
+            if right_curr_idx + 1 >= len(right_min_max):
+                right_terminate = True
+            else:
+                right_curr_idx += 1
+                right_curr_min_max = list(right_min_max[right_curr_idx])
+        elif right_terminate:
+            right_idx_to_min_max[right_curr_idx].append(tuple(left_curr_min_max))
+            left_idx_to_min_max[left_curr_idx].append(tuple(left_curr_min_max))
+            if left_curr_idx + 1 >= len(left_min_max):
+                left_terminate = True
+            else:
+                left_curr_idx += 1
+                left_curr_min_max = list(left_min_max[left_curr_idx])
+        elif left_curr_min_max[0] < right_curr_min_max[0]:
+            # left min < right min
+            right_min = [right_curr_min_max[0], not right_curr_min_max[1]]
+            max_val = min(left_curr_min_max[2:], right_min)
+            assert len(max_val) == 2
+            min_max = (
+                left_curr_min_max[0],
+                left_curr_min_max[1],
+                max_val[0],
+                max_val[1],
+            )
+            left_idx_to_min_max[left_curr_idx].append(min_max)
+            right_idx_to_min_max[right_curr_idx].append(min_max)
+            if left_curr_min_max[2:] == max_val:
+                # left max < right min
+                if left_curr_idx + 1 >= len(left_min_max):
+                    left_terminate = True
+                else:
+                    left_curr_idx += 1
+                    left_curr_min_max = list(left_min_max[left_curr_idx])
+            else:
+                # from left min(left min close) to right min(exclude right min close)
+                left_curr_min_max[:2] = right_curr_min_max[:2]
+        elif left_curr_min_max[0] > right_curr_min_max[0]:
+            # left min > right min
+            left_min = [left_curr_min_max[0], not left_curr_min_max[1]]
+            max_val = min(right_curr_min_max[2:], left_min)
+            min_max = (
+                right_curr_min_max[0],
+                right_curr_min_max[1],
+                max_val[0],
+                max_val[1],
+            )
+            left_idx_to_min_max[left_curr_idx].append(min_max)
+            right_idx_to_min_max[right_curr_idx].append(min_max)
+            if right_curr_min_max[2:] == max_val:
+                # right max < left min
+                if right_curr_idx + 1 >= len(right_min_max):
+                    right_terminate = True
+                else:
+                    right_curr_idx += 1
+                    right_curr_min_max = list(right_min_max[right_curr_idx])
+            else:
+                # from left min(left min close) to right min(exclude right min close)
+                right_curr_min_max[:2] = left_curr_min_max[:2]
+        else:
+            # left min == right min
+            max_val = min(left_curr_min_max[2:], right_curr_min_max[2:])
+            assert len(max_val) == 2
+            min_max = (
+                left_curr_min_max[0],
+                left_curr_min_max[1],
+                max_val[0],
+                max_val[1],
+            )
+            left_idx_to_min_max[left_curr_idx].append(min_max)
+            right_idx_to_min_max[right_curr_idx].append(min_max)
+            if max_val == left_curr_min_max[2:]:
+                if left_curr_idx + 1 >= len(left_min_max):
+                    left_terminate = True
+                else:
+                    left_curr_idx += 1
+                    left_curr_min_max = list(left_min_max[left_curr_idx])
+            else:
+                left_curr_min_max[:2] = max_val[0], not max_val[1]
+            if max_val == right_curr_min_max[2:]:
+                if right_curr_idx + 1 >= len(right_min_max):
+                    right_terminate = True
+                else:
+                    right_curr_idx += 1
+                    right_curr_min_max = list(right_min_max[right_curr_idx])
+            else:
+                right_curr_min_max[:2] = max_val[0], not max_val[1]
+
+    if left_increase is False:
+        left_idx_to_min_max = list(reversed(left_idx_to_min_max))
+    if right_increase is False:
+        right_idx_to_min_max = list(reversed(right_idx_to_min_max))
+
+    return left_idx_to_min_max, right_idx_to_min_max
+
+
+def build_split_idx_to_origin_idx(splits, increase=True):
+    # splits' len is equal to the original chunk size on a specified axis,
+    # splits is sth like [[(0, True, 2, True), (2, False, 3, True)]]
+    # which means there is one input chunk, and will be split into 2 out chunks
+    # in this function, we want to build a new dict from the out chunk index to
+    # the original chunk index and the inner position, like {0: (0, 0), 1: (0, 1)}
+    if increase is False:
+        splits = list(reversed(splits))
+    out_idx = itertools.count(0)
+    res = dict()
+    for origin_idx, _ in enumerate(splits):
+        for pos in range(len(splits[origin_idx])):
+            if increase is False:
+                o_idx = len(splits) - origin_idx - 1
+            else:
+                o_idx = origin_idx
+            res[next(out_idx)] = o_idx, pos
+    return res
+
+
+def _generate_value(dtype, fill_value):
+    # special handle for datetime64 and timedelta64
+    dispatch = {
+        np.datetime64: pd.Timestamp,
+        np.timedelta64: pd.Timedelta,
+        pd.CategoricalDtype.type: lambda x: pd.CategoricalDtype([x]),
+        # for object, we do not know the actual dtype,
+        # just convert to str for common usage
+        np.object_: lambda x: str(fill_value),
+    }
+    # otherwise, just use dtype.type itself to convert
+    convert = dispatch.get(dtype.type, dtype.type)
+    return convert(fill_value)
+
+
+def build_empty_df(dtypes, index=None):
+    columns = dtypes.index
+    length = len(index) if index is not None else 0
+    record = [[_generate_value(dtype, 1) for dtype in dtypes]] * max(1, length)
+
+    # duplicate column may exist,
+    # so use RangeIndex first
+    df = pd.DataFrame(record, columns=range(len(dtypes)), index=index)
+    for i, dtype in enumerate(dtypes):
+        s = df.iloc[:, i]
+        if not pd.api.types.is_dtype_equal(s.dtype, dtype):
+            df.iloc[:, i] = s.astype(dtype)
+
+    df.columns = columns
+    return df[:length] if len(df) > length else df
+
+
+def build_df(df_obj, fill_value=1, size=1, ensure_string=False):
+    dfs = []
+    if not isinstance(size, (list, tuple)):
+        sizes = [size]
+    else:
+        sizes = size
+
+    if not isinstance(fill_value, (list, tuple)):
+        fill_values = [fill_value]
+    else:
+        fill_values = fill_value
+
+    for size, fill_value in zip(sizes, fill_values):
+        dtypes = df_obj.dtypes
+        record = [[_generate_value(dtype, fill_value) for dtype in dtypes]] * size
+        df = pd.DataFrame(record)
+        df.columns = dtypes.index
+
+        if len(record) != 0:  # columns is empty in some cases
+            target_index = df_obj.index_value.to_pandas()
+            if isinstance(target_index, pd.MultiIndex):
+                index_val = tuple(
+                    _generate_value(level.dtype, fill_value)
+                    for level in target_index.levels
+                )
+                df.index = pd.MultiIndex.from_tuples(
+                    [index_val] * size, names=target_index.names
+                )
+            else:
+                index_val = _generate_value(target_index.dtype, fill_value)
+                df.index = pd.Index([index_val] * size, name=target_index.name)
+
+        # make sure dtypes correct
+        for i, dtype in enumerate(dtypes):
+            s = df.iloc[:, i]
+            if not pd.api.types.is_dtype_equal(s.dtype, dtype):
+                df[df.columns[i]] = s.astype(dtype)
+        dfs.append(df)
+    if len(dfs) == 1:
+        ret_df = dfs[0]
+    else:
+        ret_df = pd.concat(dfs)
+
+    if ensure_string:
+        obj_dtypes = df_obj.dtypes[df_obj.dtypes == np.dtype("O")]
+        ret_df[obj_dtypes.index] = ret_df[obj_dtypes.index].radd("O")
+    return ret_df
+
+
+def build_empty_series(dtype, index=None, name=None):
+    length = len(index) if index is not None else 0
+    return pd.Series(
+        [_generate_value(dtype, 1) for _ in range(length)],
+        dtype=dtype,
+        index=index,
+        name=name,
+    )
+
+
+def build_series(
+    series_obj=None,
+    fill_value=1,
+    size=1,
+    name=None,
+    ensure_string=False,
+    dtype=None,
+    index=None,
+):
+    seriess = []
+    if not isinstance(size, (list, tuple)):
+        sizes = [size]
+    else:
+        sizes = size
+
+    if not isinstance(fill_value, (list, tuple)):
+        fill_values = [fill_value]
+    else:
+        fill_values = fill_value
+
+    if series_obj is not None:
+        dtype = series_obj.dtype
+        try:
+            series_index = series_obj.index_value.to_pandas()[:0]
+        except AttributeError:
+            series_index = series_obj.index[:0]
+    else:
+        series_index = index[:0] if index is not None else None
+
+    for size, fill_value in zip(sizes, fill_values):
+        empty_series = build_empty_series(dtype, name=name, index=series_index)
+        record = _generate_value(dtype, fill_value)
+        if isinstance(empty_series.index, pd.MultiIndex):
+            index = tuple(
+                _generate_value(level.dtype, fill_value)
+                for level in empty_series.index.levels
+            )
+            empty_series = empty_series.reindex(
+                index=pd.MultiIndex.from_tuples([index], names=empty_series.index.names)
+            )
+            empty_series.iloc[0] = record
+        else:
+            if isinstance(empty_series.index.dtype, pd.CategoricalDtype):
+                index = None
+            else:
+                index = _generate_value(empty_series.index.dtype, fill_value)
+            empty_series.loc[index] = record
+
+        empty_series = pd.concat([empty_series] * size)
+        # make sure dtype correct for MultiIndex
+        empty_series = empty_series.astype(dtype, copy=False)
+        seriess.append(empty_series)
+
+    if len(seriess) == 1:
+        ret_series = seriess[0]
+    else:
+        ret_series = pd.concat(seriess)
+
+    if ensure_string and dtype == np.dtype("O"):
+        ret_series = ret_series.radd("O")
+    return ret_series
+
+
+def concat_index_value(index_values, store_data=False):
+    if not isinstance(index_values, (list, tuple)):
+        index_values = [index_values]
+    result = index_values[0]
+    if not isinstance(result, pd.Index):
+        result = result.to_pandas()
+    for index_value in index_values[1:]:
+        if isinstance(index_value, pd.Index):
+            result = result.append(index_value)
+        else:
+            result = result.append(index_value.to_pandas())
+    return parse_index(result, store_data=store_data)
+
+
+def build_concatenated_rows_frame(df):
+    from ..core import OutputType
+    from .merge.concat import DataFrameConcat
+
+    # When the df isn't split along the column axis, return the df directly.
+    if df.chunk_shape[1] == 1:
+        return df
+
+    columns = concat_index_value(
+        [df.cix[0, idx].columns_value for idx in range(df.chunk_shape[1])],
+        store_data=True,
+    )
+    columns_size = columns.to_pandas().size
+
+    out_chunks = []
+    for idx in range(df.chunk_shape[0]):
+        out_chunk = DataFrameConcat(
+            axis=1, output_types=[OutputType.dataframe]
+        ).new_chunk(
+            [df.cix[idx, k] for k in range(df.chunk_shape[1])],
+            index=(idx, 0),
+            shape=(df.cix[idx, 0].shape[0], columns_size),
+            dtypes=df.dtypes,
+            index_value=df.cix[idx, 0].index_value,
+            columns_value=columns,
+        )
+        out_chunks.append(out_chunk)
+
+    return DataFrameConcat(axis=1, output_types=[OutputType.dataframe]).new_dataframe(
+        [df],
+        chunks=out_chunks,
+        nsplits=(tuple(chunk.shape[0] for chunk in out_chunks), (df.shape[1],)),
+        shape=df.shape,
+        dtypes=df.dtypes,
+        index_value=df.index_value,
+        columns_value=df.columns_value,
+    )
+
+
+def is_index_value_identical(left: TileableType, right: TileableType) -> bool:
+    if (
+        left.index_value.key == right.index_value.key
+        and not np.isnan(sum(left.nsplits[0]))
+        and not np.isnan(sum(right.nsplits[0]))
+        and left.nsplits[0] == right.nsplits[0]
+    ):
+        is_identical = True
+    else:
+        target_chunk_index_values = [
+            c.index_value for c in left.chunks if len(c.index) <= 1 or c.index[1] == 0
+        ]
+        value_chunk_index_values = [v.index_value for v in right.chunks]
+        is_identical = len(target_chunk_index_values) == len(
+            value_chunk_index_values
+        ) and all(
+            c.key == v.key
+            for c, v in zip(target_chunk_index_values, value_chunk_index_values)
+        )
+    return is_identical
+
+
+def _filter_range_index(pd_range_index, min_val, min_val_close, max_val, max_val_close):
+    if is_pd_range_empty(pd_range_index):
+        return pd_range_index
+
+    raw_min, raw_max, step = (
+        pd_range_index.min(),
+        pd_range_index.max(),
+        _get_range_index_step(pd_range_index),
+    )
+
+    # seek min range
+    greater_func = operator.gt if min_val_close else operator.ge
+    actual_min = raw_min
+    while greater_func(min_val, actual_min):
+        actual_min += abs(step)
+    if step < 0:
+        actual_min += step  # on the right side
+
+    # seek max range
+    less_func = operator.lt if max_val_close else operator.le
+    actual_max = raw_max
+    while less_func(max_val, actual_max):
+        actual_max -= abs(step)
+    if step > 0:
+        actual_max += step  # on the right side
+
+    if step > 0:
+        return pd.RangeIndex(actual_min, actual_max, step)
+    return pd.RangeIndex(actual_max, actual_min, step)
+
+
+def infer_index_value(left_index_value, right_index_value):
+    from .core import IndexValue
+
+    if isinstance(left_index_value.value, IndexValue.RangeIndex) and isinstance(
+        right_index_value.value, IndexValue.RangeIndex
+    ):
+        if left_index_value.value.slice == right_index_value.value.slice:
+            return left_index_value
+        return parse_index(
+            pd.Index([], dtype=np.int64), left_index_value, right_index_value
+        )
+
+    # when left index and right index is identical, and both of them are elements unique,
+    # we can infer that the out index should be identical also
+    if (
+        left_index_value.is_unique
+        and right_index_value.is_unique
+        and left_index_value.key == right_index_value.key
+    ):
+        return left_index_value
+
+    left_index = left_index_value.to_pandas()
+    right_index = right_index_value.to_pandas()
+    out_index = pd.Index(
+        [], dtype=find_common_type([left_index.dtype, right_index.dtype])
+    )
+    return parse_index(out_index, left_index_value, right_index_value)
+
+
+def filter_index_value(index_value, min_max, store_data=False):
+    from .core import IndexValue
+
+    min_val, min_val_close, max_val, max_val_close = min_max
+
+    pd_index = index_value.to_pandas()
+
+    if isinstance(index_value.value, IndexValue.RangeIndex):
+        pd_filtered_index = _filter_range_index(
+            pd_index, min_val, min_val_close, max_val, max_val_close
+        )
+        return parse_index(pd_filtered_index, store_data=store_data)
+
+    if min_val_close:
+        f = pd_index >= min_val
+    else:
+        f = pd_index > min_val
+    if max_val_close:
+        f = f & (pd_index <= max_val)
+    else:
+        f = f & (pd_index < max_val)
+
+    return parse_index(pd_index[f], store_data=store_data)
+
+
+def indexing_index_value(index_value, indexes, store_data=False, rechunk=False):
+    pd_index = index_value.to_pandas()
+    # when rechunk is True, the output index shall be treated
+    # different from the input one
+    if not rechunk and isinstance(indexes, slice) and is_full_slice(indexes):
+        return index_value
+    elif not index_value.has_value():
+        new_index_value = parse_index(pd_index, indexes, store_data=store_data)
+        new_index_value._index_value._min_val = index_value.min_val
+        new_index_value._index_value._min_val_close = index_value.min_val_close
+        new_index_value._index_value._max_val = index_value.max_val
+        new_index_value._index_value._max_val_close = index_value.max_val_close
+        return new_index_value
+    else:
+        if isinstance(indexes, Integral):
+            return parse_index(pd_index[[indexes]], store_data=store_data)
+        elif isinstance(indexes, Entity):
+            if isinstance(pd_index, pd.RangeIndex):
+                return parse_index(
+                    pd.RangeIndex(-1), indexes, index_value, store_data=False
+                )
+            else:
+                return parse_index(
+                    type(pd_index)([]), indexes, index_value, store_data=False
+                )
+        if isinstance(indexes, tuple):
+            return parse_index(pd_index[list(indexes)], store_data=store_data)
+        else:
+            return parse_index(pd_index[indexes], store_data=store_data)
+
+
+def merge_index_value(to_merge_index_values: dict, store_data: bool = False):
+    """
+    Merge index value according to their chunk index.
+
+    Parameters
+    ----------
+    to_merge_index_values : dict
+        index to index_value
+    store_data : bool
+        store data in index_value
+
+    Returns
+    -------
+    merged_index_value
+    """
+
+    pd_index = None
+    min_val, min_val_close, max_val, max_val_close = None, None, None, None
+    for _, chunk_index_value in sorted(to_merge_index_values.items()):
+        if pd_index is None:
+            pd_index = chunk_index_value.to_pandas()
+            min_val, min_val_close, max_val, max_val_close = (
+                chunk_index_value.min_val,
+                chunk_index_value.min_val_close,
+                chunk_index_value.max_val,
+                chunk_index_value.max_val_close,
+            )
+        else:
+            cur_pd_index = chunk_index_value.to_pandas()
+            if store_data or (
+                isinstance(pd_index, pd.RangeIndex)
+                and isinstance(cur_pd_index, pd.RangeIndex)
+                and cur_pd_index.step == pd_index.step
+                and cur_pd_index.start == pd_index.stop
+            ):
+                # range index that is continuous
+                pd_index = pd_index.append(cur_pd_index)
+            else:
+                pd_index = pd.Index([], dtype=pd_index.dtype)
+            if chunk_index_value.min_val is not None:
+                try:
+                    if min_val is None or min_val > chunk_index_value.min_val:
+                        min_val = chunk_index_value.min_val
+                        min_val_close = chunk_index_value.min_val_close
+                except TypeError:
+                    # min_value has different types that cannot compare
+                    # just stop compare
+                    continue
+            if chunk_index_value.max_val is not None:
+                if max_val is None or max_val < chunk_index_value.max_val:
+                    max_val = chunk_index_value.max_val
+                    max_val_close = chunk_index_value.max_val_close
+
+    index_value = parse_index(pd_index, store_data=store_data)
+    if not index_value.has_value():
+        index_value._index_value._min_val = min_val
+        index_value._index_value._min_val_close = min_val_close
+        index_value._index_value._max_val = max_val
+        index_value._index_value._max_val_close = max_val_close
+    return index_value
+
+
+def infer_dtypes(left_dtypes, right_dtypes, operator):
+    left = build_empty_df(left_dtypes)
+    right = build_empty_df(right_dtypes)
+    return operator(left, right).dtypes
+
+
+@functools.lru_cache(100)
+def infer_dtype(left_dtype, right_dtype, operator):
+    left = build_empty_series(left_dtype)
+    right = build_empty_series(right_dtype)
+    return operator(left, right).dtype
+
+
+def filter_dtypes(dtypes, column_min_max):
+    left_filter = operator.ge if column_min_max[1] else operator.gt
+    left = left_filter(dtypes.index, column_min_max[0])
+    right_filter = operator.le if column_min_max[3] else operator.lt
+    right = right_filter(dtypes.index, column_min_max[2])
+    return dtypes[left & right]
+
+
+def in_range_index(i, pd_range_index):
+    """
+    Check whether the input `i` is within `pd_range_index` which is a pd.RangeIndex.
+    """
+    start, stop, step = (
+        _get_range_index_start(pd_range_index),
+        _get_range_index_stop(pd_range_index),
+        _get_range_index_step(pd_range_index),
+    )
+    if step > 0 and start <= i < stop and (i - start) % step == 0:
+        return True
+    if step < 0 and start >= i > stop and (start - i) % step == 0:
+        return True
+    return False
+
+
+def wrap_notimplemented_exception(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except NotImplementedError:
+            return NotImplemented
+
+    return wrapper
+
+
+def validate_axis(axis, tileable=None):
+    if axis == "index":
+        axis = 0
+    elif axis == "columns":
+        axis = 1
+
+    illegal = False
+    try:
+        axis = operator.index(axis)
+        if axis < 0 or (tileable is not None and axis >= tileable.ndim):
+            illegal = True
+    except TypeError:
+        illegal = True
+
+    if illegal:
+        raise ValueError(f"No axis named {axis} for object type {type(tileable)}")
+    return axis
+
+
+def validate_axis_style_args(
+    data, args, kwargs, arg_name, method_name
+):  # pragma: no cover
+    """Argument handler for mixed index, columns / axis functions
+
+    In an attempt to handle both `.method(index, columns)`, and
+    `.method(arg, axis=.)`, we have to do some bad things to argument
+    parsing. This translates all arguments to `{index=., columns=.}` style.
+
+    Parameters
+    ----------
+    data : DataFrame
+    args : tuple
+        All positional arguments from the user
+    kwargs : dict
+        All keyword arguments from the user
+    arg_name, method_name : str
+        Used for better error messages
+
+    Returns
+    -------
+    kwargs : dict
+        A dictionary of keyword arguments. Doesn't modify ``kwargs``
+        inplace, so update them with the return value here.
+    """
+    out = {}
+    # Goal: fill 'out' with index/columns-style arguments
+    # like out = {'index': foo, 'columns': bar}
+
+    # Start by validating for consistency
+    axes_names = ["index"] if data.ndim == 1 else ["index", "columns"]
+    if "axis" in kwargs and any(x in kwargs for x in axes_names):
+        msg = "Cannot specify both 'axis' and any of 'index' or 'columns'."
+        raise TypeError(msg)
+
+    # First fill with explicit values provided by the user...
+    if arg_name in kwargs:
+        if args:
+            msg = f"{method_name} got multiple values for argument '{arg_name}'"
+            raise TypeError(msg)
+
+        axis = axes_names[validate_axis(kwargs.get("axis", 0), data)]
+        out[axis] = kwargs[arg_name]
+
+    # More user-provided arguments, now from kwargs
+    for k, v in kwargs.items():
+        try:
+            ax = axes_names[validate_axis(k, data)]
+        except ValueError:
+            pass
+        else:
+            out[ax] = v
+
+    # All user-provided kwargs have been handled now.
+    # Now we supplement with positional arguments, emitting warnings
+    # when there's ambiguity and raising when there's conflicts
+
+    if len(args) == 0:
+        pass  # It's up to the function to decide if this is valid
+    elif len(args) == 1:
+        axis = axes_names[validate_axis(kwargs.get("axis", 0), data)]
+        out[axis] = args[0]
+    elif len(args) == 2:
+        if "axis" in kwargs:
+            # Unambiguously wrong
+            msg = "Cannot specify both 'axis' and any of 'index' or 'columns'"
+            raise TypeError(msg)
+
+        msg = (
+            "Interpreting call\n\t'.{method_name}(a, b)' as "
+            "\n\t'.{method_name}(index=a, columns=b)'.\nUse named "
+            "arguments to remove any ambiguity."
+        )
+        raise TypeError(msg.format(method_name=method_name))
+    else:
+        msg = f"Cannot specify all of '{arg_name}', 'index', 'columns'."
+        raise TypeError(msg)
+    return out
+
+
+def validate_output_types(**kwargs):
+    from ..core import OutputType
+
+    output_type = kwargs.pop("object_type", None) or kwargs.pop("output_type", None)
+    output_types = kwargs.pop("output_types", None) or (
+        [output_type] if output_type is not None else None
+    )
+    return (
+        [
+            getattr(OutputType, v.lower()) if isinstance(v, str) else v
+            for v in output_types
+        ]
+        if output_types
+        else None
+    )
+
+
+def standardize_range_index(chunks: List[ChunkType], axis: int = 0):
+    from .base.standardize_range_index import ChunkStandardizeRangeIndex
+
+    row_chunks = dict(
+        (k, next(v)) for k, v in itertools.groupby(chunks, key=lambda x: x.index[axis])
+    )
+    row_chunks = [row_chunks[i] for i in range(len(row_chunks))]
+
+    out_chunks = []
+    for c in chunks:
+        prev_chunks = row_chunks[: c.index[axis]]
+        op = ChunkStandardizeRangeIndex(
+            prev_shapes=[p.shape for p in prev_chunks], axis=axis
+        )
+        op.output_types = c.op.output_types
+        params = c.params.copy()
+        start_pos = sum(p.shape[axis] for p in prev_chunks)
+        end_pos = start_pos + c.shape[axis]
+        index = pd.RangeIndex(start_pos, end_pos)
+        if axis == 0:
+            params["index_value"] = parse_index(index)
+        else:
+            dtypes = params["dtypes"]
+            dtypes.index = index
+            params["dtypes"] = dtypes
+            params["columns_value"] = parse_index(dtypes.index, store_data=True)
+        out_chunks.append(op.new_chunk([c], kws=[params]))
+
+    return out_chunks
+
+
+def fetch_corner_data(df_or_series, session=None) -> pd.DataFrame:
+    """
+    Fetch corner DataFrame or Series for repr usage.
+
+    :param df_or_series: DataFrame or Series
+    :return: corner DataFrame
+    """
+    from .indexing.iloc import iloc
+
+    max_rows = pd.get_option("display.max_rows")
+    try:
+        min_rows = pd.get_option("display.min_rows")
+        min_rows = min(min_rows, max_rows)
+    except KeyError:  # pragma: no cover
+        # display.min_rows is introduced in pandas 0.25
+        min_rows = max_rows
+
+    index_size = None
+    if (
+        df_or_series.shape[0] > max_rows
+        and df_or_series.shape[0] > min_rows // 2 * 2 + 2
+    ):
+        # for pandas, greater than max_rows
+        # will display min_rows
+        # thus we fetch min_rows + 2 lines
+        index_size = min_rows // 2 + 1
+
+    if index_size is None:
+        return df_or_series._fetch(session=session)
+    else:
+        head = iloc(df_or_series)[:index_size]
+        tail = iloc(df_or_series)[-index_size:]
+        head_data, tail_data = ExecutableTuple([head, tail]).fetch(session=session)
+        xdf = cudf if head.op.is_gpu() else pd
+        return xdf.concat([head_data, tail_data], axis="index")
+
+
+class ReprSeries(pd.Series):
+    def __init__(self, corner_data, real_shape):
+        super().__init__(corner_data)
+        self._real_shape = real_shape
+
+    def __len__(self):
+        # As we only fetch corner data to repr,
+        # the length would be wrong and we have no way to control,
+        # thus we just overwrite the length to show the real one
+        return self._real_shape[0]
+
+
+def filter_dtypes_by_index(dtypes, index):
+    try:
+        new_dtypes = dtypes.loc[index].dropna()
+    except KeyError:
+        dtypes_idx = (
+            dtypes.index.to_frame()
+            .merge(index.to_frame())
+            .set_index(list(range(dtypes.index.nlevels)))
+            .index
+        )
+        new_dtypes = dtypes.loc[dtypes_idx]
+        new_dtypes.index.names = dtypes.index.names
+    return new_dtypes
+
+
+@contextmanager
+def create_sa_connection(con, **kwargs):
+    import sqlalchemy as sa
+    from sqlalchemy.engine import Connection, Engine
+
+    # process con
+    engine = None
+    if isinstance(con, Connection):
+        # connection create by user
+        close = False
+        dispose = False
+    elif isinstance(con, Engine):
+        con = con.connect()
+        close = True
+        dispose = False
+    else:
+        engine = sa.create_engine(con, **kwargs)
+        con = engine.connect()
+        close = True
+        dispose = True
+
+    try:
+        yield con
+    finally:
+        if close:
+            con.close()
+        if dispose:
+            engine.dispose()
+
+
+def arrow_table_to_pandas_dataframe(arrow_table, use_arrow_dtype=True, **kw):
+    if not use_arrow_dtype:
+        # if not use arrow string, just return
+        return arrow_table.to_pandas(**kw)
+
+    from .arrays import ArrowListArray, ArrowStringArray
+
+    table: pa.Table = arrow_table
+    schema: pa.Schema = arrow_table.schema
+
+    arrow_field_names = list()
+    arrow_arrays = list()
+    arrow_indexes = list()
+    other_field_names = list()
+    other_arrays = list()
+    for i, arrow_type in enumerate(schema.types):
+        if arrow_type == pa.string() or isinstance(arrow_type, pa.ListType):
+            arrow_field_names.append(schema.names[i])
+            arrow_indexes.append(i)
+            arrow_arrays.append(table.columns[i])
+        else:
+            other_field_names.append(schema.names[i])
+            other_arrays.append(table.columns[i])
+
+    df: pd.DataFrame = pa.Table.from_arrays(
+        other_arrays, names=other_field_names
+    ).to_pandas(**kw)
+    for arrow_index, arrow_name, arrow_array in zip(
+        arrow_indexes, arrow_field_names, arrow_arrays
+    ):
+        if arrow_array.type == pa.string():
+            series = pd.Series(ArrowStringArray(arrow_array))
+        else:
+            assert isinstance(arrow_array.type, pa.ListType)
+            series = pd.Series(ArrowListArray(arrow_array))
+        df.insert(arrow_index, arrow_name, series)
+
+    return df
+
+
+def contain_arrow_dtype(dtypes):
+    from .arrays import ArrowStringDtype
+
+    return any(isinstance(dtype, ArrowStringDtype) for dtype in dtypes)
+
+
+def to_arrow_dtypes(dtypes, test_df=None):
+    from .arrays import ArrowStringDtype
+
+    new_dtypes = dtypes.copy()
+    for i in range(len(dtypes)):
+        dtype = dtypes.iloc[i]
+        if is_string_dtype(dtype):
+            if test_df is not None:
+                series = test_df.iloc[:, i]
+                # check value
+                non_na_series = series[series.notna()]
+                if len(non_na_series) > 0:
+                    first_value = non_na_series.iloc[0]
+                    if isinstance(first_value, str):
+                        new_dtypes.iloc[i] = ArrowStringDtype()
+                else:  # pragma: no cover
+                    # empty, set arrow string dtype
+                    new_dtypes.iloc[i] = ArrowStringDtype()
+            else:
+                # empty, set arrow string dtype
+                new_dtypes.iloc[i] = ArrowStringDtype()
+    return new_dtypes
+
+
+def make_dtype(dtype):
+    if isinstance(dtype, (np.dtype, ExtensionDtype)):
+        return dtype
+    return np.dtype(dtype) if dtype is not None else None
+
+
+def make_dtypes(dtypes):
+    if dtypes is None:
+        return None
+    if not isinstance(dtypes, pd.Series):
+        dtypes = pd.Series(dtypes)
+    return dtypes.apply(make_dtype)
+
+
+def is_dataframe(x):
+    if cudf is not None:  # pragma: no cover
+        if isinstance(x, cudf.DataFrame):
+            return True
+    return isinstance(x, pd.DataFrame)
+
+
+def is_series(x):
+    if cudf is not None:  # pragma: no cover
+        if isinstance(x, cudf.Series):
+            return True
+    return isinstance(x, pd.Series)
+
+
+def is_index(x):
+    if cudf is not None:  # pragma: no cover
+        if isinstance(x, cudf.Index):
+            return True
+    return isinstance(x, pd.Index)
+
+
+def get_xdf(x):
+    if cudf is not None:  # pragma: no cover
+        if isinstance(x, (cudf.DataFrame, cudf.Series, cudf.Index)):
+            return cudf
+    return pd
+
+
+def is_cudf(x):
+    if cudf is not None:  # pragma: no cover
+        if isinstance(x, (cudf.DataFrame, cudf.Series, cudf.Index)):
+            return True
+    return False
+
+
+def auto_merge_chunks(
+    ctx: Context,
+    df_or_series: TileableType,
+    merged_file_size: Union[int, float, str] = None,
+) -> TileableType:
+    from .merge import DataFrameConcat
+
+    if df_or_series.ndim == 2 and df_or_series.chunk_shape[1] > 1:
+        # skip auto merge optimization for DataFrame
+        # that has more than 1 chunks on columns axis
+        return df_or_series
+
+    metas = ctx.get_chunks_meta(
+        [c.key for c in df_or_series.chunks], fields=["memory_size"], error="ignore"
+    )
+    memory_sizes = [meta["memory_size"] if meta is not None else None for meta in metas]
+    if any(size is None for size in memory_sizes):
+        # has not been executed before, cannot get accurate memory size, skip auto merge
+        return df_or_series
+
+    def _concat_chunks(merge_chunks: List[ChunkType], output_index: int):
+        chunk_size = sum(c.shape[0] for c in merge_chunks)
+        concat_op = DataFrameConcat(output_types=df_or_series.op.output_types)
+        if df_or_series.ndim == 1:
+            kw = dict(
+                dtype=df_or_series.dtype,
+                index_value=merge_index_value(
+                    {c.index: c.index_value for c in merge_chunks}
+                ),
+                shape=(chunk_size,),
+                index=(output_index,),
+                name=df_or_series.name,
+            )
+        else:
+            kw = dict(
+                dtypes=merge_chunks[0].dtypes,
+                index_value=merge_index_value(
+                    {c.index: c.index_value for c in merge_chunks}
+                ),
+                columns_value=merge_chunks[0].columns_value,
+                shape=(chunk_size, merge_chunks[0].shape[1]),
+                index=(output_index, 0),
+            )
+        return concat_op.new_chunk(merge_chunks, **kw)
+
+    to_merge_size = (
+        parse_readable_size(merged_file_size)[0]
+        if merged_file_size is not None
+        else options.chunk_store_limit
+    )
+    to_merge_chunks = []
+    acc_memory_size = 0
+    n_split = []
+    out_chunks = []
+    last_idx = len(memory_sizes) - 1
+    for idx, (chunk, chunk_memory_size) in enumerate(
+        zip(df_or_series.chunks, memory_sizes)
+    ):
+        to_merge_chunks.append(chunk)
+        acc_memory_size += chunk_memory_size
+        if (
+            acc_memory_size + chunk_memory_size > to_merge_size
+            and len(to_merge_chunks) > 0
+        ) or idx == last_idx:
+            # adding current chunk would exceed the maximum,
+            # concat previous chunks
+            if len(to_merge_chunks) == 1:
+                # do not generate concat op for 1 input.
+                c = to_merge_chunks[0].copy()
+                c._index = (
+                    (len(n_split),) if df_or_series.ndim == 1 else (len(n_split), 0)
+                )
+                out_chunks.append(c)
+                n_split.append(c.shape[0])
+            else:
+                merged_chunk = _concat_chunks(to_merge_chunks, len(n_split))
+                out_chunks.append(merged_chunk)
+                n_split.append(merged_chunk.shape[0])
+            # reset
+            acc_memory_size = 0
+            to_merge_chunks = []
+    # process the last chunk
+    assert len(to_merge_chunks) == 0
+    new_op = df_or_series.op.copy()
+    params = df_or_series.params.copy()
+    params["chunks"] = out_chunks
+    if df_or_series.ndim == 1:
+        params["nsplits"] = (tuple(n_split),)
+    else:
+        params["nsplits"] = (tuple(n_split), df_or_series.nsplits[1])
+    return new_op.new_tileable(df_or_series.op.inputs, kws=[params])
+
+
+# TODO: clean_up_func, is_on_ray and restore_func functions may be
+# removed or refactored in the future to calculate func size
+# with more accuracy as well as address some serialization issues.
+def clean_up_func(op):
+    threshold = int(os.getenv("MARS_CLOSURE_CLEAN_UP_BYTES_THRESHOLD", 10**4))
+    if threshold == -1:  # pragma: no cover
+        return
+    ctx = get_context()
+    if ctx is None:
+        return
+
+    # Note: op.func_key is set only when func was put into storage.
+    # Under ray backend, func will be put into storage.
+    # While under mars backend, since storage service is empty on supervisor,
+    # func won't be put into storage but serialized in advance to reduce upcoming
+    # expenses brought by serializations and deserializations during subtask transmission.
+    if whether_to_clean_up(op, threshold) is True:
+        assert (
+            op.logic_key is not None
+        ), f"Logic key of {op} wasn't calculated before cleaning up func."
+        logger.info("%s is cleaning up func %s.", op, op.func)
+        if is_on_ray(ctx):
+            import ray
+
+            op.func_key = ray.put(op.func)
+            logger.info("%s func %s is replaced by %s.", op, op.func, op.func_key)
+            op.func = None
+        else:
+            op.func = cloudpickle.dumps(op.func)
+
+
+def whether_to_clean_up(op, threshold):
+    func = op.func
+    counted_bytes = 0
+    max_recursion_depth = 2
+
+    from collections import deque
+    from numbers import Number
+
+    BYPASS_CLASSES = (str, bytes, Number, range, bytearray, pd.DataFrame, pd.Series)
+
+    class GetSizeEarlyStopException(Exception):
+        pass
+
+    def check_exceed_threshold():
+        nonlocal threshold, counted_bytes
+        if counted_bytes >= threshold:
+            raise GetSizeEarlyStopException()
+
+    def getsize(obj_outer):
+        _seen_obj_ids = set()
+
+        def inner_count(obj, recursion_depth):
+            obj_id = id(obj)
+            if obj_id in _seen_obj_ids or recursion_depth > max_recursion_depth:
+                return 0
+            _seen_obj_ids.add(obj_id)
+            recursion_depth += 1
+            size = sys.getsizeof(obj)
+            if isinstance(obj, BYPASS_CLASSES):
+                return size
+            elif isinstance(obj, (tuple, list, set, deque)):
+                size += sum(inner_count(i, recursion_depth) for i in obj)
+            elif hasattr(obj, "items"):
+                size += sum(
+                    inner_count(k, recursion_depth) + inner_count(v, recursion_depth)
+                    for k, v in getattr(obj, "items")()
+                )
+            if hasattr(obj, "__dict__"):
+                size += inner_count(vars(obj), recursion_depth)
+            if hasattr(obj, "__slots__"):
+                size += sum(
+                    inner_count(getattr(obj, s), recursion_depth)
+                    for s in obj.__slots__
+                    if hasattr(obj, s)
+                )
+            return size
+
+        return inner_count(obj_outer, 0)
+
+    try:
+        # Note: In most cases, func is just a function with closure, while chances are that
+        # func is a callable that doesn't have __closure__ attribute.
+        if inspect.isclass(func):
+            pass
+        elif hasattr(func, "__closure__") and func.__closure__ is not None:
+            for cell in func.__closure__:
+                counted_bytes += getsize(cell.cell_contents)
+                check_exceed_threshold()
+        elif callable(func):
+            if hasattr(func, "__dict__"):
+                for k, v in func.__dict__.items():
+                    counted_bytes += sum([getsize(k), getsize(v)])
+                    check_exceed_threshold()
+            if hasattr(func, "__slots__"):
+                for slot in func.__slots__:
+                    counted_bytes += (
+                        getsize(getattr(func, slot)) if hasattr(func, slot) else 0
+                    )
+                    check_exceed_threshold()
+    except GetSizeEarlyStopException:
+        logger.debug("Func needs cleanup.")
+        op.need_clean_up_func = True
+    else:
+        assert op.need_clean_up_func is False
+        logger.debug("Func doesn't need cleanup.")
+
+    return op.need_clean_up_func
+
+
+def restore_func(ctx: Context, op):
+    if op.need_clean_up_func and ctx is not None:
+        logger.info("%s is restoring func from %s.", op, op.func_key)
+        if is_on_ray(ctx):
+            import ray
+
+            op.func = ray.get(op.func_key)
+            logger.info("%s func %s is restored.", op, op.func)
+        else:
+            op.func = cloudpickle.loads(op.func)
+
+
+def concat_on_columns(objs: List) -> Any:
+    xdf = get_xdf(objs[0])
+    # In cudf, concat with axis=1 and ignore_index=False by default behaves opposite to pandas.
+    # Cudf would reset the index when axis=1 and ignore_index=False, which does not match with its document.
+    # Therefore, we deal with this case specially.
+    result = xdf.concat(objs, axis=1)
+    if xdf is cudf:
+        result.index = objs[0].index
+    return result
diff --git a/python/xorbits/_mars/dataframe/window/__init__.py b/python/xorbits/_mars/dataframe/window/__init__.py
new file mode 100644
index 000000000..9be1d5efa
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def _install():
+    from ..core import DATAFRAME_TYPE, SERIES_TYPE
+    from .ewm.aggregation import DataFrameEwmAgg
+    from .ewm.core import ewm
+    from .expanding.aggregation import DataFrameExpandingAgg
+    from .expanding.core import expanding
+    from .rolling.aggregation import DataFrameRollingAgg
+    from .rolling.core import rolling
+
+    for t in DATAFRAME_TYPE + SERIES_TYPE:
+        t.rolling = rolling
+        t.expanding = expanding
+        t.ewm = ewm
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/dataframe/window/aggregation.py b/python/xorbits/_mars/dataframe/window/aggregation.py
new file mode 100644
index 000000000..1019ac6bc
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/aggregation.py
@@ -0,0 +1,632 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict, namedtuple
+from collections.abc import Iterable
+from typing import Dict
+
+import numpy as np
+import pandas as pd
+
+from ...core.operand import OperandStage
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    DictField,
+    Int32Field,
+    Int64Field,
+    StringField,
+)
+from ...utils import tokenize
+from ..core import DATAFRAME_TYPE
+from ..merge import DataFrameConcat
+from ..operands import DataFrameOperand, DataFrameOperandMixin
+from ..utils import build_df, build_empty_series, filter_dtypes_by_index, parse_index
+
+_stage_info = namedtuple(
+    "_stage_info",
+    (
+        "map_groups",
+        "map_sources",
+        "combine_sources",
+        "combine_columns",
+        "combine_funcs",
+        "key_to_funcs",
+        "valid_columns",
+        "min_periods_func_name",
+    ),
+)
+
+
+class BaseDataFrameExpandingAgg(DataFrameOperand, DataFrameOperandMixin):
+    _min_periods = Int64Field("min_periods")
+    _axis = Int32Field("axis")
+    _func = AnyField("func")
+
+    # always treat count as valid. this behavior is cancelled in pandas 1.0
+    _count_always_valid = BoolField("count_always_valid")
+    # True if function name is treated as new index
+    _append_index = BoolField("append_index")
+
+    # chunk params
+    _output_agg = BoolField("output_agg")
+
+    _map_groups = DictField("map_groups")
+    _map_sources = DictField("map_sources")
+    _combine_sources = DictField("combine_sources")
+    _combine_columns = DictField("combine_columns")
+    _combine_funcs = DictField("combine_funcs")
+    _key_to_funcs = DictField("keys_to_funcs")
+
+    _min_periods_func_name = StringField("min_periods_func_name")
+
+    def __init__(
+        self,
+        min_periods=None,
+        axis=None,
+        func=None,
+        count_always_valid=None,
+        append_index=None,
+        output_agg=False,
+        map_groups=None,
+        map_sources=None,
+        combine_sources=None,
+        combine_columns=None,
+        combine_funcs=None,
+        key_to_funcs=None,
+        min_periods_func_name=None,
+        **kw
+    ):
+        super().__init__(
+            _min_periods=min_periods,
+            _axis=axis,
+            _func=func,
+            _count_always_valid=count_always_valid,
+            _append_index=append_index,
+            _output_agg=output_agg,
+            _map_groups=map_groups,
+            _map_sources=map_sources,
+            _combine_sources=combine_sources,
+            _combine_columns=combine_columns,
+            _combine_funcs=combine_funcs,
+            _key_to_funcs=key_to_funcs,
+            _min_periods_func_name=min_periods_func_name,
+            **kw
+        )
+
+    @property
+    def min_periods(self) -> int:
+        return self._min_periods
+
+    @property
+    def axis(self) -> int:
+        return self._axis
+
+    @property
+    def func(self):
+        return self._func
+
+    @property
+    def count_always_valid(self):
+        return self._count_always_valid
+
+    @property
+    def append_index(self):
+        return self._append_index
+
+    @property
+    def output_agg(self):
+        return self._output_agg
+
+    @property
+    def map_groups(self) -> Dict:
+        return self._map_groups
+
+    @property
+    def map_sources(self) -> Dict:
+        return self._map_sources
+
+    @property
+    def combine_sources(self) -> Dict:
+        return self._combine_sources
+
+    @property
+    def combine_columns(self) -> Dict:
+        return self._combine_columns
+
+    @property
+    def combine_funcs(self) -> Dict:
+        return self._combine_funcs
+
+    @property
+    def key_to_funcs(self) -> Dict:
+        return self._key_to_funcs
+
+    @property
+    def min_periods_func_name(self) -> str:
+        return self._min_periods_func_name
+
+    @property
+    def output_limit(self):
+        return 2 if self.output_agg else 1
+
+    def __call__(self, expanding):
+        inp = expanding.input
+        raw_func = self.func
+        self._normalize_funcs()
+
+        if isinstance(inp, DATAFRAME_TYPE):
+            empty_df = build_df(inp)
+            for c, t in empty_df.dtypes.items():
+                if t == np.dtype("O"):
+                    empty_df[c] = "O"
+
+            test_df = expanding(empty_df).agg(raw_func)
+            if self._axis == 0:
+                index_value = inp.index_value
+            else:
+                index_value = parse_index(
+                    test_df.index, expanding.params, inp, store_data=False
+                )
+            self._append_index = test_df.columns.nlevels != empty_df.columns.nlevels
+            return self.new_dataframe(
+                [inp],
+                shape=(inp.shape[0], test_df.shape[1]),
+                dtypes=test_df.dtypes,
+                index_value=index_value,
+                columns_value=parse_index(test_df.columns, store_data=True),
+            )
+        else:
+            pd_index = inp.index_value.to_pandas()
+            empty_series = build_empty_series(
+                inp.dtype, index=pd_index[:0], name=inp.name
+            )
+            test_obj = expanding(empty_series).agg(raw_func)
+            if isinstance(test_obj, pd.DataFrame):
+                return self.new_dataframe(
+                    [inp],
+                    shape=(inp.shape[0], test_obj.shape[1]),
+                    dtypes=test_obj.dtypes,
+                    index_value=inp.index_value,
+                    columns_value=parse_index(test_obj.dtypes.index, store_data=True),
+                )
+            else:
+                return self.new_series(
+                    [inp],
+                    shape=inp.shape,
+                    dtype=test_obj.dtype,
+                    index_value=inp.index_value,
+                    name=test_obj.name,
+                )
+
+    def _normalize_funcs(self):
+        if isinstance(self._func, dict):
+            new_func = OrderedDict()
+            for k, v in self._func.items():
+                if isinstance(v, str) or callable(v):
+                    new_func[k] = [v]
+                else:
+                    new_func[k] = v
+            self._func = new_func
+        elif isinstance(self._func, Iterable) and not isinstance(self._func, str):
+            self._func = list(self._func)
+
+    @staticmethod
+    def _safe_append(d, key, val):
+        if key not in d:
+            d[key] = []
+        if val not in d[key]:
+            d[key].append(val)
+
+    @classmethod
+    def _get_stage_functions(cls, op: "BaseDataFrameExpandingAgg", func):
+        raise NotImplementedError
+
+    @classmethod
+    def _gen_chunk_stage_info(
+        cls, op: "BaseDataFrameExpandingAgg", chunk_cols=None, min_periods=1
+    ):
+        map_groups = OrderedDict()
+        map_sources = OrderedDict()
+        combine_sources = OrderedDict()
+        combine_columns = OrderedDict()
+        combine_funcs = OrderedDict()
+        key_to_funcs = OrderedDict()
+        valid_columns = []
+        min_periods_func_name = None
+
+        def _clean_dict(d):
+            return OrderedDict(
+                (k, sorted(v) if v != [None] else None) for k, v in d.items()
+            )
+
+        def _fun_to_str(fun):
+            if isinstance(fun, str):
+                return fun
+            fun_str = tokenize(fun)
+            key_to_funcs[fun_str] = fun
+            return fun if isinstance(fun, str) else tokenize(fun)
+
+        def _add_column_to_functions(col, fun_name, mappers, aggregator):
+            sources = []
+            for mapper in mappers:
+                mapper_str = _fun_to_str(mapper)
+                cls._safe_append(map_groups, mapper_str, col)
+                sources.append(mapper_str)
+
+            combine_sources[fun_name] = sources
+            cls._safe_append(combine_columns, fun_name, col)
+            combine_funcs[fun_name] = _fun_to_str(aggregator)
+
+        chunk_cols = set(chunk_cols) if chunk_cols is not None else None
+        if isinstance(op.func, list):
+            op_func = {None: op.func}
+        elif isinstance(op.func, str):
+            op_func = {None: [op.func]}
+        else:
+            op_func = op.func
+
+        for col, funcs in op_func.items():
+            if col is not None:
+                if chunk_cols is not None and col not in chunk_cols:
+                    continue
+                valid_columns.append(col)
+
+            if min_periods > 1:
+                min_periods_func_name = tokenize(chunk_cols, "min_periods")
+                _add_column_to_functions(
+                    col,
+                    min_periods_func_name,
+                    *cls._get_stage_functions(op, "_data_count")
+                )
+
+            for func in funcs:
+                mapper_funcs, combine_func = cls._get_stage_functions(op, func)
+                _add_column_to_functions(col, func, mapper_funcs, combine_func)
+
+        return _stage_info(
+            map_groups=_clean_dict(map_groups),
+            map_sources=map_sources,
+            combine_sources=combine_sources,
+            combine_columns=_clean_dict(combine_columns),
+            combine_funcs=combine_funcs,
+            key_to_funcs=key_to_funcs,
+            valid_columns=valid_columns or None,
+            min_periods_func_name=min_periods_func_name,
+        )
+
+    @classmethod
+    def _remap_dtypes(cls, in_df, out_df):
+        if in_df.ndim == 1:
+            if out_df.ndim == 2:
+                return (
+                    {0: (0, out_df.dtypes)},
+                    (in_df.nsplits[0], (len(out_df.dtypes),)),
+                )
+            return None, in_df.nsplits
+
+        axis = out_df.op.axis
+        chunk_idx_to_dtypes = dict()
+        new_dtypes_sizes = []
+        for c in in_df.cix[0, :]:
+            columns = c.columns_value.to_pandas()
+            dtypes = filter_dtypes_by_index(out_df.dtypes, columns)
+
+            if len(dtypes):
+                chunk_idx_to_dtypes[c.index[1]] = (len(chunk_idx_to_dtypes), dtypes)
+                new_dtypes_sizes.append(len(dtypes))
+        new_nsplits = list(in_df.nsplits)
+        new_nsplits[1 - axis] = tuple(new_dtypes_sizes)
+        return chunk_idx_to_dtypes, tuple(new_nsplits)
+
+    @classmethod
+    def _tile_single(cls, op: "BaseDataFrameExpandingAgg"):
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+
+        chunk_idx_to_dtypes, new_nsplits = cls._remap_dtypes(in_df, out_df)
+
+        chunks = []
+        for c in in_df.chunks:
+            try:
+                if out_df.ndim == 2:
+                    new_axis_idx, new_dtypes = chunk_idx_to_dtypes[
+                        c.index[1] if c.ndim > 1 else 0
+                    ]
+                else:
+                    new_axis_idx, new_dtypes = None, None
+            except KeyError:
+                continue
+
+            chunk_op = op.copy().reset_key()
+
+            if out_df.ndim == 2:
+                chunks.append(
+                    chunk_op.new_chunk(
+                        [in_df.chunks[0]],
+                        dtypes=new_dtypes,
+                        index=(c.index[0], new_axis_idx),
+                        shape=(c.shape[0], len(new_dtypes)),
+                        index_value=c.index_value,
+                        columns_value=parse_index(new_dtypes.index, store_data=True),
+                    )
+                )
+            else:
+                params = c.params.copy()
+                params["dtype"] = out_df.dtype
+                chunks.append(chunk_op.new_chunk([in_df.chunks[0]], **params))
+
+        tileable_op = op.copy().reset_key()
+        params = out_df.params.copy()
+        params["chunks"] = chunks
+        if new_nsplits:
+            params["nsplits"] = new_nsplits
+        return tileable_op.new_tileables([in_df], **params)
+
+    @classmethod
+    def tile(cls, op: "BaseDataFrameExpandingAgg"):
+        axis = op.axis
+
+        in_df = op.inputs[0]
+        out_df = op.outputs[0]
+
+        if in_df.chunk_shape[op.axis] == 1:
+            return cls._tile_single(op)
+
+        dtypes_mapping, new_nsplits = cls._remap_dtypes(in_df, out_df)
+        new_chunk_shape = tuple(len(split) for split in new_nsplits)
+
+        data_chunks = []
+        summary_chunks = np.empty(new_chunk_shape, dtype=object)
+        stage_info_dict = dict()
+        for c in in_df.chunks:
+            try:
+                if out_df.ndim == 2:
+                    new_axis_idx, new_dtypes = dtypes_mapping[
+                        c.index[1] if c.ndim > 1 else 0
+                    ]
+                else:
+                    new_axis_idx, new_dtypes = None, None
+            except KeyError:
+                continue
+
+            new_index = (c.index[0], new_axis_idx)
+
+            try:
+                stage_info = stage_info_dict[new_index[1]]
+            except KeyError:
+                cols = c.dtypes.index if c.ndim == 2 else None
+                stage_info = stage_info_dict[new_index[1]] = cls._gen_chunk_stage_info(
+                    op, cols, min_periods=op.min_periods
+                )
+
+            chunk_op = op.copy().reset_key()
+            chunk_op._output_agg = c.index[axis] != in_df.chunk_shape[axis] - 1
+            chunk_op.stage = OperandStage.map
+            chunk_op._map_sources = stage_info.map_sources
+            chunk_op._map_groups = stage_info.map_groups
+            chunk_op._key_to_funcs = stage_info.key_to_funcs
+
+            if out_df.ndim == 2:
+                kw0 = dict(
+                    dtypes=new_dtypes,
+                    index=new_index,
+                    shape=(c.shape[0], len(new_dtypes)),
+                    index_value=c.index_value,
+                    columns_value=parse_index(new_dtypes.index, store_data=True),
+                )
+                kw1 = kw0.copy()
+                kw1["shape"] = (1, len(new_dtypes)) if axis == 0 else (c.shape[0], 1)
+            else:
+                kw0 = dict(
+                    dtype=out_df.dtype,
+                    index=c.index,
+                    shape=c.shape,
+                    name=c.name,
+                    index_value=c.index_value,
+                )
+                kw1 = kw0.copy()
+                kw1["shape"] = (1,)
+            out_chunks = chunk_op.new_chunks([c], [kw0, kw1])
+            data_chunks.append(out_chunks[0])
+            if chunk_op.output_agg:
+                summary_chunks[new_index] = out_chunks[1]
+
+        chunks = []
+        for c in data_chunks:
+            stage_info = stage_info_dict[c.index[1] if c.ndim > 1 else None]
+
+            chunk_op = op.copy().reset_key()
+            chunk_op._output_agg = False
+            chunk_op.stage = OperandStage.combine
+            chunk_op._map_groups = stage_info.map_groups
+            chunk_op._combine_sources = stage_info.combine_sources
+            chunk_op._combine_columns = stage_info.combine_columns
+            chunk_op._combine_funcs = stage_info.combine_funcs
+            chunk_op._key_to_funcs = stage_info.key_to_funcs
+            chunk_op._min_periods_func_name = stage_info.min_periods_func_name
+
+            params = c.params.copy()
+            if c.ndim == 2:
+                summary_inputs = list(summary_chunks[: c.index[0], c.index[1]])
+            else:
+                summary_inputs = list(summary_chunks[: c.index[0]])
+
+            if len(summary_inputs) > 1:
+                concat_op = DataFrameConcat(
+                    output_types=out_df.op.output_types, axis=op.axis
+                )
+                concat_summary = concat_op.new_chunk(summary_inputs)
+                chunks.append(chunk_op.new_chunk([c, concat_summary], **params))
+            elif len(summary_inputs) == 1:
+                chunks.append(chunk_op.new_chunk([c, summary_inputs[0]], **params))
+            else:
+                chunks.append(chunk_op.new_chunk([c], **params))
+
+        df_op = op.copy().reset_key()
+        params = out_df.params.copy()
+        params.update(dict(chunks=chunks, nsplits=new_nsplits))
+        return df_op.new_tileables([in_df], **params)
+
+    @classmethod
+    def _execute_map_function(cls, op: "BaseDataFrameExpandingAgg", func, in_data):
+        raise NotImplementedError
+
+    @classmethod
+    def _execute_map(cls, ctx, op: "BaseDataFrameExpandingAgg"):
+        in_data = ctx[op.inputs[0].key]
+
+        # map according to map groups
+        map_results = []
+        summary_results = []
+        for map_func_str, cols in op.map_groups.items():
+            if cols is None:
+                src_df = in_data
+            else:
+                src_df = in_data[cols]
+
+            result, summary = cls._execute_map_function(op, map_func_str, src_df)
+            map_results.append(result)
+            if op.output_agg:
+                summary_results.append(summary)
+
+        if op.output_agg:
+            summary_results.append(
+                pd.Series([len(in_data)], index=summary_results[0].index)
+            )
+
+        ctx[op.outputs[0].key] = tuple(map_results)
+        if op.output_agg:
+            ctx[op.outputs[1].key] = tuple(summary_results)
+
+    @classmethod
+    def _append_func_name_index(cls, op: "BaseDataFrameExpandingAgg", df, func_name):
+        if not op.append_index:
+            return
+
+        col_frame = df.columns.to_frame().copy()
+        col_frame[len(col_frame.columns)] = func_name
+        df.columns = pd.MultiIndex.from_frame(
+            col_frame, names=tuple(df.columns.names) + (None,)
+        )
+
+    @classmethod
+    def _execute_combine_function(
+        cls, op: "BaseDataFrameExpandingAgg", func, pred_inputs, local_inputs, func_cols
+    ):
+        raise NotImplementedError
+
+    @classmethod
+    def _execute_combine(cls, ctx, op: "BaseDataFrameExpandingAgg"):
+        out_df = op.outputs[0]
+        local_data = ctx[op.inputs[0].key]
+        local_data_dict = dict(zip(op.map_groups.keys(), local_data))
+
+        func_to_aggs = OrderedDict()
+
+        if len(op.inputs) == 1:
+            pred_record_count = 0
+            for func_name, func_sources in op.combine_sources.items():
+                func_str = op.combine_funcs[func_name]
+                func_cols = op.combine_columns[func_name]
+                if func_cols is None:
+                    local_inputs = [local_data_dict[src] for src in func_sources]
+                else:
+                    local_inputs = [
+                        local_data_dict[src][func_cols] for src in func_sources
+                    ]
+
+                func = op.key_to_funcs[func_str]
+                func_to_aggs[func_name] = cls._execute_combine_function(
+                    op, func, None, local_inputs, func_cols
+                )
+        else:
+            pred_data = ctx[op.inputs[1].key]
+            pred_record_count = pred_data[-1].sum()
+            pred_data_dict = dict(zip(op.map_groups.keys(), pred_data))
+
+            for func_name, func_sources in op.combine_sources.items():
+                func_str = op.combine_funcs[func_name]
+                func_cols = op.combine_columns[func_name]
+                if func_cols is None:
+                    local_inputs = [local_data_dict[src] for src in func_sources]
+                    pred_inputs = [pred_data_dict[src] for src in func_sources]
+                else:
+                    local_inputs = [
+                        local_data_dict[src][func_cols] for src in func_sources
+                    ]
+                    pred_inputs = [
+                        pred_data_dict[src][func_cols] for src in func_sources
+                    ]
+
+                func = op.key_to_funcs[func_str]
+                func_to_aggs[func_name] = cls._execute_combine_function(
+                    op, func, pred_inputs, local_inputs, func_cols
+                )
+
+        if op.min_periods_func_name is not None:
+            valid_counts = func_to_aggs.pop(op.min_periods_func_name)
+            invalid_poses = valid_counts < op.min_periods
+            for func_name in func_to_aggs.keys():
+                if func_name == "count":
+                    if (
+                        not op.count_always_valid
+                        and pred_record_count < op.min_periods - 1
+                    ):
+                        try:
+                            func_to_aggs[func_name].iloc[
+                                : op.min_periods - pred_record_count - 1
+                            ] = np.nan
+                        except ValueError:
+                            func_to_aggs[func_name] = func_to_aggs[func_name].copy()
+                            func_to_aggs[func_name].iloc[
+                                : op.min_periods - pred_record_count - 1
+                            ] = np.nan
+                else:
+                    func_to_aggs[func_name][invalid_poses] = np.nan
+
+        for func_name, agg_df in func_to_aggs.items():
+            if out_df.ndim == 2 and agg_df.ndim == 1:
+                agg_df.name = func_name
+                agg_df = func_to_aggs[func_name] = pd.DataFrame(agg_df)
+            cls._append_func_name_index(op, agg_df, func_name)
+
+        if len(func_to_aggs) == 1:
+            val = list(func_to_aggs.values())[0]
+        else:
+            out_df = op.outputs[0]
+            val = pd.concat(list(func_to_aggs.values()), axis=1 - op.axis)
+
+        if out_df.ndim > 1:
+            val = val.reindex(
+                out_df.columns_value.to_pandas(), axis=1 - op.axis, copy=False
+            )
+        else:
+            val.name = out_df.name
+        ctx[op.outputs[0].key] = val
+
+    @classmethod
+    def _execute_raw_function(cls, op: "BaseDataFrameExpandingAgg", in_data):
+        raise NotImplementedError
+
+    @classmethod
+    def execute(cls, ctx, op: "BaseDataFrameExpandingAgg"):
+        if op.stage == OperandStage.map:
+            cls._execute_map(ctx, op)
+        elif op.stage == OperandStage.combine:
+            cls._execute_combine(ctx, op)
+        else:
+            in_data = ctx[op.inputs[0].key]
+            r = cls._execute_raw_function(op, in_data)
+            ctx[op.outputs[0].key] = r
diff --git a/python/xorbits/_mars/dataframe/window/core.py b/python/xorbits/_mars/dataframe/window/core.py
new file mode 100644
index 000000000..5ab6aaee6
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/core.py
@@ -0,0 +1,76 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...serialization.serializables import KeyField, Serializable
+
+
+class Window(Serializable):
+    _input = KeyField("input")
+
+    def __init__(self, input=None, **kw):  # pylint: disable=redefined-builtin
+        super().__init__(_input=input, **kw)
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def params(self):
+        raise NotImplementedError
+
+    def _repr(self, params):
+        kvs = [f"{k}={v}" for k, v in params.items() if v is not None]
+        return "{} [{}]".format(self._repr_name(), ",".join(kvs))
+
+    def _repr_name(self):
+        return type(self).__name__
+
+    def __repr__(self):
+        return self._repr(self.params)
+
+    def __getitem__(self, item):
+        columns = self.input.dtypes.index
+        if isinstance(item, (list, tuple)):
+            item = list(item)
+            for col in item:
+                if col not in columns:
+                    raise KeyError(f"Column not found: {col}")
+        else:
+            if item not in columns:
+                raise KeyError(f"Column not found: {item}")
+
+        return type(self)(input=self.input[item], **self.params)
+
+    def __getattr__(self, item):
+        try:
+            return super().__getattribute__(item)
+        except AttributeError:
+            if self.input.ndim == 2 and item in self.input.dtypes:
+                return self[item]
+            else:
+                raise
+
+    def __dir__(self):
+        result = list(super().__dir__())
+        if self.input.ndim == 1:
+            return result
+        else:
+            return sorted(
+                result
+                + [
+                    k
+                    for k in self.input.dtypes.index
+                    if isinstance(k, str) and k.isidentifier()
+                ]
+            )
diff --git a/python/xorbits/_mars/dataframe/window/ewm/__init__.py b/python/xorbits/_mars/dataframe/window/ewm/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/ewm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/window/ewm/aggregation.py b/python/xorbits/_mars/dataframe/window/ewm/aggregation.py
new file mode 100644
index 000000000..31122b2a3
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/ewm/aggregation.py
@@ -0,0 +1,491 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import namedtuple
+
+import numpy as np
+import pandas as pd
+
+from .... import opcodes
+from ....serialization.serializables import BoolField, Float64Field
+from ..aggregation import BaseDataFrameExpandingAgg
+
+_stage_info = namedtuple(
+    "_stage_info",
+    (
+        "map_groups",
+        "map_sources",
+        "combine_sources",
+        "combine_columns",
+        "combine_funcs",
+        "key_to_funcs",
+        "valid_columns",
+        "min_periods_func_name",
+    ),
+)
+
+_cum_alpha_coeff_func = "_cum_alpha_coeff"
+_cum_square_alpha_coeff_func = "_cum_square_alpha_coeff"
+
+
+def _add_pred_results(
+    pred_results,
+    local_results,
+    axis=0,
+    alpha=None,
+    order=1,
+    alpha_ignore_na=False,
+    pred_exponent=None,
+    alpha_data=None,
+):
+    if pred_results[0].ndim == 1:
+        df_filler = 0
+    else:
+        df_filler = pred_results[0].iloc[-1, :].dropna()
+        df_filler[:] = 0
+
+    new_locals = []
+    combine_axis = pred_results[0].ndim - axis - 1
+    weight = (1 - alpha) ** order
+    pred_coeff = weight**pred_exponent
+    for idx, (pred_result, local_result) in enumerate(zip(pred_results, local_results)):
+        local_result.fillna(df_filler, inplace=True)
+        pred_result = pred_result.mul(pred_coeff).sum(axis=axis)
+
+        if alpha_ignore_na:
+            pred_df = pred_result * weight ** alpha_data.notna().cumsum()
+        else:
+            weights = np.arange(1, len(local_result) + 1)
+            if local_result.ndim == 2:
+                weights_df = pd.DataFrame(
+                    np.repeat(
+                        weights.reshape((len(local_result), 1)),
+                        len(local_result.columns),
+                        axis=1,
+                    ),
+                    columns=local_result.columns,
+                    index=local_result.index,
+                )
+            else:
+                weights_df = pd.Series(weights, index=local_result.index)
+            weights_df[alpha_data.isna()] = np.nan
+            weights_df.ffill(inplace=True)
+            weights_df.fillna(0, inplace=True)
+
+            weights_df = weight**weights_df
+            pred_df = weights_df.mul(pred_result, axis=combine_axis)
+
+        new_locals.append(local_result.add(pred_df, axis=combine_axis))
+    return new_locals
+
+
+def _combine_mean(
+    pred_results,
+    local_results,
+    axis=0,
+    alpha=None,
+    alpha_ignore_na=False,
+    pred_exponent=None,
+):
+    if pred_results is None:
+        return (local_results[0] / local_results[1]).ffill()
+
+    alpha_data = local_results[1]
+    local_results[0] = local_results[0].ffill()
+    local_results[1] = alpha_data.ffill()
+
+    local_sum_data, local_count_data = local_results
+
+    if pred_results is not None:
+        local_sum_data, local_count_data = _add_pred_results(
+            pred_results,
+            local_results,
+            axis=axis,
+            alpha=alpha,
+            alpha_ignore_na=alpha_ignore_na,
+            pred_exponent=pred_exponent,
+            alpha_data=alpha_data,
+        )
+    return local_sum_data / local_count_data
+
+
+def _combine_var(
+    pred_results,
+    local_results,
+    axis=0,
+    alpha=None,
+    alpha_ignore_na=False,
+    pred_exponent=None,
+):
+    local_results[0] = local_results[0].ffill()
+    alpha_data = local_results[1]
+    local_results[1] = alpha_data.ffill()
+
+    local_results[2] = local_results[2].ffill()
+    alpha2_data = local_results[3]
+    local_results[3] = alpha2_data.ffill()
+
+    (
+        local_sum_data,
+        local_count_data,
+        local_sum_square,
+        local_count2_data,
+    ) = local_results
+    if pred_results is None:
+        return (local_sum_square - local_sum_data**2 / local_count_data) / (
+            local_count_data - local_count2_data / local_count_data
+        )
+
+    pred_sum_data, pred_count_data, pred_sum_square, pred_count2_data = pred_results
+
+    (local_count2_data,) = _add_pred_results(
+        [pred_count2_data],
+        [local_count2_data],
+        axis=axis,
+        alpha=alpha,
+        order=2,
+        alpha_ignore_na=alpha_ignore_na,
+        pred_exponent=pred_exponent,
+        alpha_data=alpha_data,
+    )
+
+    local_sum_square, local_sum_data, local_count_data = _add_pred_results(
+        [pred_sum_square, pred_sum_data, pred_count_data],
+        [local_sum_square, local_sum_data, local_count_data],
+        axis=axis,
+        alpha=alpha,
+        alpha_ignore_na=alpha_ignore_na,
+        pred_exponent=pred_exponent,
+        alpha_data=alpha_data,
+    )
+
+    return (local_sum_square - local_sum_data**2 / local_count_data) / (
+        local_count_data - local_count2_data / local_count_data
+    )
+
+
+def _combine_std(
+    pred_results,
+    local_results,
+    axis=0,
+    alpha=None,
+    alpha_ignore_na=False,
+    pred_exponent=None,
+):
+    return np.sqrt(
+        _combine_var(
+            pred_results,
+            local_results,
+            axis=axis,
+            alpha=alpha,
+            alpha_ignore_na=alpha_ignore_na,
+            pred_exponent=pred_exponent,
+        )
+    )
+
+
+def _combine_data_count(pred_results, local_results, axis=0, **__):
+    if pred_results is None:
+        return local_results[0]
+    return local_results[0].add(
+        pred_results[0].sum(), axis=pred_results[0].ndim - axis - 1
+    )
+
+
+class DataFrameEwmAgg(BaseDataFrameExpandingAgg):
+    _op_type_ = opcodes.EWM_AGG
+
+    _alpha = Float64Field("alpha")
+    _adjust = BoolField("adjust")
+    _alpha_ignore_na = BoolField("alpha_ignore_na")
+
+    _validate_columns = BoolField("_validate_columns")
+
+    _exec_cache = dict()
+
+    def __init__(
+        self, alpha=None, adjust=None, alpha_ignore_na=None, validate_columns=None, **kw
+    ):
+        super().__init__(
+            _alpha=alpha,
+            _adjust=adjust,
+            _alpha_ignore_na=alpha_ignore_na,
+            _validate_columns=validate_columns,
+            **kw
+        )
+
+    @property
+    def alpha(self) -> float:
+        return self._alpha
+
+    @property
+    def adjust(self) -> bool:
+        return self._adjust
+
+    @property
+    def alpha_ignore_na(self) -> bool:
+        return self._alpha_ignore_na
+
+    @property
+    def validate_columns(self) -> bool:
+        return self._validate_columns
+
+    @classmethod
+    def _get_stage_functions(cls, op: "DataFrameEwmAgg", func):
+        if func == "_data_count":
+            return ["_data_count"], _combine_data_count
+        elif func == "mean":
+            return ["cumsum", _cum_alpha_coeff_func], _combine_mean
+        elif func in {"var", "std"}:
+            return (
+                [
+                    "cumsum",
+                    _cum_alpha_coeff_func,
+                    "cumsum2",
+                    _cum_square_alpha_coeff_func,
+                ],
+                _combine_var if func == "var" else _combine_std,
+            )
+        else:  # pragma: no cover
+            raise NotImplementedError
+
+    @classmethod
+    def _calc_data_alphas(cls, op: "DataFrameEwmAgg", in_data, order):
+        exec_cache = cls._exec_cache[op.key]
+        cache_key = ("_calc_data_alphas", order, id(in_data))
+        try:
+            return exec_cache[cache_key]
+        except KeyError:
+            pass
+
+        cum_df = in_data.copy()
+        cum_df[cum_df.notna()] = 1
+        if not op.alpha_ignore_na:
+            cum_df.ffill(inplace=True)
+        cum_df = cum_df.cumsum(axis=op.axis) - 1
+        if not op.alpha_ignore_na:
+            cum_df[in_data.isna()] = np.nan
+
+        result = exec_cache[cache_key] = (1 - op.alpha) ** (order * cum_df)
+        return result
+
+    @classmethod
+    def _execute_cum_alpha_coeff(
+        cls, op: "DataFrameEwmAgg", in_data, order, final=True
+    ):
+        exec_cache = cls._exec_cache[op.key]
+        cache_key = ("cum_alpha_coeff", order, id(in_data))
+        summary = None
+
+        try:
+            result = exec_cache[cache_key]
+        except KeyError:
+            alphas = cls._calc_data_alphas(op, in_data, order)
+            result = alphas.cumsum()
+            exec_cache[cache_key] = result
+
+        if final:
+            if op.output_agg:
+                summary = result.ffill()[-1:]
+        return result, summary
+
+    @classmethod
+    def _execute_cumsum(cls, op: "DataFrameEwmAgg", in_data):
+        exec_cache = cls._exec_cache[op.key]
+        cache_key = ("cumsum", id(in_data))
+        summary = None
+
+        try:
+            result = exec_cache[cache_key]
+        except KeyError:
+            min_periods = 1 if op.min_periods > 0 else 0
+
+            try:
+                data = in_data.ewm(
+                    alpha=op.alpha,
+                    ignore_na=op.alpha_ignore_na,
+                    adjust=op.adjust,
+                    min_periods=min_periods,
+                ).mean()
+            except ValueError:
+                in_data = in_data.copy()
+                data = in_data.ewm(
+                    alpha=op.alpha,
+                    ignore_na=op.alpha_ignore_na,
+                    adjust=op.adjust,
+                    min_periods=min_periods,
+                ).mean()
+
+            alpha_sum, _ = op._execute_cum_alpha_coeff(op, in_data, 1, final=False)
+            result = exec_cache[cache_key] = data * alpha_sum
+
+        if op.output_agg:
+            summary = result.ffill()[-1:]
+        return result, summary
+
+    @classmethod
+    def _execute_cumsum2(cls, op: "DataFrameEwmAgg", in_data):
+        summary = None
+        min_periods = 1 if op.min_periods > 0 else 0
+
+        try:
+            data = in_data.ewm(
+                alpha=op.alpha,
+                ignore_na=op.alpha_ignore_na,
+                adjust=op.adjust,
+                min_periods=min_periods,
+            ).var(bias=True)
+        except ValueError:
+            in_data = in_data.copy()
+            data = in_data.ewm(
+                alpha=op.alpha,
+                ignore_na=op.alpha_ignore_na,
+                adjust=op.adjust,
+                min_periods=min_periods,
+            ).var(bias=True)
+
+        alpha_sum, _ = op._execute_cum_alpha_coeff(op, in_data, 1)
+        cumsum, _ = op._execute_cumsum(op, in_data)
+        result = alpha_sum * data + cumsum**2 / alpha_sum
+
+        if op.output_agg:
+            summary = result.ffill()[-1:]
+
+        return result, summary
+
+    @classmethod
+    def _execute_map_function(cls, op: "DataFrameEwmAgg", func, in_data):
+        in_data = in_data._get_numeric_data()
+
+        summary = None
+        min_periods = 1 if op.min_periods > 0 else 0
+        if func == "_data_count":
+            result = in_data.expanding(min_periods=min_periods).count()
+        elif func in (_cum_alpha_coeff_func, _cum_square_alpha_coeff_func):
+            order = 1 if func == _cum_alpha_coeff_func else 2
+            result, summary = cls._execute_cum_alpha_coeff(op, in_data, order)
+        elif func == "cumsum":
+            result, summary = cls._execute_cumsum(op, in_data)
+        elif func == "cumsum2":
+            result, summary = cls._execute_cumsum2(op, in_data)
+        else:  # pragma: no cover
+            raise ValueError("Map function %s not supported")
+
+        if op.output_agg:
+            summary = summary if summary is not None else result.iloc[-1:]
+        else:
+            summary = None
+        return result, summary
+
+    @classmethod
+    def _execute_map(cls, ctx, op: "DataFrameEwmAgg"):
+        try:
+            cls._exec_cache[op.key] = dict()
+
+            super()._execute_map(ctx, op)
+            if op.output_agg:
+                in_data = ctx[op.inputs[0].key]
+                summaries = list(ctx[op.outputs[1].key])
+
+                if op.alpha_ignore_na:
+                    in_count = in_data.count()
+                    if not isinstance(in_count, pd.Series):
+                        in_count = pd.Series([in_count])
+                    summary = in_count
+                    if in_data.ndim == 2:
+                        summary = in_count.to_frame().T
+                    summary.index = summaries[-1].index
+                else:
+                    remain_counts = in_data.notna()[::-1].to_numpy().argmax(axis=0)
+                    if in_data.ndim > 1:
+                        remain_counts = remain_counts.reshape((1, len(in_data.columns)))
+                        summary = pd.DataFrame(
+                            remain_counts,
+                            columns=in_data.columns,
+                            index=summaries[-1].index,
+                        )
+                    else:
+                        summary = pd.Series(remain_counts, index=summaries[-1].index)
+                summaries.insert(-1, summary)
+
+                ctx[op.outputs[1].key] = tuple(summaries)
+        finally:
+            cls._exec_cache.pop(op.key, None)
+
+    @classmethod
+    def _execute_combine_function(
+        cls, op: "DataFrameEwmAgg", func, prev_inputs, local_inputs, func_cols
+    ):
+        exec_cache = cls._exec_cache[op.key]
+        pred_exponent = exec_cache.get("pred_exponent")
+        if func_cols and pred_exponent is not None:
+            pred_exponent = (
+                pred_exponent[func_cols] if pred_exponent is not None else None
+            )
+        return func(
+            prev_inputs,
+            local_inputs,
+            axis=op.axis,
+            alpha=op.alpha,
+            alpha_ignore_na=op.alpha_ignore_na,
+            pred_exponent=pred_exponent,
+        )
+
+    @classmethod
+    def _execute_combine(cls, ctx, op: "DataFrameEwmAgg"):
+        try:
+            cls._exec_cache[op.key] = dict()
+
+            if len(op.inputs) != 1:
+                pred_data = ctx[op.inputs[1].key]
+
+                if op.alpha_ignore_na:
+                    pred_exponent = (
+                        pred_data[-2].shift(-1)[::-1].cumsum()[::-1].fillna(0)
+                    )
+                else:
+                    succ_counts = pred_data[-1].shift(-1)
+                    succ_counts.iloc[-1] = 0
+                    pred_exponent = pred_data[-2].add(
+                        succ_counts[::-1].cumsum()[::-1], axis=op.axis
+                    )
+
+                cls._exec_cache[op.key]["pred_exponent"] = pred_exponent
+
+            super()._execute_combine(ctx, op)
+        finally:
+            cls._exec_cache.pop(op.key, None)
+
+    @classmethod
+    def _execute_raw_function(cls, op: "DataFrameEwmAgg", in_data):
+        for _ in range(2):
+            ewm = in_data.ewm(
+                alpha=op.alpha,
+                min_periods=op.min_periods,
+                adjust=op.adjust,
+                ignore_na=op.alpha_ignore_na,
+            )
+            try:
+                val = ewm.agg(op.func)
+                if (
+                    in_data.ndim == 2
+                    and op.validate_columns
+                    and len(val.columns) != len(op.outputs[0].columns_value.to_pandas())
+                ):
+                    raise ValueError("Columns not consistent")
+                return val
+            except ValueError:
+                in_data = in_data.copy()
+        else:  # pragma: no cover
+            raise ValueError
diff --git a/python/xorbits/_mars/dataframe/window/ewm/core.py b/python/xorbits/_mars/dataframe/window/ewm/core.py
new file mode 100644
index 000000000..ce56da10c
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/ewm/core.py
@@ -0,0 +1,288 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections import OrderedDict
+
+from ....serialization.serializables import (
+    BoolField,
+    Float64Field,
+    Int32Field,
+    Int64Field,
+    StringField,
+)
+from ....utils import pd_release_version
+from ...utils import validate_axis
+from ..core import Window
+
+_default_min_period_1 = pd_release_version >= (1, 1, 0)
+_pd_1_3_repr = pd_release_version >= (1, 3, 0)
+_window_has_method = pd_release_version >= (1, 4, 0)
+
+
+class EWM(Window):
+    _alpha = Float64Field("alpha")
+    _min_periods = Int64Field("min_periods")
+    _adjust = BoolField("adjust")
+    _ignore_na = BoolField("ignore_na")
+    _axis = Int32Field("axis")
+    _method = StringField("method")
+
+    def __init__(
+        self,
+        alpha=None,
+        min_periods=None,
+        adjust=None,
+        ignore_na=None,
+        axis=None,
+        method=None,
+        **kw
+    ):
+        super().__init__(
+            _alpha=alpha,
+            _min_periods=min_periods,
+            _adjust=adjust,
+            _ignore_na=ignore_na,
+            _axis=axis,
+            _method=method or "single",
+            **kw
+        )
+
+    @property
+    def alpha(self):
+        return self._alpha
+
+    @property
+    def min_periods(self):
+        return self._min_periods
+
+    @property
+    def adjust(self):
+        return self._adjust
+
+    @property
+    def ignore_na(self):
+        return self._ignore_na
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def method(self):
+        return self._method
+
+    @property
+    def params(self):
+        p = OrderedDict()
+
+        if not _window_has_method:  # pragma: no cover
+            args = ["alpha", "min_periods", "adjust", "ignore_na", "axis"]
+        else:
+            args = ["alpha", "min_periods", "adjust", "ignore_na", "axis", "method"]
+
+        for k in args:
+            p[k] = getattr(self, k)
+        return p
+
+    def __call__(self, df):
+        return df.ewm(**self.params)
+
+    def _repr(self, params):
+        com = 1.0 / params.pop("alpha") - 1
+        params["com"] = int(com) if _pd_1_3_repr and com == math.floor(com) else com
+        try:
+            params.move_to_end("com", last=False)
+        except AttributeError:  # pragma: no cover
+            pass
+        return super()._repr(params)
+
+    def _repr_name(self):
+        try:
+            from pandas.core.window import ExponentialMovingWindow  # noqa: F401
+
+            return "ExponentialMovingWindow"
+        except ImportError:  # pragma: no cover
+            return "EWM"
+
+    def aggregate(self, func):
+        from .aggregation import DataFrameEwmAgg
+
+        params = self.params
+        params["alpha_ignore_na"] = params.pop("ignore_na", False)
+        params["validate_columns"] = False
+        op = DataFrameEwmAgg(func=func, **params)
+        return op(self)
+
+    agg = aggregate
+
+    def mean(self):
+        return self.aggregate("mean")
+
+    def var(self):
+        return self.aggregate("var")
+
+    def std(self):
+        return self.aggregate("std")
+
+
+def ewm(
+    obj,
+    com=None,
+    span=None,
+    halflife=None,
+    alpha=None,
+    min_periods=0,
+    adjust=True,
+    ignore_na=False,
+    axis=0,
+):
+    r"""
+    Provide exponential weighted functions.
+
+    Parameters
+    ----------
+    com : float, optional
+        Specify decay in terms of center of mass,
+        :math:`\alpha = 1 / (1 + com),\text{ for } com \geq 0`.
+    span : float, optional
+        Specify decay in terms of span,
+        :math:`\alpha = 2 / (span + 1),\text{ for } span \geq 1`.
+    halflife : float, optional
+        Specify decay in terms of half-life,
+        :math:`\alpha = 1 - exp(log(0.5) / halflife),\text{for} halflife > 0`.
+    alpha : float, optional
+        Specify smoothing factor :math:`\alpha` directly,
+        :math:`0 < \alpha \leq 1`.
+    min_periods : int, default 0
+        Minimum number of observations in window required to have a value
+        (otherwise result is NA).
+    adjust : bool, default True
+        Divide by decaying adjustment factor in beginning periods to account
+        for imbalance in relative weightings
+        (viewing EWMA as a moving average).
+    ignore_na : bool, default False
+        Ignore missing values when calculating weights;
+        specify True to reproduce pre-0.15.0 behavior.
+    axis : {0 or 'index', 1 or 'columns'}, default 0
+        The axis to use. The value 0 identifies the rows, and 1
+        identifies the columns.
+
+    Returns
+    -------
+    DataFrame
+        A Window sub-classed for the particular operation.
+
+    See Also
+    --------
+    rolling : Provides rolling window calculations.
+    expanding : Provides expanding transformations.
+
+    Notes
+    -----
+    Exactly one of center of mass, span, half-life, and alpha must be provided.
+
+    Allowed values and relationship between the parameters are specified in the
+    parameter descriptions above; see the link at the end of this section for
+    a detailed explanation.
+
+    When adjust is True (default), weighted averages are calculated using
+    weights (1-alpha)**(n-1), (1-alpha)**(n-2), ..., 1-alpha, 1.
+
+    When adjust is False, weighted averages are calculated recursively as:
+
+       weighted_average[0] = arg[0];
+       weighted_average[i] = (1-alpha)*weighted_average[i-1] + alpha*arg[i].
+
+    When ignore_na is False (default), weights are based on absolute positions.
+    For example, the weights of x and y used in calculating the final weighted
+    average of [x, None, y] are (1-alpha)**2 and 1 (if adjust is True), and
+    (1-alpha)**2 and alpha (if adjust is False).
+
+    When ignore_na is True (reproducing pre-0.15.0 behavior), weights are based
+    on relative positions. For example, the weights of x and y used in
+    calculating the final weighted average of [x, None, y] are 1-alpha and 1
+    (if adjust is True), and 1-alpha and alpha (if adjust is False).
+
+    More details can be found at
+    https://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'B': [0, 1, 2, np.nan, 4]})
+    >>> df.execute()
+         B
+    0  0.0
+    1  1.0
+    2  2.0
+    3  NaN
+    4  4.0
+    >>> df.ewm(com=0.5).mean().execute()
+              B
+    0  0.000000
+    1  0.750000
+    2  1.615385
+    3  1.615385
+    4  3.670213
+    """
+    axis = validate_axis(axis, obj)
+
+    decay_count = 0
+    for arg in (com, span, halflife, alpha):
+        if arg is not None:
+            decay_count += 1
+
+    if decay_count == 0:
+        raise ValueError("Must pass one of comass, span, halflife, or alpha")
+    if decay_count > 1:
+        raise ValueError("comass, span, halflife, and alpha are mutually exclusive")
+
+    if com is not None:
+        if com < 0:
+            raise ValueError("comass must satisfy: comass >= 0")
+        alpha = 1.0 / (1 + com)
+    elif span is not None:
+        if span < 1:
+            raise ValueError("span must satisfy: span >= 1")
+        alpha = 2.0 / (1 + span)
+    elif halflife is not None:
+        if halflife <= 0:
+            raise ValueError("halflife must satisfy: halflife > 0")
+        alpha = 1.0 - math.exp(math.log(0.5) / halflife)
+    if alpha <= 0 or alpha > 1:
+        raise ValueError("alpha must satisfy: 0 < alpha <= 1")
+
+    if not adjust and not ignore_na:
+        raise NotImplementedError(
+            "adjust == False when ignore_na == False not implemented"
+        )
+    if axis == 1:
+        raise NotImplementedError("axis other than 0 is not supported")
+
+    if alpha == 1:
+        return obj.expanding(min_periods=min_periods, axis=axis)
+
+    if _default_min_period_1:
+        min_periods = min_periods or 1
+
+    return EWM(
+        input=obj,
+        alpha=alpha,
+        min_periods=min_periods,
+        adjust=adjust,
+        ignore_na=ignore_na,
+        axis=axis,
+    )
diff --git a/python/xorbits/_mars/dataframe/window/ewm/tests/__init__.py b/python/xorbits/_mars/dataframe/window/ewm/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/ewm/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/window/ewm/tests/test_ewm.py b/python/xorbits/_mars/dataframe/window/ewm/tests/test_ewm.py
new file mode 100644
index 000000000..4ecd0b5c8
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/ewm/tests/test_ewm.py
@@ -0,0 +1,84 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ..... import dataframe as md
+from .....core import tile
+
+
+def test_ewm():
+    df = pd.DataFrame(np.random.rand(4, 3), columns=list("abc"))
+    df2 = md.DataFrame(df)
+
+    with pytest.raises(NotImplementedError):
+        _ = df2.ewm(2, adjust=False, ignore_na=False)
+
+    with pytest.raises(ValueError):
+        _ = df2.ewm()
+
+    with pytest.raises(ValueError):
+        _ = df2.ewm(com=2, alpha=0.3)
+
+    assert pytest.approx(df2.ewm(com=1).alpha) == 0.5
+    with pytest.raises(ValueError):
+        _ = df2.ewm(com=-1)
+
+    assert pytest.approx(df2.ewm(span=3).alpha) == 0.5
+    with pytest.raises(ValueError):
+        _ = df2.ewm(span=0)
+
+    assert pytest.approx(df2.ewm(halflife=1).alpha) == 0.5
+    with pytest.raises(ValueError):
+        _ = df2.ewm(halflife=-1)
+
+    with pytest.raises(ValueError):
+        _ = df2.ewm(alpha=2)
+
+    r = df2.ewm(3)
+    expected = df.ewm(3)
+    assert repr(r) == repr(expected)
+
+    r = df2.ewm(alpha=1)
+    assert type(r).__name__ == "Expanding"
+
+
+def test_ewm_agg():
+    df = pd.DataFrame(np.random.rand(4, 3), columns=list("abc"))
+    df2 = md.DataFrame(df, chunk_size=3)
+
+    with pytest.raises(NotImplementedError):
+        _ = df2.ewm(span=3, axis=1).agg("mean")
+
+    r = df2.ewm(span=3).agg("mean")
+    expected = df.ewm(span=3).agg("mean")
+
+    assert r.shape == df.shape
+    assert r.index_value is df2.index_value
+    pd.testing.assert_index_equal(r.columns_value.to_pandas(), expected.columns)
+    pd.testing.assert_series_equal(r.dtypes, df2.dtypes)
+
+    r = tile(r)
+    for c in r.chunks:
+        assert c.shape == c.inputs[0].shape
+        assert c.index_value is c.inputs[0].index_value
+        pd.testing.assert_index_equal(c.columns_value.to_pandas(), expected.columns)
+        pd.testing.assert_series_equal(c.dtypes, expected.dtypes)
+
+    aggs = ["mean", "var", "std"]
+    for a in aggs:
+        r = getattr(df2.ewm(span=3), a)()
+        assert r.op.func == a
diff --git a/python/xorbits/_mars/dataframe/window/ewm/tests/test_ewm_execution.py b/python/xorbits/_mars/dataframe/window/ewm/tests/test_ewm_execution.py
new file mode 100644
index 000000000..97147f6a3
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/ewm/tests/test_ewm_execution.py
@@ -0,0 +1,137 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+import numpy as np
+import pandas as pd
+
+from ..... import dataframe as md
+
+
+def test_dataframe_ewm_agg(setup):
+    np.random.seed(0)
+
+    raw = pd.DataFrame(
+        {
+            "a": np.random.randint(100, size=(10,)),
+            "b": np.random.rand(10),
+            "c": np.random.randint(100, size=(10,)),
+            "d": ["c" * i for i in np.random.randint(4, size=10)],
+        }
+    )
+    raw.b[0:3] = np.nan
+    raw.b[5:7] = np.nan
+    raw.b[9] = np.nan
+
+    df = md.DataFrame(raw, chunk_size=(10, 3))
+
+    r = df.ewm(alpha=0.5).agg("mean")
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.5).agg("mean"))
+
+    r = df.ewm(alpha=0.5).agg(["mean"])
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.5).agg(["mean"]))
+
+    df = md.DataFrame(raw, chunk_size=(3, 3))
+
+    aggs = ["mean", "var", "std"]
+
+    for fun_name in aggs:
+        r = df.ewm(alpha=0.3).agg(fun_name)
+        pd.testing.assert_frame_equal(
+            r.execute().fetch(), raw.ewm(alpha=0.3).agg(fun_name)
+        )
+
+        r = df.ewm(alpha=0.3, ignore_na=True).agg(fun_name)
+        pd.testing.assert_frame_equal(
+            r.execute().fetch(), raw.ewm(alpha=0.3, ignore_na=True).agg(fun_name)
+        )
+
+    r = df.ewm(alpha=0.3).agg("mean")
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.3).agg("mean"))
+
+    r = df.ewm(alpha=0.3).agg(["mean"])
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.3).agg(["mean"]))
+
+    r = df.ewm(alpha=0.3).agg(aggs)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.3).agg(aggs))
+
+    agg_dict = {"c": "mean"}
+    r = df.ewm(alpha=0.3).agg(agg_dict)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.3).agg(agg_dict))
+
+    agg_dict = OrderedDict([("a", ["mean", "var"]), ("b", "var")])
+    r = df.ewm(alpha=0.3).agg(agg_dict)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.3).agg(agg_dict))
+
+    r = df.ewm(alpha=0.3, min_periods=0).agg(aggs)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw.ewm(alpha=0.3, min_periods=0).agg(aggs)
+    )
+
+    r = df.ewm(alpha=0.3, min_periods=2).agg(aggs)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw.ewm(alpha=0.3, min_periods=2).agg(aggs)
+    )
+
+    agg_dict = OrderedDict([("a", ["mean", "var"]), ("b", "var"), ("c", "mean")])
+    r = df.ewm(alpha=0.3, min_periods=2).agg(agg_dict)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw.ewm(alpha=0.3, min_periods=2).agg(agg_dict)
+    )
+
+
+def test_series_expanding_agg(setup):
+    raw = pd.Series(np.random.rand(10), name="a")
+    raw[:3] = np.nan
+    raw[5:10:2] = np.nan
+
+    series = md.Series(raw, chunk_size=10)
+
+    r = series.ewm(alpha=0.3).agg(["mean"])
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.3).agg(["mean"]))
+
+    r = series.ewm(alpha=0.3).agg("mean")
+    pd.testing.assert_series_equal(r.execute().fetch(), raw.ewm(alpha=0.3).agg("mean"))
+
+    series = md.Series(raw, chunk_size=3)
+
+    aggs = ["mean", "var", "std"]
+
+    for fun_name in aggs:
+        r = series.ewm(alpha=0.3).agg(fun_name)
+        pd.testing.assert_series_equal(
+            r.execute().fetch(), raw.ewm(alpha=0.3).agg(fun_name)
+        )
+
+        r = series.ewm(alpha=0.3, ignore_na=True).agg(fun_name)
+        pd.testing.assert_series_equal(
+            r.execute().fetch(), raw.ewm(alpha=0.3, ignore_na=True).agg(fun_name)
+        )
+
+    r = series.ewm(alpha=0.3).agg(["mean"])
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.3).agg(["mean"]))
+
+    r = series.ewm(alpha=0.3).agg(aggs)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.ewm(alpha=0.3).agg(aggs))
+
+    r = series.ewm(alpha=0.3, min_periods=0).agg(aggs)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw.ewm(alpha=0.3, min_periods=0).agg(aggs)
+    )
+
+    r = series.ewm(alpha=0.3, min_periods=2).agg(aggs)
+    pd.testing.assert_frame_equal(
+        r.execute().fetch(), raw.ewm(alpha=0.3, min_periods=2).agg(aggs)
+    )
diff --git a/python/xorbits/_mars/dataframe/window/expanding/__init__.py b/python/xorbits/_mars/dataframe/window/expanding/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/expanding/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/window/expanding/aggregation.py b/python/xorbits/_mars/dataframe/window/expanding/aggregation.py
new file mode 100644
index 000000000..502ef3c3c
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/expanding/aggregation.py
@@ -0,0 +1,177 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import namedtuple
+from functools import partial
+
+import numpy as np
+import pandas as pd
+
+from .... import opcodes
+from ....serialization.serializables import BoolField
+from ..aggregation import BaseDataFrameExpandingAgg
+
+_stage_info = namedtuple(
+    "_stage_info",
+    (
+        "map_groups",
+        "map_sources",
+        "combine_sources",
+        "combine_columns",
+        "combine_funcs",
+        "key_to_funcs",
+        "valid_columns",
+        "min_periods_func_name",
+    ),
+)
+
+_cum_alpha_coeff_func = "_cum_alpha_coeff"
+_cum_square_alpha_coeff_func = "_cum_square_alpha_coeff"
+
+
+def _add_pred_results(pred_results, local_results, axis=0):
+    if pred_results[0].ndim == 1:
+        df_filler = 0
+    else:
+        df_filler = pred_results[0].iloc[-1, :].dropna()
+        df_filler[:] = 0
+
+    new_locals = []
+    combine_axis = pred_results[0].ndim - axis - 1
+    for pred_result, local_result in zip(pred_results, local_results):
+        local_result = local_result.fillna(df_filler, axis=axis)
+        new_locals.append(
+            local_result.add(pred_result.sum(axis=axis), axis=combine_axis)
+        )
+    return new_locals
+
+
+def _combine_arithmetic(pred_results, local_results, axis=0):
+    if pred_results is None:
+        return local_results[0]
+    return _add_pred_results(pred_results, local_results, axis=axis)[0]
+
+
+def _combine_minmax(pred_results, local_results, axis=0, fun_name=None):
+    if pred_results is None:
+        return local_results[0]
+
+    pred_size = len(pred_results[0])
+    con = pd.concat([pred_results[0], local_results[0]], axis=axis)
+    result = con.expanding(axis=axis).agg(fun_name)
+    if result.ndim == 2:
+        return result.iloc[pred_size:, :] if axis == 0 else result.iloc[:, pred_size:]
+    else:
+        return result.iloc[pred_size:]
+
+
+def _combine_mean(pred_results, local_results, axis=0):
+    local_sum_data, local_count_data = local_results
+
+    if pred_results is not None:
+        local_sum_data, local_count_data = _add_pred_results(
+            pred_results, local_results, axis=axis
+        )
+    return local_sum_data / local_count_data
+
+
+def _combine_var(pred_results, local_results, axis=0):
+    local_sum_data, local_count_data, local_var_data = local_results
+    if pred_results is None:
+        return local_var_data * local_count_data / (local_count_data - 1)
+
+    pred_sum_data, pred_count_data, pred_var_data = pred_results
+
+    local_sum_square = (
+        local_count_data * local_var_data + local_sum_data**2 / local_count_data
+    )
+    pred_sum_square = (
+        pred_count_data * pred_var_data + pred_sum_data**2 / pred_count_data
+    )
+
+    local_sum_square, local_sum_data, local_count_data = _add_pred_results(
+        [pred_sum_square, pred_sum_data, pred_count_data],
+        [local_sum_square, local_sum_data, local_count_data],
+        axis=axis,
+    )
+
+    return (local_sum_square - local_sum_data**2 / local_count_data) / (
+        local_count_data - 1
+    )
+
+
+def _combine_std(pred_results, local_results, axis=0):
+    return np.sqrt(_combine_var(pred_results, local_results, axis=axis))
+
+
+class DataFrameExpandingAgg(BaseDataFrameExpandingAgg):
+    _op_type_ = opcodes.EXPANDING_AGG
+
+    _center = BoolField("center")
+
+    def __init__(self, center=None, **kw):
+        super().__init__(_center=center, **kw)
+
+    @property
+    def center(self):
+        return self._center
+
+    @classmethod
+    def _get_stage_functions(cls, op: "DataFrameExpandingAgg", func):
+        if func == "_data_count":
+            return ["count"], _combine_arithmetic
+        elif func in ("sum", "prod", "count"):
+            return [func], _combine_arithmetic
+        elif func in ("min", "max"):
+            return [func], partial(_combine_minmax, fun_name=func)
+        elif func == "mean":
+            return ["sum", "count"], _combine_mean
+        elif func in {"var", "std"}:
+            return (
+                ["sum", "count", "var"],
+                _combine_var if func == "var" else _combine_std,
+            )
+        else:  # pragma: no cover
+            raise NotImplementedError
+
+    @classmethod
+    def _execute_map_function(cls, op: "DataFrameExpandingAgg", func, in_data):
+        min_periods = 1 if op.min_periods > 0 else 0
+
+        expanding = in_data.expanding(
+            min_periods=min_periods, center=op.center, axis=op.axis
+        )
+        if func == "var":
+            result = expanding.var(ddof=0)
+        else:
+            result = expanding.agg(func)
+
+        if op.output_agg:
+            summary = result.iloc[len(result) - 1 : len(result)]
+        else:
+            summary = None
+        return result, summary
+
+    @classmethod
+    def _execute_combine_function(
+        cls, op: "DataFrameExpandingAgg", func, pred_inputs, local_inputs, func_cols
+    ):
+        return func(pred_inputs, local_inputs, axis=op.axis)
+
+    @classmethod
+    def _execute_raw_function(cls, op: "DataFrameExpandingAgg", in_data):
+        expanding = in_data.expanding(
+            min_periods=op.min_periods, center=op.center, axis=op.axis
+        )
+        return expanding.agg(op.func)
diff --git a/python/xorbits/_mars/dataframe/window/expanding/core.py b/python/xorbits/_mars/dataframe/window/expanding/core.py
new file mode 100644
index 000000000..e74c3e692
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/expanding/core.py
@@ -0,0 +1,161 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+from ....serialization.serializables import (
+    BoolField,
+    Int32Field,
+    Int64Field,
+    StringField,
+)
+from ....utils import pd_release_version
+from ...utils import validate_axis
+from ..core import Window
+
+_window_has_method = pd_release_version >= (1, 3, 0)
+
+
+class Expanding(Window):
+    _min_periods = Int64Field("min_periods")
+    _axis = Int32Field("axis")
+    _center = BoolField("center")
+    _method = StringField("method")
+
+    def __init__(self, min_periods=None, axis=None, center=None, method=None, **kw):
+        super().__init__(
+            _min_periods=min_periods, _axis=axis, _center=center, _method=method, **kw
+        )
+
+    @property
+    def min_periods(self):
+        return self._min_periods
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def center(self):
+        return self._center
+
+    @property
+    def method(self):
+        return self._method or "single"
+
+    def __call__(self, df):
+        return df.expanding(**self.params)
+
+    @property
+    def params(self):
+        p = OrderedDict()
+
+        if not _window_has_method:  # pragma: no cover
+            args = ["min_periods", "center", "axis"]
+        else:
+            args = ["min_periods", "center", "axis", "method"]
+
+        for k in args:
+            p[k] = getattr(self, k)
+        return p
+
+    def aggregate(self, func, **kwargs):
+        from .aggregation import DataFrameExpandingAgg
+
+        count_always_valid = kwargs.pop("_count_always_valid", False)
+
+        op = DataFrameExpandingAgg(
+            func=func, count_always_valid=count_always_valid, **self.params
+        )
+        return op(self)
+
+    agg = aggregate
+
+    def sum(self):
+        return self.aggregate("sum")
+
+    def count(self):
+        return self.aggregate("count")
+
+    def min(self):
+        return self.aggregate("min")
+
+    def max(self):
+        return self.aggregate("max")
+
+    def mean(self):
+        return self.aggregate("mean")
+
+    def var(self):
+        return self.aggregate("var")
+
+    def std(self):
+        return self.aggregate("std")
+
+
+def expanding(obj, min_periods=1, center=False, axis=0):
+    """
+    Provide expanding transformations.
+
+    Parameters
+    ----------
+    min_periods : int, default 1
+    Minimum number of observations in window required to have a value
+    (otherwise result is NA).
+    center : bool, default False
+    Set the labels at the center of the window.
+    axis : int or str, default 0
+
+    Returns
+    -------
+    a Window sub-classed for the particular operation
+
+    See Also
+    --------
+    rolling : Provides rolling window calculations.
+    ewm : Provides exponential weighted functions.
+
+    Notes
+    -----
+    By default, the result is set to the right edge of the window. This can be
+    changed to the center of the window by setting ``center=True``.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'B': [0, 1, 2, np.nan, 4]})
+    >>> df.execute()
+         B
+    0  0.0
+    1  1.0
+    2  2.0
+    3  NaN
+    4  4.0
+    >>> df.expanding(2).sum().execute()
+         B
+    0  NaN
+    1  1.0
+    2  3.0
+    3  3.0
+    4  7.0
+    """
+    axis = validate_axis(axis, obj)
+
+    if center:
+        raise NotImplementedError("center == True is not supported")
+    if axis == 1:
+        raise NotImplementedError("axis other than 0 is not supported")
+
+    return Expanding(input=obj, min_periods=min_periods, center=center, axis=axis)
diff --git a/python/xorbits/_mars/dataframe/window/expanding/tests/__init__.py b/python/xorbits/_mars/dataframe/window/expanding/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/expanding/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/window/expanding/tests/test_expanding.py b/python/xorbits/_mars/dataframe/window/expanding/tests/test_expanding.py
new file mode 100644
index 000000000..a1ddedf5c
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/expanding/tests/test_expanding.py
@@ -0,0 +1,74 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ..... import dataframe as md
+from .....core import tile
+
+
+def test_expanding():
+    df = pd.DataFrame(np.random.rand(4, 3), columns=list("abc"))
+    df2 = md.DataFrame(df)
+
+    with pytest.raises(NotImplementedError):
+        _ = df2.expanding(3, center=True)
+
+    with pytest.raises(NotImplementedError):
+        _ = df2.expanding(3, axis=1)
+
+    r = df2.expanding(3, center=False)
+    expected = df.expanding(3, center=False)
+    assert repr(r) == repr(expected)
+
+    assert "b" in dir(r)
+
+    with pytest.raises(AttributeError):
+        _ = r.d
+
+    with pytest.raises(KeyError):
+        _ = r["d"]
+
+    with pytest.raises(KeyError):
+        _ = r["a", "d"]
+
+    assert "a" not in dir(r.a)
+    assert "c" not in dir(r["a", "b"])
+
+
+def test_expanding_agg():
+    df = pd.DataFrame(np.random.rand(4, 3), columns=list("abc"))
+    df2 = md.DataFrame(df, chunk_size=3)
+
+    r = df2.expanding(3).agg("max")
+    expected = df.expanding(3).agg("max")
+
+    assert r.shape == df.shape
+    assert r.index_value is df2.index_value
+    pd.testing.assert_index_equal(r.columns_value.to_pandas(), expected.columns)
+    pd.testing.assert_series_equal(r.dtypes, df2.dtypes)
+
+    r = tile(r)
+    for c in r.chunks:
+        assert c.shape == c.inputs[0].shape
+        assert c.index_value is c.inputs[0].index_value
+        pd.testing.assert_index_equal(c.columns_value.to_pandas(), expected.columns)
+        pd.testing.assert_series_equal(c.dtypes, expected.dtypes)
+
+    aggs = ["sum", "count", "min", "max", "mean", "var", "std"]
+    for a in aggs:
+        r = getattr(df2.expanding(3), a)()
+        assert r.op.func == a
diff --git a/python/xorbits/_mars/dataframe/window/expanding/tests/test_expanding_execution.py b/python/xorbits/_mars/dataframe/window/expanding/tests/test_expanding_execution.py
new file mode 100644
index 000000000..e1cf46529
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/expanding/tests/test_expanding_execution.py
@@ -0,0 +1,108 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+import numpy as np
+import pandas as pd
+
+from ..... import dataframe as md
+
+
+def test_dataframe_expanding_agg(setup):
+    raw = pd.DataFrame(
+        {
+            "a": np.random.randint(100, size=(10,)),
+            "b": np.random.rand(10),
+            "c": np.random.randint(100, size=(10,)),
+            "d": ["c" * i for i in np.random.randint(4, size=10)],
+        }
+    )
+    raw.b[:3] = np.nan
+    raw.b[5:7] = np.nan
+
+    df = md.DataFrame(raw, chunk_size=(10, 3))
+
+    r = df.expanding().agg(["sum"])
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding().agg(["sum"]))
+
+    df = md.DataFrame(raw, chunk_size=(3, 2))
+
+    aggs = ["sum", "count", "min", "max", "mean", "var", "std"]
+
+    for fun_name in aggs:
+        r = df.expanding().agg(fun_name)
+        pd.testing.assert_frame_equal(
+            r.execute().fetch(), raw.expanding().agg(fun_name)
+        )
+
+    r = df.expanding().agg(["sum"])
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding().agg(["sum"]))
+
+    r = df.expanding().agg(aggs)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding().agg(aggs))
+
+    agg_dict = {"c": "sum"}
+    r = df.expanding().agg(agg_dict)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding().agg(agg_dict))
+
+    agg_dict = OrderedDict([("a", ["sum", "var"]), ("b", "var")])
+    r = df.expanding().agg(agg_dict)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding().agg(agg_dict))
+
+    r = df.expanding(0).agg(aggs)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding(0).agg(aggs))
+
+    r = df.expanding(2).agg(aggs)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding(2).agg(aggs))
+
+    agg_dict = OrderedDict([("a", ["min", "max"]), ("b", "max"), ("c", "sum")])
+    r = df.expanding(2).agg(agg_dict)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding(2).agg(agg_dict))
+
+
+def test_series_expanding_agg(setup):
+    raw = pd.Series(np.random.rand(10), name="a")
+    raw[:3] = np.nan
+    raw[5:7] = np.nan
+
+    series = md.Series(raw, chunk_size=10)
+
+    r = series.expanding().agg(["sum"])
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding().agg(["sum"]))
+
+    r = series.expanding().agg("sum")
+    pd.testing.assert_series_equal(r.execute().fetch(), raw.expanding().agg("sum"))
+
+    series = md.Series(raw, chunk_size=3)
+
+    aggs = ["sum", "count", "min", "max", "mean", "var", "std"]
+
+    for fun_name in aggs:
+        r = series.expanding().agg(fun_name)
+        pd.testing.assert_series_equal(
+            r.execute().fetch(), raw.expanding().agg(fun_name)
+        )
+
+    r = series.expanding().agg(["sum"])
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding().agg(["sum"]))
+
+    r = series.expanding().agg(aggs)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding().agg(aggs))
+
+    r = series.expanding(2).agg(aggs)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding(2).agg(aggs))
+
+    r = series.expanding(0).agg(aggs)
+    pd.testing.assert_frame_equal(r.execute().fetch(), raw.expanding(0).agg(aggs))
diff --git a/python/xorbits/_mars/dataframe/window/rolling/__init__.py b/python/xorbits/_mars/dataframe/window/rolling/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/rolling/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/window/rolling/aggregation.py b/python/xorbits/_mars/dataframe/window/rolling/aggregation.py
new file mode 100644
index 000000000..e8d93f285
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/rolling/aggregation.py
@@ -0,0 +1,488 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from .... import opcodes
+from ....core import recursive_tile
+from ....serialization.serializables import (
+    AnyField,
+    BoolField,
+    DictField,
+    FieldTypes,
+    Int32Field,
+    Int64Field,
+    KeyField,
+    ListField,
+    StringField,
+    TupleField,
+)
+from ....utils import calc_nsplits, has_unknown_shape, lazy_import, pd_release_version
+from ...core import DATAFRAME_TYPE
+from ...operands import DataFrameOperand, DataFrameOperandMixin
+from ...utils import build_empty_df, build_empty_series, parse_index
+
+cudf = lazy_import("cudf")
+_with_pandas_issue_38908 = pd_release_version == (1, 2, 0)
+
+
+class DataFrameRollingAgg(DataFrameOperand, DataFrameOperandMixin):
+    _op_type_ = opcodes.ROLLING_AGG
+
+    _input = KeyField("input")
+    _window = AnyField("window")
+    _min_periods = Int64Field("min_periods")
+    _center = BoolField("center")
+    _win_type = StringField("win_type")
+    _on = StringField("on")
+    _axis = Int32Field("axis")
+    _closed = StringField("closed")
+    _func = AnyField("func")
+    _func_args = TupleField("func_args")
+    _func_kwargs = DictField("func_kwargs")
+    # for chunks
+    _preds = ListField("preds", FieldTypes.key)
+    _succs = ListField("succs", FieldTypes.key)
+
+    def __init__(
+        self,
+        input=None,
+        window=None,
+        min_periods=None,
+        center=None,  # pylint: disable=redefined-builtin
+        win_type=None,
+        on=None,
+        axis=None,
+        closed=None,
+        func=None,
+        func_args=None,
+        func_kwargs=None,
+        output_types=None,
+        preds=None,
+        succs=None,
+        **kw
+    ):
+        super().__init__(
+            _input=input,
+            _window=window,
+            _min_periods=min_periods,
+            _center=center,
+            _win_type=win_type,
+            _on=on,
+            _axis=axis,
+            _closed=closed,
+            _func=func,
+            _func_args=func_args,
+            _func_kwargs=func_kwargs,
+            _output_types=output_types,
+            _preds=preds,
+            _succs=succs,
+            **kw
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def window(self):
+        return self._window
+
+    @property
+    def min_periods(self):
+        return self._min_periods
+
+    @property
+    def center(self):
+        return self._center
+
+    @property
+    def win_type(self):
+        return self._win_type
+
+    @property
+    def on(self):
+        return self._on
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def closed(self):
+        return self._closed
+
+    @property
+    def func(self):
+        return self._func
+
+    @property
+    def func_args(self):
+        return self._func_args
+
+    @property
+    def func_kwargs(self):
+        return self._func_kwargs
+
+    @property
+    def preds(self):
+        return self._preds if self._preds is not None else []
+
+    @property
+    def succs(self):
+        return self._succs if self._succs is not None else []
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        input_iter = iter(self._inputs)
+        self._input = next(input_iter)
+        if self._preds is not None:
+            self._preds = [next(input_iter) for _ in self._preds]
+        if self._succs is not None:
+            self._succs = [next(input_iter) for _ in self._succs]
+
+    def __call__(self, rolling):
+        inp = rolling.input
+
+        if isinstance(inp, DATAFRAME_TYPE):
+            pd_index = inp.index_value.to_pandas()
+            empty_df = build_empty_df(inp.dtypes, index=pd_index[:0])
+            params = rolling.params.copy()
+            if params["win_type"] == "freq":
+                params["win_type"] = None
+            if self._func != "count":
+                empty_df = empty_df._get_numeric_data()
+            test_df = empty_df.rolling(**params).agg(self._func)
+            if self._axis == 0:
+                index_value = inp.index_value
+            else:
+                index_value = parse_index(
+                    test_df.index, rolling.params, inp, store_data=False
+                )
+            return self.new_dataframe(
+                [inp],
+                shape=(inp.shape[0], test_df.shape[1]),
+                dtypes=test_df.dtypes,
+                index_value=index_value,
+                columns_value=parse_index(test_df.columns, store_data=True),
+            )
+        else:
+            pd_index = inp.index_value.to_pandas()
+            empty_series = build_empty_series(
+                inp.dtype, index=pd_index[:0], name=inp.name
+            )
+            test_obj = empty_series.rolling(**rolling.params).agg(self._func)
+            if isinstance(test_obj, pd.DataFrame):
+                return self.new_dataframe(
+                    [inp],
+                    shape=(inp.shape[0], test_obj.shape[1]),
+                    dtypes=test_obj.dtypes,
+                    index_value=inp.index_value,
+                    columns_value=parse_index(test_obj.dtypes.index, store_data=True),
+                )
+            else:
+                return self.new_series(
+                    [inp],
+                    shape=inp.shape,
+                    dtype=test_obj.dtype,
+                    index_value=inp.index_value,
+                    name=test_obj.name,
+                )
+
+    @classmethod
+    def _check_can_be_tiled(cls, op, is_window_int):
+        inp = op.input
+        axis = op.axis
+
+        if axis == 0 and inp.ndim == 2:
+            if has_unknown_shape(inp):
+                yield
+            inp = yield from recursive_tile(inp.rechunk({1: inp.shape[1]}))
+
+        if is_window_int:
+            # if window is integer
+            if any(np.isnan(ns) for ns in inp.nsplits[op.axis]):
+                yield
+        else:
+            # if window is offset
+            # must be aware of index's meta including min and max
+            for i in range(inp.chunk_shape[axis]):
+                chunk_index = [0, 0]
+                chunk_index[axis] = i
+                chunk = inp.cix[tuple(chunk_index)]
+
+                if axis == 0:
+                    index_value = chunk.index_value
+                else:
+                    index_value = chunk.columns_value
+                if pd.isnull(index_value.min_val) or pd.isnull(index_value.max_val):
+                    yield
+
+        return inp
+
+    @classmethod
+    def _find_extra_chunks_for_int_window(cls, op, inp, cur_chunk_index):
+        from ...indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem
+
+        axis = op.axis
+        window = op.window
+        center = op.center
+
+        # find prev chunks
+        i = cur_chunk_index[axis]
+        rest = window if not center else window // 2
+        prev_chunks = []
+        while i > 0 and rest > 0:
+            prev_chunk_index = list(cur_chunk_index)
+            prev_chunk_index[axis] = i - 1
+            prev_chunk_index = tuple(prev_chunk_index)
+
+            prev_chunk = inp.cix[prev_chunk_index]
+            size = prev_chunk.shape[axis]
+            if size <= rest:
+                prev_chunks.insert(0, prev_chunk)
+                rest -= size
+            else:
+                if prev_chunk.ndim == 1:
+                    slice_prev_chunk_op = SeriesIlocGetItem(
+                        indexes=[slice(-rest, None)]
+                    )
+                else:
+                    slices = [slice(None)] * 2
+                    slices[axis] = slice(-rest, None)
+                    slice_prev_chunk_op = DataFrameIlocGetItem(indexes=slices)
+                slice_prev_chunk = slice_prev_chunk_op.new_chunk([prev_chunk])
+                prev_chunks.insert(0, slice_prev_chunk)
+                rest = 0
+
+            i -= 1
+
+        # find succ chunks
+        j = cur_chunk_index[axis]
+        rest = 0 if not center else window - window // 2 - 1
+        chunk_size = inp.chunk_shape[axis]
+        succ_chunks = []
+        while j < chunk_size - 1 and rest > 0:
+            succ_chunk_index = list(cur_chunk_index)
+            succ_chunk_index[axis] = j + 1
+            succ_chunk_index = tuple(succ_chunk_index)
+
+            succ_chunk = inp.cix[succ_chunk_index]
+            size = succ_chunk.shape[axis]
+            if size <= rest:
+                succ_chunks.append(succ_chunk)
+                rest -= size
+            else:
+                if succ_chunk.ndim == 1:
+                    slice_succ_chunk_op = SeriesIlocGetItem(indexes=[slice(rest)])
+                else:
+                    slices = [slice(None)] * 2
+                    slices[axis] = slice(rest)
+                    slice_succ_chunk_op = DataFrameIlocGetItem(indexes=slices)
+                slice_succ_chunk = slice_succ_chunk_op.new_chunk([succ_chunk])
+                succ_chunks.append(slice_succ_chunk)
+                rest = 0
+
+            j += 1
+
+        return prev_chunks, succ_chunks
+
+    @classmethod
+    def _find_extra_chunks_for_offset_window(cls, op, inp, cur_chunk_index):
+        from ...indexing.loc import DataFrameLocGetItem
+
+        # when window is offset, center=True is not supported
+        assert not op.center
+
+        axis = op.axis
+        window = pd.Timedelta(op.window)
+        ndim = inp.ndim
+
+        # find prev chunks
+        i = cur_chunk_index[axis]
+        prev_chunks = []
+        cur_index_min = inp.cix[cur_chunk_index].index_value.min_val
+        start = cur_index_min - window
+        assert cur_chunk_index is not None
+        while i > 0:
+            prev_chunk_index = list(cur_chunk_index)
+            prev_chunk_index[axis] = i - 1
+            prev_chunk_index = tuple(prev_chunk_index)
+
+            prev_chunk = inp.cix[prev_chunk_index]
+            prev_index_max = prev_chunk.index_value.max_val
+            if prev_index_max >= start:
+                slices = [slice(None)] * ndim
+                slices[axis] = slice(start, None)
+                prev_chunk_op = DataFrameLocGetItem(
+                    indexes=slices, output_types=prev_chunk.op.output_types
+                )
+                slice_prev_chunk = prev_chunk_op.new_chunk([prev_chunk])
+                prev_chunks.insert(0, slice_prev_chunk)
+            else:
+                # index max < start, break
+                break
+
+            i -= 1
+
+        return prev_chunks, []
+
+    @classmethod
+    def tile(cls, op):
+        inp = op.input
+        out = op.outputs[0]
+        is_window_int = op.win_type != "freq"
+        axis = op.axis
+        input_ndim = inp.ndim
+        output_ndim = out.ndim
+
+        # check if can be tiled
+        inp = yield from cls._check_can_be_tiled(op, is_window_int)
+
+        if inp.ndim == 1 and out.ndim == 1:
+            # input series, output series
+            other_iter = [None]
+        elif inp.ndim == 1:
+            # input series, output dataframe
+            other_iter = [0]
+        else:
+            other_iter = range(inp.chunk_shape[1 - axis])
+
+        out_chunks = []
+        for i in other_iter:
+            for j in range(inp.chunk_shape[axis]):
+                chunk_op = op.copy().reset_key()
+
+                if inp.ndim == 1:
+                    chunk_index = (j,)
+                else:
+                    chunk_index = [None, None]
+                    chunk_index[1 - axis] = i
+                    chunk_index[axis] = j
+                    chunk_index = tuple(chunk_index)
+
+                inp_chunk = inp.cix[chunk_index]
+                if is_window_int:
+                    pred_chunks, succ_chunks = cls._find_extra_chunks_for_int_window(
+                        op, inp, chunk_index
+                    )
+                else:
+                    pred_chunks, succ_chunks = cls._find_extra_chunks_for_offset_window(
+                        op, inp, chunk_index
+                    )
+
+                out_chunk_index = [None] * output_ndim
+                out_chunk_index[axis] = j
+                if output_ndim == 2:
+                    out_chunk_index[1 - axis] = i
+                out_chunk_index = tuple(out_chunk_index)
+
+                chunk_params = {"index": out_chunk_index}
+                if input_ndim == 1 and output_ndim == 1:
+                    chunk_params["shape"] = inp_chunk.shape
+                    chunk_params["dtype"] = out.dtype
+                    chunk_params["index_value"] = inp_chunk.index_value
+                    chunk_params["name"] = inp_chunk.name
+                elif input_ndim == 1 and output_ndim == 2:
+                    chunk_params["shape"] = (inp_chunk.shape[0], out.shape[1])
+                    chunk_params["dtypes"] = out.dtypes
+                    chunk_params["index_value"] = inp_chunk.index_value
+                    chunk_params["columns_value"] = out.columns_value
+                else:
+                    if axis == 0:
+                        out_shape = list(out.shape)
+                        out_shape[axis] = inp_chunk.shape[axis]
+                        chunk_params["shape"] = tuple(out_shape)
+                    else:
+                        chunk_params["shape"] = inp_chunk.shape
+                    chunk_params["index_value"] = (
+                        inp_chunk.index_value if axis == 0 else out.index_value
+                    )
+                    chunk_params["dtypes"] = (
+                        out.dtypes if axis == 0 else inp_chunk.dtypes
+                    )
+                    chunk_params["columns_value"] = (
+                        out.columns_value if axis == 0 else inp_chunk.columns_value
+                    )
+
+                if len(pred_chunks) > 0:
+                    chunk_op._preds = pred_chunks
+                if len(succ_chunks) > 0:
+                    chunk_op._succs = succ_chunks
+                out_chunk = chunk_op.new_chunk(
+                    [inp_chunk] + pred_chunks + succ_chunks, kws=[chunk_params]
+                )
+                out_chunks.append(out_chunk)
+
+        params = out.params
+        params["chunks"] = out_chunks
+        if out.ndim == 1:
+            params["shape"] = (inp.shape[0],)
+        else:
+            params["shape"] = (inp.shape[0], params["shape"][1])
+        params["nsplits"] = calc_nsplits({c.index: c.shape for c in out_chunks})
+        new_op = op.copy()
+        return new_op.new_tileables([inp], kws=[params])
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inp = ctx[op.input.key]
+        axis = op.axis
+        win_type = op.win_type
+        window = op.window
+        if win_type == "freq":
+            win_type = None
+            window = pd.Timedelta(window)
+
+        preds = [ctx[pred.key] for pred in op.preds]
+        pred_size = sum(pred.shape[axis] for pred in preds)
+        succs = [ctx[succ.key] for succ in op.succs]
+        succ_size = sum(succ.shape[axis] for succ in succs)
+
+        xdf = pd if isinstance(inp, (pd.DataFrame, pd.Series)) else cudf
+
+        if pred_size > 0 or succ_size > 0:
+            data = xdf.concat(preds + [inp] + succs, axis=axis)
+        else:
+            data = inp
+
+            # fix for pandas 1.2.0
+            # see: https://github.com/pandas-dev/pandas/issues/38908
+            # df.rolling().aggregate('skew') modified original data
+            # so we copy it first for skew only
+            if (
+                _with_pandas_issue_38908
+                and op.func in ["skew", "kurt"]
+                and op.outputs[0].index[0] == 0
+            ):
+                data = data.copy()
+
+        r = data.rolling(
+            window=window,
+            min_periods=op.min_periods,
+            center=op.center,
+            win_type=win_type,
+            on=op.on,
+            axis=axis,
+            closed=op.closed,
+        )
+        result = r.aggregate(op.func, *op.func_args, **op.func_kwargs)
+
+        if pred_size > 0 or succ_size > 0:
+            slc = [slice(None)] * result.ndim
+            slc[axis] = slice(pred_size, result.shape[axis] - succ_size)
+            result = result.iloc[tuple(slc)]
+
+        ctx[op.outputs[0].key] = result
diff --git a/python/xorbits/_mars/dataframe/window/rolling/core.py b/python/xorbits/_mars/dataframe/window/rolling/core.py
new file mode 100644
index 000000000..05933f1a9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/rolling/core.py
@@ -0,0 +1,354 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+from ....serialization.serializables import (
+    AnyField,
+    BoolField,
+    Int32Field,
+    Int64Field,
+    StringField,
+)
+from ....utils import pd_release_version
+from ...core import DATAFRAME_TYPE
+from ...utils import build_empty_df, build_empty_series, validate_axis
+from ..core import Window
+
+_window_has_method = pd_release_version >= (1, 3, 0)
+
+
+class Rolling(Window):
+    _window = AnyField("window")
+    _min_periods = Int64Field("min_periods")
+    _center = BoolField("center")
+    _win_type = StringField("win_type")
+    _on = StringField("on")
+    _axis = Int32Field("axis")
+    _closed = StringField("closed")
+    _method = StringField("method")
+
+    def __init__(
+        self,
+        window=None,
+        min_periods=None,
+        center=None,
+        win_type=None,
+        on=None,
+        axis=None,
+        closed=None,
+        method=None,
+        **kw
+    ):
+        super().__init__(
+            _window=window,
+            _min_periods=min_periods,
+            _center=center,
+            _win_type=win_type,
+            _on=on,
+            _axis=axis,
+            _closed=closed,
+            _method=method,
+            **kw
+        )
+
+    @property
+    def window(self):
+        return self._window
+
+    @property
+    def min_periods(self):
+        return self._min_periods
+
+    @property
+    def center(self):
+        return self._center
+
+    @property
+    def win_type(self):
+        return self._win_type
+
+    @property
+    def on(self):
+        return self._on
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def closed(self):
+        return self._closed
+
+    @property
+    def method(self):
+        return self._method or "single"
+
+    @property
+    def params(self):
+        p = OrderedDict()
+
+        if not _window_has_method:  # pragma: no cover
+            args = [
+                "window",
+                "min_periods",
+                "center",
+                "win_type",
+                "axis",
+                "on",
+                "closed",
+            ]
+        else:
+            args = [
+                "window",
+                "min_periods",
+                "center",
+                "win_type",
+                "axis",
+                "on",
+                "closed",
+                "method",
+            ]
+
+        for attr in args:
+            p[attr] = getattr(self, attr)
+        return p
+
+    def _repr_name(self):
+        return "Rolling" if self.win_type is None else "Window"
+
+    def validate(self):
+        # leverage pandas itself to do validation
+        pd_index = self._input.index_value.to_pandas()
+        if isinstance(self._input, DATAFRAME_TYPE):
+            empty_obj = build_empty_df(self._input.dtypes, index=pd_index[:0])
+        else:
+            empty_obj = build_empty_series(
+                self._input.dtype, index=pd_index[:0], name=self._input.name
+            )
+        pd_rolling = empty_obj.rolling(**self.params)
+        for k in self.params:
+            # update value according to pandas rolling
+            setattr(self, "_" + k, getattr(pd_rolling, k))
+
+    def aggregate(self, func, *args, **kwargs):
+        from .aggregation import DataFrameRollingAgg
+
+        op = DataFrameRollingAgg(
+            func=func, func_args=args, func_kwargs=kwargs, **self.params
+        )
+        return op(self)
+
+    def agg(self, func, *args, **kwargs):
+        return self.aggregate(func, *args, **kwargs)
+
+    def count(self):
+        return self.aggregate("count")
+
+    def sum(self, *args, **kwargs):
+        return self.aggregate("sum", *args, **kwargs)
+
+    def mean(self, *args, **kwargs):
+        return self.aggregate("mean", *args, **kwargs)
+
+    def median(self, **kwargs):
+        return self.aggregate("median", **kwargs)
+
+    def var(self, ddof=1, *args, **kwargs):
+        return self.aggregate("var", ddof=ddof, *args, **kwargs)
+
+    def std(self, ddof=1, *args, **kwargs):
+        return self.aggregate("std", ddof=ddof, *args, **kwargs)
+
+    def min(self, *args, **kwargs):
+        return self.aggregate("min", *args, **kwargs)
+
+    def max(self, *args, **kwargs):
+        return self.aggregate("max", *args, **kwargs)
+
+    def skew(self, **kwargs):
+        return self.aggregate("skew", **kwargs)
+
+    def kurt(self, **kwargs):
+        return self.aggregate("kurt", **kwargs)
+
+
+def rolling(
+    obj,
+    window,
+    min_periods=None,
+    center=False,
+    win_type=None,
+    on=None,
+    axis=0,
+    closed=None,
+):
+    """
+    Provide rolling window calculations.
+
+    Parameters
+    ----------
+    window : int, or offset
+        Size of the moving window. This is the number of observations used for
+        calculating the statistic. Each window will be a fixed size.
+        If its an offset then this will be the time period of each window. Each
+        window will be a variable sized based on the observations included in
+        the time-period. This is only valid for datetimelike indexes. This is
+        new in 0.19.0
+    min_periods : int, default None
+        Minimum number of observations in window required to have a value
+        (otherwise result is NA). For a window that is specified by an offset,
+        `min_periods` will default to 1. Otherwise, `min_periods` will default
+        to the size of the window.
+    center : bool, default False
+        Set the labels at the center of the window.
+    win_type : str, default None
+        Provide a window type. If ``None``, all points are evenly weighted.
+        See the notes below for further information.
+    on : str, optional
+        For a DataFrame, a datetime-like column on which to calculate the rolling
+        window, rather than the DataFrame's index. Provided integer column is
+        ignored and excluded from result since an integer index is not used to
+        calculate the rolling window.
+    axis : int or str, default 0
+    closed : str, default None
+        Make the interval closed on the 'right', 'left', 'both' or
+        'neither' endpoints.
+        For offset-based windows, it defaults to 'right'.
+        For fixed windows, defaults to 'both'. Remaining cases not implemented
+        for fixed windows.
+
+    Returns
+    -------
+    a Window or Rolling sub-classed for the particular operation
+
+    See Also
+    --------
+    expanding : Provides expanding transformations.
+    ewm : Provides exponential weighted functions.
+
+    Notes
+    -----
+    By default, the result is set to the right edge of the window. This can be
+    changed to the center of the window by setting ``center=True``.
+    To learn more about the offsets & frequency strings, please see `this link
+    <http://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
+
+    The recognized win_types are:
+    * ``boxcar``
+    * ``triang``
+    * ``blackman``
+    * ``hamming``
+    * ``bartlett``
+    * ``parzen``
+    * ``bohman``
+    * ``blackmanharris``
+    * ``nuttall``
+    * ``barthann``
+    * ``kaiser`` (needs beta)
+    * ``gaussian`` (needs std)
+    * ``general_gaussian`` (needs power, width)
+    * ``slepian`` (needs width)
+    * ``exponential`` (needs tau), center is set to None.
+
+    If ``win_type=None`` all points are evenly weighted. To learn more about
+    different window types see `scipy.signal window functions
+    <https://docs.scipy.org/doc/scipy/reference/signal.html#window-functions>`__.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import mars.dataframe as md
+    >>> df = md.DataFrame({'B': [0, 1, 2, np.nan, 4]})
+    >>> df.execute()
+         B
+    0  0.0
+    1  1.0
+    2  2.0
+    3  NaN
+    4  4.0
+
+    Rolling sum with a window length of 2, using the 'triang'
+    window type.
+
+    >>> df.rolling(2, win_type='triang').sum().execute()
+         B
+    0  NaN
+    1  0.5
+    2  1.5
+    3  NaN
+    4  NaN
+
+    Rolling sum with a window length of 2, min_periods defaults
+    to the window length.
+
+    >>> df.rolling(2).sum().execute()
+         B
+    0  NaN
+    1  1.0
+    2  3.0
+    3  NaN
+    4  NaN
+
+    Same as above, but explicitly set the min_periods
+
+    >>> df.rolling(2, min_periods=1).sum().execute()
+         B
+    0  0.0
+    1  1.0
+    2  3.0
+    3  2.0
+    4  4.0
+
+    A ragged (meaning not-a-regular frequency), time-indexed DataFrame
+
+    >>> df = md.DataFrame({'B': [0, 1, 2, np.nan, 4]},
+    >>>                   index = [md.Timestamp('20130101 09:00:00'),
+    >>>                            md.Timestamp('20130101 09:00:02'),
+    >>>                            md.Timestamp('20130101 09:00:03'),
+    >>>                            md.Timestamp('20130101 09:00:05'),
+    >>>                            md.Timestamp('20130101 09:00:06')])
+    >>> df.execute()
+                           B
+    2013-01-01 09:00:00  0.0
+    2013-01-01 09:00:02  1.0
+    2013-01-01 09:00:03  2.0
+    2013-01-01 09:00:05  NaN
+    2013-01-01 09:00:06  4.0
+
+    Contrasting to an integer rolling window, this will roll a variable
+    length window corresponding to the time period.
+    The default for min_periods is 1.
+
+    >>> df.rolling('2s').sum().execute()
+                           B
+    2013-01-01 09:00:00  0.0
+    2013-01-01 09:00:02  1.0
+    2013-01-01 09:00:03  3.0
+    2013-01-01 09:00:05  NaN
+    2013-01-01 09:00:06  4.0
+    """
+    axis = validate_axis(axis, obj)
+    r = Rolling(
+        input=obj,
+        window=window,
+        min_periods=min_periods,
+        center=center,
+        win_type=win_type,
+        on=on,
+        axis=axis,
+        closed=closed,
+    )
+    r.validate()
+    return r
diff --git a/python/xorbits/_mars/dataframe/window/rolling/tests/__init__.py b/python/xorbits/_mars/dataframe/window/rolling/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/rolling/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/dataframe/window/rolling/tests/test_rolling.py b/python/xorbits/_mars/dataframe/window/rolling/tests/test_rolling.py
new file mode 100644
index 000000000..31535c1fe
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/rolling/tests/test_rolling.py
@@ -0,0 +1,65 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ..... import dataframe as md
+from .....core import tile
+
+
+def test_rolling():
+    df = pd.DataFrame(np.random.rand(4, 3), columns=list("abc"))
+    df2 = md.DataFrame(df)
+
+    r = df2.rolling(3, min_periods=1, center=True, win_type="triang", closed="both")
+    expected = df.rolling(
+        3, min_periods=1, center=True, win_type="triang", closed="both"
+    )
+    assert repr(r) == repr(expected)
+
+    assert "b" in dir(r)
+
+    with pytest.raises(AttributeError):
+        _ = r.d
+
+    with pytest.raises(KeyError):
+        _ = r["d"]
+
+    with pytest.raises(KeyError):
+        _ = r["a", "d"]
+
+    assert "a" not in dir(r.a)
+    assert "c" not in dir(r["a", "b"])
+
+
+def test_rolling_agg():
+    df = pd.DataFrame(np.random.rand(4, 3), columns=list("abc"))
+    df2 = md.DataFrame(df, chunk_size=3)
+
+    r = df2.rolling(3).agg("max")
+    expected = df.rolling(3).agg("max")
+
+    assert r.shape == df.shape
+    assert r.index_value is df2.index_value
+    pd.testing.assert_index_equal(r.columns_value.to_pandas(), expected.columns)
+    pd.testing.assert_series_equal(r.dtypes, df2.dtypes)
+
+    r = tile(r)
+    for c in r.chunks:
+        assert c.shape == c.inputs[0].shape
+        assert c.index_value is c.inputs[0].index_value
+        pd.testing.assert_index_equal(c.columns_value.to_pandas(), expected.columns)
+        pd.testing.assert_series_equal(c.dtypes, expected.dtypes)
diff --git a/python/xorbits/_mars/dataframe/window/rolling/tests/test_rolling_execution.py b/python/xorbits/_mars/dataframe/window/rolling/tests/test_rolling_execution.py
new file mode 100644
index 000000000..ac1bb79ac
--- /dev/null
+++ b/python/xorbits/_mars/dataframe/window/rolling/tests/test_rolling_execution.py
@@ -0,0 +1,145 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ..... import dataframe as md
+
+
+def test_rolling_agg_execution(setup):
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        {
+            "a": rs.randint(100, size=(10,)),
+            "b": rs.rand(10),
+            "c": rs.randint(100, size=(10,)),
+            "d": ["c" * i for i in rs.randint(4, size=10)],
+        }
+    )
+    raw.iloc[1, ::4] = np.nan
+    s = raw.iloc[:, 1]
+
+    dfs = [
+        md.DataFrame(raw, chunk_size=10),  # 1 chunk
+        md.DataFrame(raw, chunk_size=3),  # multiple chunks on each axis
+    ]
+    funcs = ["min", ["max", "mean"], {"c": ["std"], "b": ["count", "min"]}]
+
+    df2 = dfs[0].rolling(3).agg(funcs[2])
+
+    # test 1 chunk
+    result = df2.execute().fetch()
+    expected = raw.rolling(3).agg(funcs[2])
+    pd.testing.assert_frame_equal(result, expected)
+
+    for window in [2, 5]:
+        for center in [True, False]:
+            for func in funcs:
+                df2 = dfs[1].rolling(window, center=center).agg(func)
+
+                result = df2.execute().fetch()
+                expected = raw.rolling(window, center=center).agg(func)
+                pd.testing.assert_frame_equal(result, expected)
+
+    # test min_periods and win_type
+    df2 = dfs[1].rolling(3, min_periods=1, win_type="triang").agg("sum")
+
+    result = df2.execute().fetch()
+    expected = raw.rolling(3, min_periods=1, win_type="triang").agg("sum")
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test rolling getitem, series
+    df2 = dfs[1].rolling(3)["b"].agg("sum")
+
+    result = df2.execute().fetch()
+    expected = raw.rolling(3)["b"].agg("sum")
+    pd.testing.assert_series_equal(result, expected)
+
+    # test rolling getitem, dataframe
+    df2 = dfs[1].rolling(3)["c", "b"].agg("sum")
+
+    result = df2.execute().fetch()
+    expected = raw.rolling(3)["c", "b"].agg("sum")
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test axis=1
+    df2 = dfs[1].rolling(3, axis=1).agg("sum")
+
+    result = df2.execute(
+        extra_config=dict(check_all=False, check_nsplits=False)
+    ).fetch()
+    expected = raw.rolling(3, axis=1).agg("sum")
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test window which is offset
+    raw2 = raw.copy()
+    raw2.reset_index(inplace=True, drop=True)
+    raw2.index = pd.date_range("2020-2-25", periods=10)
+
+    df = md.DataFrame(raw2, chunk_size=3)
+    for func in funcs:
+        df2 = df.rolling("2d").agg(func)
+
+        result = df2.execute().fetch()
+        expected = raw2.rolling("2d").agg(func)
+        pd.testing.assert_frame_equal(result, expected)
+
+    series = [md.Series(s, chunk_size=10), md.Series(s, chunk_size=4)]
+
+    funcs = ["min", ["max", "mean"], {"c": "std", "b": "count"}]
+
+    for series in series:
+        for window in [2, 3, 5]:
+            for center in [True, False]:
+                for func in funcs:
+                    series2 = series.rolling(window, center=center).agg(func)
+
+                    result = series2.execute().fetch()
+                    expected = s.rolling(window, center=center).agg(func)
+                    if isinstance(expected, pd.Series):
+                        pd.testing.assert_series_equal(result, expected)
+                    else:
+                        pd.testing.assert_frame_equal(result, expected)
+
+    df = md.DataFrame(raw, chunk_size=3)
+    df = df[df.a > 0.5]
+    r = df.rolling(3).agg("max")
+
+    result = r.execute().fetch()
+    expected = raw[raw.a > 0.5].rolling(3).agg("max")
+    pd.testing.assert_frame_equal(result, expected)
+
+    series = md.Series(s, chunk_size=3)
+    series = series[series > 0.5]
+    r = series.rolling(3).agg("max")
+
+    result = r.execute().fetch()
+    expected = s[s > 0.5].rolling(3).agg("max")
+    pd.testing.assert_series_equal(result, expected)
+
+    # test agg functions
+    df = md.DataFrame(raw, chunk_size=3)
+    for func in ["count", "sum", "mean", "median", "min", "max", "skew", "kurt"]:
+        r = getattr(df.rolling(4), func)()
+
+        result = r.execute().fetch()
+        expected = getattr(raw.rolling(4), func)()
+        pd.testing.assert_frame_equal(result, expected)
+    for func in ["std", "var"]:
+        r = getattr(df.rolling(4), func)(ddof=0)
+
+        result = r.execute().fetch()
+        expected = getattr(raw.rolling(4), func)(ddof=0)
+        pd.testing.assert_frame_equal(result, expected)
diff --git a/python/xorbits/_mars/deploy/__init__.py b/python/xorbits/_mars/deploy/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/deploy/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/deploy/kubedl/__init__.py b/python/xorbits/_mars/deploy/kubedl/__init__.py
new file mode 100644
index 000000000..4f8329168
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubedl/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .client import KubeDLClusterClient, new_cluster
diff --git a/python/xorbits/_mars/deploy/kubedl/client.py b/python/xorbits/_mars/deploy/kubedl/client.py
new file mode 100644
index 000000000..e1ec944ac
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubedl/client.py
@@ -0,0 +1,372 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import time
+import warnings
+
+import requests
+
+from ...session import new_session
+from .config import (
+    MarsJobConfig,
+    MarsSchedulerSpecConfig,
+    MarsWebSpecConfig,
+    MarsWorkerSpecConfig,
+)
+
+try:
+    from kubernetes.client.rest import ApiException as K8SApiException
+except ImportError:  # pragma: no cover
+    K8SApiException = None
+
+KUBEDL_API_VERSION = "kubedl.io/v1alpha1"
+KUBEDL_MARS_PLURAL = "marsjobs"
+
+
+logger = logging.getLogger(__name__)
+
+
+class KubeDLClusterClient:
+    def __init__(self, cluster):
+        self._cluster = cluster
+        self._endpoint = None
+        self._session = None
+
+    @property
+    def endpoint(self):
+        return self._endpoint
+
+    @property
+    def namespace(self):
+        return self._cluster.namespace
+
+    @property
+    def session(self):
+        return self._session
+
+    def start(self):
+        self._endpoint = self._cluster.start()
+        self._session = new_session(self._endpoint, verify_ssl=self._cluster.verify_ssl)
+
+    def stop(self, wait=False, timeout=0):
+        self._cluster.stop(wait=wait, timeout=timeout)
+
+
+class KubeDLCluster:
+    def __init__(
+        self,
+        kube_api_client=None,
+        image=None,
+        job_name=None,
+        namespace=None,
+        scheduler_num=1,
+        scheduler_cpu=None,
+        scheduler_mem=None,
+        worker_num=1,
+        worker_cpu=None,
+        worker_mem=None,
+        worker_spill_paths=None,
+        worker_cache_mem=None,
+        min_worker_num=None,
+        web_num=1,
+        web_cpu=None,
+        web_mem=None,
+        slb_endpoint=None,
+        verify_ssl=True,
+        timeout=None,
+        **kwargs,
+    ):
+        from kubernetes import client as kube_client
+
+        self._kube_api_client = kube_api_client
+        self._custom_api = kube_client.CustomObjectsApi(kube_api_client)
+
+        self._slb_endpoint = slb_endpoint.rstrip("/")
+        self._verify_ssl = verify_ssl
+
+        self._job_name = job_name
+        self._mars_endpoint = None
+        self._namespace = namespace or "default"
+        self._image = image
+        self._timeout = timeout
+        self._extra_volumes = kwargs.pop("extra_volumes", ())
+        self._pre_stop_command = kwargs.pop("pre_stop_command", None)
+        self._log_when_fail = kwargs.pop("log_when_fail", False)
+        self._node_selectors = kwargs.pop("node_selectors", None)
+
+        extra_modules = kwargs.pop("extra_modules", None) or []
+        extra_modules = (
+            extra_modules.split(",")
+            if isinstance(extra_modules, str)
+            else extra_modules
+        )
+        extra_envs = kwargs.pop("extra_env", None) or dict()
+
+        if not verify_ssl:
+            extra_envs["KUBE_VERIFY_SSL"] = "0"
+
+        def _override_modules(updates):
+            modules = set(extra_modules)
+            updates = updates.split(",") if isinstance(updates, str) else updates
+            modules.update(updates)
+            return sorted(modules)
+
+        def _override_envs(updates):
+            ret = extra_envs.copy()
+            ret.update(updates)
+            return ret
+
+        self._scheduler_num = scheduler_num
+        self._scheduler_cpu = scheduler_cpu
+        self._scheduler_mem = scheduler_mem
+        self._scheduler_extra_modules = _override_modules(
+            kwargs.pop("scheduler_extra_modules", [])
+        )
+        self._scheduler_extra_env = _override_envs(
+            kwargs.pop("scheduler_extra_env", None) or dict()
+        )
+
+        self._worker_num = worker_num
+        self._worker_cpu = worker_cpu
+        self._worker_mem = worker_mem
+        self._worker_spill_paths = worker_spill_paths
+        self._worker_cache_mem = worker_cache_mem
+        self._min_worker_num = min_worker_num or worker_num
+        self._worker_extra_modules = _override_modules(
+            kwargs.pop("worker_extra_modules", [])
+        )
+        self._worker_extra_env = _override_envs(
+            kwargs.pop("worker_extra_env", None) or dict()
+        )
+
+        self._web_num = web_num
+        self._web_cpu = web_cpu
+        self._web_mem = web_mem
+        self._web_extra_modules = _override_modules(kwargs.pop("web_extra_modules", []))
+        self._web_extra_env = _override_envs(
+            kwargs.pop("web_extra_env", None) or dict()
+        )
+
+    @property
+    def verify_ssl(self):
+        return self._verify_ssl
+
+    def _check_if_exist(self):
+        if self._job_name is None:
+            return False
+        try:
+            api, version = KUBEDL_API_VERSION.rsplit("/", 1)
+            service_obj = self._custom_api.get_namespaced_custom_object_status(
+                api, version, self._namespace, KUBEDL_MARS_PLURAL, self._job_name
+            )
+            if len(service_obj.get("status", dict()).get("conditions", [])) > 0:
+                status = service_obj["status"]["conditions"][-1]["type"]
+                if status == "Running" or status == "Created":
+                    logger.warning(f"Reusing cluster: {self._job_name}")
+                    return True
+                else:
+                    return False
+            else:
+                return False
+        except K8SApiException:
+            return False
+
+    def _create_service(self):
+        scheduler_cfg = MarsSchedulerSpecConfig(
+            self._image,
+            self._scheduler_num,
+            cpu=self._scheduler_cpu,
+            memory=self._scheduler_mem,
+            node_selectors=self._node_selectors,
+            modules=self._scheduler_extra_modules,
+        )
+        scheduler_cfg.add_simple_envs(self._scheduler_extra_env)
+
+        worker_cfg = MarsWorkerSpecConfig(
+            self._image,
+            self._worker_num,
+            cpu=self._worker_cpu,
+            memory=self._worker_mem,
+            cache_mem=self._worker_cache_mem,
+            spill_dirs=self._worker_spill_paths,
+            node_selectors=self._node_selectors,
+            modules=self._worker_extra_modules,
+        )
+        worker_cfg.add_simple_envs(self._worker_extra_env)
+
+        web_cfg = MarsWebSpecConfig(
+            self._image,
+            self._web_num,
+            cpu=self._web_cpu,
+            memory=self._web_mem,
+            node_selectors=self._node_selectors,
+            modules=self._web_extra_modules,
+        )
+        web_cfg.add_simple_envs(self._web_extra_env)
+
+        job_cfg = MarsJobConfig(
+            job_name=self._job_name,
+            scheduler_config=scheduler_cfg,
+            worker_config=worker_cfg,
+            web_config=web_cfg,
+            web_host=self._slb_endpoint,
+        )
+
+        api, version = KUBEDL_API_VERSION.rsplit("/", 1)
+
+        cfg_json = job_cfg.build()
+        cfg_json["apiVersion"] = KUBEDL_API_VERSION
+
+        response = self._custom_api.create_namespaced_custom_object(
+            api, version, self._namespace, KUBEDL_MARS_PLURAL, cfg_json
+        )
+        self._job_name = response["metadata"]["name"]
+
+    def _wait_service_ready(self):
+        self._mars_endpoint = (
+            f"{self._slb_endpoint}/mars/{self._namespace}/{self._job_name}-webservice-0"
+        )
+        logger.warning(f"Kubedl job name: {self._job_name}")
+        check_start_time = time.time()
+        worker_count_url = self._mars_endpoint + "/api/worker?action=count"
+        while True:
+            try:
+                if self._timeout and time.time() - check_start_time > self._timeout:
+                    raise TimeoutError("Check Mars service start timeout")
+
+                if not self._verify_ssl:
+                    try:
+                        import urllib3
+
+                        urllib3.disable_warnings(
+                            urllib3.exceptions.InsecureRequestWarning
+                        )
+                    except ImportError:  # pragma: no cover
+                        pass
+
+                api, version = KUBEDL_API_VERSION.rsplit("/", 1)
+                service_obj = self._custom_api.get_namespaced_custom_object_status(
+                    api, version, self._namespace, KUBEDL_MARS_PLURAL, self._job_name
+                )
+                if len(service_obj.get("status", dict()).get("conditions", [])) > 0:
+                    if service_obj["status"]["conditions"][-1]["type"] == "Failed":
+                        raise SystemError(
+                            service_obj["status"]["conditions"][-1]["message"]
+                        )
+
+                with warnings.catch_warnings():
+                    warnings.filterwarnings(
+                        "ignore", message="Unverified HTTPS request"
+                    )
+                    resp = requests.get(
+                        worker_count_url, timeout=1, verify=self._verify_ssl
+                    )
+
+                if int(resp.text) >= self._min_worker_num:
+                    logger.warning(f"Web endpoint started at {self._mars_endpoint}")
+                    break
+            except (requests.Timeout, ValueError) as ex:
+                if not isinstance(ex, requests.Timeout):
+                    time.sleep(0.1)
+                pass
+
+    def start(self):
+        try:
+            if not self._check_if_exist():
+                self._create_service()
+            self._wait_service_ready()
+            return self._mars_endpoint
+        except:  # noqa: E722
+            self.stop()
+            raise
+
+    def stop(self, wait=False, timeout=0):
+        from kubernetes import client as kube_client
+
+        custom_api = kube_client.CustomObjectsApi(self._kube_api_client)
+        api, version = KUBEDL_API_VERSION.rsplit("/", 1)
+        custom_api.delete_namespaced_custom_object(
+            api, version, self._namespace, KUBEDL_MARS_PLURAL, self._job_name
+        )
+
+        if wait:
+            start_time = time.time()
+            while True:
+                try:
+                    custom_api.get_namespaced_custom_object(
+                        api,
+                        version,
+                        self._namespace,
+                        KUBEDL_MARS_PLURAL,
+                        self._job_name,
+                    )
+                except K8SApiException as ex:
+                    if ex.status != 404:  # pragma: no cover
+                        raise
+                    break
+                else:
+                    time.sleep(1)
+                    if (
+                        timeout and time.time() - start_time > timeout
+                    ):  # pragma: no cover
+                        raise TimeoutError("Check Mars service stop timeout")
+
+
+def new_cluster(
+    kube_api_client=None,
+    image=None,
+    scheduler_num=1,
+    scheduler_cpu=2,
+    scheduler_mem=4 * 1024**3,
+    worker_num=1,
+    worker_cpu=8,
+    worker_mem=32 * 1024**3,
+    worker_spill_paths=None,
+    worker_cache_mem="45%",
+    min_worker_num=None,
+    web_num=1,
+    web_cpu=1,
+    web_mem=4 * 1024**3,
+    slb_endpoint=None,
+    verify_ssl=True,
+    job_name=None,
+    timeout=None,
+    **kwargs,
+):
+    worker_spill_paths = worker_spill_paths or ["/tmp/spill-dir"]
+    cluster = KubeDLCluster(
+        kube_api_client,
+        image=image,
+        scheduler_num=scheduler_num,
+        scheduler_cpu=scheduler_cpu,
+        scheduler_mem=scheduler_mem,
+        worker_num=worker_num,
+        worker_cpu=worker_cpu,
+        worker_mem=worker_mem,
+        worker_spill_paths=worker_spill_paths,
+        worker_cache_mem=worker_cache_mem,
+        min_worker_num=min_worker_num,
+        web_num=web_num,
+        web_cpu=web_cpu,
+        web_mem=web_mem,
+        slb_endpoint=slb_endpoint,
+        verify_ssl=verify_ssl,
+        job_name=job_name,
+        timeout=timeout,
+        **kwargs,
+    )
+    client = KubeDLClusterClient(cluster)
+    client.start()
+    return client
diff --git a/python/xorbits/_mars/deploy/kubedl/config.py b/python/xorbits/_mars/deploy/kubedl/config.py
new file mode 100644
index 000000000..73425709f
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubedl/config.py
@@ -0,0 +1,268 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from urllib.parse import urlparse
+
+from ...utils import calc_size_by_str, parse_readable_size
+from ..kubernetes.config import ContainerEnvConfig
+
+DEFAULT_SERVICE_ACCOUNT_NAME = "kubedl-sa"
+
+
+def _remove_nones(cfg):
+    return dict((k, v) for k, v in cfg.items() if v is not None)
+
+
+class ResourceConfig:
+    """
+    Configuration builder for Kubernetes computation resources
+    """
+
+    def __init__(self, cpu, memory):
+        self._cpu = cpu
+        self._memory, ratio = (
+            parse_readable_size(memory) if memory is not None else (None, False)
+        )
+        assert not ratio
+
+    def build(self):
+        return {
+            "cpu": str(self._cpu),
+            "memory": str(int(self._memory)),
+        }
+
+
+class ReplicaSpecConfig:
+    """
+    Base configuration builder for Kubernetes replication controllers
+    """
+
+    container_name = "mars"
+
+    def __init__(
+        self,
+        name,
+        image,
+        replicas,
+        resource_request=None,
+        resource_limit=None,
+        node_selectors=None,
+    ):
+        self._name = name
+        self._image = image
+        self._replicas = replicas
+        self._envs = dict()
+        self._node_selectors = node_selectors
+
+        self.add_default_envs()
+
+        self._resource_request = resource_request
+        self._resource_limit = resource_limit
+
+    def add_env(self, name, value=None, field_path=None):
+        self._envs[name] = ContainerEnvConfig(name, value=value, field_path=field_path)
+
+    def add_simple_envs(self, envs):
+        for k, v in envs.items() or ():
+            self.add_env(k, v)
+
+    def add_default_envs(self):
+        pass  # pragma: no cover
+
+    def build_container_command(self):
+        raise NotImplementedError
+
+    def build_container(self):
+        resources_dict = {
+            "requests": self._resource_request.build()
+            if self._resource_request
+            else None,
+            "limits": self._resource_limit.build() if self._resource_limit else None,
+        }
+        return _remove_nones(
+            {
+                "imagePullPolicy": "Always",
+                "command": self.build_container_command(),
+                "env": [env.build() for env in self._envs.values()] or None,
+                "image": self._image,
+                "name": self.container_name,
+                "resources": dict((k, v) for k, v in resources_dict.items() if v)
+                or None,
+            }
+        )
+
+    def build_template_spec(self):
+        return _remove_nones(
+            {
+                "serviceAccountName": DEFAULT_SERVICE_ACCOUNT_NAME,
+                "nodeSelector": self._node_selectors,
+                "containers": [self.build_container()],
+            }
+        )
+
+    def build(self):
+        return {
+            "replicas": int(self._replicas),
+            "restartPolicy": "Never",
+            "template": {
+                "metadata": {
+                    "labels": {"mars/service-type": self._name},
+                },
+                "spec": self.build_template_spec(),
+            },
+        }
+
+
+class MarsReplicaSpecConfig(ReplicaSpecConfig):
+    service_name = None
+    service_label = None
+
+    def __init__(
+        self,
+        image,
+        replicas,
+        cpu=None,
+        memory=None,
+        limit_resources_ratio=1.2,
+        memory_limit_ratio=2,
+        modules=None,
+        node_selectors=None,
+    ):
+        self._cpu = cpu
+        self._memory, ratio = (
+            parse_readable_size(memory) if memory is not None else (None, False)
+        )
+        assert not ratio
+
+        if isinstance(modules, str):
+            self._modules = modules.split(",")
+        else:
+            self._modules = modules
+
+        res_request = ResourceConfig(cpu, memory) if cpu or memory else None
+        memory_limit_ratio = (
+            memory_limit_ratio
+            if memory_limit_ratio is not None
+            else limit_resources_ratio
+        )
+        res_limit = (
+            ResourceConfig(cpu * limit_resources_ratio, memory * memory_limit_ratio)
+            if cpu or memory
+            else None
+        )
+        super().__init__(
+            self.service_label,
+            image,
+            replicas,
+            resource_request=res_request,
+            resource_limit=res_limit,
+            node_selectors=node_selectors,
+        )
+
+    def build_container_command(self):
+        cmd = [
+            "/srv/entrypoint.sh",
+            f"mars.deploy.kubernetes.{self.service_name}",
+        ]
+        return cmd
+
+    def add_default_envs(self):
+        if self._cpu:
+            self.add_env("MARS_CPU_TOTAL", str(self._cpu))
+
+        if self._memory:
+            self.add_env("MARS_MEMORY_TOTAL", str(int(self._memory)))
+
+        if self._modules:
+            self.add_env("MARS_LOAD_MODULES", ",".join(self._modules))
+
+
+class MarsSchedulerSpecConfig(MarsReplicaSpecConfig):
+    service_name = "scheduler"
+    service_label = "marsscheduler"
+
+
+class MarsWorkerSpecConfig(MarsReplicaSpecConfig):
+    service_name = "worker"
+    service_label = "marsworker"
+
+    def __init__(self, *args, **kwargs):
+        cache_mem = kwargs.pop("cache_mem", None)
+        self._spill_dirs = kwargs.pop("spill_dirs", None) or ()
+        # set limits as 2*requests for worker replica defaulted.
+        kwargs["limit_resources_ratio"] = kwargs.get("limit_resources_ratio", 1.2)
+        super().__init__(*args, **kwargs)
+        self._cache_mem = calc_size_by_str(cache_mem, self._memory)
+        self.add_env("MARS_CACHE_MEM_SIZE", self._cache_mem)
+
+    @property
+    def spill_dirs(self):
+        return self._spill_dirs
+
+    @property
+    def cache_mem(self):
+        return self._cache_mem
+
+    def add_default_envs(self):
+        super().add_default_envs()
+        if self._spill_dirs:
+            self.add_env("MARS_SPILL_DIRS", ":".join(self._spill_dirs))
+
+
+class MarsWebSpecConfig(MarsReplicaSpecConfig):
+    service_name = "web"
+    service_label = "marsweb"
+
+
+class MarsJobConfig:
+    def __init__(
+        self, job_name, scheduler_config, worker_config, web_config, web_host=None
+    ):
+        self._job_name = job_name
+        self._scheduler_config = scheduler_config
+        self._worker_config = worker_config
+        self._web_config = web_config
+        self._web_host = web_host
+
+    def build(self):
+        if self._job_name is None:
+            metadata = {"generateName": "mars-job-"}
+        else:
+            metadata = {"name": self._job_name}
+
+        web_host = self._web_host
+        if web_host is not None and "://" in web_host:
+            web_host = urlparse(web_host).netloc
+
+        return {
+            "kind": "MarsJob",
+            "metadata": metadata,
+            "spec": _remove_nones(
+                {
+                    "workerMemoryTuningPolicy": _remove_nones(
+                        {
+                            "spillDirs": self._worker_config.spill_dirs,
+                            "workerCacheSize": self._worker_config.cache_mem,
+                        }
+                    ),
+                    "cleanPodPolicy": "None",
+                    "webHost": web_host,
+                    "marsReplicaSpecs": {
+                        "Worker": self._worker_config.build(),
+                        "Scheduler": self._scheduler_config.build(),
+                        "WebService": self._web_config.build(),
+                    },
+                }
+            ),
+        }
diff --git a/python/xorbits/_mars/deploy/kubernetes/__init__.py b/python/xorbits/_mars/deploy/kubernetes/__init__.py
new file mode 100644
index 000000000..346d3e035
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .client import KubernetesClusterClient, new_cluster
diff --git a/python/xorbits/_mars/deploy/kubernetes/client.py b/python/xorbits/_mars/deploy/kubernetes/client.py
new file mode 100644
index 000000000..19e35698b
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/client.py
@@ -0,0 +1,480 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import functools
+import logging
+import random
+import time
+import uuid
+from urllib.parse import urlparse
+
+from ...lib.aio import new_isolation, stop_isolation
+from ...services.cluster.api import WebClusterAPI
+from ...session import new_session
+from ...utils import calc_size_by_str
+from ..utils import wait_services_ready
+from .config import (
+    MarsSupervisorsConfig,
+    MarsWorkersConfig,
+    NamespaceConfig,
+    RoleBindingConfig,
+    RoleConfig,
+    ServiceConfig,
+)
+
+try:
+    from kubernetes.client.rest import ApiException as K8SApiException
+except ImportError:  # pragma: no cover
+    K8SApiException = None
+
+logger = logging.getLogger(__name__)
+
+
+class KubernetesClusterClient:
+    def __init__(self, cluster):
+        self._cluster = cluster
+        self._endpoint = None
+        self._session = None
+
+    @property
+    def endpoint(self):
+        return self._endpoint
+
+    @property
+    def namespace(self):
+        return self._cluster.namespace
+
+    @property
+    def session(self):
+        return self._session
+
+    def start(self):
+        try:
+            self._endpoint = self._cluster.start()
+            self._session = new_session(self._endpoint)
+        except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+            self.stop()
+            raise
+
+    def stop(self, wait=False, timeout=0):
+        self._cluster.stop(wait=wait, timeout=timeout)
+
+
+class KubernetesCluster:
+    _supervisor_config_cls = MarsSupervisorsConfig
+    _worker_config_cls = MarsWorkersConfig
+    _default_service_port = 7103
+    _default_web_port = 7104
+
+    def __init__(
+        self,
+        kube_api_client=None,
+        image=None,
+        namespace=None,
+        supervisor_num=1,
+        supervisor_cpu=1,
+        supervisor_mem="4G",
+        supervisor_mem_limit_ratio=None,
+        worker_num=1,
+        worker_cpu=None,
+        worker_mem=None,
+        worker_spill_paths=None,
+        worker_cache_mem=None,
+        min_worker_num=None,
+        worker_min_cache_mem=None,
+        worker_mem_limit_ratio=None,
+        web_port=None,
+        service_name=None,
+        service_type=None,
+        timeout=None,
+        **kwargs,
+    ):
+        from kubernetes import client as kube_client
+
+        if worker_cpu is None or worker_mem is None:  # pragma: no cover
+            raise TypeError("`worker_cpu` and `worker_mem` must be specified")
+
+        self._api_client = kube_api_client
+        self._core_api = kube_client.CoreV1Api(kube_api_client)
+
+        self._namespace = namespace
+        self._image = image
+        self._timeout = timeout
+        self._service_name = service_name or "marsservice"
+        self._service_type = service_type or "NodePort"
+        self._extra_volumes = kwargs.pop("extra_volumes", ())
+        self._pre_stop_command = kwargs.pop("pre_stop_command", None)
+        self._log_when_fail = kwargs.pop("log_when_fail", False)
+
+        extra_modules = kwargs.pop("extra_modules", None) or []
+        extra_modules = (
+            extra_modules.split(",")
+            if isinstance(extra_modules, str)
+            else extra_modules
+        )
+        extra_envs = kwargs.pop("extra_env", None) or dict()
+        extra_labels = kwargs.pop("extra_labels", None) or dict()
+        service_port = kwargs.pop("service_port", None) or self._default_service_port
+
+        def _override_modules(updates):
+            modules = set(extra_modules)
+            updates = updates.split(",") if isinstance(updates, str) else updates
+            modules.update(updates)
+            return sorted(modules)
+
+        def _override_dict(d, updates):
+            updates = updates or dict()
+            ret = d.copy()
+            ret.update(updates)
+            return ret
+
+        _override_envs = functools.partial(_override_dict, extra_envs)
+        _override_labels = functools.partial(_override_dict, extra_labels)
+
+        self._supervisor_num = supervisor_num
+        self._supervisor_cpu = supervisor_cpu
+        self._supervisor_mem = calc_size_by_str(supervisor_mem, None)
+        self._supervisor_mem_limit_ratio = supervisor_mem_limit_ratio
+        self._supervisor_extra_modules = _override_modules(
+            kwargs.pop("supervisor_extra_modules", [])
+        )
+        self._supervisor_extra_env = _override_envs(
+            kwargs.pop("supervisor_extra_env", None)
+        )
+        self._supervisor_extra_labels = _override_labels(
+            kwargs.pop("supervisor_extra_labels", None)
+        )
+        self._supervisor_service_port = (
+            kwargs.pop("supervisor_service_port", None) or service_port
+        )
+        self._web_port = web_port or self._default_web_port
+        self._external_web_endpoint = None
+
+        self._worker_num = worker_num
+        self._worker_cpu = worker_cpu
+        self._worker_mem = calc_size_by_str(worker_mem, None)
+        self._worker_mem_limit_ratio = worker_mem_limit_ratio
+        self._worker_spill_paths = worker_spill_paths
+        self._worker_cache_mem = worker_cache_mem
+        self._worker_min_cache_men = worker_min_cache_mem
+        self._min_worker_num = min_worker_num
+        self._worker_extra_modules = _override_modules(
+            kwargs.pop("worker_extra_modules", [])
+        )
+        self._worker_extra_env = _override_envs(kwargs.pop("worker_extra_env", None))
+        self._worker_extra_labels = _override_labels(
+            kwargs.pop("worker_extra_labels", None)
+        )
+        self._worker_service_port = (
+            kwargs.pop("worker_service_port", None) or service_port
+        )
+
+    @property
+    def namespace(self):
+        return self._namespace
+
+    def _get_free_namespace(self):
+        while True:
+            namespace = "mars-ns-" + str(uuid.uuid4().hex)
+            try:
+                self._core_api.read_namespace(namespace)
+            except K8SApiException as ex:
+                if ex.status != 404:  # pragma: no cover
+                    raise
+                return namespace
+
+    def _create_kube_service(self):
+        if self._service_type != "NodePort":  # pragma: no cover
+            raise NotImplementedError(
+                f"Service type {self._service_type} not supported"
+            )
+
+        service_config = ServiceConfig(
+            self._service_name,
+            service_type="NodePort",
+            port=self._web_port,
+            selector={"mars/service-type": MarsSupervisorsConfig.rc_name},
+        )
+        self._core_api.create_namespaced_service(
+            self._namespace, service_config.build()
+        )
+
+    def _get_ready_pod_count(self, label_selector):
+        query = self._core_api.list_namespaced_pod(
+            namespace=self._namespace, label_selector=label_selector
+        ).to_dict()
+        cnt = 0
+        for el in query["items"]:
+            if el["status"]["phase"] in ("Error", "Failed"):
+                logger.warning(
+                    "Error in starting pod, message: %s", el["status"]["message"]
+                )
+                continue
+            if "status" not in el or "conditions" not in el["status"]:
+                cnt += 1
+            elif any(
+                cond["type"] == "Ready" and cond["status"] == "True"
+                for cond in el["status"].get("conditions") or ()
+            ):
+                cnt += 1
+        return cnt
+
+    def _create_namespace(self):
+        if self._namespace is None:
+            namespace = self._namespace = self._get_free_namespace()
+        else:
+            namespace = self._namespace
+
+        self._core_api.create_namespace(NamespaceConfig(namespace).build())
+
+    def _create_roles_and_bindings(self):
+        # create role and binding
+        role_config = RoleConfig(
+            "mars-pod-operator",
+            self._namespace,
+            api_groups="",
+            resources="pods,endpoints,services",
+            verbs="get,watch,list,patch",
+        )
+        role_config.create_namespaced(self._api_client, self._namespace)
+        role_binding_config = RoleBindingConfig(
+            "mars-pod-operator-binding", self._namespace, "mars-pod-operator", "default"
+        )
+        role_binding_config.create_namespaced(self._api_client, self._namespace)
+
+    def _create_supervisors(self):
+        supervisors_config = self._supervisor_config_cls(
+            self._supervisor_num,
+            image=self._image,
+            cpu=self._supervisor_cpu,
+            memory=self._supervisor_mem,
+            memory_limit_ratio=self._supervisor_mem_limit_ratio,
+            modules=self._supervisor_extra_modules,
+            volumes=self._extra_volumes,
+            service_name=self._service_name,
+            service_port=self._supervisor_service_port,
+            web_port=self._web_port,
+            pre_stop_command=self._pre_stop_command,
+        )
+        supervisors_config.add_simple_envs(self._supervisor_extra_env)
+        supervisors_config.add_labels(self._supervisor_extra_labels)
+        supervisors_config.create_namespaced(self._api_client, self._namespace)
+
+    def _create_workers(self):
+        workers_config = self._worker_config_cls(
+            self._worker_num,
+            image=self._image,
+            cpu=self._worker_cpu,
+            memory=self._worker_mem,
+            memory_limit_ratio=self._worker_mem_limit_ratio,
+            spill_volumes=self._worker_spill_paths,
+            modules=self._worker_extra_modules,
+            volumes=self._extra_volumes,
+            worker_cache_mem=self._worker_cache_mem,
+            min_cache_mem=self._worker_min_cache_men,
+            service_name=self._service_name,
+            service_port=self._worker_service_port,
+            pre_stop_command=self._pre_stop_command,
+            supervisor_web_port=self._web_port,
+        )
+        workers_config.add_simple_envs(self._worker_extra_env)
+        workers_config.add_labels(self._worker_extra_labels)
+        workers_config.create_namespaced(self._api_client, self._namespace)
+
+    def _create_services(self):
+        self._create_supervisors()
+        self._create_workers()
+
+    def _wait_services_ready(self):
+        min_worker_num = int(self._min_worker_num or self._worker_num)
+        limits = [self._supervisor_num, min_worker_num]
+        selectors = [
+            "mars/service-type=" + MarsSupervisorsConfig.rc_name,
+            "mars/service-type=" + MarsWorkersConfig.rc_name,
+        ]
+        start_time = time.time()
+        logger.debug("Start waiting pods to be ready")
+        wait_services_ready(
+            selectors,
+            limits,
+            lambda sel: self._get_ready_pod_count(sel),
+            timeout=self._timeout,
+        )
+        logger.info("All service pods ready.")
+        if self._timeout is not None:  # pragma: no branch
+            self._timeout -= time.time() - start_time
+
+    def _get_web_address(self):
+        svc_data = self._core_api.read_namespaced_service(
+            "marsservice", self._namespace
+        ).to_dict()
+        node_port = svc_data["spec"]["ports"][0]["node_port"]
+
+        # docker desktop use a VM to hold docker processes, hence
+        # we need to use API address instead
+        desktop_nodes = self._core_api.list_node(
+            field_selector="metadata.name=docker-desktop"
+        ).to_dict()
+        if desktop_nodes["items"]:  # pragma: no cover
+            host_ip = urlparse(
+                self._core_api.api_client.configuration.host
+            ).netloc.split(":", 1)[0]
+        else:
+            web_pods = self._core_api.list_namespaced_pod(
+                self._namespace,
+                label_selector="mars/service-type=" + MarsSupervisorsConfig.rc_name,
+            ).to_dict()
+            host_ip = random.choice(web_pods["items"])["status"]["host_ip"]
+        return f"http://{host_ip}:{node_port}"
+
+    def _wait_web_ready(self):
+        loop = new_isolation().loop
+
+        async def get_supervisors():
+            start_time = time.time()
+            while True:
+                try:
+                    cluster_api = WebClusterAPI(self._external_web_endpoint)
+                    supervisors = await cluster_api.get_supervisors()
+
+                    if len(supervisors) == self._supervisor_num:
+                        break
+                except:  # noqa: E722  # nosec  # pylint: disable=bare-except  # pragma: no cover
+                    if (
+                        self._timeout is not None
+                        and time.time() - start_time > self._timeout
+                    ):
+                        logger.exception("Error when fetching supervisors")
+                        raise TimeoutError(
+                            "Wait for kubernetes cluster timed out"
+                        ) from None
+
+        asyncio.run_coroutine_threadsafe(get_supervisors(), loop).result()
+
+    def _load_cluster_logs(self):
+        log_dict = dict()
+        pod_items = self._core_api.list_namespaced_pod(self._namespace).to_dict()
+        for item in pod_items["items"]:
+            log_dict[item["metadata"]["name"]] = self._core_api.read_namespaced_pod_log(
+                name=item["metadata"]["name"], namespace=self._namespace
+            )
+        return log_dict
+
+    def start(self):
+        try:
+            self._create_namespace()
+            self._create_roles_and_bindings()
+
+            self._create_services()
+            self._create_kube_service()
+
+            self._wait_services_ready()
+
+            self._external_web_endpoint = self._get_web_address()
+            self._wait_web_ready()
+            return self._external_web_endpoint
+        except:  # noqa: E722
+            if self._log_when_fail:  # pargma: no cover
+                logger.error("Error when creating cluster")
+                for name, log in self._load_cluster_logs().items():
+                    logger.error("Error logs for %s:\n%s", name, log)
+            self.stop()
+            raise
+
+    def stop(self, wait=False, timeout=0):
+        # stop isolation
+        stop_isolation()
+
+        from kubernetes.client import CoreV1Api
+
+        api = CoreV1Api(self._api_client)
+        api.delete_namespace(self._namespace)
+        if wait:
+            start_time = time.time()
+            while True:
+                try:
+                    api.read_namespace(self._namespace)
+                except K8SApiException as ex:
+                    if ex.status != 404:  # pragma: no cover
+                        raise
+                    break
+                else:
+                    time.sleep(1)
+                    if (
+                        timeout and time.time() - start_time > timeout
+                    ):  # pragma: no cover
+                        raise TimeoutError
+
+
+def new_cluster(
+    kube_api_client=None,
+    image=None,
+    supervisor_num=1,
+    supervisor_cpu=None,
+    supervisor_mem=None,
+    worker_num=1,
+    worker_cpu=None,
+    worker_mem=None,
+    worker_spill_paths=None,
+    worker_cache_mem=None,
+    min_worker_num=None,
+    web_num=1,
+    web_cpu=None,
+    web_mem=None,
+    service_type=None,
+    timeout=None,
+    **kwargs,
+):
+    """
+    :param kube_api_client: Kubernetes API client, can be created with ``new_client_from_config``
+    :param image: Docker image to use, ``marsproject/mars:<mars version>`` by default
+    :param supervisor_num: Number of supervisors in the cluster, 1 by default
+    :param supervisor_cpu: Number of CPUs for every supervisor
+    :param supervisor_mem: Memory size for every supervisor
+    :param worker_num: Number of workers in the cluster, 1 by default
+    :param worker_cpu: Number of CPUs for every worker
+    :param worker_mem: Memory size for every worker
+    :param worker_spill_paths: Spill paths for worker pods on hosts
+    :param worker_cache_mem: Size or ratio of cache memory for every worker
+    :param min_worker_num: Minimal ready workers
+    :param web_num: Number of web services in the cluster, 1 by default
+    :param web_cpu: Number of CPUs for every web service
+    :param web_mem: Memory size for every web service
+    :param service_type: Type of Kubernetes Service, currently only ``NodePort`` supported
+    :param timeout: Timeout when creating clusters
+    """
+    cluster_cls = kwargs.pop("cluster_cls", KubernetesCluster)
+    cluster = cluster_cls(
+        kube_api_client,
+        image=image,
+        supervisor_num=supervisor_num,
+        supervisor_cpu=supervisor_cpu,
+        supervisor_mem=supervisor_mem,
+        worker_num=worker_num,
+        worker_cpu=worker_cpu,
+        worker_mem=worker_mem,
+        worker_spill_paths=worker_spill_paths,
+        worker_cache_mem=worker_cache_mem,
+        min_worker_num=min_worker_num,
+        web_num=web_num,
+        web_cpu=web_cpu,
+        web_mem=web_mem,
+        service_type=service_type,
+        timeout=timeout,
+        **kwargs,
+    )
+    client = KubernetesClusterClient(cluster)
+    client.start()
+    return client
diff --git a/python/xorbits/_mars/deploy/kubernetes/config.py b/python/xorbits/_mars/deploy/kubernetes/config.py
new file mode 100644
index 000000000..34a8f8717
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/config.py
@@ -0,0 +1,673 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+import functools
+import math
+import re
+
+from ... import __version__ as mars_version
+from ...utils import calc_size_by_str, parse_readable_size
+
+DEFAULT_IMAGE = "marsproject/mars:v" + mars_version
+DEFAULT_WORKER_CACHE_MEM = "40%"
+
+
+def _remove_nones(cfg):
+    return dict((k, v) for k, v in cfg.items() if v is not None)
+
+
+_kube_api_mapping = {
+    "v1": "CoreV1Api",
+    "apps/v1": "AppsV1Api",
+    "rbac.authorization.k8s.io/v1": "RbacAuthorizationV1Api",
+}
+
+
+@functools.lru_cache(10)
+def _get_k8s_api(api_version, k8s_api_client):
+    from kubernetes import client as kube_client
+
+    return getattr(kube_client, _kube_api_mapping[api_version])(k8s_api_client)
+
+
+@functools.lru_cache(10)
+def _camel_to_underline(name):
+    s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
+    return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
+
+
+class KubeConfig(abc.ABC):
+    api_version = "v1"
+
+    def create_namespaced(self, k8s_api_client, namespace):
+        api = _get_k8s_api(self.api_version, k8s_api_client)
+        config = self.build()
+        method_name = f'create_namespaced_{_camel_to_underline(config["kind"])}'
+        return getattr(api, method_name)(namespace, config)
+
+    @abc.abstractmethod
+    def build(self):
+        """Build config dict of the object"""
+
+
+class RoleConfig(KubeConfig):
+    """
+    Configuration builder for Kubernetes RBAC roles
+    """
+
+    api_version = "rbac.authorization.k8s.io/v1"
+
+    def __init__(self, name, namespace, api_groups, resources, verbs):
+        self._name = name
+        self._namespace = namespace
+        self._api_groups = api_groups.split(",")
+        self._resources = resources.split(",")
+        self._verbs = verbs.split(",")
+
+    def build(self):
+        return {
+            "kind": "Role",
+            "metadata": {"name": self._name, "namespace": self._namespace},
+            "rules": [
+                {
+                    "apiGroups": self._api_groups,
+                    "resources": self._resources,
+                    "verbs": self._verbs,
+                }
+            ],
+        }
+
+
+class RoleBindingConfig(KubeConfig):
+    """
+    Configuration builder for Kubernetes RBAC role bindings
+    """
+
+    api_version = "rbac.authorization.k8s.io/v1"
+
+    def __init__(self, name, namespace, role_name, service_account_name):
+        self._name = name
+        self._namespace = namespace
+        self._role_name = role_name
+        self._service_account_name = service_account_name
+
+    def build(self):
+        return {
+            "kind": "RoleBinding",
+            "metadata": {"name": self._name, "namespace": self._namespace},
+            "roleRef": {
+                "apiGroup": "rbac.authorization.k8s.io",
+                "kind": "Role",
+                "name": self._role_name,
+            },
+            "subjects": [
+                {
+                    "kind": "ServiceAccount",
+                    "name": self._service_account_name,
+                    "namespace": self._namespace,
+                }
+            ],
+        }
+
+
+class NamespaceConfig(KubeConfig):
+    """
+    Configuration builder for Kubernetes namespaces
+    """
+
+    def __init__(self, name):
+        self._name = name
+
+    def build(self):
+        return {
+            "kind": "Namespace",
+            "metadata": {
+                "name": self._name,
+                "labels": {
+                    "name": self._name,
+                },
+            },
+        }
+
+
+class ServiceConfig(KubeConfig):
+    """
+    Configuration builder for Kubernetes services
+    """
+
+    def __init__(
+        self, name, service_type, selector, port, target_port=None, protocol=None
+    ):
+        self._name = name
+        self._type = service_type
+        self._protocol = protocol or "TCP"
+        self._selector = selector
+        self._port = port
+        self._target_port = target_port
+
+    def build(self):
+        return {
+            "kind": "Service",
+            "metadata": {
+                "name": self._name,
+                "labels": {
+                    "mars/service-name": self._name,
+                },
+            },
+            "spec": _remove_nones(
+                {
+                    "type": self._type,
+                    "selector": self._selector,
+                    "ports": [
+                        _remove_nones(
+                            {
+                                "protocol": self._protocol,
+                                "port": self._port,
+                                "targetPort": self._target_port,
+                            }
+                        ),
+                    ],
+                }
+            ),
+        }
+
+
+class ResourceConfig:
+    """
+    Configuration builder for Kubernetes computation resources
+    """
+
+    def __init__(self, cpu, memory):
+        self._cpu = cpu
+        self._memory, ratio = (
+            parse_readable_size(memory) if memory is not None else (None, False)
+        )
+        assert not ratio
+
+    @property
+    def cpu(self):
+        return self._cpu
+
+    @property
+    def memory(self):
+        return self._memory
+
+    def build(self):
+        return _remove_nones(
+            {
+                "cpu": f"{int(self._cpu * 1000)}m" if self._cpu else None,
+                "memory": str(int(self._memory)) if self._memory else None,
+            }
+        )
+
+
+class PortConfig:
+    """
+    Configuration builder for Kubernetes ports definition for containers
+    """
+
+    def __init__(self, container_port):
+        self._container_port = int(container_port)
+
+    def build(self):
+        return {
+            "containerPort": self._container_port,
+        }
+
+
+class VolumeConfig(abc.ABC):
+    """
+    Base configuration builder for Kubernetes volumes
+    """
+
+    def __init__(self, name, mount_path):
+        self.name = name
+        self.mount_path = mount_path
+
+    @abc.abstractmethod
+    def build(self):
+        """Build volume config"""
+
+    def build_mount(self):
+        return {
+            "name": self.name,
+            "mountPath": self.mount_path,
+        }
+
+
+class HostPathVolumeConfig(VolumeConfig):
+    """
+    Configuration builder for Kubernetes host volumes
+    """
+
+    def __init__(self, name, mount_path, host_path, volume_type=None):
+        super().__init__(name, mount_path)
+        self._host_path = host_path
+        self._volume_type = volume_type or "DirectoryOrCreate"
+
+    def build(self):
+        return {
+            "name": self.name,
+            "hostPath": {"path": self._host_path, "type": self._volume_type},
+        }
+
+
+class EmptyDirVolumeConfig(VolumeConfig):
+    """
+    Configuration builder for Kubernetes empty-dir volumes
+    """
+
+    def __init__(self, name, mount_path, use_memory=True, size_limit=None):
+        super().__init__(name, mount_path)
+        self._medium = "Memory" if use_memory else None
+        self._size_limit = size_limit
+
+    def build(self):
+        result = {"name": self.name, "emptyDir": {}}
+        if self._medium:
+            result["emptyDir"]["medium"] = self._medium
+        if self._size_limit:
+            result["emptyDir"]["sizeLimit"] = str(int(self._size_limit))
+        return result
+
+
+class ContainerEnvConfig:
+    """
+    Configuration builder for Kubernetes container environments
+    """
+
+    def __init__(self, name, value=None, field_path=None):
+        self._name = name
+        self._value = value
+        self._field_path = field_path
+
+    def build(self):
+        result = dict(name=self._name)
+        if self._value is not None:
+            result["value"] = str(self._value)
+        elif self._field_path is not None:  # pragma: no branch
+            result["valueFrom"] = {"fieldRef": {"fieldPath": self._field_path}}
+        return result
+
+
+class ProbeConfig:
+    """
+    Base configuration builder for Kubernetes liveness and readiness probes
+    """
+
+    def __init__(
+        self,
+        initial_delay=5,
+        period=1,
+        timeout=None,
+        success_thresh=None,
+        failure_thresh=None,
+    ):
+        self._initial_delay = initial_delay
+        self._period = period
+        self._timeout = timeout
+        self._success_thresh = success_thresh
+        self._failure_thresh = failure_thresh
+
+    def build(self):
+        return _remove_nones(
+            {
+                "initialDelaySeconds": self._initial_delay,
+                "periodSeconds": self._period,
+                "timeoutSeconds": self._timeout,
+                "successThreshold": self._success_thresh,
+                "failureThreshold": self._failure_thresh,
+            }
+        )
+
+
+class TcpSocketProbeConfig(ProbeConfig):
+    """
+    Configuration builder for TCP liveness and readiness probes
+    """
+
+    def __init__(self, port: int, **kwargs):
+        super().__init__(**kwargs)
+        self._port = port
+
+    def build(self):
+        ret = super().build()
+        ret["tcpSocket"] = {"port": self._port}
+        return ret
+
+
+class ReplicationConfig(KubeConfig):
+    """
+    Base configuration builder for Kubernetes replication controllers
+    """
+
+    _default_kind = "Deployment"
+
+    def __init__(
+        self,
+        name,
+        image,
+        replicas,
+        resource_request=None,
+        resource_limit=None,
+        liveness_probe=None,
+        readiness_probe=None,
+        pre_stop_command=None,
+        kind=None,
+    ):
+        self._name = name
+        self._kind = kind or self._default_kind
+        self._image = image
+        self._replicas = replicas
+        self._ports = []
+        self._volumes = []
+        self._envs = dict()
+        self._labels = dict()
+
+        self.add_default_envs()
+
+        self._resource_request = resource_request
+        self._resource_limit = resource_limit
+
+        self._liveness_probe = liveness_probe
+        self._readiness_probe = readiness_probe
+
+        self._pre_stop_command = pre_stop_command
+
+    @property
+    def api_version(self):
+        return "apps/v1" if self._kind in ("Deployment", "ReplicaSet") else "v1"
+
+    def add_env(self, name, value=None, field_path=None):
+        self._envs[name] = ContainerEnvConfig(name, value=value, field_path=field_path)
+
+    def remove_env(self, name):  # pragma: no cover
+        self._envs.pop(name, None)
+
+    def add_simple_envs(self, envs):
+        for k, v in envs.items() or ():
+            self.add_env(k, v)
+
+    def add_labels(self, labels):
+        self._labels.update(labels)
+
+    def add_port(self, container_port):
+        self._ports.append(PortConfig(container_port))
+
+    def add_default_envs(self):
+        pass  # pragma: no cover
+
+    def add_volume(self, vol):
+        self._volumes.append(vol)
+
+    @abc.abstractmethod
+    def build_container_command(self):
+        """Output container command"""
+
+    def build_container(self):
+        resources_dict = {
+            "requests": self._resource_request.build()
+            if self._resource_request
+            else None,
+            "limits": self._resource_limit.build() if self._resource_limit else None,
+        }
+        lifecycle_dict = _remove_nones(
+            {
+                "preStop": {
+                    "exec": {"command": self._pre_stop_command},
+                }
+                if self._pre_stop_command
+                else None,
+            }
+        )
+        return _remove_nones(
+            {
+                "command": self.build_container_command(),
+                "env": [env.build() for env in self._envs.values()] or None,
+                "image": self._image,
+                "name": self._name,
+                "resources": dict((k, v) for k, v in resources_dict.items() if v)
+                or None,
+                "ports": [p.build() for p in self._ports] or None,
+                "volumeMounts": [vol.build_mount() for vol in self._volumes] or None,
+                "livenessProbe": self._liveness_probe.build()
+                if self._liveness_probe
+                else None,
+                "readinessProbe": self._readiness_probe.build()
+                if self._readiness_probe
+                else None,
+                "lifecycle": lifecycle_dict or None,
+            }
+        )
+
+    def build_template_spec(self):
+        result = {
+            "containers": [self.build_container()],
+            "volumes": [vol.build() for vol in self._volumes],
+        }
+        return dict((k, v) for k, v in result.items() if v)
+
+    def build(self):
+        return {
+            "kind": self._kind,
+            "metadata": {
+                "name": self._name,
+            },
+            "spec": {
+                "replicas": int(self._replicas),
+                "template": {
+                    "metadata": {
+                        "labels": _remove_nones(self._labels) or None,
+                    },
+                    "spec": self.build_template_spec(),
+                },
+            },
+        }
+
+
+class MarsReplicationConfig(ReplicationConfig, abc.ABC):
+    """
+    Base configuration builder for replication controllers for Mars
+    """
+
+    rc_name = None
+    default_readiness_port = 15031
+
+    def __init__(
+        self,
+        replicas,
+        cpu=None,
+        memory=None,
+        limit_resources=False,
+        memory_limit_ratio=None,
+        image=None,
+        modules=None,
+        volumes=None,
+        service_name=None,
+        service_port=None,
+        **kwargs,
+    ):
+        self._cpu = cpu
+        self._memory, ratio = (
+            parse_readable_size(memory) if memory is not None else (None, False)
+        )
+        assert not ratio
+
+        if isinstance(modules, str):
+            self._modules = modules.split(",")
+        else:
+            self._modules = modules
+
+        req_res = ResourceConfig(cpu, memory) if cpu or memory else None
+        limit_res = (
+            ResourceConfig(req_res.cpu, req_res.memory * (memory_limit_ratio or 1))
+            if req_res and memory
+            else None
+        )
+
+        self._service_name = service_name
+        self._service_port = service_port
+
+        super().__init__(
+            self.rc_name,
+            image or DEFAULT_IMAGE,
+            replicas,
+            resource_request=req_res,
+            resource_limit=limit_res if limit_resources else None,
+            readiness_probe=self.config_readiness_probe(),
+            **kwargs,
+        )
+        if service_port:
+            self.add_port(service_port)
+
+        for vol in volumes or ():
+            self.add_volume(vol)
+
+        self.add_labels({"mars/service-type": self.rc_name})
+
+    def add_default_envs(self):
+        self.add_env("MARS_K8S_POD_NAME", field_path="metadata.name")
+        self.add_env("MARS_K8S_POD_NAMESPACE", field_path="metadata.namespace")
+        self.add_env("MARS_K8S_POD_IP", field_path="status.podIP")
+
+        if self._service_name:
+            self.add_env("MARS_K8S_SERVICE_NAME", str(self._service_name))
+        if self._service_port:
+            self.add_env("MARS_K8S_SERVICE_PORT", str(self._service_port))
+
+        self.add_env("MARS_CONTAINER_IP", field_path="status.podIP")
+
+        if self._cpu:
+            self.add_env("MKL_NUM_THREADS", str(self._cpu))
+            self.add_env("MARS_CPU_TOTAL", str(self._cpu))
+            if getattr(self, "stat_type", "cgroup") == "cgroup":
+                self.add_env("MARS_USE_CGROUP_STAT", "1")
+
+        if self._memory:
+            self.add_env("MARS_MEMORY_TOTAL", str(int(self._memory)))
+
+        if self._modules:
+            self.add_env("MARS_LOAD_MODULES", ",".join(self._modules))
+
+    def config_readiness_probe(self):
+        raise NotImplementedError
+
+    @staticmethod
+    def get_local_app_module(mod_name):
+        return __name__.rsplit(".", 1)[0] + "." + mod_name
+
+    def build(self):
+        result = super().build()
+        if self._kind in ("Deployment", "ReplicaSet"):
+            result["spec"]["selector"] = {
+                "matchLabels": {"mars/service-type": self.rc_name}
+            }
+        else:
+            result["spec"]["selector"] = {"mars/service-type": self.rc_name}
+        return result
+
+
+class MarsSupervisorsConfig(MarsReplicationConfig):
+    """
+    Configuration builder for Mars supervisor service
+    """
+
+    rc_name = "marssupervisor"
+
+    def __init__(self, *args, **kwargs):
+        self._web_port = kwargs.pop("web_port", None)
+        self._readiness_port = kwargs.pop("readiness_port", self.default_readiness_port)
+        super().__init__(*args, **kwargs)
+        if self._web_port:
+            self.add_port(self._web_port)
+
+    def config_readiness_probe(self):
+        return TcpSocketProbeConfig(self._readiness_port, timeout=60, failure_thresh=10)
+
+    def build_container_command(self):
+        cmd = [
+            "/srv/entrypoint.sh",
+            self.get_local_app_module("supervisor"),
+        ]
+        if self._service_port:
+            cmd += ["-p", str(self._service_port)]
+        if self._web_port:
+            cmd += ["-w", str(self._web_port)]
+        if self._cpu:
+            cmd += ["--n-process", str(int(math.ceil(self._cpu)))]
+        return cmd
+
+
+class MarsWorkersConfig(MarsReplicationConfig):
+    """
+    Configuration builder for Mars worker service
+    """
+
+    rc_name = "marsworker"
+
+    def __init__(self, *args, **kwargs):
+        spill_volumes = kwargs.pop("spill_volumes", None) or ()
+        mount_shm = kwargs.pop("mount_shm", True)
+        self._limit_resources = kwargs["limit_resources"] = kwargs.get(
+            "limit_resources", True
+        )
+        worker_cache_mem = (
+            kwargs.pop("worker_cache_mem", None) or DEFAULT_WORKER_CACHE_MEM
+        )
+        min_cache_mem = kwargs.pop("min_cache_mem", None)
+        self._readiness_port = kwargs.pop("readiness_port", self.default_readiness_port)
+        supervisor_web_port = kwargs.pop("supervisor_web_port", None)
+
+        super().__init__(*args, **kwargs)
+
+        self._spill_volumes = []
+        for idx, vol in enumerate(spill_volumes):
+            if isinstance(vol, str):
+                path = f"/mnt/hostpath{idx}"
+                self.add_volume(HostPathVolumeConfig(f"host-path-vol-{idx}", path, vol))
+                self._spill_volumes.append(path)
+            else:
+                self.add_volume(vol)
+                self._spill_volumes.append(vol.mount_path)
+        if self._spill_volumes:
+            self.add_env("MARS_SPILL_DIRS", ":".join(self._spill_volumes))
+
+        if self._memory:
+            size_limit = calc_size_by_str(worker_cache_mem, self._memory)
+            self.add_env("MARS_CACHE_MEM_SIZE", worker_cache_mem)
+        else:
+            size_limit = None
+
+        if mount_shm:
+            self.add_volume(
+                EmptyDirVolumeConfig("mars-shared", "/dev/shm", size_limit=size_limit)
+            )
+
+        if min_cache_mem:
+            self.add_env("MARS_MIN_CACHE_MEM_SIZE", min_cache_mem)
+        if supervisor_web_port:
+            self.add_env("MARS_K8S_SUPERVISOR_WEB_PORT", supervisor_web_port)
+
+    def config_readiness_probe(self):
+        return TcpSocketProbeConfig(self._readiness_port, timeout=60, failure_thresh=10)
+
+    def build_container_command(self):
+        cmd = [
+            "/srv/entrypoint.sh",
+            self.get_local_app_module("worker"),
+        ]
+        if self._service_port:
+            cmd += ["-p", str(self._service_port)]
+        return cmd
diff --git a/python/xorbits/_mars/deploy/kubernetes/config.yml b/python/xorbits/_mars/deploy/kubernetes/config.yml
new file mode 100644
index 000000000..127ff0ec7
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/config.yml
@@ -0,0 +1,7 @@
+"@inherits": ../oscar/base_config.yml
+cluster:
+  backend: k8s
+storage:
+  backends: [plasma]
+  plasma:
+    store_memory: 20%
diff --git a/python/xorbits/_mars/deploy/kubernetes/core.py b/python/xorbits/_mars/deploy/kubernetes/core.py
new file mode 100644
index 000000000..0eaca00fd
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/core.py
@@ -0,0 +1,222 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import os
+from typing import AsyncGenerator, Dict, List, Optional, TypeVar
+
+from ...services.cluster import WebClusterAPI
+from ...services.cluster.backends import (
+    AbstractClusterBackend,
+    register_cluster_backend,
+)
+from ...services.cluster.core import NodeRole
+from ..utils import next_in_thread, wait_all_supervisors_ready
+from .config import MarsReplicationConfig
+
+logger = logging.getLogger(__name__)
+RetType = TypeVar("RetType")
+
+
+@register_cluster_backend
+class K8SClusterBackend(AbstractClusterBackend):
+    name = "k8s"
+
+    def __init__(
+        self, node_role=None, pool_address=None, k8s_config=None, k8s_namespace=None
+    ):
+        from kubernetes import client
+
+        self._node_role = node_role
+        self._pool_address = pool_address
+        self._k8s_config = k8s_config
+
+        verify_ssl = bool(int(os.environ.get("KUBE_VERIFY_SSL", "1")))
+        if not verify_ssl:
+            c = client.Configuration()
+            c.verify_ssl = False
+            client.Configuration.set_default(c)
+
+        self._k8s_namespace = (
+            k8s_namespace or os.environ.get("MARS_K8S_POD_NAMESPACE") or "default"
+        )
+        self._service_name = os.environ.get("MARS_K8S_SERVICE_NAME")
+        self._full_label_selector = None
+        self._client = client.CoreV1Api(client.ApiClient(self._k8s_config))
+
+        self._pod_to_ep = dict()
+
+    @classmethod
+    async def create(
+        cls, node_role: NodeRole, lookup_address: Optional[str], pool_address: str
+    ) -> "AbstractClusterBackend":
+        from kubernetes import client, config
+
+        if lookup_address is None:
+            k8s_namespace = None
+            k8s_config = config.load_incluster_config()
+        else:
+            address_parts = lookup_address.rsplit("?", 1)
+            k8s_namespace = None if len(address_parts) == 1 else address_parts[1]
+
+            k8s_config = client.Configuration()
+            if "://" in address_parts[0]:
+                k8s_config.host = address_parts[0]
+            else:
+                config.load_kube_config(
+                    address_parts[0], client_configuration=k8s_config
+                )
+        return cls(node_role, pool_address, k8s_config, k8s_namespace)
+
+    def __reduce__(self):
+        return (
+            type(self),
+            (
+                self._node_role,
+                self._pool_address,
+                self._k8s_config,
+                self._k8s_namespace,
+            ),
+        )
+
+    @staticmethod
+    def _format_endpoint_query_result(result: Dict, filter_ready: bool = True):
+        port = os.environ["MARS_K8S_SERVICE_PORT"]
+        endpoints = [
+            f"{addr['ip']}:{port}" for addr in result["subsets"][0]["addresses"] or []
+        ]
+        if not filter_ready:
+            endpoints = [
+                f"{addr['ip']}:{port}"
+                for addr in result["subsets"][0]["not_ready_addresses"] or []
+            ]
+        return endpoints
+
+    def _get_web_cluster_api(self):
+        supervisor_web_port = os.environ["MARS_K8S_SUPERVISOR_WEB_PORT"]
+        web_url = (
+            f"http://{self._service_name}.{self._k8s_namespace}:{supervisor_web_port}"
+        )
+        api = WebClusterAPI(web_url)
+        return api
+
+    async def _watch_supervisors_by_service_api(
+        self,
+    ) -> AsyncGenerator[List[str], None]:
+        from kubernetes.watch import Watch as K8SWatch
+        from urllib3.exceptions import ReadTimeoutError
+
+        w = K8SWatch()
+
+        while True:
+            streamer = w.stream(
+                self._client.list_namespaced_endpoints,
+                namespace=self._k8s_namespace,
+                label_selector=f"mars/service-name={self._service_name}",
+                timeout_seconds=60,
+            )
+            while True:
+                try:
+                    event = await next_in_thread(streamer)
+                    obj_dict = event["object"].to_dict()
+                    yield self._format_endpoint_query_result(obj_dict)
+                except (ReadTimeoutError, StopAsyncIteration):
+                    break
+                except:  # noqa: E722  # pragma: no cover  # pylint: disable=bare-except
+                    logger.exception("Unexpected error when watching on kubernetes")
+                    break
+
+    async def _watch_supervisors_by_cluster_web_api(self):
+        while True:
+            try:
+                api = self._get_web_cluster_api()
+                async for supervisors in api.watch_supervisors():
+                    yield supervisors
+            except (OSError, asyncio.TimeoutError):
+                pass
+
+    async def _get_supervisors_by_service_api(
+        self, filter_ready: bool = True
+    ) -> List[str]:
+        result = (
+            await asyncio.to_thread(
+                self._client.read_namespaced_endpoints,
+                name=self._service_name,
+                namespace=self._k8s_namespace,
+            )
+        ).to_dict()
+        return self._format_endpoint_query_result(result, filter_ready=filter_ready)
+
+    async def _get_supervisors_by_cluster_web_api(self, filter_ready: bool = True):
+        api = self._get_web_cluster_api()
+        try:
+            supervisors = await api.get_supervisors(filter_ready=filter_ready)
+            return supervisors
+        except (OSError, asyncio.TimeoutError):  # pragma: no cover
+            return []
+
+    async def get_supervisors(self, filter_ready: bool = True) -> List[str]:
+        if self._node_role == NodeRole.SUPERVISOR:
+            return await self._get_supervisors_by_service_api(filter_ready)
+        else:
+            return await self._get_supervisors_by_cluster_web_api(filter_ready)
+
+    async def watch_supervisors(self) -> AsyncGenerator[List[str], None]:
+        if self._node_role == NodeRole.SUPERVISOR:
+            watch_fun = self._watch_supervisors_by_service_api
+        else:
+            watch_fun = self._watch_supervisors_by_cluster_web_api
+
+        try:
+            async for supervisors in watch_fun():
+                yield supervisors
+        except asyncio.CancelledError:
+            pass
+
+    async def request_worker(
+        self, worker_cpu: int = None, worker_mem: int = None, timeout: int = None
+    ) -> str:
+        raise NotImplementedError
+
+    async def release_worker(self, address: str):
+        raise NotImplementedError
+
+    async def reconstruct_worker(self, address: str):
+        raise NotImplementedError
+
+
+class K8SServiceMixin:
+    @staticmethod
+    def write_pid_file():
+        with open("/tmp/mars-service.pid", "w") as pid_file:
+            pid_file.write(str(os.getpid()))
+
+    async def wait_all_supervisors_ready(self):
+        """
+        Wait till all containers are ready
+        """
+        await wait_all_supervisors_ready(self.args.endpoint)
+
+    async def start_readiness_server(self):
+        readiness_port = os.environ.get(
+            "MARS_K8S_READINESS_PORT", MarsReplicationConfig.default_readiness_port
+        )
+        self._readiness_server = await asyncio.start_server(
+            lambda r, w: None, port=readiness_port
+        )
+
+    async def stop_readiness_server(self):
+        self._readiness_server.close()
+        await self._readiness_server.wait_closed()
diff --git a/python/xorbits/_mars/deploy/kubernetes/docker/Dockerfile b/python/xorbits/_mars/deploy/kubernetes/docker/Dockerfile
new file mode 100644
index 000000000..2675538ab
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/docker/Dockerfile
@@ -0,0 +1,23 @@
+ARG BASE_CONTAINER=marsproject/mars-base
+FROM ${BASE_CONTAINER}
+
+COPY . /opt/mars/
+
+RUN apt-get -yq update --allow-releaseinfo-change \
+  && apt-get -yq install gcc g++ \
+  && curl -fsSL https://deb.nodesource.com/setup_14.x | sudo -E bash - \
+  && sudo apt-get install -y nodejs \
+  && /opt/conda/bin/pip install -e /opt/mars \
+  && apt-get -yq remove gcc g++ nodejs \
+  && apt-get -yq autoremove \
+  && apt-get -yq clean \
+  && rm -rf /var/lib/apt/lists/* \
+  && rm -rf /usr/local/lib/node_modules
+RUN mkdir -p /srv
+WORKDIR /srv
+
+RUN cp /opt/mars/mars/deploy/oscar/file-logging.conf /srv/logging.conf \
+  && cp /opt/mars/mars/deploy/kubernetes/docker/entrypoint.sh /srv/entrypoint.sh \
+  && chmod a+x /srv/*.sh
+
+ENTRYPOINT [ "/srv/entrypoint.sh" ]
diff --git a/python/xorbits/_mars/deploy/kubernetes/docker/Dockerfile.base b/python/xorbits/_mars/deploy/kubernetes/docker/Dockerfile.base
new file mode 100644
index 000000000..2e43d968d
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/docker/Dockerfile.base
@@ -0,0 +1,32 @@
+ARG BASE_CONTAINER=continuumio/miniconda3:4.9.2
+FROM ${BASE_CONTAINER}
+
+COPY retry.sh /srv/retry.sh
+
+RUN /srv/retry.sh 3 /opt/conda/bin/conda install \
+    cloudpickle \
+    cython \
+    greenlet \
+    mkl \
+    numba \
+    numexpr \
+    numpy\>=1.14.0 \
+    pandas\>=1.0.0 \
+    psutil \
+    scikit-learn \
+    scipy \
+    sqlalchemy \
+    tornado \
+    lz4 \
+  && /srv/retry.sh 3 /opt/conda/bin/conda install -c conda-forge \
+    libiconv \
+    pyarrow\>=1.0 \
+    tiledb-py \
+    python-kubernetes \
+    uvloop \
+  && /opt/conda/bin/conda clean --all -f -y
+
+RUN apt-get -yq update --allow-releaseinfo-change \
+  && apt-get -yq install curl sudo procps \
+  && apt-get -yq clean \
+  && rm -rf /var/lib/apt/lists/* \
diff --git a/python/xorbits/_mars/deploy/kubernetes/docker/entrypoint.sh b/python/xorbits/_mars/deploy/kubernetes/docker/entrypoint.sh
new file mode 100755
index 000000000..97eb6b0cb
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/docker/entrypoint.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -e
+
+if [[ "$1" == *"/"* ]]; then
+  $@
+else
+  /opt/conda/bin/python -m "$1" ${@:2}
+fi
diff --git a/python/xorbits/_mars/deploy/kubernetes/docker/retry.sh b/python/xorbits/_mars/deploy/kubernetes/docker/retry.sh
new file mode 100755
index 000000000..ca87a335e
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/docker/retry.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+RETRIES=$1
+shift
+for (( RETRY=1; RETRY <= $RETRIES ; RETRY++ )); do
+  "$@"
+  EXIT=$?
+  if [[ $EXIT != 0 ]]; then
+    echo "Command attempt $RETRY failed"
+  else
+    exit 0
+  fi
+done
+exit $EXIT
diff --git a/python/xorbits/_mars/deploy/kubernetes/supervisor.py b/python/xorbits/_mars/deploy/kubernetes/supervisor.py
new file mode 100644
index 000000000..de15b3e66
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/supervisor.py
@@ -0,0 +1,32 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..oscar.supervisor import SupervisorCommandRunner
+from .core import K8SServiceMixin
+
+
+class K8SSupervisorCommandRunner(K8SServiceMixin, SupervisorCommandRunner):
+    async def start_services(self):
+        await super().start_services()
+        await self.start_readiness_server()
+
+    async def stop_services(self):
+        await self.stop_readiness_server()
+        await super().stop_services()
+
+
+main = K8SSupervisorCommandRunner()
+
+if __name__ == "__main__":  # pragma: no branch
+    main()
diff --git a/python/xorbits/_mars/deploy/kubernetes/tests/Dockerfile.test b/python/xorbits/_mars/deploy/kubernetes/tests/Dockerfile.test
new file mode 100644
index 000000000..2634fd3a7
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/tests/Dockerfile.test
@@ -0,0 +1,18 @@
+ARG BASE_CONTAINER=marsproject/mars-base
+FROM ${BASE_CONTAINER}
+
+RUN /srv/retry.sh 3 /opt/conda/bin/conda install -c pkgs/main \
+    coverage\>=5.0 cloudpickle \
+  && conda clean --all -f -y
+
+RUN apt-get -yq update --allow-releaseinfo-change
+RUN apt-get -yq install git gcc g++
+
+COPY docker-logging.conf /srv/logging.conf
+COPY build_ext.sh /srv/build_ext.sh
+COPY entrypoint.sh /srv/entrypoint.sh
+COPY graceful_stop.sh /srv/graceful_stop.sh
+
+RUN echo "import coverage; coverage.process_startup()" > \
+    $(/opt/conda/bin/python -c "import site; print(site.getsitepackages()[-1])")/coverage.pth
+RUN chmod a+x /srv/*.sh
diff --git a/python/xorbits/_mars/deploy/kubernetes/tests/__init__.py b/python/xorbits/_mars/deploy/kubernetes/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/deploy/kubernetes/tests/build_ext.sh b/python/xorbits/_mars/deploy/kubernetes/tests/build_ext.sh
new file mode 100644
index 000000000..073b2b3ad
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/tests/build_ext.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -e
+cd /mnt/mars
+/opt/conda/bin/python setup.py build_ext -i
diff --git a/python/xorbits/_mars/deploy/kubernetes/tests/docker-logging.conf b/python/xorbits/_mars/deploy/kubernetes/tests/docker-logging.conf
new file mode 100644
index 000000000..320ca6cb3
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/tests/docker-logging.conf
@@ -0,0 +1,50 @@
+[loggers]
+keys=root,main,deploy,services,oscar,tornado
+
+[handlers]
+keys=stream_handler
+
+[formatters]
+keys=formatter
+
+[logger_root]
+level=WARN
+handlers=stream_handler
+
+[logger_main]
+level=DEBUG
+handlers=stream_handler
+qualname=__main__
+propagate=0
+
+[logger_deploy]
+level=DEBUG
+handlers=stream_handler
+qualname=mars.deploy
+propagate=0
+
+[logger_oscar]
+level=DEBUG
+handlers=stream_handler
+qualname=mars.oscar
+propagate=0
+
+[logger_services]
+level=DEBUG
+handlers=stream_handler
+qualname=mars.services
+propagate=0
+
+[logger_tornado]
+level=WARN
+handlers=stream_handler
+qualname=tornado
+propagate=0
+
+[handler_stream_handler]
+class=StreamHandler
+formatter=formatter
+args=(sys.stderr,)
+
+[formatter_formatter]
+format=%(asctime)s %(name)-12s %(process)d %(levelname)-8s %(message)s
diff --git a/python/xorbits/_mars/deploy/kubernetes/tests/entrypoint.sh b/python/xorbits/_mars/deploy/kubernetes/tests/entrypoint.sh
new file mode 100755
index 000000000..918d87bfd
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/tests/entrypoint.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+set -e
+cd /mnt/mars
+/opt/conda/bin/pip install -e ".[dev,extra]"
+
+mkdir -p .dist-coverage
+export COVERAGE_FILE=.dist-coverage/.coverage
+
+COV_RUNNER="/opt/conda/bin/coverage run"
+
+if [[ $1 == *"supervisor"* ]]; then
+  $COV_RUNNER -m "$1" --log-conf /srv/logging.conf ${@:2}
+elif [[ $1 == *"worker"* ]]; then
+  $COV_RUNNER -m "$1" --log-conf /srv/logging.conf ${@:2}
+else
+  $COV_RUNNER -m "$1" --log-conf /srv/logging.conf ${@:2}
+fi
+while [[ -f /tmp/stopping.tmp ]]; do
+  sleep 1
+done
diff --git a/python/xorbits/_mars/deploy/kubernetes/tests/graceful_stop.sh b/python/xorbits/_mars/deploy/kubernetes/tests/graceful_stop.sh
new file mode 100644
index 000000000..5153fc3c2
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/tests/graceful_stop.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -e
+touch /tmp/stopping.tmp
+if [[ -f /tmp/mars-service.pid ]]; then
+  SERVICE_PID="$(cat /tmp/mars-service.pid)"
+  kill -INT "$SERVICE_PID" || true
+  CNT=0
+  while kill -0 "$SERVICE_PID"; do
+    sleep 0.5
+    CNT=$((CNT+1))
+    if [[ $CNT -gt 10 ]]; then
+      break
+    fi
+  done
+  kill -INT "$SERVICE_PID" || true
+fi
diff --git a/python/xorbits/_mars/deploy/kubernetes/tests/test_config.py b/python/xorbits/_mars/deploy/kubernetes/tests/test_config.py
new file mode 100644
index 000000000..c4494fca6
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/tests/test_config.py
@@ -0,0 +1,129 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..config import (
+    EmptyDirVolumeConfig,
+    MarsSupervisorsConfig,
+    MarsWorkersConfig,
+    NamespaceConfig,
+    RoleBindingConfig,
+    RoleConfig,
+    ServiceConfig,
+)
+
+
+def test_simple_objects():
+    ns_config_dict = NamespaceConfig("ns_name").build()
+    assert ns_config_dict["metadata"]["name"] == "ns_name"
+
+    role_config_dict = RoleConfig(
+        "mars-pod-reader", "ns_name", "", "pods", "get,watch,list"
+    ).build()
+    assert role_config_dict["metadata"]["name"] == "mars-pod-reader"
+    assert "get" in role_config_dict["rules"][0]["verbs"]
+
+    role_binding_config_dict = RoleBindingConfig(
+        "mars-pod-reader-binding", "ns_name", "mars-pod-reader", "default"
+    ).build()
+    assert role_binding_config_dict["metadata"]["name"] == "mars-pod-reader-binding"
+
+    service_config_dict = ServiceConfig(
+        "mars-test-service", "NodePort", "mars/service-type=marssupervisor", 7103, 7103
+    ).build()
+    assert service_config_dict["metadata"]["name"] == "mars-test-service"
+
+
+def test_supervisor_object():
+    supervisor_config = MarsSupervisorsConfig(
+        1, cpu=2, memory="10g", limit_resources=False, modules=["mars.test_mod"]
+    )
+    supervisor_config.add_simple_envs(dict(TEST_ENV="test_val"))
+
+    supervisor_config_dict = supervisor_config.build()
+    assert supervisor_config_dict["metadata"]["name"] == "marssupervisor"
+    assert supervisor_config_dict["spec"]["replicas"] == 1
+
+    container_dict = supervisor_config_dict["spec"]["template"]["spec"]["containers"][0]
+    assert int(container_dict["resources"]["requests"]["memory"]) == 10 * 1024**3
+
+    container_envs = dict((p["name"], p) for p in container_dict["env"])
+    assert container_envs["TEST_ENV"]["value"] == "test_val"
+    assert container_envs["MKL_NUM_THREADS"]["value"] == "2"
+    assert container_envs["MARS_CPU_TOTAL"]["value"] == "2"
+    assert int(container_envs["MARS_MEMORY_TOTAL"]["value"]) == 10 * 1024**3
+    assert container_envs["MARS_LOAD_MODULES"]["value"] == "mars.test_mod"
+
+
+def test_worker_object():
+    worker_config_dict = MarsWorkersConfig(
+        4,
+        cpu=2,
+        memory=10 * 1024**3,
+        limit_resources=True,
+        memory_limit_ratio=2,
+        spill_volumes=[
+            "/tmp/spill_vol",
+            EmptyDirVolumeConfig("empty-dir", "/tmp/empty"),
+        ],
+        worker_cache_mem="20%",
+        min_cache_mem="10%",
+        modules="mars.test_mod",
+        mount_shm=True,
+    ).build()
+    assert worker_config_dict["metadata"]["name"] == "marsworker"
+    assert worker_config_dict["spec"]["replicas"] == 4
+
+    container_dict = worker_config_dict["spec"]["template"]["spec"]["containers"][0]
+    assert int(container_dict["resources"]["requests"]["memory"]) == 10 * 1024**3
+    assert int(container_dict["resources"]["limits"]["memory"]) == 20 * 1024**3
+
+    container_envs = dict((p["name"], p) for p in container_dict["env"])
+    assert container_envs["MKL_NUM_THREADS"]["value"] == "2"
+    assert container_envs["MARS_CPU_TOTAL"]["value"] == "2"
+    assert int(container_envs["MARS_MEMORY_TOTAL"]["value"]) == 10 * 1024**3
+    assert container_envs["MARS_LOAD_MODULES"]["value"] == "mars.test_mod"
+    assert set(container_envs["MARS_SPILL_DIRS"]["value"].split(":")) == {
+        "/tmp/empty",
+        "/mnt/hostpath0",
+    }
+    assert container_envs["MARS_CACHE_MEM_SIZE"]["value"] == "20%"
+
+    volume_list = worker_config_dict["spec"]["template"]["spec"]["volumes"]
+    volume_envs = dict((v["name"], v) for v in volume_list)
+    assert "empty-dir" in volume_envs
+    assert volume_envs["host-path-vol-0"]["hostPath"]["path"] == "/tmp/spill_vol"
+
+    volume_mounts = dict((v["name"], v) for v in container_dict["volumeMounts"])
+    assert volume_mounts["empty-dir"]["mountPath"] == "/tmp/empty"
+    assert volume_mounts["host-path-vol-0"]["mountPath"] == "/mnt/hostpath0"
+
+    worker_config_dict = MarsWorkersConfig(
+        4,
+        cpu=2,
+        memory=10 * 1024**3,
+        limit_resources=False,
+        spill_volumes=[
+            "/tmp/spill_vol",
+            EmptyDirVolumeConfig("empty-dir", "/tmp/empty"),
+        ],
+        modules="mars.test_mod",
+        mount_shm=False,
+    ).build()
+
+    volume_list = worker_config_dict["spec"]["template"]["spec"]["volumes"]
+    assert "shm-volume" not in volume_list
+
+    container_dict = worker_config_dict["spec"]["template"]["spec"]["containers"][0]
+    volume_mounts = dict((v["name"], v) for v in container_dict["volumeMounts"])
+    assert "shm-volume" not in volume_mounts
diff --git a/python/xorbits/_mars/deploy/kubernetes/tests/test_kubernetes.py b/python/xorbits/_mars/deploy/kubernetes/tests/test_kubernetes.py
new file mode 100644
index 000000000..03eee880d
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/tests/test_kubernetes.py
@@ -0,0 +1,284 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+import shutil
+import subprocess
+import tempfile
+import uuid
+from contextlib import contextmanager
+from distutils.spawn import find_executable
+
+import numpy as np
+import pytest
+
+from .... import tensor as mt
+from ....tests.core import mock
+from .. import new_cluster
+from ..config import HostPathVolumeConfig
+
+try:
+    from kubernetes import client as k8s_client
+    from kubernetes import config as k8s_config
+except ImportError:
+    k8s_client = k8s_config = None
+
+MARS_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(mt.__file__)))
+TEST_ROOT = os.path.dirname(os.path.abspath(__file__))
+DOCKER_ROOT = os.path.join(os.path.dirname(TEST_ROOT), "docker")
+
+kube_available = (
+    find_executable("kubectl") is not None
+    and find_executable("docker") is not None
+    and k8s_config is not None
+)
+
+
+def _collect_coverage():
+    dist_coverage_path = os.path.join(MARS_ROOT, ".dist-coverage")
+    if os.path.exists(dist_coverage_path):
+        # change ownership of coverage files
+        if find_executable("sudo"):
+            proc = subprocess.Popen(
+                [
+                    "sudo",
+                    "-n",
+                    "chown",
+                    "-R",
+                    f"{os.geteuid()}:{os.getegid()}",
+                    dist_coverage_path,
+                ],
+                shell=False,
+            )
+            proc.wait()
+
+        # rewrite paths in coverage result files
+        for fn in glob.glob(os.path.join(dist_coverage_path, ".coverage.*")):
+            if "COVERAGE_FILE" in os.environ:
+                new_cov_file = os.environ["COVERAGE_FILE"] + os.path.basename(
+                    fn
+                ).replace(".coverage", "")
+            else:
+                new_cov_file = fn.replace(".dist-coverage" + os.sep, "")
+            shutil.copyfile(fn, new_cov_file)
+        shutil.rmtree(dist_coverage_path)
+
+
+def _build_docker_images(use_test_docker_file=True):
+    image_name = "mars-test-image:" + uuid.uuid1().hex
+    try:
+        if use_test_docker_file:
+            proc = subprocess.Popen(
+                ["docker", "build", "-f", "Dockerfile.test", "-t", image_name, "."],
+                cwd=TEST_ROOT,
+            )
+        else:
+            proc = subprocess.Popen(
+                [
+                    "docker",
+                    "build",
+                    "-f",
+                    os.path.join(DOCKER_ROOT, "Dockerfile"),
+                    "-t",
+                    image_name,
+                    ".",
+                ],
+                cwd=MARS_ROOT,
+            )
+        if proc.wait() != 0:
+            raise SystemError("Executing docker build failed.")
+
+        if use_test_docker_file:
+            proc = subprocess.Popen(
+                [
+                    "docker",
+                    "run",
+                    "-v",
+                    MARS_ROOT + ":/mnt/mars",
+                    image_name,
+                    "/srv/build_ext.sh",
+                ]
+            )
+            if proc.wait() != 0:
+                raise SystemError("Executing docker run failed.")
+    except:  # noqa: E722
+        _remove_docker_image(image_name)
+        raise
+    return image_name
+
+
+def _remove_docker_image(image_name, raises=True):
+    if "CI" not in os.environ:
+        # delete image iff in CI environment
+        return
+    proc = subprocess.Popen(["docker", "rmi", "-f", image_name])
+    if proc.wait() != 0 and raises:
+        raise SystemError("Executing docker rmi failed.")
+
+
+def _load_docker_env():
+    if os.path.exists("/var/run/docker.sock") or not shutil.which("minikube"):
+        return
+
+    proc = subprocess.Popen(["minikube", "docker-env"], stdout=subprocess.PIPE)
+    proc.wait(30)
+    for line in proc.stdout:
+        line = line.decode().split("#", 1)[0]
+        line = line.strip()  # type: str | bytes
+        export_pos = line.find("export")
+        if export_pos < 0:
+            continue
+        line = line[export_pos + 6 :].strip()
+        var, value = line.split("=", 1)
+        os.environ[var] = value.strip('"')
+
+
+@contextmanager
+def _start_kube_cluster(use_test_docker_file=True, **kwargs):
+    _load_docker_env()
+    image_name = _build_docker_images(use_test_docker_file=use_test_docker_file)
+
+    temp_spill_dir = tempfile.mkdtemp(prefix="test-mars-k8s-")
+    api_client = k8s_config.new_client_from_config()
+    kube_api = k8s_client.CoreV1Api(api_client)
+
+    cluster_client = None
+    try:
+        if use_test_docker_file:
+            extra_volumes = [
+                HostPathVolumeConfig("mars-src-path", "/mnt/mars", MARS_ROOT)
+            ]
+            pre_stop_command = ["rm", "/tmp/stopping.tmp"]
+        else:
+            extra_volumes = []
+            pre_stop_command = None
+
+        cluster_client = new_cluster(
+            api_client,
+            image=image_name,
+            worker_spill_paths=[temp_spill_dir],
+            extra_volumes=extra_volumes,
+            pre_stop_command=pre_stop_command,
+            timeout=600,
+            log_when_fail=True,
+            **kwargs,
+        )
+
+        assert cluster_client.endpoint is not None
+
+        pod_items = kube_api.list_namespaced_pod(cluster_client.namespace).to_dict()
+
+        log_processes = []
+        for item in pod_items["items"]:
+            log_processes.append(
+                subprocess.Popen(
+                    [
+                        "kubectl",
+                        "logs",
+                        "-f",
+                        "-n",
+                        cluster_client.namespace,
+                        item["metadata"]["name"],
+                    ]
+                )
+            )
+
+        yield
+
+        if use_test_docker_file:
+            # turn off service processes with grace to get coverage data
+            procs = []
+            pod_items = kube_api.list_namespaced_pod(cluster_client.namespace).to_dict()
+            for item in pod_items["items"]:
+                p = subprocess.Popen(
+                    [
+                        "kubectl",
+                        "exec",
+                        "-n",
+                        cluster_client.namespace,
+                        item["metadata"]["name"],
+                        "--",
+                        "/srv/graceful_stop.sh",
+                    ]
+                )
+                procs.append(p)
+            for p in procs:
+                p.wait()
+
+        [p.terminate() for p in log_processes]
+    finally:
+        shutil.rmtree(temp_spill_dir)
+        if cluster_client:
+            try:
+                cluster_client.stop(wait=True, timeout=20)
+            except TimeoutError:
+                pass
+        _collect_coverage()
+        _remove_docker_image(image_name, False)
+
+
+@pytest.mark.parametrize("use_test_docker_file", [True, False])
+@pytest.mark.skipif(not kube_available, reason="Cannot run without kubernetes")
+def test_run_in_kubernetes(use_test_docker_file):
+    with _start_kube_cluster(
+        supervisor_cpu=0.5,
+        supervisor_mem="1G",
+        worker_cpu=0.5,
+        worker_mem="1G",
+        worker_cache_mem="64m",
+        extra_labels={"mars-test/group": "test-label-name"},
+        extra_env={"MARS_K8S_GROUP_LABELS": "mars-test/group"},
+        use_test_docker_file=use_test_docker_file,
+    ):
+        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
+        b = mt.ones((100, 100), chunk_size=20) * 2 * 1 + 1
+        c = (a * b * 2 + 1).sum()
+        r = c.execute().fetch()
+
+        expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1
+        np.testing.assert_array_equal(r, expected.sum())
+
+
+@pytest.mark.skipif(not kube_available, reason="Cannot run without kubernetes")
+@mock.patch(
+    "kubernetes.client.CoreV1Api.create_namespaced_replication_controller",
+    new=lambda *_, **__: None,
+)
+@mock.patch(
+    "kubernetes.client.AppsV1Api.create_namespaced_deployment",
+    new=lambda *_, **__: None,
+)
+def test_create_timeout():
+    _load_docker_env()
+    api_client = k8s_config.new_client_from_config()
+
+    cluster = None
+    try:
+        extra_vol_config = HostPathVolumeConfig("mars-src-path", "/mnt/mars", MARS_ROOT)
+        with pytest.raises(TimeoutError):
+            cluster = new_cluster(
+                api_client,
+                image="pseudo_image",
+                supervisor_cpu=0.5,
+                supervisor_mem="1G",
+                worker_cpu=0.5,
+                worker_mem="1G",
+                extra_volumes=[extra_vol_config],
+                timeout=1,
+            )
+    finally:
+        if cluster:
+            cluster.stop(wait=True)
+        _collect_coverage()
diff --git a/python/xorbits/_mars/deploy/kubernetes/worker.py b/python/xorbits/_mars/deploy/kubernetes/worker.py
new file mode 100644
index 000000000..f2e41cb05
--- /dev/null
+++ b/python/xorbits/_mars/deploy/kubernetes/worker.py
@@ -0,0 +1,55 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+from ..oscar.worker import WorkerCommandRunner
+from .core import K8SServiceMixin
+
+logger = logging.getLogger(__name__)
+
+
+class K8SWorkerCommandRunner(K8SServiceMixin, WorkerCommandRunner):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    async def start_services(self):
+        from ...services.cluster import ClusterAPI
+        from ..oscar.worker import start_worker
+
+        self.write_pid_file()
+        await start_worker(
+            self.pool.external_address,
+            self.args.supervisors,
+            self.band_to_resource,
+            list(self.args.load_modules),
+            self.config,
+            mark_ready=False,
+        )
+        await self.wait_all_supervisors_ready()
+
+        cluster_api = await ClusterAPI.create(self.args.endpoint)
+        await cluster_api.mark_node_ready()
+
+        await self.start_readiness_server()
+
+    async def stop_services(self):
+        await self.stop_readiness_server()
+        await super().stop_services()
+
+
+main = K8SWorkerCommandRunner()
+
+if __name__ == "__main__":  # pragma: no branch
+    main()
diff --git a/python/xorbits/_mars/deploy/oscar/__init__.py b/python/xorbits/_mars/deploy/oscar/__init__.py
new file mode 100644
index 000000000..775a30345
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .ray import RayClusterBackend, new_cluster_in_ray, new_ray_session
diff --git a/python/xorbits/_mars/deploy/oscar/base_config.yml b/python/xorbits/_mars/deploy/oscar/base_config.yml
new file mode 100644
index 000000000..21f7a2c0d
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/base_config.yml
@@ -0,0 +1,86 @@
+services:
+  - cluster
+  - session
+  - storage
+  - meta
+  - lifecycle
+  - scheduling
+  - subtask
+  - task
+  - mutable
+cluster:
+  backend: fixed
+  node_timeout: 120
+  node_check_interval: 1
+  log_dir: null
+session:
+  custom_log_dir: null
+storage:
+  default_config:
+    transfer_block_size: 5 * 1024 ** 2
+  plasma:
+    store_memory: 20%
+  "@overriding_fields": ["backends"]
+meta:
+  store: dict
+task:
+  default_config:
+    optimize_tileable_graph: yes
+    optimize_chunk_graph: yes
+    fuse_enabled: yes
+    initial_same_color_num: null
+    as_broadcaster_successor_num: null
+  execution_config:
+    backend: mars
+scheduling:
+  autoscale:
+    enabled: false
+    min_workers: 1  # Must >=1, mars need at least 1 worker to fetch data
+    max_workers: 100
+    scheduler_backlog_timeout: 60
+    worker_idle_timeout: 120
+  speculation:
+    # Enables (yes) or disables (no) speculative execution of subtasks.
+    # If enabled, `initial_same_color_num` will be set to 1 to ensure enough homogeneous subtasks to
+    # calculate statistics
+    enabled: no
+    # Don't submit subtasks actually for slow subtasks
+    dry: no
+    # The time interval seconds to use before checking for speculative subtasks.
+    interval: 5
+    # The percentage of subtasks that has not finished yet at which to start speculation.
+    threshold: 75%
+    # Minimum amount of time seconds a task runs before being considered for speculation.
+    # This can be used to avoid launching speculative copies of tasks that are very short.
+    min_task_runtime: 3
+    # How many times slower a task is than the median to be considered for speculation.
+    multiplier: 1.5
+    # Max number of concurrent speculative run for a subtask.
+    max_concurrent_run: 3
+  subtask_cancel_timeout: 5
+metrics:
+  backend: console
+  # If backend is prometheus, then we can add prometheus config as follows:
+  # prometheus:
+  #   port: 8988
+oscar:
+  numa:
+    # external address scheme, default null,
+    # available value including: null, ucx
+    external_addr_scheme: null
+    # enable internal address for in-process communication
+    enable_internal_addr: yes
+  gpu:
+    # external address scheme, default null,
+    # available value including: null, ucx
+    external_addr_scheme: null
+    # enable internal address for in-process communication
+    enable_internal_addr: yes
+  extra_conf:
+    ucx:
+      tcp: null
+      nvlink: null
+      infiniband: null
+      rdmacm: null
+      cuda-copy: null
+      create-cuda-contex: null
diff --git a/python/xorbits/_mars/deploy/oscar/cmdline.py b/python/xorbits/_mars/deploy/oscar/cmdline.py
new file mode 100644
index 000000000..10965fe07
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/cmdline.py
@@ -0,0 +1,259 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import asyncio
+import faulthandler
+import glob
+import importlib
+import json
+import logging.config
+import os
+import sys
+import tempfile
+from typing import List
+
+import psutil
+
+from ...utils import ensure_coverage
+from ..utils import get_third_party_modules_from_config, load_service_config_file
+
+logger = logging.getLogger(__name__)
+_is_windows: bool = sys.platform.startswith("win")
+ensure_coverage()
+
+
+class OscarCommandRunner:
+    command_description = None
+    node_role = None
+    _port_file_prefix = "mars_service_process"
+
+    def __init__(self):
+        faulthandler.enable()
+
+        self.args = None
+        self.ports = None
+        self.config = {}
+        self.pool = None
+
+        self.logging_conf = {}
+
+        self._running = False
+
+    def config_args(self, parser):
+        parser.add_argument("-e", "--endpoint", help="endpoint of the service")
+        parser.add_argument("-H", "--host", help="host name of the service")
+        parser.add_argument(
+            "-p",
+            "--ports",
+            help="ports of the service, must equal to num of processes",
+        )
+        parser.add_argument("-c", "--config", help="service configuration")
+        parser.add_argument(
+            "-f", "--config-file", help="configuration file of the service"
+        )
+        parser.add_argument(
+            "-s",
+            "--supervisors",
+            help="endpoint of supervisors, needed for workers and webs "
+            "when kv-store argument is not available, or when you "
+            "need to use multiple supervisors without kv-store",
+        )
+        parser.add_argument("--log-level", help="log level")
+        parser.add_argument("--log-format", help="log format")
+        parser.add_argument(
+            "--log-conf", help="log config file, logging.conf by default"
+        )
+        parser.add_argument("--load-modules", nargs="*", help="modules to import")
+        parser.add_argument(
+            "--use-uvloop", help="use uvloop, 'auto' by default. Use 'no' to disable"
+        )
+
+    def _set_log_dir(self):
+        cluster_config: dict = self.config.get("cluster")
+        if cluster_config is None:
+            raise KeyError('"cluster" key is missing!')
+        log_dir = cluster_config.get("log_dir")
+        self.logging_conf["log_dir"] = log_dir
+        self.logging_conf["from_cmd"] = True
+
+    def _get_logging_config_paths(self):
+        log_conf = self.args.log_conf or "logging.conf"
+
+        return [
+            log_conf,
+            os.path.join(os.path.abspath("."), log_conf),
+            os.path.join(
+                os.path.dirname(os.path.abspath(__file__)), "file-logging.conf"
+            ),
+        ]
+
+    def config_logging(self):
+        self._set_log_dir()
+
+        # get level and format cmd line config
+        log_level = self.args.log_level
+        level = log_level.upper() if log_level else None
+        self.logging_conf["level"] = level
+        formatter = self.args.log_format
+        if formatter:
+            self.logging_conf["format"] = formatter
+
+        config_paths = self._get_logging_config_paths()
+        for i, conf_path in enumerate(config_paths):
+            if os.path.exists(conf_path):
+                self.logging_conf["file"] = conf_path
+                break
+
+    @classmethod
+    def _build_endpoint_file_path(cls, pid: int = None, asterisk: bool = False):
+        pid = pid or os.getpid()
+        return os.path.join(
+            tempfile.gettempdir(), f'{cls._port_file_prefix}.{"*" if asterisk else pid}'
+        )
+
+    def _write_supervisor_endpoint_file(self, args):
+        file_name = self._build_endpoint_file_path()
+        with open(file_name, "w") as port_file:
+            port_file.write(args.endpoint)
+        return file_name
+
+    def _collect_supervisors_from_dir(self):
+        endpoints = []
+        for fn in glob.glob(self._build_endpoint_file_path(asterisk=True)):
+            _, pid_str = os.path.basename(fn).rsplit(".", 1)
+            # detect if process exists
+            if pid_str.isdigit() and not psutil.pid_exists(int(pid_str)):
+                continue
+            with open(fn, "r") as ep_file:
+                endpoints.append(ep_file.read().strip())
+        return endpoints
+
+    @classmethod
+    def get_default_config_file(cls):
+        mod_file_path = os.path.dirname(
+            importlib.import_module(cls.__module__).__file__
+        )
+        return os.path.join(mod_file_path, "config.yml")
+
+    def parse_args(self, parser, argv, environ=None):
+        environ = environ or os.environ
+        args = parser.parse_args(argv)
+
+        if args.endpoint is not None and args.host is not None:  # pragma: no cover
+            raise ValueError("Cannot specify host and endpoint at the same time")
+
+        if "MARS_TASK_DETAIL" in environ:
+            task_detail = json.loads(environ["MARS_TASK_DETAIL"])
+            task_type, task_index = (
+                task_detail["task"]["type"],
+                task_detail["task"]["index"],
+            )
+
+            args.host = args.host or task_detail["cluster"][task_type][task_index]
+            args.supervisors = args.supervisors or ",".join(
+                task_detail["cluster"]["supervisor"]
+            )
+
+        default_host = "0.0.0.0" if not _is_windows else "127.0.0.1"
+        env_host = os.environ.get(
+            "MARS_BIND_HOST", os.environ.get("MARS_CONTAINER_IP", default_host)
+        )
+        args.host = args.host or env_host
+
+        args.ports = args.ports or os.environ.get("MARS_BIND_PORT")
+        if args.ports is not None:
+            self.ports = [int(p) for p in args.ports.split(",")]
+
+        if args.endpoint is None and len(self.ports or []) == 1:
+            args.endpoint = f"{args.host}:{self.ports[0]}"
+            self.ports = None
+
+        args.use_uvloop = args.use_uvloop or "auto"
+
+        if args.config is not None:
+            self.config = json.loads(args.config)
+        else:
+            if args.config_file is None:
+                args.config_file = self.get_default_config_file()
+            self.config = load_service_config_file(args.config_file)
+
+        load_modules = []
+        for mods in list(args.load_modules or ()) + get_third_party_modules_from_config(
+            self.config, self.node_role, environ
+        ):
+            load_modules.extend(mods.split(",") if mods else [])
+        args.load_modules = tuple(load_modules)
+
+        if args.supervisors is None:
+            args.supervisors = ",".join(self._collect_supervisors_from_dir())
+
+        return args
+
+    async def _main(self, argv):
+        self.config_logging()
+
+        try:
+            pool = self.pool = await self.create_actor_pool()
+
+            await self.start_services()
+            self._running = True
+            await pool.join()
+        except asyncio.CancelledError:
+            if self._running:  # pragma: no branch
+                await self.stop_services()
+            if self.pool:  # pragma: no branch
+                await self.pool.stop()
+
+    async def create_actor_pool(self):
+        raise NotImplementedError
+
+    async def start_services(self):
+        raise NotImplementedError
+
+    async def stop_services(self):
+        raise NotImplementedError
+
+    def create_loop(self):
+        use_uvloop = self.args.use_uvloop.strip()
+        if use_uvloop in ("0", "no"):
+            loop = asyncio.get_event_loop()
+        else:
+            try:
+                import uvloop
+
+                loop = uvloop.new_event_loop()
+                asyncio.set_event_loop(loop)
+            except ImportError:
+                if use_uvloop == "auto":
+                    loop = asyncio.get_event_loop()
+                else:  # pragma: no cover
+                    raise
+        return loop
+
+    def __call__(self, argv: List[str] = None):
+        parser = argparse.ArgumentParser(description=self.command_description)
+        self.config_args(parser)
+        self.args = self.parse_args(parser, argv)
+
+        loop = self.create_loop()
+        task = loop.create_task(self._main(argv))
+
+        try:
+            loop.run_until_complete(task)
+        except KeyboardInterrupt:
+            task.cancel()
+            loop.run_until_complete(task)
+            # avoid displaying exception-unhandled warnings
+            task.exception()
diff --git a/python/xorbits/_mars/deploy/oscar/config.yml b/python/xorbits/_mars/deploy/oscar/config.yml
new file mode 100644
index 000000000..fe6918a14
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/config.yml
@@ -0,0 +1,3 @@
+"@inherits": base_config.yml
+storage:
+  backends: [shared_memory]
diff --git a/python/xorbits/_mars/deploy/oscar/file-logging.conf b/python/xorbits/_mars/deploy/oscar/file-logging.conf
new file mode 100644
index 000000000..12b952719
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/file-logging.conf
@@ -0,0 +1,74 @@
+[loggers]
+keys=root,main,deploy,services,oscar,tornado,dataframe,learn,tensor
+
+[handlers]
+keys=stream_handler,file_handler
+
+[formatters]
+keys=formatter
+
+[logger_root]
+level=WARN
+handlers=stream_handler,file_handler
+
+[logger_main]
+level=DEBUG
+handlers=stream_handler,file_handler
+qualname=__main__
+propagate=0
+
+[logger_deploy]
+level=DEBUG
+handlers=stream_handler,file_handler
+qualname=mars.deploy
+propagate=0
+
+[logger_oscar]
+level=DEBUG
+handlers=stream_handler,file_handler
+qualname=mars.oscar
+propagate=0
+
+[logger_services]
+level=DEBUG
+handlers=stream_handler,file_handler
+qualname=mars.services
+propagate=0
+
+[logger_dataframe]
+level=DEBUG
+handlers=stream_handler,file_handler
+qualname=mars.dataframe
+propagate=0
+
+[logger_learn]
+level=DEBUG
+handlers=stream_handler,file_handler
+qualname=mars.learn
+propagate=0
+
+[logger_tensor]
+level=DEBUG
+handlers=stream_handler,file_handler
+qualname=mars.tensor
+propagate=0
+
+[logger_tornado]
+level=WARN
+handlers=stream_handler,file_handler
+qualname=tornado
+propagate=0
+
+[handler_stream_handler]
+class=StreamHandler
+formatter=formatter
+level=DEBUG
+args=(sys.stderr,)
+
+[handler_file_handler]
+class=FileHandler
+formatter=formatter
+level=DEBUG
+
+[formatter_formatter]
+format=%(asctime)s %(name)-12s %(process)d %(levelname)-8s %(message)s
diff --git a/python/xorbits/_mars/deploy/oscar/local.py b/python/xorbits/_mars/deploy/oscar/local.py
new file mode 100644
index 000000000..8c256509a
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/local.py
@@ -0,0 +1,433 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import atexit
+import logging
+import os
+import sys
+from concurrent.futures import Future as SyncFuture
+from typing import Dict, List, Union
+
+import numpy as np
+
+from ... import oscar as mo
+from ...core.entrypoints import init_extension_entrypoints
+from ...lib.aio import get_isolation
+from ...metrics import init_metrics
+from ...oscar.backends.router import Router
+from ...resource import cpu_count, cuda_count, mem_total
+from ...services import NodeRole
+from ...services.task.execution.api import ExecutionConfig
+from ...typing import ClientType, ClusterType
+from ..utils import get_third_party_modules_from_config, load_config
+from .pool import create_supervisor_actor_pool, create_worker_actor_pool
+from .service import start_supervisor, start_worker, stop_supervisor, stop_worker
+from .session import AbstractSession, _new_session, ensure_isolation_created
+
+logger = logging.getLogger(__name__)
+
+_is_exiting_future = SyncFuture()
+atexit.register(
+    lambda: _is_exiting_future.set_result(0) if not _is_exiting_future.done() else None
+)
+
+# The default config file.
+DEFAULT_CONFIG_FILE = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "config.yml"
+)
+
+# the default times to retry subtask.
+DEFAULT_SUBTASK_MAX_RETRIES = 3
+# the default time to cancel a subtask.
+DEFAULT_SUBTASK_CANCEL_TIMEOUT = 5
+
+
+def _load_config(config: Union[str, Dict] = None):
+    return load_config(config, default_config_file=DEFAULT_CONFIG_FILE)
+
+
+async def new_cluster_in_isolation(
+    address: str = "0.0.0.0",
+    n_worker: int = 1,
+    n_cpu: Union[int, str] = "auto",
+    mem_bytes: Union[int, str] = "auto",
+    cuda_devices: Union[List[int], str] = "auto",
+    subprocess_start_method: str = None,
+    backend: str = None,
+    config: Union[str, Dict] = None,
+    web: bool = True,
+    timeout: float = None,
+    n_supervisor_process: int = 0,
+    numa_external_addr_scheme: str = None,
+    numa_enable_internal_addr: bool = None,
+    gpu_external_addr_scheme: str = None,
+    gpu_enable_internal_addr: bool = None,
+    external_addr_scheme: str = None,
+    enable_internal_addr: bool = None,
+    oscar_extra_conf: dict = None,
+    log_config: dict = None,
+) -> ClientType:
+    cluster = LocalCluster(
+        address,
+        n_worker,
+        n_cpu,
+        mem_bytes,
+        cuda_devices,
+        subprocess_start_method,
+        backend,
+        config,
+        web,
+        n_supervisor_process,
+        numa_external_addr_scheme=numa_external_addr_scheme,
+        numa_enable_internal_addr=numa_enable_internal_addr,
+        gpu_external_addr_scheme=gpu_external_addr_scheme,
+        gpu_enable_internal_addr=gpu_enable_internal_addr,
+        external_addr_scheme=external_addr_scheme,
+        enable_internal_addr=enable_internal_addr,
+        oscar_extra_conf=oscar_extra_conf,
+        log_config=log_config,
+    )
+    await cluster.start()
+    return await LocalClient.create(cluster, timeout)
+
+
+async def new_cluster(
+    address: str = "0.0.0.0",
+    n_worker: int = 1,
+    n_cpu: Union[int, str] = "auto",
+    mem_bytes: Union[int, str] = "auto",
+    cuda_devices: Union[List[int], str] = "auto",
+    subprocess_start_method: str = None,
+    backend: str = None,
+    config: Union[str, Dict] = None,
+    web: bool = True,
+    loop: asyncio.AbstractEventLoop = None,
+    use_uvloop: Union[bool, str] = "auto",
+    n_supervisor_process: int = 0,
+    numa_external_addr_scheme: str = None,
+    numa_enable_internal_addr: bool = None,
+    gpu_external_addr_scheme: str = None,
+    gpu_enable_internal_addr: bool = None,
+    external_addr_scheme: str = None,
+    enable_internal_addr: bool = None,
+    oscar_extra_conf: dict = None,
+) -> ClientType:
+    coro = new_cluster_in_isolation(
+        address,
+        n_worker=n_worker,
+        n_cpu=n_cpu,
+        mem_bytes=mem_bytes,
+        cuda_devices=cuda_devices,
+        subprocess_start_method=subprocess_start_method,
+        backend=backend,
+        config=config,
+        web=web,
+        n_supervisor_process=n_supervisor_process,
+        numa_external_addr_scheme=numa_external_addr_scheme,
+        numa_enable_internal_addr=numa_enable_internal_addr,
+        gpu_external_addr_scheme=gpu_external_addr_scheme,
+        gpu_enable_internal_addr=gpu_enable_internal_addr,
+        external_addr_scheme=external_addr_scheme,
+        enable_internal_addr=enable_internal_addr,
+        oscar_extra_conf=oscar_extra_conf,
+    )
+    isolation = ensure_isolation_created(dict(loop=loop, use_uvloop=use_uvloop))
+    fut = asyncio.run_coroutine_threadsafe(coro, isolation.loop)
+    client = await asyncio.wrap_future(fut)
+    client.session.as_default()
+    return client
+
+
+async def stop_cluster(cluster: ClusterType):
+    isolation = get_isolation()
+    coro = cluster.stop()
+    await asyncio.wrap_future(asyncio.run_coroutine_threadsafe(coro, isolation.loop))
+    Router.set_instance(None)
+
+
+class LocalCluster:
+    def __init__(
+        self: ClusterType,
+        address: str = "0.0.0.0",
+        n_worker: int = 1,
+        n_cpu: Union[int, str] = "auto",
+        mem_bytes: Union[int, str] = "auto",
+        cuda_devices: Union[List[int], List[List[int]], str] = "auto",
+        subprocess_start_method: str = None,
+        backend: str = None,
+        config: Union[str, Dict] = None,
+        web: Union[bool, str] = "auto",
+        n_supervisor_process: int = 0,
+        numa_external_addr_scheme: str = None,
+        numa_enable_internal_addr: bool = None,
+        gpu_external_addr_scheme: str = None,
+        gpu_enable_internal_addr: bool = None,
+        external_addr_scheme: str = None,
+        enable_internal_addr: str = None,
+        oscar_extra_conf: dict = None,
+        log_config: dict = None,
+    ):
+        # load third party extensions.
+        init_extension_entrypoints()
+        # auto choose the subprocess_start_method.
+        if subprocess_start_method is None:
+            subprocess_start_method = (
+                "spawn" if sys.platform == "win32" else "forkserver"
+            )
+        self._address = address
+        self._n_worker = n_worker
+        self._n_cpu = cpu_count() if n_cpu == "auto" else n_cpu
+        self._mem_bytes = mem_total() if mem_bytes == "auto" else mem_bytes
+        self._cuda_devices = self._get_cuda_devices(cuda_devices, n_worker)
+        self._subprocess_start_method = subprocess_start_method
+        self._config = load_config(config, default_config_file=DEFAULT_CONFIG_FILE)
+        execution_config = ExecutionConfig.from_config(self._config, backend=backend)
+        self._log_config = log_config
+        self._backend = execution_config.backend
+        self._web = web
+        self._n_supervisor_process = n_supervisor_process
+
+        execution_config.merge_from(
+            ExecutionConfig.from_params(
+                backend=self._backend,
+                n_worker=self._n_worker,
+                n_cpu=self._n_cpu,
+                mem_bytes=self._mem_bytes,
+                cuda_devices=self._cuda_devices,
+                subtask_cancel_timeout=self._config.get("scheduling", {}).get(
+                    "subtask_cancel_timeout", DEFAULT_SUBTASK_CANCEL_TIMEOUT
+                ),
+                subtask_max_retries=self._config.get("scheduling", {}).get(
+                    "subtask_max_retries", DEFAULT_SUBTASK_MAX_RETRIES
+                ),
+            )
+        )
+
+        # process oscar config
+        self._process_oscar_config(
+            numa_external_addr_scheme=numa_external_addr_scheme,
+            numa_enable_internal_addr=numa_enable_internal_addr,
+            gpu_external_addr_scheme=gpu_external_addr_scheme,
+            gpu_enable_internal_addr=gpu_enable_internal_addr,
+            external_addr_scheme=external_addr_scheme,
+            enable_internal_addr=enable_internal_addr,
+            oscar_extra_conf=oscar_extra_conf,
+        )
+
+        self._bands_to_resource = execution_config.get_deploy_band_resources()
+        self._supervisor_pool = None
+        self._worker_pools = []
+        self._exiting_check_task = None
+
+        self.supervisor_address = None
+        self.web_address = None
+
+    def _process_oscar_config(
+        self,
+        numa_external_addr_scheme: str = None,
+        numa_enable_internal_addr: bool = None,
+        gpu_external_addr_scheme: str = None,
+        gpu_enable_internal_addr: bool = None,
+        external_addr_scheme: str = None,
+        enable_internal_addr: str = None,
+        oscar_extra_conf: dict = None,
+    ):
+        # process oscar config
+        assert "oscar" in self._config
+        oscar_config = self._config["oscar"]
+        numa_config = oscar_config["numa"]
+        numa_external_addr_scheme = (
+            numa_external_addr_scheme
+            if numa_external_addr_scheme is not None
+            else external_addr_scheme
+        )
+        if numa_external_addr_scheme:
+            numa_config["external_addr_scheme"] = numa_external_addr_scheme
+        numa_enable_internal_addr = (
+            numa_enable_internal_addr
+            if numa_enable_internal_addr is not None
+            else enable_internal_addr
+        )
+        if numa_enable_internal_addr is not None:
+            numa_config["enable_internal_addr"] = numa_enable_internal_addr
+        gpu_config = oscar_config["gpu"]
+        gpu_external_addr_scheme = (
+            gpu_external_addr_scheme
+            if gpu_external_addr_scheme is not None
+            else external_addr_scheme
+        )
+        if gpu_external_addr_scheme:
+            gpu_config["external_addr_scheme"] = gpu_external_addr_scheme
+        gpu_enable_internal_addr = (
+            gpu_enable_internal_addr
+            if gpu_enable_internal_addr is not None
+            else enable_internal_addr
+        )
+        if gpu_enable_internal_addr is not None:
+            gpu_config["enable_internal_addr"] = gpu_enable_internal_addr
+        if oscar_extra_conf is not None:
+            oscar_config["extra_conf"] = oscar_extra_conf
+
+    @staticmethod
+    def _get_cuda_devices(cuda_devices, n_worker):
+        if cuda_devices == "auto":
+            total = cuda_count()
+            all_devices = np.arange(total)
+            return [list(arr) for arr in np.array_split(all_devices, n_worker)]
+
+        else:
+            if not cuda_devices:
+                return []
+            elif isinstance(cuda_devices[0], int):
+                assert n_worker == 1
+                return [cuda_devices]
+            else:
+                assert len(cuda_devices) == n_worker
+                return cuda_devices
+
+    @property
+    def backend(self):
+        return self._backend
+
+    @property
+    def external_address(self):
+        return self._supervisor_pool.external_address
+
+    async def start(self):
+        await self._start_supervisor_pool()
+        await self._start_worker_pools()
+        # start service
+        await self._start_service()
+
+        # init metrics to guarantee metrics use in driver
+        metric_configs = self._config.get("metrics", {})
+        metric_backend = metric_configs.get("backend")
+        init_metrics(metric_backend, config=metric_configs.get(metric_backend))
+
+        if self._web:
+            from ...services.web.supervisor import WebActor
+
+            web_actor = await mo.actor_ref(
+                WebActor.default_uid(), address=self.supervisor_address
+            )
+            self.web_address = await web_actor.get_web_address()
+            logger.warning("Web service started at %s", self.web_address)
+
+        self._exiting_check_task = asyncio.create_task(self._check_exiting())
+
+    async def _check_exiting(self):
+        await asyncio.wrap_future(_is_exiting_future)
+        await self.stop()
+
+    async def _start_supervisor_pool(self):
+        supervisor_modules = get_third_party_modules_from_config(
+            self._config, NodeRole.SUPERVISOR
+        )
+        self._supervisor_pool = await create_supervisor_actor_pool(
+            self._address,
+            n_process=self._n_supervisor_process,
+            modules=supervisor_modules,
+            subprocess_start_method=self._subprocess_start_method,
+            metrics=self._config.get("metrics", {}),
+            web=self._web,
+            # passing logging conf to config logging when create pools
+            logging_conf=self._log_config,
+            oscar_config=self._config.get("oscar"),
+        )
+        self.supervisor_address = self._supervisor_pool.external_address
+
+    async def _start_worker_pools(self):
+        worker_modules = get_third_party_modules_from_config(
+            self._config, NodeRole.WORKER
+        )
+        for band_to_resource in self._bands_to_resource:
+            worker_pool = await create_worker_actor_pool(
+                self._address,
+                band_to_resource,
+                modules=worker_modules,
+                subprocess_start_method=self._subprocess_start_method,
+                metrics=self._config.get("metrics", {}),
+                web=self._web,
+                # passing logging conf to config logging when create pools
+                logging_conf=self._log_config,
+                oscar_config=self._config.get("oscar"),
+            )
+            self._worker_pools.append(worker_pool)
+
+    async def _start_service(self):
+        self._web = await start_supervisor(
+            self.supervisor_address, config=self._config, web=self._web
+        )
+        for worker_pool, band_to_resource in zip(
+            self._worker_pools, self._bands_to_resource
+        ):
+            await start_worker(
+                worker_pool.external_address,
+                self.supervisor_address,
+                band_to_resource,
+                config=self._config,
+            )
+
+    async def stop(self):
+        from .session import SessionAPI
+
+        # delete all sessions
+        session_api = await SessionAPI.create(self._supervisor_pool.external_address)
+        await session_api.delete_all_sessions()
+
+        for worker_pool in self._worker_pools:
+            await stop_worker(worker_pool.external_address, self._config)
+        await stop_supervisor(self._supervisor_pool.external_address, self._config)
+        for worker_pool in self._worker_pools:
+            await worker_pool.stop()
+        await self._supervisor_pool.stop()
+        AbstractSession.reset_default()
+        self._exiting_check_task.cancel()
+        Router.set_instance(None)
+
+
+class LocalClient:
+    def __init__(self: ClientType, cluster: ClusterType, session: AbstractSession):
+        self._cluster = cluster
+        self.session = session
+
+    @classmethod
+    async def create(
+        cls,
+        cluster: LocalCluster,
+        timeout: float = None,
+    ) -> ClientType:
+        session = await _new_session(
+            cluster.external_address,
+            backend=cluster.backend,
+            default=True,
+            timeout=timeout,
+        )
+        client = LocalClient(cluster, session)
+        session.client = client
+        return client
+
+    @property
+    def web_address(self):
+        return self._cluster.web_address
+
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, *_):
+        await self.stop()
+
+    async def stop(self):
+        await stop_cluster(self._cluster)
diff --git a/python/xorbits/_mars/deploy/oscar/pool.py b/python/xorbits/_mars/deploy/oscar/pool.py
new file mode 100644
index 000000000..67cfb0f7c
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/pool.py
@@ -0,0 +1,271 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import configparser
+import logging
+import os
+import sys
+import tempfile
+from typing import Dict, List, Optional, Tuple
+
+from ... import oscar as mo
+from ...constants import MARS_LOG_PATH_KEY, MARS_LOG_PREFIX, MARS_TMP_DIR_PREFIX
+from ...resource import Resource, cuda_count
+
+logger = logging.getLogger(__name__)
+
+
+def _need_suspend_sigint() -> bool:
+    try:
+        from IPython import get_ipython
+
+        return get_ipython() is not None
+    except ImportError:
+        return False
+
+
+def _get_root_logger_level_and_format() -> Tuple[str, Optional[str]]:
+    root = logging.getLogger()
+    level = logging.getLevelName(root.getEffectiveLevel())
+    if level.startswith("WARN"):
+        level = "WARN"
+    handler = root.handlers[0] if root.handlers else None
+    fmt = handler.formatter._fmt if handler else None
+    return level, fmt
+
+
+def _parse_file_logging_config(
+    file_path: str,
+    log_path: str,
+    level: Optional[str],
+    formatter: Optional[str] = None,
+    from_cmd: bool = False,
+) -> configparser.RawConfigParser:
+    """
+    If env is ipython (from_cmd=False), the log level and format on the web follow our default configuration file,
+    and the level and format on the console use the user's configuration (logging.basicConfig) or keep the default.
+
+    If env is cmd (from_cmd=True, e.g. user invokes `python -m mars.worker`),
+    the log level and format on the web and console follow user's config (--log-level and --log-format)
+    or our default configuration file.
+    """
+    config = configparser.RawConfigParser()
+    config.read(file_path)
+    logger_sections = [
+        "logger_root",
+        "logger_main",
+        "logger_deploy",
+        "logger_oscar",
+        "logger_services",
+        "logger_dataframe",
+        "logger_learn",
+        "logger_tensor",
+        "handler_stream_handler",
+        "handler_file_handler",
+    ]
+    all_sections = config.sections()
+    for section in logger_sections:
+        if level and section in all_sections:
+            config[section]["level"] = level.upper()
+
+    if "handler_file_handler" in config:
+        if sys.platform.startswith("win"):
+            log_path = log_path.replace("\\", "/")
+        config["handler_file_handler"]["args"] = rf"('{log_path}',)"
+    if formatter:
+        format_section = "formatter_formatter"
+        config[format_section]["format"] = formatter
+
+    stream_handler_sec = "handler_stream_handler"
+    file_handler_sec = "handler_file_handler"
+    root_sec = "logger_root"
+    # If not from cmd (like ipython) and user uses its own config file,
+    # need to judge that whether handler_stream_handler section is in the config.
+    if not from_cmd and stream_handler_sec in all_sections:
+        # console and web log keeps the default config as root logger
+        root_level, root_fmt = _get_root_logger_level_and_format()
+        config[file_handler_sec]["level"] = root_level or "WARN"
+        config[stream_handler_sec]["level"] = root_level or "WARN"
+        config[root_sec]["level"] = root_level or "WARN"
+        if root_fmt:
+            config.add_section("formatter_console")
+            config["formatter_console"]["format"] = root_fmt
+            config["formatters"]["keys"] += ",console"
+            config[stream_handler_sec]["formatter"] = "console"
+    return config
+
+
+def _config_logging(**kwargs) -> Optional[configparser.RawConfigParser]:
+    web: bool = kwargs.get("web", True)
+    # web=False usually means it is a test environment.
+    if not web:
+        return
+    if kwargs.get("logging_conf", None) is None:
+        return
+    config = kwargs["logging_conf"]
+    from_cmd = config.get("from_cmd", False)
+    log_dir = config.get("log_dir", None)
+    log_conf_file = config.get("file", None)
+    level = config.get("level", None)
+    formatter = config.get("formatter", None)
+    logging_config_path = log_conf_file or os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "file-logging.conf"
+    )
+    # default config, then create a temp file
+    if (os.environ.get(MARS_LOG_PATH_KEY, None)) is None or (
+        not os.path.exists(os.environ[MARS_LOG_PATH_KEY])
+    ):
+        if log_dir is None:
+            mars_tmp_dir = tempfile.mkdtemp(prefix=MARS_TMP_DIR_PREFIX)
+        else:
+            mars_tmp_dir = os.path.join(log_dir, MARS_TMP_DIR_PREFIX)
+            os.makedirs(mars_tmp_dir, exist_ok=True)
+        _, file_path = tempfile.mkstemp(prefix=MARS_LOG_PREFIX, dir=mars_tmp_dir)
+        os.environ[MARS_LOG_PATH_KEY] = file_path
+        logging_conf = _parse_file_logging_config(
+            logging_config_path, file_path, level, formatter, from_cmd
+        )
+        # bind user's level and format when using default log conf
+        logging.config.fileConfig(
+            logging_conf,
+            disable_existing_loggers=False,
+        )
+        logger.debug("Use logging config file at %s", logging_config_path)
+        return logging_conf
+    else:
+        logging_conf = _parse_file_logging_config(
+            logging_config_path,
+            os.environ[MARS_LOG_PATH_KEY],
+            level,
+            formatter,
+            from_cmd,
+        )
+        logging.config.fileConfig(
+            logging_conf,
+            os.environ[MARS_LOG_PATH_KEY],
+            disable_existing_loggers=False,
+        )
+        logger.debug("Use logging config file at %s", logging_config_path)
+        return logging_conf
+
+
+async def create_supervisor_actor_pool(
+    address: str,
+    n_process: int,
+    modules: List[str] = None,
+    ports: List[int] = None,
+    subprocess_start_method: str = None,
+    oscar_config: dict = None,
+    **kwargs,
+):
+    logging_conf = _config_logging(**kwargs)
+    kwargs["logging_conf"] = logging_conf
+    if oscar_config:
+        numa_config = oscar_config.get("numa", dict())
+        numa_external_address_scheme = numa_config.get("external_addr_scheme", None)
+        numa_enable_internal_address = numa_config.get("enable_internal_addr", True)
+        external_address_schemes = [numa_external_address_scheme] * (n_process + 1)
+        enable_internal_addresses = [numa_enable_internal_address] * (n_process + 1)
+        extra_conf = oscar_config["extra_conf"]
+    else:
+        external_address_schemes = enable_internal_addresses = extra_conf = None
+    return await mo.create_actor_pool(
+        address,
+        n_process=n_process,
+        ports=ports,
+        external_address_schemes=external_address_schemes,
+        enable_internal_addresses=enable_internal_addresses,
+        modules=modules,
+        subprocess_start_method=subprocess_start_method,
+        suspend_sigint=_need_suspend_sigint(),
+        extra_conf=extra_conf,
+        **kwargs,
+    )
+
+
+async def create_worker_actor_pool(
+    address: str,
+    band_to_resource: Dict[str, Resource],
+    n_io_process: int = 1,
+    modules: List[str] = None,
+    ports: List[int] = None,
+    cuda_devices: List[int] = None,
+    subprocess_start_method: str = None,
+    oscar_config: dict = None,
+    **kwargs,
+):
+    logging_conf = _config_logging(**kwargs)
+    kwargs["logging_conf"] = logging_conf
+    # TODO: support NUMA when ready
+    n_process = sum(
+        int(resource.num_cpus) or int(resource.num_gpus)
+        for resource in band_to_resource.values()
+    )
+    envs = []
+    labels = ["main"]
+
+    oscar_config = oscar_config or dict()
+    numa_config = oscar_config.get("numa", dict())
+    numa_external_address_scheme = numa_config.get("external_addr_scheme")
+    numa_enable_internal_address = numa_config.get("enable_internal_addr")
+    gpu_config = oscar_config.get("gpu", dict())
+    gpu_external_address_scheme = gpu_config.get("external_addr_scheme")
+    gpu_enable_internal_address = gpu_config.get("enable_internal_addr")
+    extra_conf = oscar_config.get("extra_conf", dict())
+
+    if cuda_devices is None:  # pragma: no cover
+        env_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if not env_devices:
+            cuda_devices = list(range(cuda_count()))
+        else:
+            cuda_devices = [int(i) for i in env_devices.split(",")]
+
+    external_address_schemes = [numa_external_address_scheme]
+    enable_internal_addresses = [numa_enable_internal_address]
+    i_gpu = iter(sorted(cuda_devices))
+    for band, resource in band_to_resource.items():
+        if band.startswith("gpu"):
+            idx = str(next(i_gpu))
+            envs.append({"CUDA_VISIBLE_DEVICES": idx})
+            labels.append(f"gpu-{idx}")
+            external_address_schemes.append(gpu_external_address_scheme)
+            enable_internal_addresses.append(gpu_enable_internal_address)
+        else:
+            assert band.startswith("numa")
+            num_cpus = int(resource.num_cpus)
+            if cuda_devices:
+                # if has cuda device, disable all cuda devices for numa processes
+                envs.extend([{"CUDA_VISIBLE_DEVICES": "-1"} for _ in range(num_cpus)])
+            labels.extend([band] * num_cpus)
+            external_address_schemes.extend(
+                [numa_external_address_scheme for _ in range(num_cpus)]
+            )
+            enable_internal_addresses.extend(
+                [numa_enable_internal_address for _ in range(num_cpus)]
+            )
+
+    return await mo.create_actor_pool(
+        address,
+        n_process=n_process,
+        ports=ports,
+        n_io_process=n_io_process,
+        labels=labels,
+        envs=envs,
+        modules=modules,
+        subprocess_start_method=subprocess_start_method,
+        suspend_sigint=_need_suspend_sigint(),
+        external_address_schemes=external_address_schemes,
+        enable_internal_addresses=enable_internal_addresses,
+        extra_conf=extra_conf,
+        **kwargs,
+    )
diff --git a/python/xorbits/_mars/deploy/oscar/ray.py b/python/xorbits/_mars/deploy/oscar/ray.py
new file mode 100644
index 000000000..1dbcabba8
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/ray.py
@@ -0,0 +1,680 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import itertools
+import logging
+import os
+import time
+from typing import AsyncGenerator, Dict, List, Optional, Union
+
+from ... import oscar as mo
+from ...core.entrypoints import init_extension_entrypoints
+from ...metrics import init_metrics
+from ...oscar.backends.ray.driver import RayActorDriver
+from ...oscar.backends.ray.pool import RayPoolState
+from ...oscar.backends.ray.utils import (
+    node_placement_to_address,
+    process_address_to_placement,
+    process_placement_to_address,
+)
+from ...oscar.backends.router import Router
+from ...oscar.errors import ReconstructWorkerError
+from ...resource import Resource
+from ...services import NodeRole
+from ...services.cluster.backends.base import (
+    AbstractClusterBackend,
+    register_cluster_backend,
+)
+from ...services.task.execution.api import ExecutionConfig
+from ...utils import lazy_import, retry_callable
+from ..utils import get_third_party_modules_from_config, load_config
+from .pool import create_supervisor_actor_pool, create_worker_actor_pool
+from .service import start_supervisor, start_worker, stop_supervisor, stop_worker
+from .session import (
+    AbstractSession,
+    _new_session,
+    ensure_isolation_created,
+    new_session,
+)
+
+ray = lazy_import("ray")
+logger = logging.getLogger(__name__)
+
+# The default config file.
+DEFAULT_CONFIG_FILE = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "rayconfig.yml"
+)
+# The default value for supervisor standalone (not share node with worker).
+DEFAULT_SUPERVISOR_STANDALONE = False
+# The default value for supervisor sub pool count.
+DEFAULT_SUPERVISOR_SUB_POOL_NUM = 0
+
+
+def _load_config(config: Union[str, Dict] = None):
+    return load_config(config, default_config_file=DEFAULT_CONFIG_FILE)
+
+
+@register_cluster_backend
+class RayClusterBackend(AbstractClusterBackend):
+    name = "ray"
+
+    def __init__(self, lookup_address: str, cluster_state_ref):
+        self._supervisors = [n.strip() for n in lookup_address.split(",")]
+        self._cluster_state_ref = cluster_state_ref
+
+    @classmethod
+    async def create(
+        cls, node_role: NodeRole, lookup_address: str, pool_address: str
+    ) -> "RayClusterBackend":
+        try:
+            ref = await mo.create_actor(
+                ClusterStateActor,
+                uid=ClusterStateActor.default_uid(),
+                address=lookup_address,
+            )
+        except mo.ActorAlreadyExist:  # pragma: no cover
+            ref = await mo.actor_ref(
+                ClusterStateActor.default_uid(), address=lookup_address
+            )
+        return cls(lookup_address, ref)
+
+    async def watch_supervisors(self) -> AsyncGenerator[List[str], None]:
+        yield self._supervisors
+
+    async def get_supervisors(self, filter_ready: bool = True) -> List[str]:
+        return self._supervisors
+
+    async def new_worker(self, worker_address):
+        return await self._cluster_state_ref.new_worker(worker_address)
+
+    async def request_worker(
+        self, worker_cpu: int = None, worker_mem: int = None, timeout: int = None
+    ) -> str:
+        return await self._cluster_state_ref.request_worker(
+            worker_cpu, worker_mem, timeout
+        )
+
+    async def release_worker(self, address: str):
+        return await self._cluster_state_ref.release_worker(address)
+
+    async def reconstruct_worker(self, address: str):
+        return await self._cluster_state_ref.reconstruct_worker(address)
+
+    def get_cluster_state_ref(self):
+        return self._cluster_state_ref
+
+
+class ClusterStateActor(mo.StatelessActor):
+    def __init__(self):
+        self._worker_cpu, self._worker_mem, self._config = None, None, None
+        self._pg_name, self._band_to_resource, self._worker_modules = None, None, None
+        self._pg_counter = itertools.count()
+        self._worker_count = 0
+        self._workers = {}
+        self._releasing_tasks = {}
+        self._reconstructing_tasks = {}
+
+    async def __post_create__(self):
+        self._pg_name, _, _ = process_address_to_placement(self.address)
+
+    def set_config(self, worker_cpu, worker_mem, config):
+        self._worker_cpu, self._worker_mem, self._config = (
+            worker_cpu,
+            worker_mem,
+            config,
+        )
+        # TODO(chaokunyang) Support gpu
+        self._band_to_resource = {
+            "numa-0": Resource(num_cpus=self._worker_cpu, mem_bytes=self._worker_mem)
+        }
+        self._worker_modules = get_third_party_modules_from_config(
+            self._config, NodeRole.WORKER
+        )
+
+    async def request_worker(
+        self, worker_cpu: int = None, worker_mem: int = None, timeout: int = None
+    ) -> Optional[str]:
+        worker_cpu = worker_cpu or self._worker_cpu
+        worker_mem = worker_mem or self._worker_mem
+        bundle = {
+            "CPU": worker_cpu,
+            # "memory": worker_mem or self._worker_mem
+        }
+        band_to_resource = {
+            "numa-0": Resource(num_cpus=worker_cpu, mem_bytes=worker_mem)
+        }
+        start_time = time.time()
+        logger.info("Start to request worker with resource %s.", bundle)
+        # TODO rescale ray placement group instead of creating new placement group
+        pg_name = f"{self._pg_name}_{next(self._pg_counter)}"
+        pg = ray.util.placement_group(name=pg_name, bundles=[bundle], strategy="SPREAD")
+        create_pg_timeout = timeout or 120
+        try:
+            await asyncio.wait_for(pg.ready(), timeout=create_pg_timeout)
+        except asyncio.CancelledError:  # pragma: no cover
+            logger.warning(
+                "Request worker with placement group %s in %s seconds canceled.",
+                pg.bundle_specs,
+                create_pg_timeout,
+            )
+            ray.util.remove_placement_group(pg)
+            return None
+        except asyncio.TimeoutError:
+            logger.warning(
+                "Request worker failed, "
+                "can not create placement group %s in %s seconds.",
+                pg.bundle_specs,
+                create_pg_timeout,
+            )
+            ray.util.remove_placement_group(pg)
+            return None
+        logger.info(
+            "Creating placement group %s took %.4f seconds",
+            pg.bundle_specs,
+            time.time() - start_time,
+        )
+        worker_address = process_placement_to_address(pg_name, 0, 0)
+        worker_pool = await self.create_worker(worker_address)
+        await self.start_worker(worker_address, band_to_resource=band_to_resource)
+        logger.info(
+            "Request worker %s succeeds in %.4f seconds",
+            worker_address,
+            time.time() - start_time,
+        )
+        self._workers[worker_address] = (worker_pool, pg)
+        return worker_address
+
+    async def create_worker(self, worker_address):
+        start_time = time.time()
+        worker_pool = await create_worker_actor_pool(
+            worker_address,
+            self._band_to_resource,
+            modules=self._worker_modules,
+            metrics=self._config.get("metrics", {}),
+        )
+        logger.info(
+            "Create worker node %s succeeds in %.4f seconds.",
+            worker_address,
+            time.time() - start_time,
+        )
+        return worker_pool
+
+    async def start_worker(self, worker_address, band_to_resource=None):
+        self._worker_count += 1
+        start_time = time.time()
+        band_to_resource = band_to_resource or self._band_to_resource
+        await start_worker(
+            worker_address, self.address, band_to_resource, config=self._config
+        )
+        worker_pool = ray.get_actor(worker_address)
+        await worker_pool.mark_service_ready.remote()
+        logger.info(
+            "Start services on worker %s succeeds in %.4f seconds.",
+            worker_address,
+            time.time() - start_time,
+        )
+        return worker_pool
+
+    async def release_worker(self, address: str):
+        logger.info("Start to release worker %s", address)
+        task = self._reconstructing_tasks.get(address)
+        if task is not None:
+            task.cancel()
+
+        task = self._releasing_tasks.get(address)
+        if task is not None:
+            logger.info("Waiting for releasing worker %s", address)
+            return await task
+
+        async def _release_worker():
+            await stop_worker(address, self._config)
+            pool, pg = self._workers.pop(address)
+            await pool.actor_pool.remote("stop")
+            if "COV_CORE_SOURCE" in os.environ:  # pragma: no cover
+                try:
+                    # must clean up first, or coverage info lost
+                    await pool.cleanup.remote()
+                except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+                    pass
+            ray.kill(pool.main_pool)
+            ray.util.remove_placement_group(pg)
+            logger.info("Released worker %s", address)
+
+        task = asyncio.create_task(_release_worker())
+        task.add_done_callback(lambda _: self._releasing_tasks.pop(address, None))
+        self._releasing_tasks[address] = task
+        return await task
+
+    async def reconstruct_worker(self, address: str):
+        task = self._releasing_tasks.get(address)
+        if task is not None:
+            raise ReconstructWorkerError(
+                f"Can't reconstruct releasing worker {address}"
+            )
+
+        task = self._reconstructing_tasks.get(address)
+        if task is not None:
+            logger.info("Waiting for reconstruct worker %s", address)
+            return await task
+
+        async def _reconstruct_worker():
+            logger.info("Reconstruct worker %s", address)
+            actor = ray.get_actor(address)
+            # ray call will error when actor is restarting
+            state = await retry_callable(
+                actor.state.remote, ex_type=ray.exceptions.RayActorError, sync=False
+            )()
+            if state == RayPoolState.SERVICE_READY:
+                logger.info("Worker %s is service ready.")
+                return
+
+            if state == RayPoolState.INIT:
+                await actor.start.remote()
+            else:
+                assert state == RayPoolState.POOL_READY
+
+            start_time = time.time()
+            await start_worker(
+                address, self.address, self._band_to_resource, config=self._config
+            )
+            await actor.mark_service_ready.remote()
+            logger.info(
+                "Start services on worker %s succeeds in %.4f seconds.",
+                address,
+                time.time() - start_time,
+            )
+
+        task = asyncio.create_task(_reconstruct_worker())
+        task.add_done_callback(lambda _: self._reconstructing_tasks.pop(address, None))
+        self._reconstructing_tasks[address] = task
+        return await task
+
+
+async def new_cluster(
+    cluster_name: str = None,
+    supervisor_cpu: int = 1,
+    supervisor_mem: int = 1 * 1024**3,
+    worker_num: int = 1,
+    worker_cpu: int = 2,
+    worker_mem: int = 2 * 1024**3,
+    backend: str = None,
+    config: Union[str, Dict] = None,
+    **kwargs,
+):
+    cluster_name = cluster_name or f"ray-cluster-{int(time.time())}"
+    if not ray.is_initialized():
+        logger.warning("Ray is not started, start the local ray cluster by `ray.init`.")
+        # add 16 logical cpus for other computing in ray.
+        ray.init(num_cpus=16 + worker_num * worker_cpu)
+    ensure_isolation_created(kwargs)
+    if kwargs:  # pragma: no cover
+        raise TypeError(f"new_cluster got unexpected arguments: {list(kwargs)}")
+    n_supervisor_process = kwargs.get(
+        "n_supervisor_process", DEFAULT_SUPERVISOR_SUB_POOL_NUM
+    )
+    cluster = RayCluster(
+        cluster_name,
+        supervisor_cpu,
+        supervisor_mem,
+        worker_num,
+        worker_cpu,
+        worker_mem,
+        backend,
+        config,
+        n_supervisor_process=n_supervisor_process,
+    )
+    try:
+        await cluster.start()
+        return await RayClient.create(cluster)
+    except Exception as ex:  # pragma: no cover
+        # cleanup the cluster if failed.
+        try:
+            await cluster.stop()
+        except Exception as stop_ex:
+            raise stop_ex from ex
+        raise ex
+
+
+def new_cluster_in_ray(**kwargs):
+    isolation = ensure_isolation_created(kwargs)
+    coro = new_cluster(**kwargs)
+    fut = asyncio.run_coroutine_threadsafe(coro, isolation.loop)
+    client = fut.result()
+    client.session.as_default()
+    return client
+
+
+new_cluster_in_ray.__doc__ = new_cluster.__doc__
+
+
+def new_ray_session(
+    address: str = None,
+    session_id: str = None,
+    backend: str = "mars",
+    default: bool = True,
+    **new_cluster_kwargs,
+) -> AbstractSession:
+    """
+
+    Parameters
+    ----------
+    address: str
+        mars web server address.
+    session_id: str
+        session id. If not specified, will be generated automatically.
+    backend: str
+        The executor backend. Available values are "mars" and "ray", default is "mars".
+    default: bool
+        whether set the session as default session.
+    new_cluster_kwargs:
+        See `new_cluster` arguments.
+    """
+    client = None
+    if not address:
+        client = new_cluster_in_ray(backend=backend, **new_cluster_kwargs)
+        session_id = session_id or client.session.session_id
+        address = client.address
+    session = new_session(
+        address=address, session_id=session_id, backend=backend, default=default
+    )
+    session.client = client
+    if default:
+        # SyncSession set isolated_session as default session instead.
+        AbstractSession.default.client = client
+    return session
+
+
+class RayCluster:
+    _supervisor_pool: "ray.actor.ActorHandle"
+    _worker_pools: List["ray.actor.ActorHandle"]
+
+    def __init__(
+        self,
+        cluster_name: str,
+        supervisor_cpu: Union[int, float] = 1,
+        supervisor_mem: int = 1 * 1024**3,
+        worker_num: int = 1,
+        worker_cpu: Union[int, float] = 2,
+        worker_mem: int = 2 * 1024**3,
+        backend: str = None,
+        config: Union[str, Dict] = None,
+        n_supervisor_process: int = DEFAULT_SUPERVISOR_SUB_POOL_NUM,
+    ):
+        # load third party extensions.
+        init_extension_entrypoints()
+        self._cluster_name = cluster_name
+        self._supervisor_cpu = supervisor_cpu
+        self._supervisor_mem = supervisor_mem
+        self._n_supervisor_process = n_supervisor_process
+        self._worker_num = worker_num
+        self._worker_cpu = worker_cpu
+        self._worker_mem = worker_mem
+        self.backend = backend
+        # load config file to dict.
+        self._config = load_config(config, default_config_file=DEFAULT_CONFIG_FILE)
+        self.supervisor_address = None
+        # Hold actor handles to avoid being freed
+        self._supervisor_pool = None
+        self._worker_addresses = []
+        self._worker_pools = []
+        self._stopped = False
+        self._cluster_backend = None
+        self.web_address = None
+
+    async def start(self):
+        try:
+            # Python 3.8 support force argument.
+            logging.basicConfig(
+                format=ray.ray_constants.LOGGER_FORMAT, level=logging.INFO, force=True
+            )
+        except ValueError:  # pragma: no cover
+            logging.basicConfig(
+                format=ray.ray_constants.LOGGER_FORMAT, level=logging.INFO
+            )
+        execution_config = ExecutionConfig.from_config(
+            self._config, backend=self.backend
+        )
+        self.backend = execution_config.backend
+        if self.backend == "mars":
+            await self.start_oscar(
+                self._n_supervisor_process,
+                self._supervisor_cpu,
+                self._supervisor_mem,
+                self._worker_num,
+                self._worker_cpu,
+                self._worker_mem,
+            )
+        elif self.backend == "ray":
+            execution_config.merge_from(
+                ExecutionConfig.from_params(
+                    backend=self.backend,
+                    n_worker=self._worker_num,
+                    n_cpu=self._worker_num * self._worker_cpu,
+                    mem_bytes=self._worker_mem,
+                    subtask_num_cpus=self._worker_cpu,
+                    subtask_memory=self._worker_mem,
+                )
+            )
+            assert self._n_supervisor_process == 0, self._n_supervisor_process
+            await self.start_oscar(
+                self._n_supervisor_process,
+                self._supervisor_cpu,
+                self._supervisor_mem,
+                0,
+                0,
+                0,
+            )
+        else:
+            raise ValueError(f"Unsupported backend type: {self.backend}.")
+
+    async def start_oscar(
+        self,
+        n_supervisor_process,
+        supervisor_cpu,
+        supervisor_mem,
+        worker_num,
+        worker_cpu,
+        worker_mem,
+    ):
+        logger.info("Start cluster with config %s", self._config)
+        # init metrics to guarantee metrics use in driver
+        metric_configs = self._config.get("metrics", {})
+        metric_backend = metric_configs.get("backend")
+        init_metrics(metric_backend, config=metric_configs.get(metric_backend))
+        address_to_resources = dict()
+        supervisor_standalone = (
+            self._config.get("cluster", {})
+            .get("ray", {})
+            .get("supervisor", {})
+            .get("standalone", DEFAULT_SUPERVISOR_STANDALONE)
+        )
+        supervisor_sub_pool_num = (
+            self._config.get("cluster", {})
+            .get("ray", {})
+            .get("supervisor", {})
+            .get("sub_pool_num", n_supervisor_process)
+        )
+        from ...storage.ray import support_specify_owner
+
+        if not support_specify_owner():  # pragma: no cover
+            logger.warning(
+                "Current installed ray version does not support specify owner, "
+                "autoscale may not work."
+            )
+            # config['scheduling']['autoscale']['enabled'] = False
+        self.supervisor_address = process_placement_to_address(self._cluster_name, 0, 0)
+        if "cluster" not in self._config:  # pragma: no cover
+            self._config["cluster"] = dict()
+        self._config["cluster"]["lookup_address"] = self.supervisor_address
+        address_to_resources[node_placement_to_address(self._cluster_name, 0)] = {
+            "CPU": supervisor_cpu,
+            # "memory": supervisor_mem,
+        }
+        worker_addresses = []
+        if supervisor_standalone or worker_num == 0:
+            for worker_index in range(1, worker_num + 1):
+                worker_address = process_placement_to_address(
+                    self._cluster_name, worker_index, 0
+                )
+                worker_addresses.append(worker_address)
+                worker_node_address = node_placement_to_address(
+                    self._cluster_name, worker_index
+                )
+                address_to_resources[worker_node_address] = {
+                    "CPU": worker_cpu,
+                    # "memory": self._worker_mem,
+                }
+        else:
+            for worker_index in range(worker_num):
+                worker_process_index = (
+                    supervisor_sub_pool_num + 1 if worker_index == 0 else 0
+                )
+                worker_address = process_placement_to_address(
+                    self._cluster_name, worker_index, worker_process_index
+                )
+                worker_addresses.append(worker_address)
+                worker_node_address = node_placement_to_address(
+                    self._cluster_name, worker_index
+                )
+                address_to_resources[worker_node_address] = {
+                    "CPU": worker_cpu,
+                    # "memory": self._worker_mem,
+                }
+        mo.setup_cluster(address_to_resources)
+
+        # third party modules from config
+        supervisor_modules = get_third_party_modules_from_config(
+            self._config, NodeRole.SUPERVISOR
+        )
+
+        # set global router an empty one.
+        Router.set_instance(Router(list(), None))
+
+        # create supervisor actor pool
+        supervisor_pool_coro = asyncio.create_task(
+            create_supervisor_actor_pool(
+                self.supervisor_address,
+                n_process=supervisor_sub_pool_num,
+                main_pool_cpus=0,
+                sub_pool_cpus=0,
+                modules=supervisor_modules,
+                metrics=self._config.get("metrics", {}),
+            )
+        )
+        worker_pools = [
+            asyncio.create_task(
+                create_worker_actor_pool(
+                    addr,
+                    {
+                        "numa-0": Resource(
+                            num_cpus=worker_cpu, mem_bytes=self._worker_mem
+                        )
+                    },
+                    modules=get_third_party_modules_from_config(
+                        self._config, NodeRole.WORKER
+                    ),
+                    metrics=self._config.get("metrics", {}),
+                )
+            )
+            for addr in worker_addresses
+        ]
+        self._supervisor_pool = await supervisor_pool_coro
+        logger.info("Create supervisor on node %s succeeds.", self.supervisor_address)
+        self._cluster_backend = await RayClusterBackend.create(
+            NodeRole.WORKER, self.supervisor_address, self.supervisor_address
+        )
+        cluster_state_ref = self._cluster_backend.get_cluster_state_ref()
+        await self._cluster_backend.get_cluster_state_ref().set_config(
+            worker_cpu, self._worker_mem, self._config
+        )
+        # start service
+        await start_supervisor(self.supervisor_address, config=self._config)
+        logger.info(
+            "Start services on supervisor %s succeeds.", self.supervisor_address
+        )
+        await self._supervisor_pool.mark_service_ready.remote()
+        worker_pools = await asyncio.gather(*worker_pools)
+        logger.info("Create %s workers succeeds.", len(worker_pools))
+        await asyncio.gather(
+            *[cluster_state_ref.start_worker(addr) for addr in worker_addresses]
+        )
+        logger.info("Start services on %s workers succeeds.", len(worker_addresses))
+        for worker_address, worker_pool in zip(worker_addresses, worker_pools):
+            self._worker_addresses.append(worker_address)
+            self._worker_pools.append(worker_pool)
+
+        from ...services.web.supervisor import WebActor
+
+        web_actor = await mo.actor_ref(
+            WebActor.default_uid(), address=self.supervisor_address
+        )
+        self.web_address = await web_actor.get_web_address()
+        logger.warning("Web service started at %s", self.web_address)
+
+    async def stop(self):
+        if not self._stopped:
+            try:
+                for worker_address in self._worker_addresses:
+                    await stop_worker(worker_address, self._config)
+                for pool in self._worker_pools:
+                    await pool.actor_pool.remote("stop")
+                if self._supervisor_pool is not None:
+                    await stop_supervisor(self.supervisor_address, self._config)
+                    await self._supervisor_pool.actor_pool.remote("stop")
+            finally:
+                AbstractSession.reset_default()
+                RayActorDriver.stop_cluster()
+                Router.set_instance(None)
+            self._stopped = True
+
+
+class RayClient:
+    def __init__(self, cluster: RayCluster, session: AbstractSession):
+        self._cluster = cluster
+        self._address = cluster.supervisor_address
+        self._session = session
+        # hold ray cluster by client to avoid actor handle out-of-scope
+        session.client = self
+
+    @classmethod
+    async def create(cls, cluster: RayCluster) -> "RayClient":
+        session = await _new_session(
+            cluster.supervisor_address, default=True, backend=cluster.backend
+        )
+        client = RayClient(cluster, session)
+        AbstractSession.default.client = client
+        return client
+
+    @property
+    def address(self):
+        return self._session.address
+
+    @property
+    def session(self):
+        return self._session
+
+    @property
+    def web_address(self):
+        return self._cluster.web_address
+
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, *_):
+        await self.stop()
+
+    async def stop(self):
+        await self._cluster.stop()
+        AbstractSession.reset_default()
diff --git a/python/xorbits/_mars/deploy/oscar/rayconfig.yml b/python/xorbits/_mars/deploy/oscar/rayconfig.yml
new file mode 100644
index 000000000..6530466af
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/rayconfig.yml
@@ -0,0 +1,20 @@
+"@inherits": base_config.yml
+cluster:
+  backend: ray
+  ray:
+    supervisor:
+      standalone: no
+      sub_pool_num: 0
+session:
+  custom_log_dir: null
+storage:
+  backends: [ray]
+scheduling:
+  autoscale:
+    enabled: false
+    scheduler_backlog_timeout: 20
+    worker_idle_timeout: 40
+  subtask_max_retries: 3
+  subtask_max_reschedules: 2
+metrics:
+  backend: ray
diff --git a/python/xorbits/_mars/deploy/oscar/service.py b/python/xorbits/_mars/deploy/oscar/service.py
new file mode 100644
index 000000000..fc50f9c2d
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/service.py
@@ -0,0 +1,97 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Dict, List, Union
+
+from ...oscar import ServerClosed
+from ...resource import Resource
+from ...services import NodeRole, start_services, stop_services
+
+logger = logging.getLogger(__name__)
+
+
+async def start_supervisor(
+    address: str,
+    lookup_address: str = None,
+    modules: Union[List, str, None] = None,
+    config: Dict = None,
+    web: Union[str, bool] = "auto",
+):
+    logger.debug("Starting Mars supervisor at %s", address)
+    lookup_address = lookup_address or address
+    backend = config["cluster"].get("backend", "fixed")
+    if backend == "fixed" and config["cluster"].get("lookup_address") is None:
+        config["cluster"]["lookup_address"] = lookup_address
+    if web:
+        # try to append web to services
+        config["services"].append("web")
+    if modules:
+        config["modules"] = modules
+    try:
+        await start_services(NodeRole.SUPERVISOR, config, address=address)
+        logger.debug("Mars supervisor started at %s", address)
+    except ImportError:
+        if web == "auto":
+            config["services"] = [
+                service for service in config["services"] if service != "web"
+            ]
+            await start_services(NodeRole.SUPERVISOR, config, address=address)
+            logger.debug("Mars supervisor started at %s", address)
+            return False
+        else:  # pragma: no cover
+            raise
+    else:
+        return bool(web)
+
+
+async def stop_supervisor(address: str, config: Dict = None):
+    try:
+        await stop_services(NodeRole.SUPERVISOR, address=address, config=config)
+    except (ConnectionRefusedError, ServerClosed):  # pragma: no cover
+        pass
+
+
+async def start_worker(
+    address: str,
+    lookup_address: str,
+    band_to_resource: Dict[str, Resource],
+    modules: Union[List, str, None] = None,
+    config: Dict = None,
+    mark_ready: bool = True,
+):
+    logger.debug("Starting Mars worker at %s", address)
+    backend = config["cluster"].get("backend", "fixed")
+    if backend == "fixed" and config["cluster"].get("lookup_address") is None:
+        config["cluster"]["lookup_address"] = lookup_address
+    if config["cluster"].get("resource") is None:
+        config["cluster"]["resource"] = band_to_resource
+    if any(
+        band_name.startswith("gpu-") for band_name in band_to_resource
+    ):  # pragma: no cover
+        if "cuda" not in config["storage"]["backends"]:
+            config["storage"]["backends"].append("cuda")
+    if modules:
+        config["modules"] = modules
+    await start_services(
+        NodeRole.WORKER, config, address=address, mark_ready=mark_ready
+    )
+    logger.debug("Mars worker started at %s", address)
+
+
+async def stop_worker(address: str, config: Dict = None):
+    try:
+        await stop_services(NodeRole.WORKER, address=address, config=config)
+    except (ConnectionRefusedError, ServerClosed):  # pragma: no cover
+        pass
diff --git a/python/xorbits/_mars/deploy/oscar/session.py b/python/xorbits/_mars/deploy/oscar/session.py
new file mode 100644
index 000000000..4950477e2
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/session.py
@@ -0,0 +1,2076 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import concurrent.futures
+import itertools
+import json
+import logging
+import random
+import string
+import threading
+import time
+import warnings
+from abc import ABC, ABCMeta, abstractmethod
+from collections import defaultdict
+from dataclasses import dataclass
+from functools import wraps
+from numbers import Integral
+from typing import Any, Callable, Coroutine, Dict, List, Optional, Tuple, Union
+from urllib.parse import urlparse
+from weakref import WeakKeyDictionary, WeakSet, ref
+
+import numpy as np
+
+from ... import oscar as mo
+from ...config import options
+from ...core import ChunkType, TileableGraph, TileableType, enter_mode
+from ...core.entrypoints import init_extension_entrypoints
+from ...core.operand import Fetch
+from ...lib.aio import (
+    Isolation,
+    alru_cache,
+    get_isolation,
+    new_isolation,
+    stop_isolation,
+)
+from ...metrics import Metrics
+from ...services.cluster import AbstractClusterAPI, ClusterAPI
+from ...services.lifecycle import AbstractLifecycleAPI, LifecycleAPI
+from ...services.meta import AbstractMetaAPI, MetaAPI
+from ...services.mutable import MutableAPI, MutableTensor
+from ...services.session import AbstractSessionAPI, SessionAPI
+from ...services.storage import StorageAPI
+from ...services.task import AbstractTaskAPI, TaskAPI, TaskResult
+from ...services.task.execution.api import Fetcher
+from ...services.web import OscarWebAPI
+from ...typing import BandType, ClientType
+from ...utils import (
+    Timer,
+    build_fetch,
+    classproperty,
+    copy_tileables,
+    implements,
+    merge_chunks,
+    merged_chunk_as_tileable_type,
+    register_asyncio_task_timeout_detector,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Progress:
+    value: float = 0.0
+
+
+@dataclass
+class Profiling:
+    result: dict = None
+
+
+class ExecutionInfo:
+    def __init__(
+        self,
+        aio_task: asyncio.Task,
+        progress: Progress,
+        profiling: Profiling,
+        loop: asyncio.AbstractEventLoop,
+        to_execute_tileables: List[TileableType],
+    ):
+        self._aio_task = aio_task
+        self._progress = progress
+        self._profiling = profiling
+        self._loop = loop
+        self._to_execute_tileables = [ref(t) for t in to_execute_tileables]
+
+        self._future_local = threading.local()
+
+    def _ensure_future(self):
+        try:
+            self._future_local.future
+        except AttributeError:
+
+            async def wait():
+                return await self._aio_task
+
+            self._future_local.future = fut = asyncio.run_coroutine_threadsafe(
+                wait(), self._loop
+            )
+            self._future_local.aio_future = asyncio.wrap_future(fut)
+
+    @property
+    def loop(self):
+        return self._loop
+
+    @property
+    def aio_task(self):
+        return self._aio_task
+
+    def progress(self) -> float:
+        return self._progress.value
+
+    @property
+    def to_execute_tileables(self) -> List[TileableType]:
+        return [t() for t in self._to_execute_tileables]
+
+    def profiling_result(self) -> dict:
+        return self._profiling.result
+
+    def result(self, timeout=None):
+        self._ensure_future()
+        return self._future_local.future.result(timeout=timeout)
+
+    def cancel(self):
+        self._aio_task.cancel()
+
+    def __getattr__(self, attr):
+        self._ensure_future()
+        return getattr(self._future_local.aio_future, attr)
+
+    def __await__(self):
+        self._ensure_future()
+        return self._future_local.aio_future.__await__()
+
+    def get_future(self):
+        self._ensure_future()
+        return self._future_local.aio_future
+
+
+warning_msg = """
+No session found, local session \
+will be created in background, \
+it may take a while before execution. \
+If you want to new a local session by yourself, \
+run code below:
+
+```
+import mars
+
+mars.new_session()
+```
+"""
+
+
+class AbstractSession(ABC):
+    name = None
+    _default = None
+    _lock = threading.Lock()
+
+    def __init__(self, address: str, session_id: str):
+        self._address = address
+        self._session_id = session_id
+        self._closed = False
+
+    @property
+    def address(self):
+        return self._address
+
+    @property
+    def session_id(self):
+        return self._session_id
+
+    def __eq__(self, other):
+        return (
+            isinstance(other, AbstractSession)
+            and self._address == other.address
+            and self._session_id == other.session_id
+        )
+
+    def __hash__(self):
+        return hash((AbstractSession, self._address, self._session_id))
+
+    def as_default(self) -> "AbstractSession":
+        """
+        Mark current session as default session.
+        """
+        AbstractSession._default = self
+        return self
+
+    @classmethod
+    def reset_default(cls):
+        AbstractSession._default = None
+
+    @classproperty
+    def default(self):
+        return AbstractSession._default
+
+
+class AbstractAsyncSession(AbstractSession, metaclass=ABCMeta):
+    @classmethod
+    @abstractmethod
+    async def init(
+        cls, address: str, session_id: str, new: bool = True, **kwargs
+    ) -> "AbstractSession":
+        """
+        Init a new session.
+
+        Parameters
+        ----------
+        address : str
+            Address.
+        session_id : str
+            Session ID.
+        new : bool
+            New a session.
+        kwargs
+
+        Returns
+        -------
+        session
+        """
+
+    async def destroy(self):
+        """
+        Destroy a session.
+        """
+        self.reset_default()
+        self._closed = True
+
+    @abstractmethod
+    async def execute(self, *tileables, **kwargs) -> ExecutionInfo:
+        """
+        Execute tileables.
+
+        Parameters
+        ----------
+        tileables
+            Tileables.
+        kwargs
+        """
+
+    @abstractmethod
+    async def fetch(self, *tileables, **kwargs) -> list:
+        """
+        Fetch tileables' data.
+
+        Parameters
+        ----------
+        tileables
+            Tileables.
+
+        Returns
+        -------
+        data
+        """
+
+    @abstractmethod
+    async def _get_ref_counts(self) -> Dict[str, int]:
+        """
+        Get all ref counts
+
+        Returns
+        -------
+        ref_counts
+        """
+
+    @abstractmethod
+    async def fetch_tileable_op_logs(
+        self,
+        tileable_op_key: str,
+        offsets: Union[Dict[str, List[int]], str, int],
+        sizes: Union[Dict[str, List[int]], str, int],
+    ) -> Dict:
+        """
+        Fetch logs given tileable op key.
+
+        Parameters
+        ----------
+        tileable_op_key : str
+            Tileable op key.
+        offsets
+            Chunk op key to offsets.
+        sizes
+            Chunk op key to sizes.
+
+        Returns
+        -------
+        chunk_key_to_logs
+        """
+
+    @abstractmethod
+    async def get_total_n_cpu(self):
+        """
+        Get number of cluster cpus.
+
+        Returns
+        -------
+        number_of_cpu: int
+        """
+
+    @abstractmethod
+    async def get_cluster_versions(self) -> List[str]:
+        """
+        Get versions used in current Mars cluster
+
+        Returns
+        -------
+        version_list : list
+            List of versions
+        """
+
+    @abstractmethod
+    async def get_web_endpoint(self) -> Optional[str]:
+        """
+        Get web endpoint of current session
+
+        Returns
+        -------
+        web_endpoint : str
+            web endpoint
+        """
+
+    @abstractmethod
+    async def create_remote_object(
+        self, session_id: str, name: str, object_cls, *args, **kwargs
+    ):
+        """
+        Create remote object
+
+        Parameters
+        ----------
+        session_id : str
+            Session ID.
+        name : str
+        object_cls
+        args
+        kwargs
+
+        Returns
+        -------
+        actor_ref
+        """
+
+    @abstractmethod
+    async def get_remote_object(self, session_id: str, name: str):
+        """
+        Get remote object.
+
+        Parameters
+        ----------
+        session_id : str
+            Session ID.
+        name : str
+
+        Returns
+        -------
+        actor_ref
+        """
+
+    @abstractmethod
+    async def destroy_remote_object(self, session_id: str, name: str):
+        """
+        Destroy remote object.
+
+        Parameters
+        ----------
+        session_id : str
+            Session ID.
+        name : str
+        """
+
+    @abstractmethod
+    async def create_mutable_tensor(
+        self,
+        shape: tuple,
+        dtype: Union[np.dtype, str],
+        name: str = None,
+        default_value: Union[int, float] = 0,
+        chunk_size: Union[int, Tuple] = None,
+    ):
+        """
+        Create a mutable tensor.
+
+        Parameters
+        ----------
+        shape: tuple
+            Shape of the mutable tensor.
+
+        dtype: np.dtype or str
+            Data type of the mutable tensor.
+
+        name: str, optional
+            Name of the mutable tensor, a random name will be used if not specified.
+
+        default_value: optional
+            Default value of the mutable tensor. Default is 0.
+
+        chunk_size: int or tuple, optional
+            Chunk size of the mutable tensor.
+
+        Returns
+        -------
+            MutableTensor
+        """
+
+    @abstractmethod
+    async def get_mutable_tensor(self, name: str):
+        """
+        Get a mutable tensor by name.
+
+        Parameters
+        ----------
+        name: str
+            Name of the mutable tensor to get.
+
+        Returns
+        -------
+            MutableTensor
+        """
+
+    async def stop_server(self):
+        """
+        Stop server.
+        """
+
+
+class AbstractSyncSession(AbstractSession, metaclass=ABCMeta):
+    @classmethod
+    @abstractmethod
+    def init(
+        cls,
+        address: str,
+        session_id: str,
+        backend: str = "mars",
+        new: bool = True,
+        **kwargs,
+    ) -> "AbstractSession":
+        """
+        Init a new session.
+
+        Parameters
+        ----------
+        address : str
+            Address.
+        session_id : str
+            Session ID.
+        backend : str
+            Backend.
+        new : bool
+            New a session.
+        kwargs
+
+        Returns
+        -------
+        session
+        """
+
+    @abstractmethod
+    def execute(
+        self, tileable, *tileables, show_progress: Union[bool, str] = None, **kwargs
+    ) -> Union[List[TileableType], TileableType, ExecutionInfo]:
+        """
+        Execute tileables.
+
+        Parameters
+        ----------
+        tileable
+            Tileable.
+        tileables
+            Tileables.
+        show_progress
+            If show progress.
+        kwargs
+
+        Returns
+        -------
+        result
+        """
+
+    @abstractmethod
+    def fetch(self, *tileables, **kwargs) -> list:
+        """
+        Fetch tileables.
+
+        Parameters
+        ----------
+        tileables
+            Tileables.
+        kwargs
+
+        Returns
+        -------
+        fetched_data : list
+        """
+
+    @abstractmethod
+    def fetch_infos(self, *tileables, fields, **kwargs) -> list:
+        """
+        Fetch infos of tileables.
+
+        Parameters
+        ----------
+        tileables
+            Tileables.
+        fields
+            List of fields
+        kwargs
+
+        Returns
+        -------
+        fetched_infos : list
+        """
+
+    @abstractmethod
+    def decref(self, *tileables_keys):
+        """
+        Decref tileables.
+
+        Parameters
+        ----------
+        tileables_keys : list
+            Tileables' keys
+        """
+
+    @abstractmethod
+    def _get_ref_counts(self) -> Dict[str, int]:
+        """
+        Get all ref counts
+
+        Returns
+        -------
+        ref_counts
+        """
+
+    @abstractmethod
+    def fetch_tileable_op_logs(
+        self,
+        tileable_op_key: str,
+        offsets: Union[Dict[str, List[int]], str, int],
+        sizes: Union[Dict[str, List[int]], str, int],
+    ) -> Dict:
+        """
+        Fetch logs given tileable op key.
+
+        Parameters
+        ----------
+        tileable_op_key : str
+            Tileable op key.
+        offsets
+            Chunk op key to offsets.
+        sizes
+            Chunk op key to sizes.
+
+        Returns
+        -------
+        chunk_key_to_logs
+        """
+
+    @abstractmethod
+    def get_total_n_cpu(self):
+        """
+        Get number of cluster cpus.
+
+        Returns
+        -------
+        number_of_cpu: int
+        """
+
+    @abstractmethod
+    def get_cluster_versions(self) -> List[str]:
+        """
+        Get versions used in current Mars cluster
+
+        Returns
+        -------
+        version_list : list
+            List of versions
+        """
+
+    @abstractmethod
+    def get_web_endpoint(self) -> Optional[str]:
+        """
+        Get web endpoint of current session
+
+        Returns
+        -------
+        web_endpoint : str
+            web endpoint
+        """
+
+    @abstractmethod
+    def create_mutable_tensor(
+        self,
+        shape: tuple,
+        dtype: Union[np.dtype, str],
+        name: str = None,
+        default_value: Union[int, float] = 0,
+        chunk_size: Union[int, Tuple] = None,
+    ):
+        """
+        Create a mutable tensor.
+
+        Parameters
+        ----------
+        shape: tuple
+            Shape of the mutable tensor.
+
+        dtype: np.dtype or str
+            Data type of the mutable tensor.
+
+        name: str, optional
+            Name of the mutable tensor, a random name will be used if not specified.
+
+        default_value: optional
+            Default value of the mutable tensor. Default is 0.
+
+        chunk_size: int or tuple, optional
+            Chunk size of the mutable tensor.
+
+        Returns
+        -------
+            MutableTensor
+        """
+
+    @abstractmethod
+    def get_mutable_tensor(self, name: str):
+        """
+        Get a mutable tensor by name.
+
+        Parameters
+        ----------
+        name: str
+            Name of the mutable tensor to get.
+
+        Returns
+        -------
+            MutableTensor
+        """
+
+    def fetch_log(
+        self,
+        tileables: List[TileableType],
+        offsets: List[int] = None,
+        sizes: List[int] = None,
+    ):
+        from ...core.custom_log import fetch
+
+        return fetch(tileables, self, offsets=offsets, sizes=sizes)
+
+
+@dataclass
+class ChunkFetchInfo:
+    tileable: TileableType
+    chunk: ChunkType
+    indexes: List[Union[int, slice]]
+    data: Any = None
+
+
+_submitted_tileables = WeakSet()
+
+
+@enter_mode(build=True, kernel=True)
+def gen_submit_tileable_graph(
+    session: "AbstractSession",
+    result_tileables: List[TileableType],
+    warn_duplicated_execution: bool = False,
+) -> Tuple[TileableGraph, List[TileableType]]:
+    tileable_to_copied = dict()
+    indexer = itertools.count()
+    result_to_index = {t: i for t, i in zip(result_tileables, indexer)}
+    result = list()
+    to_execute_tileables = list()
+    graph = TileableGraph(result)
+
+    q = list(result_tileables)
+    while q:
+        tileable = q.pop()
+        if tileable in tileable_to_copied:
+            continue
+        if tileable.cache and tileable not in result_to_index:
+            result_to_index[tileable] = next(indexer)
+        outputs = tileable.op.outputs
+        inputs = tileable.inputs if session not in tileable._executed_sessions else []
+        new_inputs = []
+        all_inputs_processed = True
+        for inp in inputs:
+            if inp in tileable_to_copied:
+                new_inputs.append(tileable_to_copied[inp])
+            elif session in inp._executed_sessions:
+                # executed, gen fetch
+                fetch_input = build_fetch(inp).data
+                tileable_to_copied[inp] = fetch_input
+                graph.add_node(fetch_input)
+                new_inputs.append(fetch_input)
+            else:
+                # some input not processed before
+                all_inputs_processed = False
+                # put back tileable
+                q.append(tileable)
+                q.append(inp)
+                break
+        if all_inputs_processed:
+            if isinstance(tileable.op, Fetch):
+                new_outputs = [tileable]
+            elif session in tileable._executed_sessions:
+                new_outputs = []
+                for out in outputs:
+                    fetch_out = tileable_to_copied.get(out, build_fetch(out).data)
+                    new_outputs.append(fetch_out)
+            else:
+                new_outputs = [
+                    t.data for t in copy_tileables(outputs, inputs=new_inputs)
+                ]
+            for out, new_out in zip(outputs, new_outputs):
+                tileable_to_copied[out] = new_out
+                graph.add_node(new_out)
+                for new_inp in new_inputs:
+                    graph.add_edge(new_inp, new_out)
+
+    # process results
+    result.extend([None] * len(result_to_index))
+    for t, i in result_to_index.items():
+        result[i] = tileable_to_copied[t]
+        to_execute_tileables.append(t)
+
+    if warn_duplicated_execution:
+        for n, c in tileable_to_copied.items():
+            if not isinstance(c.op, Fetch) and n in _submitted_tileables:
+                warnings.warn(
+                    f"Tileable {repr(n)} has been submitted before", RuntimeWarning
+                )
+        # add all nodes into submitted tileables
+        _submitted_tileables.update(
+            n for n, c in tileable_to_copied.items() if not isinstance(c.op, Fetch)
+        )
+
+    return graph, to_execute_tileables
+
+
+class _IsolatedSession(AbstractAsyncSession):
+    def __init__(
+        self,
+        address: str,
+        session_id: str,
+        backend: str,
+        session_api: AbstractSessionAPI,
+        meta_api: AbstractMetaAPI,
+        lifecycle_api: AbstractLifecycleAPI,
+        task_api: AbstractTaskAPI,
+        mutable_api: MutableAPI,
+        cluster_api: AbstractClusterAPI,
+        web_api: Optional[OscarWebAPI],
+        client: ClientType = None,
+        timeout: float = None,
+        request_rewriter: Callable = None,
+    ):
+        super().__init__(address, session_id)
+        self._backend = backend
+        self._session_api = session_api
+        self._task_api = task_api
+        self._meta_api = meta_api
+        self._lifecycle_api = lifecycle_api
+        self._mutable_api = mutable_api
+        self._cluster_api = cluster_api
+        self._web_api = web_api
+        self.client = client
+        self.timeout = timeout
+        self._request_rewriter = request_rewriter
+
+        self._tileable_to_fetch = WeakKeyDictionary()
+        self._asyncio_task_timeout_detector_task = (
+            register_asyncio_task_timeout_detector()
+        )
+
+        # add metrics
+        self._tileable_graph_gen_time = Metrics.gauge(
+            "mars.tileable_graph_gen_time_secs",
+            "Time consuming in seconds to generate a tileable graph",
+            ("address", "session_id"),
+        )
+
+    @classmethod
+    async def _init(
+        cls,
+        address: str,
+        session_id: str,
+        backend: str,
+        new: bool = True,
+        timeout: float = None,
+    ):
+        session_api = await SessionAPI.create(address)
+        if new:
+            # create new session
+            session_address = await session_api.create_session(session_id)
+        else:
+            session_address = await session_api.get_session_address(session_id)
+        lifecycle_api = await LifecycleAPI.create(session_id, session_address)
+        meta_api = await MetaAPI.create(session_id, session_address)
+        task_api = await TaskAPI.create(session_id, session_address)
+        mutable_api = await MutableAPI.create(session_id, session_address)
+        cluster_api = await ClusterAPI.create(session_address)
+        try:
+            web_api = await OscarWebAPI.create(session_address)
+        except mo.ActorNotExist:
+            web_api = None
+        return cls(
+            address,
+            session_id,
+            backend,
+            session_api,
+            meta_api,
+            lifecycle_api,
+            task_api,
+            mutable_api,
+            cluster_api,
+            web_api,
+            timeout=timeout,
+        )
+
+    @classmethod
+    @implements(AbstractAsyncSession.init)
+    async def init(
+        cls,
+        address: str,
+        session_id: str,
+        backend: str,
+        new: bool = True,
+        timeout: float = None,
+        **kwargs,
+    ) -> "AbstractAsyncSession":
+        init_local = kwargs.pop("init_local", False)
+        request_rewriter = kwargs.pop("request_rewriter", None)
+        if init_local:
+            from .local import new_cluster_in_isolation
+
+            return (
+                await new_cluster_in_isolation(
+                    address, timeout=timeout, backend=backend, **kwargs
+                )
+            ).session
+
+        if kwargs:  # pragma: no cover
+            unexpected_keys = ", ".join(list(kwargs.keys()))
+            raise TypeError(
+                f"Oscar session got unexpected arguments: {unexpected_keys}"
+            )
+
+        if urlparse(address).scheme == "http":
+            return await _IsolatedWebSession._init(
+                address,
+                session_id,
+                backend,
+                new=new,
+                timeout=timeout,
+                request_rewriter=request_rewriter,
+            )
+        else:
+            return await cls._init(
+                address,
+                session_id,
+                backend,
+                new=new,
+                timeout=timeout,
+            )
+
+    async def _update_progress(self, task_id: str, progress: Progress):
+        zero_acc_time = 0
+        delay = 0.5
+        while True:
+            try:
+                last_progress_value = progress.value
+                progress.value = await self._task_api.get_task_progress(task_id)
+                if abs(progress.value - last_progress_value) < 1e-4:
+                    # if percentage does not change, we add delay time by 0.5 seconds every time
+                    zero_acc_time = min(5, zero_acc_time + 0.5)
+                    delay = zero_acc_time
+                else:
+                    # percentage changes, we use percentage speed to calc progress time
+                    zero_acc_time = 0
+                    speed = abs(progress.value - last_progress_value) / delay
+                    # one percent for one second
+                    delay = 0.01 / speed
+                delay = max(0.5, min(delay, 5.0))
+                await asyncio.sleep(delay)
+            except asyncio.CancelledError:
+                break
+
+    async def _run_in_background(
+        self,
+        tileables: list,
+        task_id: str,
+        progress: Progress,
+        profiling: Profiling,
+    ):
+        with enter_mode(build=True, kernel=True):
+            # wait for task to finish
+            cancelled = False
+            progress_task = asyncio.create_task(
+                self._update_progress(task_id, progress)
+            )
+            start_time = time.time()
+            task_result: Optional[TaskResult] = None
+            try:
+                if self.timeout is None:
+                    check_interval = 30
+                else:
+                    elapsed = time.time() - start_time
+                    check_interval = min(self.timeout - elapsed, 30)
+
+                while True:
+                    task_result = await self._task_api.wait_task(
+                        task_id, timeout=check_interval
+                    )
+                    if task_result is not None:
+                        break
+                    elif (
+                        self.timeout is not None
+                        and time.time() - start_time > self.timeout
+                    ):
+                        raise TimeoutError(
+                            f"Task({task_id}) running time > {self.timeout}"
+                        )
+            except asyncio.CancelledError:
+                # cancelled
+                cancelled = True
+                await self._task_api.cancel_task(task_id)
+            finally:
+                progress_task.cancel()
+                if task_result is not None:
+                    progress.value = 1.0
+                else:
+                    # not finished, set progress
+                    progress.value = await self._task_api.get_task_progress(task_id)
+            if task_result is not None:
+                profiling.result = task_result.profiling
+                if task_result.profiling:
+                    logger.warning(
+                        "Profile task %s execution result:\n%s",
+                        task_id,
+                        json.dumps(task_result.profiling, indent=4),
+                    )
+                if task_result.error:
+                    raise task_result.error.with_traceback(task_result.traceback)
+            if cancelled:
+                return
+            fetch_tileables = await self._task_api.get_fetch_tileables(task_id)
+            assert len(tileables) == len(fetch_tileables)
+
+            for tileable, fetch_tileable in zip(tileables, fetch_tileables):
+                self._tileable_to_fetch[tileable] = fetch_tileable
+                # update meta, e.g. unknown shape
+                tileable.params = fetch_tileable.params
+
+    async def execute(self, *tileables, **kwargs) -> ExecutionInfo:
+        if self._closed:
+            raise RuntimeError("Session closed already")
+        fuse_enabled: bool = kwargs.pop("fuse_enabled", None)
+        extra_config: dict = kwargs.pop("extra_config", None)
+        warn_duplicated_execution: bool = kwargs.pop("warn_duplicated_execution", False)
+        if kwargs:  # pragma: no cover
+            raise TypeError(f"run got unexpected key arguments {list(kwargs)!r}")
+
+        tileables = [
+            tileable.data if hasattr(tileable, "data") else tileable
+            for tileable in tileables
+        ]
+
+        # build tileable graph
+        with Timer() as timer:
+            tileable_graph, to_execute_tileables = gen_submit_tileable_graph(
+                self, tileables, warn_duplicated_execution=warn_duplicated_execution
+            )
+
+        logger.info(
+            "Time consuming to generate a tileable graph is %ss with address %s, session id %s",
+            timer.duration,
+            self.address,
+            self._session_id,
+        )
+        self._tileable_graph_gen_time.record(
+            timer.duration, {"address": self.address, "session_id": self._session_id}
+        )
+
+        # submit task
+        task_id = await self._task_api.submit_tileable_graph(
+            tileable_graph,
+            fuse_enabled=fuse_enabled,
+            extra_config=extra_config,
+        )
+
+        progress = Progress()
+        profiling = Profiling()
+        # create asyncio.Task
+        aio_task = asyncio.create_task(
+            self._run_in_background(to_execute_tileables, task_id, progress, profiling)
+        )
+        return ExecutionInfo(
+            aio_task,
+            progress,
+            profiling,
+            asyncio.get_running_loop(),
+            to_execute_tileables,
+        )
+
+    def _get_to_fetch_tileable(
+        self, tileable: TileableType
+    ) -> Tuple[TileableType, List[Union[slice, Integral]]]:
+        from ...dataframe.indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem
+        from ...tensor.indexing import TensorIndex
+
+        slice_op_types = TensorIndex, DataFrameIlocGetItem, SeriesIlocGetItem
+
+        if hasattr(tileable, "data"):
+            tileable = tileable.data
+
+        indexes = None
+        while tileable not in self._tileable_to_fetch:
+            # if tileable's op is slice, try to check input
+            if isinstance(tileable.op, slice_op_types):
+                indexes = tileable.op.indexes
+                tileable = tileable.inputs[0]
+                if not all(isinstance(index, (slice, Integral)) for index in indexes):
+                    raise ValueError("Only support fetch data slices")
+            elif isinstance(tileable.op, Fetch):
+                break
+            else:
+                raise ValueError(f"Cannot fetch unexecuted tileable: {tileable!r}")
+
+        if isinstance(tileable.op, Fetch):
+            return tileable, indexes
+        else:
+            return self._tileable_to_fetch[tileable], indexes
+
+    @classmethod
+    def _calc_chunk_indexes(
+        cls, fetch_tileable: TileableType, indexes: List[Union[slice, Integral]]
+    ) -> Dict[ChunkType, List[Union[slice, int]]]:
+        from ...tensor.utils import slice_split
+
+        axis_to_slices = {
+            axis: slice_split(ind, fetch_tileable.nsplits[axis])
+            for axis, ind in enumerate(indexes)
+        }
+        result = dict()
+        for chunk_index in itertools.product(
+            *[v.keys() for v in axis_to_slices.values()]
+        ):
+            # slice_obj: use tuple, since numpy complains
+            #
+            # FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use
+            # `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array
+            # index, `arr[np.array(seq)]`, which will result either in an error or a different result.
+            slice_obj = [
+                axis_to_slices[axis][chunk_idx]
+                for axis, chunk_idx in enumerate(chunk_index)
+            ]
+            chunk = fetch_tileable.cix[chunk_index]
+            result[chunk] = slice_obj
+        return result
+
+    def _process_result(self, tileable, result):  # pylint: disable=no-self-use
+        return result
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_storage_api(self, band: BandType):
+        if urlparse(self.address).scheme == "http":
+            from ...services.storage.api import WebStorageAPI
+
+            storage_api = WebStorageAPI(
+                self._session_id, self.address, band[1], self._request_rewriter
+            )
+        else:
+            storage_api = await StorageAPI.create(self._session_id, band[0], band[1])
+        return storage_api
+
+    async def fetch(self, *tileables, **kwargs) -> list:
+        if kwargs:  # pragma: no cover
+            unexpected_keys = ", ".join(list(kwargs.keys()))
+            raise TypeError(f"`fetch` got unexpected arguments: {unexpected_keys}")
+
+        fetcher = Fetcher.create(self._backend, get_storage_api=self._get_storage_api)
+
+        with enter_mode(build=True):
+            chunks = []
+            get_chunk_metas = []
+            fetch_infos_list = []
+            for tileable in tileables:
+                fetch_tileable, indexes = self._get_to_fetch_tileable(tileable)
+                chunk_to_slice = None
+                if indexes is not None:
+                    chunk_to_slice = self._calc_chunk_indexes(fetch_tileable, indexes)
+                fetch_infos = []
+                for chunk in fetch_tileable.chunks:
+                    if indexes and chunk not in chunk_to_slice:
+                        continue
+                    chunks.append(chunk)
+                    get_chunk_metas.append(
+                        self._meta_api.get_chunk_meta.delay(
+                            chunk.key,
+                            fields=fetcher.required_meta_keys,
+                        )
+                    )
+                    indexes = (
+                        chunk_to_slice[chunk] if chunk_to_slice is not None else None
+                    )
+                    fetch_infos.append(
+                        ChunkFetchInfo(tileable=tileable, chunk=chunk, indexes=indexes)
+                    )
+                fetch_infos_list.append(fetch_infos)
+
+            chunk_metas = await self._meta_api.get_chunk_meta.batch(*get_chunk_metas)
+            for chunk, meta, fetch_info in zip(
+                chunks, chunk_metas, itertools.chain(*fetch_infos_list)
+            ):
+                await fetcher.append(chunk.key, meta, fetch_info.indexes)
+            fetched_data = await fetcher.get()
+            for fetch_info, data in zip(
+                itertools.chain(*fetch_infos_list), fetched_data
+            ):
+                fetch_info.data = data
+
+            result = []
+            for tileable, fetch_infos in zip(tileables, fetch_infos_list):
+                index_to_data = [
+                    (fetch_info.chunk.index, fetch_info.data)
+                    for fetch_info in fetch_infos
+                ]
+                merged = merge_chunks(index_to_data)
+                merged = merged_chunk_as_tileable_type(merged, tileable)
+                result.append(self._process_result(tileable, merged))
+            return result
+
+    async def fetch_infos(self, *tileables, fields, **kwargs) -> list:
+        available_fields = {
+            "data_key",
+            "object_id",
+            "object_refs",
+            "level",
+            "memory_size",
+            "store_size",
+            "bands",
+        }
+        if fields is None:
+            fields = available_fields
+        else:
+            for field_name in fields:
+                if field_name not in available_fields:  # pragma: no cover
+                    raise TypeError(
+                        f"`fetch_infos` got unexpected field name: {field_name}"
+                    )
+            fields = set(fields)
+
+        if kwargs:  # pragma: no cover
+            unexpected_keys = ", ".join(list(kwargs.keys()))
+            raise TypeError(f"`fetch` got unexpected arguments: {unexpected_keys}")
+        # following fields needs to access storage API to get the meta.
+        _need_query_storage_fields = {"level", "memory_size", "store_size"}
+        _need_query_storage = bool(_need_query_storage_fields & fields)
+        with enter_mode(build=True):
+            chunk_to_bands, fetch_infos_list, result = await self._query_meta_service(
+                tileables, fields, _need_query_storage
+            )
+            if not _need_query_storage:
+                assert result is not None
+                return result
+            storage_api_to_gets = defaultdict(list)
+            storage_api_to_fetch_infos = defaultdict(list)
+            for fetch_info in itertools.chain(*fetch_infos_list):
+                chunk = fetch_info.chunk
+                bands = chunk_to_bands[chunk]
+                storage_api = await self._get_storage_api(bands[0])
+                storage_api_to_gets[storage_api].append(
+                    storage_api.get_infos.delay(chunk.key)
+                )
+                storage_api_to_fetch_infos[storage_api].append(fetch_info)
+            for storage_api in storage_api_to_gets:
+                fetched_data = await storage_api.get_infos.batch(
+                    *storage_api_to_gets[storage_api]
+                )
+                infos = storage_api_to_fetch_infos[storage_api]
+                for info, data in zip(infos, fetched_data):
+                    info.data = data
+
+            result = []
+            for fetch_infos in fetch_infos_list:
+                fetched = defaultdict(list)
+                for fetch_info in fetch_infos:
+                    bands = chunk_to_bands[fetch_info.chunk]
+                    # Currently there's only one item in the returned List from storage_api.get_infos()
+                    data = fetch_info.data[0]
+                    if "data_key" in fields:
+                        fetched["data_key"].append(fetch_info.chunk.key)
+                    if "object_id" in fields:
+                        fetched["object_id"].append(data.object_id)
+                    if "level" in fields:
+                        fetched["level"].append(data.level)
+                    if "memory_size" in fields:
+                        fetched["memory_size"].append(data.memory_size)
+                    if "store_size" in fields:
+                        fetched["store_size"].append(data.store_size)
+                    # data.band misses ip info, e.g. 'numa-0'
+                    # while band doesn't, e.g. (address0, 'numa-0')
+                    if "bands" in fields:
+                        fetched["bands"].append(bands)
+                result.append(fetched)
+
+            return result
+
+    async def _query_meta_service(self, tileables, fields, query_storage):
+        chunks = []
+        get_chunk_metas = []
+        fetch_infos_list = []
+        for tileable in tileables:
+            fetch_tileable, _ = self._get_to_fetch_tileable(tileable)
+            fetch_infos = []
+            for chunk in fetch_tileable.chunks:
+                chunks.append(chunk)
+                get_chunk_metas.append(
+                    self._meta_api.get_chunk_meta.delay(
+                        chunk.key,
+                        fields=["bands"] if query_storage else fields - {"data_key"},
+                    )
+                )
+                fetch_infos.append(
+                    ChunkFetchInfo(tileable=tileable, chunk=chunk, indexes=None)
+                )
+            fetch_infos_list.append(fetch_infos)
+        chunk_metas = await self._meta_api.get_chunk_meta.batch(*get_chunk_metas)
+        if not query_storage:
+            result = []
+            chunk_to_meta = dict(zip(chunks, chunk_metas))
+            for fetch_infos in fetch_infos_list:
+                fetched = defaultdict(list)
+                for fetch_info in fetch_infos:
+                    if "data_key" in fields:
+                        fetched["data_key"].append(fetch_info.chunk.key)
+                    for field in fields - {"data_key"}:
+                        fetched[field].append(chunk_to_meta[fetch_info.chunk][field])
+                result.append(fetched)
+            return {}, fetch_infos_list, result
+        chunk_to_bands = {
+            chunk: meta["bands"] for chunk, meta in zip(chunks, chunk_metas)
+        }
+        return chunk_to_bands, fetch_infos_list, None
+
+    async def decref(self, *tileable_keys):
+        logger.debug("Decref tileables on client: %s", tileable_keys)
+        return await self._lifecycle_api.decref_tileables(list(tileable_keys))
+
+    async def _get_ref_counts(self) -> Dict[str, int]:
+        return await self._lifecycle_api.get_all_chunk_ref_counts()
+
+    async def fetch_tileable_op_logs(
+        self,
+        tileable_op_key: str,
+        offsets: Union[Dict[str, List[int]], str, int],
+        sizes: Union[Dict[str, List[int]], str, int],
+    ) -> Dict:
+        return await self._session_api.fetch_tileable_op_logs(
+            self.session_id, tileable_op_key, offsets, sizes
+        )
+
+    async def get_total_n_cpu(self):
+        all_bands = await self._cluster_api.get_all_bands()
+        n_cpu = 0
+        for band, resource in all_bands.items():
+            _, band_name = band
+            if band_name.startswith("numa-"):
+                n_cpu += resource.num_cpus
+        return n_cpu
+
+    async def get_cluster_versions(self) -> List[str]:
+        return list(await self._cluster_api.get_mars_versions())
+
+    async def get_web_endpoint(self) -> Optional[str]:
+        if self._web_api is None:
+            return None
+        return await self._web_api.get_web_address()
+
+    async def destroy(self):
+        await super().destroy()
+        await self._session_api.delete_session(self._session_id)
+        self._tileable_to_fetch.clear()
+        if self._asyncio_task_timeout_detector_task:  # pragma: no cover
+            self._asyncio_task_timeout_detector_task.cancel()
+
+    async def create_remote_object(
+        self, session_id: str, name: str, object_cls, *args, **kwargs
+    ):
+        return await self._session_api.create_remote_object(
+            session_id, name, object_cls, *args, **kwargs
+        )
+
+    async def get_remote_object(self, session_id: str, name: str):
+        return await self._session_api.get_remote_object(session_id, name)
+
+    async def destroy_remote_object(self, session_id: str, name: str):
+        return await self._session_api.destroy_remote_object(session_id, name)
+
+    async def create_mutable_tensor(
+        self,
+        shape: tuple,
+        dtype: Union[np.dtype, str],
+        name: str = None,
+        default_value: Union[int, float] = 0,
+        chunk_size: Union[int, Tuple] = None,
+    ):
+        tensor_info = await self._mutable_api.create_mutable_tensor(
+            shape, dtype, name, default_value, chunk_size
+        )
+        return tensor_info, self._mutable_api
+
+    async def get_mutable_tensor(self, name: str):
+        tensor_info = await self._mutable_api.get_mutable_tensor(name)
+        return tensor_info, self._mutable_api
+
+    async def stop_server(self):
+        if self.client:
+            await self.client.stop()
+
+
+class _IsolatedWebSession(_IsolatedSession):
+    @classmethod
+    async def _init(
+        cls,
+        address: str,
+        session_id: str,
+        backend: str,
+        new: bool = True,
+        timeout: float = None,
+        request_rewriter: Callable = None,
+    ):
+        from ...services.cluster import WebClusterAPI
+        from ...services.lifecycle import WebLifecycleAPI
+        from ...services.meta import WebMetaAPI
+        from ...services.mutable import WebMutableAPI
+        from ...services.session import WebSessionAPI
+        from ...services.task import WebTaskAPI
+
+        session_api = WebSessionAPI(address, request_rewriter)
+        if new:
+            # create new session
+            await session_api.create_session(session_id)
+        lifecycle_api = WebLifecycleAPI(session_id, address, request_rewriter)
+        meta_api = WebMetaAPI(session_id, address, request_rewriter)
+        task_api = WebTaskAPI(session_id, address, request_rewriter)
+        mutable_api = WebMutableAPI(session_id, address, request_rewriter)
+        cluster_api = WebClusterAPI(address, request_rewriter)
+
+        return cls(
+            address,
+            session_id,
+            backend,
+            session_api,
+            meta_api,
+            lifecycle_api,
+            task_api,
+            mutable_api,
+            cluster_api,
+            None,
+            timeout=timeout,
+            request_rewriter=request_rewriter,
+        )
+
+    async def get_web_endpoint(self) -> Optional[str]:
+        return self.address
+
+
+def _delegate_to_isolated_session(func: Union[Callable, Coroutine]):
+    if asyncio.iscoroutinefunction(func):
+
+        @wraps(func)
+        async def inner(session: "AsyncSession", *args, **kwargs):
+            coro = getattr(session._isolated_session, func.__name__)(*args, **kwargs)
+            fut = asyncio.run_coroutine_threadsafe(coro, session._loop)
+            return await asyncio.wrap_future(fut)
+
+    else:
+
+        @wraps(func)
+        def inner(session: "SyncSession", *args, **kwargs):
+            coro = getattr(session._isolated_session, func.__name__)(*args, **kwargs)
+            fut = asyncio.run_coroutine_threadsafe(coro, session._loop)
+            return fut.result()
+
+    return inner
+
+
+class AsyncSession(AbstractAsyncSession):
+    def __init__(
+        self,
+        address: str,
+        session_id: str,
+        isolated_session: _IsolatedSession,
+        isolation: Isolation,
+    ):
+        super().__init__(address, session_id)
+
+        self._isolated_session = _get_isolated_session(isolated_session)
+        self._isolation = isolation
+        self._loop = isolation.loop
+
+    @classmethod
+    def from_isolated_session(
+        cls, isolated_session: _IsolatedSession
+    ) -> "AsyncSession":
+        return cls(
+            isolated_session.address,
+            isolated_session.session_id,
+            isolated_session,
+            get_isolation(),
+        )
+
+    @property
+    def client(self):
+        return self._isolated_session.client
+
+    @client.setter
+    def client(self, client: ClientType):
+        self._isolated_session.client = client
+
+    @classmethod
+    @implements(AbstractAsyncSession.init)
+    async def init(
+        cls,
+        address: str,
+        session_id: str,
+        backend: str = "mars",
+        new: bool = True,
+        **kwargs,
+    ) -> "AbstractSession":
+        isolation = ensure_isolation_created(kwargs)
+        coro = _IsolatedSession.init(address, session_id, backend, new=new, **kwargs)
+        fut = asyncio.run_coroutine_threadsafe(coro, isolation.loop)
+        isolated_session = await asyncio.wrap_future(fut)
+        return AsyncSession(address, session_id, isolated_session, isolation)
+
+    def as_default(self) -> AbstractSession:
+        AbstractSession._default = self._isolated_session
+        return self
+
+    @implements(AbstractAsyncSession.destroy)
+    async def destroy(self):
+        coro = self._isolated_session.destroy()
+        await asyncio.wrap_future(asyncio.run_coroutine_threadsafe(coro, self._loop))
+        self.reset_default()
+
+    @implements(AbstractAsyncSession.execute)
+    @_delegate_to_isolated_session
+    async def execute(self, *tileables, **kwargs) -> ExecutionInfo:
+        pass  # pragma: no cover
+
+    @implements(AbstractAsyncSession.fetch)
+    async def fetch(self, *tileables, **kwargs) -> list:
+        coro = _fetch(*tileables, session=self._isolated_session, **kwargs)
+        return await asyncio.wrap_future(
+            asyncio.run_coroutine_threadsafe(coro, self._loop)
+        )
+
+    @implements(AbstractAsyncSession._get_ref_counts)
+    @_delegate_to_isolated_session
+    async def _get_ref_counts(self) -> Dict[str, int]:
+        pass  # pragma: no cover
+
+    @implements(AbstractAsyncSession.fetch_tileable_op_logs)
+    @_delegate_to_isolated_session
+    async def fetch_tileable_op_logs(
+        self,
+        tileable_op_key: str,
+        offsets: Union[Dict[str, List[int]], str, int],
+        sizes: Union[Dict[str, List[int]], str, int],
+    ) -> Dict:
+        pass  # pragma: no cover
+
+    @implements(AbstractAsyncSession.get_total_n_cpu)
+    @_delegate_to_isolated_session
+    async def get_total_n_cpu(self):
+        pass  # pragma: no cover
+
+    @implements(AbstractAsyncSession.get_cluster_versions)
+    @_delegate_to_isolated_session
+    async def get_cluster_versions(self) -> List[str]:
+        pass  # pragma: no cover
+
+    @implements(AbstractAsyncSession.create_remote_object)
+    @_delegate_to_isolated_session
+    async def create_remote_object(
+        self, session_id: str, name: str, object_cls, *args, **kwargs
+    ):
+        pass  # pragma: no cover
+
+    @implements(AbstractAsyncSession.get_remote_object)
+    @_delegate_to_isolated_session
+    async def get_remote_object(self, session_id: str, name: str):
+        pass  # pragma: no cover
+
+    @implements(AbstractAsyncSession.destroy_remote_object)
+    @_delegate_to_isolated_session
+    async def destroy_remote_object(self, session_id: str, name: str):
+        pass  # pragma: no cover
+
+    @implements(AbstractAsyncSession.create_mutable_tensor)
+    async def create_mutable_tensor(
+        self,
+        shape: tuple,
+        dtype: Union[np.dtype, str],
+        name: str = None,
+        default_value: Union[int, float] = 0,
+        chunk_size: Union[int, Tuple] = None,
+    ):
+        tensor_info, mutable_api = await self._isolated_session.create_mutable_tensor(
+            shape, dtype, name, default_value, chunk_size
+        )
+        return MutableTensor.create(tensor_info, mutable_api, self._loop)
+
+    @implements(AbstractAsyncSession.get_mutable_tensor)
+    async def get_mutable_tensor(self, name: str):
+        tensor_info, mutable_api = await self._isolated_session.get_mutable_tensor(name)
+        return MutableTensor.create(tensor_info, mutable_api, self._loop)
+
+    @implements(AbstractAsyncSession.get_web_endpoint)
+    @_delegate_to_isolated_session
+    async def get_web_endpoint(self) -> Optional[str]:
+        pass  # pragma: no cover
+
+    @implements(AbstractAsyncSession.stop_server)
+    async def stop_server(self):
+        coro = self._isolated_session.stop_server()
+        await asyncio.wrap_future(asyncio.run_coroutine_threadsafe(coro, self._loop))
+        stop_isolation()
+
+
+class ProgressBar:
+    def __init__(self, show_progress):
+        if not show_progress:
+            self.progress_bar = None
+        else:
+            try:
+                from tqdm.auto import tqdm
+            except ImportError:
+                if show_progress != "auto":  # pragma: no cover
+                    raise ImportError("tqdm is required to show progress")
+                else:
+                    self.progress_bar = None
+            else:
+                self.progress_bar = tqdm(
+                    total=100,
+                    bar_format="{l_bar}{bar}| {n:6.2f}/{total_fmt} "
+                    "[{elapsed}<{remaining}, {rate_fmt}{postfix}]",
+                )
+
+        self.last_progress: float = 0.0
+
+    @property
+    def show_progress(self) -> bool:
+        return self.progress_bar is not None
+
+    def __enter__(self):
+        self.progress_bar.__enter__()
+
+    def __exit__(self, *_):
+        self.progress_bar.__exit__(*_)
+
+    def update(self, progress: float):
+        progress = min(progress, 100)
+        last_progress = self.last_progress
+        if self.progress_bar:
+            incr = max(progress - last_progress, 0)
+            self.progress_bar.update(incr)
+        self.last_progress = max(last_progress, progress)
+
+
+class SyncSession(AbstractSyncSession):
+    _execution_pool = concurrent.futures.ThreadPoolExecutor(1)
+
+    def __init__(
+        self,
+        address: str,
+        session_id: str,
+        isolated_session: _IsolatedSession,
+        isolation: Isolation,
+    ):
+        super().__init__(address, session_id)
+
+        self._isolated_session = _get_isolated_session(isolated_session)
+        self._isolation = isolation
+        self._loop = isolation.loop
+
+    @classmethod
+    def from_isolated_session(cls, isolated_session: _IsolatedSession) -> "SyncSession":
+        return cls(
+            isolated_session.address,
+            isolated_session.session_id,
+            isolated_session,
+            get_isolation(),
+        )
+
+    @classmethod
+    def init(
+        cls,
+        address: str,
+        session_id: str,
+        backend: str = "mars",
+        new: bool = True,
+        **kwargs,
+    ) -> "AbstractSession":
+        isolation = ensure_isolation_created(kwargs)
+        coro = _IsolatedSession.init(address, session_id, backend, new=new, **kwargs)
+        fut = asyncio.run_coroutine_threadsafe(coro, isolation.loop)
+        isolated_session = fut.result()
+        return SyncSession(address, session_id, isolated_session, isolation)
+
+    def as_default(self) -> AbstractSession:
+        AbstractSession._default = self._isolated_session
+        return self
+
+    @property
+    def _session(self):
+        return self._isolated_session
+
+    def _new_cancel_event(self):
+        async def new_event():
+            return asyncio.Event()
+
+        return asyncio.run_coroutine_threadsafe(new_event(), self._loop).result()
+
+    @implements(AbstractSyncSession.execute)
+    def execute(
+        self,
+        tileable,
+        *tileables,
+        show_progress: Union[bool, str] = None,
+        warn_duplicated_execution: bool = None,
+        **kwargs,
+    ) -> Union[List[TileableType], TileableType, ExecutionInfo]:
+        wait = kwargs.get("wait", True)
+        if show_progress is None:
+            show_progress = options.show_progress
+        if warn_duplicated_execution is None:
+            warn_duplicated_execution = options.warn_duplicated_execution
+        to_execute_tileables = []
+        for t in (tileable,) + tileables:
+            to_execute_tileables.extend(t.op.outputs)
+
+        cancelled = kwargs.get("cancelled")
+        if cancelled is None:
+            cancelled = kwargs["cancelled"] = self._new_cancel_event()
+
+        coro = _execute(
+            *set(to_execute_tileables),
+            session=self._isolated_session,
+            show_progress=show_progress,
+            warn_duplicated_execution=warn_duplicated_execution,
+            **kwargs,
+        )
+        fut = asyncio.run_coroutine_threadsafe(coro, self._loop)
+        try:
+            execution_info: ExecutionInfo = fut.result(
+                timeout=self._isolated_session.timeout
+            )
+        except KeyboardInterrupt:  # pragma: no cover
+            logger.warning("Cancelling running task")
+            cancelled.set()
+            fut.result()
+            logger.warning("Cancel finished")
+
+        if wait:
+            return tileable if len(tileables) == 0 else [tileable] + list(tileables)
+        else:
+            aio_task = execution_info.aio_task
+
+            async def run():
+                await aio_task
+                return tileable if len(tileables) == 0 else [tileable] + list(tileables)
+
+            async def driver():
+                return asyncio.create_task(run())
+
+            new_aio_task = asyncio.run_coroutine_threadsafe(
+                driver(), execution_info.loop
+            ).result()
+            new_execution_info = ExecutionInfo(
+                new_aio_task,
+                execution_info._progress,
+                execution_info._profiling,
+                execution_info.loop,
+                to_execute_tileables,
+            )
+            return new_execution_info
+
+    @implements(AbstractSyncSession.fetch)
+    def fetch(self, *tileables, **kwargs) -> list:
+        coro = _fetch(*tileables, session=self._isolated_session, **kwargs)
+        return asyncio.run_coroutine_threadsafe(coro, self._loop).result()
+
+    @implements(AbstractSyncSession.fetch_infos)
+    def fetch_infos(self, *tileables, fields, **kwargs) -> list:
+        coro = _fetch_infos(
+            *tileables, fields=fields, session=self._isolated_session, **kwargs
+        )
+        return asyncio.run_coroutine_threadsafe(coro, self._loop).result()
+
+    @implements(AbstractSyncSession.decref)
+    @_delegate_to_isolated_session
+    def decref(self, *tileables_keys):
+        pass  # pragma: no cover
+
+    @implements(AbstractSyncSession._get_ref_counts)
+    @_delegate_to_isolated_session
+    def _get_ref_counts(self) -> Dict[str, int]:
+        pass  # pragma: no cover
+
+    @implements(AbstractSyncSession.fetch_tileable_op_logs)
+    @_delegate_to_isolated_session
+    def fetch_tileable_op_logs(
+        self,
+        tileable_op_key: str,
+        offsets: Union[Dict[str, List[int]], str, int],
+        sizes: Union[Dict[str, List[int]], str, int],
+    ) -> Dict:
+        pass  # pragma: no cover
+
+    @implements(AbstractSyncSession.get_total_n_cpu)
+    @_delegate_to_isolated_session
+    def get_total_n_cpu(self):
+        pass  # pragma: no cover
+
+    @implements(AbstractSyncSession.get_web_endpoint)
+    @_delegate_to_isolated_session
+    def get_web_endpoint(self) -> Optional[str]:
+        pass  # pragma: no cover
+
+    @implements(AbstractSyncSession.get_cluster_versions)
+    @_delegate_to_isolated_session
+    def get_cluster_versions(self) -> List[str]:
+        pass  # pragma: no cover
+
+    @implements(AbstractSyncSession.create_mutable_tensor)
+    def create_mutable_tensor(
+        self,
+        shape: tuple,
+        dtype: Union[np.dtype, str],
+        name: str = None,
+        default_value: Union[int, float] = 0,
+        chunk_size: Union[int, Tuple] = None,
+    ):
+        coro = self._isolated_session.create_mutable_tensor(
+            shape, dtype, name, default_value, chunk_size
+        )
+        fut = asyncio.run_coroutine_threadsafe(coro, self._loop)
+        tensor_info, mutable_api = fut.result()
+        return MutableTensor.create(tensor_info, mutable_api, self._loop)
+
+    @implements(AbstractSyncSession.get_mutable_tensor)
+    def get_mutable_tensor(self, name: str):
+        coro = self._isolated_session.get_mutable_tensor(name)
+        fut = asyncio.run_coroutine_threadsafe(coro, self._loop)
+        tensor_info, mutable_api = fut.result()
+        return MutableTensor.create(tensor_info, mutable_api, self._loop)
+
+    def destroy(self):
+        coro = self._isolated_session.destroy()
+        asyncio.run_coroutine_threadsafe(coro, self._loop).result()
+        self.reset_default()
+
+    def stop_server(self, isolation=True):
+        try:
+            coro = self._isolated_session.stop_server()
+            future = asyncio.run_coroutine_threadsafe(coro, self._loop)
+            future.result(timeout=5)
+        finally:
+            self.reset_default()
+            if isolation:
+                stop_isolation()
+
+    def close(self):
+        self.destroy()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *_):
+        self.close()
+
+
+async def _execute_with_progress(
+    execution_info: ExecutionInfo,
+    progress_bar: ProgressBar,
+    progress_update_interval: Union[int, float],
+    cancelled: asyncio.Event,
+):
+    with progress_bar:
+        while not cancelled.is_set():
+            done, _pending = await asyncio.wait(
+                [execution_info.get_future()], timeout=progress_update_interval
+            )
+            if not done:
+                if not cancelled.is_set() and execution_info.progress() is not None:
+                    progress_bar.update(execution_info.progress() * 100)
+            else:
+                # done
+                if not cancelled.is_set():
+                    progress_bar.update(100)
+                break
+
+
+async def _execute(
+    *tileables: Tuple[TileableType],
+    session: _IsolatedSession = None,
+    wait: bool = True,
+    show_progress: Union[bool, str] = "auto",
+    progress_update_interval: Union[int, float] = 1,
+    cancelled: asyncio.Event = None,
+    **kwargs,
+):
+    execution_info = await session.execute(*tileables, **kwargs)
+
+    def _attach_session(future: asyncio.Future):
+        if future.exception() is None:
+            for t in execution_info.to_execute_tileables:
+                t._attach_session(session)
+
+    execution_info.add_done_callback(_attach_session)
+    cancelled = cancelled or asyncio.Event()
+
+    if wait:
+        progress_bar = ProgressBar(show_progress)
+        if progress_bar.show_progress:
+            await _execute_with_progress(
+                execution_info, progress_bar, progress_update_interval, cancelled
+            )
+        else:
+            exec_task = asyncio.ensure_future(execution_info)
+            cancel_task = asyncio.ensure_future(cancelled.wait())
+            await asyncio.wait(
+                [exec_task, cancel_task], return_when=asyncio.FIRST_COMPLETED
+            )
+        if cancelled.is_set():
+            execution_info.remove_done_callback(_attach_session)
+            execution_info.cancel()
+        else:
+            # set cancelled to avoid wait task leak
+            cancelled.set()
+        await execution_info
+    else:
+        return execution_info
+
+
+def execute(
+    tileable: TileableType,
+    *tileables: Tuple[TileableType],
+    session: SyncSession = None,
+    wait: bool = True,
+    new_session_kwargs: dict = None,
+    show_progress: Union[bool, str] = None,
+    progress_update_interval=1,
+    **kwargs,
+):
+    if isinstance(tileable, (tuple, list)) and len(tileables) == 0:
+        tileable, tileables = tileable[0], tileable[1:]
+    if session is None:
+        session = get_default_or_create(**(new_session_kwargs or dict()))
+    session = _ensure_sync(session)
+    return session.execute(
+        tileable,
+        *tileables,
+        wait=wait,
+        show_progress=show_progress,
+        progress_update_interval=progress_update_interval,
+        **kwargs,
+    )
+
+
+async def _fetch(
+    tileable: TileableType,
+    *tileables: Tuple[TileableType],
+    session: _IsolatedSession = None,
+    **kwargs,
+):
+    if isinstance(tileable, tuple) and len(tileables) == 0:
+        tileable, tileables = tileable[0], tileable[1:]
+    session = _get_isolated_session(session)
+    data = await session.fetch(tileable, *tileables, **kwargs)
+    return data[0] if len(tileables) == 0 else data
+
+
+async def _fetch_infos(
+    tileable: TileableType,
+    *tileables: Tuple[TileableType],
+    session: _IsolatedSession = None,
+    fields: List[str] = None,
+    **kwargs,
+):
+    if isinstance(tileable, tuple) and len(tileables) == 0:
+        tileable, tileables = tileable[0], tileable[1:]
+    session = _get_isolated_session(session)
+    data = await session.fetch_infos(tileable, *tileables, fields=fields, **kwargs)
+    return data[0] if len(tileables) == 0 else data
+
+
+def fetch(
+    tileable: TileableType,
+    *tileables: Tuple[TileableType],
+    session: SyncSession = None,
+    **kwargs,
+):
+    if isinstance(tileable, (tuple, list)) and len(tileables) == 0:
+        tileable, tileables = tileable[0], tileable[1:]
+    if session is None:
+        session = get_default_session()
+        if session is None:  # pragma: no cover
+            raise ValueError("No session found")
+
+    session = _ensure_sync(session)
+    return session.fetch(tileable, *tileables, **kwargs)
+
+
+def fetch_infos(
+    tileable: TileableType,
+    *tileables: Tuple[TileableType],
+    fields: List[str],
+    session: SyncSession = None,
+    **kwargs,
+):
+    if isinstance(tileable, tuple) and len(tileables) == 0:
+        tileable, tileables = tileable[0], tileable[1:]
+    if session is None:
+        session = get_default_session()
+        if session is None:  # pragma: no cover
+            raise ValueError("No session found")
+    session = _ensure_sync(session)
+    return session.fetch_infos(tileable, *tileables, fields=fields, **kwargs)
+
+
+def fetch_log(*tileables: TileableType, session: SyncSession = None, **kwargs):
+    if len(tileables) == 1 and isinstance(tileables[0], (list, tuple)):
+        tileables = tileables[0]
+    if session is None:
+        session = get_default_session()
+        if session is None:  # pragma: no cover
+            raise ValueError("No session found")
+    session = _ensure_sync(session)
+    return session.fetch_log(list(tileables), **kwargs)
+
+
+def ensure_isolation_created(kwargs):
+    loop = kwargs.pop("loop", None)
+    use_uvloop = kwargs.pop("use_uvloop", "auto")
+
+    try:
+        return get_isolation()
+    except KeyError:
+        if loop is None:
+            if not use_uvloop:
+                loop = asyncio.new_event_loop()
+            else:
+                try:
+                    import uvloop
+
+                    loop = uvloop.new_event_loop()
+                except ImportError:
+                    if use_uvloop == "auto":
+                        loop = asyncio.new_event_loop()
+                    else:  # pragma: no cover
+                        raise
+        return new_isolation(loop=loop)
+
+
+def _new_session_id():
+    return "".join(
+        random.choice(string.ascii_letters + string.digits) for _ in range(24)
+    )
+
+
+async def _new_session(
+    address: str,
+    session_id: str = None,
+    backend: str = "mars",
+    default: bool = False,
+    **kwargs,
+) -> AbstractSession:
+    if session_id is None:
+        session_id = _new_session_id()
+
+    session = await AsyncSession.init(
+        address, session_id=session_id, backend=backend, new=True, **kwargs
+    )
+    if default:
+        session.as_default()
+    return session
+
+
+def new_session(
+    address: str = None,
+    session_id: str = None,
+    backend: str = "mars",
+    default: bool = True,
+    new: bool = True,
+    **kwargs,
+) -> AbstractSession:
+    # load third party extensions.
+    init_extension_entrypoints()
+    ensure_isolation_created(kwargs)
+
+    if address is None:
+        address = "127.0.0.1"
+        if "init_local" not in kwargs:
+            kwargs["init_local"] = True
+
+    if session_id is None:
+        session_id = _new_session_id()
+
+    session = SyncSession.init(
+        address, session_id=session_id, backend=backend, new=new, **kwargs
+    )
+    if default:
+        session.as_default()
+    return session
+
+
+def get_default_session() -> Optional[SyncSession]:
+    if AbstractSession.default is None:
+        return
+    return SyncSession.from_isolated_session(AbstractSession.default)
+
+
+def clear_default_session():
+    AbstractSession.reset_default()
+
+
+def get_default_async_session() -> Optional[AsyncSession]:
+    if AbstractSession.default is None:
+        return
+    return AsyncSession.from_isolated_session(AbstractSession.default)
+
+
+def get_default_or_create(**kwargs):
+    with AbstractSession._lock:
+        session = AbstractSession.default
+        if session is None:
+            # no session attached, try to create one
+            warnings.warn(warning_msg)
+            session = new_session("127.0.0.1", init_local=True, **kwargs)
+            session.as_default()
+    if isinstance(session, _IsolatedSession):
+        session = SyncSession.from_isolated_session(session)
+    return _ensure_sync(session)
+
+
+def stop_server():
+    if AbstractSession.default:
+        SyncSession.from_isolated_session(AbstractSession.default).stop_server()
+
+
+def _get_isolated_session(session: AbstractSession) -> _IsolatedSession:
+    if hasattr(session, "_isolated_session"):
+        return session._isolated_session
+    return session
+
+
+def _ensure_sync(session: AbstractSession) -> SyncSession:
+    if isinstance(session, SyncSession):
+        return session
+    isolated_session = _get_isolated_session(session)
+    return SyncSession.from_isolated_session(isolated_session)
diff --git a/python/xorbits/_mars/deploy/oscar/supervisor.py b/python/xorbits/_mars/deploy/oscar/supervisor.py
new file mode 100644
index 000000000..0e5c121c0
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/supervisor.py
@@ -0,0 +1,106 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+
+from ... import oscar as mo
+from ...services import NodeRole
+from ...utils import get_next_port
+from .cmdline import OscarCommandRunner
+from .local import start_supervisor, stop_supervisor
+from .pool import create_supervisor_actor_pool
+
+logger = logging.getLogger(__name__)
+
+
+class SupervisorCommandRunner(OscarCommandRunner):
+    command_description = "Mars Supervisor"
+    node_role = NodeRole.SUPERVISOR
+
+    def __init__(self):
+        super().__init__()
+        self._endpoint_file_name = None
+
+    def config_args(self, parser):
+        super().config_args(parser)
+        parser.add_argument("-w", "--web-port", help="web port of the service")
+        parser.add_argument(
+            "--n-process", help="number of supervisor processes", default="1"
+        )
+
+    def parse_args(self, parser, argv, environ=None):
+        args = super().parse_args(parser, argv, environ=environ)
+
+        if args.endpoint is None:
+            args.endpoint = f"{args.host}:{get_next_port()}"
+        self._endpoint_file_name = self._write_supervisor_endpoint_file(args)
+
+        args.supervisors = f"{args.supervisors},{args.endpoint}".strip(",")
+
+        web_config = self.config.get("web", {})
+        if args.web_port is not None:
+            web_config["host"] = args.endpoint.split(":", 1)[0]
+            web_config["port"] = int(args.web_port)
+        self.config["web"] = web_config
+
+        return args
+
+    async def create_actor_pool(self):
+        return await create_supervisor_actor_pool(
+            self.args.endpoint,
+            n_process=int(self.args.n_process),
+            ports=self.ports,
+            modules=self.args.load_modules,
+            logging_conf=self.logging_conf,
+            subprocess_start_method="forkserver" if os.name != "nt" else "spawn",
+            metrics=self.config.get("metrics", {}),
+            oscar_config=self.config.get("oscar"),
+        )
+
+    async def start_services(self):
+        start_web = await start_supervisor(
+            self.pool.external_address,
+            self.args.supervisors,
+            self.args.load_modules,
+            self.config,
+        )
+        if start_web:
+            from ...services.web.supervisor import WebActor
+
+            web_actor = await mo.actor_ref(
+                WebActor.default_uid(), address=self.pool.external_address
+            )
+            web_address = await web_actor.get_web_address()
+        else:  # pragma: no cover
+            web_address = "<web not started>"
+        logger.warning(
+            "Supervisor started at %s, web address: %s",
+            self.pool.external_address,
+            web_address,
+        )
+
+    async def stop_services(self):
+        if self._endpoint_file_name is not None:  # pragma: no branch
+            try:
+                os.unlink(self._endpoint_file_name)
+            except OSError:  # pragma: no cover
+                pass
+        return await stop_supervisor(self.pool.external_address, self.config)
+
+
+main = SupervisorCommandRunner()
+
+if __name__ == "__main__":  # pragma: no branch
+    main()
diff --git a/python/xorbits/_mars/deploy/oscar/tests/__init__.py b/python/xorbits/_mars/deploy/oscar/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/deploy/oscar/tests/check_enabled_config.yml b/python/xorbits/_mars/deploy/oscar/tests/check_enabled_config.yml
new file mode 100644
index 000000000..8f2c42f84
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/check_enabled_config.yml
@@ -0,0 +1,7 @@
+"@inherits": '@default'
+task:
+  default_config:
+      reserved_finish_tasks: 0
+  task_preprocessor_cls: mars.services.task.supervisor.tests.CheckedTaskPreprocessor
+subtask:
+  subtask_processor_cls: mars.services.subtask.worker.tests.CheckedSubtaskProcessor
diff --git a/python/xorbits/_mars/deploy/oscar/tests/fault_injection_config.yml b/python/xorbits/_mars/deploy/oscar/tests/fault_injection_config.yml
new file mode 100644
index 000000000..efa1aafd1
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/fault_injection_config.yml
@@ -0,0 +1,3 @@
+"@inherits": '@default'
+third_party_modules:
+  - mars.services.tests.fault_injection_patch
diff --git a/python/xorbits/_mars/deploy/oscar/tests/fault_injection_config_with_rerun.yml b/python/xorbits/_mars/deploy/oscar/tests/fault_injection_config_with_rerun.yml
new file mode 100644
index 000000000..e65836240
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/fault_injection_config_with_rerun.yml
@@ -0,0 +1,9 @@
+"@inherits": '@default'
+third_party_modules:
+  - mars.services.tests.fault_injection_patch
+scheduling:
+  subtask_max_retries: 2
+  subtask_max_reschedules: 2
+storage:
+  # shared-memory38 may lose object if the process crash after put success.
+  backends: [plasma]
diff --git a/python/xorbits/_mars/deploy/oscar/tests/local_test_config.yml b/python/xorbits/_mars/deploy/oscar/tests/local_test_config.yml
new file mode 100644
index 000000000..7a16779da
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/local_test_config.yml
@@ -0,0 +1,7 @@
+"@inherits": '@default'
+session:
+  custom_log_dir: auto
+  plasma:
+    store_memory: 32M
+scheduling:
+  mem_hard_limit: 0
diff --git a/python/xorbits/_mars/deploy/oscar/tests/local_test_with_ray_config.yml b/python/xorbits/_mars/deploy/oscar/tests/local_test_with_ray_config.yml
new file mode 100644
index 000000000..3b7a9646e
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/local_test_with_ray_config.yml
@@ -0,0 +1,6 @@
+"@inherits": '@mars/deploy/oscar/rayconfig.yml'
+session:
+  custom_log_dir: auto
+scheduling:
+  subtask_max_retries: 0
+  subtask_max_reschedules: 0
diff --git a/python/xorbits/_mars/deploy/oscar/tests/local_test_with_third_parity_modules_config.yml b/python/xorbits/_mars/deploy/oscar/tests/local_test_with_third_parity_modules_config.yml
new file mode 100644
index 000000000..be0d7ba38
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/local_test_with_third_parity_modules_config.yml
@@ -0,0 +1,6 @@
+"@inherits": '@default'
+third_party_modules:
+  supervisor:
+    - mars.deploy.oscar.tests.modules.output_pid
+  worker:
+    - mars.deploy.oscar.tests.modules.output_pid
diff --git a/python/xorbits/_mars/deploy/oscar/tests/local_test_with_vineyard_config.yml b/python/xorbits/_mars/deploy/oscar/tests/local_test_with_vineyard_config.yml
new file mode 100644
index 000000000..ee9e80890
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/local_test_with_vineyard_config.yml
@@ -0,0 +1,6 @@
+"@inherits": '@mars/deploy/oscar/base_config.yml'
+session:
+  custom_log_dir: auto
+
+storage:
+  backends: [vineyard]
diff --git a/python/xorbits/_mars/deploy/oscar/tests/modules/__init__.py b/python/xorbits/_mars/deploy/oscar/tests/modules/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/modules/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/deploy/oscar/tests/modules/check_ray_remote_function_options.py b/python/xorbits/_mars/deploy/oscar/tests/modules/check_ray_remote_function_options.py
new file mode 100644
index 000000000..2e24192b9
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/modules/check_ray_remote_function_options.py
@@ -0,0 +1,25 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ray
+
+original_remote_function_options = ray.remote_function.RemoteFunction.options
+
+
+def _wrap_original_remote_function_options(*args, **kwargs):
+    assert kwargs["num_cpus"] == 5, "expect num_cpus==5"
+    return original_remote_function_options(*args, **kwargs)
+
+
+ray.remote_function.RemoteFunction.options = _wrap_original_remote_function_options
diff --git a/python/xorbits/_mars/deploy/oscar/tests/modules/output_pid.py b/python/xorbits/_mars/deploy/oscar/tests/modules/output_pid.py
new file mode 100644
index 000000000..0ec15a47d
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/modules/output_pid.py
@@ -0,0 +1,23 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+
+output_dir = os.path.join(tempfile.gettempdir(), "test_inject_module_output")
+
+os.makedirs(output_dir, exist_ok=True)
+
+with open(os.path.join(output_dir, f"{os.getpid()}"), "w") as f:
+    f.write("")
diff --git a/python/xorbits/_mars/deploy/oscar/tests/modules/replace_op.py b/python/xorbits/_mars/deploy/oscar/tests/modules/replace_op.py
new file mode 100644
index 000000000..b52e1cc32
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/modules/replace_op.py
@@ -0,0 +1,25 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .....tensor.arithmetic.add import TensorAdd
+
+
+def _replace_op(ctx, op):
+    # change the op from TensorAdd to TensorSubtract.
+    type(op)._func_name = "subtract"
+    executor = type(op).execute
+    return executor(ctx, op)
+
+
+TensorAdd.register_executor(_replace_op)
diff --git a/python/xorbits/_mars/deploy/oscar/tests/modules/utils.py b/python/xorbits/_mars/deploy/oscar/tests/modules/utils.py
new file mode 100644
index 000000000..371f4fd6e
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/modules/utils.py
@@ -0,0 +1,31 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import tempfile
+
+import pytest
+
+
+@pytest.fixture
+def cleanup_third_party_modules_output():
+    output_dir = os.path.join(tempfile.gettempdir(), "test_inject_module_output")
+    shutil.rmtree(output_dir, ignore_errors=True)
+    yield
+    shutil.rmtree(output_dir, ignore_errors=True)
+
+
+def get_output_filenames():
+    return os.listdir(os.path.join(tempfile.gettempdir(), "test_inject_module_output"))
diff --git a/python/xorbits/_mars/deploy/oscar/tests/ray_test_with_third_parity_modules_config.yml b/python/xorbits/_mars/deploy/oscar/tests/ray_test_with_third_parity_modules_config.yml
new file mode 100644
index 000000000..0795a1b67
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/ray_test_with_third_parity_modules_config.yml
@@ -0,0 +1,6 @@
+"@inherits": '@mars/deploy/oscar/rayconfig.yml'
+third_party_modules:
+  supervisor:
+    - mars.deploy.oscar.tests.modules.output_pid
+  worker:
+    - mars.deploy.oscar.tests.modules.output_pid
diff --git a/python/xorbits/_mars/deploy/oscar/tests/session.py b/python/xorbits/_mars/deploy/oscar/tests/session.py
new file mode 100644
index 000000000..2226db362
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/session.py
@@ -0,0 +1,154 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import inspect
+import os
+import uuid
+
+from ....core import OBJECT_TYPE
+from ....deploy.oscar.local import LocalClient, LocalCluster
+from ....tests.core import ObjectCheckMixin, _check_args
+from ..session import (
+    AbstractSession,
+    AsyncSession,
+    _ensure_sync,
+    _IsolatedSession,
+    ensure_isolation_created,
+)
+
+CONFIG_FILE = os.path.join(os.path.dirname(__file__), "check_enabled_config.yml")
+
+
+class CheckedSession(ObjectCheckMixin, _IsolatedSession):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._tileable_checked = dict()
+
+        check_options = dict()
+        for key in _check_args:
+            check_options[key] = kwargs.get(key, True)
+        self._check_options = check_options
+
+    @staticmethod
+    def _extract_check_options(extra_config):
+        check_options = dict()
+        for key in _check_args:
+            check_options[key] = extra_config.pop(key, True)
+        return check_options
+
+    def _process_result(self, tileable, result):
+        if self._check_options.get("check_all", True):
+            if (
+                not isinstance(tileable, OBJECT_TYPE)
+                and tileable.key not in self._tileable_checked
+            ):
+                self.assert_object_consistent(tileable, result)
+        return super()._process_result(tileable, result)
+
+    async def fetch(self, *tileables, **kwargs):
+        extra_config = kwargs.pop("extra_config", dict())
+        if kwargs:
+            unexpected_keys = ", ".join(list(kwargs.keys()))
+            raise TypeError(f"`fetch` got unexpected arguments: {unexpected_keys}")
+
+        self._check_options = self._extract_check_options(extra_config)
+        results = await super().fetch(*tileables)
+        return results
+
+
+async def _new_test_session(
+    address: str,
+    session_id: str = None,
+    backend: str = None,
+    default: bool = False,
+    new: bool = True,
+    timeout: float = None,
+    **kwargs,
+) -> AbstractSession:
+    if session_id is None:
+        session_id = str(uuid.uuid4())
+
+    async def _get_checked_session(_address):
+        session = AsyncSession.from_isolated_session(
+            await CheckedSession.init(
+                _address,
+                session_id=session_id,
+                backend=backend,
+                new=new,
+                timeout=timeout,
+                **kwargs,
+            )
+        )
+        if default:
+            session.as_default()
+        return session
+
+    async def _new_test_cluster_in_isolation(**new_cluster_kwargs):
+        cluster = LocalCluster(**new_cluster_kwargs)
+        await cluster.start()
+        session = await _get_checked_session(cluster.external_address)
+        client = LocalClient(cluster, session)
+        session.client = client
+        return client
+
+    init_local = kwargs.pop("init_local", False)
+    if init_local:
+        if "n_cpu" not in kwargs:
+            # limit to 2 cpu each worker
+            kwargs["n_cpu"] = 2 * kwargs.get("n_worker", 1)
+        if "config" not in kwargs:
+            # enable check for task and subtask processor
+            kwargs["config"] = CONFIG_FILE
+
+        sig = inspect.signature(LocalCluster)
+        new_cluster_params = {}
+        for k in sig.parameters:
+            if k in kwargs:
+                new_cluster_params[k] = kwargs.pop(k)
+        return (
+            await _new_test_cluster_in_isolation(
+                address=address, backend=backend, **new_cluster_params
+            )
+        ).session
+    return await _get_checked_session(address)
+
+
+def new_test_session(
+    address: str = None,
+    session_id: str = None,
+    backend: str = None,
+    default: bool = False,
+    new: bool = True,
+    **kwargs,
+):
+    isolation = ensure_isolation_created(kwargs)
+    if address is None:
+        address = "127.0.0.1"
+        if "init_local" not in kwargs:
+            kwargs["init_local"] = True
+    if "web" not in kwargs:
+        kwargs["web"] = False
+    backend = backend or os.environ.get("MARS_CI_BACKEND", "mars")
+    coro = _new_test_session(
+        address,
+        session_id=session_id,
+        backend=backend,
+        default=default,
+        new=new,
+        **kwargs,
+    )
+    return _ensure_sync(
+        asyncio.run_coroutine_threadsafe(coro, isolation.loop).result(120)
+    )
diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_checked_session.py b/python/xorbits/_mars/deploy/oscar/tests/test_checked_session.py
new file mode 100644
index 000000000..0e141d36f
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/test_checked_session.py
@@ -0,0 +1,97 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict
+
+import numpy as np
+import pytest
+
+from .... import tensor as mt
+from ....config import option_context
+from ....core import OperandType, TileableType
+from ....services.subtask.worker.tests import CheckedSubtaskProcessor
+from ....services.task.supervisor.tests import CheckedTaskPreprocessor
+from ..local import _load_config
+from ..tests.session import CONFIG_FILE, new_test_session
+
+
+class FakeCheckedTaskPreprocessor(CheckedTaskPreprocessor):
+    def _check_nsplits(self, tiled: TileableType):
+        raise RuntimeError("Premeditated")
+
+
+class FakeCheckedSubtaskProcessor(CheckedSubtaskProcessor):
+    def _execute_operand(self, ctx: Dict[str, Any], op: OperandType):
+        if self._check_options.get("check_all", True):
+            raise RuntimeError("Premeditated")
+        else:
+            return super()._execute_operand(ctx, op)
+
+
+@pytest.fixture(scope="module")
+def setup():
+    with option_context({"show_progress": False}):
+        yield
+
+
+def test_checked_session(setup):
+    sess = new_test_session(default=True)
+
+    a = mt.ones((10, 10))
+    b = a + 1
+    b.execute()
+
+    np.testing.assert_array_equal(sess.fetch(b), np.ones((10, 10)) + 1)
+
+    sess.stop_server()
+
+
+def test_check_task_preprocessor(setup):
+    config = _load_config(CONFIG_FILE)
+    config["task"][
+        "task_preprocessor_cls"
+    ] = "mars.deploy.oscar.tests.test_checked_session.FakeCheckedTaskPreprocessor"
+
+    sess = new_test_session(default=True, config=config)
+
+    a = mt.ones((10, 10))
+    b = a + 1
+
+    with pytest.raises(RuntimeError, match="Premeditated"):
+        b.execute()
+
+    # test test config
+    b.execute(extra_config={"check_nsplits": False})
+
+    sess.stop_server()
+
+
+def test_check_subtask_processor(setup):
+    config = _load_config(CONFIG_FILE)
+    config["subtask"][
+        "subtask_processor_cls"
+    ] = "mars.deploy.oscar.tests.test_checked_session.FakeCheckedSubtaskProcessor"
+
+    sess = new_test_session(default=True, config=config)
+
+    a = mt.ones((10, 10))
+    b = a + 1
+
+    with pytest.raises(RuntimeError, match="Premeditated"):
+        b.execute()
+
+    # test test config
+    b.execute(extra_config={"check_all": False})
+
+    sess.stop_server()
diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_clean_up_and_restore_func.py b/python/xorbits/_mars/deploy/oscar/tests/test_clean_up_and_restore_func.py
new file mode 100644
index 000000000..8a47d2ce5
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/test_clean_up_and_restore_func.py
@@ -0,0 +1,182 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict
+
+import pandas as pd
+import pytest
+
+from .... import dataframe as md
+from ....config import option_context
+from ....core import OperandType, TileableGraph
+from ....dataframe.base.apply import ApplyOperand
+from ....services.subtask.worker.processor import SubtaskProcessor
+from ....services.subtask.worker.tests import CheckedSubtaskProcessor
+from ....services.task.supervisor.preprocessor import TaskPreprocessor
+from ....services.task.supervisor.tests import CheckedTaskPreprocessor
+from ....utils import lazy_import
+from ..local import _load_config as _load_mars_config
+from ..tests.session import CONFIG_FILE, new_test_session
+
+ray = lazy_import("ray")
+
+
+class MarsBackendFuncCheckedTaskPreprocessor(CheckedTaskPreprocessor):
+    def tile(self, tileable_graph: TileableGraph):
+        ops = [t.op for t in tileable_graph if isinstance(t.op, ApplyOperand)]
+        for op in ops:
+            assert hasattr(op, "func_key")
+            assert op.func_key is None
+            assert op.func is not None
+            assert callable(op.func)
+            assert op.need_clean_up_func is False
+        result = super().tile(tileable_graph)
+        for op in ops:
+            assert hasattr(op, "func_key")
+            assert op.func_key is None
+            if op.need_clean_up_func:
+                assert isinstance(op.func, bytes)
+            else:
+                assert callable(op.func)
+        return result
+
+
+class MarsBackendFuncCheckedSubtaskProcessor(CheckedSubtaskProcessor):
+    def _execute_operand(self, ctx: Dict[str, Any], op: OperandType):
+        if isinstance(op, ApplyOperand):
+            assert hasattr(op, "func_key")
+            assert op.func_key is None
+            if op.need_clean_up_func:
+                assert isinstance(op.func, bytes)
+            else:
+                assert callable(op.func)
+            result = super()._execute_operand(ctx, op)
+            assert op.func is not None
+            assert callable(op.func)
+            return result
+        else:
+            return super()._execute_operand(ctx, op)
+
+
+class RayBackendFuncTaskPreprocessor(TaskPreprocessor):
+    def tile(self, tileable_graph: TileableGraph):
+        ops = [t.op for t in tileable_graph if isinstance(t.op, ApplyOperand)]
+        for op in ops:
+            assert hasattr(op, "func_key")
+            assert op.func_key is None
+            assert op.func is not None
+            assert callable(op.func)
+            assert op.need_clean_up_func is False
+        result = super().tile(tileable_graph)
+        for op in ops:
+            assert hasattr(op, "func_key")
+            if op.need_clean_up_func:
+                assert op.func is None
+                assert isinstance(op.func_key, ray.ObjectRef)
+            else:
+                assert callable(op.func)
+                assert op.func_key is None
+        return result
+
+
+class RayBackendFuncSubtaskProcessor(SubtaskProcessor):
+    def _execute_operand(self, ctx: Dict[str, Any], op: OperandType):
+        if isinstance(op, ApplyOperand):
+            assert hasattr(op, "func_key")
+            if op.need_clean_up_func:
+                assert op.func is None
+                assert isinstance(op.func_key, ray.ObjectRef)
+            else:
+                assert callable(op.func)
+                assert op.func_key is None
+            result = super()._execute_operand(ctx, op)
+            assert op.func is not None
+            assert callable(op.func)
+            return result
+        else:
+            return super()._execute_operand(ctx, op)
+
+
+@pytest.fixture(scope="module")
+def setup():
+    with option_context({"show_progress": False}):
+        yield
+
+
+def test_mars_backend_clean_up_and_restore_func(setup):
+    config = _load_mars_config(CONFIG_FILE)
+    config["task"][
+        "task_preprocessor_cls"
+    ] = "mars.deploy.oscar.tests.test_clean_up_and_restore_func.MarsBackendFuncCheckedTaskPreprocessor"
+    config["subtask"][
+        "subtask_processor_cls"
+    ] = "mars.deploy.oscar.tests.test_clean_up_and_restore_func.MarsBackendFuncCheckedSubtaskProcessor"
+
+    sess = new_test_session(default=True, config=config)
+
+    cols = [chr(ord("A") + i) for i in range(10)]
+    df_raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols))
+    df = md.DataFrame(df_raw, chunk_size=5)
+
+    x_small = pd.Series([i for i in range(10)])
+    y_small = pd.Series([i for i in range(10)])
+    x_large = pd.Series([i for i in range(10**4)])
+    y_large = pd.Series([i for i in range(10**4)])
+
+    def closure_small(z):
+        return pd.concat([x_small, y_small], ignore_index=True)
+
+    def closure_large(z):
+        return pd.concat([x_large, y_large], ignore_index=True)
+
+    r_small = df.apply(closure_small, axis=1)
+    r_small.execute()
+    r_large = df.apply(closure_large, axis=1)
+    r_large.execute()
+
+    sess.stop_server()
+
+
+@pytest.mark.parametrize("multiplier", [1, 3, 4])
+def test_clean_up_and_restore_callable(setup, multiplier):
+    config = _load_mars_config(CONFIG_FILE)
+    config["task"][
+        "task_preprocessor_cls"
+    ] = "mars.deploy.oscar.tests.test_clean_up_and_restore_func.MarsBackendFuncCheckedTaskPreprocessor"
+    config["subtask"][
+        "subtask_processor_cls"
+    ] = "mars.deploy.oscar.tests.test_clean_up_and_restore_func.MarsBackendFuncCheckedSubtaskProcessor"
+
+    sess = new_test_session(default=True, config=config)
+
+    cols = [chr(ord("A") + i) for i in range(10)]
+    df_raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols))
+    df = md.DataFrame(df_raw, chunk_size=5)
+
+    class callable_df:
+        __slots__ = "x", "__dict__"
+
+        def __init__(self, multiplier: int = 1):
+            self.x = pd.Series([i for i in range(10**multiplier)])
+            self.y = pd.Series([i for i in range(10**multiplier)])
+
+        def __call__(self, pdf):
+            return pd.concat([self.x, self.y], ignore_index=True)
+
+    cdf = callable_df(multiplier=multiplier)
+
+    r_callable = df.apply(cdf, axis=1)
+    r_callable.execute()
+
+    sess.stop_server()
diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_cmdline.py b/python/xorbits/_mars/deploy/oscar/tests/test_cmdline.py
new file mode 100644
index 000000000..4aba6b9f2
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/test_cmdline.py
@@ -0,0 +1,403 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import asyncio
+import glob
+import json
+import logging
+import os
+import subprocess
+import sys
+import tempfile
+import time
+from concurrent import futures
+from typing import List
+
+import numpy as np
+import psutil
+import pytest
+
+from .... import tensor as mt
+from ....lib.aio import get_isolation, new_isolation, stop_isolation
+from ....services import NodeRole
+from ....services.cluster import ClusterAPI
+from ....session import new_session
+from ....tests import flaky
+from ....utils import clean_mars_tmp_dir, get_next_port
+from ..cmdline import OscarCommandRunner
+from ..supervisor import SupervisorCommandRunner
+from ..worker import WorkerCommandRunner
+
+logger = logging.getLogger(__name__)
+
+
+class _ProcessExitedException(Exception):
+    pass
+
+
+def _wait_supervisor_ready(supervisor_proc: subprocess.Popen, timeout=120):
+    start_time = time.time()
+    supervisor_pid = supervisor_proc.pid
+    while True:
+        if supervisor_proc.poll() is not None:
+            raise _ProcessExitedException
+
+        try:
+            ep_file_name = OscarCommandRunner._build_endpoint_file_path(
+                pid=supervisor_pid
+            )
+            with open(ep_file_name, "r") as ep_file:
+                return ep_file.read().strip()
+        except:  # noqa: E722  # pylint: disable=bare-except
+            if time.time() - start_time > timeout:
+                raise
+            pass
+        finally:
+            time.sleep(0.1)
+
+
+def _wait_worker_ready(
+    supervisor_addr, worker_procs: List[subprocess.Popen], n_supervisors=1, timeout=30
+):
+    async def wait_for_workers():
+        start_time = time.time()
+        while True:
+            if any(proc.poll() is not None for proc in worker_procs):
+                raise _ProcessExitedException
+
+            try:
+                cluster_api = await ClusterAPI.create(supervisor_addr)
+                sv_info = await cluster_api.get_nodes_info(
+                    role=NodeRole.SUPERVISOR, resource=True
+                )
+                worker_info = await cluster_api.get_nodes_info(
+                    role=NodeRole.WORKER, resource=True
+                )
+                if len(sv_info) >= n_supervisors and len(worker_info) >= len(
+                    worker_procs
+                ):
+                    break
+
+                logger.info(
+                    "Cluster not satisfied. sv_num=%s worker_num=%s",
+                    len(sv_info),
+                    len(worker_info),
+                )
+            except:  # noqa: E722  # pylint: disable=bare-except
+                logger.exception("Error when waiting for workers to start")
+                if time.time() - start_time > timeout:
+                    raise
+                pass
+            finally:
+                await asyncio.sleep(0.5)
+
+    isolation = get_isolation()
+    asyncio.run_coroutine_threadsafe(wait_for_workers(), isolation.loop).result(timeout)
+
+
+_test_port_cache = dict()
+
+
+def _get_labelled_port(label=None, create=True):
+    test_name = os.environ["PYTEST_CURRENT_TEST"]
+    if (test_name, label) not in _test_port_cache:
+        if create:
+            _test_port_cache[(test_name, label)] = get_next_port(occupy=True)
+        else:
+            return None
+    return _test_port_cache[(test_name, label)]
+
+
+def _stop_processes(procs: List[subprocess.Popen]):
+    sub_ps_procs = []
+    for proc in procs:
+        if not proc:
+            continue
+
+        sub_ps_procs.extend(psutil.Process(proc.pid).children(recursive=True))
+        proc.terminate()
+
+    for proc in procs:
+        try:
+            proc.wait(10)
+        except subprocess.TimeoutExpired:
+            pass
+
+    for ps_proc in sub_ps_procs + procs:
+        try:
+            ps_proc.kill()
+        except psutil.NoSuchProcess:
+            pass
+
+
+supervisor_cmd_start = [sys.executable, "-m", "mars.deploy.oscar.supervisor"]
+worker_cmd_start = [sys.executable, "-m", "mars.deploy.oscar.worker"]
+
+
+def _reload_args(args):
+    return [arg if not callable(arg) else arg() for arg in args]
+
+
+_rerun_errors = (
+    _ProcessExitedException,
+    asyncio.TimeoutError,
+    futures.TimeoutError,
+    OSError,
+    TimeoutError,
+)
+
+
+@flaky(max_runs=10, rerun_filter=lambda err, *_: issubclass(err[0], _rerun_errors))
+@pytest.mark.parametrize(
+    "supervisor_args,worker_args,use_web_addr",
+    [
+        pytest.param(
+            supervisor_cmd_start,
+            worker_cmd_start
+            + [
+                "--config-file",
+                os.path.join(os.path.dirname(__file__), "local_test_config.yml"),
+            ],
+            False,
+            id="bare_start",
+        ),
+        pytest.param(
+            supervisor_cmd_start
+            + [
+                "-e",
+                lambda: f'127.0.0.1:{_get_labelled_port("supervisor")}',
+                "-w",
+                lambda: str(_get_labelled_port("web")),
+                "--n-process=2",
+                "--log-level=DEBUG",
+            ],
+            worker_cmd_start
+            + [
+                "-e",
+                lambda: f"127.0.0.1:{get_next_port(occupy=True)}",
+                "-s",
+                lambda: f'127.0.0.1:{_get_labelled_port("supervisor")}',
+                "--config-file",
+                os.path.join(os.path.dirname(__file__), "local_test_config.yml"),
+                "--log-level=DEBUG",
+                "--log-format=%(asctime)s %(message)s",
+                "--use-uvloop=no",
+            ],
+            True,
+            id="with_supervisors",
+        ),
+    ],
+)
+def test_cmdline_run(supervisor_args, worker_args, use_web_addr):
+    new_isolation()
+    sv_proc = w_procs = None
+    restart_trial = 5
+    try:
+        env = os.environ.copy()
+        env["MARS_CPU_TOTAL"] = "2"
+
+        for trial in range(restart_trial):
+            logger.warning("Cluster start attempt %d / %d", trial + 1, restart_trial)
+            _test_port_cache.clear()
+
+            sv_args = _reload_args(supervisor_args)
+            sv_proc = subprocess.Popen(sv_args, env=env)
+
+            oscar_port = _get_labelled_port("supervisor", create=False)
+            if not oscar_port:
+                oscar_ep = _wait_supervisor_ready(sv_proc)
+            else:
+                oscar_ep = f"127.0.0.1:{oscar_port}"
+
+            if use_web_addr:
+                host = oscar_ep.rsplit(":", 1)[0]
+                api_ep = f'http://{host}:{_get_labelled_port("web", create=False)}'
+            else:
+                api_ep = oscar_ep
+
+            w_procs = []
+            for idx in range(2):
+                proc = subprocess.Popen(_reload_args(worker_args), env=env)
+                w_procs.append(proc)
+                # make sure worker ports does not collide
+                time.sleep(2)
+
+            try:
+                _wait_worker_ready(oscar_ep, w_procs)
+                break
+            except (asyncio.TimeoutError, futures.TimeoutError, TimeoutError):
+                if trial == restart_trial - 1:
+                    raise
+                else:
+                    _stop_processes(w_procs + [sv_proc])
+
+        new_session(api_ep)
+        data = np.random.rand(10, 10)
+        res = mt.tensor(data, chunk_size=5).sum().execute().fetch()
+        np.testing.assert_almost_equal(res, data.sum())
+    finally:
+        stop_isolation()
+
+        ep_file_name = OscarCommandRunner._build_endpoint_file_path(pid=sv_proc.pid)
+        try:
+            os.unlink(ep_file_name)
+        except OSError:
+            pass
+
+        _stop_processes((w_procs or []) + [sv_proc])
+
+        port_prefix = os.path.join(
+            tempfile.gettempdir(), OscarCommandRunner._port_file_prefix
+        )
+        for fn in glob.glob(port_prefix + "*"):
+            os.unlink(fn)
+
+
+def test_parse_args():
+    parser = argparse.ArgumentParser(description="TestService")
+    app = WorkerCommandRunner()
+    app.config_args(parser)
+
+    task_detail = """
+    {
+      "cluster": {
+        "supervisor": ["sv1", "sv2"],
+        "worker": ["worker1", "worker2"]
+      },
+      "task": {
+        "type": "worker",
+        "index": 0
+      }
+    }
+    """
+
+    env = {
+        "MARS_LOAD_MODULES": "extra.module",
+        "MARS_TASK_DETAIL": task_detail,
+        "MARS_CACHE_MEM_SIZE": "20M",
+        "MARS_PLASMA_DIRS": "/dev/shm",
+        "MARS_SPILL_DIRS": "/tmp",
+    }
+    args = app.parse_args(parser, ["-p", "10324"], env)
+    assert args.host == "worker1"
+    assert args.endpoint == "worker1:10324"
+    assert args.supervisors == "sv1,sv2"
+    assert "extra.module" in args.load_modules
+    assert app.config["storage"]["plasma"] == {
+        "store_memory": "20M",
+        "plasma_directory": "/dev/shm",
+    }
+    assert app.config["storage"]["disk"] == {
+        "root_dirs": "/tmp",
+    }
+
+
+@pytest.fixture
+def init_app():
+    parser = argparse.ArgumentParser(description="TestService")
+    app = WorkerCommandRunner()
+    app.config_args(parser)
+    yield app, parser
+
+    # clean
+    clean_mars_tmp_dir()
+
+
+def test_parse_no_log_dir(init_app):
+    app, parser = init_app
+
+    assert not app.config
+    assert len(app.config) == 0
+
+    with pytest.raises(KeyError):
+        try:
+            app._set_log_dir()
+        except ValueError:
+            pytest.fail()
+
+    _ = app.parse_args(parser, ["--supervisors", "127.0.0.1"])
+    assert app.config["cluster"]
+    assert not app.config["cluster"]["log_dir"]
+    app._set_log_dir()
+    assert app.logging_conf["from_cmd"] is True
+    assert not app.logging_conf["log_dir"]
+
+
+def test_parse_log_dir(init_app):
+    app, parser = init_app
+    log_dir = tempfile.mkdtemp()
+    _ = app.parse_args(parser, ["--supervisors", "127.0.0.1"])
+    app.config["cluster"]["log_dir"] = log_dir
+    assert os.path.exists(app.config["cluster"]["log_dir"])
+    app._set_log_dir()
+    assert app.logging_conf["log_dir"] == log_dir
+
+
+def test_config_logging(init_app):
+    app, parser = init_app
+    app.args = app.parse_args(parser, ["--supervisors", "127.0.0.1"])
+    app.config_logging()
+    expected_path = os.path.join(
+        os.path.dirname(os.path.dirname(__file__)), "file-logging.conf"
+    )
+    assert app.logging_conf["file"] == expected_path
+
+
+def test_parse_third_party_modules():
+    config = {
+        "third_party_modules": {
+            "supervisor": ["supervisor.module"],
+            "worker": ["worker.module"],
+        }
+    }
+    env = {"MARS_LOAD_MODULES": "extra.module"}
+
+    parser = argparse.ArgumentParser(description="TestService")
+    app = WorkerCommandRunner()
+    app.config_args(parser)
+    args = app.parse_args(
+        parser,
+        [
+            "-c",
+            json.dumps(config),
+            "-p",
+            "10324",
+            "-s",
+            "sv1,sv2",
+            "--load-modules",
+            "load.module",
+        ],
+        env,
+    )
+    assert args.load_modules == ("load.module", "worker.module", "extra.module")
+
+    parser = argparse.ArgumentParser(description="TestService")
+    app = SupervisorCommandRunner()
+    app.config_args(parser)
+    args = app.parse_args(
+        parser,
+        [
+            "-c",
+            json.dumps(config),
+            "-p",
+            "10324",
+            "-s",
+            "sv1,sv2",
+            "--load-modules",
+            "load.module",
+        ],
+        env,
+    )
+    assert args.load_modules == ("load.module", "supervisor.module", "extra.module")
diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_fault_injection.py b/python/xorbits/_mars/deploy/oscar/tests/test_fault_injection.py
new file mode 100644
index 000000000..a6dab9cdf
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/test_fault_injection.py
@@ -0,0 +1,334 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import traceback
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .... import dataframe as md
+from .... import tensor as mt
+from ....oscar.errors import ServerClosed
+from ....remote import spawn
+from ....services.tests.fault_injection_manager import (
+    AbstractFaultInjectionManager,
+    ExtraConfigKey,
+    FaultInjectionError,
+    FaultInjectionUnhandledError,
+    FaultPosition,
+    FaultType,
+)
+from ....tensor.base.psrs import PSRSConcatPivot
+from ..local import new_cluster
+from ..session import get_default_async_session
+
+CONFIG_FILE = os.path.join(os.path.dirname(__file__), "fault_injection_config.yml")
+RERUN_SUBTASK_CONFIG_FILE = os.path.join(
+    os.path.dirname(__file__), "fault_injection_config_with_rerun.yml"
+)
+
+
+@pytest.fixture
+async def fault_cluster(request):
+    param = getattr(request, "param", {})
+    start_method = os.environ.get("POOL_START_METHOD", None)
+    client = await new_cluster(
+        subprocess_start_method=start_method,
+        config=param.get("config", CONFIG_FILE),
+        n_worker=2,
+        n_cpu=2,
+    )
+    async with client:
+        yield client
+
+
+async def create_fault_injection_manager(
+    session_id, address, fault_count, fault_type, fault_op_types=None
+):
+    class FaultInjectionManager(AbstractFaultInjectionManager):
+        def __init__(self):
+            self._fault_count = fault_count
+
+        def set_fault_count(self, count):
+            self._fault_count = count
+
+        def get_fault_count(self):
+            return self._fault_count
+
+        def get_fault(self, pos: FaultPosition, ctx=None) -> FaultType:
+            # Check op types if fault_op_types provided.
+            if fault_op_types and type(ctx.get("operand")) not in fault_op_types:
+                return FaultType.NoFault
+            if self._fault_count.get(pos, 0) > 0:
+                self._fault_count[pos] -= 1
+                return fault_type
+            return FaultType.NoFault
+
+    await FaultInjectionManager.create(session_id, address)
+    return FaultInjectionManager.name
+
+
+@pytest.mark.parametrize(
+    "fault_and_exception",
+    [
+        [
+            FaultType.Exception,
+            {FaultPosition.ON_EXECUTE_OPERAND: 1},
+            pytest.raises(FaultInjectionError, match="Fault Injection"),
+            True,
+        ],
+        [
+            FaultType.UnhandledException,
+            {FaultPosition.ON_EXECUTE_OPERAND: 1},
+            pytest.raises(
+                FaultInjectionUnhandledError, match="Fault Injection Unhandled"
+            ),
+            True,
+        ],
+        [
+            FaultType.ProcessExit,
+            {FaultPosition.ON_EXECUTE_OPERAND: 1},
+            pytest.raises(ServerClosed),
+            False,  # The ServerClosed raised from current process directly.
+        ],
+        [
+            FaultType.Exception,
+            {FaultPosition.ON_RUN_SUBTASK: 1},
+            pytest.raises(FaultInjectionError, match="Fault Injection"),
+            False,
+        ],
+    ],
+)
+@pytest.mark.asyncio
+async def test_fault_inject_subtask_processor(fault_cluster, fault_and_exception):
+    fault_type, fault_count, first_run_raises, check_error_prefix = fault_and_exception
+    name = await create_fault_injection_manager(
+        session_id=fault_cluster.session.session_id,
+        address=fault_cluster.session.address,
+        fault_count=fault_count,
+        fault_type=fault_type,
+    )
+    extra_config = {ExtraConfigKey.FAULT_INJECTION_MANAGER_NAME: name}
+
+    raw = np.random.RandomState(0).rand(10, 10)
+    a = mt.tensor(raw, chunk_size=5)
+    b = a + 1
+
+    with first_run_raises as ex:
+        b.execute(extra_config=extra_config)
+
+    if check_error_prefix:
+        assert str(ex.value).count("address") == 1
+        assert str(ex.value).count("pid") == 1
+
+    # execute again may raise an ConnectionRefusedError if the
+    # ProcessExit occurred.
+
+
+@pytest.mark.parametrize(
+    "fault_cluster", [{"config": RERUN_SUBTASK_CONFIG_FILE}], indirect=True
+)
+@pytest.mark.parametrize(
+    "fault_config",
+    [
+        [
+            FaultType.Exception,
+            {FaultPosition.ON_EXECUTE_OPERAND: 1},
+            pytest.raises(FaultInjectionError, match="Fault Injection"),
+        ],
+        [
+            FaultType.ProcessExit,
+            {FaultPosition.ON_EXECUTE_OPERAND: 1},
+            pytest.raises(ServerClosed),
+        ],
+        [
+            FaultType.Exception,
+            {FaultPosition.ON_RUN_SUBTASK: 1},
+            pytest.raises(FaultInjectionError, match="Fault Injection"),
+        ],
+    ],
+)
+@pytest.mark.asyncio
+async def test_rerun_subtask(fault_cluster, fault_config):
+    fault_type, fault_count, expect_raises = fault_config
+    name = await create_fault_injection_manager(
+        session_id=fault_cluster.session.session_id,
+        address=fault_cluster.session.address,
+        fault_count=fault_count,
+        fault_type=fault_type,
+    )
+    extra_config = {ExtraConfigKey.FAULT_INJECTION_MANAGER_NAME: name}
+    session = get_default_async_session()
+
+    raw = np.random.RandomState(0).rand(10, 10)
+    a = mt.tensor(raw, chunk_size=5)
+    b = a + 1
+
+    info = await session.execute(b, extra_config=extra_config)
+    await info
+    assert info.result() is None
+    assert info.exception() is None
+
+    r = await session.fetch(b)
+    np.testing.assert_array_equal(r, raw + 1)
+
+    fault_injection_manager = await session.get_remote_object(
+        fault_cluster.session.session_id, name
+    )
+    await fault_injection_manager.set_fault_count({FaultPosition.ON_EXECUTE_OPERAND: 1})
+
+    # the extra config overwrites the default config.
+    extra_config["subtask_max_retries"] = 0
+    extra_config["subtask_max_reschedules"] = 0
+    info = await session.execute(b, extra_config=extra_config)
+    with expect_raises:
+        await info
+
+
+@pytest.mark.parametrize(
+    "fault_cluster", [{"config": RERUN_SUBTASK_CONFIG_FILE}], indirect=True
+)
+@pytest.mark.parametrize(
+    "fault_config",
+    [
+        [FaultType.Exception, {FaultPosition.ON_EXECUTE_OPERAND: 1}, [PSRSConcatPivot]],
+        [
+            FaultType.ProcessExit,
+            {FaultPosition.ON_EXECUTE_OPERAND: 1},
+            [PSRSConcatPivot],
+        ],
+    ],
+)
+@pytest.mark.asyncio
+async def test_rerun_subtask_describe(fault_cluster, fault_config):
+    fault_type, fault_count, fault_op_types = fault_config
+    name = await create_fault_injection_manager(
+        session_id=fault_cluster.session.session_id,
+        address=fault_cluster.session.address,
+        fault_count=fault_count,
+        fault_type=fault_type,
+        fault_op_types=fault_op_types,
+    )
+    extra_config = {ExtraConfigKey.FAULT_INJECTION_MANAGER_NAME: name}
+    session = get_default_async_session()
+
+    s = np.random.RandomState(0)
+    raw = pd.DataFrame(s.rand(100, 4), columns=list("abcd"))
+    df = md.DataFrame(raw, chunk_size=30)
+
+    r = df.describe()
+    info = await session.execute(r, extra_config=extra_config)
+    await info
+    assert info.result() is None
+    assert info.exception() is None
+    assert info.progress() == 1
+    res = await session.fetch(r)
+    pd.testing.assert_frame_equal(res, raw.describe())
+
+    fault_injection_manager = await session.get_remote_object(
+        fault_cluster.session.session_id, name
+    )
+    remain_fault_count = await fault_injection_manager.get_fault_count()
+    for key in fault_count:
+        assert remain_fault_count[key] == 0
+
+
+@pytest.mark.parametrize(
+    "fault_cluster", [{"config": RERUN_SUBTASK_CONFIG_FILE}], indirect=True
+)
+@pytest.mark.parametrize(
+    "fault_config",
+    [
+        [
+            FaultType.UnhandledException,
+            {FaultPosition.ON_EXECUTE_OPERAND: 1},
+            pytest.raises(FaultInjectionUnhandledError),
+            ["_UnhandledException", "handle_fault"],
+        ],
+        [
+            FaultType.Exception,
+            {FaultPosition.ON_EXECUTE_OPERAND: 100},
+            pytest.raises(FaultInjectionError),
+            ["_ExceedMaxRerun", "handle_fault"],
+        ],
+    ],
+)
+@pytest.mark.asyncio
+async def test_rerun_subtask_fail(fault_cluster, fault_config):
+    fault_type, fault_count, expect_raises, exception_match = fault_config
+    name = await create_fault_injection_manager(
+        session_id=fault_cluster.session.session_id,
+        address=fault_cluster.session.address,
+        fault_count=fault_count,
+        fault_type=fault_type,
+    )
+    exception_typename, stack_string = exception_match
+    extra_config = {ExtraConfigKey.FAULT_INJECTION_MANAGER_NAME: name}
+
+    raw = np.random.RandomState(0).rand(10, 10)
+    a = mt.tensor(raw, chunk_size=5)
+    b = a + 1
+
+    with expect_raises as e:
+        b.execute(extra_config=extra_config)
+
+    tb_str = "".join(traceback.format_tb(e.tb))
+    assert e.value.__wrapname__ == exception_typename, tb_str
+    assert e.traceback[-1].name == stack_string, tb_str
+
+
+@pytest.mark.parametrize(
+    "fault_cluster", [{"config": RERUN_SUBTASK_CONFIG_FILE}], indirect=True
+)
+@pytest.mark.parametrize(
+    "fault_config",
+    [
+        [
+            FaultType.Exception,
+            {FaultPosition.ON_EXECUTE_OPERAND: 1},
+            pytest.raises(FaultInjectionError, match="RemoteFunction"),
+            ["_UnretryableException", "handle_fault"],
+        ],
+        [
+            FaultType.ProcessExit,
+            {FaultPosition.ON_EXECUTE_OPERAND: 1},
+            pytest.raises(ServerClosed),
+            ["_UnretryableException", "*"],
+        ],
+    ],
+)
+@pytest.mark.asyncio
+async def test_retryable(fault_cluster, fault_config):
+    fault_type, fault_count, expect_raises, exception_match = fault_config
+    name = await create_fault_injection_manager(
+        session_id=fault_cluster.session.session_id,
+        address=fault_cluster.session.address,
+        fault_count=fault_count,
+        fault_type=fault_type,
+    )
+    exception_typename, stack_string = exception_match
+    extra_config = {ExtraConfigKey.FAULT_INJECTION_MANAGER_NAME: name}
+
+    def f(x):
+        return x + 1
+
+    r = spawn(f, args=(1,), retry_when_fail=False)
+    with expect_raises as e:
+        r.execute(extra_config=extra_config)
+
+    tb_str = "".join(traceback.format_tb(e.tb))
+    assert e.value.__wrapname__ == exception_typename, tb_str
+    assert stack_string == "*" or e.traceback[-1].name == stack_string, tb_str
diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_local.py b/python/xorbits/_mars/deploy/oscar/tests/test_local.py
new file mode 100644
index 000000000..593e652b3
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/test_local.py
@@ -0,0 +1,1326 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import copy
+import gc
+import os
+import subprocess
+import sys
+import tempfile
+import textwrap
+import threading
+import time
+import uuid
+import weakref
+
+import numpy as np
+import pandas as pd
+import psutil
+import pytest
+
+try:
+    import vineyard
+except ImportError:
+    vineyard = None
+
+from .... import dataframe as md
+from .... import remote as mr
+from .... import tensor as mt
+from ....config import option_context
+from ....core.context import get_context
+from ....lib.aio import new_isolation
+from ....oscar.backends.router import Router
+from ....services.storage import StorageAPI
+from ....services.task.supervisor.task import TaskProcessor
+from ....storage import StorageLevel
+from ....tensor.arithmetic.add import TensorAdd
+from ....tests.core import DICT_NOT_EMPTY, check_dict_structure_same, mock, require_cupy
+from ....utils import lazy_import
+from ..local import _load_config, new_cluster
+from ..session import (
+    AsyncSession,
+    ExecutionInfo,
+    Profiling,
+    Progress,
+    _execute_with_progress,
+    _IsolatedWebSession,
+    clear_default_session,
+    execute,
+    fetch,
+    fetch_infos,
+    get_default_async_session,
+    get_default_session,
+    new_session,
+    stop_server,
+)
+from ..tests.session import new_test_session
+from .modules.utils import (  # noqa: F401; pylint: disable=unused-variable
+    cleanup_third_party_modules_output,
+    get_output_filenames,
+)
+
+CONFIG_TEST_FILE = os.path.join(os.path.dirname(__file__), "local_test_config.yml")
+
+CONFIG_VINEYARD_TEST_FILE = os.path.join(
+    os.path.dirname(__file__), "local_test_with_vineyard_config.yml"
+)
+
+
+CONFIG_THIRD_PARTY_MODULES_TEST_FILE = os.path.join(
+    os.path.dirname(__file__), "local_test_with_third_parity_modules_config.yml"
+)
+
+EXPECT_PROFILING_STRUCTURE = {
+    "supervisor": {
+        "general": {
+            "optimize": 0.0005879402160644531,
+            "incref_fetch_tileables": 0.0010840892791748047,
+            "stage_*": {
+                "tile(*)": 0.008243083953857422,
+                "gen_subtask_graph(*)": 0.012202978134155273,
+                "run": 0.27870702743530273,
+                "total": 0.30318617820739746,
+            },
+            "total": 0.30951380729675293,
+        },
+        "serialization": {},
+        "most_calls": DICT_NOT_EMPTY,
+        "slow_calls": DICT_NOT_EMPTY,
+        "band_subtasks": DICT_NOT_EMPTY,
+        "slow_subtasks": DICT_NOT_EMPTY,
+    }
+}
+EXPECT_PROFILING_STRUCTURE_NO_SLOW = copy.deepcopy(EXPECT_PROFILING_STRUCTURE)
+EXPECT_PROFILING_STRUCTURE_NO_SLOW["supervisor"]["slow_calls"] = {}
+EXPECT_PROFILING_STRUCTURE_NO_SLOW["supervisor"]["slow_subtasks"] = {}
+
+params = ["default"]
+if vineyard is not None:
+    params.append("vineyard")
+
+
+@pytest.mark.parametrize(indirect=True)
+@pytest.fixture(params=params)
+async def create_cluster(request):
+    if request.param == "default":
+        config = CONFIG_TEST_FILE
+    elif request.param == "vineyard":
+        config = CONFIG_VINEYARD_TEST_FILE
+    else:
+        config = None
+    start_method = os.environ.get("POOL_START_METHOD", None)
+    client = await new_cluster(
+        subprocess_start_method=start_method,
+        config=config,
+        n_worker=2,
+        n_cpu=4,
+        use_uvloop=False,
+    )
+    async with client:
+        if request.param == "default":
+            assert client.session.client is not None
+        yield client, request.param
+
+
+def _assert_storage_cleaned(session_id: str, addr: str, level: StorageLevel):
+    async def _assert(session_id: str, addr: str, level: StorageLevel):
+        storage_api = await StorageAPI.create(session_id, addr)
+        assert len(await storage_api.list(level)) == 0
+        info = await storage_api.get_storage_level_info(level)
+        assert info.used_size == 0
+
+    isolation = new_isolation()
+    asyncio.run_coroutine_threadsafe(
+        _assert(session_id, addr, level), isolation.loop
+    ).result()
+
+
+@pytest.mark.parametrize("backend", ["mars"])
+@pytest.mark.parametrize("_new_session", [new_session, new_test_session])
+def test_new_session_backend(_new_session, backend):
+    from ....services.task.execution.api import _name_to_config_cls
+
+    config_cls = _name_to_config_cls[backend]
+    original_config_init = config_cls.__init__
+    original_deploy_band_resources = config_cls.get_deploy_band_resources
+    with mock.patch.object(
+        config_cls, "__init__", autospec=True
+    ) as config_init, mock.patch.object(
+        config_cls, "get_deploy_band_resources", autospec=True
+    ) as deploy_band_resources:
+        return_deploy_band_resources = []
+
+        def _wrap_original_deploy_band_resources(*args, **kwargs):
+            nonlocal return_deploy_band_resources
+            return_deploy_band_resources = original_deploy_band_resources(
+                *args, **kwargs
+            )
+            return return_deploy_band_resources
+
+        config_init.side_effect = original_config_init
+        deploy_band_resources.side_effect = _wrap_original_deploy_band_resources
+        sess = _new_session(
+            backend=backend, n_cpu=2, web=False, use_uvloop=False, default=True
+        )
+        try:
+            assert config_init.call_count > 0
+            assert deploy_band_resources.call_count > 0
+            worker_pools = sess.default.client._cluster._worker_pools
+            assert len(worker_pools) == len(return_deploy_band_resources)
+            a = mt.ones((10, 10))
+            b = a + 1
+            res = b.to_numpy()
+            np.testing.assert_array_equal(res, np.ones((10, 10)) + 1)
+        finally:
+            sess.stop_server()
+
+    assert get_default_async_session() is None
+
+
+@pytest.mark.asyncio
+async def test_vineyard_operators(create_cluster):
+    param = create_cluster[1]
+    if param != "vineyard":
+        pytest.skip("Vineyard is not enabled")
+
+    session = get_default_async_session()
+
+    # tensor
+    raw = np.random.RandomState(0).rand(55, 55)
+    a = mt.tensor(raw, chunk_size=15)
+    info = await session.execute(a)  # n.b.: pre-execute
+    await info
+
+    b = mt.to_vineyard(a)
+    info = await session.execute(b)
+    await info
+    object_id = (await session.fetch(b))[0]
+
+    c = mt.from_vineyard(object_id)
+    info = await session.execute(c)
+    await info
+    tensor = await session.fetch(c)
+    np.testing.assert_allclose(tensor, raw)
+
+    # dataframe
+    raw = pd.DataFrame({"a": np.arange(0, 55), "b": np.arange(55, 110)})
+    a = md.DataFrame(raw, chunk_size=15)
+    b = a.to_vineyard()  # n.b.: no pre-execute
+    info = await session.execute(b)
+    await info
+    object_id = (await session.fetch(b))[0][0]
+
+    c = md.from_vineyard(object_id)
+    info = await session.execute(c)
+    await info
+    df = await session.fetch(c)
+    pd.testing.assert_frame_equal(df, raw)
+
+
+@pytest.mark.parametrize(
+    "config",
+    [
+        [
+            {
+                "enable_profiling": {
+                    "slow_calls_duration_threshold": 0,
+                    "slow_subtasks_duration_threshold": 0,
+                }
+            },
+            EXPECT_PROFILING_STRUCTURE,
+        ],
+        [
+            {
+                "enable_profiling": {
+                    "slow_calls_duration_threshold": 1000,
+                    "slow_subtasks_duration_threshold": 1000,
+                }
+            },
+            EXPECT_PROFILING_STRUCTURE_NO_SLOW,
+        ],
+        [{}, {}],
+    ],
+)
+@pytest.mark.asyncio
+async def test_execute(create_cluster, config):
+    session = get_default_async_session()
+    assert session.address is not None
+    assert session.session_id is not None
+
+    raw = np.random.RandomState(0).rand(10, 10)
+    a = mt.tensor(raw, chunk_size=5)
+    b = a + 1
+
+    extra_config, expect_profiling_structure = config
+
+    info = await session.execute(b, extra_config=extra_config)
+    await info
+    if extra_config:
+        check_dict_structure_same(info.profiling_result(), expect_profiling_structure)
+    else:
+        assert not info.profiling_result()
+    assert info.result() is None
+    assert info.exception() is None
+    assert info.progress() == 1
+    np.testing.assert_equal(raw + 1, await session.fetch(b))
+
+    with pytest.raises(ValueError):
+        await session.fetch(b + 1)
+
+    with pytest.raises(ValueError):
+        await session.fetch(b[b < 0.6])
+
+    del a, b
+
+    if (
+        not isinstance(session._isolated_session, _IsolatedWebSession)
+        and session.client
+    ):
+        worker_pools = session.client._cluster._worker_pools
+        await session.destroy()
+        for worker_pool in worker_pools:
+            if hasattr(worker_pool, "external_address"):
+                _assert_storage_cleaned(
+                    session.session_id,
+                    worker_pool.external_address,
+                    StorageLevel.MEMORY,
+                )
+
+
+@pytest.mark.asyncio
+async def test_iterative_tiling(create_cluster):
+    session = get_default_async_session()
+
+    raw = np.random.RandomState(0).rand(30, 5)
+    raw_df = pd.DataFrame(raw, index=np.arange(1, 31))
+
+    df = md.DataFrame(raw_df, chunk_size=10)
+    df = df[df[0] < 0.7]
+    df2 = df.shift(2)
+
+    info = await session.execute(df2)
+    await info
+    assert info.result() is None
+    result = await session.fetch(df2)
+
+    expected = raw_df[raw_df[0] < 0.7].shift(2)
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test meta
+    assert df2.index_value.min_val >= 1
+    assert df2.index_value.max_val <= 30
+
+    if (
+        not isinstance(session._isolated_session, _IsolatedWebSession)
+        and session.client
+    ):
+        worker_pools = session.client._cluster._worker_pools
+        await session.destroy()
+        for worker_pool in worker_pools:
+            if hasattr(worker_pool, "external_address"):
+                _assert_storage_cleaned(
+                    session.session_id,
+                    worker_pool.external_address,
+                    StorageLevel.MEMORY,
+                )
+
+
+@pytest.mark.asyncio
+async def test_execute_describe(create_cluster):
+    s = np.random.RandomState(0)
+    raw = pd.DataFrame(s.rand(100, 4), columns=list("abcd"))
+    df = md.DataFrame(raw, chunk_size=30)
+
+    session = get_default_async_session()
+    r = df.describe()
+    info = await session.execute(r)
+    await info
+    assert info.result() is None
+    assert info.exception() is None
+    assert info.progress() == 1
+    res = await session.fetch(r)
+    pd.testing.assert_frame_equal(res, raw.describe())
+
+    if (
+        not isinstance(session._isolated_session, _IsolatedWebSession)
+        and session.client
+    ):
+        worker_pools = session.client._cluster._worker_pools
+        await session.destroy()
+        for worker_pool in worker_pools:
+            if hasattr(worker_pool, "external_address"):
+                _assert_storage_cleaned(
+                    session.session_id,
+                    worker_pool.external_address,
+                    StorageLevel.MEMORY,
+                )
+
+
+@pytest.mark.asyncio
+async def test_execute_apply_closure(create_cluster):
+    # DataFrame
+    cols = [chr(ord("A") + i) for i in range(10)]
+    raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols))
+    df = md.DataFrame(raw, chunk_size=5)
+
+    x1 = pd.Series([i for i in range(10**4)])
+    y1 = pd.Series([i for i in range(10**4)])
+
+    def dataframe_closure(z1):
+        return pd.concat([x1, y1], ignore_index=True)
+
+    session = get_default_async_session()
+    df_r = df.apply(dataframe_closure, axis=1)
+    df_info = await session.execute(df_r)
+    await df_info
+    assert df_info.result() is None
+    assert df_info.exception() is None
+    assert df_info.progress() == 1
+
+    df_result = await session.fetch(df_r)
+    df_expected = raw.apply(dataframe_closure, axis=1)
+    pd.testing.assert_frame_equal(df_result, df_expected)
+
+    # Series
+    idxes = [chr(ord("A") + i) for i in range(20)]
+    s_raw = pd.Series([i**2 for i in range(20)], index=idxes)
+
+    series = md.Series(s_raw, chunk_size=5)
+
+    x2, y2 = 1, 2
+
+    def series_closure(z2):
+        return [z2 + x2, z2 + y2]
+
+    series_r = series.apply(series_closure, convert_dtype=False)
+    series_info = await session.execute(series_r)
+    await series_info
+    assert series_info.result() is None
+    assert series_info.exception() is None
+    assert series_info.progress() == 1
+
+    series_result = await session.fetch(series_r)
+    series_expected = s_raw.apply(series_closure, convert_dtype=False)
+    pd.testing.assert_series_equal(series_result, series_expected)
+
+    if (
+        not isinstance(session._isolated_session, _IsolatedWebSession)
+        and session.client
+    ):
+        worker_pools = session.client._cluster._worker_pools
+        await session.destroy()
+        for worker_pool in worker_pools:
+            if hasattr(worker_pool, "external_address"):
+                _assert_storage_cleaned(
+                    session.session_id,
+                    worker_pool.external_address,
+                    StorageLevel.MEMORY,
+                )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("multiplier", [1, 3, 4])
+async def test_execute_callable_closure(create_cluster, multiplier):
+    # DataFrame
+    cols = [chr(ord("A") + i) for i in range(10)]
+    raw = pd.DataFrame(dict((c, [i**2 for i in range(20)]) for c in cols))
+    df = md.DataFrame(raw, chunk_size=5)
+
+    class callable_df:
+        __slots__ = "x", "__dict__"
+
+        def __init__(self, multiplier: int = 1):
+            self.x = pd.Series([i for i in range(10**multiplier)])
+            self.y = pd.Series([i for i in range(10**multiplier)])
+
+        def __call__(self, pdf):
+            return pd.concat([self.x, self.y], ignore_index=True)
+
+    session = get_default_async_session()
+    cdf = callable_df(multiplier=multiplier)
+    df_r = df.apply(cdf, axis=1)
+    df_info = await session.execute(df_r)
+    await df_info
+    assert df_info.result() is None
+    assert df_info.exception() is None
+    assert df_info.progress() == 1
+
+    df_result = await session.fetch(df_r)
+    df_expected = raw.apply(cdf, axis=1)
+    pd.testing.assert_frame_equal(df_result, df_expected)
+
+    if (
+        not isinstance(session._isolated_session, _IsolatedWebSession)
+        and session.client
+    ):
+        worker_pools = session.client._cluster._worker_pools
+        await session.destroy()
+        for worker_pool in worker_pools:
+            if hasattr(worker_pool, "external_address"):
+                _assert_storage_cleaned(
+                    session.session_id,
+                    worker_pool.external_address,
+                    StorageLevel.MEMORY,
+                )
+
+
+@pytest.mark.asyncio
+async def test_sync_execute_in_async(create_cluster):
+    a = mt.ones((10, 10))
+    b = a + 1
+    res = b.to_numpy()
+    np.testing.assert_array_equal(res, np.ones((10, 10)) + 1)
+
+
+@pytest.mark.asyncio
+async def test_fetch_infos(create_cluster):
+    raw = np.random.RandomState(0).rand(30, 5)
+    raw_df = pd.DataFrame(raw, index=np.arange(1, 31))
+
+    df = md.DataFrame(raw_df, chunk_size=10)
+    df.execute()
+    fetched_infos = df.fetch_infos()
+
+    assert "object_id" in fetched_infos
+    assert "level" in fetched_infos
+    assert "memory_size" in fetched_infos
+    assert "store_size" in fetched_infos
+    assert "bands" in fetched_infos
+
+    fetched_infos = df.fetch_infos(fields=["object_id", "bands"])
+    assert "object_id" in fetched_infos
+    assert "bands" in fetched_infos
+    assert len(fetched_infos) == 2
+
+    fetch_infos((df, df), fields=None)
+    results_infos = mr.ExecutableTuple([df, df]).execute()._fetch_infos()
+    assert len(results_infos) == 2
+    assert "object_id" in results_infos[0]
+    assert "level" in results_infos[0]
+    assert "memory_size" in results_infos[0]
+    assert "store_size" in results_infos[0]
+    assert "bands" in results_infos[0]
+
+
+async def _run_web_session_test(web_address):
+    session_id = str(uuid.uuid4())
+    session = await AsyncSession.init(web_address, session_id)
+    session.as_default()
+
+    raw = np.random.RandomState(0).rand(10, 10)
+    a = mt.tensor(raw, chunk_size=5)
+    b = a + 1
+
+    info = await session.execute(b)
+    await info
+    assert info.result() is None
+    assert info.exception() is None
+    assert info.progress() == 1
+    np.testing.assert_equal(raw + 1, await session.fetch(b))
+    del a, b
+
+    # Test spawn a local function by the web session.
+    def _my_func():
+        print("output from function")
+
+    r = mr.spawn(_my_func)
+    info = await session.execute(r)
+    await info
+    assert info.result() is None
+    assert info.exception() is None
+    assert info.progress() == 1
+    assert "output from function" in str(r.fetch_log(session=session))
+    assert "output from function" in str(
+        r.fetch_log(session=session, offsets="0k", sizes=[1000])
+    )
+    assert "output from function" in str(
+        r.fetch_log(session=session, offsets={r.op.key: "0k"}, sizes=[1000])
+    )
+
+    df = md.DataFrame([1, 2, 3])
+    # Test apply a lambda by the web session.
+    r = df.apply(lambda x: x)
+    info = await session.execute(r)
+    await info
+    assert info.result() is None
+    assert info.exception() is None
+    assert info.progress() == 1
+    pd.testing.assert_frame_equal(await session.fetch(r), pd.DataFrame([1, 2, 3]))
+
+    AsyncSession.reset_default()
+    await session.destroy()
+
+
+@pytest.mark.parametrize(
+    "config",
+    [
+        [
+            {
+                "enable_profiling": {
+                    "slow_calls_duration_threshold": 0,
+                    "slow_subtasks_duration_threshold": 0,
+                }
+            },
+            EXPECT_PROFILING_STRUCTURE,
+        ],
+        [
+            {
+                "enable_profiling": {
+                    "slow_calls_duration_threshold": 1000,
+                    "slow_subtasks_duration_threshold": 1000,
+                }
+            },
+            EXPECT_PROFILING_STRUCTURE_NO_SLOW,
+        ],
+        [{}, {}],
+    ],
+)
+@pytest.mark.asyncio
+async def test_web_session(create_cluster, config):
+    client = create_cluster[0]
+    session_id = str(uuid.uuid4())
+    web_address = client.web_address
+    session = await AsyncSession.init(
+        web_address, session_id, request_rewriter=lambda x: x
+    )
+    assert await session.get_web_endpoint() == web_address
+    session.as_default()
+    assert isinstance(session._isolated_session, _IsolatedWebSession)
+    await test_execute(client, config)
+    await test_iterative_tiling(client)
+    AsyncSession.reset_default()
+    await session.destroy()
+    await _run_web_session_test(web_address)
+
+    worker_pools = client._cluster._worker_pools
+    for worker_pool in worker_pools:
+        if hasattr(worker_pool, "external_address"):
+            _assert_storage_cleaned(
+                session.session_id, worker_pool.external_address, StorageLevel.MEMORY
+            )
+
+
+@pytest.mark.parametrize("config", [{"backend": "mars"}])
+def test_sync_execute(config):
+    session = new_session(
+        backend=config["backend"], n_cpu=2, web=False, use_uvloop=False
+    )
+
+    # web not started
+    assert session._session.client.web_address is None
+    assert session.get_web_endpoint() is None
+
+    with session:
+        raw = np.random.RandomState(0).rand(10, 5)
+        a = mt.tensor(raw, chunk_size=5).sum(axis=1)
+        b = a.execute(show_progress=False)
+        assert b is a
+        result = a.fetch()
+        np.testing.assert_array_equal(result, raw.sum(axis=1))
+
+        c = b + 1
+        c.execute(show_progress=False)
+        result = c.fetch()
+        np.testing.assert_array_equal(result, raw.sum(axis=1) + 1)
+
+        c = mt.tensor(raw, chunk_size=5).sum()
+        d = session.execute(c)
+        assert d is c
+        assert abs(session.fetch(d) - raw.sum()) < 0.001
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            file_path = os.path.join(tempdir, "test.csv")
+            pdf = pd.DataFrame(
+                np.random.RandomState(0).rand(100, 10),
+                columns=[f"col{i}" for i in range(10)],
+            )
+            pdf.to_csv(file_path, index=False)
+
+            df = md.read_csv(
+                file_path,
+                chunk_bytes=os.stat(file_path).st_size / 5,
+                incremental_index=True,
+            )
+            result = df.sum(axis=1).execute().fetch()
+            expected = pd.read_csv(file_path).sum(axis=1)
+            pd.testing.assert_series_equal(result, expected)
+
+            df = md.read_csv(
+                file_path,
+                chunk_bytes=os.stat(file_path).st_size / 5,
+                incremental_index=True,
+            )
+            result = df.head(10).execute().fetch()
+            expected = pd.read_csv(file_path).head(10)
+            pd.testing.assert_frame_equal(result, expected)
+
+    for worker_pool in session._session.client._cluster._worker_pools:
+        _assert_storage_cleaned(
+            session.session_id, worker_pool.external_address, StorageLevel.MEMORY
+        )
+
+    session.stop_server()
+    assert get_default_async_session() is None
+
+
+def test_no_default_session():
+    raw = np.random.RandomState(0).rand(10, 10)
+    a = mt.tensor(raw, chunk_size=5)
+    b = a + 1
+
+    with pytest.warns(Warning):
+        execute(b, show_progress=False)
+
+    np.testing.assert_array_equal(fetch(b), raw + 1)
+    fetch_infos(b, fields=None)
+    assert get_default_async_session() is not None
+    stop_server()
+    assert get_default_async_session() is None
+
+
+@pytest.mark.asyncio
+async def test_session_set_progress(create_cluster):
+    session = get_default_async_session()
+    assert session.address is not None
+    assert session.session_id is not None
+
+    def f1(interval: float, count: int):
+        for idx in range(count):
+            time.sleep(interval)
+            get_context().set_progress((1 + idx) * 1.0 / count)
+
+    r = mr.spawn(f1, args=(0.5, 10))
+
+    info = await session.execute(r)
+
+    for _ in range(20):
+        if 0 < info.progress() < 1:
+            break
+        await asyncio.sleep(0.1)
+    else:
+        raise Exception(f"progress test failed, actual value {info.progress()}.")
+
+    await info
+    assert info.result() is None
+    assert info.exception() is None
+    assert info.progress() == 1
+
+
+@pytest.mark.asyncio
+async def test_session_get_progress(create_cluster):
+    session = get_default_async_session()
+    assert session.address is not None
+    assert session.session_id is not None
+
+    raw = np.random.rand(100, 4)
+    t = mt.tensor(raw, chunk_size=50)
+
+    def f1(c):
+        time.sleep(0.5)
+        return c
+
+    t1 = t.sum()
+    t2 = t1.map_chunk(f1)
+    r = t2.map_chunk(f1)
+    info = await session.execute(r)
+
+    for _ in range(100):
+        if 0 < info.progress() < 1:
+            break
+        await asyncio.sleep(0.1)
+    else:
+        raise Exception(f"progress test failed, actual value {info.progress()}.")
+
+    await info
+    assert info.result() is None
+    assert info.exception() is None
+    assert info.progress() == 1
+
+
+@pytest.fixture
+def setup_session(request):
+    param = getattr(request, "param", {})
+    config = param.get("config", {})
+    session = new_session(
+        backend=config.get("backend", "mars"), n_cpu=2, use_uvloop=False, config=config
+    )
+    assert session.get_web_endpoint() is not None
+
+    try:
+        with session, option_context({"show_progress": False}):
+            yield session
+    finally:
+        session.stop_server()
+
+
+WeakTaskProcessorRefs = weakref.WeakSet()
+
+
+class CheckRefTaskProcessor(TaskProcessor):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        WeakTaskProcessorRefs.add(self)
+
+    async def run(self):
+        # Trigger tileable gc before execute.
+        gc.collect()
+        return await super().run()
+
+    @staticmethod
+    def check_ref_count(count):
+        for _ in range(10):
+            if len(WeakTaskProcessorRefs) == count:
+                break
+            time.sleep(1)
+        else:
+            raise Exception(
+                f"Check TaskProcessor weakref failed, expect {count} instances, "
+                f"but got {WeakTaskProcessorRefs}"
+            )
+
+
+@pytest.mark.parametrize(
+    "setup_session",
+    [
+        {
+            "config": {
+                "task.default_config.reserved_finish_tasks": 2,
+                "task.task_processor_cls": CheckRefTaskProcessor,
+            }
+        }
+    ],
+    indirect=True,
+)
+def test_decref(setup_session):
+    session = setup_session
+
+    a = mt.ones((10, 10))
+    b = mt.ones((10, 10))
+    c = b + 1
+    d = mt.ones((5, 5))
+
+    a.execute()
+    b.execute()
+    c.execute()
+    d.execute()
+
+    CheckRefTaskProcessor.check_ref_count(4)
+
+    del a
+    ref_counts = session._get_ref_counts()
+    assert len(ref_counts) == 3
+    del b
+    ref_counts = session._get_ref_counts()
+    assert len(ref_counts) == 3
+    del c
+    ref_counts = session._get_ref_counts()
+    assert len(ref_counts) == 1
+    del d
+    ref_counts = session._get_ref_counts()
+    assert len(ref_counts) == 0
+
+    CheckRefTaskProcessor.check_ref_count(2)
+
+    rs = np.random.RandomState(0)
+    pdf = pd.DataFrame({"a": rs.randint(10, size=10), "b": rs.rand(10)})
+    df = md.DataFrame(pdf, chunk_size=5)
+    df2 = df.groupby("a").agg("mean", method="shuffle")
+    result = df2.execute().fetch()
+    expected = pdf.groupby("a").agg("mean")
+    pd.testing.assert_frame_equal(result, expected)
+
+    CheckRefTaskProcessor.check_ref_count(3)
+
+    del df, df2
+    ref_counts = session._get_ref_counts()
+    assert len(ref_counts) == 0
+
+    CheckRefTaskProcessor.check_ref_count(2)
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test.csv")
+        pdf = pd.DataFrame(
+            np.random.RandomState(0).rand(100, 10),
+            columns=[f"col{i}" for i in range(10)],
+        )
+        pdf.to_csv(file_path, index=False)
+
+        df = md.read_csv(file_path, chunk_bytes=os.stat(file_path).st_size / 5)
+        df2 = df.head(10)
+
+        result = df2.execute().fetch()
+        expected = pdf.head(10)
+        pd.testing.assert_frame_equal(result, expected)
+
+        del df, df2
+
+        ref_counts = session._get_ref_counts()
+        assert len(ref_counts) == 0
+
+    for a in ((1, 1, 1, 2, 2, 3), [1, 1, 1, 2, 2, 3]):
+        splits = mt.split(a, (3, 5))
+        assert len(splits) == 3
+        splits0 = splits[0].execute().fetch()
+        np.testing.assert_array_equal(splits0, (1, 1, 1))
+        splits1 = splits[1].execute().fetch()
+        np.testing.assert_array_equal(splits1, (2, 2))
+        splits2 = splits[2].execute().fetch()
+        np.testing.assert_array_equal(splits2, (3,))
+
+    del splits, splits0, splits1, splits2
+
+    gc.collect()
+    ref_counts = session._get_ref_counts()
+    assert len(ref_counts) == 0
+
+    worker_addr = session._session.client._cluster._worker_pools[0].external_address
+    _assert_storage_cleaned(session.session_id, worker_addr, StorageLevel.MEMORY)
+
+
+def _assert_worker_pool_storage_cleaned(session):
+    worker_addr = session._session.client._cluster._worker_pools[0].external_address
+    _assert_storage_cleaned(session.session_id, worker_addr, StorageLevel.MEMORY)
+
+
+def _cancel_when_execute(session, cancelled):
+    def run():
+        time.sleep(200)
+
+    rs = [mr.spawn(run) for _ in range(10)]
+    execute(*rs, cancelled=cancelled)
+
+    assert all(not r._executed_sessions for r in rs)
+
+    del rs
+    time.sleep(0.5)
+    ref_counts = session._get_ref_counts()
+    assert len(ref_counts) == 0
+
+
+def _cancel_assert_when_execute(session, cancelled):
+    _assert_worker_pool_storage_cleaned(session)
+    _cancel_when_execute(session, cancelled)
+
+
+class SlowTileAdd(TensorAdd):
+    @classmethod
+    def tile(cls, op):
+        time.sleep(2)
+        return (yield from TensorAdd.tile(op))
+
+
+def _cancel_when_tile(session, cancelled):
+    a = mt.tensor([1, 2, 3])
+    for i in range(20):
+        a = SlowTileAdd(dtype=np.dtype(np.int64))(a, 1)
+    execute(a, cancelled=cancelled)
+
+    assert not a._executed_sessions
+
+    del a
+    time.sleep(0.5)
+    ref_counts = session._get_ref_counts()
+    assert len(ref_counts) == 0
+
+
+@pytest.mark.parametrize("test_func", [_cancel_assert_when_execute, _cancel_when_tile])
+def test_cancel(create_cluster, test_func):
+    session = get_default_session()
+
+    async def _new_cancel_event():
+        return asyncio.Event()
+
+    isolation = new_isolation()
+    cancelled = asyncio.run_coroutine_threadsafe(
+        _new_cancel_event(), isolation.loop
+    ).result()
+
+    def cancel():
+        time.sleep(0.5)
+        cancelled.set()
+
+    t = threading.Thread(target=cancel)
+    t.daemon = True
+    t.start()
+
+    start = time.time()
+    test_func(session, cancelled)
+    assert time.time() - start < 20
+
+    # submit another task
+    raw = np.random.rand(10, 10)
+    t = mt.tensor(raw, chunk_size=(10, 5))
+    np.testing.assert_array_equal(t.execute().fetch(), raw)
+
+
+def test_load_third_party_modules(cleanup_third_party_modules_output):  # noqa: F811
+    config = _load_config()
+
+    config["third_party_modules"] = set()
+    with pytest.raises(TypeError, match="set"):
+        new_session(n_cpu=2, web=False, config=config)
+
+    config["third_party_modules"] = {"supervisor": ["not_exists_for_supervisor"]}
+    with pytest.raises(ModuleNotFoundError, match="not_exists_for_supervisor"):
+        new_session(n_cpu=2, web=False, config=config)
+
+    config["third_party_modules"] = {"worker": ["not_exists_for_worker"]}
+    with pytest.raises(ModuleNotFoundError, match="not_exists_for_worker"):
+        new_session(n_cpu=2, web=False, config=config)
+
+    config["third_party_modules"] = ["mars.deploy.oscar.tests.modules.replace_op"]
+    session = new_session(n_cpu=2, web=False, config=config)
+    # web not started
+    assert session._session.client.web_address is None
+
+    with session:
+        raw = np.random.RandomState(0).rand(10, 10)
+        a = mt.tensor(raw, chunk_size=5)
+        b = a + 1
+        b.execute(show_progress=False)
+        result = b.fetch()
+
+        np.testing.assert_equal(raw - 1, result)
+
+    session.stop_server()
+    assert get_default_session() is None
+
+    session = new_session(
+        n_cpu=2, web=False, config=CONFIG_THIRD_PARTY_MODULES_TEST_FILE
+    )
+    # web not started
+    assert session._session.client.web_address is None
+
+    with session:
+        # 1 main pool, 3 sub pools(2 worker + 1 io).
+        assert len(get_output_filenames()) == 4
+
+    session.stop_server()
+    assert get_default_session() is None
+
+
+@mock.patch("asyncio.base_events.logger")
+def test_show_progress_raise_exception(m_log):
+    loop = asyncio.get_event_loop()
+    event = asyncio.Event()
+
+    class ProgressBar:
+        def __init__(self, *args, **kwargs):
+            pass
+
+        def __enter__(self):
+            pass
+
+        def __exit__(self, *_):
+            pass
+
+        def update(self, progress: float):
+            pass
+
+    async def _exec():
+        progress = Progress()
+        profiling = Profiling()
+        execution_info = ExecutionInfo(
+            asyncio.create_task(event.wait()), progress, profiling, loop, list()
+        )
+        progress_bar = ProgressBar(True)
+        cancel_event = asyncio.Event()
+        loop.call_later(2, cancel_event.set)
+        await _execute_with_progress(execution_info, progress_bar, 0.01, cancel_event)
+        execution_info.get_future().set_exception(Exception("Expect Exception!!!"))
+
+    loop.run_until_complete(_exec())
+    assert len(m_log.mock_calls) < 3
+
+
+min_task_runtime = 2
+
+
+@pytest.fixture
+async def speculative_cluster():
+    config = _load_config()
+    config["scheduling"]["speculation"]["enabled"] = True
+    config["scheduling"]["speculation"]["dry"] = False
+    config["scheduling"]["speculation"]["interval"] = 0.5
+    config["scheduling"]["speculation"]["threshold"] = 0.2
+    config["scheduling"]["speculation"]["min_task_runtime"] = min_task_runtime
+    config["scheduling"]["speculation"]["multiplier"] = 2
+    config["scheduling"]["speculation"]["max_concurrent_run"] = 10
+    config["scheduling"]["subtask_cancel_timeout"] = 0.1
+    config["scheduling"]["enable_kill_slot"] = True
+    config["storage"]["backends"] = ["plasma"]
+    config["storage"]["plasma"]["store_memory"] = 10 * 1024 * 1024
+    client = await new_cluster(
+        config=config,
+        n_worker=5,
+        n_cpu=10,
+        use_uvloop=False,
+    )
+    async with client:
+        yield client
+
+
+@pytest.mark.timeout(timeout=500)
+@pytest.mark.asyncio
+async def test_task_speculation_execution(speculative_cluster):
+    series_size = 10
+
+    def time_consuming(start, x):
+        print(f"subtask index {x}")
+        if (
+            x >= series_size - 1
+        ):  # leave some workers not excluded from speculative submit.
+            if time.time() - start < min_task_runtime:
+                print(f"subtask with index {x} starts to hang.")
+                time.sleep(1000000)
+        return x * x
+
+    from functools import partial
+
+    assert (
+        md.Series(list(range(series_size)), chunk_size=1)
+        .apply(partial(time_consuming, time.time()))
+        .sum()
+        .execute()
+        .fetch()
+        == pd.Series(list(range(series_size))).apply(lambda x: x * x).sum()
+    )
+
+
+def test_naive_code_file():
+    code_file = """
+    import mars
+    import mars.tensor as mt
+    import os
+
+    mars.new_session()
+    try:
+        result_path = os.environ["RESULTPATH"]
+        with open(result_path, "w") as outf:
+            outf.write(str(mt.ones((10, 10)).sum().execute()))
+    finally:
+        mars.stop_server()
+    """
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        try:
+            script_path = os.path.join(temp_dir, "test_file.py")
+            result_path = os.path.join(temp_dir, "result.txt")
+
+            with open(script_path, "w") as file_obj:
+                file_obj.write(textwrap.dedent(code_file))
+
+            env = os.environ.copy()
+            env["PYTHONPATH"] = os.path.pathsep.join(sys.path)
+            env["RESULTPATH"] = result_path
+            proc = subprocess.Popen([sys.executable, script_path], env=env)
+            pid = proc.pid
+            proc.wait(120)
+
+            with open(result_path, "r") as inp_file:
+                assert 100 == int(float(inp_file.read()))
+        except subprocess.TimeoutExpired:
+            try:
+                procs = [psutil.Process(pid)]
+                procs.extend(procs[0].children(True))
+                for proc in reversed(procs):
+                    try:
+                        proc.kill()
+                    except psutil.NoSuchProcess:
+                        pass
+            except psutil.NoSuchProcess:
+                pass
+            raise
+
+
+ucp = lazy_import("ucp")
+_OSCAR_CONF_TEMPLATE = """
+"@inherits": '@default'
+oscar:
+  numa:
+    external_addr_scheme: {scheme}
+    enable_internal_addr: {enable_inaddr}
+"""
+
+
+schemes = [None]
+if ucp is not None:
+    schemes.append("ucx")
+
+
+@pytest.mark.parametrize("scheme", schemes)
+@pytest.mark.parametrize("enable_inaddr", [False, True])
+@pytest.mark.parametrize("manner", ["numa", "all", "config_file"])
+def test_oscar_configs(scheme, enable_inaddr, manner):
+    def test(sess):
+        def verify():
+            router = Router.get_instance()
+            prefix = "" if not scheme else f"{scheme}://"
+            assert router._mapping
+            assert all(addr.startswith(prefix) for addr in router._mapping)
+            if enable_inaddr:
+                assert all(inaddr is not None for inaddr in router._mapping.values())
+            else:
+                assert all(inaddr is None for inaddr in router._mapping.values())
+
+        with sess:
+            sess.execute(*[mr.spawn(verify) for _ in range(4)])
+
+        sess.stop_server()
+        assert get_default_async_session() is None
+
+    if manner == "numa":
+        session = new_session(
+            n_cpu=2,
+            web=False,
+            cuda_devices=None,
+            numa_external_addr_scheme=scheme,
+            numa_enable_internal_addr=enable_inaddr,
+            oscar_extra_conf={"ucx": {"tcp": True}},
+        )
+        test(session)
+    elif manner == "all":
+        session = new_session(
+            n_cpu=2,
+            web=False,
+            cuda_devices=None,
+            external_addr_scheme=scheme,
+            enable_internal_addr=enable_inaddr,
+        )
+        test(session)
+    else:
+        scheme_str = "" if not scheme else scheme
+        enable_inaddr_str = "yes" if enable_inaddr else "no"
+        config_content = _OSCAR_CONF_TEMPLATE.format(
+            scheme=scheme_str, enable_inaddr=enable_inaddr_str
+        )
+        with tempfile.NamedTemporaryFile(mode="w+", suffix=".yml") as f:
+            f.write(config_content)
+            f.flush()
+            session = new_session(config=f.name, n_cpu=2, web=False, cuda_devices=None)
+
+            test(session)
+
+
+@require_cupy
+@pytest.mark.parametrize("scheme", schemes)
+@pytest.mark.parametrize("enable_inaddr", [False, True])
+@pytest.mark.parametrize("manner", ["gpu", "all"])
+def test_gpu_oscar_configs(scheme, enable_inaddr, manner):
+    def test(sess):
+        def verify():
+            router = Router.get_instance()
+            prefix = "" if not scheme else f"{scheme}://"
+            # only verify GPU process
+            assert {addr for addr in router._mapping if addr == router.external_address}
+            assert all(
+                addr.startswith(prefix)
+                for addr in router._mapping
+                if addr == router.external_address
+            )
+            if enable_inaddr:
+                assert all(
+                    inaddr is not None
+                    for addr, inaddr in router._mapping.items()
+                    if addr == router.external_address
+                )
+            else:
+                assert all(
+                    inaddr is None
+                    for addr, inaddr in router._mapping.items()
+                    if addr == router.external_address
+                )
+
+        with sess:
+            sess.execute(*[mr.spawn(verify, gpu=True) for _ in range(2)])
+
+        sess.stop_server()
+        assert get_default_async_session() is None
+
+    if manner == "gpu":
+        session = new_session(
+            n_cpu=2,
+            web=False,
+            cuda_devices=[0],
+            gpu_external_addr_scheme=scheme,
+            gpu_enable_internal_addr=enable_inaddr,
+            oscar_extra_conf={"ucx": {"create-cuda-contex": True}},
+        )
+        test(session)
+    else:
+        session = new_session(
+            n_cpu=2,
+            web=False,
+            cuda_devices=[0],
+            external_addr_scheme=scheme,
+            enable_internal_addr=enable_inaddr,
+        )
+        test(session)
+
+
+def test_default_oscar_config():
+    session = new_session(n_cpu=2, web=False, cuda_devices=None)
+
+    def verify():
+        router = Router.get_instance()
+        assert router._mapping
+        # enabled inner address by default
+        assert all(inaddr is not None for inaddr in router._mapping.values())
+
+    with session:
+        session.execute(*[mr.spawn(verify) for _ in range(4)])
+
+    session.stop_server()
+    assert get_default_async_session() is None
+
+
+@pytest.mark.parametrize("config", [{"backend": "mars"}])
+def test_fetch_concat(config):
+    session = new_session(
+        backend=config["backend"], n_cpu=2, web=False, use_uvloop=False
+    )
+    assert session is not None
+
+    with session:
+        data = {"A": [i for i in range(10)]}
+        df0 = md.DataFrame(data)
+        df1 = df0[["A"]]
+        df2 = df0[["A"]]
+        df1 = df1.execute()
+        df2 = df2.execute()
+        df3 = md.concat([df1, df2], axis=1)
+        ret = df3.execute()
+        df4 = ret.fetch()
+
+        pdf0 = pd.DataFrame(data)
+        pdf1 = pdf0[["A"]]
+        pdf2 = pdf0[["A"]]
+        pdf3 = pd.concat([pdf1, pdf2], axis=1)
+
+        assert pdf3.equals(df4)
+
+    for worker_pool in session._session.client._cluster._worker_pools:
+        _assert_storage_cleaned(
+            session.session_id, worker_pool.external_address, StorageLevel.MEMORY
+        )
+
+    session.stop_server()
+    assert get_default_async_session() is None
+
+
+def test_clear_default_session(setup):
+    assert get_default_session() is not None
+    clear_default_session()
+    assert get_default_session() is None
diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_pool.py b/python/xorbits/_mars/deploy/oscar/tests/test_pool.py
new file mode 100644
index 000000000..57e843914
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/test_pool.py
@@ -0,0 +1,115 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import tempfile
+
+import pytest
+
+from ....constants import MARS_LOG_PATH_KEY, MARS_TMP_DIR_PREFIX
+from ....utils import clean_mars_tmp_dir
+from ..pool import (
+    _parse_file_logging_config,
+    _config_logging,
+    _get_root_logger_level_and_format,
+)
+
+
+@pytest.fixture
+def init():
+    root_level, _ = _get_root_logger_level_and_format()
+    file_logging_config = os.path.join(
+        os.path.dirname(__file__), "..", "file-logging.conf"
+    )
+    logger_sections = [
+        "logger_main",
+        "logger_deploy",
+        "logger_oscar",
+        "logger_services",
+        "logger_dataframe",
+        "logger_learn",
+        "logger_tensor",
+        "handler_file_handler",
+    ]
+    yield file_logging_config, logger_sections, root_level
+
+    # clean
+    clean_mars_tmp_dir()
+
+
+def test_parse_file_logging_config(init):
+    fp, sections, root_level = init
+    log_path = "mock_path"
+    config = _parse_file_logging_config(fp, log_path, "FATAL")
+    assert config["handler_stream_handler"]["level"] == root_level
+    assert config["handler_stream_handler"].get("formatter") is not None
+    assert config["handler_stream_handler"]["formatter"] == "console"
+    for sec in sections:
+        if sec != "handler_file_handler":
+            assert config[sec]["level"] == "FATAL"
+        else:
+            assert config[sec]["level"] == root_level
+
+    formatter = "foo"
+    config = _parse_file_logging_config(fp, log_path, "FATAL", formatter=formatter)
+    assert config["formatter_formatter"]["format"] == formatter
+
+    config = _parse_file_logging_config(fp, log_path, level="", formatter=formatter)
+    assert config["logger_dataframe"]["level"] == "DEBUG"
+
+    config = _parse_file_logging_config(
+        fp, log_path, level="", formatter=formatter, from_cmd=True
+    )
+    assert config["logger_tensor"]["level"] == "DEBUG"
+
+    assert config["handler_stream_handler"]["level"] == "DEBUG"
+    assert config["formatter_formatter"]["format"] == formatter
+
+
+def test_config_logging(init, caplog):
+    _, _, root_level = init
+    kwargs = {"logging_conf": {}}
+    with caplog.at_level(logging.DEBUG):
+        _config_logging(**kwargs)
+    log_path = os.environ.get(MARS_LOG_PATH_KEY)
+    assert log_path is not None
+    assert os.path.basename(os.path.dirname(log_path)).startswith(MARS_TMP_DIR_PREFIX)
+
+    clean_mars_tmp_dir()
+
+    with tempfile.TemporaryDirectory() as folder:
+        kwargs = {"logging_conf": {"log_dir": folder, "from_cmd": True}}
+        _config_logging(**kwargs)
+        log_path = os.environ.get(MARS_LOG_PATH_KEY)
+        assert log_path is not None
+        assert os.path.dirname(os.path.dirname(log_path)) == folder
+
+        cnt = 0
+        file_handler = None
+        for handler in logging.getLogger().handlers:
+            if isinstance(handler, logging.FileHandler):
+                cnt += 1
+                file_handler = handler
+        assert cnt == 1
+        assert file_handler is not None
+        assert file_handler.level == logging.getLevelName("DEBUG")
+        assert file_handler.baseFilename == os.environ.get(MARS_LOG_PATH_KEY)
+
+
+def test_pool_with_no_web_config(init):
+    kwargs = {"web": False}
+    _config_logging(**kwargs)
+    log_path = os.environ.get(MARS_LOG_PATH_KEY)
+    assert log_path is None
diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_ray.py b/python/xorbits/_mars/deploy/oscar/tests/test_ray.py
new file mode 100644
index 000000000..67a629c15
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/test_ray.py
@@ -0,0 +1,333 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import copy
+import operator
+import os
+from functools import reduce
+
+import numpy as np
+import pytest
+
+from .... import dataframe as md
+from .... import tensor as mt
+from ....oscar.errors import ReconstructWorkerError
+from ....tests.core import DICT_NOT_EMPTY, mock, require_ray
+from ....utils import lazy_import
+from ..ray import ClusterStateActor, _load_config, new_cluster
+from ..session import get_default_session, new_session
+from ..tests import test_local
+from .modules.utils import (  # noqa: F401  # pylint: disable=unused-variable
+    cleanup_third_party_modules_output,
+    get_output_filenames,
+)
+
+ray = lazy_import("ray")
+
+CONFIG_FILE = os.path.join(os.path.dirname(__file__), "local_test_with_ray_config.yml")
+
+EXPECT_PROFILING_STRUCTURE = {
+    "supervisor": {
+        "general": {
+            "optimize": 0.0005879402160644531,
+            "incref_fetch_tileables": 0.0010840892791748047,
+            "stage_*": {
+                "tile(*)": 0.008243083953857422,
+                "gen_subtask_graph(*)": 0.012202978134155273,
+                "run": 0.27870702743530273,
+                "total": 0.30318617820739746,
+            },
+            "total": 0.30951380729675293,
+        },
+        "serialization": {
+            "serialize": 0.014928340911865234,
+            "deserialize": 0.0011813640594482422,
+            "total": 0.016109704971313477,
+        },
+        "most_calls": DICT_NOT_EMPTY,
+        "slow_calls": DICT_NOT_EMPTY,
+        "band_subtasks": DICT_NOT_EMPTY,
+        "slow_subtasks": DICT_NOT_EMPTY,
+    }
+}
+EXPECT_PROFILING_STRUCTURE_NO_SLOW = copy.deepcopy(EXPECT_PROFILING_STRUCTURE)
+EXPECT_PROFILING_STRUCTURE_NO_SLOW["supervisor"]["slow_calls"] = {}
+EXPECT_PROFILING_STRUCTURE_NO_SLOW["supervisor"]["slow_subtasks"] = {}
+
+
+@pytest.fixture
+async def create_cluster(request):
+    param = getattr(request, "param", {})
+    ray_config = _load_config(CONFIG_FILE)
+    ray_config.update(param.get("config", {}))
+    client = await new_cluster(
+        supervisor_mem=1 * 1024**3,
+        worker_num=2,
+        worker_cpu=2,
+        worker_mem=1 * 1024**3,
+        config=ray_config,
+    )
+    async with client:
+        yield client, param
+
+
+@require_ray
+@pytest.mark.parametrize(
+    "config",
+    [
+        [
+            {
+                "enable_profiling": {
+                    "slow_calls_duration_threshold": 0,
+                    "slow_subtasks_duration_threshold": 0,
+                }
+            },
+            EXPECT_PROFILING_STRUCTURE,
+        ],
+        [
+            {
+                "enable_profiling": {
+                    "slow_calls_duration_threshold": 1000,
+                    "slow_subtasks_duration_threshold": 1000,
+                }
+            },
+            EXPECT_PROFILING_STRUCTURE_NO_SLOW,
+        ],
+        [{}, {}],
+    ],
+)
+@pytest.mark.asyncio
+async def test_execute(ray_start_regular_shared, create_cluster, config):
+    await test_local.test_execute(create_cluster, config)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_iterative_tiling(ray_start_regular_shared, create_cluster):
+    await test_local.test_iterative_tiling(create_cluster)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_execute_describe(ray_start_regular_shared, create_cluster):
+    await test_local.test_execute_describe(create_cluster)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_execute_apply_closure(ray_start_regular_shared, create_cluster):
+    await test_local.test_execute_apply_closure(create_cluster)
+
+
+@require_ray
+@pytest.mark.parametrize("multiplier", [1, 3, 4])
+@pytest.mark.asyncio
+async def test_execute_callable_closure(
+    ray_start_regular_shared, create_cluster, multiplier
+):
+    await test_local.test_execute_callable_closure(create_cluster, multiplier)
+
+
+@require_ray
+@pytest.mark.parametrize(
+    "create_cluster",
+    [
+        {
+            "config": {
+                "task.task_preprocessor_cls": "mars.deploy.oscar.tests.test_clean_up_and_restore_func.RayBackendFuncTaskPreprocessor",
+                "subtask.subtask_processor_cls": "mars.deploy.oscar.tests.test_clean_up_and_restore_func.RayBackendFuncSubtaskProcessor",
+            }
+        }
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_ray_oscar_clean_up_and_restore_func(
+    ray_start_regular_shared, create_cluster
+):
+    await test_local.test_execute_apply_closure(create_cluster)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_fetch_infos(ray_start_regular_shared, create_cluster):
+    await test_local.test_fetch_infos(create_cluster)
+    df = md.DataFrame(mt.random.RandomState(0).rand(5000, 1, chunk_size=1000))
+    df.execute()
+    fetched_infos = df.fetch_infos(fields=["object_refs"])
+    object_refs = reduce(operator.concat, fetched_infos["object_refs"])
+    assert len(fetched_infos) == 1
+    assert len(object_refs) == 5
+
+
+@require_ray
+@pytest.mark.asyncio
+def test_sync_execute(ray_start_regular_shared, create_cluster):
+    client = create_cluster[0]
+    assert client.session
+    session = new_session(address=client.address)
+    with session:
+        raw = np.random.RandomState(0).rand(10, 5)
+        a = mt.tensor(raw, chunk_size=5).sum(axis=1)
+        b = a.execute(show_progress=False)
+        assert b is a
+        result = a.fetch()
+        np.testing.assert_array_equal(result, raw.sum(axis=1))
+
+        c = mt.tensor(raw, chunk_size=5).sum()
+        d = session.execute(c)
+        assert d is c
+        assert abs(session.fetch(d) - raw.sum()) < 0.001
+
+    assert get_default_session() is None
+
+
+def _run_web_session(web_address):
+    import asyncio
+
+    asyncio.new_event_loop().run_until_complete(
+        test_local._run_web_session_test(web_address)
+    )
+    return True
+
+
+def _sync_web_session_test(web_address):
+    new_session(web_address)
+    raw = np.random.RandomState(0).rand(10, 5)
+    a = mt.tensor(raw, chunk_size=5).sum(axis=1)
+    b = a.execute(show_progress=False)
+    assert b is a
+    return True
+
+
+@require_ray
+@pytest.mark.parametrize(
+    "config",
+    [
+        [
+            {
+                "enable_profiling": {
+                    "slow_calls_duration_threshold": 0,
+                    "slow_subtasks_duration_threshold": 0,
+                }
+            },
+            EXPECT_PROFILING_STRUCTURE,
+        ],
+        [
+            {
+                "enable_profiling": {
+                    "slow_calls_duration_threshold": 1000,
+                    "slow_subtasks_duration_threshold": 1000,
+                }
+            },
+            EXPECT_PROFILING_STRUCTURE_NO_SLOW,
+        ],
+        [{}, {}],
+    ],
+)
+@pytest.mark.asyncio
+async def test_web_session(ray_start_regular_shared, create_cluster, config):
+    client = create_cluster[0]
+    await test_local.test_web_session(create_cluster, config)
+    web_address = client.web_address
+    assert await ray.remote(_run_web_session).remote(web_address)
+    assert await ray.remote(_sync_web_session_test).remote(web_address)
+
+
+@require_ray
+def test_load_config():
+    default_config = _load_config()
+    assert default_config["scheduling"]["autoscale"]["enabled"] is False
+    default_config = _load_config({"scheduling": {"autoscale": {"enabled": True}}})
+    assert default_config["scheduling"]["autoscale"]["enabled"] is True
+    default_config = _load_config(
+        {
+            "scheduling.autoscale.enabled": True,
+            "scheduling.autoscale.scheduler_backlog_timeout": 1,
+        }
+    )
+    assert default_config["scheduling"]["autoscale"]["enabled"] is True
+    assert default_config["scheduling"]["autoscale"]["scheduler_backlog_timeout"] == 1
+    with pytest.raises(ValueError):
+        _load_config({"scheduling.autoscale.enabled": True, "scheduling.autoscale": {}})
+    assert _load_config(CONFIG_FILE)["session"]["custom_log_dir"] == "auto"
+
+
+@require_ray
+@pytest.mark.asyncio
+@mock.patch("mars.deploy.oscar.ray.stop_worker")
+async def test_reconstruct_worker_during_releasing_worker(fake_stop_worker):
+    stop_worker = asyncio.Event()
+    lock = asyncio.Event()
+
+    async def _stop_worker(*args):
+        stop_worker.set()
+        await lock.wait()
+
+    fake_stop_worker.side_effect = _stop_worker
+    cluster_state = ClusterStateActor()
+    release_task = asyncio.create_task(cluster_state.release_worker("abc"))
+    await stop_worker.wait()
+    with pytest.raises(ReconstructWorkerError, match="releasing"):
+        await cluster_state.reconstruct_worker("abc")
+    release_task.cancel()
+
+
+@require_ray
+@pytest.mark.asyncio
+@mock.patch("mars.deploy.oscar.ray.stop_worker")
+@mock.patch("ray.get_actor")
+async def test_release_worker_during_reconstructing_worker(
+    fake_get_actor, fake_stop_worker
+):
+    get_actor = asyncio.Event()
+    lock = asyncio.Event()
+
+    class FakeActorMethod:
+        async def remote(self):
+            get_actor.set()
+            await lock.wait()
+
+    class FakeActor:
+        state = FakeActorMethod()
+
+    def _get_actor(*args, **kwargs):
+        return FakeActor
+
+    async def _stop_worker(*args):
+        await lock.wait()
+
+    fake_get_actor.side_effect = _get_actor
+    fake_stop_worker.side_effect = _stop_worker
+    cluster_state = ClusterStateActor()
+    reconstruct_task = asyncio.create_task(cluster_state.reconstruct_worker("abc"))
+    await get_actor.wait()
+    release_task = asyncio.create_task(cluster_state.release_worker("abc"))
+    with pytest.raises(asyncio.CancelledError):
+        await reconstruct_task
+    release_task.cancel()
+
+
+@require_ray
+@pytest.mark.asyncio
+def test_init_metrics_on_ray(ray_start_regular_shared, create_cluster):
+    client = create_cluster[0]
+    assert client.session
+    from ....metrics import api
+
+    assert client._cluster._config.get("metrics", {}).get("backend") == "ray"
+    assert api._metric_backend == "ray"
+
+    client.session.stop_server()
diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_ray_client.py b/python/xorbits/_mars/deploy/oscar/tests/test_ray_client.py
new file mode 100644
index 000000000..f39bbc28d
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/test_ray_client.py
@@ -0,0 +1,97 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+import sys
+import tempfile
+import threading
+
+import pytest
+
+from ....tests.core import require_ray
+from ....utils import lazy_import
+from .test_ray_cluster_standalone import new_ray_session_test
+
+ray = lazy_import("ray")
+
+
+@require_ray
+@pytest.mark.parametrize(
+    "backend",
+    [
+        "mars",
+        "ray",
+    ],
+)
+def test_ray_client(backend):
+    server_code = """import time
+import ray.util.client.server.server as ray_client_server
+
+server = ray_client_server.init_and_serve("{address}", num_cpus=20)
+print("OK", flush=True)
+while True:
+    time.sleep(1)
+"""
+
+    address = "127.0.0.1:50051"
+
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".py") as f:
+        f.write(server_code.format(address=address))
+        f.flush()
+
+        proc = subprocess.Popen([sys.executable, "-u", f.name], stdout=subprocess.PIPE)
+
+        try:
+
+            def _check_ready(expect_exit=False):
+                while True:
+                    line = proc.stdout.readline()
+                    if proc.returncode is not None:
+                        if expect_exit:
+                            break
+                        raise Exception(
+                            f"Failed to start ray server at {address}, "
+                            f"the return code is {proc.returncode}."
+                        )
+                    if b"OK" in line:
+                        break
+
+            # Avoid ray.init timeout.
+            _check_ready()
+
+            # Avoid blocking the subprocess when the stdout pipe is full.
+            t = threading.Thread(target=_check_ready, args=(True,), daemon=True)
+            t.start()
+            try:
+                import ray
+
+                ray.client(address).connect()  # Ray 1.4
+            except Exception:
+                try:
+                    from ray.util.client import ray
+
+                    ray.connect(address)  # Ray 1.2
+                except Exception:
+                    import ray
+
+                    ray.init(f"ray://{address}")  # Ray latest
+            ray._inside_client_test = True
+            try:
+                new_ray_session_test(backend=backend)
+            finally:
+                ray._inside_client_test = False
+                ray.shutdown()
+        finally:
+            proc.kill()
+            proc.wait()
diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_ray_cluster_standalone.py b/python/xorbits/_mars/deploy/oscar/tests/test_ray_cluster_standalone.py
new file mode 100644
index 000000000..cf9d6639c
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/test_ray_cluster_standalone.py
@@ -0,0 +1,160 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import mars
+import pytest
+
+from .... import dataframe as md
+from .... import tensor as mt
+from ....tests.core import mock, require_ray
+from ....utils import lazy_import
+from ..ray import _load_config, new_cluster, new_cluster_in_ray, new_ray_session
+
+ray = lazy_import("ray")
+
+
+@require_ray
+def test_new_cluster_in_ray(stop_ray):
+    cluster = new_cluster_in_ray(worker_num=2)
+    mt.random.RandomState(0).rand(100, 5).sum().execute()
+    cluster.session.execute(mt.random.RandomState(0).rand(100, 5).sum())
+    mars.execute(mt.random.RandomState(0).rand(100, 5).sum())
+    session = new_ray_session(address=cluster.address, session_id="abcd", default=True)
+    session.execute(mt.random.RandomState(0).rand(100, 5).sum())
+    mars.execute(mt.random.RandomState(0).rand(100, 5).sum())
+    cluster.stop()
+
+
+@require_ray
+@pytest.mark.parametrize(
+    "backend",
+    [
+        "mars",
+        "ray",
+    ],
+)
+def test_new_ray_session(stop_ray, backend):
+    new_ray_session_test(backend)
+
+
+def new_ray_session_test(backend):
+    session = new_ray_session(
+        session_id="abc", worker_num=2, worker_mem=512 * 1024**2, backend=backend
+    )
+    mt.random.RandomState(0).rand(100, 5).sum().execute()
+    session.execute(mt.random.RandomState(0).rand(100, 5).sum())
+    mars.execute(mt.random.RandomState(0).rand(100, 5).sum())
+    session = new_ray_session(
+        session_id="abcd",
+        worker_num=2,
+        default=True,
+        worker_mem=512 * 1024**2,
+        backend=backend,
+    )
+    session.execute(mt.random.RandomState(0).rand(100, 5).sum())
+    mars.execute(mt.random.RandomState(0).rand(100, 5).sum())
+    df = md.DataFrame(mt.random.rand(100, 4), columns=list("abcd"))
+    # Convert mars dataframe to ray dataset
+    ds = md.to_ray_dataset(df)
+    print(ds.schema(), ds.count())
+    ds.filter(lambda row: row["a"] > 0.5).show(5)
+    # Convert ray dataset to mars dataframe
+    df2 = md.read_ray_dataset(ds)
+    print(df2.head(5).execute())
+    # Test ray cluster exists after session got gc.
+    del session
+    import gc
+
+    gc.collect()
+    mars.execute(mt.random.RandomState(0).rand(100, 5).sum())
+
+
+@require_ray
+@pytest.mark.parametrize(
+    "test_option",
+    [
+        [True, 0, ["ray://test_cluster/1/0", "ray://test_cluster/2/0"]],
+        [False, 0, ["ray://test_cluster/0/1", "ray://test_cluster/1/0"]],
+        [True, 2, ["ray://test_cluster/1/0", "ray://test_cluster/2/0"]],
+        [False, 5, ["ray://test_cluster/0/6", "ray://test_cluster/1/0"]],
+    ],
+)
+@pytest.mark.asyncio
+async def test_optional_supervisor_node(ray_start_regular, test_option):
+    import logging
+
+    logging.basicConfig(level=logging.INFO)
+    supervisor_standalone, supervisor_sub_pool_num, worker_addresses = test_option
+    config = _load_config()
+    config["cluster"]["ray"]["supervisor"]["standalone"] = supervisor_standalone
+    config["cluster"]["ray"]["supervisor"]["sub_pool_num"] = supervisor_sub_pool_num
+    client = await new_cluster(
+        "test_cluster",
+        supervisor_mem=1 * 1024**3,
+        worker_num=2,
+        worker_cpu=2,
+        worker_mem=1 * 1024**3,
+        config=config,
+    )
+    async with client:
+        assert client.address == "ray://test_cluster/0/0"
+        assert client._cluster._worker_addresses == worker_addresses
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_new_ray_session_config(stop_ray):
+    original_placement_group = ray.util.placement_group
+    with mock.patch.object(
+        ray.util, "placement_group", autospec=True
+    ) as mock_placement_group:
+
+        def _wrap_original_placement_group(*args, **kwargs):
+            assert {"CPU": 3} in kwargs["bundles"]
+            return original_placement_group(*args, **kwargs)
+
+        mock_placement_group.side_effect = _wrap_original_placement_group
+        mars.new_ray_session(
+            supervisor_cpu=3,
+            worker_cpu=5,
+            backend="ray",
+            default=True,
+            config={
+                "third_party_modules": [
+                    "mars.deploy.oscar.tests.modules.check_ray_remote_function_options"
+                ]
+            },
+        )
+        mt.random.RandomState(0).rand(100, 5).sum().execute()
+
+        # It seems crashes CI.
+        # mars.stop_server()
+        #
+        # actors = ray.state.actors()
+        # assert len(actors) == 1
+        # assert list(actors.values())[0]["State"] == "DEAD"
+
+        mars.new_ray_session(
+            supervisor_cpu=3,
+            worker_cpu=4,
+            backend="ray",
+            default=True,
+            config={
+                "third_party_modules": [
+                    "mars.deploy.oscar.tests.modules.check_ray_remote_function_options"
+                ]
+            },
+        )
+        with pytest.raises(AssertionError):
+            mt.random.RandomState(0).rand(100, 5).sum().execute()
diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_ray_dag.py b/python/xorbits/_mars/deploy/oscar/tests/test_ray_dag.py
new file mode 100644
index 000000000..acc2b36a3
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/test_ray_dag.py
@@ -0,0 +1,227 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import time
+
+import pytest
+
+from .... import get_context
+from .... import tensor as mt
+from ....tests import test_session
+from ....tests.core import DICT_NOT_EMPTY, require_ray
+from ....utils import lazy_import
+from ..local import new_cluster
+from ..session import get_default_async_session, new_session
+from ..tests import test_local
+from ..tests.session import new_test_session
+from ..tests.test_local import _cancel_when_execute, _cancel_when_tile
+from .modules.utils import (  # noqa: F401; pylint: disable=unused-variable
+    cleanup_third_party_modules_output,
+    get_output_filenames,
+)
+
+ray = lazy_import("ray")
+
+EXPECT_PROFILING_STRUCTURE = {
+    "supervisor": {
+        "general": {
+            "optimize": 0.0005879402160644531,
+            "stage_*": {
+                "tile(*)": 0.008243083953857422,
+                "gen_subtask_graph(*)": 0.012202978134155273,
+                "run": 0.27870702743530273,
+                "total": 0.30318617820739746,
+            },
+            "total": 0.30951380729675293,
+        },
+        "serialization": {},
+        "most_calls": DICT_NOT_EMPTY,
+        "slow_calls": DICT_NOT_EMPTY,
+        "band_subtasks": {},
+        "slow_subtasks": {},
+    }
+}
+EXPECT_PROFILING_STRUCTURE_NO_SLOW = copy.deepcopy(EXPECT_PROFILING_STRUCTURE)
+EXPECT_PROFILING_STRUCTURE_NO_SLOW["supervisor"]["slow_calls"] = {}
+
+
+@pytest.mark.parametrize(indirect=True)
+@pytest.fixture
+async def create_cluster(request):
+    param = getattr(request, "param", {})
+    start_method = os.environ.get("POOL_START_METHOD", None)
+    client = await new_cluster(
+        subprocess_start_method=start_method,
+        backend="ray",
+        n_worker=2,
+        n_cpu=2,
+        use_uvloop=False,
+        config=param.get("config", None),
+    )
+    async with client:
+        assert client.session.client is not None
+        yield client, {}
+
+
+@require_ray
+@pytest.mark.parametrize("backend", ["ray"])
+@pytest.mark.parametrize("_new_session", [new_session, new_test_session])
+def test_new_session_backend(ray_start_regular_shared2, _new_session, backend):
+    test_local.test_new_session_backend(_new_session, backend)
+
+
+@require_ray
+@pytest.mark.parametrize(
+    "config",
+    [
+        [
+            {
+                "enable_profiling": {
+                    "slow_calls_duration_threshold": 0,
+                    "slow_subtasks_duration_threshold": 0,
+                }
+            },
+            EXPECT_PROFILING_STRUCTURE,
+        ],
+        [
+            {
+                "enable_profiling": {
+                    "slow_calls_duration_threshold": 1000,
+                    "slow_subtasks_duration_threshold": 1000,
+                }
+            },
+            EXPECT_PROFILING_STRUCTURE_NO_SLOW,
+        ],
+        [{}, {}],
+    ],
+)
+@pytest.mark.asyncio
+async def test_execute(ray_start_regular_shared2, create_cluster, config):
+    await test_local.test_execute(create_cluster, config)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_iterative_tiling(ray_start_regular_shared2, create_cluster):
+    await test_local.test_iterative_tiling(create_cluster)
+
+
+@require_ray
+@pytest.mark.parametrize("config", [{"backend": "ray"}])
+def test_sync_execute(ray_start_regular_shared2, config):
+    test_local.test_sync_execute(config)
+
+
+@require_ray
+@pytest.mark.parametrize(
+    "create_cluster",
+    [{"config": {"task.execution_config.ray.monitor_interval_seconds": 0}}],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_session_get_progress(ray_start_regular_shared2, create_cluster):
+    await test_local.test_session_get_progress(create_cluster)
+
+
+@require_ray
+@pytest.mark.parametrize("test_func", [_cancel_when_execute, _cancel_when_tile])
+def test_cancel(ray_start_regular_shared2, create_cluster, test_func):
+    test_local.test_cancel(create_cluster, test_func)
+
+
+@require_ray
+@pytest.mark.parametrize("config", [{"backend": "ray"}])
+def test_executor_context_gc(ray_start_regular_shared2, config):
+    session = new_session(
+        backend=config["backend"],
+        n_cpu=2,
+        web=False,
+        use_uvloop=False,
+        config={"task.execution_config.ray.monitor_interval_seconds": 0},
+    )
+
+    assert session._session.client.web_address is None
+    assert session.get_web_endpoint() is None
+
+    def f1(c):
+        time.sleep(0.5)
+        return c
+
+    with session:
+        t1 = mt.random.randint(10, size=(100, 10), chunk_size=100)
+        t2 = mt.random.randint(10, size=(100, 10), chunk_size=50)
+        t3 = t2 + t1
+        t4 = t3.sum(0)
+        t5 = t4.map_chunk(f1)
+        r = t5.execute()
+        result = r.fetch()
+        assert result is not None
+        assert len(result) == 10
+        context = get_context()
+        assert len(context._task_context) < 5
+
+    session.stop_server()
+    assert get_default_async_session() is None
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_execute_describe(ray_start_regular_shared2, create_cluster):
+    # `describe` contains multiple shuffle.
+    await test_local.test_execute_describe(create_cluster)
+
+
+@require_ray
+@pytest.mark.parametrize("method", ["shuffle", "broadcast", None])
+@pytest.mark.parametrize("auto_merge", ["after", "before"])
+def test_merge_groupby(ray_start_regular_shared2, setup, method, auto_merge):
+    # add ray_dag decorator to the test_merge_groupby makes the raylet crash.
+    test_session.test_merge_groupby(setup, method, auto_merge)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_execute_apply_closure(ray_start_regular_shared2, create_cluster):
+    await test_local.test_execute_apply_closure(create_cluster)
+
+
+@require_ray
+@pytest.mark.parametrize("multiplier", [1, 3, 4])
+@pytest.mark.asyncio
+async def test_execute_callable_closure(
+    ray_start_regular_shared2, create_cluster, multiplier
+):
+    await test_local.test_execute_callable_closure(create_cluster, multiplier)
+
+
+@require_ray
+@pytest.mark.parametrize(
+    "create_cluster",
+    [
+        {
+            "config": {
+                "task.task_preprocessor_cls": "mars.deploy.oscar.tests.test_clean_up_and_restore_func.RayBackendFuncTaskPreprocessor",
+                "subtask.subtask_processor_cls": "mars.deploy.oscar.tests.test_clean_up_and_restore_func.RayBackendFuncSubtaskProcessor",
+            }
+        }
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_ray_dag_clean_up_and_restore_func(
+    ray_start_regular_shared2, create_cluster
+):
+    await test_local.test_execute_apply_closure(create_cluster)
diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_ray_dag_failover.py b/python/xorbits/_mars/deploy/oscar/tests/test_ray_dag_failover.py
new file mode 100644
index 000000000..988bc8690
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/test_ray_dag_failover.py
@@ -0,0 +1,114 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import operator
+from functools import reduce
+
+import mars
+import pandas as pd
+import pytest
+
+from .... import dataframe as md
+from .... import tensor as mt
+from ....tests.core import require_ray
+from ....utils import lazy_import
+
+ray = lazy_import("ray")
+try:
+    from ray.exceptions import ObjectReconstructionFailedMaxAttemptsExceededError
+except ImportError:  # pragma: no cover
+    ObjectReconstructionFailedMaxAttemptsExceededError = None
+
+
+@require_ray
+@pytest.mark.parametrize(
+    "ray_large_cluster",
+    [{"num_nodes": 0}],
+    indirect=True,
+)
+@pytest.mark.parametrize("reconstruction_enabled", [True, False])
+@pytest.mark.skipif(
+    ObjectReconstructionFailedMaxAttemptsExceededError is None,
+    reason="Not support ObjectReconstructionFailedMaxAttemptsExceededError",
+)
+def test_basic_object_reconstruction(
+    ray_large_cluster, reconstruction_enabled, stop_mars
+):
+    config = {
+        "num_heartbeats_timeout": 10,
+        "raylet_heartbeat_period_milliseconds": 200,
+        "object_timeout_milliseconds": 200,
+    }
+    # Workaround to reset the config to the default value.
+    if not reconstruction_enabled:
+        config["lineage_pinning_enabled"] = False
+        subtask_max_retries = 0
+    else:
+        subtask_max_retries = 1
+
+    cluster = ray_large_cluster
+    # Head node with no resources.
+    cluster.add_node(
+        num_cpus=0,
+        _system_config=config,
+        enable_object_reconstruction=reconstruction_enabled,
+    )
+    ray.init(address=cluster.address)
+    # Node to place the initial object.
+    node_to_kill = cluster.add_node(num_cpus=1, object_store_memory=10**8)
+    mars.new_session(
+        backend="ray",
+        config={"scheduling.subtask_max_retries": subtask_max_retries},
+        default=True,
+    )
+    cluster.wait_for_nodes()
+
+    df = md.DataFrame(mt.random.RandomState(0).rand(2_000_000, 1, chunk_size=1_000_000))
+    df.execute()
+    # this will submit new ray tasks
+    df2 = df.map_chunk(lambda pdf: pdf * 2).execute()
+    executed_infos = df2.fetch_infos(fields=["object_refs"])
+    object_refs = reduce(operator.concat, executed_infos["object_refs"])
+    head5 = df2.head(5).to_pandas()
+
+    cluster.remove_node(node_to_kill, allow_graceful=False)
+    node_to_kill = cluster.add_node(num_cpus=1, object_store_memory=10**8)
+
+    # use a dependent_task to avoid fetch lost objects to local
+    @ray.remote
+    def dependent_task(x):
+        return x
+
+    if reconstruction_enabled:
+        ray.get([dependent_task.remote(ref) for ref in object_refs])
+        new_head5 = df2.head(5).to_pandas()
+        pd.testing.assert_frame_equal(head5, new_head5)
+    else:
+        with pytest.raises(ray.exceptions.RayTaskError):
+            df2.head(5).to_pandas()
+        with pytest.raises(ray.exceptions.ObjectLostError):
+            ray.get(object_refs)
+
+    # Losing the object a second time will cause reconstruction to fail because
+    # we have reached the max task retries.
+    cluster.remove_node(node_to_kill, allow_graceful=False)
+    cluster.add_node(num_cpus=1, object_store_memory=10**8)
+
+    if reconstruction_enabled:
+        with pytest.raises(ObjectReconstructionFailedMaxAttemptsExceededError):
+            ray.get(object_refs)
+    else:
+        with pytest.raises(ray.exceptions.ObjectLostError):
+            ray.get(object_refs)
diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_ray_dag_oscar.py b/python/xorbits/_mars/deploy/oscar/tests/test_ray_dag_oscar.py
new file mode 100644
index 000000000..c085e840c
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/test_ray_dag_oscar.py
@@ -0,0 +1,89 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from ....tests.core import require_ray
+from ....utils import lazy_import
+from ..ray import _load_config, new_cluster
+from ..tests import test_local
+
+ray = lazy_import("ray")
+CONFIG_FILE = os.path.join(os.path.dirname(__file__), "local_test_with_ray_config.yml")
+
+
+@pytest.fixture
+async def create_cluster(request):
+    param = getattr(request, "param", {})
+    ray_config = _load_config(CONFIG_FILE)
+    ray_config.update(param.get("config", {}))
+    client = await new_cluster(
+        supervisor_mem=1 * 1024**3,
+        worker_num=2,
+        worker_cpu=2,
+        worker_mem=1 * 1024**3,
+        backend="ray",
+        config=ray_config,
+    )
+    async with client:
+        yield client, param
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_iterative_tiling(ray_start_regular_shared2, create_cluster):
+    await test_local.test_iterative_tiling(create_cluster)
+
+
+@pytest.mark.asyncio
+@require_ray
+async def test_execute_describe(ray_start_regular_shared2, create_cluster):
+    await test_local.test_execute_describe(create_cluster)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_execute_apply_closure(ray_start_regular_shared2, create_cluster):
+    await test_local.test_execute_apply_closure(create_cluster)
+
+
+@require_ray
+@pytest.mark.parametrize("multiplier", [1, 3, 4])
+@pytest.mark.asyncio
+async def test_execute_callable_closure(
+    ray_start_regular_shared2, create_cluster, multiplier
+):
+    await test_local.test_execute_callable_closure(create_cluster, multiplier)
+
+
+@require_ray
+@pytest.mark.parametrize(
+    "create_cluster",
+    [
+        {
+            "config": {
+                "task.task_preprocessor_cls": "mars.deploy.oscar.tests.test_clean_up_and_restore_func.RayBackendFuncTaskPreprocessor",
+                "subtask.subtask_processor_cls": "mars.deploy.oscar.tests.test_clean_up_and_restore_func.RayBackendFuncSubtaskProcessor",
+            }
+        }
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_ray_dag_oscar_clean_up_and_restore_func(
+    ray_start_regular_shared2, create_cluster
+):
+    await test_local.test_execute_apply_closure(create_cluster)
diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_ray_fault_injection.py b/python/xorbits/_mars/deploy/oscar/tests/test_ray_fault_injection.py
new file mode 100644
index 000000000..16486c9de
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/test_ray_fault_injection.py
@@ -0,0 +1,207 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from ....oscar.errors import ServerClosed
+from ....services.tests.fault_injection_manager import (
+    FaultInjectionError,
+    FaultInjectionUnhandledError,
+    FaultPosition,
+    FaultType,
+)
+from ....tensor.base.psrs import PSRSConcatPivot
+from ....tests.core import require_ray
+from ....utils import lazy_import
+from ..ray import _load_config, new_cluster
+from ..tests import test_fault_injection
+
+ray = lazy_import("ray")
+
+RAY_CONFIG_FILE = os.path.join(
+    os.path.dirname(__file__), "local_test_with_ray_config.yml"
+)
+FAULT_INJECTION_CONFIG = {
+    "third_party_modules": ["mars.services.tests.fault_injection_patch"],
+}
+SUBTASK_RERUN_CONFIG = {
+    "scheduling": {
+        "subtask_max_retries": 2,
+        "subtask_max_reschedules": 2,
+    }
+}
+
+
+@pytest.fixture
+async def fault_cluster(request):
+    param = getattr(request, "param", {})
+    ray_config = _load_config(RAY_CONFIG_FILE)
+    ray_config.update(FAULT_INJECTION_CONFIG)
+    ray_config.update(param.get("config", {}))
+    client = await new_cluster(
+        worker_num=2,
+        worker_cpu=2,
+        worker_mem=1 * 1024**3,
+        config=ray_config,
+    )
+    async with client:
+        yield client
+
+
+@require_ray
+@pytest.mark.parametrize(
+    "fault_and_exception",
+    [
+        [
+            FaultType.Exception,
+            {FaultPosition.ON_EXECUTE_OPERAND: 1},
+            pytest.raises(FaultInjectionError, match="Fault Injection"),
+            True,
+        ],
+        [
+            FaultType.UnhandledException,
+            {FaultPosition.ON_EXECUTE_OPERAND: 1},
+            pytest.raises(
+                FaultInjectionUnhandledError, match="Fault Injection Unhandled"
+            ),
+            True,
+        ],
+        [
+            FaultType.ProcessExit,
+            {FaultPosition.ON_EXECUTE_OPERAND: 1},
+            pytest.raises(ServerClosed),
+            False,  # The ServerClosed raised from current process directly.
+        ],
+        [
+            FaultType.Exception,
+            {FaultPosition.ON_RUN_SUBTASK: 1},
+            pytest.raises(FaultInjectionError, match="Fault Injection"),
+            True,
+        ],
+    ],
+)
+@pytest.mark.asyncio
+async def test_fault_inject_subtask_processor(
+    ray_start_regular_shared, fault_cluster, fault_and_exception
+):
+    await test_fault_injection.test_fault_inject_subtask_processor(
+        fault_cluster, fault_and_exception
+    )
+
+
+@require_ray
+@pytest.mark.parametrize(
+    "fault_cluster", [{"config": SUBTASK_RERUN_CONFIG}], indirect=True
+)
+@pytest.mark.parametrize(
+    "fault_config",
+    [
+        [
+            FaultType.Exception,
+            {FaultPosition.ON_EXECUTE_OPERAND: 1},
+            pytest.raises(FaultInjectionError, match="Fault Injection"),
+        ],
+        [
+            FaultType.ProcessExit,
+            {FaultPosition.ON_EXECUTE_OPERAND: 1},
+            pytest.raises(ServerClosed),
+        ],
+        [
+            FaultType.Exception,
+            {FaultPosition.ON_RUN_SUBTASK: 1},
+            pytest.raises(FaultInjectionError, match="Fault Injection"),
+        ],
+    ],
+)
+@pytest.mark.asyncio
+async def test_rerun_subtask(ray_start_regular_shared, fault_cluster, fault_config):
+    await test_fault_injection.test_rerun_subtask(fault_cluster, fault_config)
+
+
+@require_ray
+@pytest.mark.parametrize(
+    "fault_cluster", [{"config": SUBTASK_RERUN_CONFIG}], indirect=True
+)
+@pytest.mark.parametrize(
+    "fault_config",
+    [
+        [FaultType.Exception, {FaultPosition.ON_EXECUTE_OPERAND: 1}, [PSRSConcatPivot]],
+        [
+            FaultType.ProcessExit,
+            {FaultPosition.ON_EXECUTE_OPERAND: 1},
+            [PSRSConcatPivot],
+        ],
+    ],
+)
+@pytest.mark.asyncio
+async def test_rerun_subtask_describe(
+    ray_start_regular_shared, fault_cluster, fault_config
+):
+    await test_fault_injection.test_rerun_subtask_describe(fault_cluster, fault_config)
+
+
+@require_ray
+@pytest.mark.parametrize(
+    "fault_cluster", [{"config": SUBTASK_RERUN_CONFIG}], indirect=True
+)
+@pytest.mark.parametrize(
+    "fault_config",
+    [
+        [
+            FaultType.UnhandledException,
+            {FaultPosition.ON_EXECUTE_OPERAND: 1},
+            pytest.raises(FaultInjectionUnhandledError),
+            ["_UnhandledException", "handle_fault"],
+        ],
+        [
+            FaultType.Exception,
+            {FaultPosition.ON_EXECUTE_OPERAND: 100},
+            pytest.raises(FaultInjectionError),
+            ["_ExceedMaxRerun", "handle_fault"],
+        ],
+    ],
+)
+@pytest.mark.asyncio
+async def test_rerun_subtask_fail(
+    ray_start_regular_shared, fault_cluster, fault_config
+):
+    await test_fault_injection.test_rerun_subtask_fail(fault_cluster, fault_config)
+
+
+@require_ray
+@pytest.mark.parametrize(
+    "fault_cluster", [{"config": SUBTASK_RERUN_CONFIG}], indirect=True
+)
+@pytest.mark.parametrize(
+    "fault_config",
+    [
+        [
+            FaultType.Exception,
+            {FaultPosition.ON_EXECUTE_OPERAND: 1},
+            pytest.raises(FaultInjectionError, match="RemoteFunction"),
+            ["_UnretryableException", "handle_fault"],
+        ],
+        [
+            FaultType.ProcessExit,
+            {FaultPosition.ON_EXECUTE_OPERAND: 1},
+            pytest.raises(ServerClosed),
+            ["_UnretryableException", "*"],
+        ],
+    ],
+)
+@pytest.mark.asyncio
+async def test_retryable(ray_start_regular_shared, fault_cluster, fault_config):
+    await test_fault_injection.test_retryable(fault_cluster, fault_config)
diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_ray_load_modules.py b/python/xorbits/_mars/deploy/oscar/tests/test_ray_load_modules.py
new file mode 100644
index 000000000..e29e2550a
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/test_ray_load_modules.py
@@ -0,0 +1,129 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import pytest
+
+from .... import tensor as mt
+from ....tests.core import require_ray
+from ....utils import lazy_import
+from ..ray import _load_config, new_cluster
+from ..session import get_default_session, new_session
+from .modules.utils import (  # noqa: F401  # pylint: disable=unused-variable
+    cleanup_third_party_modules_output,
+    get_output_filenames,
+)
+
+ray = lazy_import("ray")
+
+CONFIG_FILE = os.path.join(os.path.dirname(__file__), "local_test_with_ray_config.yml")
+CONFIG_THIRD_PARTY_MODULES_TEST_FILE = os.path.join(
+    os.path.dirname(__file__), "ray_test_with_third_parity_modules_config.yml"
+)
+
+
+@pytest.fixture
+async def create_cluster(request):
+    param = getattr(request, "param", {})
+    ray_config = _load_config(CONFIG_FILE)
+    ray_config.update(param.get("config", {}))
+    client = await new_cluster(
+        supervisor_mem=1 * 1024**3,
+        worker_num=2,
+        worker_cpu=2,
+        worker_mem=1 * 1024**3,
+        config=ray_config,
+    )
+    async with client:
+        yield client, param
+
+
+@require_ray
+@pytest.mark.parametrize(
+    "config_exception",
+    [
+        [set(), pytest.raises(TypeError, match="set")],
+        [
+            {"supervisor": ["not_exists_for_supervisor"]},
+            pytest.raises(ModuleNotFoundError, match="not_exists_for_supervisor"),
+        ],
+        [
+            {"worker": ["not_exists_for_worker"]},
+            pytest.raises(ModuleNotFoundError, match="not_exists_for_worker"),
+        ],
+    ],
+)
+@pytest.mark.asyncio
+async def test_load_third_party_modules(ray_start_regular, config_exception):
+    third_party_modules_config, expected_exception = config_exception
+    config = _load_config()
+
+    config["third_party_modules"] = third_party_modules_config
+    with expected_exception:
+        await new_cluster(
+            worker_num=1,
+            worker_cpu=1,
+            worker_mem=1 * 1024**3,
+            config=config,
+        )
+
+
+@require_ray
+@pytest.mark.parametrize(
+    "create_cluster",
+    [
+        {
+            "config": {
+                "third_party_modules": {
+                    "worker": ["mars.deploy.oscar.tests.modules.replace_op"]
+                },
+            },
+        }
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+def test_load_third_party_modules2(ray_start_regular, create_cluster):
+    client = create_cluster[0]
+    assert client.session
+    session = new_session(address=client.address)
+    with session:
+        raw = np.random.RandomState(0).rand(10, 10)
+        a = mt.tensor(raw, chunk_size=5)
+        b = a + 1
+        b.execute(show_progress=False)
+        result = b.fetch()
+
+        np.testing.assert_equal(raw - 1, result)
+
+    assert get_default_session() is None
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_load_third_party_modules_from_config(
+    ray_start_regular, cleanup_third_party_modules_output  # noqa: F811
+):
+    client = await new_cluster(
+        supervisor_mem=1 * 1024**3,
+        worker_num=1,
+        worker_cpu=1,
+        worker_mem=1 * 1024**3,
+        config=CONFIG_THIRD_PARTY_MODULES_TEST_FILE,
+    )
+    async with client:
+        # 1 supervisor, 1 worker main pools, 1 worker sub pools.
+        assert len(get_output_filenames()) == 3
diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_ray_scheduling.py b/python/xorbits/_mars/deploy/oscar/tests/test_ray_scheduling.py
new file mode 100644
index 000000000..f125c95bc
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/tests/test_ray_scheduling.py
@@ -0,0 +1,323 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import logging
+import os
+import time
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .... import dataframe as md
+from .... import oscar as mo
+from .... import tensor as mt
+from ....oscar.backends.ray.utils import (
+    kill_and_wait,
+    process_address_to_placement,
+    process_placement_to_address,
+)
+from ....services.cluster import ClusterAPI
+from ....services.scheduling.supervisor.autoscale import AutoscalerActor
+from ....tests.core import require_ray
+from ....utils import lazy_import
+from ..ray import _load_config, new_cluster
+from ..tests import test_local
+
+ray = lazy_import("ray")
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.fixture
+async def speculative_cluster():
+    client = await new_cluster(
+        "test_cluster",
+        worker_num=5,
+        worker_cpu=2,
+        worker_mem=512 * 1024**2,
+        supervisor_mem=100 * 1024**2,
+        config={
+            "scheduling": {
+                "speculation": {
+                    "enabled": True,
+                    "dry": False,
+                    "interval": 0.5,
+                    "threshold": 0.2,
+                    "min_task_runtime": 2,
+                    "multiplier": 1.5,
+                },
+                # used to kill hanged subtask to release slot.
+                "subtask_cancel_timeout": 0.1,
+            },
+        },
+    )
+    async with client:
+        yield client
+
+
+@pytest.mark.parametrize("ray_large_cluster", [{"num_nodes": 2}], indirect=True)
+@pytest.mark.timeout(timeout=500)
+@require_ray
+@pytest.mark.asyncio
+async def test_task_speculation_execution(ray_large_cluster, speculative_cluster):
+    await test_local.test_task_speculation_execution(speculative_cluster)
+
+
+@pytest.mark.parametrize(
+    "ray_large_cluster", [{"num_nodes": 1, "num_cpus": 3}], indirect=True
+)
+@require_ray
+@pytest.mark.asyncio
+async def test_request_worker(ray_large_cluster):
+    worker_cpu, worker_mem = 1, 100 * 1024**2
+    client = await new_cluster(
+        worker_num=0, worker_cpu=worker_cpu, worker_mem=worker_mem
+    )
+    async with client:
+        cluster_state_ref = client._cluster._cluster_backend.get_cluster_state_ref()
+        # Note that supervisor took one node
+        workers = await asyncio.gather(
+            *[cluster_state_ref.request_worker(timeout=5) for _ in range(2)]
+        )
+        assert all(worker is not None for worker in workers)
+        assert not await cluster_state_ref.request_worker(timeout=5)
+        release_workers = [
+            cluster_state_ref.release_worker(worker) for worker in workers
+        ]
+        # Duplicate release workers requests should be handled.
+        release_workers.extend(
+            [cluster_state_ref.release_worker(worker) for worker in workers]
+        )
+        await asyncio.gather(*release_workers)
+        assert await cluster_state_ref.request_worker(timeout=5)
+        cluster_state_ref.reconstruct_worker()
+
+
+@pytest.mark.parametrize(
+    "ray_large_cluster", [{"num_nodes": 1, "num_cpus": 3}], indirect=True
+)
+@require_ray
+@pytest.mark.asyncio
+async def test_reconstruct_worker(ray_large_cluster):
+    worker_cpu, worker_mem = 1, 100 * 1024**2
+    client = await new_cluster(
+        worker_num=0, worker_cpu=worker_cpu, worker_mem=worker_mem
+    )
+    async with client:
+        cluster_api = await ClusterAPI.create(client._cluster.supervisor_address)
+        worker = await cluster_api.request_worker(timeout=5)
+        pg_name, bundle_index, process_index = process_address_to_placement(worker)
+        worker_sub_pool = process_placement_to_address(
+            pg_name, bundle_index, process_index + 1
+        )
+
+        worker_actor = ray.get_actor(worker)
+        worker_pid = await worker_actor.getpid.remote()
+        # the worker pool actor should be destroyed even we get actor.
+        worker_sub_pool_actor = ray.get_actor(worker_sub_pool)
+        worker_sub_pool_pid = await worker_sub_pool_actor.getpid.remote()
+
+        # kill worker main pool
+        await kill_and_wait(ray.get_actor(worker))
+
+        # duplicated reconstruct worker request can be handled.
+        await asyncio.gather(
+            cluster_api.reconstruct_worker(worker),
+            cluster_api.reconstruct_worker(worker),
+        )
+        worker_actor = ray.get_actor(worker)
+        new_worker_pid = await worker_actor.getpid.remote()
+        worker_sub_pool_actor = ray.get_actor(worker_sub_pool)
+        new_worker_sub_pool_pid = await worker_sub_pool_actor.getpid.remote()
+        assert new_worker_pid != worker_pid
+        assert new_worker_sub_pool_pid != worker_sub_pool_pid
+
+        # the compute should be ok after the worker is reconstructed.
+        raw = np.random.RandomState(0).rand(10, 5)
+        a = mt.tensor(raw, chunk_size=5).sum(axis=1)
+        b = a.execute(show_progress=False)
+        assert b is a
+        result = a.fetch()
+        np.testing.assert_array_equal(result, raw.sum(axis=1))
+
+
+@pytest.mark.parametrize(
+    "ray_large_cluster", [{"num_nodes": 2, "num_cpus": 4}], indirect=True
+)
+@pytest.mark.parametrize("init_workers", [0, 1])
+@require_ray
+@pytest.mark.asyncio
+async def test_auto_scale_out(ray_large_cluster, init_workers: int):
+    client = await new_cluster(
+        worker_num=init_workers,
+        worker_cpu=2,
+        worker_mem=200 * 1024**2,
+        supervisor_mem=1 * 1024**3,
+        config={
+            "scheduling.autoscale.enabled": True,
+            "scheduling.autoscale.scheduler_backlog_timeout": 1,
+            "scheduling.autoscale.worker_idle_timeout": 10000000,
+            "scheduling.autoscale.max_workers": 10,
+        },
+    )
+    async with client:
+
+        def time_consuming(x):
+            time.sleep(1)
+            return x * x
+
+        series_size = 100
+        assert (
+            md.Series(list(range(series_size)), chunk_size=1)
+            .apply(time_consuming)
+            .sum()
+            .execute()
+            .fetch()
+            == pd.Series(list(range(series_size))).apply(lambda x: x * x).sum()
+        )
+        autoscaler_ref = mo.create_actor_ref(
+            uid=AutoscalerActor.default_uid(),
+            address=client._cluster.supervisor_address,
+        )
+        assert await autoscaler_ref.get_dynamic_worker_nums() > 0
+
+
+@pytest.mark.timeout(timeout=600)
+@pytest.mark.parametrize(
+    "ray_large_cluster", [{"num_nodes": 2, "num_cpus": 4}], indirect=True
+)
+@require_ray
+@pytest.mark.asyncio
+async def test_auto_scale_in(ray_large_cluster):
+    config = _load_config()
+    config["scheduling"]["autoscale"]["enabled"] = True
+    config["scheduling"]["autoscale"]["worker_idle_timeout"] = 1
+    config["scheduling"]["autoscale"]["max_workers"] = 4
+    config["scheduling"]["autoscale"]["min_workers"] = 2
+    client = await new_cluster(
+        worker_num=0,
+        worker_cpu=2,
+        worker_mem=200 * 1024**2,
+        supervisor_mem=1 * 1024**3,
+        config=config,
+    )
+    async with client:
+        autoscaler_ref = mo.create_actor_ref(
+            uid=AutoscalerActor.default_uid(),
+            address=client._cluster.supervisor_address,
+        )
+        new_worker_nums = 3
+        await asyncio.gather(
+            *[autoscaler_ref.request_worker() for _ in range(new_worker_nums)]
+        )
+        series_size = 100
+        assert (
+            md.Series(list(range(series_size)), chunk_size=20).sum().execute().fetch()
+            == pd.Series(list(range(series_size))).sum()
+        )
+        while await autoscaler_ref.get_dynamic_worker_nums() > 2:
+            dynamic_workers = await autoscaler_ref.get_dynamic_workers()
+            logger.info(f"Waiting %s workers to be released.", dynamic_workers)
+            await asyncio.sleep(1)
+        await asyncio.sleep(1)
+        assert await autoscaler_ref.get_dynamic_worker_nums() == 2
+
+
+@pytest.mark.timeout(timeout=500)
+@pytest.mark.parametrize("ray_large_cluster", [{"num_nodes": 4}], indirect=True)
+@require_ray
+@pytest.mark.asyncio
+async def test_ownership_when_scale_in(ray_large_cluster):
+    client = await new_cluster(
+        worker_num=0,
+        worker_cpu=2,
+        worker_mem=1 * 1024**3,
+        supervisor_mem=200 * 1024**2,
+        config={
+            "scheduling.autoscale.enabled": True,
+            "scheduling.autoscale.scheduler_check_interval": 0.1,
+            "scheduling.autoscale.scheduler_backlog_timeout": 0.5,
+            "scheduling.autoscale.worker_idle_timeout": 1,
+            "scheduling.autoscale.min_workers": 1,
+            "scheduling.autoscale.max_workers": 4,
+        },
+    )
+    async with client:
+        autoscaler_ref = mo.create_actor_ref(
+            uid=AutoscalerActor.default_uid(),
+            address=client._cluster.supervisor_address,
+        )
+        num_chunks, chunk_size = 10, 4
+        df = md.DataFrame(
+            mt.random.rand(num_chunks * chunk_size, 4, chunk_size=chunk_size),
+            columns=list("abcd"),
+        )
+        latch_actor = ray.remote(CountDownLatch).remote(1)
+        pid = os.getpid()
+
+        def f(pdf, latch):
+            if os.getpid() != pid:
+                # type inference will call this function too
+                ray.get(latch.wait.remote())
+            return pdf
+
+        df = df.map_chunk(
+            f,
+            args=(latch_actor,),
+        )
+        info = df.execute(wait=False)
+        while await autoscaler_ref.get_dynamic_worker_nums() <= 1:
+            logger.info("Waiting workers to be created.")
+            await asyncio.sleep(1)
+        await latch_actor.count_down.remote()
+        await info
+        assert info.exception() is None
+        assert info.progress() == 1
+        logger.info("df execute succeed.")
+
+        while await autoscaler_ref.get_dynamic_worker_nums() > 1:
+            dynamic_workers = await autoscaler_ref.get_dynamic_workers()
+            logger.info("Waiting workers %s to be released.", dynamic_workers)
+            await asyncio.sleep(1)
+        # Test data on node of released worker can still be fetched
+        pd_df = df.fetch()
+        groupby_sum_df = (
+            df.rechunk(chunk_size * 2).groupby("a").apply(lambda pdf: pdf.sum())
+        )
+        logger.info(groupby_sum_df.execute())
+        while await autoscaler_ref.get_dynamic_worker_nums() > 1:
+            dynamic_workers = await autoscaler_ref.get_dynamic_workers()
+            logger.info(f"Waiting workers %s to be released.", dynamic_workers)
+            await asyncio.sleep(1)
+        assert df.to_pandas().to_dict() == pd_df.to_dict()
+        assert (
+            groupby_sum_df.to_pandas().to_dict()
+            == pd_df.groupby("a").apply(lambda pdf: pdf.sum()).to_dict()
+        )
+
+
+class CountDownLatch:
+    def __init__(self, cnt):
+        self.cnt = cnt
+
+    def count_down(self):
+        self.cnt -= 1
+
+    def get_count(self):
+        return self.cnt
+
+    async def wait(self):
+        while self.cnt != 0:
+            await asyncio.sleep(0.01)
diff --git a/python/xorbits/_mars/deploy/oscar/worker.py b/python/xorbits/_mars/deploy/oscar/worker.py
new file mode 100644
index 000000000..7eeb05833
--- /dev/null
+++ b/python/xorbits/_mars/deploy/oscar/worker.py
@@ -0,0 +1,126 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from ...resource import Resource, cpu_count, cuda_count, mem_total
+from ...services import NodeRole
+from ...utils import get_next_port
+from .cmdline import OscarCommandRunner
+from .local import start_worker, stop_worker
+from .pool import create_worker_actor_pool
+
+
+class WorkerCommandRunner(OscarCommandRunner):
+    command_description = "Mars Worker"
+    node_role = NodeRole.WORKER
+
+    def __init__(self):
+        super().__init__()
+        self.band_to_resource = dict()
+        self.cuda_devices = []
+        self.n_io_process = 1
+
+    def config_args(self, parser):
+        super().config_args(parser)
+        parser.add_argument("--n-cpu", help="num of CPU to use", default="auto")
+        parser.add_argument(
+            "--mem-bytes", help="bytes of memory to use", default="auto"
+        )
+        parser.add_argument("--n-io-process", help="num of IO processes", default="1")
+        parser.add_argument(
+            "--cuda-devices",
+            help="CUDA device to use, if not specified, will use "
+            "all available devices",
+            default="auto",
+        )
+
+    def parse_args(self, parser, argv, environ=None):
+        environ = environ or os.environ
+        args = super().parse_args(parser, argv, environ=environ)
+
+        if (
+            self.config.get("cluster", {}).get("backend", "fixed") == "fixed"
+            and not args.supervisors
+        ):  # pragma: no cover
+            raise ValueError("--supervisors is needed to start Mars Worker")
+
+        if args.endpoint is None:
+            args.endpoint = f"{args.host}:{get_next_port()}"
+        self.n_io_process = int(args.n_io_process)
+
+        n_cpu = cpu_count() if args.n_cpu == "auto" else int(args.n_cpu)
+        mem_bytes = mem_total() if args.mem_bytes == "auto" else int(args.mem_bytes)
+
+        if "CUDA_VISIBLE_DEVICES" in os.environ:  # pragma: no cover
+            args.cuda_devices = os.environ["CUDA_VISIBLE_DEVICES"].strip()
+
+        if args.cuda_devices == "auto":
+            self.cuda_devices = list(range(cuda_count()))
+        elif args.cuda_devices.strip() == "":  # pragma: no cover
+            # allow using CPU only
+            self.cuda_devices = []
+        else:  # pragma: no cover
+            self.cuda_devices = [int(i) for i in args.cuda_devices.split(",")]
+
+        self.band_to_resource = band_to_resource = dict()
+        band_to_resource["numa-0"] = Resource(num_cpus=n_cpu, mem_bytes=mem_bytes)
+        for i in self.cuda_devices:  # pragma: no cover
+            band_to_resource[f"gpu-{i}"] = Resource(num_gpus=1)
+
+        storage_config = self.config["storage"] = self.config.get("storage", {})
+        backends = storage_config["backends"] = storage_config.get("backends", [])
+        plasma_config = storage_config["plasma"] = storage_config.get("plasma", {})
+        disk_config = storage_config["disk"] = storage_config.get("disk", {})
+        if "MARS_CACHE_MEM_SIZE" in environ:
+            plasma_config["store_memory"] = environ["MARS_CACHE_MEM_SIZE"]
+        if "MARS_PLASMA_DIRS" in environ:
+            plasma_config["plasma_directory"] = environ["MARS_PLASMA_DIRS"]
+        if "MARS_SPILL_DIRS" in environ:
+            backends.append("disk")
+            disk_config["root_dirs"] = environ["MARS_SPILL_DIRS"]
+
+        return args
+
+    async def create_actor_pool(self):
+        return await create_worker_actor_pool(
+            self.args.endpoint,
+            self.band_to_resource,
+            ports=self.ports,
+            n_io_process=self.n_io_process,
+            modules=list(self.args.load_modules),
+            logging_conf=self.logging_conf,
+            cuda_devices=self.cuda_devices,
+            subprocess_start_method="forkserver" if os.name != "nt" else "spawn",
+            metrics=self.config.get("metrics", {}),
+            oscar_config=self.config.get("oscar"),
+        )
+
+    async def start_services(self):
+        return await start_worker(
+            self.pool.external_address,
+            self.args.supervisors,
+            self.band_to_resource,
+            list(self.args.load_modules),
+            self.config,
+        )
+
+    async def stop_services(self):
+        return await stop_worker(self.pool.external_address, self.config)
+
+
+main = WorkerCommandRunner()
+
+if __name__ == "__main__":  # pragma: no branch
+    main()
diff --git a/python/xorbits/_mars/deploy/tests/__init__.py b/python/xorbits/_mars/deploy/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/deploy/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/deploy/tests/base_test_cfg.yml b/python/xorbits/_mars/deploy/tests/base_test_cfg.yml
new file mode 100644
index 000000000..e7a53db89
--- /dev/null
+++ b/python/xorbits/_mars/deploy/tests/base_test_cfg.yml
@@ -0,0 +1,13 @@
+"@inherits": '@default'
+test_list:
+  - item1
+  - item2
+test_list2:
+  - item1
+  - item2
+test_dict:
+  key1: val1
+  key2:
+    key2_key1:
+      val2
+"@overriding_fields": ["test_list2"]
diff --git a/python/xorbits/_mars/deploy/tests/inherit_test_cfg1.yml b/python/xorbits/_mars/deploy/tests/inherit_test_cfg1.yml
new file mode 100644
index 000000000..4ee50c47a
--- /dev/null
+++ b/python/xorbits/_mars/deploy/tests/inherit_test_cfg1.yml
@@ -0,0 +1,5 @@
+"@inherits": '@mars/deploy/tests/base_test_cfg.yml'
+test_list:
+  - item3
+test_list2:  # overriding
+  - item3
diff --git a/python/xorbits/_mars/deploy/tests/inherit_test_cfg2.yml b/python/xorbits/_mars/deploy/tests/inherit_test_cfg2.yml
new file mode 100644
index 000000000..ffdaf7181
--- /dev/null
+++ b/python/xorbits/_mars/deploy/tests/inherit_test_cfg2.yml
@@ -0,0 +1,6 @@
+"@inherits": inherit_test_cfg1.yml
+test_dict:
+  key2:
+    key2_key1:
+      val2_modified
+  key3: val3
diff --git a/python/xorbits/_mars/deploy/tests/test_utils.py b/python/xorbits/_mars/deploy/tests/test_utils.py
new file mode 100644
index 000000000..b6d4af9e5
--- /dev/null
+++ b/python/xorbits/_mars/deploy/tests/test_utils.py
@@ -0,0 +1,105 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from ...services import NodeRole
+from ..utils import (
+    get_third_party_modules_from_config,
+    load_service_config_file,
+    next_in_thread,
+)
+
+_cwd = os.path.abspath(os.getcwd())
+
+
+@pytest.mark.parametrize("cwd", [_cwd, os.path.dirname(_cwd)])
+def test_load_service_config(cwd):
+    old_cwd = os.getcwd()
+    try:
+        os.chdir(cwd)
+        cfg = load_service_config_file(
+            os.path.join(os.path.dirname(__file__), "inherit_test_cfg2.yml")
+        )
+
+        assert "services" in cfg
+        assert cfg["test_list"] == ["item1", "item2", "item3"]
+        assert cfg["test_list2"] == ["item3"]
+        assert set(cfg["test_dict"].keys()) == {"key1", "key2", "key3"}
+        assert set(cfg["test_dict"]["key2"].values()) == {"val2_modified"}
+        assert all(not k.startswith("@") for k in cfg.keys())
+    finally:
+        os.chdir(old_cwd)
+
+
+def test_get_third_party_modules_from_config():
+    r = get_third_party_modules_from_config({}, NodeRole.SUPERVISOR)
+    assert r == []
+
+    r = get_third_party_modules_from_config({}, NodeRole.WORKER)
+    assert r == []
+
+    config = {"third_party_modules": {"supervisor": ["a.module"]}}
+    r = get_third_party_modules_from_config(config, NodeRole.SUPERVISOR)
+    assert r == ["a.module"]
+    r = get_third_party_modules_from_config(config, NodeRole.WORKER)
+    assert r == []
+
+    config = {"third_party_modules": {"worker": ["b.module"]}}
+    r = get_third_party_modules_from_config(config, NodeRole.WORKER)
+    assert r == ["b.module"]
+    r = get_third_party_modules_from_config(config, NodeRole.SUPERVISOR)
+    assert r == []
+
+    config = {"third_party_modules": ["ab.module"]}
+    r = get_third_party_modules_from_config(config, NodeRole.SUPERVISOR)
+    assert r == ["ab.module"]
+    r = get_third_party_modules_from_config(config, NodeRole.WORKER)
+    assert r == ["ab.module"]
+
+    os.environ["MARS_LOAD_MODULES"] = "c.module,d.module"
+    try:
+        r = get_third_party_modules_from_config(config, NodeRole.SUPERVISOR)
+        assert r == ["ab.module", "c.module", "d.module"]
+        r = get_third_party_modules_from_config(config, NodeRole.WORKER)
+        assert r == ["ab.module", "c.module", "d.module"]
+        r = get_third_party_modules_from_config({}, NodeRole.SUPERVISOR)
+        assert r == ["c.module", "d.module"]
+        r = get_third_party_modules_from_config({}, NodeRole.WORKER)
+        assert r == ["c.module", "d.module"]
+    finally:
+        os.environ.pop("MARS_LOAD_MODULES", None)
+
+    config = {"third_party_modules": "ab.module"}
+    with pytest.raises(TypeError, match="str"):
+        get_third_party_modules_from_config(config, NodeRole.SUPERVISOR)
+    config = {"third_party_modules": {"supervisor": "a.module"}}
+    with pytest.raises(TypeError, match="str"):
+        get_third_party_modules_from_config(config, NodeRole.SUPERVISOR)
+
+
+@pytest.mark.asyncio
+async def test_next_in_thread():
+    def gen_fun():
+        yield 1
+        yield 2
+
+    gen = gen_fun()
+
+    assert await next_in_thread(gen) == 1
+    assert await next_in_thread(gen) == 2
+    with pytest.raises(StopAsyncIteration):
+        await next_in_thread(gen)
diff --git a/python/xorbits/_mars/deploy/utils.py b/python/xorbits/_mars/deploy/utils.py
new file mode 100644
index 000000000..b378a4589
--- /dev/null
+++ b/python/xorbits/_mars/deploy/utils.py
@@ -0,0 +1,223 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+import time
+import warnings
+from typing import Callable, Dict, List, TextIO, Union
+
+import yaml
+
+from ..services import NodeRole
+from ..utils import flatten_dict_to_nested_dict, merge_dict
+
+DEFAULT_CONFIG_FILE = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "oscar/config.yml"
+)
+
+
+def wait_services_ready(
+    selectors: List, min_counts: List[int], count_fun: Callable, timeout=None
+):
+    readies = [0] * len(selectors)
+    start_time = time.time()
+    while True:
+        all_satisfy = True
+        for idx, selector in enumerate(selectors):
+            if readies[idx] < min_counts[idx]:
+                all_satisfy = False
+                readies[idx] = count_fun(selector)
+                break
+        if all_satisfy:
+            break
+        if timeout and timeout + start_time < time.time():
+            raise TimeoutError("Wait cluster start timeout")
+        time.sleep(1)
+
+
+def load_service_config_file(path: Union[str, TextIO]) -> Dict:
+    mars_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+    cfg_stack = []  # type: List[Dict]
+    cfg_file_set = set()
+    if isinstance(path, str):
+        path = os.path.abspath(path)
+
+    while path is not None:
+        if path in cfg_file_set:  # pragma: no cover
+            raise ValueError("Recursive config inherit detected")
+
+        if not hasattr(path, "read"):
+            with open(path) as file:
+                cfg = yaml.safe_load(file)
+        else:
+            cfg = yaml.safe_load(path)
+        cfg_stack.append(cfg)
+        cfg_file_set.add(path)
+
+        inherit_path = cfg.pop("@inherits", None)
+        if not inherit_path:
+            path = None
+        elif os.path.isfile(inherit_path):
+            path = inherit_path
+        elif inherit_path == "@default":
+            path = DEFAULT_CONFIG_FILE
+        elif inherit_path.startswith("@mars"):
+            path = inherit_path.replace("@mars", mars_path)
+        else:
+            path = os.path.join(os.path.dirname(path), inherit_path)
+
+    def _override_cfg(src: Union[Dict, List], override: Union[Dict, List]):
+        if isinstance(override, dict):
+            overriding_fields = set(src.get("@overriding_fields") or set())
+            for key, val in override.items():
+                if (
+                    key not in src
+                    or not isinstance(val, (list, dict))
+                    or key in overriding_fields
+                ):
+                    src[key] = val
+                else:
+                    _override_cfg(src[key], override[key])
+        else:
+            src.extend(override)
+
+    def _clear_meta_cfg(src: Dict):
+        meta_keys = []
+        for k, v in src.items():
+            if k.startswith("@"):
+                meta_keys.append(k)
+            elif isinstance(v, dict):
+                _clear_meta_cfg(v)
+
+        for k in meta_keys:
+            src.pop(k)
+
+    cfg = cfg_stack[-1]
+    for new_cfg in cfg_stack[-2::-1]:
+        _override_cfg(cfg, new_cfg)
+
+    _clear_meta_cfg(cfg)
+    return cfg
+
+
+def _merge_config(full_config: Dict, config: Dict) -> Dict:
+    """
+    Merge the config to full_config, the config support flatten key, e.g.
+
+    config={
+        'scheduling.autoscale.enabled': True,
+        'scheduling.autoscale.scheduler_check_interval': 1,
+        'scheduling.autoscale.scheduler_backlog_timeout': 1,
+        'scheduling.autoscale.worker_idle_timeout': 10,
+        'scheduling.autoscale.min_workers': 1,
+        'scheduling.autoscale.max_workers': 4
+    }
+    """
+    if not config:
+        return full_config
+    if not isinstance(config, Dict):  # pragma: no cover
+        raise ValueError(
+            f"The config should be a dict, but the type is {type(config)}."
+        )
+    flatten_keys = set(k for k in config.keys() if isinstance(k, str) and "." in k)
+    nested_flatten_config = flatten_dict_to_nested_dict(
+        {k: config[k] for k in flatten_keys}
+    )
+    nested_config = {k: config[k] for k in config.keys() if k not in flatten_keys}
+    config = merge_dict(nested_config, nested_flatten_config, overwrite=False)
+    merge_dict(full_config, config)
+    return full_config
+
+
+def load_config(config: Union[str, Dict], default_config_file: str):
+    """
+    Load config based on the default_config.
+    """
+    # use default config
+    if isinstance(config, str):
+        filename = config
+        config = load_service_config_file(filename)
+    else:
+        full_config = load_service_config_file(default_config_file)
+        config = _merge_config(full_config, config)
+    if config["scheduling"]["speculation"]["enabled"] is True:
+        # if `initial_same_color_num` > 1, coloring based fusion will make subtask too heterogeneous such that
+        # the speculative scheduler can't get enough homogeneous subtasks to calculate statistics
+        warnings.warn(
+            "speculative execution is enabled, set initial_same_color_num to 1 to "
+            "ensure enough homogeneous subtasks to calculate statistics."
+        )
+        config["task"]["default_config"]["initial_same_color_num"] = 1
+    ray_execution_config = config["task"]["execution_config"].setdefault("ray", {})
+    subtask_max_retries = config["scheduling"].get("subtask_max_retries")
+    if subtask_max_retries is not None:
+        ray_execution_config.setdefault("subtask_max_retries", subtask_max_retries)
+    return config
+
+
+async def wait_all_supervisors_ready(endpoint):
+    """
+    Wait till all containers are ready
+    """
+    from ..services.cluster import ClusterAPI
+
+    cluster_api = None
+
+    while True:
+        try:
+            cluster_api = await ClusterAPI.create(endpoint)
+            break
+        except:  # noqa: E722  # pylint: disable=bare-except  # pragma: no cover
+            await asyncio.sleep(0.1)
+            continue
+
+    assert cluster_api is not None
+    await cluster_api.wait_all_supervisors_ready()
+
+
+def get_third_party_modules_from_config(config: Dict, role: NodeRole, environ=None):
+    environ = environ or os.environ
+    third_party_modules = config.get("third_party_modules", [])
+    if isinstance(third_party_modules, list):
+        modules = third_party_modules
+    elif isinstance(third_party_modules, dict):
+        key = {
+            NodeRole.SUPERVISOR: "supervisor",
+            NodeRole.WORKER: "worker",
+        }
+        modules = third_party_modules.get(key[role], [])
+        if not isinstance(modules, list):
+            raise TypeError(
+                f"The value type of third_party_modules.{key[role]} "
+                f"should be a list, but got a {type(modules)} instead."
+            )
+    else:
+        raise TypeError(
+            f"The value type of third_party_modules should be a list "
+            f"or dict, but got a {type(third_party_modules)} instead."
+        )
+
+    all_modules = []
+    for mods in tuple(modules or ()) + (environ.get("MARS_LOAD_MODULES"),):
+        all_modules.extend(mods.split(",") if mods else [])
+    return all_modules
+
+
+async def next_in_thread(gen):
+    res = await asyncio.to_thread(next, gen, StopIteration)
+    if res is StopIteration:
+        raise StopAsyncIteration
+    return res
diff --git a/python/xorbits/_mars/deploy/yarn/__init__.py b/python/xorbits/_mars/deploy/yarn/__init__.py
new file mode 100644
index 000000000..82f91f84c
--- /dev/null
+++ b/python/xorbits/_mars/deploy/yarn/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .client import YarnClusterClient, new_cluster
diff --git a/python/xorbits/_mars/deploy/yarn/client.py b/python/xorbits/_mars/deploy/yarn/client.py
new file mode 100644
index 000000000..109dc67e2
--- /dev/null
+++ b/python/xorbits/_mars/deploy/yarn/client.py
@@ -0,0 +1,225 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import random
+import time
+import uuid
+
+from ...session import new_session
+from ...utils import calc_size_by_str, to_str
+from ..utils import wait_services_ready
+from .config import MarsApplicationConfig, MarsSupervisorConfig, MarsWorkerConfig
+
+logger = logging.getLogger(__name__)
+
+
+class YarnClusterClient:
+    def __init__(self, skein_client, application_id, endpoint, is_client_managed=False):
+        self._skein_client = skein_client
+        self._is_client_managed = is_client_managed
+        self._application_id = application_id
+        self._endpoint = endpoint
+        self._session = new_session(endpoint)
+
+    @property
+    def session(self):
+        return self._session
+
+    @property
+    def endpoint(self):
+        return self._endpoint
+
+    @property
+    def application_id(self):
+        return self._application_id
+
+    def stop(self, status="SUCCEEDED"):
+        import skein
+
+        try:
+            skein_client = skein.Client()
+            app_client = skein_client.connect(self._application_id)
+            app_client.shutdown(status=status)
+            if self._is_client_managed:
+                self._skein_client.close()
+        except skein.ApplicationNotRunningError:
+            pass
+
+
+def _get_ready_container_count(app_client, svc):
+    container_ids = set(
+        c.yarn_container_id for c in app_client.get_containers([svc], ["RUNNING"])
+    )
+    prefixes = app_client.kv.get_prefix(svc)
+    registered_ids = set(to_str(v).rsplit("@", 1)[-1] for v in prefixes.values())
+    return len(container_ids.intersection(registered_ids))
+
+
+def new_cluster(
+    environment=None,
+    supervisor_num=1,
+    supervisor_cpu=None,
+    supervisor_mem=None,
+    worker_num=1,
+    worker_cpu=None,
+    worker_mem=None,
+    worker_spill_paths=None,
+    worker_cache_mem=None,
+    min_worker_num=None,
+    timeout=None,
+    log_config=None,
+    skein_client=None,
+    app_name=None,
+    app_queue=None,
+    **kwargs,
+):
+    import skein
+
+    from .supervisor import YarnSupervisorCommandRunner
+
+    def _override_envs(src, updates):
+        ret = src.copy()
+        ret.update(updates)
+        return ret
+
+    if worker_cpu is None or worker_mem is None:  # pragma: no cover
+        raise TypeError("`worker_cpu` and `worker_mem` must be specified")
+
+    app_name = app_name or f"mars-app-{uuid.uuid4()}"
+    supervisor_mem = calc_size_by_str(supervisor_mem, None)
+    worker_mem = calc_size_by_str(worker_mem, None)
+
+    log_when_fail = kwargs.pop("log_when_fail", False)
+
+    supervisor_extra_modules = kwargs.pop("supervisor_extra_modules", None)
+    worker_extra_modules = kwargs.pop("worker_extra_modules", None)
+
+    cmd_tmpl = kwargs.pop("cmd_tmpl", None)
+
+    extra_envs = kwargs.pop("extra_env", dict())
+    supervisor_extra_env = _override_envs(
+        extra_envs, kwargs.pop("supervisor_extra_env", dict())
+    )
+    worker_extra_env = _override_envs(
+        extra_envs, kwargs.pop("worker_extra_env", dict())
+    )
+
+    extra_args = kwargs.pop("extra_args", "")
+    supervisor_extra_args = (
+        extra_args + " " + kwargs.pop("supervisor_extra_args", "")
+    ).strip()
+    worker_extra_args = (extra_args + " " + kwargs.pop("worker_extra_args", "")).strip()
+
+    supervisor_log_config = kwargs.pop("supervisor_log_config", log_config)
+    worker_log_config = kwargs.pop("worker_log_config", log_config)
+
+    supervisor_config = MarsSupervisorConfig(
+        instances=supervisor_num,
+        environment=environment,
+        cpu=supervisor_cpu,
+        memory=supervisor_mem,
+        modules=supervisor_extra_modules,
+        env=supervisor_extra_env,
+        log_config=supervisor_log_config,
+        extra_args=supervisor_extra_args,
+        cmd_tmpl=cmd_tmpl,
+    )
+    worker_config = MarsWorkerConfig(
+        instances=worker_num,
+        environment=environment,
+        cpu=worker_cpu,
+        memory=worker_mem,
+        spill_dirs=worker_spill_paths,
+        worker_cache_mem=worker_cache_mem,
+        modules=worker_extra_modules,
+        env=worker_extra_env,
+        log_config=worker_log_config,
+        extra_args=worker_extra_args,
+        cmd_tmpl=cmd_tmpl,
+    )
+    app_config = MarsApplicationConfig(
+        app_name,
+        app_queue,
+        supervisor_config=supervisor_config,
+        worker_config=worker_config,
+    )
+
+    skein_client = skein_client or skein.Client()
+    app_id = None
+    try:
+        is_client_managed = skein_client is not None
+        app_id = skein_client.submit(app_config.build())
+
+        check_start_time = time.time()
+        while True:
+            try:
+                app_client = skein_client.connect(app_id)
+                break
+            except skein.ApplicationNotRunningError:  # pragma: no cover
+                time.sleep(0.5)
+                if timeout and time.time() - check_start_time > timeout:
+                    raise
+
+        logger.debug(
+            "Application client for %s at %s retrieved", app_id, app_client.address
+        )
+
+        # wait until supervisors and expected num of workers are ready
+        min_worker_num = int(min_worker_num or worker_num)
+        limits = [supervisor_num, min_worker_num]
+        services = [MarsSupervisorConfig.service_name, MarsWorkerConfig.service_name]
+
+        wait_services_ready(
+            services,
+            limits,
+            lambda svc: _get_ready_container_count(app_client, svc),
+            timeout=None if not timeout else timeout - (time.time() - check_start_time),
+        )
+        web_endpoint_kv = app_client.kv.get_prefix(
+            YarnSupervisorCommandRunner.web_service_name
+        )
+        web_endpoint = random.choice(
+            [to_str(v).split("@", 1)[0] for v in web_endpoint_kv.values()]
+        )
+        return YarnClusterClient(
+            skein_client,
+            app_client.id,
+            web_endpoint,
+            is_client_managed=is_client_managed,
+        )
+    except:  # noqa: E722
+        skein_client = skein.Client()
+        try:
+            if log_when_fail:
+                if app_id is not None:
+                    try:
+                        app_client = skein_client.connect(app_id)
+                        app_client.shutdown(status="FAILED")
+                    except skein.ApplicationNotRunningError:
+                        pass
+
+                    try:
+                        logs = skein_client.application_logs(app_id)
+                        logger.error("Error when creating cluster:\n%s", logs.dumps())
+                    except ValueError:
+                        logger.error(
+                            "Error when creating cluster and failed to get logs"
+                        )
+                else:
+                    logger.error("Error when creating cluster and no logs from cluster")
+        finally:
+            if app_id is not None:
+                skein_client.kill_application(app_id)
+        raise
diff --git a/python/xorbits/_mars/deploy/yarn/config.py b/python/xorbits/_mars/deploy/yarn/config.py
new file mode 100644
index 000000000..b5aaa6df0
--- /dev/null
+++ b/python/xorbits/_mars/deploy/yarn/config.py
@@ -0,0 +1,308 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import textwrap
+from urllib.parse import urlparse
+
+from ...utils import parse_readable_size
+
+
+def _remove_nones(cfg):
+    return dict((k, v) for k, v in cfg.items() if v is not None)
+
+
+def _get_local_app_module(mod_name):
+    return __name__.rsplit(".", 1)[0] + "." + mod_name.rsplit(".", 1)[-1]
+
+
+class SecurityConfig:
+    def __init__(self, cert_file=None, key_file=None):
+        self._cert_file = cert_file
+        self._key_file = key_file
+
+    def build(self):
+        return dict(cert_file=self._cert_file, key_file=self._key_file)
+
+
+class AppFileConfig:
+    def __init__(
+        self, source, file_type=None, visibility=None, size=None, timestamp=None
+    ):
+        self._source = source
+        self._file_type = file_type
+        self._visibility = visibility
+        self._size = size
+        self._timestamp = timestamp
+
+    def build(self):
+        if all(
+            v is None
+            for v in (self._file_type, self._visibility, self._size, self._timestamp)
+        ):
+            return self._source
+        else:
+            return _remove_nones(
+                dict(
+                    source=self._source,
+                    type=self._file_type,
+                    visibility=self._visibility,
+                    size=self._size,
+                    timestamp=self._timestamp,
+                )
+            )
+
+
+class AppContainerConfig:
+    def __init__(self, cpu=None, memory=None, env=None, files=None, script=None):
+        self._cpu = cpu
+
+        if memory is not None:
+            real_mem, is_percent = parse_readable_size(memory)
+            assert not is_percent
+            self._memory = real_mem
+        else:
+            self._memory = None
+
+        self._env = env
+        self._script = script
+        self._files = files
+
+        self.add_default_envs()
+
+    def build_script(self):
+        return self._script
+
+    def add_default_envs(self):
+        pass
+
+    def add_env(self, k, v):
+        if self._env is None:
+            self._env = dict()
+        self._env[k] = v
+
+    def build(self):
+        return _remove_nones(
+            dict(
+                resources=dict(
+                    vcores=self._cpu,
+                    memory=f"{self._memory // 1024 ** 2} MiB" if self._memory else None,
+                ),
+                env=self._env,
+                script=self.build_script(),
+                files=dict((k, v.build()) for k, v in self._files.items())
+                if self._files
+                else None,
+            )
+        )
+
+
+class AppMasterConfig(AppContainerConfig):
+    def __init__(self, security=None, **kwargs):
+        super().__init__(**kwargs)
+        self._security = security
+
+    def build(self):
+        d = super().build()
+        if self._security is not None:
+            d["security"] = self._security.build()
+        return d
+
+
+class AppServiceConfig(AppContainerConfig):
+    def __init__(
+        self, instances=1, depends=None, allow_failures=False, max_restarts=0, **kwargs
+    ):
+        super().__init__(**kwargs)
+        if isinstance(depends, str):
+            depends = [depends]
+
+        self._allow_failures = allow_failures
+        self._depends = depends or []
+        self._max_restarts = max_restarts
+        self._instances = instances
+
+    def build(self):
+        d = super().build()
+        d.update(
+            dict(
+                instances=self._instances,
+                depends=self._depends,
+                allow_failures=self._allow_failures,
+                max_restarts=self._max_restarts,
+            )
+        )
+        return d
+
+
+class MarsServiceConfig(AppServiceConfig):
+    service_name = None
+
+    def __init__(
+        self,
+        environment,
+        modules=None,
+        cmd_tmpl=None,
+        cpu=None,
+        memory=None,
+        log_config=None,
+        extra_args=None,
+        **kwargs,
+    ):
+        files = kwargs.pop("files", dict())
+        kwargs["files"] = files
+
+        parsed = urlparse(environment)
+        self._env_scheme = parsed.scheme
+
+        if parsed.scheme:
+            import mars
+
+            self._source_path = os.path.dirname(
+                os.path.dirname(os.path.abspath(mars.__file__))
+            )
+
+            self._env_path = environment[len(parsed.scheme) + 3 :]
+            self._path_environ = os.environ["PATH"]
+        else:
+            self._source_path = None
+            self._env_path = environment
+            self._path_environ = None
+
+        self._cmd_tmpl = cmd_tmpl or '"{executable}"'
+        if not self._env_scheme:
+            files["mars_env"] = AppFileConfig(environment)
+
+        self._log_config = log_config
+        if log_config:
+            files["logging.conf"] = AppFileConfig(log_config)
+
+        self._modules = modules.split(",") if isinstance(modules, str) else modules
+
+        self._extra_args = extra_args or ""
+
+        cpu = cpu or 1
+        memory = memory or "1 GiB"
+        super().__init__(cpu=cpu, memory=memory, **kwargs)
+
+    def add_default_envs(self):
+        if self._cpu:
+            self.add_env("MKL_NUM_THREADS", str(self._cpu))
+            self.add_env("MARS_CPU_TOTAL", str(self._cpu))
+            self.add_env("MARS_USE_PROCESS_STAT", "1")
+
+        if self._memory:
+            self.add_env("MARS_MEMORY_TOTAL", str(int(self._memory)))
+
+        if self._modules:
+            self.add_env("MARS_LOAD_MODULES", ",".join(self._modules))
+
+        if self._path_environ:
+            self.add_env("MARS_YARN_PATH", self._path_environ)
+
+        if self._source_path:
+            self.add_env("MARS_SOURCE_PATH", self._source_path)
+
+    def build_script(self):
+        bash_lines = [
+            textwrap.dedent(
+                """
+        #!/bin/bash
+        if [[ "$YARN_CONTAINER_RUNTIME_TYPE" == "docker" ]]; then
+          export MARS_USE_CGROUP_STAT=1
+        else
+          export MARS_USE_PROCESS_STAT=1
+        fi
+        if [[ -n $MARS_SOURCE_PATH ]]; then export PYTHONPATH=$PYTHONPATH:$MARS_SOURCE_PATH; fi
+        if [[ -n $MARS_YARN_PATH ]]; then export PATH=$MARS_YARN_PATH:$PATH; fi
+        """
+            ).strip()
+        ]
+
+        if not self._env_scheme:
+            bash_lines.append("source mars_env/bin/activate")
+            python_executable = "mars_env/bin/python"
+        elif self._env_scheme == "conda":
+            bash_lines.append(f'conda activate "{self._env_path}"')
+            python_executable = "python"
+        elif self._env_scheme == "venv":
+            bash_lines.append(f'source "{self._env_path}/bin/activate"')
+            python_executable = self._env_path + "/bin/python"
+        else:  # pragma: no cover
+            python_executable = self._env_path
+
+        cmd = self._cmd_tmpl.format(executable=python_executable)
+        bash_lines.append(
+            f"{cmd} -m {_get_local_app_module(self.service_name)} {self._extra_args} > /tmp/{self.service_name}.stdout.log 2> /tmp/{self.service_name}.stderr.log"
+        )
+        return "\n".join(bash_lines) + "\n"
+
+
+class MarsSupervisorConfig(MarsServiceConfig):
+    service_name = "mars.supervisor"
+    web_service_name = "mars.web"
+
+
+class MarsWorkerConfig(MarsServiceConfig):
+    service_name = "mars.worker"
+
+    def __init__(self, environment, worker_cache_mem=None, spill_dirs=None, **kwargs):
+        kwargs["depends"] = MarsSupervisorConfig.service_name
+        super().__init__(environment, **kwargs)
+
+        if worker_cache_mem:
+            self.add_env("MARS_CACHE_MEM_SIZE", worker_cache_mem)
+
+        if spill_dirs:
+            self.add_env(
+                "MARS_SPILL_DIRS",
+                spill_dirs if isinstance(spill_dirs, str) else ":".join(spill_dirs),
+            )
+
+
+class MarsApplicationConfig:
+    def __init__(
+        self,
+        name=None,
+        queue=None,
+        file_systems=None,
+        master=None,
+        supervisor_config=None,
+        worker_config=None,
+    ):
+        self._name = name
+        self._queue = queue or "default"
+        self._file_systems = file_systems or []
+        self._master = master or AppMasterConfig(cpu=1, memory="512 MiB")
+        self._supervisor_config = supervisor_config
+        self._worker_config = worker_config
+
+    def build(self):
+        services = _remove_nones(
+            {
+                MarsSupervisorConfig.service_name: self._supervisor_config.build()
+                if self._supervisor_config
+                else None,
+                MarsWorkerConfig.service_name: self._worker_config.build()
+                if self._worker_config
+                else None,
+            }
+        )
+        return dict(
+            name=self._name,
+            queue=self._queue,
+            file_systems=self._file_systems,
+            master=self._master.build() if self._master else None,
+            services=services,
+        )
diff --git a/python/xorbits/_mars/deploy/yarn/config.yml b/python/xorbits/_mars/deploy/yarn/config.yml
new file mode 100644
index 000000000..b3d0be729
--- /dev/null
+++ b/python/xorbits/_mars/deploy/yarn/config.yml
@@ -0,0 +1,3 @@
+"@inherits": ../oscar/config.yml
+cluster:
+  backend: yarn
diff --git a/python/xorbits/_mars/deploy/yarn/core.py b/python/xorbits/_mars/deploy/yarn/core.py
new file mode 100644
index 000000000..4277ec86e
--- /dev/null
+++ b/python/xorbits/_mars/deploy/yarn/core.py
@@ -0,0 +1,200 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import os
+import signal
+import uuid
+from collections import defaultdict
+from typing import AsyncGenerator, Dict, List, Optional, TypeVar
+
+from ... import oscar as mo
+from ...services import NodeRole
+from ...services.cluster.backends import (
+    AbstractClusterBackend,
+    register_cluster_backend,
+)
+from ...utils import to_binary, to_str
+from ..utils import wait_all_supervisors_ready
+from .config import MarsSupervisorConfig, MarsWorkerConfig
+
+try:
+    from skein import ApplicationClient
+    from skein import Client as SkeinClient
+    from skein import ConnectionError as SkeinConnectionError
+    from skein import SkeinError
+    from skein import properties as skein_props
+except ImportError:  # pragma: no cover
+    ApplicationClient, SkeinClient, skein_props = None, None, None
+    SkeinConnectionError, SkeinError = None, None
+
+RetType = TypeVar("RetType")
+logger = logging.getLogger(__name__)
+
+_role_to_config = {
+    NodeRole.SUPERVISOR: MarsSupervisorConfig,
+    NodeRole.WORKER: MarsWorkerConfig,
+}
+
+
+class YarnNodeWatchActor(mo.Actor):
+    def __init__(self):
+        assert ApplicationClient is not None
+        self._app_client = ApplicationClient.from_current()
+
+        self._nodes = defaultdict(set)
+        self._supervisor_watch_task = None
+        self._role_to_events = defaultdict(list)
+
+    async def __post_create__(self):
+        self._supervisor_watch_task = asyncio.create_task(
+            self._watch_nodes(NodeRole.SUPERVISOR)
+        )
+
+    async def __pre_destroy__(self):
+        if self._supervisor_watch_task is not None:  # pragma: no branch
+            self._watch_task.cancel()
+
+    async def get_container_mappings(self, role: NodeRole) -> Dict[str, str]:
+        key_prefix = _role_to_config[role].service_name
+
+        container_specs = await asyncio.to_thread(
+            self._app_client.get_containers, [key_prefix]
+        )
+        cid_to_endpoint = {c.yarn_container_id: None for c in container_specs}
+
+        prefixes = await asyncio.to_thread(self._app_client.kv.get_prefix, key_prefix)
+        for val in prefixes.values():
+            ep, cid = to_str(val).split("@", 1)
+            cid_to_endpoint[cid] = ep
+        return cid_to_endpoint
+
+    async def _watch_nodes(self, role: NodeRole):
+        while True:
+            try:
+                mappings = await self.get_container_mappings(role)
+                eps = set(v for v in mappings.values() if v is not None)
+
+                if eps != self._nodes[role]:
+                    logger.info("New endpoints retrieved: %r", eps)
+                    events = self._role_to_events.pop(role, [])
+                    for ev in events:
+                        ev.set()
+                    self._nodes[role] = eps
+                await asyncio.sleep(1)
+            except SkeinConnectionError:  # pragma: no cover
+                logger.warning("Skein application down, process will terminate")
+                os.kill(os.getpid(), signal.SIGTERM)
+            except (SkeinError, asyncio.CancelledError):  # pragma: no cover
+                logger.exception("Error when watching nodes")
+                break
+
+    async def get_nodes(self, role: NodeRole) -> List[str]:
+        if not self._nodes[role]:
+            mappings = await self.get_container_mappings(role)
+            eps = set(v for v in mappings.values() if v is not None)
+            self._nodes[role] = eps
+        return list(self._nodes[role])
+
+    async def wait_nodes(self, role: NodeRole):
+        event = asyncio.Event()
+        self._role_to_events[role].append(event)
+
+        async def waiter():
+            await event.wait()
+            return list(self._supervisors)
+
+        return waiter()
+
+
+@register_cluster_backend
+class YarnClusterBackend(AbstractClusterBackend):
+    name = "yarn"
+
+    def __init__(self, pool_address: str, watch_ref: mo.ActorRef = None):
+        self._pool_address = pool_address
+        self._watch_ref = watch_ref
+
+    @classmethod
+    async def create(
+        cls, node_role: NodeRole, lookup_address: Optional[str], pool_address: str
+    ) -> "AbstractClusterBackend":
+        try:
+            ref = await mo.create_actor(
+                YarnNodeWatchActor,
+                uid=YarnNodeWatchActor.default_uid(),
+                address=pool_address,
+            )
+        except mo.ActorAlreadyExist:  # pragma: no cover
+            ref = await mo.actor_ref(
+                YarnNodeWatchActor.default_uid(), address=pool_address
+            )
+        return YarnClusterBackend(pool_address, ref)
+
+    async def get_supervisors(self, filter_ready: bool = True) -> List[str]:
+        if filter_ready:
+            return await self._watch_ref.get_nodes(NodeRole.SUPERVISOR)
+        else:
+            mapping = await self._watch_ref.get_container_mappings(NodeRole.SUPERVISOR)
+            return [v if v is not None else k for k, v in mapping.items()]
+
+    async def watch_supervisors(self) -> AsyncGenerator[List[str], None]:
+        while True:
+            yield await self._watch_ref.wait_nodes(NodeRole.SUPERVISOR)
+
+    async def request_worker(
+        self, worker_cpu: int = None, worker_mem: int = None, timeout: int = None
+    ) -> str:
+        raise NotImplementedError
+
+    async def release_worker(self, address: str):
+        raise NotImplementedError
+
+    async def reconstruct_worker(self, address: str):
+        raise NotImplementedError
+
+
+class YarnServiceMixin(object):
+    service_name = None
+
+    @property
+    def app_client(self):
+        if not hasattr(self, "_app_client"):
+            self._app_client = ApplicationClient.from_current()
+        return self._app_client
+
+    def get_container_ip(self):
+        svc_containers = self.app_client.get_containers([self.service_name])
+        container = next(
+            c
+            for c in svc_containers
+            if c.yarn_container_id == skein_props["yarn_container_id"]
+        )
+        return container.yarn_node_http_address.split(":")[0]
+
+    def register_endpoint(self, prefix: str = None, endpoint: str = None):
+        prefix = prefix or self.service_name
+        endpoint = endpoint or self.args.endpoint
+
+        container_key = prefix + "-" + str(uuid.uuid1())
+        self.app_client.kv[container_key] = to_binary(
+            f'{endpoint}@{skein_props["yarn_container_id"]}'
+        )
+
+    async def wait_all_supervisors_ready(self):
+        """
+        Wait till all containers are ready, both in yarn and in Cluster Service
+        """
+        await wait_all_supervisors_ready(self.args.endpoint)
diff --git a/python/xorbits/_mars/deploy/yarn/supervisor.py b/python/xorbits/_mars/deploy/yarn/supervisor.py
new file mode 100644
index 000000000..2e21a0f58
--- /dev/null
+++ b/python/xorbits/_mars/deploy/yarn/supervisor.py
@@ -0,0 +1,45 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from ..oscar.supervisor import SupervisorCommandRunner
+from .config import MarsSupervisorConfig
+from .core import YarnServiceMixin
+
+
+class YarnSupervisorCommandRunner(YarnServiceMixin, SupervisorCommandRunner):
+    service_name = MarsSupervisorConfig.service_name
+    web_service_name = MarsSupervisorConfig.web_service_name
+
+    def __call__(self, *args, **kwargs):
+        os.environ["MARS_CONTAINER_IP"] = self.get_container_ip()
+        return super().__call__(*args, **kwargs)
+
+    async def start_services(self):
+        self.register_endpoint()
+
+        await super().start_services()
+
+        from ...services.web import OscarWebAPI
+
+        web_api = await OscarWebAPI.create(self.args.endpoint)
+        web_endpoint = await web_api.get_web_address()
+        self.register_endpoint(self.web_service_name, web_endpoint)
+
+
+main = YarnSupervisorCommandRunner()
+
+if __name__ == "__main__":  # pragma: no branch
+    main()
diff --git a/python/xorbits/_mars/deploy/yarn/tests/__init__.py b/python/xorbits/_mars/deploy/yarn/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/deploy/yarn/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/deploy/yarn/tests/test_config.py b/python/xorbits/_mars/deploy/yarn/tests/test_config.py
new file mode 100644
index 000000000..a78b2013c
--- /dev/null
+++ b/python/xorbits/_mars/deploy/yarn/tests/test_config.py
@@ -0,0 +1,133 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from .... import __file__ as mars_file
+from ..config import (
+    AppFileConfig,
+    AppMasterConfig,
+    MarsApplicationConfig,
+    MarsSupervisorConfig,
+    MarsWorkerConfig,
+    SecurityConfig,
+)
+
+
+def test_simple_object():
+    config = SecurityConfig("/path/to/cert.pem", "/path/to/key.pem").build()
+    assert config["cert_file"] == "/path/to/cert.pem"
+    assert config["key_file"] == "/path/to/key.pem"
+
+    config = AppFileConfig(source="/path/to/file").build()
+    assert config == "/path/to/file"
+    config = AppFileConfig(source="/path/to/file", file_type="archive").build()
+    assert config["source"] == "/path/to/file"
+    assert config["type"] == "archive"
+
+    config = AppMasterConfig(
+        security=SecurityConfig("/path/to/cert.pem", "/path/to/key.pem"),
+        cpu=1,
+        memory="512 MiB",
+    ).build()
+    assert config["security"]["cert_file"] == "/path/to/cert.pem"
+    assert config["security"]["key_file"] == "/path/to/key.pem"
+    assert config["resources"]["vcores"] == 1
+
+
+def test_supervisor_config():
+    config = MarsSupervisorConfig(
+        "/path/to/packed.tar.gz",
+        "mars.test_mod",
+        cpu=2,
+        memory="10 GiB",
+        env={"TEST_ENV": "test_val"},
+        extra_args="-Dsupervisor.default_cpu_usage=0",
+    ).build()
+    assert config["files"]["mars_env"] == "/path/to/packed.tar.gz"
+    assert "mars.deploy.yarn.supervisor" in config["script"]
+
+    config_envs = config["env"]
+    assert config_envs["TEST_ENV"] == "test_val"
+    assert config_envs["MKL_NUM_THREADS"] == "2"
+    assert config_envs["MARS_CPU_TOTAL"] == "2"
+    assert int(config_envs["MARS_MEMORY_TOTAL"]) == 10 * 1024**3
+    assert config_envs["MARS_LOAD_MODULES"] == "mars.test_mod"
+
+    config = MarsSupervisorConfig(
+        "conda://path/to_env",
+        "mars.test_mod",
+        cpu=2,
+        memory="10 GiB",
+        log_config="logging.conf",
+        env={"TEST_ENV": "test_val"},
+        extra_args="-Dsupervisor.default_cpu_usage=0",
+    ).build()
+    config_envs = config["env"]
+    assert config_envs["MARS_SOURCE_PATH"] == os.path.dirname(
+        os.path.dirname(mars_file)
+    )
+
+    config = MarsSupervisorConfig(
+        "venv://path/to_env",
+        "mars.test_mod",
+        cpu=2,
+        log_config="logging.conf",
+        env={"TEST_ENV": "test_val"},
+        extra_args="-Dsupervisor.default_cpu_usage=0",
+    ).build()
+    config_envs = config["env"]
+    assert config_envs["MARS_SOURCE_PATH"] == os.path.dirname(
+        os.path.dirname(mars_file)
+    )
+
+
+def test_worker_config():
+    config = MarsWorkerConfig("/path/to/packed.tar.gz").build()
+    assert "mars.deploy.yarn.worker" in config["script"]
+    assert config["depends"] == [MarsSupervisorConfig.service_name]
+
+    config = MarsWorkerConfig(
+        "/path/to/packed.tar.gz",
+        worker_cache_mem="10g",
+        spill_dirs=["/spill/dir1", "/spill/dir2"],
+    ).build()
+    config_envs = config["env"]
+    assert config_envs["MARS_CACHE_MEM_SIZE"] == "10g"
+    assert config_envs["MARS_SPILL_DIRS"].split(":") == ["/spill/dir1", "/spill/dir2"]
+
+
+def test_app_config():
+    supervisor_config = MarsSupervisorConfig(
+        "/path/to/packed.tar.gz",
+        "mars.test_mod",
+        cpu=2,
+        memory="10 GiB",
+        env={"TEST_ENV": "test_val"},
+        extra_args="-Dsupervisor.default_cpu_usage=0",
+    )
+    worker_config = MarsWorkerConfig(
+        "/path/to/packed.tar.gz",
+        worker_cache_mem="10g",
+        spill_dirs=["/spill/dir1", "/spill/dir2"],
+    )
+
+    config = MarsApplicationConfig(
+        name="config-name",
+        queue="default",
+        supervisor_config=supervisor_config,
+        worker_config=worker_config,
+    ).build()
+    assert config["name"] == "config-name"
+    assert config["queue"] == "default"
diff --git a/python/xorbits/_mars/deploy/yarn/tests/test_yarn.py b/python/xorbits/_mars/deploy/yarn/tests/test_yarn.py
new file mode 100644
index 000000000..dd2b1738e
--- /dev/null
+++ b/python/xorbits/_mars/deploy/yarn/tests/test_yarn.py
@@ -0,0 +1,183 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import logging
+import os
+import shutil
+import sqlite3
+import subprocess
+import tempfile
+import time
+from distutils.spawn import find_executable
+
+import numpy as np
+import pytest
+
+from .... import tensor as mt
+from ....tests.core import flaky, require_hadoop
+from ...yarn import new_cluster
+
+logger = logging.getLogger(__name__)
+MARS_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(mt.__file__)))
+
+
+def _collect_coverage():
+    time.sleep(5)
+    dist_coverage_path = os.path.join(MARS_ROOT, ".dist-coverage")
+    if os.path.exists(dist_coverage_path):
+        # change ownership of coverage files
+        if find_executable("sudo"):
+            proc = subprocess.Popen(
+                [
+                    "sudo",
+                    "-n",
+                    "chown",
+                    "-R",
+                    f"{os.geteuid()}:{os.getegid()}",
+                    dist_coverage_path,
+                ],
+                shell=False,
+            )
+            proc.wait()
+
+        # rewrite paths in coverage result files
+        for fn in glob.glob(os.path.join(dist_coverage_path, ".coverage.*")):
+            cov_db = sqlite3.connect(fn)
+            c = cov_db.cursor()
+            c.execute(
+                f"UPDATE file SET path=REPLACE(path, '{MARS_ROOT + os.path.sep}', '')"
+            )
+            cov_db.commit()
+            cov_db.close()
+
+            if "COVERAGE_FILE" in os.environ:
+                new_cov_file = os.environ["COVERAGE_FILE"] + os.path.basename(
+                    fn
+                ).replace(".coverage", "")
+            else:
+                new_cov_file = fn.replace(".dist-coverage" + os.sep, "")
+            shutil.copyfile(fn, new_cov_file)
+        shutil.rmtree(dist_coverage_path)
+
+
+def _run_yarn_test_with_env(env_path, timeout):
+    cluster = None
+
+    coverage_result = os.path.join(MARS_ROOT, ".dist-coverage", ".coverage")
+    cov_dir = os.path.join(MARS_ROOT, ".dist-coverage")
+    os.makedirs(cov_dir, exist_ok=True)
+    os.chmod(cov_dir, 0o777)
+    try:
+        log_config_file = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "yarn-logging.conf"
+        )
+
+        cmd_tmpl = (
+            '"{executable}" -m coverage run --source=%s/mars --rcfile=%s/setup.cfg'
+            % (MARS_ROOT, MARS_ROOT)
+        )
+        extra_env = {
+            "COVERAGE_FILE": coverage_result,
+            "COVERAGE_PROCESS_START": f"{MARS_ROOT}/setup.cfg",
+        }
+        cluster = new_cluster(
+            env_path,
+            timeout=timeout,
+            worker_cpu=1,
+            worker_mem="1G",
+            extra_env=extra_env,
+            log_config=log_config_file,
+            extra_args=f"--config-file {MARS_ROOT}/mars/deploy/yarn/tests/test_yarn_config.yml",
+            log_when_fail=True,
+            cmd_tmpl=cmd_tmpl,
+        )
+        assert cluster.endpoint is not None
+
+        check_time = time.time()
+        while cluster.session.get_total_n_cpu() == 0:
+            time.sleep(1)
+            if time.time() - check_time > 5:
+                raise SystemError("Worker not ready")
+
+        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
+        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
+        r = (a * b * 2 + 1).sum().execute().fetch()
+
+        expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1
+        np.testing.assert_array_equal(r, expected.sum())
+    finally:
+        if cluster is not None:
+            cluster.stop()
+        _collect_coverage()
+
+
+@require_hadoop
+@flaky(max_runs=3)
+def test_run_with_conda_env():
+    _run_yarn_test_with_env("conda://" + os.environ["CONDA_PREFIX"], 600)
+
+
+@require_hadoop
+@flaky(max_runs=3)
+def test_run_with_packed_env():
+    import conda_pack
+
+    temp_dir = os.environ.get("MARS_YARN_TEST_DIR")
+    clean_after_test = False
+    if temp_dir is None:
+        clean_after_test = True
+        temp_dir = tempfile.mkdtemp(prefix="test-mars-yarn-")
+    else:
+        os.makedirs(temp_dir, exist_ok=True)
+
+    packed_env_file = os.path.join(temp_dir, "mars-test-env.tar.gz")
+    if not os.path.exists(packed_env_file):
+        try:
+            conda_pack.pack(output=packed_env_file, ignore_editable_packages=True)
+        except conda_pack.CondaPackException:
+            logger.exception("Failed to pack environment, this test will be skipped")
+            return
+
+    try:
+        _run_yarn_test_with_env(packed_env_file, 1200)
+    finally:
+        if clean_after_test:
+            shutil.rmtree(temp_dir)
+
+
+@require_hadoop
+@flaky(max_runs=3)
+def test_create_timeout():
+    cluster = None
+    try:
+        env_path = "conda://" + os.environ["CONDA_PREFIX"]
+        log_config_file = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "yarn-logging.conf"
+        )
+
+        with pytest.raises(TimeoutError):
+            cluster = new_cluster(
+                env_path,
+                log_config=log_config_file,
+                worker_cpu=1,
+                worker_mem="1G",
+                worker_cache_mem="64m",
+                log_when_fail=True,
+                timeout=1,
+            )
+    finally:
+        if cluster is not None:
+            cluster.stop()
+        _collect_coverage()
diff --git a/python/xorbits/_mars/deploy/yarn/tests/test_yarn_config.yml b/python/xorbits/_mars/deploy/yarn/tests/test_yarn_config.yml
new file mode 100644
index 000000000..489d11d29
--- /dev/null
+++ b/python/xorbits/_mars/deploy/yarn/tests/test_yarn_config.yml
@@ -0,0 +1,4 @@
+"@inherits": '@mars/deploy/yarn/config.yml'
+storage:
+  plasma:
+    store_memory: 20M
diff --git a/python/xorbits/_mars/deploy/yarn/tests/yarn-logging.conf b/python/xorbits/_mars/deploy/yarn/tests/yarn-logging.conf
new file mode 100644
index 000000000..320ca6cb3
--- /dev/null
+++ b/python/xorbits/_mars/deploy/yarn/tests/yarn-logging.conf
@@ -0,0 +1,50 @@
+[loggers]
+keys=root,main,deploy,services,oscar,tornado
+
+[handlers]
+keys=stream_handler
+
+[formatters]
+keys=formatter
+
+[logger_root]
+level=WARN
+handlers=stream_handler
+
+[logger_main]
+level=DEBUG
+handlers=stream_handler
+qualname=__main__
+propagate=0
+
+[logger_deploy]
+level=DEBUG
+handlers=stream_handler
+qualname=mars.deploy
+propagate=0
+
+[logger_oscar]
+level=DEBUG
+handlers=stream_handler
+qualname=mars.oscar
+propagate=0
+
+[logger_services]
+level=DEBUG
+handlers=stream_handler
+qualname=mars.services
+propagate=0
+
+[logger_tornado]
+level=WARN
+handlers=stream_handler
+qualname=tornado
+propagate=0
+
+[handler_stream_handler]
+class=StreamHandler
+formatter=formatter
+args=(sys.stderr,)
+
+[formatter_formatter]
+format=%(asctime)s %(name)-12s %(process)d %(levelname)-8s %(message)s
diff --git a/python/xorbits/_mars/deploy/yarn/worker.py b/python/xorbits/_mars/deploy/yarn/worker.py
new file mode 100644
index 000000000..4d29b69e7
--- /dev/null
+++ b/python/xorbits/_mars/deploy/yarn/worker.py
@@ -0,0 +1,52 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from ..oscar.worker import WorkerCommandRunner
+from .config import MarsWorkerConfig
+from .core import YarnServiceMixin
+
+
+class YarnWorkerCommandRunner(YarnServiceMixin, WorkerCommandRunner):
+    service_name = MarsWorkerConfig.service_name
+
+    def __call__(self, *args, **kwargs):
+        os.environ["MARS_CONTAINER_IP"] = self.get_container_ip()
+        return super().__call__(*args, **kwargs)
+
+    async def start_services(self):
+        from ...services.cluster import ClusterAPI
+        from ..oscar.worker import start_worker
+
+        self.register_endpoint()
+
+        await start_worker(
+            self.pool.external_address,
+            self.args.supervisors,
+            self.band_to_resource,
+            list(self.args.load_modules),
+            self.config,
+            mark_ready=False,
+        )
+        await self.wait_all_supervisors_ready()
+
+        cluster_api = await ClusterAPI.create(self.args.endpoint)
+        await cluster_api.mark_node_ready()
+
+
+main = YarnWorkerCommandRunner()
+
+if __name__ == "__main__":  # pragma: no branch
+    main()
diff --git a/python/xorbits/_mars/learn/__init__.py b/python/xorbits/_mars/learn/__init__.py
new file mode 100644
index 000000000..e6012d6f5
--- /dev/null
+++ b/python/xorbits/_mars/learn/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import cluster, ensemble, neighbors, preprocessing, proxima, utils
+
+# register operands
+# import torch first, or some issue emerges,
+# see https://github.com/pytorch/pytorch/issues/2575
+from .contrib import lightgbm, pytorch, statsmodels, tensorflow, xgboost
+from .metrics import pairwise
+
+for _mod in [xgboost, tensorflow, pytorch, lightgbm, proxima, neighbors, statsmodels]:
+    _mod.register_op()
+
+del _mod, pairwise, preprocessing, utils
diff --git a/python/xorbits/_mars/learn/base.py b/python/xorbits/_mars/learn/base.py
new file mode 100644
index 000000000..5eb03aeef
--- /dev/null
+++ b/python/xorbits/_mars/learn/base.py
@@ -0,0 +1,179 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from sklearn.base import BaseEstimator as SklearnBaseEstimator
+
+from .utils.validation import check_array, check_X_y
+
+
+class ClassifierMixin:
+    """Mixin class for all classifiers in scikit-learn."""
+
+    _estimator_type = "classifier"
+
+    def score(self, X, y, sample_weight=None, session=None, run_kwargs=None):
+        """
+        Return the mean accuracy on the given test data and labels.
+
+        In multi-label classification, this is the subset accuracy
+        which is a harsh metric since you require for each sample that
+        each label set be correctly predicted.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Test samples.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            True labels for X.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        score : Tensor
+            Mean accuracy of self.predict(X) wrt. y.
+        """
+        from .metrics import accuracy_score
+
+        result = accuracy_score(
+            y,
+            self.predict(X),
+            sample_weight=sample_weight,
+            session=session,
+            run_kwargs=run_kwargs,
+        )
+        return result
+
+
+class RegressorMixin:
+    """Mixin class for all regression estimators in scikit-learn."""
+
+    _estimator_type = "regressor"
+
+    def score(self, X, y, sample_weight=None):
+        """Return the coefficient of determination :math:`R^2` of the
+        prediction.
+
+        The coefficient :math:`R^2` is defined as :math:`(1 - \\frac{u}{v})`,
+        where :math:`u` is the residual sum of squares ``((y_true - y_pred)
+        ** 2).sum()`` and :math:`v` is the total sum of squares ``((y_true -
+        y_true.mean()) ** 2).sum()``. The best possible score is 1.0 and it
+        can be negative (because the model can be arbitrarily worse). A
+        constant model that always predicts the expected value of `y`,
+        disregarding the input features, would get a :math:`R^2` score of
+        0.0.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Test samples. For some estimators this may be a precomputed
+            kernel matrix or a list of generic objects instead with shape
+            ``(n_samples, n_samples_fitted)``, where ``n_samples_fitted``
+            is the number of samples used in the fitting for the estimator.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            True values for `X`.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        score : Tensor
+            :math:`R^2` of ``self.predict(X)`` wrt. `y`.
+
+        Notes
+        -----
+        The :math:`R^2` score used when calling ``score`` on a regressor uses
+        ``multioutput='uniform_average'`` from version 0.23 to keep consistent
+        with default value of :func:`~sklearn.metrics.r2_score`.
+        This influences the ``score`` method of all the multioutput
+        regressors (except for
+        :class:`~sklearn.multioutput.MultiOutputRegressor`).
+        """
+
+        from .metrics import r2_score
+
+        y_pred = self.predict(X)
+        return r2_score(y, y_pred, sample_weight=sample_weight)
+
+    def _more_tags(self):  # noqa: R0201  # pylint: disable=no-self-use
+        return {"requires_y": True}
+
+
+class BaseEstimator(SklearnBaseEstimator):
+    def _validate_data(
+        self, X, y=None, reset=True, validate_separately=False, **check_params
+    ):
+        """Validate input data and set or check the `n_features_in_` attribute.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix, dataframe} of shape \
+                (n_samples, n_features)
+            The input samples.
+        y : array-like of shape (n_samples,), default=None
+            The targets. If None, `check_array` is called on `X` and
+            `check_X_y` is called otherwise.
+        reset : bool, default=True
+            Whether to reset the `n_features_in_` attribute.
+            If False, the input will be checked for consistency with data
+            provided when reset was last True.
+        validate_separately : False or tuple of dicts, default=False
+            Only used if y is not None.
+            If False, call validate_X_y(). Else, it must be a tuple of kwargs
+            to be used for calling check_array() on X and y respectively.
+        **check_params : kwargs
+            Parameters passed to :func:`sklearn.utils.check_array` or
+            :func:`sklearn.utils.check_X_y`. Ignored if validate_separately
+            is not False.
+
+        Returns
+        -------
+        out : tensor or tuple of these
+            The validated input. A tuple is returned if `y` is not None.
+        """
+
+        if y is None:
+            if hasattr(self, "_get_tags") and self._get_tags().get(
+                "requires_y", False
+            ):  # pragma: no cover
+                raise ValueError(
+                    f"This {type(self).__name__} estimator requires y to be passed, "
+                    "but the target y is None."
+                )
+            X = check_array(X, **check_params)
+            out = X
+        elif isinstance(y, str) and y == "no_validation":
+            X = check_array(X, **check_params)
+            out = X
+        else:  # pragma: no cover
+            if validate_separately:
+                # We need this because some estimators validate X and y
+                # separately, and in general, separately calling check_array()
+                # on X and y isn't equivalent to just calling check_X_y()
+                # :(
+                check_X_params, check_y_params = validate_separately
+                X = check_array(X, **check_X_params)
+                y = check_array(y, **check_y_params)
+            else:
+                X, y = check_X_y(X, y, **check_params)
+            out = X, y
+
+        if check_params.get("ensure_2d", True) and hasattr(self, "_check_n_features"):
+            self._check_n_features(X, reset=reset)
+
+        return out
diff --git a/python/xorbits/_mars/learn/cluster/__init__.py b/python/xorbits/_mars/learn/cluster/__init__.py
new file mode 100644
index 000000000..6baa53cd6
--- /dev/null
+++ b/python/xorbits/_mars/learn/cluster/__init__.py
@@ -0,0 +1,43 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try:
+    from ._kmeans import KMeans, k_means
+
+    def _install():
+        from ._k_means_common import KMeansInertia, KMeansRelocateEmptyClusters
+        from ._k_means_elkan_iter import (
+            KMeansElkanInitBounds,
+            KMeansElkanPostprocess,
+            KMeansElkanUpdate,
+        )
+        from ._k_means_init import KMeansPlusPlusInit
+        from ._k_means_lloyd_iter import KMeansLloydPostprocess, KMeansLloydUpdate
+
+        del (
+            KMeansInertia,
+            KMeansRelocateEmptyClusters,
+            KMeansElkanInitBounds,
+            KMeansElkanUpdate,
+            KMeansElkanPostprocess,
+            KMeansPlusPlusInit,
+            KMeansLloydUpdate,
+            KMeansLloydPostprocess,
+        )
+
+    _install()
+    del _install
+except ImportError:
+    KMeans = None
+    k_means = None
diff --git a/python/xorbits/_mars/learn/cluster/_k_means_common.py b/python/xorbits/_mars/learn/cluster/_k_means_common.py
new file mode 100644
index 000000000..3799728a9
--- /dev/null
+++ b/python/xorbits/_mars/learn/cluster/_k_means_common.py
@@ -0,0 +1,402 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes
+from ... import tensor as mt
+from ...core import OutputType, recursive_tile
+from ...serialization.serializables import KeyField
+from ...tensor.array_utils import as_same_device, device, sparse
+from ...tensor.core import TensorOrder
+from ...utils import has_unknown_shape
+from ..operands import LearnOperand, LearnOperandMixin
+from ._k_means_fast import _inertia_dense, _inertia_sparse, merge_update_chunks
+
+
+class KMeansInertia(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.KMEANS_INERTIA
+
+    _x = KeyField("x")
+    _sample_weight = KeyField("sample_weight")
+    _centers = KeyField("centers")
+    _labels = KeyField("labels")
+
+    def __init__(
+        self,
+        x=None,
+        sample_weight=None,
+        centers=None,
+        labels=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _x=x,
+            _sample_weight=sample_weight,
+            _centers=centers,
+            _labels=labels,
+            _output_types=output_types,
+            **kw
+        )
+        if self._output_types is None:
+            self._output_types = [OutputType.tensor]
+
+    @property
+    def x(self):
+        return self._x
+
+    @property
+    def sample_weight(self):
+        return self._sample_weight
+
+    @property
+    def centers(self):
+        return self._centers
+
+    @property
+    def labels(self):
+        return self._labels
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        for field in ("_x", "_sample_weight", "_centers", "_labels"):
+            if getattr(self, field, None) is not None:
+                setattr(self, field, next(inputs_iter))
+
+    def __call__(self):
+        params = {"shape": (), "dtype": np.dtype(float), "order": TensorOrder.C_ORDER}
+        return self.new_tileable(
+            [self._x, self._sample_weight, self._centers, self._labels], kws=[params]
+        )
+
+    @classmethod
+    def tile(cls, op: "KMeansInertia"):
+        if has_unknown_shape(*op.inputs):
+            yield
+        x = op.x
+        x = yield from recursive_tile(x.rechunk({1: x.shape[1]}))
+        sample_weight = yield from recursive_tile(
+            op.sample_weight.rechunk({0: x.nsplits[0]})
+        )
+        labels = yield from recursive_tile(op.labels.rechunk({0: x.nsplits[0]}))
+        centers = op.centers
+        centers = yield from recursive_tile(centers.rechunk(centers.shape))
+
+        out_chunks = []
+        for x_chunk, sample_weight_chunk, labels_chunk in zip(
+            x.chunks, sample_weight.chunks, labels.chunks
+        ):
+            chunk_op = op.copy().reset_key()
+            chunk_params = {
+                "shape": (1,),
+                "dtype": np.dtype(float),
+                "order": TensorOrder.C_ORDER,
+                "index": x_chunk.index,
+            }
+            out_chunk = chunk_op.new_chunk(
+                [x_chunk, sample_weight_chunk, centers.chunks[0], labels_chunk],
+                kws=[chunk_params],
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        params = op.outputs[0].params
+        params["shape"] = (x.chunk_shape[0],)
+        params["chunks"] = out_chunks
+        params["nsplits"] = ((1,) * x.chunk_shape[0],)
+        out = new_op.new_tileable(op.inputs, kws=[params]).sum()
+        out = yield from recursive_tile(out)
+        return [out]
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (x, sample_weight, centers, labels), device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs],
+            device=op.device,
+            ret_extra=True,
+            copy_if_not_writeable=True,
+        )
+
+        with device(device_id):
+            if xp is np:
+                method = _inertia_dense
+            elif xp is sparse:
+                method = _inertia_sparse
+            else:  # pragma: no cover
+                raise NotImplementedError("Cannot run inertial on GPU")
+
+            result = method(x, sample_weight, centers, labels)
+            ctx[op.outputs[0].key] = np.array([result])
+
+
+def _inertia(X, sample_weight, centers, labels):
+    op = KMeansInertia(x=X, sample_weight=sample_weight, centers=centers, labels=labels)
+    return op()
+
+
+def _execute_merge_update(ctx, op):
+    inputs, device_id, xp = as_same_device(
+        [ctx[inp.key] for inp in op.inputs],
+        op.device,
+        ret_extra=True,
+        copy_if_not_writeable=True,
+    )
+    length = len(inputs) // 2
+    assert len(inputs) % 2 == 0
+    centers_new_chunks = inputs[:length]
+    weight_in_cluster_chunks = inputs[length:]
+
+    with device(device_id):
+        weight_in_clusters = np.zeros(
+            op.n_clusters, dtype=weight_in_cluster_chunks[0].dtype
+        )
+        centers_new = np.zeros_like(centers_new_chunks[0])
+        n_clusters = op.n_clusters
+        n_features = centers_new_chunks[0].shape[1]
+
+        for weight_in_clusters_chunk, centers_new_chunk in zip(
+            weight_in_cluster_chunks, centers_new_chunks
+        ):
+            merge_update_chunks(
+                n_clusters,
+                n_features,
+                weight_in_clusters,
+                weight_in_clusters_chunk,
+                centers_new,
+                centers_new_chunk,
+            )
+
+        # centers new
+        ctx[op.outputs[0].key] = centers_new
+        # weight_in_clusters
+        ctx[op.outputs[1].key] = weight_in_clusters
+
+
+class KMeansRelocateEmptyClusters(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.KMEANS_RELOCASTE_EMPTY_CLUSTERS
+
+    _empty_clusters = KeyField("empty_clusters")
+    _far_x = KeyField("far_x")
+    _far_labels = KeyField("far_labels")
+    _far_sample_weights = KeyField("far_sample_weight")
+    _centers_new = KeyField("centers_new")
+    _weight_in_clusters = KeyField("weight_in_clusters")
+
+    def __init__(
+        self,
+        empty_clusters=None,
+        far_x=None,
+        far_labels=None,
+        far_sample_weights=None,
+        centers_new=None,
+        weight_in_clusters=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _empty_clusters=empty_clusters,
+            _far_x=far_x,
+            _far_labels=far_labels,
+            _far_sample_weights=far_sample_weights,
+            _centers_new=centers_new,
+            _weight_in_clusters=weight_in_clusters,
+            _output_types=output_types,
+            **kw
+        )
+        if self._output_types is None:
+            self._output_types = [OutputType.tensor] * self.output_limit
+
+    @property
+    def empty_clusters(self):
+        return self._empty_clusters
+
+    @property
+    def far_x(self):
+        return self._far_x
+
+    @property
+    def far_labels(self):
+        return self._far_labels
+
+    @property
+    def far_sample_weights(self):
+        return self._far_sample_weights
+
+    @property
+    def centers_new(self):
+        return self._centers_new
+
+    @property
+    def weight_in_clusters(self):
+        return self._weight_in_clusters
+
+    @property
+    def output_limit(self):
+        return 2
+
+    @property
+    def _input_fields(self):
+        return (
+            "_empty_clusters",
+            "_far_x",
+            "_far_labels",
+            "_far_sample_weights",
+            "_centers_new",
+            "_weight_in_clusters",
+        )
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        for field in self._input_fields:
+            ob = getattr(self, field)
+            if ob is not None:
+                setattr(self, field, next(inputs_iter))
+
+    def __call__(self):
+        kws = [
+            # centers_new
+            self._centers_new.params,
+            # weight_in_clusters
+            self._weight_in_clusters.params,
+        ]
+        return self.new_tileables(
+            [getattr(self, field) for field in self._input_fields], kws=kws
+        )
+
+    @classmethod
+    def tile(cls, op: "KMeansRelocateEmptyClusters"):
+        empty_clusters = yield from recursive_tile(
+            op.empty_clusters.rechunk(op.empty_clusters.shape)
+        )
+        far_x = yield from recursive_tile(op.far_x.rechunk(op.far_x.shape))
+        far_labels = yield from recursive_tile(
+            op.far_labels.rechunk(op.far_labels.shape)
+        )
+        far_sample_weight = yield from recursive_tile(
+            op.far_sample_weights.rechunk(op.far_sample_weights.shape)
+        )
+        centers_new = yield from recursive_tile(
+            op.centers_new.rechunk(op.centers_new.shape)
+        )
+        weight_in_clusters = yield from recursive_tile(
+            op.weight_in_clusters.rechunk(op.weight_in_clusters.shape)
+        )
+
+        chunk_op = op.copy().reset_key()
+        out_centers_new_chunk, out_weight_in_clusters_chunk = chunk_op.new_chunks(
+            [
+                empty_clusters.chunks[0],
+                far_x.chunks[0],
+                far_labels.chunks[0],
+                far_sample_weight.chunks[0],
+                centers_new.chunks[0],
+                weight_in_clusters.chunks[0],
+            ],
+            kws=[centers_new.chunks[0].params, weight_in_clusters.chunks[0].params],
+        )
+
+        out_centers_new_params = centers_new.params
+        out_centers_new_params["nsplits"] = centers_new.nsplits
+        out_centers_new_params["chunks"] = [out_centers_new_chunk]
+        out_weight_in_clusters_params = weight_in_clusters.params
+        out_weight_in_clusters_params["nsplits"] = weight_in_clusters.nsplits
+        out_weight_in_clusters_params["chunks"] = [out_weight_in_clusters_chunk]
+        new_op = op.copy()
+        return new_op.new_tileables(
+            op.inputs, kws=[out_centers_new_params, out_weight_in_clusters_params]
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (
+            (
+                empty_clusters,
+                far_x,
+                far_labels,
+                far_sample_weight,
+                center_new,
+                weight_in_clusters,
+            ),
+            device_id,
+            xp,
+        ) = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], op.device, ret_extra=True
+        )
+
+        out_centers_new = center_new.copy()
+        out_weight_in_clusters = weight_in_clusters.copy()
+        del center_new, weight_in_clusters
+
+        n_empty = empty_clusters.shape[0]
+        n_features = far_x.shape[1]
+
+        for idx in range(n_empty):
+            new_cluster_id = empty_clusters[idx]
+            weight = far_sample_weight[idx]
+            old_cluster_id = far_labels[idx]
+
+            for k in range(n_features):
+                out_centers_new[old_cluster_id, k] -= far_x[idx, k] * weight
+                out_centers_new[new_cluster_id, k] = far_x[idx, k] * weight
+
+            out_weight_in_clusters[new_cluster_id] = weight
+            out_weight_in_clusters[old_cluster_id] -= weight
+
+        ctx[op.outputs[0].key] = out_centers_new
+        ctx[op.outputs[1].key] = out_weight_in_clusters
+
+
+def _relocate_empty_clusters(
+    X,
+    sample_weight,
+    centers_old,
+    centers_new,
+    weight_in_clusters,
+    labels,
+    to_run=None,
+    session=None,
+    run_kwargs=None,
+):
+    to_run = to_run or list()
+    empty_clusters = mt.where(mt.equal(weight_in_clusters, 0))[0].astype(mt.int32)
+    to_run.append(empty_clusters)
+
+    mt.ExecutableTuple(to_run).execute(session=session, **(run_kwargs or dict()))
+
+    n_empty = empty_clusters.shape[0]
+
+    if n_empty == 0:
+        return centers_new, weight_in_clusters
+
+    distances = ((mt.asarray(X) - mt.asarray(centers_old)[labels]) ** 2).sum(axis=1)
+    far_from_centers = mt.argpartition(distances, -n_empty)[: -n_empty - 1 : -1].astype(
+        np.int32
+    )
+
+    far_x = X[far_from_centers]
+    far_labels = labels[far_from_centers]
+    far_sample_weight = sample_weight[far_from_centers]
+
+    op = KMeansRelocateEmptyClusters(
+        empty_clusters=empty_clusters,
+        far_x=far_x,
+        far_labels=far_labels,
+        far_sample_weights=far_sample_weight,
+        centers_new=centers_new,
+        weight_in_clusters=weight_in_clusters,
+    )
+    return op()
diff --git a/python/xorbits/_mars/learn/cluster/_k_means_elkan.pyx b/python/xorbits/_mars/learn/cluster/_k_means_elkan.pyx
new file mode 100644
index 000000000..db3a41708
--- /dev/null
+++ b/python/xorbits/_mars/learn/cluster/_k_means_elkan.pyx
@@ -0,0 +1,375 @@
+# cython: profile=False, boundscheck=False, wraparound=False, cdivision=True
+#
+# Author: Andreas Mueller
+#
+# Licence: BSD 3 clause
+
+# TODO: We still need to use ndarrays instead of typed memoryviews when using
+# fused types and when the array may be read-only (for instance when it's
+# provided by the user). This is fixed in cython > 0.3.
+
+import numpy as np
+cimport numpy as np
+cimport cython
+from cython cimport floating
+from sklearn.utils.extmath import row_norms
+
+from ._k_means_fast cimport _euclidean_dense_dense
+from ._k_means_fast cimport _euclidean_sparse_dense
+
+
+np.import_array()
+
+
+def init_bounds_dense(
+        np.ndarray[floating, ndim=2, mode='c'] X,  # IN
+        floating[:, ::1] centers,                  # IN
+        floating[:, ::1] center_half_distances,    # IN
+        int[::1] labels,                           # OUT
+        floating[::1] upper_bounds,                # OUT
+        floating[:, ::1] lower_bounds):            # OUT
+    """Initialize upper and lower bounds for each sample for dense input data.
+
+    Given X, centers and the pairwise distances divided by 2.0 between the
+    centers this calculates the upper bounds and lower bounds for each sample.
+    The upper bound for each sample is set to the distance between the sample
+    and the closest center.
+
+    The lower bound for each sample is a one-dimensional array of n_clusters.
+    For each sample i assume that the previously assigned cluster is c1 and the
+    previous closest distance is dist, for a new cluster c2, the
+    lower_bound[i][c2] is set to distance between the sample and this new
+    cluster, if and only if dist > center_half_distances[c1][c2]. This prevents
+    computation of unnecessary distances for each sample to the clusters that
+    it is unlikely to be assigned to.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features), dtype=floating
+        The input data.
+
+    centers : ndarray of shape (n_clusters, n_features), dtype=floating
+        The cluster centers.
+
+    center_half_distances : ndarray of shape (n_clusters, n_clusters), \
+            dtype=floating
+        The half of the distance between any 2 clusters centers.
+
+    labels : ndarray of shape(n_samples), dtype=int
+        The label for each sample. This array is modified in place.
+
+    upper_bounds : ndarray of shape(n_samples,), dtype=floating
+        The upper bound on the distance between each sample and its closest
+        cluster center. This array is modified in place.
+
+    lower_bounds : ndarray, of shape(n_samples, n_clusters), dtype=floating
+        The lower bound on the distance between each sample and each cluster
+        center. This array is modified in place.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_clusters = centers.shape[0]
+        int n_features = X.shape[1]
+
+        floating min_dist, dist
+        int best_cluster, i, j
+
+    for i in range(n_samples):
+        best_cluster = 0
+        min_dist = _euclidean_dense_dense(&X[i, 0], &centers[0, 0],
+                                          n_features, False)
+        lower_bounds[i, 0] = min_dist
+        for j in range(1, n_clusters):
+            if min_dist > center_half_distances[best_cluster, j]:
+                dist = _euclidean_dense_dense(&X[i, 0], &centers[j, 0],
+                                              n_features, False)
+                lower_bounds[i, j] = dist
+                if dist < min_dist:
+                    min_dist = dist
+                    best_cluster = j
+        labels[i] = best_cluster
+        upper_bounds[i] = min_dist
+
+
+def init_bounds_sparse(
+        X,                                       # IN
+        floating[:, ::1] centers,                # IN
+        floating[:, ::1] center_half_distances,  # IN
+        int[::1] labels,                         # OUT
+        floating[::1] upper_bounds,              # OUT
+        floating[:, ::1] lower_bounds):          # OUT
+    """Initialize upper and lower bounds for each sample for sparse input data.
+
+    Given X, centers and the pairwise distances divided by 2.0 between the
+    centers this calculates the upper bounds and lower bounds for each sample.
+    The upper bound for each sample is set to the distance between the sample
+    and the closest center.
+
+    The lower bound for each sample is a one-dimensional array of n_clusters.
+    For each sample i assume that the previously assigned cluster is c1 and the
+    previous closest distance is dist, for a new cluster c2, the
+    lower_bound[i][c2] is set to distance between the sample and this new
+    cluster, if and only if dist > center_half_distances[c1][c2]. This prevents
+    computation of unnecessary distances for each sample to the clusters that
+    it is unlikely to be assigned to.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features), dtype=floating
+        The input data. Must be in CSR format.
+
+    centers : ndarray of shape (n_clusters, n_features), dtype=floating
+        The cluster centers.
+
+    center_half_distances : ndarray of shape (n_clusters, n_clusters), \
+            dtype=floating
+        The half of the distance between any 2 clusters centers.
+
+    labels : ndarray of shape(n_samples), dtype=int
+        The label for each sample. This array is modified in place.
+
+    upper_bounds : ndarray of shape(n_samples,), dtype=floating
+        The upper bound on the distance between each sample and its closest
+        cluster center. This array is modified in place.
+
+    lower_bounds : ndarray of shape(n_samples, n_clusters), dtype=floating
+        The lower bound on the distance between each sample and each cluster
+        center. This array is modified in place.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_clusters = centers.shape[0]
+        int n_features = X.shape[1]
+
+        floating[::1] X_data = X.data
+        int[::1] X_indices = X.indices
+        int[::1] X_indptr = X.indptr
+
+        floating min_dist, dist
+        int best_cluster, i, j
+
+        floating[::1] centers_squared_norms = row_norms(centers, squared=True)
+
+    for i in range(n_samples):
+        best_cluster = 0
+        min_dist = _euclidean_sparse_dense(
+            X_data[X_indptr[i]: X_indptr[i + 1]],
+            X_indices[X_indptr[i]: X_indptr[i + 1]],
+            centers[0], centers_squared_norms[0], False)
+
+        lower_bounds[i, 0] = min_dist
+        for j in range(1, n_clusters):
+            if min_dist > center_half_distances[best_cluster, j]:
+                dist = _euclidean_sparse_dense(
+                    X_data[X_indptr[i]: X_indptr[i + 1]],
+                    X_indices[X_indptr[i]: X_indptr[i + 1]],
+                    centers[j], centers_squared_norms[j], False)
+                lower_bounds[i, j] = dist
+                if dist < min_dist:
+                    min_dist = dist
+                    best_cluster = j
+        labels[i] = best_cluster
+        upper_bounds[i] = min_dist
+
+
+def update_chunk_dense(
+        np.ndarray[floating, ndim=2, mode='c'] X,  # IN
+        floating[::1] sample_weight,               # IN
+        floating[:, ::1] centers_old,              # IN
+        floating[:, ::1] center_half_distances,    # IN
+        floating[::1] distance_next_center,        # IN
+        int[::1] labels,                           # INOUT
+        floating[::1] upper_bounds,                # INOUT
+        floating[:, ::1] lower_bounds,             # INOUT
+        floating[:, ::1] centers_new,              # OUT
+        floating[::1] weight_in_clusters,          # OUT
+        bint update_centers=True):
+    return _update_chunk_dense(&X[0, 0], sample_weight, centers_old,
+                               center_half_distances,
+                               distance_next_center, labels,
+                               upper_bounds, lower_bounds,
+                               &centers_new[0, 0], &weight_in_clusters[0],
+                               update_centers)
+
+
+cdef void _update_chunk_dense(
+        floating *X,                             # IN
+        # expecting C aligned 2D array. XXX: Can be
+        # replaced by const memoryview when cython min
+        # version is >= 0.3
+        floating[::1] sample_weight,             # IN
+        floating[:, ::1] centers_old,            # IN
+        floating[:, ::1] center_half_distances,  # IN
+        floating[::1] distance_next_center,      # IN
+        int[::1] labels,                         # INOUT
+        floating[::1] upper_bounds,              # INOUT
+        floating[:, ::1] lower_bounds,           # INOUT
+        floating *centers_new,                   # OUT
+        floating *weight_in_clusters,            # OUT
+        bint update_centers) nogil:
+    """K-means combined EM step for one dense data chunk.
+
+    Compute the partial contribution of a single data chunk to the labels and
+    centers.
+    """
+    cdef:
+        int n_samples = labels.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int n_features = centers_old.shape[1]
+
+        floating upper_bound, distance
+        int i, j, k, label
+
+    for i in range(n_samples):
+        upper_bound = upper_bounds[i]
+        bounds_tight = 0
+        label = labels[i]
+
+        # Next center is not far away from the currently assigned center.
+        # Sample might need to be assigned to another center.
+        if not distance_next_center[label] >= upper_bound:
+
+            for j in range(n_clusters):
+
+                # If this holds, then center_index is a good candidate for the
+                # sample to be relabelled, and we need to confirm this by
+                # recomputing the upper and lower bounds.
+                if (j != label
+                    and (upper_bound > lower_bounds[i, j])
+                    and (upper_bound > center_half_distances[label, j])):
+
+                    # Recompute upper bound by calculating the actual distance
+                    # between the sample and its current assigned center.
+                    if not bounds_tight:
+                        upper_bound = _euclidean_dense_dense(
+                            X + i * n_features, &centers_old[label, 0], n_features, False)
+                        lower_bounds[i, label] = upper_bound
+                        bounds_tight = 1
+
+                    # If the condition still holds, then compute the actual
+                    # distance between the sample and center. If this is less
+                    # than the previous distance, reassign label.
+                    if (upper_bound > lower_bounds[i, j]
+                        or (upper_bound > center_half_distances[label, j])):
+
+                        distance = _euclidean_dense_dense(
+                            X + i * n_features, &centers_old[j, 0], n_features, False)
+                        lower_bounds[i, j] = distance
+                        if distance < upper_bound:
+                            label = j
+                            upper_bound = distance
+
+            labels[i] = label
+            upper_bounds[i] = upper_bound
+
+        if update_centers:
+            weight_in_clusters[label] += sample_weight[i]
+            for k in range(n_features):
+                centers_new[label * n_features + k] += X[i * n_features + k] * sample_weight[i]
+
+
+def update_chunk_sparse(
+        X,                                         # IN
+        floating[::1] sample_weight,               # IN
+        floating[:, ::1] centers_old,              # IN
+        floating[:, ::1] center_half_distances,    # IN
+        floating[::1] distance_next_center,        # IN
+        int[::1] labels,                           # INOUT
+        floating[::1] upper_bounds,                # INOUT
+        floating[:, ::1] lower_bounds,             # INOUT
+        floating[:, ::1] centers_new,              # OUT
+        floating[::1] weight_in_clusters,          # OUT
+        bint update_centers=True):
+    cdef:
+        floating[::1] X_data = X.data
+        int[::1] X_indices = X.indices
+        int[::1] X_indptr = X.indptr
+
+        floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
+
+    return _update_chunk_sparse(
+        X_data, X_indices, X_indptr, sample_weight, centers_old,
+        centers_squared_norms, center_half_distances,
+        distance_next_center, labels, upper_bounds, lower_bounds,
+        &centers_new[0, 0], &weight_in_clusters[0], update_centers
+    )
+
+
+cdef void _update_chunk_sparse(
+        floating[::1] X_data,                    # IN
+        int[::1] X_indices,                      # IN
+        int[::1] X_indptr,                       # IN
+        floating[::1] sample_weight,             # IN
+        floating[:, ::1] centers_old,            # IN
+        floating[::1] centers_squared_norms,     # IN
+        floating[:, ::1] center_half_distances,  # IN
+        floating[::1] distance_next_center,      # IN
+        int[::1] labels,                         # INOUT
+        floating[::1] upper_bounds,              # INOUT
+        floating[:, ::1] lower_bounds,           # INOUT
+        floating *centers_new,                   # OUT
+        floating *weight_in_clusters,            # OUT
+        bint update_centers) nogil:
+    """K-means combined EM step for one sparse data chunk.
+
+    Compute the partial contribution of a single data chunk to the labels and
+    centers.
+    """
+    cdef:
+        int n_samples = labels.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int n_features = centers_old.shape[1]
+
+        floating upper_bound, distance
+        int i, j, k, label
+        int s = X_indptr[0]
+
+    for i in range(n_samples):
+        upper_bound = upper_bounds[i]
+        bounds_tight = 0
+        label = labels[i]
+
+        # Next center is not far away from the currently assigned center.
+        # Sample might need to be assigned to another center.
+        if not distance_next_center[label] >= upper_bound:
+
+            for j in range(n_clusters):
+
+                # If this holds, then center_index is a good candidate for the
+                # sample to be relabelled, and we need to confirm this by
+                # recomputing the upper and lower bounds.
+                if (j != label
+                    and (upper_bound > lower_bounds[i, j])
+                    and (upper_bound > center_half_distances[label, j])):
+
+                    # Recompute upper bound by calculating the actual distance
+                    # between the sample and its current assigned center.
+                    if not bounds_tight:
+                        upper_bound = _euclidean_sparse_dense(
+                            X_data[X_indptr[i] - s: X_indptr[i + 1] - s],
+                            X_indices[X_indptr[i] - s: X_indptr[i + 1] - s],
+                            centers_old[label], centers_squared_norms[label], False)
+                        lower_bounds[i, label] = upper_bound
+                        bounds_tight = 1
+
+                    # If the condition still holds, then compute the actual
+                    # distance between the sample and center. If this is less
+                    # than the previous distance, reassign label.
+                    if (upper_bound > lower_bounds[i, j]
+                        or (upper_bound > center_half_distances[label, j])):
+                        distance = _euclidean_sparse_dense(
+                            X_data[X_indptr[i] - s: X_indptr[i + 1] - s],
+                            X_indices[X_indptr[i] - s: X_indptr[i + 1] - s],
+                            centers_old[j], centers_squared_norms[j], False)
+                        lower_bounds[i, j] = distance
+                        if distance < upper_bound:
+                            label = j
+                            upper_bound = distance
+
+            labels[i] = label
+            upper_bounds[i] = upper_bound
+
+        if update_centers:
+            weight_in_clusters[label] += sample_weight[i]
+            for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
+                centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i]
diff --git a/python/xorbits/_mars/learn/cluster/_k_means_elkan_iter.py b/python/xorbits/_mars/learn/cluster/_k_means_elkan_iter.py
new file mode 100644
index 000000000..54f14d33c
--- /dev/null
+++ b/python/xorbits/_mars/learn/cluster/_k_means_elkan_iter.py
@@ -0,0 +1,866 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes
+from ...core import OutputType, recursive_tile
+from ...core.operand import OperandStage
+from ...serialization.serializables import BoolField, Int32Field, KeyField
+from ...tensor.array_utils import as_same_device, cp, device, sparse
+from ...tensor.core import TensorOrder
+from ...utils import has_unknown_shape
+from ..operands import LearnOperand, LearnOperandMixin
+from ._k_means_common import _execute_merge_update, _relocate_empty_clusters
+from ._k_means_elkan import (
+    init_bounds_dense,
+    init_bounds_sparse,
+    update_chunk_dense,
+    update_chunk_sparse,
+)
+from ._k_means_fast import update_center, update_upper_lower_bounds
+
+
+class KMeansElkanInitBounds(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.KMEANS_ELKAN_INIT_BOUNDS
+
+    _x = KeyField("x")
+    _centers = KeyField("centers")
+    _center_half_distances = KeyField("center_half_distances")
+    _n_clusters = Int32Field("n_clusters")
+
+    def __init__(
+        self,
+        x=None,
+        centers=None,
+        center_half_distances=None,
+        n_clusters=None,
+        sparse=None,
+        gpu=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _x=x,
+            _centers=centers,
+            _center_half_distances=center_half_distances,
+            _n_clusters=n_clusters,
+            sparse=sparse,
+            gpu=gpu,
+            _output_types=output_types,
+            **kw
+        )
+        if self._output_types is None:
+            self._output_types = [OutputType.tensor] * self.output_limit
+
+    @property
+    def x(self):
+        return self._x
+
+    @property
+    def centers(self):
+        return self._centers
+
+    @property
+    def center_half_distances(self):
+        return self._center_half_distances
+
+    @property
+    def n_clusters(self):
+        return self._n_clusters
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._x = self._inputs[0]
+        self._centers = self._inputs[1]
+        self._center_half_distances = self._inputs[2]
+
+    @property
+    def output_limit(self):
+        return 3
+
+    def __call__(self):
+        params = []
+        # labels
+        params.append(
+            {
+                "shape": (self._x.shape[0],),
+                "dtype": np.dtype(np.int32),
+                "order": TensorOrder.C_ORDER,
+            }
+        )
+        # upper bounds
+        params.append(
+            {
+                "shape": (self._x.shape[0],),
+                "dtype": self._x.dtype,
+                "order": TensorOrder.C_ORDER,
+            }
+        )
+        # lower bounds
+        params.append(
+            {
+                "shape": (self._x.shape[0], self._n_clusters),
+                "dtype": self._x.dtype,
+                "order": TensorOrder.C_ORDER,
+            }
+        )
+        return self.new_tileables(
+            [self._x, self._centers, self._center_half_distances], kws=params
+        )
+
+    @classmethod
+    def tile(cls, op: "KMeansElkanInitBounds"):
+        # unify chunks on axis 0
+        if has_unknown_shape(op.centers, op.center_half_distances):
+            yield
+        x = op.x
+        centers = yield from recursive_tile(op.centers.rechunk(op.centers.shape))
+        center_half_distances = yield from recursive_tile(
+            op.center_half_distances.rechunk(op.center_half_distances.shape)
+        )
+
+        out_chunks = [list() for _ in range(op.output_limit)]
+        for c in x.chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_params = []
+            # labels chunk
+            chunk_params.append(
+                {
+                    "shape": (c.shape[0],),
+                    "index": (c.index[0],),
+                    "dtype": np.dtype(np.int32),
+                    "order": TensorOrder.C_ORDER,
+                }
+            )
+            # upper bounds
+            chunk_params.append(
+                {
+                    "shape": (c.shape[0],),
+                    "index": (c.index[0],),
+                    "dtype": c.dtype,
+                    "order": TensorOrder.C_ORDER,
+                }
+            )
+            # lower bounds
+            chunk_params.append(
+                {
+                    "shape": (c.shape[0], op.n_clusters),
+                    "index": (c.index[0], 0),
+                    "dtype": c.dtype,
+                    "order": TensorOrder.C_ORDER,
+                }
+            )
+            chunks = chunk_op.new_chunks(
+                [c, centers.chunks[0], center_half_distances.chunks[0]],
+                kws=chunk_params,
+            )
+            for i, out_chunk in enumerate(chunks):
+                out_chunks[i].append(out_chunk)
+
+        out_nsplits = [
+            (x.nsplits[0],),
+            (x.nsplits[0],),
+            (x.nsplits[0], (op.n_clusters,)),
+        ]
+        out_params = [out.params for out in op.outputs]
+        for i, chunks in enumerate(out_chunks):
+            out_params[i]["chunks"] = chunks
+        for i, nsplits in enumerate(out_nsplits):
+            out_params[i]["nsplits"] = nsplits
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=out_params)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (x, centers, center_half_distances), device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs],
+            device=op.device,
+            ret_extra=True,
+            copy_if_not_writeable=True,
+        )
+
+        with device(device_id):
+            if xp is cp:  # pragma: no cover
+                raise NotImplementedError("cannot support init_bounds for kmeans elkan")
+
+            n_samples = x.shape[0]
+            n_clusters = op.n_clusters
+
+            labels = np.full(n_samples, -1, dtype=np.int32)
+            upper_bounds = np.zeros(n_samples, dtype=x.dtype)
+            lower_bounds = np.zeros((n_samples, n_clusters), dtype=x.dtype)
+
+            if xp is np:
+                init_bounds = init_bounds_dense
+            else:
+                assert xp is sparse
+                init_bounds = init_bounds_sparse
+
+            init_bounds(
+                x, centers, center_half_distances, labels, upper_bounds, lower_bounds
+            )
+
+            ctx[op.outputs[0].key] = labels
+            ctx[op.outputs[1].key] = upper_bounds
+            ctx[op.outputs[2].key] = lower_bounds
+
+
+def init_bounds(X, centers, center_half_distances, n_clusters):
+    op = KMeansElkanInitBounds(
+        x=X,
+        centers=centers,
+        center_half_distances=center_half_distances,
+        n_clusters=n_clusters,
+        sparse=False,
+        gpu=X.op.gpu,
+    )
+    return op()
+
+
+class KMeansElkanUpdate(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.KMEANS_ELKAN_UPDATE
+
+    _x = KeyField("x")
+    _sample_weight = KeyField("sample_weight")
+    _centers_old = KeyField("centers_old")
+    _center_half_distances = KeyField("center_half_distances")
+    _distance_next_center = KeyField("distance_next_center")
+    _labels = KeyField("labels")
+    _upper_bounds = KeyField("upper_bounds")
+    _lower_bounds = KeyField("lower_bounds")
+    _update_centers = BoolField("update_centers")
+    _n_clusters = Int32Field("n_clusters")
+
+    def __init__(
+        self,
+        x=None,
+        sample_weight=None,
+        centers_old=None,
+        center_half_distances=None,
+        distance_next_center=None,
+        labels=None,
+        upper_bounds=None,
+        lower_bounds=None,
+        update_centers=None,
+        n_clusters=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _x=x,
+            _sample_weight=sample_weight,
+            _centers_old=centers_old,
+            _center_half_distances=center_half_distances,
+            _distance_next_center=distance_next_center,
+            _labels=labels,
+            _upper_bounds=upper_bounds,
+            _lower_bounds=lower_bounds,
+            _update_centers=update_centers,
+            _n_clusters=n_clusters,
+            _output_types=output_types,
+            **kw
+        )
+        if self._output_types is None:
+            self._output_types = [OutputType.tensor] * self.output_limit
+
+    @property
+    def x(self):
+        return self._x
+
+    @property
+    def sample_weight(self):
+        return self._sample_weight
+
+    @property
+    def centers_old(self):
+        return self._centers_old
+
+    @property
+    def center_half_distances(self):
+        return self._center_half_distances
+
+    @property
+    def distance_next_center(self):
+        return self._distance_next_center
+
+    @property
+    def labels(self):
+        return self._labels
+
+    @property
+    def upper_bounds(self):
+        return self._upper_bounds
+
+    @property
+    def lower_bounds(self):
+        return self._lower_bounds
+
+    @property
+    def update_centers(self):
+        return self._update_centers
+
+    @property
+    def n_clusters(self):
+        return self._n_clusters
+
+    @property
+    def output_limit(self):
+        return 5 if self.stage != OperandStage.reduce else 2
+
+    @property
+    def _input_fields(self):
+        return (
+            "_x",
+            "_sample_weight",
+            "_centers_old",
+            "_center_half_distances",
+            "_distance_next_center",
+            "_labels",
+            "_upper_bounds",
+            "_lower_bounds",
+        )
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if self.stage != OperandStage.reduce:
+            input_fields = self._input_fields
+            assert len(input_fields) == len(self._inputs)
+            inputs_iter = iter(inputs)
+            for field in input_fields:
+                setattr(self, field, next(inputs_iter))
+
+    def __call__(self):
+        kws = list(
+            (
+                # labels
+                self._labels.params,
+                # upper_bounds
+                self._upper_bounds.params,
+                # lower_bounds
+                self._lower_bounds.params,
+            )
+        )
+        # centers_new
+        kws.append(
+            {
+                "shape": (self._n_clusters, self._x.shape[1]),
+                "dtype": self._centers_old.dtype,
+                "order": TensorOrder.C_ORDER,
+            }
+        )
+        # weight_in_clusters
+        kws.append(
+            {
+                "shape": (self._n_clusters,),
+                "dtype": self._centers_old.dtype,
+                "order": TensorOrder.C_ORDER,
+            }
+        )
+        return self.new_tileables(
+            [getattr(self, f) for f in self._input_fields], kws=kws
+        )
+
+    @classmethod
+    def tile(cls, op: "KMeansElkanUpdate"):
+        if has_unknown_shape(*op.inputs):
+            yield
+        x = op.x
+        if x.chunk_shape[1] != 1:  # pragma: no cover
+            x = yield from recursive_tile(x.rechunk({1: x.shape[1]}))
+        sample_weight = yield from recursive_tile(
+            op.sample_weight.rechunk({0: x.nsplits[0]})
+        )
+        labels = yield from recursive_tile(op.labels.rechunk({0: x.nsplits[0]}))
+        upper_bounds = yield from recursive_tile(
+            op.upper_bounds.rechunk({0: x.nsplits[0]})
+        )
+        lower_bounds = yield from recursive_tile(
+            op.lower_bounds.rechunk({0: x.nsplits[0], 1: op.lower_bounds.shape[1]})
+        )
+        centers_old = yield from recursive_tile(
+            op.centers_old.rechunk(op.centers_old.shape)
+        )
+        center_half_distances = yield from recursive_tile(
+            op.center_half_distances.rechunk(op.center_half_distances.shape)
+        )
+        distance_next_center = yield from recursive_tile(
+            op.distance_next_center.rechunk(op.distance_next_center.shape)
+        )
+
+        out_chunks = [list() for _ in range(op.output_limit)]
+        for i in range(x.chunk_shape[0]):
+            x_chunk = x.cix[i, 0]
+            sample_weight_chunk = sample_weight.cix[i,]
+            labels_chunk = labels.cix[i,]
+            upper_bounds_chunk = upper_bounds.cix[i,]
+            lower_bounds_chunk = lower_bounds.cix[i, 0]
+            chunk_op = op.copy().reset_key()
+            chunk_op.stage = OperandStage.map
+            chunk_kws = list(
+                (
+                    # labels
+                    labels_chunk.params,
+                    # upper_bounds
+                    upper_bounds_chunk.params,
+                    # lower_boudns
+                    lower_bounds_chunk.params,
+                )
+            )
+            # centers_new
+            chunk_kws.append(
+                {
+                    "index": (0, 0),
+                    "shape": (op.n_clusters, x_chunk.shape[1]),
+                    "dtype": centers_old.dtype,
+                    "order": TensorOrder.C_ORDER,
+                }
+            )
+            # weight_in_clusters
+            chunk_kws.append(
+                {
+                    "index": (0,),
+                    "shape": (op.n_clusters,),
+                    "dtype": centers_old.dtype,
+                    "order": TensorOrder.C_ORDER,
+                }
+            )
+            chunks = chunk_op.new_chunks(
+                [
+                    x_chunk,
+                    sample_weight_chunk,
+                    centers_old.chunks[0],
+                    center_half_distances.chunks[0],
+                    distance_next_center.chunks[0],
+                    labels_chunk,
+                    upper_bounds_chunk,
+                    lower_bounds_chunk,
+                ],
+                kws=chunk_kws,
+            )
+            assert len(chunks) == len(out_chunks)
+            for oc, c in zip(out_chunks, chunks):
+                oc.append(c)
+
+        label_chunks, upper_bounds_chunks, lower_bounds_chunks = out_chunks[:3]
+        centers_new_chunks, weight_in_cluster_chunks = out_chunks[3:]
+
+        if op.update_centers:
+            # merge centers_new and weight_in_clusters
+            merge_op = KMeansElkanUpdate(
+                stage=OperandStage.reduce, n_clusters=op.n_clusters
+            )
+            merge_chunk_kw = [
+                centers_new_chunks[0].params,
+                weight_in_cluster_chunks[0].params,
+            ]
+            centers_new_chunk, weight_in_cluster_chunk = merge_op.new_chunks(
+                centers_new_chunks + weight_in_cluster_chunks, kws=merge_chunk_kw
+            )
+        else:
+            # the data is meaningless, just pick one
+            centers_new_chunk = centers_new_chunks[0]
+            weight_in_cluster_chunk = weight_in_cluster_chunks[0]
+
+        out_params = [out.params for out in op.outputs]
+        # labels
+        out_params[0]["nsplits"] = labels.nsplits
+        out_params[0]["chunks"] = label_chunks
+        # upper_bounds
+        out_params[1]["nsplits"] = upper_bounds.nsplits
+        out_params[1]["chunks"] = upper_bounds_chunks
+        # lower_bounds
+        out_params[2]["nsplits"] = lower_bounds.nsplits
+        out_params[2]["chunks"] = lower_bounds_chunks
+        # centers_new
+        out_params[3]["nsplits"] = tuple((s,) for s in op.outputs[3].shape)
+        out_params[3]["chunks"] = [centers_new_chunk]
+        # weight_in_clusters
+        out_params[4]["nsplits"] = tuple((s,) for s in op.outputs[4].shape)
+        out_params[4]["chunks"] = [weight_in_cluster_chunk]
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=out_params)
+
+    @classmethod
+    def _execute_reduce(cls, ctx, op):
+        return _execute_merge_update(ctx, op)
+
+    @classmethod
+    def execute(cls, ctx, op: "KMeansElkanUpdate"):
+        if op.stage == OperandStage.reduce:
+            return cls._execute_reduce(ctx, op)
+        else:
+            (
+                (
+                    x,
+                    sample_weight,
+                    centers_old,
+                    center_half_distances,
+                    distance_next_center,
+                    labels,
+                    upper_bounds,
+                    lower_bounds,
+                ),
+                device_id,
+                xp,
+            ) = as_same_device(
+                [ctx[inp.key] for inp in op.inputs],
+                device=op.device,
+                ret_extra=True,
+                copy_if_not_writeable=True,
+            )
+
+            with device(device_id):
+                if not op.update_centers:
+                    centers_new = centers_old.copy()
+                else:
+                    centers_new = np.zeros_like(centers_old)
+                weight_in_clusters = np.zeros(op.n_clusters, dtype=x.dtype)
+
+                if xp is np:
+                    method = update_chunk_dense
+                elif xp is sparse:
+                    method = update_chunk_sparse
+                else:  # pragma: no cover
+                    raise NotImplementedError("Does not support run on GPU")
+
+                out_labels, out_upper_bounds, out_lower_bounds = (
+                    labels.copy(),
+                    upper_bounds.copy(),
+                    lower_bounds.copy(),
+                )
+                method(
+                    x,
+                    sample_weight,
+                    centers_old,
+                    center_half_distances,
+                    distance_next_center,
+                    out_labels,
+                    out_upper_bounds,
+                    out_lower_bounds,
+                    centers_new,
+                    weight_in_clusters,
+                    op.update_centers,
+                )
+
+                # labels
+                ctx[op.outputs[0].key] = out_labels
+                # upper_bounds
+                ctx[op.outputs[1].key] = out_upper_bounds
+                # lower_bounds
+                ctx[op.outputs[2].key] = out_lower_bounds
+                # centers_new
+                ctx[op.outputs[3].key] = centers_new
+                # weight_in_cluster
+                ctx[op.outputs[4].key] = weight_in_clusters
+
+
+class KMeansElkanPostprocess(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.KMEANS_ELKAN_POSTPROCESS
+
+    _centers_old = KeyField("centers_old")
+    _centers_new = KeyField("centers_new")
+    _center_shift = KeyField("center_shift")
+    _lower_bounds = KeyField("lower_bounds")
+    _upper_bounds = KeyField("upper_bounds")
+    _labels = KeyField("labels")
+    _weight_in_clusters = KeyField("weight_in_clusters")
+
+    def __init__(
+        self,
+        centers_old=None,
+        centers_new=None,
+        center_shift=None,
+        lower_bounds=None,
+        upper_bounds=None,
+        labels=None,
+        weight_in_clusters=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _centers_old=centers_old,
+            _centers_new=centers_new,
+            _center_shift=center_shift,
+            _lower_bounds=lower_bounds,
+            _upper_bounds=upper_bounds,
+            _labels=labels,
+            _weight_in_clusters=weight_in_clusters,
+            _output_types=output_types,
+            **kw
+        )
+        if self._output_types is None:
+            self._output_types = [OutputType.tensor] * self.output_limit
+
+    @property
+    def centers_old(self):
+        return self._centers_old
+
+    @property
+    def centers_new(self):
+        return self._centers_new
+
+    @property
+    def center_shift(self):
+        return self._center_shift
+
+    @property
+    def lower_bounds(self):
+        return self._lower_bounds
+
+    @property
+    def upper_bounds(self):
+        return self._upper_bounds
+
+    @property
+    def labels(self):
+        return self._labels
+
+    @property
+    def weight_in_clusters(self):
+        return self._weight_in_clusters
+
+    @property
+    def output_limit(self):
+        if self.stage is None:
+            # for tileable
+            return 4
+        elif self.stage == OperandStage.combine:
+            return 2
+        else:
+            assert self.stage == OperandStage.reduce
+            return 2
+
+    @property
+    def _input_fields(self):
+        return (
+            "_centers_old",
+            "_centers_new",
+            "_center_shift",
+            "_lower_bounds",
+            "_upper_bounds",
+            "_labels",
+            "_weight_in_clusters",
+        )
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        for field in self._input_fields:
+            ob = getattr(self, field)
+            if ob is not None:
+                setattr(self, field, next(inputs_iter))
+
+    def __call__(self):
+        kws = [
+            # centers_new
+            self._centers_new.params,
+            # center_shift
+            self._center_shift.params,
+            # upper_bounds
+            self._upper_bounds.params,
+            # lower_bounds
+            self._lower_bounds.params,
+        ]
+        return self.new_tileables(
+            [getattr(self, f) for f in self._input_fields], kws=kws
+        )
+
+    @classmethod
+    def tile(cls, op: "KMeansElkanPostprocess"):
+        assert len(op.centers_old.chunks) == 1
+        assert len(op.centers_new.chunks) == 1
+        assert len(op.center_shift.chunks) == 1
+        assert len(op._weight_in_clusters.chunks) == 1
+        assert op.lower_bounds.chunk_shape[1] == 1
+
+        centers_old_chunk = op.centers_old.chunks[0]
+        centers_new_chunk = op.centers_new.chunks[0]
+        center_shift_chunk = op.center_shift.chunks[0]
+        weight_in_clusters_chunk = op.weight_in_clusters.chunks[0]
+
+        # calculate center shift first
+        centers_new_chunk, center_shift_chunk = KMeansElkanPostprocess(
+            centers_old=centers_old_chunk,
+            centers_new=centers_new_chunk,
+            center_shift=center_shift_chunk,
+            weight_in_clusters=weight_in_clusters_chunk,
+            stage=OperandStage.combine,
+        ).new_chunks(
+            [
+                centers_old_chunk,
+                centers_new_chunk,
+                center_shift_chunk,
+                weight_in_clusters_chunk,
+            ],
+            kws=[centers_new_chunk.params, center_shift_chunk.params],
+        )
+
+        upper_bounds_chunks, lower_bounds_chunks = [], []
+        for upper_bound_chunk, lower_bound_chunk, labels_chunk in zip(
+            op.upper_bounds.chunks, op.lower_bounds.chunks, op.labels.chunks
+        ):
+            chunk_kws = [upper_bound_chunk.params, lower_bound_chunk.params]
+            upper_bound_chk, lower_bound_chk = KMeansElkanPostprocess(
+                center_shift=center_shift_chunk,
+                lower_bounds=lower_bound_chunk,
+                upper_bounds=upper_bound_chunk,
+                labels=labels_chunk,
+                stage=OperandStage.reduce,
+            ).new_chunks(
+                [
+                    center_shift_chunk,
+                    lower_bound_chunk,
+                    upper_bound_chunk,
+                    labels_chunk,
+                ],
+                kws=chunk_kws,
+            )
+            upper_bounds_chunks.append(upper_bound_chk)
+            lower_bounds_chunks.append(lower_bound_chk)
+
+        centers_new_kw = op.centers_new.params
+        centers_new_kw["chunks"] = [centers_new_chunk]
+        centers_new_kw["nsplits"] = op.centers_new.nsplits
+        center_shift_kw = op.center_shift.params
+        center_shift_kw["chunks"] = [center_shift_chunk]
+        center_shift_kw["nsplits"] = op.center_shift.nsplits
+        upper_bounds_kw = op.upper_bounds.params
+        upper_bounds_kw["chunks"] = upper_bounds_chunks
+        upper_bounds_kw["nsplits"] = op.upper_bounds.nsplits
+        lower_bounds_kw = op.lower_bounds.params
+        lower_bounds_kw["chunks"] = lower_bounds_chunks
+        lower_bounds_kw["nsplits"] = op.lower_bounds.nsplits
+        new_op = op.copy()
+        return new_op.new_tileables(
+            op.inputs,
+            kws=[centers_new_kw, center_shift_kw, upper_bounds_kw, lower_bounds_kw],
+        )
+
+    @classmethod
+    def _execute_combine(cls, ctx, op):
+        (
+            (centers_old, centers_new, center_shift, weight_in_clusters),
+            device_id,
+            xp,
+        ) = as_same_device(
+            [ctx[inp.key] for inp in op.inputs],
+            op.device,
+            ret_extra=True,
+            copy_if_not_writeable=True,
+        )
+
+        with device(device_id):
+            out_centers_new = centers_new.copy()
+            out_center_shift = center_shift.copy()
+            update_center(
+                centers_old, out_centers_new, out_center_shift, weight_in_clusters
+            )
+
+            ctx[op.outputs[0].key] = out_centers_new
+            ctx[op.outputs[1].key] = out_center_shift
+
+    @classmethod
+    def _execute_reduce(cls, ctx, op):
+        (
+            (center_shift, lower_bounds, upper_bounds, labels),
+            device_id,
+            xp,
+        ) = as_same_device(
+            [ctx[inp.key] for inp in op.inputs],
+            op.device,
+            ret_extra=True,
+            copy_if_not_writeable=True,
+        )
+
+        with device(device_id):
+            out_upper_bounds = upper_bounds.copy()
+            out_lower_bounds = lower_bounds.copy()
+            update_upper_lower_bounds(
+                out_upper_bounds, out_lower_bounds, labels, center_shift
+            )
+            ctx[op.outputs[0].key] = out_upper_bounds
+            ctx[op.outputs[1].key] = out_lower_bounds
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.combine:
+            return cls._execute_combine(ctx, op)
+        else:
+            assert op.stage == OperandStage.reduce
+            return cls._execute_reduce(ctx, op)
+
+
+def elkan_iter(
+    X,
+    sample_weight,
+    centers_old,
+    center_half_distances,
+    distance_next_center,
+    upper_bounds,
+    lower_bounds,
+    labels,
+    center_shift,
+    update_centers=True,
+    session=None,
+    run_kwargs=None,
+):
+    update_op = KMeansElkanUpdate(
+        x=X,
+        sample_weight=sample_weight,
+        centers_old=centers_old,
+        center_half_distances=center_half_distances,
+        distance_next_center=distance_next_center,
+        labels=labels,
+        upper_bounds=upper_bounds,
+        lower_bounds=lower_bounds,
+        update_centers=update_centers,
+        n_clusters=centers_old.shape[0],
+    )
+    to_run = []
+    ret = update_op()
+    to_run.extend(ret)
+    labels, upper_bounds, lower_bounds, centers_new, weight_in_clusters = ret
+
+    if update_centers:
+        centers_new, weight_in_clusters = _relocate_empty_clusters(
+            X,
+            sample_weight,
+            centers_old,
+            centers_new,
+            weight_in_clusters,
+            labels,
+            to_run=to_run,
+            session=session,
+            run_kwargs=run_kwargs,
+        )
+        postprocess = KMeansElkanPostprocess(
+            centers_old=centers_old,
+            centers_new=centers_new,
+            center_shift=center_shift,
+            lower_bounds=lower_bounds,
+            upper_bounds=upper_bounds,
+            labels=labels,
+            weight_in_clusters=weight_in_clusters,
+        )
+        centers_new, center_shift, upper_bounds, lower_bounds = postprocess()
+
+    return (
+        centers_new,
+        weight_in_clusters,
+        upper_bounds,
+        lower_bounds,
+        labels,
+        center_shift,
+    )
diff --git a/python/xorbits/_mars/learn/cluster/_k_means_fast.pxd b/python/xorbits/_mars/learn/cluster/_k_means_fast.pxd
new file mode 100644
index 000000000..29d704ef7
--- /dev/null
+++ b/python/xorbits/_mars/learn/cluster/_k_means_fast.pxd
@@ -0,0 +1,12 @@
+from cython cimport floating
+cimport numpy as np
+
+
+cdef floating _euclidean_dense_dense(floating*, floating*, int, bint) nogil
+
+cdef floating _euclidean_sparse_dense(floating[::1], int[::1], floating[::1],
+                                      floating, bint) nogil
+
+cdef void _average_centers(floating[:, ::1], floating[::1])
+
+cdef void _center_shift(floating[:, ::1], floating[:, ::1], floating[::1])
diff --git a/python/xorbits/_mars/learn/cluster/_k_means_fast.pyx b/python/xorbits/_mars/learn/cluster/_k_means_fast.pyx
new file mode 100644
index 000000000..ebad20ccf
--- /dev/null
+++ b/python/xorbits/_mars/learn/cluster/_k_means_fast.pyx
@@ -0,0 +1,213 @@
+# cython: profile=False, boundscheck=False, wraparound=False, cdivision=True
+# Profiling is enabled by default as the overhead does not seem to be
+# measurable on this specific use case.
+
+# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#         Olivier Grisel <olivier.grisel@ensta.org>
+#         Lars Buitinck
+#
+# License: BSD 3 clause
+
+# TODO: We still need to use ndarrays instead of typed memoryviews when using
+# fused types and when the array may be read-only (for instance when it's
+# provided by the user). This is fixed in cython > 0.3.
+
+import numpy as np
+cimport numpy as np
+cimport cython
+from cython cimport floating
+from libc.math cimport sqrt
+
+try:
+    from sklearn.utils.extmath import row_norms
+except ImportError:  # pragma: no cover
+    row_norms = None
+
+
+np.import_array()
+
+
+ctypedef np.float64_t DOUBLE
+ctypedef np.int32_t INT
+
+
+cdef floating _euclidean_dense_dense(
+        floating* a,  # IN
+        floating* b,  # IN
+        int n_features,
+        bint squared) nogil:
+    """Euclidean distance between a dense and b dense"""
+    cdef:
+        int i
+        int n = n_features // 4
+        int rem = n_features % 4
+        floating result = 0
+
+    # We manually unroll the loop for better cache optimization.
+    for i in range(n):
+        result += ((a[0] - b[0]) * (a[0] - b[0])
+                  +(a[1] - b[1]) * (a[1] - b[1])
+                  +(a[2] - b[2]) * (a[2] - b[2])
+                  +(a[3] - b[3]) * (a[3] - b[3]))
+        a += 4; b += 4
+
+    for i in range(rem):
+        result += (a[i] - b[i]) * (a[i] - b[i])
+
+    return result if squared else sqrt(result)
+
+
+cdef floating _euclidean_sparse_dense(
+        floating[::1] a_data,  # IN
+        int[::1] a_indices,    # IN
+        floating[::1] b,       # IN
+        floating b_squared_norm,
+        bint squared) nogil:
+    """Euclidean distance between a sparse and b dense"""
+    cdef:
+        int nnz = a_indices.shape[0]
+        int i
+        floating tmp, bi
+        floating result = 0.0
+
+    for i in range(nnz):
+        bi = b[a_indices[i]]
+        tmp = a_data[i] - bi
+        result += tmp * tmp - bi * bi
+
+    result += b_squared_norm
+
+    if result < 0: result = 0.0
+
+    return result if squared else sqrt(result)
+
+
+cpdef floating _inertia_dense(
+        np.ndarray[floating, ndim=2, mode='c'] X,  # IN
+        floating[::1] sample_weight,               # IN
+        floating[:, ::1] centers,                  # IN
+        int[::1] labels):                          # IN
+    """Compute inertia for dense input data
+
+    Sum of squared distance between each sample and its assigned center.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_features = X.shape[1]
+        int i, j
+
+        floating sq_dist = 0.0
+        floating inertia = 0.0
+
+    for i in range(n_samples):
+        j = labels[i]
+        sq_dist = _euclidean_dense_dense(&X[i, 0], &centers[j, 0],
+                                         n_features, True)
+        inertia += sq_dist * sample_weight[i]
+
+    return inertia
+
+
+cpdef floating _inertia_sparse(
+        X,                            # IN
+        floating[::1] sample_weight,  # IN
+        floating[:, ::1] centers,     # IN
+        int[::1] labels):             # IN
+    """Compute inertia for sparse input data
+
+    Sum of squared distance between each sample and its assigned center.
+    """
+    cdef:
+        floating[::1] X_data = X.data
+        int[::1] X_indices = X.indices
+        int[::1] X_indptr = X.indptr
+
+        int n_samples = X.shape[0]
+        int n_features = X.shape[1]
+        int i, j
+
+        floating sq_dist = 0.0
+        floating inertia = 0.0
+
+        floating[::1] centers_squared_norms = row_norms(centers, squared=True)
+
+    for i in range(n_samples):
+        j = labels[i]
+        sq_dist = _euclidean_sparse_dense(
+            X_data[X_indptr[i]: X_indptr[i + 1]],
+            X_indices[X_indptr[i]: X_indptr[i + 1]],
+            centers[j], centers_squared_norms[j], True)
+        inertia += sq_dist * sample_weight[i]
+
+    return inertia
+
+
+cdef void _average_centers(
+        floating[:, ::1] centers,           # INOUT
+        floating[::1] weight_in_clusters):  # IN
+    """Average new centers wrt weights."""
+    cdef:
+        int n_clusters = centers.shape[0]
+        int n_features = centers.shape[1]
+        int j, k
+        floating alpha
+
+    for j in range(n_clusters):
+        if weight_in_clusters[j] > 0:
+            alpha = 1.0 / weight_in_clusters[j]
+            for k in range(n_features):
+                centers[j, k] *= alpha
+
+
+cdef void _center_shift(
+        floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,  # IN
+        floating[::1] center_shift):   # OUT
+    """Compute shift between old and new centers."""
+    cdef:
+        int n_clusters = centers_old.shape[0]
+        int n_features = centers_old.shape[1]
+        int j
+
+    for j in range(n_clusters):
+        center_shift[j] = _euclidean_dense_dense(
+            &centers_new[j, 0], &centers_old[j, 0], n_features, False)
+
+
+def update_center(
+        floating[:, ::1] centers_old,       # IN
+        floating[:, ::1] centers_new,       # INOUT
+        floating[::1] center_shift,         # OUT
+        floating[::1] weight_in_clusters):  # IN
+    _average_centers(centers_new, weight_in_clusters)
+    _center_shift(centers_old, centers_new, center_shift)
+
+
+def merge_update_chunks(int n_clusters,
+                        int n_features,
+                        floating[::1] weight_in_clusters,
+                        floating[::1] weight_in_clusters_chunk,
+                        floating[:, ::1] centers_new,
+                        floating[:, ::1] centers_new_chunk):
+    for j in range(n_clusters):
+        weight_in_clusters[j] += weight_in_clusters_chunk[j]
+        for k in range(n_features):
+            centers_new[j, k] += centers_new_chunk[j, k]
+
+
+def update_upper_lower_bounds(
+        floating[::1] upper_bounds,                # INOUT
+        floating[:, ::1] lower_bounds,             # INOUT
+        int[::1] labels,                           # IN
+        floating[::1] center_shift):               # IN
+    cdef:
+        int n_samples = upper_bounds.shape[0]
+        int n_clusters = lower_bounds.shape[1]
+
+    for i in range(n_samples):
+        upper_bounds[i] += center_shift[labels[i]]
+
+        for j in range(n_clusters):
+            lower_bounds[i, j] -= center_shift[j]
+            if lower_bounds[i, j] < 0:
+                lower_bounds[i, j] = 0
diff --git a/python/xorbits/_mars/learn/cluster/_k_means_init.py b/python/xorbits/_mars/learn/cluster/_k_means_init.py
new file mode 100644
index 000000000..81808a411
--- /dev/null
+++ b/python/xorbits/_mars/learn/cluster/_k_means_init.py
@@ -0,0 +1,503 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes
+from ... import tensor as mt
+from ...core import OutputType, recursive_tile
+from ...core.operand import OperandStage
+from ...serialization.serializables import Int32Field, KeyField
+from ...tensor.array_utils import as_same_device, device
+from ...tensor.core import TensorOrder
+from ...tensor.random import RandomStateField
+from ...utils import has_unknown_shape
+from ..metrics import euclidean_distances
+from ..operands import LearnOperand, LearnOperandMixin
+
+
+def _kmeans_plus_plus_init(
+    X, x_squared_norms, random_state, n_clusters: int, n_local_trials: int = None
+):
+    n_samples, n_features = X.shape
+
+    centers = mt.empty((n_clusters, n_features), dtype=X.dtype)
+
+    assert x_squared_norms is not None, "x_squared_norms None in _k_init"
+
+    # Set the number of local seeding trials if none is given
+    if n_local_trials is None:
+        # This is what Arthur/Vassilvitskii tried, but did not report
+        # specific results for other than mentioning in the conclusion
+        # that it helped.
+        n_local_trials = 2 + int(np.log(n_clusters))
+
+    # Pick first center randomly
+    center_id = random_state.randint(n_samples)
+    if X.issparse():  # pragma: no cover
+        centers[0] = X[center_id].todense()
+    else:
+        centers[0] = X[center_id]
+
+    # Initialize list of closest distances and calculate current potential
+    closest_dist_sq = euclidean_distances(
+        centers[0, mt.newaxis], X, Y_norm_squared=x_squared_norms, squared=True
+    )
+    current_pot = closest_dist_sq.sum()
+
+    # Pick the remaining n_clusters-1 points
+    for c in range(1, n_clusters):
+        # Choose center candidates by sampling with probability proportional
+        # to the squared distance to the closest existing center
+        rand_vals = random_state.random_sample(n_local_trials) * current_pot
+        candidate_ids = mt.searchsorted(closest_dist_sq.cumsum(), rand_vals)
+        # XXX: numerical imprecision can result in a candidate_id out of range
+        candidate_ids = mt.clip(candidate_ids, None, closest_dist_sq.size - 1)
+
+        # Compute distances to center candidates
+        distance_to_candidates = euclidean_distances(
+            X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True
+        )
+
+        # update closest distances squared and potential for each candidate
+        distance_to_candidates = mt.minimum(closest_dist_sq, distance_to_candidates)
+
+        candidates_pot = distance_to_candidates.sum(axis=1)
+
+        # Decide which candidate is the best
+        best_candidate = mt.argmin(candidates_pot)
+        current_pot = candidates_pot[best_candidate]
+        closest_dist_sq = distance_to_candidates[best_candidate]
+        best_candidate = candidate_ids[best_candidate]
+
+        # Permanently add best center candidate found in local tries
+        if X.issparse():  # pragma: no cover
+            c_center = X[best_candidate].todense()
+        else:
+            c_center = X[best_candidate]
+
+        centers[c] = c_center
+
+    return centers
+
+
+class KMeansPlusPlusInit(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.KMEANS_PLUS_PLUS_INIT
+
+    _x = KeyField("x")
+    _n_clusters = Int32Field("n_clusters")
+    _x_squared_norms = KeyField("x_squared_norms")
+    _state = RandomStateField("state")
+    _n_local_trials = Int32Field("n_local_trials")
+
+    def __init__(
+        self,
+        x=None,
+        n_clusters=None,
+        x_squared_norms=None,
+        state=None,
+        n_local_trials=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _x=x,
+            _n_clusters=n_clusters,
+            _x_squared_norms=x_squared_norms,
+            _state=state,
+            _n_local_trials=n_local_trials,
+            _output_types=output_types,
+            **kw
+        )
+        if self._output_types is None:
+            self._output_types = [OutputType.tensor]
+
+    @property
+    def x(self):
+        return self._x
+
+    @property
+    def n_clusters(self):
+        return self._n_clusters
+
+    @property
+    def x_squared_norms(self):
+        return self._x_squared_norms
+
+    @property
+    def state(self):
+        return self._state
+
+    @property
+    def n_local_trials(self):
+        return self._n_local_trials
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._x = self._inputs[0]
+        self._x_squared_norms = self._inputs[-1]
+
+    def __call__(self):
+        inputs = [self._x, self._x_squared_norms]
+        kw = {
+            "shape": (self._n_clusters, self._x.shape[1]),
+            "dtype": self._x.dtype,
+            "order": TensorOrder.C_ORDER,
+        }
+        return self.new_tileable(inputs, kws=[kw])
+
+    @classmethod
+    def _tile_one_chunk(cls, op: "KMeansPlusPlusInit"):
+        out = op.outputs[0]
+
+        chunk_op = op.copy().reset_key()
+        chunk_kw = out.params.copy()
+        chunk_kw["index"] = (0, 0)
+        chunk_inputs = [op.x.chunks[0], op.x_squared_norms.chunks[0]]
+        chunk = chunk_op.new_chunk(chunk_inputs, kws=[chunk_kw])
+
+        kw = out.params
+        kw["chunks"] = [chunk]
+        kw["nsplits"] = tuple((s,) for s in out.shape)
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=[kw])
+
+    @classmethod
+    def tile(cls, op: "KMeansPlusPlusInit"):
+        if len(op.x.chunks) == 1:
+            assert len(op.x_squared_norms.chunks) == 1
+            return cls._tile_one_chunk(op)
+        else:
+            return (yield from cls._tile_k_init(op))
+
+    @classmethod
+    def _tile_k_init(cls, op: "KMeansPlusPlusInit"):
+        X = op.x
+        n_clusters = op.n_clusters
+        x_squared_norms = op.x_squared_norms
+        random_state = op.state
+        n_local_trials = op.n_local_trials
+
+        centers = _kmeans_plus_plus_init(
+            X, x_squared_norms, random_state, n_clusters, n_local_trials
+        )
+        return (yield from recursive_tile(centers))
+
+    @classmethod
+    def execute(cls, ctx, op: "KMeansPlusPlusInit"):
+        try:
+            from sklearn.cluster._kmeans import _kmeans_plusplus
+        except ImportError:  # pragma: no cover
+            try:
+                from sklearn.cluster._kmeans import _k_init
+            except ImportError:
+                from sklearn.cluster.k_means_ import _k_init
+
+            def _kmeans_plusplus(*args, **kwargs):
+                return _k_init(*args, **kwargs), None
+
+        (x, x_squared_norms), device_id, _ = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            ctx[op.outputs[0].key] = _kmeans_plusplus(
+                x,
+                op.n_clusters,
+                x_squared_norms=x_squared_norms,
+                random_state=op.state,
+                n_local_trials=op.n_local_trials,
+            )[0]
+
+
+###############################################################################
+# Initialization heuristic
+
+
+def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
+    """Init n_clusters seeds according to k-means++
+
+    Parameters
+    ----------
+    X : array or sparse matrix, shape (n_samples, n_features)
+        The data to pick seeds for. To avoid memory copy, the input data
+        should be double precision (dtype=np.float64).
+
+    n_clusters : integer
+        The number of seeds to choose
+
+    x_squared_norms : array, shape (n_samples,)
+        Squared Euclidean norm of each data point.
+
+    random_state : int, RandomState instance
+        The generator used to initialize the centers. Use an int to make the
+        randomness deterministic.
+        See :term:`Glossary <random_state>`.
+
+    n_local_trials : integer, optional
+        The number of seeding trials for each center (except the first),
+        of which the one reducing inertia the most is greedily chosen.
+        Set to None to make the number of trials depend logarithmically
+        on the number of seeds (2+log(k)); this is the default.
+
+    Notes
+    -----
+    Selects initial cluster centers for k-mean clustering in a smart way
+    to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
+    "k-means++: the advantages of careful seeding". ACM-SIAM symposium
+    on Discrete algorithms. 2007
+
+    Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip,
+    which is the implementation used in the aforementioned paper.
+    """
+    op = KMeansPlusPlusInit(
+        x=X,
+        n_clusters=n_clusters,
+        x_squared_norms=x_squared_norms,
+        state=random_state,
+        n_local_trials=n_local_trials,
+    )
+    return op()
+
+
+class KMeansScalablePlusPlusInit(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.KMEANS_SCALABLE_PLUS_PLUS_INIT
+
+    _x = KeyField("x")
+    _n_clusters = Int32Field("n_clusters")
+    _x_squared_norms = KeyField("x_squared_norms")
+    _state = RandomStateField("state")
+    _init_iter = Int32Field("init_iter")
+    _oversampling_factor = Int32Field("oversampling_factor")
+
+    def __init__(
+        self,
+        x=None,
+        n_clusters=None,
+        x_squared_norms=None,
+        state=None,
+        init_iter=None,
+        oversampling_factor=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _x=x,
+            _n_clusters=n_clusters,
+            _x_squared_norms=x_squared_norms,
+            _state=state,
+            _init_iter=init_iter,
+            _oversampling_factor=oversampling_factor,
+            _output_types=output_types,
+            **kw
+        )
+        if self._output_types is None:
+            self._output_types = [OutputType.tensor]
+
+    @property
+    def x(self):
+        return self._x
+
+    @property
+    def n_clusters(self):
+        return self._n_clusters
+
+    @property
+    def x_squared_norms(self):
+        return self._x_squared_norms
+
+    @property
+    def state(self):
+        return self._state
+
+    @property
+    def init_iter(self):
+        return self._init_iter
+
+    @property
+    def oversampling_factor(self):
+        return self._oversampling_factor
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if self._x is not None:
+            self._x = self._inputs[0]
+        if self._x_squared_norms is not None:
+            self._x_squared_norms = self._inputs[-1]
+
+    def __call__(self):
+        inputs = [self._x, self._x_squared_norms]
+        kw = {
+            "shape": (self._n_clusters, self._x.shape[1]),
+            "dtype": self._x.dtype,
+            "order": TensorOrder.C_ORDER,
+        }
+        return self.new_tileable(inputs, kws=[kw])
+
+    @classmethod
+    def tile(cls, op: "KMeansScalablePlusPlusInit"):
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        x = mt.tensor(op.x)
+        x_squared_norms = mt.atleast_2d(op.x_squared_norms)
+        out = op.outputs[0]
+
+        random_state = op.state
+        rs = mt.random.RandomState.from_numpy(random_state)
+
+        n_samples, n_features = x.shape
+        n_clusters = op.n_clusters
+
+        # step 1, sample a centroid
+        centers = x[random_state.randint(n_samples, size=1)]
+
+        for _ in range(op.init_iter):
+            distances = euclidean_distances(
+                x, centers, X_norm_squared=x_squared_norms, squared=True
+            )
+
+            # calculate the cost of data with respect to current centers
+            cost = mt.sum(mt.min(distances, axis=1))
+
+            # calculate the distribution to sample new centers
+            distribution = mt.full(len(distances), 1 / len(distances))
+            mt.true_divide(
+                mt.min(distances, axis=1), cost, where=cost != 0, out=distribution
+            )
+
+            # pick new centers
+            new_centers_size = op.oversampling_factor * n_clusters
+            new_centers = x[rs.choice(n_samples, new_centers_size, p=distribution)]
+
+            centers = mt.concatenate([centers, new_centers])
+
+        # rechunk centers into one chunk
+        centers = (yield from recursive_tile(centers)).rechunk(centers.shape)
+
+        distances = yield from recursive_tile(
+            euclidean_distances(
+                x, centers, X_norm_squared=x_squared_norms, squared=True
+            )
+        )
+
+        map_index_to_chunks = {}
+        # calculate weight for each chunk
+        for c in distances.chunks:
+            map_chunk_op = KMeansScalablePlusPlusInit(stage=OperandStage.map)
+            map_chunk_kw = {
+                "shape": (len(centers),),
+                "dtype": np.dtype(np.int64),
+                "order": TensorOrder.C_ORDER,
+                "index": c.index,
+            }
+            map_chunk = map_chunk_op.new_chunk([c], kws=[map_chunk_kw])
+            map_index_to_chunks[c.index] = map_chunk
+
+        combine_chunks = []
+        for i in range(distances.chunk_shape[0]):
+            map_chunks = [
+                map_index_to_chunks[i, j] for j in range(distances.chunk_shape[1])
+            ]
+            combine_chunk_op = KMeansScalablePlusPlusInit(stage=OperandStage.combine)
+            combine_chunk_kw = {
+                "shape": (len(centers),),
+                "dtype": np.dtype(np.int64),
+                "order": TensorOrder.C_ORDER,
+                "index": (i,),
+            }
+            combine_chunk = combine_chunk_op.new_chunk(
+                map_chunks, kws=[combine_chunk_kw]
+            )
+            combine_chunks.append(combine_chunk)
+
+        reduce_chunk_op = KMeansScalablePlusPlusInit(
+            n_clusters=op.n_clusters, state=random_state, stage=OperandStage.reduce
+        )
+        reduce_chunk_kw = out.params
+        reduce_chunk_kw["index"] = (0, 0)
+        reduce_chunk = reduce_chunk_op.new_chunk(
+            [centers.chunks[0]] + combine_chunks, kws=[reduce_chunk_kw]
+        )
+
+        new_op = op.copy()
+        kw = out.params
+        kw["chunks"] = [reduce_chunk]
+        kw["nsplits"] = tuple((s,) for s in out.shape)
+        return new_op.new_tileables(op.inputs, kws=[kw])
+
+    @classmethod
+    def _execute_map(cls, ctx, op: "KMeansScalablePlusPlusInit"):
+        distances = ctx[op.inputs[0].key]
+        min_distance_ids = np.argmin(distances, axis=1)
+        min_distances = distances[range(len(distances)), min_distance_ids]
+        ctx[op.outputs[0].key] = (min_distances, min_distance_ids)
+
+    @classmethod
+    def _execute_combine(cls, ctx, op: "KMeansScalablePlusPlusInit"):
+        out = op.outputs[0]
+        all_distances, all_min_distance_ids = tuple(
+            zip(*(ctx[inp.key] for inp in op.inputs))
+        )
+        distances = np.stack(all_distances).T
+        min_distance_ids = np.stack(all_min_distance_ids).T
+
+        combined_min_distance_id = np.argmin(distances, axis=1)
+        min_distance_ids = min_distance_ids[
+            range(len(distances)), combined_min_distance_id
+        ]
+        count = np.bincount(min_distance_ids)
+        result = np.zeros(out.shape[0], dtype=np.int64)
+        result[: len(count)] = count
+        ctx[out.key] = result
+
+    @classmethod
+    def _execute_reduce(cls, ctx, op: "KMeansScalablePlusPlusInit"):
+        from sklearn.cluster import KMeans
+
+        inputs = [ctx[inp.key] for inp in op.inputs]
+
+        count = np.zeros(inputs[1].shape[0], dtype=np.int64)
+        for inp in inputs[1:]:
+            count += inp
+        weight = count / count.sum()
+
+        centers = inputs[0]
+
+        kmeans = KMeans(n_clusters=op.n_clusters, n_init=1, random_state=op.state)
+        kmeans.fit(centers, sample_weight=weight)
+        ctx[op.outputs[0].key] = kmeans.cluster_centers_
+
+    @classmethod
+    def execute(cls, ctx, op: "KMeansScalablePlusPlusInit"):
+        if op.stage == OperandStage.map:
+            return cls._execute_map(ctx, op)
+        elif op.stage == OperandStage.combine:
+            return cls._execute_combine(ctx, op)
+        else:
+            return cls._execute_reduce(ctx, op)
+
+
+def _scalable_k_init(
+    X, n_clusters, x_squared_norms, random_state, oversampling_factor=2, init_iter=5
+):
+    op = KMeansScalablePlusPlusInit(
+        x=X,
+        n_clusters=n_clusters,
+        x_squared_norms=x_squared_norms,
+        state=random_state,
+        init_iter=init_iter,
+        oversampling_factor=oversampling_factor,
+    )
+    return op()
diff --git a/python/xorbits/_mars/learn/cluster/_k_means_lloyd.pyx b/python/xorbits/_mars/learn/cluster/_k_means_lloyd.pyx
new file mode 100644
index 000000000..d0190ecf5
--- /dev/null
+++ b/python/xorbits/_mars/learn/cluster/_k_means_lloyd.pyx
@@ -0,0 +1,184 @@
+# cython: profile=False, boundscheck=False, wraparound=False, cdivision=True
+#
+# Licence: BSD 3 clause
+
+# TODO: We still need to use ndarrays instead of typed memoryviews when using
+# fused types and when the array may be read-only (for instance when it's
+# provided by the user). This is fixed in cython > 0.3.
+
+import numpy as np
+cimport numpy as np
+from cython cimport floating
+from libc.stdlib cimport malloc, free
+from libc.float cimport DBL_MAX, FLT_MAX
+
+from ..utils._cython_blas cimport _gemm
+from ..utils._cython_blas cimport RowMajor, Trans, NoTrans
+
+
+np.import_array()
+
+
+def update_chunk_dense(
+        np.ndarray[floating, ndim=2, mode='c'] X,  # IN
+        floating[::1] sample_weight,               # IN
+        floating[::1] x_squared_norms,             # IN
+        floating[:, ::1] centers_old,              # IN
+        floating[::1] centers_squared_norms,       # IN
+        int[::1] labels,                           # OUT
+        floating[:, ::1] centers_new,              # OUT
+        floating[::1] weight_in_clusters,          # OUT
+        bint update_centers=True):
+    cdef:
+        int n_samples = X.shape[0]
+        int n_clusters = centers_old.shape[0]
+        floating *pairwise_distances
+
+    pairwise_distances = <floating*> malloc(n_samples * n_clusters * sizeof(floating))
+    result = _update_chunk_dense(&X[0, 0], sample_weight, x_squared_norms,
+                                 centers_old, centers_squared_norms,
+                                 labels, &centers_new[0, 0],
+                                 &weight_in_clusters[0], pairwise_distances,
+                                 update_centers)
+    free(pairwise_distances)
+    return result
+
+
+cdef void _update_chunk_dense(
+        floating *X,                          # IN
+        # expecting C aligned 2D array. XXX: Can be
+        # replaced by const memoryview when cython min
+        # version is >= 0.3
+        floating[::1] sample_weight,          # IN
+        floating[::1] x_squared_norms,        # IN
+        floating[:, ::1] centers_old,         # IN
+        floating[::1] centers_squared_norms,  # IN
+        int[::1] labels,                      # OUT
+        floating *centers_new,                # OUT
+        floating *weight_in_clusters,         # OUT
+        floating *pairwise_distances,         # OUT
+        bint update_centers) nogil:
+    """K-means combined EM step for one dense data chunk.
+
+    Compute the partial contribution of a single data chunk to the labels and
+    centers.
+    """
+    cdef:
+        int n_samples = labels.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int n_features = centers_old.shape[1]
+
+        floating sq_dist, min_sq_dist
+        int i, j, k, label
+
+    # Instead of computing the full pairwise squared distances matrix,
+    # ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to store
+    # the - 2 X.C^T + ||C||² term since the argmin for a given sample only
+    # depends on the centers.
+    # pairwise_distances = ||C||²
+    for i in range(n_samples):
+        for j in range(n_clusters):
+            pairwise_distances[i * n_clusters + j] = centers_squared_norms[j]
+
+    # pairwise_distances += -2 * X.dot(C.T)
+    _gemm(RowMajor, NoTrans, Trans, n_samples, n_clusters, n_features,
+          -2.0, X, n_features, &centers_old[0, 0], n_features,
+          1.0, pairwise_distances, n_clusters)
+
+    for i in range(n_samples):
+        min_sq_dist = pairwise_distances[i * n_clusters]
+        label = 0
+        for j in range(1, n_clusters):
+            sq_dist = pairwise_distances[i * n_clusters + j]
+            if sq_dist < min_sq_dist:
+                min_sq_dist = sq_dist
+                label = j
+        labels[i] = label
+
+        if update_centers:
+            weight_in_clusters[label] += sample_weight[i]
+            for k in range(n_features):
+                centers_new[label * n_features + k] += X[i * n_features + k] * sample_weight[i]
+
+
+def update_chunk_sparse(
+        X,                                         # IN
+        floating[::1] sample_weight,               # IN
+        floating[::1] x_squared_norms,             # IN
+        floating[:, ::1] centers_old,              # IN
+        floating[::1] centers_squared_norms,       # IN
+        int[::1] labels,                           # OUT
+        floating[:, ::1] centers_new,              # OUT
+        floating[::1] weight_in_clusters,          # OUT
+        bint update_centers=True):
+    cdef:
+        floating[::1] X_data = X.data
+        int[::1] X_indices = X.indices
+        int[::1] X_indptr = X.indptr
+
+    X_data = X.data
+    X_indices = X.indices
+    X_indptr = X.indptr
+
+    return _update_chunk_sparse(
+        X_data, X_indices, X_indptr, sample_weight,
+        x_squared_norms, centers_old, centers_squared_norms,
+        labels, &centers_new[0, 0], &weight_in_clusters[0],
+        update_centers
+    )
+
+
+cdef void _update_chunk_sparse(
+        floating[::1] X_data,                 # IN
+        int[::1] X_indices,                   # IN
+        int[::1] X_indptr,                    # IN
+        floating[::1] sample_weight,          # IN
+        floating[::1] x_squared_norms,        # IN
+        floating[:, ::1] centers_old,         # IN
+        floating[::1] centers_squared_norms,  # IN
+        int[::1] labels,                      # OUT
+        floating *centers_new,                # OUT
+        floating *weight_in_clusters,         # OUT
+        bint update_centers) nogil:
+    """K-means combined EM step for one sparse data chunk.
+
+    Compute the partial contribution of a single data chunk to the labels and
+    centers.
+    """
+    cdef:
+        int n_samples = labels.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int n_features = centers_old.shape[1]
+
+        floating sq_dist, min_sq_dist
+        int i, j, k, label
+        floating max_floating = FLT_MAX if floating is float else DBL_MAX
+        int s = X_indptr[0]
+
+    # XXX Precompute the pairwise distances matrix is not worth for sparse
+    # currently. Should be tested when BLAS (sparse x dense) matrix
+    # multiplication is available.
+    for i in range(n_samples):
+        min_sq_dist = max_floating
+        label = 0
+
+        for j in range(n_clusters):
+            sq_dist = 0.0
+            for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
+                sq_dist += centers_old[j, X_indices[k]] * X_data[k]
+
+            # Instead of computing the full squared distance with each cluster,
+            # ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to compute
+            # the - 2 X.C^T + ||C||² term since the argmin for a given sample
+            # only depends on the centers C.
+            sq_dist = centers_squared_norms[j] -2 * sq_dist
+            if sq_dist < min_sq_dist:
+                min_sq_dist = sq_dist
+                label = j
+
+        labels[i] = label
+
+        if update_centers:
+            weight_in_clusters[label] += sample_weight[i]
+            for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
+                centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i]
diff --git a/python/xorbits/_mars/learn/cluster/_k_means_lloyd_iter.py b/python/xorbits/_mars/learn/cluster/_k_means_lloyd_iter.py
new file mode 100644
index 000000000..ca9121fee
--- /dev/null
+++ b/python/xorbits/_mars/learn/cluster/_k_means_lloyd_iter.py
@@ -0,0 +1,449 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from sklearn.utils.extmath import row_norms as sklearn_row_norms
+
+from ... import opcodes
+from ...core import OutputType, recursive_tile
+from ...core.operand import OperandStage
+from ...serialization.serializables import BoolField, Int32Field, KeyField
+from ...tensor.array_utils import as_same_device, device, sparse
+from ...tensor.core import TensorOrder
+from ...utils import has_unknown_shape
+from ..operands import LearnOperand, LearnOperandMixin
+from ._k_means_common import _execute_merge_update, _relocate_empty_clusters
+from ._k_means_fast import update_center
+from ._k_means_lloyd import update_chunk_dense, update_chunk_sparse
+
+
+class KMeansLloydUpdate(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.KMEANS_LLOYD_UPDATE
+
+    _x = KeyField("x")
+    _sample_weight = KeyField("sample_weight")
+    _x_squared_norms = KeyField("x_squared_norms")
+    _centers_old = KeyField("centers_old")
+    _labels = KeyField("labels")
+    _update_centers = BoolField("update_centers")
+    _n_clusters = Int32Field("n_clusters")
+
+    def __init__(
+        self,
+        x=None,
+        sample_weight=None,
+        x_squared_norms=None,
+        centers_old=None,
+        labels=None,
+        update_centers=None,
+        n_clusters=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _x=x,
+            _sample_weight=sample_weight,
+            _x_squared_norms=x_squared_norms,
+            _centers_old=centers_old,
+            _labels=labels,
+            _update_centers=update_centers,
+            _n_clusters=n_clusters,
+            _output_types=output_types,
+            **kw
+        )
+        if self._output_types is None:
+            self._output_types = [OutputType.tensor] * self.output_limit
+
+    @property
+    def x(self):
+        return self._x
+
+    @property
+    def sample_weight(self):
+        return self._sample_weight
+
+    @property
+    def x_squared_norms(self):
+        return self._x_squared_norms
+
+    @property
+    def centers_old(self):
+        return self._centers_old
+
+    @property
+    def labels(self):
+        return self._labels
+
+    @property
+    def update_centers(self):
+        return self._update_centers
+
+    @property
+    def n_clusters(self):
+        return self._n_clusters
+
+    @property
+    def output_limit(self):
+        return 3 if self.stage != OperandStage.reduce else 2
+
+    @property
+    def _input_fields(self):
+        return "_x", "_sample_weight", "_x_squared_norms", "_centers_old", "_labels"
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        for field in self._input_fields:
+            if getattr(self, field, None) is not None:
+                setattr(self, field, next(inputs_iter))
+
+    def __call__(self):
+        kws = [
+            # labels
+            self._labels.params,
+            # centers_new
+            {
+                "shape": (self._n_clusters, self._x.shape[1]),
+                "dtype": self._centers_old.dtype,
+                "order": TensorOrder.C_ORDER,
+            },
+            # weight_in_clusters
+            {
+                "shape": (self._n_clusters,),
+                "dtype": self._centers_old.dtype,
+                "order": TensorOrder.C_ORDER,
+            },
+        ]
+        return self.new_tileables(
+            [getattr(self, field) for field in self._input_fields], kws=kws
+        )
+
+    @classmethod
+    def tile(cls, op: "KMeansLloydUpdate"):
+        if has_unknown_shape(*op.inputs):
+            yield
+        x = op.x
+        if x.chunk_shape[1] != 1:  # pragma: no cover
+            x = yield from recursive_tile(x.rechunk({1: x.shape[1]}))
+        sample_weight = yield from recursive_tile(
+            op.sample_weight.rechunk({0: x.nsplits[0]})
+        )
+        x_squared_norms = yield from recursive_tile(
+            op.x_squared_norms.rechunk({0: x.nsplits[0]})
+        )
+        labels = yield from recursive_tile(op.labels.rechunk({0: x.nsplits[0]}))
+        assert len(op.centers_old.chunks) == 1
+
+        labels_chunks, centers_new_chunks, weight_in_clusters_chunks = [], [], []
+        for i in range(x.chunk_shape[0]):
+            x_chunk = x.cix[i, 0]
+            sample_weight_chunk = sample_weight.cix[i,]
+            x_squared_norms_chunk = x_squared_norms.cix[i,]
+            labels_chunk = labels.cix[i,]
+            chunk_op = op.copy().reset_key()
+            chunk_op.stage = OperandStage.map
+            chunk_kws = [
+                labels_chunk.params,
+                {
+                    "index": (0, 0),
+                    "shape": (op.n_clusters, x_chunk.shape[1]),
+                    "dtype": op.centers_old.dtype,
+                    "order": TensorOrder.C_ORDER,
+                },
+                {
+                    "index": (0,),
+                    "shape": (op.n_clusters,),
+                    "dtype": op.centers_old.dtype,
+                    "order": TensorOrder.C_ORDER,
+                },
+            ]
+            (
+                labels_chunk,
+                centers_new_chunk,
+                weight_in_clusters_chunk,
+            ) = chunk_op.new_chunks(
+                [
+                    x_chunk,
+                    sample_weight_chunk,
+                    x_squared_norms_chunk,
+                    op.centers_old.chunks[0],
+                    labels_chunk,
+                ],
+                kws=chunk_kws,
+            )
+            labels_chunks.append(labels_chunk)
+            centers_new_chunks.append(centers_new_chunk)
+            weight_in_clusters_chunks.append(weight_in_clusters_chunk)
+
+        if op.update_centers:
+            # merge centers_new and weight_in_clusters
+            merge_op = KMeansLloydUpdate(
+                stage=OperandStage.reduce, n_clusters=op.n_clusters
+            )
+            merge_chunk_kw = [
+                centers_new_chunks[0].params,
+                weight_in_clusters_chunks[0].params,
+            ]
+            centers_new_chunk, weight_in_cluster_chunk = merge_op.new_chunks(
+                centers_new_chunks + weight_in_clusters_chunks, kws=merge_chunk_kw
+            )
+        else:
+            # the data is meaningless, just pick one
+            centers_new_chunk = centers_new_chunks[0]
+            weight_in_cluster_chunk = weight_in_clusters_chunks[0]
+
+        out_params = [out.params for out in op.outputs]
+        # labels
+        out_params[0]["nsplits"] = labels.nsplits
+        out_params[0]["chunks"] = labels_chunks
+        # centers_new
+        out_params[1]["nsplits"] = tuple((s,) for s in op.outputs[1].shape)
+        out_params[1]["chunks"] = [centers_new_chunk]
+        # weight_in_clusters
+        out_params[2]["nsplits"] = tuple((s,) for s in op.outputs[2].shape)
+        out_params[2]["chunks"] = [weight_in_cluster_chunk]
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=out_params)
+
+    @classmethod
+    def _execute_reduce(cls, ctx, op):
+        return _execute_merge_update(ctx, op)
+
+    @classmethod
+    def execute(cls, ctx, op: "KMeansLloydUpdate"):
+        if op.stage == OperandStage.reduce:
+            return cls._execute_reduce(ctx, op)
+        else:
+            (
+                (x, sample_weight, x_squared_norms, centers_old, labels),
+                device_id,
+                xp,
+            ) = as_same_device(
+                [ctx[inp.key] for inp in op.inputs],
+                device=op.device,
+                ret_extra=True,
+                copy_if_not_writeable=True,
+            )
+
+            with device(device_id):
+                if not op.update_centers:
+                    centers_new = centers_old.copy()
+                else:
+                    centers_new = np.zeros_like(centers_old)
+                weight_in_clusters = np.zeros(op.n_clusters, dtype=x.dtype)
+                centers_squared_norms = sklearn_row_norms(centers_old, squared=True)
+
+                if xp is np:
+                    method = update_chunk_dense
+                elif xp is sparse:
+                    method = update_chunk_sparse
+                else:  # pragma: no cover
+                    raise NotImplementedError("Does not support run on GPU")
+                out_labels = labels.copy()
+                method(
+                    x,
+                    sample_weight,
+                    x_squared_norms,
+                    centers_old,
+                    centers_squared_norms,
+                    out_labels,
+                    centers_new,
+                    weight_in_clusters,
+                    op.update_centers,
+                )
+
+                # labels
+                ctx[op.outputs[0].key] = out_labels
+                # centers_new
+                ctx[op.outputs[1].key] = centers_new
+                # weight_in_cluster
+                ctx[op.outputs[2].key] = weight_in_clusters
+
+
+class KMeansLloydPostprocess(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.KMEANS_LLOYD_POSTPROCESS
+
+    _centers_old = KeyField("centers_old")
+    _centers_new = KeyField("centers_new")
+    _center_shift = KeyField("center_shift")
+    _weight_in_clusters = KeyField("weight_in_clusters")
+
+    def __init__(
+        self,
+        centers_old=None,
+        centers_new=None,
+        center_shift=None,
+        weight_in_clusters=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _centers_old=centers_old,
+            _centers_new=centers_new,
+            _center_shift=center_shift,
+            _weight_in_clusters=weight_in_clusters,
+            _output_types=output_types,
+            **kw
+        )
+        if self._output_types is None:
+            self._output_types = [OutputType.tensor] * self.output_limit
+
+    @property
+    def centers_old(self):
+        return self._centers_old
+
+    @property
+    def centers_new(self):
+        return self._centers_new
+
+    @property
+    def center_shift(self):
+        return self._center_shift
+
+    @property
+    def weight_in_clusters(self):
+        return self._weight_in_clusters
+
+    @property
+    def output_limit(self):
+        return 2
+
+    @property
+    def _input_fields(self):
+        return "_centers_old", "_centers_new", "_center_shift", "_weight_in_clusters"
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        for field in self._input_fields:
+            ob = getattr(self, field)
+            if ob is not None:
+                setattr(self, field, next(inputs_iter))
+
+    def __call__(self):
+        kws = [
+            # centers_new
+            self._centers_new.params,
+            # center_shift
+            self._center_shift.params,
+        ]
+        return self.new_tileables(
+            [getattr(self, f) for f in self._input_fields], kws=kws
+        )
+
+    @classmethod
+    def tile(cls, op: "KMeansLloydPostprocess"):
+        assert len(op.centers_old.chunks) == 1
+        assert len(op.centers_new.chunks) == 1
+        assert len(op.center_shift.chunks) == 1
+        assert len(op.weight_in_clusters.chunks) == 1
+
+        centers_old_chunk = op.centers_old.chunks[0]
+        centers_new_chunk = op.centers_new.chunks[0]
+        center_shift_chunk = op.center_shift.chunks[0]
+        weight_in_clusters_chunk = op.weight_in_clusters.chunks[0]
+        centers_new_chunk, center_shift_chunk = KMeansLloydPostprocess(
+            centers_old=centers_old_chunk,
+            centers_new=centers_new_chunk,
+            center_shift=center_shift_chunk,
+            weight_in_clusters=weight_in_clusters_chunk,
+        ).new_chunks(
+            [
+                centers_old_chunk,
+                centers_new_chunk,
+                center_shift_chunk,
+                weight_in_clusters_chunk,
+            ],
+            kws=[centers_new_chunk.params, center_shift_chunk.params],
+        )
+
+        centers_new_kw = op.centers_new.params
+        centers_new_kw["chunks"] = [centers_new_chunk]
+        centers_new_kw["nsplits"] = op.centers_new.nsplits
+        center_shift_kw = op.center_shift.params
+        center_shift_kw["chunks"] = [center_shift_chunk]
+        center_shift_kw["nsplits"] = op.center_shift.nsplits
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=[centers_new_kw, center_shift_kw])
+
+    @classmethod
+    def execute(cls, ctx, op: "KMeansLloydPostprocess"):
+        (
+            (centers_old, centers_new, center_shift, weight_in_clusters),
+            device_id,
+            xp,
+        ) = as_same_device(
+            [ctx[inp.key] for inp in op.inputs],
+            op.device,
+            ret_extra=True,
+            copy_if_not_writeable=True,
+        )
+
+        with device(device_id):
+            out_center_shift = center_shift.copy()
+            out_centers_new = centers_new.copy()
+            update_center(
+                centers_old, out_centers_new, out_center_shift, weight_in_clusters
+            )
+
+            ctx[op.outputs[0].key] = out_centers_new
+            ctx[op.outputs[1].key] = out_center_shift
+
+
+def lloyd_iter(
+    X,
+    sample_weight,
+    x_squared_norms,
+    centers_old,
+    labels,
+    center_shift,
+    update_centers=True,
+    session=None,
+    run_kwargs=None,
+):
+    update_op = KMeansLloydUpdate(
+        x=X,
+        sample_weight=sample_weight,
+        x_squared_norms=x_squared_norms,
+        centers_old=centers_old,
+        labels=labels,
+        update_centers=update_centers,
+        n_clusters=centers_old.shape[0],
+    )
+    to_run = []
+    ret = update_op()
+    to_run.extend(ret)
+    labels, centers_new, weight_in_clusters = ret
+
+    if update_centers:
+        centers_new, weight_in_clusters = _relocate_empty_clusters(
+            X,
+            sample_weight,
+            centers_old,
+            centers_new,
+            weight_in_clusters,
+            labels,
+            to_run=to_run,
+            session=session,
+            run_kwargs=run_kwargs,
+        )
+        postprocess = KMeansLloydPostprocess(
+            centers_old=centers_old,
+            centers_new=centers_new,
+            center_shift=center_shift,
+            weight_in_clusters=weight_in_clusters,
+        )
+        centers_new, center_shift = postprocess()
+
+    return centers_new, weight_in_clusters, labels, center_shift
diff --git a/python/xorbits/_mars/learn/cluster/_kmeans.py b/python/xorbits/_mars/learn/cluster/_kmeans.py
new file mode 100644
index 000000000..112b3b245
--- /dev/null
+++ b/python/xorbits/_mars/learn/cluster/_kmeans.py
@@ -0,0 +1,1122 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import numpy as np
+from sklearn.base import ClusterMixin, TransformerMixin
+from sklearn.exceptions import ConvergenceWarning
+
+from ... import tensor as mt
+from ...tensor.utils import check_random_state
+from ..base import BaseEstimator
+from ..metrics.pairwise import euclidean_distances
+from ..utils.extmath import row_norms
+from ..utils.validation import (
+    _check_sample_weight,
+    _num_samples,
+    check_array,
+    check_is_fitted,
+)
+from ._k_means_common import _inertia
+from ._k_means_elkan_iter import elkan_iter, init_bounds
+from ._k_means_init import _k_init, _scalable_k_init
+from ._k_means_lloyd_iter import lloyd_iter
+
+###############################################################################
+# K-means batch estimation by EM (expectation maximization)
+
+
+def _validate_center_shape(X, n_centers, centers):
+    """Check if centers is compatible with X and n_centers"""
+    if len(centers) != n_centers:
+        raise ValueError(
+            "The shape of the initial centers (%s) "
+            "does not match the number of clusters %i" % (centers.shape, n_centers)
+        )
+    if centers.shape[1] != X.shape[1]:
+        raise ValueError(
+            "The number of features of the initial centers %s "
+            "does not match the number of features of the data %s."
+            % (centers.shape[1], X.shape[1])
+        )
+
+
+def _tolerance(X, tol):
+    """Return a tolerance which is independent of the dataset"""
+    variances = mt.var(X, axis=0)
+    return mt.mean(variances) * tol
+
+
+def _check_normalize_sample_weight(sample_weight, X):
+    """Set sample_weight if None, and check for correct dtype"""
+
+    sample_weight_was_none = sample_weight is None
+
+    sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+    if not sample_weight_was_none:
+        # normalize the weights to sum up to n_samples
+        # an array of 1 (i.e. samples_weight is None) is already normalized
+        n_samples = len(sample_weight)
+        scale = n_samples / sample_weight.sum()
+        sample_weight *= scale
+    return sample_weight
+
+
+def k_means(
+    X,
+    n_clusters,
+    sample_weight=None,
+    init="k-means||",
+    n_init=10,
+    max_iter=300,
+    verbose=False,
+    tol=1e-4,
+    random_state=None,
+    copy_x=True,
+    algorithm="auto",
+    oversampling_factor=2,
+    init_iter=5,
+    return_n_iter=False,
+):
+    """K-means clustering algorithm.
+
+    Parameters
+    ----------
+    X : Tensor, shape (n_samples, n_features)
+        The observations to cluster. It must be noted that the data
+        will be converted to C ordering, which will cause a memory copy
+        if the given data is not C-contiguous.
+
+    n_clusters : int
+        The number of clusters to form as well as the number of
+        centroids to generate.
+
+    sample_weight : array-like, shape (n_samples,), optional
+        The weights for each observation in X. If None, all observations
+        are assigned equal weight (default: None)
+
+    init : {'k-means++', 'k-means||', 'random', or tensor, or a callable}, optional
+        Method for initialization, default to 'k-means||':
+
+        'k-means++' : selects initial cluster centers for k-mean
+        clustering in a smart way to speed up convergence. See section
+        Notes in k_init for more details.
+
+        'k-means||': scalable k-means++.
+
+        'random': choose k observations (rows) at random from data for
+        the initial centroids.
+
+        If an ndarray is passed, it should be of shape (n_clusters, n_features)
+        and gives the initial centers.
+
+        If a callable is passed, it should take arguments X, k and
+        and a random state and return an initialization.
+
+    n_init : int, optional, default: 10
+        Number of time the k-means algorithm will be run with different
+        centroid seeds. The final results will be the best output of
+        n_init consecutive runs in terms of inertia.
+
+    max_iter : int, optional, default 300
+        Maximum number of iterations of the k-means algorithm to run.
+
+    verbose : boolean, optional
+        Verbosity mode.
+
+    tol : float, optional
+        The relative increment in the results before declaring convergence.
+
+    random_state : int, RandomState instance or None (default)
+        Determines random number generation for centroid initialization. Use
+        an int to make the randomness deterministic.
+        See :term:`Glossary <random_state>`.
+
+    copy_x : bool, optional
+        When pre-computing distances it is more numerically accurate to center
+        the data first.  If copy_x is True (default), then the original data is
+        not modified, ensuring X is C-contiguous.  If False, the original data
+        is modified, and put back before the function returns, but small
+        numerical differences may be introduced by subtracting and then adding
+        the data mean, in this case it will also not ensure that data is
+        C-contiguous which may cause a significant slowdown.
+
+    algorithm : "auto", "full" or "elkan", default="auto"
+        K-means algorithm to use. The classical EM-style algorithm is "full".
+        The "elkan" variation is more efficient by using the triangle
+        inequality, but currently doesn't support sparse data. "auto" chooses
+        "elkan" for dense data and "full" for sparse data.
+
+    oversampling_factor: int, default=2
+        Only work for kmeans||, used in each iteration in kmeans||.
+
+    init_iter: int, default=5
+        Only work for kmeans||, indicates how may iterations required.
+
+    return_n_iter : bool, optional
+        Whether or not to return the number of iterations.
+
+    Returns
+    -------
+    centroid : float ndarray with shape (k, n_features)
+        Centroids found at the last iteration of k-means.
+
+    label : integer ndarray with shape (n_samples,)
+        label[i] is the code or index of the centroid the
+        i'th observation is closest to.
+
+    inertia : float
+        The final value of the inertia criterion (sum of squared distances to
+        the closest centroid for all observations in the training set).
+
+    best_n_iter : int
+        Number of iterations corresponding to the best results.
+        Returned only if `return_n_iter` is set to True.
+    """
+
+    est = KMeans(
+        n_clusters=n_clusters,
+        init=init,
+        n_init=n_init,
+        max_iter=max_iter,
+        verbose=verbose,
+        tol=tol,
+        random_state=random_state,
+        copy_x=copy_x,
+        algorithm=algorithm,
+        oversampling_factor=oversampling_factor,
+        init_iter=init_iter,
+    ).fit(X, sample_weight=sample_weight)
+    if return_n_iter:
+        return est.cluster_centers_, est.labels_, est.inertia_, est.n_iter_
+    else:
+        return est.cluster_centers_, est.labels_, est.inertia_
+
+
+def _kmeans_single_elkan(
+    X,
+    sample_weight,
+    centers_init,
+    n_clusters,
+    max_iter=300,
+    verbose=False,
+    x_squared_norms=None,
+    tol=1e-4,
+    X_mean=None,
+    session=None,
+    run_kwargs=None,
+):
+    sample_weight = _check_normalize_sample_weight(sample_weight, X)
+
+    centers = centers_init
+    # execute X, centers and tol first
+    tol = mt.asarray(tol)
+    to_run = [X, sample_weight, centers, x_squared_norms, tol]
+    if X_mean is not None:
+        to_run.append(X_mean)
+    mt.ExecutableTuple(to_run).execute(session=session, **(run_kwargs or dict()))
+    tol = tol.fetch(session=session)
+
+    if verbose:
+        print("Initialization complete")
+
+    center_half_distances = euclidean_distances(centers) / 2
+    distance_next_center = mt.partition(
+        mt.asarray(center_half_distances), kth=1, axis=0
+    )[1]
+    center_shift = mt.zeros(n_clusters, dtype=X.dtype)
+
+    labels, upper_bounds, lower_bounds = init_bounds(
+        X, centers, center_half_distances, n_clusters
+    )
+
+    for i in range(max_iter):
+        to_runs = []
+
+        (
+            centers_new,
+            weight_in_clusters,
+            upper_bounds,
+            lower_bounds,
+            labels,
+            center_shift,
+        ) = elkan_iter(
+            X,
+            sample_weight,
+            centers,
+            center_half_distances,
+            distance_next_center,
+            upper_bounds,
+            lower_bounds,
+            labels,
+            center_shift,
+            session=session,
+            run_kwargs=run_kwargs,
+        )
+        to_runs.extend(
+            [
+                centers_new,
+                weight_in_clusters,
+                upper_bounds,
+                lower_bounds,
+                labels,
+                center_shift,
+            ]
+        )
+
+        # compute new pairwise distances between centers and closest other
+        # center of each center for next iterations
+        center_half_distances = euclidean_distances(centers_new) / 2
+        distance_next_center = mt.partition(
+            mt.asarray(center_half_distances), kth=1, axis=0
+        )[1]
+        to_runs.extend([center_half_distances, distance_next_center])
+
+        if verbose:
+            inertia = _inertia(X, sample_weight, centers, labels)
+            to_runs.append(inertia)
+
+        center_shift_tot = (center_shift**2).sum()
+        to_runs.append(center_shift_tot)
+
+        mt.ExecutableTuple(to_runs).execute(session=session, **(run_kwargs or dict()))
+
+        if verbose:
+            inertia_data = inertia.fetch(session=session)
+            print(f"Iteration {i}, inertia {inertia_data}")
+
+        center_shift_tot = center_shift_tot.fetch(session=session)
+        if center_shift_tot <= tol:
+            if verbose:  # pragma: no cover
+                print(
+                    f"Converged at iteration {i}: center shift {center_shift_tot} "
+                    f"within tolerance {tol}"
+                )
+            break
+
+        centers, centers_new = centers_new, centers
+
+    if center_shift_tot > 0:
+        # rerun E-step so that predicted labels match cluster centers
+        (
+            centers_new,
+            weight_in_clusters,
+            upper_bounds,
+            lower_bounds,
+            labels,
+            center_shift,
+        ) = elkan_iter(
+            X,
+            sample_weight,
+            centers,
+            center_half_distances,
+            distance_next_center,
+            upper_bounds,
+            lower_bounds,
+            labels,
+            center_shift,
+            update_centers=False,
+            session=session,
+            run_kwargs=run_kwargs,
+        )
+
+    inertia = _inertia(X, sample_weight, centers, labels)
+
+    mt.ExecutableTuple([labels, inertia, centers]).execute(
+        session=session, **(run_kwargs or dict())
+    )
+    return labels, inertia, centers, i + 1
+
+
+def _kmeans_single_lloyd(
+    X,
+    sample_weight,
+    centers_init,
+    n_clusters,
+    max_iter=300,
+    verbose=False,
+    x_squared_norms=None,
+    tol=1e-4,
+    X_mean=None,
+    session=None,
+    run_kwargs=None,
+):
+    sample_weight = _check_normalize_sample_weight(sample_weight, X)
+
+    centers = centers_init
+    # execute X, centers and tol first
+    tol = mt.asarray(tol)
+    to_run = [X, centers, x_squared_norms, tol]
+    if X_mean is not None:
+        to_run.append(X_mean)
+    mt.ExecutableTuple(to_run).execute(session=session, **(run_kwargs or dict()))
+    tol = tol.fetch(session=session)
+
+    if verbose:  # pragma: no cover
+        print("Initialization complete")
+
+    labels = mt.full(X.shape[0], -1, dtype=mt.int32)
+    center_shift = mt.zeros(n_clusters, dtype=X.dtype)
+
+    for i in range(max_iter):
+        to_runs = []
+
+        centers_new, weight_in_clusters, labels, center_shift = lloyd_iter(
+            X,
+            sample_weight,
+            x_squared_norms,
+            centers,
+            labels,
+            center_shift,
+            update_centers=True,
+            session=session,
+            run_kwargs=run_kwargs,
+        )
+        to_runs.extend([centers_new, weight_in_clusters, labels, center_shift])
+
+        if verbose:
+            inertia = _inertia(X, sample_weight, centers, labels)
+            to_runs.append(inertia)
+
+        center_shift_tot = (center_shift**2).sum()
+        to_runs.append(center_shift_tot)
+
+        mt.ExecutableTuple(to_runs).execute(session=session, **(run_kwargs or dict()))
+
+        if verbose:  # pragma: no cover
+            inertia_data = inertia.fetch(session=session)
+            print(f"Iteration {i}, inertia {inertia_data}")
+
+        center_shift_tot = center_shift_tot.fetch(session=session)
+        if center_shift_tot <= tol:
+            if verbose:  # pragma: no cover
+                print(
+                    f"Converged at iteration {i}: center shift {center_shift_tot} "
+                    f"within tolerance {tol}"
+                )
+            break
+
+        centers, centers_new = centers_new, centers
+
+    if center_shift_tot > 0:
+        # rerun E-step so that predicted labels match cluster centers
+        centers_new, weight_in_clusters, labels, center_shift = lloyd_iter(
+            X,
+            sample_weight,
+            x_squared_norms,
+            centers,
+            labels,
+            center_shift,
+            update_centers=False,
+            session=session,
+            run_kwargs=run_kwargs,
+        )
+
+    inertia = _inertia(X, sample_weight, centers, labels)
+
+    mt.ExecutableTuple([labels, inertia, centers]).execute(
+        session=session, **(run_kwargs or dict())
+    )
+    return labels, inertia, centers, i + 1
+
+
+def _labels_inertia(
+    X, sample_weight, x_squared_norms, centers, session=None, run_kwargs=None
+):
+    """E step of the K-means EM algorithm.
+
+    Compute the labels and the inertia of the given samples and centers.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The input samples to assign to the labels. If sparse matrix, must be in
+        CSR format.
+
+    sample_weight : array-like of shape (n_samples,)
+        The weights for each observation in X.
+
+    x_squared_norms : Tensor of shape (n_samples,)
+        Precomputed squared euclidean norm of each data point, to speed up
+        computations.
+
+    centers : Tensor, shape (n_clusters, n_features)
+        The cluster centers.
+
+    Returns
+    -------
+    labels : ndarray of shape (n_samples,)
+        The resulting assignment
+
+    inertia : float
+        Sum of squared distances of samples to their closest cluster center.
+    """
+    n_samples = X.shape[0]
+    n_clusters = centers.shape[0]
+
+    sample_weight = _check_normalize_sample_weight(sample_weight, X)
+    labels = mt.full(n_samples, -1, dtype=np.int32)
+    weight_in_clusters = mt.zeros(n_clusters, dtype=centers.dtype)
+    center_shift = mt.zeros_like(weight_in_clusters)
+
+    centers, weight_in_clusters, labels, center_shift = lloyd_iter(
+        X,
+        sample_weight,
+        x_squared_norms,
+        centers,
+        labels,
+        center_shift,
+        update_centers=False,
+        session=session,
+        run_kwargs=run_kwargs,
+    )
+
+    inertia = _inertia(X, sample_weight, centers, labels)
+
+    return labels, inertia
+
+
+def _init_centroids(
+    X,
+    n_clusters=8,
+    init="k-means++",
+    random_state=None,
+    x_squared_norms=None,
+    init_size=None,
+    oversampling_factor=2,
+    init_iter=5,
+):
+    """Compute the initial centroids
+
+    Parameters
+    ----------
+
+    X : Tensor of shape (n_samples, n_features)
+        The input samples.
+
+    n_clusters : int, default=8
+        number of centroids.
+
+    init : {'k-means++', 'k-means||', 'random', tensor, callable}, default="k-means++"
+        Method for initialization.
+
+    random_state : int, RandomState instance, default=None
+        Determines random number generation for centroid initialization. Use
+        an int to make the randomness deterministic.
+        See :term:`Glossary <random_state>`.
+
+    x_squared_norms : tensor of shape (n_samples,), default=None
+        Squared euclidean norm of each data point. Pass it if you have it at
+        hands already to avoid it being recomputed here. Default: None
+
+    init_size : int, default=None
+        Number of samples to randomly sample for speeding up the
+        initialization (sometimes at the expense of accuracy): the
+        only algorithm is initialized by running a batch KMeans on a
+        random subset of the data. This needs to be larger than k.
+
+    Returns
+    -------
+    centers : tensor of shape(k, n_features)
+    """
+    random_state = check_random_state(random_state).to_numpy()
+    n_samples = X.shape[0]
+
+    if x_squared_norms is None:
+        x_squared_norms = row_norms(X, squared=True)
+
+    if init_size is not None and init_size < n_samples:  # pragma: no cover
+        if init_size < n_clusters:
+            warnings.warn(
+                f"init_size={init_size} should be larger than k={n_clusters}. "
+                "Setting it to 3*k",
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            init_size = 3 * n_clusters
+        init_indices = random_state.randint(0, n_samples, init_size)
+        X = X[init_indices]
+        x_squared_norms = x_squared_norms[init_indices]
+        n_samples = X.shape[0]
+    elif n_samples < n_clusters:
+        raise ValueError(
+            f"n_samples={n_samples} should be larger than n_clusters={n_clusters}"
+        )
+
+    if isinstance(init, str) and init == "k-means++":
+        centers = _k_init(
+            X, n_clusters, random_state=random_state, x_squared_norms=x_squared_norms
+        )
+    elif isinstance(init, str) and init == "k-means||":
+        centers = _scalable_k_init(
+            X,
+            n_clusters,
+            random_state=random_state,
+            x_squared_norms=x_squared_norms,
+            oversampling_factor=oversampling_factor,
+            init_iter=init_iter,
+        )
+    elif isinstance(init, str) and init == "random":
+        seeds = random_state.choice(n_samples, size=n_clusters, replace=False)
+        centers = X[seeds].rechunk((n_clusters, X.shape[1]))
+    elif hasattr(init, "__array__"):
+        # ensure that the centers have the same dtype as X
+        # this is a requirement of fused types of cython
+        centers = mt.array(init, dtype=X.dtype)
+    elif callable(init):
+        centers = init(X, n_clusters, random_state=random_state)
+        centers = mt.asarray(centers, dtype=X.dtype)
+    else:  # pragma: no cover
+        raise ValueError(
+            "the init parameter for the k-means should "
+            "be 'k-means++' or 'random' or a tensor, "
+            f"'{init}' (type '{type(init)}') was passed."
+        )
+
+    if centers.issparse():
+        centers = centers.todense()
+
+    _validate_center_shape(X, n_clusters, centers)
+    return centers
+
+
+class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
+    """K-Means clustering.
+
+    Read more in the :ref:`User Guide <k_means>`.
+
+    Parameters
+    ----------
+
+    n_clusters : int, default=8
+        The number of clusters to form as well as the number of
+        centroids to generate.
+
+    init : {'k-means++', 'k-means||', 'random'} or tensor of shape \
+            (n_clusters, n_features), default='k-means||'
+        Method for initialization, defaults to 'k-means||':
+
+        'k-means++' : selects initial cluster centers for k-mean
+        clustering in a smart way to speed up convergence. See section
+        Notes in k_init for more details.
+
+        'k-means||': scalable k-means++.
+
+        'random': choose k observations (rows) at random from data for
+        the initial centroids.
+
+        If a tensor is passed, it should be of shape (n_clusters, n_features)
+        and gives the initial centers.
+
+    n_init : int, default=1
+        Number of time the k-means algorithm will be run with different
+        centroid seeds. The final results will be the best output of
+        n_init consecutive runs in terms of inertia.
+
+    max_iter : int, default=300
+        Maximum number of iterations of the k-means algorithm for a
+        single run.
+
+    tol : float, default=1e-4
+        Relative tolerance with regards to inertia to declare convergence.
+
+    verbose : int, default=0
+        Verbosity mode.
+
+    random_state : int, RandomState instance, default=None
+        Determines random number generation for centroid initialization. Use
+        an int to make the randomness deterministic.
+        See :term:`Glossary <random_state>`.
+
+    copy_x : bool, default=True
+        When pre-computing distances it is more numerically accurate to center
+        the data first.  If copy_x is True (default), then the original data is
+        not modified, ensuring X is C-contiguous.  If False, the original data
+        is modified, and put back before the function returns, but small
+        numerical differences may be introduced by subtracting and then adding
+        the data mean, in this case it will also not ensure that data is
+        C-contiguous which may cause a significant slowdown.
+
+    algorithm : {"auto", "full", "elkan"}, default="auto"
+        K-means algorithm to use. The classical EM-style algorithm is "full".
+        The "elkan" variation is more efficient by using the triangle
+        inequality, but currently doesn't support sparse data. "auto" chooses
+        "elkan" for dense data and "full" for sparse data.
+
+    oversampling_factor: int, default=2
+        Only work for kmeans||, used in each iteration in kmeans||.
+
+    init_iter: int, default=5
+        Only work for kmeans||, indicates how may iterations required.
+
+    Attributes
+    ----------
+    cluster_centers_ : tensor of shape (n_clusters, n_features)
+        Coordinates of cluster centers. If the algorithm stops before fully
+        converging (see ``tol`` and ``max_iter``), these will not be
+        consistent with ``labels_``.
+
+    labels_ : tensor of shape (n_samples,)
+        Labels of each point
+
+    inertia_ : float
+        Sum of squared distances of samples to their closest cluster center.
+
+    n_iter_ : int
+        Number of iterations run.
+
+    See Also
+    --------
+
+    MiniBatchKMeans
+        Alternative online implementation that does incremental updates
+        of the centers positions using mini-batches.
+        For large scale learning (say n_samples > 10k) MiniBatchKMeans is
+        probably much faster than the default batch implementation.
+
+    Notes
+    -----
+    The k-means problem is solved using either Lloyd's or Elkan's algorithm.
+
+    The average complexity is given by O(k n T), were n is the number of
+    samples and T is the number of iteration.
+
+    The worst case complexity is given by O(n^(k+2/p)) with
+    n = n_samples, p = n_features. (D. Arthur and S. Vassilvitskii,
+    'How slow is the k-means method?' SoCG2006)
+
+    In practice, the k-means algorithm is very fast (one of the fastest
+    clustering algorithms available), but it falls in local minima. That's why
+    it can be useful to restart it several times.
+
+    If the algorithm stops before fully converging (because of ``tol`` or
+    ``max_iter``), ``labels_`` and ``cluster_centers_`` will not be consistent,
+    i.e. the ``cluster_centers_`` will not be the means of the points in each
+    cluster. Also, the estimator will reassign ``labels_`` after the last
+    iteration to make ``labels_`` consistent with ``predict`` on the training
+    set.
+
+    Examples
+    --------
+
+    >>> from mars.learn.cluster import KMeans
+    >>> import mars.tensor as mt
+    >>> X = mt.array([[1, 2], [1, 4], [1, 0],
+    ...               [10, 2], [10, 4], [10, 0]])
+    >>> kmeans = KMeans(n_clusters=2, random_state=0, init='k-means++').fit(X)
+    >>> kmeans.labels_
+    array([1, 1, 1, 0, 0, 0], dtype=int32)
+    >>> kmeans.predict([[0, 0], [12, 3]])
+    array([1, 0], dtype=int32)
+    >>> kmeans.cluster_centers_
+    array([[10.,  2.],
+           [ 1.,  2.]])
+    """
+
+    def __init__(
+        self,
+        n_clusters=8,
+        init="k-means||",
+        n_init=1,
+        max_iter=300,
+        tol=1e-4,
+        verbose=0,
+        random_state=None,
+        copy_x=True,
+        algorithm="auto",
+        oversampling_factor=2,
+        init_iter=5,
+    ):
+        self.n_clusters = n_clusters
+        self.init = init
+        self.max_iter = max_iter
+        self.tol = tol
+        self.n_init = n_init
+        self.verbose = verbose
+        self.random_state = random_state
+        self.copy_x = copy_x
+        self.algorithm = algorithm
+        self.oversampling_factor = oversampling_factor
+        self.init_iter = init_iter
+
+    def _check_params(self, X):
+        # n_init
+        if self.n_init <= 0:
+            raise ValueError(f"n_init should be > 0, got {self.n_init} instead.")
+        self._n_init = self.n_init
+
+        # max_iter
+        if self.max_iter <= 0:
+            raise ValueError(f"max_iter should be > 0, got {self.max_iter} instead.")
+
+        # n_clusters
+        if X.shape[0] < self.n_clusters:
+            raise ValueError(
+                f"n_samples={X.shape[0]} should be >= n_clusters={self.n_clusters}."
+            )
+
+        # tol
+        self._tol = _tolerance(X, self.tol)
+
+        # algorithm
+        if self.algorithm not in ("auto", "full", "elkan"):
+            raise ValueError(
+                f"Algorithm must be 'auto', 'full' or 'elkan', "
+                f"got {self.algorithm} instead."
+            )
+
+        self._algorithm = self.algorithm
+        if self._algorithm == "auto":
+            # note(xuye.qin):
+            # Different from scikit-learn,
+            # for now, full seems more efficient when data is large,
+            # elkan needs to be tuned more
+            # old: algorithm = "full" if self.n_clusters == 1 else "elkan"
+            self._algorithm = "full"
+        if self._algorithm == "elkan" and self.n_clusters == 1:
+            warnings.warn(
+                "algorithm='elkan' doesn't make sense for a single "
+                "cluster. Using 'full' instead.",
+                RuntimeWarning,
+            )
+            self._algorithm = "full"
+
+        # init
+        if not (
+            hasattr(self.init, "__array__")
+            or callable(self.init)
+            or (
+                isinstance(self.init, str)
+                and self.init in ["k-means++", "k-means||", "random"]
+            )
+        ):
+            raise ValueError(
+                f"init should be either 'k-means++'， 'k-mean||', 'random', "
+                f"a tensor, a ndarray or a "
+                f"callable, got '{self.init}' instead."
+            )
+
+        if hasattr(self.init, "__array__") and self._n_init != 1:
+            warnings.warn(
+                f"Explicit initial center position passed: performing only"
+                f" one init in {self.__class__.__name__} instead of "
+                f"n_init={self._n_init}.",
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            self._n_init = 1
+
+    def _check_test_data(self, X):
+        X = check_array(
+            X,
+            accept_sparse=True,
+            dtype=[np.float64, np.float32],
+            order="C",
+            accept_large_sparse=False,
+        )
+        n_samples, n_features = X.shape
+        expected_n_features = self.cluster_centers_.shape[1]
+        if not n_features == expected_n_features:  # pragma: no cover
+            raise ValueError(
+                f"Incorrect number of features. Got {n_features} features, "
+                f"expected {expected_n_features}"
+            )
+
+        return X
+
+    def fit(self, X, y=None, sample_weight=None, session=None, run_kwargs=None):
+        """Compute k-means clustering.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training instances to cluster. It must be noted that the data
+            will be converted to C ordering, which will cause a memory
+            copy if the given data is not C-contiguous.
+            If a sparse matrix is passed, a copy will be made if it's not in
+            CSR format.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            The weights for each observation in X. If None, all observations
+            are assigned equal weight.
+
+        Returns
+        -------
+        self
+            Fitted estimator.
+        """
+        expect_chunk_size_on_columns = mt.tensor(X).shape[1]
+        if not np.isnan(expect_chunk_size_on_columns):
+            X = mt.tensor(X, chunk_size={1: expect_chunk_size_on_columns})
+
+        X = self._validate_data(
+            X,
+            accept_sparse=True,
+            dtype=[np.float64, np.float32],
+            order="C",
+            copy=self.copy_x,
+            accept_large_sparse=False,
+        )
+        # verify that the number of samples given is larger than k
+        if np.isnan(_num_samples(X)):  # pragma: no cover
+            X.execute(session=session, **(run_kwargs or dict()))
+
+        self._check_params(X)
+        random_state = check_random_state(self.random_state).to_numpy()
+
+        tol = _tolerance(X, self.tol)
+
+        # Validate init array
+        init = self.init
+        if hasattr(init, "__array__"):
+            init = check_array(init, dtype=X.dtype.type, copy=True, order="C")
+            _validate_center_shape(X, self.n_clusters, init)
+
+        # subtract of mean of x for more accurate distance computations
+        X_mean = None
+        if not X.issparse():
+            X_mean = X.mean(axis=0)
+            # The copy was already done above
+            X -= X_mean
+
+            if hasattr(init, "__array__"):
+                init -= X_mean
+
+        # precompute squared norms of data points
+        x_squared_norms = row_norms(X, squared=True)
+
+        best_labels, best_inertia, best_centers = None, None, None
+
+        if self._algorithm == "full":
+            kmeans_single = _kmeans_single_lloyd
+        else:
+            kmeans_single = _kmeans_single_elkan
+
+        for i in range(self._n_init):  # pylint: disable=unused-variable
+            # Initialize centers
+            centers_init = _init_centroids(
+                X,
+                self.n_clusters,
+                init,
+                random_state=random_state,
+                x_squared_norms=x_squared_norms,
+                oversampling_factor=self.oversampling_factor,
+                init_iter=self.init_iter,
+            )
+
+            # run a k-means once
+            labels, inertia, centers, n_iter_ = kmeans_single(
+                X,
+                sample_weight,
+                centers_init,
+                self.n_clusters,
+                max_iter=self.max_iter,
+                verbose=self.verbose,
+                tol=tol,
+                x_squared_norms=x_squared_norms,
+                X_mean=X_mean,
+                session=session,
+                run_kwargs=run_kwargs,
+            )
+            inertia = inertia.fetch(session=session)
+            # determine if these results are the best so far
+            if best_inertia is None or inertia < best_inertia:
+                best_labels = labels
+                best_centers = centers
+                best_inertia = inertia
+                best_n_iter = n_iter_
+
+        if not X.issparse():
+            if not self.copy_x:  # pragma: no cover
+                X += X_mean
+            best_centers += X_mean
+            best_centers.execute(session=session, **(run_kwargs or dict()))
+
+        distinct_clusters = len(set(best_labels.fetch(session=session)))
+        if distinct_clusters < self.n_clusters:  # pragma: no cover
+            warnings.warn(
+                f"Number of distinct clusters ({distinct_clusters}) found smaller than "
+                f"n_clusters ({self.n_clusters}). Possibly due to duplicate points in X.",
+                ConvergenceWarning,
+                stacklevel=2,
+            )
+
+        self.cluster_centers_ = best_centers
+        self.labels_ = best_labels
+        self.inertia_ = best_inertia
+        self.n_iter_ = best_n_iter
+        return self
+
+    def fit_predict(self, X, y=None, sample_weight=None, session=None, run_kwargs=None):
+        """Compute cluster centers and predict cluster index for each sample.
+
+        Convenience method; equivalent to calling fit(X) followed by
+        predict(X).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data to transform.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            The weights for each observation in X. If None, all observations
+            are assigned equal weight.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Index of the cluster each sample belongs to.
+        """
+        return self.fit(
+            X, sample_weight=sample_weight, session=session, run_kwargs=run_kwargs
+        ).labels_
+
+    def fit_transform(
+        self, X, y=None, sample_weight=None, session=None, run_kwargs=None
+    ):
+        """Compute clustering and transform X to cluster-distance space.
+
+        Equivalent to fit(X).transform(X), but more efficiently implemented.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data to transform.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            The weights for each observation in X. If None, all observations
+            are assigned equal weight.
+
+        Returns
+        -------
+        X_new : array of shape (n_samples, n_clusters)
+            X transformed in the new space.
+        """
+        # Currently, this just skips a copy of the data if it is not in
+        # np.array or CSR format already.
+        # XXX This skips _check_test_data, which may change the dtype;
+        # we should refactor the input validation.
+        self.fit(X, sample_weight=sample_weight, session=session, run_kwargs=run_kwargs)
+        return self._transform(X, session=session, run_kwargs=run_kwargs)
+
+    def transform(self, X, session=None, run_kwargs=None):
+        """Transform X to a cluster-distance space.
+
+        In the new space, each dimension is the distance to the cluster
+        centers.  Note that even if X is sparse, the array returned by
+        `transform` will typically be dense.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data to transform.
+
+        Returns
+        -------
+        X_new : tensor of shape (n_samples, n_clusters)
+            X transformed in the new space.
+        """
+        check_is_fitted(self)
+
+        X = self._check_test_data(X)
+        return self._transform(X, session=session, run_kwargs=run_kwargs)
+
+    def _transform(self, X, session=None, run_kwargs=None):
+        """guts of transform method; no input validation"""
+        return euclidean_distances(X, self.cluster_centers_).execute(
+            session=session, **(run_kwargs or dict())
+        )
+
+    def predict(self, X, sample_weight=None, session=None, run_kwargs=None):
+        """Predict the closest cluster each sample in X belongs to.
+
+        In the vector quantization literature, `cluster_centers_` is called
+        the code book and each value returned by `predict` is the index of
+        the closest code in the code book.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data to predict.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            The weights for each observation in X. If None, all observations
+            are assigned equal weight.
+
+        Returns
+        -------
+        labels : tensor of shape (n_samples,)
+            Index of the cluster each sample belongs to.
+        """
+        check_is_fitted(self)
+
+        X = self._check_test_data(X)
+        x_squared_norms = row_norms(X, squared=True)
+
+        result = _labels_inertia(
+            X,
+            sample_weight,
+            x_squared_norms,
+            self.cluster_centers_,
+            session=session,
+            run_kwargs=run_kwargs,
+        )[0]
+        result.execute(session=session, *(run_kwargs or dict()))
+        return result
+
+    def score(self, X, y=None, sample_weight=None, session=None, run_kwargs=None):
+        """Opposite of the value of X on the K-means objective.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            The weights for each observation in X. If None, all observations
+            are assigned equal weight.
+
+        Returns
+        -------
+        score : float
+            Opposite of the value of X on the K-means objective.
+        """
+        check_is_fitted(self)
+
+        X = self._check_test_data(X)
+        x_squared_norms = row_norms(X, squared=True)
+
+        result = -_labels_inertia(
+            X,
+            sample_weight,
+            x_squared_norms,
+            self.cluster_centers_,
+            session=session,
+            run_kwargs=run_kwargs,
+        )[1]
+        result.execute(session=session, **(run_kwargs or dict()))
+        return result
diff --git a/python/xorbits/_mars/learn/cluster/tests/__init__.py b/python/xorbits/_mars/learn/cluster/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/cluster/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/cluster/tests/test_k_means.py b/python/xorbits/_mars/learn/cluster/tests/test_k_means.py
new file mode 100644
index 000000000..d75b9bb14
--- /dev/null
+++ b/python/xorbits/_mars/learn/cluster/tests/test_k_means.py
@@ -0,0 +1,513 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from io import StringIO
+
+import numpy as np
+import pytest
+import scipy.sparse as sp
+
+try:
+    from sklearn.datasets import make_blobs
+    from sklearn.metrics.cluster import v_measure_score
+    from sklearn.utils._testing import assert_raise_message
+except ImportError:
+    pass
+
+from .... import tensor as mt
+from ....config import options
+from ....core import ChunkGraphBuilder, TileableGraph, TileableGraphBuilder
+from .. import KMeans, k_means
+from .._kmeans import _init_centroids
+
+
+@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed")
+@pytest.mark.parametrize("representation", ["dense", "sparse"])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("algo", ["full", "elkan"])
+def test_k_means_results(setup, representation, dtype, algo):
+    array_constr = {"dense": np.array, "sparse": sp.csr_matrix}[representation]
+
+    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype)
+    sample_weight = [3, 1, 1, 3]  # will be rescaled to [1.5, 0.5, 0.5, 1.5]
+    init_centers = np.array([[0, 0], [1, 1]], dtype=dtype)
+
+    expected_labels = [0, 0, 1, 1]
+    expected_inertia = 0.1875
+    expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype)
+    expected_n_iter = 2
+
+    kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
+    kmeans.fit(X, sample_weight=sample_weight)
+
+    np.testing.assert_array_equal(kmeans.labels_, expected_labels)
+    np.testing.assert_almost_equal(kmeans.inertia_, expected_inertia)
+    np.testing.assert_array_almost_equal(kmeans.cluster_centers_, expected_centers)
+    assert kmeans.n_iter_ == expected_n_iter
+
+
+@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed")
+@pytest.mark.parametrize("representation", ["dense", "sparse"])
+@pytest.mark.parametrize("algo", ["full", "elkan"])
+def test_relocated_clusters(setup, representation, algo):
+    # check that empty clusters are relocated as expected
+
+    # second center too far from others points will be empty at first iter
+    init_centers = np.array([[0.5, 0.5], [3, 3]])
+
+    expected_labels = [0, 0, 1, 1]
+    expected_inertia = 0.25
+    expected_centers = [[0.25, 0], [0.75, 1]]
+    expected_n_iter = 3
+
+    array_constr = {"dense": np.array, "sparse": sp.csr_matrix}[representation]
+    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]])
+
+    kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
+    kmeans.fit(X)
+
+    np.testing.assert_array_equal(kmeans.labels_, expected_labels)
+    np.testing.assert_almost_equal(kmeans.inertia_, expected_inertia)
+    np.testing.assert_array_almost_equal(kmeans.cluster_centers_, expected_centers)
+    assert kmeans.n_iter_ == expected_n_iter
+
+
+@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed")
+@pytest.mark.parametrize("distribution", ["normal", "blobs"])
+@pytest.mark.parametrize("tol", [1e-2, 1e-4, 1e-8])
+def test_elkan_results(setup, distribution, tol):
+    # check that results are identical between lloyd and elkan algorithms
+
+    rnd = np.random.RandomState(0)
+    if distribution == "normal":
+        X = rnd.normal(size=(5000, 10))
+    else:
+        X, _ = make_blobs(random_state=rnd)
+
+    km_full = KMeans(
+        algorithm="full",
+        n_clusters=5,
+        random_state=0,
+        n_init=1,
+        tol=tol,
+        init="k-means++",
+    )
+    km_elkan = KMeans(
+        algorithm="elkan",
+        n_clusters=5,
+        random_state=0,
+        n_init=1,
+        tol=tol,
+        init="k-means++",
+    )
+
+    km_full.fit(X)
+    km_elkan.fit(X)
+    np.testing.assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_)
+    np.testing.assert_array_equal(km_elkan.labels_, km_full.labels_)
+
+    assert km_elkan.n_iter_ == km_full.n_iter_
+    assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6)
+
+
+@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed")
+def test_k_means_convergence(setup):
+    for algorithm in ["full", "elkan"]:
+        # Check that KMeans stops when convergence is reached when tol=0. (#16075)
+        rnd = np.random.RandomState(0)
+        X = rnd.normal(size=(5000, 10))
+
+        km = KMeans(
+            algorithm=algorithm,
+            n_clusters=5,
+            random_state=0,
+            n_init=1,
+            tol=0,
+            max_iter=300,
+            init="k-means++",
+        ).fit(X)
+
+        assert km.n_iter_ < 300
+
+
+@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed")
+def test_elkan_results_sparse(setup):
+    for distribution in ["normal", "blobs"]:
+        # check that results are identical between lloyd and elkan algorithms
+        # with sparse input
+        rnd = np.random.RandomState(0)
+        if distribution == "normal":
+            X = sp.random(100, 100, density=0.1, format="csr", random_state=rnd)
+            X.data = rnd.randn(len(X.data))
+        else:
+            X, _ = make_blobs(n_samples=100, n_features=100, random_state=rnd)
+            X = sp.csr_matrix(X)
+
+        km_full = KMeans(
+            algorithm="full", n_clusters=5, random_state=0, n_init=1, init="k-means++"
+        )
+        km_elkan = KMeans(
+            algorithm="elkan", n_clusters=5, random_state=0, n_init=1, init="k-means++"
+        )
+
+        km_full.fit(X)
+        km_elkan.fit(X)
+        np.testing.assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_)
+        np.testing.assert_allclose(km_elkan.labels_, km_full.labels_)
+
+
+@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed")
+def test_k_means_new_centers(setup):
+    # Explore the part of the code where a new center is reassigned
+    X = np.array(
+        [
+            [0, 0, 1, 1],
+            [0, 0, 0, 0],
+            [0, 1, 0, 0],
+            [0, 0, 0, 0],
+            [0, 0, 0, 0],
+            [0, 1, 0, 0],
+        ]
+    )
+    labels = [0, 1, 2, 1, 1, 2]
+    bad_centers = np.array([[+0, 1, 0, 0], [0.2, 0, 0.2, 0.2], [+0, 0, 0, 0]])
+
+    km = KMeans(
+        n_clusters=3,
+        init=bad_centers,
+        n_init=1,
+        max_iter=10,
+        random_state=1,
+        algorithm="elkan",
+    )
+    for this_X in (X, sp.coo_matrix(X)):
+        km.fit(this_X)
+        this_labels = km.labels_.fetch()
+        # Reorder the labels so that the first instance is in cluster 0,
+        # the second in cluster 1, ...
+        this_labels = np.unique(this_labels, return_index=True)[1][this_labels]
+        np.testing.assert_array_equal(this_labels, labels)
+
+
+def _check_fitted_model(km, n_clusters, n_features, true_labels):
+    # check that the number of clusters centers and distinct labels match
+    # the expectation
+    centers = km.cluster_centers_
+    assert centers.shape == (n_clusters, n_features)
+
+    labels = km.labels_.fetch()
+    assert np.unique(labels).shape[0] == n_clusters
+
+    # check that the labels assignment are perfect (up to a permutation)
+    assert v_measure_score(true_labels, labels) == 1.0
+    assert km.inertia_ > 0.0
+
+    # check error on dataset being too small
+    assert_raise_message(
+        ValueError,
+        "n_samples=1 should be >= n_clusters=%d" % km.n_clusters,
+        km.fit,
+        [[0.0, 1.0]],
+    )
+
+
+@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed")
+def test_k_means_init(setup):
+    # non centered, sparse centers to check the
+    centers = np.array(
+        [
+            [0.0, 5.0, 0.0, 0.0, 0.0],
+            [1.0, 1.0, 4.0, 0.0, 0.0],
+            [1.0, 0.0, 0.0, 5.0, 1.0],
+        ]
+    )
+    n_samples = 100
+    n_clusters, n_features = centers.shape
+    X, true_labels = make_blobs(
+        n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
+    )
+    X_csr = sp.csr_matrix(X)
+    for data in [X, X_csr]:
+        for init in ["random", "k-means++", "k-means||", centers.copy()]:
+            data = mt.tensor(data, chunk_size=50)
+            km = KMeans(
+                init=init,
+                n_clusters=n_clusters,
+                random_state=42,
+                n_init=1,
+                algorithm="elkan",
+            )
+            km.fit(data)
+            _check_fitted_model(km, n_clusters, n_features, true_labels)
+
+    X = mt.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]])
+    kmeans = KMeans(n_clusters=2, random_state=0, n_init=1, init="k-means||").fit(X)
+    assert sorted(kmeans.cluster_centers_.fetch().tolist()) == sorted(
+        [[10.0, 2.0], [1.0, 2.0]]
+    )
+
+
+@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed")
+def test_k_means_n_init(setup):
+    rnd = np.random.RandomState(0)
+    X = rnd.normal(size=(40, 2))
+
+    # two regression tests on bad n_init argument
+    # previous bug: n_init <= 0 threw non-informative TypeError (#3858)
+    with pytest.raises(ValueError, match="n_init"):
+        KMeans(n_init=0, init="k-means++").fit(X)
+    with pytest.raises(ValueError, match="n_init"):
+        KMeans(n_init=-1, init="k-means++").fit(X)
+
+
+@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed")
+def test_k_means_explicit_init_shape(setup):
+    # test for sensible errors when giving explicit init
+    # with wrong number of features or clusters
+    rnd = np.random.RandomState(0)
+    X = rnd.normal(size=(40, 3))
+
+    # mismatch of number of features
+    km = KMeans(n_init=1, init=X[:, :2], n_clusters=len(X), algorithm="elkan")
+    msg = "does not match the number of features of the data"
+    with pytest.raises(ValueError, match=msg):
+        km.fit(X)
+    # for callable init
+    km = KMeans(
+        n_init=1,
+        init=lambda X_, k, random_state: X_[:, :2],
+        n_clusters=len(X),
+        algorithm="elkan",
+    )
+    with pytest.raises(ValueError, match=msg):
+        km.fit(X)
+    # mismatch of number of clusters
+    msg = "does not match the number of clusters"
+    km = KMeans(n_init=1, init=X[:2, :], n_clusters=3, algorithm="elkan")
+    with pytest.raises(ValueError, match=msg):
+        km.fit(X)
+    # for callable init
+    km = KMeans(
+        n_init=1,
+        init=lambda X_, k, random_state: X_[:2, :],
+        n_clusters=3,
+        algorithm="elkan",
+    )
+    with pytest.raises(ValueError, match=msg):
+        km.fit(X)
+
+
+@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed")
+def test_k_means_fortran_aligned_data(setup):
+    # Check the KMeans will work well, even if X is a fortran-aligned data.
+    X = np.asfortranarray([[0, 0], [0, 1], [0, 1]])
+    centers = np.array([[0, 0], [0, 1]])
+    labels = np.array([0, 1, 1])
+    km = KMeans(
+        n_init=1, init=centers, random_state=42, n_clusters=2, algorithm="elkan"
+    )
+    km.fit(X)
+    np.testing.assert_array_almost_equal(km.cluster_centers_, centers)
+    np.testing.assert_array_equal(km.labels_, labels)
+
+
+@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed")
+@pytest.mark.parametrize("algo", ["full", "elkan"])
+@pytest.mark.parametrize(
+    "seed, max_iter, tol",
+    [
+        (0, 2, 1e-7),  # strict non-convergence
+        (1, 2, 1e-1),  # loose non-convergence
+        (3, 300, 1e-7),  # strict convergence
+        (4, 300, 1e-1),  # loose convergence
+    ],
+)
+def test_k_means_fit_predict(setup, algo, seed, max_iter, tol):
+    # check that fit.predict gives same result as fit_predict
+    rng = np.random.RandomState(seed)
+
+    X = make_blobs(n_samples=1000, n_features=10, centers=10, random_state=rng)[0]
+
+    kmeans = KMeans(
+        algorithm=algo,
+        n_clusters=10,
+        random_state=seed,
+        tol=tol,
+        max_iter=max_iter,
+        init="k-means++",
+    )
+
+    labels_1 = kmeans.fit(X).predict(X)
+    labels_2 = kmeans.fit_predict(X)
+
+    # Due to randomness in the order in which chunks of data are processed when
+    # using more than one thread, the absolute values of the labels can be
+    # different between the 2 strategies but they should correspond to the same
+    # clustering.
+    assert pytest.approx(v_measure_score(labels_1, labels_2)) == 1
+
+
+@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed")
+def test_transform(setup):
+    centers = np.array(
+        [
+            [0.0, 5.0, 0.0, 0.0, 0.0],
+            [1.0, 1.0, 4.0, 0.0, 0.0],
+            [1.0, 0.0, 0.0, 5.0, 1.0],
+        ]
+    )
+    n_samples = 100
+    n_clusters, n_features = centers.shape
+    X = make_blobs(
+        n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
+    )[0]
+
+    km = KMeans(n_clusters=n_clusters, init="k-means++", algorithm="elkan")
+    km.fit(X)
+    X_new = km.transform(km.cluster_centers_).fetch()
+
+    for c in range(n_clusters):
+        assert X_new[c, c] == 0
+        for c2 in range(n_clusters):
+            if c != c2:
+                assert X_new[c, c2] > 0
+
+
+@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed")
+def test_fit_transform(setup):
+    centers = np.array(
+        [
+            [0.0, 5.0, 0.0, 0.0, 0.0],
+            [1.0, 1.0, 4.0, 0.0, 0.0],
+            [1.0, 0.0, 0.0, 5.0, 1.0],
+        ]
+    )
+    n_samples = 100
+    X = make_blobs(
+        n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
+    )[0]
+    X1 = (
+        KMeans(n_clusters=3, random_state=51, init="k-means++", algorithm="elkan")
+        .fit(X)
+        .transform(X)
+    )
+    X2 = KMeans(
+        n_clusters=3, random_state=51, init="k-means++", algorithm="elkan"
+    ).fit_transform(X)
+    np.testing.assert_array_almost_equal(X1, X2)
+
+
+@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed")
+def test_score(setup):
+    centers = np.array(
+        [
+            [0.0, 5.0, 0.0, 0.0, 0.0],
+            [1.0, 1.0, 4.0, 0.0, 0.0],
+            [1.0, 0.0, 0.0, 5.0, 1.0],
+        ]
+    )
+    n_samples = 100
+    n_clusters, n_features = centers.shape
+    X = make_blobs(
+        n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
+    )[0]
+
+    for algo in ["full", "elkan"]:
+        # Check that fitting k-means with multiple inits gives better score
+        km1 = KMeans(
+            n_clusters=n_clusters,
+            max_iter=1,
+            random_state=42,
+            n_init=1,
+            algorithm=algo,
+            init="k-means++",
+        )
+        s1 = km1.fit(X).score(X).fetch()
+        km2 = KMeans(
+            n_clusters=n_clusters,
+            max_iter=10,
+            random_state=42,
+            n_init=1,
+            algorithm=algo,
+            init="k-means++",
+        )
+        s2 = km2.fit(X).score(X).fetch()
+        assert s2 > s1
+
+
+@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed")
+def test_k_means_function(setup):
+    # test calling the k_means function directly
+
+    # non centered, sparse centers to check the
+    centers = np.array(
+        [
+            [0.0, 5.0, 0.0, 0.0, 0.0],
+            [1.0, 1.0, 4.0, 0.0, 0.0],
+            [1.0, 0.0, 0.0, 5.0, 1.0],
+        ]
+    )
+    n_samples = 100
+    n_clusters, n_features = centers.shape
+    X, true_labels = make_blobs(
+        n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
+    )
+
+    # catch output
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        cluster_centers, labels, inertia = k_means(
+            X, n_clusters=n_clusters, sample_weight=None, verbose=True, init="k-means++"
+        )
+    finally:
+        sys.stdout = old_stdout
+    centers = cluster_centers
+    assert centers.shape == (n_clusters, n_features)
+
+    labels = labels.fetch()
+    assert np.unique(labels).shape[0] == n_clusters
+
+    # check that the labels assignment are perfect (up to a permutation)
+    assert v_measure_score(true_labels, labels) == 1.0
+    assert inertia > 0.0
+
+    # check warning when centers are passed
+    with pytest.warns(RuntimeWarning):
+        k_means(
+            X,
+            n_clusters=n_clusters,
+            sample_weight=None,
+            init=centers,
+        )
+
+    # to many clusters desired
+    with pytest.raises(ValueError):
+        k_means(X, n_clusters=X.shape[0] + 1, sample_weight=None, init="k-means++")
+
+
+@pytest.mark.skipif(KMeans is None, reason="scikit-learn not installed")
+def test_k_means_init_large_n_clusters():
+    chunk_bytes_limit = options.chunk_store_limit * 2
+    n_cluster = 2000
+    x = mt.random.rand(1000_000, 64, chunk_size=250_000)
+
+    centers = _init_centroids(x, n_cluster, init="k-means||")
+    t_graph = next(TileableGraphBuilder(TileableGraph([centers])).build())
+    graph = next(ChunkGraphBuilder(t_graph).build())
+    for c in graph:
+        nbytes = c.nbytes
+        if not np.isnan(nbytes):
+            assert nbytes <= chunk_bytes_limit
diff --git a/python/xorbits/_mars/learn/contrib/__init__.py b/python/xorbits/_mars/learn/contrib/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/contrib/joblib/__init__.py b/python/xorbits/_mars/learn/contrib/joblib/__init__.py
new file mode 100644
index 000000000..c5df56dec
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/joblib/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .backend import register_mars_backend
diff --git a/python/xorbits/_mars/learn/contrib/joblib/backend.py b/python/xorbits/_mars/learn/contrib/joblib/backend.py
new file mode 100644
index 000000000..ebdbaccb3
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/joblib/backend.py
@@ -0,0 +1,87 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import concurrent.futures
+
+from .... import remote
+from ....deploy.oscar.session import get_default_session, new_session
+
+try:
+    from joblib.parallel import (
+        AutoBatchingMixin,
+        ParallelBackendBase,
+        register_parallel_backend,
+    )
+except ImportError:
+    ParallelBackendBase = object
+    AutoBatchingMixin = object
+    register_parallel_backend = None
+
+
+class MarsDistributedBackend(AutoBatchingMixin, ParallelBackendBase):
+    MIN_IDEAL_BATCH_DURATION = 0.2
+    MAX_IDEAL_BATCH_DURATION = 1.0
+    supports_timeout = True
+
+    def __init__(self, service=None, session=None, backend=None, n_parallel=None):
+        super().__init__()
+
+        if session is None:
+            if service is not None:
+                self.session = new_session(service, backend=backend, default=False)
+            else:
+                self.session = get_default_session()
+        else:
+            self.session = session
+
+        self.n_parallel = n_parallel or 1
+        self.executor = None
+
+    def get_nested_backend(self):
+        return MarsDistributedBackend(session=self.session), -1
+
+    def configure(self, n_jobs=1, parallel=None, **backend_args):
+        self.parallel = parallel
+        n_parallel = self.effective_n_jobs(n_jobs)
+        self.executor = concurrent.futures.ThreadPoolExecutor(n_parallel)
+        return n_parallel
+
+    def effective_n_jobs(self, n_jobs):
+        eff_n_jobs = super(MarsDistributedBackend, self).effective_n_jobs(n_jobs)
+        if n_jobs == -1:
+            eff_n_jobs = self.n_parallel
+        return eff_n_jobs
+
+    def apply_async(self, func, callback=None):
+        # todo allow execute f() in remote end to reduce data copy latency
+        def f():
+            spawned = []
+            for func_obj, args, kwargs in func.items:
+                spawned.append(remote.spawn(func_obj, args=args, kwargs=kwargs))
+
+            ret = (
+                remote.ExecutableTuple(spawned)
+                .execute(session=self.session)
+                .fetch(self.session)
+            )
+            callback(ret)
+            return ret
+
+        future = self.executor.submit(f)
+        future.get = future.result
+        return future
+
+
+def register_mars_backend():
+    register_parallel_backend("mars", MarsDistributedBackend)
diff --git a/python/xorbits/_mars/learn/contrib/joblib/tests/__init__.py b/python/xorbits/_mars/learn/contrib/joblib/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/joblib/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/contrib/joblib/tests/test_backend.py b/python/xorbits/_mars/learn/contrib/joblib/tests/test_backend.py
new file mode 100644
index 000000000..d3c763919
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/joblib/tests/test_backend.py
@@ -0,0 +1,38 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import joblib
+import numpy as np
+from sklearn.datasets import load_digits
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.svm import SVC
+
+from .. import register_mars_backend
+
+register_mars_backend()
+
+
+def test_sk_learn_svc_train(setup):
+    digits = load_digits()
+    param_space = {
+        "C": np.logspace(-6, 6, 30),
+        "gamma": np.logspace(-8, 8, 30),
+        "tol": np.logspace(-4, -1, 30),
+        "class_weight": [None, "balanced"],
+    }
+    model = SVC(kernel="rbf")
+    search = RandomizedSearchCV(model, param_space, cv=5, n_iter=5, verbose=10)
+
+    with joblib.parallel_backend("mars", n_parallel=16):
+        search.fit(digits.data, digits.target)
diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/__init__.py b/python/xorbits/_mars/learn/contrib/lightgbm/__init__.py
new file mode 100644
index 000000000..55a078a6f
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/lightgbm/__init__.py
@@ -0,0 +1,34 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ._predict import predict, predict_proba
+
+
+def register_op():
+    from ._align import align_data_set
+    from ._train import train
+
+    del train, align_data_set
+
+
+from ..utils import config_mod_getattr as _config_mod_getattr
+
+_config_mod_getattr(
+    {
+        "LGBMClassifier": ".classifier.LGBMClassifier",
+        "LGBMRegressor": ".regressor.LGBMRegressor",
+        "LGBMRanker": ".ranker.LGBMRanker",
+    },
+    globals(),
+)
diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/_align.py b/python/xorbits/_mars/learn/contrib/lightgbm/_align.py
new file mode 100644
index 000000000..1784a3bbf
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/lightgbm/_align.py
@@ -0,0 +1,130 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import opcodes
+from ....core import ExecutableTuple, get_output_types, recursive_tile
+from ....serialization.serializables import AnyField
+from ....utils import has_unknown_shape
+from ...operands import LearnOperand, LearnOperandMixin
+
+
+class LGBMAlign(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.LGBM_ALIGN
+
+    _data = AnyField("data")
+    _label = AnyField("label")
+    _sample_weight = AnyField("sample_weight")
+    _init_score = AnyField("init_score")
+
+    def __init__(
+        self,
+        data=None,
+        label=None,
+        sample_weight=None,
+        init_score=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _data=data,
+            _label=label,
+            _sample_weight=sample_weight,
+            _init_score=init_score,
+            _output_types=output_types,
+            **kw
+        )
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def label(self):
+        return self._label
+
+    @property
+    def sample_weight(self):
+        return self._sample_weight
+
+    @property
+    def init_score(self):
+        return self._init_score
+
+    @property
+    def output_limit(self):
+        return 2 if self._sample_weight is None else 3
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        it = iter(inputs)
+        self._data = next(it)
+        for attr in ("_label", "_sample_weight", "_init_score"):
+            if getattr(self, attr) is not None:
+                setattr(self, attr, next(it))
+
+    def __call__(self):
+        kws, inputs = [], []
+        for arg in [self.data, self.label, self.sample_weight, self.init_score]:
+            if hasattr(arg, "params"):
+                kws.append(arg.params)
+                inputs.append(arg)
+        tileables = self.new_tileables(inputs, kws=kws)
+        return ExecutableTuple(tileables)
+
+    @classmethod
+    def tile(cls, op: "LGBMAlign"):
+        inputs = [
+            d
+            for d in [op.data, op.label, op.sample_weight, op.init_score]
+            if d is not None
+        ]
+        data = op.data
+
+        # check inputs to make sure no unknown chunk shape exists
+        if has_unknown_shape(*inputs):
+            yield
+
+        if len(data.nsplits[1]) != 1:
+            data = yield from recursive_tile(data.rechunk({1: data.shape[1]}))
+        outputs = [data]
+        for inp in inputs[1:]:
+            if inp is not None:
+                outputs.append(
+                    (yield from recursive_tile(inp.rechunk((data.nsplits[0],))))
+                )
+
+        kws = []
+        for o in outputs:
+            kw = o.params.copy()
+            kw.update(dict(chunks=o.chunks, nsplits=o.nsplits))
+            kws.append(kw)
+
+        new_op = op.copy().reset_key()
+        tileables = new_op.new_tileables(inputs, kws=kws)
+
+        return tileables
+
+
+def align_data_set(dataset):
+    out_types = get_output_types(
+        dataset.data, dataset.label, dataset.sample_weight, dataset.init_score
+    )
+    op = LGBMAlign(
+        data=dataset.data,
+        label=dataset.label,
+        sample_weight=dataset.sample_weight,
+        init_score=dataset.init_score,
+        output_types=out_types,
+    )
+    return op()
diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/_predict.py b/python/xorbits/_mars/learn/contrib/lightgbm/_predict.py
new file mode 100644
index 000000000..dd18b5766
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/lightgbm/_predict.py
@@ -0,0 +1,240 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle
+
+import numpy as np
+import pandas as pd
+
+from .... import opcodes
+from ....core import recursive_tile
+from ....dataframe.utils import parse_index
+from ....serialization.serializables import BoolField, BytesField, DictField, KeyField
+from ....tensor.core import TENSOR_TYPE, TensorOrder
+from ...operands import LearnOperand, LearnOperandMixin, OutputType
+
+
+class LGBMPredict(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.LGBM_PREDICT
+
+    _data = KeyField("data")
+    _model = BytesField("model", on_serialize=pickle.dumps, on_deserialize=pickle.loads)
+    _proba = BoolField("proba")
+    _kwds = DictField("kwds")
+
+    def __init__(
+        self, data=None, model=None, proba=None, kwds=None, output_types=None, **kw
+    ):
+        super().__init__(
+            _data=data,
+            _model=model,
+            _proba=proba,
+            _kwds=kwds,
+            _output_types=output_types,
+            **kw,
+        )
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def model(self):
+        return self._model
+
+    @property
+    def proba(self) -> bool:
+        return self._proba
+
+    @property
+    def kwds(self) -> dict:
+        return self._kwds
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        it = iter(inputs)
+        self._data = next(it)
+
+    def __call__(self):
+        num_class = int(getattr(self.model, "n_classes_", 2))
+        if self.proba:
+            shape = (self.data.shape[0], num_class)
+        else:
+            shape = (self.data.shape[0],)
+
+        if self._proba:
+            dtype = np.dtype(np.float_)
+        elif hasattr(self.model, "classes_"):
+            dtype = np.array(self.model.classes_).dtype
+        else:
+            dtype = getattr(self.model, "out_dtype_", np.dtype("float"))
+
+        if self.output_types[0] == OutputType.tensor:
+            # tensor
+            return self.new_tileable(
+                [self.data], shape=shape, dtype=dtype, order=TensorOrder.C_ORDER
+            )
+        elif self.output_types[0] == OutputType.dataframe:
+            # dataframe
+            dtypes = pd.Series([dtype] * num_class)
+            columns_value = parse_index(pd.Index(self.model.classes_), store_data=True)
+            return self.new_tileable(
+                [self.data],
+                shape=shape,
+                dtypes=dtypes,
+                columns_value=columns_value,
+                index_value=self.data.index_value,
+            )
+        else:
+            return self.new_tileable(
+                [self.data],
+                shape=shape,
+                index_value=self.data.index_value,
+                name="predictions",
+                dtype=dtype,
+            )
+
+    @classmethod
+    def tile(cls, op: "LGBMPredict"):
+        out = op.outputs[0]
+        out_chunks = []
+        data = op.data
+        if data.chunk_shape[1] > 1:
+            data = yield from recursive_tile(data.rechunk({1: op.data.shape[1]}))
+
+        for in_chunk in data.chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_index = (in_chunk.index[0],)
+
+            if len(out.shape) > 1:
+                chunk_shape = (in_chunk.shape[0], out.shape[1])
+                chunk_index += (0,)
+            else:
+                chunk_shape = (in_chunk.shape[0],)
+
+            if op.output_types[0] == OutputType.tensor:
+                out_chunk = chunk_op.new_chunk(
+                    [in_chunk],
+                    shape=chunk_shape,
+                    dtype=out.dtype,
+                    order=out.order,
+                    index=chunk_index,
+                )
+            elif op.output_types[0] == OutputType.dataframe:
+                # dataframe chunk
+                out_chunk = chunk_op.new_chunk(
+                    [in_chunk],
+                    shape=chunk_shape,
+                    dtypes=out.dtypes,
+                    columns_value=out.columns_value,
+                    index_value=in_chunk.index_value,
+                    index=chunk_index,
+                )
+            else:
+                # series chunk
+                out_chunk = chunk_op.new_chunk(
+                    [in_chunk],
+                    shape=chunk_shape,
+                    dtype=out.dtype,
+                    index_value=in_chunk.index_value,
+                    name=out.name,
+                    index=chunk_index,
+                )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        params = out.params
+        params["chunks"] = out_chunks
+        nsplits = (data.nsplits[0],)
+        if out.ndim > 1:
+            nsplits += ((out.shape[1],),)
+        params["nsplits"] = nsplits
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def execute(cls, ctx, op: "LGBMPredict"):
+        in_data = ctx[op.data.key]
+        in_data = in_data.spmatrix if hasattr(in_data, "spmatrix") else in_data
+        out = op.outputs[0]
+
+        if op.data.shape[0] == 0:
+            result = np.array([])
+        elif op.proba:
+            result = op.model.predict_proba(in_data, **op.kwds)
+        else:
+            result = op.model.predict(in_data, **op.kwds)
+
+        if op.output_types[0] == OutputType.dataframe:
+            result = pd.DataFrame(
+                result, index=in_data.index, columns=out.columns_value.to_pandas()
+            )
+        elif op.output_types[0] == OutputType.series:
+            result = pd.Series(result, index=in_data.index, name="predictions")
+
+        ctx[out.key] = result
+
+
+def predict_base(model, data, session=None, run_kwargs=None, run=True, **kwargs):
+    from lightgbm import LGBMModel
+
+    if not isinstance(model, LGBMModel):
+        raise TypeError(
+            f"model has to be a lightgbm.LGBMModel, got {type(model)} instead"
+        )
+    model = model.to_local() if hasattr(model, "to_local") else model
+
+    proba = kwargs.pop("proba", hasattr(model, "classes_"))
+
+    if isinstance(data, TENSOR_TYPE):
+        output_types = [OutputType.tensor]
+    elif proba:
+        output_types = [OutputType.dataframe]
+    else:
+        output_types = [OutputType.series]
+
+    op = LGBMPredict(
+        data=data,
+        model=model,
+        gpu=data.op.gpu,
+        output_types=output_types,
+        proba=proba,
+        kwds=kwargs,
+    )
+    result = op()
+    if run:
+        result.execute(session=session, **(run_kwargs or dict()))
+    return result
+
+
+def predict(model, data, session=None, run_kwargs=None, run=True, **kw):
+    if hasattr(model, "classes_"):
+        return predict_base(
+            model,
+            data,
+            session=session,
+            run_kwargs=run_kwargs,
+            proba=False,
+            run=run,
+            **kw,
+        )
+    else:
+        return predict_base(
+            model, data, session=session, run_kwargs=run_kwargs, run=run, **kw
+        )
+
+
+def predict_proba(model, data, session=None, run_kwargs=None, run=True, **kw):
+    return predict_base(
+        model, data, session=session, run_kwargs=run_kwargs, run=run, proba=True, **kw
+    )
diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/_train.py b/python/xorbits/_mars/learn/contrib/lightgbm/_train.py
new file mode 100644
index 000000000..b0b7420ea
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/lightgbm/_train.py
@@ -0,0 +1,458 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import logging
+import operator
+import pickle
+from collections import defaultdict
+from functools import reduce
+
+import numpy as np
+
+from .... import opcodes
+from ....core import ExecutableTuple, OutputType, recursive_tile
+from ....core.context import get_context
+from ....core.operand import MergeDictOperand
+from ....serialization.serializables import (
+    DictField,
+    FieldTypes,
+    Int32Field,
+    KeyField,
+    ListField,
+    StringField,
+)
+from ...utils import collect_ports, concat_chunks
+from ._align import align_data_set
+from .core import LGBMModelType, get_model_cls_from_type
+
+logger = logging.getLogger(__name__)
+
+
+class LGBMTrain(MergeDictOperand):
+    _op_type_ = opcodes.LGBM_TRAIN
+
+    _model_type = Int32Field(
+        "model_type", on_serialize=lambda x: x.value, on_deserialize=LGBMModelType
+    )
+    _params = DictField("params", key_type=FieldTypes.string)
+    _data = KeyField("data")
+    _label = KeyField("label")
+    _sample_weight = KeyField("sample_weight")
+    _init_score = KeyField("init_score")
+    _kwds = DictField("kwds", key_type=FieldTypes.string)
+
+    _eval_datas = ListField("eval_datas", FieldTypes.key)
+    _eval_labels = ListField("eval_labels", FieldTypes.key)
+    _eval_sample_weights = ListField("eval_sample_weights", FieldTypes.key)
+    _eval_init_scores = ListField("eval_init_scores", FieldTypes.key)
+
+    _workers = ListField("workers", FieldTypes.string)
+    _worker_id = Int32Field("worker_id")
+    _worker_ports = KeyField("worker_ports")
+
+    _tree_learner = StringField("tree_learner")
+    _timeout = Int32Field("timeout")
+
+    def __init__(
+        self,
+        model_type=None,
+        data=None,
+        label=None,
+        sample_weight=None,
+        init_score=None,
+        eval_datas=None,
+        eval_labels=None,
+        eval_sample_weights=None,
+        eval_init_scores=None,
+        params=None,
+        kwds=None,
+        workers=None,
+        worker_id=None,
+        worker_ports=None,
+        tree_learner=None,
+        timeout=None,
+        **kw,
+    ):
+        super().__init__(
+            _model_type=model_type,
+            _params=params,
+            _data=data,
+            _label=label,
+            _sample_weight=sample_weight,
+            _init_score=init_score,
+            _eval_datas=eval_datas,
+            _eval_labels=eval_labels,
+            _eval_sample_weights=eval_sample_weights,
+            _eval_init_scores=eval_init_scores,
+            _kwds=kwds,
+            _workers=workers,
+            _worker_id=worker_id,
+            _worker_ports=worker_ports,
+            _tree_learner=tree_learner,
+            _timeout=timeout,
+            **kw,
+        )
+        if self.output_types is None:
+            self.output_types = [OutputType.object]
+
+    @property
+    def model_type(self) -> LGBMModelType:
+        return self._model_type
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def label(self):
+        return self._label
+
+    @property
+    def sample_weight(self):
+        return self._sample_weight
+
+    @property
+    def init_score(self):
+        return self._init_score
+
+    @property
+    def eval_datas(self) -> list:
+        return self._eval_datas or []
+
+    @property
+    def eval_labels(self) -> list:
+        return self._eval_labels or []
+
+    @property
+    def eval_sample_weights(self) -> list:
+        return self._eval_sample_weights or []
+
+    @property
+    def eval_init_scores(self) -> list:
+        return self._eval_init_scores or []
+
+    @property
+    def params(self) -> dict:
+        return self._params or dict()
+
+    @property
+    def kwds(self) -> dict:
+        return self._kwds or dict()
+
+    @property
+    def workers(self) -> list:
+        return self._workers
+
+    @property
+    def worker_id(self) -> int:
+        return self._worker_id
+
+    @property
+    def worker_ports(self):
+        return self._worker_ports
+
+    @property
+    def timeout(self) -> int:
+        return self._timeout
+
+    @property
+    def tree_learner(self) -> str:
+        return self._tree_learner
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        it = iter(inputs)
+        for attr in ["_data", "_label", "_sample_weight", "_init_score"]:
+            if getattr(self, attr) is not None:
+                setattr(self, attr, next(it))
+        for attr in [
+            "_eval_datas",
+            "_eval_labels",
+            "_eval_sample_weights",
+            "_eval_init_scores",
+        ]:
+            new_list = []
+            for c in getattr(self, attr, None) or []:
+                if c is not None:
+                    new_list.append(next(it))
+            setattr(self, attr, new_list or None)
+
+        if self._worker_ports is not None:
+            self._worker_ports = next(it)
+
+    def __call__(self):
+        inputs = []
+        for attr in ["_data", "_label", "_sample_weight", "_init_score"]:
+            if getattr(self, attr) is not None:
+                inputs.append(getattr(self, attr))
+        for attr in [
+            "_eval_datas",
+            "_eval_labels",
+            "_eval_sample_weights",
+            "_eval_init_scores",
+        ]:
+            for c in getattr(self, attr, None) or []:
+                if c is not None:
+                    inputs.append(c)
+        return self.new_tileable(inputs)
+
+    @staticmethod
+    def _get_data_chunks_workers(ctx, data):
+        # data_chunk.inputs is concat, and concat's input is the co-allocated chunks
+        metas = ctx.get_chunks_meta([c.key for c in data.chunks], fields=["bands"])
+        return [m["bands"][0][0] for m in metas]
+
+    @staticmethod
+    def _concat_chunks_by_worker(chunks, chunk_workers):
+        worker_to_chunks = defaultdict(list)
+        for chunk, worker in zip(chunks, chunk_workers):
+            worker_to_chunks[worker].append(chunk)
+        worker_to_concat = dict()
+        for worker, chunks in worker_to_chunks.items():
+            worker_to_concat[worker] = concat_chunks(chunks)
+        return worker_to_concat
+
+    @classmethod
+    def tile(cls, op: "LGBMTrain"):
+        ctx = get_context()
+        data = op.data
+        worker_to_args = defaultdict(dict)
+
+        workers = cls._get_data_chunks_workers(ctx, data)
+
+        for arg in ["_data", "_label", "_sample_weight", "_init_score"]:
+            if getattr(op, arg) is not None:
+                for worker, chunk in cls._concat_chunks_by_worker(
+                    getattr(op, arg).chunks, workers
+                ).items():
+                    worker_to_args[worker][arg] = chunk
+
+        if op.eval_datas:
+            eval_workers_list = [
+                cls._get_data_chunks_workers(ctx, d) for d in op.eval_datas
+            ]
+            extra_workers = reduce(
+                operator.or_, (set(w) for w in eval_workers_list)
+            ) - set(workers)
+            worker_remap = dict(zip(extra_workers, itertools.cycle(workers)))
+            if worker_remap:
+                eval_workers_list = [
+                    [worker_remap.get(w, w) for w in wl] for wl in eval_workers_list
+                ]
+
+            for arg in [
+                "_eval_datas",
+                "_eval_labels",
+                "_eval_sample_weights",
+                "_eval_init_scores",
+            ]:
+                if getattr(op, arg):
+                    for tileable, eval_workers in zip(
+                        getattr(op, arg), eval_workers_list
+                    ):
+                        for worker, chunk in cls._concat_chunks_by_worker(
+                            tileable.chunks, eval_workers
+                        ).items():
+                            if arg not in worker_to_args[worker]:
+                                worker_to_args[worker][arg] = []
+                            worker_to_args[worker][arg].append(chunk)
+
+        out_chunks = []
+        workers = list(set(workers))
+        for worker_id, worker in enumerate(workers):
+            chunk_op = op.copy().reset_key()
+            chunk_op.expect_worker = worker
+
+            input_chunks = []
+            concat_args = worker_to_args.get(worker, {})
+            for arg in [
+                "_data",
+                "_label",
+                "_sample_weight",
+                "_init_score",
+                "_eval_datas",
+                "_eval_labels",
+                "_eval_sample_weights",
+                "_eval_init_scores",
+            ]:
+                arg_val = getattr(op, arg)
+                if arg_val:
+                    arg_chunk = concat_args.get(arg)
+                    setattr(chunk_op, arg, arg_chunk)
+                    if isinstance(arg_chunk, list):
+                        input_chunks.extend(arg_chunk)
+                    else:
+                        input_chunks.append(arg_chunk)
+
+            worker_ports_chunk = (
+                yield from recursive_tile(collect_ports(workers, op.data))
+            ).chunks[0]
+            input_chunks.append(worker_ports_chunk)
+
+            chunk_op._workers = workers
+            chunk_op._worker_ports = worker_ports_chunk
+            chunk_op._worker_id = worker_id
+
+            data_chunk = concat_args["_data"]
+            out_chunk = chunk_op.new_chunk(
+                input_chunks, shape=(np.nan,), index=data_chunk.index[:1]
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tileables(
+            op.inputs, chunks=out_chunks, nsplits=((np.nan for _ in out_chunks),)
+        )
+
+    @classmethod
+    def execute(cls, ctx, op: "LGBMTrain"):
+        if op.merge:
+            return super().execute(ctx, op)
+
+        from lightgbm.basic import _LIB, _safe_call
+
+        data_val = ctx[op.data.key]
+        data_val = data_val.spmatrix if hasattr(data_val, "spmatrix") else data_val
+
+        label_val = ctx[op.label.key]
+        sample_weight_val = (
+            ctx[op.sample_weight.key] if op.sample_weight is not None else None
+        )
+        init_score_val = ctx[op.init_score.key] if op.init_score is not None else None
+
+        if op.eval_datas is None:
+            eval_set, eval_sample_weight, eval_init_score = None, None, None
+        else:
+            eval_set, eval_sample_weight, eval_init_score = [], [], []
+            for data, label in zip(op.eval_datas, op.eval_labels):
+                data_eval = ctx[data.key]
+                data_eval = (
+                    data_eval.spmatrix if hasattr(data_eval, "spmatrix") else data_eval
+                )
+                eval_set.append((data_eval, ctx[label.key]))
+            for weight in op.eval_sample_weights:
+                eval_sample_weight.append(
+                    ctx[weight.key] if weight is not None else None
+                )
+            for score in op.eval_init_scores:
+                eval_init_score.append(ctx[score.key] if score is not None else None)
+
+            eval_set = eval_set or None
+            eval_sample_weight = eval_sample_weight or None
+            eval_init_score = eval_init_score or None
+
+        params = op.params.copy()
+        # if model is trained, remove unsupported parameters
+        params.pop("out_dtype_", None)
+        worker_ports = ctx[op.worker_ports.key]
+        worker_ips = [worker.split(":", 1)[0] for worker in op.workers]
+        worker_endpoints = [
+            f"{worker}:{port}" for worker, port in zip(worker_ips, worker_ports)
+        ]
+
+        params["machines"] = ",".join(worker_endpoints)
+        params["time_out"] = op.timeout
+        params["num_machines"] = len(worker_endpoints)
+        params["local_listen_port"] = worker_ports[op.worker_id]
+
+        if (op.tree_learner or "").lower() not in {"data", "feature", "voting"}:
+            logger.warning(
+                "Parameter tree_learner not set or set to incorrect value "
+                f'{op.tree_learner}, using "data" as default'
+            )
+            params["tree_learner"] = "data"
+        else:
+            params["tree_learner"] = op.tree_learner
+
+        try:
+            model_cls = get_model_cls_from_type(op.model_type)
+            model = model_cls(**params)
+            model.fit(
+                data_val,
+                label_val,
+                sample_weight=sample_weight_val,
+                init_score=init_score_val,
+                eval_set=eval_set,
+                eval_sample_weight=eval_sample_weight,
+                eval_init_score=eval_init_score,
+                **op.kwds,
+            )
+
+            if (
+                op.model_type == LGBMModelType.RANKER
+                or op.model_type == LGBMModelType.REGRESSOR
+            ):
+                model.set_params(out_dtype_=np.dtype("float"))
+            elif hasattr(label_val, "dtype"):
+                model.set_params(out_dtype_=label_val.dtype)
+            else:
+                model.set_params(out_dtype_=label_val.dtypes[0])
+
+            ctx[op.outputs[0].key] = pickle.dumps(model)
+        finally:
+            _safe_call(_LIB.LGBM_NetworkFree())
+
+
+def train(params, train_set, eval_sets=None, **kwargs):
+    eval_sets = eval_sets or []
+    model_type = kwargs.pop("model_type", LGBMModelType.CLASSIFIER)
+
+    evals_result = kwargs.pop("evals_result", dict())
+    session = kwargs.pop("session", None)
+    run_kwargs = kwargs.pop("run_kwargs", None)
+    if run_kwargs is None:
+        run_kwargs = dict()
+    timeout = kwargs.pop("timeout", 120)
+    base_port = kwargs.pop("base_port", None)
+
+    aligns = align_data_set(train_set)
+    for eval_set in eval_sets:
+        aligns += align_data_set(eval_set)
+
+    aligned_iter = iter(ExecutableTuple(aligns).execute(session))
+    datas, labels, sample_weights, init_scores = [], [], [], []
+    for dataset in [train_set] + eval_sets:
+        train_kw = dict()
+        for arg in ["data", "label", "sample_weight", "init_score"]:
+            if getattr(dataset, arg) is not None:
+                train_kw[arg] = next(aligned_iter)
+            else:
+                train_kw[arg] = None
+
+        datas.append(train_kw["data"])
+        labels.append(train_kw["label"])
+        sample_weights.append(train_kw["sample_weight"])
+        init_scores.append(train_kw["init_score"])
+
+    op = LGBMTrain(
+        params=params,
+        data=datas[0],
+        label=labels[0],
+        sample_weight=sample_weights[0],
+        init_score=init_scores[0],
+        eval_datas=datas[1:],
+        eval_labels=labels[1:],
+        eval_weights=sample_weights[1:],
+        eval_init_score=init_scores[1:],
+        model_type=model_type,
+        timeout=timeout,
+        lgbm_port=base_port,
+        kwds=kwargs,
+    )
+    ret = op().execute(session=session, **run_kwargs).fetch(session=session)
+
+    bst = pickle.loads(ret)
+    evals_result.update(bst.evals_result_ or {})
+    return bst
diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/classifier.py b/python/xorbits/_mars/learn/contrib/lightgbm/classifier.py
new file mode 100644
index 000000000..06c7610e4
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/lightgbm/classifier.py
@@ -0,0 +1,71 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils import check_consistent_length
+from ..utils import make_import_error_func
+from ._predict import predict_base
+from ._train import train
+from .core import LGBMModelType, LGBMScikitLearnBase
+
+try:
+    import lightgbm
+except ImportError:
+    lightgbm = None
+
+
+LGBMClassifier = make_import_error_func("lightgbm")
+if lightgbm:
+
+    class LGBMClassifier(LGBMScikitLearnBase, lightgbm.LGBMClassifier):
+        def fit(
+            self,
+            X,
+            y,
+            sample_weight=None,
+            init_score=None,
+            eval_set=None,
+            eval_sample_weight=None,
+            eval_init_score=None,
+            session=None,
+            run_kwargs=None,
+            **kwargs
+        ):
+            check_consistent_length(X, y, session=session, run_kwargs=run_kwargs)
+            params = self.get_params(True)
+            model = train(
+                params,
+                self._wrap_train_tuple(X, y, sample_weight, init_score),
+                eval_sets=self._wrap_eval_tuples(
+                    eval_set, eval_sample_weight, eval_init_score
+                ),
+                model_type=LGBMModelType.CLASSIFIER,
+                session=session,
+                run_kwargs=run_kwargs,
+                **kwargs
+            )
+
+            self.set_params(**model.get_params())
+            self._copy_extra_params(model, self)
+            return self
+
+        def predict(self, X, **kwargs):
+            return predict_base(self, X, proba=False, **kwargs)
+
+        def predict_proba(self, X, **kwargs):
+            return predict_base(self, X, proba=True, **kwargs)
+
+        def to_local(self):
+            model = lightgbm.LGBMClassifier(**self.get_params())
+            self._copy_extra_params(self, model)
+            return model
diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/core.py b/python/xorbits/_mars/learn/contrib/lightgbm/core.py
new file mode 100644
index 000000000..4adc198cc
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/lightgbm/core.py
@@ -0,0 +1,129 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+import itertools
+from collections import namedtuple
+
+import numpy as np
+import pandas as pd
+
+from ....dataframe import DataFrame as MarsDataFrame
+from ....dataframe import Series as MarsSeries
+from ....tensor import tensor as mars_tensor
+
+
+class LGBMModelType(enum.Enum):
+    CLASSIFIER = 0
+    REGRESSOR = 1
+    RANKER = 2
+
+
+_model_type_to_model = dict()
+
+
+def get_model_cls_from_type(model_type: LGBMModelType):
+    import lightgbm
+
+    if not _model_type_to_model:
+        _model_type_to_model.update(
+            {
+                LGBMModelType.CLASSIFIER: lightgbm.LGBMClassifier,
+                LGBMModelType.REGRESSOR: lightgbm.LGBMRegressor,
+                LGBMModelType.RANKER: lightgbm.LGBMRanker,
+            }
+        )
+    return _model_type_to_model[model_type]
+
+
+TrainTuple = namedtuple("TrainTuple", "data label sample_weight init_score")
+
+
+class LGBMScikitLearnBase:
+    def __init__(self, *args, **kwargs):
+        if args and isinstance(args[0], self._get_lgbm_class()):
+            model = args[0]
+            super().__init__(**model.get_params())
+            self._copy_extra_params(model, self)
+        else:
+            super().__init__(*args, **kwargs)
+
+    @classmethod
+    def _get_lgbm_class(cls):
+        try:
+            return getattr(cls, "_lgbm_class")
+        except AttributeError:
+            lgbm_class = next(
+                base for base in cls.__bases__ if base.__module__.startswith("lightgbm")
+            )
+            cls._lgbm_class = lgbm_class
+            return lgbm_class
+
+    @classmethod
+    def _get_param_names(cls):
+        return cls._get_lgbm_class()._get_param_names()
+
+    @staticmethod
+    def _copy_extra_params(source, dest):
+        params = source.get_params()
+        attributes = source.__dict__
+        extra_param_names = set(attributes.keys()).difference(params.keys())
+        for name in extra_param_names:
+            setattr(dest, name, attributes[name])
+
+    @staticmethod
+    def _convert_tileable(obj):
+        if isinstance(obj, np.ndarray):
+            return mars_tensor(obj)
+        elif isinstance(obj, pd.DataFrame):
+            return MarsDataFrame(obj)
+        elif isinstance(obj, pd.Series):
+            return MarsSeries(obj)
+        return obj
+
+    @classmethod
+    def _wrap_train_tuple(cls, data, label, sample_weight=None, init_score=None):
+        data = cls._convert_tileable(data)
+        label = cls._convert_tileable(label)
+        sample_weight = cls._convert_tileable(sample_weight)
+        init_score = cls._convert_tileable(init_score)
+        return TrainTuple(data, label, sample_weight, init_score)
+
+    @staticmethod
+    def _wrap_eval_tuples(eval_set=None, eval_sample_weight=None, eval_init_score=None):
+        if not eval_set:
+            return None
+
+        tps = []
+        for (data, label), weight, score in zip(
+            eval_set,
+            eval_sample_weight or itertools.repeat(None),
+            eval_init_score or itertools.repeat(None),
+        ):
+            tps.append(TrainTuple(data, label, weight, score))
+        return tps
+
+    def fit(self, X, y, sample_weight=None, **kwargs):
+        raise NotImplementedError
+
+    def predict(self, X, **kwargs):
+        raise NotImplementedError
+
+    def predict_proba(self, X, **kwargs):
+        raise NotImplementedError
+
+    def load_model(self, model):
+        self.set_params(**self.get_params())
+        self._copy_extra_params(model, self)
+        return self
diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/ranker.py b/python/xorbits/_mars/learn/contrib/lightgbm/ranker.py
new file mode 100644
index 000000000..3963d5123
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/lightgbm/ranker.py
@@ -0,0 +1,72 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils import check_consistent_length
+from ..utils import make_import_error_func
+from ._predict import predict_base
+from ._train import train
+from .core import LGBMModelType, LGBMScikitLearnBase
+
+try:
+    import lightgbm
+except ImportError:
+    lightgbm = None
+
+
+LGBMRanker = make_import_error_func("lightgbm")
+if lightgbm:
+
+    class LGBMRanker(LGBMScikitLearnBase, lightgbm.LGBMRanker):
+        def fit(
+            self,
+            X,
+            y,
+            sample_weight=None,
+            init_score=None,
+            group=None,
+            eval_set=None,
+            eval_sample_weight=None,
+            eval_init_score=None,
+            session=None,
+            run_kwargs=None,
+            **kwargs
+        ):
+            check_consistent_length(X, y, session=session, run_kwargs=run_kwargs)
+            params = self.get_params(True)
+            model = train(
+                params,
+                self._wrap_train_tuple(X, y, sample_weight, init_score),
+                eval_sets=self._wrap_eval_tuples(
+                    eval_set, eval_sample_weight, eval_init_score
+                ),
+                group=group,
+                model_type=LGBMModelType.RANKER,
+                session=session,
+                run_kwargs=run_kwargs,
+                **kwargs
+            )
+
+            self.set_params(**model.get_params())
+            self._copy_extra_params(model, self)
+            return self
+
+        def predict(self, X, **kw):
+            session = kw.pop("session", None)
+            run_kwargs = kw.pop("run_kwargs", None)
+            return predict_base(self, X, session=session, run_kwargs=run_kwargs, **kw)
+
+        def to_local(self):
+            model = lightgbm.LGBMRanker(**self.get_params())
+            self._copy_extra_params(self, model)
+            return model
diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/regressor.py b/python/xorbits/_mars/learn/contrib/lightgbm/regressor.py
new file mode 100644
index 000000000..31ed81e4a
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/lightgbm/regressor.py
@@ -0,0 +1,71 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils import check_consistent_length
+from ..utils import make_import_error_func
+from ._predict import predict_base
+from ._train import train
+from .core import LGBMModelType, LGBMScikitLearnBase
+
+try:
+    import lightgbm
+except ImportError:
+    lightgbm = None
+
+
+LGBMRegressor = make_import_error_func("lightgbm")
+if lightgbm:
+
+    class LGBMRegressor(LGBMScikitLearnBase, lightgbm.LGBMRegressor):
+        def fit(
+            self,
+            X,
+            y,
+            sample_weight=None,
+            init_score=None,
+            eval_set=None,
+            eval_sample_weight=None,
+            eval_init_score=None,
+            session=None,
+            run_kwargs=None,
+            **kwargs
+        ):
+            check_consistent_length(X, y, session=session, run_kwargs=run_kwargs)
+            params = self.get_params(True)
+            model = train(
+                params,
+                self._wrap_train_tuple(X, y, sample_weight, init_score),
+                eval_sets=self._wrap_eval_tuples(
+                    eval_set, eval_sample_weight, eval_init_score
+                ),
+                model_type=LGBMModelType.REGRESSOR,
+                session=session,
+                run_kwargs=run_kwargs,
+                **kwargs
+            )
+
+            self.set_params(**model.get_params())
+            self._copy_extra_params(model, self)
+            return self
+
+        def predict(self, X, **kw):
+            session = kw.pop("session", None)
+            run_kwargs = kw.pop("run_kwargs", None)
+            X = self._convert_tileable(X)
+            return predict_base(self, X, session=session, run_kwargs=run_kwargs, **kw)
+
+        def to_local(self):
+            model = lightgbm.LGBMRegressor(**self.get_params())
+            self._copy_extra_params(self, model)
+            return model
diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/tests/__init__.py b/python/xorbits/_mars/learn/contrib/lightgbm/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/lightgbm/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_classifier.py b/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_classifier.py
new file mode 100644
index 000000000..ce5db3332
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_classifier.py
@@ -0,0 +1,173 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ..... import dataframe as md
+from ..... import tensor as mt
+
+try:
+    import lightgbm
+
+    from .. import LGBMClassifier
+except ImportError:
+    lightgbm = LGBMClassifier = None
+
+
+n_rows = 1000
+n_columns = 10
+chunk_size = 200
+rs = mt.random.RandomState(0)
+X_raw = rs.rand(n_rows, n_columns, chunk_size=chunk_size)
+y_raw = rs.rand(n_rows, chunk_size=chunk_size)
+filter = rs.rand(n_rows, chunk_size=chunk_size) < 0.8
+X = X_raw[filter]
+y = y_raw[filter]
+
+X_df = md.DataFrame(X)
+x_sparse = np.random.rand(n_rows, n_columns)
+x_sparse[np.arange(n_rows), np.random.randint(n_columns, size=n_rows)] = np.nan
+X_sparse = mt.tensor(x_sparse, chunk_size=chunk_size).tosparse(missing=np.nan)[filter]
+
+
+@pytest.mark.skipif(lightgbm is None, reason="LightGBM not installed")
+def test_local_classifier(setup):
+    y_data = (y * 10).astype(mt.int32)
+    classifier = LGBMClassifier(n_estimators=2)
+    classifier.fit(X, y_data, eval_set=[(X, y_data)], verbose=True)
+    prediction = classifier.predict(X)
+
+    assert prediction.ndim == 1
+    assert prediction.shape[0] == len(X)
+
+    assert isinstance(prediction, mt.Tensor)
+
+    # test sparse tensor
+    X_sparse_data = X_sparse
+    classifier = LGBMClassifier(n_estimators=2)
+    classifier.fit(
+        X_sparse_data, y_data, eval_set=[(X_sparse_data, y_data)], verbose=True
+    )
+    prediction = classifier.predict(X_sparse_data)
+
+    assert prediction.ndim == 1
+    assert prediction.shape[0] == len(X)
+
+    assert isinstance(prediction, mt.Tensor)
+
+    prob = classifier.predict_proba(X)
+    assert prob.shape == X.shape
+
+    prediction_empty = classifier.predict(mt.array([]).reshape((0, X.shape[1])))
+    assert prediction_empty.shape == (0,)
+
+    # test dataframe
+    X_df_data = X_df
+    classifier = LGBMClassifier(n_estimators=2)
+    classifier.fit(X_df_data, y_data, verbose=True)
+    prediction = classifier.predict(X_df_data)
+
+    assert prediction.ndim == 1
+    assert prediction.shape[0] == len(X)
+
+    prob = classifier.predict_proba(X_df)
+
+    assert prob.ndim == 2
+    assert prob.shape == (len(X), 10)
+
+    # test weight
+    weights = [mt.random.rand(X.shape[0]), md.Series(mt.random.rand(X.shape[0]))]
+    y_df = md.DataFrame(y_data)
+    for weight in weights:
+        classifier = LGBMClassifier(n_estimators=2)
+        classifier.fit(X, y_df, sample_weight=weight, verbose=True)
+        prediction = classifier.predict(X)
+
+        assert prediction.ndim == 1
+        assert prediction.shape[0] == len(X)
+
+    # should raise error if weight.ndim > 1
+    with pytest.raises(ValueError):
+        LGBMClassifier(n_estimators=2).fit(
+            X, y_df, sample_weight=mt.random.rand(1, 1), verbose=True
+        )
+
+    # test binary classifier
+    new_y = (y_data > 0.5).astype(mt.int32)
+    classifier = LGBMClassifier(n_estimators=2)
+    classifier.fit(X, new_y, verbose=True)
+
+    prediction = classifier.predict(X)
+    assert prediction.ndim == 1
+    assert prediction.shape[0] == len(X)
+
+    prediction = classifier.predict_proba(X)
+    assert prediction.ndim == 2
+    assert prediction.shape[0] == len(X)
+
+    # test with existing model
+    X_np = X.execute().fetch()
+    new_y_np = new_y.execute().fetch()
+    raw_classifier = lightgbm.LGBMClassifier(n_estimators=2)
+    raw_classifier.fit(X_np, new_y_np, verbose=True)
+
+    classifier = LGBMClassifier(raw_classifier)
+    label_result = classifier.predict(X_df)
+    assert label_result.ndim == 1
+    assert label_result.shape[0] == len(X)
+
+    proba_result = classifier.predict_proba(X_df)
+    assert proba_result.ndim == 2
+    assert proba_result.shape[0] == len(X)
+
+
+@pytest.mark.skipif(lightgbm is None, reason="LightGBM not installed")
+def test_local_classifier_from_to_parquet(setup):
+    n_rows = 1000
+    n_columns = 10
+    rs = np.random.RandomState(0)
+    X = rs.rand(n_rows, n_columns)
+    y = (rs.rand(n_rows) > 0.5).astype(np.int32)
+    df = pd.DataFrame(X, columns=[f"c{i}" for i in range(n_columns)])
+
+    # test with existing model
+    classifier = lightgbm.LGBMClassifier(n_estimators=2)
+    classifier.fit(X, y, verbose=True)
+
+    with tempfile.TemporaryDirectory() as d:
+        result_dir = os.path.join(d, "result")
+        os.mkdir(result_dir)
+        data_dir = os.path.join(d, "data")
+        os.mkdir(data_dir)
+
+        df.iloc[:500].to_parquet(os.path.join(d, "data", "data1.parquet"))
+        df.iloc[500:].to_parquet(os.path.join(d, "data", "data2.parquet"))
+
+        df = md.read_parquet(data_dir)
+        model = LGBMClassifier()
+        model.load_model(classifier)
+        result = model.predict(df, run=False)
+        r = md.DataFrame(result).to_parquet(result_dir)
+
+        r.execute()
+
+        ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy()
+        expected = classifier.predict(X)
+        expected = np.stack([1 - expected, expected]).argmax(axis=0)
+        np.testing.assert_array_equal(ret, expected)
diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_ranker.py b/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_ranker.py
new file mode 100644
index 000000000..f060be69b
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_ranker.py
@@ -0,0 +1,68 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from ..... import tensor as mt
+
+try:
+    import lightgbm
+
+    from .. import LGBMRanker
+except ImportError:
+    lightgbm = LGBMRanker = None
+
+
+n_rows = 1000
+n_columns = 10
+chunk_size = 200
+rs = mt.random.RandomState(0)
+X_raw = rs.rand(n_rows, n_columns, chunk_size=chunk_size)
+y_raw = rs.rand(n_rows, chunk_size=chunk_size)
+
+
+@pytest.mark.skipif(lightgbm is None, reason="LightGBM not installed")
+def test_local_ranker(setup):
+    y = (y_raw * 10).astype(mt.int32)
+    ranker = LGBMRanker(n_estimators=2)
+    ranker.fit(X_raw, y, group=[X_raw.shape[0]], verbose=True)
+    prediction = ranker.predict(X_raw)
+
+    assert prediction.ndim == 1
+    assert prediction.shape[0] == len(X_raw)
+
+    assert isinstance(prediction, mt.Tensor)
+    result = prediction.fetch()
+    assert prediction.dtype == result.dtype
+
+    # test weight
+    weight = mt.random.rand(X_raw.shape[0])
+    ranker = LGBMRanker(verbosity=1, n_estimators=2)
+    ranker.fit(X_raw, y, group=[X_raw.shape[0]], sample_weight=weight)
+    prediction = ranker.predict(X_raw)
+
+    assert prediction.ndim == 1
+    assert prediction.shape[0] == len(X_raw)
+    result = prediction.fetch()
+    assert prediction.dtype == result.dtype
+
+    # test local model
+    X_np = X_raw.execute().fetch()
+    y_np = y.execute().fetch()
+    raw_ranker = lightgbm.LGBMRanker(verbosity=1, n_estimators=2)
+    raw_ranker.fit(X_np, y_np, group=[X_raw.shape[0]])
+    prediction = LGBMRanker(raw_ranker).predict(X_raw)
+
+    assert prediction.ndim == 1
+    assert prediction.shape[0] == len(X_raw)
diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_regressor.py b/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_regressor.py
new file mode 100644
index 000000000..22f00e5cb
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_regressor.py
@@ -0,0 +1,91 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+import pytest
+
+from ..... import tensor as mt
+
+try:
+    import lightgbm
+
+    from .. import LGBMRegressor
+except ImportError:
+    lightgbm = LGBMRegressor = None
+
+
+n_rows = 1000
+n_columns = 10
+chunk_size = 200
+rs = mt.random.RandomState(0)
+X = rs.rand(n_rows, n_columns, chunk_size=chunk_size)
+y = rs.randint(0, 10, n_rows, chunk_size=chunk_size)
+
+
+@pytest.mark.skipif(lightgbm is None, reason="LightGBM not installed")
+def test_local_regressor(setup):
+    regressor = LGBMRegressor(n_estimators=2)
+    regressor.fit(X, y, verbose=True)
+    prediction = regressor.predict(X)
+
+    assert prediction.ndim == 1
+    assert prediction.shape[0] == len(X)
+
+    assert isinstance(prediction, mt.Tensor)
+    result = prediction.fetch()
+    assert prediction.dtype == result.dtype
+
+    # test weight
+    weight = mt.random.rand(X.shape[0])
+    regressor = LGBMRegressor(verbosity=1, n_estimators=2)
+    regressor.fit(X, y, sample_weight=weight)
+    prediction = regressor.predict(X)
+
+    assert prediction.ndim == 1
+    assert prediction.shape[0] == len(X)
+    result = prediction.fetch()
+    assert prediction.dtype == result.dtype
+
+    # test numpy tensor
+    try:
+        from sklearn.datasets import make_classification
+
+        X_array, y_array = make_classification()
+        regressor = LGBMRegressor(n_estimators=2)
+        regressor.fit(X_array, y_array, verbose=True)
+        prediction = regressor.predict(X_array)
+
+        assert prediction.ndim == 1
+        assert prediction.shape[0] == len(X_array)
+
+        X_df = pd.DataFrame(X_array)
+        y_df = pd.Series(y_array)
+        regressor = LGBMRegressor(n_estimators=2)
+        regressor.fit(X_df, y_df, verbose=True)
+        prediction = regressor.predict(X_df)
+
+        assert prediction.ndim == 1
+        assert prediction.shape[0] == len(X_df)
+    except ImportError:
+        pass
+
+    # test existing model
+    X_np = X.execute().fetch()
+    y_np = y.execute().fetch()
+    raw_regressor = lightgbm.LGBMRegressor(verbosity=1, n_estimators=2)
+    raw_regressor.fit(X_np, y_np)
+    prediction = LGBMRegressor(raw_regressor).predict(X)
+
+    assert prediction.ndim == 1
+    assert prediction.shape[0] == len(X)
diff --git a/python/xorbits/_mars/learn/contrib/pytorch/__init__.py b/python/xorbits/_mars/learn/contrib/pytorch/__init__.py
new file mode 100644
index 000000000..77f4deef9
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/pytorch/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .dataset import MarsDataset  # noqa: F401 # pylint: disable=unused-import
+from .run_script import run_pytorch_script
+from .sampler import (  # noqa: F401 # pylint: disable=unused-import
+    DistributedSampler,
+    RandomSampler,
+    SequentialSampler,
+    SubsetRandomSampler,
+)
+
+
+def register_op():
+    from .run_script import RunPyTorch
+
+    del RunPyTorch
diff --git a/python/xorbits/_mars/learn/contrib/pytorch/dataset.py b/python/xorbits/_mars/learn/contrib/pytorch/dataset.py
new file mode 100644
index 000000000..283b600b1
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/pytorch/dataset.py
@@ -0,0 +1,97 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from typing import List
+
+import numpy as np
+import pandas as pd
+
+try:
+    import torch
+    from torch.utils.data import Dataset
+except ImportError:  # pragma: no cover
+    torch = None
+    Dataset = object
+
+from .... import execute
+from ....core.context import get_context
+from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
+from ....tensor.core import TENSOR_TYPE
+from ....utils import require_not_none
+
+ACCEPT_TYPE = (
+    TENSOR_TYPE,
+    DATAFRAME_TYPE,
+    SERIES_TYPE,
+    np.ndarray,
+    pd.DataFrame,
+    pd.Series,
+    List,
+)
+
+
+@require_not_none(torch)
+class MarsDataset(Dataset):
+    r"""MarsDataset that inherit from torch.utils.data.Dataset.
+    It converts from Mars basic datatype such as Tensor,
+    DataFrame, Series. Additionally, it's constructor can receive
+    np.ndarray, pd.DataFrame, pd.Series type.
+    """
+
+    def __init__(self, *tileables, fetch_kwargs=None):
+        self._context = get_context()
+        self._tileables = tileables
+        self._fetch_kwargs = fetch_kwargs or dict()
+        self._executed = False
+        self._check_type()
+
+    def _check_type(self):
+        for t in self._tileables:
+            if not isinstance(t, ACCEPT_TYPE):
+                raise TypeError(f"Unexpected dataset type: {type(t)}")
+
+    def _execute(self):
+        execute_data = [t for t in self._tileables if isinstance(t, ACCEPT_TYPE[:3])]
+        if len(execute_data):
+            execute(execute_data)
+
+    def __len__(self):
+        return self._tileables[0].shape[0]
+
+    def __getitem__(self, index):
+        if not self._executed:
+            self._execute()
+            self._executed = True
+        return tuple(self.get_data(t, index) for t in self._tileables)
+
+    def get_data(self, t, index):
+        fetch_kwargs = dict()
+        if self._fetch_kwargs:
+            fetch_kwargs = copy.deepcopy(self._fetch_kwargs)
+
+        if isinstance(t, TENSOR_TYPE):
+            return t[index].fetch(**fetch_kwargs)
+        elif isinstance(t, np.ndarray):
+            return t[index]
+        elif isinstance(t, DATAFRAME_TYPE):
+            return t.iloc[index].fetch(**fetch_kwargs).values
+        elif isinstance(t, SERIES_TYPE):
+            return t.iloc[index].fetch(**fetch_kwargs)
+        elif isinstance(t, pd.DataFrame):
+            return t.iloc[index].values
+        elif isinstance(t, pd.Series):
+            return t.iloc[index]
+        else:
+            return t[index]
diff --git a/python/xorbits/_mars/learn/contrib/pytorch/run_script.py b/python/xorbits/_mars/learn/contrib/pytorch/run_script.py
new file mode 100644
index 000000000..7e15a60a8
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/pytorch/run_script.py
@@ -0,0 +1,162 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Any, BinaryIO, Dict, List, Optional, TextIO, Union
+
+import numpy as np
+
+from .... import opcodes as OperandDef
+from ....core.context import get_context
+from ....remote.run_script import RunScript, _extract_inputs
+from ....serialization.serializables import Int32Field, StringField
+from ....typing import SessionType, TileableType
+from ....utils import to_binary
+from ..utils import pick_workers
+
+
+class RunPyTorch(RunScript):
+    _op_type_ = OperandDef.RUN_PYTORCH
+
+    # used for chunk op
+    _master_port = Int32Field("master_port")
+    _master_addr = StringField("master_addr")
+    _rank = Int32Field("rank")
+    _init_method = StringField("init_method")
+
+    def __init__(
+        self, master_port=None, master_addr=None, init_method=None, gpu=None, **kw
+    ):
+        super().__init__(
+            _master_port=master_port,
+            _master_addr=master_addr,
+            _init_method=init_method,
+            gpu=gpu,
+            **kw
+        )
+
+    @property
+    def master_port(self):
+        return self._master_port
+
+    @property
+    def master_addr(self):
+        return self._master_addr
+
+    @property
+    def init_method(self):
+        return self._init_method
+
+    @classmethod
+    def tile(cls, op):
+        ctx = get_context()
+
+        workers = pick_workers(ctx.get_worker_addresses(), op.world_size)
+        data, input_chunks = cls._get_chunk_data(op)
+
+        out_chunks = []
+        for i in range(op.world_size):
+            chunk_op = op.copy().reset_key()
+            chunk_op._data = data
+            chunk_op.expect_worker = workers[i]
+            if op.init_method is None:
+                chunk_op._master_port = op.master_port
+                chunk_op._master_addr = workers[0].split(":", 1)[0]
+            chunk_op._rank = i
+            chunk_op._init_method = op.init_method
+            out_chunks.append(chunk_op.new_chunk(input_chunks, index=(i,)))
+
+        new_op = op.copy()
+        return new_op.new_tileables(
+            op.inputs,
+            chunks=out_chunks,
+            nsplits=(tuple(np.nan for _ in range(len(out_chunks))),),
+        )
+
+    @classmethod
+    def _build_envs(cls, ctx, op):
+        envs = super()._build_envs(ctx, op)
+        if op.master_port is not None:
+            envs["MASTER_PORT"] = str(op.master_port)
+        if op.master_addr is not None:
+            envs["MASTER_ADDR"] = str(op.master_addr)
+        return envs
+
+    @classmethod
+    def execute(cls, ctx, op):
+        assert ctx.local_address.split(":")[0] == op.expect_worker.split(":")[0]
+
+        super().execute(ctx, op)
+
+
+def run_pytorch_script(
+    script: Union[bytes, str, BinaryIO, TextIO],
+    n_workers: int,
+    data: Dict[str, TileableType] = None,
+    gpu: Optional[bool] = None,
+    command_argv: List[str] = None,
+    retry_when_fail: bool = False,
+    session: SessionType = None,
+    run_kwargs: Dict[str, Any] = None,
+    port: int = None,
+):
+    """
+    Run PyTorch script in Mars cluster.
+
+    Parameters
+    ----------
+    script: str or file-like object
+        Script to run
+    n_workers : int
+        Number of PyTorch workers
+    data : dict
+        Variable name to data.
+    gpu : bool
+        Run PyTorch script on GPU
+    command_argv : list
+        Extra command args for script
+    retry_when_fail : bool
+        If True, retry when function failed.
+    session
+        Mars session, if not provided, will use default one.
+    run_kwargs : dict
+        Extra kwargs for `session.run`.
+    port : int
+        Port of PyTorch worker or ps, will automatically increase for the same worker
+
+    Returns
+    -------
+    status
+        return {'status': 'ok'} if succeeded, or error raised
+    """
+    if int(n_workers) <= 0:
+        raise ValueError("n_workers should be at least 1")
+    if hasattr(script, "read"):
+        code = script.read()
+    else:
+        with open(os.path.abspath(script), "rb") as f:
+            code = f.read()
+
+    inputs = _extract_inputs(data)
+    port = 29500 if port is None else port
+    op = RunPyTorch(
+        data=data,
+        code=to_binary(code),
+        world_size=int(n_workers),
+        retry_when_fail=retry_when_fail,
+        gpu=gpu,
+        master_port=port,
+        command_args=command_argv,
+    )
+    return op(inputs).execute(session=session, **(run_kwargs or {}))
diff --git a/python/xorbits/_mars/learn/contrib/pytorch/sampler.py b/python/xorbits/_mars/learn/contrib/pytorch/sampler.py
new file mode 100644
index 000000000..f783d7609
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/pytorch/sampler.py
@@ -0,0 +1,287 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Iterator, Optional, Sequence, Sized
+
+try:
+    import torch
+    from torch.utils.data import Sampler
+except ImportError:  # pragma: no cover
+    torch = None
+    Sampler = object
+
+from ....utils import require_not_none
+
+
+@require_not_none(torch)
+class SequentialSampler(Sampler):
+    r"""
+    Samples elements sequentially, always in the same order.
+
+    Args:
+        data_source (Dataset): dataset to sample from
+    """
+    data_source: Sized
+
+    def __init__(self, data_source):
+        self.data_source = data_source
+
+    def __iter__(self) -> Iterator[int]:
+        return iter(range(len(self.data_source)))
+
+    def __len__(self) -> int:
+        return len(self.data_source)
+
+
+@require_not_none(torch)
+class RandomSampler(Sampler):
+    r"""
+    Samples elements randomly. If without replacement, then sample from a shuffled dataset.
+    If with replacement, then user can specify :attr:`num_samples` to draw.
+
+    Args:
+        data_source (Dataset): dataset to sample from
+        replacement (bool): samples are drawn on-demand with replacement if ``True``, default=``False``
+        num_samples (int): number of samples to draw, default=`len(dataset)`. This argument
+            is supposed to be specified only when `replacement` is ``True``.
+        generator (Generator): Generator used in sampling.
+    """
+    data_source: Sized
+    replacement: bool
+
+    def __init__(
+        self, data_source, replacement=False, num_samples=None, generator=None
+    ):
+        self.data_source = data_source
+        self.replacement = replacement
+        self._num_samples = num_samples
+        self.generator = generator
+
+        if not isinstance(self.replacement, bool):
+            raise ValueError(
+                "replacement should be a boolean value, but got "
+                f"replacement={self.replacement}"
+            )
+
+        if self._num_samples is not None and not replacement:
+            raise ValueError(
+                "With replacement=False, num_samples should not be specified, "
+                "since a random permute will be performed."
+            )
+
+        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
+            raise ValueError(
+                "num_samples should be a positive integer "
+                f"value, but got num_samples={self.num_samples}"
+            )
+
+    @property
+    def num_samples(self):
+        # dataset size might change at runtime
+        if self._num_samples is None:
+            return len(self.data_source)
+        return self._num_samples
+
+    def __iter__(self):
+        n = len(self.data_source)
+        if self.generator is None:
+            generator = torch.Generator()
+            generator.manual_seed(
+                int(torch.empty((), dtype=torch.int64).random_().item())
+            )
+        else:
+            generator = self.generator
+        if self.replacement:
+            for _ in range(self.num_samples // 32):
+                yield from torch.randint(
+                    high=n, size=(32,), dtype=torch.int64, generator=generator
+                ).tolist()
+            yield from torch.randint(
+                high=n,
+                size=(self.num_samples % 32,),
+                dtype=torch.int64,
+                generator=generator,
+            ).tolist()
+        else:
+            yield from torch.randperm(n, generator=generator).tolist()
+
+    def __len__(self) -> int:
+        return self.num_samples
+
+
+@require_not_none(torch)
+class SubsetRandomSampler(Sampler):
+    """
+    Samples elements randomly from a given list of indices, without replacement.
+
+    Args:
+        indices (sequence): a sequence of indices
+        generator (Generator): Generator used in sampling.
+    """
+
+    indices: Sequence[int]
+
+    def __init__(self, indices: Sequence[int], generator=None) -> None:
+        self.indices = indices
+        self.generator = generator
+
+    def __iter__(self) -> Iterator[int]:
+        return (
+            self.indices[i]
+            for i in torch.randperm(len(self.indices), generator=self.generator)
+        )
+
+    def __len__(self) -> int:
+        return len(self.indices)
+
+
+@require_not_none(torch)
+class DistributedSampler(Sampler):
+    r"""
+    Sampler that restricts data loading to a subset of the dataset.
+
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such a case, each
+    process can pass a :class:`~torch.utils.data.DistributedSampler` instance as a
+    :class:`~torch.utils.data.DataLoader` sampler, and load a subset of the
+    original dataset that is exclusive to it.
+
+    .. note::
+        Dataset is assumed to be of constant size.
+
+    Args:
+        dataset: Dataset used for sampling.
+        num_replicas (int, optional): Number of processes participating in
+            distributed training. By default, :attr:`world_size` is retrieved from the
+            current distributed group.
+        rank (int, optional): Rank of the current process within :attr:`num_replicas`.
+            By default, :attr:`rank` is retrieved from the current distributed
+            group.
+        shuffle (bool, optional): If ``True`` (default), sampler will shuffle the
+            indices.
+        seed (int, optional): random seed used to shuffle the sampler if
+            :attr:`shuffle=True`. This number should be identical across all
+            processes in the distributed group. Default: ``0``.
+        drop_last (bool, optional): if ``True``, then the sampler will drop the
+            tail of the data to make it evenly divisible across the number of
+            replicas. If ``False``, the sampler will add extra indices to make
+            the data evenly divisible across the replicas. Default: ``False``.
+
+    .. warning::
+        In distributed mode, calling the :meth:`set_epoch` method at
+        the beginning of each epoch **before** creating the :class:`DataLoader` iterator
+        is necessary to make shuffling work properly across multiple epochs. Otherwise,
+        the same ordering will be always used.
+
+    Example::
+
+        >>> sampler = DistributedSampler(dataset) if is_distributed else None
+        >>> loader = DataLoader(dataset, shuffle=(sampler is None),
+        ...                     sampler=sampler)
+        >>> for epoch in range(start_epoch, n_epochs):
+        ...     if is_distributed:
+        ...         sampler.set_epoch(epoch)
+        ...     train(loader)
+    """
+
+    def __init__(
+        self,
+        dataset,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = True,
+        seed: int = 0,
+        drop_last: bool = False,
+    ) -> None:
+        import torch.distributed as dist
+
+        if num_replicas is None:  # pragma: no cover
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:  # pragma: no cover
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        if rank >= num_replicas or rank < 0:
+            raise ValueError(
+                "Invalid rank {}, rank should be in the interval"
+                " [0, {}]".format(rank, num_replicas - 1)
+            )
+
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.drop_last = drop_last
+        # If the dataset length is evenly divisible by # of replicas, then there
+        # is no need to drop any data, since the dataset will be split equally.
+        if self.drop_last and len(self.dataset) % self.num_replicas != 0:
+            # Split to nearest available length that is evenly divisible.
+            # This is to ensure each rank receives the same amount of data when
+            # using this Sampler.
+            self.num_samples = math.ceil(
+                (len(self.dataset) - self.num_replicas) / self.num_replicas
+            )
+        else:
+            self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)
+        self.total_size = self.num_samples * self.num_replicas
+        self.shuffle = shuffle
+        self.seed = seed
+
+    def generate_indices(self):
+        if self.shuffle:
+            # deterministically shuffle based on epoch and seed
+            g = torch.Generator()
+            g.manual_seed(self.seed + self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = list(range(len(self.dataset)))
+
+        if not self.drop_last:
+            # add extra samples to make it evenly divisible
+            padding_size = self.total_size - len(indices)
+            if padding_size <= len(indices):
+                indices += indices[:padding_size]
+            else:
+                indices += (indices * math.ceil(padding_size / len(indices)))[
+                    :padding_size
+                ]
+        else:
+            # remove tail of data to make it evenly divisible.
+            indices = indices[: self.total_size]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank : self.total_size : self.num_replicas]
+        assert len(indices) == self.num_samples
+
+        return indices
+
+    def __iter__(self):
+        return iter(self.generate_indices())
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch: int):
+        r"""Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas
+        use a different random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
diff --git a/python/xorbits/_mars/learn/contrib/pytorch/tests/__init__.py b/python/xorbits/_mars/learn/contrib/pytorch/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/pytorch/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/contrib/pytorch/tests/pytorch_dataset.py b/python/xorbits/_mars/learn/contrib/pytorch/tests/pytorch_dataset.py
new file mode 100644
index 000000000..737472ab2
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/pytorch/tests/pytorch_dataset.py
@@ -0,0 +1,80 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+
+def get_model():
+    import torch.nn as nn
+
+    return nn.Sequential(
+        nn.Linear(32, 64),
+        nn.ReLU(),
+        nn.Linear(64, 64),
+        nn.ReLU(),
+        nn.Linear(64, 10),
+        nn.Softmax(),
+    )
+
+
+def main(feature_data, labels):
+    import torch.distributed as dist
+    import torch.nn as nn
+    import torch.optim as optim
+    import torch.utils.data
+    from mars.learn.contrib.pytorch import DistributedSampler, MarsDataset
+
+    dist.init_process_group(backend="gloo")
+    torch.manual_seed(42)
+
+    data = feature_data
+    labels = labels
+
+    train_dataset = MarsDataset(data, labels)
+    assert len(train_dataset) == 1000
+
+    train_sampler = DistributedSampler(train_dataset, shuffle=True)
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset,
+        batch_size=32,
+        shuffle=(train_sampler is None),
+        sampler=train_sampler,
+    )
+
+    model = nn.parallel.DistributedDataParallel(get_model())
+    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
+    criterion = nn.BCELoss()
+
+    for i in range(2):
+        # 2 epochs
+        train_sampler.set_epoch(i)
+        running_loss = 0.0
+        for _, (batch_data, batch_labels) in enumerate(train_loader):
+            outputs = model(batch_data)
+            loss = criterion(outputs.squeeze(), batch_labels)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            running_loss += loss.item()
+        print(f"running_loss is {loss.item()}")
+
+    print("Done!")
+
+
+if __name__ == "__main__":
+    assert len(sys.argv) == 2
+    assert sys.argv[1] == "multiple"
+    feature_data = globals()["feature_data"]
+    labels = globals()["labels"]
+    main(feature_data, labels)
diff --git a/python/xorbits/_mars/learn/contrib/pytorch/tests/pytorch_sample.py b/python/xorbits/_mars/learn/contrib/pytorch/tests/pytorch_sample.py
new file mode 100644
index 000000000..b6f298744
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/pytorch/tests/pytorch_sample.py
@@ -0,0 +1,66 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+
+def get_model():
+    import torch.nn as nn
+
+    return nn.Sequential(
+        nn.Linear(32, 64),
+        nn.ReLU(),
+        nn.Linear(64, 64),
+        nn.ReLU(),
+        nn.Linear(64, 10),
+        nn.Softmax(),
+    )
+
+
+def main():
+    import torch.distributed as dist
+    import torch.nn as nn
+    import torch.optim as optim
+    import torch.utils.data
+
+    dist.init_process_group(backend="gloo")
+    torch.manual_seed(42)
+
+    data = torch.rand((1000, 32), dtype=torch.float32)
+    labels = torch.randint(1, (1000, 10), dtype=torch.float32)
+
+    train_dataset = torch.utils.data.TensorDataset(data, labels)
+    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=32, shuffle=False, sampler=train_sampler
+    )
+
+    model = nn.parallel.DistributedDataParallel(get_model())
+    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
+    criterion = nn.BCELoss()
+
+    for _ in range(2):
+        # 2 epochs
+        for _, (batch_data, batch_labels) in enumerate(train_loader):
+            outputs = model(batch_data)
+            loss = criterion(outputs.squeeze(), batch_labels)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+
+if __name__ == "__main__":
+    assert len(sys.argv) == 2
+    assert sys.argv[1] == "multiple"
+    main()
diff --git a/python/xorbits/_mars/learn/contrib/pytorch/tests/test_dataset.py b/python/xorbits/_mars/learn/contrib/pytorch/tests/test_dataset.py
new file mode 100644
index 000000000..d70d5463c
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/pytorch/tests/test_dataset.py
@@ -0,0 +1,325 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from ..... import dataframe as md
+from ..... import tensor as mt
+from .....utils import lazy_import
+from .. import (
+    DistributedSampler,
+    MarsDataset,
+    RandomSampler,
+    SequentialSampler,
+    SubsetRandomSampler,
+    run_pytorch_script,
+)
+
+torch_installed = lazy_import("torch") is not None
+
+
+@pytest.mark.skipif(not torch_installed, reason="pytorch not installed")
+def test_mars_dataset(setup):
+    import numpy as np
+    import pandas as pd
+    from torch.utils.data import Dataset
+
+    # Mars tensor
+    data = mt.random.rand(1000, 32, dtype="f4")
+    labels = mt.random.randint(0, 2, (1000, 10), dtype="f4")
+
+    data_verify = data[1].execute().fetch()
+    labels_verify = labels[1].execute().fetch()
+
+    train_dataset = MarsDataset(data, labels)
+
+    assert isinstance(train_dataset, Dataset)
+    np.testing.assert_array_equal(train_dataset[1][0], data_verify)
+    np.testing.assert_array_equal(train_dataset[1][1], labels_verify)
+    assert len(train_dataset) == 1000
+
+    # np ndarray
+    data = np.random.rand(1000, 32)
+    labels = np.random.randint(0, 2, (1000, 10))
+
+    data_verify = data[1]
+    labels.dtype = "float32"
+    labels_verify = labels[1]
+
+    train_dataset = MarsDataset(data, labels)
+    np.testing.assert_array_equal(train_dataset[1][0], data_verify)
+    np.testing.assert_array_equal(train_dataset[1][1], labels_verify)
+    assert len(train_dataset) == 1000
+
+    # Mars dataframe
+    data = md.DataFrame(data)
+    labels = md.DataFrame(labels)
+
+    data_verify = data.iloc[1].execute().fetch().values
+    labels_verify = labels.iloc[1].execute().fetch().values
+
+    train_dataset = MarsDataset(
+        data, labels, fetch_kwargs={"extra_config": {"check_series_name": False}}
+    )
+    np.testing.assert_array_equal(train_dataset[1][0], data_verify)
+    np.testing.assert_array_equal(train_dataset[1][1], labels_verify)
+    assert len(train_dataset) == 1000
+
+    # Mars Series
+    label = labels[1]
+
+    label_verify = label[1].execute().fetch()
+
+    train_dataset = MarsDataset(
+        data, label, fetch_kwargs={"extra_config": {"check_series_name": False}}
+    )
+    np.testing.assert_array_equal(train_dataset[1][0], data_verify)
+    assert train_dataset[1][1] == label_verify
+    assert len(train_dataset) == 1000
+
+    # pandas dataframe
+    data = pd.DataFrame(np.random.rand(1000, 32))
+    labels = pd.DataFrame(np.random.randint(0, 2, (1000, 10)), dtype="float32")
+
+    data_verify = data.iloc[1].values
+    labels_verify = labels.iloc[1].values
+
+    train_dataset = MarsDataset(data, labels)
+    np.testing.assert_array_equal(train_dataset[1][0], data_verify)
+    np.testing.assert_array_equal(train_dataset[1][1], labels_verify)
+    assert len(train_dataset) == 1000
+
+    # pands series
+    label = labels[1]
+    label_verify = label[1]
+
+    train_dataset = MarsDataset(data, label)
+    np.testing.assert_array_equal(train_dataset[1][0], data_verify)
+    assert train_dataset[1][1] == label_verify
+    assert len(train_dataset) == 1000
+
+    # test TypeError
+    label = tuple(range(1000))
+
+    with pytest.raises(TypeError) as e:
+        train_dataset = MarsDataset(data, label)
+    exec_msg = e.value.args[0]
+    assert exec_msg == "Unexpected dataset type: <class 'tuple'>"
+
+
+@pytest.mark.skipif(not torch_installed, reason="pytorch not installed")
+def test_sequential_sampler(setup_cluster):
+    import torch
+
+    data = mt.random.rand(1000, 32, dtype="f4")
+    labels = mt.random.randint(0, 2, (1000, 10), dtype="f4")
+
+    train_dataset = MarsDataset(data, labels)
+    assert len(train_dataset) == 1000
+
+    train_sampler = SequentialSampler(train_dataset)
+    assert len(train_sampler) == 1000
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=32, sampler=train_sampler
+    )
+
+    model = torch.nn.Sequential(
+        torch.nn.Linear(32, 64),
+        torch.nn.ReLU(),
+        torch.nn.Linear(64, 64),
+        torch.nn.ReLU(),
+        torch.nn.Linear(64, 10),
+        torch.nn.Softmax(dim=1),
+    )
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
+    criterion = torch.nn.BCELoss()
+    for _ in range(2):
+        # 2 epochs
+        for _, (batch_data, batch_labels) in enumerate(train_loader):
+            outputs = model(batch_data)
+            loss = criterion(outputs.squeeze(), batch_labels)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+
+@pytest.mark.skipif(not torch_installed, reason="pytorch not installed")
+def test_random_sampler(setup_cluster):
+    import torch
+
+    data = mt.random.rand(1000, 32, dtype="f4")
+    labels = mt.random.randint(0, 2, (1000, 10), dtype="f4")
+
+    train_dataset = MarsDataset(data, labels)
+
+    # test __init__()
+    with pytest.raises(ValueError) as e:
+        train_sampler = RandomSampler(train_dataset, replacement=1)
+    exec_msg = e.value.args[0]
+    assert exec_msg == "replacement should be a boolean value, but got replacement=1"
+
+    with pytest.raises(ValueError) as e:
+        train_sampler = RandomSampler(train_dataset, num_samples=900)
+    exec_msg = e.value.args[0]
+    assert (
+        exec_msg
+        == "With replacement=False, num_samples should not "
+        + "be specified, since a random permute will be performed."
+    )
+
+    with pytest.raises(ValueError) as e:
+        train_sampler = RandomSampler(train_dataset, replacement=True, num_samples=-1)
+    exec_msg = e.value.args[0]
+    assert (
+        exec_msg
+        == "num_samples should be a positive integer value, but got num_samples=-1"
+    )
+
+    train_sampler = RandomSampler(train_dataset)
+
+    # test __len__ num_samples()
+    assert len(train_sampler) == 1000
+    assert train_sampler.num_samples == 1000
+
+    # test __iter__
+    g_cpu = torch.Generator()
+    g_cpu.manual_seed(2147483647)
+
+    train_sampler = RandomSampler(train_dataset, generator=g_cpu)
+    assert len(train_sampler) == 1000
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=32, sampler=train_sampler
+    )
+    for _, (batch_data, batch_labels) in enumerate(train_loader):
+        assert len(batch_data[0]) == 32
+        assert len(batch_labels[0]) == 10
+
+    train_sampler = RandomSampler(train_dataset, replacement=True, num_samples=900)
+    assert len(train_sampler) == 900
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=32, sampler=train_sampler
+    )
+    for _, (batch_data, batch_labels) in enumerate(train_loader):
+        assert len(batch_data[0]) == 32
+        assert len(batch_labels[0]) == 10
+
+    # torch train
+    model = torch.nn.Sequential(
+        torch.nn.Linear(32, 64),
+        torch.nn.ReLU(),
+        torch.nn.Linear(64, 64),
+        torch.nn.ReLU(),
+        torch.nn.Linear(64, 10),
+        torch.nn.Softmax(dim=1),
+    )
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
+    criterion = torch.nn.BCELoss()
+    for _ in range(2):
+        # 2 epochs
+        for _, (batch_data, batch_labels) in enumerate(train_loader):
+            outputs = model(batch_data)
+            loss = criterion(outputs.squeeze(), batch_labels)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+
+@pytest.mark.skipif(not torch_installed, reason="pytorch not installed")
+def test_subset_random_sampler(setup_cluster):
+    import numpy as np
+    import torch
+
+    data = mt.random.rand(1000, 32, dtype="f4")
+    labels = mt.random.randint(0, 2, (1000, 10), dtype="f4")
+    data.execute()
+    labels.execute()
+
+    train_dataset = MarsDataset(data, labels)
+    train_sampler = SubsetRandomSampler(
+        np.random.choice(range(len(train_dataset)), len(train_dataset))
+    )
+
+    assert len(train_sampler) == 1000
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=32, sampler=train_sampler
+    )
+    for _, (batch_data, batch_labels) in enumerate(train_loader):
+        assert len(batch_data[0]) == 32
+        assert len(batch_labels[0]) == 10
+
+
+@pytest.mark.skipif(not torch_installed, reason="pytorch not installed")
+def test_distributed_sampler(setup_cluster):
+    import torch
+
+    data = mt.random.rand(1001, 32, dtype="f4")
+    labels = mt.random.randint(0, 2, (1001, 10), dtype="f4")
+
+    train_dataset = MarsDataset(data, labels)
+
+    with pytest.raises(ValueError) as e:
+        train_sampler = DistributedSampler(train_dataset, num_replicas=2, rank=-1)
+    exec_msg = e.value.args[0]
+    assert exec_msg == "Invalid rank -1, rank should be in the interval [0, 1]"
+
+    train_sampler = DistributedSampler(
+        train_dataset, num_replicas=2, rank=0, drop_last=True, shuffle=True
+    )
+    assert len(train_sampler) == 500
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=32, sampler=train_sampler
+    )
+    for _, (batch_data, batch_labels) in enumerate(train_loader):
+        assert len(batch_data[0]) == 32
+        assert len(batch_labels[0]) == 10
+
+    train_sampler = DistributedSampler(
+        train_dataset, num_replicas=2, rank=0, drop_last=False, shuffle=False
+    )
+    train_sampler.set_epoch(10)
+    assert len(train_sampler) == 501
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset, batch_size=32, sampler=train_sampler
+    )
+    for _, (batch_data, batch_labels) in enumerate(train_loader):
+        assert len(batch_data[0]) == 32
+        assert len(batch_labels[0]) == 10
+
+
+@pytest.mark.skipif(not torch_installed, reason="pytorch not installed")
+def test_mars_dataset_script(setup_cluster):
+    sess = setup_cluster
+    path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "pytorch_dataset.py"
+    )
+
+    data = mt.random.rand(1000, 32, dtype="f4")
+    labels = mt.random.randint(0, 2, (1000, 10), dtype="f4")
+
+    assert (
+        run_pytorch_script(
+            path,
+            n_workers=2,
+            data={"feature_data": data, "labels": labels},
+            command_argv=["multiple"],
+            port=9945,
+            session=sess,
+        ).fetch()["status"]
+        == "ok"
+    )
diff --git a/python/xorbits/_mars/learn/contrib/pytorch/tests/test_run_script.py b/python/xorbits/_mars/learn/contrib/pytorch/tests/test_run_script.py
new file mode 100644
index 000000000..811c13ab7
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/pytorch/tests/test_run_script.py
@@ -0,0 +1,34 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from .....utils import lazy_import
+from .. import run_pytorch_script
+
+torch_installed = lazy_import("torch") is not None
+
+
+@pytest.mark.skipif(not torch_installed, reason="pytorch not installed")
+def test_distributed_run_py_torch_script(setup_cluster):
+    sess = setup_cluster
+    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "pytorch_sample.py")
+    assert (
+        run_pytorch_script(
+            path, n_workers=2, command_argv=["multiple"], port=9945, session=sess
+        ).fetch()["status"]
+        == "ok"
+    )
diff --git a/python/xorbits/_mars/learn/contrib/statsmodels/__init__.py b/python/xorbits/_mars/learn/contrib/statsmodels/__init__.py
new file mode 100644
index 000000000..13eeef52a
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/statsmodels/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .api import MarsDistributedModel, MarsResults
+
+
+def register_op():
+    from . import predict, train
+
+    del train, predict
diff --git a/python/xorbits/_mars/learn/contrib/statsmodels/api.py b/python/xorbits/_mars/learn/contrib/statsmodels/api.py
new file mode 100644
index 000000000..515e7b0c1
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/statsmodels/api.py
@@ -0,0 +1,101 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle  # nosec  # pylint: disable=import_pickle
+
+from .predict import StatsModelsPredict
+from .train import StatsModelsTrain
+
+try:
+    from statsmodels.base.distributed_estimation import DistributedModel
+except ImportError:
+    DistributedModel = None
+
+
+class MarsDistributedModel:
+    def __init__(
+        self,
+        factor=None,
+        num_partitions=None,
+        model_class=None,
+        init_kwds=None,
+        estimation_method=None,
+        estimation_kwds=None,
+        join_method=None,
+        join_kwds=None,
+        results_class=None,
+        results_kwds=None,
+    ):
+        self._factor = factor
+        self._sm_model = DistributedModel(
+            num_partitions or 10,
+            model_class=model_class,
+            init_kwds=init_kwds,
+            estimation_method=estimation_method,
+            estimation_kwds=estimation_kwds,
+            join_method=join_method,
+            join_kwds=join_kwds,
+            results_class=results_class,
+            results_kwds=results_kwds,
+        )
+
+    def fit(self, endog, exog, session=None, **kwargs):
+        num_partitions = None if self._factor is not None else self._sm_model.partitions
+        run_kwargs = kwargs.pop("run_kwargs", dict())
+        op = StatsModelsTrain(
+            endog=endog,
+            exog=exog,
+            num_partitions=num_partitions,
+            factor=self._factor,
+            model_class=self._sm_model.model_class,
+            init_kwds=self._sm_model.init_kwds,
+            fit_kwds=kwargs,
+            estimation_method=self._sm_model.estimation_method,
+            estimation_kwds=self._sm_model.estimation_kwds,
+            join_method=self._sm_model.join_method,
+            join_kwds=self._sm_model.join_kwds,
+            results_class=self._sm_model.results_class,
+            results_kwds=self._sm_model.results_kwds,
+        )
+        result = (
+            op(exog, endog)
+            .execute(session=session, **run_kwargs)
+            .fetch(session=session)
+        )
+        return MarsResults(pickle.loads(result))  # nosec
+
+
+class MarsResults:
+    def __init__(self, model):
+        self._model = model
+
+    @property
+    def model(self):
+        return self._model
+
+    def __getattr__(self, item):
+        if item == "_model":
+            raise AttributeError
+        return getattr(self._model, item)
+
+    def __mars_tokenize__(self):
+        return pickle.dumps(self.model)
+
+    def predict(self, exog, *args, **kwargs):
+        session = kwargs.pop("session", None)
+        run_kwargs = kwargs.pop("run_kwargs", dict())
+        op = StatsModelsPredict(
+            model_results=self, predict_args=args, predict_kwargs=kwargs
+        )
+        return op(exog).execute(session=session, **run_kwargs)
diff --git a/python/xorbits/_mars/learn/contrib/statsmodels/predict.py b/python/xorbits/_mars/learn/contrib/statsmodels/predict.py
new file mode 100644
index 000000000..1dbd84a97
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/statsmodels/predict.py
@@ -0,0 +1,110 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle  # nosec  # pylint: disable=import_pickle
+
+import numpy as np
+
+from .... import opcodes
+from ....core import OutputType, recursive_tile
+from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
+from ....serialization.serializables import BytesField, DictField, TupleField
+from ...operands import LearnOperand, LearnOperandMixin
+
+
+class StatsModelsPredict(LearnOperand, LearnOperandMixin):
+    _op_code_ = opcodes.STATSMODELS_PREDICT
+
+    _model_results = BytesField(
+        "model_results", on_serialize=pickle.dumps, on_deserialize=pickle.loads
+    )
+    _predict_args = TupleField("predict_args")
+    _predict_kwargs = DictField("predict_kwargs")
+
+    def __init__(
+        self, model_results=None, predict_args=None, predict_kwargs=None, **kw
+    ):
+        super().__init__(
+            _model_results=model_results,
+            _predict_args=predict_args,
+            _predict_kwargs=predict_kwargs,
+            **kw
+        )
+
+    @property
+    def model_results(self):
+        return self._model_results
+
+    @property
+    def predict_args(self) -> tuple:
+        return self._predict_args
+
+    @property
+    def predict_kwargs(self) -> dict:
+        return self._predict_kwargs
+
+    def __call__(self, exog):
+        if isinstance(exog, (DATAFRAME_TYPE, SERIES_TYPE)):
+            self._output_types = [OutputType.series]
+            kwargs = dict(
+                shape=exog.shape[:1],
+                index_value=exog.index_value,
+                dtype=np.dtype("float"),
+            )
+        else:
+            self._output_types = [OutputType.tensor]
+            kwargs = dict(
+                shape=exog.shape[:1],
+                dtype=np.dtype("float"),
+            )
+        return self.new_tileable([exog], **kwargs)
+
+    @classmethod
+    def tile(cls, op: "StatsModelsPredict"):
+        exog = op.inputs[0]
+        out = op.outputs[0]
+
+        if exog.ndim > 1 and exog.chunk_shape[1] > 1:
+            exog = yield from recursive_tile(exog.rechunk({1: exog.shape[1]}))
+
+        chunks = []
+        for in_chunk in exog.chunks:
+            if isinstance(exog, (DATAFRAME_TYPE, SERIES_TYPE)):
+                kwargs = dict(
+                    index=in_chunk.index[:1],
+                    shape=in_chunk.shape[:1],
+                    index_value=in_chunk.index_value,
+                    dtype=out.dtype,
+                )
+            else:
+                kwargs = dict(
+                    index=in_chunk.index[:1],
+                    shape=in_chunk.shape[:1],
+                    dtype=out.dtype,
+                )
+
+            new_op = op.copy().reset_key()
+            chunks.append(new_op.new_chunk([in_chunk], **kwargs))
+
+        new_op = op.copy().reset_key()
+        return new_op.new_tileables(
+            [exog], chunks=chunks, nsplits=(exog.nsplits[0],), **out.params
+        )
+
+    @classmethod
+    def execute(cls, ctx, op: "StatsModelsPredict"):
+        in_data = ctx[op.inputs[0].key]
+        ctx[op.outputs[0].key] = op.model_results.model.predict(
+            in_data, *op.predict_args, **op.predict_kwargs
+        )
diff --git a/python/xorbits/_mars/learn/contrib/statsmodels/tests/__init__.py b/python/xorbits/_mars/learn/contrib/statsmodels/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/statsmodels/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/contrib/statsmodels/tests/test_statsmodels.py b/python/xorbits/_mars/learn/contrib/statsmodels/tests/test_statsmodels.py
new file mode 100644
index 000000000..acbf12432
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/statsmodels/tests/test_statsmodels.py
@@ -0,0 +1,48 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from ..... import tensor as mt
+
+try:
+    import statsmodels
+
+    from .. import MarsDistributedModel, MarsResults
+except ImportError:  # pragma: no cover
+    statsmodels = MarsDistributedModel = MarsResults = None
+
+
+n_rows = 1000
+n_columns = 10
+chunk_size = 200
+rs = mt.random.RandomState(0)
+X = rs.rand(n_rows, n_columns, chunk_size=chunk_size)
+y = rs.rand(n_rows, chunk_size=chunk_size)
+filter = rs.rand(n_rows, chunk_size=chunk_size) < 0.8
+X = X[filter]
+y = y[filter]
+
+
+@pytest.mark.skipif(statsmodels is None, reason="statsmodels not installed")
+def test_distributed_stats_models(setup):
+    y_data = (y * 10).astype(mt.int32)
+    model = MarsDistributedModel(factor=1.2)
+    result = model.fit(y_data, X, alpha=0.2)
+    prediction = result.predict(X)
+
+    X.execute()
+
+    assert prediction.ndim == 1
+    assert prediction.shape[0] == len(X)
diff --git a/python/xorbits/_mars/learn/contrib/statsmodels/train.py b/python/xorbits/_mars/learn/contrib/statsmodels/train.py
new file mode 100644
index 000000000..9aa8fd53b
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/statsmodels/train.py
@@ -0,0 +1,244 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle  # nosec  # pylint: disable=import_pickle
+
+import cloudpickle
+
+from .... import opcodes
+from ....core import OutputType, recursive_tile
+from ....core.context import get_context
+from ....core.operand import MergeDictOperand, OperandStage
+from ....serialization.serializables import (
+    BytesField,
+    DictField,
+    Float32Field,
+    FunctionField,
+    Int32Field,
+    KeyField,
+)
+from ....utils import has_unknown_shape
+
+
+class StatsModelsTrain(MergeDictOperand):
+    _op_type_ = opcodes.STATSMODELS_TRAIN
+
+    _exog = KeyField("exog")  # exogenous
+    _endog = KeyField("endog")  # endogenous
+
+    _num_partitions = Int32Field("num_partitions")
+    _partition_id = Int32Field("partition_id")
+    _factor = Float32Field("factor")
+    _model_class = BytesField(
+        "model_class", on_serialize=cloudpickle.dumps, on_deserialize=cloudpickle.loads
+    )
+    _init_kwds = DictField("init_kwds")
+    _fit_kwds = DictField("fit_kwds")
+    _estimation_method = FunctionField("estimation_method")
+    _estimation_kwds = DictField("estimation_kwds")
+    _join_method = FunctionField("join_method")
+    _join_kwds = DictField("join_kwds")
+    _results_class = BytesField(
+        "results_class",
+        on_serialize=cloudpickle.dumps,
+        on_deserialize=cloudpickle.loads,
+    )
+    _results_kwds = DictField("results_kwds")
+
+    def __init__(
+        self,
+        exog=None,
+        endog=None,
+        num_partitions=None,
+        partition_id=None,
+        factor=None,
+        model_class=None,
+        init_kwds=None,
+        fit_kwds=None,
+        estimation_method=None,
+        estimation_kwds=None,
+        join_method=None,
+        join_kwds=None,
+        results_class=None,
+        results_kwds=None,
+        **kw
+    ):
+        super().__init__(
+            _exog=exog,
+            _endog=endog,
+            _num_partitions=num_partitions,
+            _partition_id=partition_id,
+            _factor=factor,
+            _model_class=model_class,
+            _init_kwds=init_kwds,
+            _fit_kwds=fit_kwds,
+            _estimation_method=estimation_method,
+            _estimation_kwds=estimation_kwds,
+            _join_method=join_method,
+            _join_kwds=join_kwds,
+            _results_class=results_class,
+            _results_kwds=results_kwds,
+            **kw
+        )
+
+    @property
+    def exog(self):
+        return self._exog
+
+    @property
+    def endog(self):
+        return self._endog
+
+    @property
+    def num_partitions(self):
+        return self._num_partitions
+
+    @property
+    def partition_id(self):
+        return self._partition_id
+
+    @property
+    def factor(self):
+        return self._factor
+
+    @property
+    def model_class(self):
+        return self._model_class
+
+    @property
+    def init_kwds(self) -> dict:
+        return self._init_kwds
+
+    @property
+    def fit_kwds(self) -> dict:
+        return self._fit_kwds
+
+    @property
+    def estimation_method(self):
+        return self._estimation_method
+
+    @property
+    def estimation_kwds(self) -> dict:
+        return self._estimation_kwds
+
+    @property
+    def join_method(self):
+        return self._join_method
+
+    @property
+    def join_kwds(self) -> dict:
+        return self._join_kwds
+
+    @property
+    def results_class(self):
+        return self._results_class
+
+    @property
+    def results_kwds(self) -> dict:
+        return self._results_kwds
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(inputs)
+        self._exog = next(inputs_iter)
+        self._endog = next(inputs_iter)
+
+    def __call__(self, exog, endog):
+        self._output_types = [OutputType.object]
+        return self.new_tileable([exog, endog])
+
+    @classmethod
+    def tile(cls, op: "StatsModelsTrain"):
+        if op.factor is not None:
+            ctx = get_context()
+            cluster_cpu_count = ctx.get_total_n_cpu()
+            assert cluster_cpu_count > 0
+            num_partitions = int(cluster_cpu_count * op.factor)
+        else:
+            num_partitions = op.num_partitions
+
+        if has_unknown_shape(op.exog, op.endog):
+            yield
+
+        exog = op.exog
+        if exog.ndim > 1 and exog.chunk_shape[1] > 1:
+            exog = exog.rechunk({1: exog.shape[1]})
+        exog = yield from recursive_tile(exog.rebalance(num_partitions=num_partitions))
+        endog = yield from recursive_tile(
+            op.endog.rebalance(num_partitions=num_partitions)
+        )
+
+        assert len(exog.chunks) == len(endog.chunks)
+
+        # generate map stage
+        map_chunks = []
+        for part_id, (exog_chunk, endog_chunk) in enumerate(
+            zip(exog.chunks, endog.chunks)
+        ):
+            new_op = op.copy().reset_key()
+            new_op._factor = None
+            new_op._partition_id = part_id
+            new_op._num_partitions = num_partitions
+            new_op.stage = OperandStage.map
+
+            map_chunks.append(
+                new_op.new_chunk([exog_chunk, endog_chunk], index=exog_chunk.index)
+            )
+
+        # generate combine (join) stage
+        new_op = op.copy().reset_key()
+        new_op._factor = None
+        new_op._num_partitions = num_partitions
+        new_op.stage = OperandStage.combine
+
+        combine_chunk = new_op.new_chunk(map_chunks, index=(0,))
+
+        # generate tileable
+        new_op = op.copy().reset_key()
+        return new_op.new_tileables(op.inputs, chunks=[combine_chunk])
+
+    @classmethod
+    def _execute_map(cls, ctx, op: "StatsModelsTrain"):
+        endog = ctx[op.endog.key]
+        exog = ctx[op.exog.key]
+
+        # code from statsmodels.base.distributed_estimation::_helper_fit_partition
+        model = op.model_class(endog, exog, **op.init_kwds)
+        results = op.estimation_method(
+            model,
+            op.partition_id,
+            op.num_partitions,
+            fit_kwds=op.fit_kwds,
+            **op.estimation_kwds
+        )
+        ctx[op.outputs[0].key] = pickle.dumps(results)
+
+    @classmethod
+    def _execute_combine(cls, ctx, op: "StatsModelsTrain"):
+        # code from statsmodels.base.distributed_estimation::DistributedModel.fit
+        results_list = [pickle.loads(ctx[inp.key]) for inp in op.inputs]  # nosec
+        params = op.join_method(results_list, **op.join_kwds)
+        res_mod = op.model_class([0], [0], **op.init_kwds)
+        result = op.results_class(res_mod, params, **op.results_kwds)
+
+        ctx[op.outputs[0].key] = pickle.dumps(result)
+
+    @classmethod
+    def execute(cls, ctx, op: "StatsModelsTrain"):
+        if op.merge:  # pragma: no cover
+            super().execute(ctx, op)
+        elif op.stage == OperandStage.combine:
+            cls._execute_combine(ctx, op)
+        else:
+            cls._execute_map(ctx, op)
diff --git a/python/xorbits/_mars/learn/contrib/tensorflow/__init__.py b/python/xorbits/_mars/learn/contrib/tensorflow/__init__.py
new file mode 100644
index 000000000..2c1651bc7
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/tensorflow/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .dataset import (  # noqa: F401 # pylint: disable=unused-import
+    gen_tensorflow_dataset,
+)
+from .run_script import run_tensorflow_script
+
+
+def register_op():
+    from .run_script import RunTensorFlow
+
+    del RunTensorFlow
diff --git a/python/xorbits/_mars/learn/contrib/tensorflow/dataset.py b/python/xorbits/_mars/learn/contrib/tensorflow/dataset.py
new file mode 100644
index 000000000..e6ef3a92b
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/tensorflow/dataset.py
@@ -0,0 +1,195 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+from typing import List, Tuple
+
+import numpy as np
+import pandas as pd
+
+from .... import execute
+from ....core.context import get_context
+from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
+from ....tensor.core import TENSOR_TYPE
+from ....utils import lazy_import, require_not_none
+
+tf = lazy_import("tensorflow")
+
+
+ACCEPT_TYPE = (
+    TENSOR_TYPE,
+    DATAFRAME_TYPE,
+    SERIES_TYPE,
+    np.ndarray,
+    pd.DataFrame,
+    pd.Series,
+    List,
+)
+
+
+@require_not_none(tf)
+class MarsDataset:
+    def __init__(
+        self,
+        tensors,
+        output_shapes: Tuple[int, ...] = None,
+        output_types: Tuple[np.dtype, ...] = None,
+        fetch_kwargs=None,
+    ):
+        self._context = get_context()
+        self._tensors = tensors
+        self._output_shapes = output_shapes
+        self._output_types = output_types
+        self._fetch_kwargs = fetch_kwargs or dict()
+        self._executed = False
+        self._check_and_convert()
+
+    def _check_and_convert(self):
+        if not isinstance(self._tensors, Tuple):
+            self._tensors = (self._tensors,)
+        for t in self._tensors:
+            if not isinstance(t, ACCEPT_TYPE):
+                raise TypeError(f"Unexpected dataset type: {type(t)}")
+
+        if not self._executed:
+            self._execute()
+            self._executed = True
+
+        if not self._output_shapes:
+            get_shape = (
+                lambda t: tuple(())
+                if isinstance(t, (List, SERIES_TYPE, pd.Series))
+                else t.shape[1:]
+            )
+            self._output_shapes = (
+                get_shape(self._tensors[0])
+                if len(self._tensors) == 1
+                else tuple(get_shape(t) for t in self._tensors)
+            )
+
+        if not self._output_types:
+            get_type = (
+                lambda t: type(t[0])
+                if isinstance(t, List)
+                else t[0].dtype
+                if isinstance(t, (DATAFRAME_TYPE, pd.DataFrame))
+                else t.dtype
+            )
+            self._output_types = (
+                get_type(self._tensors[0])
+                if len(self._tensors) == 1
+                else tuple(tf.as_dtype(get_type(t)) for t in self._tensors)
+            )
+
+    def _execute(self):  # pragma: no cover
+        execute_data = [t for t in self._tensors if isinstance(t, ACCEPT_TYPE[:3])]
+
+        if len(execute_data) > 0:
+            execute(execute_data)
+
+    def get_data(self, t, index):  # pragma: no cover
+        # coverage not included as now there is no solution to cover tensorflow methods
+        # see https://github.com/tensorflow/tensorflow/issues/33759 for more details.
+        fetch_kwargs = dict()
+        if self._fetch_kwargs:
+            fetch_kwargs = copy.deepcopy(self._fetch_kwargs)
+
+        if isinstance(t, TENSOR_TYPE):
+            return t[index].fetch(**fetch_kwargs)
+        elif isinstance(t, np.ndarray):
+            return t[index]
+        elif isinstance(t, DATAFRAME_TYPE):
+            return t.iloc[index].fetch(**fetch_kwargs).values
+        elif isinstance(t, SERIES_TYPE):
+            return t.iloc[index].fetch(**fetch_kwargs)
+        elif isinstance(t, pd.DataFrame):
+            return t.iloc[index].values
+        elif isinstance(t, pd.Series):
+            return t.iloc[index]
+        else:
+            return t[index]
+
+    def to_tf(self) -> "tf.data.Dataset":
+        """Get TF Dataset.
+
+        convert into a tensorflow.data.Dataset
+        """
+
+        def make_generator():  # pragma: no cover
+            if not self._executed:
+                self._execute()
+                self._executed = True
+
+            for i in range(len(self._tensors[0])):
+                if len(self._tensors) == 1:
+                    yield self.get_data(self._tensors[0], i)
+                else:
+                    yield tuple(self.get_data(t, i) for t in self._tensors)
+
+        return tf.data.Dataset.from_generator(
+            make_generator,
+            output_types=self._output_types,
+            output_shapes=self._output_shapes,
+        )
+
+
+def gen_tensorflow_dataset(
+    tensors,
+    output_shapes: Tuple[int, ...] = None,
+    output_types: Tuple[np.dtype, ...] = None,
+    fetch_kwargs=None,
+):
+    """
+    convert mars data type to tf.data.Dataset. Note this is based tensorflow 2.0
+    For example
+    -----------
+    >>> # convert a tensor to tf.data.Dataset.
+    >>> data = mt.tensor([[1, 2], [3, 4]])
+    >>> dataset = gen_tensorflow_dataset(data)
+    >>> list(dataset.as_numpy_iterator())
+    [array([1, 2]), array([3, 4])]
+    >>> dataset.element_spec
+    TensorSpec(shape=(2,), dtype=tf.int64, name=None)
+
+    >>> # convert a tuple of tensors to tf.data.Dataset.
+    >>> data1 = mt.tensor([1, 2]); data2 = mt.tensor([3, 4]); data3 = mt.tensor([5, 6])
+    >>> dataset = gen_tensorflow_dataset((data1, data2, data3))
+    >>> list(dataset.as_numpy_iterator())
+    [(1, 3, 5), (2, 4, 6)]
+
+    Parameters
+    ----------
+    tensors: Mars data type or a tuple consisting of Mars data type
+        the data that convert to tf.data.dataset
+    output_shapes:
+        A (nested) structure of `tf.TensorShape` objects corresponding to
+        each component of an element yielded from mars object.
+    output_types:
+        A (nested) structure of `tf.DType` objects corresponding to each
+        component of an element yielded from mars object.
+    fetch_kwargs:
+        the parameters of mars object executes fetch() operation.
+    Returns
+    -------
+        tf.data.Dataset
+    """
+    mars_dataset = MarsDataset(
+        tensors,
+        output_shapes=output_shapes,
+        output_types=output_types,
+        fetch_kwargs=fetch_kwargs,
+    )
+
+    return mars_dataset.to_tf()
diff --git a/python/xorbits/_mars/learn/contrib/tensorflow/run_script.py b/python/xorbits/_mars/learn/contrib/tensorflow/run_script.py
new file mode 100644
index 000000000..221ade0f5
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/tensorflow/run_script.py
@@ -0,0 +1,225 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from typing import Any, BinaryIO, Dict, List, Optional, TextIO, Union
+
+import numpy as np
+
+from .... import opcodes as OperandDef
+from ....core import recursive_tile
+from ....core.context import get_context
+from ....remote.run_script import RunScript, _extract_inputs
+from ....serialization.serializables import (
+    BytesField,
+    DictField,
+    Int32Field,
+    StringField,
+)
+from ....typing import SessionType, TileableType
+from ....utils import to_binary
+from ...utils import collect_ports
+from ..utils import pick_workers
+
+
+class RunTensorFlow(RunScript):
+    _op_type_ = OperandDef.RUN_TENSORFLOW
+
+    _code = BytesField("code")
+    _n_workers = Int32Field("n_workers")
+    _n_ps = Int32Field("n_ps")
+    _tf_config = DictField("tf_config")
+    _port = Int32Field("port")
+    # used for chunk op
+    _tf_task_type = StringField("tf_task_type")
+    _tf_task_index = Int32Field("tf_task_index")
+
+    def __init__(
+        self,
+        n_workers=None,
+        n_ps=None,
+        tf_config=None,
+        port=None,
+        tf_task_type=None,
+        tf_task_index=None,
+        gpu=None,
+        **kw,
+    ):
+        super().__init__(
+            _n_workers=n_workers,
+            _n_ps=n_ps,
+            _tf_config=tf_config,
+            _port=port,
+            _tf_task_type=tf_task_type,
+            _tf_task_index=tf_task_index,
+            gpu=gpu,
+            **kw,
+        )
+
+    @property
+    def n_workers(self):
+        return self._n_workers
+
+    @property
+    def n_ps(self):
+        return self._n_ps or 0
+
+    @property
+    def n_roles(self):
+        return self._n_workers + self._n_ps
+
+    @property
+    def tf_config(self):
+        return self._tf_config
+
+    @property
+    def port(self):
+        return self._port
+
+    @property
+    def tf_task_type(self):
+        return self._tf_task_type
+
+    @property
+    def tf_task_index(self):
+        return self._tf_task_index
+
+    @classmethod
+    def tile(cls, op):
+        ctx = get_context()
+
+        cluster_conf = {"worker": []}
+        if op.n_ps > 0:
+            cluster_conf["ps"] = []
+        n_workers = op.n_workers
+
+        out_chunks = []
+        worker_addresses = ctx.get_worker_addresses()
+        picked_workers = pick_workers(worker_addresses, op.n_roles)
+        data, input_chunks = cls._get_chunk_data(op)
+
+        ports = yield from recursive_tile(collect_ports(worker_addresses))
+        yield ports.chunks
+        ports = ctx.get_chunks_result([ports.chunks[0].key])[0]
+
+        i = 0
+        for worker, port in zip(picked_workers, ports):
+            worker_addr = worker.rsplit(":", 1)[0]
+            chunk_op = op.copy().reset_key()
+            chunk_op._data = data
+            addr = f"{worker_addr}:{port}"
+            # tell graph actor that the chunk should be executed on the exact worker
+            chunk_op.expect_worker = worker
+            tp = "worker" if i < n_workers else "ps"
+            chunk_op._tf_task_type = tp
+            idx = i if i < n_workers else i - n_workers
+            chunk_op._tf_task_index = idx
+            cluster_conf[tp].append(addr)
+            chunk_op._tf_config = {
+                "cluster": cluster_conf,
+                "task": {"type": tp, "index": idx},
+            }
+            out_chunks.append(chunk_op.new_chunk(input_chunks, index=(i,)))
+            i += 1
+
+        new_op = op.copy()
+        return new_op.new_tileables(
+            op.inputs,
+            chunks=out_chunks,
+            nsplits=(tuple(np.nan for _ in range(len(out_chunks))),),
+        )
+
+    @classmethod
+    def _build_envs(cls, ctx, op):
+        envs = super()._build_envs(ctx, op)
+        envs.update({"TF_CONFIG": json.dumps(op.tf_config)})
+        return envs
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.merge:
+            return super().execute(ctx, op)
+
+        assert ctx.local_address.split(":")[0] == op.expect_worker.split(":")[0]
+
+        super().execute(ctx, op)
+
+        if op.tf_task_type == "worker" and op.tf_task_index == 0:
+            ctx[op.outputs[0].key] = {"status": "ok"}
+        else:
+            ctx[op.outputs[0].key] = {}
+
+
+def run_tensorflow_script(
+    script: Union[bytes, str, BinaryIO, TextIO],
+    n_workers: int,
+    n_ps: int = 0,
+    data: Dict[str, TileableType] = None,
+    gpu: Optional[bool] = None,
+    command_argv: List[str] = None,
+    retry_when_fail: bool = False,
+    session: SessionType = None,
+    run_kwargs: Dict[str, Any] = None,
+):
+    """
+    Run TensorFlow script in Mars cluster.
+
+    Parameters
+    ----------
+    script: str or file-like object
+        Script to run
+    n_workers : int
+        Number of TensorFlow workers.
+    n_ps : int
+        Number of TensorFlow PS workers.
+    data : dict
+        Variable name to data.
+    gpu : bool
+        Run PyTorch script on GPU
+    command_argv : list
+        Extra command args for script
+    retry_when_fail : bool
+        If True, retry when function failed.
+    session
+        Mars session, if not provided, will use default one.
+    run_kwargs : dict
+        Extra kwargs for `session.run`.
+
+    Returns
+    -------
+    status
+        return {'status': 'ok'} if succeeded, or error raised
+    """
+    if int(n_workers) <= 0:
+        raise ValueError("n_workers should be at least 1")
+    if int(n_ps) < 0:
+        raise ValueError("n_ps should be at least 0")
+    if hasattr(script, "read"):
+        code = script.read()
+    else:
+        with open(os.path.abspath(script), "rb") as f:
+            code = f.read()
+
+    inputs = _extract_inputs(data)
+    op = RunTensorFlow(
+        data=data,
+        code=to_binary(code),
+        n_workers=int(n_workers),
+        n_ps=int(n_ps),
+        retry_when_fail=retry_when_fail,
+        gpu=gpu,
+        command_args=command_argv,
+    )
+    return op(inputs).execute(session=session, **(run_kwargs or {}))
diff --git a/python/xorbits/_mars/learn/contrib/tensorflow/tests/__init__.py b/python/xorbits/_mars/learn/contrib/tensorflow/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/tensorflow/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/contrib/tensorflow/tests/test_dataset.py b/python/xorbits/_mars/learn/contrib/tensorflow/tests/test_dataset.py
new file mode 100644
index 000000000..9ef4aa135
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/tensorflow/tests/test_dataset.py
@@ -0,0 +1,157 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from ..... import dataframe as md
+from ..... import tensor as mt
+from .....utils import lazy_import
+from .. import gen_tensorflow_dataset, run_tensorflow_script
+
+tf_installed = lazy_import("tensorflow") is not None
+
+
+@pytest.mark.skipif(not tf_installed, reason="tensorflow not installed")
+def test_mars_dataset(setup_cluster):
+    import numpy as np
+    import pandas as pd
+
+    tf_dataset_ops = lazy_import("tensorflow.python.data.ops.dataset_ops")
+
+    # Mars tensor
+    data = mt.random.rand(1000, 32, dtype="f4")
+    data_verify = data[:10].execute().fetch()
+
+    dataset = gen_tensorflow_dataset(data)
+    assert isinstance(dataset, tf_dataset_ops.DatasetV2)
+    for _, data_1batch in enumerate(dataset.repeat().batch(10).take(1)):
+        np.testing.assert_array_equal(data_1batch, data_verify)
+
+    # Mars tensors
+    data = mt.random.rand(1000, 32, dtype="f4")
+    labels = mt.random.randint(0, 2, (1000, 10), dtype="f4")
+
+    data_verify = data[:10].execute().fetch()
+    labels_verify = labels[:10].execute().fetch()
+
+    dataset = gen_tensorflow_dataset((data, labels))
+    assert isinstance(dataset, tf_dataset_ops.DatasetV2)
+    for _, (data_1batch, label_1batch) in enumerate(dataset.repeat().batch(10).take(1)):
+        np.testing.assert_array_equal(data_1batch, data_verify)
+        np.testing.assert_array_equal(label_1batch, labels_verify)
+
+    # np ndarray
+    data = np.random.rand(1000, 32)
+    labels = np.random.randint(0, 2, (1000, 10))
+
+    data_verify = data[:10]
+    labels_verify = labels[:10]
+
+    dataset = gen_tensorflow_dataset((data, labels))
+    assert isinstance(dataset, tf_dataset_ops.DatasetV2)
+    for _, (data_1batch, label_1batch) in enumerate(dataset.repeat().batch(10).take(1)):
+        np.testing.assert_array_equal(data_1batch, data_verify)
+        np.testing.assert_array_equal(label_1batch, labels_verify)
+
+    # Mars dataframe
+    data = md.DataFrame(data)
+    labels = md.DataFrame(labels)
+
+    data_verify = data.iloc[:10].execute().fetch().values
+    labels_verify = labels.iloc[:10].execute().fetch().values
+
+    dataset = gen_tensorflow_dataset(
+        (data, labels), fetch_kwargs={"extra_config": {"check_series_name": False}}
+    )
+    assert isinstance(dataset, tf_dataset_ops.DatasetV2)
+    for _, (data_1batch, label_1batch) in enumerate(dataset.repeat().batch(10).take(1)):
+        np.testing.assert_array_equal(data_1batch, data_verify)
+        np.testing.assert_array_equal(label_1batch, labels_verify)
+
+    # Mars series
+    label = labels[1]
+
+    label_verify = label[:10].execute().fetch()
+
+    dataset = gen_tensorflow_dataset(
+        (data, label), fetch_kwargs={"extra_config": {"check_series_name": False}}
+    )
+    assert isinstance(dataset, tf_dataset_ops.DatasetV2)
+    for _, (data_1batch, label_1batch) in enumerate(dataset.repeat().batch(10).take(1)):
+        np.testing.assert_array_equal(data_1batch, data_verify)
+        np.testing.assert_array_equal(label_1batch, label_verify)
+
+    # pandas dataframe
+    data = pd.DataFrame(np.random.rand(1000, 32))
+    labels = pd.DataFrame(np.random.randint(0, 2, (1000, 10)), dtype="float32")
+
+    data_verify = data.iloc[:10].values
+    labels_verify = labels.iloc[:10].values
+    dataset = gen_tensorflow_dataset((data, labels))
+    assert isinstance(dataset, tf_dataset_ops.DatasetV2)
+    for _, (data_1batch, label_1batch) in enumerate(dataset.repeat().batch(10).take(1)):
+        np.testing.assert_array_equal(data_1batch, data_verify)
+        np.testing.assert_array_equal(label_1batch, labels_verify)
+
+    # pandas series
+    label = labels[1]
+
+    label_verify = label[:10]
+
+    dataset = gen_tensorflow_dataset((data, label))
+    assert isinstance(dataset, tf_dataset_ops.DatasetV2)
+    for _, (data_1batch, label_1batch) in enumerate(dataset.repeat().batch(10).take(1)):
+        np.testing.assert_array_equal(data_1batch, data_verify)
+        np.testing.assert_array_equal(label_1batch, label_verify)
+
+    # list
+    label = label.tolist()
+
+    label_verify = label[:10]
+
+    dataset = gen_tensorflow_dataset((data, label))
+    assert isinstance(dataset, tf_dataset_ops.DatasetV2)
+    for _, (data_1batch, label_1batch) in enumerate(dataset.repeat().batch(10).take(1)):
+        np.testing.assert_array_equal(data_1batch, data_verify)
+        np.testing.assert_array_equal(label_1batch, label_verify)
+
+    # test TypeError
+    label = tuple(range(1000))
+
+    with pytest.raises(TypeError) as e:
+        dataset = gen_tensorflow_dataset((data, label))
+    exec_msg = e.value.args[0]
+    assert exec_msg == "Unexpected dataset type: <class 'tuple'>"
+
+
+@pytest.mark.skipif(not tf_installed, reason="tensorflow not installed")
+def test_mars_dataset_script(setup_cluster):
+    sess = setup_cluster
+    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tf_dataset.py")
+
+    data = mt.random.rand(1000, 32, dtype="f4")
+    labels = mt.random.randint(0, 2, (1000, 10), dtype="f4")
+
+    assert (
+        run_tensorflow_script(
+            path,
+            n_workers=2,
+            data={"feature_data": data, "labels": labels},
+            command_argv=["multiple"],
+            session=sess,
+        ).fetch()["status"]
+        == "ok"
+    )
diff --git a/python/xorbits/_mars/learn/contrib/tensorflow/tests/test_run_script.py b/python/xorbits/_mars/learn/contrib/tensorflow/tests/test_run_script.py
new file mode 100644
index 000000000..e3d86aa1c
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/tensorflow/tests/test_run_script.py
@@ -0,0 +1,43 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+try:
+    import tensorflow
+except ImportError:
+    tensorflow = None
+
+from .. import run_tensorflow_script
+
+
+@pytest.mark.skipif(tensorflow is None, reason="tensorflow not installed")
+def test_local_run_tensor_flow_script(setup_cluster):
+    path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "tf_distributed_sample.py"
+    )
+    assert (
+        run_tensorflow_script(path, n_workers=2, command_argv=["multiple"]).fetch()[
+            "status"
+        ]
+        == "ok"
+    )
+
+    with pytest.raises(ValueError):
+        run_tensorflow_script(path, n_workers=0)
+
+    with pytest.raises(ValueError):
+        run_tensorflow_script(path, 2, n_ps=-1)
diff --git a/python/xorbits/_mars/learn/contrib/tensorflow/tests/tf_dataset.py b/python/xorbits/_mars/learn/contrib/tensorflow/tests/tf_dataset.py
new file mode 100644
index 000000000..67afc4d64
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/tensorflow/tests/tf_dataset.py
@@ -0,0 +1,60 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import sys
+
+import tensorflow as tf
+from mars.learn.contrib.tensorflow import gen_tensorflow_dataset
+from tensorflow.keras import layers
+from tensorflow.python.data.ops.dataset_ops import DatasetV2
+
+
+def get_model():
+    model = tf.keras.Sequential()
+    model.add(layers.Dense(64, activation="relu"))
+    model.add(layers.Dense(64, activation="relu"))
+    model.add(layers.Dense(10, activation="softmax"))
+    model.compile(
+        optimizer=tf.keras.optimizers.Adam(0.01),
+        loss="categorical_crossentropy",
+        metrics=["accuracy"],
+    )
+    return model
+
+
+def train(feature_data, labels):
+    data = feature_data
+    labels = labels
+
+    db_train = gen_tensorflow_dataset((data, labels))
+    assert isinstance(db_train, DatasetV2)
+    db_train = db_train.batch(32)
+
+    model = get_model()
+    model.fit(db_train, epochs=2)
+
+
+if __name__ == "__main__":
+    assert json.loads(os.environ["TF_CONFIG"])["task"]["index"] in {0, 1}
+    assert len(sys.argv) == 2
+    assert sys.argv[1] == "multiple"
+
+    feature_data = globals()["feature_data"]
+    labels = globals()["labels"]
+    multiworker_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+
+    with multiworker_strategy.scope():
+        train(feature_data, labels)
diff --git a/python/xorbits/_mars/learn/contrib/tensorflow/tests/tf_distributed_sample.py b/python/xorbits/_mars/learn/contrib/tensorflow/tests/tf_distributed_sample.py
new file mode 100644
index 000000000..4041a5e7e
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/tensorflow/tests/tf_distributed_sample.py
@@ -0,0 +1,48 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import sys
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import layers
+
+
+def get_model():
+    model = tf.keras.Sequential()
+    model.add(layers.Dense(64, activation="relu"))
+    model.add(layers.Dense(64, activation="relu"))
+    model.add(layers.Dense(10, activation="softmax"))
+    model.compile(
+        optimizer=tf.keras.optimizers.Adam(0.01),
+        loss="categorical_crossentropy",
+        metrics=["accuracy"],
+    )
+    return model
+
+
+assert json.loads(os.environ["TF_CONFIG"])["task"]["index"] in {0, 1}
+assert len(sys.argv) == 2
+assert sys.argv[1] == "multiple"
+
+multiworker_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+
+with multiworker_strategy.scope():
+    data = np.random.random((1000, 32))
+    labels = np.random.random((1000, 10))
+
+    model = get_model()
+    model.fit(data, labels, epochs=2, batch_size=32)
diff --git a/python/xorbits/_mars/learn/contrib/tsfresh/__init__.py b/python/xorbits/_mars/learn/contrib/tsfresh/__init__.py
new file mode 100644
index 000000000..31f64ca7e
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/tsfresh/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import MarsDistributor
diff --git a/python/xorbits/_mars/learn/contrib/tsfresh/core.py b/python/xorbits/_mars/learn/contrib/tsfresh/core.py
new file mode 100644
index 000000000..917160b4a
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/tsfresh/core.py
@@ -0,0 +1,66 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+
+from .... import remote as mr
+from ....deploy.oscar.session import get_default_session
+from ....utils import ceildiv
+
+try:
+    try:
+        # fix for tsfresh 0.17.0, from this version on,
+        # we need to inherit from IterableDistributorBaseClass
+        from tsfresh.utilities.distribution import (
+            IterableDistributorBaseClass as DistributorBaseClass,
+        )
+    except ImportError:  # pragma: no cover
+        from tsfresh.utilities.distribution import DistributorBaseClass
+except ImportError:  # pragma: no cover
+    DistributorBaseClass = object
+
+
+class MarsDistributor(DistributorBaseClass):
+    def __init__(self, session=None):
+        self._session = session or get_default_session()
+
+    def calculate_best_chunk_size(self, data_length):
+        n_cores = self._session.get_total_n_cpu()
+        return ceildiv(data_length, n_cores)
+
+    def distribute(self, func, partitioned_chunks, kwargs):
+        def _wrapped_func(*args, **kw):
+            # Series.value_counts() may not be able to handle
+            if not getattr(pd.Series.value_counts, "_wrapped", False):
+                old_value_counts = pd.Series.value_counts
+
+                def _wrapped_value_counts(obj, *args, **kw):
+                    try:
+                        return old_value_counts(obj, *args, **kw)
+                    except ValueError:  # pragma: no cover
+                        return old_value_counts(obj.copy(), *args, **kw)
+
+                pd.Series.value_counts = _wrapped_value_counts
+                pd.Series.value_counts._wrapped = True
+
+            return func(*args, **kw)
+
+        tasks = []
+        for partitioned_chunk in partitioned_chunks:
+            tasks.append(
+                mr.spawn(_wrapped_func, args=(partitioned_chunk,), kwargs=kwargs)
+            )
+        executed = mr.ExecutableTuple(tasks).execute(session=self._session)
+        fetched = executed.fetch(session=self._session)
+        return [item for results in fetched for item in results]
diff --git a/python/xorbits/_mars/learn/contrib/tsfresh/tests/__init__.py b/python/xorbits/_mars/learn/contrib/tsfresh/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/tsfresh/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/contrib/tsfresh/tests/test_tsfresh.py b/python/xorbits/_mars/learn/contrib/tsfresh/tests/test_tsfresh.py
new file mode 100644
index 000000000..8643dc0e7
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/tsfresh/tests/test_tsfresh.py
@@ -0,0 +1,49 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+try:
+    import tsfresh
+    from tsfresh import extract_features
+    from tsfresh.examples import robot_execution_failures
+    from tsfresh.feature_extraction import ComprehensiveFCParameters
+    from tsfresh.utilities.dataframe_functions import impute
+except ImportError:
+    tsfresh = None
+
+from .....deploy.oscar.session import get_default_session, new_session
+from .. import MarsDistributor
+
+
+@pytest.mark.skipif(tsfresh is None, reason="tsfresh not installed")
+def test_distributed_ts_fresh(setup):
+    robot_execution_failures.download_robot_execution_failures()
+    df, y = robot_execution_failures.load_robot_execution_failures()
+    default_session = get_default_session()
+    sync_session = new_session(default_session.address, default=False)
+    dist = MarsDistributor(session=sync_session)
+
+    df = df.iloc[:200].copy()
+
+    extraction_settings = ComprehensiveFCParameters()
+    extract_features(
+        df,
+        column_id="id",
+        column_sort="time",
+        default_fc_parameters=extraction_settings,
+        # we impute = remove all NaN features automatically
+        impute_function=impute,
+        distributor=dist,
+    )
diff --git a/python/xorbits/_mars/learn/contrib/utils.py b/python/xorbits/_mars/learn/contrib/utils.py
new file mode 100644
index 000000000..9e408b150
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/utils.py
@@ -0,0 +1,83 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import numpy as np
+
+
+def make_import_error_func(package_name):
+    def _func(*_, **__):  # pragma: no cover
+        raise ImportError(
+            f"Cannot import {package_name}, please reinstall that package."
+        )
+
+    return _func
+
+
+def pick_workers(workers, size):
+    """
+    Pick workers from a list.
+
+    This method will try to pick workers as balanced as it can.
+
+    1. If size <= len(workers), randomly pick workers from the list.
+    2. If size > len(workers), just select all workers in a random order,
+       then see the rest size, if it's still more than the workers size,
+       return all workers in a random order, if not,
+       randomly select workers from the list.
+
+    :param workers: workers list
+    :param size: number to pick from workers list
+    :return: ndarray of selected workers whose length is `size`
+    """
+    result = np.empty(size, dtype=object)
+    rest = size
+    while rest > 0:
+        start = size - rest
+        to_pick_size = min(size - start, len(workers))
+        result[start : start + to_pick_size] = np.random.permutation(workers)[
+            :to_pick_size
+        ]
+        rest = rest - to_pick_size
+    return result
+
+
+def config_mod_getattr(mod_dict, globals_):
+    def __getattr__(name):
+        import importlib
+
+        if name in mod_dict:
+            mod_name, cls_name = mod_dict[name].rsplit(".", 1)
+            mod = importlib.import_module(mod_name, globals_["__name__"])
+            cls = globals_[name] = getattr(mod, cls_name)
+            return cls
+        else:  # pragma: no cover
+            raise AttributeError(name)
+
+    if sys.version_info[:2] < (3, 7):
+        for _mod in mod_dict.keys():
+            __getattr__(_mod)
+
+    def __dir__():
+        return sorted([n for n in globals_ if not n.startswith("_")] + list(mod_dict))
+
+    globals_.update(
+        {
+            "__getattr__": __getattr__,
+            "__dir__": __dir__,
+            "__all__": list(__dir__()),
+            "__warningregistry__": dict(),
+        }
+    )
diff --git a/python/xorbits/_mars/learn/contrib/xgboost/__init__.py b/python/xorbits/_mars/learn/contrib/xgboost/__init__.py
new file mode 100644
index 000000000..21678e4e8
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/xgboost/__init__.py
@@ -0,0 +1,34 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .dmatrix import MarsDMatrix
+from .predict import predict
+from .train import train
+
+
+def register_op():
+    from .start_tracker import StartTracker
+
+    del StartTracker
+
+
+from ..utils import config_mod_getattr as _config_mod_getattr
+
+_config_mod_getattr(
+    {
+        "XGBClassifier": ".classifier.XGBClassifier",
+        "XGBRegressor": ".regressor.XGBRegressor",
+    },
+    globals(),
+)
diff --git a/python/xorbits/_mars/learn/contrib/xgboost/classifier.py b/python/xorbits/_mars/learn/contrib/xgboost/classifier.py
new file mode 100644
index 000000000..3fe7e9edc
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/xgboost/classifier.py
@@ -0,0 +1,102 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import make_import_error_func
+from .core import XGBScikitLearnBase, xgboost
+
+XGBClassifier = make_import_error_func("xgboost")
+if xgboost:
+    from xgboost.sklearn import XGBClassifierBase
+
+    from .... import tensor as mt
+    from .core import wrap_evaluation_matrices
+    from .predict import predict
+    from .train import train
+
+    class XGBClassifier(XGBScikitLearnBase, XGBClassifierBase):
+        """
+        Implementation of the scikit-learn API for XGBoost classification.
+        """
+
+        def fit(
+            self,
+            X,
+            y,
+            sample_weight=None,
+            base_margin=None,
+            eval_set=None,
+            sample_weight_eval_set=None,
+            base_margin_eval_set=None,
+            **kw,
+        ):
+            session = kw.pop("session", None)
+            run_kwargs = kw.pop("run_kwargs", dict())
+            if kw:
+                raise TypeError(
+                    f"fit got an unexpected keyword argument '{next(iter(kw))}'"
+                )
+
+            dtrain, evals = wrap_evaluation_matrices(
+                None,
+                X,
+                y,
+                sample_weight,
+                base_margin,
+                eval_set,
+                sample_weight_eval_set,
+                base_margin_eval_set,
+            )
+            params = self.get_xgb_params()
+
+            self.classes_ = mt.unique(y, aggregate_size=1).to_numpy(
+                session=session, **run_kwargs
+            )
+            self.n_classes_ = len(self.classes_)
+
+            if self.n_classes_ > 2:
+                params["objective"] = "multi:softprob"
+                params["num_class"] = self.n_classes_
+            else:
+                params["objective"] = "binary:logistic"
+
+            self.evals_result_ = dict()
+            result = train(
+                params,
+                dtrain,
+                num_boost_round=self.get_num_boosting_rounds(),
+                evals=evals,
+                evals_result=self.evals_result_,
+                session=session,
+                run_kwargs=run_kwargs,
+            )
+            self._Booster = result
+            return self
+
+        def predict(self, data, **kw):
+            session = kw.pop("session", None)
+            run_kwargs = kw.pop("run_kwargs", dict())
+            run = kw.pop("run", True)
+            prob = predict(self.get_booster(), data, run=False, **kw)
+            if prob.ndim > 1:
+                prediction = mt.argmax(prob, axis=1)
+            else:
+                prediction = (prob > 0.5).astype(mt.int64)
+            if run:
+                prediction.execute(session=session, **run_kwargs)
+            return prediction
+
+        def predict_proba(self, data, ntree_limit=None, **kw):
+            if ntree_limit is not None:
+                raise NotImplementedError("ntree_limit is not currently supported")
+            return predict(self.get_booster(), data, **kw)
diff --git a/python/xorbits/_mars/learn/contrib/xgboost/core.py b/python/xorbits/_mars/learn/contrib/xgboost/core.py
new file mode 100644
index 000000000..d1349a382
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/xgboost/core.py
@@ -0,0 +1,152 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Callable, List, Optional, Tuple
+
+try:
+    import xgboost
+except ImportError:
+    xgboost = None
+
+from .dmatrix import MarsDMatrix
+
+XGBScikitLearnBase = None
+if xgboost:
+
+    class XGBScikitLearnBase(xgboost.XGBModel):
+        """
+        Base class for implementing scikit-learn interface
+        """
+
+        def fit(
+            self,
+            X,
+            y,
+            sample_weights=None,
+            eval_set=None,
+            sample_weight_eval_set=None,
+            **kw,
+        ):
+            """
+            Fit the regressor.
+            Parameters
+            ----------
+            X : array_like
+                Feature matrix
+            y : array_like
+                Labels
+            sample_weight : array_like
+                instance weights
+            eval_set : list, optional
+                A list of (X, y) tuple pairs to use as validation sets, for which
+                metrics will be computed.
+                Validation metrics will help us track the performance of the model.
+            sample_weight_eval_set : list, optional
+                A list of the form [L_1, L_2, ..., L_n], where each L_i is a list
+                of group weights on the i-th validation set.
+            """
+            raise NotImplementedError
+
+        def predict(self, data, **kw):
+            """
+            Predict with `data`.
+
+            Parameters
+            ----------
+              data: data that can be used to perform prediction
+            Returns
+            -------
+            prediction : mars.tensor.Tensor
+            """
+            raise NotImplementedError
+
+    def wrap_evaluation_matrices(
+        missing: float,
+        X: Any,
+        y: Any,
+        sample_weight: Optional[Any],
+        base_margin: Optional[Any],
+        eval_set: Optional[List[Tuple[Any, Any]]],
+        sample_weight_eval_set: Optional[List[Any]],
+        base_margin_eval_set: Optional[List[Any]],
+        label_transform: Callable = lambda x: x,
+    ) -> Tuple[Any, Optional[List[Tuple[Any, str]]]]:
+        """Convert array_like evaluation matrices into DMatrix.  Perform validation on the way."""
+        train_dmatrix = MarsDMatrix(
+            data=X,
+            label=label_transform(y),
+            weight=sample_weight,
+            base_margin=base_margin,
+            missing=missing,
+        )
+
+        n_validation = 0 if eval_set is None else len(eval_set)
+
+        def validate_or_none(meta: Optional[List], name: str) -> List:
+            if meta is None:
+                return [None] * n_validation
+            if len(meta) != n_validation:
+                raise ValueError(
+                    f"{name}'s length does not equal `eval_set`'s length, "
+                    + f"expecting {n_validation}, got {len(meta)}"
+                )
+            return meta
+
+        if eval_set is not None:
+            sample_weight_eval_set = validate_or_none(
+                sample_weight_eval_set, "sample_weight_eval_set"
+            )
+            base_margin_eval_set = validate_or_none(
+                base_margin_eval_set, "base_margin_eval_set"
+            )
+
+            evals = []
+            for i, (valid_X, valid_y) in enumerate(eval_set):
+                # Skip the duplicated entry.
+                if all(
+                    (
+                        valid_X is X,
+                        valid_y is y,
+                        sample_weight_eval_set[i] is sample_weight,
+                        base_margin_eval_set[i] is base_margin,
+                    )
+                ):
+                    evals.append(train_dmatrix)
+                else:
+                    m = MarsDMatrix(
+                        data=valid_X,
+                        label=label_transform(valid_y),
+                        weight=sample_weight_eval_set[i],
+                        base_margin=base_margin_eval_set[i],
+                        missing=missing,
+                    )
+                    evals.append(m)
+            nevals = len(evals)
+            eval_names = [f"validation_{i}" for i in range(nevals)]
+            evals = list(zip(evals, eval_names))
+        else:
+            if any(
+                meta is not None
+                for meta in [
+                    sample_weight_eval_set,
+                    base_margin_eval_set,
+                ]
+            ):
+                raise ValueError(
+                    "`eval_set` is not set but one of the other evaluation meta info is "
+                    "not None."
+                )
+            evals = []
+
+        return train_dmatrix, evals
diff --git a/python/xorbits/_mars/learn/contrib/xgboost/dmatrix.py b/python/xorbits/_mars/learn/contrib/xgboost/dmatrix.py
new file mode 100644
index 000000000..9809d2386
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/xgboost/dmatrix.py
@@ -0,0 +1,359 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from typing import List, Union
+
+import numpy as np
+
+from .... import opcodes as OperandDef
+from ....core import get_output_types, recursive_tile
+from ....core.context import Context, get_context
+from ....dataframe.core import DATAFRAME_TYPE
+from ....serialization.serializables import BoolField, Float64Field, KeyField, ListField
+from ....tensor import tensor as astensor
+from ....tensor.core import TENSOR_CHUNK_TYPE, TENSOR_TYPE
+from ....typing import ChunkType, TileableType
+from ....utils import build_fetch, ensure_own_data, has_unknown_shape
+from ...operands import LearnOperand, LearnOperandMixin
+from ...utils import concat_chunks, convert_to_tensor_or_dataframe
+
+
+class ToDMatrix(LearnOperand, LearnOperandMixin):
+    _op_type_ = OperandDef.TO_DMATRIX
+
+    data = KeyField("data")
+    label = KeyField("label")
+    missing = Float64Field("missing")
+    weight = KeyField("weight")
+    base_margin = KeyField("base_margin")
+    feature_names = ListField("feature_names")
+    feature_types = ListField("feature_types")
+    # if to collocate the data, label and weight
+    _collocate = BoolField("collocate", default=False)
+
+    @property
+    def output_limit(self):
+        if self._collocate:
+            return (
+                1
+                + (self.label is not None)
+                + (self.weight is not None)
+                + (self.base_margin is not None)
+            )
+        return 1
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if self.data is not None:
+            self.data = self._inputs[0]
+        has_label = self.label is not None
+        if has_label:
+            self.label = self._inputs[1]
+        if self.weight is not None:
+            i = 1 if not has_label else 2
+            self.weight = self._inputs[i]
+        if self.base_margin is not None:
+            self.base_margin = self._inputs[-1]
+
+    @staticmethod
+    def _get_kw(obj):
+        if isinstance(obj, TENSOR_TYPE + TENSOR_CHUNK_TYPE):
+            return {"shape": obj.shape, "dtype": obj.dtype, "order": obj.order}
+        else:
+            return {
+                "shape": obj.shape,
+                "dtypes": obj.dtypes,
+                "index_value": obj.index_value,
+                "columns_value": obj.columns_value,
+            }
+
+    def __call__(self):
+        inputs = [self.data]
+        kw = self._get_kw(self.data)
+        if self.label is not None:
+            inputs.append(self.label)
+        if self.weight is not None:
+            inputs.append(self.weight)
+        if self.base_margin is not None:
+            inputs.append(self.base_margin)
+
+        return self.new_tileable(inputs, **kw)
+
+    @classmethod
+    def _get_collocated(
+        cls,
+        op: "ToDMatrix",
+        data: TileableType,
+        label: TileableType,
+        weight: TileableType,
+        base_margin: TileableType,
+    ) -> List[TileableType]:
+        types = ["data", "label", "weight", "base_margin"]
+        nsplit = data.nsplits[0]
+        out_chunkss = [[] for _ in op.inputs]
+        for i in range(len(nsplit)):
+            data_chunk = data.cix[i, 0]
+            inps = [data_chunk]
+            kws = []
+            chunk_op = op.copy().reset_key()
+            chunk_op._collocate = True
+            chunk_op.data = data_chunk
+            output_types = [get_output_types(data)[0]]
+            data_kw = cls._get_kw(data_chunk)
+            data_kw["index"] = data_chunk.index
+            kws.append(data_kw)
+            for type_name, inp in zip(types[1:], [label, weight, base_margin]):
+                if inp is None:
+                    continue
+                inp_chunk = inp.cix[i,]
+                setattr(chunk_op, type_name, inp_chunk)
+                inps.append(inp_chunk)
+                kw = cls._get_kw(inp_chunk)
+                kw["index"] = inp_chunk.index
+                kw["type"] = type_name
+                kws.append(kw)
+                output_types.append(get_output_types(inp)[0])
+            chunk_op.output_types = output_types
+            out_chunks = chunk_op.new_chunks(inps, kws=kws)
+            for i, out_chunk in enumerate(out_chunks):
+                out_chunkss[i].append(out_chunk)
+
+        new_op = op.copy()
+        new_op._collocate = True
+        outs = [data, label, weight, base_margin]
+        params = [out.params.copy() for out in outs if out is not None]
+        output_types = []
+        j = 0
+        for i, out in enumerate(outs):
+            if out is None:
+                continue
+            params[j]["nsplits"] = out.nsplits
+            params[j]["chunks"] = out_chunkss[j]
+            params[j]["type"] = types[i]
+            output_types.append(get_output_types(out)[0])
+            j += 1
+        new_op.output_types = output_types
+        return new_op.new_tileables(op.inputs, kws=params)
+
+    @staticmethod
+    def _order_chunk_index(chunks: List[ChunkType]):
+        ndim = chunks[0].ndim
+        for i, c in enumerate(chunks):
+            if ndim == 2:
+                c._index = (i, 0)
+            else:
+                c._index = (i,)
+        return chunks
+
+    @classmethod
+    def tile(cls, op: "MarsDMatrix"):
+        data, label, weight, base_margin = op.data, op.label, op.weight, op.base_margin
+
+        if has_unknown_shape(data):
+            yield
+        if data.chunk_shape[1] > 1:
+            # make sure data's second dimension has only 1 chunk
+            data = yield from recursive_tile(data.rechunk({1: data.shape[1]}))
+        nsplit = data.nsplits[0]
+        # rechunk label
+        if label is not None:
+            label = yield from recursive_tile(label.rechunk({0: nsplit}))
+        # rechunk weight
+        if weight is not None:
+            weight = yield from recursive_tile(weight.rechunk({0: nsplit}))
+        # rechunk base_margin
+        if base_margin is not None:
+            base_margin = yield from recursive_tile(base_margin.rechunk({0: nsplit}))
+
+        collocated = cls._get_collocated(op, data, label, weight, base_margin)
+        collocated_chunks = list(
+            itertools.chain.from_iterable(c.chunks for c in collocated)
+        )
+        yield collocated_chunks + collocated
+
+        data = build_fetch(collocated[0])
+        has_label = False
+        if label is not None:
+            has_label = True
+            label = build_fetch(collocated[1])
+        i_weight = -1
+        if weight is not None:
+            i_weight = 1 if not has_label else 2
+            weight = build_fetch(collocated[i_weight])
+        if base_margin is not None:
+            base_margin = build_fetch(collocated[-1])
+
+        ctx = get_context()
+
+        # for distributed, we should concat the chunks
+        # which allocated on the same worker into one
+        data_chunk_metas = ctx.get_chunks_meta(
+            [c.key for c in data.chunks], fields=["bands"]
+        )
+        data_chunk_workers = [m["bands"][0][0] for m in data_chunk_metas]
+        worker_to_chunks = dict()
+        for i, worker in enumerate(data_chunk_workers):
+            size = 1 + sum(it is not None for it in [label, weight, base_margin])
+            if worker not in worker_to_chunks:
+                worker_to_chunks[worker] = [[] for _ in range(size)]
+            worker_to_chunks[worker][0].append(data.chunks[i])
+            if label is not None:
+                worker_to_chunks[worker][1].append(label.chunks[i])
+            if weight is not None:
+                worker_to_chunks[worker][i_weight].append(weight.chunks[i])
+            if base_margin is not None:
+                worker_to_chunks[worker][-1].append(base_margin.chunks[i])
+        ind = itertools.count(0)
+        out_chunks = []
+        for worker, chunks in worker_to_chunks.items():
+            data_chunk = concat_chunks(cls._order_chunk_index(chunks[0]))
+            inps = [data_chunk]
+            label_chunk = None
+            if label is not None:
+                label_chunk = concat_chunks(cls._order_chunk_index(chunks[1]))
+                inps.append(label_chunk)
+            weight_chunk = None
+            if weight is not None:
+                weight_chunk = concat_chunks(cls._order_chunk_index(chunks[i_weight]))
+                inps.append(weight_chunk)
+            base_margin_chunk = None
+            if base_margin is not None:
+                base_margin_chunk = concat_chunks(cls._order_chunk_index(chunks[-1]))
+                inps.append(base_margin_chunk)
+            chunk_op = ToDMatrix(
+                data=data_chunk,
+                label=label_chunk,
+                missing=op.missing,
+                weight=weight_chunk,
+                base_margin=base_margin_chunk,
+                feature_names=op.feature_names,
+                feature_types=op.feature_types,
+                _output_types=op.output_types,
+            )
+            kws = data_chunk.params
+            kws["index"] = (next(ind), 0)
+            out_chunks.append(chunk_op.new_chunk(inps, **kws))
+        nsplits = (tuple(c.shape[0] for c in out_chunks), (out_chunks[0].shape[1],))
+
+        new_op = op.copy()
+        kw = op.outputs[0].params
+        kw["chunks"] = out_chunks
+        kw["nsplits"] = nsplits
+        return new_op.new_tileables(op.inputs, kws=[kw])
+
+    @staticmethod
+    def get_xgb_dmatrix(tup, nthread: int = -1):
+        from xgboost import DMatrix
+
+        data, label, weight, base_margin, missing, feature_names, feature_types = tup
+        data = data.spmatrix if hasattr(data, "spmatrix") else data
+        return DMatrix(
+            ensure_own_data(data),
+            label=ensure_own_data(label),
+            missing=missing,
+            weight=ensure_own_data(weight),
+            base_margin=base_margin,
+            feature_names=feature_names,
+            feature_types=feature_types,
+            nthread=nthread,
+        )
+
+    @staticmethod
+    def _from_ctx_if_not_none(ctx, chunk):
+        if chunk is None:
+            return chunk
+        return ctx[chunk.key]
+
+    @classmethod
+    def execute(cls, ctx: Union[dict, Context], op: "ToDMatrix"):
+        if op._collocate:
+            outs = op.outputs
+            ctx[outs[0].key] = ctx[op.inputs[0].key]
+            has_label = False
+            if op.label is not None:
+                has_label = True
+                ctx[outs[1].key] = ctx[op.inputs[1].key]
+            if op.weight is not None:
+                i_weight = 1 if not has_label else 2
+                ctx[outs[i_weight].key] = ctx[op.inputs[i_weight].key]
+            if op.base_margin is not None:
+                ctx[outs[-1].key] = ctx[op.inputs[-1].key]
+        else:
+            out = op.outputs[0]
+            data = cls._from_ctx_if_not_none(ctx, op.data)
+            if data is None:
+                data = np.empty((0, out.shape[1]))
+            ctx[out.key] = (
+                data,
+                cls._from_ctx_if_not_none(ctx, op.label),
+                cls._from_ctx_if_not_none(ctx, op.weight),
+                cls._from_ctx_if_not_none(ctx, op.base_margin),
+                op.missing,
+                op.feature_names,
+                op.feature_types,
+            )
+
+
+def check_data(data):
+    data = convert_to_tensor_or_dataframe(data)
+    if data.ndim != 2:
+        raise ValueError(f"Expecting 2-d data, got: {data.ndim}-d")
+
+    return data
+
+
+def check_array_like(y: TileableType, name: str) -> TileableType:
+    if y is None:
+        return
+    y = convert_to_tensor_or_dataframe(y)
+    if isinstance(y, DATAFRAME_TYPE):
+        y = y.iloc[:, 0]
+    y = astensor(y)
+    if y.ndim != 1:
+        raise ValueError(f"Expecting 1-d {name}, got: {y.ndim}-d")
+    return y
+
+
+def to_dmatrix(
+    data,
+    label=None,
+    missing=None,
+    weight=None,
+    base_margin=None,
+    feature_names=None,
+    feature_types=None,
+):
+    data = check_data(data)
+    label = check_array_like(label, "label")
+    weight = check_array_like(weight, "weight")
+    base_margin = check_array_like(base_margin, "base_margin")
+
+    # If not multiple outputs, try to collect the chunks on same worker into one
+    # to feed the data into XGBoost for training.
+    op = ToDMatrix(
+        data=data,
+        label=label,
+        missing=missing,
+        weight=weight,
+        base_margin=base_margin,
+        feature_names=feature_names,
+        feature_types=feature_types,
+        gpu=data.op.gpu,
+        _output_types=get_output_types(data),
+    )
+    return op()
+
+
+MarsDMatrix = to_dmatrix
diff --git a/python/xorbits/_mars/learn/contrib/xgboost/predict.py b/python/xorbits/_mars/learn/contrib/xgboost/predict.py
new file mode 100644
index 000000000..2952d627c
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/xgboost/predict.py
@@ -0,0 +1,204 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle
+
+import numpy as np
+import pandas as pd
+
+from .... import opcodes as OperandDef
+from ....core import recursive_tile
+from ....dataframe.core import DATAFRAME_CHUNK_TYPE, SERIES_CHUNK_TYPE
+from ....dataframe.utils import parse_index
+from ....serialization.serializables import BytesField, DictField, FieldTypes, KeyField
+from ....tensor.core import TENSOR_TYPE, TensorOrder
+from ....utils import ensure_own_data, has_unknown_shape
+from ...operands import LearnOperand, LearnOperandMixin, OutputType
+from .dmatrix import ToDMatrix, check_data
+
+
+class XGBPredict(LearnOperand, LearnOperandMixin):
+    _op_type_ = OperandDef.XGBOOST_PREDICT
+
+    data = KeyField("data", default=None)
+    model = BytesField(
+        "model", on_serialize=pickle.dumps, on_deserialize=pickle.loads, default=None
+    )
+    kwargs = DictField("kwargs", key_type=FieldTypes.string, default_factory=dict)
+
+    def __init__(self, output_types=None, gpu=None, **kw):
+        super().__init__(_output_types=output_types, gpu=gpu, **kw)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self.data = self._inputs[0]
+
+    def __call__(self):
+        num_class = self.model.attr("num_class")
+        if num_class is not None:
+            num_class = int(num_class)
+        if num_class is not None:
+            shape = (self.data.shape[0], num_class)
+        else:
+            shape = (self.data.shape[0],)
+        inputs = [self.data]
+        if self.output_types[0] == OutputType.tensor:
+            # tensor
+            return self.new_tileable(
+                inputs,
+                shape=shape,
+                dtype=np.dtype(np.float32),
+                order=TensorOrder.C_ORDER,
+            )
+        elif self.output_types[0] == OutputType.dataframe:
+            # dataframe
+            dtypes = pd.DataFrame(np.random.rand(0, num_class), dtype=np.float32).dtypes
+            return self.new_tileable(
+                inputs,
+                shape=shape,
+                dtypes=dtypes,
+                columns_value=parse_index(dtypes.index),
+                index_value=self.data.index_value,
+            )
+        else:
+            # series
+            return self.new_tileable(
+                inputs,
+                shape=shape,
+                index_value=self.data.index_value,
+                name="predictions",
+                dtype=np.dtype(np.float32),
+            )
+
+    @classmethod
+    def tile(cls, op: "XGBPredict"):
+        out = op.outputs[0]
+        out_chunks = []
+        data = op.data
+        if data.chunk_shape[1] > 1:
+            if has_unknown_shape(op.data):
+                yield
+            data = yield from recursive_tile(data.rechunk({1: op.data.shape[1]}))
+        for in_chunk in data.chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_index = (in_chunk.index[0],)
+            if op.model.attr("num_class"):
+                chunk_shape = (in_chunk.shape[0], int(op.model.attr("num_class")))
+                chunk_index += (0,)
+            else:
+                chunk_shape = (in_chunk.shape[0],)
+            if op.output_types[0] == OutputType.tensor:
+                out_chunk = chunk_op.new_chunk(
+                    [in_chunk],
+                    shape=chunk_shape,
+                    dtype=out.dtype,
+                    order=out.order,
+                    index=chunk_index,
+                )
+            elif op.output_types[0] == OutputType.dataframe:
+                # dataframe chunk
+                out_chunk = chunk_op.new_chunk(
+                    [in_chunk],
+                    shape=chunk_shape,
+                    dtypes=data.dtypes,
+                    columns_value=data.columns_value,
+                    index_value=in_chunk.index_value,
+                    index=chunk_index,
+                )
+            else:
+                # series chunk
+                out_chunk = chunk_op.new_chunk(
+                    [in_chunk],
+                    shape=chunk_shape,
+                    dtype=out.dtype,
+                    index_value=in_chunk.index_value,
+                    name=out.name,
+                    index=chunk_index,
+                )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        params = out.params
+        params["chunks"] = out_chunks
+        nsplits = (data.nsplits[0],)
+        if out.ndim > 1:
+            nsplits += ((out.shape[1],),)
+        params["nsplits"] = nsplits
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def execute(cls, ctx, op: "XGBPredict"):
+        from xgboost import DMatrix
+
+        raw_data = data = ctx[op.data.key]
+        if isinstance(data, tuple):
+            data = ToDMatrix.get_xgb_dmatrix(ensure_own_data(data))
+        else:
+            data = data.spmatrix if hasattr(data, "spmatrix") else data
+            data = DMatrix(data)
+
+        # do not pass arguments that are None
+        kwargs = dict((k, v) for k, v in op.kwargs.items() if v is not None)
+        result = op.model.predict(data, **kwargs)
+
+        if isinstance(op.outputs[0], DATAFRAME_CHUNK_TYPE):
+            result = pd.DataFrame(result, index=raw_data.index)
+        elif isinstance(op.outputs[0], SERIES_CHUNK_TYPE):
+            result = pd.Series(result, index=raw_data.index, name="predictions")
+
+        ctx[op.outputs[0].key] = result
+
+
+def predict(
+    model,
+    data,
+    output_margin=False,
+    ntree_limit=None,
+    validate_features=True,
+    base_margin=None,
+    session=None,
+    run_kwargs=None,
+    run=True,
+):
+    import xgboost
+
+    data = check_data(data)
+    if not isinstance(model, xgboost.Booster):
+        raise TypeError(f"model has to be a xgboost.Booster, got {type(model)} instead")
+
+    num_class = model.attr("num_class")
+    if isinstance(data, TENSOR_TYPE):
+        output_types = [OutputType.tensor]
+    elif num_class is not None:
+        output_types = [OutputType.dataframe]
+    else:
+        output_types = [OutputType.series]
+
+    kwargs = {
+        "output_margin": output_margin,
+        "ntree_limit": ntree_limit,
+        "validate_features": validate_features,
+        "base_margin": base_margin,
+    }
+    op = XGBPredict(
+        data=data,
+        model=model,
+        kwargs=kwargs,
+        gpu=data.op.gpu,
+        output_types=output_types,
+    )
+    result = op()
+    if run:
+        result.execute(session=session, **(run_kwargs or dict()))
+    return result
diff --git a/python/xorbits/_mars/learn/contrib/xgboost/regressor.py b/python/xorbits/_mars/learn/contrib/xgboost/regressor.py
new file mode 100644
index 000000000..caddc2a3e
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/xgboost/regressor.py
@@ -0,0 +1,77 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ..utils import make_import_error_func
+from .core import XGBScikitLearnBase, xgboost
+
+XGBRegressor = make_import_error_func("xgboost")
+if xgboost:
+    from .core import wrap_evaluation_matrices
+    from .predict import predict
+    from .train import train
+
+    class XGBRegressor(XGBScikitLearnBase):
+        """
+        Implementation of the scikit-learn API for XGBoost regressor.
+        """
+
+        def fit(
+            self,
+            X,
+            y,
+            sample_weight=None,
+            base_margin=None,
+            eval_set=None,
+            sample_weight_eval_set=None,
+            base_margin_eval_set=None,
+            **kw,
+        ):
+            session = kw.pop("session", None)
+            run_kwargs = kw.pop("run_kwargs", dict())
+            if kw:
+                raise TypeError(
+                    f"fit got an unexpected keyword argument '{next(iter(kw))}'"
+                )
+
+            dtrain, evals = wrap_evaluation_matrices(
+                None,
+                X,
+                y,
+                sample_weight,
+                base_margin,
+                eval_set,
+                sample_weight_eval_set,
+                base_margin_eval_set,
+            )
+            params = self.get_xgb_params()
+            self.evals_result_ = dict()
+            result = train(
+                params,
+                dtrain,
+                num_boost_round=self.get_num_boosting_rounds(),
+                evals=evals,
+                evals_result=self.evals_result_,
+                session=session,
+                run_kwargs=run_kwargs,
+            )
+            self._Booster = result
+            return self
+
+        def predict(self, data, **kw):
+            session = kw.pop("session", None)
+            run_kwargs = kw.pop("run_kwargs", None)
+            return predict(
+                self.get_booster(), data, session=session, run_kwargs=run_kwargs, **kw
+            )
diff --git a/python/xorbits/_mars/learn/contrib/xgboost/start_tracker.py b/python/xorbits/_mars/learn/contrib/xgboost/start_tracker.py
new file mode 100644
index 000000000..a90cf632b
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/xgboost/start_tracker.py
@@ -0,0 +1,60 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from threading import Thread
+
+from .... import opcodes as OperandDef
+from ....core import NotSupportTile
+from ....serialization.serializables import Int32Field
+from ....utils import to_binary
+from ...operands import LearnOperand, LearnOperandMixin, OutputType
+
+
+class StartTracker(LearnOperand, LearnOperandMixin):
+    _op_type_ = OperandDef.START_TRACKER
+    _op_module_ = "learn.contrib.xgboost"
+
+    n_workers = Int32Field("n_workers", default=None)
+
+    def __init__(self, output_types=None, pure_depends=None, **kw):
+        super().__init__(
+            _output_types=output_types,
+            _pure_depends=pure_depends,
+            **kw,
+        )
+        if self.output_types is None:
+            self.output_types = [OutputType.object]
+
+    @classmethod
+    def tile(cls, op):
+        raise NotSupportTile("StartTracker is a chunk op")
+
+    @classmethod
+    def execute(cls, ctx, op):
+        """Start Rabit tracker"""
+        from .tracker import RabitTracker
+
+        env = {"DMLC_NUM_WORKER": op.n_workers}
+        rabit_context = RabitTracker(
+            host_ip=ctx.get_local_host_ip(), n_workers=op.n_workers
+        )
+        env.update(rabit_context.worker_envs())
+
+        rabit_context.start(op.n_workers)
+        thread = Thread(target=rabit_context.join)
+        thread.daemon = True
+        thread.start()
+
+        rabit_args = [to_binary(f"{k}={v}") for k, v in env.items()]
+        ctx[op.outputs[0].key] = rabit_args
diff --git a/python/xorbits/_mars/learn/contrib/xgboost/tests/__init__.py b/python/xorbits/_mars/learn/contrib/xgboost/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/xgboost/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/contrib/xgboost/tests/test_classifier.py b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_classifier.py
new file mode 100644
index 000000000..2090ce736
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_classifier.py
@@ -0,0 +1,167 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+
+import numpy as np
+import pandas as pd
+import pytest
+
+try:
+    import xgboost
+except ImportError:
+    xgboost = None
+
+from ..... import dataframe as md
+from ..... import tensor as mt
+from ..classifier import XGBClassifier
+
+n_rows = 1000
+n_columns = 10
+chunk_size = 200
+rs = mt.random.RandomState(0)
+X_raw = rs.rand(n_rows, n_columns, chunk_size=chunk_size)
+y_raw = rs.rand(n_rows, chunk_size=chunk_size)
+X_df_raw = md.DataFrame(X_raw)
+
+
+@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed")
+def test_local_classifier(setup):
+    y = (y_raw * 10).astype(mt.int32)
+    classifier = XGBClassifier(verbosity=1, n_estimators=2)
+    classifier.fit(X_raw, y, eval_set=[(X_raw, y)])
+    prediction = classifier.predict(X_raw)
+
+    assert prediction.ndim == 1
+    assert prediction.shape[0] == len(X_raw)
+
+    history = classifier.evals_result()
+
+    assert isinstance(prediction, mt.Tensor)
+    assert isinstance(history, dict)
+
+    assert list(history)[0] == "validation_0"
+    # default metrics may differ, see https://github.com/dmlc/xgboost/pull/6183
+    eval_metric = list(history["validation_0"])[0]
+    assert eval_metric in ("merror", "mlogloss")
+    assert len(history["validation_0"]) == 1
+    assert len(history["validation_0"][eval_metric]) == 2
+
+    prob = classifier.predict_proba(X_raw)
+    assert prob.shape == X_raw.shape
+
+    # test dataframe
+    X_df = X_df_raw
+    classifier = XGBClassifier(verbosity=1, n_estimators=2)
+    classifier.fit(X_df, y)
+    prediction = classifier.predict(X_df)
+
+    assert prediction.ndim == 1
+    assert prediction.shape[0] == len(X_raw)
+
+    # test weight
+    weights = [
+        mt.random.rand(X_raw.shape[0]),
+        md.Series(mt.random.rand(X_raw.shape[0])),
+        md.DataFrame(mt.random.rand(X_raw.shape[0])),
+    ]
+    y_df = md.DataFrame(y)
+    for weight in weights:
+        classifier = XGBClassifier(verbosity=1, n_estimators=2)
+        classifier.fit(X_raw, y_df, sample_weight=weight)
+        prediction = classifier.predict(X_raw)
+
+        assert prediction.ndim == 1
+        assert prediction.shape[0] == len(X_raw)
+
+    # should raise error if weight.ndim > 1
+    with pytest.raises(ValueError):
+        XGBClassifier(verbosity=1, n_estimators=2).fit(
+            X_raw, y_df, sample_weight=mt.random.rand(1, 1)
+        )
+
+    # test binary classifier
+    new_y = (y > 0.5).astype(mt.int32)
+    classifier = XGBClassifier(verbosity=1, n_estimators=2)
+    classifier.fit(X_raw, new_y)
+    prediction = classifier.predict(X_raw)
+
+    assert prediction.ndim == 1
+    assert prediction.shape[0] == len(X_raw)
+
+    # test predict data with unknown shape
+    X2 = X_raw[X_raw[:, 0] > 0.1].astype(mt.int32)
+    prediction = classifier.predict(X2)
+
+    assert prediction.ndim == 1
+
+    # test train with unknown shape
+    cond = X_raw[:, 0] > 0
+    X3 = X_raw[cond]
+    y3 = y[cond]
+    classifier = XGBClassifier(verbosity=1, n_estimators=2)
+    classifier.fit(X3, y3)
+    prediction = classifier.predict(X_raw)
+
+    assert prediction.ndim == 1
+    assert prediction.shape[0] == len(X_raw)
+
+    classifier = XGBClassifier(verbosity=1, n_estimators=2)
+    with pytest.raises(TypeError):
+        classifier.fit(X_raw, y, wrong_param=1)
+    classifier.fit(X_raw, y)
+    with pytest.raises(TypeError):
+        classifier.predict(X_raw, wrong_param=1)
+
+
+@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed")
+def test_local_classifier_from_to_parquet(setup):
+    n_rows = 1000
+    n_columns = 10
+    rs = np.random.RandomState(0)
+    X = rs.rand(n_rows, n_columns)
+    y = rs.rand(n_rows)
+    df = pd.DataFrame(X, columns=[f"c{i}" for i in range(n_columns)])
+    df["id"] = [f"i{i}" for i in range(n_rows)]
+
+    booster = xgboost.train({}, xgboost.DMatrix(X, y), num_boost_round=2)
+
+    with tempfile.TemporaryDirectory() as d:
+        m_name = os.path.join(d, "c.model")
+        result_dir = os.path.join(d, "result")
+        os.mkdir(result_dir)
+        data_dir = os.path.join(d, "data")
+        os.mkdir(data_dir)
+
+        booster.save_model(m_name)
+
+        df.iloc[:500].to_parquet(os.path.join(d, "data", "data1.parquet"))
+        df.iloc[500:].to_parquet(os.path.join(d, "data", "data2.parquet"))
+
+        df = md.read_parquet(data_dir).set_index("id")
+        model = XGBClassifier()
+        model.load_model(m_name)
+        result = model.predict(df, run=False)
+        r = md.DataFrame(result).to_parquet(result_dir)
+
+        # tiles to ensure no iterative tiling exists
+        r.execute()
+
+        ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy()
+        model2 = xgboost.XGBClassifier()
+        model2.load_model(m_name)
+        expected = model2.predict(X)
+        expected = np.stack([1 - expected, expected]).argmax(axis=0)
+        np.testing.assert_array_equal(ret, expected)
diff --git a/python/xorbits/_mars/learn/contrib/xgboost/tests/test_core.py b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_core.py
new file mode 100644
index 000000000..15b558923
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_core.py
@@ -0,0 +1,43 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+try:
+    import xgboost
+except ImportError:
+    xgboost = None
+
+
+from ..... import tensor as mt
+
+if xgboost:
+    from ..core import wrap_evaluation_matrices
+
+
+@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed")
+def test_wrap_evaluation_matrices():
+    X = mt.random.rand(100, 3)
+    y = mt.random.randint(3, size=(100,))
+
+    eval_set = [(mt.random.rand(10, 3), mt.random.randint(3, size=10))]
+    with pytest.raises(ValueError):
+        # sample_weight_eval_set size wrong
+        wrap_evaluation_matrices(0.0, X, y, None, None, eval_set, [], None)
+
+    with pytest.raises(ValueError):
+        wrap_evaluation_matrices(0.0, X, y, None, None, None, eval_set, None)
+
+    evals = wrap_evaluation_matrices(0.0, X, y, None, None, eval_set, None, None)[1]
+    assert len(evals) > 0
diff --git a/python/xorbits/_mars/learn/contrib/xgboost/tests/test_predict.py b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_predict.py
new file mode 100644
index 000000000..bfa14a29f
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_predict.py
@@ -0,0 +1,68 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import pytest
+
+try:
+    import xgboost
+    from xgboost import Booster
+except ImportError:
+    xgboost = None
+
+from ..... import dataframe as md
+from ..... import tensor as mt
+from .. import MarsDMatrix, predict, train
+
+n_rows = 1000
+n_columns = 10
+chunk_size = 200
+rs = mt.random.RandomState(0)
+X = rs.rand(n_rows, n_columns, chunk_size=chunk_size)
+y = rs.rand(n_rows, chunk_size=chunk_size)
+X_df = md.DataFrame(X)
+y_series = md.Series(y)
+x_sparse = np.random.rand(n_rows, n_columns)
+x_sparse[np.arange(n_rows), np.random.randint(n_columns, size=n_rows)] = np.nan
+X_sparse = mt.tensor(x_sparse, chunk_size=chunk_size).tosparse(missing=np.nan)
+
+
+@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed")
+def test_local_predict_tensor(setup):
+    dtrain = MarsDMatrix(X, y)
+    booster = train({}, dtrain, num_boost_round=2)
+    assert isinstance(booster, Booster)
+
+    prediction = predict(booster, X)
+    assert isinstance(prediction.to_numpy(), np.ndarray)
+
+    prediction = predict(booster, X_sparse)
+    assert isinstance(prediction.to_numpy(), np.ndarray)
+
+    prediction = predict(booster, dtrain)
+    assert isinstance(prediction.fetch(), np.ndarray)
+
+    with pytest.raises(TypeError):
+        predict(None, X)
+
+
+@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed")
+def test_local_predict_dataframe(setup):
+    dtrain = MarsDMatrix(X_df, y_series)
+    booster = train({}, dtrain, num_boost_round=2)
+    assert isinstance(booster, Booster)
+
+    prediction = predict(booster, X_df)
+    assert isinstance(prediction.to_pandas(), pd.Series)
diff --git a/python/xorbits/_mars/learn/contrib/xgboost/tests/test_regressor.py b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_regressor.py
new file mode 100644
index 000000000..73ed0dee9
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_regressor.py
@@ -0,0 +1,67 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+try:
+    import xgboost
+except ImportError:
+    xgboost = None
+
+from ..... import tensor as mt
+from ..regressor import XGBRegressor
+
+n_rows = 1000
+n_columns = 10
+chunk_size = 200
+rs = mt.random.RandomState(0)
+X = rs.rand(n_rows, n_columns, chunk_size=chunk_size)
+y = rs.rand(n_rows, chunk_size=chunk_size)
+
+
+@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed")
+def test_local_regressor(setup):
+    regressor = XGBRegressor(verbosity=1, n_estimators=2)
+    regressor.set_params(tree_method="hist")
+    regressor.fit(X, y, eval_set=[(X, y)])
+    prediction = regressor.predict(X)
+
+    assert prediction.ndim == 1
+    assert prediction.shape[0] == len(X)
+
+    history = regressor.evals_result()
+
+    assert isinstance(prediction, mt.Tensor)
+    assert isinstance(history, dict)
+
+    assert list(history["validation_0"])[0] == "rmse"
+    assert len(history["validation_0"]["rmse"]) == 2
+
+    # test weight
+    weight = mt.random.rand(X.shape[0])
+    classifier = XGBRegressor(verbosity=1, n_estimators=2)
+    regressor.set_params(tree_method="hist")
+    classifier.fit(X, y, sample_weight=weight)
+    prediction = classifier.predict(X)
+
+    assert prediction.ndim == 1
+    assert prediction.shape[0] == len(X)
+
+    # test wrong params
+    regressor = XGBRegressor(verbosity=1, n_estimators=2)
+    with pytest.raises(TypeError):
+        regressor.fit(X, y, wrong_param=1)
+    regressor.fit(X, y)
+    with pytest.raises(TypeError):
+        regressor.predict(X, wrong_param=1)
diff --git a/python/xorbits/_mars/learn/contrib/xgboost/tests/test_train.py b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_train.py
new file mode 100644
index 000000000..eeeda48db
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_train.py
@@ -0,0 +1,120 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+try:
+    import xgboost
+    from xgboost import Booster
+except ImportError:
+    xgboost = None
+from ..... import dataframe as md
+from ..... import tensor as mt
+from .....tests.core import require_ray
+from .. import MarsDMatrix, train
+
+n_rows = 1000
+n_columns = 10
+chunk_size = 200
+rs = mt.random.RandomState(0)
+X = rs.rand(n_rows, n_columns, chunk_size=chunk_size)
+y = rs.rand(n_rows, chunk_size=chunk_size)
+X_df = md.DataFrame(X)
+y_series = md.Series(y)
+x_sparse = np.random.rand(n_rows, n_columns)
+x_sparse[np.arange(n_rows), np.random.randint(n_columns, size=n_rows)] = np.nan
+X_sparse = mt.tensor(x_sparse, chunk_size=chunk_size).tosparse(missing=np.nan)
+
+
+@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed")
+def test_local_train_tensor(setup):
+    dtrain = MarsDMatrix(X, y)
+    booster = train({}, dtrain, num_boost_round=2)
+    assert isinstance(booster, Booster)
+
+
+@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed")
+def test_local_train_sparse_tensor(setup):
+    dtrain = MarsDMatrix(X_sparse, y)
+    booster = train({}, dtrain, num_boost_round=2)
+    assert isinstance(booster, Booster)
+
+
+@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed")
+def test_local_train_dataframe(setup):
+    dtrain = MarsDMatrix(X_df, y_series)
+    booster = train({}, dtrain, num_boost_round=2)
+    assert isinstance(booster, Booster)
+
+
+@pytest.mark.skipif(xgboost is None, reason="XGBoost not installed")
+@pytest.mark.parametrize("chunk_size", [n_rows // 5, n_rows])
+def test_train_evals(setup_cluster, chunk_size):
+    rs = mt.random.RandomState(0)
+    # keep 1 chunk for X and y
+    X = rs.rand(n_rows, n_columns, chunk_size=(n_rows, n_columns // 2))
+    y = rs.rand(n_rows, chunk_size=n_rows)
+    base_margin = rs.rand(n_rows, chunk_size=n_rows)
+    dtrain = MarsDMatrix(X, y, base_margin=base_margin)
+    eval_x = MarsDMatrix(
+        rs.rand(n_rows, n_columns, chunk_size=chunk_size),
+        rs.rand(n_rows, chunk_size=chunk_size),
+    )
+    evals = [(eval_x, "eval_x")]
+    eval_result = dict()
+    booster = train(
+        {}, dtrain, num_boost_round=2, evals=evals, evals_result=eval_result
+    )
+    assert isinstance(booster, Booster)
+    assert len(eval_result) > 0
+
+    with pytest.raises(TypeError):
+        train(
+            {},
+            dtrain,
+            num_boost_round=2,
+            evals=[("eval_x", eval_x)],
+            evals_result=eval_result,
+        )
+
+
+@require_ray
+def test_train_on_ray_cluster(ray_start_regular, ray_create_mars_cluster):
+    rs = mt.random.RandomState(0)
+    # keep 1 chunk for X and y
+    X = rs.rand(n_rows, n_columns, chunk_size=(n_rows, n_columns // 2))
+    y = rs.rand(n_rows, chunk_size=n_rows)
+    base_margin = rs.rand(n_rows, chunk_size=n_rows)
+    dtrain = MarsDMatrix(X, y, base_margin=base_margin)
+    eval_x = MarsDMatrix(
+        rs.rand(n_rows, n_columns, chunk_size=n_rows // 5),
+        rs.rand(n_rows, chunk_size=n_rows // 5),
+    )
+    evals = [(eval_x, "eval_x")]
+    eval_result = dict()
+    booster = train(
+        {}, dtrain, num_boost_round=2, evals=evals, evals_result=eval_result
+    )
+    assert isinstance(booster, Booster)
+    assert len(eval_result) > 0
+
+    with pytest.raises(TypeError):
+        train(
+            {},
+            dtrain,
+            num_boost_round=2,
+            evals=[("eval_x", eval_x)],
+            evals_result=eval_result,
+        )
diff --git a/python/xorbits/_mars/learn/contrib/xgboost/tracker.py b/python/xorbits/_mars/learn/contrib/xgboost/tracker.py
new file mode 100644
index 000000000..76167eb1e
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/xgboost/tracker.py
@@ -0,0 +1,503 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script is a variant of dmlc-core/dmlc_tracker/tracker.py,
+which is a specialized version for xgboost tasks.
+"""
+
+import argparse
+import logging
+
+# pylint: disable=invalid-name, missing-docstring, too-many-arguments, too-many-locals
+# pylint: disable=too-many-branches, too-many-statements, too-many-instance-attributes
+import socket
+import struct
+import sys
+from threading import Thread
+from typing import Dict, List, Optional, Set, Tuple, Union
+
+_RingMap = Dict[int, Tuple[int, int]]
+_TreeMap = Dict[int, List[int]]
+
+logger = logging.getLogger(__name__)
+
+
+class ExSocket:
+    """
+    Extension of socket to handle recv and send of special data
+    """
+
+    def __init__(self, sock: socket.socket) -> None:
+        self.sock = sock
+
+    def recvall(self, nbytes: int) -> bytes:
+        """Receive number of bytes."""
+        res = []
+        nread = 0
+        while nread < nbytes:
+            chunk = self.sock.recv(min(nbytes - nread, 1024))
+            nread += len(chunk)
+            res.append(chunk)
+        return b"".join(res)
+
+    def recvint(self) -> int:
+        """Receive an integer of 32 bytes"""
+        return struct.unpack("@i", self.recvall(4))[0]
+
+    def sendint(self, value: int) -> None:
+        """Send an integer of 32 bytes"""
+        self.sock.sendall(struct.pack("@i", value))
+
+    def sendstr(self, value: str) -> None:
+        """Send a Python string"""
+        self.sendint(len(value))
+        self.sock.sendall(value.encode())
+
+    def recvstr(self) -> str:
+        """Receive a Python string"""
+        slen = self.recvint()
+        return self.recvall(slen).decode()
+
+
+# magic number used to verify existence of data
+MAGIC_NUM = 0xFF99
+
+
+def get_some_ip(host: str) -> str:
+    """Get ip from host"""
+    return socket.getaddrinfo(host, None)[0][4][0]
+
+
+def get_family(addr: str) -> int:
+    """Get network family from address."""
+    return socket.getaddrinfo(addr, None)[0][0]
+
+
+class WorkerEntry:
+    """Handler to each worker."""
+
+    def __init__(self, sock: socket.socket, s_addr: Tuple[str, int]):
+        worker = ExSocket(sock)
+        self.sock = worker
+        self.host = get_some_ip(s_addr[0])
+        magic = worker.recvint()
+        assert magic == MAGIC_NUM, f"invalid magic number={magic} from {self.host}"
+        worker.sendint(MAGIC_NUM)
+        self.rank = worker.recvint()
+        self.world_size = worker.recvint()
+        self.jobid = worker.recvstr()
+        self.cmd = worker.recvstr()
+        self.wait_accept = 0
+        self.port: Optional[int] = None
+
+    def print(self, use_logger: bool) -> None:
+        """Execute the print command from worker."""
+        msg = self.sock.recvstr()
+        # On dask we use print to avoid setting global verbosity.
+        if use_logger:
+            logger.info(msg.strip())
+        else:
+            print(msg.strip(), flush=True)
+
+    def decide_rank(self, job_map: Dict[str, int]) -> int:
+        """Get the rank of current entry."""
+        if self.rank >= 0:
+            return self.rank
+        if self.jobid != "NULL" and self.jobid in job_map:
+            return job_map[self.jobid]
+        return -1
+
+    def assign_rank(
+        self,
+        rank: int,
+        wait_conn: Dict[int, "WorkerEntry"],
+        tree_map: _TreeMap,
+        parent_map: Dict[int, int],
+        ring_map: _RingMap,
+    ) -> List[int]:
+        """Assign the rank for current entry."""
+        self.rank = rank
+        nnset = set(tree_map[rank])
+        rprev, rnext = ring_map[rank]
+        self.sock.sendint(rank)
+        # send parent rank
+        self.sock.sendint(parent_map[rank])
+        # send world size
+        self.sock.sendint(len(tree_map))
+        self.sock.sendint(len(nnset))
+        # send the rprev and next link
+        for r in nnset:
+            self.sock.sendint(r)
+        # send prev link
+        if rprev not in (-1, rank):
+            nnset.add(rprev)
+            self.sock.sendint(rprev)
+        else:
+            self.sock.sendint(-1)
+        # send next link
+        if rnext not in (-1, rank):
+            nnset.add(rnext)
+            self.sock.sendint(rnext)
+        else:
+            self.sock.sendint(-1)
+
+        return self._get_remote(wait_conn, nnset)
+
+    def _get_remote(
+        self, wait_conn: Dict[int, "WorkerEntry"], nnset: Set[int]
+    ) -> List[int]:
+        while True:
+            ngood = self.sock.recvint()
+            goodset = set([])
+            for _ in range(ngood):
+                goodset.add(self.sock.recvint())
+            assert goodset.issubset(nnset)
+            badset = nnset - goodset
+            conset = []
+            for r in badset:
+                if r in wait_conn:
+                    conset.append(r)
+            self.sock.sendint(len(conset))
+            self.sock.sendint(len(badset) - len(conset))
+            for r in conset:
+                self.sock.sendstr(wait_conn[r].host)
+                port = wait_conn[r].port
+                assert port is not None
+                self.sock.sendint(port)
+                self.sock.sendint(r)
+            nerr = self.sock.recvint()
+            if nerr != 0:
+                continue
+            self.port = self.sock.recvint()
+            rmset = []
+            # all connection was successuly setup
+            for r in conset:
+                wait_conn[r].wait_accept -= 1
+                if wait_conn[r].wait_accept == 0:
+                    rmset.append(r)
+            for r in rmset:
+                wait_conn.pop(r, None)
+            self.wait_accept = len(badset) - len(conset)
+            return rmset
+
+
+class RabitTracker:
+    """
+    tracker for rabit
+    """
+
+    def __init__(
+        self, host_ip: str, n_workers: int, port: int = 0, use_logger: bool = False
+    ) -> None:
+        """A Python implementation of RABIT tracker.
+        Parameters
+        ..........
+        use_logger:
+            Use logging.info for tracker print command.  When set to False, Python print
+            function is used instead.
+        """
+        sock = socket.socket(get_family(host_ip), socket.SOCK_STREAM)
+        sock.bind((host_ip, port))
+        self.port = sock.getsockname()[1]
+        sock.listen(256)
+        self.sock = sock
+        self.host_ip = host_ip
+        self.thread: Optional[Thread] = None
+        self.n_workers = n_workers
+        self._use_logger = use_logger
+        logger.info("start listen on %s:%d", host_ip, self.port)
+
+    def __del__(self) -> None:
+        if hasattr(self, "sock"):
+            self.sock.close()
+
+    @staticmethod
+    def _get_neighbor(rank: int, n_workers: int) -> List[int]:
+        rank = rank + 1
+        ret = []
+        if rank > 1:
+            ret.append(rank // 2 - 1)
+        if rank * 2 - 1 < n_workers:
+            ret.append(rank * 2 - 1)
+        if rank * 2 < n_workers:
+            ret.append(rank * 2)
+        return ret
+
+    def worker_envs(self) -> Dict[str, Union[str, int]]:
+        """
+        get environment variables for workers
+        can be passed in as args or envs
+        """
+        return {"DMLC_TRACKER_URI": self.host_ip, "DMLC_TRACKER_PORT": self.port}
+
+    def _get_tree(self, n_workers: int) -> Tuple[_TreeMap, Dict[int, int]]:
+        tree_map: _TreeMap = {}
+        parent_map: Dict[int, int] = {}
+        for r in range(n_workers):
+            tree_map[r] = self._get_neighbor(r, n_workers)
+            parent_map[r] = (r + 1) // 2 - 1
+        return tree_map, parent_map
+
+    def find_share_ring(
+        self, tree_map: _TreeMap, parent_map: Dict[int, int], rank: int
+    ) -> List[int]:
+        """
+        get a ring structure that tends to share nodes with the tree
+        return a list starting from rank
+        """
+        nset = set(tree_map[rank])
+        cset = nset - set([parent_map[rank]])
+        if not cset:
+            return [rank]
+        rlst = [rank]
+        cnt = 0
+        for v in cset:
+            vlst = self.find_share_ring(tree_map, parent_map, v)
+            cnt += 1
+            if cnt == len(cset):
+                vlst.reverse()
+            rlst += vlst
+        return rlst
+
+    def get_ring(self, tree_map: _TreeMap, parent_map: Dict[int, int]) -> _RingMap:
+        """
+        get a ring connection used to recover local data
+        """
+        assert parent_map[0] == -1
+        rlst = self.find_share_ring(tree_map, parent_map, 0)
+        assert len(rlst) == len(tree_map)
+        ring_map: _RingMap = {}
+        n_workers = len(tree_map)
+        for r in range(n_workers):
+            rprev = (r + n_workers - 1) % n_workers
+            rnext = (r + 1) % n_workers
+            ring_map[rlst[r]] = (rlst[rprev], rlst[rnext])
+        return ring_map
+
+    def get_link_map(self, n_workers: int) -> Tuple[_TreeMap, Dict[int, int], _RingMap]:
+        """
+        get the link map, this is a bit hacky, call for better algorithm
+        to place similar nodes together
+        """
+        tree_map, parent_map = self._get_tree(n_workers)
+        ring_map = self.get_ring(tree_map, parent_map)
+        rmap = {0: 0}
+        k = 0
+        for i in range(n_workers - 1):
+            k = ring_map[k][1]
+            rmap[k] = i + 1
+
+        ring_map_: _RingMap = {}
+        tree_map_: _TreeMap = {}
+        parent_map_: Dict[int, int] = {}
+        for k, v in ring_map.items():
+            ring_map_[rmap[k]] = (rmap[v[0]], rmap[v[1]])
+        for k, tree_nodes in tree_map.items():
+            tree_map_[rmap[k]] = [rmap[x] for x in tree_nodes]
+        for k, parent in parent_map.items():
+            if k != 0:
+                parent_map_[rmap[k]] = rmap[parent]
+            else:
+                parent_map_[rmap[k]] = -1
+        return tree_map_, parent_map_, ring_map_
+
+    def accept_workers(self, n_workers: int) -> None:
+        """Wait for all workers to connect to the tracker."""
+        # set of nodes that finishes the job
+        shutdown: Dict[int, WorkerEntry] = {}
+        # set of nodes that is waiting for connections
+        wait_conn: Dict[int, WorkerEntry] = {}
+        # maps job id to rank
+        job_map: Dict[str, int] = {}
+        # list of workers that is pending to be assigned rank
+        pending: List[WorkerEntry] = []
+        # lazy initialize tree_map
+        tree_map = None
+
+        while len(shutdown) != n_workers:
+            fd, s_addr = self.sock.accept()
+            s = WorkerEntry(fd, s_addr)
+            if s.cmd == "print":
+                s.print(self._use_logger)
+                continue
+            if s.cmd == "shutdown":
+                assert s.rank >= 0 and s.rank not in shutdown
+                assert s.rank not in wait_conn
+                shutdown[s.rank] = s
+                logger.debug("Received %s signal from %d", s.cmd, s.rank)
+                continue
+            assert s.cmd in ("start", "recover")
+            # lazily initialize the workers
+            if tree_map is None:
+                assert s.cmd == "start"
+                if s.world_size > 0:
+                    n_workers = s.world_size
+                tree_map, parent_map, ring_map = self.get_link_map(n_workers)
+                # set of nodes that is pending for getting up
+                todo_nodes = list(range(n_workers))
+            else:
+                assert s.world_size in (-1, n_workers)
+            if s.cmd == "recover":
+                assert s.rank >= 0
+
+            rank = s.decide_rank(job_map)
+            # batch assignment of ranks
+            if rank == -1:
+                assert todo_nodes
+                pending.append(s)
+                if len(pending) == len(todo_nodes):
+                    pending.sort(key=lambda x: x.host)
+                    for s in pending:
+                        rank = todo_nodes.pop(0)
+                        if s.jobid != "NULL":
+                            job_map[s.jobid] = rank
+                        s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map)
+                        if s.wait_accept > 0:
+                            wait_conn[rank] = s
+                        logger.debug(
+                            "Received %s signal from %s; assign rank %d",
+                            s.cmd,
+                            s.host,
+                            s.rank,
+                        )
+                if not todo_nodes:
+                    logger.info("@tracker All of %d nodes getting started", n_workers)
+            else:
+                s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map)
+                logger.debug("Received %s signal from %d", s.cmd, s.rank)
+                if s.wait_accept > 0:
+                    wait_conn[rank] = s
+        logger.info("@tracker All nodes finishes job")
+
+    def start(self, n_workers: int) -> None:
+        """Start the tracker, it will wait for `n_workers` to connect."""
+
+        def run() -> None:
+            self.accept_workers(n_workers)
+
+        self.thread = Thread(target=run, args=(), daemon=True)
+        self.thread.start()
+
+    def join(self) -> None:
+        """Wait for the tracker to finish."""
+        while self.thread is not None and self.thread.is_alive():
+            self.thread.join(100)
+
+    def alive(self) -> bool:
+        """Whether the tracker thread is alive"""
+        return self.thread is not None and self.thread.is_alive()
+
+
+def get_host_ip(host_ip: Optional[str] = None) -> str:
+    """Get the IP address of current host.  If `host_ip` is not none then it will be
+    returned as it's
+    """
+    if host_ip is None or host_ip == "auto":
+        host_ip = "ip"
+
+    if host_ip == "dns":
+        host_ip = socket.getfqdn()
+    elif host_ip == "ip":
+        from socket import gaierror
+
+        try:
+            host_ip = socket.gethostbyname(socket.getfqdn())
+        except gaierror:
+            logger.debug(
+                "gethostbyname(socket.getfqdn()) failed... trying on hostname()"
+            )
+            host_ip = socket.gethostbyname(socket.gethostname())
+        if host_ip.startswith("127."):
+            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+            # doesn't have to be reachable
+            s.connect(("10.255.255.255", 1))
+            host_ip = s.getsockname()[0]
+
+    assert host_ip is not None
+    return host_ip
+
+
+def start_rabit_tracker(args: argparse.Namespace) -> None:
+    """Standalone function to start rabit tracker.
+    Parameters
+    ----------
+    args: arguments to start the rabit tracker.
+    """
+    envs = {"DMLC_NUM_WORKER": args.num_workers, "DMLC_NUM_SERVER": args.num_servers}
+    rabit = RabitTracker(
+        host_ip=get_host_ip(args.host_ip), n_workers=args.num_workers, use_logger=True
+    )
+    envs.update(rabit.worker_envs())
+    rabit.start(args.num_workers)
+    sys.stdout.write("DMLC_TRACKER_ENV_START\n")
+    # simply write configuration to stdout
+    for k, v in envs.items():
+        sys.stdout.write(f"{k}={v}\n")
+    sys.stdout.write("DMLC_TRACKER_ENV_END\n")
+    sys.stdout.flush()
+    rabit.join()
+
+
+def main() -> None:
+    """Main function if tracker is executed in standalone mode."""
+    parser = argparse.ArgumentParser(description="Rabit Tracker start.")
+    parser.add_argument(
+        "--num-workers",
+        required=True,
+        type=int,
+        help="Number of worker process to be launched.",
+    )
+    parser.add_argument(
+        "--num-servers",
+        default=0,
+        type=int,
+        help="Number of server process to be launched. Only used in PS jobs.",
+    )
+    parser.add_argument(
+        "--host-ip",
+        default=None,
+        type=str,
+        help=(
+            "Host IP addressed, this is only needed "
+            + "if the host IP cannot be automatically guessed."
+        ),
+    )
+    parser.add_argument(
+        "--log-level",
+        default="INFO",
+        type=str,
+        choices=["INFO", "DEBUG"],
+        help="Logging level of the logger.",
+    )
+    args = parser.parse_args()
+
+    fmt = "%(asctime)s %(levelname)s %(message)s"
+    if args.log_level == "INFO":
+        level = logging.INFO
+    elif args.log_level == "DEBUG":
+        level = logging.DEBUG
+    else:
+        raise RuntimeError(f"Unknown logging level {args.log_level}")
+
+    logging.basicConfig(format=fmt, level=level)
+
+    if args.num_servers == 0:
+        start_rabit_tracker(args)
+    else:
+        raise RuntimeError("Do not yet support start ps tracker in standalone mode.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/xorbits/_mars/learn/contrib/xgboost/train.py b/python/xorbits/_mars/learn/contrib/xgboost/train.py
new file mode 100644
index 000000000..11df52972
--- /dev/null
+++ b/python/xorbits/_mars/learn/contrib/xgboost/train.py
@@ -0,0 +1,280 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import logging
+import pickle
+from collections import OrderedDict, defaultdict
+
+import numpy as np
+
+from .... import opcodes as OperandDef
+from ....core import OutputType
+from ....core.context import get_context
+from ....core.operand import MergeDictOperand
+from ....serialization.serializables import DictField, FieldTypes, KeyField, ListField
+from ....utils import ensure_own_data
+from .dmatrix import ToDMatrix, to_dmatrix
+from .start_tracker import StartTracker
+
+logger = logging.getLogger(__name__)
+
+
+def _on_serialize_evals(evals_val):
+    if evals_val is None:
+        return None
+    return [list(x) for x in evals_val]
+
+
+class XGBTrain(MergeDictOperand):
+    _op_type_ = OperandDef.XGBOOST_TRAIN
+
+    params = DictField("params", key_type=FieldTypes.string, default=None)
+    dtrain = KeyField("dtrain", default=None)
+    evals = ListField("evals", on_serialize=_on_serialize_evals, default=None)
+    kwargs = DictField("kwargs", key_type=FieldTypes.string, default=None)
+    tracker = KeyField("tracker", default=None)
+
+    def __init__(self, gpu=None, **kw):
+        super().__init__(gpu=gpu, **kw)
+        if self.output_types is None:
+            self.output_types = [OutputType.object]
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self.dtrain = self._inputs[0]
+        rest = self._inputs[1:]
+        if self.tracker is not None:
+            self.tracker = self._inputs[-1]
+            rest = rest[:-1]
+        if self.evals is not None:
+            evals_dict = OrderedDict(self.evals)
+            new_evals_dict = OrderedDict()
+            for new_key, val in zip(rest, evals_dict.values()):
+                new_evals_dict[new_key] = val
+            self.evals = list(new_evals_dict.items())
+
+    def __call__(self):
+        inputs = [self.dtrain]
+        if self.evals is not None:
+            inputs.extend(e[0] for e in self.evals)
+        return self.new_tileable(inputs)
+
+    @staticmethod
+    def _get_dmatrix_chunks_workers(ctx, dmatrix):
+        # dmatrix_chunk.inputs is concat, and concat's input is the coallocated chunks
+        metas = ctx.get_chunks_meta(
+            [c.inputs[0].inputs[0].key for c in dmatrix.chunks], fields=["bands"]
+        )
+        return [m["bands"][0][0] for m in metas]
+
+    @classmethod
+    def tile(cls, op: "XGBTrain"):
+        ctx = get_context()
+
+        inp = op.inputs[0]
+        in_chunks = inp.chunks
+        workers = cls._get_dmatrix_chunks_workers(ctx, inp)
+        worker_to_in_chunks = dict(zip(workers, in_chunks))
+        n_chunk = len(in_chunks)
+        out_chunks = []
+        worker_to_evals = defaultdict(dict)
+        if op.evals is not None:
+            for dm, ev in op.evals:
+                ev_workers = cls._get_dmatrix_chunks_workers(ctx, dm)
+                for ev_worker, ev_chunk in zip(ev_workers, dm.chunks):
+                    worker_to_evals[ev_worker][ev] = ev_chunk
+
+        all_workers = set(workers)
+        all_workers.update(worker_to_evals)
+
+        i = itertools.count(n_chunk)
+        tracker_chunk = StartTracker(
+            n_workers=len(all_workers), pure_depends=[True] * n_chunk
+        ).new_chunk(in_chunks, shape=())
+        for worker in all_workers:
+            chunk_op = op.copy().reset_key()
+            chunk_op.expect_worker = worker
+            chunk_op.tracker = tracker_chunk
+            if worker in worker_to_in_chunks:
+                in_chunk = worker_to_in_chunks[worker]
+            else:
+                in_chunk_op = ToDMatrix(
+                    data=None,
+                    label=None,
+                    weight=None,
+                    base_margin=None,
+                    missing=inp.op.missing,
+                    feature_names=inp.op.feature_names,
+                    feature_types=inp.op.feature_types,
+                    _output_types=inp.op.output_types,
+                )
+                params = inp.params.copy()
+                params["index"] = (next(i), 0)
+                params["shape"] = (0, inp.shape[1])
+                in_chunk = in_chunk_op.new_chunk(None, kws=[params])
+            chunk_evals = []
+            for dm, ev in op.evals:
+                try:
+                    chunk_evals.append((worker_to_evals[worker][ev], ev))
+                except KeyError:
+                    # create a new eval chunk
+                    eval_chunk_op = ToDMatrix(
+                        data=None,
+                        label=None,
+                        weight=None,
+                        base_margin=None,
+                        missing=dm.op.missing,
+                        feature_names=dm.op.feature_names,
+                        feature_types=dm.op.feature_types,
+                        _output_types=dm.op.output_types,
+                    )
+                    params = dm.params.copy()
+                    params["index"] = (0, 0)
+                    params["shape"] = (0, dm.shape[1])
+                    eval_chunk = eval_chunk_op.new_chunk(None, kws=[params])
+                    chunk_evals.append((eval_chunk, ev))
+            chunk_op.evals = chunk_evals
+            input_chunks = (
+                [in_chunk] + [pair[0] for pair in chunk_evals] + [tracker_chunk]
+            )
+            out_chunk = chunk_op.new_chunk(
+                input_chunks, shape=(np.nan,), index=in_chunk.index[:1]
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tileables(
+            op.inputs, chunks=out_chunks, nsplits=((np.nan for _ in out_chunks),)
+        )
+
+    @classmethod
+    def execute(cls, ctx, op: "XGBTrain"):
+        if op.merge:
+            return super().execute(ctx, op)
+
+        from xgboost import rabit, train
+
+        params = op.params.copy()
+
+        n_threads = 0
+        if op.tracker is None:
+            # non distributed
+            ctx_n_threads = -1
+        else:
+            # distributed
+            ctx_n_threads = ctx.get_slots()
+
+        # fix parallelism on nodes
+        for p in ["nthread", "n_jobs"]:
+            if (
+                params.get(p, None) is not None
+                and params.get(p, ctx_n_threads) != ctx_n_threads
+            ):  # pragma: no cover
+                logger.info("Overriding `nthreads` defined in Mars worker.")
+                n_threads = params[p]
+                break
+        if n_threads == 0 or n_threads is None:  # pragma: no branch
+            n_threads = ctx_n_threads
+        params.update({"nthread": n_threads, "n_jobs": n_threads})
+
+        dtrain = ToDMatrix.get_xgb_dmatrix(
+            ensure_own_data(ctx[op.dtrain.key]), nthread=n_threads
+        )
+        evals = tuple()
+        if op.evals is not None:
+            eval_dmatrices = [
+                ToDMatrix.get_xgb_dmatrix(
+                    ensure_own_data(ctx[t[0].key]), nthread=n_threads
+                )
+                for t in op.evals
+            ]
+            evals = tuple((m, ev[1]) for m, ev in zip(eval_dmatrices, op.evals))
+
+        if op.tracker is None:
+            # non distributed
+            local_history = dict()
+            kwargs = dict() if op.kwargs is None else op.kwargs
+            bst = train(
+                params, dtrain, evals=evals, evals_result=local_history, **kwargs
+            )
+            ctx[op.outputs[0].key] = {
+                "booster": pickle.dumps(bst),
+                "history": local_history,
+            }
+        else:
+            # distributed
+            logger.debug("Distributed train params: %r", params)
+
+            rabit_args = ctx[op.tracker.key]
+            rabit.init(
+                [
+                    arg.tobytes() if isinstance(arg, memoryview) else arg
+                    for arg in rabit_args
+                ]
+            )
+            try:
+                logger.debug(
+                    "Start to train data, train size: %s, evals sizes: %s",
+                    dtrain.num_row(),
+                    [ev[0].num_row() for ev in evals],
+                )
+                local_history = dict()
+                bst = train(
+                    params, dtrain, evals=evals, evals_result=local_history, **op.kwargs
+                )
+                ret = {"booster": pickle.dumps(bst), "history": local_history}
+                if rabit.get_rank() != 0:
+                    ret = {}
+                ctx[op.outputs[0].key] = ret
+            finally:
+                rabit.finalize()
+
+
+def train(params, dtrain, evals=(), **kwargs):
+    """
+    Train XGBoost model in Mars manner.
+
+    Parameters
+    ----------
+    Parameters are the same as `xgboost.train`.
+
+    Returns
+    -------
+    results: Booster
+    """
+
+    evals_result = kwargs.pop("evals_result", dict())
+    session = kwargs.pop("session", None)
+    run_kwargs = kwargs.pop("run_kwargs", dict())
+
+    processed_evals = []
+    if evals:
+        for eval_dmatrix, name in evals:
+            if not isinstance(name, str):
+                raise TypeError("evals must a list of pairs (DMatrix, string)")
+            if hasattr(eval_dmatrix, "op") and isinstance(eval_dmatrix.op, ToDMatrix):
+                processed_evals.append((eval_dmatrix, name))
+            else:
+                processed_evals.append((to_dmatrix(eval_dmatrix), name))
+
+    op = XGBTrain(params=params, dtrain=dtrain, evals=processed_evals, kwargs=kwargs)
+    t = op()
+    ret = t.execute(session=session, **run_kwargs).fetch(session=session)
+    evals_result.update(ret["history"])
+    bst = pickle.loads(ret["booster"])
+    num_class = params.get("num_class")
+    if num_class:
+        bst.set_attr(num_class=str(num_class))
+    return bst
diff --git a/python/xorbits/_mars/learn/datasets/__init__.py b/python/xorbits/_mars/learn/datasets/__init__.py
new file mode 100644
index 000000000..62a6a122b
--- /dev/null
+++ b/python/xorbits/_mars/learn/datasets/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .samples_generator import (
+    make_blobs,
+    make_classification,
+    make_low_rank_matrix,
+    make_regression,
+)
diff --git a/python/xorbits/_mars/learn/datasets/samples_generator.py b/python/xorbits/_mars/learn/datasets/samples_generator.py
new file mode 100644
index 000000000..b1b14b40c
--- /dev/null
+++ b/python/xorbits/_mars/learn/datasets/samples_generator.py
@@ -0,0 +1,633 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numbers
+from collections.abc import Iterable
+
+import numpy as np
+
+from ... import tensor as mt
+from ...tensor import linalg
+from ...tensor.utils import check_random_state
+from ..utils import check_array
+from ..utils import shuffle as util_shuffle
+
+# -------------------------------------------------------------------
+# Original implementation is in `sklearn.datasets.samples_generator`.
+# -------------------------------------------------------------------
+
+
+def make_classification(
+    n_samples=100,
+    n_features=20,
+    n_informative=2,
+    n_redundant=2,
+    n_repeated=0,
+    n_classes=2,
+    n_clusters_per_class=2,
+    weights=None,
+    flip_y=0.01,
+    class_sep=1.0,
+    hypercube=True,
+    shift=0.0,
+    scale=1.0,
+    shuffle=True,
+    random_state=None,
+):
+    """Generate a random n-class classification problem.
+
+    This initially creates clusters of points normally distributed (std=1)
+    about vertices of an ``n_informative``-dimensional hypercube with sides of
+    length ``2*class_sep`` and assigns an equal number of clusters to each
+    class. It introduces interdependence between these features and adds
+    various types of further noise to the data.
+
+    Without shuffling, ``X`` horizontally stacks features in the following
+    order: the primary ``n_informative`` features, followed by ``n_redundant``
+    linear combinations of the informative features, followed by ``n_repeated``
+    duplicates, drawn randomly with replacement from the informative and
+    redundant features. The remaining features are filled with random noise.
+    Thus, without shuffling, all useful features are contained in the columns
+    ``X[:, :n_informative + n_redundant + n_repeated]``.
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int, optional (default=100)
+        The number of samples.
+
+    n_features : int, optional (default=20)
+        The total number of features. These comprise ``n_informative``
+        informative features, ``n_redundant`` redundant features,
+        ``n_repeated`` duplicated features and
+        ``n_features-n_informative-n_redundant-n_repeated`` useless features
+        drawn at random.
+
+    n_informative : int, optional (default=2)
+        The number of informative features. Each class is composed of a number
+        of gaussian clusters each located around the vertices of a hypercube
+        in a subspace of dimension ``n_informative``. For each cluster,
+        informative features are drawn independently from  N(0, 1) and then
+        randomly linearly combined within each cluster in order to add
+        covariance. The clusters are then placed on the vertices of the
+        hypercube.
+
+    n_redundant : int, optional (default=2)
+        The number of redundant features. These features are generated as
+        random linear combinations of the informative features.
+
+    n_repeated : int, optional (default=0)
+        The number of duplicated features, drawn randomly from the informative
+        and the redundant features.
+
+    n_classes : int, optional (default=2)
+        The number of classes (or labels) of the classification problem.
+
+    n_clusters_per_class : int, optional (default=2)
+        The number of clusters per class.
+
+    weights : list of floats or None (default=None)
+        The proportions of samples assigned to each class. If None, then
+        classes are balanced. Note that if ``len(weights) == n_classes - 1``,
+        then the last class weight is automatically inferred.
+        More than ``n_samples`` samples may be returned if the sum of
+        ``weights`` exceeds 1.
+
+    flip_y : float, optional (default=0.01)
+        The fraction of samples whose class are randomly exchanged. Larger
+        values introduce noise in the labels and make the classification
+        task harder.
+
+    class_sep : float, optional (default=1.0)
+        The factor multiplying the hypercube size.  Larger values spread
+        out the clusters/classes and make the classification task easier.
+
+    hypercube : boolean, optional (default=True)
+        If True, the clusters are put on the vertices of a hypercube. If
+        False, the clusters are put on the vertices of a random polytope.
+
+    shift : float, array of shape [n_features] or None, optional (default=0.0)
+        Shift features by the specified value. If None, then features
+        are shifted by a random value drawn in [-class_sep, class_sep].
+
+    scale : float, array of shape [n_features] or None, optional (default=1.0)
+        Multiply features by the specified value. If None, then features
+        are scaled by a random value drawn in [1, 100]. Note that scaling
+        happens after shifting.
+
+    shuffle : boolean, optional (default=True)
+        Shuffle the samples and the features.
+
+    random_state : int, RandomState instance or None (default)
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    X : tensor of shape [n_samples, n_features]
+        The generated samples.
+
+    y : tensor of shape [n_samples]
+        The integer labels for class membership of each sample.
+
+    Notes
+    -----
+    The algorithm is adapted from Guyon [1] and was designed to generate
+    the "Madelon" dataset.
+
+    References
+    ----------
+    .. [1] I. Guyon, "Design of experiments for the NIPS 2003 variable
+           selection benchmark", 2003.
+
+    See also
+    --------
+    make_blobs: simplified variant
+    make_multilabel_classification: unrelated generator for multilabel tasks
+    """
+    from sklearn.datasets._samples_generator import _generate_hypercube
+
+    generator = check_random_state(random_state)
+    np_generator = generator.to_numpy()
+
+    # Count features, clusters and samples
+    if n_informative + n_redundant + n_repeated > n_features:
+        raise ValueError(
+            "Number of informative, redundant and repeated "
+            "features must sum to less than the number of total"
+            " features"
+        )
+    # Use log2 to avoid overflow errors
+    if n_informative < np.log2(n_classes * n_clusters_per_class):
+        raise ValueError(
+            "n_classes * n_clusters_per_class must"
+            " be smaller or equal 2 ** n_informative"
+        )
+    if weights and len(weights) not in [n_classes, n_classes - 1]:
+        raise ValueError("Weights specified but incompatible with number of classes.")
+
+    n_useless = n_features - n_informative - n_redundant - n_repeated
+    n_clusters = n_classes * n_clusters_per_class
+
+    if weights and len(weights) == (n_classes - 1):
+        weights = weights + [1.0 - sum(weights)]
+
+    if weights is None:
+        weights = [1.0 / n_classes] * n_classes
+        weights[-1] = 1.0 - sum(weights[:-1])
+
+    # Distribute samples among clusters by weight
+    n_samples_per_cluster = [
+        int(n_samples * weights[k % n_classes] / n_clusters_per_class)
+        for k in range(n_clusters)
+    ]
+
+    for i in range(n_samples - sum(n_samples_per_cluster)):
+        n_samples_per_cluster[i % n_clusters] += 1
+
+    # Initialize X and y
+    X = mt.zeros((n_samples, n_features))
+    y = mt.zeros(n_samples, dtype=mt.int)
+
+    # Build the polytope whose vertices become cluster centroids
+    centroids = _generate_hypercube(n_clusters, n_informative, np_generator).astype(
+        float, copy=False
+    )
+    centroids *= 2 * class_sep
+    centroids -= class_sep
+    if not hypercube:
+        centroids *= np_generator.rand(n_clusters, 1)
+        centroids *= np_generator.rand(1, n_informative)
+
+    # Initially draw informative features from the standard normal
+    X[:, :n_informative] = generator.randn(n_samples, n_informative)
+
+    # Create each cluster; a variant of make_blobs
+    stop = 0
+    for k, centroid in enumerate(centroids):
+        start, stop = stop, stop + n_samples_per_cluster[k]
+        y[start:stop] = k % n_classes  # assign labels
+        X_k = X[start:stop, :n_informative]  # slice a view of the cluster
+
+        A = 2 * generator.rand(n_informative, n_informative) - 1
+        X_k[...] = mt.dot(X_k, A)  # introduce random covariance
+
+        X_k += centroid  # shift the cluster to a vertex
+
+    # Create redundant features
+    if n_redundant > 0:
+        B = 2 * generator.rand(n_informative, n_redundant) - 1
+        X[:, n_informative : n_informative + n_redundant] = mt.dot(
+            X[:, :n_informative], B
+        )
+
+    # Repeat some features
+    if n_repeated > 0:
+        n = n_informative + n_redundant
+        indices = ((n - 1) * generator.rand(n_repeated) + 0.5).astype(mt.intp)
+        X[:, n : n + n_repeated] = X[:, indices]
+
+    # Fill useless features
+    if n_useless > 0:
+        X[:, -n_useless:] = generator.randn(n_samples, n_useless)
+
+    # Randomly replace labels
+    if flip_y >= 0.0:
+        flip_mask = generator.rand(n_samples) < flip_y
+        y = mt.where(flip_mask, generator.randint(n_classes, size=len(y)), y)
+
+    # Randomly shift and scale
+    if shift is None:
+        shift = (2 * generator.rand(n_features) - 1) * class_sep
+    X += shift
+
+    if scale is None:
+        scale = 1 + 100 * generator.rand(n_features)
+    X *= scale
+
+    if shuffle:
+        # Randomly permute samples
+        X, y = util_shuffle(X, y, random_state=generator, axes=(0, 1))
+
+    return X, y
+
+
+def make_regression(
+    n_samples=100,
+    n_features=100,
+    *,
+    n_informative=10,
+    n_targets=1,
+    bias=0.0,
+    effective_rank=None,
+    tail_strength=0.5,
+    noise=0.0,
+    shuffle=True,
+    coef=False,
+    random_state=None,
+):
+    """Generate a random regression problem.
+
+    The input set can either be well conditioned (by default) or have a low
+    rank-fat tail singular profile. See :func:`make_low_rank_matrix` for
+    more details.
+
+    The output is generated by applying a (potentially biased) random linear
+    regression model with `n_informative` nonzero regressors to the previously
+    generated input and some gaussian centered noise with some adjustable
+    scale.
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int, default=100
+        The number of samples.
+
+    n_features : int, default=100
+        The number of features.
+
+    n_informative : int, default=10
+        The number of informative features, i.e., the number of features used
+        to build the linear model used to generate the output.
+
+    n_targets : int, default=1
+        The number of regression targets, i.e., the dimension of the y output
+        vector associated with a sample. By default, the output is a scalar.
+
+    bias : float, default=0.0
+        The bias term in the underlying linear model.
+
+    effective_rank : int, default=None
+        if not None:
+            The approximate number of singular vectors required to explain most
+            of the input data by linear combinations. Using this kind of
+            singular spectrum in the input allows the generator to reproduce
+            the correlations often observed in practice.
+        if None:
+            The input set is well conditioned, centered and gaussian with
+            unit variance.
+
+    tail_strength : float, default=0.5
+        The relative importance of the fat noisy tail of the singular values
+        profile if `effective_rank` is not None. When a float, it should be
+        between 0 and 1.
+
+    noise : float, default=0.0
+        The standard deviation of the gaussian noise applied to the output.
+
+    shuffle : bool, default=True
+        Shuffle the samples and the features.
+
+    coef : bool, default=False
+        If True, the coefficients of the underlying linear model are returned.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    X : tensor of shape (n_samples, n_features)
+        The input samples.
+
+    y : tensor of shape (n_samples,) or (n_samples, n_targets)
+        The output values.
+
+    coef : tensor of shape (n_features,) or (n_features, n_targets)
+        The coefficient of the underlying linear model. It is returned only if
+        coef is True.
+    """
+    n_informative = min(n_features, n_informative)
+    generator = check_random_state(random_state)
+
+    if effective_rank is None:
+        # Randomly generate a well conditioned input set
+        X = generator.randn(n_samples, n_features)
+
+    else:
+        # Randomly generate a low rank, fat tail input set
+        X = make_low_rank_matrix(
+            n_samples=n_samples,
+            n_features=n_features,
+            effective_rank=effective_rank,
+            tail_strength=tail_strength,
+            random_state=generator,
+        )
+
+    # Generate a ground truth model with only n_informative features being non
+    # zeros (the other features are not correlated to y and should be ignored
+    # by a sparsifying regularizers such as L1 or elastic net)
+    ground_truth = mt.zeros((n_features, n_targets))
+    ground_truth[:n_informative, :] = 100 * generator.rand(n_informative, n_targets)
+
+    y = mt.dot(X, ground_truth) + bias
+
+    # Add noise
+    if noise > 0.0:
+        y += generator.normal(scale=noise, size=y.shape)
+
+    # Randomly permute samples and features
+    if shuffle:
+        X, y = util_shuffle(X, y, random_state=generator)
+
+        indices = mt.arange(n_features)
+        generator.shuffle(indices)
+        X[:, :] = X[:, indices]
+        ground_truth = ground_truth[indices]
+
+    y = mt.squeeze(y)
+
+    if coef:
+        return X, y, mt.squeeze(ground_truth)
+
+    else:
+        return X, y
+
+
+def make_blobs(
+    n_samples=100,
+    n_features=2,
+    centers=None,
+    cluster_std=1.0,
+    center_box=(-10.0, 10.0),
+    shuffle=True,
+    random_state=None,
+):
+    """Generate isotropic Gaussian blobs for clustering.
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int or array-like, optional (default=100)
+        If int, it is the total number of points equally divided among
+        clusters.
+        If array-like, each element of the sequence indicates
+        the number of samples per cluster.
+
+    n_features : int, optional (default=2)
+        The number of features for each sample.
+
+    centers : int or array of shape [n_centers, n_features], optional
+        (default=None)
+        The number of centers to generate, or the fixed center locations.
+        If n_samples is an int and centers is None, 3 centers are generated.
+        If n_samples is array-like, centers must be
+        either None or an array of length equal to the length of n_samples.
+
+    cluster_std : float or sequence of floats, optional (default=1.0)
+        The standard deviation of the clusters.
+
+    center_box : pair of floats (min, max), optional (default=(-10.0, 10.0))
+        The bounding box for each cluster center when centers are
+        generated at random.
+
+    shuffle : boolean, optional (default=True)
+        Shuffle the samples.
+
+    random_state : int, RandomState instance or None (default)
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    X : tensor of shape [n_samples, n_features]
+        The generated samples.
+
+    y : tensor of shape [n_samples]
+        The integer labels for cluster membership of each sample.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_blobs
+    >>> X, y = make_blobs(n_samples=10, centers=3, n_features=2,
+    ...                   random_state=0)
+    >>> print(X.shape)
+    (10, 2)
+    >>> y
+    array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0])
+    >>> X, y = make_blobs(n_samples=[3, 3, 4], centers=None, n_features=2,
+    ...                   random_state=0)
+    >>> print(X.shape)
+    (10, 2)
+    >>> y
+    array([0, 1, 2, 0, 2, 2, 2, 1, 1, 0])
+
+    See also
+    --------
+    make_classification: a more intricate variant
+    """
+    from ..utils.checks import AssertAllFinite
+
+    generator = check_random_state(random_state)
+
+    if isinstance(n_samples, numbers.Integral):
+        # Set n_centers by looking at centers arg
+        if centers is None:
+            centers = 3
+
+        if isinstance(centers, numbers.Integral):
+            n_centers = centers
+            centers = generator.uniform(
+                center_box[0], center_box[1], size=(n_centers, n_features)
+            )
+
+        else:
+            centers = check_array(centers)
+            n_features = centers.shape[1]
+            n_centers = centers.shape[0]
+
+    else:
+        # Set n_centers by looking at [n_samples] arg
+        n_centers = len(n_samples)
+        if centers is None:
+            centers = generator.uniform(
+                center_box[0], center_box[1], size=(n_centers, n_features)
+            )
+        try:
+            assert len(centers) == n_centers
+        except TypeError:
+            raise ValueError(
+                f"Parameter `centers` must be array-like. Got {centers!r} instead"
+            )
+        except AssertionError:
+            raise ValueError(
+                "Length of `n_samples` not consistent"
+                f" with number of centers. Got n_samples = {n_samples} "
+                f"and centers = {centers}"
+            )
+        else:
+            centers = check_array(centers)
+            n_features = centers.shape[1]
+
+    # stds: if cluster_std is given as list, it must be consistent
+    # with the n_centers
+    if hasattr(cluster_std, "__len__") and len(cluster_std) != n_centers:
+        if isinstance(centers.op, AssertAllFinite):
+            centers = centers.op.inputs[0]
+        raise ValueError(
+            "Length of `clusters_std` not consistent with "
+            f"number of centers. Got centers = {centers} "
+            f"and cluster_std = {cluster_std}"
+        )
+
+    if isinstance(cluster_std, numbers.Real):
+        cluster_std = mt.full(len(centers), cluster_std)
+
+    X = []
+    y = []
+
+    if isinstance(n_samples, Iterable):
+        n_samples_per_center = n_samples
+    else:
+        n_samples_per_center = [int(n_samples // n_centers)] * n_centers
+
+        for i in range(n_samples % n_centers):
+            n_samples_per_center[i] += 1
+
+    for i, (n, std) in enumerate(zip(n_samples_per_center, cluster_std)):
+        if n == 0:
+            continue
+        X.append(generator.normal(loc=centers[i], scale=std, size=(n, n_features)))
+        y += [i] * n
+
+    X = mt.concatenate(X)
+    y = mt.array(y)
+
+    if shuffle:
+        X, y = util_shuffle(X, y, random_state=generator)
+
+    return X, y
+
+
+def make_low_rank_matrix(
+    n_samples=100,
+    n_features=100,
+    effective_rank=10,
+    tail_strength=0.5,
+    random_state=None,
+    chunk_size=None,
+):
+    """Generate a mostly low rank matrix with bell-shaped singular values
+
+    Most of the variance can be explained by a bell-shaped curve of width
+    effective_rank: the low rank part of the singular values profile is::
+
+        (1 - tail_strength) * exp(-1.0 * (i / effective_rank) ** 2)
+
+    The remaining singular values' tail is fat, decreasing as::
+
+        tail_strength * exp(-0.1 * i / effective_rank).
+
+    The low rank part of the profile can be considered the structured
+    signal part of the data while the tail can be considered the noisy
+    part of the data that cannot be summarized by a low number of linear
+    components (singular vectors).
+
+    This kind of singular profiles is often seen in practice, for instance:
+     - gray level pictures of faces
+     - TF-IDF vectors of text documents crawled from the web
+
+    Read more in the :ref:`User Guide <sample_generators>`.
+
+    Parameters
+    ----------
+    n_samples : int, optional (default=100)
+        The number of samples.
+
+    n_features : int, optional (default=100)
+        The number of features.
+
+    effective_rank : int, optional (default=10)
+        The approximate number of singular vectors required to explain most of
+        the data by linear combinations.
+
+    tail_strength : float between 0.0 and 1.0, optional (default=0.5)
+        The relative importance of the fat noisy tail of the singular values
+        profile.
+
+    random_state : int, RandomState instance or None (default)
+        Determines random number generation for dataset creation. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+
+    Returns
+    -------
+    X : array of shape [n_samples, n_features]
+        The matrix.
+    """
+    generator = check_random_state(random_state)
+    n = min(n_samples, n_features)
+
+    # Random (ortho normal) vectors
+    u, _ = linalg.qr(generator.randn(n_samples, n, chunk_size=chunk_size))
+    v, _ = linalg.qr(generator.randn(n_features, n, chunk_size=chunk_size))
+
+    # Index of the singular values
+    singular_ind = mt.arange(n, dtype=mt.float64, chunk_size=chunk_size)
+
+    # Build the singular profile by assembling signal and noise components
+    low_rank = (1 - tail_strength) * mt.exp(-1.0 * (singular_ind / effective_rank) ** 2)
+    tail = tail_strength * mt.exp(-0.1 * singular_ind / effective_rank)
+    s = mt.identity(n) * (low_rank + tail)
+
+    return mt.dot(mt.dot(u, s), v.T)
diff --git a/python/xorbits/_mars/learn/datasets/tests/__init__.py b/python/xorbits/_mars/learn/datasets/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/datasets/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/datasets/tests/test_samples_generator.py b/python/xorbits/_mars/learn/datasets/tests/test_samples_generator.py
new file mode 100644
index 000000000..8c026a556
--- /dev/null
+++ b/python/xorbits/_mars/learn/datasets/tests/test_samples_generator.py
@@ -0,0 +1,336 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from functools import partial
+
+import numpy as np
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_raise_message,
+    assert_raises,
+)
+
+from .... import tensor as mt
+from ....tensor.linalg import svd
+from ..samples_generator import (
+    make_blobs,
+    make_classification,
+    make_low_rank_matrix,
+    make_regression,
+)
+
+
+def test_make_classification(setup):
+    weights = [0.1, 0.25]
+    X, y = make_classification(
+        n_samples=100,
+        n_features=20,
+        n_informative=5,
+        n_redundant=1,
+        n_repeated=1,
+        n_classes=3,
+        n_clusters_per_class=1,
+        hypercube=False,
+        shift=None,
+        scale=None,
+        weights=weights,
+        random_state=0,
+        flip_y=-1,
+    )
+
+    assert weights == [0.1, 0.25]
+    assert X.shape == (100, 20)
+    assert y.shape == (100,)
+    assert mt.unique(y).to_numpy().shape == (3,)
+    assert (y == 0).sum().to_numpy() == 10
+    assert (y == 1).sum().to_numpy() == 25
+    assert (y == 2).sum().to_numpy() == 65
+
+    # Test for n_features > 30
+    X, y = make_classification(
+        n_samples=2000,
+        n_features=31,
+        n_informative=31,
+        n_redundant=0,
+        n_repeated=0,
+        hypercube=True,
+        scale=0.5,
+        random_state=0,
+    )
+
+    X = X.to_numpy()
+    assert X.shape == (2000, 31)
+    assert y.shape == (2000,)
+    assert (
+        np.unique(X.view([("", X.dtype)] * X.shape[1]))
+        .view(X.dtype)
+        .reshape(-1, X.shape[1])
+        .shape[0]
+        == 2000
+    )
+
+
+def test_make_classification_informative_features(setup):
+    """Test the construction of informative features in make_classification
+
+    Also tests `n_clusters_per_class`, `n_classes`, `hypercube` and
+    fully-specified `weights`.
+    """
+    # Create very separate clusters; check that vertices are unique and
+    # correspond to classes
+    class_sep = 1e6
+    make = partial(
+        make_classification,
+        class_sep=class_sep,
+        n_redundant=0,
+        n_repeated=0,
+        flip_y=0,
+        shift=0,
+        scale=1,
+        shuffle=False,
+    )
+
+    for n_informative, weights, n_clusters_per_class in [
+        (2, [1], 1),
+        (2, [1 / 3] * 3, 1),
+        (2, [1 / 4] * 4, 1),
+        (2, [1 / 2] * 2, 2),
+        (2, [3 / 4, 1 / 4], 2),
+        (10, [1 / 3] * 3, 10),
+        (np.int_(64), [1], 1),
+    ]:
+        n_classes = len(weights)
+        n_clusters = n_classes * n_clusters_per_class
+        n_samples = n_clusters * 50
+
+        for hypercube in (False, True):
+            generated = make(
+                n_samples=n_samples,
+                n_classes=n_classes,
+                weights=weights,
+                n_features=n_informative,
+                n_informative=n_informative,
+                n_clusters_per_class=n_clusters_per_class,
+                hypercube=hypercube,
+                random_state=0,
+            )
+
+            X, y = mt.ExecutableTuple(generated).execute().fetch()
+            assert X.shape == (n_samples, n_informative)
+            assert y.shape == (n_samples,)
+
+            # Cluster by sign, viewed as strings to allow uniquing
+            signs = np.sign(X)
+            signs = signs.view(dtype=f"|S{signs.strides[0]}")
+            unique_signs, cluster_index = np.unique(signs, return_inverse=True)
+
+            assert len(unique_signs) == n_clusters
+
+            clusters_by_class = defaultdict(set)
+            for cluster, cls in zip(cluster_index, y):
+                clusters_by_class[cls].add(cluster)
+            for clusters in clusters_by_class.values():
+                assert len(clusters) == n_clusters_per_class
+            assert len(clusters_by_class) == n_classes
+
+            assert_array_almost_equal(
+                np.bincount(y) / len(y) // weights,
+                [1] * n_classes,
+                err_msg="Wrong number of samples per class",
+            )
+
+            # Ensure on vertices of hypercube
+            for cluster in range(len(unique_signs)):
+                centroid = X[cluster_index == cluster].mean(axis=0)
+                if hypercube:
+                    assert_array_almost_equal(
+                        np.abs(centroid) / class_sep,
+                        np.ones(n_informative),
+                        decimal=5,
+                        err_msg="Clusters are not centered on hypercube vertices",
+                    )
+                else:
+                    assert_raises(
+                        AssertionError,
+                        assert_array_almost_equal,
+                        np.abs(centroid) / class_sep,
+                        np.ones(n_informative),
+                        decimal=5,
+                        err_msg="Clusters should not be centered "
+                        "on hypercube vertices",
+                    )
+
+    assert_raises(
+        ValueError,
+        make,
+        n_features=2,
+        n_informative=2,
+        n_classes=5,
+        n_clusters_per_class=1,
+    )
+    assert_raises(
+        ValueError,
+        make,
+        n_features=2,
+        n_informative=2,
+        n_classes=3,
+        n_clusters_per_class=2,
+    )
+
+
+def test_make_regression(setup):
+    X, y, c = make_regression(
+        n_samples=100,
+        n_features=10,
+        n_informative=3,
+        effective_rank=5,
+        coef=True,
+        bias=0.0,
+        noise=1.0,
+        random_state=0,
+    )
+    X, y, c = mt.ExecutableTuple((X, y, c)).execute().fetch()
+
+    assert X.shape == (100, 10), "X shape mismatch"
+    assert y.shape == (100,), "y shape mismatch"
+    assert c.shape == (10,), "coef shape mismatch"
+    assert sum(c != 0.0) == 3, "Unexpected number of informative features"
+
+    # Test that y ~= np.dot(X, c) + bias + N(0, 1.0).
+    assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
+
+    # Test with small number of features.
+    X, y = make_regression(n_samples=100, n_features=1)  # n_informative=3
+    assert X.shape == (100, 1)
+
+
+def test_make_regression_multitarget():
+    X, y, c = make_regression(
+        n_samples=100,
+        n_features=10,
+        n_informative=3,
+        n_targets=3,
+        coef=True,
+        noise=1.0,
+        random_state=0,
+    )
+    X, y, c = mt.ExecutableTuple((X, y, c)).execute().fetch()
+
+    assert X.shape == (100, 10), "X shape mismatch"
+    assert y.shape == (100, 3), "y shape mismatch"
+    assert c.shape == (10, 3), "coef shape mismatch"
+    np.testing.assert_array_equal(
+        sum(c != 0.0), 3, "Unexpected number of informative features"
+    )
+
+    # Test that y ~= np.dot(X, c) + bias + N(0, 1.0)
+    assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
+
+
+def test_make_blobs(setup):
+    cluster_stds = np.array([0.05, 0.2, 0.4])
+    cluster_centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
+    X, y = make_blobs(
+        random_state=0,
+        n_samples=50,
+        n_features=2,
+        centers=cluster_centers,
+        cluster_std=cluster_stds,
+    )
+    X, y = mt.ExecutableTuple((X, y)).execute().fetch()
+    assert X.shape == (50, 2)
+    assert y.shape == (50,)
+    assert np.unique(y).shape == (3,)
+    for i, (ctr, std) in enumerate(zip(cluster_centers, cluster_stds)):
+        assert_almost_equal((X[y == i] - ctr).std(), std, 1, "Unexpected std")
+
+
+def test_make_blobs_n_samples_list(setup):
+    n_samples = [50, 30, 20]
+    X, y = make_blobs(n_samples=n_samples, n_features=2, random_state=0)
+    X, y = mt.ExecutableTuple((X, y)).execute().fetch()
+
+    assert X.shape == (sum(n_samples), 2)
+    assert all(np.bincount(y, minlength=len(n_samples)) == n_samples) is True
+
+
+def test_make_blobs_n_samples_list_with_centers(setup):
+    n_samples = [20, 20, 20]
+    centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
+    cluster_stds = np.array([0.05, 0.2, 0.4])
+    X, y = make_blobs(
+        n_samples=n_samples, centers=centers, cluster_std=cluster_stds, random_state=0
+    )
+    X, y = mt.ExecutableTuple((X, y)).execute().fetch()
+
+    assert X.shape == (sum(n_samples), 2)
+    assert all(np.bincount(y, minlength=len(n_samples)) == n_samples) is True
+    for i, (ctr, std) in enumerate(zip(centers, cluster_stds)):
+        assert_almost_equal((X[y == i] - ctr).std(), std, 1, "Unexpected std")
+
+
+def test_make_blobs_n_samples_centers_none(setup):
+    for n_samples in [[5, 3, 0], np.array([5, 3, 0]), tuple([5, 3, 0])]:
+        centers = None
+        X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=0)
+        X, y = mt.ExecutableTuple((X, y)).execute().fetch()
+
+        assert X.shape == (sum(n_samples), 2)
+        assert all(np.bincount(y, minlength=len(n_samples)) == n_samples) is True
+
+
+def test_make_blobs_error(setup):
+    n_samples = [20, 20, 20]
+    centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
+    cluster_stds = np.array([0.05, 0.2, 0.4])
+    wrong_centers_msg = (
+        "Length of `n_samples` not consistent "
+        f"with number of centers. Got n_samples = {n_samples} "
+        f"and centers = {centers[:-1]}"
+    )
+    assert_raise_message(
+        ValueError, wrong_centers_msg, make_blobs, n_samples, centers=centers[:-1]
+    )
+    wrong_std_msg = (
+        "Length of `clusters_std` not consistent with "
+        f"number of centers. Got centers = {mt.tensor(centers)} "
+        f"and cluster_std = {cluster_stds[:-1]}"
+    )
+    assert_raise_message(
+        ValueError,
+        wrong_std_msg,
+        make_blobs,
+        n_samples,
+        centers=centers,
+        cluster_std=cluster_stds[:-1],
+    )
+    wrong_type_msg = f"Parameter `centers` must be array-like. Got {3!r} instead"
+    assert_raise_message(ValueError, wrong_type_msg, make_blobs, n_samples, centers=3)
+
+
+def test_make_low_rank_matrix(setup):
+    X = make_low_rank_matrix(
+        n_samples=50,
+        n_features=25,
+        effective_rank=5,
+        tail_strength=0.01,
+        random_state=0,
+    )
+
+    assert X.shape == (50, 25)
+
+    _, s, _ = svd(X)
+    assert (s.sum() - 5).to_numpy() < 0.1
diff --git a/python/xorbits/_mars/learn/decomposition/__init__.py b/python/xorbits/_mars/learn/decomposition/__init__.py
new file mode 100644
index 000000000..0fdc6656e
--- /dev/null
+++ b/python/xorbits/_mars/learn/decomposition/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ._pca import PCA
+from ._truncated_svd import TruncatedSVD
diff --git a/python/xorbits/_mars/learn/decomposition/_base.py b/python/xorbits/_mars/learn/decomposition/_base.py
new file mode 100644
index 000000000..956140950
--- /dev/null
+++ b/python/xorbits/_mars/learn/decomposition/_base.py
@@ -0,0 +1,185 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from abc import ABCMeta, abstractmethod
+
+from sklearn.base import BaseEstimator, TransformerMixin
+
+from ... import tensor as mt
+from ...tensor import linalg
+from ..utils import check_array
+from ..utils.validation import check_is_fitted
+
+# -----------------------------------------------------------
+# Original implementation is in `sklearn.decomposition.base`.
+# -----------------------------------------------------------
+
+
+class _BasePCA(BaseEstimator, TransformerMixin, metaclass=ABCMeta):
+    """Base class for PCA methods.
+
+    Warning: This class should not be used directly.
+    Use derived classes instead.
+    """
+
+    def get_covariance(self, session=None):
+        """Compute data covariance with the generative model.
+
+        ``cov = components_.T * S**2 * components_ + sigma2 * eye(n_features)``
+        where S**2 contains the explained variances, and sigma2 contains the
+        noise variances.
+
+        Returns
+        -------
+        cov : Tensor, shape=(n_features, n_features)
+            Estimated covariance of data.
+        """
+        components_ = self.components_
+        exp_var = self.explained_variance_
+        if self.whiten:
+            components_ = components_ * mt.sqrt(exp_var[:, mt.newaxis])
+        exp_var_diff = mt.maximum(exp_var - self.noise_variance_, 0.0)
+        cov = mt.dot(components_.T * exp_var_diff, components_)
+        cov.flat[:: len(cov) + 1] += self.noise_variance_  # modify diag inplace
+        cov.execute(session=session)
+        return cov
+
+    def get_precision(self, session=None):
+        """Compute data precision matrix with the generative model.
+
+        Equals the inverse of the covariance but computed with
+        the matrix inversion lemma for efficiency.
+
+        Returns
+        -------
+        precision : Tensor, shape=(n_features, n_features)
+            Estimated precision of data.
+        """
+        n_features = self.components_.shape[1]
+
+        # handle corner cases first
+        if self.n_components_ == 0:
+            precision = mt.eye(n_features) / self.noise_variance_
+            precision.execute(session=session)
+            return precision
+        if self.n_components_ == n_features:
+            precision = linalg.inv(self.get_covariance())
+            precision.execute(session=session)
+            return precision
+
+        # Get precision using matrix inversion lemma
+        components_ = self.components_
+        exp_var = self.explained_variance_
+        if self.whiten:
+            components_ = components_ * mt.sqrt(exp_var[:, mt.newaxis])
+        exp_var_diff = mt.maximum(exp_var - self.noise_variance_, 0.0)
+        precision = mt.dot(components_, components_.T) / self.noise_variance_
+        precision.flat[:: len(precision) + 1] += 1.0 / exp_var_diff
+        precision = mt.dot(components_.T, mt.dot(linalg.inv(precision), components_))
+        precision /= -(self.noise_variance_**2)
+        precision.flat[:: len(precision) + 1] += 1.0 / self.noise_variance_
+        precision.execute(session=session)
+        return precision
+
+    @abstractmethod
+    def fit(X, y=None, session=None, run_kwargs=None):
+        """Placeholder for fit. Subclasses should implement this method!
+
+        Fit the model with X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+
+    def transform(self, X, session=None):
+        """Apply dimensionality reduction to X.
+
+        X is projected on the first principal components previously extracted
+        from a training set.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            New data, where n_samples is the number of samples
+            and n_features is the number of features.
+        session : session to run
+
+        Returns
+        -------
+        X_new : array-like, shape (n_samples, n_components)
+
+        Examples
+        --------
+
+        >>> import numpy as np
+        >>> from sklearn.decomposition import IncrementalPCA
+        >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
+        >>> ipca = IncrementalPCA(n_components=2, batch_size=3)
+        >>> ipca.fit(X)
+        IncrementalPCA(batch_size=3, copy=True, n_components=2, whiten=False)
+        >>> ipca.transform(X) # doctest: +SKIP
+        """
+        check_is_fitted(self, ["mean_", "components_"], all_or_any=all)
+
+        X = check_array(X)
+        if self.mean_ is not None:
+            X = X - self.mean_
+        X_transformed = mt.dot(X, self.components_.T)
+        if self.whiten:
+            X_transformed /= mt.sqrt(self.explained_variance_)
+        X_transformed.execute(session=session)
+        return X_transformed
+
+    def inverse_transform(self, X, session=None):
+        """Transform data back to its original space.
+
+        In other words, return an input X_original whose transform would be X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_components)
+            New data, where n_samples is the number of samples
+            and n_components is the number of components.
+        session : session to run
+
+        Returns
+        -------
+        X_original array-like, shape (n_samples, n_features)
+
+        Notes
+        -----
+        If whitening is enabled, inverse_transform will compute the
+        exact inverse operation, which includes reversing whitening.
+        """
+        if self.whiten:
+            ret = (
+                mt.dot(
+                    X,
+                    mt.sqrt(self.explained_variance_[:, mt.newaxis]) * self.components_,
+                )
+                + self.mean_
+            )
+        else:
+            ret = mt.dot(X, self.components_) + self.mean_
+        ret.execute(session=session)
+        return ret
diff --git a/python/xorbits/_mars/learn/decomposition/_pca.py b/python/xorbits/_mars/learn/decomposition/_pca.py
new file mode 100644
index 000000000..6056c42de
--- /dev/null
+++ b/python/xorbits/_mars/learn/decomposition/_pca.py
@@ -0,0 +1,644 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numbers
+from math import log, sqrt
+
+import numpy as np
+from scipy.special import gammaln
+from sklearn.utils.extmath import fast_logdet
+from sklearn.utils.validation import check_is_fitted
+
+from ... import remote as mr
+from ... import tensor as mt
+from ...core import ExecutableTuple
+from ...lib.sparse import issparse
+from ...tensor.array_utils import get_array_module
+from ...tensor.core import TENSOR_TYPE
+from ...tensor.linalg import randomized_svd
+from ...tensor.linalg.randomized_svd import svd_flip
+from ...tensor.utils import check_random_state
+from ..utils import check_array
+from ._base import _BasePCA
+
+
+def _assess_dimension(spectrum, rank, n_samples):
+    """Compute the log-likelihood of a rank ``rank`` dataset.
+
+    The dataset is assumed to be embedded in gaussian noise of shape(n,
+    dimf) having spectrum ``spectrum``.
+
+    Parameters
+    ----------
+    spectrum : array of shape (n_features)
+        Data spectrum.
+    rank : int
+        Tested rank value. It should be strictly lower than n_features,
+        otherwise the method isn't specified (division by zero in equation
+        (31) from the paper).
+    n_samples : int
+        Number of samples.
+
+    Returns
+    -------
+    ll : float,
+        The log-likelihood
+
+    Notes
+    -----
+    This implements the method of `Thomas P. Minka:
+    Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604`
+    """
+
+    xp = get_array_module(spectrum, nosparse=True)
+
+    n_features = spectrum.shape[0]
+    if not 1 <= rank < n_features:  # pragma: no cover
+        raise ValueError("the tested rank should be in [1, n_features - 1]")
+
+    eps = 1e-15
+
+    if spectrum[rank - 1] < eps:  # pragma: no cover
+        # When the tested rank is associated with a small eigenvalue, there's
+        # no point in computing the log-likelihood: it's going to be very
+        # small and won't be the max anyway. Also, it can lead to numerical
+        # issues below when computing pa, in particular in log((spectrum[i] -
+        # spectrum[j]) because this will take the log of something very small.
+        return -np.inf
+
+    pu = -rank * log(2.0)
+    for i in range(1, rank + 1):
+        pu += (
+            gammaln((n_features - i + 1) / 2.0)
+            - log(np.pi) * (n_features - i + 1) / 2.0
+        )
+
+    pl = xp.sum(xp.log(spectrum[:rank]))
+    pl = -pl * n_samples / 2.0
+
+    v = max(eps, xp.sum(spectrum[rank:]) / (n_features - rank))
+    pv = -xp.log(v) * n_samples * (n_features - rank) / 2.0
+
+    m = n_features * rank - rank * (rank + 1.0) / 2.0
+    pp = log(2.0 * np.pi) * (m + rank) / 2.0
+
+    pa = 0.0
+    spectrum_ = spectrum.copy()
+    spectrum_[rank:n_features] = v
+    for i in range(rank):
+        for j in range(i + 1, len(spectrum)):
+            pa += log(
+                (spectrum[i] - spectrum[j]) * (1.0 / spectrum_[j] - 1.0 / spectrum_[i])
+            ) + log(n_samples)
+
+    ll = pu + pl + pv + pp - pa / 2.0 - rank * log(n_samples) / 2.0
+
+    return ll
+
+
+def _infer_dimension(spectrum, n_samples):
+    """Infers the dimension of a dataset with a given spectrum.
+
+    The returned value will be in [1, n_features - 1].
+    """
+    xp = get_array_module(spectrum, nosparse=True)
+
+    ll = xp.empty_like(spectrum)
+    ll[0] = -np.inf  # we don't want to return n_components = 0
+    for rank in range(1, spectrum.shape[0]):
+        ll[rank] = _assess_dimension(spectrum, rank, n_samples)
+    return ll.argmax()
+
+
+class PCA(_BasePCA):
+    """Principal component analysis (PCA)
+
+    Linear dimensionality reduction using Singular Value Decomposition of the
+    data to project it to a lower dimensional space. The input data is centered
+    but not scaled for each feature before applying the SVD.
+
+    It uses the LAPACK implementation of the full SVD or a randomized truncated
+    SVD by the method of Halko et al. 2009, depending on the shape of the input
+    data and the number of components to extract.
+
+    It can also use the scipy.sparse.linalg ARPACK implementation of the
+    truncated SVD.
+
+    Notice that this class does not support sparse input. See
+    :class:`TruncatedSVD` for an alternative with sparse data.
+
+    Read more in the :ref:`User Guide <PCA>`.
+
+    Parameters
+    ----------
+    n_components : int, float, None or string
+        Number of components to keep.
+        if n_components is not set all components are kept::
+
+            n_components == min(n_samples, n_features)
+
+        If ``n_components == 'mle'`` and ``svd_solver == 'full'``, Minka's
+        MLE is used to guess the dimension. Use of ``n_components == 'mle'``
+        will interpret ``svd_solver == 'auto'`` as ``svd_solver == 'full'``.
+
+        If ``0 < n_components < 1`` and ``svd_solver == 'full'``, select the
+        number of components such that the amount of variance that needs to be
+        explained is greater than the percentage specified by n_components.
+
+        If ``svd_solver == 'arpack'``, the number of components must be
+        strictly less than the minimum of n_features and n_samples.
+
+        Hence, the None case results in::
+
+            n_components == min(n_samples, n_features) - 1
+
+    copy : bool (default True)
+        If False, data passed to fit are overwritten and running
+        fit(X).transform(X) will not yield the expected results,
+        use fit_transform(X) instead.
+
+    whiten : bool, optional (default False)
+        When True (False by default) the `components_` vectors are multiplied
+        by the square root of n_samples and then divided by the singular values
+        to ensure uncorrelated outputs with unit component-wise variances.
+
+        Whitening will remove some information from the transformed signal
+        (the relative variance scales of the components) but can sometime
+        improve the predictive accuracy of the downstream estimators by
+        making their data respect some hard-wired assumptions.
+
+    svd_solver : string {'auto', 'full', 'arpack', 'randomized'}
+        auto :
+            the solver is selected by a default policy based on `X.shape` and
+            `n_components`: if the input data is larger than 500x500 and the
+            number of components to extract is lower than 80% of the smallest
+            dimension of the data, then the more efficient 'randomized'
+            method is enabled. Otherwise the exact full SVD is computed and
+            optionally truncated afterwards.
+        full :
+            run exact full SVD calling the standard LAPACK solver via
+            `scipy.linalg.svd` and select the components by postprocessing
+        arpack :
+            run SVD truncated to n_components calling ARPACK solver via
+            `scipy.sparse.linalg.svds`. It requires strictly
+            0 < n_components < min(X.shape)
+        randomized :
+            run randomized SVD by the method of Halko et al.
+
+    tol : float >= 0, optional (default .0)
+        Tolerance for singular values computed by svd_solver == 'arpack'.
+
+    iterated_power : int >= 0, or 'auto', (default 'auto')
+        Number of iterations for the power method computed by
+        svd_solver == 'randomized'.
+
+    random_state : int, RandomState instance or None, optional (default None)
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`. Used when ``svd_solver`` == 'arpack' or 'randomized'.
+
+    Attributes
+    ----------
+    components_ : tensor, shape (n_components, n_features)
+        Principal axes in feature space, representing the directions of
+        maximum variance in the data. The components are sorted by
+        ``explained_variance_``.
+
+    explained_variance_ : tensor, shape (n_components,)
+        The amount of variance explained by each of the selected components.
+
+        Equal to n_components largest eigenvalues
+        of the covariance matrix of X.
+
+    explained_variance_ratio_ : tensor, shape (n_components,)
+        Percentage of variance explained by each of the selected components.
+
+        If ``n_components`` is not set then all components are stored and the
+        sum of the ratios is equal to 1.0.
+
+    singular_values_ : tensor, shape (n_components,)
+        The singular values corresponding to each of the selected components.
+        The singular values are equal to the 2-norms of the ``n_components``
+        variables in the lower-dimensional space.
+
+    mean_ : tensor, shape (n_features,)
+        Per-feature empirical mean, estimated from the training set.
+
+        Equal to `X.mean(axis=0)`.
+
+    n_components_ : int
+        The estimated number of components. When n_components is set
+        to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this
+        number is estimated from input data. Otherwise it equals the parameter
+        n_components, or the lesser value of n_features and n_samples
+        if n_components is None.
+
+    noise_variance_ : float
+        The estimated noise covariance following the Probabilistic PCA model
+        from Tipping and Bishop 1999. See "Pattern Recognition and
+        Machine Learning" by C. Bishop, 12.2.1 p. 574 or
+        http://www.miketipping.com/papers/met-mppca.pdf. It is required to
+        compute the estimated data covariance and score samples.
+
+        Equal to the average of (min(n_features, n_samples) - n_components)
+        smallest eigenvalues of the covariance matrix of X.
+
+    References
+    ----------
+    For n_components == 'mle', this class uses the method of *Minka, T. P.
+    "Automatic choice of dimensionality for PCA". In NIPS, pp. 598-604*
+
+    Implements the probabilistic PCA model from:
+    Tipping, M. E., and Bishop, C. M. (1999). "Probabilistic principal
+    component analysis". Journal of the Royal Statistical Society:
+    Series B (Statistical Methodology), 61(3), 611-622.
+    via the score and score_samples methods.
+    See http://www.miketipping.com/papers/met-mppca.pdf
+
+    For svd_solver == 'arpack', refer to `scipy.sparse.linalg.svds`.
+
+    For svd_solver == 'randomized', see:
+    *Halko, N., Martinsson, P. G., and Tropp, J. A. (2011).
+    "Finding structure with randomness: Probabilistic algorithms for
+    constructing approximate matrix decompositions".
+    SIAM review, 53(2), 217-288.* and also
+    *Martinsson, P. G., Rokhlin, V., and Tygert, M. (2011).
+    "A randomized algorithm for the decomposition of matrices".
+    Applied and Computational Harmonic Analysis, 30(1), 47-68.*
+
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> from mars.learn.decomposition import PCA
+    >>> X = mt.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
+    >>> pca = PCA(n_components=2)
+    >>> pca.fit(X)  # doctest: +NORMALIZE_WHITESPACE
+    PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
+      svd_solver='auto', tol=0.0, whiten=False)
+    >>> print(pca.explained_variance_ratio_)  # doctest: +ELLIPSIS
+    [0.9924... 0.0075...]
+    >>> print(pca.singular_values_)  # doctest: +ELLIPSIS
+    [6.30061... 0.54980...]
+
+    >>> pca = PCA(n_components=2, svd_solver='full')
+    >>> pca.fit(X)                 # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
+      svd_solver='full', tol=0.0, whiten=False)
+    >>> print(pca.explained_variance_ratio_)  # doctest: +ELLIPSIS
+    [0.9924... 0.00755...]
+    >>> print(pca.singular_values_)  # doctest: +ELLIPSIS
+    [6.30061... 0.54980...]
+
+    See also
+    --------
+    KernelPCA
+    SparsePCA
+    TruncatedSVD
+    IncrementalPCA
+    """
+
+    def __init__(
+        self,
+        n_components=None,
+        copy=True,
+        whiten=False,
+        svd_solver="auto",
+        tol=0.0,
+        iterated_power="auto",
+        random_state=None,
+    ):
+        self.n_components = n_components
+        self.copy = copy
+        self.whiten = whiten
+        self.svd_solver = svd_solver
+        self.tol = tol
+        self.iterated_power = iterated_power
+        self.random_state = random_state
+
+    def fit(self, X, y=None, session=None, run_kwargs=None):
+        """Fit the model with X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data, where n_samples is the number of samples
+            and n_features is the number of features.
+
+        y : Ignored
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        self._fit(X, session=session, run=True, run_kwargs=run_kwargs)
+        return self
+
+    def fit_transform(self, X, y=None, session=None):
+        """Fit the model with X and apply the dimensionality reduction on X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data, where n_samples is the number of samples
+            and n_features is the number of features.
+
+        y : Ignored
+
+        Returns
+        -------
+        X_new : array-like, shape (n_samples, n_components)
+
+        """
+        U, S, _ = self._fit(X, session=session, run=False)
+        U = U[:, : self.n_components_]
+
+        if self.whiten:
+            # X_new = X * V / S * sqrt(n_samples) = U * sqrt(n_samples)
+            U *= sqrt(X.shape[0] - 1)
+        else:
+            # X_new = X * V = U * S * V^T * V = U * S
+            U *= S[: self.n_components_]
+
+        self._run([U], session=session)
+        return U
+
+    def _run(self, result, session=None, run_kwargs=None):
+        to_run_tensors = list(result)
+        if isinstance(self.noise_variance_, TENSOR_TYPE):
+            to_run_tensors.append(self.noise_variance_)
+        to_run_tensors.append(self.components_)
+        to_run_tensors.append(self.explained_variance_)
+        to_run_tensors.append(self.explained_variance_ratio_)
+        to_run_tensors.append(self.singular_values_)
+
+        ExecutableTuple(to_run_tensors).execute(session=session, **(run_kwargs or {}))
+
+    def _fit(self, X, session=None, run=True, run_kwargs=None):
+        """Dispatch to the right submethod depending on the chosen solver."""
+
+        # Raise an error for sparse input.
+        # This is more informative than the generic one raised by check_array.
+        if (hasattr(X, "issparse") and X.issparse()) or issparse(X):
+            raise TypeError(
+                "PCA does not support sparse input. See "
+                "TruncatedSVD for a possible alternative."
+            )
+
+        X = check_array(
+            X, dtype=[mt.float64, mt.float32], ensure_2d=True, copy=self.copy
+        )
+
+        # Handle n_components==None
+        if self.n_components is None:
+            if self.svd_solver != "arpack":
+                n_components = min(X.shape)
+            else:
+                n_components = min(X.shape) - 1
+        else:
+            n_components = self.n_components
+
+        # Handle svd_solver
+        self._fit_svd_solver = self.svd_solver
+        if self._fit_svd_solver == "auto":
+            # Small problem or n_components == 'mle', just call full PCA
+            if max(X.shape) <= 500 or n_components == "mle":
+                self._fit_svd_solver = "full"
+            elif n_components >= 1 and n_components < 0.8 * min(X.shape):
+                self._fit_svd_solver = "randomized"
+            # This is also the case of n_components in (0,1)
+            else:
+                self._fit_svd_solver = "full"
+
+        # Call different fits for either full or truncated SVD
+        if self._fit_svd_solver == "full":
+            ret = self._fit_full(X, n_components, session=session)
+        elif self._fit_svd_solver in ["arpack", "randomized"]:
+            ret = self._fit_truncated(X, n_components, self._fit_svd_solver)
+        else:
+            raise ValueError(f"Unrecognized svd_solver='{self._fit_svd_solver}'")
+
+        if run:
+            self._run(ret, session=session, run_kwargs=run_kwargs)
+        return ret
+
+    def _fit_full(self, X, n_components, session=None, run_kwargs=None):
+        """Fit the model by computing full SVD on X"""
+        n_samples, n_features = X.shape
+
+        if n_components == "mle":
+            if n_samples < n_features:
+                raise ValueError(
+                    "n_components='mle' is only supported if n_samples >= n_features"
+                )
+        elif not 0 <= n_components <= min(n_samples, n_features):
+            raise ValueError(
+                "n_components=%r must be between 0 and "
+                "min(n_samples, n_features)=%r with "
+                "svd_solver='full'" % (n_components, min(n_samples, n_features))
+            )
+        elif n_components >= 1:
+            if not isinstance(n_components, (numbers.Integral, np.integer)):
+                raise ValueError(
+                    "n_components=%r must be of type int "
+                    "when greater than or equal to 1, "
+                    "was of type=%r" % (n_components, type(n_components))
+                )
+
+        # Center data
+        self.mean_ = mt.mean(X, axis=0)
+        X -= self.mean_
+
+        U, S, V = mt.linalg.svd(X)
+        # flip eigenvectors' sign to enforce deterministic output
+        U, V = svd_flip(U, V)
+
+        components_ = V
+
+        # Get variance explained by singular values
+        explained_variance_ = (S**2) / (n_samples - 1)
+        total_var = explained_variance_.sum()
+        explained_variance_ratio_ = explained_variance_ / total_var
+        singular_values_ = S.copy()  # Store the singular values.
+
+        # Postprocess the number of components required
+        if n_components == "mle":
+            n_components = mr.spawn(
+                _infer_dimension,
+                args=(explained_variance_, n_samples),
+                resolve_tileable_input=True,
+            )
+            ExecutableTuple([n_components, U, V]).execute(
+                session=session, **(run_kwargs or dict())
+            )
+            n_components = n_components.fetch(session=session)
+        elif 0 < n_components < 1.0:
+            # number of components for which the cumulated explained
+            # variance percentage is superior to the desired threshold
+            # ratio_cumsum = stable_cumsum(explained_variance_ratio_)
+            ratio_cumsum = explained_variance_ratio_.cumsum()
+            n_components = (mt.searchsorted(ratio_cumsum, n_components) + 1).to_numpy(
+                session=session, **(run_kwargs or dict())
+            )
+
+        # Compute noise covariance using Probabilistic PCA model
+        # The sigma2 maximum likelihood (cf. eq. 12.46)
+        if n_components < min(n_features, n_samples):
+            self.noise_variance_ = explained_variance_[n_components:].mean()
+        else:
+            self.noise_variance_ = 0.0
+
+        self.n_samples_, self.n_features_ = n_samples, n_features
+        self.components_ = components_[:n_components]
+        self.n_components_ = n_components
+        self.explained_variance_ = explained_variance_[:n_components]
+        self.explained_variance_ratio_ = explained_variance_ratio_[:n_components]
+        self.singular_values_ = singular_values_[:n_components]
+
+        return U, S, V
+
+    def _fit_truncated(self, X, n_components, svd_solver):
+        """Fit the model by computing truncated SVD (by ARPACK or randomized)
+        on X
+        """
+        n_samples, n_features = X.shape
+
+        if isinstance(n_components, str):
+            raise ValueError(
+                "n_components=%r cannot be a string "
+                "with svd_solver='%s'" % (n_components, svd_solver)
+            )
+        elif not 1 <= n_components <= min(n_samples, n_features):
+            raise ValueError(
+                "n_components=%r must be between 1 and "
+                "min(n_samples, n_features)=%r with "
+                "svd_solver='%s'"
+                % (n_components, min(n_samples, n_features), svd_solver)
+            )
+        elif not isinstance(n_components, (numbers.Integral, np.integer)):
+            raise ValueError(
+                "n_components=%r must be of type int "
+                "when greater than or equal to 1, was of type=%r"
+                % (n_components, type(n_components))
+            )
+        elif svd_solver == "arpack" and n_components == min(n_samples, n_features):
+            raise ValueError(
+                "n_components=%r must be strictly less than "
+                "min(n_samples, n_features)=%r with "
+                "svd_solver='%s'"
+                % (n_components, min(n_samples, n_features), svd_solver)
+            )
+
+        random_state = check_random_state(self.random_state)
+
+        # Center data
+        self.mean_ = mt.mean(X, axis=0)
+        X -= self.mean_
+
+        if svd_solver == "arpack":
+            # # random init solution, as ARPACK does it internally
+            # v0 = random_state.uniform(-1, 1, size=min(X.shape))
+            # U, S, V = svds(X, k=n_components, tol=self.tol, v0=v0)
+            # # svds doesn't abide by scipy.linalg.svd/randomized_svd
+            # # conventions, so reverse its outputs.
+            # S = S[::-1]
+            # # flip eigenvectors' sign to enforce deterministic output
+            # U, V = svd_flip(U[:, ::-1], V[::-1])
+            raise NotImplementedError("Does not support arpack svd_resolver")
+
+        elif svd_solver == "randomized":
+            # sign flipping is done inside
+            U, S, V = randomized_svd(
+                X,
+                n_components=n_components,
+                n_iter=self.iterated_power,
+                flip_sign=True,
+                random_state=random_state,
+            )
+
+        self.n_samples_, self.n_features_ = n_samples, n_features
+        self.components_ = V
+        self.n_components_ = n_components
+
+        # Get variance explained by singular values
+        self.explained_variance_ = (S**2) / (n_samples - 1)
+        total_var = mt.var(X, ddof=1, axis=0)
+        self.explained_variance_ratio_ = self.explained_variance_ / total_var.sum()
+        self.singular_values_ = S.copy()  # Store the singular values.
+
+        if self.n_components_ < min(n_features, n_samples):
+            self.noise_variance_ = total_var.sum() - self.explained_variance_.sum()
+            self.noise_variance_ /= min(n_features, n_samples) - n_components
+        else:
+            self.noise_variance_ = 0.0
+
+        return U, S, V
+
+    def _score_samples(self, X, session=None):
+        check_is_fitted(self, "mean_")
+
+        X = check_array(X)
+        Xr = X - self.mean_
+        n_features = X.shape[1]
+        precision = self.get_precision().fetch(session=session)
+        log_like = -0.5 * (Xr * (mt.dot(Xr, precision))).sum(axis=1)
+        log_like -= 0.5 * (n_features * log(2.0 * mt.pi) - fast_logdet(precision))
+        return log_like
+
+    def score_samples(self, X, session=None):
+        """Return the log-likelihood of each sample.
+
+        See. "Pattern Recognition and Machine Learning"
+        by C. Bishop, 12.2.1 p. 574
+        or http://www.miketipping.com/papers/met-mppca.pdf
+
+        Parameters
+        ----------
+        X : tensor, shape(n_samples, n_features)
+            The data.
+
+        Returns
+        -------
+        ll : tensor, shape (n_samples,)
+            Log-likelihood of each sample under the current model
+        """
+        log_like = self._score_samples(X, session=session)
+        log_like.execute(session=session)
+        return log_like
+
+    def score(self, X, y=None, session=None):
+        """Return the average log-likelihood of all samples.
+
+        See. "Pattern Recognition and Machine Learning"
+        by C. Bishop, 12.2.1 p. 574
+        or http://www.miketipping.com/papers/met-mppca.pdf
+
+        Parameters
+        ----------
+        X : tensor, shape(n_samples, n_features)
+            The data.
+
+        y : Ignored
+
+        Returns
+        -------
+        ll : float
+            Average log-likelihood of the samples under the current model
+        """
+        ret = mt.mean(self._score_samples(X))
+        ret.execute(session=session)
+        return ret
diff --git a/python/xorbits/_mars/learn/decomposition/_truncated_svd.py b/python/xorbits/_mars/learn/decomposition/_truncated_svd.py
new file mode 100644
index 000000000..1aad878bd
--- /dev/null
+++ b/python/xorbits/_mars/learn/decomposition/_truncated_svd.py
@@ -0,0 +1,258 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+
+from ... import tensor as mt
+from ...core import ExecutableTuple
+from ...tensor.linalg import randomized_svd
+from ...tensor.utils import check_random_state
+from ..utils import check_array
+
+__all__ = ["TruncatedSVD"]
+
+
+class TruncatedSVD(BaseEstimator, TransformerMixin):
+    """Dimensionality reduction using truncated SVD (aka LSA).
+
+    This transformer performs linear dimensionality reduction by means of
+    truncated singular value decomposition (SVD). Contrary to PCA, this
+    estimator does not center the data before computing the singular value
+    decomposition. This means it can work with scipy.sparse matrices
+    efficiently.
+
+    In particular, truncated SVD works on term count/tf-idf matrices as
+    returned by the vectorizers in sklearn.feature_extraction.text. In that
+    context, it is known as latent semantic analysis (LSA).
+
+    This estimator supports two algorithms: a fast randomized SVD solver, and
+    a "naive" algorithm that uses ARPACK as an eigensolver on (X * X.T) or
+    (X.T * X), whichever is more efficient.
+
+    Read more in the :ref:`User Guide <LSA>`.
+
+    Parameters
+    ----------
+    n_components : int, default = 2
+        Desired dimensionality of output data.
+        Must be strictly less than the number of features.
+        The default value is useful for visualisation. For LSA, a value of
+        100 is recommended.
+
+    algorithm : string, default = "randomized"
+        SVD solver to use. Either "arpack" for the ARPACK wrapper in SciPy
+        (scipy.sparse.linalg.svds), or "randomized" for the randomized
+        algorithm due to Halko (2009).
+
+    n_iter : int, optional (default 5)
+        Number of iterations for randomized SVD solver. Not used by ARPACK.
+        The default is larger than the default in `randomized_svd` to handle
+        sparse matrices that may have large slowly decaying spectrum.
+
+    random_state : int, RandomState instance or None, optional, default = None
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    tol : float, optional
+        Tolerance for ARPACK. 0 means machine precision. Ignored by randomized
+        SVD solver.
+
+    Attributes
+    ----------
+    components_ : array, shape (n_components, n_features)
+
+    explained_variance_ : array, shape (n_components,)
+        The variance of the training samples transformed by a projection to
+        each component.
+
+    explained_variance_ratio_ : array, shape (n_components,)
+        Percentage of variance explained by each of the selected components.
+
+    singular_values_ : array, shape (n_components,)
+        The singular values corresponding to each of the selected components.
+        The singular values are equal to the 2-norms of the ``n_components``
+        variables in the lower-dimensional space.
+
+    Examples
+    --------
+    >>> from mars.learn.decomposition import TruncatedSVD
+    >>> import mars.tensor as mt
+    >>> from sklearn.random_projection import sparse_random_matrix
+    >>> X = mt.tensor(sparse_random_matrix(100, 100, density=0.01, random_state=42))
+    >>> svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
+    >>> svd.fit(X)  # doctest: +NORMALIZE_WHITESPACE
+    TruncatedSVD(algorithm='randomized', n_components=5, n_iter=7,
+            random_state=42, tol=0.0)
+    >>> print(svd.explained_variance_ratio_)  # doctest: +ELLIPSIS
+    [0.0606... 0.0584... 0.0497... 0.0434... 0.0372...]
+    >>> print(svd.explained_variance_ratio_.sum())  # doctest: +ELLIPSIS
+    0.249...
+    >>> print(svd.singular_values_)  # doctest: +ELLIPSIS
+    [2.5841... 2.5245... 2.3201... 2.1753... 2.0443...]
+
+    See also
+    --------
+    PCA
+
+    References
+    ----------
+    Finding structure with randomness: Stochastic algorithms for constructing
+    approximate matrix decompositions
+    Halko, et al., 2009 (arXiv:909) https://arxiv.org/pdf/0909.4061.pdf
+
+    Notes
+    -----
+    SVD suffers from a problem called "sign indeterminacy", which means the
+    sign of the ``components_`` and the output from transform depend on the
+    algorithm and random state. To work around this, fit instances of this
+    class to data once, then keep the instance around to do transformations.
+
+    """
+
+    def __init__(
+        self,
+        n_components=2,
+        algorithm="randomized",
+        n_iter=5,
+        random_state=None,
+        tol=0.0,
+    ):
+        self.algorithm = algorithm
+        self.n_components = n_components
+        self.n_iter = n_iter
+        self.random_state = random_state
+        self.tol = tol
+
+    def fit(self, X, y=None, session=None):
+        """Fit LSI model on training data X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Training data.
+        session : session to run
+
+        y : Ignored
+
+        Returns
+        -------
+        self : object
+            Returns the transformer object.
+        """
+        self.fit_transform(X, session=session)
+        return self
+
+    def fit_transform(self, X, y=None, session=None):
+        """Fit LSI model to X and perform dimensionality reduction on X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Training data.
+        session : session to run
+
+        y : Ignored
+
+        Returns
+        -------
+        X_new : array, shape (n_samples, n_components)
+            Reduced version of X. This will always be a dense array.
+        """
+        X = check_array(X, accept_sparse=["csr", "csc"], ensure_min_features=2)
+        random_state = check_random_state(self.random_state)
+
+        if self.algorithm == "arpack":
+            # U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol)
+            # # svds doesn't abide by scipy.linalg.svd/randomized_svd
+            # # conventions, so reverse its outputs.
+            # Sigma = Sigma[::-1]
+            # U, VT = svd_flip(U[:, ::-1], VT[::-1])
+            raise NotImplementedError("Does not support arpack for truncated_svd")
+
+        elif self.algorithm == "randomized":
+            k = self.n_components
+            n_features = X.shape[1]
+            if k >= n_features:
+                raise ValueError(
+                    f"n_components must be < n_features; got {k} >= {n_features}"
+                )
+            U, Sigma, VT = randomized_svd(
+                X, self.n_components, n_iter=self.n_iter, random_state=random_state
+            )
+        else:
+            raise ValueError(f"unknown algorithm {self.algorithm!r}")
+
+        self.components_ = VT
+
+        # Calculate explained variance & explained variance ratio
+        X_transformed = U * Sigma
+        self.explained_variance_ = exp_var = np.var(X_transformed, axis=0)
+        full_var = mt.var(X, axis=0).sum()
+        self.explained_variance_ratio_ = exp_var / full_var
+        self.singular_values_ = Sigma  # Store the singular values.
+
+        to_run_tensors = [
+            X_transformed,
+            self.components_,
+            self.explained_variance_,
+            self.explained_variance_ratio_,
+            self.singular_values_,
+        ]
+
+        ExecutableTuple(to_run_tensors).execute(session=session)
+        return X_transformed
+
+    def transform(self, X, session=None):
+        """Perform dimensionality reduction on X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            New data.
+        session : session to run
+
+        Returns
+        -------
+        X_new : array, shape (n_samples, n_components)
+            Reduced version of X. This will always be a dense array.
+        """
+        X = check_array(X, accept_sparse="csr")
+        ret = mt.dot(X, self.components_.T)
+        ret.execute(session=session)
+        return ret
+
+    def inverse_transform(self, X, session=None):
+        """Transform X back to its original space.
+
+        Returns an array X_original whose transform would be X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_components)
+            New data.
+        session : session to run
+
+        Returns
+        -------
+        X_original : array, shape (n_samples, n_features)
+            Note that this is always a dense array.
+        """
+        X = check_array(X)
+        ret = mt.dot(X, self.components_)
+        ret.execute(session=session)
+        return ret
diff --git a/python/xorbits/_mars/learn/decomposition/tests/__init__.py b/python/xorbits/_mars/learn/decomposition/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/decomposition/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/decomposition/tests/test_pca.py b/python/xorbits/_mars/learn/decomposition/tests/test_pca.py
new file mode 100644
index 000000000..b42ff5368
--- /dev/null
+++ b/python/xorbits/_mars/learn/decomposition/tests/test_pca.py
@@ -0,0 +1,730 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from itertools import product
+
+import numpy as np
+import pytest
+import scipy as sp
+from sklearn import datasets
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_raise_message,
+    assert_raises,
+    assert_raises_regex,
+)
+
+from .... import tensor as mt
+from .._pca import PCA, _assess_dimension, _infer_dimension
+
+iris = mt.tensor(datasets.load_iris().data)
+# solver_list not includes arpack
+solver_list = ["full", "randomized", "auto"]
+
+
+def test_pca(setup):
+    X = iris
+
+    for n_comp in np.arange(X.shape[1]):
+        pca = PCA(n_components=n_comp, svd_solver="full")
+
+        X_r = pca.fit(X).transform(X).fetch()
+        np.testing.assert_equal(X_r.shape[1], n_comp)
+
+        X_r2 = pca.fit_transform(X).fetch()
+        assert_array_almost_equal(X_r, X_r2)
+
+        X_r = pca.transform(X).fetch()
+        X_r2 = pca.fit_transform(X).fetch()
+        assert_array_almost_equal(X_r, X_r2)
+
+        # Test get_covariance and get_precision
+        cov = pca.get_covariance()
+        precision = pca.get_precision()
+        assert_array_almost_equal(
+            mt.dot(cov, precision).to_numpy(), np.eye(X.shape[1]), 12
+        )
+
+    # test explained_variance_ratio_ == 1 with all components
+    pca = PCA(svd_solver="full")
+    pca.fit(X)
+    np.testing.assert_allclose(pca.explained_variance_ratio_.sum().to_numpy(), 1.0, 3)
+
+
+def test_pca_randomized_solver(setup):
+    # PCA on dense arrays
+    X = iris
+
+    # Loop excluding the 0, invalid for randomized
+    for n_comp in np.arange(1, X.shape[1]):
+        pca = PCA(n_components=n_comp, svd_solver="randomized", random_state=0)
+
+        X_r = pca.fit(X).transform(X)
+        np.testing.assert_equal(X_r.shape[1], n_comp)
+
+        X_r2 = pca.fit_transform(X)
+        assert_array_almost_equal(X_r.fetch(), X_r2.fetch())
+
+        X_r = pca.transform(X)
+        assert_array_almost_equal(X_r.fetch(), X_r2.fetch())
+
+        # Test get_covariance and get_precision
+        cov = pca.get_covariance()
+        precision = pca.get_precision()
+        assert_array_almost_equal(
+            mt.dot(cov, precision).to_numpy(), mt.eye(X.shape[1]).to_numpy(), 12
+        )
+
+    pca = PCA(n_components=0, svd_solver="randomized", random_state=0)
+    with pytest.raises(ValueError):
+        pca.fit(X)
+
+    pca = PCA(n_components=0, svd_solver="randomized", random_state=0)
+    with pytest.raises(ValueError):
+        pca.fit(X)
+    # Check internal state
+    assert (
+        pca.n_components
+        == PCA(n_components=0, svd_solver="randomized", random_state=0).n_components
+    )
+    assert (
+        pca.svd_solver
+        == PCA(n_components=0, svd_solver="randomized", random_state=0).svd_solver
+    )
+
+
+def test_whitening(setup):
+    # Check that PCA output has unit-variance
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    n_features = 80
+    n_components = 30
+    rank = 50
+
+    # some low rank data with correlated features
+    X = mt.dot(
+        rng.randn(n_samples, rank),
+        mt.dot(mt.diag(mt.linspace(10.0, 1.0, rank)), rng.randn(rank, n_features)),
+    )
+    # the component-wise variance of the first 50 features is 3 times the
+    # mean component-wise variance of the remaining 30 features
+    X[:, :50] *= 3
+
+    assert X.shape == (n_samples, n_features)
+
+    # the component-wise variance is thus highly varying:
+    assert X.std(axis=0).std().to_numpy() > 43.8
+
+    for solver, copy in product(solver_list, (True, False)):
+        # whiten the data while projecting to the lower dim subspace
+        X_ = X.copy()  # make sure we keep an original across iterations.
+        pca = PCA(
+            n_components=n_components,
+            whiten=True,
+            copy=copy,
+            svd_solver=solver,
+            random_state=0,
+            iterated_power=7,
+        )
+        # test fit_transform
+        X_whitened = pca.fit_transform(X_.copy())
+        assert X_whitened.shape == (n_samples, n_components)
+        X_whitened2 = pca.transform(X_)
+        assert_array_almost_equal(X_whitened.fetch(), X_whitened2.fetch())
+
+        assert_almost_equal(
+            X_whitened.std(ddof=1, axis=0).to_numpy(), np.ones(n_components), decimal=6
+        )
+        assert_almost_equal(X_whitened.mean(axis=0).to_numpy(), np.zeros(n_components))
+
+        X_ = X.copy()
+        pca = PCA(
+            n_components=n_components, whiten=False, copy=copy, svd_solver=solver
+        ).fit(X_)
+        X_unwhitened = pca.transform(X_)
+        assert X_unwhitened.shape == (n_samples, n_components)
+
+        # in that case the output components still have varying variances
+        assert_almost_equal(X_unwhitened.std(axis=0).std().to_numpy(), 74.1, 1)
+        # we always center, so no test for non-centering.
+
+
+def test_explained_variance(setup):
+    # Check that PCA output has unit-variance
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    n_features = 80
+
+    X = mt.tensor(rng.randn(n_samples, n_features))
+
+    pca = PCA(n_components=2, svd_solver="full").fit(X)
+    rpca = PCA(n_components=2, svd_solver="randomized", random_state=42).fit(X)
+    assert_array_almost_equal(
+        pca.explained_variance_.to_numpy(), rpca.explained_variance_.to_numpy(), 1
+    )
+    assert_array_almost_equal(
+        pca.explained_variance_ratio_.to_numpy(),
+        rpca.explained_variance_ratio_.to_numpy(),
+        1,
+    )
+
+    # compare to empirical variances
+    expected_result = np.linalg.eig(np.cov(X.to_numpy(), rowvar=False))[0]
+    expected_result = sorted(expected_result, reverse=True)[:2]
+
+    X_pca = pca.transform(X)
+    assert_array_almost_equal(
+        pca.explained_variance_.to_numpy(), mt.var(X_pca, ddof=1, axis=0).to_numpy()
+    )
+    assert_array_almost_equal(pca.explained_variance_.to_numpy(), expected_result)
+
+    X_rpca = rpca.transform(X)
+    assert_array_almost_equal(
+        rpca.explained_variance_.to_numpy(),
+        mt.var(X_rpca, ddof=1, axis=0).to_numpy(),
+        decimal=1,
+    )
+    assert_array_almost_equal(
+        rpca.explained_variance_.to_numpy(), expected_result, decimal=1
+    )
+
+    # Same with correlated data
+    X = datasets.make_classification(
+        n_samples, n_features, n_informative=n_features - 2, random_state=rng
+    )[0]
+    X = mt.tensor(X)
+
+    pca = PCA(n_components=2).fit(X)
+    rpca = PCA(n_components=2, svd_solver="randomized", random_state=rng).fit(X)
+    assert_array_almost_equal(
+        pca.explained_variance_ratio_.to_numpy(),
+        rpca.explained_variance_ratio_.to_numpy(),
+        5,
+    )
+
+
+def test_singular_values(setup):
+    # Check that the PCA output has the correct singular values
+
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    n_features = 80
+
+    X = mt.tensor(rng.randn(n_samples, n_features))
+
+    pca = PCA(n_components=2, svd_solver="full", random_state=rng).fit(X)
+    rpca = PCA(n_components=2, svd_solver="randomized", random_state=rng).fit(X)
+    assert_array_almost_equal(
+        pca.singular_values_.fetch(), rpca.singular_values_.fetch(), 1
+    )
+
+    # Compare to the Frobenius norm
+    X_pca = pca.transform(X)
+    X_rpca = rpca.transform(X)
+    assert_array_almost_equal(
+        mt.sum(pca.singular_values_**2.0).to_numpy(),
+        (mt.linalg.norm(X_pca, "fro") ** 2.0).to_numpy(),
+        12,
+    )
+    assert_array_almost_equal(
+        mt.sum(rpca.singular_values_**2.0).to_numpy(),
+        (mt.linalg.norm(X_rpca, "fro") ** 2.0).to_numpy(),
+        0,
+    )
+
+    # Compare to the 2-norms of the score vectors
+    assert_array_almost_equal(
+        pca.singular_values_.fetch(),
+        mt.sqrt(mt.sum(X_pca**2.0, axis=0)).to_numpy(),
+        12,
+    )
+    assert_array_almost_equal(
+        rpca.singular_values_.fetch(),
+        mt.sqrt(mt.sum(X_rpca**2.0, axis=0)).to_numpy(),
+        2,
+    )
+
+    # Set the singular values and see what we get back
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    n_features = 110
+
+    X = mt.tensor(rng.randn(n_samples, n_features))
+
+    pca = PCA(n_components=3, svd_solver="full", random_state=rng)
+    rpca = PCA(n_components=3, svd_solver="randomized", random_state=rng)
+    X_pca = pca.fit_transform(X)
+
+    X_pca /= mt.sqrt(mt.sum(X_pca**2.0, axis=0))
+    X_pca[:, 0] *= 3.142
+    X_pca[:, 1] *= 2.718
+
+    X_hat = mt.dot(X_pca, pca.components_)
+    pca.fit(X_hat)
+    rpca.fit(X_hat)
+    assert_array_almost_equal(pca.singular_values_.fetch(), [3.142, 2.718, 1.0], 14)
+    assert_array_almost_equal(rpca.singular_values_.fetch(), [3.142, 2.718, 1.0], 14)
+
+
+def test_pca_check_projection(setup):
+    # Test that the projection of data is correct
+    rng = np.random.RandomState(0)
+    n, p = 100, 3
+    X = mt.tensor(rng.randn(n, p) * 0.1)
+    X[:10] += mt.array([3, 4, 5])
+    Xt = 0.1 * mt.tensor(rng.randn(1, p)) + mt.array([3, 4, 5])
+
+    for solver in solver_list:
+        Yt = PCA(n_components=2, svd_solver=solver).fit(X).transform(Xt)
+        Yt /= mt.sqrt((Yt**2).sum())
+
+        assert_almost_equal(mt.abs(Yt[0][0]).to_numpy(), 1.0, 1)
+
+
+def test_pca_inverse(setup):
+    # Test that the projection of data can be inverted
+    rng = np.random.RandomState(0)
+    n, p = 50, 3
+    X = mt.tensor(rng.randn(n, p))  # spherical data
+    X[:, 1] *= 0.00001  # make middle component relatively small
+    X += [5, 4, 3]  # make a large mean
+
+    # same check that we can find the original data from the transformed
+    # signal (since the data is almost of rank n_components)
+    pca = PCA(n_components=2, svd_solver="full").fit(X)
+    Y = pca.transform(X)
+    Y_inverse = pca.inverse_transform(Y)
+    assert_almost_equal(X.to_numpy(), Y_inverse.to_numpy(), decimal=3)
+
+    # same as above with whitening (approximate reconstruction)
+    for solver in solver_list:
+        pca = PCA(n_components=2, whiten=True, svd_solver=solver)
+        pca.fit(X)
+        Y = pca.transform(X)
+        Y_inverse = pca.inverse_transform(Y)
+        assert_almost_equal(X.to_numpy(), Y_inverse.to_numpy(), decimal=3)
+
+
+def test_pca_validation(setup):
+    for solver in solver_list:
+        # Ensures that solver-specific extreme inputs for the n_components
+        # parameter raise errors
+        X = mt.array([[0, 1, 0], [1, 0, 0]])
+        smallest_d = 2  # The smallest dimension
+        lower_limit = {"randomized": 1, "full": 0, "auto": 0}
+
+        # We conduct the same test on X.T so that it is invariant to axis.
+        for data in [X, X.T]:
+            for n_components in [-1, 3]:
+                if solver == "auto":
+                    solver_reported = "full"
+                else:
+                    solver_reported = solver
+
+                assert_raises_regex(
+                    ValueError,
+                    f"n_components={n_components}L? must be between "
+                    rf"{lower_limit[solver]}L? and min\(n_samples, n_features\)="
+                    f"{smallest_d}L? with svd_solver='{solver_reported}'",
+                    PCA(n_components, svd_solver=solver).fit,
+                    data,
+                )
+
+        n_components = 1.0
+        type_ncom = type(n_components)
+        assert_raise_message(
+            ValueError,
+            f"n_components={n_components} must be of type int "
+            f"when greater than or equal to 1, was of type={type_ncom}",
+            PCA(n_components, svd_solver=solver).fit,
+            data,
+        )
+
+
+def test_n_components_none(setup):
+    for solver in solver_list:
+        # Ensures that n_components == None is handled correctly
+        X = iris
+        # We conduct the same test on X.T so that it is invariant to axis.
+        for data in [X, X.T]:
+            pca = PCA(svd_solver=solver)
+            pca.fit(data)
+            assert pca.n_components_ == min(data.shape)
+
+
+def test_randomized_pca_check_projection(setup):
+    # Test that the projection by randomized PCA on dense data is correct
+    rng = np.random.RandomState(0)
+    n, p = 100, 3
+    X = mt.tensor(rng.randn(n, p) * 0.1)
+    X[:10] += mt.array([3, 4, 5])
+    Xt = 0.1 * mt.tensor(rng.randn(1, p)) + mt.array([3, 4, 5])
+
+    Yt = (
+        PCA(n_components=2, svd_solver="randomized", random_state=0)
+        .fit(X)
+        .transform(Xt)
+    )
+    Yt /= np.sqrt((Yt**2).sum())
+
+    assert_almost_equal(mt.abs(Yt[0][0]).to_numpy(), 1.0, 1)
+
+
+def test_randomized_pca_check_list(setup):
+    # Test that the projection by randomized PCA on list data is correct
+    X = mt.tensor([[1.0, 0.0], [0.0, 1.0]])
+    X_transformed = (
+        PCA(n_components=1, svd_solver="randomized", random_state=0).fit(X).transform(X)
+    )
+    assert X_transformed.shape == (2, 1)
+    assert_almost_equal(X_transformed.mean().to_numpy(), 0.00, 2)
+    assert_almost_equal(X_transformed.std().to_numpy(), 0.71, 2)
+
+
+def test_randomized_pca_inverse(setup):
+    # Test that randomized PCA is inversible on dense data
+    rng = np.random.RandomState(0)
+    n, p = 50, 3
+    X = mt.tensor(rng.randn(n, p))  # spherical data
+    X[:, 1] *= 0.00001  # make middle component relatively small
+    X += [5, 4, 3]  # make a large mean
+
+    # same check that we can find the original data from the transformed signal
+    # (since the data is almost of rank n_components)
+    pca = PCA(n_components=2, svd_solver="randomized", random_state=0).fit(X)
+    Y = pca.transform(X)
+    Y_inverse = pca.inverse_transform(Y)
+    assert_almost_equal(X.to_numpy(), Y_inverse.to_numpy(), decimal=2)
+
+    # same as above with whitening (approximate reconstruction)
+    pca = PCA(n_components=2, whiten=True, svd_solver="randomized", random_state=0).fit(
+        X
+    )
+    Y = pca.transform(X)
+    Y_inverse = pca.inverse_transform(Y)
+    relative_max_delta = (mt.abs(X - Y_inverse) / mt.abs(X).mean()).max()
+    assert relative_max_delta.to_numpy() < 1e-5
+
+
+def test_n_components_mle(setup):
+    # Ensure that n_components == 'mle' doesn't raise error for auto/full
+    # svd_solver and raises error for arpack/randomized svd_solver
+    rng = np.random.RandomState(0)
+    n_samples = 600
+    n_features = 10
+    X = mt.tensor(rng.randn(n_samples, n_features))
+    n_components_dict = {}
+    for solver in solver_list:
+        pca = PCA(n_components="mle", svd_solver=solver)
+        if solver in ["auto", "full"]:
+            pca.fit(X)
+            n_components_dict[solver] = pca.n_components_
+        else:  # arpack/randomized solver
+            error_message = (
+                f"n_components='mle' cannot be a string with svd_solver='{solver}'"
+            )
+            assert_raise_message(ValueError, error_message, pca.fit, X)
+    assert n_components_dict["auto"] == n_components_dict["full"]
+
+
+def test_pca_dim(setup):
+    # Check automated dimensionality setting
+    rng = np.random.RandomState(0)
+    n, p = 100, 5
+    X = mt.tensor(rng.randn(n, p) * 0.1)
+    X[:10] += mt.array([3, 4, 5, 1, 2])
+    pca = PCA(n_components="mle", svd_solver="full").fit(X)
+    assert pca.n_components == "mle"
+    assert pca.n_components_ == 1
+
+
+def test_infer_dim_1(setup):
+    # TODO: explain what this is testing
+    # Or at least use explicit variable names...
+    n, p = 1000, 5
+    rng = np.random.RandomState(0)
+    X = (
+        mt.tensor(rng.randn(n, p)) * 0.1
+        + mt.tensor(rng.randn(n, 1)) * mt.array([3, 4, 5, 1, 2])
+        + mt.array([1, 0, 7, 4, 6])
+    )
+    pca = PCA(n_components=p, svd_solver="full")
+    pca.fit(X)
+    spect = pca.explained_variance_.to_numpy()
+    ll = np.array([_assess_dimension(spect, k, n) for k in range(1, p)])
+    assert ll[1] > ll.max() - 0.01 * n
+
+
+def test_infer_dim_2(setup):
+    # TODO: explain what this is testing
+    # Or at least use explicit variable names...
+    n, p = 1000, 5
+    rng = np.random.RandomState(0)
+    X = mt.tensor(rng.randn(n, p) * 0.1)
+    X[:10] += mt.array([3, 4, 5, 1, 2])
+    X[10:20] += mt.array([6, 0, 7, 2, -1])
+    pca = PCA(n_components=p, svd_solver="full")
+    pca.fit(X)
+    spect = pca.explained_variance_.fetch()
+    assert _infer_dimension(spect, n) > 1
+
+
+def test_infer_dim_3(setup):
+    n, p = 100, 5
+    rng = np.random.RandomState(0)
+    X = mt.tensor(rng.randn(n, p) * 0.1)
+    X[:10] += mt.array([3, 4, 5, 1, 2])
+    X[10:20] += mt.array([6, 0, 7, 2, -1])
+    X[30:40] += 2 * mt.array([-1, 1, -1, 1, -1])
+    pca = PCA(n_components=p, svd_solver="full")
+    pca.fit(X)
+    spect = pca.explained_variance_.fetch()
+    assert _infer_dimension(spect, n) > 2
+
+
+def test_infer_dim_by_explained_variance(setup):
+    X = iris
+    pca = PCA(n_components=0.95, svd_solver="full")
+    pca.fit(X)
+    assert pca.n_components == 0.95
+    assert pca.n_components_ == 2
+
+    pca = PCA(n_components=0.01, svd_solver="full")
+    pca.fit(X)
+    assert pca.n_components == 0.01
+    assert pca.n_components_ == 1
+
+    rng = np.random.RandomState(0)
+    # more features than samples
+    X = mt.tensor(rng.rand(5, 20))
+    pca = PCA(n_components=0.5, svd_solver="full").fit(X)
+    assert pca.n_components == 0.5
+    assert pca.n_components_ == 2
+
+
+def test_pca_score(setup):
+    # Test that probabilistic PCA scoring yields a reasonable score
+    n, p = 1000, 3
+    rng = np.random.RandomState(0)
+    X = mt.tensor(rng.randn(n, p) * 0.1) + mt.array([3, 4, 5])
+    for solver in solver_list:
+        pca = PCA(n_components=2, svd_solver=solver)
+        pca.fit(X)
+        ll1 = pca.score(X)
+        h = -0.5 * mt.log(2 * mt.pi * mt.exp(1) * 0.1**2) * p
+        np.testing.assert_almost_equal((ll1 / h).to_numpy(), 1, 0)
+
+
+def test_pca_score2(setup):
+    # Test that probabilistic PCA correctly separated different datasets
+    n, p = 100, 3
+    rng = np.random.RandomState(0)
+    X = mt.tensor(rng.randn(n, p) * 0.1) + mt.array([3, 4, 5])
+    for solver in solver_list:
+        pca = PCA(n_components=2, svd_solver=solver)
+        pca.fit(X)
+        ll1 = pca.score(X)
+        ll2 = pca.score(mt.tensor(rng.randn(n, p) * 0.2) + mt.array([3, 4, 5]))
+        assert ll1.fetch() > ll2.fetch()
+
+        # Test that it gives different scores if whiten=True
+        pca = PCA(n_components=2, whiten=True, svd_solver=solver)
+        pca.fit(X)
+        ll2 = pca.score(X)
+        assert ll1.fetch() > ll2.fetch()
+
+
+def test_pca_score3(setup):
+    # Check that probabilistic PCA selects the right model
+    n, p = 200, 3
+    rng = np.random.RandomState(0)
+    Xl = mt.tensor(
+        rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7])
+    )
+    Xt = mt.tensor(
+        rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7])
+    )
+    ll = mt.zeros(p)
+    for k in range(p):
+        pca = PCA(n_components=k, svd_solver="full")
+        pca.fit(Xl)
+        ll[k] = pca.score(Xt)
+
+    assert ll.argmax().to_numpy() == 1
+
+
+def test_pca_score_with_different_solvers(setup):
+    digits = datasets.load_digits()
+    X_digits = mt.tensor(digits.data)
+
+    pca_dict = {
+        svd_solver: PCA(n_components=30, svd_solver=svd_solver, random_state=0)
+        for svd_solver in solver_list
+    }
+
+    for pca in pca_dict.values():
+        pca.fit(X_digits)
+        # Sanity check for the noise_variance_. For more details see
+        # https://github.com/scikit-learn/scikit-learn/issues/7568
+        # https://github.com/scikit-learn/scikit-learn/issues/8541
+        # https://github.com/scikit-learn/scikit-learn/issues/8544
+        assert mt.all((pca.explained_variance_ - pca.noise_variance_) >= 0).to_numpy()
+
+    # Compare scores with different svd_solvers
+    score_dict = {
+        svd_solver: pca.score(X_digits).to_numpy()
+        for svd_solver, pca in pca_dict.items()
+    }
+    assert_almost_equal(score_dict["full"], score_dict["randomized"], decimal=3)
+
+
+def test_pca_zero_noise_variance_edge_cases(setup):
+    # ensure that noise_variance_ is 0 in edge cases
+    # when n_components == min(n_samples, n_features)
+    n, p = 100, 3
+
+    rng = np.random.RandomState(0)
+    X = mt.tensor(rng.randn(n, p) * 0.1) + mt.array([3, 4, 5])
+    # arpack raises ValueError for n_components == min(n_samples,
+    # n_features)
+    svd_solvers = ["full", "randomized"]
+
+    for svd_solver in svd_solvers:
+        pca = PCA(svd_solver=svd_solver, n_components=p)
+        pca.fit(X)
+        assert pca.noise_variance_ == 0
+
+        pca.fit(X.T)
+        assert pca.noise_variance_ == 0
+
+
+def test_svd_solver_auto(setup):
+    rng = np.random.RandomState(0)
+    X = mt.tensor(rng.uniform(size=(1000, 50)))
+
+    # case: n_components in (0,1) => 'full'
+    pca = PCA(n_components=0.5)
+    pca.fit(X)
+    pca_test = PCA(n_components=0.5, svd_solver="full")
+    pca_test.fit(X)
+    assert_array_almost_equal(
+        pca.components_.to_numpy(), pca_test.components_.to_numpy()
+    )
+
+    # case: max(X.shape) <= 500 => 'full'
+    pca = PCA(n_components=5, random_state=0)
+    Y = X[:10, :]
+    pca.fit(Y)
+    pca_test = PCA(n_components=5, svd_solver="full", random_state=0)
+    pca_test.fit(Y)
+    assert_array_almost_equal(
+        pca.components_.to_numpy(), pca_test.components_.to_numpy()
+    )
+
+    # case: n_components >= .8 * min(X.shape) => 'full'
+    pca = PCA(n_components=50)
+    pca.fit(X)
+    pca_test = PCA(n_components=50, svd_solver="full")
+    pca_test.fit(X)
+    assert_array_almost_equal(
+        pca.components_.to_numpy(), pca_test.components_.to_numpy()
+    )
+
+    # n_components >= 1 and n_components < .8 * min(X.shape) => 'randomized'
+    pca = PCA(n_components=10, random_state=0)
+    pca.fit(X)
+    pca_test = PCA(n_components=10, svd_solver="randomized", random_state=0)
+    pca_test.fit(X)
+    assert_array_almost_equal(
+        pca.components_.to_numpy(), pca_test.components_.to_numpy()
+    )
+
+
+def test_pca_sparse_input(setup):
+    for svd_solver in solver_list:
+        X = np.random.RandomState(0).rand(5, 4)
+        X = mt.tensor(sp.sparse.csr_matrix(X))
+        assert X.issparse() is True
+
+        pca = PCA(n_components=3, svd_solver=svd_solver)
+
+        assert_raises(TypeError, pca.fit, X)
+
+
+def test_pca_bad_solver(setup):
+    X = mt.tensor(np.random.RandomState(0).rand(5, 4))
+    pca = PCA(n_components=3, svd_solver="bad_argument")
+    with pytest.raises(ValueError):
+        pca.fit(X)
+
+
+def test_pca_dtype_preservation(setup):
+    for svd_solver in solver_list:
+        _check_pca_float_dtype_preservation(svd_solver)
+        _check_pca_int_dtype_upcast_to_double(svd_solver)
+
+
+def _check_pca_float_dtype_preservation(svd_solver):
+    # Ensure that PCA does not upscale the dtype when input is float32
+    X_64 = mt.tensor(
+        np.random.RandomState(0).rand(1000, 4).astype(np.float64, copy=False)
+    )
+    X_32 = X_64.astype(np.float32)
+
+    pca_64 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_64)
+    pca_32 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_32)
+
+    assert pca_64.components_.dtype == np.float64
+    assert pca_32.components_.dtype == np.float32
+    assert pca_64.transform(X_64).dtype == np.float64
+    assert pca_32.transform(X_32).dtype == np.float32
+
+    # decimal=5 fails on mac with scipy = 1.1.0
+    assert_array_almost_equal(
+        pca_64.components_.to_numpy(), pca_32.components_.to_numpy(), decimal=4
+    )
+
+
+def _check_pca_int_dtype_upcast_to_double(svd_solver):
+    # Ensure that all int types will be upcast to float64
+    X_i64 = mt.tensor(np.random.RandomState(0).randint(0, 1000, (1000, 4)))
+    X_i64 = X_i64.astype(np.int64, copy=False)
+    X_i32 = X_i64.astype(np.int32, copy=False)
+
+    pca_64 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_i64)
+    pca_32 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_i32)
+
+    assert pca_64.components_.dtype == np.float64
+    assert pca_32.components_.dtype == np.float64
+    assert pca_64.transform(X_i64).dtype == np.float64
+    assert pca_32.transform(X_i32).dtype == np.float64
+
+    assert_array_almost_equal(
+        pca_64.components_.to_numpy(), pca_32.components_.to_numpy(), decimal=5
+    )
+
+
+def test_pca_deterministic_output(setup):
+    rng = np.random.RandomState(0)
+    X = mt.tensor(rng.rand(10, 10))
+
+    for solver in solver_list:
+        transformed_X = np.zeros((20, 2))
+        for i in range(20):
+            pca = PCA(n_components=2, svd_solver=solver, random_state=rng)
+            transformed_X[i, :] = pca.fit_transform(X)[0].fetch()
+        np.testing.assert_allclose(
+            transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2)
+        )
diff --git a/python/xorbits/_mars/learn/decomposition/tests/test_truncated_svd.py b/python/xorbits/_mars/learn/decomposition/tests/test_truncated_svd.py
new file mode 100644
index 000000000..6a53e7e0b
--- /dev/null
+++ b/python/xorbits/_mars/learn/decomposition/tests/test_truncated_svd.py
@@ -0,0 +1,166 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import scipy.sparse as sp
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import assert_array_almost_equal, assert_array_less
+
+from .... import tensor as mt
+from .. import TruncatedSVD
+
+# Make an X that looks somewhat like a small tf-idf matrix.
+# XXX newer versions of SciPy >0.16 have scipy.sparse.rand for this.
+shape = 60, 55
+n_samples, n_features = shape
+rng = check_random_state(42)
+X = rng.randint(-100, 20, np.product(shape)).reshape(shape)
+X = sp.csr_matrix(np.maximum(X, 0), dtype=np.float64)
+X.data[:] = 1 + np.log(X.data)
+Xdense = X.A
+n_samples = n_samples
+n_features = n_features
+
+
+def test_attributes(setup):
+    for n_components in (10, 25, 41):
+        tsvd = TruncatedSVD(n_components).fit(X)
+        assert tsvd.n_components == n_components
+        assert tsvd.components_.shape == (n_components, n_features)
+
+
+def test_too_many_components(setup):
+    for n_components in (n_features, n_features + 1):
+        tsvd = TruncatedSVD(n_components=n_components, algorithm="randomized")
+        with pytest.raises(ValueError):
+            tsvd.fit(X)
+
+
+def test_sparse_formats(setup):
+    tsvd = TruncatedSVD(n_components=11)
+    Xtrans = tsvd.fit_transform(Xdense)
+    assert Xtrans.shape == (n_samples, 11)
+    Xtrans = tsvd.transform(Xdense)
+    assert Xtrans.shape == (n_samples, 11)
+
+
+def test_inverse_transform(setup):
+    # We need a lot of components for the reconstruction to be "almost
+    # equal" in all positions. XXX Test means or sums instead?
+    tsvd = TruncatedSVD(n_components=52, random_state=42, algorithm="randomized")
+    Xt = tsvd.fit_transform(X)
+    Xinv = tsvd.inverse_transform(Xt)
+    assert_array_almost_equal(Xinv.fetch(), Xdense, decimal=1)
+
+
+def test_integers(setup):
+    Xint = X.astype(np.int64)
+    tsvd = TruncatedSVD(n_components=6)
+    Xtrans = tsvd.fit_transform(Xint)
+    assert Xtrans.shape == (n_samples, tsvd.n_components)
+
+
+def test_explained_variance(setup):
+    # Test sparse data
+    svd_r_10_sp = TruncatedSVD(10, algorithm="randomized", random_state=42)
+    svd_r_20_sp = TruncatedSVD(20, algorithm="randomized", random_state=42)
+    X_trans_r_10_sp = svd_r_10_sp.fit_transform(X)
+    X_trans_r_20_sp = svd_r_20_sp.fit_transform(X)
+
+    # Test dense data
+    svd_r_10_de = TruncatedSVD(10, algorithm="randomized", random_state=42)
+    svd_r_20_de = TruncatedSVD(20, algorithm="randomized", random_state=42)
+    X_trans_r_10_de = svd_r_10_de.fit_transform(X.toarray())
+    X_trans_r_20_de = svd_r_20_de.fit_transform(X.toarray())
+
+    # helper arrays for tests below
+    svds = (svd_r_10_sp, svd_r_20_sp, svd_r_10_de, svd_r_20_de)
+    svds_trans = (
+        (svd_r_10_sp, X_trans_r_10_sp),
+        (svd_r_20_sp, X_trans_r_20_sp),
+        (svd_r_10_de, X_trans_r_10_de),
+        (svd_r_20_de, X_trans_r_20_de),
+    )
+    svds_10_v_20 = (
+        (svd_r_10_sp, svd_r_20_sp),
+        (svd_r_10_de, svd_r_20_de),
+    )
+    svds_sparse_v_dense = (
+        (svd_r_10_sp, svd_r_10_de),
+        (svd_r_20_sp, svd_r_20_de),
+    )
+
+    # Assert the 1st component is equal
+    for svd_10, svd_20 in svds_10_v_20:
+        assert_array_almost_equal(
+            svd_10.explained_variance_ratio_.to_numpy(),
+            svd_20.explained_variance_ratio_[:10].to_numpy(),
+            decimal=4,
+        )
+
+    # Assert that 20 components has higher explained variance than 10
+    for svd_10, svd_20 in svds_10_v_20:
+        assert (
+            svd_20.explained_variance_ratio_.sum().to_numpy()
+            > svd_10.explained_variance_ratio_.sum().to_numpy()
+        )
+
+    # Assert that all the values are greater than 0
+    for svd in svds:
+        assert_array_less(0.0, svd.explained_variance_ratio_.to_numpy())
+
+    # Assert that total explained variance is less than 1
+    for svd in svds:
+        assert_array_less(svd.explained_variance_ratio_.sum().to_numpy(), 1.0)
+
+    # Compare sparse vs. dense
+    for svd_sparse, svd_dense in svds_sparse_v_dense:
+        assert_array_almost_equal(
+            svd_sparse.explained_variance_ratio_.to_numpy(),
+            svd_dense.explained_variance_ratio_.to_numpy(),
+        )
+
+    # Test that explained_variance is correct
+    for svd, transformed in svds_trans:
+        total_variance = mt.var(X.toarray(), axis=0).sum().to_numpy()
+        variances = mt.var(transformed, axis=0)
+        true_explained_variance_ratio = variances / total_variance
+
+        assert_array_almost_equal(
+            svd.explained_variance_ratio_.to_numpy(),
+            true_explained_variance_ratio.to_numpy(),
+        )
+
+
+def test_singular_values(setup):
+    # Check that the TruncatedSVD output has the correct singular values
+
+    # Set the singular values and see what we get back
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    n_features = 110
+
+    X = rng.randn(n_samples, n_features)
+
+    rpca = TruncatedSVD(n_components=3, algorithm="randomized", random_state=rng)
+    X_rpca = rpca.fit_transform(X)
+
+    X_rpca /= mt.sqrt(mt.sum(X_rpca**2.0, axis=0))
+    X_rpca[:, 0] *= 3.142
+    X_rpca[:, 1] *= 2.718
+
+    X_hat_rpca = mt.dot(X_rpca, rpca.components_)
+    rpca.fit(X_hat_rpca)
+    assert_array_almost_equal(rpca.singular_values_.to_numpy(), [3.142, 2.718, 1.0], 14)
diff --git a/python/xorbits/_mars/learn/ensemble/__init__.py b/python/xorbits/_mars/learn/ensemble/__init__.py
new file mode 100644
index 000000000..565282bff
--- /dev/null
+++ b/python/xorbits/_mars/learn/ensemble/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ._bagging import BaggingClassifier, BaggingRegressor
+from ._blockwise import BlockwiseVotingClassifier, BlockwiseVotingRegressor
+from ._iforest import IsolationForest
diff --git a/python/xorbits/_mars/learn/ensemble/_bagging.py b/python/xorbits/_mars/learn/ensemble/_bagging.py
new file mode 100644
index 000000000..ed6bf0032
--- /dev/null
+++ b/python/xorbits/_mars/learn/ensemble/_bagging.py
@@ -0,0 +1,1711 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+import itertools
+import warnings
+from collections import defaultdict
+from typing import Callable, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
+from sklearn.base import clone as clone_estimator
+from sklearn.utils import check_random_state as sklearn_check_random_state
+
+from ... import opcodes
+from ... import tensor as mt
+from ...core import OutputType, get_output_types, recursive_tile
+from ...core.context import Context
+from ...core.operand import OperandStage
+from ...dataframe.core import DATAFRAME_TYPE
+from ...dataframe.utils import parse_index
+from ...deploy.oscar.session import execute
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    DictField,
+    FieldTypes,
+    Float32Field,
+    FunctionField,
+    Int8Field,
+    Int64Field,
+    ReferenceField,
+    TupleField,
+)
+from ...tensor.core import TENSOR_CHUNK_TYPE
+from ...tensor.random import RandomStateField
+from ...tensor.utils import gen_random_seeds
+from ...typing import TileableType
+from ...utils import has_unknown_shape
+from ..operands import LearnOperand, LearnOperandMixin, LearnShuffleProxy
+from ..utils import column_or_1d, convert_to_tensor_or_dataframe
+from ..utils.multiclass import check_classification_targets
+from ..utils.shuffle import LearnShuffle
+from ..utils.validation import check_is_fitted
+
+
+def _extract_bagging_io(io_list: Iterable, op: LearnOperand, output: bool = False):
+    if not isinstance(io_list, Iterable):
+        io_list = [io_list]
+    input_iter = iter(io_list)
+    out = [
+        next(input_iter),
+        next(input_iter) if op.with_labels else None,
+        next(input_iter) if op.with_weights else None,
+        next(input_iter) if output and op.with_feature_indices else None,
+    ]
+    return out
+
+
+def _get_by_iloc(x, idx, axis=0):
+    if hasattr(x, "iloc"):
+        item_getter = x.iloc
+    else:
+        item_getter = x
+    if axis == 0:
+        return item_getter[idx]
+    else:
+        return item_getter[:, idx]
+
+
+def _concat_on_axis(data_list, axis=0, out_chunk=None):
+    if isinstance(out_chunk, TENSOR_CHUNK_TYPE):
+        return np.concatenate(data_list, axis=axis)
+    else:
+        return pd.concat(data_list, axis=axis)
+
+
+def _concat_by_row(row, out_chunk=None):
+    arr = np.empty((1,), dtype=object)
+    arr[0] = _concat_on_axis(row.tolist(), axis=0, out_chunk=out_chunk)
+    return arr
+
+
+def _set_random_states(estimator, random_state=None):
+    random_state = sklearn_check_random_state(random_state)
+    to_set = {}
+    for key in sorted(estimator.get_params(deep=True)):
+        if key == "random_state" or key.endswith("__random_state"):
+            to_set[key] = random_state.randint(np.iinfo(np.int32).max)
+
+    if to_set:
+        estimator.set_params(**to_set)
+
+
+def _make_estimator(estimator, random_state=None):
+    """Make and configure a copy of the `base_estimator_` attribute.
+
+    Warning: This method should be used to properly instantiate new
+    sub-estimators.
+    """
+    estimator = clone_estimator(estimator)
+    if random_state is not None:
+        _set_random_states(estimator, random_state)
+    return estimator
+
+
+class BaggingSample(LearnShuffle, LearnOperandMixin):
+    _op_type_ = opcodes.BAGGING_SHUFFLE_SAMPLE
+
+    n_estimators: int = Int64Field("n_estimators")
+    max_samples = AnyField("max_samples")
+    max_features = AnyField("max_features")
+    bootstrap: bool = BoolField("bootstrap")
+    bootstrap_features: bool = BoolField("bootstrap_features")
+
+    random_state = RandomStateField("random_state")
+    sample_random_state = RandomStateField("sample_random_state")
+    feature_random_state = RandomStateField("feature_random_state")
+
+    reducer_ratio: float = Float32Field("reducer_ratio")
+    column_offset: int = Int64Field("column_offset", default=None)
+
+    chunk_shape: Tuple[int] = TupleField("chunk_shape", FieldTypes.int64)
+    with_labels: bool = BoolField("with_labels")
+    with_weights: bool = BoolField("with_weights")
+    with_feature_indices: bool = BoolField("with_feature_indices")
+
+    def __init__(
+        self,
+        max_samples: Union[int, float] = 1.0,
+        max_features: Union[int, float] = 1.0,
+        bootstrap: bool = True,
+        bootstrap_features: bool = False,
+        random_state: np.random.RandomState = None,
+        reducer_ratio: float = 1.0,
+        **kw,
+    ):
+        super().__init__(
+            bootstrap=bootstrap,
+            bootstrap_features=bootstrap_features,
+            max_samples=max_samples,
+            max_features=max_features,
+            reducer_ratio=reducer_ratio,
+            random_state=random_state,
+            **kw,
+        )
+        if self.random_state is None:
+            self.random_state = np.random.RandomState()
+
+    @property
+    def output_limit(self) -> int:
+        if self.stage != OperandStage.map:
+            return 1 + self.with_labels + self.with_weights + self.with_feature_indices
+        return 1
+
+    def __call__(
+        self,
+        in_sample: TileableType,
+        in_labels: Optional[TileableType] = None,
+        in_weights: Optional[TileableType] = None,
+    ):
+        self._output_types = get_output_types(in_sample, in_labels, in_weights)
+
+        self.with_labels = in_labels is not None
+        self.with_weights = in_weights is not None
+        axis_keep_shape = [
+            isinstance(self.max_samples, float) and self.max_samples == 1.0,
+            isinstance(self.max_features, float) and self.max_features == 1.0,
+        ]
+        self.with_feature_indices = not axis_keep_shape[1] or self.bootstrap_features
+        if self.with_feature_indices:
+            self._output_types += (OutputType.tensor,)
+
+        new_shape = tuple(
+            s if keep_shape else np.nan
+            for s, keep_shape in zip(in_sample.shape, axis_keep_shape)
+        )
+
+        kws = []
+
+        data_params = in_sample.params
+        data_params["shape"] = new_shape
+        kws.append(data_params)
+
+        if in_labels is not None:
+            labels_params = in_labels.params
+            labels_params["shape"] = (new_shape[0],)
+            kws.append(labels_params)
+
+        if in_weights is not None:
+            weights_params = in_weights.params
+            weights_params["shape"] = (new_shape[0],)
+            kws.append(weights_params)
+
+        if self.with_feature_indices:
+            feature_params = {
+                "shape": (self.n_estimators, new_shape[1]),
+                "dtype": np.dtype(int),
+            }
+            kws.append(feature_params)
+
+        inputs = [in_sample]
+        if in_labels is not None:
+            inputs.append(in_labels)
+        if in_weights is not None:
+            inputs.append(in_weights)
+
+        return self.new_tileables(inputs, kws=kws)
+
+    @classmethod
+    def _scatter_samples(
+        cls,
+        max_samples: Union[int, float],
+        nsplits: Tuple[int],
+        random_state: np.random.RandomState,
+        n_estimators: int,
+    ) -> np.ndarray:
+        nsp_array = np.array(nsplits)
+        dim_size = nsp_array.sum()
+        if isinstance(max_samples, int):
+            expect_sample_count = max_samples
+        else:
+            expect_sample_count = int(max_samples * nsp_array.sum())
+
+        if expect_sample_count == dim_size:
+            return np.array([list(nsplits)] * n_estimators)
+
+        split_probs = nsp_array / dim_size
+        return random_state.multinomial(
+            expect_sample_count, split_probs, size=n_estimators
+        )
+
+    @classmethod
+    def tile(cls, op: "BaggingSample"):
+        in_sample, in_labels, in_weights, _ = _extract_bagging_io(
+            op.inputs, op, output=False
+        )
+        out_data, out_labels, out_weights, out_feature_indices = _extract_bagging_io(
+            op.outputs, op, output=True
+        )
+
+        # make sure all shapes are computed
+        if (
+            has_unknown_shape(in_sample)
+            or (in_labels is not None and has_unknown_shape(in_labels))
+            or (in_weights is not None and has_unknown_shape(in_weights))
+        ):
+            yield
+
+        to_tile = []
+        if in_labels is not None:
+            in_labels = in_labels.rechunk({0: in_sample.nsplits[0]})
+            to_tile.append(in_labels)
+        if in_weights is not None:
+            in_weights = in_weights.rechunk({0: in_sample.nsplits[0]})
+            to_tile.append(in_weights)
+
+        # tile rechunks
+        if to_tile:
+            tiled = yield from recursive_tile(to_tile)
+            tiled_iter = iter(tiled)
+            if in_labels is not None:
+                in_labels = next(tiled_iter)
+            if in_weights is not None:
+                in_weights = next(tiled_iter)
+
+        random_seeds = [
+            gen_random_seeds(n, op.random_state) for n in in_sample.chunk_shape
+        ]
+
+        axis_keep_shape = [
+            isinstance(op.max_samples, float)
+            and op.max_samples == 1.0
+            and not op.bootstrap,
+            isinstance(op.max_features, float)
+            and op.max_features == 1.0
+            and not op.bootstrap_features,
+        ]
+
+        n_reducers = (
+            op.n_reducers
+            if getattr(op, "n_reducers", None)
+            else max(1, int(in_sample.chunk_shape[0] * op.reducer_ratio))
+        )
+
+        # todo implement sampling without replacements
+        map_chunks = []
+        max_samples_splits = cls._scatter_samples(
+            op.max_samples, in_sample.nsplits[0], op.random_state, op.n_estimators
+        )
+        max_features_splits = cls._scatter_samples(
+            op.max_features, in_sample.nsplits[1], op.random_state, op.n_estimators
+        )
+
+        column_cum_offset = np.concatenate([[0], np.cumsum(in_sample.nsplits[1])])
+        for chunk in in_sample.chunks:
+            new_op = op.copy().reset_key()
+            new_op.random_state = None
+            new_op.sample_random_state = np.random.RandomState(
+                random_seeds[0][chunk.index[0]]
+            )
+            new_op.feature_random_state = np.random.RandomState(
+                random_seeds[1][chunk.index[1]]
+            )
+            new_op.stage = OperandStage.map
+            new_op.max_samples = max_samples_splits[:, chunk.index[0]]
+            new_op.max_features = max_features_splits[:, chunk.index[1]]
+            new_op.n_reducers = n_reducers
+            new_op.column_offset = int(column_cum_offset[chunk.index[1]])
+
+            if chunk.index[0] != 0:
+                new_op.with_feature_indices = False
+
+            if chunk.index[1] != in_sample.chunk_shape[1] - 1:
+                new_op.with_weights = False
+                new_op.with_labels = False
+
+            params = chunk.params
+            params["shape"] = tuple(
+                s if keep_shape else np.nan
+                for s, keep_shape in zip(chunk.shape, axis_keep_shape)
+            )
+
+            input_chunks = [chunk]
+            if new_op.with_labels:
+                input_chunks.append(in_labels.cix[chunk.index[0]])
+            if new_op.with_weights:
+                input_chunks.append(in_weights.cix[chunk.index[0]])
+            map_chunks.append(new_op.new_chunk(input_chunks, **params))
+
+        shuffle_op = LearnShuffleProxy(output_types=[OutputType.tensor]).new_chunk(
+            map_chunks, dtype=np.dtype(int), shape=()
+        )
+
+        remain_reducers = op.n_estimators % n_reducers
+        reduce_data_chunks = []
+        reduce_labels_chunks = []
+        reduce_weights_chunks = []
+        reduce_feature_chunks = []
+        for idx in range(n_reducers):
+            new_op = op.copy().reset_key()
+            new_op.random_state = None
+            new_op.stage = OperandStage.reduce
+            new_op.reducer_ordinal = idx
+            new_op.n_reducers = n_reducers
+            new_op.chunk_shape = in_sample.chunk_shape
+            new_op.n_estimators = op.n_estimators // n_reducers
+            if remain_reducers:
+                remain_reducers -= 1
+                new_op.n_estimators += 1
+
+            if new_op.n_estimators == 0:
+                continue
+
+            kws = []
+
+            data_params = out_data.params
+            data_params["index"] = (idx, 0)
+            data_params["shape"] = (np.nan, out_data.shape[1])
+            kws.append(data_params)
+
+            if op.with_labels:
+                labels_params = out_labels.params
+                labels_params["index"] = (idx,)
+                labels_params["shape"] = (np.nan,)
+                kws.append(labels_params)
+
+            if op.with_weights:
+                weights_params = out_weights.params
+                weights_params["index"] = (idx,)
+                weights_params["shape"] = (np.nan,)
+                kws.append(weights_params)
+
+            if op.with_feature_indices:
+                feature_params = {
+                    "index": (idx, 0),
+                    "shape": (new_op.n_estimators, out_feature_indices.shape[1]),
+                    "dtype": np.dtype(int),
+                }
+                kws.append(feature_params)
+
+            chunks = new_op.new_chunks([shuffle_op], kws=kws)
+            (
+                data_chunk,
+                labels_chunk,
+                weights_chunk,
+                feature_chunk,
+            ) = _extract_bagging_io(chunks, op, output=True)
+
+            reduce_data_chunks.append(data_chunk)
+            if labels_chunk is not None:
+                reduce_labels_chunks.append(labels_chunk)
+            if weights_chunk is not None:
+                reduce_weights_chunks.append(weights_chunk)
+            if feature_chunk is not None:
+                reduce_feature_chunks.append(feature_chunk)
+
+        new_op = op.copy().reset_key()
+
+        kws = [
+            {
+                "chunks": reduce_data_chunks,
+                "nsplits": ((np.nan,) * len(reduce_data_chunks), (out_data.shape[1],)),
+                **out_data.params,
+            }
+        ]
+        if op.with_labels:
+            kws.append(
+                {
+                    "chunks": reduce_labels_chunks,
+                    "nsplits": ((np.nan,) * len(reduce_data_chunks),),
+                    **out_labels.params,
+                }
+            )
+        if op.with_weights:
+            kws.append(
+                {
+                    "chunks": reduce_weights_chunks,
+                    "nsplits": ((np.nan,) * len(reduce_data_chunks),),
+                    **out_weights.params,
+                }
+            )
+        if op.with_feature_indices:
+            estimator_nsplit = tuple(c.op.n_estimators for c in reduce_data_chunks)
+            kws.append(
+                {
+                    "chunks": reduce_feature_chunks,
+                    "nsplits": (estimator_nsplit, (out_feature_indices.shape[1],)),
+                    **out_feature_indices.params,
+                }
+            )
+        return new_op.new_tileables(op.inputs, kws=kws)
+
+    @classmethod
+    def _gen_sample_indices(
+        cls,
+        max_range: int,
+        size: int,
+        random_state: np.random.RandomState,
+        with_replacement: bool = False,
+    ):
+        if not with_replacement:
+            result = random_state.choice(np.arange(max_range), size, False)
+        else:
+            result = random_state.randint(0, max_range - 1, size)
+        result.sort()
+        return result
+
+    @classmethod
+    def _execute_map(cls, ctx, op: "BaggingSample"):
+        in_sample, in_labels, in_weights, _ = _extract_bagging_io(
+            op.inputs, op, output=False
+        )
+        in_sample_data = ctx[in_sample.key]
+        in_labels_data = ctx[in_labels.key] if op.with_labels else None
+        in_weights_data = ctx[in_weights.key] if op.with_weights else None
+        out_samples = op.outputs[0]
+
+        remains = op.n_estimators % op.n_reducers
+        reducer_iters = [
+            itertools.repeat(idx, 1 + op.n_estimators // op.n_reducers)
+            for idx in range(remains)
+        ]
+        reducer_iters += [
+            itertools.repeat(idx, op.n_estimators // op.n_reducers)
+            for idx in range(remains, op.n_reducers)
+        ]
+        reducer_iter = itertools.chain(*reducer_iters)
+
+        result_store = defaultdict(lambda: ([], [], [], []))
+        for est_id in range(op.n_estimators):
+            sampled_data = in_sample_data
+            sampled_labels = in_labels_data
+            sampled_weights = in_weights_data
+
+            if op.max_samples[est_id] != in_sample_data.shape[0]:
+                sample_indices = cls._gen_sample_indices(
+                    in_sample_data.shape[0],
+                    op.max_samples[est_id],
+                    op.sample_random_state,
+                    op.bootstrap,
+                )
+
+                sampled_data = _get_by_iloc(sampled_data, sample_indices)
+                if sampled_labels is not None:
+                    sampled_labels = _get_by_iloc(sampled_labels, sample_indices)
+                if sampled_weights is not None:
+                    sampled_weights = _get_by_iloc(sampled_weights, sample_indices)
+
+            if op.max_features[est_id] != in_sample_data.shape[1]:
+                feature_indices = cls._gen_sample_indices(
+                    in_sample_data.shape[1],
+                    op.max_features[est_id],
+                    op.feature_random_state,
+                    op.bootstrap_features,
+                )
+
+                sampled_data = _get_by_iloc(sampled_data, feature_indices, axis=1)
+                if not op.with_feature_indices:
+                    feature_indices = None
+            else:
+                feature_indices = None
+
+            samples, labels, weights, feature_idx_array = result_store[
+                next(reducer_iter)
+            ]
+            samples.append(sampled_data)
+            if sampled_labels is not None:
+                labels.append(sampled_labels)
+            if sampled_weights is not None:
+                weights.append(sampled_weights)
+            if feature_indices is not None:
+                feature_idx_array.append(feature_indices + op.column_offset)
+
+        for (
+            reducer_id,
+            (
+                samples,
+                labels,
+                weights,
+                feature_idx_array,
+            ),
+        ) in result_store.items():
+            ctx[out_samples.key, (reducer_id, 0)] = (
+                ctx.get_current_chunk().index,
+                tuple(samples + labels + weights + feature_idx_array),
+            )
+
+    @classmethod
+    def _execute_reduce(cls, ctx, op: "BaggingSample"):
+        out_data, out_labels, out_weights, out_feature_indices = _extract_bagging_io(
+            op.outputs, op, output=True
+        )
+        sample_holder = [
+            np.empty(op.chunk_shape, dtype=object) for _ in range(op.n_estimators)
+        ]
+
+        labels_holder = (
+            [np.empty(op.chunk_shape[0], dtype=object) for _ in range(op.n_estimators)]
+            if op.with_labels
+            else None
+        )
+
+        weights_holder = (
+            [np.empty(op.chunk_shape[0], dtype=object) for _ in range(op.n_estimators)]
+            if op.with_weights
+            else None
+        )
+
+        feature_indices_holder = (
+            [np.empty(op.chunk_shape[1], dtype=object) for _ in range(op.n_estimators)]
+            if op.with_feature_indices
+            else None
+        )
+
+        input_indexes = [idx for idx, _ in op.iter_mapper_data(ctx)]
+        for input_key, input_idx in zip(op.iter_mapper_keys(), input_indexes):
+            add_feature_index = input_idx[0] == 0
+            add_label_weight = input_idx[1] == op.chunk_shape[1] - 1
+            chunk_data = ctx[input_key, out_data.index][-1]
+
+            num_groups = 1
+            if add_feature_index and op.with_feature_indices:
+                # contains feature indices
+                num_groups += 1
+            if add_label_weight:  # contains label or weight
+                num_groups += int(op.with_weights) + int(op.with_labels)
+
+            sample_count = len(chunk_data) // num_groups
+            assert len(chunk_data) % num_groups == 0
+
+            group_iter = (
+                chunk_data[i * sample_count : (i + 1) * sample_count]
+                for i in range(num_groups)
+            )
+
+            for data_idx, sample in enumerate(next(group_iter)):
+                sample_holder[data_idx][input_idx] = sample
+
+            if add_label_weight:
+                if op.with_labels:
+                    for data_idx, label in enumerate(next(group_iter)):
+                        labels_holder[data_idx][input_idx[0]] = label
+                if op.with_weights:
+                    for data_idx, weight in enumerate(next(group_iter)):
+                        weights_holder[data_idx][input_idx[0]] = weight
+
+            if add_feature_index and op.with_feature_indices:
+                for data_idx, feature_index in enumerate(next(group_iter)):
+                    feature_indices_holder[data_idx][input_idx[1]] = feature_index
+
+        data_results: List[Optional[np.ndarray]] = [None] * len(sample_holder)
+        for est_idx, sample_mat in enumerate(sample_holder):
+            row_chunks = np.apply_along_axis(
+                _concat_by_row, axis=0, arr=sample_mat, out_chunk=out_data
+            )
+            data_results[est_idx] = _concat_on_axis(
+                row_chunks[0].tolist(), axis=1, out_chunk=out_data
+            )
+        ctx[out_data.key] = tuple(data_results)
+
+        for out, holder in zip(
+            (out_labels, out_weights, out_feature_indices),
+            (labels_holder, weights_holder, feature_indices_holder),
+        ):
+            if out is None:
+                continue
+            results: List[Optional[np.ndarray]] = [None] * len(holder)
+            for est_idx, labels_vct in enumerate(holder):
+                results[est_idx] = _concat_on_axis(labels_vct.tolist(), out_chunk=out)
+            if holder is feature_indices_holder:
+                ctx[out.key] = np.stack(results)
+            else:
+                ctx[out.key] = tuple(results)
+
+    @classmethod
+    def execute(cls, ctx, op: "BaggingSample"):
+        if op.stage == OperandStage.map:
+            cls._execute_map(ctx, op)
+        else:
+            cls._execute_reduce(ctx, op)
+
+
+class BaggingSampleReindex(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.BAGGING_SHUFFLE_REINDEX
+
+    n_estimators: int = Int64Field("n_estimators")
+    feature_indices: TileableType = ReferenceField("feature_indices", default=None)
+
+    start_col_index: int = Int64Field("start_col_index", 0)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if self.feature_indices is not None:
+            self.feature_indices = inputs[-1]
+
+    def __call__(self, data: TileableType, feature_indices: TileableType = None):
+        self._output_types = get_output_types(data)
+        inputs = [data]
+        self.feature_indices = feature_indices
+        params = data.params
+        if feature_indices is not None:
+            inputs.append(feature_indices)
+            params["shape"] = (data.shape[0], np.nan)
+        if isinstance(data, DATAFRAME_TYPE):
+            params["index_value"] = parse_index(pd.Index([], dtype=np.int64), data.key)
+        return self.new_tileable(inputs, **params)
+
+    @classmethod
+    def tile(cls, op: "BaggingSampleReindex"):
+        t_data = op.inputs[0]
+        t_out = op.outputs[0]
+        t_feature_idxes = op.feature_indices
+        cum_nsplits = np.cumsum(np.concatenate([[0], t_data.nsplits[1]]))
+
+        if t_feature_idxes is None:
+            out = t_data
+            if out.chunk_shape[1] > 1:
+                out = yield from recursive_tile(out.rechunk({1: (out.shape[1],)}))
+            return out
+
+        # generate map chunks
+        map_holder = np.empty(
+            t_data.chunk_shape + (t_feature_idxes.chunk_shape[0],),
+            dtype=np.dtype(object),
+        )
+        for chunk in t_data.chunks:
+            for feature_idx_chunk in t_feature_idxes.chunks:
+                new_op = op.copy().reset_key()
+                new_op.stage = OperandStage.map
+                new_op.start_col_index = int(cum_nsplits[chunk.index[1]])
+                params = chunk.params
+                new_index = params["index"] = chunk.index + (
+                    feature_idx_chunk.index[0],
+                )
+                if t_feature_idxes.chunk_shape[0] == 1:
+                    new_index = new_index[:-1]
+                map_holder[new_index] = new_op.new_chunk(
+                    [chunk, feature_idx_chunk], **params
+                )
+        if op.feature_indices.chunk_shape[0] == 1:
+            chunks = map_holder.reshape((t_data.chunk_shape[0],)).tolist()
+        else:
+
+            def _gen_combine_chunk(chunks):
+                new_op = op.copy().reset_key()
+                new_op.feature_indices = None
+                new_op.stage = OperandStage.combine
+                params = chunks[0].params
+                params["shape"] = (chunks[0].shape[0], op.feature_indices.shape[1])
+                params["index"] = (chunks[0].index[0], chunks[0].index[2])
+                if isinstance(t_data, DATAFRAME_TYPE):
+                    params["index_value"] = parse_index(
+                        pd.Index([], dtype=np.int64), chunks[0].key
+                    )
+                inputs = chunks.tolist()
+                return new_op.new_chunk(inputs, **params)
+
+            chunks_array = np.apply_along_axis(_gen_combine_chunk, 1, map_holder)
+            chunks = chunks_array.reshape((chunks_array.size,)).tolist()
+
+        new_op = op.copy().reset_key()
+        new_nsplits = (
+            t_data.nsplits[0],
+            (op.feature_indices.shape[1],) * t_feature_idxes.chunk_shape[0],
+        )
+        return new_op.new_tileables(
+            op.inputs, chunks=chunks, nsplits=new_nsplits, **t_out.params
+        )
+
+    @classmethod
+    def _execute_map(cls, ctx, op: "BaggingSampleReindex"):
+        data = ctx[op.inputs[0].key]
+        feature_idx = ctx[op.feature_indices.key] - op.start_col_index
+        filtered = []
+        for row in feature_idx:
+            row = row[(row >= 0) & (row < data.shape[1])]
+            filtered.append(_get_by_iloc(data, row, axis=1))
+        ctx[op.outputs[0].key] = tuple(filtered)
+
+    @classmethod
+    def _execute_combine(cls, ctx, op: "BaggingSampleReindex"):
+        data_inputs = [ctx[c.key] for c in op.inputs]
+        concatenated = []
+        for data_input in zip(*data_inputs):
+            concatenated.append(_concat_on_axis(data_input, 1, op.inputs[0]))
+        ctx[op.outputs[0].key] = tuple(concatenated)
+
+    @classmethod
+    def execute(cls, ctx, op: "BaggingSampleReindex"):
+        if op.stage == OperandStage.combine:
+            cls._execute_combine(ctx, op)
+        else:
+            cls._execute_map(ctx, op)
+
+
+class BaggingFitOperand(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.BAGGING_FIT
+
+    base_estimator: BaseEstimator = AnyField("base_estimator")
+    estimator_params: dict = DictField("estimator_params", default=None)
+    n_estimators: int = Int64Field("n_estimators")
+    max_samples = AnyField("max_samples", default=1.0)
+    max_features = AnyField("max_features", default=1.0)
+    bootstrap: bool = BoolField("bootstrap", default=False)
+    bootstrap_features: bool = BoolField("bootstrap_features", default=True)
+    random_state = RandomStateField("random_state", default=None)
+
+    reducer_ratio: float = Float32Field("reducer_ratio")
+    n_reducers: int = Int64Field("n_reducers")
+
+    labels: TileableType = ReferenceField("labels", default=None)
+    weights: TileableType = ReferenceField("weights", default=None)
+    feature_indices: TileableType = ReferenceField("feature_indices", default=None)
+    with_feature_indices: bool = BoolField("with_feature_indices", default=None)
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if self.random_state is None:
+            self.random_state = np.random.RandomState()
+        if self.with_feature_indices is None:
+            full_features = (
+                isinstance(self.max_features, float) and self.max_features == 1.0
+            )
+            self.with_feature_indices = not full_features or self.bootstrap_features
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+
+        input_iter = iter(inputs)
+        next(input_iter)
+        if self.labels is not None:
+            self.labels = next(input_iter)
+        if self.weights is not None:
+            self.weights = next(input_iter)
+        if self.feature_indices is not None:
+            self.feature_indices = next(input_iter)
+
+    def _get_bagging_sample_tileables(self, samples=None):
+        samples = samples or self.inputs[0]
+        sample_op = BaggingSample(
+            n_estimators=self.n_estimators,
+            max_samples=self.max_samples,
+            max_features=self.max_features,
+            bootstrap=self.bootstrap,
+            bootstrap_features=self.bootstrap_features,
+            random_state=self.random_state,
+            reducer_ratio=self.reducer_ratio,
+            n_reducers=self.n_reducers,
+            with_weights=self.weights is not None,
+            with_labels=self.labels is not None,
+            with_feature_indices=self.with_feature_indices,
+        )
+        return _extract_bagging_io(
+            sample_op(samples, self.labels, self.weights), sample_op, output=True
+        )
+
+    @property
+    def output_limit(self) -> int:
+        if self.with_feature_indices:
+            return 2
+        return 1
+
+    def __call__(
+        self,
+        in_data: TileableType,
+        in_labels: Optional[TileableType] = None,
+        in_weights: Optional[TileableType] = None,
+        feature_indices: TileableType = None,
+    ):
+        self._output_types = [OutputType.tensor]
+        inputs = [in_data]
+
+        if in_labels is not None:
+            self.labels = in_labels
+            inputs.append(in_labels)
+        if in_weights is not None:
+            self.weights = in_weights
+            inputs.append(in_weights)
+
+        if feature_indices is not None:
+            self.feature_indices = feature_indices
+            inputs.append(feature_indices)
+
+        kws = [dict(shape=(self.n_estimators,), dtype=np.dtype(object))]
+        if self.with_feature_indices:
+            self._output_types.append(OutputType.tensor)
+            sample_tileables = self._get_bagging_sample_tileables(in_data)
+            kws.append(sample_tileables[-1].params)
+
+        return self.new_tileables(inputs, kws=kws)
+
+    @classmethod
+    def tile(cls, op: "BaggingFitOperand"):
+        out = op.outputs[0]
+        sample_tileables = op._get_bagging_sample_tileables()
+        tiled_sample_iter = iter(
+            (
+                yield from recursive_tile(
+                    tuple(t for t in sample_tileables if t is not None)
+                )
+            )
+        )
+        sampled, labels, weights, feature_indices = (
+            t if t is None else next(tiled_sample_iter) for t in sample_tileables
+        )
+
+        estimator_nsplits = (tuple(c.op.n_estimators for c in sampled.chunks),)
+
+        label_chunks = itertools.repeat(None) if labels is None else labels.chunks
+        weight_chunks = itertools.repeat(None) if weights is None else weights.chunks
+
+        out_chunks = []
+        seeds = gen_random_seeds(len(sampled.chunks), op.random_state)
+        for sample_chunk, label_chunk, weight_chunk, n_estimators in zip(
+            sampled.chunks, label_chunks, weight_chunks, estimator_nsplits[0]
+        ):
+            chunk_op = BaggingFitOperand(
+                base_estimator=op.base_estimator,
+                estimator_params=op.estimator_params,
+                labels=label_chunk,
+                weights=weight_chunk,
+                n_estimators=n_estimators,
+                with_feature_indices=False,
+                random_state=sklearn_check_random_state(seeds[sample_chunk.index[0]]),
+            )
+            chunk_op._output_types = op._output_types
+            inputs = [
+                c for c in [sample_chunk, label_chunk, weight_chunk] if c is not None
+            ]
+            out_chunks.append(
+                chunk_op.new_chunk(
+                    inputs,
+                    index=(sample_chunk.index[0],),
+                    shape=(n_estimators,),
+                    dtype=out.dtype,
+                )
+            )
+
+        out_op = op.copy().reset_key()
+        kws = [
+            dict(chunks=out_chunks, nsplits=estimator_nsplits, **out.params),
+        ]
+        if feature_indices is not None:
+            kws.append(
+                dict(
+                    chunks=feature_indices.chunks,
+                    nsplits=feature_indices.nsplits,
+                    **feature_indices.params,
+                )
+            )
+        return out_op.new_tileables(op.inputs, kws=kws, output_limit=len(kws))
+
+    @classmethod
+    def execute(cls, ctx: Union[dict, Context], op: "BaggingFitOperand"):
+        sampled_data = ctx[op.inputs[0].key]
+        labels_data = (
+            ctx[op.labels.key] if op.labels is not None else itertools.repeat(None)
+        )
+        weights_data = (
+            ctx[op.weights.key] if op.weights is not None else itertools.repeat(None)
+        )
+
+        for k, v in (op.estimator_params or dict()).items():
+            setattr(op.base_estimator, k, v)
+
+        new_estimators = []
+        seeds = gen_random_seeds(len(sampled_data), op.random_state)
+        for idx, (sampled, label, weights) in enumerate(
+            zip(sampled_data, labels_data, weights_data)
+        ):
+            estimator = _make_estimator(op.base_estimator, seeds[idx])
+            estimator.fit(sampled, y=label, sample_weight=weights)
+            new_estimators.append(estimator)
+        ctx[op.outputs[0].key] = np.array(new_estimators, dtype=np.dtype(object))
+
+
+class PredictionType(enum.Enum):
+    REGRESSION = 0
+    PROBABILITY = 1
+    LOG_PROBABILITY = 2
+    DECISION_FUNCTION = 3
+
+
+class BaggingPredictionOperand(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.BAGGING_PREDICTION
+
+    estimators: TileableType = ReferenceField("estimators")
+    feature_indices: TileableType = ReferenceField("feature_indices", default=None)
+    n_classes: Optional[int] = Int64Field("n_classes", default=None)
+    prediction_type: PredictionType = Int8Field(
+        "prediction_type",
+        on_serialize=lambda x: x.value,
+        on_deserialize=PredictionType,
+        default=PredictionType.PROBABILITY,
+    )
+    decision_function: Callable = FunctionField("decision_function", default=None)
+    calc_means: bool = BoolField("calc_means", default=True)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        input_iter = iter(inputs[1:])
+        self.estimators = next(input_iter)
+        if self.feature_indices is not None:
+            self.feature_indices = next(input_iter)
+
+    def __call__(
+        self,
+        instances: TileableType,
+        estimators: TileableType,
+        feature_indices: TileableType = None,
+    ) -> TileableType:
+        self._output_types = [OutputType.tensor]
+        self.estimators = estimators
+        self.feature_indices = feature_indices
+
+        if self.n_classes is not None:
+            shape = (instances.shape[0], estimators.shape[0], self.n_classes)
+        else:
+            shape = (instances.shape[0], estimators.shape[0])
+        if self.calc_means:
+            shape = (shape[0],) + shape[2:]
+
+        params = {"dtype": np.dtype(float), "shape": shape}
+        inputs = [instances, estimators]
+        if feature_indices is not None:
+            inputs.append(feature_indices)
+        return self.new_tileable(inputs, **params)
+
+    def _get_class_shape(self):
+        if self.n_classes and self.n_classes > 2:
+            return self.n_classes
+        elif self.prediction_type == PredictionType.DECISION_FUNCTION:
+            return None
+        else:
+            return self.n_classes
+
+    @classmethod
+    def _build_chunks_without_feature_indices(
+        cls, op: "BaggingPredictionOperand", t_instances: TileableType
+    ):
+        class_shape = op._get_class_shape()
+        chunks = []
+        for c_instance in t_instances.chunks:
+            for c_estimator in op.estimators.chunks:
+                if class_shape is not None:
+                    params = {
+                        "dtype": np.dtype(float),
+                        "shape": (
+                            c_instance.shape[0],
+                            class_shape,
+                            c_estimator.shape[0],
+                        ),
+                        "index": (c_instance.index[0], 0, c_estimator.index[0]),
+                    }
+                else:
+                    params = {
+                        "dtype": np.dtype(float),
+                        "shape": (c_instance.shape[0], c_estimator.shape[0]),
+                        "index": (c_instance.index[0], c_estimator.index[0]),
+                    }
+                new_op = op.copy().reset_key()
+                new_op.feature_indices = None
+                chunks.append(new_op.new_chunk([c_instance, c_estimator], **params))
+        return chunks
+
+    @classmethod
+    def _build_chunks_with_feature_indices(
+        cls, op: "BaggingPredictionOperand", t_instances: TileableType
+    ):
+        class_shape = op._get_class_shape()
+        chunks = []
+        for c in t_instances.chunks:
+            estimator_chunk = op.estimators.chunks[c.index[1]]
+
+            if class_shape is not None:
+                params = {
+                    "dtype": np.dtype(float),
+                    "shape": (c.shape[0], class_shape, estimator_chunk.shape[0]),
+                    "index": (c.index[0], 0, c.index[1]),
+                }
+            else:
+                params = {
+                    "dtype": np.dtype(float),
+                    "shape": (c.shape[0], estimator_chunk.shape[0]),
+                    "index": c.index,
+                }
+
+            new_op = op.copy().reset_key()
+            new_op.feature_indices = None
+            chunks.append(new_op.new_chunk([c, estimator_chunk], **params))
+        return chunks
+
+    @classmethod
+    def tile(cls, op: "BaggingPredictionOperand"):
+        n_estimators = op.estimators.shape[0]
+        reindex_op = BaggingSampleReindex(n_estimators=n_estimators)
+        t_instances = yield from recursive_tile(
+            reindex_op(op.inputs[0], op.feature_indices)
+        )
+
+        # for classifiers, form instance-class-estimator array
+        # for regressors, form instance-estimator array
+        # and then sum over estimator axis
+
+        if op.feature_indices is None:
+            chunks = cls._build_chunks_without_feature_indices(op, t_instances)
+        else:
+            chunks = cls._build_chunks_with_feature_indices(op, t_instances)
+
+        new_op = op.copy().reset_key()
+        class_shape = op._get_class_shape()
+        if class_shape is not None:
+            params = {
+                "dtype": np.dtype(float),
+                "shape": (t_instances.shape[0], class_shape, n_estimators),
+            }
+            nsplits = (t_instances.nsplits[0], (class_shape,), op.estimators.nsplits[0])
+        else:
+            params = {
+                "dtype": np.dtype(float),
+                "shape": (t_instances.shape[0], n_estimators),
+            }
+            nsplits = (t_instances.nsplits[0], op.estimators.nsplits[0])
+        estimator_probas = new_op.new_tileable(
+            op.inputs, chunks=chunks, nsplits=nsplits, **params
+        )
+
+        if not op.calc_means:
+            return estimator_probas
+        elif op.prediction_type != PredictionType.LOG_PROBABILITY:
+            return [
+                (
+                    yield from recursive_tile(
+                        mt.sum(estimator_probas, axis=-1) / n_estimators
+                    )
+                )
+            ]
+        else:
+            return [
+                (
+                    yield from recursive_tile(
+                        mt.log(mt.exp(estimator_probas).sum(axis=-1))
+                        - np.log(n_estimators)
+                    )
+                )
+            ]
+
+    @classmethod
+    def _predict_proba(cls, instance, estimator, n_classes):
+        n_samples = instance.shape[0]
+        proba = np.zeros((n_samples, n_classes))
+
+        if hasattr(estimator, "predict_proba"):
+            proba_estimator = estimator.predict_proba(instance)
+            if n_classes == len(estimator.classes_):
+                proba += proba_estimator
+
+            else:
+                proba[:, estimator.classes_] += proba_estimator[
+                    :, range(len(estimator.classes_))
+                ]
+        else:
+            # Resort to voting
+            predictions = estimator.predict(instance)
+            for i in range(n_samples):
+                proba[i, predictions[i]] += 1
+        return proba
+
+    @classmethod
+    def _predict_log_proba(cls, instance, estimator, n_classes):
+        """Private function used to compute log probabilities within a job."""
+        if not hasattr(estimator, "predict_log_proba"):
+            return np.log(cls._predict_proba(instance, estimator, n_classes))
+
+        n_samples = instance.shape[0]
+        log_proba = np.empty((n_samples, n_classes))
+        log_proba.fill(-np.inf)
+        all_classes = np.arange(n_classes, dtype=int)
+
+        log_proba_estimator = estimator.predict_log_proba(instance)
+
+        if n_classes == len(estimator.classes_):
+            log_proba = np.logaddexp(log_proba, log_proba_estimator)
+        else:  # pragma: no cover
+            log_proba[:, estimator.classes_] = np.logaddexp(
+                log_proba[:, estimator.classes_],
+                log_proba_estimator[:, range(len(estimator.classes_))],
+            )
+            missing = np.setdiff1d(all_classes, estimator.classes_)
+            log_proba[:, missing] = np.logaddexp(log_proba[:, missing], -np.inf)
+        return log_proba
+
+    @classmethod
+    def _decision_function(cls, instance, estimator, func=None):
+        if func is not None:
+            return func(instance, estimator)
+        else:
+            return estimator.decision_function(instance)
+
+    @classmethod
+    def execute(cls, ctx: Union[dict, Context], op: "BaggingPredictionOperand"):
+        instances = ctx[op.inputs[0].key]
+        estimators = ctx[op.estimators.key]
+        if not isinstance(instances, tuple):
+            instances = [instances] * len(estimators)
+
+        estimate_results = []
+        for instance, estimator in zip(instances, estimators):
+            # classifier
+            if op.prediction_type == PredictionType.PROBABILITY:
+                estimate_results.append(
+                    cls._predict_proba(instance, estimator, op.n_classes)
+                )
+            elif op.prediction_type == PredictionType.LOG_PROBABILITY:
+                estimate_results.append(
+                    cls._predict_log_proba(instance, estimator, op.n_classes)
+                )
+            elif op.prediction_type == PredictionType.DECISION_FUNCTION:
+                estimate_results.append(
+                    cls._decision_function(instance, estimator, op.decision_function)
+                )
+            else:
+                estimate_results.append(estimator.predict(instance))
+
+        out = op.outputs[0]
+        ctx[out.key] = np.stack(estimate_results, axis=out.ndim - 1)
+
+
+class BaseBagging:
+    def __init__(
+        self,
+        base_estimator=None,
+        n_estimators=10,
+        *,
+        max_samples=1.0,
+        max_features=1.0,
+        bootstrap=True,
+        bootstrap_features=False,
+        oob_score=False,
+        warm_start=False,
+        n_jobs=None,
+        random_state=None,
+        verbose=0,
+        reducers=1.0,
+    ):
+        self.base_estimator = base_estimator
+        self.n_estimators = n_estimators
+
+        self.max_samples = max_samples
+        self.max_features = max_features
+        self.bootstrap = bootstrap
+        self.bootstrap_features = bootstrap_features
+        self.oob_score = oob_score
+        self.warm_start = warm_start
+        self.n_jobs = n_jobs
+        self.random_state = (
+            np.random.RandomState(random_state)
+            if isinstance(random_state, int)
+            else random_state
+        )
+        self.verbose = verbose
+        self.reducers = reducers
+
+        self.estimators_ = None
+        self.estimator_features_ = None
+
+    def _validate_y(self, y, session=None, run_kwargs=None):
+        if len(y.shape) == 1 or y.shape[1] == 1:
+            return column_or_1d(y, warn=True)
+        else:
+            return y
+
+    def _fit(
+        self,
+        X,
+        y=None,
+        sample_weight=None,
+        max_samples=None,
+        estimator_params=None,
+        session=None,
+        run_kwargs=None,
+    ):
+        estimator_features, feature_indices = None, None
+        n_more_estimators = self.n_estimators
+
+        X = convert_to_tensor_or_dataframe(X)
+        y = convert_to_tensor_or_dataframe(y) if y is not None else None
+        sample_weight = (
+            convert_to_tensor_or_dataframe(sample_weight)
+            if sample_weight is not None
+            else None
+        )
+
+        y = self._validate_y(y)
+
+        if self.warm_start:
+            feature_indices = self.estimator_features_
+            if self.estimators_ is not None:
+                exist_estimators = self.estimators_.shape[0]
+                # move random states to skip duplicated results
+                self.random_state.rand(exist_estimators)
+                n_more_estimators = self.n_estimators - exist_estimators
+
+        if n_more_estimators < 0:
+            raise ValueError(
+                "n_estimators=%d must be larger or equal to "
+                "len(estimators_)=%d when warm_start==True"
+                % (self.n_estimators, self.estimators_.shape[0])
+            )
+        elif n_more_estimators == 0:
+            warnings.warn(
+                "Warm-start fitting without increasing n_estimators does not "
+                "fit new trees."
+            )
+            return self
+
+        fit_op = BaggingFitOperand(
+            base_estimator=self.base_estimator,
+            estimator_params=estimator_params,
+            n_estimators=n_more_estimators,
+            max_samples=max_samples or self.max_samples,
+            max_features=self.max_features,
+            bootstrap=self.bootstrap,
+            bootstrap_features=self.bootstrap_features,
+            random_state=self.random_state,
+            reducer_ratio=self.reducers if isinstance(self.reducers, float) else None,
+            n_reducers=self.reducers if isinstance(self.reducers, int) else None,
+        )
+        tileables = fit_op(X, y, sample_weight, feature_indices)
+        ret = execute(*tileables, session=session, **(run_kwargs or dict()))
+
+        if len(ret) == 2:
+            estimators, estimator_features = ret
+        else:
+            estimators = ret
+
+        if self.estimators_ is not None:
+            estimators = mt.concatenate([self.estimators_, estimators])
+        if self.estimator_features_ is not None:
+            estimator_features = mt.concatenate(
+                [self.estimator_features_, estimator_features]
+            )
+
+        self.estimators_, self.estimator_features_ = estimators, estimator_features
+        return self
+
+    def fit(self, X, y=None, sample_weight=None, session=None, run_kwargs=None):
+        """
+        Build a Bagging ensemble of estimators from the training set (X, y).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrices are accepted only if
+            they are supported by the base estimator.
+
+        y : array-like of shape (n_samples,)
+            The target values (class labels in classification, real numbers in
+            regression).
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted.
+            Note that this is supported only if the base estimator supports
+            sample weighting.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        return self._fit(
+            X, y, sample_weight=sample_weight, session=session, run_kwargs=run_kwargs
+        )
+
+
+class BaggingClassifier(ClassifierMixin, BaseBagging):
+    """
+    A Bagging classifier.
+
+    A Bagging classifier is an ensemble meta-estimator that fits base
+    classifiers each on random subsets of the original dataset and then
+    aggregate their individual predictions (either by voting or by averaging)
+    to form a final prediction. Such a meta-estimator can typically be used as
+    a way to reduce the variance of a black-box estimator (e.g., a decision
+    tree), by introducing randomization into its construction procedure and
+    then making an ensemble out of it.
+
+    This algorithm encompasses several works from the literature. When random
+    subsets of the dataset are drawn as random subsets of the samples, then
+    this algorithm is known as Pasting [1]_. If samples are drawn with
+    replacement, then the method is known as Bagging [2]_. When random subsets
+    of the dataset are drawn as random subsets of the features, then the method
+    is known as Random Subspaces [3]_. Finally, when base estimators are built
+    on subsets of both samples and features, then the method is known as
+    Random Patches [4]_.
+
+    Read more in the :ref:`User Guide <bagging>`.
+
+    Parameters
+    ----------
+    base_estimator : object, default=None
+        The base estimator to fit on random subsets of the dataset.
+        If None, then the base estimator is a
+        :class:`~sklearn.tree.DecisionTreeClassifier`.
+
+    n_estimators : int, default=10
+        The number of base estimators in the ensemble.
+
+    max_samples : int or float, default=1.0
+        The number of samples to draw from X to train each base estimator (with
+        replacement by default, see `bootstrap` for more details).
+
+        - If int, then draw `max_samples` samples.
+        - If float, then draw `max_samples * X.shape[0]` samples.
+
+    max_features : int or float, default=1.0
+        The number of features to draw from X to train each base estimator (
+        without replacement by default, see `bootstrap_features` for more
+        details).
+
+        - If int, then draw `max_features` features.
+        - If float, then draw `max_features * X.shape[1]` features.
+
+    bootstrap : bool, default=True
+        Whether samples are drawn with replacement. If False, sampling
+        without replacement is performed.
+
+    bootstrap_features : bool, default=False
+        Whether features are drawn with replacement.
+
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just fit
+        a whole new ensemble. See :term:`the Glossary <warm_start>`.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the random resampling of the original dataset
+        (sample wise and feature wise).
+        If the base estimator accepts a `random_state` attribute, a different
+        seed is generated for each instance in the ensemble.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    base_estimator_ : estimator
+        The base estimator from which the ensemble is grown.
+
+    estimators_ : list of estimators
+        The collection of fitted base estimators.
+
+    estimators_features_ : list of arrays
+        The subset of drawn features for each base estimator.
+
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels.
+
+    n_classes_ : int or list
+        The number of classes.
+
+    See Also
+    --------
+    BaggingRegressor : A Bagging regressor.
+
+    References
+    ----------
+
+    .. [1] L. Breiman, "Pasting small votes for classification in large
+           databases and on-line", Machine Learning, 36(1), 85-103, 1999.
+
+    .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140,
+           1996.
+
+    .. [3] T. Ho, "The random subspace method for constructing decision
+           forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844,
+           1998.
+
+    .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine
+           Learning and Knowledge Discovery in Databases, 346-361, 2012.
+
+    Examples
+    --------
+    >>> from sklearn.svm import SVC
+    >>> from mars.learn.ensemble import BaggingClassifier
+    >>> from mars.learn.datasets import make_classification
+    >>> X, y = make_classification(n_samples=100, n_features=4,
+    ...                            n_informative=2, n_redundant=0,
+    ...                            random_state=0, shuffle=False)
+    >>> clf = BaggingClassifier(base_estimator=SVC(),
+    ...                         n_estimators=10, random_state=0).fit(X, y)
+    >>> clf.predict([[0, 0, 0, 0]])
+    array([1])
+    """
+
+    def _validate_y(self, y, session=None, run_kwargs=None):
+        to_run = [check_classification_targets(y)]
+        y = column_or_1d(y, warn=True)
+        to_run.extend(mt.unique(y, return_inverse=True))
+        _, self.classes_, y = execute(
+            *to_run, session=session, **(run_kwargs or dict())
+        )
+        self.n_classes_ = len(self.classes_)
+
+        return y
+
+    def _predict_proba(self, X):
+        check_is_fitted(self)
+        X = convert_to_tensor_or_dataframe(X)
+        predict_op = BaggingPredictionOperand(
+            n_classes=self.n_classes_,
+            prediction_type=PredictionType.PROBABILITY,
+        )
+        return predict_op(X, self.estimators_, self.estimator_features_)
+
+    def predict(self, X, session=None, run_kwargs=None):
+        """
+        Predict class for X.
+
+        The predicted class of an input sample is computed as the class with
+        the highest mean predicted probability. If base estimators do not
+        implement a ``predict_proba`` method, then it resorts to voting.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrices are accepted only if
+            they are supported by the base estimator.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+            The predicted classes.
+        """
+        probas = self._predict_proba(X)
+        y = self.classes_.take(mt.argmax(probas, axis=1), axis=0)
+        return execute(y, session=session, **(run_kwargs or dict()))
+
+    def predict_proba(self, X, session=None, run_kwargs=None):
+        """
+        Predict class probabilities for X.
+
+        The predicted class probabilities of an input sample is computed as
+        the mean predicted class probabilities of the base estimators in the
+        ensemble. If base estimators do not implement a ``predict_proba``
+        method, then it resorts to voting and the predicted class probabilities
+        of an input sample represents the proportion of estimators predicting
+        each class.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrices are accepted only if
+            they are supported by the base estimator.
+
+        Returns
+        -------
+        p : ndarray of shape (n_samples, n_classes)
+            The class probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        probas = self._predict_proba(X)
+        return execute(probas, session=session, **(run_kwargs or dict()))
+
+    def predict_log_proba(self, X, session=None, run_kwargs=None):
+        """
+        Predict class log-probabilities for X.
+
+        The predicted class log-probabilities of an input sample is computed as
+        the log of the mean predicted class probabilities of the base
+        estimators in the ensemble.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrices are accepted only if
+            they are supported by the base estimator.
+
+        Returns
+        -------
+        p : ndarray of shape (n_samples, n_classes)
+            The class log-probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        check_is_fitted(self)
+        X = convert_to_tensor_or_dataframe(X)
+        predict_op = BaggingPredictionOperand(
+            n_classes=self.n_classes_,
+            prediction_type=PredictionType.LOG_PROBABILITY,
+        )
+        probas = predict_op(X, self.estimators_, self.estimator_features_)
+        return execute(probas, session=session, **(run_kwargs or dict()))
+
+    def decision_function(self, X, session=None, run_kwargs=None):
+        """
+        Average of the decision functions of the base classifiers.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrices are accepted only if
+            they are supported by the base estimator.
+
+        Returns
+        -------
+        score : ndarray of shape (n_samples, k)
+            The decision function of the input samples. The columns correspond
+            to the classes in sorted order, as they appear in the attribute
+            ``classes_``. Regression and binary classification are special
+            cases with ``k == 1``, otherwise ``k==n_classes``.
+        """
+        check_is_fitted(self)
+        X = convert_to_tensor_or_dataframe(X)
+        predict_op = BaggingPredictionOperand(
+            n_classes=self.n_classes_,
+            prediction_type=PredictionType.DECISION_FUNCTION,
+        )
+        result = predict_op(X, self.estimators_, self.estimator_features_)
+        return execute(result, session=session, **(run_kwargs or dict()))
+
+
+class BaggingRegressor(RegressorMixin, BaseBagging):
+    """
+    A Bagging regressor.
+
+    A Bagging regressor is an ensemble meta-estimator that fits base
+    regressors each on random subsets of the original dataset and then
+    aggregate their individual predictions (either by voting or by averaging)
+    to form a final prediction. Such a meta-estimator can typically be used as
+    a way to reduce the variance of a black-box estimator (e.g., a decision
+    tree), by introducing randomization into its construction procedure and
+    then making an ensemble out of it.
+
+    This algorithm encompasses several works from the literature. When random
+    subsets of the dataset are drawn as random subsets of the samples, then
+    this algorithm is known as Pasting [1]_. If samples are drawn with
+    replacement, then the method is known as Bagging [2]_. When random subsets
+    of the dataset are drawn as random subsets of the features, then the method
+    is known as Random Subspaces [3]_. Finally, when base estimators are built
+    on subsets of both samples and features, then the method is known as
+    Random Patches [4]_.
+
+    Read more in the :ref:`User Guide <bagging>`.
+
+    Parameters
+    ----------
+    base_estimator : object, default=None
+        The base estimator to fit on random subsets of the dataset.
+        If None, then the base estimator is a
+        :class:`~sklearn.tree.DecisionTreeRegressor`.
+
+    n_estimators : int, default=10
+        The number of base estimators in the ensemble.
+
+    max_samples : int or float, default=1.0
+        The number of samples to draw from X to train each base estimator (with
+        replacement by default, see `bootstrap` for more details).
+
+        - If int, then draw `max_samples` samples.
+        - If float, then draw `max_samples * X.shape[0]` samples.
+
+    max_features : int or float, default=1.0
+        The number of features to draw from X to train each base estimator (
+        without replacement by default, see `bootstrap_features` for more
+        details).
+
+        - If int, then draw `max_features` features.
+        - If float, then draw `max_features * X.shape[1]` features.
+
+    bootstrap : bool, default=True
+        Whether samples are drawn with replacement. If False, sampling
+        without replacement is performed.
+
+    bootstrap_features : bool, default=False
+        Whether features are drawn with replacement.
+
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just fit
+        a whole new ensemble. See :term:`the Glossary <warm_start>`.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the random resampling of the original dataset
+        (sample wise and feature wise).
+        If the base estimator accepts a `random_state` attribute, a different
+        seed is generated for each instance in the ensemble.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    base_estimator_ : estimator
+        The base estimator from which the ensemble is grown.
+
+    estimators_ : list of estimators
+        The collection of fitted sub-estimators.
+
+    estimators_features_ : list of arrays
+        The subset of drawn features for each base estimator.
+
+    See Also
+    --------
+    BaggingClassifier : A Bagging classifier.
+
+    References
+    ----------
+
+    .. [1] L. Breiman, "Pasting small votes for classification in large
+           databases and on-line", Machine Learning, 36(1), 85-103, 1999.
+
+    .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140,
+           1996.
+
+    .. [3] T. Ho, "The random subspace method for constructing decision
+           forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844,
+           1998.
+
+    .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine
+           Learning and Knowledge Discovery in Databases, 346-361, 2012.
+
+    Examples
+    --------
+    >>> from sklearn.svm import SVR
+    >>> from mars.learn.ensemble import BaggingRegressor
+    >>> from mars.learn.datasets import make_regression
+    >>> X, y = make_regression(n_samples=100, n_features=4,
+    ...                        n_informative=2, n_targets=1,
+    ...                        random_state=0, shuffle=False)
+    >>> regr = BaggingRegressor(base_estimator=SVR(),
+    ...                         n_estimators=10, random_state=0).fit(X, y)
+    >>> regr.predict([[0, 0, 0, 0]])
+    array([-2.8720...])
+    """
+
+    def predict(self, X, session=None, run_kwargs=None):
+        """
+        Predict regression target for X.
+
+        The predicted regression target of an input sample is computed as the
+        mean predicted regression targets of the estimators in the ensemble.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrices are accepted only if
+            they are supported by the base estimator.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+            The predicted values.
+        """
+        check_is_fitted(self)
+        X = convert_to_tensor_or_dataframe(X)
+        predict_op = BaggingPredictionOperand(
+            prediction_type=PredictionType.REGRESSION,
+        )
+        probas = predict_op(X, self.estimators_, self.estimator_features_)
+        return execute(probas, session=session, **(run_kwargs or dict()))
diff --git a/python/xorbits/_mars/learn/ensemble/_blockwise.py b/python/xorbits/_mars/learn/ensemble/_blockwise.py
new file mode 100644
index 000000000..b0fbe6db5
--- /dev/null
+++ b/python/xorbits/_mars/learn/ensemble/_blockwise.py
@@ -0,0 +1,389 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Union
+
+import numpy as np
+from sklearn.base import BaseEstimator as SklearnBaseEstimator
+from sklearn.base import clone
+from sklearn.utils.validation import check_is_fitted
+
+from ... import execute, opcodes
+from ... import tensor as mt
+from ...core import ENTITY_TYPE, OutputType, recursive_tile
+from ...core.context import Context
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    DictField,
+    FieldTypes,
+    Int64Field,
+    KeyField,
+    ListField,
+    StringField,
+)
+from ...tensor.core import Tensor, TensorOrder
+from ...tensor.utils import decide_unify_split
+from ...typing import SessionType
+from ..base import BaseEstimator, ClassifierMixin, RegressorMixin
+from ..operands import LearnOperand, LearnOperandMixin
+from ..utils import check_array
+
+
+class BlockwiseEnsembleFit(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.BLOCKWISE_ENSEMBLE_FIT
+
+    x = KeyField("x")
+    y = KeyField("y")
+    estimator = AnyField("estimator")
+    kwargs = DictField("kwargs", default_factory=dict)
+
+    def __call__(self):
+        self._output_types = [OutputType.object]
+        return self.new_tileable([self.x, self.y])
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self.x = self._inputs[0]
+        self.y = self._inputs[1]
+
+    @classmethod
+    def tile(cls, op: "BlockwiseEnsembleFit"):
+        X, y = op.x, op.y
+        x_split = X.nsplits[0]
+        y_split = y.nsplits[0]
+        out = op.outputs[0]
+
+        if any(np.isnan(s) for s in x_split + y_split) or np.isnan(
+            X.shape[1]
+        ):  # pragma: no cover
+            yield
+
+        if x_split != y_split or X.chunk_shape[1] > 1:
+            x_split = y_split = decide_unify_split(x_split, y_split)
+            X = X.rechunk({0: x_split, 1: X.shape[1]})
+            y = y.rechunk({0: y_split})
+            X, y = yield from recursive_tile(X, y)
+
+        out_chunks = []
+        for i, _ in enumerate(x_split):
+            chunk_op = op.copy().reset_key()
+            out_chunk = chunk_op.new_chunk(
+                [X.cix[i, 0], y.cix[(i,)]],
+                index=(i,),
+            )
+            out_chunks.append(out_chunk)
+
+        params = out.params.copy()
+        params["chunks"] = out_chunks
+        params["nsplits"] = ((np.nan,) * len(x_split),)
+        return op.copy().new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def execute(cls, ctx: Union[dict, Context], op: "BlockwiseEnsembleFit"):
+        x, y = ctx[op.inputs[0].key], ctx[op.inputs[1].key]
+        estimator = clone(op.estimator)
+        ctx[op.outputs[0].key] = estimator.fit(x, y, **op.kwargs)
+
+
+class BlockwiseEnsemblePredict(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.BLOCKWISE_ENSEMBLE_PREDICT
+
+    x = KeyField("x")
+    estimators = ListField("estimators", FieldTypes.key)
+    voting = StringField("voting", default="hard")
+    proba = BoolField("proba", default=None)
+    is_classifier = BoolField("is_classifier")
+    n_classes = Int64Field("n_classes")
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self.x = self._inputs[0]
+        self.estimators = self._inputs[1:]
+
+    def __call__(self):
+        self._output_types = [OutputType.tensor]
+        x_len = self.x.shape[0]
+        if self.is_classifier:
+            shape = (x_len, self.n_classes) if self.proba else (x_len,)
+            dtype = np.dtype(np.float64) if self.proba else np.dtype(np.int64)
+        else:
+            shape = (x_len,)
+            dtype = np.dtype(np.float64)
+        return self.new_tileable(
+            [self.x] + self.estimators,
+            shape=shape,
+            dtype=dtype,
+            order=TensorOrder.C_ORDER,
+        )
+
+    @classmethod
+    def tile(cls, op: "BlockwiseEnsemblePredict"):
+        out = op.outputs[0]
+        x = op.x
+        estimators = op.estimators[0]
+        estimators_chunks = estimators.chunks
+
+        out_chunks = []
+        for chunk in x.chunks:
+            chunk_op = op.copy().reset_key()
+            if out.ndim == 2:
+                chunk_shape = (chunk.shape[0], out.shape[1])
+                chunk_index = (chunk.index[0], 0)
+            else:
+                chunk_shape = (chunk.shape[0],)
+                chunk_index = (chunk.index[0],)
+            out_chunk = chunk_op.new_chunk(
+                [chunk] + estimators_chunks,
+                shape=chunk_shape,
+                dtype=out.dtype,
+                order=out.order,
+                index=chunk_index,
+            )
+            out_chunks.append(out_chunk)
+
+        if out.ndim == 2:
+            nsplits = (x.nsplits[0], (out.shape[1],))
+        else:
+            nsplits = (x.nsplits[0],)
+        params = out.params.copy()
+        params["nsplits"] = nsplits
+        params["chunks"] = out_chunks
+        return op.copy().new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def execute(cls, ctx: Union[dict, Context], op: "BlockwiseEnsemblePredict"):
+        x = ctx[op.inputs[0].key]
+        estimators = [ctx[inp.key] for inp in op.inputs[1:]]
+        if op.proba or op.voting == "soft":
+            predictions = [estimator.predict_proba(x) for estimator in estimators]
+        else:
+            predictions = [estimator.predict(x) for estimator in estimators]
+
+        if op.is_classifier:
+            if not op.proba:
+                result = cls._execute_classifier_predict(predictions, op)
+            else:
+                result = cls._execute_classifier_predict_proba(predictions, op)
+        else:
+            result = cls._execute_regressor_predict(predictions)
+        ctx[op.outputs[0].key] = result
+
+    @classmethod
+    def _execute_classifier_predict(
+        cls, predictions: List[np.ndarray], op: "BlockwiseEnsemblePredict"
+    ):
+        if op.voting == "soft":
+            prob = np.average(np.stack(predictions), axis=0)
+            return np.argmax(prob, axis=1)
+        else:
+
+            def vote(x: np.ndarray):
+                return np.argmax(np.bincount(x))
+
+            # hard voting
+            prediction = np.vstack(predictions).T
+            return np.apply_along_axis(vote, 1, prediction)
+
+    @classmethod
+    def _execute_classifier_predict_proba(
+        cls, predictions: List[np.ndarray], op: "BlockwiseEnsemblePredict"
+    ):
+        assert op.voting == "soft"
+        return np.average(np.stack(predictions), axis=0)
+
+    @classmethod
+    def _execute_regressor_predict(cls, predictions: List[np.ndarray]):
+        return np.average(np.vstack(predictions), axis=0)
+
+
+class BlockwiseBaseEstimator(BaseEstimator):
+    def __init__(self, estimator: SklearnBaseEstimator):
+        self.estimator = estimator
+
+    def _fit(self, X, y, **kwargs):
+        X = check_array(X)
+        op = BlockwiseEnsembleFit(x=X, y=y, estimator=self.estimator, kwargs=kwargs)
+        self.estimators_ = op()
+
+
+class BlockwiseVotingClassifier(ClassifierMixin, BlockwiseBaseEstimator):
+    """
+    Blockwise training and ensemble voting classifier.
+
+    This classifier trains on blocks / partitions of tensors or DataFrames.
+    A cloned version of `estimator` will be fit *independently* on each block
+    or partition of the data. This is useful when the sub estimator
+    only works on small in-memory data structures like a NumPy array or pandas
+    DataFrame.
+
+    Prediction is done by the *ensemble* of learned models.
+
+    .. warning::
+
+       Ensure that your data are sufficiently shuffled prior to training!
+       If the values of the various blocks / partitions of your dataset are not
+       distributed similarly, the classifier will give poor results.
+
+    Parameters
+    ----------
+    estimator : Estimator
+    voting : str, {'hard', 'soft'} (default='hard')
+        If 'hard', uses predicted class labels for majority rule voting.
+        Else if 'soft', predicts the class label based on the argmax of
+        the sums of the predicted probabilities, which is recommended for
+        an ensemble of well-calibrated classifiers.
+    classes : list-like, optional
+        The set of classes that `y` can take. This can also be provided as
+        a fit param if the underlying estimator requires `classes` at fit time.
+
+    Attributes
+    ----------
+    estimators_ : list of classifiers
+        The collection of fitted sub-estimators that are `estimator` fitted
+        on each partition / block of the inputs.
+
+    classes_ : array-like, shape (n_predictions,)
+        The class labels.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> from mars.learn.ensemble import BlockwiseVotingClassifier
+    >>> from sklearn.linear_model import RidgeClassifier
+    >>> from sklearn.datasets import make_classification
+    >>> X, y = make_classification(n_samples=100_000)
+    >>> X, y = mt.tensor(X, chunk_size=10_0000), mt.tensor(y, chunk_size=10_0000)
+    >>> subestimator = RidgeClassifier(random_state=0)
+    >>> clf = BlockwiseVotingClassifier(subestimator)
+    >>> clf.fit(X, y)
+    """
+
+    def __init__(
+        self,
+        estimator: SklearnBaseEstimator,
+        voting: str = "hard",
+        classes: Union[np.ndarray, list, Tensor] = None,
+    ):
+        super().__init__(estimator=estimator)
+        if voting not in ("hard", "soft"):  # pragma: no cover
+            raise ValueError("`voting` could be hard or soft")
+        self.voting = voting
+        self.classes = None
+        if classes is not None:
+            self.classes = mt.tensor(classes)
+
+    def fit(
+        self,
+        X,
+        y,
+        classes: Union[np.ndarray, list, Tensor] = None,
+        session: SessionType = None,
+        run_kwargs: dict = None,
+        **kwargs,
+    ):
+        if not isinstance(y, ENTITY_TYPE):
+            y = mt.tensor(y)
+        if classes is None:
+            classes = self.classes
+        to_execute = []
+        if classes is None:
+            classes = mt.unique(y)
+            to_execute.append(classes)
+        super()._fit(X, y, **kwargs)
+        to_execute.append(self.estimators_)
+        execute(to_execute, session=session, **(run_kwargs or dict()))
+        self.n_classes_ = len(classes)
+
+    def predict(self, X, session: SessionType = None, run_kwargs: dict = None):
+        check_is_fitted(self, attributes=["estimators_"])
+        X = check_array(X)
+        op = BlockwiseEnsemblePredict(
+            x=X,
+            estimators=[self.estimators_],
+            voting=self.voting,
+            proba=False,
+            is_classifier=True,
+            n_classes=self.n_classes_,
+        )
+        return op().execute(session=session, **(run_kwargs or dict()))
+
+    def predict_proba(self, X, session: SessionType = None, run_kwargs: dict = None):
+        if self.voting == "hard":
+            raise AttributeError(f'predict_proba is not available when voting="hard"')
+
+        check_is_fitted(self, attributes=["estimators_"])
+        X = check_array(X)
+        op = BlockwiseEnsemblePredict(
+            x=X,
+            estimators=[self.estimators_],
+            voting=self.voting,
+            proba=True,
+            is_classifier=True,
+            n_classes=self.n_classes_,
+        )
+        return op().execute(session=session, **(run_kwargs or dict()))
+
+
+class BlockwiseVotingRegressor(RegressorMixin, BlockwiseBaseEstimator):
+    """
+    Blockwise training and ensemble voting regressor.
+
+    This regressor trains on blocks / partitions of tensors or DataFrames.
+    A cloned version of `estimator` will be fit *independently* on each block
+    or partition of the data.
+
+    Prediction is done by the *ensemble* of learned models.
+
+    .. warning::
+       Ensure that your data are sufficiently shuffled prior to training!
+       If the values of the various blocks / partitions of your dataset are not
+       distributed similarly, the regressor will give poor results.
+
+    Parameters
+    ----------
+    estimator : Estimator
+
+    Attributes
+    ----------
+    estimators_ : list of regressors
+        The collection of fitted sub-estimators that are `estimator` fitted
+        on each partition / block of the inputs.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> from mars.learn.ensemble import BlockwiseVotingRegressor
+    >>> from sklearn.linear_model import LinearRegression
+    >>> from sklearn.datasets import make_classification
+    >>> X, y = make_classification(n_samples=100_000)
+    >>> X, y = mt.tensor(X, chunk_size=10_0000), mt.tensor(y, chunk_size=10_0000)
+    >>> subestimator = LinearRegression()
+    >>> clf = BlockwiseVotingRegressor(subestimator)
+    >>> clf.fit(X, y)
+    """
+
+    def fit(self, X, y, session: SessionType = None, run_kwargs: dict = None, **kwargs):
+        if not isinstance(y, ENTITY_TYPE):
+            y = mt.tensor(y)
+        super()._fit(X, y, **kwargs)
+        self.estimators_.execute(session=session, **(run_kwargs or dict()))
+
+    def predict(self, X, session: SessionType = None, run_kwargs: dict = None):
+        check_is_fitted(self, attributes=["estimators_"])
+        X = check_array(X)
+        op = BlockwiseEnsemblePredict(
+            x=X, estimators=[self.estimators_], is_classifier=False
+        )
+        return op().execute(session=session, **(run_kwargs or dict()))
diff --git a/python/xorbits/_mars/learn/ensemble/_iforest.py b/python/xorbits/_mars/learn/ensemble/_iforest.py
new file mode 100644
index 000000000..46985e0f8
--- /dev/null
+++ b/python/xorbits/_mars/learn/ensemble/_iforest.py
@@ -0,0 +1,472 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numbers
+import warnings
+from typing import Union
+
+import numpy as np
+from sklearn.base import OutlierMixin
+from sklearn.tree import ExtraTreeRegressor
+from sklearn.utils import check_array as sklearn_check_array
+from sklearn.utils import gen_batches as sklearn_gen_batches
+
+from ... import tensor as mt
+from ...deploy.oscar.session import execute
+from ...lib.sparse import issparse
+from ...tensor.utils import check_random_state
+from ..utils import convert_to_tensor_or_dataframe, get_chunk_n_rows
+from ..utils.validation import _num_samples, check_is_fitted
+from ._bagging import BaggingPredictionOperand, BaseBagging, PredictionType
+
+
+def _average_path_length(n_samples_leaf):
+    """
+    The average path length in a n_samples iTree, which is equal to
+    the average path length of an unsuccessful BST search since the
+    latter has the same structure as an isolation tree.
+    Parameters
+    ----------
+    n_samples_leaf : array-like of shape (n_samples,)
+        The number of training samples in each test sample leaf, for
+        each estimators.
+
+    Returns
+    -------
+    average_path_length : ndarray of shape (n_samples,)
+    """
+
+    n_samples_leaf = sklearn_check_array(n_samples_leaf, ensure_2d=False)
+
+    n_samples_leaf_shape = n_samples_leaf.shape
+    n_samples_leaf = n_samples_leaf.reshape((1, -1))
+    average_path_length = np.zeros(n_samples_leaf.shape)
+
+    mask_1 = n_samples_leaf <= 1
+    mask_2 = n_samples_leaf == 2
+    not_mask = ~np.logical_or(mask_1, mask_2)
+
+    average_path_length[mask_1] = 0.0
+    average_path_length[mask_2] = 1.0
+    average_path_length[not_mask] = (
+        2.0 * (np.log(n_samples_leaf[not_mask] - 1.0) + np.euler_gamma)
+        - 2.0 * (n_samples_leaf[not_mask] - 1.0) / n_samples_leaf[not_mask]
+    )
+
+    return average_path_length.reshape(n_samples_leaf_shape)
+
+
+def _tree_decision_function(instance, estimator):
+    n_samples = _num_samples(instance)
+
+    # We get as many rows as possible within our working_memory budget
+    # (defined by sklearn.get_config()['working_memory']) to store
+    # self._max_features in each row during computation.
+    #
+    # Note:
+    #  - this will get at least 1 row, even if 1 row of score will
+    #    exceed working_memory.
+    #  - this does only account for temporary memory usage while loading
+    #    the data needed to compute the scores -- the returned scores
+    #    themselves are 1D.
+
+    chunk_n_rows = get_chunk_n_rows(
+        row_bytes=16 * instance.shape[1], max_n_rows=n_samples
+    )
+    slices = sklearn_gen_batches(n_samples, chunk_n_rows)
+
+    scores = np.zeros(n_samples, order="f")
+
+    for sl in slices:
+        # compute score on the slices of test samples:
+        scores[sl] = _compute_score_samples(instance[sl], estimator)
+
+    return scores
+
+
+def _compute_score_samples(instance, estimator):
+    leaves_index = estimator.apply(instance)
+    node_indicator = estimator.decision_path(instance)
+    n_samples_leaf = estimator.tree_.n_node_samples[leaves_index]
+
+    return (
+        np.ravel(node_indicator.sum(axis=1))
+        + _average_path_length(n_samples_leaf)
+        - 1.0
+    )
+
+
+class IsolationForest(OutlierMixin, BaseBagging):
+    """
+    Isolation Forest Algorithm.
+
+    Return the anomaly score of each sample using the IsolationForest algorithm
+
+    The IsolationForest 'isolates' observations by randomly selecting a feature
+    and then randomly selecting a split value between the maximum and minimum
+    values of the selected feature.
+
+    Since recursive partitioning can be represented by a tree structure, the
+    number of splittings required to isolate a sample is equivalent to the path
+    length from the root node to the terminating node.
+
+    This path length, averaged over a forest of such random trees, is a
+    measure of normality and our decision function.
+
+    Random partitioning produces noticeably shorter paths for anomalies.
+    Hence, when a forest of random trees collectively produce shorter path
+    lengths for particular samples, they are highly likely to be anomalies.
+
+    Read more in the :ref:`User Guide <isolation_forest>`.
+
+    Parameters
+    ----------
+    n_estimators : int, default=100
+        The number of base estimators in the ensemble.
+
+    max_samples : "auto", int or float, default="auto"
+        The number of samples to draw from X to train each base estimator.
+            - If int, then draw `max_samples` samples.
+            - If float, then draw `max_samples * X.shape[0]` samples.
+            - If "auto", then `max_samples=min(256, n_samples)`.
+
+        If max_samples is larger than the number of samples provided,
+        all samples will be used for all trees (no sampling).
+
+    contamination : 'auto' or float, default='auto'
+        The amount of contamination of the data set, i.e. the proportion
+        of outliers in the data set. Used when fitting to define the threshold
+        on the scores of the samples.
+
+            - If 'auto', the threshold is determined as in the
+              original paper.
+            - If float, the contamination should be in the range (0, 0.5].
+
+    max_features : int or float, default=1.0
+        The number of features to draw from X to train each base estimator.
+
+            - If int, then draw `max_features` features.
+            - If float, then draw `max_features * X.shape[1]` features.
+
+    bootstrap : bool, default=False
+        If True, individual trees are fit on random subsets of the training
+        data sampled with replacement. If False, sampling without replacement
+        is performed.
+
+    random_state : int, RandomState instance or None, default=None
+        Controls the pseudo-randomness of the selection of the feature
+        and split values for each branching step and each tree in the forest.
+
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    warm_start : bool, default=False
+        When set to ``True``, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just fit a whole
+        new forest. See :term:`the Glossary <warm_start>`.
+
+    Attributes
+    ----------
+    base_estimator_ : ExtraTreeRegressor instance
+        The child estimator template used to create the collection of
+        fitted sub-estimators.
+
+    estimators_ : list of ExtraTreeRegressor instances
+        The collection of fitted sub-estimators.
+
+    estimators_features_ : list of ndarray
+        The subset of drawn features for each base estimator.
+
+    max_samples_ : int
+        The actual number of samples.
+
+    offset_ : float
+        Offset used to define the decision function from the raw scores. We
+        have the relation: ``decision_function = score_samples - offset_``.
+        ``offset_`` is defined as follows. When the contamination parameter is
+        set to "auto", the offset is equal to -0.5 as the scores of inliers are
+        close to 0 and the scores of outliers are close to -1. When a
+        contamination parameter different than "auto" is provided, the offset
+        is defined in such a way we obtain the expected number of outliers
+        (samples with decision function < 0) in training.
+
+    Notes
+    -----
+    The implementation is based on an ensemble of ExtraTreeRegressor. The
+    maximum depth of each tree is set to ``ceil(log_2(n))`` where
+    :math:`n` is the number of samples used to build the tree
+    (see (Liu et al., 2008) for more details).
+
+    References
+    ----------
+    .. [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest."
+           Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.
+    .. [2] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation-based
+           anomaly detection." ACM Transactions on Knowledge Discovery from
+           Data (TKDD) 6.1 (2012): 3.
+
+    See Also
+    ----------
+    sklearn.covariance.EllipticEnvelope : An object for detecting outliers in a
+        Gaussian distributed dataset.
+    sklearn.svm.OneClassSVM : Unsupervised Outlier Detection.
+        Estimate the support of a high-dimensional distribution.
+        The implementation is based on libsvm.
+    sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection
+        using Local Outlier Factor (LOF).
+
+    Examples
+    --------
+    >>> from mars.learn.ensemble import IsolationForest
+    >>> X = [[-1.1], [0.3], [0.5], [100]]
+    >>> clf = IsolationForest(random_state=0).fit(X)
+    >>> clf.predict([[0.1], [0], [90]])
+    array([ 1,  1, -1])
+    """
+
+    contamination: Union[str, float]
+
+    def __init__(
+        self,
+        *,
+        n_estimators=100,
+        max_samples="auto",
+        contamination="auto",
+        max_features=1.0,
+        bootstrap=False,
+        random_state=None,
+        warm_start=False,
+    ):
+        super().__init__(
+            base_estimator=ExtraTreeRegressor(
+                max_features=1, splitter="random", random_state=random_state
+            ),
+            # here above max_features has no links with self.max_features
+            bootstrap=bootstrap,
+            bootstrap_features=False,
+            n_estimators=n_estimators,
+            max_samples=max_samples,
+            max_features=max_features,
+            warm_start=warm_start,
+            random_state=random_state,
+        )
+        self.contamination = contamination
+
+    def fit(
+        self, X, y=None, sample_weight=None, session=None, run_kwargs=None
+    ) -> "IsolationForest":
+        """
+        Fit estimator.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Use ``dtype=np.float32`` for maximum
+            efficiency. Sparse matrices are also supported, use sparse
+            ``csc_matrix`` for maximum efficiency.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted.
+
+        Returns
+        -------
+        self : IsolationForest
+            Fitted estimator.
+        """
+        run_kwargs = run_kwargs or dict()
+        X = convert_to_tensor_or_dataframe(X)
+        if issparse(X):  # pragma: no cover
+            raise NotImplementedError
+
+        if np.isnan(X.shape[0]):
+            execute(X, session=session, **run_kwargs)
+
+        rnd = check_random_state(self.random_state)
+        y = rnd.uniform(size=X.shape[0])
+
+        # ensure that max_sample is in [1, n_samples]:
+        n_samples = X.shape[0]
+
+        if self.contamination != "auto":
+            if not (0.0 < self.contamination <= 0.5):
+                raise ValueError(
+                    "contamination must be in (0, 0.5], got: %f" % self.contamination
+                )
+
+        if isinstance(self.max_samples, str):
+            if self.max_samples == "auto":
+                max_samples = min(256, n_samples)
+            else:
+                raise ValueError(
+                    "max_samples (%s) is not supported."
+                    'Valid choices are: "auto", int or'
+                    "float" % self.max_samples
+                )
+
+        elif isinstance(self.max_samples, numbers.Integral):
+            if self.max_samples > n_samples:
+                warnings.warn(
+                    "max_samples (%s) is greater than the "
+                    "total number of samples (%s). max_samples "
+                    "will be set to n_samples for estimation."
+                    % (self.max_samples, n_samples)
+                )
+                max_samples = n_samples
+            else:
+                max_samples = self.max_samples
+        else:  # float
+            if not 0.0 < self.max_samples <= 1.0:
+                raise ValueError(
+                    "max_samples must be in (0, 1], got %r" % self.max_samples
+                )
+            max_samples = int(self.max_samples * X.shape[0])
+
+        self.max_samples_ = max_samples
+        max_depth = int(np.ceil(np.log2(max(max_samples, 2))))
+        super()._fit(
+            X,
+            y,
+            sample_weight=sample_weight,
+            max_samples=max_samples,
+            estimator_params=dict(max_samples=max_samples, max_depth=max_depth),
+        )
+
+        if self.contamination == "auto":
+            # 0.5 plays a special role as described in the original paper.
+            # we take the opposite as we consider the opposite of their score.
+            self.offset_ = -0.5
+            return self
+
+        # else, define offset_ wrt contamination parameter
+        self.offset_ = execute(
+            mt.percentile(self._score_samples(X), 100.0 * self.contamination),
+            session=session,
+            **(run_kwargs or dict()),
+        ).fetch(session=session, **(run_kwargs or dict()))
+
+        return self
+
+    def predict(self, X, session=None, run_kwargs=None):
+        """
+        Predict if a particular sample is an outlier or not.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        is_inlier : ndarray of shape (n_samples,)
+            For each observation, tells whether or not (+1 or -1) it should
+            be considered as an inlier according to the fitted model.
+        """
+        check_is_fitted(self)
+        is_inlier = mt.ones(X.shape[0], dtype=int)
+        is_inlier[self._decision_function(X) < 0] = -1
+        return execute(is_inlier, session=session, **(run_kwargs or dict()))
+
+    def _decision_function(self, X):
+        return self._score_samples(X) - self.offset_
+
+    def decision_function(self, X, session=None, run_kwargs=None):
+        """
+        Average anomaly score of X of the base classifiers.
+
+        The anomaly score of an input sample is computed as
+        the mean anomaly score of the trees in the forest.
+
+        The measure of normality of an observation given a tree is the depth
+        of the leaf containing this observation, which is equivalent to
+        the number of splittings required to isolate this point. In case of
+        several observations n_left in the leaf, the average path length of
+        a n_left samples isolation tree is added.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        scores : ndarray of shape (n_samples,)
+            The anomaly score of the input samples.
+            The lower, the more abnormal. Negative scores represent outliers,
+            positive scores represent inliers.
+        """
+        # We subtract self.offset_ to make 0 be the threshold value for being
+        # an outlier:
+
+        decision_func = self._decision_function(X)
+        return execute(decision_func, session=session, **(run_kwargs or dict()))
+
+    def _score_samples(self, X):
+        check_is_fitted(self)
+        X = convert_to_tensor_or_dataframe(X)
+        predict_op = BaggingPredictionOperand(
+            prediction_type=PredictionType.DECISION_FUNCTION,
+            decision_function=_tree_decision_function,
+            calc_means=False,
+        )
+        depths = predict_op(X, self.estimators_, self.estimator_features_).sum(axis=1)
+        denominator = self.estimators_.shape[0] * _average_path_length(
+            [self.max_samples_]
+        )
+        return -(
+            2
+            ** (
+                # For a single training sample, denominator and depth are 0.
+                # Therefore, we set the score manually to 1.
+                -mt.divide(
+                    depths,
+                    denominator,
+                    out=mt.ones_like(depths),
+                    where=denominator != 0,
+                )
+            )
+        )
+
+    def score_samples(self, X, session=None, run_kwargs=None):
+        """
+        Opposite of the anomaly score defined in the original paper.
+
+        The anomaly score of an input sample is computed as
+        the mean anomaly score of the trees in the forest.
+
+        The measure of normality of an observation given a tree is the depth
+        of the leaf containing this observation, which is equivalent to
+        the number of splittings required to isolate this point. In case of
+        several observations n_left in the leaf, the average path length of
+        a n_left samples isolation tree is added.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        scores : ndarray of shape (n_samples,)
+            The anomaly score of the input samples.
+            The lower, the more abnormal.
+        """
+        scores = self._score_samples(X)
+        return execute(scores, session=session, **(run_kwargs or dict()))
diff --git a/python/xorbits/_mars/learn/ensemble/tests/__init__.py b/python/xorbits/_mars/learn/ensemble/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/ensemble/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/ensemble/tests/test_bagging.py b/python/xorbits/_mars/learn/ensemble/tests/test_bagging.py
new file mode 100644
index 000000000..cf24dcb07
--- /dev/null
+++ b/python/xorbits/_mars/learn/ensemble/tests/test_bagging.py
@@ -0,0 +1,322 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.datasets import make_classification, make_regression
+from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.svm import SVC
+
+from .... import dataframe as md
+from .... import execute
+from .... import tensor as mt
+from ....conftest import MARS_CI_BACKEND
+from ....core import enter_mode
+from ....services.task.execution.api import Fetcher
+from .._bagging import (
+    BaggingClassifier,
+    BaggingRegressor,
+    BaggingSample,
+    BaggingSampleReindex,
+    _extract_bagging_io,
+)
+
+
+def _get_tileable_chunk_data(sync_session, tileable):
+    @enter_mode(build=True)
+    async def _async_fetch():
+        tuples = []
+        async_session = sync_session._session
+        meta_api = async_session._meta_api
+
+        t, indexes = async_session._get_to_fetch_tileable(tileable)
+        fetcher = Fetcher.create(
+            MARS_CI_BACKEND, get_storage_api=async_session._get_storage_api
+        )
+
+        get_metas = []
+        for chunk in t.chunks:
+            get_metas.append(
+                meta_api.get_chunk_meta.delay(
+                    chunk.key, fields=fetcher.required_meta_keys
+                )
+            )
+        metas = await meta_api.get_chunk_meta.batch(*get_metas)
+
+        for chunk, meta in zip(t.chunks, metas):
+            await fetcher.append(chunk.key, meta)
+        all_data = await fetcher.get()
+
+        for chunk, data in zip(t.chunks, all_data):
+            tuples.append((t, chunk, data))
+        return tuples
+
+    future = asyncio.run_coroutine_threadsafe(
+        _async_fetch(), sync_session._isolation.loop
+    )
+    return future.result(120 if "CI" in os.environ else None)
+
+
+@pytest.mark.parametrize(
+    "use_dataframe, max_samples, max_features, with_labels, with_weights",
+    [
+        (False, 10, 1.0, False, False),
+        (False, 10, 0.5, True, True),
+        (True, 10, 1.0, False, False),
+        (True, 10, 0.5, True, True),
+    ],
+)
+def test_bagging_sample_execution(
+    setup, use_dataframe, max_samples, max_features, with_labels, with_weights
+):
+    rs = np.random.RandomState(0)
+
+    raw_data = rs.randint(100, size=(100, 50))
+    if not use_dataframe:
+        t = mt.tensor(raw_data, chunk_size=20)
+    else:
+        raw_data = pd.DataFrame(raw_data)
+        t = md.DataFrame(raw_data, chunk_size=20)
+
+    raw_labels = rs.choice([0, 1, 2], size=100)
+    raw_weights = rs.random(100)
+    labels = mt.tensor(raw_labels, chunk_size=20) if with_labels else None
+    weights = mt.tensor(raw_weights, chunk_size=20) if with_weights else None
+
+    sample_op = BaggingSample(
+        n_estimators=10,
+        max_samples=max_samples,
+        max_features=max_features,
+        random_state=rs,
+    )
+    result_tuple = execute(*sample_op(t, labels, weights))
+    t_sampled, t_labels, t_weights, t_feature_indices = _extract_bagging_io(
+        result_tuple, sample_op, output=True
+    )
+
+    label_chunks, weights_chunks, feature_idx_chunks = dict(), dict(), dict()
+
+    for t, chunks_dict in zip((t_labels, t_weights), (label_chunks, weights_chunks)):
+        if t is None:
+            continue
+        for tiled, chunk, chunk_data in _get_tileable_chunk_data(setup, t):
+            assert len(tiled.chunks) == 5
+            chunks_dict[chunk.index] = chunk_data
+            for d in chunk_data:
+                assert d.shape == (10,)
+
+    if t_feature_indices is not None:
+        for tiled, chunk, chunk_data in _get_tileable_chunk_data(
+            setup, t_feature_indices
+        ):
+            assert len(tiled.chunks) == 5
+            feature_idx_chunks[chunk.index] = chunk_data
+            assert chunk_data.shape == (2, int(max_features * raw_data.shape[1]))
+
+    for tiled, chunk, chunk_data in _get_tileable_chunk_data(setup, t_sampled):
+        assert len(tiled.chunks) == 5
+        assert len(chunk_data) == 2
+        for est_id, d in enumerate(chunk_data):
+            assert d.shape == (10, int(max_features * raw_data.shape[1]))
+
+            if use_dataframe:
+                raw_sliced = raw_data.loc[d.index]
+                if label_chunks:
+                    label_chunk = label_chunks[(chunk.index[0],)][est_id]
+                    np.testing.assert_array_equal(raw_labels[d.index], label_chunk)
+                if weights_chunks:
+                    weights_chunk = weights_chunks[(chunk.index[0],)][est_id]
+                    np.testing.assert_array_equal(raw_weights[d.index], weights_chunk)
+
+                if feature_idx_chunks:
+                    feature_indices_chunk = feature_idx_chunks[chunk.index][est_id]
+                    raw_sliced = raw_sliced.iloc[:, feature_indices_chunk]
+                pd.testing.assert_frame_equal(raw_sliced, d)
+
+
+@pytest.mark.parametrize(
+    "use_dataframe, max_samples, max_features, column_split",
+    [
+        (False, 10, 1.0, 50),
+        (False, 10, 0.5, 50),
+        (True, 10, 1.0, 20),
+        (True, 10, 0.5, 20),
+    ],
+)
+def test_bagging_sample_reindex(
+    setup, use_dataframe, max_samples, max_features, column_split
+):
+    rs = np.random.RandomState(0)
+
+    raw_insts = rs.randint(100, size=(100, 50))
+    raw_data = rs.randint(100, size=(200, 50))
+    if not use_dataframe:
+        t_insts = mt.tensor(raw_insts, chunk_size=column_split)
+        t_data = mt.tensor(raw_data, chunk_size=column_split)
+    else:
+        raw_insts = pd.DataFrame(raw_insts)
+        raw_data = pd.DataFrame(raw_data)
+        t_insts = md.DataFrame(raw_insts, chunk_size=column_split)
+        t_data = md.DataFrame(raw_data, chunk_size=column_split)
+
+    sample_op = BaggingSample(
+        n_estimators=10,
+        max_samples=max_samples,
+        max_features=max_features,
+        random_state=rs,
+    )
+    result_tuple = execute(*sample_op(t_insts))
+    _t_sampled, _label, _weights, t_feature_indices = _extract_bagging_io(
+        result_tuple, sample_op, output=True
+    )
+
+    reindex_op = BaggingSampleReindex(n_estimators=10)
+    reindexed = execute(
+        reindex_op(t_data, t_feature_indices), extra_config={"check_dtypes": False}
+    )
+
+    for tiled, _chunk, chunk_data in _get_tileable_chunk_data(setup, reindexed):
+        if t_feature_indices is None:
+            assert len(tiled.chunks) == np.ceil(raw_data.shape[0] / column_split)
+            assert chunk_data.shape[1] == 50
+        else:
+            row_chunks = np.ceil(raw_insts.shape[0] / column_split)
+            assert len(tiled.chunks) == row_chunks * np.ceil(
+                raw_data.shape[0] / column_split
+            )
+            assert isinstance(chunk_data, tuple)
+            for chunk_data_piece in chunk_data:
+                assert chunk_data_piece.shape[1] == 25
+
+
+@pytest.mark.parametrize(
+    "use_dataframe, max_samples, max_features, with_weights, base_estimator_cls",
+    [
+        (False, 10, 0.5, False, LogisticRegression),
+        (True, 10, 1.0, True, SVC),
+    ],
+)
+def test_bagging_classifier(
+    setup, use_dataframe, max_samples, max_features, with_weights, base_estimator_cls
+):
+    rs = np.random.RandomState(0)
+
+    raw_x, raw_y = make_classification(
+        n_samples=100,
+        n_features=4,
+        n_informative=2,
+        n_redundant=0,
+        random_state=rs,
+        shuffle=False,
+    )
+
+    if not use_dataframe:
+        t_x = mt.tensor(raw_x, chunk_size=20)
+    else:
+        raw_x = pd.DataFrame(raw_x)
+        t_x = md.DataFrame(raw_x, chunk_size=20)
+
+    raw_weights = rs.random(100)
+    t_y = mt.tensor(raw_y, chunk_size=20)
+    t_weights = mt.tensor(raw_weights, chunk_size=20) if with_weights else None
+
+    clf = BaggingClassifier(
+        base_estimator=base_estimator_cls(),
+        n_estimators=10,
+        max_samples=max_samples,
+        max_features=max_features,
+        random_state=rs,
+        warm_start=True,
+    )
+    clf.fit(t_x, t_y, sample_weight=t_weights)
+
+    for _tiled, _chunk, chunk_data in _get_tileable_chunk_data(setup, clf.estimators_):
+        assert len(chunk_data) == 2
+        assert all(isinstance(c, base_estimator_cls) for c in chunk_data)
+
+    if max_features < 1.0:
+        assert clf.estimator_features_ is not None
+
+    with pytest.warns(Warning):
+        clf.fit(t_x, t_y, sample_weight=t_weights)
+    with pytest.raises(ValueError):
+        clf.n_estimators = 5
+        clf.fit(t_x, t_y, sample_weight=t_weights)
+
+    clf.n_estimators = 20
+    clf.fit(t_x, t_y, sample_weight=t_weights)
+    assert clf.estimators_.shape[0] == 20
+
+    proba = clf.predict_proba(t_x)
+    proba_array = proba.fetch()
+    assert np.all((proba_array >= 0) & (proba_array <= 1))
+    assert np.allclose(np.sum(proba_array, axis=1), 1.0)
+
+    log_proba = clf.predict_log_proba(t_x)
+    exp_log_proba_array = np.exp(log_proba.fetch())
+    assert np.all((exp_log_proba_array >= 0) & (exp_log_proba_array <= 1))
+    assert np.allclose(np.sum(exp_log_proba_array, axis=1), 1.0)
+
+    y = clf.predict(t_x)
+    y_array = y.fetch()
+    assert np.all((y_array == 0) | (y_array == 1))
+
+    decision_fun = clf.decision_function(t_x)
+    decision_fun_array = decision_fun.fetch()
+    assert decision_fun_array.shape == (y_array.shape[0],)
+
+
+@pytest.mark.parametrize(
+    "use_dataframe, max_samples, max_features, with_weights",
+    [
+        (False, 10, 0.5, False),
+        (True, 10, 1.0, True),
+    ],
+)
+def test_bagging_regressor(
+    setup, use_dataframe, max_samples, max_features, with_weights
+):
+    rs = np.random.RandomState(0)
+
+    raw_x, raw_y = make_regression(
+        n_samples=100, n_features=4, n_informative=2, random_state=rs, shuffle=False
+    )
+
+    if not use_dataframe:
+        t_x = mt.tensor(raw_x, chunk_size=20)
+    else:
+        raw_x = pd.DataFrame(raw_x)
+        t_x = md.DataFrame(raw_x, chunk_size=20)
+
+    raw_weights = rs.random(100)
+    t_y = mt.tensor(raw_y, chunk_size=20)
+    t_weights = mt.tensor(raw_weights, chunk_size=20) if with_weights else None
+
+    clf = BaggingRegressor(
+        base_estimator=LinearRegression(),
+        n_estimators=10,
+        max_samples=max_samples,
+        max_features=max_features,
+        random_state=rs,
+        warm_start=True,
+    )
+    clf.fit(t_x, t_y, sample_weight=t_weights)
+
+    predict_y = clf.predict(t_x)
+    predict_y_array = predict_y.fetch()
+    assert predict_y_array.shape == raw_y.shape
diff --git a/python/xorbits/_mars/learn/ensemble/tests/test_blockwise.py b/python/xorbits/_mars/learn/ensemble/tests/test_blockwise.py
new file mode 100644
index 000000000..b8599b19d
--- /dev/null
+++ b/python/xorbits/_mars/learn/ensemble/tests/test_blockwise.py
@@ -0,0 +1,109 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+from sklearn.datasets import make_classification
+from sklearn.linear_model import LogisticRegression
+
+from .... import dataframe as md
+from .... import tensor as mt
+from .. import BlockwiseVotingClassifier, BlockwiseVotingRegressor
+
+fit_raw_X, fit_raw_y = make_classification()
+fit_X, fit_y = mt.tensor(fit_raw_X, chunk_size=25), mt.tensor(fit_raw_y, chunk_size=25)
+fit_df_X = md.DataFrame(fit_X)
+predict_raw_X, predict_raw_y = make_classification()
+predict_X, predict_y = (
+    mt.tensor(predict_raw_X, chunk_size=20),
+    mt.tensor(predict_raw_y, chunk_size=20),
+)
+predict_df_X = md.DataFrame(predict_X)
+
+
+@pytest.mark.parametrize(
+    "fit_X, fit_y, predict_X, predict_y",
+    [
+        (fit_X, fit_y, predict_X, predict_y),
+        (fit_raw_X, fit_raw_y, predict_raw_X, predict_raw_y),
+        (fit_df_X, fit_raw_y, predict_df_X, predict_raw_y),
+    ],
+)
+def test_blockwise_voting_classifier_hard(setup, fit_X, fit_y, predict_X, predict_y):
+    clf = BlockwiseVotingClassifier(LogisticRegression(solver="lbfgs"))
+    clf.fit(fit_X, fit_y)
+    estimators = clf.estimators_.fetch()
+    if not isinstance(fit_X, np.ndarray):
+        assert len(estimators) == 4
+
+    clf.predict(predict_X)
+    score = clf.score(predict_X, predict_y)
+    assert isinstance(score.fetch(), float)
+
+    with pytest.raises(AttributeError, match="hard"):
+        clf.predict_proba(predict_X)
+
+
+@pytest.mark.parametrize(
+    "fit_X, fit_y, predict_X, predict_y",
+    [
+        (fit_X, fit_y, predict_X, predict_y),
+        (fit_raw_X, fit_raw_y, predict_raw_X, predict_raw_y),
+        (fit_df_X, fit_raw_y, predict_df_X, predict_raw_y),
+    ],
+)
+def test_blockwise_voting_classifier_soft(setup, fit_X, fit_y, predict_X, predict_y):
+    clf = BlockwiseVotingClassifier(
+        LogisticRegression(solver="lbfgs"),
+        voting="soft",
+        classes=[0, 1],
+    )
+    clf.fit(fit_X, fit_y)
+    estimators = clf.estimators_.fetch()
+    if not isinstance(fit_X, np.ndarray):
+        assert len(estimators) == 4
+
+    result = clf.predict(predict_X)
+    assert result.dtype == np.dtype("int64")
+    assert result.shape == (predict_X.shape[0],)
+
+    result = clf.predict_proba(predict_X)
+    assert result.dtype == np.dtype("float64")
+    assert result.shape == (predict_X.shape[0], 2)
+
+    score = clf.score(predict_X, predict_y)
+    assert isinstance(score.fetch(), float)
+
+
+@pytest.mark.parametrize(
+    "fit_X, fit_y, predict_X, predict_y",
+    [
+        (fit_X, fit_y, predict_X, predict_y),
+        (fit_raw_X, fit_raw_y, predict_raw_X, predict_raw_y),
+        (fit_df_X, fit_raw_y, predict_df_X, predict_raw_y),
+    ],
+)
+def test_blockwise_voting_regressor(setup, fit_X, fit_y, predict_X, predict_y):
+    est = BlockwiseVotingRegressor(LogisticRegression())
+    est.fit(fit_X, fit_y)
+    estimators = est.estimators_.fetch()
+    if not isinstance(fit_X, np.ndarray):
+        assert len(estimators) == 4
+
+    result = est.predict(predict_X)
+    assert result.dtype == np.dtype("float64")
+    assert result.shape == (predict_X.shape[0],)
+
+    score = est.score(predict_X, predict_y)
+    assert isinstance(score.fetch(), float)
diff --git a/python/xorbits/_mars/learn/ensemble/tests/test_iforest.py b/python/xorbits/_mars/learn/ensemble/tests/test_iforest.py
new file mode 100644
index 000000000..088a8707e
--- /dev/null
+++ b/python/xorbits/_mars/learn/ensemble/tests/test_iforest.py
@@ -0,0 +1,75 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+from sklearn.datasets import load_iris
+
+from .... import tensor as mt
+from .._iforest import IsolationForest
+
+
+@pytest.mark.parametrize("max_samples", [0.5, 1.0, 10])
+def test_iforest(setup, max_samples):
+    rs = np.random.RandomState(0)
+    raw_train = rs.poisson(size=(100, 10))
+    t_train = mt.tensor(raw_train, chunk_size=20)
+    raw_test = rs.poisson(size=(200, 10))
+    t_test = mt.tensor(raw_test, chunk_size=20)
+
+    clf = IsolationForest(random_state=rs, n_estimators=10, max_samples=max_samples)
+    clf.fit(t_train).predict(t_test)
+    clf.score_samples(t_test)
+
+
+@pytest.mark.parametrize("contamination", [0.25, "auto"])
+def test_iforest_works(setup, contamination):
+    rs = np.random.RandomState(0)
+    # toy sample (the last two samples are outliers)
+    raw = np.array(
+        [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]
+    )
+    t = mt.tensor(raw, chunk_size=4)
+
+    # Test IsolationForest
+    clf = IsolationForest(random_state=rs, contamination=contamination)
+    clf.fit(t)
+    decision_func = -clf.decision_function(t).execute().fetch()
+    pred = clf.predict(t).execute().fetch()
+    # assert detect outliers:
+    assert np.min(decision_func[-2:]) > np.max(decision_func[:-2])
+    np.testing.assert_array_equal(pred, 6 * [1] + 2 * [-1])
+
+
+def test_iforest_error():
+    """Test that it gives proper exception on deficient input."""
+    iris = load_iris()
+    X = iris.data
+
+    # Test max_samples
+    with pytest.raises(ValueError):
+        IsolationForest(max_samples=-1).fit(X)
+    with pytest.raises(ValueError):
+        IsolationForest(max_samples=0.0).fit(X)
+    with pytest.raises(ValueError):
+        IsolationForest(max_samples=2.0).fit(X)
+
+    with pytest.raises(ValueError):
+        IsolationForest(max_samples="foobar").fit(X)
+    with pytest.raises(ValueError):
+        IsolationForest(max_samples=1.5).fit(X)
+
+    # test X_test n_features match X_train one:
+    with pytest.raises(ValueError):
+        IsolationForest().fit(X).predict(X[:, 1:])
diff --git a/python/xorbits/_mars/learn/glm/__init__.py b/python/xorbits/_mars/learn/glm/__init__.py
new file mode 100644
index 000000000..14a15022b
--- /dev/null
+++ b/python/xorbits/_mars/learn/glm/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ._logistic import LogisticRegression
+
+__all__ = [
+    "LogisticRegression",
+]
diff --git a/python/xorbits/_mars/learn/glm/_logistic.py b/python/xorbits/_mars/learn/glm/_logistic.py
new file mode 100644
index 000000000..fdfb19f53
--- /dev/null
+++ b/python/xorbits/_mars/learn/glm/_logistic.py
@@ -0,0 +1,307 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numbers
+
+from sklearn.utils.validation import _deprecate_positional_args
+
+from ... import tensor as mt
+from ..base import BaseEstimator
+from ..linear_model._base import LinearClassifierMixin
+from ..utils.extmath import softmax
+from ..utils.multiclass import check_classification_targets
+from ..utils.validation import check_is_fitted
+from ._optimizers import gradient_descent
+
+
+def _check_solver(solver):
+    all_solvers = ["SGD"]
+    if solver not in all_solvers:
+        raise ValueError(
+            "Logistic Regression supports only solvers in %s, got"
+            " %s." % (all_solvers, solver)
+        )
+    return solver
+
+
+def _check_multi_class(multi_class, solver, n_classes):
+    if multi_class == "auto":
+        return "multinomial"
+    if multi_class == "ovr":
+        if n_classes == 2:
+            return "multinomial"
+        else:
+            raise ValueError(
+                "Solver %s does not support "
+                "an ovr backend with number of classes "
+                "larger than 2." % solver
+            )
+    if multi_class == "multinomial":
+        return "multinomial"
+
+    raise ValueError(
+        "multi_class should be 'multinomial', 'ovr' or 'auto'. Got %s." % multi_class
+    )
+
+
+class LogisticRegression(LinearClassifierMixin, BaseEstimator):
+    """
+    Logistic Regression (aka logit, MaxEnt) classifier.
+
+    In the multiclass case, the training algorithm uses the one-vs-rest (OvR)
+    scheme if the 'multi_class' option is set to 'ovr', and uses the
+    cross-entropy loss if the 'multi_class' option is set to 'multinomial'.
+    (Currently the 'multinomial' option is supported only by the 'lbfgs',
+    'sag', 'saga' and 'newton-cg' solvers.)
+
+    This class implements regularized logistic regression using the
+    'liblinear' library, 'newton-cg', 'sag', 'saga' and 'lbfgs' solvers. **Note
+    that regularization is applied by default**. It can handle both dense
+    and sparse input. Use C-ordered arrays or CSR matrices containing 64-bit
+    floats for optimal performance; any other input format will be converted
+    (and copied).
+
+    The 'newton-cg', 'sag', and 'lbfgs' solvers support only L2 regularization
+    with primal formulation, or no regularization. The 'liblinear' solver
+    supports both L1 and L2 regularization, with a dual formulation only for
+    the L2 penalty. The Elastic-Net regularization is only supported by the
+    'saga' solver.
+
+    Read more in the :ref:`User Guide <logistic_regression>`.
+
+    Parameters
+    ----------
+    penalty : {'l1', 'l2', 'elasticnet', 'none'}, default='l2'
+        Used to specify the norm used in the penalization. The 'newton-cg',
+        'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is
+        only supported by the 'saga' solver. If 'none' (not supported by the
+        liblinear solver), no regularization is applied.
+
+        .. versionadded:: 0.19
+           l1 penalty with SAGA solver (allowing 'multinomial' + L1)
+
+    C : float, default=1.0
+        Inverse of regularization strength; must be a positive float.
+        Like in support vector machines, smaller values specify stronger
+        regularization.
+
+    fit_intercept : bool, default=True
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the decision function.
+
+    random_state : int, RandomState instance, default=None
+        Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the
+        data. See :term:`Glossary <random_state>` for details.
+
+    solver : SGD (stochastic gradient descent)
+
+    max_iter : int, default=100
+        Maximum number of iterations taken for the solvers to converge.
+
+    multi_class : {'auto', 'ovr', 'multinomial'}, default='auto'
+        If the option chosen is 'ovr', then a binary problem is fit for each
+        label. For 'multinomial' the loss minimised is the multinomial loss fit
+        across the entire probability distribution, *even when the data is
+        binary*. 'multinomial' is unavailable when solver='liblinear'.
+        'auto' selects 'ovr' if the data is binary, or if solver='liblinear',
+        and otherwise selects 'multinomial'.
+
+        .. versionadded:: 0.18
+           Stochastic Average Gradient descent solver for 'multinomial' case.
+        .. versionchanged:: 0.22
+            Default changed from 'ovr' to 'auto' in 0.22.
+
+    verbose : int, default=0
+        For the liblinear and lbfgs solvers set verbose to any positive
+        number for verbosity.
+
+    Attributes
+    ----------
+
+    coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)
+        Coefficient of the features in the decision function.
+
+        `coef_` is of shape (1, n_features) when the given problem is binary.
+        In particular, when `multi_class='multinomial'`, `coef_` corresponds
+        to outcome 1 (True) and `-coef_` corresponds to outcome 0 (False).
+
+    intercept_ : ndarray of shape (1,) or (n_classes,)
+        Intercept (a.k.a. bias) added to the decision function.
+
+        If `fit_intercept` is set to False, the intercept is set to zero.
+        `intercept_` is of shape (1,) when the given problem is binary.
+        In particular, when `multi_class='multinomial'`, `intercept_`
+        corresponds to outcome 1 (True) and `-intercept_` corresponds to
+        outcome 0 (False).
+
+    See Also
+    --------
+    SGDClassifier : Incrementally trained logistic regression (when given
+        the parameter ``loss="log"``).
+    LogisticRegressionCV : Logistic regression with built-in cross validation.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> X, y = load_iris(return_X_y=True)
+    >>> clf = LogisticRegression(random_state=0).fit(X, y)
+    >>> clf.predict(X[:2, :])
+    array([0, 0])
+    """
+
+    @_deprecate_positional_args
+    def __init__(
+        self,
+        penalty="l2",
+        fit_intercept=False,
+        C=100,
+        batch_size=20,
+        learning_rate=0.1,
+        random_state=None,
+        solver="SGD",
+        max_iter=300,
+        multi_class="auto",
+        verbose=0,
+    ):
+        self.penalty = penalty
+        self.fit_intercept = fit_intercept
+        self.C = C
+        self.batch_size = batch_size
+        self.learning_rate = learning_rate
+        self.random_state = random_state
+        self.solver = solver
+        self.max_iter = max_iter
+        self.multi_class = multi_class
+        self.verbose = verbose
+
+    def fit(self, X, y):
+        """
+        Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target vector relative to X.
+
+        Returns
+        -------
+        self
+            Fitted estimator.
+        """
+        # ========== Pre-check =============
+        if self.penalty not in ["l2"]:
+            raise NotImplementedError("Only support L2 penalty.")
+
+        solver = _check_solver(self.solver)
+
+        if not isinstance(self.C, numbers.Number) or self.C < 0:
+            raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
+
+        if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
+            raise ValueError(
+                "Maximum number of iteration must be positive;"
+                " got (max_iter=%r)" % self.max_iter
+            )
+
+        _dtype = [mt.float64, mt.float32]
+
+        X, y = self._validate_data(X, y, accept_sparse="csr", dtype=_dtype, order="C")
+
+        check_classification_targets(y)
+
+        self.classes_ = mt.unique(y)
+        n_classes = self.classes_.execute().shape[0]
+        multi_class = _check_multi_class(self.multi_class, solver, n_classes)
+
+        # ========== Fit solver ============
+        # Only support stochastic gradient descent for now
+        if multi_class == "multinomial":
+            if solver == "SGD":
+                self.coef_ = gradient_descent(
+                    X,
+                    y,
+                    learning_rate=self.learning_rate,
+                    reg=(1 / self.C),
+                    max_epochs=self.max_iter,
+                    batch_size=self.batch_size,
+                    fit_intercept=self.fit_intercept,
+                    verbose=self.verbose,
+                )
+                self.coef_ = self.coef_.T
+
+        # ========== Post processing =======
+        if self.fit_intercept:
+            self.intercept_ = self.coef_[:, -1]
+            self.coef_ = self.coef_[:, :-1]
+        else:
+            self.intercept_ = mt.zeros(n_classes)
+
+        return self
+
+    def predict_proba(self, X):
+        """
+        Probability estimates.
+
+        The returned estimates for all classes are ordered by the
+        label of classes.
+
+        For a multi_class problem, if multi_class is set to be "multinomial"
+        the softmax function is used to find the predicted probability of
+        each class.
+        Else use a one-vs-rest approach, i.e calculate the probability
+        of each class assuming it to be positive using the logistic function.
+        and normalize these values across all the classes.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Vector to be scored, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        T : array-like of shape (n_samples, n_classes)
+            Returns the probability of the sample for each class in the model,
+            where classes are ordered as they are in ``self.classes_``.
+        """
+        check_is_fitted(self)
+        decision = self.decision_function(X)
+
+        return softmax(decision, copy=False).execute()
+
+    def predict_log_proba(self, X):
+        """
+        Predict logarithm of probability estimates.
+
+        The returned estimates for all classes are ordered by the
+        label of classes.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Vector to be scored, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        Returns
+        -------
+        T : array-like of shape (n_samples, n_classes)
+            Returns the log-probability of the sample for each class in the
+            model, where classes are ordered as they are in ``self.classes_``.
+        """
+        return mt.log(self.predict_proba(X)).execute()
diff --git a/python/xorbits/_mars/learn/glm/_optimizers.py b/python/xorbits/_mars/learn/glm/_optimizers.py
new file mode 100644
index 000000000..929691c95
--- /dev/null
+++ b/python/xorbits/_mars/learn/glm/_optimizers.py
@@ -0,0 +1,91 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+
+from ... import execute
+from ... import tensor as mt
+from ...tensor.datasource import tensor as astensor
+
+
+def softmax_loss_and_grad(W, X, y, reg):
+    N, D = X.shape
+    K = W.shape[1]
+
+    y_obs = mt.zeros(shape=(N, K))
+    for i in range(N):
+        y_obs[i] = mt.eye(K)[y[i]]
+
+    loss = -1 / N * mt.sum(
+        y_obs * mt.log(mt.exp(X @ W) / mt.sum(mt.exp(X @ W), axis=1).reshape(-1, 1))
+    ) + 0.5 * reg * mt.sum(mt.square(W))
+
+    dW = mt.zeros(shape=(D, K))
+
+    # Matrix approach
+    dW = (
+        -1
+        / N
+        * X.T
+        @ (y_obs - (mt.exp(X @ W) / mt.sum(mt.exp(X @ W), axis=1).reshape(-1, 1)))
+        + reg * W
+    )
+
+    execute(loss, dW)
+
+    return loss, dW
+
+
+def gradient_descent(
+    X,
+    y,
+    learning_rate=1e-3,
+    reg=1e-5,
+    max_epochs=100,
+    batch_size=20,
+    fit_intercept=True,
+    verbose=0,
+):
+    # assume y takes values 0...K-1 where K is number of classes
+    num_classes = (mt.max(y) + 1).to_numpy()
+
+    num_train, dim = X.shape
+    num_iters_per_epoch = int(math.floor(1.0 * num_train / batch_size))
+
+    # need extra entries if fit_intercept
+    if fit_intercept:
+        X = mt.hstack((X, mt.ones((num_train, 1))))
+        W = 0.001 * mt.random.randn(dim + 1, num_classes).execute()
+    else:
+        X = astensor(X)
+        W = 0.001 * mt.random.randn(dim, num_classes).execute()
+
+    for _ in range(max_epochs):
+        # perform mini-batch SGD update
+        perm_idx = np.random.permutation(num_train)
+        for it in range(num_iters_per_epoch):
+            # print(it, num_iters_per_epoch)
+            idx = perm_idx[it * batch_size : (it + 1) * batch_size]
+            batch_x = X[idx]
+            batch_y = y[idx]
+
+            # evaluate loss and gradient
+            _, grad = softmax_loss_and_grad(W, batch_x, batch_y, reg)
+
+            # update parameters
+            W = W - learning_rate * grad
+
+    return W
diff --git a/python/xorbits/_mars/learn/glm/tests/__init__.py b/python/xorbits/_mars/learn/glm/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/glm/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/glm/tests/test_logistic.py b/python/xorbits/_mars/learn/glm/tests/test_logistic.py
new file mode 100644
index 000000000..0c6b340c8
--- /dev/null
+++ b/python/xorbits/_mars/learn/glm/tests/test_logistic.py
@@ -0,0 +1,113 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+import pytest
+from sklearn.datasets import load_iris
+
+from .._logistic import LogisticRegression, _check_multi_class, _check_solver
+
+# general data load
+X, y = load_iris(return_X_y=True)
+
+
+def test_check_solver(setup):
+    all_solvers = ["SGD"]
+    for solver in all_solvers:
+        checked_solver = _check_solver(solver)
+        assert checked_solver == solver
+
+    invalid_solver = "Newton"
+    error_msg = re.escape(
+        "Logistic Regression supports only solvers in %s, "
+        "got %s." % (all_solvers, invalid_solver)
+    )
+
+    with pytest.raises(ValueError, match=error_msg):
+        _check_solver(invalid_solver)
+
+
+def test_check_multi_class(setup):
+    all_multi_class = ["auto", "multinomial", "ovr"]
+    solver = "SGD"
+
+    for multi_class in all_multi_class:
+        checked_multi_class = _check_multi_class(multi_class, solver, 2)
+        assert checked_multi_class == "multinomial"
+
+    error_msg = re.escape(
+        "Solver %s does not support "
+        "an ovr backend with number of classes "
+        "larger than 2." % solver
+    )
+    with pytest.raises(ValueError, match=error_msg):
+        _check_multi_class("ovr", solver, 3)
+
+    invalid_multi_class = "multiovr"
+    error_msg = re.escape(
+        "multi_class should be 'multinomial', "
+        "'ovr' or 'auto'. Got %s." % invalid_multi_class
+    )
+    with pytest.raises(ValueError, match=error_msg):
+        _check_multi_class(invalid_multi_class, solver, 3)
+
+
+def test_invalid_penalty(setup):
+    error_msg = re.escape("Only support L2 penalty.")
+
+    with pytest.raises(NotImplementedError, match=error_msg):
+        model = LogisticRegression(penalty="l1")
+        model.fit(X, y)
+
+
+def test_invalid_C(setup):
+    invalid_C = -1
+    error_msg = re.escape("Penalty term must be positive; got (C=%r)" % invalid_C)
+
+    with pytest.raises(ValueError, match=error_msg):
+        model = LogisticRegression(C=invalid_C)
+        model.fit(X, y)
+
+
+def test_invalid_max_iter(setup):
+    invalid_max_iter = -1
+    error_msg = re.escape(
+        "Maximum number of iteration must be positive;"
+        " got (max_iter=%r)" % invalid_max_iter
+    )
+
+    with pytest.raises(ValueError, match=error_msg):
+        model = LogisticRegression(max_iter=invalid_max_iter)
+        model.fit(X, y)
+
+
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_logistic_regression_no_converge(setup, fit_intercept):
+    # quite slow in local tests, so set max_iter=1
+    # suggested max_iter >= 10
+    model = LogisticRegression(fit_intercept=fit_intercept, max_iter=1)
+    model.fit(X, y)
+    model.predict(X)
+    model.score(X, y)
+    model.predict_proba(X)
+    model.predict_log_proba(X)
+
+    error_msg = re.escape(
+        "X has %d features per sample; expecting %d"
+        % (X.shape[1], model.coef_.shape[1] - 1)
+    )
+    model.coef_ = model.coef_[:, :-1]
+    with pytest.raises(ValueError, match=error_msg):
+        model.predict(X)
diff --git a/python/xorbits/_mars/learn/linear_model/__init__.py b/python/xorbits/_mars/learn/linear_model/__init__.py
new file mode 100644
index 000000000..c5c19b92e
--- /dev/null
+++ b/python/xorbits/_mars/learn/linear_model/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ._base import LinearRegression
+
+__all__ = [
+    "LinearRegression",
+]
diff --git a/python/xorbits/_mars/learn/linear_model/_base.py b/python/xorbits/_mars/learn/linear_model/_base.py
new file mode 100644
index 000000000..6485732dd
--- /dev/null
+++ b/python/xorbits/_mars/learn/linear_model/_base.py
@@ -0,0 +1,367 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numbers
+from abc import ABCMeta, abstractmethod
+
+from numpy.linalg import LinAlgError
+from sklearn.base import MultiOutputMixin
+from sklearn.utils.validation import _deprecate_positional_args, check_is_fitted
+
+from ... import execute
+from ... import tensor as mt
+from ...tensor.datasource import tensor as astensor
+from ..base import BaseEstimator, ClassifierMixin, RegressorMixin
+from ..preprocessing import normalize as f_normalize
+from ..utils.validation import FLOAT_DTYPES, _check_sample_weight, check_array
+
+
+def _preprocess_data(
+    X,
+    y,
+    fit_intercept,
+    normalize=False,
+    copy=True,
+    sample_weight=None,
+    return_mean=False,
+    check_input=True,
+):
+    """Center and scale data.
+
+    Centers data to have mean zero along axis 0. If fit_intercept=False or if
+    the X is a sparse matrix, no centering is done, but normalization can still
+    be applied. The function returns the statistics necessary to reconstruct
+    the input data, which are X_offset, y_offset, X_scale, such that the output
+
+        X = (X - X_offset) / X_scale
+
+    X_scale is the L2 norm of X - X_offset. If sample_weight is not None,
+    then the weighted mean of X and y is zero, and not the mean itself. If
+    return_mean=True, the mean, eventually weighted, is returned, independently
+    of whether X was centered (option used for optimization with sparse data in
+    coordinate_descend).
+
+    This is here because nearly all linear models will want their data to be
+    centered. This function also systematically makes y consistent with X.dtype
+    """
+    if isinstance(sample_weight, numbers.Number):
+        sample_weight = None
+    if sample_weight is not None:
+        sample_weight = astensor(sample_weight)
+
+    X = astensor(X)
+    y = astensor(y, dtype=X.dtype)
+
+    if check_input:
+        X = check_array(X, copy=copy, accept_sparse=["csr", "csc"], dtype=FLOAT_DTYPES)
+    elif copy:
+        if X.issparse():
+            X = X.copy()
+        else:
+            X = X.copy(order="K")
+
+    if fit_intercept:
+        if X.issparse():
+            raise NotImplementedError("Does not support sparse input!")
+        else:
+            X_offset = mt.average(X, axis=0, weights=sample_weight)
+            X = X - X_offset
+            if normalize:
+                X, X_scale = f_normalize(X, axis=0, copy=False, return_norm=True)
+            else:
+                X_scale = mt.ones(X.shape[1], dtype=X.dtype)
+        y_offset = mt.average(y, axis=0, weights=sample_weight)
+        y = y - y_offset
+    else:
+        if X.issparse():
+            raise NotImplementedError("Does not support sparse input!")
+        X_offset = mt.zeros(X.shape[1], dtype=X.dtype)
+        X_scale = mt.ones(X.shape[1], dtype=X.dtype)
+        if y.ndim == 1:
+            y_offset = X.dtype.type(0)
+        else:
+            y_offset = mt.zeros(y.shape[1], dtype=X.dtype)
+
+    return X, y, X_offset, y_offset, X_scale
+
+
+def _rescale_data(X, y, sample_weight):
+    """Rescale data sample-wise by square root of sample_weight.
+
+    For many linear models, this enables easy support for sample_weight.
+
+    Returns
+    -------
+    X_rescaled : {array-like, sparse matrix}
+
+    y_rescaled : {array-like, sparse matrix}
+    """
+    n_samples = X.shape[0]
+    sample_weight = mt.asarray(sample_weight)
+    if sample_weight.ndim == 0:
+        sample_weight = mt.full(n_samples, sample_weight, dtype=sample_weight.dtype)
+    sample_weight = mt.sqrt(sample_weight)
+    sw_matrix = mt.diag(sample_weight, sparse=True)
+    X = mt.dot(sw_matrix, X)
+    y = mt.dot(sw_matrix, y)
+    return X, y
+
+
+class LinearModel(BaseEstimator, metaclass=ABCMeta):
+    """Base class for Linear Models"""
+
+    @abstractmethod
+    def fit(self, X, y, sample_weight=None):
+        """Fit model."""
+
+    def _decision_function(self, X):
+        check_is_fitted(self)
+
+        X = self._validate_data(
+            X, y="no_validation", accept_sparse=["csr", "csc", "coo"], reset=False
+        )
+        return mt.dot(X, self.coef_.T) + self.intercept_
+
+    def predict(self, X):
+        """
+        Predict using the linear model.
+
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        C : array, shape (n_samples,)
+            Returns predicted values.
+        """
+        return self._decision_function(X)
+
+    _preprocess_data = staticmethod(_preprocess_data)
+
+    def _set_intercept(self, X_offset, y_offset, X_scale):
+        """Set the intercept_"""
+        if self.fit_intercept:
+            self.coef_ = self.coef_ / X_scale
+            self.intercept_ = y_offset - mt.dot(X_offset, self.coef_.T)
+            execute(self.coef_, self.intercept_)
+        else:
+            self.intercept_ = mt.tensor(0.0)
+            self.intercept_.execute()
+
+    def _more_tags(self):  # noqa: R0201  # pylint: disable=no-self-use
+        return {"requires_y": True}
+
+
+class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
+    """
+    Ordinary least squares Linear Regression.
+
+    LinearRegression fits a linear model with coefficients w = (w1, ..., wp)
+    to minimize the residual sum of squares between the observed targets in
+    the dataset, and the targets predicted by the linear approximation.
+
+    Parameters
+    ----------
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to False, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    normalize : bool, default=False
+        This parameter is ignored when ``fit_intercept`` is set to False.
+        If True, the regressors X will be normalized before regression by
+        subtracting the mean and dividing by the l2-norm.
+        If you wish to standardize, please use
+        :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
+        on an estimator with ``normalize=False``.
+
+    copy_X : bool, default=True
+        If True, X will be copied; else, it may be overwritten.
+
+    positive : bool, default=False
+        When set to ``True``, forces the coefficients to be positive. This
+        option is only supported for dense arrays.
+
+    Attributes
+    ----------
+    coef_ : array of shape (n_features, ) or (n_targets, n_features)
+        Estimated coefficients for the linear regression problem.
+        If multiple targets are passed during the fit (y 2D), this
+        is a 2D array of shape (n_targets, n_features), while if only
+        one target is passed, this is a 1D array of length n_features.
+
+    rank_ : int
+        Rank of matrix `X`. Only available when `X` is dense.
+
+    singular_ : array of shape (min(X, y),)
+        Singular values of `X`. Only available when `X` is dense.
+
+    intercept_ : float or array of shape (n_targets,)
+        Independent term in the linear model. Set to 0.0 if
+        `fit_intercept = False`.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+    See Also
+    --------
+    Ridge : Ridge regression addresses some of the
+        problems of Ordinary Least Squares by imposing a penalty on the
+        size of the coefficients with l2 regularization.
+    Lasso : The Lasso is a linear model that estimates
+        sparse coefficients with l1 regularization.
+    ElasticNet : Elastic-Net is a linear regression
+        model trained with both l1 and l2 -norm regularization of the
+        coefficients.
+    """
+
+    @_deprecate_positional_args
+    def __init__(
+        self,
+        *,
+        fit_intercept=True,
+        normalize=False,
+        copy_X=True,
+        positive=False,
+    ):
+        self.fit_intercept = fit_intercept
+        self.normalize = normalize
+        self.copy_X = copy_X
+        self.positive = positive
+
+    def fit(self, X, y, sample_weight=None):
+        """
+        Fit linear model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+            Target values. Will be cast to X's dtype if necessary.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Individual weights for each sample.
+
+        Returns
+        -------
+        self : object
+            Fitted Estimator.
+        """
+        accept_sparse = False if self.positive else ["csr", "csc", "coo"]
+
+        X, y = self._validate_data(
+            X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
+        )
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+        X, y, X_offset, y_offset, X_scale = self._preprocess_data(
+            X,
+            y,
+            fit_intercept=self.fit_intercept,
+            normalize=self.normalize,
+            copy=self.copy_X,
+            sample_weight=sample_weight,
+            return_mean=True,
+        )
+
+        if sample_weight is not None:
+            # Sample weight can be implemented via a simple rescaling.
+            X, y = _rescale_data(X, y, sample_weight)
+
+        if self.positive:
+            # TODO: implement optimize.nnls first
+            raise NotImplementedError("Does not support positive coefficients!")
+        elif X.issparse():
+            # TODO: implement sparse.linalg.lsqr first
+            raise NotImplementedError("Does not support sparse input!")
+        else:
+            try:
+                # In numpy:
+                #   Mat mul does NOT always satisfy associative law
+                # Tyipical mistake:
+                #   (mt.linalg.inv(X.T @ X) @ (X.T @ y)).T
+                self.coef_ = (mt.linalg.inv(X.T @ X) @ X.T @ y).T
+                self.coef_.execute()
+            except LinAlgError:
+                # TODO: implement linalg.lstsq first
+                raise NotImplementedError("Does not support sigular matrix!")
+
+        if y.ndim == 1:
+            self.coef_ = mt.ravel(self.coef_)
+            self.coef_.execute()
+        self._set_intercept(X_offset, y_offset, X_scale)
+        return self
+
+
+class LinearClassifierMixin(ClassifierMixin):
+    """Mixin for linear classifiers.
+
+    Handles prediction for sparse and dense X.
+    """
+
+    def decision_function(self, X):
+        """
+        Predict confidence scores for samples.
+
+        The confidence score for a sample is proportional to the signed
+        distance of that sample to the hyperplane.
+
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)
+            Confidence scores per (sample, class) combination. In the binary
+            case, confidence score for self.classes_[1] where >0 means this
+            class would be predicted.
+        """
+        check_is_fitted(self)
+
+        X = check_array(X, accept_sparse="csr")
+
+        n_features = self.coef_.shape[1]
+        if X.shape[1] != n_features:
+            raise ValueError(
+                "X has %d features per sample; expecting %d" % (X.shape[1], n_features)
+            )
+
+        scores = mt.dot(X, self.coef_.T) + self.intercept_
+        return scores
+
+    def predict(self, X):
+        """
+        Predict class labels for samples in X.
+
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        C : array, shape [n_samples]
+            Predicted class label per sample.
+        """
+        scores = self.decision_function(X)
+        indices = scores.argmax(axis=1)
+        return self.classes_[indices].execute()
diff --git a/python/xorbits/_mars/learn/linear_model/tests/__init__.py b/python/xorbits/_mars/learn/linear_model/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/linear_model/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/linear_model/tests/test_base.py b/python/xorbits/_mars/learn/linear_model/tests/test_base.py
new file mode 100644
index 000000000..2d5fdfbac
--- /dev/null
+++ b/python/xorbits/_mars/learn/linear_model/tests/test_base.py
@@ -0,0 +1,698 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+import numpy as np
+import pytest
+from numpy.testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from scipy import linalg, sparse
+from sklearn.datasets import load_iris, make_regression, make_sparse_uncorrelated
+from sklearn.linear_model import LinearRegression as sklearn_LR
+from sklearn.linear_model._base import make_dataset
+from sklearn.utils import check_random_state
+
+from .. import LinearRegression
+from .._base import _preprocess_data, _rescale_data
+
+rng = np.random.RandomState(0)
+rtol = 1e-6
+
+
+def test_linear_regression(setup):
+    # Regular model fitting, #samples > 2, #features >= 2
+    X = [[1, 1.5], [1.8, 2], [4, 5]]
+    Y = [1, 2, 3]
+
+    reg = LinearRegression()
+    reg.fit(X, Y)
+
+    model = sklearn_LR()
+    model.fit(X, Y)
+
+    assert_array_almost_equal(reg.coef_, model.coef_)
+    assert_array_almost_equal(reg.intercept_, model.intercept_)
+    assert_array_almost_equal(reg.predict(X), model.predict(X))
+
+    # Regular model fitting, #samples <= 2, # features < 2
+    error_msg = re.escape("Does not support sigular matrix!")
+
+    X = [[1], [2]]
+    Y = [1, 2]
+
+    reg = LinearRegression()
+    reg.fit(X, Y)
+
+    model = sklearn_LR()
+    model.fit(X, Y)
+
+    assert_array_almost_equal(reg.coef_, model.coef_)
+    assert_array_almost_equal(reg.intercept_, model.intercept_)
+    assert_array_almost_equal(reg.predict(X), model.predict(X))
+
+    # Extra case #1: singular matrix, degenerate input
+    error_msg = re.escape("Does not support sigular matrix!")
+
+    X = [[1]]
+    Y = [0]
+
+    reg = LinearRegression()
+    with pytest.raises(NotImplementedError, match=error_msg):
+        reg.fit(X, Y)
+
+    # # Extra case #2: algebrically singular matrix but algorithmically not
+    # # Works locally but not work in github checks
+    # # May be because the inverse is super large
+    # X = [[1, 1.5], [1.8, 2]]
+    # Y = [1, 2]
+
+    # reg = LinearRegression()
+    # reg.fit(X, Y)
+
+    # model = sklearn_LR()
+    # model.fit(X, Y)
+
+    # with pytest.raises(AssertionError):
+    #     assert_array_almost_equal(reg.coef_, model.coef_)
+
+
+def test_linear_regression_sample_weights(setup):
+    # TODO: loop over sparse data as well
+
+    rng = np.random.RandomState(0)
+
+    # It would not work with under-determined systems
+    for n_samples, n_features in ((6, 5),):
+        y = rng.randn(n_samples)
+        X = rng.randn(n_samples, n_features)
+        sample_weight = 1.0 + rng.rand(n_samples)
+
+        for intercept in (True, False):
+            # LinearRegression with explicit sample_weight
+            reg = LinearRegression(fit_intercept=intercept)
+            reg.fit(X, y, sample_weight=sample_weight)
+            coefs1 = reg.coef_
+            inter1 = reg.intercept_
+
+            assert reg.coef_.shape == (X.shape[1],)  # sanity checks
+            assert reg.score(X, y).to_numpy() > 0.5
+
+            # Closed form of the weighted least square
+            # theta = (X^T W X)^(-1) * X^T W y
+            W = np.diag(sample_weight)
+            if intercept is False:
+                X_aug = X
+            else:
+                dummy_column = np.ones(shape=(n_samples, 1))
+                X_aug = np.concatenate((dummy_column, X), axis=1)
+
+            coefs2 = linalg.solve(X_aug.T.dot(W).dot(X_aug), X_aug.T.dot(W).dot(y))
+
+            if intercept is False:
+                assert_array_almost_equal(coefs1, coefs2)
+            else:
+                assert_array_almost_equal(coefs1, coefs2[1:])
+                assert_almost_equal(inter1.to_numpy(), coefs2[0])
+
+
+def test_raises_value_error_if_positive_and_sparse(setup):
+    error_msg = re.escape(
+        "A sparse tensor was passed, but dense "
+        "data is required. Use X.todense() to "
+        "convert to a dense tensor."
+    )
+    # X must not be sparse if positive == True
+    X = sparse.eye(10)
+    y = np.ones(10)
+
+    reg = LinearRegression(positive=True)
+
+    with pytest.raises(TypeError, match=error_msg):
+        reg.fit(X, y)
+
+
+def test_raises_value_error_if_sample_weights_greater_than_1d(setup):
+    error_msg = re.escape("Sample weights must be 1D array or scalar")
+
+    X = rng.randn(10, 5)
+    y = rng.randn(10)
+    sample_weights_2D = rng.randn(10, 2) ** 2 + 1
+
+    reg = LinearRegression()
+
+    with pytest.raises(ValueError, match=error_msg):
+        reg.fit(X, y, sample_weights_2D)
+
+
+def test_fit_intercept(setup):
+    # Test assertions on betas shape.
+    X2 = np.array([[0.38349978, 0.61650022], [0.58853682, 0.41146318]])
+    X3 = np.array(
+        [[0.27677969, 0.70693172, 0.01628859], [0.08385139, 0.20692515, 0.70922346]]
+    )
+    y = np.array([1, 1])
+
+    lr2_without_intercept = LinearRegression(fit_intercept=False).fit(X2, y)
+    lr2_with_intercept = LinearRegression().fit(X2, y)
+
+    lr3_without_intercept = LinearRegression(fit_intercept=False).fit(X3, y)
+    lr3_with_intercept = LinearRegression().fit(X3, y)
+
+    assert lr2_with_intercept.coef_.shape == lr2_without_intercept.coef_.shape
+    assert lr3_with_intercept.coef_.shape == lr3_without_intercept.coef_.shape
+    assert lr2_without_intercept.coef_.ndim == lr3_without_intercept.coef_.ndim
+
+
+def test_linear_regression_sparse(setup, random_state=0):
+    # Test that linear regression also works with sparse data
+    random_state = check_random_state(random_state)
+    for i in range(10):
+        n = 100
+        X = sparse.eye(n, n)
+        beta = random_state.rand(n)
+        y = X * beta[:, np.newaxis]
+        ols = LinearRegression()
+
+        error_msg = re.escape("Does not support sparse input!")
+        with pytest.raises(NotImplementedError, match=error_msg):
+            ols.fit(X, y.ravel())
+
+
+@pytest.mark.parametrize("normalize", [True, False])
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_linear_regression_sparse_equal_dense(setup, normalize, fit_intercept):
+    # Test that linear regression agrees between sparse and dense
+    rng = check_random_state(0)
+    n_samples = 200
+    n_features = 2
+    X = rng.randn(n_samples, n_features)
+    X[X < 0.1] = 0.0
+    Xcsr = sparse.csr_matrix(X)
+    y = rng.rand(n_samples)
+    params = dict(normalize=normalize, fit_intercept=fit_intercept)
+    clf_dense = LinearRegression(**params)
+    clf_sparse = LinearRegression(**params)
+    clf_dense.fit(X, y)
+
+    if fit_intercept is False:
+        error_msg = re.escape("Does not support sparse input!")
+        with pytest.raises(NotImplementedError, match=error_msg):
+            clf_sparse.fit(Xcsr, y)
+    else:
+        error_msg = re.escape("Does not support sparse input!")
+        with pytest.raises(NotImplementedError, match=error_msg):
+            clf_sparse.fit(Xcsr, y)
+
+
+def test_linear_regression_multiple_outcome(setup, random_state=0):
+    # Test multiple-outcome linear regressions
+    X, y = make_regression(random_state=random_state)
+
+    Y = np.vstack((y, y)).T
+    n_features = X.shape[1]
+
+    reg = LinearRegression()
+    reg.fit((X), Y)
+    assert reg.coef_.shape == (2, n_features)
+    Y_pred = reg.predict(X)
+    reg.fit(X, y)
+    y_pred = reg.predict(X)
+    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
+
+
+def test_linear_regression_sparse_multiple_outcome(setup, random_state=0):
+    # Test multiple-outcome linear regressions with sparse data
+    random_state = check_random_state(random_state)
+    X, y = make_sparse_uncorrelated(random_state=random_state)
+    X = sparse.coo_matrix(X)
+    Y = np.vstack((y, y)).T
+
+    ols = LinearRegression()
+    error_msg = re.escape("Does not support sparse input!")
+    with pytest.raises(NotImplementedError, match=error_msg):
+        ols.fit(X, Y)
+
+    error_msg = re.escape("Does not support sparse input!")
+    with pytest.raises(NotImplementedError, match=error_msg):
+        ols.fit(X, y.ravel())
+
+
+# # When optimize.nnls is implemented, one can utilize this test case
+# def test_linear_regression_positive(setup):
+#     # Test nonnegative LinearRegression on a simple dataset.
+#     X = [[1], [2]]
+#     y = [1, 2]
+
+#     reg = LinearRegression(positive=True)
+#     reg.fit(X, y)
+
+#     assert_array_almost_equal(reg.coef_, [1])
+#     assert_array_almost_equal(reg.intercept_, [0])
+#     assert_array_almost_equal(reg.predict(X), [1, 2])
+
+#     # test it also for degenerate input
+#     X = [[1]]
+#     y = [0]
+
+#     reg = LinearRegression(positive=True)
+#     reg.fit(X, y)
+#     assert_allclose(reg.coef_, [0])
+#     assert_allclose(reg.intercept_, [0])
+#     assert_allclose(reg.predict(X), [0])
+
+
+# # When optimize.nnls is implemented, one can utilize this test case
+# def test_linear_regression_positive_multiple_outcome(setup, random_state=0):
+#     # Test multiple-outcome nonnegative linear regressions
+#     random_state = check_random_state(random_state)
+#     X, y = make_sparse_uncorrelated(random_state=random_state)
+#     Y = np.vstack((y, y)).T
+#     n_features = X.shape[1]
+
+#     ols = LinearRegression(positive=True)
+#     ols.fit(X, Y)
+#     assert ols.coef_.shape == (2, n_features)
+#     assert np.all(ols.coef_.to_numpy() >= 0.)
+#     Y_pred = ols.predict(X)
+#     ols.fit(X, y.ravel())
+#     y_pred = ols.predict(X)
+#     assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred)
+
+
+def test_linear_regression_positive_vs_nonpositive(setup):
+    # Test differences with LinearRegression when positive=False.
+    X, y = make_sparse_uncorrelated(random_state=0)
+
+    # reg = LinearRegression(positive=True)
+    reg = sklearn_LR(positive=True)
+    reg.fit(X, y)
+    regn = LinearRegression(positive=False)
+    regn.fit(X, y)
+
+    assert np.mean(((reg.coef_ - regn.coef_) ** 2).to_numpy()) > 1e-3
+
+
+def test_linear_regression_positive_vs_nonpositive_when_positive(setup):
+    # Test LinearRegression fitted coefficients
+    # when the problem is positive.
+    n_samples = 200
+    n_features = 4
+    X = rng.rand(n_samples, n_features)
+    y = X[:, 0] + 2 * X[:, 1] + 3 * X[:, 2] + 1.5 * X[:, 3]
+
+    # reg = LinearRegression(positive=True)
+    reg = sklearn_LR(positive=True)
+    reg.fit(X, y)
+    regn = LinearRegression(positive=False)
+    regn.fit(X, y)
+
+    assert np.mean(((reg.coef_ - regn.coef_) ** 2).to_numpy()) < 1e-6
+
+
+# # Failed: DID NOT WARN.
+# # No such warning "pandas.DataFrame with sparse columns found."
+# def test_linear_regression_pd_sparse_dataframe_warning():
+#     pd = pytest.importorskip('pandas')
+#     # restrict the pd versions < '0.24.0'
+#     # as they have a bug in is_sparse func
+#     if parse_version(pd.__version__) < parse_version('0.24.0'):
+#         pytest.skip("pandas 0.24+ required.")
+
+#     # Warning is raised only when some of the columns is sparse
+#     df = pd.DataFrame({'0': np.random.randn(10)})
+#     for col in range(1, 4):
+#         arr = np.random.randn(10)
+#         arr[:8] = 0
+#         # all columns but the first column is sparse
+#         if col != 0:
+#             arr = pd.arrays.SparseArray(arr, fill_value=0)
+#         df[str(col)] = arr
+
+#     msg = "pandas.DataFrame with sparse columns found."
+#     with pytest.warns(UserWarning, match=msg):
+#         reg = LinearRegression()
+#         reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])
+
+#     # does not warn when the whole dataframe is sparse
+#     df['0'] = pd.arrays.SparseArray(df['0'], fill_value=0)
+#     assert hasattr(df, "sparse")
+
+#     with pytest.warns(None) as record:
+#         reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])
+#     assert not record
+
+
+def test_preprocess_data(setup):
+    n_samples = 200
+    n_features = 2
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples)
+    expected_X_mean = np.mean(X, axis=0)
+    expected_X_norm = np.std(X, axis=0) * np.sqrt(X.shape[0])
+    expected_y_mean = np.mean(y, axis=0)
+
+    Xt, yt, X_mean, y_mean, X_norm = _preprocess_data(
+        X, y, fit_intercept=False, normalize=False
+    )
+    assert_array_almost_equal(X_mean, np.zeros(n_features))
+    assert_array_almost_equal(y_mean, 0)
+    assert_array_almost_equal(X_norm, np.ones(n_features))
+    assert_array_almost_equal(Xt, X)
+    assert_array_almost_equal(yt, y)
+
+    Xt, yt, X_mean, y_mean, X_norm = _preprocess_data(
+        X, y, fit_intercept=True, normalize=False
+    )
+    assert_array_almost_equal(X_mean, expected_X_mean)
+    assert_array_almost_equal(y_mean, expected_y_mean)
+    assert_array_almost_equal(X_norm, np.ones(n_features))
+    assert_array_almost_equal(Xt, X - expected_X_mean)
+    assert_array_almost_equal(yt, y - expected_y_mean)
+
+    Xt, yt, X_mean, y_mean, X_norm = _preprocess_data(
+        X, y, fit_intercept=True, normalize=True
+    )
+    assert_array_almost_equal(X_mean, expected_X_mean)
+    assert_array_almost_equal(y_mean, expected_y_mean)
+    assert_array_almost_equal(X_norm, expected_X_norm)
+    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm)
+    assert_array_almost_equal(yt, y - expected_y_mean)
+
+
+def test_preprocess_data_multioutput(setup):
+    n_samples = 200
+    n_features = 3
+    n_outputs = 2
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples, n_outputs)
+    expected_y_mean = np.mean(y, axis=0)
+
+    # case 1
+    _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=False, normalize=False)
+    assert_array_almost_equal(y_mean, np.zeros(n_outputs))
+    assert_array_almost_equal(yt, y)
+
+    _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True, normalize=False)
+    assert_array_almost_equal(y_mean, expected_y_mean)
+    assert_array_almost_equal(yt, y - y_mean)
+
+    _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True, normalize=True)
+    assert_array_almost_equal(y_mean, expected_y_mean)
+    assert_array_almost_equal(yt, y - y_mean)
+
+    # case 2
+    X = sparse.csc_matrix(X)
+    error_msg = "Does not support sparse input!"
+    with pytest.raises(NotImplementedError, match=error_msg):
+        _, yt, _, y_mean, _ = _preprocess_data(
+            X, y, fit_intercept=False, normalize=False
+        )
+
+    with pytest.raises(NotImplementedError, match=error_msg):
+        _, yt, _, y_mean, _ = _preprocess_data(
+            X, y, fit_intercept=True, normalize=False
+        )
+
+    with pytest.raises(NotImplementedError, match=error_msg):
+        _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True, normalize=True)
+
+
+def test_preprocess_data_weighted(setup):
+    n_samples = 200
+    n_features = 2
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples)
+    sample_weight = rng.rand(n_samples)
+    expected_X_mean = np.average(X, axis=0, weights=sample_weight)
+    expected_y_mean = np.average(y, axis=0, weights=sample_weight)
+
+    # XXX: if normalize=True, should we expect a weighted standard deviation?
+    #      Currently not weighted, but calculated with respect to weighted mean
+    expected_X_norm = (
+        np.sqrt(X.shape[0]) * np.mean((X - expected_X_mean) ** 2, axis=0) ** 0.5
+    )
+
+    Xt, yt, X_mean, y_mean, X_norm = _preprocess_data(
+        X, y, fit_intercept=True, normalize=False, sample_weight=sample_weight
+    )
+    assert_array_almost_equal(X_mean, expected_X_mean)
+    assert_array_almost_equal(y_mean, expected_y_mean)
+    assert_array_almost_equal(X_norm, np.ones(n_features))
+    assert_array_almost_equal(Xt, X - expected_X_mean)
+    assert_array_almost_equal(yt, y - expected_y_mean)
+
+    Xt, yt, X_mean, y_mean, X_norm = _preprocess_data(
+        X, y, fit_intercept=True, normalize=True, sample_weight=sample_weight
+    )
+    assert_array_almost_equal(X_mean, expected_X_mean)
+    assert_array_almost_equal(y_mean, expected_y_mean)
+    assert_array_almost_equal(X_norm, expected_X_norm)
+    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm)
+    assert_array_almost_equal(yt, y - expected_y_mean)
+
+
+def test_sparse_preprocess_data_with_return_mean(setup):
+    n_samples = 200
+    n_features = 2
+    # random_state not supported yet in sparse.rand
+    X = sparse.rand(n_samples, n_features, density=0.5)  # , random_state=rng
+    X = sparse.csr_matrix(X)
+    y = rng.rand(n_samples)
+
+    error_msg = re.escape("Does not support sparse input!")
+    with pytest.raises(NotImplementedError, match=error_msg):
+        Xt, yt, X_mean, y_mean, X_norm = _preprocess_data(
+            X, y, fit_intercept=False, normalize=False, return_mean=True
+        )
+
+    error_msg = re.escape("Does not support sparse input!")
+    with pytest.raises(NotImplementedError, match=error_msg):
+        Xt, yt, X_mean, y_mean, X_norm = _preprocess_data(
+            X,
+            y,
+            fit_intercept=True,
+            normalize=False,
+            return_mean=True,
+            check_input=False,
+        )
+
+    error_msg = re.escape("Does not support sparse input!")
+    with pytest.raises(NotImplementedError, match=error_msg):
+        Xt, yt, X_mean, y_mean, X_norm = _preprocess_data(
+            X,
+            y,
+            fit_intercept=True,
+            normalize=True,
+            return_mean=True,
+            check_input=False,
+        )
+
+
+# # AttributeError: 'TensorData' object has no attribute 'getformat'
+# def test_csr_preprocess_data():
+#     # Test output format of _preprocess_data, when input is csr
+#     X, y = make_regression()
+#     X[X < 2.5] = 0.0
+#     csr = sparse.csr_matrix(X)
+#     csr_, y, _, _, _ = _preprocess_data(csr, y, True)
+#     assert csr_.getformat() == 'csr'
+
+
+@pytest.mark.parametrize("is_sparse", (True, False))
+@pytest.mark.parametrize("to_copy", (True, False))
+def test_preprocess_copy_data_no_checks(setup, is_sparse, to_copy):
+    X, y = make_regression()
+    X[X < 2.5] = 0.0
+
+    if is_sparse:
+        X = sparse.csr_matrix(X)
+        error_msg = re.escape("Does not support sparse input!")
+        with pytest.raises(NotImplementedError, match=error_msg):
+            X_, y_, _, _, _ = _preprocess_data(
+                X, y, True, copy=to_copy, check_input=False
+            )
+    else:
+        X_, y_, _, _, _ = _preprocess_data(X, y, True, copy=to_copy, check_input=False)
+
+        if to_copy and is_sparse:
+            assert not np.may_share_memory(X_.data, X.data)
+        elif to_copy:
+            assert not np.may_share_memory(X_.to_numpy(), X)
+        elif is_sparse:
+            assert np.may_share_memory(X_.data, X.data)
+        # else:  # fake pass
+        #     assert np.may_share_memory(X_.to_numpy(), X)
+
+
+def test_dtype_preprocess_data(setup):
+    n_samples = 200
+    n_features = 2
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples)
+
+    X_32 = np.asarray(X, dtype=np.float32)
+    y_32 = np.asarray(y, dtype=np.float32)
+    X_64 = np.asarray(X, dtype=np.float64)
+    y_64 = np.asarray(y, dtype=np.float64)
+
+    for fit_intercept in [True, False]:
+        for normalize in [True, False]:
+            Xt_32, yt_32, X_mean_32, y_mean_32, X_norm_32 = _preprocess_data(
+                X_32,
+                y_32,
+                fit_intercept=fit_intercept,
+                normalize=normalize,
+                return_mean=True,
+            )
+
+            Xt_64, yt_64, X_mean_64, y_mean_64, X_norm_64 = _preprocess_data(
+                X_64,
+                y_64,
+                fit_intercept=fit_intercept,
+                normalize=normalize,
+                return_mean=True,
+            )
+
+            Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_norm_3264 = _preprocess_data(
+                X_32,
+                y_64,
+                fit_intercept=fit_intercept,
+                normalize=normalize,
+                return_mean=True,
+            )
+
+            Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_norm_6432 = _preprocess_data(
+                X_64,
+                y_32,
+                fit_intercept=fit_intercept,
+                normalize=normalize,
+                return_mean=True,
+            )
+
+            assert Xt_32.dtype == np.float32
+            assert yt_32.dtype == np.float32
+            assert X_mean_32.dtype == np.float32
+            assert y_mean_32.dtype == np.float32
+            assert X_norm_32.dtype == np.float32
+
+            assert Xt_64.dtype == np.float64
+            assert yt_64.dtype == np.float64
+            assert X_mean_64.dtype == np.float64
+            assert y_mean_64.dtype == np.float64
+            assert X_norm_64.dtype == np.float64
+
+            assert Xt_3264.dtype == np.float32
+            assert yt_3264.dtype == np.float32
+            assert X_mean_3264.dtype == np.float32
+            assert y_mean_3264.dtype == np.float32
+            assert X_norm_3264.dtype == np.float32
+
+            assert Xt_6432.dtype == np.float64
+            assert yt_6432.dtype == np.float64
+            assert X_mean_6432.dtype == np.float64
+            assert y_mean_6432.dtype == np.float64
+            assert X_norm_6432.dtype == np.float64
+
+            assert X_32.dtype == np.float32
+            assert y_32.dtype == np.float32
+            assert X_64.dtype == np.float64
+            assert y_64.dtype == np.float64
+
+            assert_array_almost_equal(Xt_32, Xt_64)
+            assert_array_almost_equal(yt_32, yt_64)
+            assert_array_almost_equal(X_mean_32, X_mean_64)
+            assert_array_almost_equal(y_mean_32, y_mean_64)
+            assert_array_almost_equal(X_norm_32, X_norm_64)
+
+
+@pytest.mark.parametrize("n_targets", [None, 2])
+def test_rescale_data_dense(setup, n_targets):
+    n_samples = 200
+    n_features = 2
+
+    sample_weight = 1.0 + rng.rand(n_samples)
+    X = rng.rand(n_samples, n_features)
+    if n_targets is None:
+        y = rng.rand(n_samples)
+    else:
+        y = rng.rand(n_samples, n_targets)
+    rescaled_X, rescaled_y = _rescale_data(X, y, sample_weight)
+    rescaled_X2 = X * np.sqrt(sample_weight)[:, np.newaxis]
+    if n_targets is None:
+        rescaled_y2 = y * np.sqrt(sample_weight)
+    else:
+        rescaled_y2 = y * np.sqrt(sample_weight)[:, np.newaxis]
+    assert_array_almost_equal(rescaled_X, rescaled_X2)
+    assert_array_almost_equal(rescaled_y, rescaled_y2)
+
+
+def test_fused_types_make_dataset(setup):
+    iris = load_iris()
+
+    X_32 = iris.data.astype(np.float32)
+    y_32 = iris.target.astype(np.float32)
+    X_csr_32 = sparse.csr_matrix(X_32)
+    sample_weight_32 = np.arange(y_32.size, dtype=np.float32)
+
+    X_64 = iris.data.astype(np.float64)
+    y_64 = iris.target.astype(np.float64)
+    X_csr_64 = sparse.csr_matrix(X_64)
+    sample_weight_64 = np.arange(y_64.size, dtype=np.float64)
+
+    # array
+    dataset_32, _ = make_dataset(X_32, y_32, sample_weight_32)
+    dataset_64, _ = make_dataset(X_64, y_64, sample_weight_64)
+    xi_32, yi_32, _, _ = dataset_32._next_py()
+    xi_64, yi_64, _, _ = dataset_64._next_py()
+    xi_data_32, _, _ = xi_32
+    xi_data_64, _, _ = xi_64
+
+    assert xi_data_32.dtype == np.float32
+    assert xi_data_64.dtype == np.float64
+    assert_allclose(yi_64, yi_32, rtol=rtol)
+
+    # csr
+    datasetcsr_32, _ = make_dataset(X_csr_32, y_32, sample_weight_32)
+    datasetcsr_64, _ = make_dataset(X_csr_64, y_64, sample_weight_64)
+    xicsr_32, yicsr_32, _, _ = datasetcsr_32._next_py()
+    xicsr_64, yicsr_64, _, _ = datasetcsr_64._next_py()
+    xicsr_data_32, _, _ = xicsr_32
+    xicsr_data_64, _, _ = xicsr_64
+
+    assert xicsr_data_32.dtype == np.float32
+    assert xicsr_data_64.dtype == np.float64
+
+    assert_allclose(xicsr_data_64, xicsr_data_32, rtol=rtol)
+    assert_allclose(yicsr_64, yicsr_32, rtol=rtol)
+
+    assert_array_equal(xi_data_32, xicsr_data_32)
+    assert_array_equal(xi_data_64, xicsr_data_64)
+    assert_array_equal(yi_32, yicsr_32)
+    assert_array_equal(yi_64, yicsr_64)
+
+
+def test_raise_notimplemented_when_positive(setup):
+    error_msg = re.escape("Does not support positive coefficients!")
+
+    X = [[1], [2]]
+    y = [1, 2]
+
+    reg = LinearRegression(positive=True)
+    with pytest.raises(NotImplementedError, match=error_msg):
+        reg.fit(X, y)
diff --git a/python/xorbits/_mars/learn/metrics/__init__.py b/python/xorbits/_mars/learn/metrics/__init__.py
new file mode 100644
index 000000000..5c7c792b1
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ._classification import (
+    accuracy_score,
+    f1_score,
+    fbeta_score,
+    log_loss,
+    multilabel_confusion_matrix,
+    precision_recall_fscore_support,
+    precision_score,
+    recall_score,
+)
+from ._ranking import auc, roc_auc_score, roc_curve
+from ._regresssion import r2_score
+from ._scorer import get_scorer
+from .pairwise import euclidean_distances, pairwise_distances, pairwise_distances_topk
diff --git a/python/xorbits/_mars/learn/metrics/_base.py b/python/xorbits/_mars/learn/metrics/_base.py
new file mode 100644
index 000000000..337915669
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/_base.py
@@ -0,0 +1,131 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from itertools import combinations
+
+from ... import tensor as mt
+from ..utils import check_array, check_consistent_length
+from ..utils.multiclass import type_of_target
+
+
+def _average_binary_score(
+    binary_metric,
+    y_true,
+    y_score,
+    average,
+    sample_weight=None,
+    session=None,
+    run_kwargs=None,
+):
+    average_options = (None, "micro", "macro", "weighted", "samples")
+    if average not in average_options:  # pragma: no cover
+        raise ValueError("average has to be one of {0}".format(average_options))
+
+    y_type = type_of_target(y_true).to_numpy(session=session, **(run_kwargs or dict()))
+    if y_type not in ("binary", "multilabel-indicator"):  # pragma: no cover
+        raise ValueError("{0} format is not supported".format(y_type))
+
+    if y_type == "binary":
+        return binary_metric(y_true, y_score, sample_weight=sample_weight)
+
+    check_consistent_length(
+        y_true, y_score, sample_weight, session=session, run_kwargs=run_kwargs
+    )
+    y_true = check_array(y_true)
+    y_score = check_array(y_score)
+
+    not_average_axis = 1
+    score_weight = sample_weight
+    average_weight = None
+
+    if average == "micro":
+        if score_weight is not None:  # pragma: no cover
+            score_weight = mt.repeat(score_weight, y_true.shape[1])
+        y_true = y_true.ravel()
+        y_score = y_score.ravel()
+
+    elif average == "weighted":
+        if score_weight is not None:  # pragma: no cover
+            average_weight = mt.sum(
+                mt.multiply(y_true, mt.reshape(score_weight, (-1, 1))), axis=0
+            )
+        else:
+            average_weight = mt.sum(y_true, axis=0)
+        if mt.isclose(average_weight.sum(), 0.0).to_numpy(
+            session=session, **(run_kwargs or dict())
+        ):
+            return 0
+
+    elif average == "samples":
+        # swap average_weight <-> score_weight
+        average_weight = score_weight
+        score_weight = None
+        not_average_axis = 0
+
+    if y_true.ndim == 1:
+        y_true = y_true.reshape((-1, 1))
+
+    if y_score.ndim == 1:
+        y_score = y_score.reshape((-1, 1))
+
+    n_classes = y_score.shape[not_average_axis]
+    score = mt.zeros((n_classes,))
+    for c in range(n_classes):
+        y_true_c = y_true.take([c], axis=not_average_axis).ravel()
+        y_score_c = y_score.take([c], axis=not_average_axis).ravel()
+        score[c] = binary_metric(y_true_c, y_score_c, sample_weight=score_weight)
+
+    # Average the results
+    if average is not None:
+        if average_weight is not None:
+            # Scores with 0 weights are forced to be 0, preventing the average
+            # score from being affected by 0-weighted NaN elements.
+            average_weight = mt.asarray(average_weight)
+            score[average_weight == 0] = 0
+        return mt.average(score, weights=average_weight)
+    else:
+        return score
+
+
+def _average_multiclass_ovo_score(
+    binary_metric, y_true, y_score, average="macro", session=None, run_kwargs=None
+):
+    check_consistent_length(y_true, y_score, session=session, run_kwargs=run_kwargs)
+
+    y_true_unique = mt.unique(y_true).to_numpy()
+    n_classes = y_true_unique.shape[0]
+    n_pairs = n_classes * (n_classes - 1) // 2
+    pair_scores = mt.empty(n_pairs)
+
+    is_weighted = average == "weighted"
+    prevalence = mt.empty(n_pairs) if is_weighted else None
+
+    # Compute scores treating a as positive class and b as negative class,
+    # then b as positive class and a as negative class
+    for ix, (a, b) in enumerate(combinations(y_true_unique, 2)):
+        a_mask = y_true == a
+        b_mask = y_true == b
+        ab_mask = mt.logical_or(a_mask, b_mask)
+
+        if is_weighted:
+            prevalence[ix] = mt.average(ab_mask)
+
+        a_true = a_mask[ab_mask]
+        b_true = b_mask[ab_mask]
+
+        a_true_score = binary_metric(a_true, y_score[ab_mask, a])
+        b_true_score = binary_metric(b_true, y_score[ab_mask, b])
+        pair_scores[ix] = (a_true_score + b_true_score) / 2
+
+    return mt.average(pair_scores, weights=prevalence)
diff --git a/python/xorbits/_mars/learn/metrics/_check_targets.py b/python/xorbits/_mars/learn/metrics/_check_targets.py
new file mode 100644
index 000000000..e9d58bf53
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/_check_targets.py
@@ -0,0 +1,196 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+try:
+    from sklearn.metrics._classification import _check_targets as sklearn_check_targets
+except ImportError:  # pragma: no cover
+    # sklearn < 0.22
+    from sklearn.metrics.classification import _check_targets as sklearn_check_targets
+
+from ... import opcodes as OperandDef
+from ... import tensor as mt
+from ...core import ENTITY_TYPE, ExecutableTuple, recursive_tile
+from ...core.context import get_context
+from ...serialization.serializables import AnyField
+from ...tensor.core import TENSOR_TYPE, TensorOrder
+from ..operands import LearnOperand, LearnOperandMixin, OutputType
+from ..utils import check_consistent_length, column_or_1d
+from ..utils.multiclass import type_of_target
+
+
+class CheckTargets(LearnOperand, LearnOperandMixin):
+    _op_type_ = OperandDef.CHECK_TARGETS
+
+    y_true = AnyField("y_true")
+    y_pred = AnyField("y_pred")
+
+    @property
+    def output_limit(self):
+        return 3
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        if isinstance(self.y_true, ENTITY_TYPE):
+            self.y_true = next(inputs_iter)
+        if isinstance(self.y_pred, ENTITY_TYPE):
+            self.y_pred = next(inputs_iter)
+
+    def __call__(self, y_true, y_pred):
+        # scalar(y_type), y_true, y_pred
+        self.output_types = [OutputType.tensor] * 3
+
+        inputs = []
+        if isinstance(y_true, ENTITY_TYPE):
+            inputs.append(y_true)
+        if isinstance(y_pred, ENTITY_TYPE):
+            inputs.append(y_pred)
+
+        kws = list()
+        kws.append(
+            {"shape": (), "dtype": np.dtype(object), "order": TensorOrder.C_ORDER}
+        )
+        kws.extend([y.params for y in (mt.tensor(y_true), mt.tensor(y_pred))])
+        kws[1]["shape"] = kws[2]["shape"] = (np.nan,)
+        return ExecutableTuple(self.new_tileables(inputs, kws=kws))
+
+    @classmethod
+    def tile(cls, op):
+        y_true, y_pred = op.y_true, op.y_pred
+        if isinstance(y_true, ENTITY_TYPE):
+            y_true = mt.tensor(y_true)
+        if isinstance(y_pred, ENTITY_TYPE):
+            y_pred = mt.tensor(y_pred)
+
+        if len(op.inputs) == 0:
+            # no entity input
+            type_true, y_true, y_pred = sklearn_check_targets(y_true, y_pred)
+            new_op = op.copy()
+            outs = yield from recursive_tile(
+                mt.tensor(type_true), mt.tensor(y_true), mt.tensor(y_pred)
+            )
+            params = [out.params.copy() for out in op.outputs]
+            for param, out in zip(params, outs):
+                param["nsplits"] = out.nsplits
+                param["chunks"] = out.chunks
+                param["shape"] = out.shape
+            return new_op.new_tileables(op.inputs, kws=params)
+
+        check_consistent_length(y_true, y_pred)
+
+        type_true, type_pred = type_of_target(y_true), type_of_target(y_pred)
+        y_true, y_pred = mt.tensor(y_true), mt.tensor(y_pred)
+        tileables = y_true, y_pred, type_true, type_pred = yield from recursive_tile(
+            y_true, y_pred, type_true, type_pred
+        )
+        yield [c for t in tileables for c in t.chunks]
+
+        ctx = get_context()
+        type_true, type_pred = [
+            d.item() if hasattr(d, "item") else d
+            for d in ctx.get_chunks_result(
+                [type_true.chunks[0].key, type_pred.chunks[0].key]
+            )
+        ]
+
+        y_type = {type_true, type_pred}
+        if y_type == {"binary", "multiclass"}:
+            y_type = {"multiclass"}
+
+        if len(y_type) > 1:
+            raise ValueError(
+                f"Classification metrics can't handle a mix of {type_true} "
+                f"and {type_pred} targets"
+            )
+
+        # We can't have more than one value on y_type => The set is no more needed
+        y_type = y_type.pop()
+
+        # No metrics support "multiclass-multioutput" format
+        if y_type not in ["binary", "multiclass", "multilabel-indicator"]:
+            raise ValueError(f"{y_type} is not supported")
+
+        if y_type in ["binary", "multiclass"]:
+            y_true = column_or_1d(y_true)
+            y_pred = column_or_1d(y_pred)
+            if y_type == "binary":
+                unique_values = mt.union1d(y_true, y_pred)
+                y_type = mt.where(
+                    mt.count_nonzero(unique_values) > 2, "multiclass", y_type
+                )
+        elif y_type.startswith("multilabel"):
+            y_true = mt.tensor(y_true).tosparse()
+            y_pred = mt.tensor(y_pred).tosparse()
+            y_type = "multilabel-indicator"
+
+        if not isinstance(y_true, ENTITY_TYPE):
+            y_true = mt.tensor(y_true)
+        if not isinstance(y_pred, ENTITY_TYPE):
+            y_pred = mt.tensor(y_pred)
+        if not isinstance(y_type, TENSOR_TYPE):
+            y_type = mt.tensor(y_type, dtype=object)
+
+        y_type, y_true, y_pred = yield from recursive_tile(y_type, y_true, y_pred)
+
+        kws = [out.params for out in op.outputs]
+        kws[0].update(dict(nsplits=(), chunks=[y_type.chunks[0]]))
+        kws[1].update(
+            dict(
+                nsplits=y_true.nsplits,
+                chunks=y_true.chunks,
+                shape=tuple(sum(sp) for sp in y_true.nsplits),
+            )
+        )
+        kws[2].update(
+            dict(
+                nsplits=y_pred.nsplits,
+                chunks=y_pred.chunks,
+                shape=tuple(sum(sp) for sp in y_pred.nsplits),
+            )
+        )
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=kws)
+
+
+def _check_targets(y_true, y_pred):
+    """Check that y_true and y_pred belong to the same classification task
+
+    This converts multiclass or binary types to a common shape, and raises a
+    ValueError for a mix of multilabel and multiclass targets, a mix of
+    multilabel formats, for the presence of continuous-valued or multioutput
+    targets, or for targets of different lengths.
+
+    Column vectors are squeezed to 1d, while multilabel formats are returned
+    as CSR sparse label indicators.
+
+    Parameters
+    ----------
+    y_true : array-like
+
+    y_pred : array-like
+
+    Returns
+    -------
+    type_true : one of {'multilabel-indicator', 'multiclass', 'binary'}
+        The type of the true target data, as output by
+        ``utils.multiclass.type_of_target``
+
+    y_true : Tensor
+
+    y_pred : Tensor
+    """
+    op = CheckTargets(y_true=y_true, y_pred=y_pred)
+    return op(y_true, y_pred)
diff --git a/python/xorbits/_mars/learn/metrics/_classification.py b/python/xorbits/_mars/learn/metrics/_classification.py
new file mode 100644
index 000000000..b7c8a46eb
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/_classification.py
@@ -0,0 +1,1475 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import numpy as np
+from sklearn.exceptions import UndefinedMetricWarning
+
+from ... import execute, fetch
+from ... import opcodes as OperandDef
+from ... import tensor as mt
+from ...core import ENTITY_TYPE, recursive_tile
+from ...core.context import get_context
+from ...serialization.serializables import AnyField, BoolField, KeyField
+from ...tensor.core import TensorOrder
+from ..operands import LearnOperand, LearnOperandMixin, OutputType
+from ..preprocessing import LabelBinarizer, LabelEncoder
+from ..utils import check_array, check_consistent_length, column_or_1d
+from ..utils.multiclass import unique_labels
+from ..utils.sparsefuncs import count_nonzero
+from ._check_targets import _check_targets
+
+
+class AccuracyScore(LearnOperand, LearnOperandMixin):
+    _op_type_ = OperandDef.ACCURACY_SCORE
+
+    _y_true = AnyField("y_true")
+    _y_pred = AnyField("y_pred")
+    _normalize = BoolField("normalize")
+    _sample_weight = AnyField("sample_weight")
+    _type_true = KeyField("type_true")
+
+    def __init__(
+        self,
+        y_true=None,
+        y_pred=None,
+        normalize=None,
+        sample_weight=None,
+        type_true=None,
+        **kw
+    ):
+        super().__init__(
+            _y_true=y_true,
+            _y_pred=y_pred,
+            _normalize=normalize,
+            _sample_weight=sample_weight,
+            _type_true=type_true,
+            **kw
+        )
+        self.output_types = [OutputType.tensor]
+
+    @property
+    def y_true(self):
+        return self._y_true
+
+    @property
+    def y_pred(self):
+        return self._y_pred
+
+    @property
+    def normalize(self):
+        return self._normalize
+
+    @property
+    def sample_weight(self):
+        return self._sample_weight
+
+    @property
+    def type_true(self):
+        return self._type_true
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        if self._y_true is not None:
+            self._y_true = next(inputs_iter)
+        if self._y_pred is not None:
+            self._y_pred = next(inputs_iter)
+        if self._type_true is not None:
+            self._type_true = next(inputs_iter)
+        if isinstance(self._sample_weight, ENTITY_TYPE):
+            self._sample_weight = next(inputs_iter)
+
+    def __call__(self, y_true, y_pred):
+        type_true, y_true, y_pred = _check_targets(y_true, y_pred)
+        self._type_true = type_true
+        inputs = [y_true, y_pred, type_true]
+        if isinstance(self._sample_weight, ENTITY_TYPE):
+            inputs.append(self._sample_weight)
+
+        dtype = np.dtype(float) if self._normalize else np.result_type(y_true, y_pred)
+        return self.new_tileable(
+            inputs, dtype=dtype, shape=(), order=TensorOrder.C_ORDER
+        )
+
+    @classmethod
+    def tile(cls, op):
+        # make sure type_true executed first
+        chunks = [op.type_true.chunks[0]]
+        yield chunks
+
+        ctx = get_context()
+        type_true = ctx.get_chunks_result([chunks[0].key])[0]
+
+        y_true, y_pred = op.y_true, op.y_pred
+        if type_true.item().startswith("multilabel"):
+            differing_labels = mt.count_nonzero(y_true - y_pred, axis=1)
+            score = mt.equal(differing_labels, 0)
+        else:
+            score = mt.equal(y_true, y_pred)
+
+        result = _weighted_sum(score, op.sample_weight, op.normalize)
+        return [(yield from recursive_tile(result))]
+
+
+def _weighted_sum(sample_score, sample_weight, normalize=False):
+    if normalize:
+        return mt.average(sample_score, weights=sample_weight)
+    elif sample_weight is not None:
+        return mt.dot(sample_score, sample_weight)
+    else:
+        return sample_score.sum()
+
+
+def accuracy_score(
+    y_true, y_pred, normalize=True, sample_weight=None, session=None, run_kwargs=None
+):
+    """Accuracy classification score.
+
+    In multilabel classification, this function computes subset accuracy:
+    the set of labels predicted for a sample must *exactly* match the
+    corresponding set of labels in y_true.
+
+    Read more in the :ref:`User Guide <accuracy_score>`.
+
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator tensor / sparse tensor
+        Ground truth (correct) labels.
+
+    y_pred : 1d array-like, or label indicator tensor / sparse tensor
+        Predicted labels, as returned by a classifier.
+
+    normalize : bool, optional (default=True)
+        If ``False``, return the number of correctly classified samples.
+        Otherwise, return the fraction of correctly classified samples.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    score : float
+        If ``normalize == True``, return the fraction of correctly
+        classified samples (float), else returns the number of correctly
+        classified samples (int).
+
+        The best performance is 1 with ``normalize == True`` and the number
+        of samples with ``normalize == False``.
+
+    See also
+    --------
+    jaccard_score, hamming_loss, zero_one_loss
+
+    Notes
+    -----
+    In binary and multiclass classification, this function is equal
+    to the ``jaccard_score`` function.
+
+    Examples
+    --------
+    >>> from mars.learn.metrics import accuracy_score
+    >>> y_pred = [0, 2, 1, 3]
+    >>> y_true = [0, 1, 2, 3]
+    >>> accuracy_score(y_true, y_pred).execute()
+    0.5
+    >>> accuracy_score(y_true, y_pred, normalize=False).execute()
+    2
+
+    In the multilabel case with binary label indicators:
+
+    >>> import mars.tensor as mt
+    >>> accuracy_score(mt.array([[0, 1], [1, 1]]), mt.ones((2, 2))).execute()
+    0.5
+    """
+
+    # Compute accuracy for each possible representation
+    op = AccuracyScore(
+        y_true=y_true, y_pred=y_pred, normalize=normalize, sample_weight=sample_weight
+    )
+    score = op(y_true, y_pred)
+    return score.execute(session=session, **(run_kwargs or dict()))
+
+
+def log_loss(
+    y_true, y_pred, *, eps=1e-15, normalize=True, sample_weight=None, labels=None
+):
+    r"""Log loss, aka logistic loss or cross-entropy loss.
+
+    This is the loss function used in (multinomial) logistic regression
+    and extensions of it such as neural networks, defined as the negative
+    log-likelihood of a logistic model that returns ``y_pred`` probabilities
+    for its training data ``y_true``.
+    The log loss is only defined for two or more labels.
+    For a single sample with true label :math:`y \in \{0,1\}` and
+    and a probability estimate :math:`p = \operatorname{Pr}(y = 1)`, the log
+    loss is:
+
+    .. math::
+        L_{\log}(y, p) = -(y \log (p) + (1 - y) \log (1 - p))
+
+    Read more in the :ref:`User Guide <log_loss>`.
+
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) labels for n_samples samples.
+
+    y_pred : array-like of float, shape = (n_samples, n_classes) or (n_samples,)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method. If ``y_pred.shape = (n_samples,)``
+        the probabilities provided are assumed to be that of the
+        positive class. The labels in ``y_pred`` are assumed to be
+        ordered alphabetically, as done by
+        :class:`preprocessing.LabelBinarizer`.
+
+    eps : float, default=1e-15
+        Log loss is undefined for p=0 or p=1, so probabilities are
+        clipped to max(eps, min(1 - eps, p)).
+
+    normalize : bool, default=True
+        If true, return the mean loss per sample.
+        Otherwise, return the sum of the per-sample losses.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    labels : array-like, default=None
+        If not provided, labels will be inferred from y_true. If ``labels``
+        is ``None`` and ``y_pred`` has shape (n_samples,) the labels are
+        assumed to be binary and are inferred from ``y_true``.
+
+    Returns
+    -------
+    loss : float
+
+    Notes
+    -----
+    The logarithm used is the natural logarithm (base-e).
+
+    Examples
+    --------
+    >>> from mars.learn.metrics import log_loss
+    >>> log_loss(["spam", "ham", "ham", "spam"],
+    ...          [[.1, .9], [.9, .1], [.8, .2], [.35, .65]])
+    0.21616...
+
+    References
+    ----------
+    C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer,
+    p. 209.
+    """
+    y_pred = check_array(y_pred, ensure_2d=False)
+    check_consistent_length(y_pred, y_true, sample_weight)
+
+    lb = LabelBinarizer()
+
+    if labels is not None:
+        lb.fit(labels)
+    else:
+        lb.fit(y_true)
+
+    if len(lb.classes_) == 1:
+        if labels is None:
+            raise ValueError(
+                "y_true contains only one label ({0}). Please "
+                "provide the true labels explicitly through the "
+                "labels argument.".format(lb.classes_[0].fetch())
+            )
+        else:
+            raise ValueError(
+                "The labels array needs to contain at least two "
+                "labels for log_loss, "
+                "got {0}.".format(lb.classes_.fetch())
+            )
+
+    transformed_labels = lb.transform(y_true)
+
+    if transformed_labels.shape[1] == 1:
+        transformed_labels = mt.append(
+            1 - transformed_labels, transformed_labels, axis=1
+        )
+
+    # Clipping
+    y_pred = mt.clip(y_pred, eps, 1 - eps)
+
+    # If y_pred is of single dimension, assume y_true to be binary
+    # and then check.
+    if y_pred.ndim == 1:  # pragma: no cover
+        y_pred = y_pred[:, mt.newaxis]
+    if y_pred.shape[1] == 1:  # pragma: no cover
+        y_pred = mt.append(1 - y_pred, y_pred, axis=1)
+
+    # Check if dimensions are consistent.
+    transformed_labels = check_array(transformed_labels)
+    if len(lb.classes_) != y_pred.shape[1]:
+        if labels is None:
+            raise ValueError(
+                "y_true and y_pred contain different number of "
+                "classes {0}, {1}. Please provide the true "
+                "labels explicitly through the labels argument. "
+                "Classes found in "
+                "y_true: {2}".format(
+                    transformed_labels.shape[1], y_pred.shape[1], lb.classes_.fetch()
+                )
+            )
+        else:
+            raise ValueError(
+                "The number of classes in labels is different "
+                "from that in y_pred. Classes found in "
+                "labels: {0}".format(lb.classes_.fetch())
+            )
+
+    # Renormalize
+    y_pred /= y_pred.sum(axis=1)[:, mt.newaxis]
+    loss = -(transformed_labels * mt.log(y_pred)).sum(axis=1)
+
+    return _weighted_sum(loss, sample_weight, normalize).execute()
+
+
+def multilabel_confusion_matrix(
+    y_true,
+    y_pred,
+    *,
+    sample_weight=None,
+    labels=None,
+    samplewise=False,
+    session=None,
+    run_kwargs=None
+):
+    """
+    Compute a confusion matrix for each class or sample.
+
+    Compute class-wise (default) or sample-wise (samplewise=True) multilabel
+    confusion matrix to evaluate the accuracy of a classification, and output
+    confusion matrices for each class or sample.
+
+    In multilabel confusion matrix :math:`MCM`, the count of true negatives
+    is :math:`MCM_{:,0,0}`, false negatives is :math:`MCM_{:,1,0}`,
+    true positives is :math:`MCM_{:,1,1}` and false positives is
+    :math:`MCM_{:,0,1}`.
+
+    Multiclass data will be treated as if binarized under a one-vs-rest
+    transformation. Returned confusion matrices will be in the order of
+    sorted unique labels in the union of (y_true, y_pred).
+
+    Read more in the :ref:`User Guide <multilabel_confusion_matrix>`.
+
+    Parameters
+    ----------
+    y_true : {array-like, sparse matrix} of shape (n_samples, n_outputs) or \
+            (n_samples,)
+        Ground truth (correct) target values.
+
+    y_pred : {array-like, sparse matrix} of shape (n_samples, n_outputs) or \
+            (n_samples,)
+        Estimated targets as returned by a classifier.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    labels : array-like of shape (n_classes,), default=None
+        A list of classes or column indices to select some (or to force
+        inclusion of classes absent from the data).
+
+    samplewise : bool, default=False
+        In the multilabel case, this calculates a confusion matrix per sample.
+
+    Returns
+    -------
+    multi_confusion : ndarray of shape (n_outputs, 2, 2)
+        A 2x2 confusion matrix corresponding to each output in the input.
+        When calculating class-wise multi_confusion (default), then
+        n_outputs = n_labels; when calculating sample-wise multi_confusion
+        (samplewise=True), n_outputs = n_samples. If ``labels`` is defined,
+        the results will be returned in the order specified in ``labels``,
+        otherwise the results will be returned in sorted order by default.
+
+    See Also
+    --------
+    confusion_matrix : Compute confusion matrix to evaluate the accuracy of a
+        classifier.
+
+    Notes
+    -----
+    The `multilabel_confusion_matrix` calculates class-wise or sample-wise
+    multilabel confusion matrices, and in multiclass tasks, labels are
+    binarized under a one-vs-rest way; while
+    :func:`~sklearn.metrics.confusion_matrix` calculates one confusion matrix
+    for confusion between every two classes.
+
+    Examples
+    --------
+    Multiclass case:
+
+    >>> import mars.tensor as mt
+    >>> from mars.learn.metrics import multilabel_confusion_matrix
+    >>> y_true = ["cat", "ant", "cat", "cat", "ant", "bird"]
+    >>> y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"]
+    >>> multilabel_confusion_matrix(y_true, y_pred,
+    ...                             labels=["ant", "bird", "cat"])
+    array([[[3, 1],
+            [0, 2]],
+    <BLANKLINE>
+           [[5, 0],
+            [1, 0]],
+    <BLANKLINE>
+           [[2, 1],
+            [1, 2]]])
+
+    Multilabel-indicator case not implemented yet.
+    """
+    exec_kw = dict(session=session, **(run_kwargs or dict()))
+
+    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+    execute(y_type, y_true, y_pred, **exec_kw)
+    y_type = y_type.fetch()
+
+    if sample_weight is not None:
+        sample_weight = column_or_1d(sample_weight)
+    check_consistent_length(y_true, y_pred, sample_weight, **exec_kw)
+
+    if y_type not in ("binary", "multiclass", "multilabel-indicator"):
+        raise ValueError("%s is not supported" % y_type)
+
+    present_labels = unique_labels(y_true, y_pred)
+    if labels is None:
+        labels = present_labels
+        n_labels = None
+    else:
+        labels = mt.tensor(labels)
+        n_labels = labels.shape[0]
+        # todo simplify this when mt.setdiff1d is implemented.
+        labels = labels.rechunk(((np.nan,),)).map_chunk(
+            lambda l, pl: np.hstack([l, np.setdiff1d(pl, l, assume_unique=True)]),
+            args=(present_labels,),
+            dtype=labels.dtype,
+            shape=(np.nan,),
+        )
+
+    if y_true.ndim == 1:
+        if samplewise:
+            raise ValueError(
+                "Samplewise metrics are not available outside of "
+                "multilabel classification."
+            )
+
+        le = LabelEncoder()
+        le.fit(labels, execute=False)
+        y_true = le.transform(y_true, execute=False)
+        y_pred = le.transform(y_pred, execute=False)
+        sorted_labels = le.classes_
+
+        # labels are now from 0 to len(labels) - 1 -> use bincount
+        tp = y_true == y_pred
+        tp_bins = y_true[tp]
+        execute(labels, y_true, y_pred, tp_bins, **exec_kw)
+        if sample_weight is not None:
+            tp_bins_weights = mt.asarray(sample_weight)[tp]
+        else:
+            tp_bins_weights = None
+
+        if tp_bins.shape[0]:
+            tp_sum = mt.bincount(
+                tp_bins, weights=tp_bins_weights, minlength=labels.shape[0]
+            )
+        else:
+            # Pathological case
+            true_sum = pred_sum = tp_sum = mt.zeros(labels.shape[0])
+        if y_pred.shape[0]:
+            pred_sum = mt.bincount(
+                y_pred, weights=sample_weight, minlength=labels.shape[0]
+            )
+        if y_true.shape[0]:
+            true_sum = mt.bincount(
+                y_true, weights=sample_weight, minlength=labels.shape[0]
+            )
+
+        # Retain only selected labels
+        indices = mt.searchsorted(sorted_labels, labels[:n_labels])
+        tp_sum = tp_sum[indices]
+        true_sum = true_sum[indices]
+        pred_sum = pred_sum[indices]
+
+    else:
+        sum_axis = 1 if samplewise else 0
+
+        def _check_labels(labels, present_labels):
+            # All labels are index integers for multilabel.
+            # Select labels:
+            if not np.array_equal(labels, present_labels):
+                if np.max(labels) > np.max(present_labels):
+                    raise ValueError(
+                        "All labels must be in [0, n labels) for "
+                        "multilabel targets. "
+                        "Got %d > %d" % (np.max(labels), np.max(present_labels))
+                    )
+                if np.min(labels) < 0:
+                    raise ValueError(
+                        "All labels must be in [0, n labels) for "
+                        "multilabel targets. "
+                        "Got %d < 0" % np.min(labels)
+                    )
+            return labels
+
+        labels = labels.map_chunk(
+            _check_labels,
+            args=(present_labels,),
+            dtype=labels.dtype,
+            shape=labels.shape,
+        )
+
+        if n_labels is not None:
+            y_true = y_true[:, labels[:n_labels]]
+            y_pred = y_pred[:, labels[:n_labels]]
+
+        # calculate weighted counts
+        true_and_pred = mt.multiply(y_true, y_pred)
+        tp_sum = count_nonzero(
+            true_and_pred, axis=sum_axis, sample_weight=sample_weight
+        )
+        pred_sum = count_nonzero(y_pred, axis=sum_axis, sample_weight=sample_weight)
+        true_sum = count_nonzero(y_true, axis=sum_axis, sample_weight=sample_weight)
+
+    fp = pred_sum - tp_sum
+    fn = true_sum - tp_sum
+    tp = tp_sum
+
+    # we need to obtain correct shape of y_true for further computation
+    executables = (fp, fn, tp, y_true)
+    execute(*executables, **exec_kw)
+
+    if sample_weight is not None and samplewise:
+        sample_weight = mt.asarray(sample_weight)
+        tp = mt.asarray(tp)
+        fp = mt.asarray(fp)
+        fn = mt.asarray(fn)
+        tn = sample_weight * y_true.shape[1] - tp - fp - fn
+    elif sample_weight is not None:
+        tn = sum(sample_weight) - tp - fp - fn
+    elif samplewise:
+        tn = y_true.shape[1] - tp - fp - fn
+    else:
+        tn = y_true.shape[0] - tp - fp - fn
+
+    ret = mt.array([tn, fp, fn, tp]).T.reshape(-1, 2, 2)
+    return ret.execute(**exec_kw)
+
+
+def _check_zero_division(zero_division):  # pragma: no cover
+    if isinstance(zero_division, str) and zero_division == "warn":
+        return
+    elif isinstance(zero_division, (int, float)) and zero_division in [0, 1]:
+        return
+    raise ValueError(
+        "Got zero_division={0}." ' Must be one of ["warn", 0, 1]'.format(zero_division)
+    )
+
+
+def _warn_prf(average, modifier, msg_start, result_size):  # pragma: no cover
+    axis0, axis1 = "sample", "label"
+    if average == "samples":
+        axis0, axis1 = axis1, axis0
+    msg = (
+        "{0} ill-defined and being set to 0.0 {{0}} "
+        "no {1} {2}s. Use `zero_division` parameter to control"
+        " this behavior.".format(msg_start, modifier, axis0)
+    )
+    if result_size == 1:
+        msg = msg.format("due to")
+    else:
+        msg = msg.format("in {0}s with".format(axis1))
+    warnings.warn(msg, UndefinedMetricWarning, stacklevel=2)
+
+
+def _prf_divide(
+    numerator, denominator, metric, modifier, average, warn_for, zero_division="warn"
+):  # pragma: no cover
+    """Performs division and handles divide-by-zero.
+
+    On zero-division, sets the corresponding result elements equal to
+    0 or 1 (according to ``zero_division``). Plus, if
+    ``zero_division != "warn"`` raises a warning.
+
+    The metric, modifier and average arguments are used only for determining
+    an appropriate warning.
+    """
+    mask = denominator == 0.0
+    denominator = denominator.copy()
+    denominator[mask] = 1  # avoid infs/nans
+    result = numerator / denominator
+
+    # if ``zero_division=1``, set those with denominator == 0 equal to 1
+    result[mask] = 0.0 if zero_division in ["warn", 0] else 1.0
+
+    # the user will be removing warnings if zero_division is set to something
+    # different than its default value. If we are computing only f-score
+    # the warning will be raised only if precision and recall are ill-defined
+    if zero_division != "warn" or metric not in warn_for:
+        return result
+
+    # build appropriate warning
+    # E.g. "Precision and F-score are ill-defined and being set to 0.0 in
+    # labels with no predicted samples. Use ``zero_division`` parameter to
+    # control this behavior."
+
+    if metric in warn_for and "f-score" in warn_for:
+        msg_start = "{0} and F-score are".format(metric.title())
+    elif metric in warn_for:
+        msg_start = "{0} is".format(metric.title())
+    elif "f-score" in warn_for:
+        msg_start = "F-score is"
+    else:
+        return result
+
+    _warn_prf(average, modifier, msg_start, len(result))
+
+    return result
+
+
+def _check_set_wise_labels(
+    y_true, y_pred, average, labels, pos_label, session=None, run_kwargs=None
+):  # pragma: no cover
+    """Validation associated with set-wise metrics
+
+    Returns identified labels
+    """
+    exec_kwargs = dict(session=session, **(run_kwargs or dict()))
+    average_options = (None, "micro", "macro", "weighted", "samples")
+    if average not in average_options and average != "binary":
+        raise ValueError("average has to be one of " + str(average_options))
+
+    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+    present_labels = unique_labels(y_true, y_pred)
+    execute(y_type, y_true, y_pred, **exec_kwargs)
+    y_type = y_type.fetch(**exec_kwargs)
+
+    if average == "binary":
+        if y_type == "binary":
+            t_pos_in_labels = mt.any(mt.isin(present_labels, pos_label))
+            execute(t_pos_in_labels, present_labels, **exec_kwargs)
+            pos_in_labels = t_pos_in_labels.fetch(**exec_kwargs)
+            if pos_in_labels:
+                if present_labels.shape[0] >= 2:
+                    raise ValueError(
+                        "pos_label=%r is not a valid label: "
+                        "%r" % (pos_label, present_labels)
+                    )
+            labels = [pos_label]
+        else:
+            average_options = list(average_options)
+            if y_type == "multiclass":
+                average_options.remove("samples")
+            raise ValueError(
+                "Target is %s but average='binary'. Please "
+                "choose another average setting, one of %r." % (y_type, average_options)
+            )
+    elif pos_label not in (None, 1):
+        warnings.warn(
+            "Note that pos_label (set to %r) is ignored when "
+            "average != 'binary' (got %r). You may use "
+            "labels=[pos_label] to specify a single positive class."
+            % (pos_label, average),
+            UserWarning,
+        )
+    return labels
+
+
+def precision_recall_fscore_support(
+    y_true,
+    y_pred,
+    *,
+    beta=1.0,
+    labels=None,
+    pos_label=1,
+    average=None,
+    warn_for=("precision", "recall", "f-score"),
+    sample_weight=None,
+    zero_division="warn",
+    session=None,
+    run_kwargs=None
+):
+    """Compute precision, recall, F-measure and support for each class
+
+    The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
+    true positives and ``fp`` the number of false positives. The precision is
+    intuitively the ability of the classifier not to label as positive a sample
+    that is negative.
+
+    The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
+    true positives and ``fn`` the number of false negatives. The recall is
+    intuitively the ability of the classifier to find all the positive samples.
+
+    The F-beta score can be interpreted as a weighted harmonic mean of
+    the precision and recall, where an F-beta score reaches its best
+    value at 1 and worst score at 0.
+
+    The F-beta score weights recall more than precision by a factor of
+    ``beta``. ``beta == 1.0`` means recall and precision are equally important.
+
+    The support is the number of occurrences of each class in ``y_true``.
+
+    If ``pos_label is None`` and in binary classification, this function
+    returns the average precision, recall and F-measure if ``average``
+    is one of ``'micro'``, ``'macro'``, ``'weighted'`` or ``'samples'``.
+
+    Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
+
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator array / sparse matrix
+        Ground truth (correct) target values.
+
+    y_pred : 1d array-like, or label indicator array / sparse matrix
+        Estimated targets as returned by a classifier.
+
+    beta : float, 1.0 by default
+        The strength of recall versus precision in the F-score.
+
+    labels : list, optional
+        The set of labels to include when ``average != 'binary'``, and their
+        order if ``average is None``. Labels present in the data can be
+        excluded, for example to calculate a multiclass average ignoring a
+        majority negative class, while labels not present in the data will
+        result in 0 components in a macro average. For multilabel targets,
+        labels are column indices. By default, all labels in ``y_true`` and
+        ``y_pred`` are used in sorted order.
+
+    pos_label : str or int, 1 by default
+        The class to report if ``average='binary'`` and the data is binary.
+        If the data are multiclass or multilabel, this will be ignored;
+        setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
+        scores for that label only.
+
+    average : string, [None (default), 'binary', 'micro', 'macro', 'samples', \
+                       'weighted']
+        If ``None``, the scores for each class are returned. Otherwise, this
+        determines the type of averaging performed on the data:
+
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'micro'``:
+            Calculate metrics globally by counting the total true positives,
+            false negatives and false positives.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average weighted
+            by support (the number of true instances for each label). This
+            alters 'macro' to account for label imbalance; it can result in an
+            F-score that is not between precision and recall.
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average (only
+            meaningful for multilabel classification where this differs from
+            :func:`accuracy_score`).
+
+    warn_for : tuple or set, for internal use
+        This determines which warnings will be made in the case that this
+        function is being used to return only one of its metrics.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    zero_division : "warn", 0 or 1, default="warn"
+        Sets the value to return when there is a zero division:
+           - recall: when there are no positive labels
+           - precision: when there are no positive predictions
+           - f-score: both
+
+        If set to "warn", this acts as 0, but warnings are also raised.
+
+    Returns
+    -------
+    precision : float (if average is not None) or array of float, shape =\
+        [n_unique_labels]
+
+    recall : float (if average is not None) or array of float, , shape =\
+        [n_unique_labels]
+
+    fbeta_score : float (if average is not None) or array of float, shape =\
+        [n_unique_labels]
+
+    support : None (if average is not None) or array of int, shape =\
+        [n_unique_labels]
+        The number of occurrences of each label in ``y_true``.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Precision and recall
+           <https://en.wikipedia.org/wiki/Precision_and_recall>`_
+
+    .. [2] `Wikipedia entry for the F1-score
+           <https://en.wikipedia.org/wiki/F1_score>`_
+
+    .. [3] `Discriminative Methods for Multi-labeled Classification Advances
+           in Knowledge Discovery and Data Mining (2004), pp. 22-30 by Shantanu
+           Godbole, Sunita Sarawagi
+           <http://www.godbole.net/shantanu/pubs/multilabelsvm-pakdd04.pdf>`_
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from mars.learn.metrics import precision_recall_fscore_support
+    >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig'])
+    >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog'])
+    >>> precision_recall_fscore_support(y_true, y_pred, average='macro')
+    (0.22..., 0.33..., 0.26..., None)
+    >>> precision_recall_fscore_support(y_true, y_pred, average='micro')
+    (0.33..., 0.33..., 0.33..., None)
+    >>> precision_recall_fscore_support(y_true, y_pred, average='weighted')
+    (0.22..., 0.33..., 0.26..., None)
+
+    It is possible to compute per-label precisions, recalls, F1-scores and
+    supports instead of averaging:
+
+    >>> precision_recall_fscore_support(y_true, y_pred, average=None,
+    ... labels=['pig', 'dog', 'cat'])
+    (array([0.        , 0.        , 0.66...]),
+     array([0., 0., 1.]), array([0. , 0. , 0.8]),
+     array([2, 2, 2]))
+
+    Notes
+    -----
+    When ``true positive + false positive == 0``, precision is undefined;
+    When ``true positive + false negative == 0``, recall is undefined.
+    In such cases, by default the metric will be set to 0, as will f-score,
+    and ``UndefinedMetricWarning`` will be raised. This behavior can be
+    modified with ``zero_division``.
+    """
+    exec_kw = dict(session=session, **(run_kwargs or dict()))
+
+    _check_zero_division(zero_division)
+    if beta < 0:
+        raise ValueError("beta should be >=0 in the F-beta score")
+    labels = _check_set_wise_labels(
+        y_true,
+        y_pred,
+        average,
+        labels,
+        pos_label,
+        session=session,
+        run_kwargs=run_kwargs,
+    )
+
+    # Calculate tp_sum, pred_sum, true_sum ###
+    samplewise = average == "samples"
+    MCM = multilabel_confusion_matrix(
+        y_true,
+        y_pred,
+        sample_weight=sample_weight,
+        labels=labels,
+        samplewise=samplewise,
+        session=session,
+        run_kwargs=run_kwargs,
+    )
+    tp_sum = MCM[:, 1, 1]
+    pred_sum = tp_sum + MCM[:, 0, 1]
+    true_sum = tp_sum + MCM[:, 1, 0]
+
+    if average == "micro":
+        tp_sum = mt.array([tp_sum.sum()])
+        pred_sum = mt.array([pred_sum.sum()])
+        true_sum = mt.array([true_sum.sum()])
+
+    execute(true_sum, **exec_kw)
+
+    # Finally, we have all our sufficient statistics. Divide! #
+    beta2 = beta**2
+
+    # Divide, and on zero-division, set scores and/or warn according to
+    # zero_division:
+    precision = _prf_divide(
+        tp_sum, pred_sum, "precision", "predicted", average, warn_for, zero_division
+    )
+    recall = _prf_divide(
+        tp_sum, true_sum, "recall", "true", average, warn_for, zero_division
+    )
+
+    # warn for f-score only if zero_division is warn, it is in warn_for
+    # and BOTH prec and rec are ill-defined
+    if zero_division == "warn" and ("f-score",) == warn_for:
+        any_pred_sum_zero = (
+            (pred_sum[true_sum == 0] == 0).any().execute(**exec_kw).fetch(**exec_kw)
+        )
+        if any_pred_sum_zero:
+            _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
+
+    # if tp == 0 F will be 1 only if all predictions are zero, all labels are
+    # zero, and zero_division=1. In all other case, 0
+    if np.isposinf(beta):
+        f_score = recall
+    else:
+        denom = beta2 * precision + recall
+
+        denom[denom == 0.0] = 1  # avoid division by 0
+        f_score = (1 + beta2) * precision * recall / denom
+
+    # Average the results
+    if average == "weighted":
+        weights = true_sum
+        sum_weights, sum_pred_sum = fetch(
+            execute(weights.sum(), pred_sum.sum(), **exec_kw), **exec_kw
+        )
+        if sum_weights == 0:
+            zero_division_value = 0.0 if zero_division in ["warn", 0] else 1.0
+            # precision is zero_division if there are no positive predictions
+            # recall is zero_division if there are no positive labels
+            # fscore is zero_division if all labels AND predictions are
+            # negative
+            return (
+                mt.scalar(zero_division_value if sum_pred_sum == 0 else 0),
+                mt.scalar(zero_division_value),
+                mt.scalar(zero_division_value if sum_pred_sum == 0 else 0),
+                None,
+            )
+
+    elif average == "samples":
+        weights = sample_weight
+    else:
+        weights = None
+
+    if average is not None:
+        assert average != "binary" or len(precision) == 1
+        precision = mt.average(precision, weights=weights)
+        recall = mt.average(recall, weights=weights)
+        f_score = mt.average(f_score, weights=weights)
+        true_sum = None  # return no support
+
+    return precision, recall, f_score, true_sum
+
+
+def precision_score(
+    y_true,
+    y_pred,
+    *,
+    labels=None,
+    pos_label=1,
+    average="binary",
+    sample_weight=None,
+    zero_division="warn"
+):
+    """Compute the precision
+
+    The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
+    true positives and ``fp`` the number of false positives. The precision is
+    intuitively the ability of the classifier not to label as positive a sample
+    that is negative.
+
+    The best value is 1 and the worst value is 0.
+
+    Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
+
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator array / sparse matrix
+        Ground truth (correct) target values.
+
+    y_pred : 1d array-like, or label indicator array / sparse matrix
+        Estimated targets as returned by a classifier.
+
+    labels : list, optional
+        The set of labels to include when ``average != 'binary'``, and their
+        order if ``average is None``. Labels present in the data can be
+        excluded, for example to calculate a multiclass average ignoring a
+        majority negative class, while labels not present in the data will
+        result in 0 components in a macro average. For multilabel targets,
+        labels are column indices. By default, all labels in ``y_true`` and
+        ``y_pred`` are used in sorted order.
+
+    pos_label : str or int, 1 by default
+        The class to report if ``average='binary'`` and the data is binary.
+        If the data are multiclass or multilabel, this will be ignored;
+        setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
+        scores for that label only.
+
+    average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \
+                       'weighted']
+        This parameter is required for multiclass/multilabel targets.
+        If ``None``, the scores for each class are returned. Otherwise, this
+        determines the type of averaging performed on the data:
+
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'micro'``:
+            Calculate metrics globally by counting the total true positives,
+            false negatives and false positives.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average weighted
+            by support (the number of true instances for each label). This
+            alters 'macro' to account for label imbalance; it can result in an
+            F-score that is not between precision and recall.
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average (only
+            meaningful for multilabel classification where this differs from
+            :func:`accuracy_score`).
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    zero_division : "warn", 0 or 1, default="warn"
+        Sets the value to return when there is a zero division. If set to
+        "warn", this acts as 0, but warnings are also raised.
+
+    Returns
+    -------
+    precision : float (if average is not None) or array of float, shape =\
+        [n_unique_labels]
+        Precision of the positive class in binary classification or weighted
+        average of the precision of each class for the multiclass task.
+
+    See also
+    --------
+    precision_recall_fscore_support, multilabel_confusion_matrix
+
+    Examples
+    --------
+    >>> from mars.learn.metrics import precision_score
+    >>> y_true = [0, 1, 2, 0, 1, 2]
+    >>> y_pred = [0, 2, 1, 0, 0, 1]
+    >>> precision_score(y_true, y_pred, average='macro')
+    0.22...
+    >>> precision_score(y_true, y_pred, average='micro')
+    0.33...
+    >>> precision_score(y_true, y_pred, average='weighted')
+    0.22...
+    >>> precision_score(y_true, y_pred, average=None)
+    array([0.66..., 0.        , 0.        ])
+    >>> y_pred = [0, 0, 0, 0, 0, 0]
+    >>> precision_score(y_true, y_pred, average=None)
+    array([0.33..., 0.        , 0.        ])
+    >>> precision_score(y_true, y_pred, average=None, zero_division=1)
+    array([0.33..., 1.        , 1.        ])
+
+    Notes
+    -----
+    When ``true positive + false positive == 0``, precision returns 0 and
+    raises ``UndefinedMetricWarning``. This behavior can be
+    modified with ``zero_division``.
+
+    """
+    p, _, _, _ = precision_recall_fscore_support(
+        y_true,
+        y_pred,
+        labels=labels,
+        pos_label=pos_label,
+        average=average,
+        warn_for=("precision",),
+        sample_weight=sample_weight,
+        zero_division=zero_division,
+    )
+    return p
+
+
+def recall_score(
+    y_true,
+    y_pred,
+    *,
+    labels=None,
+    pos_label=1,
+    average="binary",
+    sample_weight=None,
+    zero_division="warn"
+):
+    """Compute the recall
+
+    The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
+    true positives and ``fn`` the number of false negatives. The recall is
+    intuitively the ability of the classifier to find all the positive samples.
+
+    The best value is 1 and the worst value is 0.
+
+    Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
+
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator array / sparse matrix
+        Ground truth (correct) target values.
+
+    y_pred : 1d array-like, or label indicator array / sparse matrix
+        Estimated targets as returned by a classifier.
+
+    labels : list, optional
+        The set of labels to include when ``average != 'binary'``, and their
+        order if ``average is None``. Labels present in the data can be
+        excluded, for example to calculate a multiclass average ignoring a
+        majority negative class, while labels not present in the data will
+        result in 0 components in a macro average. For multilabel targets,
+        labels are column indices. By default, all labels in ``y_true`` and
+        ``y_pred`` are used in sorted order.
+
+    pos_label : str or int, 1 by default
+        The class to report if ``average='binary'`` and the data is binary.
+        If the data are multiclass or multilabel, this will be ignored;
+        setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
+        scores for that label only.
+
+    average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \
+                       'weighted']
+        This parameter is required for multiclass/multilabel targets.
+        If ``None``, the scores for each class are returned. Otherwise, this
+        determines the type of averaging performed on the data:
+
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'micro'``:
+            Calculate metrics globally by counting the total true positives,
+            false negatives and false positives.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average weighted
+            by support (the number of true instances for each label). This
+            alters 'macro' to account for label imbalance; it can result in an
+            F-score that is not between precision and recall.
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average (only
+            meaningful for multilabel classification where this differs from
+            :func:`accuracy_score`).
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    zero_division : "warn", 0 or 1, default="warn"
+        Sets the value to return when there is a zero division. If set to
+        "warn", this acts as 0, but warnings are also raised.
+
+    Returns
+    -------
+    recall : float (if average is not None) or array of float, shape =\
+        [n_unique_labels]
+        Recall of the positive class in binary classification or weighted
+        average of the recall of each class for the multiclass task.
+
+    See also
+    --------
+    precision_recall_fscore_support, balanced_accuracy_score,
+    multilabel_confusion_matrix
+
+    Examples
+    --------
+    >>> from mars.learn.metrics import recall_score
+    >>> y_true = [0, 1, 2, 0, 1, 2]
+    >>> y_pred = [0, 2, 1, 0, 0, 1]
+    >>> recall_score(y_true, y_pred, average='macro')
+    0.33...
+    >>> recall_score(y_true, y_pred, average='micro')
+    0.33...
+    >>> recall_score(y_true, y_pred, average='weighted')
+    0.33...
+    >>> recall_score(y_true, y_pred, average=None)
+    array([1., 0., 0.])
+    >>> y_true = [0, 0, 0, 0, 0, 0]
+    >>> recall_score(y_true, y_pred, average=None)
+    array([0.5, 0. , 0. ])
+    >>> recall_score(y_true, y_pred, average=None, zero_division=1)
+    array([0.5, 1. , 1. ])
+
+    Notes
+    -----
+    When ``true positive + false negative == 0``, recall returns 0 and raises
+    ``UndefinedMetricWarning``. This behavior can be modified with
+    ``zero_division``.
+    """
+    _, r, _, _ = precision_recall_fscore_support(
+        y_true,
+        y_pred,
+        labels=labels,
+        pos_label=pos_label,
+        average=average,
+        warn_for=("recall",),
+        sample_weight=sample_weight,
+        zero_division=zero_division,
+    )
+    return r
+
+
+def f1_score(
+    y_true,
+    y_pred,
+    *,
+    labels=None,
+    pos_label=1,
+    average="binary",
+    sample_weight=None,
+    zero_division="warn"
+):
+    """Compute the F1 score, also known as balanced F-score or F-measure
+
+    The F1 score can be interpreted as a weighted average of the precision and
+    recall, where an F1 score reaches its best value at 1 and worst score at 0.
+    The relative contribution of precision and recall to the F1 score are
+    equal. The formula for the F1 score is::
+
+        F1 = 2 * (precision * recall) / (precision + recall)
+
+    In the multi-class and multi-label case, this is the average of
+    the F1 score of each class with weighting depending on the ``average``
+    parameter.
+
+    Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
+
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator array / sparse matrix
+        Ground truth (correct) target values.
+
+    y_pred : 1d array-like, or label indicator array / sparse matrix
+        Estimated targets as returned by a classifier.
+
+    labels : list, optional
+        The set of labels to include when ``average != 'binary'``, and their
+        order if ``average is None``. Labels present in the data can be
+        excluded, for example to calculate a multiclass average ignoring a
+        majority negative class, while labels not present in the data will
+        result in 0 components in a macro average. For multilabel targets,
+        labels are column indices. By default, all labels in ``y_true`` and
+        ``y_pred`` are used in sorted order.
+
+    pos_label : str or int, 1 by default
+        The class to report if ``average='binary'`` and the data is binary.
+        If the data are multiclass or multilabel, this will be ignored;
+        setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
+        scores for that label only.
+
+    average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \
+                       'weighted']
+        This parameter is required for multiclass/multilabel targets.
+        If ``None``, the scores for each class are returned. Otherwise, this
+        determines the type of averaging performed on the data:
+
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'micro'``:
+            Calculate metrics globally by counting the total true positives,
+            false negatives and false positives.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average weighted
+            by support (the number of true instances for each label). This
+            alters 'macro' to account for label imbalance; it can result in an
+            F-score that is not between precision and recall.
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average (only
+            meaningful for multilabel classification where this differs from
+            :func:`accuracy_score`).
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    zero_division : "warn", 0 or 1, default="warn"
+        Sets the value to return when there is a zero division, i.e. when all
+        predictions and labels are negative. If set to "warn", this acts as 0,
+        but warnings are also raised.
+
+    Returns
+    -------
+    f1_score : float or array of float, shape = [n_unique_labels]
+        F1 score of the positive class in binary classification or weighted
+        average of the F1 scores of each class for the multiclass task.
+
+    See also
+    --------
+    fbeta_score, precision_recall_fscore_support, jaccard_score,
+    multilabel_confusion_matrix
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the F1-score
+           <https://en.wikipedia.org/wiki/F1_score>`_
+
+    Examples
+    --------
+    >>> from mars.learn.metrics import f1_score
+    >>> y_true = [0, 1, 2, 0, 1, 2]
+    >>> y_pred = [0, 2, 1, 0, 0, 1]
+    >>> f1_score(y_true, y_pred, average='macro')
+    0.26...
+    >>> f1_score(y_true, y_pred, average='micro')
+    0.33...
+    >>> f1_score(y_true, y_pred, average='weighted')
+    0.26...
+    >>> f1_score(y_true, y_pred, average=None)
+    array([0.8, 0. , 0. ])
+    >>> y_true = [0, 0, 0, 0, 0, 0]
+    >>> y_pred = [0, 0, 0, 0, 0, 0]
+    >>> f1_score(y_true, y_pred, zero_division=1)
+    1.0...
+
+    Notes
+    -----
+    When ``true positive + false positive == 0``, precision is undefined;
+    When ``true positive + false negative == 0``, recall is undefined.
+    In such cases, by default the metric will be set to 0, as will f-score,
+    and ``UndefinedMetricWarning`` will be raised. This behavior can be
+    modified with ``zero_division``.
+    """
+    return fbeta_score(
+        y_true,
+        y_pred,
+        beta=1,
+        labels=labels,
+        pos_label=pos_label,
+        average=average,
+        sample_weight=sample_weight,
+        zero_division=zero_division,
+    )
+
+
+def fbeta_score(
+    y_true,
+    y_pred,
+    *,
+    beta,
+    labels=None,
+    pos_label=1,
+    average="binary",
+    sample_weight=None,
+    zero_division="warn"
+):
+    """Compute the F-beta score
+
+    The F-beta score is the weighted harmonic mean of precision and recall,
+    reaching its optimal value at 1 and its worst value at 0.
+
+    The `beta` parameter determines the weight of recall in the combined
+    score. ``beta < 1`` lends more weight to precision, while ``beta > 1``
+    favors recall (``beta -> 0`` considers only precision, ``beta -> +inf``
+    only recall).
+
+    Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
+
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator array / sparse matrix
+        Ground truth (correct) target values.
+
+    y_pred : 1d array-like, or label indicator array / sparse matrix
+        Estimated targets as returned by a classifier.
+
+    beta : float
+        Determines the weight of recall in the combined score.
+
+    labels : list, optional
+        The set of labels to include when ``average != 'binary'``, and their
+        order if ``average is None``. Labels present in the data can be
+        excluded, for example to calculate a multiclass average ignoring a
+        majority negative class, while labels not present in the data will
+        result in 0 components in a macro average. For multilabel targets,
+        labels are column indices. By default, all labels in ``y_true`` and
+        ``y_pred`` are used in sorted order.
+
+    pos_label : str or int, 1 by default
+        The class to report if ``average='binary'`` and the data is binary.
+        If the data are multiclass or multilabel, this will be ignored;
+        setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
+        scores for that label only.
+
+    average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \
+                       'weighted']
+        This parameter is required for multiclass/multilabel targets.
+        If ``None``, the scores for each class are returned. Otherwise, this
+        determines the type of averaging performed on the data:
+
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'micro'``:
+            Calculate metrics globally by counting the total true positives,
+            false negatives and false positives.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average weighted
+            by support (the number of true instances for each label). This
+            alters 'macro' to account for label imbalance; it can result in an
+            F-score that is not between precision and recall.
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average (only
+            meaningful for multilabel classification where this differs from
+            :func:`accuracy_score`).
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    zero_division : "warn", 0 or 1, default="warn"
+        Sets the value to return when there is a zero division, i.e. when all
+        predictions and labels are negative. If set to "warn", this acts as 0,
+        but warnings are also raised.
+
+    Returns
+    -------
+    fbeta_score : float (if average is not None) or array of float, shape =\
+        [n_unique_labels]
+        F-beta score of the positive class in binary classification or weighted
+        average of the F-beta score of each class for the multiclass task.
+
+    See also
+    --------
+    precision_recall_fscore_support, multilabel_confusion_matrix
+
+    References
+    ----------
+    .. [1] R. Baeza-Yates and B. Ribeiro-Neto (2011).
+           Modern Information Retrieval. Addison Wesley, pp. 327-328.
+
+    .. [2] `Wikipedia entry for the F1-score
+           <https://en.wikipedia.org/wiki/F1_score>`_
+
+    Examples
+    --------
+    >>> from mars.learn.metrics import fbeta_score
+    >>> y_true = [0, 1, 2, 0, 1, 2]
+    >>> y_pred = [0, 2, 1, 0, 0, 1]
+    >>> fbeta_score(y_true, y_pred, average='macro', beta=0.5)
+    0.23...
+    >>> fbeta_score(y_true, y_pred, average='micro', beta=0.5)
+    0.33...
+    >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
+    0.23...
+    >>> fbeta_score(y_true, y_pred, average=None, beta=0.5)
+    array([0.71..., 0.        , 0.        ])
+
+    Notes
+    -----
+    When ``true positive + false positive == 0`` or
+    ``true positive + false negative == 0``, f-score returns 0 and raises
+    ``UndefinedMetricWarning``. This behavior can be
+    modified with ``zero_division``.
+    """
+
+    _, _, f, _ = precision_recall_fscore_support(
+        y_true,
+        y_pred,
+        beta=beta,
+        labels=labels,
+        pos_label=pos_label,
+        average=average,
+        warn_for=("f-score",),
+        sample_weight=sample_weight,
+        zero_division=zero_division,
+    )
+    return f
diff --git a/python/xorbits/_mars/learn/metrics/_ranking.py b/python/xorbits/_mars/learn/metrics/_ranking.py
new file mode 100644
index 000000000..2a36233ed
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/_ranking.py
@@ -0,0 +1,791 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from functools import partial
+
+import numpy as np
+
+from ... import execute as _execute
+from ... import fetch as _fetch
+from ... import tensor as mt
+from ...utils import cache_tileables
+from ..preprocessing import label_binarize
+from ..utils._encode import _encode, _unique
+from ..utils.checks import assert_all_finite
+from ..utils.core import sort_by
+from ..utils.multiclass import type_of_target
+from ..utils.validation import check_array, check_consistent_length, column_or_1d
+from ._base import _average_binary_score, _average_multiclass_ovo_score
+
+
+def auc(x, y, session=None, run_kwargs=None):
+    """Compute Area Under the Curve (AUC) using the trapezoidal rule
+
+    This is a general function, given points on a curve.  For computing the
+    area under the ROC-curve, see :func:`roc_auc_score`.  For an alternative
+    way to summarize a precision-recall curve, see
+    :func:`average_precision_score`.
+
+    Parameters
+    ----------
+    x : tensor, shape = [n]
+        x coordinates. These must be either monotonic increasing or monotonic
+        decreasing.
+    y : tensor, shape = [n]
+        y coordinates.
+
+    Returns
+    -------
+    auc : tensor, with float value
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> from mars.learn import metrics
+    >>> y = mt.array([1, 1, 2, 2])
+    >>> pred = mt.array([0.1, 0.4, 0.35, 0.8])
+    >>> fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2)
+    >>> metrics.auc(fpr, tpr)
+    0.75
+
+    See also
+    --------
+    roc_auc_score : Compute the area under the ROC curve
+    average_precision_score : Compute average precision from prediction scores
+    precision_recall_curve :
+        Compute precision-recall pairs for different probability thresholds
+    """
+    check_consistent_length(x, y)
+    x = column_or_1d(x)
+    y = column_or_1d(y)
+
+    if x.shape[0] < 2:
+        raise ValueError(
+            "At least 2 points are needed to compute"
+            f" area under curve, but x.shape = {x.shape}"
+        )
+
+    direction = 1
+    dx = mt.diff(x)
+    any_dx_lt_0 = mt.any(dx < 0)
+    all_dx_le_0 = mt.all(dx <= 0)
+    mt.ExecutableTuple([x, any_dx_lt_0, all_dx_le_0]).execute(
+        session=session, **(run_kwargs or dict())
+    )
+    if any_dx_lt_0.fetch(session=session):
+        if all_dx_le_0.fetch(session=session):
+            direction = -1
+        else:
+            x_data = x.fetch(session=session)
+            raise ValueError(f"x is neither increasing nor decreasing : {x_data}.")
+
+    area = direction * mt.trapz(y, x)
+    return area.execute(session=session, **(run_kwargs or dict()))
+
+
+def _binary_clf_curve(
+    y_true, y_score, pos_label=None, sample_weight=None, session=None, run_kwargs=None
+):
+    """Calculate true and false positives per binary classification threshold.
+
+    Parameters
+    ----------
+    y_true : tensor, shape = [n_samples]
+        True targets of binary classification
+
+    y_score : tensor, shape = [n_samples]
+        Estimated probabilities or decision function
+
+    pos_label : int or str, default=None
+        The label of the positive class
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    Returns
+    -------
+    fps : tensor, shape = [n_thresholds]
+        A count of false positives, at index i being the number of negative
+        samples assigned a score >= thresholds[i]. The total number of
+        negative samples is equal to fps[-1] (thus true negatives are given by
+        fps[-1] - fps).
+
+    tps : tensor, shape = [n_thresholds <= len(mt.unique(y_score))]
+        An increasing count of true positives, at index i being the number
+        of positive samples assigned a score >= thresholds[i]. The total
+        number of positive samples is equal to tps[-1] (thus false negatives
+        are given by tps[-1] - tps).
+
+    thresholds : tensor, shape = [n_thresholds]
+        Decreasing score values.
+    """
+    y_type = type_of_target(y_true).to_numpy(session=session, **(run_kwargs or dict()))
+    if not (y_type == "binary" or (y_type == "multiclass" and pos_label is not None)):
+        raise ValueError(f"{y_type} format is not supported")
+
+    check_consistent_length(
+        y_true, y_score, sample_weight, session=session, **(run_kwargs or dict())
+    )
+    y_true = column_or_1d(y_true)
+    y_score = column_or_1d(y_score)
+    y_true = assert_all_finite(y_true, check_only=False)
+    y_score = assert_all_finite(y_score, check_only=False)
+
+    cache_tileables(y_true, y_score)
+
+    if sample_weight is not None:
+        sample_weight = column_or_1d(sample_weight)
+
+    # ensure binary classification if pos_label is not specified
+    # classes.dtype.kind in ('O', 'U', 'S') is required to avoid
+    # triggering a FutureWarning by calling np.array_equal(a, b)
+    # when elements in the two arrays are not comparable.
+    classes = mt.unique(y_true, aggregate_size=1).to_numpy(
+        session=session, **(run_kwargs or dict())
+    )
+    if pos_label is None and (
+        classes.dtype.kind in ("O", "U", "S")
+        or not (
+            np.array_equal(classes, [0, 1])
+            or np.array_equal(classes, [-1, 1])
+            or np.array_equal(classes, [0])
+            or np.array_equal(classes, [-1])
+            or np.array_equal(classes, [1])
+        )
+    ):
+        classes_repr = ", ".join(repr(c) for c in classes)
+        raise ValueError(
+            f"y_true takes value in {{{classes_repr}}} and "
+            "pos_label is not specified: either make y_true "
+            "take value in {{0, 1}} or {{-1, 1}} or "
+            "pass pos_label explicitly."
+        )
+    elif pos_label is None:
+        pos_label = 1.0
+
+    # make y_true a boolean vector
+    y_true = y_true == pos_label
+
+    # sort scores and corresponding truth values
+    # original implementation adopted from sklearn:
+    # """
+    # desc_score_indices = mt.argsort(y_score, kind="mergesort")[::-1]
+    # y_score = y_score[desc_score_indices]
+    # y_true = y_true[desc_score_indices]
+    # if sample_weight is not None:
+    #     weight = sample_weight[desc_score_indices]
+    # else:
+    #     weight = 1.0
+    # """
+    # since fancy indexing is a heavy operation, we try to use DataFrame to sort
+    to_sort = [y_score, y_true]
+    if sample_weight is not None:
+        to_sort.append(sample_weight)
+    to_sort = sort_by(to_sort, y_score, ascending=False)
+    y_score, y_true = to_sort[:2]
+    if sample_weight is not None:
+        weight = to_sort[-1]
+    else:
+        weight = 1.0
+
+    # y_score typically has many tied values. Here we extract
+    # the indices associated with the distinct values. We also
+    # concatenate a value for the end of the curve.
+    distinct_value_indices = mt.where(mt.diff(y_score))[0]
+    threshold_idxs = mt.r_[distinct_value_indices, y_true.size - 1]
+
+    # accumulate the true positives with decreasing threshold
+    # raw tps from sklearn implementation
+    # we try to perform only one fancy index
+    # tps = (y_true * weight).cumsum()[threshold_idxs]
+    temp_tps = (y_true * weight).cumsum()
+    if sample_weight is not None:
+        # express fps as a cumsum to ensure fps is increasing even in
+        # the presence of floating point errors
+        # fps = ((1 - y_true) * weight).cumsum()[threshold_idxs]
+        temp_fps = ((1 - y_true) * weight).cumsum()
+        tps, fps, thresholds = mt.stack([temp_tps, temp_fps, y_score])[
+            :, threshold_idxs
+        ]
+
+    else:
+        tps, thresholds = mt.stack([temp_tps, y_score])[:, threshold_idxs]
+        fps = 1 + threshold_idxs - tps
+    return _execute([fps, tps, thresholds], session=session, **(run_kwargs or dict()))
+
+
+def _binary_roc_auc_score(
+    y_true, y_score, sample_weight=None, max_fpr=None, session=None, run_kwargs=None
+):
+    """Binary roc auc score."""
+
+    from numpy import interp
+
+    if len(mt.unique(y_true).execute()) != 2:
+        raise ValueError(
+            "Only one class present in y_true. ROC AUC score "
+            "is not defined in that case."
+        )
+
+    fpr, tpr, _ = roc_curve(
+        y_true,
+        y_score,
+        sample_weight=sample_weight,
+        session=session,
+        run_kwargs=run_kwargs,
+    )
+    fpr, tpr = mt.ExecutableTuple([fpr, tpr]).fetch(session=session)
+
+    if max_fpr is None or max_fpr == 1:
+        return auc(fpr, tpr, session=session, run_kwargs=run_kwargs).fetch(
+            session=session
+        )
+    if max_fpr <= 0 or max_fpr > 1:
+        raise ValueError(f"Expected max_fpr in range (0, 1], got: {max_fpr}")
+
+    # Add a single point at max_fpr by linear interpolation
+    stop = (
+        mt.searchsorted(fpr, max_fpr, "right")
+        .execute(session=session, **(run_kwargs or dict()))
+        .fetch(session=session)
+    )
+    x_interp = [fpr[stop - 1], fpr[stop]]
+    y_interp = [tpr[stop - 1], tpr[stop]]
+    tpr = list(tpr[:stop])
+    tpr.append(interp(max_fpr, x_interp, y_interp))
+    fpr = list(fpr[:stop])
+    fpr.append(max_fpr)
+    partial_auc = auc(fpr, tpr, session=session, run_kwargs=run_kwargs)
+
+    # McClish correction: standardize result to be 0.5 if non-discriminant
+    # and 1 if maximal
+    min_area = 0.5 * max_fpr**2
+    max_area = max_fpr
+    return 0.5 * (
+        1 + (partial_auc.fetch(session=session) - min_area) / (max_area - min_area)
+    )
+
+
+def roc_auc_score(
+    y_true,
+    y_score,
+    *,
+    average="macro",
+    sample_weight=None,
+    max_fpr=None,
+    multi_class="raise",
+    labels=None,
+    session=None,
+    run_kwargs=None,
+):
+    """
+    Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
+    from prediction scores.
+
+    Note: this implementation can be used with binary, multiclass and
+    multilabel classification, but some restrictions apply (see Parameters).
+
+    Read more in the :ref:`User Guide <roc_metrics>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_classes)
+        True labels or binary label indicators. The binary and multiclass cases
+        expect labels with shape (n_samples,) while the multilabel case expects
+        binary label indicators with shape (n_samples, n_classes).
+
+    y_score : array-like of shape (n_samples,) or (n_samples, n_classes)
+        Target scores.
+
+        * In the binary case, it corresponds to an array of shape
+          `(n_samples,)`. Both probability estimates and non-thresholded
+          decision values can be provided. The probability estimates correspond
+          to the **probability of the class with the greater label**,
+          i.e. `estimator.classes_[1]` and thus
+          `estimator.predict_proba(X, y)[:, 1]`. The decision values
+          corresponds to the output of `estimator.decision_function(X, y)`.
+          See more information in the :ref:`User guide <roc_auc_binary>`;
+        * In the multiclass case, it corresponds to an array of shape
+          `(n_samples, n_classes)` of probability estimates provided by the
+          `predict_proba` method. The probability estimates **must**
+          sum to 1 across the possible classes. In addition, the order of the
+          class scores must correspond to the order of ``labels``,
+          if provided, or else to the numerical or lexicographical order of
+          the labels in ``y_true``. See more information in the
+          :ref:`User guide <roc_auc_multiclass>`;
+        * In the multilabel case, it corresponds to an array of shape
+          `(n_samples, n_classes)`. Probability estimates are provided by the
+          `predict_proba` method and the non-thresholded decision values by
+          the `decision_function` method. The probability estimates correspond
+          to the **probability of the class with the greater label for each
+          output** of the classifier. See more information in the
+          :ref:`User guide <roc_auc_multilabel>`.
+
+    average : {'micro', 'macro', 'samples', 'weighted'} or None, \
+            default='macro'
+        If ``None``, the scores for each class are returned. Otherwise,
+        this determines the type of averaging performed on the data:
+        Note: multiclass ROC AUC currently only handles the 'macro' and
+        'weighted' averages.
+
+        ``'micro'``:
+            Calculate metrics globally by considering each element of the label
+            indicator matrix as a label.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average, weighted
+            by support (the number of true instances for each label).
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average.
+
+        Will be ignored when ``y_true`` is binary.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    max_fpr : float > 0 and <= 1, default=None
+        If not ``None``, the standardized partial AUC [2]_ over the range
+        [0, max_fpr] is returned. For the multiclass case, ``max_fpr``,
+        should be either equal to ``None`` or ``1.0`` as AUC ROC partial
+        computation currently is not supported for multiclass.
+
+    multi_class : {'raise', 'ovr', 'ovo'}, default='raise'
+        Only used for multiclass targets. Determines the type of configuration
+        to use. The default value raises an error, so either
+        ``'ovr'`` or ``'ovo'`` must be passed explicitly.
+
+        ``'ovr'``:
+            Stands for One-vs-rest. Computes the AUC of each class
+            against the rest [3]_ [4]_. This
+            treats the multiclass case in the same way as the multilabel case.
+            Sensitive to class imbalance even when ``average == 'macro'``,
+            because class imbalance affects the composition of each of the
+            'rest' groupings.
+        ``'ovo'``:
+            Stands for One-vs-one. Computes the average AUC of all
+            possible pairwise combinations of classes [5]_.
+            Insensitive to class imbalance when
+            ``average == 'macro'``.
+
+    labels : array-like of shape (n_classes,), default=None
+        Only used for multiclass targets. List of labels that index the
+        classes in ``y_score``. If ``None``, the numerical or lexicographical
+        order of the labels in ``y_true`` is used.
+
+    Returns
+    -------
+    auc : float
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Receiver operating characteristic
+            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
+
+    .. [2] `Analyzing a portion of the ROC curve. McClish, 1989
+            <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
+
+    .. [3] Provost, F., Domingos, P. (2000). Well-trained PETs: Improving
+           probability estimation trees (Section 6.2), CeDER Working Paper
+           #IS-00-04, Stern School of Business, New York University.
+
+    .. [4] `Fawcett, T. (2006). An introduction to ROC analysis. Pattern
+            Recognition Letters, 27(8), 861-874.
+            <https://www.sciencedirect.com/science/article/pii/S016786550500303X>`_
+
+    .. [5] `Hand, D.J., Till, R.J. (2001). A Simple Generalisation of the Area
+            Under the ROC Curve for Multiple Class Classification Problems.
+            Machine Learning, 45(2), 171-186.
+            <http://link.springer.com/article/10.1023/A:1010920819831>`_
+
+    See Also
+    --------
+    average_precision_score : Area under the precision-recall curve.
+    roc_curve : Compute Receiver operating characteristic (ROC) curve.
+    RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic
+        (ROC) curve given an estimator and some data.
+    RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic
+        (ROC) curve given the true and predicted values.
+
+    Examples
+    --------
+    Binary case:
+
+    >>> from sklearn.datasets import load_breast_cancer
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from mars.learn.metrics import roc_auc_score
+    >>> X, y = load_breast_cancer(return_X_y=True)
+    >>> clf = LogisticRegression(solver="liblinear", random_state=0).fit(X, y)
+    >>> roc_auc_score(y, clf.predict_proba(X)[:, 1])
+    0.99...
+    >>> roc_auc_score(y, clf.decision_function(X))
+    0.99...
+
+    Multiclass case:
+
+    >>> from sklearn.datasets import load_iris
+    >>> X, y = load_iris(return_X_y=True)
+    >>> clf = LogisticRegression(solver="liblinear").fit(X, y)
+    >>> roc_auc_score(y, clf.predict_proba(X), multi_class='ovr')
+    0.99...
+
+    Multilabel case:
+
+    >>> import numpy as np
+    >>> from sklearn.datasets import make_multilabel_classification
+    >>> from sklearn.multioutput import MultiOutputClassifier
+    >>> X, y = make_multilabel_classification(random_state=0)
+    >>> clf = MultiOutputClassifier(clf).fit(X, y)
+    >>> # get a list of n_output containing probability arrays of shape
+    >>> # (n_samples, n_classes)
+    >>> y_pred = clf.predict_proba(X)
+    >>> # extract the positive columns for each output
+    >>> y_pred = np.transpose([pred[:, 1] for pred in y_pred])
+    >>> roc_auc_score(y, y_pred, average=None)
+    array([0.82..., 0.86..., 0.94..., 0.85... , 0.94...])
+    >>> from sklearn.linear_model import RidgeClassifierCV
+    >>> clf = RidgeClassifierCV().fit(X, y)
+    >>> roc_auc_score(y, clf.decision_function(X), average=None)
+    array([0.81..., 0.84... , 0.93..., 0.87..., 0.94...])
+    """
+
+    cache_tileables(y_true, y_score)
+
+    y_type = type_of_target(y_true)
+    y_true = check_array(y_true, ensure_2d=False, dtype=None)
+    y_score = check_array(y_score, ensure_2d=False)
+    _execute([y_type, y_true, y_score], session=session, **(run_kwargs or dict()))
+    y_type = y_type.fetch(session=session)
+
+    def execute(*args):
+        result = [None] * len(args)
+        to_execute = dict()
+        for i, arg in enumerate(args):
+            if hasattr(arg, "op"):
+                to_execute[i] = arg
+            else:
+                result[i] = arg
+        if to_execute:
+            _execute(*to_execute.values(), session=session, **(run_kwargs or dict()))
+        for i, e in to_execute.items():
+            if e.isscalar():
+                e = e.fetch(session=session)
+            result[i] = e
+        return result[0] if len(result) == 1 else result
+
+    if y_type == "multiclass" or (
+        y_type == "binary" and y_score.ndim == 2 and y_score.shape[1] > 2
+    ):
+        # do not support partial ROC computation for multiclass
+        if max_fpr is not None and max_fpr != 1.0:
+            raise ValueError(
+                "Partial AUC computation not available in "
+                "multiclass setting, 'max_fpr' must be"
+                " set to `None`, received `max_fpr={0}` "
+                "instead".format(max_fpr)
+            )
+        if multi_class == "raise":
+            raise ValueError("multi_class must be in ('ovo', 'ovr')")
+        return execute(
+            _multiclass_roc_auc_score(
+                y_true, y_score, labels, multi_class, average, sample_weight
+            )
+        )
+    elif y_type == "binary":
+        labels = mt.unique(y_true).execute(session=session, **(run_kwargs or dict()))
+        y_true = label_binarize(y_true, classes=labels, execute=False)[:, 0]
+        cache_tileables(y_true)
+        return execute(
+            _average_binary_score(
+                partial(_binary_roc_auc_score, max_fpr=max_fpr),
+                y_true,
+                y_score,
+                average,
+                sample_weight=sample_weight,
+            )
+        )
+    else:  # multilabel-indicator
+        return execute(
+            _average_binary_score(
+                partial(_binary_roc_auc_score, max_fpr=max_fpr),
+                y_true,
+                y_score,
+                average,
+                sample_weight=sample_weight,
+            )
+        )
+
+
+def _multiclass_roc_auc_score(
+    y_true,
+    y_score,
+    labels,
+    multi_class,
+    average,
+    sample_weight,
+    session=None,
+    run_kwargs=None,
+):
+    # validation of the input y_score
+    if not mt.allclose(1, y_score.sum(axis=1)).to_numpy(
+        session=session, **(run_kwargs or dict())
+    ):  # pragma: no cover
+        raise ValueError(
+            "Target scores need to be probabilities for multiclass "
+            "roc_auc, i.e. they should sum up to 1.0 over classes"
+        )
+
+    # validation for multiclass parameter specifications
+    average_options = ("macro", "weighted")
+    if average not in average_options:
+        raise ValueError(
+            "average must be one of {0} for multiclass problems".format(average_options)
+        )
+
+    multiclass_options = ("ovo", "ovr")
+    if multi_class not in multiclass_options:
+        raise ValueError(
+            "multi_class='{0}' is not supported "
+            "for multiclass ROC AUC, multi_class must be "
+            "in {1}".format(multi_class, multiclass_options)
+        )
+
+    if labels is not None:
+        labels = column_or_1d(labels).to_numpy(
+            session=session, **(run_kwargs or dict())
+        )
+        classes = _unique(labels).to_numpy(session=session, **(run_kwargs or dict()))
+        if len(classes) != len(labels):
+            raise ValueError("Parameter 'labels' must be unique")
+        if not np.array_equal(classes, labels):
+            raise ValueError("Parameter 'labels' must be ordered")
+        if len(classes) != y_score.shape[1]:
+            raise ValueError(
+                "Number of given labels, {0}, not equal to the number "
+                "of columns in 'y_score', {1}".format(len(classes), y_score.shape[1])
+            )
+        if len(
+            mt.setdiff1d(y_true, classes).execute(
+                session=session, **(run_kwargs or dict())
+            )
+        ):
+            raise ValueError("'y_true' contains labels not in parameter 'labels'")
+    else:
+        classes = _unique(y_true).execute(session=session, **(run_kwargs or dict()))
+        if len(classes) != y_score.shape[1]:
+            raise ValueError(
+                "Number of classes in y_true not equal to the number of "
+                "columns in 'y_score'"
+            )
+
+    if multi_class == "ovo":
+        if sample_weight is not None:
+            raise ValueError(
+                "sample_weight is not supported "
+                "for multiclass one-vs-one ROC AUC, "
+                "'sample_weight' must be None in this case."
+            )
+        y_true_encoded = _encode(y_true, uniques=classes)
+        # Hand & Till (2001) implementation (ovo)
+        return _average_multiclass_ovo_score(
+            _binary_roc_auc_score,
+            y_true_encoded,
+            y_score,
+            average=average,
+            session=session,
+            run_kwargs=run_kwargs,
+        )
+    else:
+        # ovr is same as multi-label
+        y_true_multilabel = label_binarize(y_true, classes=classes, execute=False)
+        return _average_binary_score(
+            _binary_roc_auc_score,
+            y_true_multilabel,
+            y_score,
+            average,
+            sample_weight=sample_weight,
+            session=session,
+            run_kwargs=run_kwargs,
+        )
+
+
+def roc_curve(
+    y_true,
+    y_score,
+    pos_label=None,
+    sample_weight=None,
+    drop_intermediate=True,
+    session=None,
+    run_kwargs=None,
+):
+    """Compute Receiver operating characteristic (ROC)
+
+    Note: this implementation is restricted to the binary classification task.
+
+    Read more in the :ref:`User Guide <roc_metrics>`.
+
+    Parameters
+    ----------
+
+    y_true : tensor, shape = [n_samples]
+        True binary labels. If labels are not either {-1, 1} or {0, 1}, then
+        pos_label should be explicitly given.
+
+    y_score : tensor, shape = [n_samples]
+        Target scores, can either be probability estimates of the positive
+        class, confidence values, or non-thresholded measure of decisions
+        (as returned by "decision_function" on some classifiers).
+
+    pos_label : int or str, default=None
+        The label of the positive class.
+        When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1},
+        ``pos_label`` is set to 1, otherwise an error will be raised.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    drop_intermediate : boolean, optional (default=True)
+        Whether to drop some suboptimal thresholds which would not appear
+        on a plotted ROC curve. This is useful in order to create lighter
+        ROC curves.
+
+        .. versionadded:: 0.17
+           parameter *drop_intermediate*.
+
+    Returns
+    -------
+    fpr : tensor, shape = [>2]
+        Increasing false positive rates such that element i is the false
+        positive rate of predictions with score >= thresholds[i].
+
+    tpr : tensor, shape = [>2]
+        Increasing true positive rates such that element i is the true
+        positive rate of predictions with score >= thresholds[i].
+
+    thresholds : tensor, shape = [n_thresholds]
+        Decreasing thresholds on the decision function used to compute
+        fpr and tpr. `thresholds[0]` represents no instances being predicted
+        and is arbitrarily set to `max(y_score) + 1`.
+
+    See also
+    --------
+    roc_auc_score : Compute the area under the ROC curve
+
+    Notes
+    -----
+    Since the thresholds are sorted from low to high values, they
+    are reversed upon returning them to ensure they correspond to both ``fpr``
+    and ``tpr``, which are sorted in reversed order during their calculation.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Receiver operating characteristic
+            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
+
+    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
+           Letters, 2006, 27(8):861-874.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> from mars.learn import metrics
+    >>> y = mt.array([1, 1, 2, 2])
+    >>> scores = mt.array([0.1, 0.4, 0.35, 0.8])
+    >>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)
+    >>> fpr
+    array([0. , 0. , 0.5, 0.5, 1. ])
+    >>> tpr
+    array([0. , 0.5, 0.5, 1. , 1. ])
+    >>> thresholds
+    array([1.8 , 0.8 , 0.4 , 0.35, 0.1 ])
+
+    """
+    from sklearn.exceptions import UndefinedMetricWarning
+
+    cache_tileables(y_true, y_score)
+
+    fps, tps, thresholds = _binary_clf_curve(
+        y_true,
+        y_score,
+        pos_label=pos_label,
+        sample_weight=sample_weight,
+        session=session,
+        run_kwargs=run_kwargs,
+    )
+
+    # Attempt to drop thresholds corresponding to points in between and
+    # collinear with other points. These are always suboptimal and do not
+    # appear on a plotted ROC curve (and thus do not affect the AUC).
+    # Here mt.diff(_, 2) is used as a "second derivative" to tell if there
+    # is a corner at the point. Both fps and tps must be tested to handle
+    # thresholds with multiple data points (which are combined in
+    # _binary_clf_curve). This keeps all cases where the point should be kept,
+    # but does not drop more complicated cases like fps = [1, 3, 7],
+    # tps = [1, 2, 4]; there is no harm in keeping too many thresholds.
+    if drop_intermediate and len(fps) > 2:
+        optimal_idxs = mt.where(
+            mt.r_[True, mt.logical_or(mt.diff(fps, 2), mt.diff(tps, 2)), True]
+        )[0]
+        # original implementation of sklearn:
+        # """
+        # fps = fps[optimal_idxs]
+        # tps = tps[optimal_idxs]
+        # thresholds = thresholds[optimal_idxs]
+        # """
+        # however, it's really a heavy operation to perform fancy index,
+        # thus we put them together
+        stacked = mt.stack([fps, tps, thresholds])
+        fps, tps, thresholds = stacked[:, optimal_idxs]
+
+    # Add an extra threshold position
+    # to make sure that the curve starts at (0, 0)
+    tps = mt.r_[0, tps]
+    fps = mt.r_[0, fps]
+    thresholds = mt.r_[thresholds[0] + 1, thresholds]
+
+    last_fps = fps[-1]
+    last_tps = tps[-1]
+    _execute(
+        [tps, fps, last_fps, last_tps, thresholds],
+        session=session,
+        **(run_kwargs or dict()),
+    )
+    last_fps, last_tps = _fetch([last_fps, last_tps], session=session)
+
+    if last_fps <= 0:
+        warnings.warn(
+            "No negative samples in y_true, "
+            "false positive value should be meaningless",
+            UndefinedMetricWarning,
+        )
+        fpr = mt.repeat(mt.nan, fps.shape)
+    else:
+        fpr = fps / last_fps
+
+    if last_tps <= 0:
+        warnings.warn(
+            "No positive samples in y_true, "
+            "true positive value should be meaningless",
+            UndefinedMetricWarning,
+        )
+        tpr = mt.repeat(mt.nan, tps.shape)
+    else:
+        tpr = tps / last_tps
+
+    ret = mt.ExecutableTuple([fpr, tpr, thresholds]).execute(
+        session=session, **(run_kwargs or dict())
+    )
+    return ret
diff --git a/python/xorbits/_mars/learn/metrics/_regresssion.py b/python/xorbits/_mars/learn/metrics/_regresssion.py
new file mode 100644
index 000000000..bdbd0f7c1
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/_regresssion.py
@@ -0,0 +1,248 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import numpy as np
+from sklearn.exceptions import UndefinedMetricWarning
+
+from ... import execute
+from ... import tensor as mt
+from ..utils.validation import (
+    _num_samples,
+    check_array,
+    check_consistent_length,
+    column_or_1d,
+)
+
+
+def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):
+    """Check that y_true and y_pred belong to the same regression task.
+
+    Parameters
+    ----------
+    y_true : array-like
+
+    y_pred : array-like
+
+    multioutput : array-like or string in ['raw_values', uniform_average',
+        'variance_weighted'] or None
+        None is accepted due to backward compatibility of r2_score().
+
+    Returns
+    -------
+    type_true : one of {'continuous', continuous-multioutput'}
+        The type of the true target data, as output by
+        'utils.multiclass.type_of_target'.
+
+    y_true : array-like of shape (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples, n_outputs)
+        Estimated target values.
+
+    multioutput : array-like of shape (n_outputs) or string in ['raw_values',
+        uniform_average', 'variance_weighted'] or None
+        Custom output weights if ``multioutput`` is array-like or
+        just the corresponding argument if ``multioutput`` is a
+        correct keyword.
+
+    dtype : str or list, default="numeric"
+        the dtype argument passed to check_array.
+    """
+    check_consistent_length(y_true, y_pred)
+    y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
+    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
+
+    if y_true.ndim == 1:
+        y_true = y_true.reshape((-1, 1))
+
+    if y_pred.ndim == 1:
+        y_pred = y_pred.reshape((-1, 1))
+
+    if y_true.shape[1] != y_pred.shape[1]:
+        raise ValueError(
+            "y_true and y_pred have different number of output "
+            "({0}!={1})".format(y_true.shape[1], y_pred.shape[1])
+        )
+
+    n_outputs = y_true.shape[1]
+    allowed_multioutput_str = ("raw_values", "uniform_average", "variance_weighted")
+    if isinstance(multioutput, str):
+        if multioutput not in allowed_multioutput_str:
+            raise ValueError(
+                "Allowed 'multioutput' string values are {}. "
+                "You provided multioutput={!r}".format(
+                    allowed_multioutput_str, multioutput
+                )
+            )
+    elif multioutput is not None:
+        multioutput = check_array(multioutput, ensure_2d=False)
+        if n_outputs == 1:
+            raise ValueError("Custom weights are useful only in multi-output cases.")
+        elif n_outputs != len(multioutput):
+            raise ValueError(
+                ("There must be equally many custom weights (%d) as outputs (%d).")
+                % (len(multioutput), n_outputs)
+            )
+    y_type = "continuous" if n_outputs == 1 else "continuous-multioutput"
+
+    return y_type, y_true, y_pred, multioutput
+
+
+def r2_score(
+    y_true,
+    y_pred,
+    *,
+    sample_weight=None,
+    multioutput="uniform_average",
+    session=None,
+    run_kwargs=None
+):
+    """:math:`R^2` (coefficient of determination) regression score function.
+
+    Best possible score is 1.0 and it can be negative (because the
+    model can be arbitrarily worse). A constant model that always
+    predicts the expected value of y, disregarding the input features,
+    would get a :math:`R^2` score of 0.0.
+
+    Read more in the :ref:`User Guide <r2_score>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    multioutput : {'raw_values', 'uniform_average', 'variance_weighted'}, \
+            array-like of shape (n_outputs,) or None, default='uniform_average'
+
+        Defines aggregating of multiple output scores.
+        Array-like value defines weights used to average scores.
+        Default is "uniform_average".
+
+        'raw_values' :
+            Returns a full set of scores in case of multioutput input.
+
+        'uniform_average' :
+            Scores of all outputs are averaged with uniform weight.
+
+        'variance_weighted' :
+            Scores of all outputs are averaged, weighted by the variances
+            of each individual output.
+
+    Returns
+    -------
+    z : float or tensor of floats
+        The :math:`R^2` score or ndarray of scores if 'multioutput' is
+        'raw_values'.
+
+    Notes
+    -----
+    This is not a symmetric function.
+
+    Unlike most other scores, :math:`R^2` score may be negative (it need not
+    actually be the square of a quantity R).
+
+    This metric is not well-defined for single samples and will return a NaN
+    value if n_samples is less than two.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry on the Coefficient of determination
+            <https://en.wikipedia.org/wiki/Coefficient_of_determination>`_
+
+    Examples
+    --------
+    >>> from mars.learn.metrics import r2_score
+    >>> y_true = [3, -0.5, 2, 7]
+    >>> y_pred = [2.5, 0.0, 2, 8]
+    >>> r2_score(y_true, y_pred)
+    0.948...
+    >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
+    >>> y_pred = [[0, 2], [-1, 2], [8, -5]]
+    >>> r2_score(y_true, y_pred,
+    ...          multioutput='variance_weighted')
+    0.938...
+    >>> y_true = [1, 2, 3]
+    >>> y_pred = [1, 2, 3]
+    >>> r2_score(y_true, y_pred)
+    1.0
+    >>> y_true = [1, 2, 3]
+    >>> y_pred = [2, 2, 2]
+    >>> r2_score(y_true, y_pred)
+    0.0
+    >>> y_true = [1, 2, 3]
+    >>> y_pred = [3, 2, 1]
+    >>> r2_score(y_true, y_pred)
+    -3.0
+    """
+    _, y_true, y_pred, multioutput = _check_reg_targets(y_true, y_pred, multioutput)
+    check_consistent_length(y_true, y_pred, sample_weight)
+
+    if _num_samples(y_pred) < 2:
+        msg = "R^2 score is not well-defined with less than two samples."
+        warnings.warn(msg, UndefinedMetricWarning)
+        return float("nan")
+
+    if sample_weight is not None:
+        sample_weight = column_or_1d(sample_weight)
+        weight = sample_weight[:, np.newaxis]
+    else:
+        weight = 1.0
+
+    numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)
+    denominator = (
+        weight * (y_true - mt.average(y_true, axis=0, weights=sample_weight)) ** 2
+    ).sum(axis=0, dtype=np.float64)
+    nonzero_denominator = denominator != 0
+    nonzero_numerator = numerator != 0
+    valid_score = nonzero_denominator & nonzero_numerator
+    output_scores = mt.ones((y_true.shape[1],))
+    output_scores[valid_score] = 1 - (numerator[valid_score] / denominator[valid_score])
+    # arbitrary set to zero to avoid -inf scores, having a constant
+    # y_true is not interesting for scoring a regression anyway
+    output_scores[nonzero_numerator & ~nonzero_denominator] = 0.0
+    if isinstance(multioutput, str):
+        if multioutput == "raw_values":
+            # return scores individually
+            return output_scores
+        elif multioutput == "uniform_average":
+            # passing None as weights results is uniform mean
+            avg_weights = None
+        elif multioutput == "variance_weighted":
+            avg_weights = denominator
+            # avoid fail on constant y or one-element arrays
+            cond1 = mt.any(nonzero_denominator)
+            execute(
+                cond1, nonzero_denominator, session=session, **(run_kwargs or dict())
+            )
+            if not cond1.fetch():
+                if not mt.any(nonzero_numerator).to_numpy(
+                    session=session, **(run_kwargs or dict())
+                ):
+                    return 1.0
+                else:
+                    return 0.0
+    else:
+        avg_weights = multioutput
+
+    return mt.average(output_scores, weights=avg_weights).execute(
+        session=session, **(run_kwargs or dict())
+    )
diff --git a/python/xorbits/_mars/learn/metrics/_scorer.py b/python/xorbits/_mars/learn/metrics/_scorer.py
new file mode 100644
index 000000000..61e01dfa7
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/_scorer.py
@@ -0,0 +1,57 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Union
+
+from sklearn.metrics import make_scorer
+
+from . import accuracy_score, log_loss, r2_score
+
+accuracy_score = make_scorer(accuracy_score)
+r2_score = make_scorer(r2_score)
+neg_log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
+
+
+SCORERS = dict(
+    r2=r2_score,
+    accuracy=accuracy_score,
+    neg_log_loss=neg_log_loss_scorer,
+)
+
+
+def get_scorer(score_func: Union[str, Callable], **kwargs) -> Callable:
+    """
+    Get a scorer from string
+
+    Parameters
+    ----------
+    score_func : str | callable
+        scoring method as string. If callable it is returned as is.
+
+    Returns
+    -------
+    scorer : callable
+        The scorer.
+    """
+    if isinstance(score_func, str):
+        try:
+            scorer = SCORERS[score_func]
+        except KeyError:
+            raise ValueError(
+                "{} is not a valid scoring value. "
+                "Valid options are {}".format(score_func, sorted(SCORERS))
+            )
+        return scorer
+    else:
+        return make_scorer(score_func, **kwargs)
diff --git a/python/xorbits/_mars/learn/metrics/pairwise/__init__.py b/python/xorbits/_mars/learn/metrics/pairwise/__init__.py
new file mode 100644
index 000000000..b554de019
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/pairwise/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .cosine import cosine_distances, cosine_similarity
+from .euclidean import euclidean_distances
+from .haversine import haversine_distances
+from .manhattan import manhattan_distances
+from .pairwise import PAIRWISE_DISTANCE_FUNCTIONS, pairwise_distances
+from .pairwise_distances_topk import pairwise_distances_topk
+from .rbf_kernel import rbf_kernel
diff --git a/python/xorbits/_mars/learn/metrics/pairwise/core.py b/python/xorbits/_mars/learn/metrics/pairwise/core.py
new file mode 100644
index 000000000..37c0a81e6
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/pairwise/core.py
@@ -0,0 +1,184 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+
+from ....core import recursive_tile
+from ....serialization.serializables import Int64Field
+from ....tensor import tensor as astensor
+from ....tensor.operands import TensorOperand, TensorOperandMixin
+from ....utils import has_unknown_shape
+from ...utils import check_array
+
+
+class PairwiseDistances(TensorOperand, TensorOperandMixin):
+    _op_module_ = "learn"
+
+    chunk_store_limit = Int64Field("chunk_store_limit")
+
+    @staticmethod
+    def _return_float_dtype(X, Y):
+        """
+        1. If dtype of X and Y is float32, then dtype float32 is returned.
+        2. Else dtype float is returned.
+        """
+
+        X = astensor(X)
+
+        if Y is None:
+            Y_dtype = X.dtype
+        else:
+            Y = astensor(Y)
+            Y_dtype = Y.dtype
+
+        if X.dtype == Y_dtype == np.float32:
+            dtype = np.float32
+        else:
+            dtype = float
+
+        return X, Y, dtype
+
+    @staticmethod
+    def check_pairwise_arrays(X, Y, precomputed=False, dtype=None):
+        X, Y, dtype_float = PairwiseDistances._return_float_dtype(X, Y)
+
+        estimator = "check_pairwise_arrays"
+        if dtype is None:
+            dtype = dtype_float
+
+        if Y is X or Y is None:
+            X = Y = check_array(X, accept_sparse=True, dtype=dtype, estimator=estimator)
+        else:
+            X = check_array(X, accept_sparse=True, dtype=dtype, estimator=estimator)
+            Y = check_array(Y, accept_sparse=True, dtype=dtype, estimator=estimator)
+
+        if precomputed:
+            if X.shape[1] != Y.shape[0]:
+                raise ValueError(
+                    "Precomputed metric requires shape "
+                    f"(n_queries, n_indexed). Got ({X.shape[0]}, {X.shape[1]}) "
+                    f"for {Y.shape[0]} indexed."
+                )
+        elif X.shape[1] != Y.shape[1]:
+            raise ValueError(
+                "Incompatible dimension for X and Y matrices: "
+                f"X.shape[1] == {X.shape[1]} while Y.shape[1] == {Y.shape[1]}"
+            )
+
+        return X, Y
+
+    @classmethod
+    def _tile_one_chunk(cls, op):
+        out = op.outputs[0]
+        chunk_op = op.copy().reset_key()
+        chunk = chunk_op.new_chunk(
+            [op.x.chunks[0], op.y.chunks[0]],
+            shape=out.shape,
+            order=out.order,
+            index=(0, 0),
+        )
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            shape=out.shape,
+            order=out.order,
+            chunks=[chunk],
+            nsplits=tuple((s,) for s in out.shape),
+        )
+
+    @classmethod
+    def _tile_chunks(cls, op, x, y):
+        out = op.outputs[0]
+        out_chunks = []
+        for idx in itertools.product(range(x.chunk_shape[0]), range(y.chunk_shape[0])):
+            xi, yi = idx
+
+            chunk_op = op.copy().reset_key()
+            chunk_inputs = [x.cix[xi, 0], y.cix[yi, 0]]
+            out_chunk = chunk_op.new_chunk(
+                chunk_inputs,
+                shape=(
+                    chunk_inputs[0].shape[0],
+                    chunk_inputs[1].shape[0],
+                ),
+                order=out.order,
+                index=idx,
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            shape=out.shape,
+            order=out.order,
+            chunks=out_chunks,
+            nsplits=(x.nsplits[0], y.nsplits[0]),
+        )
+
+    @classmethod
+    def _rechunk_cols_into_one(cls, x, y):
+        y_is_x = y is x
+        if x.chunk_shape[1] != 1 or y.chunk_shape[1] != 1:
+            if has_unknown_shape([x, y]):
+                yield
+
+            x = yield from recursive_tile(x.rechunk({1: x.shape[1]}))
+            if y_is_x:
+                y = x
+            else:
+                y = yield from recursive_tile(y.rechunk({1: y.shape[1]}))
+
+        return x, y
+
+    @classmethod
+    def _adjust_chunk_sizes(cls, op, X, Y, out):
+        max_x_chunk_size = max(X.nsplits[0])
+        max_y_chunk_size = max(Y.nsplits[0])
+        itemsize = out.dtype.itemsize
+        max_chunk_bytes = max_x_chunk_size * max_y_chunk_size * itemsize
+        chunk_store_limit = op.chunk_store_limit * 2  # scale 2 times
+        if max_chunk_bytes > chunk_store_limit:
+            adjust_succeeded = False
+            # chunk is too huge, try to rechunk X and Y
+            if X.shape[0] > Y.shape[0]:
+                # y is smaller, rechunk y is more efficient
+                expected_y_chunk_size = max(
+                    int(chunk_store_limit / itemsize / max_x_chunk_size), 1
+                )
+                if (
+                    max_x_chunk_size * expected_y_chunk_size * itemsize
+                    <= chunk_store_limit
+                ):
+                    adjust_succeeded = True
+                    Y = yield from recursive_tile(Y.rechunk({0: expected_y_chunk_size}))
+            else:
+                # x is smaller, rechunk x is more efficient
+                expected_x_chunk_size = max(
+                    int(chunk_store_limit / itemsize / max_y_chunk_size), 1
+                )
+                if (
+                    max_y_chunk_size * expected_x_chunk_size * itemsize
+                    <= chunk_store_limit
+                ):
+                    adjust_succeeded = True
+                    X = yield from recursive_tile(X.rechunk({0: expected_x_chunk_size}))
+
+            if not adjust_succeeded:
+                expected_chunk_size = max(int(np.sqrt(chunk_store_limit / itemsize)), 1)
+                X = yield from recursive_tile(X.rechunk({0: expected_chunk_size}))
+                Y = yield from recursive_tile(Y.rechunk({0: expected_chunk_size}))
+
+        return X, Y
diff --git a/python/xorbits/_mars/learn/metrics/pairwise/cosine.py b/python/xorbits/_mars/learn/metrics/pairwise/cosine.py
new file mode 100644
index 000000000..8d47beee0
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/pairwise/cosine.py
@@ -0,0 +1,138 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from .... import opcodes as OperandDef
+from .... import tensor as mt
+from ....core import recursive_tile
+from ....serialization.serializables import KeyField
+from ....tensor.core import TensorOrder
+from ...preprocessing import normalize
+from .core import PairwiseDistances
+
+
+class CosineDistances(PairwiseDistances):
+    _op_type_ = OperandDef.PAIRWISE_COSINE_DISTANCES
+
+    _x = KeyField("x")
+    _y = KeyField("y")
+
+    def __init__(self, x=None, y=None, **kw):
+        super().__init__(_x=x, _y=y, **kw)
+
+    @property
+    def x(self):
+        return self._x
+
+    @property
+    def y(self):
+        return self._y
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._x = self._inputs[0]
+        self._y = self._inputs[1]
+
+    def __call__(self, x, y=None):
+        x, y = self.check_pairwise_arrays(x, y)
+        return self.new_tensor(
+            [x, y], shape=(x.shape[0], y.shape[0]), order=TensorOrder.C_ORDER
+        )
+
+    @classmethod
+    def tile(cls, op):
+        x, y = op.x, op.y
+        if x is y:
+            S = cosine_similarity(x)
+        else:
+            S = cosine_similarity(x, y)
+        S = (S * -1) + 1
+        S = mt.clip(S, 0, 2)
+        if x is y:
+            mt.fill_diagonal(S, 0.0)
+        return [(yield from recursive_tile(S))]
+
+
+def cosine_similarity(X, Y=None, dense_output=True):
+    """Compute cosine similarity between samples in X and Y.
+
+    Cosine similarity, or the cosine kernel, computes similarity as the
+    normalized dot product of X and Y:
+
+        K(X, Y) = <X, Y> / (||X||*||Y||)
+
+    On L2-normalized data, this function is equivalent to linear_kernel.
+
+    Read more in the :ref:`User Guide <cosine_similarity>`.
+
+    Parameters
+    ----------
+    X : Tensor or sparse tensor, shape: (n_samples_X, n_features)
+        Input data.
+
+    Y : Tensor or sparse tensor, shape: (n_samples_Y, n_features)
+        Input data. If ``None``, the output will be the pairwise
+        similarities between all samples in ``X``.
+
+    dense_output : boolean (optional), default True
+        Whether to return dense output even when the input is sparse. If
+        ``False``, the output is sparse if both input tensors are sparse.
+
+    Returns
+    -------
+    kernel matrix : Tensor
+        A tensor with shape (n_samples_X, n_samples_Y).
+    """
+    X, Y = PairwiseDistances.check_pairwise_arrays(X, Y)
+
+    X_normalized = normalize(X, copy=True)
+    if X is Y:
+        Y_normalized = X_normalized
+    else:
+        Y_normalized = normalize(Y, copy=True)
+
+    K = X_normalized.dot(Y_normalized.T)
+    if dense_output:
+        K = K.todense()
+    return K
+
+
+def cosine_distances(X, Y=None):
+    """Compute cosine distance between samples in X and Y.
+
+    Cosine distance is defined as 1.0 minus the cosine similarity.
+
+    Read more in the :ref:`User Guide <metrics>`.
+
+    Parameters
+    ----------
+    X : array_like, sparse matrix
+        with shape (n_samples_X, n_features).
+
+    Y : array_like, sparse matrix (optional)
+        with shape (n_samples_Y, n_features).
+
+    Returns
+    -------
+    distance matrix : Tensor
+        A tensor with shape (n_samples_X, n_samples_Y).
+
+    See also
+    --------
+    mars.learn.metrics.pairwise.cosine_similarity
+    mars.tensor.spatial.distance.cosine : dense matrices only
+    """
+    op = CosineDistances(x=X, y=Y, dtype=np.dtype(np.float64))
+    return op(X, y=Y)
diff --git a/python/xorbits/_mars/learn/metrics/pairwise/euclidean.py b/python/xorbits/_mars/learn/metrics/pairwise/euclidean.py
new file mode 100644
index 000000000..2b265d201
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/pairwise/euclidean.py
@@ -0,0 +1,253 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from .... import opcodes as OperandDef
+from .... import tensor as mt
+from ....config import options
+from ....core import recursive_tile
+from ....serialization.serializables import BoolField, KeyField
+from ....tensor.core import TensorOrder
+from ....utils import has_unknown_shape
+from ...utils import check_array
+from ...utils.extmath import row_norms
+from .core import PairwiseDistances
+
+
+class EuclideanDistances(PairwiseDistances):
+    _op_type_ = OperandDef.PAIRWISE_EUCLIDEAN_DISTANCES
+
+    _x = KeyField("X")
+    _y = KeyField("Y")
+    _x_norm_squared = KeyField("X_norm_squared")
+    _y_norm_squared = KeyField("Y_norm_squared")
+    _squared = BoolField("squared")
+
+    def __init__(
+        self,
+        x=None,
+        y=None,
+        x_norm_squared=None,
+        y_norm_squared=None,
+        squared=None,
+        **kw
+    ):
+        super().__init__(
+            _x=x,
+            _y=y,
+            _x_norm_squared=x_norm_squared,
+            _y_norm_squared=y_norm_squared,
+            _squared=squared,
+            **kw
+        )
+
+    @property
+    def x(self):
+        return self._x
+
+    @property
+    def y(self):
+        return self._y
+
+    @property
+    def x_norm_squared(self):
+        return self._x_norm_squared
+
+    @property
+    def y_norm_squared(self):
+        return self._y_norm_squared
+
+    @property
+    def squared(self):
+        return self._squared
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        input_iter = iter(self._inputs)
+        self._x = next(input_iter)
+        if self._y is not None:
+            self._y = next(input_iter)
+        if self._x_norm_squared is not None:
+            self._x_norm_squared = next(input_iter)
+        if self._y_norm_squared is not None:
+            self._y_norm_squared = next(input_iter)
+
+    def __call__(self, X, Y=None, Y_norm_squared=None, X_norm_squared=None):
+        # If norms are passed as float32, they are unused. If arrays are passed as
+        # float32, norms needs to be recomputed on upcast chunks.
+        # TODO: use a float64 accumulator in row_norms to avoid the latter.
+        if X_norm_squared is not None:
+            XX = check_array(X_norm_squared)
+            if XX.shape == (1, X.shape[0]):
+                XX = XX.T
+            elif XX.shape != (X.shape[0], 1):
+                raise ValueError("Incompatible dimensions for X and X_norm_squared")
+            if XX.dtype == np.float32:
+                XX = self._x_norm_squared = None
+        else:
+            XX = None
+
+        if X is Y and XX is not None:
+            # shortcut in the common case euclidean_distances(X, X)
+            YY = XX.T
+        elif Y_norm_squared is not None:
+            YY = mt.atleast_2d(Y_norm_squared)
+
+            if YY.shape != (1, Y.shape[0]):
+                raise ValueError("Incompatible dimensions for Y and Y_norm_squared")
+            if YY.dtype == np.float32:
+                YY = self._y_norm_squared = None
+        else:
+            YY = None
+
+        inputs = [X, Y]
+        if XX is not None:
+            inputs.append(XX)
+        if YY is not None:
+            inputs.append(YY)
+        return self.new_tensor(
+            inputs, shape=(X.shape[0], Y.shape[0]), order=TensorOrder.C_ORDER
+        )
+
+    @classmethod
+    def tile(cls, op):
+        X, Y = op.x, op.y
+        out = op.outputs[0]
+
+        if X.dtype == np.float32:
+            if has_unknown_shape(X, Y):
+                yield
+            # rechunk
+            new_nsplit = max(max(X.nsplits[0]) // 2, 1)
+            X = yield from recursive_tile(X.rechunk({0: new_nsplit}).astype(np.float64))
+            if Y is not X:
+                new_nsplit = max(max(Y.nsplits[0]) // 2, 1)
+                Y = yield from recursive_tile(
+                    Y.rechunk({0: new_nsplit}).astype(np.float64)
+                )
+
+        XX = op.x_norm_squared
+        if XX is None:
+            XX = row_norms(X, squared=True)[:, np.newaxis]
+        YY = op.y_norm_squared
+        if YY is None:
+            YY = row_norms(Y, squared=True)[np.newaxis, :]
+
+        X, Y = yield from cls._adjust_chunk_sizes(op, X, Y, out)
+
+        distances = -2 * X.dot(Y.T)
+        if distances.issparse():
+            distances = distances.todense()
+        distances += XX
+        distances += YY
+        distances = mt.maximum(distances, 0)
+
+        if X is Y or X.key == Y.key:
+            mt.fill_diagonal(distances, 0)
+
+        distances = distances if op.squared else mt.sqrt(distances)
+        distances = distances.astype(out.dtype, copy=False)
+        return [(yield from recursive_tile(distances))]
+
+
+def euclidean_distances(
+    X, Y=None, Y_norm_squared=None, squared=False, X_norm_squared=None
+):
+    """
+    Considering the rows of X (and Y=X) as vectors, compute the
+    distance matrix between each pair of vectors.
+
+    For efficiency reasons, the euclidean distance between a pair of row
+    vector x and y is computed as::
+
+        dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))
+
+    This formulation has two advantages over other ways of computing distances.
+    First, it is computationally efficient when dealing with sparse data.
+    Second, if one argument varies but the other remains unchanged, then
+    `dot(x, x)` and/or `dot(y, y)` can be pre-computed.
+
+    However, this is not the most precise way of doing this computation, and
+    the distance matrix returned by this function may not be exactly
+    symmetric as required by, e.g., ``scipy.spatial.distance`` functions.
+
+    Read more in the :ref:`User Guide <metrics>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples_1, n_features)
+
+    Y : {array-like, sparse matrix}, shape (n_samples_2, n_features)
+
+    Y_norm_squared : array-like, shape (n_samples_2, ), optional
+        Pre-computed dot-products of vectors in Y (e.g.,
+        ``(Y**2).sum(axis=1)``)
+        May be ignored in some cases, see the note below.
+
+    squared : boolean, optional
+        Return squared Euclidean distances.
+
+    X_norm_squared : array-like, shape = [n_samples_1], optional
+        Pre-computed dot-products of vectors in X (e.g.,
+        ``(X**2).sum(axis=1)``)
+        May be ignored in some cases, see the note below.
+
+    Notes
+    -----
+    To achieve better accuracy, `X_norm_squared` and `Y_norm_squared` may be
+    unused if they are passed as ``float32``.
+
+    Returns
+    -------
+    distances : tensor, shape (n_samples_1, n_samples_2)
+
+    Examples
+    --------
+    >>> from mars.learn.metrics.pairwise import euclidean_distances
+    >>> X = [[0, 1], [1, 1]]
+    >>> # distance between rows of X
+    >>> euclidean_distances(X, X).execute()
+    array([[0., 1.],
+           [1., 0.]])
+    >>> # get distance to origin
+    >>> euclidean_distances(X, [[0, 0]]).execute()
+    array([[1.        ],
+           [1.41421356]])
+
+    See also
+    --------
+    paired_distances : distances betweens pairs of elements of X and Y.
+    """
+    if X.dtype == np.float32:
+        if Y is None:
+            dtype = X.dtype
+        elif Y.dtype == np.float32:
+            dtype = np.float32
+        else:
+            dtype = np.float64
+    else:
+        dtype = np.float64
+
+    X, Y = EuclideanDistances.check_pairwise_arrays(X, Y)
+    op = EuclideanDistances(
+        x=X,
+        y=Y,
+        x_norm_squared=X_norm_squared,
+        y_norm_squared=Y_norm_squared,
+        squared=squared,
+        dtype=np.dtype(dtype),
+        chunk_store_limit=options.chunk_store_limit,
+    )
+    return op(X, Y=Y, Y_norm_squared=Y_norm_squared, X_norm_squared=X_norm_squared)
diff --git a/python/xorbits/_mars/learn/metrics/pairwise/haversine.py b/python/xorbits/_mars/learn/metrics/pairwise/haversine.py
new file mode 100644
index 000000000..8d6d0d534
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/pairwise/haversine.py
@@ -0,0 +1,156 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+try:
+    from sklearn.neighbors import DistanceMetric as SklearnDistanceMetric
+except ImportError:  # pragma: no cover
+    SklearnDistanceMetric = None
+
+from .... import opcodes as OperandDef
+from ....core import recursive_tile
+from ....serialization.serializables import BoolField, KeyField
+from ....tensor.array_utils import as_same_device, device
+from ....tensor.core import TensorOrder
+from ....tensor.indexing import fill_diagonal
+from .core import PairwiseDistances
+
+
+class HaversineDistances(PairwiseDistances):
+    _op_type_ = OperandDef.PAIRWISE_HAVERSINE_DISTANCES
+
+    _x = KeyField("x")
+    _y = KeyField("y")
+    # for test purpose
+    _use_sklearn = BoolField("use_sklearn")
+
+    def __init__(self, x=None, y=None, use_sklearn=None, **kw):
+        super().__init__(_x=x, _y=y, _use_sklearn=use_sklearn, **kw)
+        if self._use_sklearn is None:
+            # if not set use_sklearn, will try to use sklearn by default
+            self._use_sklearn = True
+
+    @property
+    def x(self):
+        return self._x
+
+    @property
+    def y(self):
+        return self._y
+
+    @property
+    def use_sklearn(self):
+        return self._use_sklearn
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._x = self._inputs[0]
+        self._y = self._inputs[1]
+
+    def __call__(self, X, Y=None):
+        X, Y = self.check_pairwise_arrays(X, Y)
+        if self._y is None:
+            self._y = Y
+
+        if X.shape[1] != 2 or Y.shape[1] != 2:
+            raise ValueError("Haversine distance only valid in 2 dimensions")
+        if X.issparse() or Y.issparse():
+            raise TypeError("Haversine distance requires inputs dense")
+
+        return self.new_tensor(
+            [X, Y], shape=(X.shape[0], Y.shape[0]), order=TensorOrder.C_ORDER
+        )
+
+    @classmethod
+    def tile(cls, op):
+        x, y = op.x, op.y
+        y_is_x = y is x
+
+        if len(x.chunks) == 1 and len(y.chunks) == 1:
+            return cls._tile_one_chunk(op)
+
+        x, y = yield from cls._rechunk_cols_into_one(x, y)
+        (ret,) = cls._tile_chunks(op, x, y)
+        if y_is_x:
+            fill_diagonal(ret, 0)
+        return [(yield from recursive_tile(ret))]
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (x, y), device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            if xp is np and op.use_sklearn and SklearnDistanceMetric is not None:
+                # CPU and sklearn installed, delegate computation to sklearn
+                d = SklearnDistanceMetric.get_metric("haversine").pairwise(x, y)
+            else:
+                # try to leverage xp(np, cp) to perform computation
+                sin_0 = xp.sin(0.5 * (x[:, [0]] - y[:, 0]))
+                sin_1 = xp.sin(0.5 * (x[:, [1]] - y[:, 1]))
+                d = 2 * xp.arcsin(
+                    xp.sqrt(
+                        sin_0 * sin_0
+                        + xp.cos(x[:, [0]]) * xp.cos(y[:, 0]) * sin_1 * sin_1
+                    )
+                )
+
+            ctx[op.outputs[0].key] = d
+
+
+def haversine_distances(X, Y=None):
+    """Compute the Haversine distance between samples in X and Y
+
+    The Haversine (or great circle) distance is the angular distance between
+    two points on the surface of a sphere. The first distance of each point is
+    assumed to be the latitude, the second is the longitude, given in radians.
+    The dimension of the data must be 2.
+
+    .. math::
+       D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x1 - y1) / 2)
+                                + \\cos(x1)\\cos(y1)\\sin^2((x2 - y2) / 2)}]
+
+    Parameters
+    ----------
+    X : array_like, shape (n_samples_1, 2)
+
+    Y : array_like, shape (n_samples_2, 2), optional
+
+    Returns
+    -------
+    distance : {Tensor}, shape (n_samples_1, n_samples_2)
+
+    Notes
+    -----
+    As the Earth is nearly spherical, the haversine formula provides a good
+    approximation of the distance between two points of the Earth surface, with
+    a less than 1% error on average.
+
+    Examples
+    --------
+    We want to calculate the distance between the Ezeiza Airport
+    (Buenos Aires, Argentina) and the Charles de Gaulle Airport (Paris, France)
+
+    >>> from mars.learn.metrics.pairwise import haversine_distances
+    >>> bsas = [-34.83333, -58.5166646]
+    >>> paris = [49.0083899664, 2.53844117956]
+    >>> result = haversine_distances([bsas, paris])
+    >>> (result * 6371000/1000).execute()  # multiply by Earth radius to get kilometers
+    array([[    0.        , 11279.45379464],
+           [11279.45379464,     0.        ]])
+    """
+    op = HaversineDistances(x=X, y=Y, dtype=np.dtype(np.float64))
+    return op(X, Y=Y)
diff --git a/python/xorbits/_mars/learn/metrics/pairwise/manhattan.py b/python/xorbits/_mars/learn/metrics/pairwise/manhattan.py
new file mode 100644
index 000000000..26c45facb
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/pairwise/manhattan.py
@@ -0,0 +1,180 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+try:
+    from sklearn.metrics.pairwise import (
+        manhattan_distances as sklearn_manhattan_distances,
+    )
+except ImportError:  # pragma: no cover
+    sklearn_manhattan_distances = None
+
+from .... import opcodes as OperandDef
+from ....core import recursive_tile
+from ....serialization.serializables import BoolField, KeyField
+from ....tensor.arithmetic import abs as mt_abs
+from ....tensor.array_utils import as_same_device, device
+from ....tensor.core import TensorOrder
+from ....tensor.spatial.distance import cdist
+from ....utils import ensure_own_data
+from .core import PairwiseDistances
+
+
+class ManhattanDistances(PairwiseDistances):
+    _op_type_ = OperandDef.PAIRWISE_MANHATTAN_DISTANCES
+
+    _x = KeyField("x")
+    _y = KeyField("y")
+    _sum_over_features = BoolField("sum_over_features")
+
+    def __init__(self, x=None, y=None, sum_over_features=None, use_sklearn=None, **kw):
+        super().__init__(
+            _x=x,
+            _y=y,
+            _sum_over_features=sum_over_features,
+            _use_sklearn=use_sklearn,
+            **kw,
+        )
+
+    @property
+    def x(self):
+        return self._x
+
+    @property
+    def y(self):
+        return self._y
+
+    @property
+    def sum_over_features(self):
+        return self._sum_over_features
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._x = self._inputs[0]
+        self._y = self._inputs[1]
+
+    def __call__(self, X, Y=None):
+        X, Y = self.check_pairwise_arrays(X, Y)
+        if self._y is None:
+            self._y = Y
+
+        if (X.issparse() or Y.issparse()) and not self._sum_over_features:
+            raise TypeError(
+                f"sum_over_features={self._sum_over_features} not supported"
+                " for sparse matrices"
+            )
+
+        if not self._sum_over_features:
+            shape = (X.shape[0] * Y.shape[0], X.shape[1])
+        else:
+            shape = (X.shape[0], Y.shape[0])
+
+        return self.new_tensor([X, Y], shape=shape, order=TensorOrder.C_ORDER)
+
+    @classmethod
+    def tile(cls, op):
+        x, y = op.x, op.y
+
+        if len(x.chunks) == 1 and len(y.chunks) == 1:
+            return cls._tile_one_chunk(op)
+
+        if x.issparse() or y.issparse():
+            assert op.sum_over_features
+            return cls._tile_chunks(op, x, y)
+        elif op.sum_over_features:
+            # if x, y are not sparse and `sum_over_features` is True
+            # just use cdist
+            return [(yield from recursive_tile(cdist(x, y, "cityblock")))]
+        else:
+            d = x[:, np.newaxis, :] - y[np.newaxis, :, :]
+            d = mt_abs(d)
+            d = d.reshape((-1, x.shape[1]))
+            return [(yield from recursive_tile(d))]
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (x, y), device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+        out = op.outputs[0]
+
+        with device(device_id):
+            if sklearn_manhattan_distances is not None:
+                ctx[out.key] = sklearn_manhattan_distances(
+                    ensure_own_data(x),
+                    ensure_own_data(y),
+                    sum_over_features=op.sum_over_features,
+                )
+            else:  # pragma: no cover
+                # we cannot support sparse
+                raise NotImplementedError(
+                    "cannot support calculate manhattan distances on GPU"
+                )
+
+
+def manhattan_distances(X, Y=None, sum_over_features=True):
+    """ Compute the L1 distances between the vectors in X and Y.
+
+    With sum_over_features equal to False it returns the componentwise
+    distances.
+
+    Read more in the :ref:`User Guide <metrics>`.
+
+    Parameters
+    ----------
+    X : array_like
+        A tensor with shape (n_samples_X, n_features).
+
+    Y : array_like, optional
+        A tensor with shape (n_samples_Y, n_features).
+
+    sum_over_features : bool, default=True
+        If True the function returns the pairwise distance matrix
+        else it returns the componentwise L1 pairwise-distances.
+        Not supported for sparse matrix inputs.
+
+    Returns
+    -------
+    D : Tensor
+        If sum_over_features is False shape is
+        (n_samples_X * n_samples_Y, n_features) and D contains the
+        componentwise L1 pairwise-distances (ie. absolute difference),
+        else shape is (n_samples_X, n_samples_Y) and D contains
+        the pairwise L1 distances.
+
+    Examples
+    --------
+    >>> from mars.learn.metrics.pairwise import manhattan_distances
+    >>> manhattan_distances([[3]], [[3]]).execute() #doctest:+ELLIPSIS
+    array([[0.]])
+    >>> manhattan_distances([[3]], [[2]]).execute() #doctest:+ELLIPSIS
+    array([[1.]])
+    >>> manhattan_distances([[2]], [[3]]).execute() #doctest:+ELLIPSIS
+    array([[1.]])
+    >>> manhattan_distances([[1, 2], [3, 4]],\
+         [[1, 2], [0, 3]]).execute() #doctest:+ELLIPSIS
+    array([[0., 2.],
+           [4., 4.]])
+    >>> import mars.tensor as mt
+    >>> X = mt.ones((1, 2))
+    >>> y = mt.full((2, 2), 2.)
+    >>> manhattan_distances(X, y, sum_over_features=False).execute() #doctest:+ELLIPSIS
+    array([[1., 1.],
+           [1., 1.]])
+    """
+    op = ManhattanDistances(
+        x=X, y=Y, sum_over_features=sum_over_features, dtype=np.dtype(np.float64)
+    )
+    return op(X, Y=Y)
diff --git a/python/xorbits/_mars/learn/metrics/pairwise/pairwise.py b/python/xorbits/_mars/learn/metrics/pairwise/pairwise.py
new file mode 100644
index 000000000..5db1b7848
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/pairwise/pairwise.py
@@ -0,0 +1,127 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from functools import partial
+
+try:
+    from sklearn.exceptions import DataConversionWarning
+except ImportError:  # pragma: no cover
+    DataConversionWarning = None
+
+from ....tensor.spatial import distance
+from ...utils.validation import check_non_negative
+from .core import PairwiseDistances
+from .cosine import cosine_distances
+from .euclidean import euclidean_distances
+from .haversine import haversine_distances
+from .manhattan import manhattan_distances
+
+_VALID_METRICS = [
+    "euclidean",
+    "l2",
+    "l1",
+    "manhattan",
+    "cityblock",
+    "braycurtis",
+    "canberra",
+    "chebyshev",
+    "correlation",
+    "cosine",
+    "dice",
+    "hamming",
+    "jaccard",
+    "kulsinski",
+    "mahalanobis",
+    "matching",
+    "minkowski",
+    "rogerstanimoto",
+    "russellrao",
+    "seuclidean",
+    "sokalmichener",
+    "sokalsneath",
+    "sqeuclidean",
+    "yule",
+    "wminkowski",
+    "haversine",
+]
+
+# Helper functions - distance
+PAIRWISE_DISTANCE_FUNCTIONS = {
+    # If updating this dictionary, update the doc in both distance_metrics()
+    # and also in pairwise_distances()!
+    "cityblock": manhattan_distances,
+    "cosine": cosine_distances,
+    "euclidean": euclidean_distances,
+    "haversine": haversine_distances,
+    "l2": euclidean_distances,
+    "l1": manhattan_distances,
+    "manhattan": manhattan_distances,
+    "precomputed": None,  # HACK: precomputed is always allowed, never called
+}
+
+# These distances recquire boolean tensors, when using mars.tensor.spatial.distance
+PAIRWISE_BOOLEAN_FUNCTIONS = [
+    "dice",
+    "jaccard",
+    "kulsinski",
+    "matching",
+    "rogerstanimoto",
+    "russellrao",
+    "sokalmichener",
+    "sokalsneath",
+    "yule",
+]
+
+
+def pairwise_distances(X, Y=None, metric="euclidean", **kwds):
+    if (
+        metric not in _VALID_METRICS
+        and not callable(metric)
+        and metric != "precomputed"
+    ):
+        raise ValueError(
+            f"Unknown metric {metric}. Valid metrics are {_VALID_METRICS}, "
+            "or 'precomputed', or a callable"
+        )
+
+    if metric == "precomputed":
+        X, _ = PairwiseDistances.check_pairwise_arrays(X, Y, precomputed=True)
+
+        whom = (
+            "`pairwise_distances`. Precomputed distance "
+            " need to have non-negative values."
+        )
+        X = check_non_negative(X, whom=whom)
+        return X
+    elif metric in PAIRWISE_DISTANCE_FUNCTIONS:
+        func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
+    else:
+        # including when metric is callable
+        dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None
+
+        if (
+            dtype == bool
+            and (X.dtype != bool or (Y is not None and Y.dtype != bool))
+            and DataConversionWarning is not None
+        ):
+            msg = f"Data was converted to boolean for metric {metric}"
+            warnings.warn(msg, DataConversionWarning)
+
+        X, Y = PairwiseDistances.check_pairwise_arrays(X, Y, dtype=dtype)
+        if X is Y:
+            return distance.squareform(distance.pdist(X, metric=metric, **kwds))
+        func = partial(distance.cdist, metric=metric, **kwds)
+
+    return func(X, Y, **kwds)
diff --git a/python/xorbits/_mars/learn/metrics/pairwise/pairwise_distances_topk.py b/python/xorbits/_mars/learn/metrics/pairwise/pairwise_distances_topk.py
new file mode 100644
index 000000000..8e261f7cc
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/pairwise/pairwise_distances_topk.py
@@ -0,0 +1,506 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+
+import numpy as np
+
+from .... import opcodes, options
+from ....core import recursive_tile
+from ....core.operand import OperandStage
+from ....serialization.serializables import (
+    AnyField,
+    BoolField,
+    DictField,
+    Int64Field,
+    KeyField,
+)
+from ....tensor.array_utils import as_same_device, device, get_array_module
+from ....tensor.core import TensorOrder
+from ....tensor.merge import TensorConcatenate
+from ....utils import ensure_own_data, has_unknown_shape
+from ...utils import gen_batches, get_chunk_n_rows
+from ...utils.validation import _num_samples
+from .core import PairwiseDistances
+
+
+def _precompute_metric_params(X, Y, xp, metric=None, **kwds):  # pragma: no cover
+    """Precompute data-derived metric parameters if not provided"""
+    if metric == "seuclidean" and "V" not in kwds:
+        if X is Y:
+            V = xp.var(X, axis=0, ddof=1)
+        else:
+            V = xp.var(xp.vstack([X, Y]), axis=0, ddof=1)
+        return {"V": V}
+    if metric == "mahalanobis" and "VI" not in kwds:
+        if X is Y:
+            VI = xp.linalg.inv(xp.cov(X.T)).T
+        else:
+            VI = xp.linalg.inv(xp.cov(xp.vstack([X, Y]).T)).T
+        return {"VI": VI}
+    return {}
+
+
+def _check_chunk_size(reduced, chunk_size):  # pragma: no cover
+    """Checks chunk is a sequence of expected size or a tuple of same"""
+    if reduced is None:
+        return
+    is_tuple = isinstance(reduced, tuple)
+    if not is_tuple:
+        reduced = (reduced,)
+    if any(isinstance(r, tuple) or not hasattr(r, "__iter__") for r in reduced):
+        raise TypeError(
+            "reduce_func returned %r. "
+            "Expected sequence(s) of length %d."
+            % (reduced if is_tuple else reduced[0], chunk_size)
+        )
+    if any(_num_samples(r) != chunk_size for r in reduced):
+        actual_size = tuple(_num_samples(r) for r in reduced)
+        raise ValueError(
+            "reduce_func returned object of length %s. "
+            "Expected same length as input: %d."
+            % (actual_size if is_tuple else actual_size[0], chunk_size)
+        )
+
+
+def _pariwise_distance_chunked(
+    X, Y, reduce_func=None, metric="euclidean", working_memory=None, xp=None, **kwds
+):
+    if xp is np:
+        from sklearn.metrics import pairwise_distances
+    else:  # pragma: no cover
+        from cuml.metrics import pairwise_distances
+
+    n_samples_X = _num_samples(X)
+    if metric == "precomputed":  # pragma: no cover
+        slices = (slice(0, n_samples_X),)
+    else:
+        # We get as many rows as possible within our working_memory budget to
+        # store len(Y) distances in each row of output.
+        #
+        # Note:
+        #  - this will get at least 1 row, even if 1 row of distances will
+        #    exceed working_memory.
+        #  - this does not account for any temporary memory usage while
+        #    calculating distances (e.g. difference of vectors in manhattan
+        #    distance.
+        chunk_n_rows = get_chunk_n_rows(
+            row_bytes=8 * _num_samples(Y),
+            max_n_rows=n_samples_X,
+            working_memory=working_memory,
+        )
+        slices = gen_batches(n_samples_X, chunk_n_rows)
+
+    # precompute data-derived metric params
+    params = _precompute_metric_params(X, Y, xp, metric=metric, **kwds)
+    kwds.update(**params)
+
+    for sl in slices:
+        if sl.start == 0 and sl.stop == n_samples_X:
+            X_chunk = X  # enable optimised paths for X is Y
+        else:
+            X_chunk = X[sl]
+        # call pairwise op's execute method to get the result
+        D_chunk = pairwise_distances(
+            ensure_own_data(X_chunk), ensure_own_data(Y), metric=metric, **kwds
+        )
+        if (X is Y or Y is None) and metric == "euclidean":
+            # zeroing diagonal, taking care of aliases of "euclidean",
+            # i.e. "l2"
+            D_chunk.flat[
+                sl.start :: _num_samples(X) + 1
+            ] = 0  # pylint: disable=invalid-slice-index
+        if reduce_func is not None:
+            chunk_size = D_chunk.shape[0]
+            D_chunk = reduce_func(D_chunk, sl.start)
+            _check_chunk_size(D_chunk, chunk_size)
+        yield D_chunk
+
+
+class PairwiseDistancesTopk(PairwiseDistances):
+    _op_type_ = opcodes.PAIRWISE_DISTANCES_TOPK
+
+    _x = KeyField("x")
+    _y = KeyField("y")
+    _k = Int64Field("k")
+    _metric = AnyField("metric")
+    _metric_kwargs = DictField("metric_kwargs")
+    _return_index = BoolField("return_index")
+    _working_memory = AnyField("working_memory")
+    # for chunks
+    _y_offset = Int64Field("y_offset")
+
+    def __init__(
+        self,
+        x=None,
+        y=None,
+        k=None,
+        metric=None,
+        metric_kwargs=None,
+        return_index=None,
+        working_memory=None,
+        **kw
+    ):
+        super().__init__(
+            _x=x,
+            _y=y,
+            _k=k,
+            _metric=metric,
+            _metric_kwargs=metric_kwargs,
+            _return_index=return_index,
+            _working_memory=working_memory,
+            **kw
+        )
+
+    @property
+    def x(self):
+        return self._x
+
+    @property
+    def y(self):
+        return self._y
+
+    @property
+    def k(self):
+        return self._k
+
+    @property
+    def metric(self):
+        return self._metric
+
+    @property
+    def metric_kwargs(self):
+        return self._metric_kwargs
+
+    @property
+    def return_index(self):
+        return self._return_index
+
+    @property
+    def working_memory(self):
+        return self._working_memory
+
+    @property
+    def y_offset(self):
+        return self._y_offset
+
+    @property
+    def output_limit(self):
+        return 1 if not self._return_index or self.stage == OperandStage.map else 2
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if self.stage != OperandStage.agg:
+            self._x = self._inputs[0]
+            self._y = self._inputs[1]
+        else:
+            self._x = self._y = None
+
+    def __call__(self, X, Y):
+        from .pairwise import pairwise_distances
+
+        # leverage pairwise_distances for checks
+        d = pairwise_distances(X, Y, metric=self._metric, **self._metric_kwargs)
+
+        if self._k > Y.shape[0]:
+            self._k = Y.shape[0]
+
+        X, Y = d.op.inputs
+
+        shape_list = [X.shape[0]]
+        shape_list.append(min(Y.shape[0], self._k))
+        shape = tuple(shape_list)
+        kws = [
+            {
+                "shape": shape,
+                "order": TensorOrder.C_ORDER,
+                "dtype": np.dtype(np.float64),
+                "_type_": "distance",
+            },
+        ]
+        if self._return_index:
+            kws.append(
+                {
+                    "shape": shape,
+                    "order": TensorOrder.C_ORDER,
+                    "dtype": np.dtype(np.int64),
+                    "_type_": "index",
+                }
+            )
+            return self.new_tensors([X, Y], kws=kws)
+        else:
+            return self.new_tensors([X, Y], kws=kws)[0]
+
+    @classmethod
+    def _gen_out_chunks(cls, x_chunk, y_chunk, chunk_op):
+        k = chunk_op.k
+        i, j = x_chunk.index[0], y_chunk.index[0]
+
+        distance_chunk_params = {
+            "shape": (x_chunk.shape[0], k),
+            "order": TensorOrder.C_ORDER,
+            "dtype": np.dtype(np.float64),
+            "index": (i, j),
+            "_type_": "distance",
+        }
+        if chunk_op.return_index:
+            index_chunk_params = {
+                "shape": (x_chunk.shape[0], k),
+                "order": TensorOrder.C_ORDER,
+                "dtype": np.dtype(np.int64),
+                "index": (i, j),
+                "_type_": "index",
+            }
+            distance_chunk, index_chunk = chunk_op.new_chunks(
+                [x_chunk, y_chunk], kws=[distance_chunk_params, index_chunk_params]
+            )
+            return distance_chunk, index_chunk
+        else:
+            return chunk_op.new_chunks([x_chunk, y_chunk], kws=[distance_chunk_params])[
+                0
+            ]
+
+    @classmethod
+    def tile(cls, op: "PairwiseDistancesTopk"):
+        X, Y = op.x, op.y
+        k = op.k
+
+        if X.chunk_shape[1] > 1:
+            X = yield from recursive_tile(X.rechunk({1: X.shape[1]}))
+
+        if has_unknown_shape(Y):
+            yield
+        if Y.chunk_shape[1] > 1:
+            Y = yield from recursive_tile(Y.rechunk({1: Y.shape[1]}))
+
+        out_distance_chunks, out_index_chunks = [], []
+        y_acc_chunk_shapes = [0] + np.cumsum(Y.nsplits[0]).tolist()
+        for i in range(len(range(X.chunk_shape[0]))):
+            x_chunk = X.cix[i, 0]
+            y_chunk_shape = Y.chunk_shape[0]
+
+            if y_chunk_shape == 1:
+                chunk_op = op.copy().reset_key()
+                y_chunk = Y.chunks[0]
+                o = cls._gen_out_chunks(x_chunk, y_chunk, chunk_op)
+                if chunk_op.return_index:
+                    out_distance_chunks.append(o[0])
+                    out_index_chunks.append(o[1])
+                else:
+                    out_distance_chunks.append(o)
+            else:
+                to_concat_chunks = []
+                concat_size = 0
+                for j in range(y_chunk_shape):
+                    y_chunk = Y.cix[j, 0]
+                    chunk_op = op.copy().reset_key()
+                    chunk_op._y_offset = y_acc_chunk_shapes[j]
+                    chunk_op.stage = OperandStage.map
+                    size = min(k, y_chunk.shape[0])
+                    o = chunk_op.new_chunk(
+                        [x_chunk, y_chunk],
+                        shape=(x_chunk.shape[0], size),
+                        order=TensorOrder.C_ORDER,
+                        index=(i, j),
+                    )
+                    to_concat_chunks.append(o)
+                    concat_size += size
+
+                concat_op = TensorConcatenate(axis=1, dtype=to_concat_chunks[0].dtype)
+                concat = concat_op.new_chunk(
+                    to_concat_chunks,
+                    shape=(x_chunk.shape[0], concat_size),
+                    order=TensorOrder.C_ORDER,
+                    index=(i, 0),
+                )
+
+                chunk_op = op.copy().reset_key()
+                chunk_op.stage = OperandStage.agg
+                distance_params = {
+                    "shape": (x_chunk.shape[0], k),
+                    "order": TensorOrder.C_ORDER,
+                    "dtype": np.dtype(np.float64),
+                    "index": (i, 0),
+                    "_type_": "distance",
+                }
+                if op.return_index:
+                    index_params = {
+                        "shape": (x_chunk.shape[0], k),
+                        "order": TensorOrder.C_ORDER,
+                        "dtype": np.dtype(np.int64),
+                        "index": (i, 0),
+                        "_type": "index",
+                    }
+                    distance_chunk, index_chunk = chunk_op.new_chunks(
+                        [concat], kws=[distance_params, index_params]
+                    )
+                    out_distance_chunks.append(distance_chunk)
+                    out_index_chunks.append(index_chunk)
+                else:
+                    out_distance_chunks.append(
+                        chunk_op.new_chunk([concat], kws=[distance_params])
+                    )
+
+        new_op = op.copy()
+        nsplits = (tuple(c.shape[0] for c in out_distance_chunks), (k,))
+        params = [o.params for o in op.outputs]
+        params[0]["chunks"] = out_distance_chunks
+        params[0]["nsplits"] = nsplits
+        if op.return_index:
+            params[1]["chunks"] = out_index_chunks
+            params[1]["nsplits"] = nsplits
+        return new_op.new_tensors(op.inputs, kws=params)
+
+    @classmethod
+    def _topk_reduce_func(cls, dist, start, topk, xp, metric):
+        """Reduce a chunk of distances to topk
+
+        Parameters
+        ----------
+        dist : array of shape (n_samples_chunk, n_samples)
+        start : int
+            The index in X which the first row of dist corresponds to.
+        topk : int
+
+        Returns
+        -------
+        dist : array of shape (n_samples_chunk, n_neighbors)
+        neigh : array of shape (n_samples_chunk, n_neighbors)
+        """
+        sample_range = xp.arange(dist.shape[0])[:, None]
+        if topk - 1 >= dist.shape[1]:
+            neigh_ind = xp.repeat(
+                xp.arange(dist.shape[1]).reshape(1, -1), dist.shape[0], axis=0
+            )
+        else:
+            neigh_ind = xp.argpartition(dist, topk - 1, axis=1)
+            neigh_ind = neigh_ind[:, :topk]
+        # argpartition doesn't guarantee sorted order, so we sort again
+        neigh_ind = neigh_ind[sample_range, xp.argsort(dist[sample_range, neigh_ind])]
+        return dist[sample_range, neigh_ind], neigh_ind
+
+    @classmethod
+    def _calcuate_topk_distances(cls, x, y, op, xp):
+        metric = op.metric
+        reduce_func = partial(cls._topk_reduce_func, topk=op.k, xp=xp, metric=op.metric)
+        kwds = op.metric_kwargs or dict()
+        need_sqrt = False
+        if metric == "euclidean" and not kwds.get("squared", False):
+            need_sqrt = True
+            kwds["squared"] = True
+        chunked_results = _pariwise_distance_chunked(
+            x,
+            y,
+            reduce_func=reduce_func,
+            metric=op.metric,
+            working_memory=op.working_memory,
+            xp=xp,
+            **kwds
+        )
+        neigh_dist, neigh_ind = zip(*chunked_results)
+        dist, ind = np.vstack(neigh_dist), np.vstack(neigh_ind)
+        if metric == "euclidean" and need_sqrt:
+            dist = xp.sqrt(dist)
+        if getattr(op, "y_offset", None) is not None:
+            ind += op.y_offset
+        return dist, ind
+
+    @classmethod
+    def _execute_map(cls, ctx, op: "PairwiseDistancesTopk"):
+        (x, y), device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            xp = get_array_module(x, nosparse=True)
+            ctx[op.outputs[0].key] = cls._calcuate_topk_distances(x, y, op, xp)
+
+    @classmethod
+    def _execute_agg(cls, ctx, op: "PairwiseDistancesTopk"):
+        inputs, device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+        inputs = inputs[0]
+        distances = inputs[0]
+
+        with device(device_id):
+            dist, ind = cls._topk_reduce_func(distances, 0, op.k, xp, op.metric)
+            ctx[op.outputs[0].key] = dist
+            if op.return_index:
+                inds = inputs[1]
+                ind_result = xp.empty_like(ind)
+                for i in range(
+                    len(ind_result)
+                ):  # pylint: disable=consider-using-enumerate
+                    ind_result[i] = inds[i][ind[i]]
+                ctx[op.outputs[1].key] = ind_result
+
+    @classmethod
+    def _execute(cls, ctx, op: "PairwiseDistancesTopk"):
+        (x, y), device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            xp = get_array_module(x, nosparse=True)
+            dist, ind = cls._calcuate_topk_distances(x, y, op, xp)
+            dist, ind_on_ind = cls._topk_reduce_func(dist, 0, op.k, xp, op.metric)
+            ctx[op.outputs[0].key] = dist
+            if op.return_index:
+                ind_result = xp.empty_like(ind_on_ind)
+                for i in range(
+                    len(ind_on_ind)
+                ):  # pylint: disable=consider-using-enumerate
+                    ind_result[i] = ind[i][ind_on_ind[i]]
+                ctx[op.outputs[1].key] = ind_result
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            return cls._execute_map(ctx, op)
+        elif op.stage == OperandStage.agg:
+            return cls._execute_agg(ctx, op)
+        else:
+            return cls._execute(ctx, op)
+
+
+def pairwise_distances_topk(
+    X,
+    Y=None,
+    k=None,
+    metric="euclidean",
+    return_index=True,
+    axis=1,
+    working_memory=None,
+    **kwds
+):
+    if k is None:  # pragma: no cover
+        raise ValueError("`k` has to be specified")
+
+    if Y is None:
+        Y = X
+    if axis == 0:
+        X, Y = Y, X
+    if working_memory is None:
+        working_memory = options.learn.working_memory
+    op = PairwiseDistancesTopk(
+        x=X,
+        y=Y,
+        k=k,
+        metric=metric,
+        metric_kwargs=kwds,
+        return_index=return_index,
+        working_memory=working_memory,
+    )
+    return op(X, Y)
diff --git a/python/xorbits/_mars/learn/metrics/pairwise/rbf_kernel.py b/python/xorbits/_mars/learn/metrics/pairwise/rbf_kernel.py
new file mode 100644
index 000000000..700bbf87d
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/pairwise/rbf_kernel.py
@@ -0,0 +1,51 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import tensor as mt
+from .core import PairwiseDistances
+from .euclidean import euclidean_distances
+
+
+def rbf_kernel(X, Y=None, gamma=None):
+    """
+    Compute the rbf (gaussian) kernel between X and Y::
+
+        K(x, y) = exp(-gamma ||x-y||^2)
+
+    for each pair of rows x in X and y in Y.
+
+    Read more in the :ref:`User Guide <rbf_kernel>`.
+
+    Parameters
+    ----------
+    X : tensor of shape (n_samples_X, n_features)
+
+    Y : tensor of shape (n_samples_Y, n_features)
+
+    gamma : float, default None
+        If None, defaults to 1.0 / n_features
+
+    Returns
+    -------
+    kernel_matrix : tensor of shape (n_samples_X, n_samples_Y)
+    """
+
+    X, Y = PairwiseDistances.check_pairwise_arrays(X, Y)
+    if gamma is None:
+        gamma = 1.0 / X.shape[1]
+
+    K = euclidean_distances(X, Y, squared=True)
+    K *= -gamma
+    K = mt.exp(K)
+    return K
diff --git a/python/xorbits/_mars/learn/metrics/pairwise/tests/__init__.py b/python/xorbits/_mars/learn/metrics/pairwise/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/pairwise/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/metrics/pairwise/tests/test_cosine_distances.py b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_cosine_distances.py
new file mode 100644
index 000000000..ad20c032e
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_cosine_distances.py
@@ -0,0 +1,50 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import scipy.sparse as sps
+from sklearn.metrics.pairwise import cosine_distances as sk_cosine_distances
+
+from ..... import tensor as mt
+from .. import cosine_distances
+
+raw_dense_x = np.random.rand(25, 10)
+raw_dense_y = np.random.rand(17, 10)
+
+raw_sparse_x = sps.random(25, 10, density=0.5, format="csr", random_state=0)
+raw_sparse_y = sps.random(17, 10, density=0.4, format="csr", random_state=1)
+
+raw_x_ys = [(raw_dense_x, raw_dense_y), (raw_sparse_x, raw_sparse_y)]
+
+
+@pytest.mark.parametrize("raw_x, raw_y", raw_x_ys)
+@pytest.mark.parametrize("chunk_size", [25, 6])
+def test_cosine_distances_execution(setup, raw_x, raw_y, chunk_size):
+    x = mt.tensor(raw_x, chunk_size=chunk_size)
+    y = mt.tensor(raw_y, chunk_size=chunk_size)
+
+    d = cosine_distances(x, y)
+
+    result = d.execute().fetch()
+    expected = sk_cosine_distances(raw_x, raw_y)
+
+    np.testing.assert_almost_equal(np.asarray(result), expected)
+
+    d = cosine_distances(x)
+
+    result = d.execute().fetch()
+    expected = sk_cosine_distances(raw_x)
+
+    np.testing.assert_almost_equal(np.asarray(result), expected)
diff --git a/python/xorbits/_mars/learn/metrics/pairwise/tests/test_euclidean_distances.py b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_euclidean_distances.py
new file mode 100644
index 000000000..175c87e51
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_euclidean_distances.py
@@ -0,0 +1,131 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import scipy.sparse as sps
+from sklearn.metrics import euclidean_distances as sk_euclidean_distances
+
+from ..... import tensor as mt
+from .....config import option_context
+from .....lib.sparse import SparseNDArray
+from ....utils import check_array
+from ... import euclidean_distances
+
+
+def test_euclidean_distances_op():
+    x = mt.random.rand(10, 3)
+    xx = mt.random.rand(1, 10)
+    y = mt.random.rand(11, 3)
+
+    d = euclidean_distances(x, X_norm_squared=xx)
+    assert d.op.x_norm_squared.key == check_array(xx).T.key
+
+    d = euclidean_distances(
+        x,
+        y,
+        X_norm_squared=mt.random.rand(10, 1, dtype=mt.float32),
+        Y_norm_squared=mt.random.rand(1, 11, dtype=mt.float32),
+    )
+    assert d.op.x_norm_squared is None
+    assert d.op.y_norm_squared is None
+
+    # XX shape incompatible
+    with pytest.raises(ValueError):
+        euclidean_distances(x, X_norm_squared=mt.random.rand(10))
+
+    # XX shape incompatible
+    with pytest.raises(ValueError):
+        euclidean_distances(x, X_norm_squared=mt.random.rand(11, 1))
+
+    # YY shape incompatible
+    with pytest.raises(ValueError):
+        euclidean_distances(x, y, Y_norm_squared=mt.random.rand(10))
+
+
+def test_euclidean_distances_execution(setup):
+    dense_raw_x = np.random.rand(30, 10)
+    dense_raw_y = np.random.rand(40, 10)
+    sparse_raw_x = SparseNDArray(sps.random(30, 10, density=0.5, format="csr"))
+    sparse_raw_y = SparseNDArray(sps.random(40, 10, density=0.5, format="csr"))
+
+    for raw_x, raw_y in [(dense_raw_x, dense_raw_y), (sparse_raw_x, sparse_raw_y)]:
+        x = mt.tensor(raw_x, chunk_size=9)
+        y = mt.tensor(raw_y, chunk_size=7)
+
+        distance = euclidean_distances(x, y)
+
+        result = distance.execute().fetch()
+        expected = sk_euclidean_distances(raw_x, Y=raw_y)
+        np.testing.assert_almost_equal(result, expected)
+
+        x_norm = x.sum(axis=1)[..., np.newaxis]
+        y_norm = y.sum(axis=1)[np.newaxis, ...]
+        distance = euclidean_distances(
+            x, y, X_norm_squared=x_norm, Y_norm_squared=y_norm
+        )
+        x_raw_norm = raw_x.sum(axis=1)[..., np.newaxis]
+        y_raw_norm = raw_y.sum(axis=1)[np.newaxis, ...]
+
+        result = distance.execute().fetch()
+        expected = sk_euclidean_distances(
+            raw_x, raw_y, X_norm_squared=x_raw_norm, Y_norm_squared=y_raw_norm
+        )
+        np.testing.assert_almost_equal(result, expected)
+
+        x_sq = (x**2).astype(np.float32)
+        y_sq = (y**2).astype(np.float32)
+
+        distance = euclidean_distances(x_sq, y_sq, squared=True)
+
+        x_raw_sq = (raw_x**2).astype(np.float32)
+        y_raw_sq = (raw_y**2).astype(np.float32)
+
+        result = distance.execute().fetch()
+        expected = sk_euclidean_distances(x_raw_sq, y_raw_sq, squared=True)
+        np.testing.assert_almost_equal(result, expected, decimal=6)
+
+        # test x is y
+        distance = euclidean_distances(x)
+
+        result = distance.execute().fetch()
+        expected = sk_euclidean_distances(raw_x)
+
+        np.testing.assert_almost_equal(result, expected)
+
+    # test size adjust
+    raw1 = np.random.rand(12, 4)
+    raw2 = np.random.rand(18, 4)
+
+    t1 = mt.tensor(raw1, chunk_size=4)
+    t2 = mt.tensor(raw2, chunk_size=6)
+    with option_context({"chunk_store_limit": 80}):
+        distance = euclidean_distances(t1, t2)
+
+        result = distance.execute().fetch()
+        expected = sk_euclidean_distances(raw1, raw2)
+        np.testing.assert_almost_equal(result, expected)
+
+        distance = euclidean_distances(t2, t1)
+
+        result = distance.execute().fetch()
+        expected = sk_euclidean_distances(raw2, raw1)
+        np.testing.assert_almost_equal(result, expected)
+
+    with option_context({"chunk_store_limit": 20}):
+        distance = euclidean_distances(t1, t2)
+
+        result = distance.execute().fetch()
+        expected = sk_euclidean_distances(raw1, raw2)
+        np.testing.assert_almost_equal(result, expected)
diff --git a/python/xorbits/_mars/learn/metrics/pairwise/tests/test_haversine_distances.py b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_haversine_distances.py
new file mode 100644
index 000000000..7b78e42ad
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_haversine_distances.py
@@ -0,0 +1,65 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+from sklearn.metrics.pairwise import haversine_distances as sk_haversine_distances
+
+from ..... import tensor as mt
+from .. import haversine_distances
+
+
+def test_haversine_distances_op():
+    # shape[1] != 2
+    with pytest.raises(ValueError):
+        haversine_distances(mt.random.rand(10, 3))
+
+    # shape[1] != 2
+    with pytest.raises(ValueError):
+        haversine_distances(mt.random.rand(10, 2), mt.random.rand(11, 3))
+
+    # cannot support sparse
+    with pytest.raises(TypeError):
+        haversine_distances(mt.random.randint(10, size=(10, 2), density=0.5))
+
+
+raw_x = np.random.rand(30, 2)
+raw_y = np.random.rand(21, 2)
+
+# one chunk
+x1 = mt.tensor(raw_x, chunk_size=30)
+y1 = mt.tensor(raw_y, chunk_size=30)
+
+# multiple chunks
+x2 = mt.tensor(raw_x, chunk_size=(11, 1))
+y2 = mt.tensor(raw_y, chunk_size=(17, 1))
+
+
+@pytest.mark.parametrize("x, y", [(x1, y1), (x2, y2)])
+@pytest.mark.parametrize("use_sklearn", [True, False])
+def test_haversine_distances_execution(setup, x, y, use_sklearn):
+    distance = haversine_distances(x, y)
+    distance.op._use_sklearn = use_sklearn
+
+    result = distance.execute().fetch()
+    expected = sk_haversine_distances(raw_x, raw_y)
+    np.testing.assert_almost_equal(result, expected)
+
+    # test x is y
+    distance = haversine_distances(x)
+    distance.op._use_sklearn = use_sklearn
+
+    result = distance.execute().fetch()
+    expected = sk_haversine_distances(raw_x, raw_x)
+    np.testing.assert_almost_equal(result, expected)
diff --git a/python/xorbits/_mars/learn/metrics/pairwise/tests/test_manhattan_distances.py b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_manhattan_distances.py
new file mode 100644
index 000000000..70541eea7
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_manhattan_distances.py
@@ -0,0 +1,84 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import scipy.sparse as sps
+from sklearn.metrics.pairwise import manhattan_distances as sk_manhattan_distances
+
+from ..... import tensor as mt
+from .. import manhattan_distances
+
+
+def test_manhattan_distances():
+    x = mt.random.randint(10, size=(10, 3), density=0.4)
+    y = mt.random.randint(10, size=(11, 3), density=0.5)
+
+    with pytest.raises(TypeError):
+        manhattan_distances(x, y, sum_over_features=False)
+
+    x = x.todense()
+    y = y.todense()
+
+    d = manhattan_distances(x, y, sum_over_features=True)
+    assert d.shape == (10, 11)
+    d = manhattan_distances(x, y, sum_over_features=False)
+    assert d.shape == (110, 3)
+
+
+raw_x = np.random.rand(20, 5)
+raw_y = np.random.rand(21, 5)
+
+x1 = mt.tensor(raw_x, chunk_size=30)
+y1 = mt.tensor(raw_y, chunk_size=30)
+
+x2 = mt.tensor(raw_x, chunk_size=11)
+y2 = mt.tensor(raw_y, chunk_size=12)
+
+raw_sparse_x = sps.random(20, 5, density=0.4, format="csr", random_state=0)
+raw_sparse_y = sps.random(21, 5, density=0.3, format="csr", random_state=0)
+
+x3 = mt.tensor(raw_sparse_x, chunk_size=30)
+y3 = mt.tensor(raw_sparse_y, chunk_size=30)
+
+x4 = mt.tensor(raw_sparse_x, chunk_size=11)
+y4 = mt.tensor(raw_sparse_y, chunk_size=12)
+
+
+@pytest.mark.parametrize(
+    "x, y, is_sparse",
+    [(x1, y1, False), (x2, y2, False), (x3, y3, True), (x4, y4, True)],
+)
+def test_manhattan_distances_execution(setup, x, y, is_sparse):
+    if is_sparse:
+        rx, ry = raw_sparse_x, raw_sparse_y
+    else:
+        rx, ry = raw_x, raw_y
+
+    sv = [True, False] if not is_sparse else [True]
+
+    for sum_over_features in sv:
+        d = manhattan_distances(x, y, sum_over_features)
+
+        result = d.execute().fetch()
+        expected = sk_manhattan_distances(rx, ry, sum_over_features=sum_over_features)
+
+        np.testing.assert_almost_equal(result, expected)
+
+        d = manhattan_distances(x, sum_over_features=sum_over_features)
+
+        result = d.execute().fetch()
+        expected = sk_manhattan_distances(rx, sum_over_features=sum_over_features)
+
+        np.testing.assert_almost_equal(result, expected)
diff --git a/python/xorbits/_mars/learn/metrics/pairwise/tests/test_pariwise_distances.py b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_pariwise_distances.py
new file mode 100644
index 000000000..5b61f3683
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_pariwise_distances.py
@@ -0,0 +1,114 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+from sklearn.exceptions import DataConversionWarning
+from sklearn.metrics import pairwise_distances as sk_pairwise_distances
+from sklearn.neighbors import NearestNeighbors as SkNearestNeighbors
+
+from ..... import tensor as mt
+from .....session import execute, fetch
+from ... import pairwise_distances, pairwise_distances_topk
+
+
+def test_pairwise_distances_execution(setup):
+    raw_x = np.random.rand(20, 5)
+    raw_y = np.random.rand(21, 5)
+
+    x = mt.tensor(raw_x, chunk_size=11)
+    y = mt.tensor(raw_y, chunk_size=12)
+
+    d = pairwise_distances(x, y)
+    result = d.execute().fetch()
+    expected = sk_pairwise_distances(raw_x, raw_y)
+    np.testing.assert_almost_equal(result, expected)
+
+    # test precomputed
+    d2 = d.copy()
+    d2[0, 0] = -1
+    d2 = pairwise_distances(d2, y, metric="precomputed")
+    with pytest.raises(ValueError):
+        _ = d2.execute().fetch()
+
+    # test cdist
+    weight = np.random.rand(5)
+    d = pairwise_distances(x, y, metric="wminkowski", p=3, w=weight)
+    result = d.execute().fetch()
+    expected = sk_pairwise_distances(raw_x, raw_y, metric="minkowski", p=3, w=weight)
+    np.testing.assert_almost_equal(result, expected)
+
+    # test pdist
+    d = pairwise_distances(x, metric="hamming")
+    result = d.execute().fetch()
+    expected = sk_pairwise_distances(raw_x, metric="hamming")
+    np.testing.assert_almost_equal(result, expected)
+
+    # test function metric
+    m = lambda u, v: np.sqrt(((u - v) ** 2).sum())
+    d = pairwise_distances(x, y, metric=m)
+    result = d.execute().fetch()
+    expected = sk_pairwise_distances(raw_x, raw_y, metric=m)
+    np.testing.assert_almost_equal(result, expected)
+
+    with pytest.warns(DataConversionWarning):
+        pairwise_distances(x, y, metric="jaccard")
+
+    with pytest.raises(ValueError):
+        _ = pairwise_distances(x, y, metric="unknown")
+
+
+def test_pairwise_distances_topk_execution(setup):
+    rs = np.random.RandomState(0)
+    raw_x = rs.rand(20, 5)
+    raw_y = rs.rand(21, 5)
+
+    x = mt.tensor(raw_x, chunk_size=11)
+    y = mt.tensor(raw_y, chunk_size=12)
+
+    d, i = pairwise_distances_topk(x, y, 3, metric="euclidean", return_index=True)
+    result = fetch(*execute(d, i))
+    nn = SkNearestNeighbors(n_neighbors=3, algorithm="brute", metric="euclidean")
+    nn.fit(raw_y)
+    expected = nn.kneighbors(raw_x, return_distance=True)
+    np.testing.assert_almost_equal(result[0], expected[0])
+    np.testing.assert_array_equal(result[1], expected[1])
+
+    x = mt.tensor(raw_x, chunk_size=(11, 3))
+
+    d = pairwise_distances_topk(x, k=4, metric="euclidean", return_index=False)
+    result = d.execute().fetch()
+    nn = SkNearestNeighbors(n_neighbors=3, algorithm="brute", metric="euclidean")
+    nn.fit(raw_x)
+    expected = nn.kneighbors(return_distance=True)[0]
+    np.testing.assert_almost_equal(result[:, 1:], expected)
+
+    y = mt.tensor(raw_y, chunk_size=21)
+
+    d, i = pairwise_distances_topk(
+        x, y, 3, metric="cosine", return_index=True, working_memory="168"
+    )
+    result = fetch(*execute(d, i))
+    nn = SkNearestNeighbors(n_neighbors=3, algorithm="brute", metric="cosine")
+    nn.fit(raw_y)
+    expected = nn.kneighbors(raw_x, return_distance=True)
+    np.testing.assert_almost_equal(result[0], expected[0])
+    np.testing.assert_array_equal(result[1], expected[1])
+
+    d = pairwise_distances_topk(x, y, 3, metric="cosine", axis=0, return_index=False)
+    result = d.execute().fetch()
+    nn = SkNearestNeighbors(n_neighbors=3, algorithm="brute", metric="cosine")
+    nn.fit(raw_x)
+    expected = nn.kneighbors(raw_y, return_distance=True)[0]
+    np.testing.assert_almost_equal(result, expected)
diff --git a/python/xorbits/_mars/learn/metrics/pairwise/tests/test_rbf_kernel.py b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_rbf_kernel.py
new file mode 100644
index 000000000..510ae679d
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_rbf_kernel.py
@@ -0,0 +1,30 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from sklearn.metrics.pairwise import rbf_kernel as sklearn_rbf_kernel
+
+from .. import rbf_kernel
+
+
+def test_rbf_kernel(setup):
+    rs = np.random.RandomState(0)
+    raw_X = rs.rand(10, 4)
+    raw_Y = rs.rand(11, 4)
+
+    r = rbf_kernel(raw_X, raw_Y)
+    result = r.to_numpy()
+    expected = sklearn_rbf_kernel(raw_X, raw_Y)
+
+    np.testing.assert_almost_equal(result, expected)
diff --git a/python/xorbits/_mars/learn/metrics/tests/__init__.py b/python/xorbits/_mars/learn/metrics/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/metrics/tests/test_classification.py b/python/xorbits/_mars/learn/metrics/tests/test_classification.py
new file mode 100644
index 000000000..6cb95cf96
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/tests/test_classification.py
@@ -0,0 +1,591 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+from sklearn import datasets, svm
+from sklearn.exceptions import UndefinedMetricWarning
+from sklearn.metrics import accuracy_score as sklearn_accuracy_score
+from sklearn.utils import check_random_state
+from sklearn.utils._mocking import MockDataFrame
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+
+from .... import execute, fetch
+from .... import tensor as mt
+from ....lib.sparse import SparseNDArray
+from .. import accuracy_score, log_loss
+from .._classification import (
+    _check_targets,
+    f1_score,
+    fbeta_score,
+    multilabel_confusion_matrix,
+    precision_recall_fscore_support,
+    precision_score,
+    recall_score,
+)
+
+IND = "multilabel-indicator"
+MC = "multiclass"
+BIN = "binary"
+CNT = "continuous"
+MMC = "multiclass-multioutput"
+MCN = "continuous-multioutput"
+# all of length 3
+EXAMPLES = [
+    (IND, np.array([[0, 1, 1], [1, 0, 0], [0, 0, 1]])),
+    # must not be considered binary
+    (IND, np.array([[0, 1], [1, 0], [1, 1]])),
+    (MC, [2, 3, 1]),
+    (BIN, [0, 1, 1]),
+    (CNT, [0.0, 1.5, 1.0]),
+    (MC, np.array([[2], [3], [1]])),
+    (BIN, np.array([[0], [1], [1]])),
+    (CNT, np.array([[0.0], [1.5], [1.0]])),
+    (MMC, np.array([[0, 2], [1, 3], [2, 3]])),
+    (MCN, np.array([[0.5, 2.0], [1.1, 3.0], [2.0, 3.0]])),
+]
+# expected type given input types, or None for error
+# (types will be tried in either order)
+EXPECTED = {
+    (IND, IND): IND,
+    (MC, MC): MC,
+    (BIN, BIN): BIN,
+    (MC, IND): None,
+    (BIN, IND): None,
+    (BIN, MC): MC,
+    # Disallowed types
+    (CNT, CNT): None,
+    (MMC, MMC): None,
+    (MCN, MCN): None,
+    (IND, CNT): None,
+    (MC, CNT): None,
+    (BIN, CNT): None,
+    (MMC, CNT): None,
+    (MCN, CNT): None,
+    (IND, MMC): None,
+    (MC, MMC): None,
+    (BIN, MMC): None,
+    (MCN, MMC): None,
+    (IND, MCN): None,
+    (MC, MCN): None,
+    (BIN, MCN): None,
+}
+
+
+###############################################################################
+# Utilities for testing
+
+
+def make_prediction(dataset=None, binary=False):
+    """Make some classification predictions on a toy dataset using a SVC
+
+    If binary is True restrict to a binary classification problem instead of a
+    multiclass classification problem
+    """
+
+    if dataset is None:
+        # import some data to play with
+        dataset = datasets.load_iris()
+
+    X = dataset.data
+    y = dataset.target
+
+    if binary:
+        # restrict to a binary classification task
+        X, y = X[y < 2], y[y < 2]
+
+    n_samples, n_features = X.shape
+    p = np.arange(n_samples)
+
+    rng = check_random_state(37)
+    rng.shuffle(p)
+    X, y = X[p], y[p]
+    half = int(n_samples / 2)
+
+    # add noisy features to make the problem harder and avoid perfect results
+    rng = np.random.RandomState(0)
+    X = np.c_[X, rng.randn(n_samples, 200 * n_features)]
+
+    # run classifier, get class probabilities and label predictions
+    clf = svm.SVC(kernel="linear", probability=True, random_state=0)
+    probas_pred = clf.fit(X[:half], y[:half]).predict_proba(X[half:])
+
+    if binary:
+        # only interested in probabilities of the positive case
+        # XXX: do we really want a special API for the binary case?
+        probas_pred = probas_pred[:, 1]
+
+    y_pred = clf.predict(X[half:])
+    y_true = y[half:]
+    return y_true, y_pred, probas_pred
+
+
+@pytest.mark.parametrize("type1, y1", EXAMPLES)
+@pytest.mark.parametrize("type2, y2", EXAMPLES)
+def test__check_targets(setup, type1, y1, type2, y2):
+    # Check that _check_targets correctly merges target types, squeezes
+    # output and fails if input lengths differ.
+    try:
+        expected = EXPECTED[type1, type2]
+    except KeyError:
+        expected = EXPECTED[type2, type1]
+    if expected is None:
+        with pytest.raises(ValueError):
+            _check_targets(y1, y2).execute()
+
+        if type1 != type2:
+            with pytest.raises(ValueError):
+                _check_targets(y1, y2).execute()
+
+        else:
+            if type1 not in (BIN, MC, IND):
+                with pytest.raises(ValueError):
+                    _check_targets(y1, y2).execute()
+
+    else:
+        merged_type, y1out, y2out = _check_targets(y1, y2).execute().fetch()
+        assert merged_type == expected
+        if merged_type.startswith("multilabel"):
+            assert isinstance(y1out, SparseNDArray)
+            assert isinstance(y2out, SparseNDArray)
+        else:
+            np.testing.assert_array_equal(y1out, np.squeeze(y1))
+            np.testing.assert_array_equal(y2out, np.squeeze(y2))
+        with pytest.raises(ValueError):
+            _check_targets(y1[:-1], y2).execute()
+
+
+def test_accuracy_score(setup):
+    y_pred = [0, 2, 1, 3]
+    y_true = [0, 1, 2, 3]
+
+    score = accuracy_score(y_true, y_pred)
+    result = score.execute().fetch()
+    expected = sklearn_accuracy_score(y_true, y_pred)
+    assert pytest.approx(result) == expected
+
+    score = accuracy_score(y_true, y_pred, normalize=False)
+    result = score.execute().fetch()
+    expected = sklearn_accuracy_score(y_true, y_pred, normalize=False)
+    assert pytest.approx(result) == expected
+
+    y_pred = np.array([[0, 1], [1, 1]])
+    y_true = np.ones((2, 2))
+    score = accuracy_score(y_true, y_pred)
+    result = score.execute().fetch()
+    expected = sklearn_accuracy_score(y_true, y_pred)
+    assert pytest.approx(result) == expected
+
+    sample_weight = [0.7, 0.3]
+    score = accuracy_score(y_true, y_pred, sample_weight=sample_weight)
+    result = score.execute().fetch()
+    expected = sklearn_accuracy_score(y_true, y_pred, sample_weight=sample_weight)
+    assert pytest.approx(result) == expected
+
+    score = accuracy_score(
+        mt.tensor(y_true),
+        mt.tensor(y_pred),
+        sample_weight=mt.tensor(sample_weight),
+        normalize=False,
+    )
+    result = score.execute().fetch()
+    expected = sklearn_accuracy_score(
+        y_true, y_pred, sample_weight=sample_weight, normalize=False
+    )
+    assert pytest.approx(result) == expected
+
+
+def test_log_loss(setup):
+    # binary case with symbolic labels ("no" < "yes")
+    y_true = ["no", "no", "no", "yes", "yes", "yes"]
+    y_pred = mt.array(
+        [[0.5, 0.5], [0.1, 0.9], [0.01, 0.99], [0.9, 0.1], [0.75, 0.25], [0.001, 0.999]]
+    )
+    loss = log_loss(y_true, y_pred).fetch()
+    assert_almost_equal(loss, 1.8817971)
+
+    # multiclass case; adapted from http://bit.ly/RJJHWA
+    y_true = [1, 0, 2]
+    y_pred = [[0.2, 0.7, 0.1], [0.6, 0.2, 0.2], [0.6, 0.1, 0.3]]
+    loss = log_loss(y_true, y_pred, normalize=True).fetch()
+    assert_almost_equal(loss, 0.6904911)
+
+    # check that we got all the shapes and axes right
+    # by doubling the length of y_true and y_pred
+    y_true *= 2
+    y_pred *= 2
+    loss = log_loss(y_true, y_pred, normalize=False).fetch()
+    assert_almost_equal(loss, 0.6904911 * 6, decimal=6)
+
+    # check eps and handling of absolute zero and one probabilities
+    y_pred = np.asarray(y_pred) > 0.5
+    loss = log_loss(y_true, y_pred, normalize=True, eps=0.1).fetch()
+    assert_almost_equal(loss, log_loss(y_true, np.clip(y_pred, 0.1, 0.9)).fetch())
+
+    # raise error if number of classes are not equal.
+    y_true = [1, 0, 2]
+    y_pred = [[0.2, 0.7], [0.6, 0.5], [0.4, 0.1]]
+    with pytest.raises(ValueError):
+        log_loss(y_true, y_pred)
+    with pytest.raises(ValueError):
+        log_loss(y_true, y_pred, labels=[0, 1, 2])
+
+    # case when y_true is a string array object
+    y_true = ["ham", "spam", "spam", "ham"]
+    y_pred = [[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]]
+    loss = log_loss(y_true, y_pred).fetch()
+    assert_almost_equal(loss, 1.0383217, decimal=6)
+
+    # test labels option
+
+    y_true = [2, 2]
+    y_pred = [[0.2, 0.7], [0.6, 0.5]]
+    y_score = np.array([[0.1, 0.9], [0.1, 0.9]])
+    error_str = (
+        r"y_true contains only one label \(2\). Please provide "
+        r"the true labels explicitly through the labels argument."
+    )
+    with pytest.raises(ValueError, match=error_str):
+        log_loss(y_true, y_pred)
+    error_str = (
+        r"The labels array needs to contain at least two "
+        r"labels for log_loss, got \[1\]."
+    )
+    with pytest.raises(ValueError, match=error_str):
+        log_loss(y_true, y_pred, labels=[1])
+
+    # works when the labels argument is used
+
+    true_log_loss = -np.mean(np.log(y_score[:, 1]))
+    calculated_log_loss = log_loss(y_true, y_score, labels=[1, 2]).fetch()
+    assert_almost_equal(calculated_log_loss, true_log_loss)
+
+    # ensure labels work when len(np.unique(y_true)) != y_pred.shape[1]
+    y_true = [1, 2, 2]
+    y_score2 = [[0.2, 0.7, 0.3], [0.6, 0.5, 0.3], [0.3, 0.9, 0.1]]
+    loss = log_loss(y_true, y_score2, labels=[1, 2, 3]).fetch()
+    assert_almost_equal(loss, 1.0630345, decimal=6)
+
+
+def test_log_loss_pandas_input(setup):
+    # case when input is a pandas series and dataframe gh-5715
+    y_tr = np.array(["ham", "spam", "spam", "ham"])
+    y_pr = np.array([[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]])
+    types = [(MockDataFrame, MockDataFrame)]
+    try:
+        from pandas import DataFrame, Series
+
+        types.append((Series, DataFrame))
+    except ImportError:
+        pass
+    for TrueInputType, PredInputType in types:
+        # y_pred dataframe, y_true series
+        y_true, y_pred = TrueInputType(y_tr), PredInputType(y_pr)
+        loss = log_loss(y_true, y_pred).fetch()
+        assert_almost_equal(loss, 1.0383217, decimal=6)
+
+
+def test_multilabel_confusion_matrix_binary(setup):
+    # Test multilabel confusion matrix - binary classification case
+    y_true, y_pred, _ = make_prediction(binary=True)
+    y_true = mt.tensor(y_true, chunk_size=40)
+    y_pred = mt.tensor(y_pred, chunk_size=40)
+
+    def run_test(y_true, y_pred):
+        cm = multilabel_confusion_matrix(y_true, y_pred).fetch()
+        assert_array_equal(cm, [[[17, 8], [3, 22]], [[22, 3], [8, 17]]])
+
+    run_test(y_true, y_pred)
+    run_test(y_true.astype(str), y_pred.astype(str))
+
+
+def test_multilabel_confusion_matrix_multiclass(setup):
+    # Test multilabel confusion matrix - multi-class case
+    y_true, y_pred, _ = make_prediction(binary=False)
+    y_true = mt.tensor(y_true, chunk_size=40)
+    y_pred = mt.tensor(y_pred, chunk_size=40)
+
+    def run_test(y_true, y_pred, string_type=False):
+        # compute confusion matrix with default labels introspection
+        cm = multilabel_confusion_matrix(y_true, y_pred).fetch()
+        assert_array_equal(
+            cm, [[[47, 4], [5, 19]], [[38, 6], [28, 3]], [[30, 25], [2, 18]]]
+        )
+
+        # compute confusion matrix with explicit label ordering
+        labels = ["0", "2", "1"] if string_type else [0, 2, 1]
+        cm = multilabel_confusion_matrix(y_true, y_pred, labels=labels).fetch()
+        assert_array_equal(
+            cm, [[[47, 4], [5, 19]], [[30, 25], [2, 18]], [[38, 6], [28, 3]]]
+        )
+
+        # compute confusion matrix with super set of present labels
+        labels = ["0", "2", "1", "3"] if string_type else [0, 2, 1, 3]
+        cm = multilabel_confusion_matrix(y_true, y_pred, labels=labels).fetch()
+        assert_array_equal(
+            cm,
+            [
+                [[47, 4], [5, 19]],
+                [[30, 25], [2, 18]],
+                [[38, 6], [28, 3]],
+                [[75, 0], [0, 0]],
+            ],
+        )
+
+    run_test(y_true, y_pred)
+    run_test(y_true.astype(str), y_pred.astype(str), string_type=True)
+
+
+def test_multilabel_confusion_matrix_multilabel(setup):
+    # Test multilabel confusion matrix - multilabel-indicator case
+    from scipy.sparse import csc_matrix, csr_matrix
+
+    y_true = np.array([[1, 0, 1], [0, 1, 0], [1, 1, 0]])
+    y_pred = np.array([[1, 0, 0], [0, 1, 1], [0, 0, 1]])
+    y_true_csr = csr_matrix(y_true)
+    y_pred_csr = csr_matrix(y_pred)
+    y_true_csc = csc_matrix(y_true)
+    y_pred_csc = csc_matrix(y_pred)
+
+    y_true_t = mt.tensor(y_true)
+    y_pred_t = mt.tensor(y_pred)
+
+    # cross test different types
+    sample_weight = np.array([2, 1, 3])
+    real_cm = [[[1, 0], [1, 1]], [[1, 0], [1, 1]], [[0, 2], [1, 0]]]
+    trues = [y_true_t, y_true_csr, y_true_csc]
+    preds = [y_pred_t, y_pred_csr, y_pred_csc]
+
+    for y_true_tmp in trues:
+        for y_pred_tmp in preds:
+            cm = multilabel_confusion_matrix(y_true_tmp, y_pred_tmp).fetch()
+            assert_array_equal(cm, real_cm)
+
+    # test support for samplewise
+    cm = multilabel_confusion_matrix(y_true_t, y_pred_t, samplewise=True).fetch()
+    assert_array_equal(cm, [[[1, 0], [1, 1]], [[1, 1], [0, 1]], [[0, 1], [2, 0]]])
+
+    # test support for labels
+    cm = multilabel_confusion_matrix(y_true_t, y_pred_t, labels=[2, 0]).fetch()
+    assert_array_equal(cm, [[[0, 2], [1, 0]], [[1, 0], [1, 1]]])
+
+    # test support for labels with samplewise
+    cm = multilabel_confusion_matrix(
+        y_true_t, y_pred_t, labels=[2, 0], samplewise=True
+    ).fetch()
+    assert_array_equal(cm, [[[0, 0], [1, 1]], [[1, 1], [0, 0]], [[0, 1], [1, 0]]])
+
+    # test support for sample_weight with sample_wise
+    cm = multilabel_confusion_matrix(
+        y_true_t, y_pred_t, sample_weight=sample_weight, samplewise=True
+    ).fetch()
+    assert_array_equal(cm, [[[2, 0], [2, 2]], [[1, 1], [0, 1]], [[0, 3], [6, 0]]])
+
+
+def test_multilabel_confusion_matrix_errors(setup):
+    y_true = mt.array([[1, 0, 1], [0, 1, 0], [1, 1, 0]])
+    y_pred = mt.array([[1, 0, 0], [0, 1, 1], [0, 0, 1]])
+
+    # Bad sample_weight
+    with pytest.raises(ValueError, match="inconsistent numbers of samples"):
+        multilabel_confusion_matrix(y_true, y_pred, sample_weight=[1, 2])
+    with pytest.raises(ValueError, match="should be a 1d array"):
+        multilabel_confusion_matrix(
+            y_true, y_pred, sample_weight=[[1, 2, 3], [2, 3, 4], [3, 4, 5]]
+        )
+
+    # Bad labels
+    err_msg = r"All labels must be in \[0, n labels\)"
+    with pytest.raises(ValueError, match=err_msg):
+        multilabel_confusion_matrix(y_true, y_pred, labels=[-1])
+    err_msg = r"All labels must be in \[0, n labels\)"
+    with pytest.raises(ValueError, match=err_msg):
+        multilabel_confusion_matrix(y_true, y_pred, labels=[3])
+
+    # Using samplewise outside multilabel
+    with pytest.raises(ValueError, match="Samplewise metrics"):
+        multilabel_confusion_matrix([0, 1, 2], [1, 2, 0], samplewise=True)
+
+    # Bad y_type
+    err_msg = "multiclass-multioutput is not supported"
+    with pytest.raises(ValueError, match=err_msg):
+        multilabel_confusion_matrix([[0, 1, 2], [2, 1, 0]], [[1, 2, 0], [1, 0, 2]])
+
+
+@pytest.mark.parametrize("average", ["macro", "micro", "weighted", "samples"])
+def test_precision_recall_f1_no_labels_check_warnings(setup, average):
+    y_true = mt.zeros((20, 3))
+    y_pred = mt.zeros_like(y_true)
+
+    func = precision_recall_fscore_support
+    with pytest.warns(UndefinedMetricWarning):
+        p, r, f, s = func(y_true, y_pred, average=average, beta=1.0)
+        p, r, f = fetch(execute(p, r, f))
+
+    assert_almost_equal(p, 0)
+    assert_almost_equal(r, 0)
+    assert_almost_equal(f, 0)
+    assert s is None
+
+    with pytest.warns(UndefinedMetricWarning):
+        fbeta = fetch(execute(fbeta_score(y_true, y_pred, average=average, beta=1.0)))
+
+    assert_almost_equal(fbeta, 0)
+
+
+def test_precision_recall_f1_score_multiclass(setup):
+    # Test Precision Recall and F1 Score for multiclass classification task
+    y_true, y_pred, _ = make_prediction(binary=False)
+    y_true = mt.tensor(y_true, chunk_size=40)
+    y_pred = mt.tensor(y_pred, chunk_size=40)
+
+    # compute scores with default labels introspection
+    p, r, f, s = fetch(
+        execute(precision_recall_fscore_support(y_true, y_pred, average=None))
+    )
+    assert_array_almost_equal(p, [0.83, 0.33, 0.42], 2)
+    assert_array_almost_equal(r, [0.79, 0.09, 0.90], 2)
+    assert_array_almost_equal(f, [0.81, 0.15, 0.57], 2)
+    assert_array_equal(s, [24, 31, 20])
+
+    # averaging tests
+    ps = fetch(execute(precision_score(y_true, y_pred, pos_label=1, average="micro")))
+    assert_array_almost_equal(ps, 0.53, 2)
+
+    rs = fetch(execute(recall_score(y_true, y_pred, average="micro")))
+    assert_array_almost_equal(rs, 0.53, 2)
+
+    fs = fetch(execute(f1_score(y_true, y_pred, average="micro")))
+    assert_array_almost_equal(fs, 0.53, 2)
+
+    ps = fetch(execute(precision_score(y_true, y_pred, average="macro")))
+    assert_array_almost_equal(ps, 0.53, 2)
+
+    rs = fetch(execute(recall_score(y_true, y_pred, average="macro")))
+    assert_array_almost_equal(rs, 0.60, 2)
+
+    fs = fetch(execute(f1_score(y_true, y_pred, average="macro")))
+    assert_array_almost_equal(fs, 0.51, 2)
+
+    ps = fetch(execute(precision_score(y_true, y_pred, average="weighted")))
+    assert_array_almost_equal(ps, 0.51, 2)
+
+    rs = fetch(execute(recall_score(y_true, y_pred, average="weighted")))
+    assert_array_almost_equal(rs, 0.53, 2)
+
+    fs = fetch(execute(f1_score(y_true, y_pred, average="weighted")))
+    assert_array_almost_equal(fs, 0.47, 2)
+
+    with pytest.raises(ValueError):
+        precision_score(y_true, y_pred, average="samples")
+    with pytest.raises(ValueError):
+        recall_score(y_true, y_pred, average="samples")
+    with pytest.raises(ValueError):
+        f1_score(y_true, y_pred, average="samples")
+    with pytest.raises(ValueError):
+        fbeta_score(y_true, y_pred, average="samples", beta=0.5)
+
+    # same prediction but with and explicit label ordering
+    p, r, f, s = fetch(
+        execute(
+            precision_recall_fscore_support(
+                y_true, y_pred, labels=[0, 2, 1], average=None
+            )
+        )
+    )
+    assert_array_almost_equal(p, [0.83, 0.41, 0.33], 2)
+    assert_array_almost_equal(r, [0.79, 0.90, 0.10], 2)
+    assert_array_almost_equal(f, [0.81, 0.57, 0.15], 2)
+    assert_array_equal(s, [24, 20, 31])
+
+
+@pytest.mark.parametrize("average", ["samples", "micro", "macro", "weighted", None])
+def test_precision_refcall_f1_score_multilabel_unordered_labels(setup, average):
+    # test that labels need not be sorted in the multilabel case
+    y_true = mt.array([[1, 1, 0, 0]])
+    y_pred = mt.array([[0, 0, 1, 1]])
+    p, r, f, s = precision_recall_fscore_support(
+        y_true, y_pred, labels=[3, 0, 1, 2], warn_for=[], average=average
+    )
+    p, r, f = fetch(execute(p, r, f))
+    assert_array_equal(p, 0)
+    assert_array_equal(r, 0)
+    assert_array_equal(f, 0)
+    if average is None:
+        assert_array_equal(s, [0, 1, 1, 0])
+
+
+def test_precision_recall_f1_score_binary_averaged(setup):
+    y_true = mt.array([0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1], chunk_size=10)
+    y_pred = mt.array([1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1], chunk_size=10)
+
+    # compute scores with default labels introspection
+    ps, rs, fs, _ = precision_recall_fscore_support(y_true, y_pred, average=None)
+    ps, rs, fs = fetch(execute(ps, rs, fs))
+    p, r, f, _ = precision_recall_fscore_support(y_true, y_pred, average="macro")
+    p, r, f = fetch(execute(p, r, f))
+    assert p == np.mean(ps)
+    assert r == np.mean(rs)
+    assert f == np.mean(fs)
+    p, r, f, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
+    p, r, f = fetch(execute(p, r, f))
+    support = np.bincount(y_true).execute().fetch()
+    assert p == np.average(ps, weights=support)
+    assert r == np.average(rs, weights=support)
+    assert f == np.average(fs, weights=support)
+
+
+def test_zero_precision_recall(setup):
+    # Check that pathological cases do not bring NaNs
+
+    old_error_settings = np.seterr(all="raise")
+
+    try:
+        y_true = mt.array([0, 1, 2, 0, 1, 2], chunk_size=4)
+        y_pred = mt.array([2, 0, 1, 1, 2, 0], chunk_size=4)
+
+        assert_almost_equal(
+            precision_score(y_true, y_pred, average="macro").execute().fetch(), 0.0, 2
+        )
+        assert_almost_equal(
+            recall_score(y_true, y_pred, average="macro").execute().fetch(), 0.0, 2
+        )
+        assert_almost_equal(
+            f1_score(y_true, y_pred, average="macro").execute().fetch(), 0.0, 2
+        )
+
+    finally:
+        np.seterr(**old_error_settings)
+
+
+def test_precision_recall_f_binary_single_class(setup):
+    # Test precision, recall and F-scores behave with a single positive or
+    # negative class
+    # Such a case may occur with non-stratified cross-validation
+    assert 1.0 == fetch(execute(precision_score([1, 1], [1, 1])))
+    assert 1.0 == fetch(execute(recall_score([1, 1], [1, 1])))
+    assert 1.0 == fetch(execute(f1_score([1, 1], [1, 1])))
+    assert 1.0 == fetch(execute(fbeta_score([1, 1], [1, 1], beta=0)))
+
+    assert 0.0 == fetch(execute(precision_score([-1, -1], [-1, -1])))
+    assert 0.0 == fetch(execute(recall_score([-1, -1], [-1, -1])))
+    assert 0.0 == fetch(execute(f1_score([-1, -1], [-1, -1])))
+    assert 0.0 == fetch(execute(fbeta_score([-1, -1], [-1, -1], beta=float("inf"))))
+    assert fetch(
+        execute(fbeta_score([-1, -1], [-1, -1], beta=float("inf")))
+    ) == pytest.approx(fetch(execute(fbeta_score([-1, -1], [-1, -1], beta=1e5))))
diff --git a/python/xorbits/_mars/learn/metrics/tests/test_ranking.py b/python/xorbits/_mars/learn/metrics/tests/test_ranking.py
new file mode 100644
index 000000000..9ce5d73c8
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/tests/test_ranking.py
@@ -0,0 +1,699 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import warnings
+
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.exceptions import UndefinedMetricWarning
+from sklearn.metrics import accuracy_score as sklearn_accuracy_score
+from sklearn.metrics import auc as sklearn_auc
+from sklearn.metrics import roc_curve as sklearn_roc_curve
+from sklearn.metrics.tests.test_ranking import _auc, make_prediction
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import assert_almost_equal, assert_array_almost_equal
+
+from .... import dataframe as md
+from .... import tensor as mt
+from ...utils.extmath import softmax
+from .. import accuracy_score, auc, roc_auc_score, roc_curve
+
+
+def _partial_roc_auc_score(y_true, y_predict, max_fpr):
+    """Alternative implementation to check for correctness of `roc_auc_score`
+    with `max_fpr` set.
+    """
+
+    def _partial_roc(y_true, y_predict, max_fpr):
+        fpr, tpr, _ = sklearn_roc_curve(y_true, y_predict)
+        new_fpr = fpr[fpr <= max_fpr]
+        new_fpr = np.append(new_fpr, max_fpr)
+        new_tpr = tpr[fpr <= max_fpr]
+        idx_out = np.argmax(fpr > max_fpr)
+        idx_in = idx_out - 1
+        x_interp = [fpr[idx_in], fpr[idx_out]]
+        y_interp = [tpr[idx_in], tpr[idx_out]]
+        new_tpr = np.append(new_tpr, np.interp(max_fpr, x_interp, y_interp))
+        return (new_fpr, new_tpr)
+
+    new_fpr, new_tpr = _partial_roc(y_true, y_predict, max_fpr)
+    partial_auc = sklearn_auc(new_fpr, new_tpr)
+
+    # Formula (5) from McClish 1989
+    fpr1 = 0
+    fpr2 = max_fpr
+    min_area = 0.5 * (fpr2 - fpr1) * (fpr2 + fpr1)
+    max_area = fpr2 - fpr1
+    return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area))
+
+
+@pytest.mark.parametrize("drop", [True, False])
+def test_roc_curve(setup, drop):
+    # Test Area under Receiver Operating Characteristic (ROC) curve
+    y_true, _, probas_pred = make_prediction(binary=True)
+    expected_auc = _auc(y_true, probas_pred)
+
+    fpr, tpr, thresholds = (
+        roc_curve(y_true, probas_pred, drop_intermediate=drop).execute().fetch()
+    )
+    roc_auc = auc(fpr, tpr).to_numpy()
+    np.testing.assert_array_almost_equal(roc_auc, expected_auc, decimal=2)
+    np.testing.assert_almost_equal(roc_auc, roc_auc_score(y_true, probas_pred))
+    assert fpr.shape == tpr.shape
+    assert fpr.shape == thresholds.shape
+
+
+def test_roc_curve_end_points(setup):
+    # Make sure that roc_curve returns a curve start at 0 and ending and
+    # 1 even in corner cases
+    rng = np.random.RandomState(0)
+    y_true = np.array([0] * 50 + [1] * 50)
+    y_pred = rng.randint(3, size=100)
+    fpr, tpr, thr = roc_curve(y_true, y_pred, drop_intermediate=True).fetch()
+    assert fpr[0] == 0
+    assert fpr[-1] == 1
+    assert fpr.shape == tpr.shape
+    assert fpr.shape == thr.shape
+
+
+def test_roc_returns_consistency(setup):
+    # Test whether the returned threshold matches up with tpr
+    # make small toy dataset
+    y_true, _, probas_pred = make_prediction(binary=True)
+    fpr, tpr, thresholds = roc_curve(y_true, probas_pred).fetch()
+
+    # use the given thresholds to determine the tpr
+    tpr_correct = []
+    for t in thresholds:
+        tp = np.sum((probas_pred >= t) & y_true)
+        p = np.sum(y_true)
+        tpr_correct.append(1.0 * tp / p)
+
+    # compare tpr and tpr_correct to see if the thresholds' order was correct
+    np.testing.assert_array_almost_equal(tpr, tpr_correct, decimal=2)
+    assert fpr.shape == tpr.shape
+    assert fpr.shape == thresholds.shape
+
+
+def test_roc_curve_multi(setup):
+    # roc_curve not applicable for multi-class problems
+    y_true, _, probas_pred = make_prediction(binary=False)
+
+    with pytest.raises(ValueError):
+        roc_curve(y_true, probas_pred)
+
+
+def test_roc_curve_confidence(setup):
+    # roc_curve for confidence scores
+    y_true, _, probas_pred = make_prediction(binary=True)
+
+    fpr, tpr, thresholds = roc_curve(y_true, probas_pred - 0.5)
+    roc_auc = auc(fpr, tpr).fetch()
+    np.testing.assert_array_almost_equal(roc_auc, 0.90, decimal=2)
+    assert fpr.shape == tpr.shape
+    assert fpr.shape == thresholds.shape
+
+
+def test_roc_curve_hard(setup):
+    # roc_curve for hard decisions
+    y_true, pred, probas_pred = make_prediction(binary=True)
+
+    # always predict one
+    trivial_pred = np.ones(y_true.shape)
+    fpr, tpr, thresholds = roc_curve(y_true, trivial_pred)
+    roc_auc = auc(fpr, tpr).fetch()
+    np.testing.assert_array_almost_equal(roc_auc, 0.50, decimal=2)
+    assert fpr.shape == tpr.shape
+    assert fpr.shape == thresholds.shape
+
+    # always predict zero
+    trivial_pred = np.zeros(y_true.shape)
+    fpr, tpr, thresholds = roc_curve(y_true, trivial_pred)
+    roc_auc = auc(fpr, tpr).fetch()
+    np.testing.assert_array_almost_equal(roc_auc, 0.50, decimal=2)
+    assert fpr.shape == tpr.shape
+    assert fpr.shape == thresholds.shape
+
+    # hard decisions
+    fpr, tpr, thresholds = roc_curve(y_true, pred)
+    roc_auc = auc(fpr, tpr).fetch()
+    np.testing.assert_array_almost_equal(roc_auc, 0.78, decimal=2)
+    assert fpr.shape == tpr.shape
+    assert fpr.shape == thresholds.shape
+
+
+def test_roc_curve_one_label(setup):
+    y_true = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+    y_pred = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+    # assert there are warnings
+    w = UndefinedMetricWarning
+    with pytest.warns(w):
+        fpr, tpr, thresholds = roc_curve(y_true, y_pred)
+    # all true labels, all fpr should be nan
+    np.testing.assert_array_equal(fpr.fetch(), np.full(len(thresholds), np.nan))
+    assert fpr.shape == tpr.shape
+    assert fpr.shape == thresholds.shape
+
+    # assert there are warnings
+    with pytest.warns(w):
+        fpr, tpr, thresholds = roc_curve([1 - x for x in y_true], y_pred)
+    # all negative labels, all tpr should be nan
+    np.testing.assert_array_equal(tpr.fetch(), np.full(len(thresholds), np.nan))
+    assert fpr.shape == tpr.shape
+    assert fpr.shape == thresholds.shape
+
+
+def test_roc_curve_toydata(setup):
+    # Binary classification
+    y_true = [0, 1]
+    y_score = [0, 1]
+    tpr, fpr, _ = roc_curve(y_true, y_score)
+    roc_auc = roc_auc_score(y_true, y_score)
+    assert_array_almost_equal(tpr, [0, 0, 1])
+    assert_array_almost_equal(fpr, [0, 1, 1])
+    assert_almost_equal(roc_auc, 1.0)
+
+    y_true = [0, 1]
+    y_score = [1, 0]
+    tpr, fpr, _ = roc_curve(y_true, y_score)
+    roc_auc = roc_auc_score(y_true, y_score)
+    assert_array_almost_equal(tpr, [0, 1, 1])
+    assert_array_almost_equal(fpr, [0, 0, 1])
+    assert_almost_equal(roc_auc, 0.0)
+
+    y_true = [1, 0]
+    y_score = [1, 1]
+    tpr, fpr, _ = roc_curve(y_true, y_score)
+    roc_auc = roc_auc_score(y_true, y_score)
+    assert_array_almost_equal(tpr, [0, 1])
+    assert_array_almost_equal(fpr, [0, 1])
+    assert_almost_equal(roc_auc, 0.5)
+
+    y_true = [1, 0]
+    y_score = [1, 0]
+    tpr, fpr, _ = roc_curve(y_true, y_score)
+    roc_auc = roc_auc_score(y_true, y_score)
+    assert_array_almost_equal(tpr, [0, 0, 1])
+    assert_array_almost_equal(fpr, [0, 1, 1])
+    assert_almost_equal(roc_auc, 1.0)
+
+    y_true = [1, 0]
+    y_score = [0.5, 0.5]
+    tpr, fpr, _ = roc_curve(y_true, y_score)
+    roc_auc = roc_auc_score(y_true, y_score)
+    assert_array_almost_equal(tpr, [0, 1])
+    assert_array_almost_equal(fpr, [0, 1])
+    assert_almost_equal(roc_auc, 0.5)
+
+    y_true = [0, 0]
+    y_score = [0.25, 0.75]
+    # assert UndefinedMetricWarning because of no positive sample in y_true
+    expected_message = (
+        "No positive samples in y_true, true positive value should be meaningless"
+    )
+    with pytest.warns(UndefinedMetricWarning, match=expected_message):
+        tpr, fpr, _ = roc_curve(y_true, y_score)
+
+    with pytest.raises(ValueError):
+        roc_auc_score(y_true, y_score)
+    assert_array_almost_equal(tpr, [0.0, 0.5, 1.0])
+    assert_array_almost_equal(fpr, [np.nan, np.nan, np.nan])
+
+    y_true = [1, 1]
+    y_score = [0.25, 0.75]
+    # assert UndefinedMetricWarning because of no negative sample in y_true
+    expected_message = (
+        "No negative samples in y_true, false positive value should be meaningless"
+    )
+    with pytest.warns(UndefinedMetricWarning, match=expected_message):
+        tpr, fpr, _ = roc_curve(y_true, y_score)
+
+    with pytest.raises(ValueError):
+        roc_auc_score(y_true, y_score)
+    assert_array_almost_equal(tpr, [np.nan, np.nan, np.nan])
+    assert_array_almost_equal(fpr, [0.0, 0.5, 1.0])
+
+    # Multi-label classification task
+    y_true = np.array([[0, 1], [0, 1]])
+    y_score = np.array([[0, 1], [0, 1]])
+    with pytest.raises(ValueError):
+        roc_auc_score(y_true, y_score, average="macro")
+    with pytest.raises(ValueError):
+        roc_auc_score(y_true, y_score, average="weighted")
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 1.0)
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 1.0)
+
+    y_true = np.array([[0, 1], [0, 1]])
+    y_score = np.array([[0, 1], [1, 0]])
+    with pytest.raises(ValueError):
+        roc_auc_score(y_true, y_score, average="macro")
+    with pytest.raises(ValueError):
+        roc_auc_score(y_true, y_score, average="weighted")
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0.5)
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0.5)
+
+    y_true = np.array([[1, 0], [0, 1]])
+    y_score = np.array([[0, 1], [1, 0]])
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), 0)
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), 0)
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0)
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0)
+
+    y_true = np.array([[1, 0], [0, 1]])
+    y_score = np.array([[0.5, 0.5], [0.5, 0.5]])
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), 0.5)
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), 0.5)
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0.5)
+    assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0.5)
+
+
+def test_roc_curve_drop_intermediate(setup):
+    # Test that drop_intermediate drops the correct thresholds
+    y_true = [0, 0, 0, 0, 1, 1]
+    y_score = [0.0, 0.2, 0.5, 0.6, 0.7, 1.0]
+    tpr, fpr, thresholds = roc_curve(y_true, y_score, drop_intermediate=True)
+    np.testing.assert_array_almost_equal(thresholds.fetch(), [2.0, 1.0, 0.7, 0.0])
+
+    # Test dropping thresholds with repeating scores
+    y_true = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
+    y_score = [0.0, 0.1, 0.6, 0.6, 0.7, 0.8, 0.9, 0.6, 0.7, 0.8, 0.9, 0.9, 1.0]
+    tpr, fpr, thresholds = roc_curve(y_true, y_score, drop_intermediate=True)
+    np.testing.assert_array_almost_equal(
+        thresholds.fetch(), [2.0, 1.0, 0.9, 0.7, 0.6, 0.0]
+    )
+
+
+def test_roc_curve_fpr_tpr_increasing(setup):
+    # Ensure that fpr and tpr returned by roc_curve are increasing.
+    # Construct an edge case with float y_score and sample_weight
+    # when some adjacent values of fpr and tpr are actually the same.
+    y_true = [0, 0, 1, 1, 1]
+    y_score = [0.1, 0.7, 0.3, 0.4, 0.5]
+    sample_weight = np.repeat(0.2, 5)
+    fpr, tpr, _ = roc_curve(y_true, y_score, sample_weight=sample_weight)
+    assert ((mt.diff(fpr) < 0).sum() == 0).to_numpy()
+    assert ((mt.diff(tpr) < 0).sum() == 0).to_numpy()
+
+
+def test_auc(setup):
+    # Test Area Under Curve (AUC) computation
+    x = [0, 1]
+    y = [0, 1]
+    np.testing.assert_array_almost_equal(auc(x, y).fetch(), 0.5)
+    x = [1, 0]
+    y = [0, 1]
+    np.testing.assert_array_almost_equal(auc(x, y).fetch(), 0.5)
+    x = [1, 0, 0]
+    y = [0, 1, 1]
+    np.testing.assert_array_almost_equal(auc(x, y).fetch(), 0.5)
+    x = [0, 1]
+    y = [1, 1]
+    np.testing.assert_array_almost_equal(auc(x, y).fetch(), 1)
+    x = [0, 0.5, 1]
+    y = [0, 0.5, 1]
+    np.testing.assert_array_almost_equal(auc(x, y).fetch(), 0.5)
+
+
+def test_auc_errors(setup):
+    # Incompatible shapes
+    with pytest.raises(ValueError):
+        auc([0.0, 0.5, 1.0], [0.1, 0.2])
+
+    # Too few x values
+    with pytest.raises(ValueError):
+        auc([0.0], [0.1])
+
+    # x is not in order
+    x = [2, 1, 3, 4]
+    y = [5, 6, 7, 8]
+    error_message = f"x is neither increasing nor decreasing : {np.array(x)}"
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        auc(x, y)
+
+
+@pytest.mark.parametrize(
+    "y_true, labels",
+    [
+        (np.array([0, 1, 0, 2]), [0, 1, 2]),
+        (np.array([0, 1, 0, 2]), None),
+        (["a", "b", "a", "c"], ["a", "b", "c"]),
+        (["a", "b", "a", "c"], None),
+    ],
+)
+def test_multiclass_ovo_roc_auc_toydata(setup, y_true, labels):
+    # Tests the one-vs-one multiclass ROC AUC algorithm
+    # on a small example, representative of an expected use case.
+    y_scores = np.array(
+        [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]]
+    )
+
+    # Used to compute the expected output.
+    # Consider labels 0 and 1:
+    # positive label is 0, negative label is 1
+    score_01 = roc_auc_score([1, 0, 1], [0.1, 0.3, 0.35])
+    # positive label is 1, negative label is 0
+    score_10 = roc_auc_score([0, 1, 0], [0.8, 0.4, 0.5])
+    average_score_01 = (score_01 + score_10) / 2
+
+    # Consider labels 0 and 2:
+    score_02 = roc_auc_score([1, 1, 0], [0.1, 0.35, 0])
+    score_20 = roc_auc_score([0, 0, 1], [0.1, 0.15, 0.8])
+    average_score_02 = (score_02 + score_20) / 2
+
+    # Consider labels 1 and 2:
+    score_12 = roc_auc_score([1, 0], [0.4, 0.2])
+    score_21 = roc_auc_score([0, 1], [0.3, 0.8])
+    average_score_12 = (score_12 + score_21) / 2
+
+    # Unweighted, one-vs-one multiclass ROC AUC algorithm
+    ovo_unweighted_score = (average_score_01 + average_score_02 + average_score_12) / 3
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, labels=labels, multi_class="ovo"),
+        ovo_unweighted_score,
+    )
+
+    # Weighted, one-vs-one multiclass ROC AUC algorithm
+    # Each term is weighted by the prevalence for the positive label.
+    pair_scores = [average_score_01, average_score_02, average_score_12]
+    prevalence = [0.75, 0.75, 0.50]
+    ovo_weighted_score = np.average(pair_scores, weights=prevalence)
+    assert_almost_equal(
+        roc_auc_score(
+            y_true, y_scores, labels=labels, multi_class="ovo", average="weighted"
+        ),
+        ovo_weighted_score,
+    )
+
+
+@pytest.mark.parametrize(
+    "y_true, labels",
+    [
+        (np.array([0, 2, 0, 2]), [0, 1, 2]),
+        (np.array(["a", "d", "a", "d"]), ["a", "b", "d"]),
+    ],
+)
+def test_multiclass_ovo_roc_auc_toydata_binary(setup, y_true, labels):
+    # Tests the one-vs-one multiclass ROC AUC algorithm for binary y_true
+    #
+    # on a small example, representative of an expected use case.
+    y_scores = np.array(
+        [[0.2, 0.0, 0.8], [0.6, 0.0, 0.4], [0.55, 0.0, 0.45], [0.4, 0.0, 0.6]]
+    )
+
+    # Used to compute the expected output.
+    # Consider labels 0 and 1:
+    # positive label is 0, negative label is 1
+    score_01 = roc_auc_score([1, 0, 1, 0], [0.2, 0.6, 0.55, 0.4])
+    # positive label is 1, negative label is 0
+    score_10 = roc_auc_score([0, 1, 0, 1], [0.8, 0.4, 0.45, 0.6])
+    ovo_score = (score_01 + score_10) / 2
+
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, labels=labels, multi_class="ovo"), ovo_score
+    )
+
+    # Weighted, one-vs-one multiclass ROC AUC algorithm
+    assert_almost_equal(
+        roc_auc_score(
+            y_true, y_scores, labels=labels, multi_class="ovo", average="weighted"
+        ),
+        ovo_score,
+    )
+
+
+@pytest.mark.parametrize(
+    "y_true, labels",
+    [
+        (np.array([0, 1, 2, 2]), None),
+        (["a", "b", "c", "c"], None),
+        ([0, 1, 2, 2], [0, 1, 2]),
+        (["a", "b", "c", "c"], ["a", "b", "c"]),
+    ],
+)
+def test_multiclass_ovr_roc_auc_toydata(setup, y_true, labels):
+    # Tests the unweighted, one-vs-rest multiclass ROC AUC algorithm
+    # on a small example, representative of an expected use case.
+    y_scores = np.array(
+        [[1.0, 0.0, 0.0], [0.1, 0.5, 0.4], [0.1, 0.1, 0.8], [0.3, 0.3, 0.4]]
+    )
+    # Compute the expected result by individually computing the 'one-vs-rest'
+    # ROC AUC scores for classes 0, 1, and 2.
+    out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:, 0])
+    out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:, 1])
+    out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:, 2])
+    result_unweighted = (out_0 + out_1 + out_2) / 3.0
+
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multi_class="ovr", labels=labels),
+        result_unweighted,
+    )
+
+    # Tests the weighted, one-vs-rest multiclass ROC AUC algorithm
+    # on the same input (Provost & Domingos, 2000)
+    result_weighted = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5
+    assert_almost_equal(
+        roc_auc_score(
+            y_true, y_scores, multi_class="ovr", labels=labels, average="weighted"
+        ),
+        result_weighted,
+    )
+
+
+@pytest.mark.parametrize(
+    "msg, y_true, labels",
+    [
+        ("Parameter 'labels' must be unique", np.array([0, 1, 2, 2]), [0, 2, 0]),
+        (
+            "Parameter 'labels' must be unique",
+            np.array(["a", "b", "c", "c"]),
+            ["a", "a", "b"],
+        ),
+        (
+            "Number of classes in y_true not equal to the number of columns "
+            "in 'y_score'",
+            np.array([0, 2, 0, 2]),
+            None,
+        ),
+        (
+            "Parameter 'labels' must be ordered",
+            np.array(["a", "b", "c", "c"]),
+            ["a", "c", "b"],
+        ),
+        (
+            "Number of given labels, 2, not equal to the number of columns in "
+            "'y_score', 3",
+            np.array([0, 1, 2, 2]),
+            [0, 1],
+        ),
+        (
+            "Number of given labels, 2, not equal to the number of columns in "
+            "'y_score', 3",
+            np.array(["a", "b", "c", "c"]),
+            ["a", "b"],
+        ),
+        (
+            "Number of given labels, 4, not equal to the number of columns in "
+            "'y_score', 3",
+            np.array([0, 1, 2, 2]),
+            [0, 1, 2, 3],
+        ),
+        (
+            "Number of given labels, 4, not equal to the number of columns in "
+            "'y_score', 3",
+            np.array(["a", "b", "c", "c"]),
+            ["a", "b", "c", "d"],
+        ),
+        (
+            "'y_true' contains labels not in parameter 'labels'",
+            np.array(["a", "b", "c", "e"]),
+            ["a", "b", "c"],
+        ),
+        (
+            "'y_true' contains labels not in parameter 'labels'",
+            np.array(["a", "b", "c", "d"]),
+            ["a", "b", "c"],
+        ),
+        (
+            "'y_true' contains labels not in parameter 'labels'",
+            np.array([0, 1, 2, 3]),
+            [0, 1, 2],
+        ),
+    ],
+)
+@pytest.mark.parametrize("multi_class", ["ovo", "ovr"])
+def test_roc_auc_score_multiclass_labels_error(setup, msg, y_true, labels, multi_class):
+    y_scores = np.array(
+        [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]]
+    )
+
+    with pytest.raises(ValueError, match=msg):
+        roc_auc_score(y_true, y_scores, labels=labels, multi_class=multi_class)
+
+
+@pytest.mark.parametrize(
+    "msg, kwargs",
+    [
+        (
+            (
+                r"average must be one of \('macro', 'weighted'\) for "
+                r"multiclass problems"
+            ),
+            {"average": "samples", "multi_class": "ovo"},
+        ),
+        (
+            (
+                r"average must be one of \('macro', 'weighted'\) for "
+                r"multiclass problems"
+            ),
+            {"average": "micro", "multi_class": "ovr"},
+        ),
+        (
+            (
+                r"sample_weight is not supported for multiclass one-vs-one "
+                r"ROC AUC, 'sample_weight' must be None in this case"
+            ),
+            {"multi_class": "ovo", "sample_weight": []},
+        ),
+        (
+            (
+                r"Partial AUC computation not available in multiclass setting, "
+                r"'max_fpr' must be set to `None`, received `max_fpr=0.5` "
+                r"instead"
+            ),
+            {"multi_class": "ovo", "max_fpr": 0.5},
+        ),
+        (
+            (
+                r"multi_class='ovp' is not supported for multiclass ROC AUC, "
+                r"multi_class must be in \('ovo', 'ovr'\)"
+            ),
+            {"multi_class": "ovp"},
+        ),
+        (r"multi_class must be in \('ovo', 'ovr'\)", {}),
+    ],
+)
+def test_roc_auc_score_multiclass_error(setup, msg, kwargs):
+    # Test that roc_auc_score function returns an error when trying
+    # to compute multiclass AUC for parameters where an output
+    # is not defined.
+    rng = check_random_state(404)
+    y_score = rng.rand(20, 3)
+    y_prob = softmax(y_score)
+    y_true = rng.randint(0, 3, size=20)
+    with pytest.raises(ValueError, match=msg):
+        roc_auc_score(y_true, y_prob, **kwargs)
+
+
+def test_auc_score_non_binary_class(setup):
+    # Test that roc_auc_score function returns an error when trying
+    # to compute AUC for non-binary class values.
+    rng = check_random_state(404)
+    y_pred = rng.rand(10)
+    # y_true contains only one class value
+    y_true = np.zeros(10, dtype="int")
+    err_msg = "ROC AUC score is not defined"
+    with pytest.raises(ValueError, match=err_msg):
+        roc_auc_score(y_true, y_pred)
+    y_true = np.ones(10, dtype="int")
+    with pytest.raises(ValueError, match=err_msg):
+        roc_auc_score(y_true, y_pred)
+    y_true = np.full(10, -1, dtype="int")
+    with pytest.raises(ValueError, match=err_msg):
+        roc_auc_score(y_true, y_pred)
+
+    with warnings.catch_warnings(record=True):
+        rng = check_random_state(404)
+        y_pred = rng.rand(10)
+        # y_true contains only one class value
+        y_true = np.zeros(10, dtype="int")
+        with pytest.raises(ValueError, match=err_msg):
+            roc_auc_score(y_true, y_pred)
+        y_true = np.ones(10, dtype="int")
+        with pytest.raises(ValueError, match=err_msg):
+            roc_auc_score(y_true, y_pred)
+        y_true = np.full(10, -1, dtype="int")
+        with pytest.raises(ValueError, match=err_msg):
+            roc_auc_score(y_true, y_pred)
+
+
+def test_binary_clf_curve_multiclass_error(setup):
+    rng = check_random_state(404)
+    y_true = rng.randint(0, 3, size=10)
+    y_pred = rng.rand(10)
+    msg = "multiclass format is not supported"
+
+    with pytest.raises(ValueError, match=msg):
+        roc_curve(y_true, y_pred)
+
+
+def test_dataframe_roc_curve_auc(setup):
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame({"a": rs.randint(0, 10, (10,)), "b": rs.rand(10)})
+
+    df = md.DataFrame(raw)
+    y = df["a"].to_tensor().astype("int")
+    pred = df["b"].to_tensor().astype("float")
+    fpr, tpr, thresholds = roc_curve(y, pred, pos_label=2)
+    m = auc(fpr, tpr)
+
+    sk_fpr, sk_tpr, sk_threshod = sklearn_roc_curve(
+        raw["a"].to_numpy().astype("int"),
+        raw["b"].to_numpy().astype("float"),
+        pos_label=2,
+    )
+    expect_m = sklearn_auc(sk_fpr, sk_tpr)
+    assert pytest.approx(m.fetch()) == expect_m
+
+
+def test_dataframe_accuracy_score(setup):
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame({"a": rs.randint(0, 10, (10,)), "b": rs.randint(0, 10, (10,))})
+
+    df = md.DataFrame(raw)
+    y = df["a"].to_tensor().astype("int")
+    pred = df["b"].astype("int")
+
+    score = accuracy_score(y, pred)
+    expect = sklearn_accuracy_score(
+        raw["a"].to_numpy().astype("int"), raw["b"].to_numpy().astype("int")
+    )
+    assert pytest.approx(score.fetch()) == expect
+
+
+def test_partial_roc_auc_score(setup):
+    # Check `roc_auc_score` for max_fpr != `None`
+    y_true = np.array([0, 0, 1, 1])
+    assert roc_auc_score(y_true, y_true, max_fpr=1) == 1
+    assert roc_auc_score(y_true, y_true, max_fpr=0.001) == 1
+    with pytest.raises(ValueError):
+        assert roc_auc_score(y_true, y_true, max_fpr=-0.1)
+    with pytest.raises(ValueError):
+        assert roc_auc_score(y_true, y_true, max_fpr=1.1)
+    with pytest.raises(ValueError):
+        assert roc_auc_score(y_true, y_true, max_fpr=0)
+
+    y_scores = np.array([0.1, 0, 0.1, 0.01])
+    roc_auc_with_max_fpr_one = roc_auc_score(y_true, y_scores, max_fpr=1)
+    unconstrained_roc_auc = roc_auc_score(y_true, y_scores)
+    assert roc_auc_with_max_fpr_one == unconstrained_roc_auc
+    assert roc_auc_score(y_true, y_scores, max_fpr=0.3) == 0.5
+
+    y_true, y_pred, _ = make_prediction(binary=True)
+    for max_fpr in np.linspace(1e-4, 1, 5):
+        assert_almost_equal(
+            roc_auc_score(y_true, y_pred, max_fpr=max_fpr),
+            _partial_roc_auc_score(y_true, y_pred, max_fpr),
+        )
diff --git a/python/xorbits/_mars/learn/metrics/tests/test_regression.py b/python/xorbits/_mars/learn/metrics/tests/test_regression.py
new file mode 100644
index 000000000..28183873f
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/tests/test_regression.py
@@ -0,0 +1,138 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from itertools import product
+
+import numpy as np
+import pytest
+from sklearn.exceptions import UndefinedMetricWarning
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+
+from .... import tensor as mt
+from .. import r2_score
+from .._regresssion import _check_reg_targets
+
+
+def test__check_reg_targets(setup):
+    # All of length 3
+    EXAMPLES = [
+        ("continuous", [1, 2, 3], 1),
+        ("continuous", [[1], [2], [3]], 1),
+        ("continuous-multioutput", [[1, 1], [2, 2], [3, 1]], 2),
+        ("continuous-multioutput", [[5, 1], [4, 2], [3, 1]], 2),
+        ("continuous-multioutput", [[1, 3, 4], [2, 2, 2], [3, 1, 1]], 3),
+    ]
+
+    for (type1, y1, n_out1), (type2, y2, n_out2) in product(EXAMPLES, repeat=2):
+        if type1 == type2 and n_out1 == n_out2:
+            y_type, y_check1, y_check2, multioutput = _check_reg_targets(y1, y2, None)
+            assert type1 == y_type
+            if type1 == "continuous":
+                assert_array_equal(y_check1, np.reshape(y1, (-1, 1)))
+                assert_array_equal(y_check2, np.reshape(y2, (-1, 1)))
+            else:
+                assert_array_equal(y_check1, y1)
+                assert_array_equal(y_check2, y2)
+        else:
+            with pytest.raises(ValueError):
+                _check_reg_targets(y1, y2, None)
+
+
+def test__check_reg_targets_exception(setup):
+    invalid_multioutput = "this_value_is_not_valid"
+    expected_message = (
+        "Allowed 'multioutput' string values are.+"
+        "You provided multioutput={!r}".format(invalid_multioutput)
+    )
+    with pytest.raises(ValueError, match=expected_message):
+        _check_reg_targets([1, 2, 3], [[1], [2], [3]], invalid_multioutput)
+
+    with pytest.raises(ValueError):
+        _check_reg_targets([1, 2], [[1], [2]], multioutput=[0.4, 0.6])
+    with pytest.raises(ValueError):
+        _check_reg_targets([[1, 2], [3, 4]], [[1, 2], [3, 4]], multioutput=[0.4])
+
+
+def test_r2_score(setup, n_samples=50):
+    y_true = mt.arange(n_samples)
+    y_pred = y_true + 1
+
+    assert_almost_equal(r2_score(y_true, y_pred).fetch(), 0.995, 2)
+
+    y_true = mt.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]])
+    y_pred = mt.array([[0, 0, 0, 1], [1, 0, 1, 1], [0, 0, 0, 1]])
+
+    error = r2_score(y_true, y_pred, multioutput="variance_weighted")
+    assert_almost_equal(error.fetch(), 1.0 - 5.0 / 2)
+    error = r2_score(y_true, y_pred, multioutput="uniform_average")
+    assert_almost_equal(error.fetch(), -0.875)
+
+    assert_almost_equal(r2_score([0.0, 1], [0.0, 1]).fetch(), 1.00, 2)
+    assert_almost_equal(
+        r2_score([0.0, 1], [0.0, 1], sample_weight=[0.5, 0.5]).fetch(), 1.00, 2
+    )
+
+    y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]]
+    y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]]
+
+    r = r2_score(y_true, y_pred, multioutput="raw_values")
+
+    assert_array_almost_equal(r, [0.95, 0.93], decimal=2)
+
+    # mean_absolute_error and mean_squared_error are equal because
+    # it is a binary problem.
+    y_true = [[0, 0]] * 4
+    y_pred = [[1, 1]] * 4
+    r = r2_score(y_true, y_pred, multioutput="raw_values")
+    assert_array_almost_equal(r, [0.0, 0.0], decimal=2)
+
+    r = r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput="raw_values")
+    assert_array_almost_equal(r, [0, -3.5], decimal=2)
+    assert (
+        np.mean(r.fetch())
+        == r2_score(
+            [[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput="uniform_average"
+        ).fetch()
+    )
+
+    # Checking for the condition in which both numerator and denominator is
+    # zero.
+    y_true = [[1, 3], [-1, 2]]
+    y_pred = [[1, 4], [-1, 1]]
+    r2 = r2_score(y_true, y_pred, multioutput="raw_values")
+    assert_array_almost_equal(r2, [1.0, -3.0], decimal=2)
+    assert (
+        np.mean(r2.fetch())
+        == r2_score(y_true, y_pred, multioutput="uniform_average").fetch()
+    )
+
+    y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]]
+    y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]]
+
+    rw = r2_score(y_true, y_pred, multioutput=[0.4, 0.6])
+
+    assert_almost_equal(rw.fetch(), 0.94, decimal=2)
+
+    y_true = [0]
+    y_pred = [1]
+    warning_msg = "not well-defined with less than two samples."
+
+    # Trigger the warning
+    with pytest.warns(UndefinedMetricWarning, match=warning_msg):
+        score = r2_score(y_true, y_pred)
+        assert np.isnan(score)
diff --git a/python/xorbits/_mars/learn/metrics/tests/test_scorer.py b/python/xorbits/_mars/learn/metrics/tests/test_scorer.py
new file mode 100644
index 000000000..3f30691de
--- /dev/null
+++ b/python/xorbits/_mars/learn/metrics/tests/test_scorer.py
@@ -0,0 +1,26 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from sklearn.metrics import r2_score
+
+from .. import get_scorer
+
+
+def test_get_scorer():
+    with pytest.raises(ValueError):
+        get_scorer("unknown")
+
+    assert get_scorer("r2") is not None
+    assert get_scorer(r2_score) is not None
diff --git a/python/xorbits/_mars/learn/model_selection/__init__.py b/python/xorbits/_mars/learn/model_selection/__init__.py
new file mode 100644
index 000000000..2df377453
--- /dev/null
+++ b/python/xorbits/_mars/learn/model_selection/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ._split import KFold, train_test_split
diff --git a/python/xorbits/_mars/learn/model_selection/_split.py b/python/xorbits/_mars/learn/model_selection/_split.py
new file mode 100644
index 000000000..faf31984c
--- /dev/null
+++ b/python/xorbits/_mars/learn/model_selection/_split.py
@@ -0,0 +1,459 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numbers
+from abc import ABCMeta, abstractmethod
+from itertools import chain
+from math import ceil, floor
+
+import numpy as np
+
+from ... import tensor as mt
+from ...core import ExecutableTuple
+from ...tensor.utils import check_random_state
+from ..utils import shuffle as shuffle_arrays
+from ..utils.validation import _num_samples, indexable
+
+
+def train_test_split(*arrays, **options):
+    """Split arrays or matrices into random train and test subsets
+
+    Parameters
+    ----------
+    *arrays : sequence of indexables with same length / shape[0]
+        Allowed inputs are lists, numpy arrays, scipy-sparse
+        matrices or pandas dataframes.
+
+    test_size : float, int or None, optional (default=None)
+        If float, should be between 0.0 and 1.0 and represent the proportion
+        of the dataset to include in the test split. If int, represents the
+        absolute number of test samples. If None, the value is set to the
+        complement of the train size. If ``train_size`` is also None, it will
+        be set to 0.25.
+
+    train_size : float, int, or None, (default=None)
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the dataset to include in the train split. If
+        int, represents the absolute number of train samples. If None,
+        the value is automatically set to the complement of the test size.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    shuffle : boolean, optional (default=True)
+        Whether or not to shuffle the data before splitting. If shuffle=False
+        then stratify must be None.
+
+    stratify : array-like or None (default=None)
+        If not None, data is split in a stratified fashion, using this as
+        the class labels.
+
+    Returns
+    -------
+    splitting : list, length=2 * len(arrays)
+        List containing train-test split of inputs.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> from mars.learn.model_selection import train_test_split
+    >>> X, y = mt.arange(10).reshape((5, 2)), range(5)
+    >>> X.execute()
+    array([[0, 1],
+           [2, 3],
+           [4, 5],
+           [6, 7],
+           [8, 9]])
+    >>> list(y)
+    [0, 1, 2, 3, 4]
+
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, test_size=0.33, random_state=42)
+    ...
+    >>> X_train.execute()
+    array([[8, 9],
+           [0, 1],
+           [4, 5]])
+    >>> y_train.execute()
+    array([4, 0, 2])
+    >>> X_test.execute()
+    array([[2, 3],
+           [6, 7]])
+    >>> y_test.execute()
+    array([1, 3])
+
+    >>> train_test_split(y, shuffle=False)
+    [array([0, 1, 2]), array([3, 4])]
+
+    """
+
+    n_arrays = len(arrays)
+    if n_arrays == 0:
+        raise ValueError("At least one array required as input")
+    test_size = options.pop("test_size", None)
+    train_size = options.pop("train_size", None)
+    random_state = options.pop("random_state", None)
+    stratify = options.pop("stratify", None)
+    shuffle = options.pop("shuffle", True)
+    session = options.pop("session", None)
+    run_kwargs = options.pop("run_kwargs", None)
+
+    if options:
+        raise TypeError(f"Invalid parameters passed: {options}")
+
+    arrays = indexable(*arrays, session=session, run_kwargs=run_kwargs)
+
+    n_samples = _num_samples(arrays[0])
+    n_train, n_test = _validate_shuffle_split(
+        n_samples, test_size, train_size, default_test_size=0.25
+    )
+
+    if shuffle is False:
+        if stratify is not None:  # pragma: no cover
+            raise ValueError(
+                "Stratified train/test split is not implemented for shuffle=False"
+            )
+
+        iterables = ((a[:n_train], a[n_train : n_train + n_test]) for a in arrays)
+    else:
+        if stratify is not None:  # pragma: no cover
+            raise NotImplementedError("stratify is not implemented yet")
+        else:
+            shuffled_arrays = shuffle_arrays(*arrays, random_state=random_state)
+            if not isinstance(shuffled_arrays, tuple):
+                shuffled_arrays = (shuffled_arrays,)
+            iterables = (
+                (a[:n_train], a[n_train : n_train + n_test]) for a in shuffled_arrays
+            )
+
+    return list(
+        ExecutableTuple(chain.from_iterable(iterables)).execute(
+            session=session, **(run_kwargs or dict())
+        )
+    )
+
+
+def _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=None):
+    """
+    Validation helper to check if the test/test sizes are meaningful wrt to the
+    size of the data (n_samples)
+    """
+    if test_size is None and train_size is None:
+        test_size = default_test_size
+
+    test_size_type = np.asarray(test_size).dtype.kind
+    train_size_type = np.asarray(train_size).dtype.kind
+
+    if (
+        test_size_type == "i"
+        and (test_size >= n_samples or test_size <= 0)
+        or test_size_type == "f"
+        and (test_size <= 0 or test_size >= 1)
+    ):
+        raise ValueError(
+            f"test_size={test_size} should be either positive and smaller"
+            f" than the number of samples {n_samples} or a float in the "
+            "(0, 1) range"
+        )
+
+    if (
+        train_size_type == "i"
+        and (train_size >= n_samples or train_size <= 0)
+        or train_size_type == "f"
+        and (train_size <= 0 or train_size >= 1)
+    ):
+        raise ValueError(
+            f"train_size={train_size} should be either positive and smaller"
+            f" than the number of samples {n_samples} or a float in the "
+            "(0, 1) range"
+        )
+
+    if train_size is not None and train_size_type not in ("i", "f"):  # pragma: no cover
+        raise ValueError(f"Invalid value for train_size: {train_size}")
+    if test_size is not None and test_size_type not in ("i", "f"):
+        raise ValueError(f"Invalid value for test_size: {test_size}")
+
+    if train_size_type == "f" and test_size_type == "f" and train_size + test_size > 1:
+        raise ValueError(
+            f"The sum of test_size and train_size = {train_size + test_size}, "
+            "should be in the (0, 1) range. Reduce test_size and/or train_size."
+        )
+
+    if test_size_type == "f":
+        n_test = ceil(test_size * n_samples)
+    elif test_size_type == "i":
+        n_test = float(test_size)
+
+    if train_size_type == "f":
+        n_train = floor(train_size * n_samples)
+    elif train_size_type == "i":  # pragma: no cover
+        n_train = float(train_size)
+
+    if train_size is None:
+        n_train = n_samples - n_test
+    elif test_size is None:
+        n_test = n_samples - n_train
+
+    if n_train + n_test > n_samples:  # pragma: no cover
+        raise ValueError(
+            f"The sum of train_size and test_size = {n_train + n_test}, "
+            f"should be smaller than the number of samples {n_samples}. "
+            "Reduce test_size and/or train_size."
+        )
+
+    n_train, n_test = int(n_train), int(n_test)
+
+    if n_train == 0:  # pragma: no cover
+        raise ValueError(
+            f"With n_samples={n_samples}, test_size={test_size} and "
+            f"train_size={train_size}, the resulting train set will "
+            f"be empty. Adjust any of the aforementioned parameters."
+        )
+
+    return n_train, n_test
+
+
+class BaseCrossValidator(metaclass=ABCMeta):
+    """Base class for all cross-validators
+
+    Implementations must define `_iter_test_masks` or `_iter_test_indices`.
+    """
+
+    def split(self, X, y=None, groups=None):  # pragma: no cover
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where n_samples is the number of samples
+            and n_features is the number of features.
+
+        y : array-like of shape (n_samples,)
+            The target variable for supervised learning problems.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        X, y, groups = indexable(X, y, groups)
+        indices = mt.arange(_num_samples(X))
+        for test_index in self._iter_test_masks(X, y, groups):
+            train_index = indices[mt.logical_not(test_index)]
+            test_index = indices[test_index]
+            yield train_index, test_index
+
+    # Since subclasses must implement either _iter_test_masks or
+    # _iter_test_indices, neither can be abstract.
+    def _iter_test_masks(self, X=None, y=None, groups=None):  # pragma: no cover
+        """Generates boolean masks corresponding to test sets.
+
+        By default, delegates to _iter_test_indices(X, y, groups)
+        """
+        for test_index in self._iter_test_indices(X, y, groups):
+            test_mask = mt.zeros(_num_samples(X), dtype=bool)
+            test_mask[test_index] = True
+            yield test_mask
+
+    def _iter_test_indices(self, X=None, y=None, groups=None):  # pragma: no cover
+        """Generates integer indices corresponding to test sets."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator"""
+
+
+class _BaseKFold(BaseCrossValidator, metaclass=ABCMeta):
+    """Base class for KFold, GroupKFold, and StratifiedKFold"""
+
+    @abstractmethod
+    def __init__(self, n_splits, *, shuffle, random_state):
+        if not isinstance(n_splits, numbers.Integral):
+            raise ValueError(
+                "The number of folds must be of Integral type. "
+                "%s of type %s was passed." % (n_splits, type(n_splits))
+            )
+        n_splits = int(n_splits)
+
+        if n_splits <= 1:
+            raise ValueError(
+                "k-fold cross-validation requires at least one"
+                " train/test split by setting n_splits=2 or more,"
+                " got n_splits={0}.".format(n_splits)
+            )
+
+        if not isinstance(shuffle, bool):
+            raise TypeError("shuffle must be True or False; got {0}".format(shuffle))
+
+        if not shuffle and random_state is not None:  # None is the default
+            raise ValueError(
+                "Setting a random_state has no effect since shuffle is "
+                "False. You should leave "
+                "random_state to its default (None), or set shuffle=True.",
+            )
+
+        self.n_splits = n_splits
+        self.shuffle = shuffle
+        self.random_state = random_state
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        return self.n_splits
+
+
+class KFold(_BaseKFold):
+    """K-Folds cross-validator
+
+    Provides train/test indices to split data in train/test sets. Split
+    dataset into k consecutive folds (without shuffling by default).
+
+    Each fold is then used once as a validation while the k - 1 remaining
+    folds form the training set.
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of folds. Must be at least 2.
+
+        .. versionchanged:: 0.22
+            ``n_splits`` default value changed from 3 to 5.
+
+    shuffle : bool, default=False
+        Whether to shuffle the data before splitting into batches.
+        Note that the samples within each split will not be shuffled.
+
+    random_state : int, RandomState instance or None, default=None
+        When `shuffle` is True, `random_state` affects the ordering of the
+        indices, which controls the randomness of each fold. Otherwise, this
+        parameter has no effect.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> from mars.learn.model_selection import KFold
+    >>> X = mt.array([[1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = mt.array([1, 2, 3, 4])
+    >>> kf = KFold(n_splits=2)
+    >>> kf.get_n_splits(X)
+    2
+    >>> print(kf)
+    KFold(n_splits=2, random_state=None, shuffle=False)
+    >>> for train_index, test_index in kf.split(X):
+    ...     print("TRAIN:", train_index, "TEST:", test_index)
+    ...     X_train, X_test = X[train_index], X[test_index]
+    ...     y_train, y_test = y[train_index], y[test_index]
+    TRAIN: [2 3] TEST: [0 1]
+    TRAIN: [0 1] TEST: [2 3]
+
+    Notes
+    -----
+    The first ``n_samples % n_splits`` folds have size
+    ``n_samples // n_splits + 1``, other folds have size
+    ``n_samples // n_splits``, where ``n_samples`` is the number of samples.
+
+    Randomized CV splitters may return different results for each call of
+    split. You can make the results identical by setting `random_state`
+    to an integer.
+
+    See Also
+    --------
+    StratifiedKFold : Takes group information into account to avoid building
+        folds with imbalanced class distributions (for binary or multiclass
+        classification tasks).
+
+    GroupKFold : K-fold iterator variant with non-overlapping groups.
+
+    RepeatedKFold : Repeats K-Fold n times.
+    """
+
+    def __init__(self, n_splits=5, *, shuffle=False, random_state=None):
+        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
+
+    def split(self, X, y=None, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where n_samples is the number of samples
+            and n_features is the number of features.
+
+        y : array-like of shape (n_samples,), default=None
+            The target variable for supervised learning problems.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        X, y, groups = indexable(X, y, groups)
+        n_samples = _num_samples(X)
+        if self.n_splits > n_samples:
+            raise ValueError(
+                (
+                    "Cannot have number of splits n_splits={0} greater"
+                    " than the number of samples: n_samples={1}."
+                ).format(self.n_splits, n_samples)
+            )
+
+        indices = mt.arange(n_samples)
+        if self.shuffle:
+            check_random_state(self.random_state).shuffle(indices)
+
+        n_splits = self.n_splits
+        fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=int)
+        fold_sizes[: n_samples % n_splits] += 1
+        current = 0
+        for fold_size in fold_sizes:
+            start, stop = current, current + fold_size
+            train_index = mt.concatenate([indices[:start], indices[stop:]])
+            yield train_index, indices[start:stop]
+            current = stop
diff --git a/python/xorbits/_mars/learn/model_selection/tests/__init__.py b/python/xorbits/_mars/learn/model_selection/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/model_selection/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/model_selection/tests/test_split.py b/python/xorbits/_mars/learn/model_selection/tests/test_split.py
new file mode 100644
index 000000000..ccdf99370
--- /dev/null
+++ b/python/xorbits/_mars/learn/model_selection/tests/test_split.py
@@ -0,0 +1,317 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+import pandas as pd
+import pytest
+
+try:
+    import scipy.sparse as sps
+except ImportError:  # pragma: no cover
+    sps = None
+
+from .... import dataframe as md
+from .... import tensor as mt
+from ....dataframe.core import DATAFRAME_TYPE
+from ....lib.sparse import SparseNDArray
+from ...utils.validation import _num_samples
+from .. import KFold, train_test_split
+
+
+def test_train_test_split_errors(setup):
+    pytest.raises(ValueError, train_test_split)
+
+    pytest.raises(ValueError, train_test_split, range(3), train_size=1.1)
+
+    pytest.raises(ValueError, train_test_split, range(3), test_size=0.6, train_size=0.6)
+    pytest.raises(
+        ValueError,
+        train_test_split,
+        range(3),
+        test_size=np.float32(0.6),
+        train_size=np.float32(0.6),
+    )
+    pytest.raises(ValueError, train_test_split, range(3), test_size="wrong_type")
+    pytest.raises(ValueError, train_test_split, range(3), test_size=2, train_size=4)
+    pytest.raises(TypeError, train_test_split, range(3), some_argument=1.1)
+    pytest.raises(ValueError, train_test_split, range(3), range(42))
+    pytest.raises(ValueError, train_test_split, range(10), shuffle=False, stratify=True)
+
+    with pytest.raises(
+        ValueError,
+        match=r"train_size=11 should be either positive and "
+        r"smaller than the number of samples 10 or a "
+        r"float in the \(0, 1\) range",
+    ):
+        train_test_split(range(10), train_size=11, test_size=1)
+
+
+def test_train_test_split_invalid_sizes1(setup):
+    for train_size, test_size in [
+        (1.2, 0.8),
+        (1.0, 0.8),
+        (0.0, 0.8),
+        (-0.2, 0.8),
+        (0.8, 1.2),
+        (0.8, 1.0),
+        (0.8, 0.0),
+        (0.8, -0.2),
+    ]:
+        with pytest.raises(ValueError, match=r"should be .* in the \(0, 1\) range"):
+            train_test_split(range(10), train_size=train_size, test_size=test_size)
+
+
+def test_train_test_split_invalid_sizes2(setup):
+    for train_size, test_size in [
+        (-10, 0.8),
+        (0, 0.8),
+        (11, 0.8),
+        (0.8, -10),
+        (0.8, 0),
+        (0.8, 11),
+    ]:
+        with pytest.raises(ValueError, match=r"should be .* in the \(0, 1\) range"):
+            train_test_split(range(10), train_size=train_size, test_size=test_size)
+
+
+def test_train_test_split(setup):
+    X = np.arange(100).reshape((10, 10))
+    y = np.arange(10)
+
+    # simple test
+    split = train_test_split(X, y, test_size=None, train_size=0.5)
+    X_train, X_test, y_train, y_test = split
+    assert len(y_test) == len(y_train)
+    # test correspondence of X and y
+    np.testing.assert_array_equal(X_train[:, 0], y_train * 10)
+    np.testing.assert_array_equal(X_test[:, 0], y_test * 10)
+
+    # allow nd-arrays
+    X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2)
+    y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11)
+    split = train_test_split(X_4d, y_3d)
+    assert split[0].shape == (7, 5, 3, 2)
+    assert split[1].shape == (3, 5, 3, 2)
+    assert split[2].shape == (7, 7, 11)
+    assert split[3].shape == (3, 7, 11)
+
+    # test unshuffled split
+    y = np.arange(10)
+    for test_size in [2, 0.2]:
+        train, test = train_test_split(y, shuffle=False, test_size=test_size)
+        np.testing.assert_array_equal(test, [8, 9])
+        np.testing.assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7])
+
+
+def test_train_test_split_dataframe(setup):
+    X = np.ones(10)
+    types = [pd.DataFrame, md.DataFrame]
+    for InputFeatureType in types:
+        # X dataframe
+        X_df = InputFeatureType(X)
+        X_train, X_test = train_test_split(X_df)
+        assert isinstance(X_train, DATAFRAME_TYPE)
+        assert isinstance(X_test, DATAFRAME_TYPE)
+
+
+@pytest.mark.skipif(sps is None, reason="scipy not installed")
+def test_train_test_split_sparse(setup):
+    # check that train_test_split converts scipy sparse matrices
+    # to csr, as stated in the documentation
+    X = np.arange(100).reshape((10, 10))
+    sparse_types = [sps.csr_matrix, sps.csc_matrix, sps.coo_matrix]
+    for InputFeatureType in sparse_types:
+        X_s = InputFeatureType(X)
+        for x in (X_s, mt.tensor(X_s, chunk_size=(2, 5))):
+            X_train, X_test = train_test_split(x)
+            assert isinstance(X_train.fetch(), SparseNDArray)
+            assert isinstance(X_test.fetch(), SparseNDArray)
+
+
+def test_train_testplit_list_input(setup):
+    # Check that when y is a list / list of string labels, it works.
+    X = np.ones(7)
+    y1 = ["1"] * 4 + ["0"] * 3
+    y2 = np.hstack((np.ones(4), np.zeros(3)))
+    y3 = y2.tolist()
+
+    for stratify in (False,):
+        X_train1, X_test1, y_train1, y_test1 = train_test_split(
+            X, y1, stratify=y1 if stratify else None, random_state=0
+        )
+        X_train2, X_test2, y_train2, y_test2 = train_test_split(
+            X, y2, stratify=y2 if stratify else None, random_state=0
+        )
+        X_train3, X_test3, y_train3, y_test3 = train_test_split(
+            X, y3, stratify=y3 if stratify else None, random_state=0
+        )
+
+        np.testing.assert_equal(X_train1, X_train2)
+        np.testing.assert_equal(y_train2, y_train3)
+        np.testing.assert_equal(X_test1, X_test3)
+        np.testing.assert_equal(y_test3, y_test2)
+
+
+def test_mixied_input_type_train_test_split(setup):
+    rs = np.random.RandomState(0)
+    df_raw = pd.DataFrame(rs.rand(10, 4))
+    df = md.DataFrame(df_raw, chunk_size=5)
+    X, y = df.iloc[:, :-1], df.iloc[:, -1]
+
+    for x_to_tensor, y_to_tensor in itertools.product(range(1), range(1)):
+        x = X
+        if x_to_tensor:
+            x = mt.tensor(x)
+        yy = y
+        if y_to_tensor:
+            yy = mt.tensor(yy)
+
+        x_train, x_test, y_train, y_test = train_test_split(
+            x, y, random_state=0, run_kwargs={"extra_config": {"check_nsplits": False}}
+        )
+        assert isinstance(x_train, type(x))
+        assert isinstance(x_test, type(x))
+        assert isinstance(y_train, type(yy))
+        assert isinstance(y_test, type(yy))
+
+
+def test_kfold_valueerrors():
+    X1 = np.array([[1, 2], [3, 4], [5, 6]])
+    # Check that errors are raised if there is not enough samples
+    with pytest.raises(ValueError):
+        next(KFold(4).split(X1))
+
+    # Error when number of folds is <= 1
+    with pytest.raises(ValueError):
+        KFold(0)
+    with pytest.raises(ValueError):
+        KFold(1)
+
+    # When n_splits is not integer:
+    with pytest.raises(ValueError):
+        KFold(1.5)
+    with pytest.raises(ValueError):
+        KFold(2.0)
+
+    # When shuffle is not  a bool:
+    with pytest.raises(TypeError):
+        KFold(n_splits=4, shuffle=None)
+
+
+def check_valid_split(train, test, n_samples=None):
+    # Use python sets to get more informative assertion failure messages
+    train = train.execute().to_numpy()
+    test = test.execute().to_numpy()
+    train, test = set(train), set(test)
+
+    # Train and test split should not overlap
+    assert train.intersection(test) == set()
+
+    if n_samples is not None:
+        # Check that the union of train an test split cover all the indices
+        assert train.union(test) == set(range(n_samples))
+
+
+def check_cv_coverage(cv, X, y, groups, expected_n_splits):
+    n_samples = _num_samples(X)
+    # Check that a all the samples appear at least once in a test fold
+    assert cv.get_n_splits(X, y, groups) == expected_n_splits
+
+    collected_test_samples = set()
+    iterations = 0
+    for train, test in cv.split(X, y, groups):
+        check_valid_split(train, test, n_samples=n_samples)
+        iterations += 1
+        collected_test_samples.update(test.execute().to_numpy())
+
+    # Check that the accumulated test samples cover the whole dataset
+    assert iterations == expected_n_splits
+    if n_samples is not None:
+        assert collected_test_samples == set(range(n_samples))
+
+
+def test_kfold_indices(setup):
+    # Check all indices are returned in the test folds
+    X1 = np.ones(18)
+    kf = KFold(3)
+    check_cv_coverage(kf, X1, y=None, groups=None, expected_n_splits=3)
+
+    # Check all indices are returned in the test folds even when equal-sized
+    # folds are not possible
+    X2 = np.ones(17)
+    kf = KFold(3)
+    check_cv_coverage(kf, X2, y=None, groups=None, expected_n_splits=3)
+
+    # Check if get_n_splits returns the number of folds
+    assert 5 == KFold(5).get_n_splits(X2)
+
+
+def test_kfold_no_shuffle(setup):
+    # Manually check that KFold preserves the data ordering on toy datasets
+    X2 = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
+
+    splits = KFold(2).split(X2[:-1])
+    train, test = next(splits)
+    np.testing.assert_array_equal(test.execute().fetch(), [0, 1])
+    np.testing.assert_array_equal(train.execute().fetch(), [2, 3])
+
+    train, test = next(splits)
+    np.testing.assert_array_equal(test.execute().fetch(), [2, 3])
+    np.testing.assert_array_equal(train.execute().fetch(), [0, 1])
+
+    splits = KFold(2).split(X2)
+    train, test = next(splits)
+    np.testing.assert_array_equal(test.execute().fetch(), [0, 1, 2])
+    np.testing.assert_array_equal(train.execute().fetch(), [3, 4])
+
+    train, test = next(splits)
+    np.testing.assert_array_equal(test.execute().fetch(), [3, 4])
+    np.testing.assert_array_equal(train.execute().fetch(), [0, 1, 2])
+
+
+def test_kfold_balance(setup):
+    # Check that KFold returns folds with balanced sizes
+    for i in range(11, 17):
+        kf = KFold(5).split(X=np.ones(i))
+        sizes = [len(test) for _, test in kf]
+
+        assert (np.max(sizes) - np.min(sizes)) <= 1
+        assert np.sum(sizes) == i
+
+
+def test_shuffle_kfold(setup):
+    # Check the indices are shuffled properly
+    kf = KFold(3)
+    kf2 = KFold(3, shuffle=True, random_state=0)
+    kf3 = KFold(3, shuffle=True, random_state=1)
+
+    X = mt.ones(300)
+
+    all_folds = np.zeros(300)
+    for (tr1, te1), (tr2, te2), (tr3, te3) in zip(
+        kf.split(X), kf2.split(X), kf3.split(X)
+    ):
+        for tr_a, tr_b in itertools.combinations((tr1, tr2, tr3), 2):
+            # Assert that there is no complete overlap
+            tr_a = tr_a.execute().fetch()
+            tr_b = tr_b.execute().fetch()
+            assert len(np.intersect1d(tr_a, tr_b)) != len(tr1)
+
+        # Set all test indices in successive iterations of kf2 to 1
+        all_folds[te2.execute().fetch()] = 1
+
+    # Check that all indices are returned in the different test folds
+    assert sum(all_folds) == 300
diff --git a/python/xorbits/_mars/learn/neighbors/__init__.py b/python/xorbits/_mars/learn/neighbors/__init__.py
new file mode 100644
index 000000000..860298dbb
--- /dev/null
+++ b/python/xorbits/_mars/learn/neighbors/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try:
+    from .unsupervised import NearestNeighbors
+except ImportError:  # pragma: no cover
+    pass
+
+
+def register_op():
+    from ._ball_tree import BallTree, BallTreeQuery
+    from ._kd_tree import KDTree, KDTreeQuery
+
+    del BallTree, BallTreeQuery, KDTree, KDTreeQuery
diff --git a/python/xorbits/_mars/learn/neighbors/_ball_tree.py b/python/xorbits/_mars/learn/neighbors/_ball_tree.py
new file mode 100644
index 000000000..48b850e99
--- /dev/null
+++ b/python/xorbits/_mars/learn/neighbors/_ball_tree.py
@@ -0,0 +1,59 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try:
+    from sklearn.neighbors import BallTree as SklearnBallTree
+except ImportError:  # pragma: no cover
+    SklearnBallTree = None
+
+from ... import opcodes as OperandDef
+from ...utils import require_not_none
+from .tree import TreeBase, TreeObject, TreeQueryBase
+
+
+class BallTree(TreeObject):
+    pass
+
+
+@require_not_none(SklearnBallTree)
+class _BallTree(TreeBase):
+    _op_type_ = OperandDef.BALL_TREE_TRAIN
+    _tree_type = SklearnBallTree
+
+    def __call__(self, a):
+        result = super().__call__(a)
+        return BallTree(result.data)
+
+
+@require_not_none(SklearnBallTree)
+class BallTreeQuery(TreeQueryBase):
+    _op_type_ = OperandDef.BALL_TREE_QUERY
+    _tree_type = SklearnBallTree
+
+
+@require_not_none(SklearnBallTree)
+def ball_tree_query(tree, data, n_neighbors, return_distance):
+    op = BallTreeQuery(
+        tree=tree, n_neighbors=n_neighbors, return_distance=return_distance
+    )
+    ret = op(data)
+    if not return_distance:
+        return ret[0]
+    return ret
+
+
+@require_not_none(SklearnBallTree)
+def create_ball_tree(X, leaf_size, metric=None, **metric_params):
+    op = _BallTree(leaf_size=leaf_size, metric=metric, **metric_params)
+    return op(X)
diff --git a/python/xorbits/_mars/learn/neighbors/_faiss.py b/python/xorbits/_mars/learn/neighbors/_faiss.py
new file mode 100644
index 000000000..167c13c5e
--- /dev/null
+++ b/python/xorbits/_mars/learn/neighbors/_faiss.py
@@ -0,0 +1,806 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+import os
+import tempfile
+from enum import Enum
+
+import numpy as np
+
+try:
+    import faiss
+except ImportError:  # pragma: no cover
+    faiss = None
+
+from ... import opcodes as OperandDef
+from ...core import recursive_tile
+from ...core.operand import OperandStage
+from ...serialization.serializables import (
+    BoolField,
+    Int8Field,
+    Int32Field,
+    Int64Field,
+    KeyField,
+    StringField,
+)
+from ...tensor import tensor as astensor
+from ...tensor.array_utils import as_same_device, device
+from ...tensor.core import TensorOrder
+from ...tensor.random import RandomState
+from ...tensor.utils import check_random_state, gen_random_seeds
+from ...utils import has_unknown_shape, require_not_none
+from ..operands import LearnOperand, LearnOperandMixin, OutputType
+
+
+class MemoryRequirementGrade(Enum):
+    minimum = 0
+    low = 1
+    high = 2
+    maximum = 3
+
+
+if faiss is not None:
+    METRIC_TO_FAISS_METRIC_TYPE = {
+        "l2": faiss.METRIC_L2,
+        "euclidean": faiss.METRIC_L2,
+        "innerproduct": faiss.METRIC_INNER_PRODUCT,
+        "cosine": faiss.METRIC_INNER_PRODUCT,
+    }
+else:  # pragma: no cover
+    METRIC_TO_FAISS_METRIC_TYPE = {}
+
+
+@require_not_none(faiss)
+class FaissBuildIndex(LearnOperand, LearnOperandMixin):
+    _op_type_ = OperandDef.FAISS_BUILD_INDEX
+
+    _input = KeyField("input")
+    _metric = StringField("metric")
+    _faiss_index = StringField("faiss_index")
+    _n_sample = Int64Field("n_sample")
+    _seed = Int32Field("seed")
+    _same_distribution = BoolField("same_distribution")
+    _accuracy = BoolField("accuracy")
+    _memory_require = Int8Field(
+        "memory_require",
+        on_serialize=operator.attrgetter("value"),
+        on_deserialize=MemoryRequirementGrade,
+    )
+
+    def __init__(
+        self,
+        metric=None,
+        faiss_index=None,
+        n_sample=None,
+        seed=None,
+        same_distribution=None,
+        accuracy=None,
+        memory_require=None,
+        output_types=None,
+        **kw,
+    ):
+        super().__init__(
+            _metric=metric,
+            _faiss_index=faiss_index,
+            _n_sample=n_sample,
+            _seed=seed,
+            _same_distribution=same_distribution,
+            _accuracy=accuracy,
+            _memory_require=memory_require,
+            _output_types=output_types,
+            **kw,
+        )
+        if self.output_types is None:
+            self.output_types = [OutputType.object]
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def metric(self):
+        return self._metric
+
+    @property
+    def faiss_metric_type(self):
+        return METRIC_TO_FAISS_METRIC_TYPE[self._metric]
+
+    @property
+    def faiss_index(self):
+        return self._faiss_index
+
+    @property
+    def n_sample(self):
+        return self._n_sample
+
+    @property
+    def seed(self):
+        return self._seed
+
+    @property
+    def same_distribution(self):
+        return self._same_distribution
+
+    @property
+    def accuracy(self):
+        return self._accuracy
+
+    @property
+    def memory_require(self):
+        return self._memory_require
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def __call__(self, X):
+        return self.new_tileable([X])
+
+    @classmethod
+    def tile(cls, op):
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        in_tensor = yield from recursive_tile(astensor(op.input, np.dtype(np.float32)))
+        if op.faiss_index == "auto":
+            faiss_index, n_sample = _gen_index_string_and_sample_count(
+                in_tensor.shape,
+                op.n_sample,
+                op.accuracy,
+                op.memory_require,
+                gpu=op.gpu,
+                **op.extra_params,
+            )
+            op._n_sample = n_sample
+        else:
+            faiss_index, n_sample = op.faiss_index, op.n_sample
+
+        if len(in_tensor.chunks) == 1:
+            return cls._tile_one_chunk(op, faiss_index, n_sample)
+
+        if in_tensor.chunk_shape[1] != 1:
+            # make sure axis 1 has 1 chunk
+            in_tensor = yield from recursive_tile(
+                in_tensor.rechunk({1: in_tensor.shape[1]})
+            )
+        return (yield from cls._tile_chunks(op, in_tensor, faiss_index, n_sample))
+
+    @classmethod
+    def _tile_one_chunk(cls, op, faiss_index, n_sample):
+        in_chunk = op.input.chunks[0]
+        chunk_op = op.copy().reset_key()
+        chunk_op._faiss_index = faiss_index
+        chunk_op._n_sample = n_sample
+        chunk = chunk_op.new_chunk([in_chunk], index=in_chunk.index)
+
+        new_op = op.copy()
+        kw = op.outputs[0].params
+        kw["chunks"] = [chunk]
+        kw["nsplits"] = ((1,),)
+        return new_op.new_tileables(op.inputs, kws=[kw])
+
+    @classmethod
+    def _tile_chunks(cls, op, in_tensor, faiss_index, n_sample):
+        """
+        If the distribution on each chunk is the same,
+        refer to:
+        https://github.com/facebookresearch/faiss/wiki/FAQ#how-can-i-distribute-index-building-on-several-machines
+
+        1. train an IndexIVF* on a representative sample of the data, store it.
+        2. for each node, load the trained index, add the local data to it, store the resulting populated index
+        3. on a central node, load all the populated indexes and merge them.
+        """
+        faiss_index_ = faiss.index_factory(
+            in_tensor.shape[1], faiss_index, op.faiss_metric_type
+        )
+        # Training on sample data when two conditions meet
+        # 1. the index type requires for training, e.g. Flat does not require
+        # 2. distributions of chunks are the same, in not,
+        #    train separately on each chunk data
+        need_sample_train = not faiss_index_.is_trained and op.same_distribution
+        need_merge_index = (
+            hasattr(faiss_index_, "merge_from") if need_sample_train else False
+        )
+
+        train_chunk = None
+        if need_sample_train:
+            # sample data to train
+            rs = RandomState(op.seed)
+            sampled_index = rs.choice(
+                in_tensor.shape[0], size=n_sample, replace=False, chunk_size=n_sample
+            )
+            sample_tensor = yield from recursive_tile(in_tensor[sampled_index])
+            assert len(sample_tensor.chunks) == 1
+            sample_chunk = sample_tensor.chunks[0]
+            train_op = FaissTrainSampledIndex(faiss_index=faiss_index, metric=op.metric)
+            train_chunk = train_op.new_chunk([sample_chunk])
+        elif op.gpu:  # pragma: no cover
+            # if not need train, and on gpu, just merge data together to train
+            in_tensor = yield from recursive_tile(in_tensor.rechunk(in_tensor.shape))
+
+        # build index for each input chunk
+        build_index_chunks = []
+        for i, chunk in enumerate(in_tensor.chunks):
+            build_index_op = op.copy().reset_key()
+            build_index_op.stage = OperandStage.map
+            build_index_op._faiss_index = faiss_index
+            if train_chunk is not None:
+                build_index_chunk = build_index_op.new_chunk(
+                    [chunk, train_chunk], index=(i,)
+                )
+            else:
+                build_index_chunk = build_index_op.new_chunk([chunk], index=(i,))
+            build_index_chunks.append(build_index_chunk)
+
+        out_chunks = []
+        if need_merge_index:
+            assert op.n_sample is not None
+            # merge all indices into one, do only when trained on sample data
+            out_chunk_op = op.copy().reset_key()
+            out_chunk_op._faiss_index = faiss_index
+            out_chunk_op.stage = OperandStage.agg
+            out_chunk = out_chunk_op.new_chunk(build_index_chunks, index=(0,))
+            out_chunks.append(out_chunk)
+        else:
+            out_chunks.extend(build_index_chunks)
+
+        new_op = op.copy()
+        return new_op.new_tileables(
+            op.inputs, chunks=out_chunks, nsplits=((len(out_chunks),),)
+        )
+
+    @classmethod
+    def _execute_one_chunk(cls, ctx, op):
+        (inp,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            inp = inp.astype(np.float32, copy=False)
+            # create index
+            index = faiss.index_factory(
+                inp.shape[1], op.faiss_index, op.faiss_metric_type
+            )
+            # GPU
+            if device_id >= 0:  # pragma: no cover
+                index = _index_to_gpu(index, device_id)
+
+            # train index
+            if not index.is_trained:
+                assert op.n_sample is not None
+                sample_indices = xp.random.choice(
+                    inp.shape[0], size=op.n_sample, replace=False
+                )
+                sampled = inp[sample_indices]
+                index.train(sampled)
+
+            if op.metric == "cosine":
+                # faiss does not support cosine distances directly,
+                # data needs to be normalize before adding to index,
+                # refer to:
+                # https://github.com/facebookresearch/faiss/wiki/FAQ#how-can-i-index-vectors-for-cosine-distance
+                faiss.normalize_L2(inp)
+            # add vectors to index
+            if device_id >= 0:  # pragma: no cover
+                # gpu
+                index.add_c(inp.shape[0], _swig_ptr_from_cupy_float32_array(inp))
+            else:
+                index.add(inp)
+
+            ctx[op.outputs[0].key] = _store_index(index, device_id)
+
+    @classmethod
+    def _execute_map(cls, ctx, op):
+        (data,), device_id, xp = as_same_device(
+            [ctx[op.inputs[0].key]], device=op.device, ret_extra=True
+        )
+        index = ctx[op.inputs[1].key] if len(op.inputs) == 2 else None
+
+        with device(device_id):
+            data = xp.ascontiguousarray(data)
+            if index is not None:
+                # fetch the trained index
+                trained_index = _load_index(index, device_id)
+            else:
+                trained_index = faiss.index_factory(
+                    data.shape[1], op.faiss_index, op.faiss_metric_type
+                )
+                if op.same_distribution:
+                    # no need to train, just create index
+                    pass
+                else:
+                    # distribution no the same, train on each chunk
+                    trained_index.train(data)
+
+                if device_id >= 0:  # pragma: no cover
+                    trained_index = _index_to_gpu(trained_index, device_id)
+            if op.metric == "cosine":
+                # faiss does not support cosine distances directly,
+                # data needs to be normalize before adding to index,
+                # refer to:
+                # https://github.com/facebookresearch/faiss/wiki/FAQ#how-can-i-index-vectors-for-cosine-distance
+                faiss.normalize_L2(data)
+
+            # add data into index
+            if device_id >= 0:  # pragma: no cover
+                # gpu
+                trained_index.add_c(
+                    data.shape[0], _swig_ptr_from_cupy_float32_array(data)
+                )
+            else:
+                trained_index.add(data)
+
+            ctx[op.outputs[0].key] = _store_index(trained_index, device_id)
+
+    @classmethod
+    def _execute_agg(cls, ctx, op):
+        device_id = op.device
+        if device_id is None:
+            device_id = -1
+        inputs = [ctx[inp.key] for inp in op.inputs]
+
+        with device(device_id):
+            merged_index = None
+            indexes = []
+            for index in inputs:
+                index = _load_index(index, device_id)
+                indexes.append(index)
+                assert hasattr(index, "merge_from")
+                if merged_index is None:
+                    merged_index = index
+                else:
+                    merged_index.merge_from(index, index.ntotal)
+
+            ctx[op.outputs[0].key] = _store_index(merged_index, device_id)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            cls._execute_map(ctx, op)
+        elif op.stage == OperandStage.agg:
+            cls._execute_agg(ctx, op)
+        else:
+            assert op.stage is None
+            cls._execute_one_chunk(ctx, op)
+
+
+def _store_index(index, device_id):
+    if device_id >= 0:  # pragma: no cover
+        # for gpu, convert to cpu first
+        index = faiss.index_gpu_to_cpu(index)
+    # distributed, save to file, then return in memory bytes
+    fn = tempfile.mkstemp(".index", prefix="faiss_")[1]
+    faiss.write_index(index, fn)
+    try:
+        with open(fn, "rb") as f:
+            return f.read()
+    finally:
+        os.remove(fn)
+
+
+def _load_index(index, device_id):
+    # distributed
+    fn = tempfile.mkstemp(".index", prefix="faiss_")[1]
+    with open(fn, "wb") as f:
+        f.write(index)
+    index = faiss.read_index(f.name)
+    if device_id >= 0:  # pragma: no cover
+        index = _index_to_gpu(index, device_id)
+    return index
+
+
+def _index_to_gpu(index, device_id):  # pragma: no cover
+    res = faiss.StandardGpuResources()
+    return faiss.index_cpu_to_gpu(res, device_id, index)
+
+
+def _swig_ptr_from_cupy_float32_array(x):  # pragma: no cover
+    assert x.flags.c_contiguous
+    assert x.dtype == np.float32
+    data_ptr = x.__cuda_array_interface__["data"][0]
+    return faiss.cast_integer_to_float_ptr(data_ptr)
+
+
+def _swig_ptr_from_cupy_int64_array(x):  # pragma: no cover
+    assert x.flags.c_contiguous
+    assert x.dtype == np.int64
+    data_ptr = x.__cuda_array_interface__["data"][0]
+    return faiss.cast_integer_to_long_ptr(data_ptr)
+
+
+@require_not_none(faiss)
+class FaissTrainSampledIndex(LearnOperand, LearnOperandMixin):
+    _op_type_ = OperandDef.FAISS_TRAIN_SAMPLED_INDEX
+
+    _input = KeyField("input")
+    _metric = StringField("metric")
+    _faiss_index = StringField("faiss_index")
+
+    def __init__(self, faiss_index=None, metric=None, output_types=None, **kw):
+        super().__init__(
+            _faiss_index=faiss_index, _metric=metric, _output_types=output_types, **kw
+        )
+        if self.output_types is None:
+            self.output_types = [OutputType.object]
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def metric(self):
+        return self._metric
+
+    @property
+    def faiss_metric_type(self):
+        return METRIC_TO_FAISS_METRIC_TYPE[self.metric]
+
+    @property
+    def faiss_index(self):
+        return self._faiss_index
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (data,), device_id, _ = as_same_device(
+            [ctx[op.input.key]], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            index = faiss.index_factory(
+                data.shape[1], op.faiss_index, op.faiss_metric_type
+            )
+
+            if device_id >= 0:  # pragma: no cover
+                # GPU
+                index = _index_to_gpu(index, device_id)
+                index.train_c(data.shape[0], _swig_ptr_from_cupy_float32_array(data))
+            else:
+                index.train(data)
+
+            ctx[op.outputs[0].key] = _store_index(index, device_id)
+
+
+def _gen_index_string_and_sample_count(
+    shape, n_sample, accuracy, memory_require, gpu=None, **kw
+):
+    """
+    Generate index string and sample count according to guidance of faiss:
+    https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
+    """
+    size, dim = shape
+    memory_require = _get_memory_require(memory_require)
+
+    if accuracy or size < 10**5:
+        # Flat is the only index that guarantees exact results
+        # no need to train, thus sample count is None
+        return "Flat", None
+
+    if memory_require == MemoryRequirementGrade.maximum and not gpu:
+        x = kw.get("M", 32)  # get medium number by default
+        if x < 4 or x > 64:
+            raise ValueError(f"HNSWx requires M that between 4 and 64, got {x}")
+        return f"HNSW{x}", None
+
+    if memory_require in (MemoryRequirementGrade.high, MemoryRequirementGrade.maximum):
+        basement = "{},Flat"
+    elif memory_require == MemoryRequirementGrade.low:
+        x = kw.get("dim", dim // 2)
+        basement = f"PCAR{x},{{}},SQ8"
+    elif memory_require == MemoryRequirementGrade.minimum:
+        x = kw.get("M", min(64, dim // 2))
+        if x > 64:
+            raise ValueError(f"PQx requires M <= 64, got {x}")
+        y = kw.get("dim", None)
+        if y is not None and y % x != 0:
+            raise ValueError(
+                f"OPQx_y requires dim is a multiple of M({x}), got dim: {y}"
+            )
+        y = min(dim, 4 * x)
+        y = x * (y // x)  # make sure y is a multiple of x
+        basement = f"OPQ{x}_{y},{{}},PQ{x}"
+    else:  # pragma: no cover
+        raise ValueError("unknown memory require")
+
+    # now choose the clustering options
+    if size < 10**6 or (size < 10**7 and gpu):
+        # < 1M, or <10M but need GPU
+        k = kw.get("k", 5 * int(np.sqrt(size)))
+        if k < 4 * int(np.sqrt(size)) or k > 16 * int(np.sqrt(size)):
+            raise ValueError(
+                f"k should be between 4 * sqrt(N) and 16 * sqrt(N), got {k}"
+            )
+        index_str = basement.format(f"IVF{k}")
+        if n_sample is None:
+            # 30 * k - 256 * k
+            n_sample = min(30 * k, size)
+    elif size < 10**7 and not gpu:
+        # 1M - 10M
+        index_str = basement.format("IVF65536_HNSW32")
+        if n_sample is None:
+            # between 30 * 65536 and 256 * 65536
+            n_sample = 32 * 65536
+    elif size < 10**8:
+        index_str = basement.format("IVF65536_HNSW32")
+        n_sample = 64 * 65536 if n_sample is None else n_sample
+    else:
+        index_str = basement.format("IVF1048576_HNSW32")
+        n_sample = 64 * 65536 if n_sample is None else n_sample
+
+    return index_str, n_sample
+
+
+def _get_memory_require(memory_require):
+    if isinstance(memory_require, str):
+        return getattr(MemoryRequirementGrade, memory_require)
+    elif isinstance(memory_require, MemoryRequirementGrade):
+        return memory_require
+    return MemoryRequirementGrade(memory_require)
+
+
+@require_not_none(faiss)
+def build_faiss_index(
+    X,
+    index_name="auto",
+    n_sample=None,
+    metric="euclidean",
+    random_state=None,
+    same_distribution=True,
+    accuracy=False,
+    memory_require=None,
+    **kw,
+):
+    X = astensor(X)
+
+    if metric not in METRIC_TO_FAISS_METRIC_TYPE:
+        raise ValueError(f"unknown metric: {metric}")
+    if index_name != "auto":
+        try:
+            faiss.index_factory(
+                X.shape[1], index_name, METRIC_TO_FAISS_METRIC_TYPE[metric]
+            )
+        except RuntimeError:
+            raise ValueError(f"illegal faiss index: {index_name}")
+
+    rs = check_random_state(random_state)
+    if isinstance(rs, RandomState):
+        rs = rs.to_numpy()
+    seed = gen_random_seeds(1, rs)[0]
+    if memory_require is None:
+        memory_require = MemoryRequirementGrade.low
+    else:
+        memory_require = _get_memory_require(memory_require)
+    op = FaissBuildIndex(
+        faiss_index=index_name,
+        metric=metric,
+        n_sample=n_sample,
+        gpu=X.op.gpu,
+        seed=seed,
+        same_distribution=same_distribution,
+        accuracy=accuracy,
+        memory_require=memory_require,
+        **kw,
+    )
+    return op(X)
+
+
+class FaissQuery(LearnOperand, LearnOperandMixin):
+    _op_type_ = OperandDef.FAISS_QUERY
+
+    _input = KeyField("input")
+    _faiss_index = KeyField("faiss_index")
+    _metric = StringField("metric")
+    _n_neighbors = Int32Field("n_neighbors")
+    _return_distance = BoolField("return_distance")
+    _nprobe = Int64Field("nprobe")
+
+    def __init__(
+        self,
+        faiss_index=None,
+        metric=None,
+        n_neighbors=None,
+        return_distance=None,
+        nprobe=None,
+        output_types=None,
+        gpu=None,
+        **kw,
+    ):
+        super().__init__(
+            _faiss_index=faiss_index,
+            _n_neighbors=n_neighbors,
+            _metric=metric,
+            _return_distance=return_distance,
+            _output_types=output_types,
+            _nprobe=nprobe,
+            gpu=gpu,
+            **kw,
+        )
+        if self.output_types is None:
+            self.output_types = [OutputType.tensor] * self.output_limit
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def faiss_index(self):
+        return self._faiss_index
+
+    @property
+    def metric(self):
+        return self._metric
+
+    @property
+    def n_neighbors(self):
+        return self._n_neighbors
+
+    @property
+    def nprobe(self):
+        return self._nprobe
+
+    @property
+    def return_distance(self):
+        return self._return_distance
+
+    @property
+    def output_limit(self):
+        return 2 if self._return_distance else 1
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+        if self._faiss_index is not None:
+            self._faiss_index = self._inputs[1]
+
+    def __call__(self, y):
+        kws = []
+        if self._return_distance:
+            kws.append(
+                {
+                    "shape": (y.shape[0], self._n_neighbors),
+                    "dtype": np.dtype(np.float32),
+                    "order": TensorOrder.C_ORDER,
+                    "type": "distance",
+                }
+            )
+        kws.append(
+            {
+                "shape": (y.shape[0], self._n_neighbors),
+                "dtype": np.dtype(np.int64),
+                "order": TensorOrder.C_ORDER,
+                "type": "indices",
+            }
+        )
+        return self.new_tileables([y, self._faiss_index], kws=kws)
+
+    @classmethod
+    def tile(cls, op):
+        in_tensor = astensor(op.input)
+
+        if in_tensor.chunk_shape[1] != 1:
+            if has_unknown_shape(in_tensor):
+                yield
+            in_tensor = yield from recursive_tile(
+                in_tensor.rechunk({1: in_tensor.shape[1]})
+            )
+
+        out_chunks = [], []
+        for chunk in in_tensor.chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_kws = []
+            if op.return_distance:
+                chunk_kws.append(
+                    {
+                        "shape": (chunk.shape[0], op.n_neighbors),
+                        "dtype": np.dtype(np.float32),
+                        "order": TensorOrder.C_ORDER,
+                        "index": chunk.index,
+                        "type": "distance",
+                    }
+                )
+            chunk_kws.append(
+                {
+                    "shape": (chunk.shape[0], op.n_neighbors),
+                    "dtype": np.dtype(np.int64),
+                    "order": TensorOrder.C_ORDER,
+                    "index": chunk.index,
+                    "type": "indices",
+                }
+            )
+            in_chunks = [chunk]
+            in_chunks.extend(op.faiss_index.chunks)
+            chunks = chunk_op.new_chunks(in_chunks, kws=chunk_kws)
+            if op.return_distance:
+                out_chunks[0].append(chunks[0])
+            out_chunks[1].append(chunks[-1])
+
+        new_op = op.copy()
+        kws = [out.params for out in op.outputs]
+        if op.return_distance:
+            kws[0]["chunks"] = out_chunks[0]
+            kws[0]["nsplits"] = (in_tensor.nsplits[0], (op.n_neighbors,))
+        kws[-1]["chunks"] = out_chunks[1]
+        kws[-1]["nsplits"] = (in_tensor.nsplits[0], (op.n_neighbors,))
+        return new_op.new_tileables(op.inputs, kws=kws)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (y,), device_id, xp = as_same_device(
+            [ctx[op.input.key]], device=op.device, ret_extra=True
+        )
+        indexes = [_load_index(ctx[index.key], device_id) for index in op.inputs[1:]]
+
+        with device(device_id):
+            y = xp.ascontiguousarray(y, dtype=np.float32)
+
+            if len(indexes) == 1:
+                index = indexes[0]
+            else:
+                index = faiss.IndexShards(indexes[0].d)
+                [index.add_shard(ind) for ind in indexes]
+
+            if op.metric == "cosine":
+                # faiss does not support cosine distances directly,
+                # data needs to be normalize before searching,
+                # refer to:
+                # https://github.com/facebookresearch/faiss/wiki/FAQ#how-can-i-index-vectors-for-cosine-distance
+                faiss.normalize_L2(y)
+
+            if op.nprobe is not None:
+                index.nprobe = op.nprobe
+
+            if device_id >= 0:  # pragma: no cover
+                n = y.shape[0]
+                k = op.n_neighbors
+                distances = xp.empty((n, k), dtype=xp.float32)
+                indices = xp.empty((n, k), dtype=xp.int64)
+                index.search_c(
+                    n,
+                    _swig_ptr_from_cupy_float32_array(y),
+                    k,
+                    _swig_ptr_from_cupy_float32_array(distances),
+                    _swig_ptr_from_cupy_int64_array(indices),
+                )
+            else:
+                distances, indices = index.search(y, op.n_neighbors)
+            if op.return_distance:
+                if index.metric_type == faiss.METRIC_L2:
+                    # make it equivalent to `pairwise.euclidean_distances`
+                    distances = xp.sqrt(distances, out=distances)
+                elif op.metric == "cosine":
+                    # make it equivalent to `pairwise.cosine_distances`
+                    distances = xp.subtract(1, distances, out=distances)
+                ctx[op.outputs[0].key] = distances
+            ctx[op.outputs[-1].key] = indices
+
+
+@require_not_none(faiss)
+def faiss_query(faiss_index, data, n_neighbors, return_distance=True, nprobe=None):
+    data = astensor(data)
+    op = FaissQuery(
+        faiss_index=faiss_index,
+        n_neighbors=n_neighbors,
+        metric=faiss_index.op.metric,
+        return_distance=return_distance,
+        nprobe=nprobe,
+        gpu=data.op.gpu,
+    )
+    ret = op(data)
+    if not return_distance:
+        return ret[0]
+    return ret
diff --git a/python/xorbits/_mars/learn/neighbors/_kd_tree.py b/python/xorbits/_mars/learn/neighbors/_kd_tree.py
new file mode 100644
index 000000000..f810adf72
--- /dev/null
+++ b/python/xorbits/_mars/learn/neighbors/_kd_tree.py
@@ -0,0 +1,61 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try:
+    from sklearn.neighbors import KDTree as SklearnKDTree
+except ImportError:  # pragma: no cover
+    SklearnKDTree = None
+
+from ... import opcodes as OperandDef
+from ...utils import require_not_none
+from .tree import TreeBase, TreeObject, TreeQueryBase
+
+
+class KDTree(TreeObject):
+    pass
+
+
+@require_not_none(SklearnKDTree)
+class _KDTree(TreeBase):
+    _op_type_ = OperandDef.KD_TREE_TRAIN
+    _tree_type = SklearnKDTree
+
+    def __call__(self, a):
+        result = super().__call__(a)
+        return KDTree(result.data)
+
+
+@require_not_none(SklearnKDTree)
+class KDTreeQuery(TreeQueryBase):
+    _op_type_ = OperandDef.KD_TREE_QUERY
+    _tree_type = SklearnKDTree
+
+
+@require_not_none(SklearnKDTree)
+def kd_tree_query(tree, data, n_neighbors, return_distance):
+    op = KDTreeQuery(
+        tree=tree, n_neighbors=n_neighbors, return_distance=return_distance
+    )
+    ret = op(data)
+    if not return_distance:
+        return ret[0]
+    return ret
+
+
+@require_not_none(SklearnKDTree)
+def create_kd_tree(X, leaf_size, metric=None, **metric_params):
+    # kd_tree cannot accept callable metric
+    assert not callable(metric)
+    op = _KDTree(leaf_size=leaf_size, metric=metric, **metric_params)
+    return op(X)
diff --git a/python/xorbits/_mars/learn/neighbors/_kneighbors_graph.py b/python/xorbits/_mars/learn/neighbors/_kneighbors_graph.py
new file mode 100644
index 000000000..a5364524d
--- /dev/null
+++ b/python/xorbits/_mars/learn/neighbors/_kneighbors_graph.py
@@ -0,0 +1,134 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import recursive_tile
+from ...lib.sparse.array import SparseNDArray, get_sparse_module
+from ...serialization.serializables import Int64Field, KeyField
+from ...tensor.array_utils import as_same_device, device
+from ...tensor.core import TensorOrder
+from ...tensor.utils import decide_unify_split
+from ...utils import has_unknown_shape
+from ..operands import LearnOperand, LearnOperandMixin, OutputType
+
+
+class KNeighborsGraph(LearnOperand, LearnOperandMixin):
+    _op_type_ = OperandDef.KNEIGHBORS_GRAPH
+
+    _a_data = KeyField("a_data")
+    _a_ind = KeyField("a_ind")
+    _n_neighbors = Int64Field("n_neighbors")
+
+    def __init__(self, a_data=None, a_ind=None, n_neighbors=None, **kw):
+        super().__init__(_a_data=a_data, _a_ind=a_ind, _n_neighbors=n_neighbors, **kw)
+        self.output_types = [OutputType.tensor]
+
+    @property
+    def a_data(self):
+        return self._a_data
+
+    @property
+    def a_ind(self):
+        return self._a_ind
+
+    @property
+    def n_neighbors(self):
+        return self._n_neighbors
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if self._a_data is not None:
+            self._a_data = self._inputs[0]
+        self._a_ind = self._inputs[-1]
+
+    def __call__(self, A_data, A_ind, shape):
+        inputs = []
+        if A_data is not None:
+            inputs.append(A_data)
+        inputs.append(A_ind)
+        return self.new_tileable(
+            inputs, dtype=np.dtype(np.float64), shape=shape, order=TensorOrder.C_ORDER
+        )
+
+    @classmethod
+    def tile(cls, op):
+        if has_unknown_shape(*op.inputs):
+            yield
+        A_data, A_ind = op.a_data, op.a_ind
+        out = op.outputs[0]
+
+        shape1 = A_ind.shape[1]
+        if A_data is not None:
+            # mode == 'distance'
+            axis0_chunk_sizes = decide_unify_split(A_data.nsplits[0], A_ind.nsplits[0])
+            A_data = yield from recursive_tile(
+                A_data.rechunk({0: axis0_chunk_sizes, 1: shape1})
+            )
+            A_ind = yield from recursive_tile(
+                A_ind.rechunk({0: axis0_chunk_sizes, 1: shape1})
+            )
+        else:
+            # mode == 'connectivity'
+            A_ind = yield from recursive_tile(A_ind.rechunk({1: shape1}))
+
+        out_chunks = []
+        for i, ind_c in enumerate(A_ind.chunks):
+            chunk_op = op.copy().reset_key()
+            chunk_inputs = [ind_c]
+            if A_data is not None:
+                data_c = A_data.cix[i, 0]
+                chunk_inputs.insert(0, data_c)
+            out_chunk = chunk_op.new_chunk(
+                chunk_inputs,
+                dtype=out.dtype,
+                shape=(ind_c.shape[0], out.shape[1]),
+                order=out.order,
+                index=(i, 0),
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        params = out.params
+        params["chunks"] = out_chunks
+        params["nsplits"] = (A_ind.nsplits[0], (out.shape[1],))
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+        out = op.outputs[0]
+        n_samples1, n_samples2 = out.shape
+        n_neighbors = op.n_neighbors
+        n_nonzero = n_samples1 * n_neighbors
+
+        with device(device_id):
+            A_ind = inputs[-1]
+            A_indptr = xp.arange(0, n_nonzero + 1, n_neighbors)
+
+            if op.a_data is None:
+                # mode == 'connectivity
+                A_data = xp.ones(n_samples1 * n_neighbors)
+            else:
+                # mode == 'distance'
+                A_data = xp.ravel(inputs[0])
+
+            xps = get_sparse_module(A_ind)
+            graph = xps.csr_matrix(
+                (A_data, A_ind.ravel(), A_indptr), shape=(n_samples1, n_samples2)
+            )
+            ctx[out.key] = SparseNDArray(graph)
diff --git a/python/xorbits/_mars/learn/neighbors/_proxima.py b/python/xorbits/_mars/learn/neighbors/_proxima.py
new file mode 100644
index 000000000..7ead22086
--- /dev/null
+++ b/python/xorbits/_mars/learn/neighbors/_proxima.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..proxima.simple_index import build_index, search_index
+
+METRIC_TO_PROXIMA_METRIC_TYPE = {
+    "l2": "Euclidean",
+    "euclidean": "Euclidean",
+    "canberra": "Canberra",
+    "chebyshev": "Chebyshev",
+    "sqeuclidean": "SquaredEuclidean",
+    "innerproduct": "InnerProduct",
+    "manhattan": "Manhattan",
+}
+
+build_proxima_index = build_index
+proxima_query = search_index
diff --git a/python/xorbits/_mars/learn/neighbors/base.py b/python/xorbits/_mars/learn/neighbors/base.py
new file mode 100644
index 000000000..39f74a45b
--- /dev/null
+++ b/python/xorbits/_mars/learn/neighbors/base.py
@@ -0,0 +1,575 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+from sklearn.base import BaseEstimator, MultiOutputMixin
+
+from ... import tensor as mt
+from ...tensor.reshape.reshape import _reshape as reshape_unchecked
+from ..metrics import pairwise_distances_topk
+from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
+from ..utils import check_array
+from ..utils.validation import check_is_fitted
+from ._ball_tree import SklearnBallTree, ball_tree_query, create_ball_tree
+from ._faiss import METRIC_TO_FAISS_METRIC_TYPE, build_faiss_index, faiss_query
+from ._kd_tree import SklearnKDTree, create_kd_tree, kd_tree_query
+from ._kneighbors_graph import KNeighborsGraph
+from ._proxima import METRIC_TO_PROXIMA_METRIC_TYPE, build_proxima_index, proxima_query
+
+VALID_METRICS = dict(
+    ball_tree=SklearnBallTree.valid_metrics,
+    kd_tree=SklearnKDTree.valid_metrics,
+    # The following list comes from the
+    # sklearn.metrics.pairwise doc string
+    brute=(
+        list(PAIRWISE_DISTANCE_FUNCTIONS.keys())
+        + [
+            "braycurtis",
+            "canberra",
+            "chebyshev",
+            "correlation",
+            "cosine",
+            "dice",
+            "hamming",
+            "jaccard",
+            "kulsinski",
+            "mahalanobis",
+            "matching",
+            "minkowski",
+            "rogerstanimoto",
+            "russellrao",
+            "seuclidean",
+            "sokalmichener",
+            "sokalsneath",
+            "sqeuclidean",
+            "yule",
+            "wminkowski",
+        ]
+    ),
+    faiss=list(METRIC_TO_FAISS_METRIC_TYPE),
+    proxima=list(METRIC_TO_PROXIMA_METRIC_TYPE),
+)
+
+
+VALID_METRICS_SPARSE = dict(
+    ball_tree=[], kd_tree=[], brute=(PAIRWISE_DISTANCE_FUNCTIONS.keys() - {"haversine"})
+)
+
+
+class NeighborsBase(BaseEstimator, MultiOutputMixin, metaclass=ABCMeta):
+    """Base class for nearest neighbors estimators."""
+
+    @abstractmethod
+    def __init__(
+        self,
+        n_neighbors=None,
+        radius=None,
+        algorithm="auto",
+        leaf_size=30,
+        metric="minkowski",
+        p=2,
+        metric_params=None,
+        n_jobs=None,
+    ):
+        self.n_neighbors = n_neighbors
+        self.radius = radius
+        self.algorithm = algorithm
+        self.leaf_size = leaf_size
+        self.metric = metric
+        self.metric_params = metric_params
+        self.p = p
+        self.n_jobs = n_jobs
+        self._check_algorithm_metric()
+
+    def _check_algorithm_metric(self):
+        if self.algorithm not in [
+            "auto",
+            "brute",
+            "kd_tree",
+            "ball_tree",
+            "faiss",
+            "proxima",
+        ]:
+            raise ValueError(f"unrecognized algorithm: '{self.algorithm}'")
+
+        if self.algorithm == "auto":
+            if self.metric == "precomputed":
+                alg_check = "brute"
+            elif callable(self.metric) or self.metric in VALID_METRICS["ball_tree"]:
+                alg_check = "ball_tree"
+            else:
+                alg_check = "brute"
+        else:
+            alg_check = self.algorithm
+
+        if callable(self.metric):
+            if self.algorithm == "kd_tree":
+                # callable metric is only valid for brute force and ball_tree
+                raise ValueError(
+                    "kd_tree algorithm does not support callable metric '%s'"
+                    % self.metric
+                )
+        elif self.metric not in VALID_METRICS[alg_check]:
+            raise ValueError(
+                "Metric '%s' not valid. Use "
+                "sorted(sklearn.neighbors.VALID_METRICS['%s']) "
+                "to get valid options. "
+                "Metric can also be a callable function." % (self.metric, alg_check)
+            )
+
+        if self.metric_params is not None and "p" in self.metric_params:
+            warnings.warn(
+                "Parameter p is found in metric_params. "
+                "The corresponding parameter from __init__ "
+                "is ignored.",
+                SyntaxWarning,
+                stacklevel=3,
+            )
+            effective_p = self.metric_params["p"]
+        else:
+            effective_p = self.p
+
+        if self.metric in ["wminkowski", "minkowski"] and effective_p < 1:
+            raise ValueError("p must be greater than one for minkowski metric")
+
+    def _fit(self, X, session=None, run_kwargs=None):
+        self._check_algorithm_metric()
+        if self.metric_params is None:
+            self.effective_metric_params_ = {}
+        else:
+            self.effective_metric_params_ = self.metric_params.copy()
+
+        effective_p = self.effective_metric_params_.get("p", self.p)
+        if self.metric in ["wminkowski", "minkowski"]:
+            self.effective_metric_params_["p"] = effective_p
+
+        self.effective_metric_ = self.metric
+        # For minkowski distance, use more efficient methods where available
+        if self.metric == "minkowski":
+            p = self.effective_metric_params_.pop("p", 2)
+            if p < 1:  # pragma: no cover
+                raise ValueError("p must be greater than one for minkowski metric")
+            elif p == 1:
+                self.effective_metric_ = "manhattan"
+            elif p == 2:
+                self.effective_metric_ = "euclidean"
+            elif p == np.inf:
+                self.effective_metric_ = "chebyshev"
+            else:
+                self.effective_metric_params_["p"] = p
+
+        if isinstance(X, NeighborsBase):
+            self._fit_X = X._fit_X
+            self._tree = X._tree
+            self._fit_method = X._fit_method
+            return self
+
+        elif isinstance(X, SklearnBallTree):
+            self._fit_X = mt.tensor(X.data)
+            self._tree = X
+            self._fit_method = "ball_tree"
+            return self
+
+        elif isinstance(X, SklearnKDTree):
+            self._fit_X = mt.tensor(X.data)
+            self._tree = X
+            self._fit_method = "kd_tree"
+            return self
+
+        X = check_array(X, accept_sparse=True)
+
+        if np.isnan(X.size):
+            # if X has unknown shape, execute it first
+            X.execute(session=session, **(run_kwargs or dict()))
+
+        if X.issparse():
+            if self.algorithm not in ("auto", "brute"):
+                warnings.warn("cannot use tree with sparse input: using brute force")
+            if self.effective_metric_ not in VALID_METRICS_SPARSE[
+                "brute"
+            ] and not callable(self.effective_metric_):
+                raise ValueError(
+                    "Metric '%s' not valid for sparse input. "
+                    "Use sorted(sklearn.neighbors."
+                    "VALID_METRICS_SPARSE['brute']) "
+                    "to get valid options. "
+                    "Metric can also be a callable function." % (self.effective_metric_)
+                )
+            self._fit_X = X.copy()
+            self._tree = None
+            self._fit_method = "brute"
+            return self
+
+        self._fit_method = self.algorithm
+        self._fit_X = X
+
+        if self._fit_method == "auto":
+            # A tree approach is better for small number of neighbors,
+            # and KDTree is generally faster when available
+            if (
+                self.n_neighbors is None or self.n_neighbors < self._fit_X.shape[0] // 2
+            ) and self.metric != "precomputed":
+                if self.effective_metric_ in VALID_METRICS["kd_tree"]:
+                    self._fit_method = "kd_tree"
+                elif (
+                    callable(self.effective_metric_)
+                    or self.effective_metric_ in VALID_METRICS["ball_tree"]
+                ):
+                    self._fit_method = "ball_tree"
+                else:
+                    self._fit_method = "brute"
+            else:
+                self._fit_method = "brute"
+
+        if self._fit_method == "ball_tree":
+            self._tree = tree = create_ball_tree(
+                X,
+                self.leaf_size,
+                metric=self.effective_metric_,
+                **self.effective_metric_params_,
+            )
+            tree.execute(session=session, **(run_kwargs or dict()))
+        elif self._fit_method == "kd_tree":
+            self._tree = tree = create_kd_tree(
+                X,
+                self.leaf_size,
+                metric=self.effective_metric_,
+                **self.effective_metric_params_,
+            )
+            tree.execute(session=session, **(run_kwargs or dict()))
+        elif self._fit_method == "brute":
+            self._tree = None
+        elif self._fit_method == "faiss":
+            faiss_index = build_faiss_index(
+                X, metric=self.effective_metric_, **self.effective_metric_params_
+            )
+            faiss_index.execute(session=session, **(run_kwargs or dict()))
+            self._faiss_index = faiss_index
+        elif self._fit_method == "proxima":  # pragma: no cover
+            proxima_metric = METRIC_TO_PROXIMA_METRIC_TYPE[self.effective_metric_]
+            proxima_index = build_proxima_index(
+                X,
+                distance_metric=proxima_metric,
+                topk=self.n_neighbors,
+                session=session,
+                run_kwargs=run_kwargs,
+                **self.effective_metric_params_,
+            )
+            self._proxima_index = proxima_index
+        else:  # pragma: no cover
+            raise ValueError("algorithm = '%s' not recognized" % self.algorithm)
+
+        if self.n_neighbors is not None:
+            if self.n_neighbors <= 0:
+                raise ValueError(f"Expected n_neighbors > 0. Got {self.n_neighbors}")
+            else:
+                if not np.issubdtype(type(self.n_neighbors), np.integer):
+                    raise TypeError(
+                        f"n_neighbors does not take {type(self.n_neighbors)} value, "
+                        "enter integer value"
+                    )
+
+        return self
+
+
+class KNeighborsMixin:
+    """Mixin for k-neighbors searches"""
+
+    def kneighbors(
+        self,
+        X=None,
+        n_neighbors=None,
+        return_distance=True,
+        session=None,
+        run_kwargs=None,
+        **kw,
+    ):
+        """Finds the K-neighbors of a point.
+        Returns indices of and distances to the neighbors of each point.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_query, n_features), \
+                or (n_query, n_indexed) if metric == 'precomputed'
+            The query point or points.
+            If not provided, neighbors of each indexed point are returned.
+            In this case, the query point is not considered its own neighbor.
+
+        n_neighbors : int
+            Number of neighbors to get (default is the value
+            passed to the constructor).
+
+        return_distance : boolean, optional. Defaults to True.
+            If False, distances will not be returned
+
+        Returns
+        -------
+        dist : Tensor
+            Array representing the lengths to points, only present if
+            return_distance=True
+
+        ind : Tensor
+            Indices of the nearest points in the population matrix.
+
+        Examples
+        --------
+        In the following example, we construct a NeighborsClassifier
+        class from a tensor representing our data set and ask who's
+        the closest point to [1,1,1]
+
+        >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]
+        >>> from mars.learn.neighbors import NearestNeighbors
+        >>> neigh = NearestNeighbors(n_neighbors=1)
+        >>> neigh.fit(samples) # doctest: +ELLIPSIS
+        NearestNeighbors(algorithm='auto', leaf_size=30, ...)
+        >>> print(neigh.kneighbors([[1., 1., 1.]])) # doctest: +ELLIPSIS
+        (array([[0.5]]), array([[2]]))
+
+        As you can see, it returns [[0.5]], and [[2]], which means that the
+        element is at distance 0.5 and is the third element of samples
+        (indexes start at 0). You can also query for multiple points:
+
+        >>> X = [[0., 1., 0.], [1., 0., 1.]]
+        >>> neigh.kneighbors(X, return_distance=False) # doctest: +ELLIPSIS
+        array([[1],
+               [2]]...)
+
+        """
+        check_is_fitted(self, ["_fit_method", "_fit_X"], all_or_any=any)
+
+        if n_neighbors is None:
+            n_neighbors = self.n_neighbors
+        elif n_neighbors <= 0:
+            raise ValueError(f"Expected n_neighbors > 0. Got {n_neighbors}")
+        else:
+            if not np.issubdtype(type(n_neighbors), np.integer):
+                raise TypeError(
+                    f"n_neighbors does not take {type(n_neighbors)} value, "
+                    "enter integer value"
+                )
+
+        if X is not None:
+            query_is_train = False
+            X = check_array(X, accept_sparse=True)
+        else:
+            query_is_train = True
+            X = self._fit_X
+            # Include an extra neighbor to account for the sample itself being
+            # returned, which is removed later
+            n_neighbors += 1
+
+        if X.key == self._fit_X.key and X is not self._fit_X:
+            X = self._fit_X
+        if np.isnan(X.size):
+            # has unknown size, execute first
+            X.execute(session=session, **(run_kwargs or dict()))
+
+        train_size = self._fit_X.shape[0]
+        if n_neighbors > train_size:
+            raise ValueError(
+                "Expected n_neighbors <= n_samples, "
+                f"but n_samples = {train_size}, n_neighbors = {n_neighbors}"
+            )
+        n_samples, _ = X.shape
+        sample_range = mt.arange(n_samples)[:, None]
+
+        if self._fit_method == "brute":
+            # for efficiency, use squared euclidean distances
+            kwds = (
+                {"squared": True}
+                if self.effective_metric_ == "euclidean"
+                else self.effective_metric_params_
+            )
+
+            neigh_dist, neigh_ind = pairwise_distances_topk(
+                X, self._fit_X, k=n_neighbors, metric=self.effective_metric_, **kwds
+            )
+            if return_distance:
+                if self.effective_metric_ == "euclidean":
+                    result = mt.sqrt(neigh_dist), neigh_ind
+                else:
+                    result = neigh_dist, neigh_ind
+            else:
+                result = neigh_ind
+        elif self._fit_method in ["ball_tree", "kd_tree"]:
+            if X.issparse():
+                raise ValueError(
+                    f"{self._fit_method} does not work with sparse matrices. "
+                    "Densify the data, or set algorithm='brute'"
+                )
+
+            query = (
+                ball_tree_query if self._fit_method == "ball_tree" else kd_tree_query
+            )
+            result = query(self._tree, X, n_neighbors, return_distance)
+        elif self._fit_method == "faiss":
+            if X.issparse():
+                raise ValueError(
+                    f"{self._fit_method} does not work with sparse matrices. "
+                    "Densify the data, or set algorithm='brute'"
+                )
+            result = faiss_query(
+                self._faiss_index, X, n_neighbors, return_distance, **kw
+            )
+        elif self._fit_method == "proxima":  # pragma: no cover
+            if X.issparse():
+                raise ValueError(
+                    f"{self._fit_method} does not work with sparse matrices. "
+                    "Densify the data, or set algorithm='brute'"
+                )
+            ind, dis = proxima_query(
+                X, n_neighbors, index=self._proxima_index, run=False, **kw
+            )
+            if not return_distance:
+                result = ind
+            else:
+                result = (dis, ind)
+        else:  # pragma: no cover
+            raise ValueError("internal: _fit_method not recognized")
+
+        if not query_is_train:
+            if isinstance(result, (tuple, list)):
+                result = mt.ExecutableTuple(result)
+            result.execute(session=session, **(run_kwargs or dict()))
+            return result
+        else:
+            # If the query data is the same as the indexed data, we would like
+            # to ignore the first nearest neighbor of every sample, i.e
+            # the sample itself.
+            if return_distance:
+                dist, neigh_ind = result
+            else:
+                neigh_ind = result
+
+            sample_mask = neigh_ind != sample_range
+
+            # Corner case: When the number of duplicates are more
+            # than the number of neighbors, the first NN will not
+            # be the sample, but a duplicate.
+            # In that case mask the first duplicate.
+            dup_gr_nbrs = mt.all(sample_mask, axis=1)
+            sample_mask[:, 0] = mt.where(dup_gr_nbrs, False, sample_mask[:, 0])
+
+            neigh_ind = reshape_unchecked(
+                neigh_ind[sample_mask], (n_samples, n_neighbors - 1)
+            )
+
+            if return_distance:
+                dist = reshape_unchecked(
+                    dist[sample_mask], (n_samples, n_neighbors - 1)
+                )
+                ret = mt.ExecutableTuple([dist, neigh_ind])
+                ret.execute(session=session, **(run_kwargs or dict()))
+                return ret
+            neigh_ind.execute(session=session, **(run_kwargs or dict()))
+            return neigh_ind
+
+    def kneighbors_graph(
+        self,
+        X=None,
+        n_neighbors=None,
+        mode="connectivity",
+        session=None,
+        run_kwargs=None,
+    ):
+        """Computes the (weighted) graph of k-Neighbors for points in X
+
+        Parameters
+        ----------
+        X : array-like, shape (n_query, n_features), \
+                or (n_query, n_indexed) if metric == 'precomputed'
+            The query point or points.
+            If not provided, neighbors of each indexed point are returned.
+            In this case, the query point is not considered its own neighbor.
+
+        n_neighbors : int
+            Number of neighbors for each sample.
+            (default is value passed to the constructor).
+
+        mode : {'connectivity', 'distance'}, optional
+            Type of returned matrix: 'connectivity' will return the
+            connectivity matrix with ones and zeros, in 'distance' the
+            edges are Euclidean distance between points.
+
+        Returns
+        -------
+        A : SparseTensor, shape = [n_samples, n_samples_fit]
+            n_samples_fit is the number of samples in the fitted data
+            A[i, j] is assigned the weight of edge that connects i to j.
+
+        Examples
+        --------
+        >>> X = [[0], [3], [1]]
+        >>> from mars.learn.neighbors import NearestNeighbors
+        >>> neigh = NearestNeighbors(n_neighbors=2)
+        >>> neigh.fit(X) # doctest: +ELLIPSIS
+        NearestNeighbors(algorithm='auto', leaf_size=30, ...)
+        >>> A = neigh.kneighbors_graph(X)
+        >>> A.fetch().toarray()
+        array([[1., 0., 1.],
+               [0., 1., 1.],
+               [1., 0., 1.]])
+
+        See also
+        --------
+        NearestNeighbors.radius_neighbors_graph
+        """
+        check_is_fitted(self, ["_fit_method", "_fit_X"], all_or_any=any)
+        if n_neighbors is None:
+            n_neighbors = self.n_neighbors
+
+        # kneighbors does the None handling.
+        if X is not None:
+            X = check_array(X, accept_sparse=True)
+            n_samples1 = X.shape[0]
+        else:
+            n_samples1 = self._fit_X.shape[0]
+
+        n_samples2 = self._fit_X.shape[0]
+
+        if mode == "connectivity":
+            A_data = None
+            A_ind = self.kneighbors(X, n_neighbors, return_distance=False)
+
+        elif mode == "distance":
+            A_data, A_ind = self.kneighbors(X, n_neighbors, return_distance=True)
+
+        else:
+            raise ValueError(
+                'Unsupported mode, must be one of "connectivity" '
+                f'or "distance" but got {mode} instead'
+            )
+
+        op = KNeighborsGraph(
+            a_data=A_data, a_ind=A_ind, n_neighbors=n_neighbors, sparse=True
+        )
+        graph = op(A_data, A_ind, shape=(n_samples1, n_samples2))
+        graph.execute(session=session, **(run_kwargs or dict()))
+        return graph
+
+
+class UnsupervisedMixin:
+    def fit(self, X, y=None, session=None, run_kwargs=None):
+        """Fit the model using X as training data
+
+        Parameters
+        ----------
+        X : {array-like, tensor, BallTree, KDTree}
+            Training data. If tensor, shape [n_samples, n_features],
+            or [n_samples, n_samples] if metric='precomputed'.
+        """
+        return self._fit(X, session=session, run_kwargs=run_kwargs)
diff --git a/python/xorbits/_mars/learn/neighbors/tests/__init__.py b/python/xorbits/_mars/learn/neighbors/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/neighbors/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/neighbors/tests/test_faiss.py b/python/xorbits/_mars/learn/neighbors/tests/test_faiss.py
new file mode 100644
index 000000000..8628fb1ef
--- /dev/null
+++ b/python/xorbits/_mars/learn/neighbors/tests/test_faiss.py
@@ -0,0 +1,241 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+try:
+    import faiss
+except ImportError:  # pragma: no cover
+    faiss = None
+
+from .... import tensor as mt
+from ....core import tile
+from ....session import execute, fetch
+from .. import NearestNeighbors
+from .._faiss import (
+    _gen_index_string_and_sample_count,
+    _load_index,
+    build_faiss_index,
+    faiss_query,
+)
+
+
+@pytest.mark.skipif(faiss is None, reason="faiss not installed")
+def test_manual_build_faiss_index(setup):
+    d = 8
+    n = 50
+    n_test = 10
+    x = np.random.RandomState(0).rand(n, d).astype(np.float32)
+    y = np.random.RandomState(0).rand(n_test, d).astype(np.float32)
+
+    nn = NearestNeighbors(algorithm="kd_tree")
+    nn.fit(x)
+    _, expected_indices = nn.kneighbors(y, 5)
+
+    # test brute-force search
+    X = mt.tensor(x, chunk_size=10)
+    index = build_faiss_index(X, "Flat", None, random_state=0, same_distribution=True)
+    faiss_index = index.execute().fetch()
+
+    index_shards = faiss.IndexShards(d)
+    for ind in faiss_index:
+        shard = _load_index(ind, -1)
+        index_shards.add_shard(shard)
+    faiss_index = index_shards
+
+    faiss_index.nprob = 10
+    _, indices = faiss_index.search(y, k=5)
+
+    np.testing.assert_array_equal(indices, expected_indices.fetch())
+
+    # test one chunk, brute force
+    X = mt.tensor(x, chunk_size=50)
+    index = build_faiss_index(X, "Flat", None, random_state=0, same_distribution=True)
+    faiss_index = _load_index(index.execute().fetch(), -1)
+
+    faiss_index.nprob = 10
+    _, indices = faiss_index.search(y, k=5)
+
+    np.testing.assert_array_equal(indices, expected_indices.fetch())
+
+    # test train, same distribution
+    X = mt.tensor(x, chunk_size=10)
+    index = build_faiss_index(
+        X, "IVF30,Flat", 30, random_state=0, same_distribution=True
+    )
+    faiss_index = _load_index(index.execute().fetch(), -1)
+
+    assert isinstance(faiss_index, faiss.IndexIVFFlat)
+    assert faiss_index.ntotal == n
+    assert len(tile(index).chunks) == 1
+
+    # test train, distributions are variant
+    X = mt.tensor(x, chunk_size=10)
+    index = build_faiss_index(
+        X, "IVF10,Flat", None, random_state=0, same_distribution=False
+    )
+    faiss_index = index.execute().fetch()
+
+    assert len(faiss_index) == 5
+    for ind in faiss_index:
+        ind = _load_index(ind, -1)
+        assert isinstance(ind, faiss.IndexIVFFlat)
+        assert ind.ntotal == 10
+
+    # test more index type
+    index = build_faiss_index(X, "PCAR6,IVF8_HNSW32,SQ8", 10, random_state=0)
+    faiss_index = index.execute().fetch()
+
+    assert len(faiss_index) == 5
+    for ind in faiss_index:
+        ind = _load_index(ind, -1)
+        assert isinstance(ind, faiss.IndexPreTransform)
+        assert ind.ntotal == 10
+
+    # test one chunk, train
+    X = mt.tensor(x, chunk_size=50)
+    index = build_faiss_index(
+        X, "IVF30,Flat", 30, random_state=0, same_distribution=True
+    )
+    faiss_index = _load_index(index.execute().fetch(), -1)
+
+    assert isinstance(faiss_index, faiss.IndexIVFFlat)
+    assert faiss_index.ntotal == n
+
+    # test wrong index
+    with pytest.raises(ValueError):
+        build_faiss_index(X, "unknown_index", None)
+
+    # test unknown metric
+    with pytest.raises(ValueError):
+        build_faiss_index(X, "Flat", None, metric="unknown_metric")
+
+
+d = 8
+n = 50
+n_test = 10
+x = np.random.RandomState(0).rand(n, d).astype(np.float32)
+y = np.random.RandomState(1).rand(n_test, d).astype(np.float32)
+
+
+@pytest.mark.skipif(faiss is None, reason="faiss not installed")
+@pytest.mark.parametrize(
+    "X, Y",
+    [
+        # multi chunks
+        (mt.tensor(x, chunk_size=(20, 5)), mt.tensor(y, chunk_size=5)),
+        # one chunk
+        (mt.tensor(x, chunk_size=50), mt.tensor(y, chunk_size=10)),
+    ],
+)
+@pytest.mark.parametrize("metric", ["l2", "cosine"])
+def test_faiss_query(setup, X, Y, metric):
+    faiss_index = build_faiss_index(X, "Flat", None, metric=metric, random_state=0)
+    d, i = faiss_query(faiss_index, Y, 5, nprobe=10)
+    distance, indices = fetch(*execute(d, i))
+
+    nn = NearestNeighbors(metric=metric)
+    nn.fit(x)
+    expected_distance, expected_indices = nn.kneighbors(y, 5)
+
+    np.testing.assert_array_equal(indices, expected_indices.fetch())
+    np.testing.assert_almost_equal(distance, expected_distance.fetch(), decimal=4)
+
+    # test other index
+    X2 = X.astype(np.float64)
+    Y2 = y.astype(np.float64)
+    faiss_index = build_faiss_index(
+        X2, "PCAR6,IVF8_HNSW32,SQ8", 10, random_state=0, return_index_type="object"
+    )
+    d, i = faiss_query(faiss_index, Y2, 5, nprobe=10)
+    # test execute only
+    execute(d, i)
+
+
+@pytest.mark.skipif(faiss is None, reason="faiss not installed")
+def test_gen_index_string_and_sample_count(setup):
+    d = 32
+
+    # accuracy=True, could be Flat only
+    ret = _gen_index_string_and_sample_count((10**9, d), None, True, "minimum")
+    assert ret == ("Flat", None)
+
+    # no memory concern
+    ret = _gen_index_string_and_sample_count((10**5, d), None, False, "maximum")
+    assert ret == ("HNSW32", None)
+    index = faiss.index_factory(d, ret[0])
+    assert index.is_trained is True
+
+    # memory concern not much
+    ret = _gen_index_string_and_sample_count((10**5, d), None, False, "high")
+    assert ret == ("IVF1580,Flat", 47400)
+    index = faiss.index_factory(d, ret[0])
+    assert index.is_trained is False
+
+    # memory quite important
+    ret = _gen_index_string_and_sample_count((5 * 10**6, d), None, False, "low")
+    assert ret == ("PCAR16,IVF65536_HNSW32,SQ8", 32 * 65536)
+    index = faiss.index_factory(d, ret[0])
+    assert index.is_trained is False
+
+    # memory very important
+    ret = _gen_index_string_and_sample_count((10**8, d), None, False, "minimum")
+    assert ret == ("OPQ16_32,IVF1048576_HNSW32,PQ16", 64 * 65536)
+    index = faiss.index_factory(d, ret[0])
+    assert index.is_trained is False
+
+    ret = _gen_index_string_and_sample_count((10**10, d), None, False, "low")
+    assert ret == ("PCAR16,IVF1048576_HNSW32,SQ8", 64 * 65536)
+    index = faiss.index_factory(d, ret[0])
+    assert index.is_trained is False
+
+    with pytest.raises(ValueError):
+        # M > 64 raise error
+        _gen_index_string_and_sample_count((10**5, d), None, False, "maximum", M=128)
+
+    with pytest.raises(ValueError):
+        # M > 64
+        _gen_index_string_and_sample_count((10**5, d), None, False, "minimum", M=128)
+
+    with pytest.raises(ValueError):
+        # dim should be multiple of M
+        _gen_index_string_and_sample_count(
+            (10**5, d), None, False, "minimum", M=16, dim=17
+        )
+
+    with pytest.raises(ValueError):
+        _gen_index_string_and_sample_count((10**5, d), None, False, "low", k=5)
+
+
+@pytest.mark.skipif(faiss is None, reason="faiss not installed")
+def test_auto_index(setup):
+    d = 8
+    n = 50
+    n_test = 10
+    x = np.random.RandomState(0).rand(n, d).astype(np.float32)
+    y = np.random.RandomState(1).rand(n_test, d).astype(np.float32)
+
+    for chunk_size in (50, 20):
+        X = mt.tensor(x, chunk_size=chunk_size)
+
+        faiss_index = build_faiss_index(X, random_state=0, return_index_type="object")
+        d, i = faiss_query(faiss_index, y, 5, nprobe=10)
+        indices = i.execute().fetch()
+
+        nn = NearestNeighbors()
+        nn.fit(x)
+        expected_indices = nn.kneighbors(y, 5, return_distance=False)
+
+        np.testing.assert_array_equal(indices, expected_indices)
diff --git a/python/xorbits/_mars/learn/neighbors/tests/test_nearest_neighbors.py b/python/xorbits/_mars/learn/neighbors/tests/test_nearest_neighbors.py
new file mode 100644
index 000000000..423918a1e
--- /dev/null
+++ b/python/xorbits/_mars/learn/neighbors/tests/test_nearest_neighbors.py
@@ -0,0 +1,414 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import scipy.sparse as sps
+
+try:
+    import faiss
+except ImportError:  # pragma: no cover
+    faiss = None
+try:
+    from sklearn.neighbors import BallTree as SkBallTree
+    from sklearn.neighbors import KDTree as SkKDTree
+    from sklearn.neighbors import NearestNeighbors as SkNearestNeighbors
+except ImportError:  # pragma: no cover
+    SkNearestNeighbors = None
+
+from .... import tensor as mt
+from ....core import tile
+from ....lib.sparse import SparseNDArray
+from ....tests.core import require_cupy
+from ....utils import lazy_import
+from ...proxima.core import proxima
+from .. import NearestNeighbors
+
+cupy = lazy_import("cupy")
+
+
+def test_nearest_neighbors(setup):
+    rs = np.random.RandomState(0)
+    raw_X = rs.rand(10, 5)
+    raw_Y = rs.rand(8, 5)
+
+    X = mt.tensor(raw_X)
+    Y = mt.tensor(raw_Y)
+
+    raw_sparse_x = sps.random(10, 5, density=0.5, format="csr", random_state=rs)
+    raw_sparse_y = sps.random(8, 5, density=0.4, format="csr", random_state=rs)
+
+    X_sparse = mt.tensor(raw_sparse_x)
+    Y_sparse = mt.tensor(raw_sparse_y)
+
+    metric_func = lambda u, v: np.sqrt(((u - v) ** 2).sum())
+
+    _ = NearestNeighbors(algorithm="auto", metric="precomputed", metric_params={})
+
+    with pytest.raises(ValueError):
+        _ = NearestNeighbors(algorithm="unknown")
+
+    with pytest.raises(ValueError):
+        _ = NearestNeighbors(algorithm="kd_tree", metric=metric_func)
+
+    with pytest.raises(ValueError):
+        _ = NearestNeighbors(algorithm="auto", metric="unknown")
+
+    with pytest.warns(SyntaxWarning):
+        NearestNeighbors(metric_params={"p": 1})
+
+    with pytest.raises(ValueError):
+        _ = NearestNeighbors(metric="wminkowski", p=0)
+
+    with pytest.raises(ValueError):
+        _ = NearestNeighbors(algorithm="auto", metric="minkowski", p=0)
+
+    nn = NearestNeighbors(algorithm="auto", metric="minkowski", p=1)
+    nn.fit(X)
+    assert nn.effective_metric_ == "manhattan"
+
+    nn = NearestNeighbors(algorithm="auto", metric="minkowski", p=2)
+    nn.fit(X)
+    assert nn.effective_metric_ == "euclidean"
+
+    nn = NearestNeighbors(algorithm="auto", metric="minkowski", p=np.inf)
+    nn.fit(X)
+    assert nn.effective_metric_ == "chebyshev"
+
+    nn2 = NearestNeighbors(algorithm="auto", metric="minkowski")
+    nn2.fit(nn)
+    assert nn2._fit_method == nn._fit_method
+
+    nn = NearestNeighbors(algorithm="auto", metric="minkowski")
+    ball_tree = SkBallTree(raw_X)
+    nn.fit(ball_tree)
+    assert nn._fit_method == "ball_tree"
+
+    nn = NearestNeighbors(algorithm="auto", metric="minkowski")
+    kd_tree = SkKDTree(raw_X)
+    nn.fit(kd_tree)
+    assert nn._fit_method == "kd_tree"
+
+    with pytest.raises(ValueError):
+        nn = NearestNeighbors()
+        nn.fit(np.random.rand(0, 10))
+
+    nn = NearestNeighbors(algorithm="ball_tree")
+    with pytest.warns(UserWarning):
+        nn.fit(X_sparse)
+
+    nn = NearestNeighbors(metric="haversine")
+    with pytest.raises(ValueError):
+        nn.fit(X_sparse)
+
+    nn = NearestNeighbors(metric=metric_func, n_neighbors=1)
+    nn.fit(X)
+    assert nn._fit_method == "ball_tree"
+
+    nn = NearestNeighbors(metric="sqeuclidean", n_neighbors=1)
+    nn.fit(X)
+    assert nn._fit_method == "brute"
+
+    with pytest.raises(ValueError):
+        nn = NearestNeighbors(n_neighbors=-1)
+        nn.fit(X)
+
+    with pytest.raises(TypeError):
+        nn = NearestNeighbors(n_neighbors=1.3)
+        nn.fit(X)
+
+    nn = NearestNeighbors()
+    nn.fit(X)
+    with pytest.raises(ValueError):
+        nn.kneighbors(Y, n_neighbors=-1)
+    with pytest.raises(TypeError):
+        nn.kneighbors(Y, n_neighbors=1.3)
+    with pytest.raises(ValueError):
+        nn.kneighbors(Y, n_neighbors=11)
+
+    nn = NearestNeighbors(algorithm="ball_tree")
+    nn.fit(X)
+    with pytest.raises(ValueError):
+        nn.kneighbors(Y_sparse)
+
+
+def test_nearest_neighbors_execution(setup):
+    rs = np.random.RandomState(0)
+    raw_X = rs.rand(10, 5)
+    raw_Y = rs.rand(8, 5)
+
+    X = mt.tensor(raw_X, chunk_size=7)
+    Y = mt.tensor(raw_Y, chunk_size=(5, 3))
+
+    for algo in ["brute", "ball_tree", "kd_tree", "auto"]:
+        for metric in ["minkowski", "manhattan"]:
+            nn = NearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric)
+            nn.fit(X)
+
+            ret = nn.kneighbors(Y)
+
+            snn = SkNearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric)
+            snn.fit(raw_X)
+            expected = snn.kneighbors(raw_Y)
+
+            result = [r.fetch() for r in ret]
+            np.testing.assert_almost_equal(result[0], expected[0])
+            np.testing.assert_almost_equal(result[1], expected[1])
+
+            if nn._tree is not None:
+                assert isinstance(nn._tree.fetch(), type(snn._tree))
+
+            # test return_distance=False
+            ret = nn.kneighbors(Y, return_distance=False)
+
+            result = ret.fetch()
+            np.testing.assert_almost_equal(result, expected[1])
+
+            # test y is x
+            ret = nn.kneighbors()
+
+            expected = snn.kneighbors()
+
+            result = [r.fetch() for r in ret]
+            np.testing.assert_almost_equal(result[0], expected[0])
+            np.testing.assert_almost_equal(result[1], expected[1])
+
+            # test y is x, and return_distance=False
+            ret = nn.kneighbors(return_distance=False)
+
+            result = ret.fetch()
+            np.testing.assert_almost_equal(result, expected[1])
+
+    # test callable metric
+    metric = lambda u, v: np.sqrt(((u - v) ** 2).sum())
+    for algo in ["brute", "ball_tree"]:
+        nn = NearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric)
+        nn.fit(X)
+
+        ret = nn.kneighbors(Y)
+
+        snn = SkNearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric)
+        snn.fit(raw_X)
+        expected = snn.kneighbors(raw_Y)
+
+        result = [r.fetch() for r in ret]
+        np.testing.assert_almost_equal(result[0], expected[0])
+        np.testing.assert_almost_equal(result[1], expected[1])
+
+    # test sparse
+    raw_sparse_x = sps.random(10, 5, density=0.5, format="csr", random_state=rs)
+    raw_sparse_y = sps.random(8, 5, density=0.4, format="csr", random_state=rs)
+
+    X = mt.tensor(raw_sparse_x, chunk_size=7)
+    Y = mt.tensor(raw_sparse_y, chunk_size=5)
+
+    nn = NearestNeighbors(n_neighbors=3)
+    nn.fit(X)
+
+    ret = nn.kneighbors(Y)
+
+    snn = SkNearestNeighbors(n_neighbors=3)
+    snn.fit(raw_sparse_x)
+    expected = snn.kneighbors(raw_sparse_y)
+
+    result = [r.fetch() for r in ret]
+    np.testing.assert_almost_equal(result[0], expected[0])
+    np.testing.assert_almost_equal(result[1], expected[1])
+
+    # test input with unknown shape
+    X = mt.tensor(raw_X, chunk_size=7)
+    X = X[X[:, 0] > 0.1]
+    Y = mt.tensor(raw_Y, chunk_size=(5, 3))
+    Y = Y[Y[:, 0] > 0.1]
+
+    nn = NearestNeighbors(n_neighbors=3)
+    nn.fit(X)
+
+    ret = nn.kneighbors(Y)
+
+    x2 = raw_X[raw_X[:, 0] > 0.1]
+    y2 = raw_Y[raw_Y[:, 0] > 0.1]
+    snn = SkNearestNeighbors(n_neighbors=3)
+    snn.fit(x2)
+    expected = snn.kneighbors(y2)
+
+    result = ret.fetch()
+    assert nn._fit_method == snn._fit_method
+    np.testing.assert_almost_equal(result[0], expected[0])
+    np.testing.assert_almost_equal(result[1], expected[1])
+
+    # test fit a sklearn tree
+    nn = NearestNeighbors(n_neighbors=3)
+    nn.fit(snn._tree)
+
+    ret = nn.kneighbors(Y)
+    result = ret.fetch()
+    assert nn._fit_method == snn._fit_method
+    np.testing.assert_almost_equal(result[0], expected[0])
+    np.testing.assert_almost_equal(result[1], expected[1])
+
+
+def test_k_neighbors_graph_execution(setup):
+    rs = np.random.RandomState(0)
+    raw_X = rs.rand(10, 5)
+    raw_Y = rs.rand(8, 5)
+
+    X = mt.tensor(raw_X, chunk_size=7)
+    Y = mt.tensor(raw_Y, chunk_size=(5, 3))
+
+    neigh = NearestNeighbors(n_neighbors=3)
+    neigh.fit(X)
+    sklearn_neigh = SkNearestNeighbors(n_neighbors=3)
+    sklearn_neigh.fit(raw_X)
+
+    for mode in ["connectivity", "distance"]:
+        graph = neigh.kneighbors_graph(Y, mode=mode)
+        result = graph.fetch()
+
+        assert isinstance(result, SparseNDArray)
+        assert len(tile(graph).chunks) > 1
+
+        expected = sklearn_neigh.kneighbors_graph(raw_Y, mode=mode)
+
+        np.testing.assert_array_equal(result.toarray(), expected.toarray())
+
+        graph2 = neigh.kneighbors_graph(mode=mode)
+        result2 = graph2.fetch()
+
+        assert isinstance(result2, SparseNDArray)
+
+        expected2 = sklearn_neigh.kneighbors_graph(mode=mode)
+
+        np.testing.assert_array_equal(result2.toarray(), expected2.toarray())
+
+    X = [[0], [3], [1]]
+
+    neigh = NearestNeighbors(n_neighbors=2)
+    sklearn_neigh = SkNearestNeighbors(n_neighbors=2)
+    neigh.fit(X)
+    sklearn_neigh.fit(X)
+
+    A = neigh.kneighbors_graph(X).fetch()
+    expected_A = sklearn_neigh.kneighbors_graph(X)
+    np.testing.assert_array_equal(A.toarray(), expected_A.toarray())
+
+    # test wrong mode
+    with pytest.raises(ValueError):
+        _ = neigh.kneighbors_graph(mode="unknown")
+
+
+@pytest.mark.skipif(faiss is None, reason="faiss not installed")
+def test_faiss_nearest_neighbors_execution(setup):
+    rs = np.random.RandomState(0)
+    raw_X = rs.rand(10, 5)
+    raw_Y = rs.rand(8, 5)
+
+    # test faiss execution
+    X = mt.tensor(raw_X, chunk_size=7)
+    Y = mt.tensor(raw_Y, chunk_size=(5, 3))
+
+    nn = NearestNeighbors(n_neighbors=3, algorithm="faiss", metric="l2")
+    nn.fit(X)
+
+    ret = nn.kneighbors(Y)
+
+    snn = SkNearestNeighbors(n_neighbors=3, algorithm="auto", metric="l2")
+    snn.fit(raw_X)
+    expected = snn.kneighbors(raw_Y)
+
+    result = [r.fetch() for r in ret]
+    np.testing.assert_almost_equal(result[0], expected[0], decimal=6)
+    np.testing.assert_almost_equal(result[1], expected[1])
+
+    # test return_distance=False
+    ret = nn.kneighbors(Y, return_distance=False)
+
+    result = ret.fetch()
+    np.testing.assert_almost_equal(result, expected[1])
+
+    # test y is x
+    ret = nn.kneighbors()
+
+    expected = snn.kneighbors()
+
+    result = [r.fetch() for r in ret]
+    np.testing.assert_almost_equal(result[0], expected[0], decimal=5)
+    np.testing.assert_almost_equal(result[1], expected[1])
+
+
+@pytest.mark.skipif(proxima is None, reason="proxima not installed")
+def test_proxima_nearest_neighbors_execution(setup):
+    rs = np.random.RandomState(0)
+    raw_X = rs.rand(10, 5).astype("float32")
+    raw_Y = rs.rand(8, 5).astype("float32")
+
+    # test faiss execution
+    X = mt.tensor(raw_X, chunk_size=6)
+    Y = mt.tensor(raw_Y, chunk_size=(5, 3))
+
+    nn = NearestNeighbors(n_neighbors=3, algorithm="proxima", metric="l2")
+    nn.fit(X)
+
+    ret = nn.kneighbors(Y)
+
+    snn = SkNearestNeighbors(n_neighbors=3, algorithm="auto", metric="l2")
+    snn.fit(raw_X)
+    expected = snn.kneighbors(raw_Y)
+
+    result = [r.fetch() for r in ret]
+    np.testing.assert_almost_equal(result[0], expected[0], decimal=6)
+    np.testing.assert_almost_equal(result[1], expected[1])
+
+    # test return_distance=False
+    ret = nn.kneighbors(Y, return_distance=False)
+
+    result = ret.fetch()
+    np.testing.assert_almost_equal(result, expected[1])
+
+    # test y is x
+    ret = nn.kneighbors()
+
+    expected = snn.kneighbors()
+
+    result = [r.fetch() for r in ret]
+    np.testing.assert_almost_equal(result[0], expected[0], decimal=5)
+    np.testing.assert_almost_equal(result[1], expected[1])
+
+
+@require_cupy
+@pytest.mark.skipif(
+    cupy is None or faiss is None, reason="either cupy or faiss not installed"
+)
+def test_gpu_faiss_nearest_neighbors_execution(setup_gpu):
+    rs = np.random.RandomState(0)
+
+    raw_X = rs.rand(10, 5)
+    raw_Y = rs.rand(8, 5)
+
+    # test faiss execution
+    X = mt.tensor(raw_X, chunk_size=7).to_gpu()
+    Y = mt.tensor(raw_Y, chunk_size=8).to_gpu()
+
+    nn = NearestNeighbors(n_neighbors=3, algorithm="faiss", metric="l2")
+    nn.fit(X)
+
+    ret = nn.kneighbors(Y)
+
+    snn = SkNearestNeighbors(n_neighbors=3, algorithm="auto", metric="l2")
+    snn.fit(raw_X)
+    expected = snn.kneighbors(raw_Y)
+
+    result = [r.fetch() for r in ret]
+    np.testing.assert_almost_equal(result[0].get(), expected[0], decimal=6)
+    np.testing.assert_almost_equal(result[1].get(), expected[1])
diff --git a/python/xorbits/_mars/learn/neighbors/tree.py b/python/xorbits/_mars/learn/neighbors/tree.py
new file mode 100644
index 000000000..a6ae58999
--- /dev/null
+++ b/python/xorbits/_mars/learn/neighbors/tree.py
@@ -0,0 +1,280 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cloudpickle
+import numpy as np
+
+from ...core import OBJECT_CHUNK_TYPE, OBJECT_TYPE, Object, recursive_tile
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    DictField,
+    Int32Field,
+    KeyField,
+)
+from ...tensor.core import TensorOrder
+from ...utils import has_unknown_shape, tokenize
+from ..operands import LearnOperand, LearnOperandMixin, OutputType
+
+
+class TreeObject(Object):
+    def fetch(self, session=None, **kw):
+        result = self._data.fetch(session=session, **kw)
+        return cloudpickle.loads(result) if isinstance(result, bytes) else result
+
+
+class TreeBase(LearnOperand, LearnOperandMixin):
+    _input = KeyField("input")
+    _leaf_size = Int32Field("leaf_size")
+    _metric = AnyField("metric")
+
+    _metric_params = DictField("metric_params")
+
+    def __init__(
+        self, leaf_size=None, metric=None, metric_params=None, output_types=None, **kw
+    ):
+        super().__init__(
+            _leaf_size=leaf_size,
+            _metric=metric,
+            _metric_params=metric_params,
+            _output_types=output_types,
+            **kw
+        )
+        if self.output_types is None:
+            self.output_types = [OutputType.object]
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def leaf_size(self):
+        return self._leaf_size
+
+    @property
+    def metric(self):
+        return self._metric
+
+    @property
+    def metric_params(self):
+        return self._metric_params
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def __call__(self, a):
+        return self.new_tileable([a])
+
+    @classmethod
+    def tile(cls, op):
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        # ball tree and kd tree requires the full data,
+        # thus rechunk input tensor into 1 chunk
+        inp = op.input.rechunk({ax: s for ax, s in enumerate(op.input.shape)})
+        inp = yield from recursive_tile(inp)
+        out = op.outputs[0]
+
+        chunk_op = op.copy().reset_key()
+        kw = out.params
+        kw["index"] = inp.chunks[0].index
+        chunk = chunk_op.new_chunk([inp.chunks[0]], kws=[kw])
+
+        new_op = op.copy()
+        tileable_kw = out.params
+        tileable_kw["nsplits"] = ((1,),)
+        tileable_kw["chunks"] = [chunk]
+        return new_op.new_tileables(op.inputs, kws=[tileable_kw])
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.gpu:  # pragma: no cover
+            raise NotImplementedError(
+                "Does not support tree-based nearest neighbors on GPU"
+            )
+
+        a = ctx[op.input.key]
+        tree = cls._tree_type(
+            a, op.leaf_size, metric=op.metric, **(op.metric_params or dict())
+        )
+        ctx[op.outputs[0].key] = tree
+
+
+def _on_serialize_tree(tree):
+    return cloudpickle.dumps(tree) if not hasattr(tree, "key") else tree
+
+
+def _on_deserialize_tree(ser):
+    return cloudpickle.loads(ser) if isinstance(ser, bytes) else ser
+
+
+class TreeQueryBase(LearnOperand, LearnOperandMixin):
+    _input = KeyField("input")
+    _tree = AnyField(
+        "tree", on_serialize=_on_serialize_tree, on_deserialize=_on_deserialize_tree
+    )
+    _n_neighbors = Int32Field("n_neighbors")
+    _return_distance = BoolField("return_distance")
+
+    def __init__(
+        self, tree=None, n_neighbors=None, return_distance=None, output_types=None, **kw
+    ):
+        super().__init__(
+            _tree=tree,
+            _n_neighbors=n_neighbors,
+            _return_distance=return_distance,
+            _output_types=output_types,
+            **kw
+        )
+        if self.output_types is None:
+            self.output_types = [OutputType.tensor] * self.output_limit
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def tree(self):
+        return self._tree
+
+    @property
+    def n_neighbors(self):
+        return self._n_neighbors
+
+    @property
+    def return_distance(self):
+        return self._return_distance
+
+    @property
+    def output_limit(self):
+        return 2 if self._return_distance else 1
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+        if isinstance(self._tree, (OBJECT_TYPE, OBJECT_CHUNK_TYPE)):
+            self._tree = self._inputs[1]
+
+    def _update_key(self):
+        values = []
+        for value in self._values_:
+            if isinstance(value, self._tree_type):
+                values.append(cloudpickle.dumps(value))
+            else:
+                values.append(value)
+        self._obj_set("_key", tokenize(type(self).__name__, *values))
+        return self
+
+    def __call__(self, x):
+        kws = []
+        if self._return_distance:
+            kws.append(
+                {
+                    "shape": (x.shape[0], self._n_neighbors),
+                    "dtype": np.dtype(np.float64),
+                    "order": x.order,
+                    "type": "distance",
+                }
+            )
+        kws.append(
+            {
+                "shape": (x.shape[0], self._n_neighbors),
+                "dtype": np.dtype(np.int64),
+                "order": TensorOrder.C_ORDER,
+                "type": "indices",
+            }
+        )
+        inputs = [x]
+        if isinstance(self._tree, OBJECT_TYPE):
+            inputs.append(self._tree)
+        return self.new_tileables(inputs, kws=kws, output_limit=len(kws))
+
+    @classmethod
+    def tile(cls, op):
+        inp = op.input
+
+        if inp.chunk_shape[1] != 1:
+            if has_unknown_shape(inp):
+                yield
+            inp = yield from recursive_tile(inp.rechunk({1: inp.shape[1]}))
+
+        tree_chunk = None
+        if isinstance(op.tree, OBJECT_TYPE):
+            tree_chunk = op.tree.chunks[0]
+        out_chunks = [[] for _ in range(len(op.outputs))]
+        for chunk in inp.chunks:
+            chunk_op = op.copy().reset_key()
+            if tree_chunk is not None:
+                chunk_op._tree = tree_chunk
+            chunk_kws = []
+            if op.return_distance:
+                chunk_kws.append(
+                    {
+                        "shape": (chunk.shape[0], op.n_neighbors),
+                        "dtype": np.dtype(np.float64),
+                        "order": chunk.order,
+                        "index": chunk.index,
+                        "type": "distance",
+                    }
+                )
+            chunk_kws.append(
+                {
+                    "shape": (chunk.shape[0], op.n_neighbors),
+                    "dtype": np.dtype(np.int64),
+                    "order": TensorOrder.C_ORDER,
+                    "index": chunk.index,
+                    "type": "indices",
+                }
+            )
+            chunk_inputs = [chunk]
+            if tree_chunk is not None:
+                chunk_inputs.append(tree_chunk)
+            chunks = chunk_op.new_chunks(
+                chunk_inputs, kws=chunk_kws, output_limit=len(chunk_kws)
+            )
+            for cs, c in zip(out_chunks, chunks):
+                cs.append(c)
+
+        kws = [o.params for o in op.outputs]
+        nsplits = list(inp.nsplits)
+        nsplits[1] = (op.n_neighbors,)
+        if op.return_distance:
+            kws[0]["chunks"] = out_chunks[0]
+            kws[0]["nsplits"] = tuple(nsplits)
+        kws[-1]["chunks"] = out_chunks[-1]
+        kws[-1]["nsplits"] = tuple(nsplits)
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=kws, output_limit=len(kws))
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.gpu:  # pragma: no cover
+            raise NotImplementedError(
+                "Does not support tree-based nearest neighbors on GPU"
+            )
+
+        x = ctx[op.input.key]
+        if len(op.inputs) == 2:
+            tree = ctx[op.tree.key]
+        else:
+            tree = op.tree
+        tree = cloudpickle.loads(tree) if isinstance(tree, bytes) else tree
+        ret = tree.query(x, op.n_neighbors, op.return_distance)
+        if op.return_distance:
+            ctx[op.outputs[0].key] = ret[0]
+            ctx[op.outputs[1].key] = ret[1]
+        else:
+            ctx[op.outputs[0].key] = ret
diff --git a/python/xorbits/_mars/learn/neighbors/unsupervised.py b/python/xorbits/_mars/learn/neighbors/unsupervised.py
new file mode 100644
index 000000000..bb949cd31
--- /dev/null
+++ b/python/xorbits/_mars/learn/neighbors/unsupervised.py
@@ -0,0 +1,39 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import KNeighborsMixin, NeighborsBase, UnsupervisedMixin
+
+
+class NearestNeighbors(NeighborsBase, KNeighborsMixin, UnsupervisedMixin):
+    def __init__(
+        self,
+        n_neighbors=5,
+        radius=1.0,
+        algorithm="auto",
+        leaf_size=30,
+        metric="minkowski",
+        p=2,
+        metric_params=None,
+        **kwargs
+    ):
+        super().__init__(
+            n_neighbors=n_neighbors,
+            radius=radius,
+            algorithm=algorithm,
+            leaf_size=leaf_size,
+            metric=metric,
+            p=p,
+            metric_params=metric_params,
+            **kwargs
+        )
diff --git a/python/xorbits/_mars/learn/operands.py b/python/xorbits/_mars/learn/operands.py
new file mode 100644
index 000000000..0c5e007c3
--- /dev/null
+++ b/python/xorbits/_mars/learn/operands.py
@@ -0,0 +1,88 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core import OutputType
+from ..core.operand import (
+    Fuse,
+    FuseChunkMixin,
+    Operand,
+    ShuffleProxy,
+    TileableOperandMixin,
+)
+from ..dataframe.core import CHUNK_TYPE as DATAFRAME_CHUNK_TYPE
+from ..dataframe.core import TILEABLE_TYPE as DATAFRAME_TYPE
+from ..dataframe.operands import DataFrameFuseChunk, DataFrameOperandMixin
+from ..tensor.core import TENSOR_CHUNK_TYPE, TENSOR_TYPE
+from ..tensor.fuse import TensorFuseChunk
+from ..tensor.operands import TensorOperandMixin
+
+LearnOperand = Operand
+
+
+class LearnOperandMixin(TileableOperandMixin):
+    __slots__ = ()
+    _op_module_ = "learn"
+
+    @classmethod
+    def concat_tileable_chunks(cls, tileable):
+        if isinstance(tileable, TENSOR_TYPE):
+            return TensorOperandMixin.concat_tileable_chunks(tileable)
+        elif isinstance(tileable, DATAFRAME_TYPE):
+            return DataFrameOperandMixin.concat_tileable_chunks(tileable)
+        else:
+            # op has to implement its logic of `concat_tileable_chunks`
+            raise NotImplementedError
+
+    @classmethod
+    def create_tileable_from_chunks(cls, chunks, inputs=None, **kw):
+        if isinstance(chunks[0], TENSOR_CHUNK_TYPE):
+            return TensorOperandMixin.create_tileable_from_chunks(
+                chunks, inputs=inputs, **kw
+            )
+        elif isinstance(chunks[0], DATAFRAME_CHUNK_TYPE):
+            return DataFrameOperandMixin.create_tileable_from_chunks(
+                chunks, inputs=inputs, **kw
+            )
+        else:
+            # op has to implement its logic of `create_tileable_from_chunks`
+            raise NotImplementedError
+
+    def get_fuse_op_cls(self, obj):
+        if isinstance(obj, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)):
+            return TensorFuseChunk
+        elif isinstance(obj, (DATAFRAME_TYPE, DATAFRAME_CHUNK_TYPE)):
+            return DataFrameFuseChunk
+        else:
+            return LearnObjectFuseChunk
+
+
+class LearnObjectFuseChunkMixin(FuseChunkMixin, LearnOperandMixin):
+    __slots__ = ()
+
+    _output_type_ = OutputType.object
+
+
+class LearnObjectFuseChunk(LearnObjectFuseChunkMixin, Fuse):
+    pass
+
+
+class LearnShuffleProxy(ShuffleProxy, LearnOperandMixin):
+    def __init__(self, output_types=None, **kw):
+        super().__init__(_output_types=output_types, **kw)
+        if not self.output_types:
+            self.output_types = [OutputType.object]
+
+    @classmethod
+    def execute(cls, ctx, op):
+        pass
diff --git a/python/xorbits/_mars/learn/preprocessing/__init__.py b/python/xorbits/_mars/learn/preprocessing/__init__.py
new file mode 100644
index 000000000..7162b0e27
--- /dev/null
+++ b/python/xorbits/_mars/learn/preprocessing/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ._data import MinMaxScaler, minmax_scale
+from ._label import LabelBinarizer, LabelEncoder, label_binarize
+from .normalize import normalize
diff --git a/python/xorbits/_mars/learn/preprocessing/_data.py b/python/xorbits/_mars/learn/preprocessing/_data.py
new file mode 100644
index 000000000..7b33815e8
--- /dev/null
+++ b/python/xorbits/_mars/learn/preprocessing/_data.py
@@ -0,0 +1,400 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from sklearn.base import TransformerMixin
+from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
+
+from ... import tensor as mt
+from ...tensor.core import TENSOR_TYPE
+from ..base import BaseEstimator
+from ..utils.validation import check_array
+
+
+def _handle_zeros_in_scale(scale, copy=True):
+    """Makes sure that whenever scale is zero, we handle it correctly.
+
+    This happens in most scalers when we have constant features.
+    """
+
+    # if we are fitting on 1D arrays, scale might be a scalar
+    if np.isscalar(scale):  # pragma: no cover
+        if scale == 0.0:
+            scale = 1.0
+        return scale
+    elif hasattr(scale, "ndim") and scale.ndim == 0:  # pragma: no cover
+        # scalar that is tensor
+        return mt.where(scale == 0.0, 1.0, scale)
+    elif isinstance(scale, (np.ndarray, TENSOR_TYPE)):
+        if copy:
+            # New array to avoid side-effects
+            scale = scale.copy()
+        scale[scale == 0.0] = 1.0
+        return scale
+
+
+class MinMaxScaler(TransformerMixin, BaseEstimator):
+    """Transform features by scaling each feature to a given range.
+
+    This estimator scales and translates each feature individually such
+    that it is in the given range on the training set, e.g. between
+    zero and one.
+
+    The transformation is given by::
+
+        X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
+        X_scaled = X_std * (max - min) + min
+
+    where min, max = feature_range.
+
+    This transformation is often used as an alternative to zero mean,
+    unit variance scaling.
+
+    Read more in the :ref:`User Guide <preprocessing_scaler>`.
+
+    Parameters
+    ----------
+    feature_range : tuple (min, max), default=(0, 1)
+        Desired range of transformed data.
+
+    copy : bool, default=True
+        Set to False to perform inplace row normalization and avoid a
+        copy (if the input is already a numpy array).
+
+    clip: bool, default=False
+        Set to True to clip transformed values of held-out data to
+        provided `feature range`.
+
+    Attributes
+    ----------
+    min_ : Tensor of shape (n_features,)
+        Per feature adjustment for minimum. Equivalent to
+        ``min - X.min(axis=0) * self.scale_``
+
+    scale_ : Tensor of shape (n_features,)
+        Per feature relative scaling of the data. Equivalent to
+        ``(max - min) / (X.max(axis=0) - X.min(axis=0))``
+
+    data_min_ : ndarray of shape (n_features,)
+        Per feature minimum seen in the data
+
+    data_max_ : ndarray of shape (n_features,)
+        Per feature maximum seen in the data
+
+    data_range_ : ndarray of shape (n_features,)
+        Per feature range ``(data_max_ - data_min_)`` seen in the data
+
+    n_samples_seen_ : int
+        The number of samples processed by the estimator.
+        It will be reset on new calls to fit, but increments across
+        ``partial_fit`` calls.
+
+    Examples
+    --------
+    >>> from mars.learn.preprocessing import MinMaxScaler
+    >>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
+    >>> scaler = MinMaxScaler()
+    >>> print(scaler.fit(data))
+    MinMaxScaler()
+    >>> print(scaler.data_max_)
+    [ 1. 18.]
+    >>> print(scaler.transform(data))
+    [[0.   0.  ]
+     [0.25 0.25]
+     [0.5  0.5 ]
+     [1.   1.  ]]
+    >>> print(scaler.transform([[2, 2]]))
+    [[1.5 0. ]]
+
+    See Also
+    --------
+    minmax_scale : Equivalent function without the estimator API.
+
+    Notes
+    -----
+    NaNs are treated as missing values: disregarded in fit, and maintained in
+    transform.
+
+    For a comparison of the different scalers, transformers, and normalizers,
+    see :ref:`examples/preprocessing/plot_all_scaling.py
+    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
+    """
+
+    def __init__(self, feature_range=(0, 1), copy=True, clip=False):
+        self.feature_range = feature_range
+        self.copy = copy
+        self.clip = clip
+
+    def _reset(self):  # pragma: no cover
+        """Reset internal data-dependent state of the scaler, if necessary.
+
+        __init__ parameters are not touched.
+        """
+
+        # Checking one attribute is enough, because they are all set together
+        # in partial_fit
+        if hasattr(self, "scale_"):
+            del self.scale_
+            del self.min_
+            del self.n_samples_seen_
+            del self.data_min_
+            del self.data_max_
+            del self.data_range_
+
+    def fit(self, X, y=None, session=None, run_kwargs=None):
+        """Compute the minimum and maximum to be used for later scaling.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data used to compute the per-feature minimum and maximum
+            used for later scaling along the features axis.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted scaler.
+        """
+
+        # Reset internal state before fitting
+        self._reset()
+        return self.partial_fit(X, y, session=session, run_kwargs=run_kwargs)
+
+    def partial_fit(self, X, y=None, session=None, run_kwargs=None):
+        """Online computation of min and max on X for later scaling.
+
+        All of X is processed as a single batch. This is intended for cases
+        when :meth:`fit` is not feasible due to very large number of
+        `n_samples` or because X is read from a continuous stream.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data used to compute the mean and standard deviation
+            used for later scaling along the features axis.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted scaler.
+        """
+        feature_range = self.feature_range
+        if feature_range[0] >= feature_range[1]:
+            raise ValueError(
+                "Minimum of desired feature range must be smaller"
+                " than maximum. Got %s." % str(feature_range)
+            )
+
+        if mt.tensor(X).issparse():  # pragma: no cover
+            raise TypeError(
+                "MinMaxScaler does not support sparse input. "
+                "Consider using MaxAbsScaler instead."
+            )
+
+        first_pass = not hasattr(self, "n_samples_seen_")
+        X = self._validate_data(
+            X,
+            reset=first_pass,
+            estimator=self,
+            dtype=FLOAT_DTYPES,
+            force_all_finite="allow-nan",
+        )
+
+        if np.isnan(X.shape[0]):  # pragma: no cover
+            X.execute(session=session, **(run_kwargs or dict()))
+
+        data_min = mt.nanmin(X, axis=0)
+        data_max = mt.nanmax(X, axis=0)
+
+        if first_pass:
+            self.n_samples_seen_ = X.shape[0]
+        else:
+            data_min = mt.minimum(
+                self.data_min_, data_min
+            )  # pylint: disable=access-member-before-definition
+            data_max = mt.maximum(
+                self.data_max_, data_max
+            )  # pylint: disable=access-member-before-definition
+            self.n_samples_seen_ += X.shape[0]
+
+        data_range = data_max - data_min
+        self.scale_ = (feature_range[1] - feature_range[0]) / _handle_zeros_in_scale(
+            data_range
+        )
+        self.min_ = feature_range[0] - data_min * self.scale_
+        self.data_min_ = data_min
+        self.data_max_ = data_max
+        self.data_range_ = data_range
+        mt.ExecutableTuple(
+            [self.scale_, self.min_, self.data_min_, self.data_max_, self.data_range_]
+        ).execute(session=session, **(run_kwargs or dict()))
+        return self
+
+    def transform(self, X, session=None, run_kwargs=None):
+        """Scale features of X according to feature_range.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data that will be transformed.
+
+        Returns
+        -------
+        Xt : ndarray of shape (n_samples, n_features)
+            Transformed data.
+        """
+        check_is_fitted(self)
+
+        X = self._validate_data(
+            X,
+            copy=self.copy,
+            dtype=FLOAT_DTYPES,
+            force_all_finite="allow-nan",
+            reset=False,
+        )
+
+        X *= self.scale_
+        X += self.min_
+        if self.clip:
+            X = mt.clip(X, self.feature_range[0], self.feature_range[1])
+        return X.execute(session=session, **(run_kwargs or dict()))
+
+    def inverse_transform(self, X, session=None, run_kwargs=None):
+        """Undo the scaling of X according to feature_range.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data that will be transformed. It cannot be sparse.
+
+        Returns
+        -------
+        Xt : ndarray of shape (n_samples, n_features)
+            Transformed data.
+        """
+        check_is_fitted(self)
+
+        X = check_array(
+            X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan"
+        )
+
+        X -= self.min_
+        X /= self.scale_
+        return X.execute(session=session, **(run_kwargs or dict()))
+
+    def _more_tags(self):  # pylint: disable=no-self-use
+        return {"allow_nan": True}
+
+
+def minmax_scale(
+    X, feature_range=(0, 1), *, axis=0, copy=True, session=None, run_kwargs=None
+):
+    """Transform features by scaling each feature to a given range.
+
+    This estimator scales and translates each feature individually such
+    that it is in the given range on the training set, i.e. between
+    zero and one.
+
+    The transformation is given by (when ``axis=0``)::
+
+        X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
+        X_scaled = X_std * (max - min) + min
+
+    where min, max = feature_range.
+
+    The transformation is calculated as (when ``axis=0``)::
+
+       X_scaled = scale * X + min - X.min(axis=0) * scale
+       where scale = (max - min) / (X.max(axis=0) - X.min(axis=0))
+
+    This transformation is often used as an alternative to zero mean,
+    unit variance scaling.
+
+    Read more in the :ref:`User Guide <preprocessing_scaler>`.
+
+    .. versionadded:: 0.17
+       *minmax_scale* function interface
+       to :class:`~sklearn.preprocessing.MinMaxScaler`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        The data.
+
+    feature_range : tuple (min, max), default=(0, 1)
+        Desired range of transformed data.
+
+    axis : int, default=0
+        Axis used to scale along. If 0, independently scale each feature,
+        otherwise (if 1) scale each sample.
+
+    copy : bool, default=True
+        Set to False to perform inplace scaling and avoid a copy (if the input
+        is already a numpy array).
+
+    Returns
+    -------
+    X_tr : ndarray of shape (n_samples, n_features)
+        The transformed data.
+
+    .. warning:: Risk of data leak
+
+        Do not use :func:`~sklearn.preprocessing.minmax_scale` unless you know
+        what you are doing. A common mistake is to apply it to the entire data
+        *before* splitting into training and test sets. This will bias the
+        model evaluation because information would have leaked from the test
+        set to the training set.
+        In general, we recommend using
+        :class:`~sklearn.preprocessing.MinMaxScaler` within a
+        :ref:`Pipeline <pipeline>` in order to prevent most risks of data
+        leaking: `pipe = make_pipeline(MinMaxScaler(), LogisticRegression())`.
+
+    See Also
+    --------
+    MinMaxScaler : Performs scaling to a given range using the Transformer
+        API (e.g. as part of a preprocessing
+        :class:`~sklearn.pipeline.Pipeline`).
+
+    Notes
+    -----
+    For a comparison of the different scalers, transformers, and normalizers,
+    see :ref:`examples/preprocessing/plot_all_scaling.py
+    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
+    """  # noqa
+    # Unlike the scaler object, this function allows 1d input.
+    # If copy is required, it will be done inside the scaler object.
+    X = check_array(
+        X, copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, force_all_finite="allow-nan"
+    )
+    original_ndim = X.ndim
+
+    if original_ndim == 1:
+        X = X.reshape(X.shape[0], 1)
+
+    s = MinMaxScaler(feature_range=feature_range, copy=copy)
+    if axis == 0:
+        X = s.fit_transform(X)
+    else:
+        X = s.fit_transform(X.T).T
+
+    if original_ndim == 1:
+        X = X.ravel()
+
+    return X.execute(session=session, **(run_kwargs or dict()))
diff --git a/python/xorbits/_mars/learn/preprocessing/_label.py b/python/xorbits/_mars/learn/preprocessing/_label.py
new file mode 100644
index 000000000..83d537233
--- /dev/null
+++ b/python/xorbits/_mars/learn/preprocessing/_label.py
@@ -0,0 +1,864 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Union
+
+import numpy as np
+import scipy.sparse as sp
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.preprocessing import label_binarize as sklearn_label_binarize
+from sklearn.utils.sparsefuncs import min_max_axis
+
+from ... import execute as execute_tileable
+from ... import fetch as fetch_tileable
+from ... import opcodes
+from ... import tensor as mt
+from ...core import ENTITY_TYPE, OutputType, recursive_tile
+from ...core.context import Context, get_context
+from ...lib.sparse import SparseNDArray
+from ...serialization.serializables import AnyField, BoolField, Int32Field, StringField
+from ...tensor.core import TensorOrder
+from ...typing import TileableType
+from ..operands import LearnOperand, LearnOperandMixin
+from ..utils import column_or_1d
+from ..utils._encode import _encode, _unique
+from ..utils.multiclass import type_of_target, unique_labels
+from ..utils.validation import _num_samples, check_array, check_is_fitted
+
+
+class LabelEncoder(TransformerMixin, BaseEstimator):
+    """Encode target labels with value between 0 and n_classes-1.
+
+    This transformer should be used to encode target values, *i.e.* `y`, and
+    not the input `X`.
+
+    Read more in the :ref:`User Guide <preprocessing_targets>`.
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,)
+        Holds the label for each class.
+
+    See Also
+    --------
+    OrdinalEncoder : Encode categorical features using an ordinal encoding
+        scheme.
+    OneHotEncoder : Encode categorical features as a one-hot numeric array.
+
+    Examples
+    --------
+    `LabelEncoder` can be used to normalize labels.
+
+    >>> from sklearn import preprocessing
+    >>> le = preprocessing.LabelEncoder()
+    >>> le.fit([1, 2, 2, 6])
+    LabelEncoder()
+    >>> le.classes_
+    array([1, 2, 6])
+    >>> le.transform([1, 1, 2, 6])
+    array([0, 0, 1, 2]...)
+    >>> le.inverse_transform([0, 0, 1, 2])
+    array([1, 1, 2, 6])
+
+    It can also be used to transform non-numerical labels (as long as they are
+    hashable and comparable) to numerical labels.
+
+    >>> le = preprocessing.LabelEncoder()
+    >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
+    LabelEncoder()
+    >>> list(le.classes_)
+    ['amsterdam', 'paris', 'tokyo']
+    >>> le.transform(["tokyo", "tokyo", "paris"])
+    array([2, 2, 1]...)
+    >>> list(le.inverse_transform([2, 2, 1]))
+    ['tokyo', 'tokyo', 'paris']
+    """
+
+    def fit(self, y, session=None, run_kwargs=None, execute=True):
+        """Fit label encoder.
+
+        Parameters
+        ----------
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        self : returns an instance of self.
+            Fitted label encoder.
+        """
+        y = column_or_1d(y, warn=True)
+        self.classes_ = _unique(y)
+        if execute:
+            self.classes_ = execute_tileable(
+                self.classes_, session=session, **(run_kwargs or dict())
+            )
+        return self
+
+    def fit_transform(self, y, session=None, run_kwargs=None):
+        """Fit label encoder and return encoded labels.
+
+        Parameters
+        ----------
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        y : array-like of shape (n_samples,)
+            Encoded labels.
+        """
+        y = column_or_1d(y, warn=True)
+        self.classes_, y = execute_tileable(
+            _unique(y, return_inverse=True), session=session, **(run_kwargs or dict())
+        )
+        return y
+
+    def transform(self, y, session=None, run_kwargs=None, execute=True):
+        """Transform labels to normalized encoding.
+
+        Parameters
+        ----------
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        y : array-like of shape (n_samples,)
+            Labels as normalized encodings.
+        """
+        check_is_fitted(self)
+        y = column_or_1d(y, warn=True)
+        # transform of empty array is empty array
+        if _num_samples(y) == 0:
+            return mt.array([])
+
+        t = _encode(y, uniques=self.classes_)
+        if execute:
+            t = t.execute(session=session, **(run_kwargs or dict()))
+        return t
+
+    def inverse_transform(self, y, session=None, run_kwargs=None):
+        """Transform labels back to original encoding.
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+            Original encoding.
+        """
+        check_is_fitted(self)
+        y = column_or_1d(y, warn=True)
+        # inverse transform of empty array is empty array
+        if _num_samples(y) == 0:
+            return mt.array([])
+
+        def _class_checker(chunk_data, classes_data):
+            diff = np.setdiff1d(chunk_data, np.arange(len(classes_data)))
+            if len(diff):
+                raise ValueError("y contains previously unseen labels: %s" % str(diff))
+            return chunk_data
+
+        y = mt.asarray(y).map_chunk(_class_checker, args=(self.classes_,))
+        return self.classes_[y].execute(session=session, **(run_kwargs or dict()))
+
+    def _more_tags(self):
+        return {"X_types": ["1dlabels"]}
+
+
+class LabelBinarizer(TransformerMixin, BaseEstimator):
+    """Binarize labels in a one-vs-all fashion.
+
+    Several regression and binary classification algorithms are
+    available in scikit-learn. A simple way to extend these algorithms
+    to the multi-class classification case is to use the so-called
+    one-vs-all scheme.
+
+    At learning time, this simply consists in learning one regressor
+    or binary classifier per class. In doing so, one needs to convert
+    multi-class labels to binary labels (belong or does not belong
+    to the class). LabelBinarizer makes this process easy with the
+    transform method.
+
+    At prediction time, one assigns the class for which the corresponding
+    model gave the greatest confidence. LabelBinarizer makes this easy
+    with the inverse_transform method.
+
+    Read more in the :ref:`User Guide <preprocessing_targets>`.
+
+    Parameters
+    ----------
+
+    neg_label : int, default=0
+        Value with which negative labels must be encoded.
+
+    pos_label : int, default=1
+        Value with which positive labels must be encoded.
+
+    sparse_output : bool, default=False
+        True if the returned array from transform is desired to be in sparse
+        CSR format.
+
+    Attributes
+    ----------
+
+    classes_ : ndarray of shape (n_classes,)
+        Holds the label for each class.
+
+    y_type_ : str
+        Represents the type of the target data as evaluated by
+        utils.multiclass.type_of_target. Possible type are 'continuous',
+        'continuous-multioutput', 'binary', 'multiclass',
+        'multiclass-multioutput', 'multilabel-indicator', and 'unknown'.
+
+    sparse_input_ : bool
+        True if the input data to transform is given as a sparse matrix, False
+        otherwise.
+
+    Examples
+    --------
+    >>> from mars.learn import preprocessing
+    >>> lb = preprocessing.LabelBinarizer()
+    >>> lb.fit([1, 2, 6, 4, 2])
+    LabelBinarizer()
+    >>> lb.classes_
+    array([1, 2, 4, 6])
+    >>> lb.transform([1, 6])
+    array([[1, 0, 0, 0],
+           [0, 0, 0, 1]])
+
+    Binary targets transform to a column vector
+
+    >>> lb = preprocessing.LabelBinarizer()
+    >>> lb.fit_transform(['yes', 'no', 'no', 'yes'])
+    array([[1],
+           [0],
+           [0],
+           [1]])
+
+    Passing a 2D matrix for multilabel classification
+
+    >>> import numpy as np
+    >>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]]))
+    LabelBinarizer()
+    >>> lb.classes_
+    array([0, 1, 2])
+    >>> lb.transform([0, 1, 2, 1])
+    array([[1, 0, 0],
+           [0, 1, 0],
+           [0, 0, 1],
+           [0, 1, 0]])
+
+    See Also
+    --------
+    label_binarize : Function to perform the transform operation of
+        LabelBinarizer with fixed classes.
+    OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
+        scheme.
+    """
+
+    def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):
+        if neg_label >= pos_label:
+            raise ValueError(
+                "neg_label={0} must be strictly less than "
+                "pos_label={1}.".format(neg_label, pos_label)
+            )
+
+        if sparse_output and (pos_label == 0 or neg_label != 0):
+            raise ValueError(
+                "Sparse binarization is only supported with non "
+                "zero pos_label and zero neg_label, got "
+                "pos_label={0} and neg_label={1}"
+                "".format(pos_label, neg_label)
+            )
+
+        self.neg_label = neg_label
+        self.pos_label = pos_label
+        self.sparse_output = sparse_output
+
+    def fit(self, y, session=None, run_kwargs=None):
+        """Fit label binarizer.
+
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,) or (n_samples, n_classes)
+            Target values. The 2-d matrix should only contain 0 and 1,
+            represents multilabel classification.
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        self.y_type_ = fetch_tileable(
+            execute_tileable(
+                type_of_target(y), session=session, **(run_kwargs or dict())
+            )
+        )
+        if "multioutput" in self.y_type_:
+            raise ValueError(
+                "Multioutput target data is not supported with label binarization"
+            )
+        if _num_samples(y) == 0:  # pragma: no cover
+            raise ValueError("y has 0 samples: %r" % y)
+
+        self.sparse_input_ = mt.tensor(y).issparse()
+        self.classes_ = unique_labels(y).execute(
+            session=session, **(run_kwargs or dict())
+        )
+        return self
+
+    def fit_transform(self, y, session=None, run_kwargs=None):
+        """Fit label binarizer and transform multi-class labels to binary
+        labels.
+
+        The output of transform is sometimes referred to as
+        the 1-of-K coding scheme.
+
+        Parameters
+        ----------
+        y : {ndarray, sparse matrix} of shape (n_samples,) or \
+                (n_samples, n_classes)
+            Target values. The 2-d matrix should only contain 0 and 1,
+            represents multilabel classification. Sparse matrix can be
+            CSR, CSC, COO, DOK, or LIL.
+
+        Returns
+        -------
+        Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
+            Shape will be (n_samples, 1) for binary problems. Sparse matrix
+            will be of CSR format.
+        """
+        return self.fit(y, session=session, run_kwargs=run_kwargs).transform(
+            y, session=session, run_kwargs=run_kwargs
+        )
+
+    def transform(self, y, session=None, run_kwargs=None):
+        """Transform multi-class labels to binary labels.
+
+        The output of transform is sometimes referred to by some authors as
+        the 1-of-K coding scheme.
+
+        Parameters
+        ----------
+        y : {array, sparse matrix} of shape (n_samples,) or \
+                (n_samples, n_classes)
+            Target values. The 2-d matrix should only contain 0 and 1,
+            represents multilabel classification. Sparse matrix can be
+            CSR, CSC, COO, DOK, or LIL.
+
+        Returns
+        -------
+        Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
+            Shape will be (n_samples, 1) for binary problems. Sparse matrix
+            will be of CSR format.
+        """
+        check_is_fitted(self)
+
+        target = fetch_tileable(
+            execute_tileable(
+                type_of_target(y), session=session, **(run_kwargs or dict())
+            )
+        )
+        y_is_multilabel = target.startswith("multilabel")
+        if y_is_multilabel and not self.y_type_.startswith("multilabel"):
+            raise ValueError("The object was not fitted with multilabel input.")
+
+        return label_binarize(
+            y,
+            classes=self.classes_,
+            pos_label=self.pos_label,
+            neg_label=self.neg_label,
+            sparse_output=self.sparse_output,
+        )
+
+    def inverse_transform(self, Y, threshold=None):
+        """Transform binary labels back to multi-class labels.
+
+        Parameters
+        ----------
+        Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
+            Target values. All sparse matrices are converted to CSR before
+            inverse transformation.
+
+        threshold : float, default=None
+            Threshold used in the binary and multi-label cases.
+
+            Use 0 when ``Y`` contains the output of decision_function
+            (classifier).
+            Use 0.5 when ``Y`` contains the output of predict_proba.
+
+            If None, the threshold is assumed to be half way between
+            neg_label and pos_label.
+
+        Returns
+        -------
+        y : {ndarray, sparse matrix} of shape (n_samples,)
+            Target values. Sparse matrix will be of CSR format.
+
+        Notes
+        -----
+        In the case when the binary labels are fractional
+        (probabilistic), inverse_transform chooses the class with the
+        greatest value. Typically, this allows to use the output of a
+        linear model's decision_function method directly as the input
+        of inverse_transform.
+        """
+        check_is_fitted(self)
+
+        if threshold is None:
+            threshold = (self.pos_label + self.neg_label) / 2.0
+
+        Y = mt.asarray(Y)
+        if self.y_type_ == "multiclass":
+            y_inv = Y.map_chunk(
+                _inverse_binarize_multiclass,
+                args=(self.classes_,),
+                dtype=self.classes_.dtype,
+                shape=(Y.shape[0],),
+            )
+        else:
+            shape = (Y.shape[0],) if self.y_type_ != "multilabel-indicator" else Y.shape
+            y_inv = Y.map_chunk(
+                _inverse_binarize_thresholding,
+                args=(self.y_type_, self.classes_, threshold),
+                dtype=self.classes_.dtype,
+                shape=shape,
+            )
+
+        if self.sparse_input_:
+            y_inv = y_inv.tosparse()
+        elif y_inv.issparse():
+            y_inv = y_inv.todense()
+
+        return y_inv
+
+    def _more_tags(
+        self,
+    ):  # pragma: no cover  # noqa: R0201  # pylint: disable=no-self-use
+        return {"X_types": ["1dlabels"]}
+
+
+class LabelBinarize(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.LABEL_BINARIZE
+
+    y = AnyField("y")
+    classes = AnyField("classes")
+    neg_label = Int32Field("neg_label")
+    pos_label = Int32Field("pos_label")
+    sparse_output = BoolField("sparse_output")
+    # for chunk
+    y_type = StringField("y_type")
+    pos_switch = BoolField("pos_switch")
+
+    def __call__(self, y: TileableType, classes: TileableType):
+        inputs = []
+        if not isinstance(y, list):
+            # XXX Workaround that will be removed when list of list format is
+            # dropped
+            self.y = y = check_array(y, accept_sparse=True, ensure_2d=False, dtype=None)
+        if isinstance(y, ENTITY_TYPE):
+            inputs.append(y)
+        if isinstance(classes, ENTITY_TYPE):
+            inputs.append(classes)
+        self.sparse = self.sparse_output
+        self.output_types = [OutputType.tensor]
+        if len(classes) == 2:
+            n_dim1 = 1
+        else:
+            n_dim1 = len(classes)
+        return self.new_tileable(
+            inputs,
+            shape=(np.nan, n_dim1),
+            dtype=np.dtype(int),
+            order=TensorOrder.C_ORDER,
+        )
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if isinstance(self.y, ENTITY_TYPE):
+            self.y = self._inputs[0]
+        if isinstance(self.classes, ENTITY_TYPE):
+            self.classes = self._inputs[-1]
+
+    @classmethod
+    def tile(cls, op: "LabelBinarize"):
+        y = op.y
+        classes = op.classes
+        neg_label = op.neg_label
+        pos_label = op.pos_label
+        sparse_output = op.sparse_output
+        out = op.outputs[0]
+        ctx = get_context()
+
+        if isinstance(y, list):
+            if _num_samples(y) == 0:
+                raise ValueError("y has 0 samples: %r" % y)
+
+        if len(op.inputs) == 0:
+            # no entity input
+            r = sklearn_label_binarize(
+                op.y,
+                classes=op.classes,
+                neg_label=neg_label,
+                pos_label=pos_label,
+                sparse_output=sparse_output,
+            )
+            return (yield from recursive_tile(mt.tensor(r)))
+        else:
+            # trigger execution
+            yield
+
+        if neg_label >= pos_label:
+            raise ValueError(
+                "neg_label={0} must be strictly less than "
+                "pos_label={1}.".format(neg_label, pos_label)
+            )
+
+        if sparse_output and (pos_label == 0 or neg_label != 0):
+            raise ValueError(
+                "Sparse binarization is only supported with non "
+                "zero pos_label and zero neg_label, got "
+                "pos_label={0} and neg_label={1}"
+                "".format(pos_label, neg_label)
+            )
+
+        # To account for pos_label == 0 in the dense case
+        pos_switch = pos_label == 0
+        if pos_switch:
+            pos_label = -neg_label
+
+        y_type = yield from recursive_tile(type_of_target(y))
+        yield y_type.chunks
+        y_type = ctx.get_chunks_result([y_type.chunks[0].key])[0]
+        y_type = y_type.item() if hasattr(y_type, "item") else y_type
+        if "multioutput" in y_type:
+            raise ValueError(
+                "Multioutput target data is not supported with label binarization"
+            )
+        if y_type == "unknown":
+            raise ValueError("The type of target data is not known")
+
+        n_samples = mt.tensor(y).shape[0]
+        n_classes = len(classes)
+
+        if y_type == "binary":
+            if n_classes == 1:
+                if sparse_output:
+                    return (
+                        yield from recursive_tile(
+                            mt.zeros((n_samples, 1), dtype=int, sparse=True)
+                        )
+                    )
+                else:
+                    Y = mt.zeros((len(y), 1), dtype=int)
+                    Y += neg_label
+                    return (yield from recursive_tile(Y))
+            elif len(classes) >= 3:
+                y_type = "multiclass"
+
+        if y_type == "multilabel-indicator":
+            y_n_classes = y.shape[1] if hasattr(y, "shape") else len(y[0])
+            if mt.tensor(classes).size != y_n_classes:
+                raise ValueError(
+                    "classes {0} mismatch with the labels {1}"
+                    " found in the data".format(classes, unique_labels(y))
+                )
+
+        if y_type in ("binary", "multiclass"):
+            y = yield from recursive_tile(column_or_1d(y))
+            if y_type == "binary":
+                out_shape = (n_samples, 1)
+            else:
+                out_shape = (n_samples, n_classes)
+        elif y_type == "multilabel-indicator":
+            out_shape = y.shape
+        else:
+            raise ValueError(
+                "%s target data is not supported with label binarization" % y_type
+            )
+
+        out_chunks = []
+        for y_chunk in y.chunks:
+            chunk_inputs = [y_chunk]
+            classes_chunk = classes
+            if isinstance(classes, ENTITY_TYPE):
+                chunk_inputs.append(classes.chunks[0])
+                classes_chunk = classes.chunks[0]
+            chunk_op = LabelBinarize(
+                y=y_chunk,
+                classes=classes_chunk,
+                neg_label=neg_label,
+                pos_label=pos_label,
+                sparse_output=sparse_output,
+                y_type=y_type,
+                pos_switch=pos_switch,
+                _output_types=op.output_types,
+            )
+            if len(out_shape) == 2:
+                chunk_shape = (y_chunk.shape[0], out_shape[1])
+                chunk_index = (y_chunk.index[0], 0)
+            else:  # pragma: no cover
+                chunk_shape = (y_chunk.shape[0],)
+                chunk_index = (y_chunk.index[0],)
+            out_chunk = chunk_op.new_chunk(
+                chunk_inputs,
+                shape=chunk_shape,
+                dtype=out.dtype,
+                order=out.order,
+                index=chunk_index,
+            )
+            out_chunks.append(out_chunk)
+
+        params = out.params.copy()
+        params["chunks"] = out_chunks
+        params["shape"] = out_shape
+        if len(out_shape) == 2:
+            nsplits = (y.nsplits[0], (out_shape[1],))
+        else:  # pragma: no cover
+            nsplits = (y.nsplits[0],)
+        params["nsplits"] = nsplits
+        return op.copy().new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def execute(cls, ctx: Union[dict, Context], op: "LabelBinarize"):
+        y = ctx[op.y.key]
+        if hasattr(y, "raw"):
+            # SparseNDArray
+            y = y.raw
+        if isinstance(op.classes, ENTITY_TYPE):
+            classes = ctx[op.classes.key]
+        else:
+            classes = op.classes
+        y_type = op.y_type
+        sparse_output = op.sparse_output
+        pos_label = op.pos_label
+        neg_label = op.neg_label
+        pos_switch = op.pos_switch
+
+        n_samples = y.shape[0] if sp.issparse(y) else len(y)
+        n_classes = len(classes)
+        sorted_class = np.sort(classes)
+
+        if y_type in ("binary", "multiclass"):
+            # pick out the known labels from y
+            y_in_classes = np.in1d(y, classes)
+            y_seen = y[y_in_classes]
+            indices = np.searchsorted(sorted_class, y_seen)
+            indptr = np.hstack((0, np.cumsum(y_in_classes)))
+
+            data = np.empty_like(indices)
+            data.fill(pos_label)
+            Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes))
+        elif y_type == "multilabel-indicator":
+            Y = sp.csr_matrix(y)
+            if pos_label != 1:
+                data = np.empty_like(Y.data)
+                data.fill(pos_label)
+                Y.data = data
+        else:  # pragma: no cover
+            raise ValueError(
+                "%s target data is not supported with label binarization" % y_type
+            )
+
+        if not sparse_output:
+            Y = Y.toarray()
+            Y = Y.astype(int, copy=False)
+
+            if neg_label != 0:
+                Y[Y == 0] = neg_label
+
+            if pos_switch:
+                Y[Y == pos_label] = 0
+        else:
+            Y.data = Y.data.astype(int, copy=False)
+
+        # preserve label ordering
+        if np.any(classes != sorted_class):
+            indices = np.searchsorted(sorted_class, classes)
+            Y = Y[:, indices]
+
+        if y_type == "binary":
+            if sparse_output:
+                Y = Y.getcol(-1)
+            else:
+                Y = Y[:, -1].reshape((-1, 1))
+
+        if sp.issparse(Y):
+            Y = SparseNDArray(Y)
+        ctx[op.outputs[0].key] = Y
+
+
+def label_binarize(
+    y, *, classes, neg_label=0, pos_label=1, sparse_output=False, execute=True
+):
+    """Binarize labels in a one-vs-all fashion.
+
+    Several regression and binary classification algorithms are
+    available in scikit-learn. A simple way to extend these algorithms
+    to the multi-class classification case is to use the so-called
+    one-vs-all scheme.
+
+    This function makes it possible to compute this transformation for a
+    fixed set of class labels known ahead of time.
+
+    Parameters
+    ----------
+    y : array-like
+        Sequence of integer labels or multilabel data to encode.
+
+    classes : array-like of shape (n_classes,)
+        Uniquely holds the label for each class.
+
+    neg_label : int, default=0
+        Value with which negative labels must be encoded.
+
+    pos_label : int, default=1
+        Value with which positive labels must be encoded.
+
+    sparse_output : bool, default=False,
+        Set to true if output binary array is desired in CSR sparse format.
+
+    Returns
+    -------
+    Y : {tensor, sparse tensor} of shape (n_samples, n_classes)
+        Shape will be (n_samples, 1) for binary problems.
+
+    Examples
+    --------
+    >>> from mars.learn.preprocessing import label_binarize
+    >>> label_binarize([1, 6], classes=[1, 2, 4, 6])
+    array([[1, 0, 0, 0],
+           [0, 0, 0, 1]])
+
+    The class ordering is preserved:
+
+    >>> label_binarize([1, 6], classes=[1, 6, 4, 2])
+    array([[1, 0, 0, 0],
+           [0, 1, 0, 0]])
+
+    Binary targets transform to a column vector
+
+    >>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes'])
+    array([[1],
+           [0],
+           [0],
+           [1]])
+
+    See Also
+    --------
+    LabelBinarizer : Class used to wrap the functionality of label_binarize and
+        allow for fitting to classes independently of the transform operation.
+    """
+    op = LabelBinarize(
+        y=y,
+        classes=classes,
+        neg_label=neg_label,
+        pos_label=pos_label,
+        sparse_output=sparse_output,
+    )
+    result = op(y, classes)
+    return result.execute() if execute else result
+
+
+def _inverse_binarize_multiclass(y, classes):  # pragma: no cover
+    """Inverse label binarization transformation for multiclass.
+
+    Multiclass uses the maximal score instead of a threshold.
+    """
+    classes = np.asarray(classes)
+
+    if sp.issparse(y):
+        # Find the argmax for each row in y where y is a CSR matrix
+
+        y = y.tocsr()
+        n_samples, n_outputs = y.shape
+        outputs = np.arange(n_outputs)
+        row_max = min_max_axis(y, 1)[1]
+        row_nnz = np.diff(y.indptr)
+
+        y_data_repeated_max = np.repeat(row_max, row_nnz)
+        # picks out all indices obtaining the maximum per row
+        y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data)
+
+        # For corner case where last row has a max of 0
+        if row_max[-1] == 0:
+            y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)])
+
+        # Gets the index of the first argmax in each row from y_i_all_argmax
+        index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1])
+        # first argmax of each row
+        y_ind_ext = np.append(y.indices, [0])
+        y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]]
+        # Handle rows of all 0
+        y_i_argmax[np.where(row_nnz == 0)[0]] = 0
+
+        # Handles rows with max of 0 that contain negative numbers
+        samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)]
+        for i in samples:
+            ind = y.indices[y.indptr[i] : y.indptr[i + 1]]
+            y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0]
+
+        return classes[y_i_argmax]
+    else:
+        return classes.take(y.argmax(axis=1), mode="clip")
+
+
+def _inverse_binarize_thresholding(
+    y, output_type, classes, threshold
+):  # pragma: no cover
+    """Inverse label binarization transformation using thresholding."""
+
+    if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2:
+        raise ValueError("output_type='binary', but y.shape = {0}".format(y.shape))
+
+    if output_type != "binary" and y.shape[1] != len(classes):
+        raise ValueError(
+            "The number of class is not equal to the number of dimension of y."
+        )
+
+    classes = np.asarray(classes)
+
+    # Perform thresholding
+    if sp.issparse(y):
+        if threshold > 0:
+            if y.format not in ("csr", "csc"):
+                y = y.tocsr()
+            y.data = np.array(y.data > threshold, dtype=int)
+            y.eliminate_zeros()
+        else:
+            y = np.array(y.toarray() > threshold, dtype=int)
+    else:
+        y = np.array(y > threshold, dtype=int)
+
+    # Inverse transform data
+    if output_type == "binary":
+        if sp.issparse(y):
+            y = y.toarray()
+        if y.ndim == 2 and y.shape[1] == 2:
+            return classes[y[:, 1]]
+        else:
+            if len(classes) == 1:
+                return np.repeat(classes[0], len(y))
+            else:
+                return classes[y.ravel()]
+
+    elif output_type == "multilabel-indicator":
+        return y
+
+    else:
+        raise ValueError("{0} format is not supported".format(output_type))
diff --git a/python/xorbits/_mars/learn/preprocessing/normalize.py b/python/xorbits/_mars/learn/preprocessing/normalize.py
new file mode 100644
index 000000000..6108cec32
--- /dev/null
+++ b/python/xorbits/_mars/learn/preprocessing/normalize.py
@@ -0,0 +1,345 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+try:
+    from sklearn.preprocessing import normalize as sklearn_normalize
+except ImportError:  # pragma: no cover
+    sklearn_normalize = None
+
+from ... import opcodes as OperandDef
+from ... import tensor as mt
+from ...core import ExecutableTuple, recursive_tile
+from ...serialization.serializables import BoolField, Int32Field, KeyField, StringField
+from ...tensor.array_utils import as_same_device, device, issparse, sparse
+from ...tensor.core import TensorOrder
+from ...tensor.operands import TensorOperand, TensorOperandMixin
+from ..utils import check_array
+
+
+class TensorNormalize(TensorOperand, TensorOperandMixin):
+    _op_module_ = "learn"
+    _op_type_ = OperandDef.NORMALIZE
+
+    _input = KeyField("input")
+    _norm = StringField("norm")
+    _axis = Int32Field("axis")
+    _return_norm = BoolField("return_norm")
+    # for test purpose
+    _use_sklearn = BoolField("use_sklearn")
+
+    def __init__(self, norm=None, axis=None, return_norm=None, use_sklearn=None, **kw):
+        super().__init__(
+            _norm=norm,
+            _axis=axis,
+            _return_norm=return_norm,
+            _use_sklearn=use_sklearn,
+            **kw,
+        )
+        if self._use_sklearn is None:
+            # force to use sklearn if not specified
+            self._use_sklearn = True
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def norm(self):
+        return self._norm
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def return_norm(self):
+        return self._return_norm
+
+    @property
+    def use_sklearn(self):
+        return self._use_sklearn
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    @property
+    def output_limit(self):
+        return 2 if self._return_norm else 1
+
+    def __call__(self, x, copy=True):
+        x = check_array(
+            x,
+            accept_sparse=True,
+            estimator="the normalize function",
+            dtype=(np.float64, np.float32, np.float16),
+        )
+
+        normed = None
+        if not self._return_norm:
+            res = self.new_tensor([x], shape=x.shape, order=x.order)
+        else:
+            kws = [
+                {"shape": x.shape, "order": x.order},
+                {
+                    "shape": (x.shape[0] if self._axis == 1 else x.shape[1],),
+                    "order": TensorOrder.C_ORDER,
+                },
+            ]
+            res, normed = self.new_tensors([x], kws=kws, output_limit=2)
+
+        if not copy and self._axis == 1:
+            # follow the behaviour of sklearn
+            x.data = res.data
+
+        if normed is None:
+            return res
+        return ExecutableTuple([res, normed])
+
+    @classmethod
+    def _tile_one_chunk(cls, op):
+        outs = op.outputs
+        chunk_op = op.copy().reset_key()
+        kws = [{"shape": outs[0].shape, "order": outs[0].order, "index": (0, 0)}]
+        if len(outs) == 2:
+            kws.append({"shape": outs[1].shape, "order": outs[1].order, "index": (0,)})
+        chunks = chunk_op.new_chunks(
+            [op.input.chunks[0]], kws=kws, output_limit=len(outs)
+        )
+
+        tensor_kws = [
+            {
+                "shape": outs[0].shape,
+                "order": outs[0].order,
+                "chunks": [chunks[0]],
+                "nsplits": tuple((s,) for s in outs[0].shape),
+            }
+        ]
+        if len(outs) == 2:
+            tensor_kws.append(
+                {
+                    "shape": outs[1].shape,
+                    "order": outs[1].order,
+                    "chunks": [chunks[1]],
+                    "nsplits": tuple((s,) for s in outs[1].shape),
+                }
+            )
+
+        new_op = op.copy()
+        return new_op.new_tensors(op.inputs, kws=tensor_kws, output_limit=len(outs))
+
+    @classmethod
+    def _need_tile_into_chunks(cls, op):
+        # if true, try to tile into chunks
+        # whose implementation is based on sklearn itself
+        x = op.input
+        if op.gpu:  # pragma: no cover
+            return False
+        if x.issparse() and op.return_norm and op.norm in ("l1", "l2"):
+            # sklearn cannot handle
+            return False
+        if x.chunk_shape[op.axis] > 1:
+            return False
+        return True
+
+    @classmethod
+    def _tile_chunks(cls, op):
+        assert op.input.chunk_shape[op.axis] == 1
+        x = op.input
+        axis = op.axis
+        outs = op.outputs
+
+        out_chunks = [], []
+        for i, c in enumerate(x.chunks):
+            chunk_op = op.copy().reset_key()
+            kws = [{"shape": c.shape, "order": c.order, "index": c.index}]
+            if op.return_norm:
+                kws.append(
+                    {
+                        "shape": (c.shape[1 - axis],),
+                        "order": TensorOrder.C_ORDER,
+                        "index": (i,),
+                    }
+                )
+            chunks = chunk_op.new_chunks([c], kws=kws, output_limit=op.output_limit)
+            out_chunks[0].append(chunks[0])
+            if len(chunks) == 2:
+                out_chunks[1].append(chunks[1])
+
+        tensor_kws = [
+            {
+                "shape": outs[0].shape,
+                "order": outs[0].order,
+                "chunks": out_chunks[0],
+                "nsplits": x.nsplits,
+            }
+        ]
+        if len(outs) == 2:
+            tensor_kws.append(
+                {
+                    "shape": outs[1].shape,
+                    "order": outs[1].order,
+                    "chunks": out_chunks[1],
+                    "nsplits": (x.nsplits[1 - axis],),
+                }
+            )
+        new_op = op.copy()
+        return new_op.new_tensors(op.inputs, kws=tensor_kws, output_limit=len(outs))
+
+    @classmethod
+    def tile(cls, op):
+        x = op.input
+        norm = op.norm
+        axis = op.axis
+
+        if len(x.chunks) == 1:
+            return cls._tile_one_chunk(op)
+
+        if cls._need_tile_into_chunks(op):
+            return cls._tile_chunks(op)
+        else:
+            if norm == "l1":
+                norms = mt.abs(x).sum(axis=axis)
+            elif norm == "l2":
+                norms = mt.sqrt((x**2).sum(axis=axis))
+            else:
+                assert norm == "max"
+                # sparse.max will still be a sparse,
+                # force to convert to dense
+                norms = mt.max(x, axis=axis).todense()
+            norms = mt.where(mt.equal(norms, 0.0), 1.0, norms)
+            if axis == 1:
+                x = x / norms[:, mt.newaxis]
+            else:
+                x = x / norms[mt.newaxis, :]
+
+            ret = [(yield from recursive_tile(x))]
+            if op.return_norm:
+                ret.append((yield from recursive_tile(norms)))
+
+            new_op = op.copy()
+            kws = [out.params for out in op.outputs]
+            for i, r in enumerate(ret):
+                kws[i]["chunks"] = r.chunks
+                kws[i]["nsplits"] = r.nsplits
+            return new_op.new_tensors(op.inputs, kws=kws)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (x,), device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+        axis = op.axis
+        return_norm = op.return_norm
+        norm = op.norm
+        outs = op.outputs
+
+        with device(device_id):
+            if device_id < 0 and op.use_sklearn and sklearn_normalize is not None:
+                # no GPU
+                try:
+                    if xp is sparse:
+                        if axis == 0:
+                            xm = x.raw.tocsc()
+                        else:
+                            xm = x.raw
+                    else:
+                        xm = x
+                    ret = sklearn_normalize(
+                        xm, norm=norm, axis=axis, return_norm=return_norm
+                    )
+                    normed = None
+                    if return_norm:
+                        ret, normed = ret
+                    if issparse(ret):
+                        ret = sparse.SparseNDArray(ret)
+                    ctx[outs[0].key] = ret
+                    if normed is not None:
+                        ctx[outs[1].key] = normed
+                    return
+                except NotImplementedError:
+                    pass
+
+            # fall back
+            if axis == 0:
+                x = x.T
+
+            if norm == "l1":
+                norms = xp.abs(x).sum(axis=1)
+            elif norm == "l2":
+                norms = xp.sqrt((x**2).sum(axis=1))
+            else:
+                norms = xp.max(x, axis=1)
+                if issparse(norms):
+                    norms = norms.toarray()
+            norms[norms == 0.0] = 1.0
+            x = x / norms[:, np.newaxis]
+
+            if axis == 0:
+                x = x.T
+
+            ctx[outs[0].key] = x
+            if return_norm:
+                ctx[outs[1].key] = norms
+
+
+def normalize(X, norm="l2", axis=1, copy=True, return_norm=False):
+    """
+    Scale input vectors individually to unit norm (vector length).
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape [n_samples, n_features]
+        The data to normalize, element by element.
+        scipy.sparse matrices should be in CSR format to avoid an
+        un-necessary copy.
+
+    norm : 'l1', 'l2', or 'max', optional ('l2' by default)
+        The norm to use to normalize each non zero sample (or each non-zero
+        feature if axis is 0).
+
+    axis : 0 or 1, optional (1 by default)
+        axis used to normalize the data along. If 1, independently normalize
+        each sample, otherwise (if 0) normalize each feature.
+
+    copy : boolean, optional, default True
+        set to False to perform inplace row normalization and avoid a
+        copy (if the input is already a tensor and if axis is 1).
+
+    return_norm : boolean, default False
+        whether to return the computed norms
+
+    Returns
+    -------
+    X : {array-like, sparse matrix}, shape [n_samples, n_features]
+        Normalized input X.
+
+    norms : Tensor, shape [n_samples] if axis=1 else [n_features]
+        A tensor of norms along given axis for X.
+        When X is sparse, a NotImplementedError will be raised
+        for norm 'l1' or 'l2'.
+
+    See also
+    --------
+    Normalizer: Performs normalization using the ``Transformer`` API
+        (e.g. as part of a preprocessing :class:`mars.learn.pipeline.Pipeline`).
+    """
+    if norm not in ("l1", "l2", "max"):
+        raise ValueError(f"'{norm}' is not a supported norm")
+    if axis not in (0, 1):
+        raise ValueError(f"'{axis}' is not a supported axis")
+    op = TensorNormalize(norm=norm, axis=axis, return_norm=return_norm, dtype=X.dtype)
+    return op(X, copy=copy)
diff --git a/python/xorbits/_mars/learn/preprocessing/tests/__init__.py b/python/xorbits/_mars/learn/preprocessing/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/preprocessing/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/preprocessing/tests/test_data.py b/python/xorbits/_mars/learn/preprocessing/tests/test_data.py
new file mode 100644
index 000000000..fe33b9256
--- /dev/null
+++ b/python/xorbits/_mars/learn/preprocessing/tests/test_data.py
@@ -0,0 +1,216 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from sklearn.datasets import load_iris
+from sklearn.utils import gen_batches
+from sklearn.utils._testing import assert_allclose, assert_array_almost_equal
+
+from .... import tensor as mt
+from .. import MinMaxScaler, minmax_scale
+
+
+def assert_correct_incr(i, batch_start, batch_stop, n, chunk_size, n_samples_seen):
+    if batch_stop != n:
+        assert (i + 1) * chunk_size == n_samples_seen
+    else:
+        assert i * chunk_size + (batch_stop - batch_start) == n_samples_seen
+
+
+def _check_dim_1axis(a):
+    return mt.asarray(a).shape[0]
+
+
+rng = mt.random.RandomState(0)
+n_features = 30
+n_samples = 1000
+offsets = rng.uniform(-1, 1, size=n_features)
+scales = rng.uniform(1, 10, size=n_features)
+X_2d = rng.randn(n_samples, n_features) * scales + offsets
+X_1row = X_2d[0, :].reshape(1, n_features)
+X_1col = X_2d[:, 0].reshape(n_samples, 1)
+iris = mt.tensor(load_iris().data)
+
+
+@pytest.mark.parametrize("chunk_size", [200, X_2d.shape[0], X_2d.shape[0] + 42])
+def test_min_max_scaler_partial_fit(setup, chunk_size):
+    # Test if partial_fit run over many batches of size 1 and 50
+    # gives the same results as fit
+    X = X_2d
+    n = X.shape[0]
+
+    # Test mean at the end of the process
+    scaler_batch = MinMaxScaler().fit(X)
+
+    scaler_incr = MinMaxScaler()
+    for batch in gen_batches(n_samples, chunk_size):
+        scaler_incr = scaler_incr.partial_fit(X[batch])
+
+    assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_)
+    assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_)
+    assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
+    assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_)
+    assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
+    assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)
+
+    # Test std after 1 step
+    batch0 = slice(0, chunk_size)
+    scaler_batch = MinMaxScaler().fit(X[batch0])
+    scaler_incr = MinMaxScaler().partial_fit(X[batch0])
+
+    assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_)
+    assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_)
+    assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
+    assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_)
+    assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
+    assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)
+
+    # Test std until the end of partial fits, and
+    _ = MinMaxScaler().fit(X)
+    scaler_incr = MinMaxScaler()  # Clean estimator
+    for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
+        scaler_incr = scaler_incr.partial_fit(X[batch])
+        assert_correct_incr(
+            i,
+            batch_start=batch.start,
+            batch_stop=batch.stop,
+            n=n,
+            chunk_size=chunk_size,
+            n_samples_seen=scaler_incr.n_samples_seen_,
+        )
+
+
+def test_min_max_scaler_iris(setup):
+    X = iris
+    scaler = MinMaxScaler()
+    # default params
+    X_trans = scaler.fit_transform(X)
+    assert_array_almost_equal(X_trans.min(axis=0), 0)
+    assert_array_almost_equal(X_trans.max(axis=0), 1)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+
+    # not default params: min=1, max=2
+    scaler = MinMaxScaler(feature_range=(1, 2))
+    X_trans = scaler.fit_transform(X)
+    assert_array_almost_equal(X_trans.min(axis=0), 1)
+    assert_array_almost_equal(X_trans.max(axis=0), 2)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+
+    # min=-.5, max=.6
+    scaler = MinMaxScaler(feature_range=(-0.5, 0.6))
+    X_trans = scaler.fit_transform(X)
+    assert_array_almost_equal(X_trans.min(axis=0), -0.5)
+    assert_array_almost_equal(X_trans.max(axis=0), 0.6)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+
+    # raises on invalid range
+    scaler = MinMaxScaler(feature_range=(2, 1))
+    with pytest.raises(ValueError):
+        scaler.fit(X)
+
+
+def test_min_max_scaler_zero_variance_features(setup):
+    # Check min max scaler on toy data with zero variance features
+    X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]]
+
+    X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]]
+
+    # default params
+    scaler = MinMaxScaler()
+    X_trans = scaler.fit_transform(X)
+    X_expected_0_1 = [[0.0, 0.0, 0.5], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0]]
+    assert_array_almost_equal(X_trans, X_expected_0_1)
+    X_trans_inv = scaler.inverse_transform(X_trans)
+    assert_array_almost_equal(X, X_trans_inv)
+
+    X_trans_new = scaler.transform(X_new)
+    X_expected_0_1_new = [[+0.0, 1.0, 0.500], [-1.0, 0.0, 0.083], [+0.0, 0.0, 1.333]]
+    assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2)
+
+    # not default params
+    scaler = MinMaxScaler(feature_range=(1, 2))
+    X_trans = scaler.fit_transform(X)
+    X_expected_1_2 = [[1.0, 1.0, 1.5], [1.0, 1.0, 1.0], [1.0, 1.0, 2.0]]
+    assert_array_almost_equal(X_trans, X_expected_1_2)
+
+    # function interface
+    X_trans = minmax_scale(X)
+    assert_array_almost_equal(X_trans, X_expected_0_1)
+    X_trans = minmax_scale(X, feature_range=(1, 2))
+    assert_array_almost_equal(X_trans, X_expected_1_2)
+
+
+def test_minmax_scale_axis1(setup):
+    X = iris
+    X_trans = minmax_scale(X, axis=1)
+    assert_array_almost_equal(mt.min(X_trans, axis=1), 0)
+    assert_array_almost_equal(mt.max(X_trans, axis=1), 1)
+
+
+def test_min_max_scaler1d(setup):
+    X_list_1row = X_1row.to_numpy().tolist()
+    X_list_1col = X_1col.to_numpy().tolist()
+
+    # Test scaling of dataset along single axis
+    for X in [X_1row, X_1col, X_list_1row, X_list_1col]:
+        scaler = MinMaxScaler(copy=True)
+        X_scaled = scaler.fit(X).transform(X)
+
+        if isinstance(X, list):
+            X = mt.array(X)  # cast only after scaling done
+
+        if _check_dim_1axis(X) == 1:
+            assert_array_almost_equal(X_scaled.min(axis=0), mt.zeros(n_features))
+            assert_array_almost_equal(X_scaled.max(axis=0), mt.zeros(n_features))
+        else:
+            assert_array_almost_equal(X_scaled.min(axis=0), 0.0)
+            assert_array_almost_equal(X_scaled.max(axis=0), 1.0)
+        assert scaler.n_samples_seen_ == X.shape[0]
+
+        # check inverse transform
+        X_scaled_back = scaler.inverse_transform(X_scaled)
+        assert_array_almost_equal(X_scaled_back, X)
+
+    # Constant feature
+    X = mt.ones((5, 1))
+    scaler = MinMaxScaler()
+    X_scaled = scaler.fit(X).transform(X)
+    assert X_scaled.min().to_numpy() >= 0.0
+    assert X_scaled.max().to_numpy() <= 1.0
+    assert scaler.n_samples_seen_ == X.shape[0]
+
+    # Function interface
+    X_1d = X_1row.ravel()
+    min_ = X_1d.min()
+    max_ = X_1d.max()
+    assert_array_almost_equal(
+        (X_1d - min_) / (max_ - min_), minmax_scale(X_1d, copy=True)
+    )
+
+
+@pytest.mark.parametrize("feature_range", [(0, 1), (-10, 10)])
+def test_minmax_scaler_clip(setup, feature_range):
+    # test behaviour of the parameter 'clip' in MinMaxScaler
+    X = iris
+    scaler = MinMaxScaler(feature_range=feature_range, clip=True).fit(X)
+    X_min, X_max = mt.min(X, axis=0), mt.max(X, axis=0)
+    X_test = [mt.r_[X_min[:2] - 10, X_max[2:] + 10]]
+    X_transformed = scaler.transform(X_test)
+    assert_allclose(
+        X_transformed,
+        [[feature_range[0], feature_range[0], feature_range[1], feature_range[1]]],
+    )
diff --git a/python/xorbits/_mars/learn/preprocessing/tests/test_label.py b/python/xorbits/_mars/learn/preprocessing/tests/test_label.py
new file mode 100644
index 000000000..1d4cf0425
--- /dev/null
+++ b/python/xorbits/_mars/learn/preprocessing/tests/test_label.py
@@ -0,0 +1,381 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import scipy.sparse as sp
+from sklearn.preprocessing._label import (
+    _inverse_binarize_multiclass,
+    _inverse_binarize_thresholding,
+)
+from sklearn.utils._testing import assert_array_equal, ignore_warnings
+from sklearn.utils.multiclass import type_of_target
+
+from .... import tensor as mt
+from .. import LabelBinarizer, LabelEncoder, label_binarize
+
+
+def test_label_binarizer(setup):
+    # one-class case defaults to negative label
+    # For dense case:
+    inp = ["pos", "pos", "pos", "pos"]
+    lb = LabelBinarizer(sparse_output=False)
+    expected = np.array([[0, 0, 0, 0]]).T
+    got = lb.fit_transform(inp)
+    assert_array_equal(lb.classes_, ["pos"])
+    assert_array_equal(expected, got)
+    assert_array_equal(lb.inverse_transform(got), inp)
+
+    # For sparse case:
+    lb = LabelBinarizer(sparse_output=True)
+    got = lb.fit_transform(inp)
+    assert got.issparse()
+    assert_array_equal(lb.classes_, ["pos"])
+    assert_array_equal(expected, got.fetch().toarray())
+    assert_array_equal(lb.inverse_transform(got.todense()), inp)
+
+    lb = LabelBinarizer(sparse_output=False)
+    # two-class case
+    inp = ["neg", "pos", "pos", "neg"]
+    expected = np.array([[0, 1, 1, 0]]).T
+    got = lb.fit_transform(inp)
+    assert_array_equal(lb.classes_, ["neg", "pos"])
+    assert_array_equal(expected, got)
+
+    to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
+    assert_array_equal(lb.inverse_transform(to_invert), inp)
+
+    # multi-class case
+    inp = ["spam", "ham", "eggs", "ham", "0"]
+    expected = np.array(
+        [[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]
+    )
+    got = lb.fit_transform(inp)
+    assert_array_equal(lb.classes_, ["0", "eggs", "ham", "spam"])
+    assert_array_equal(expected, got)
+    assert_array_equal(lb.inverse_transform(got), inp)
+
+
+def test_label_binarizer_set_label_encoding(setup):
+    lb = LabelBinarizer(neg_label=-2, pos_label=0)
+
+    # two-class case with pos_label=0
+    inp = np.array([0, 1, 1, 0])
+    expected = np.array([[-2, 0, 0, -2]]).T
+    got = lb.fit_transform(mt.tensor(inp))
+    assert_array_equal(expected, got)
+    assert_array_equal(lb.inverse_transform(got), inp)
+
+    lb = LabelBinarizer(neg_label=-2, pos_label=2)
+
+    # multi-class case
+    inp = np.array([3, 2, 1, 2, 0])
+    expected = np.array(
+        [
+            [-2, -2, -2, +2],
+            [-2, -2, +2, -2],
+            [-2, +2, -2, -2],
+            [-2, -2, +2, -2],
+            [+2, -2, -2, -2],
+        ]
+    )
+    got = lb.fit_transform(inp)
+    assert_array_equal(expected, got)
+    assert_array_equal(lb.inverse_transform(got), inp)
+
+
+@ignore_warnings
+def test_label_binarizer_errors(setup):
+    # Check that invalid arguments yield ValueError
+    one_class = np.array([0, 0, 0, 0])
+    lb = LabelBinarizer().fit(one_class)
+
+    multi_label = [(2, 3), (0,), (0, 2)]
+    with pytest.raises(ValueError):
+        lb.transform(multi_label)
+
+    lb = LabelBinarizer()
+    with pytest.raises(ValueError):
+        lb.transform([])
+    with pytest.raises(ValueError):
+        lb.inverse_transform([])
+
+    with pytest.raises(ValueError):
+        LabelBinarizer(neg_label=2, pos_label=1)
+    with pytest.raises(ValueError):
+        LabelBinarizer(neg_label=2, pos_label=2)
+
+    with pytest.raises(ValueError):
+        LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
+
+    # Sequence of seq type should raise ValueError
+    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
+    with pytest.raises(ValueError):
+        LabelBinarizer().fit_transform(y_seq_of_seqs)
+
+    # Fail on multioutput data
+    with pytest.raises(ValueError):
+        LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
+    with pytest.raises(ValueError):
+        label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
+
+
+def test_label_binarize_with_class_order(setup):
+    out = label_binarize([1, 6], classes=[1, 2, 4, 6])
+    expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
+    assert_array_equal(out, expected)
+
+    # Modified class order
+    out = label_binarize([1, 6], classes=[1, 6, 4, 2])
+    expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
+    assert_array_equal(out, expected)
+
+    out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1])
+    expected = np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0]])
+    assert_array_equal(out, expected)
+
+
+def toarray(a):
+    if hasattr(a, "toarray"):
+        a = a.toarray()
+    return a
+
+
+def check_binarized_results(y, classes, pos_label, neg_label, expected):
+    for sparse_output in [True, False]:
+        if (pos_label == 0 or neg_label != 0) and sparse_output:
+            with pytest.raises(ValueError):
+                label_binarize(
+                    y,
+                    classes=classes,
+                    neg_label=neg_label,
+                    pos_label=pos_label,
+                    sparse_output=sparse_output,
+                )
+            continue
+
+        # check label_binarize
+        binarized = label_binarize(
+            y,
+            classes=classes,
+            neg_label=neg_label,
+            pos_label=pos_label,
+            sparse_output=sparse_output,
+        )
+        binarized = binarized.fetch()
+        if hasattr(binarized, "raw"):
+            binarized = binarized.raw
+        assert_array_equal(toarray(binarized), expected)
+        assert sp.issparse(binarized) == sparse_output
+
+        # check inverse
+        y_type = type_of_target(y)
+        if y_type == "multiclass":
+            inversed = _inverse_binarize_multiclass(binarized, classes=classes)
+
+        else:
+            inversed = _inverse_binarize_thresholding(
+                binarized.copy(),  # https://github.com/mars-project/mars/issues/3268
+                output_type=y_type,
+                classes=classes,
+                threshold=((neg_label + pos_label) / 2.0),
+            )
+
+        assert_array_equal(toarray(inversed), toarray(y))
+
+        # Check label binarizer
+        lb = LabelBinarizer(
+            neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output
+        )
+        binarized = lb.fit_transform(y)
+        assert_array_equal(toarray(binarized), expected)
+        assert binarized.issparse() == sparse_output
+        inverse_output = lb.inverse_transform(binarized)
+        assert_array_equal(toarray(inverse_output), toarray(y))
+        assert inverse_output.issparse() == sp.issparse(y)
+
+
+def test_label_binarize_binary(setup):
+    y = [0, 1, 0]
+    classes = [0, 1]
+    pos_label = 2
+    neg_label = -1
+    expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1))
+
+    check_binarized_results(y, classes, pos_label, neg_label, expected)
+
+    # Binary case where sparse_output = True will not result in a ValueError
+    y = [0, 1, 0]
+    classes = [0, 1]
+    pos_label = 3
+    neg_label = 0
+    expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
+
+    check_binarized_results(y, classes, pos_label, neg_label, expected)
+
+
+def test_label_binarize_multiclass(setup):
+    y = [0, 1, 2]
+    classes = [0, 1, 2]
+    pos_label = 2
+    neg_label = 0
+    expected = 2 * np.eye(3)
+
+    check_binarized_results(y, classes, pos_label, neg_label, expected)
+
+    with pytest.raises(ValueError):
+        label_binarize(
+            y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
+        )
+
+
+def test_label_binarize_multilabel(setup):
+    y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
+    classes = [0, 1, 2]
+    pos_label = 2
+    neg_label = 0
+    expected = pos_label * y_ind
+    y_sparse = [sp.csr_matrix(y_ind)]
+
+    for y in [y_ind] + y_sparse:
+        check_binarized_results(y, classes, pos_label, neg_label, expected)
+
+    with pytest.raises(ValueError):
+        label_binarize(
+            y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
+        )
+
+
+def test_invalid_input_label_binarize(setup):
+    with pytest.raises(ValueError):
+        label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
+    with pytest.raises(ValueError, match="continuous target data is not "):
+        label_binarize([1.2, 2.7], classes=[0, 1])
+    with pytest.raises(ValueError, match="mismatch with the labels"):
+        label_binarize([[1, 3]], classes=[1, 2, 3])
+
+
+@pytest.mark.parametrize(
+    "values, classes, unknown",
+    [
+        (
+            np.array([2, 1, 3, 1, 3], dtype="int64"),
+            np.array([1, 2, 3], dtype="int64"),
+            np.array([4], dtype="int64"),
+        ),
+        (
+            np.array(["b", "a", "c", "a", "c"], dtype=object),
+            np.array(["a", "b", "c"], dtype=object),
+            np.array(["d"], dtype=object),
+        ),
+        (
+            np.array(["b", "a", "c", "a", "c"]),
+            np.array(["a", "b", "c"]),
+            np.array(["d"]),
+        ),
+    ],
+    ids=["int64", "object", "str"],
+)
+def test_label_encoder(setup, values, classes, unknown):
+    # Test LabelEncoder's transform, fit_transform and
+    # inverse_transform methods
+    values_t = mt.tensor(values)
+
+    le = LabelEncoder()
+    le.fit(values_t)
+    assert_array_equal(le.classes_.fetch(), classes)
+    assert_array_equal(le.transform(values_t).fetch(), [1, 0, 2, 0, 2])
+    assert_array_equal(le.inverse_transform(mt.tensor([1, 0, 2, 0, 2])).fetch(), values)
+
+    le = LabelEncoder()
+    ret = le.fit_transform(values)
+    assert_array_equal(ret.fetch(), [1, 0, 2, 0, 2])
+
+    with pytest.raises(ValueError, match="unseen labels"):
+        le.transform(unknown)
+
+
+def test_label_encoder_missing_values_numeric(setup):
+    values = np.array([3, 1, np.nan, 5, 3, np.nan], dtype=float)
+    values_t = mt.tensor(values)
+    le = LabelEncoder()
+    assert_array_equal(le.fit_transform(values_t).fetch(), [1, 0, 3, 2, 1, 3])
+
+
+def test_label_encoder_negative_ints(setup):
+    le = LabelEncoder()
+    le.fit(mt.tensor([1, 1, 4, 5, -1, 0]))
+    assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
+    assert_array_equal(
+        le.transform(mt.tensor([0, 1, 4, 4, 5, -1, -1])), [1, 2, 3, 3, 4, 0, 0]
+    )
+    assert_array_equal(
+        le.inverse_transform(mt.tensor([1, 2, 3, 3, 4, 0, 0])), [0, 1, 4, 4, 5, -1, -1]
+    )
+    with pytest.raises(ValueError):
+        le.transform(mt.tensor([0, 6]))
+
+
+@pytest.mark.parametrize("dtype", ["str", "object"])
+def test_label_encoder_str_bad_shape(setup, dtype):
+    le = LabelEncoder()
+    le.fit(mt.tensor(np.array(["apple", "orange"], dtype=dtype)))
+    msg = "should be a 1d array"
+    with pytest.raises(ValueError, match=msg):
+        le.transform("apple")
+
+
+def test_label_encoder_errors(setup):
+    # Check that invalid arguments yield ValueError
+    le = LabelEncoder()
+    with pytest.raises(ValueError):
+        le.transform([])
+    with pytest.raises(ValueError):
+        le.inverse_transform([])
+
+    # Fail on unseen labels
+    le = LabelEncoder()
+    le.fit(mt.tensor([1, 2, 3, -1, 1]))
+    msg = "contains previously unseen labels"
+    with pytest.raises(ValueError, match=msg):
+        le.inverse_transform(mt.tensor([-2]))
+    with pytest.raises(ValueError, match=msg):
+        le.inverse_transform(mt.tensor([-2, -3, -4]))
+
+    # Fail on inverse_transform("")
+    msg = r"should be a 1d array.+shape \(\)"
+    with pytest.raises(ValueError, match=msg):
+        le.inverse_transform("")
+
+
+@pytest.mark.parametrize(
+    "values",
+    [
+        np.array([2, 1, 3, 1, 3], dtype="int64"),
+        np.array(["b", "a", "c", "a", "c"], dtype=object),
+        np.array(["b", "a", "c", "a", "c"]),
+    ],
+    ids=["int64", "object", "str"],
+)
+def test_label_encoder_empty_array(setup, values):
+    values_t = mt.tensor(values)
+
+    le = LabelEncoder()
+    le.fit(values_t)
+    # test empty transform
+    transformed = le.transform(mt.array([]))
+    assert_array_equal(np.array([]), transformed)
+    # test empty inverse transform
+    inverse_transformed = le.inverse_transform([])
+    assert_array_equal(np.array([]), inverse_transformed)
diff --git a/python/xorbits/_mars/learn/preprocessing/tests/test_normalize.py b/python/xorbits/_mars/learn/preprocessing/tests/test_normalize.py
new file mode 100644
index 000000000..017a81b38
--- /dev/null
+++ b/python/xorbits/_mars/learn/preprocessing/tests/test_normalize.py
@@ -0,0 +1,83 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import scipy.sparse as sps
+from sklearn.preprocessing import normalize as sk_normalize
+
+from .... import tensor as mt
+from .. import normalize
+
+
+def test_normalize_op():
+    with pytest.raises(ValueError):
+        normalize(mt.random.random(10, 3), norm="unknown")
+
+    with pytest.raises(ValueError):
+        normalize(mt.random.random(10, 3), axis=-1)
+
+    with pytest.raises(ValueError):
+        normalize(mt.random.rand(10, 3, 3))
+
+
+def test_normalize_execution(setup):
+    raw_dense = np.random.rand(10, 10)
+    raw_sparse = sps.random(10, 10, density=0.4, format="csr")
+
+    for chunk_size in [10, 6, (10, 6), (6, 10)]:
+        for raw, x in [
+            (raw_dense, mt.tensor(raw_dense, chunk_size=chunk_size)),
+            (raw_sparse, mt.tensor(raw_sparse, chunk_size=chunk_size)),
+        ]:
+            for norm in ["l1", "l2", "max"]:
+                for axis in (0, 1):
+                    for use_sklearn in [True, False]:
+                        n = normalize(x, norm=norm, axis=axis, return_norm=False)
+                        n.op._use_sklearn = use_sklearn
+
+                        result = n.execute().fetch()
+                        expected = sk_normalize(
+                            raw, norm=norm, axis=axis, return_norm=False
+                        )
+
+                        if sps.issparse(expected):
+                            expected = expected.A
+                        np.testing.assert_almost_equal(np.asarray(result), expected)
+
+    raw_dense = np.random.rand(10, 10)
+    raw_sparse = sps.random(10, 10, density=0.4, format="csr")
+
+    # test copy and return_normalize
+    for axis in (0, 1):
+        for chunk_size in (10, 6, (6, 10)):
+            for raw in (raw_dense, raw_sparse):
+                x = mt.tensor(raw, chunk_size=chunk_size)
+                n = normalize(x, axis=axis, copy=False, return_norm=True)
+
+                results = n.execute().fetch()
+                raw_copy = raw.copy()
+                try:
+                    expects = sk_normalize(
+                        raw_copy, axis=axis, copy=False, return_norm=True
+                    )
+                except NotImplementedError:
+                    continue
+
+                if sps.issparse(expects[0]):
+                    expected = expects[0].A
+                else:
+                    expected = expects[0]
+                np.testing.assert_almost_equal(np.asarray(results[0]), expected)
+                np.testing.assert_almost_equal(results[1], expects[1])
diff --git a/python/xorbits/_mars/learn/proxima/__init__.py b/python/xorbits/_mars/learn/proxima/__init__.py
new file mode 100644
index 000000000..ea47818fc
--- /dev/null
+++ b/python/xorbits/_mars/learn/proxima/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def register_op():
+    from .simple_index.builder import ProximaBuilder
+    from .simple_index.searcher import ProximaSearcher
+
+    del ProximaBuilder
+    del ProximaSearcher
diff --git a/python/xorbits/_mars/learn/proxima/core.py b/python/xorbits/_mars/learn/proxima/core.py
new file mode 100644
index 000000000..4ac67d904
--- /dev/null
+++ b/python/xorbits/_mars/learn/proxima/core.py
@@ -0,0 +1,178 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+try:
+    import pyproxima2 as proxima
+except ImportError:  # pragma: no cover
+    proxima = None
+
+from ... import tensor as mt
+from ...tensor.indexing import TensorSlice
+from ...tensor.merge import TensorConcatenate
+
+available_numpy_dtypes = [
+    np.dtype(np.float16),
+    np.dtype(np.float32),
+    np.dtype(np.int8),
+    np.dtype(np.int16),
+]
+
+
+if proxima:
+    _proxima_types = [
+        proxima.IndexMeta.FT_FP16,
+        proxima.IndexMeta.FT_FP32,
+        proxima.IndexMeta.FT_INT8,
+        proxima.IndexMeta.FT_INT16,
+    ]
+    assert len(_proxima_types) == len(available_numpy_dtypes)
+    _type_mapping = {
+        numpy_dtype: proxima_type
+        for numpy_dtype, proxima_type in zip(available_numpy_dtypes, _proxima_types)
+    }
+
+
+def rechunk_tensor(tensor, chunk_size):
+    # TODO(hks): Provide a unify rechunk logic with mmap.
+    cur_chunks = []
+
+    out_nchunks = tensor.shape[0] // chunk_size
+    row_nsplits = [chunk_size] * out_nchunks
+    rest = tensor.shape[0] % chunk_size
+    if rest >= out_nchunks:
+        row_nsplits.append(rest)
+    else:
+        for i in range(tensor.shape[0] % chunk_size):
+            row_nsplits[-i - 1] += 1
+
+    tensor_cumnrows = np.cumsum([0] + list(tensor.nsplits[0]))
+    offset = 0
+    out_groups = []
+    for split in row_nsplits:
+        start_chunk_index = int(tensor_cumnrows.searchsorted(offset))
+        start_chunk_index = start_chunk_index - 1 if start_chunk_index != 0 else 0
+        end_chunk_index = int(tensor_cumnrows.searchsorted(offset + split) - 1)
+        if start_chunk_index == end_chunk_index:
+            t = tensor.chunks[start_chunk_index]
+            slice_op = TensorSlice(
+                [
+                    slice(
+                        offset - tensor_cumnrows[start_chunk_index],
+                        split + offset - tensor_cumnrows[end_chunk_index],
+                    ),
+                    slice(None),
+                ],
+                dtype=t.dtype,
+            )
+            out_groups.append(
+                [
+                    slice_op.new_chunk(
+                        [t],
+                        shape=(split, t.shape[1]),
+                        index=(len(cur_chunks), 0),
+                        order=t.order,
+                    )
+                ]
+            )
+        else:
+            chunks = []
+            start_chunk = tensor.chunks[start_chunk_index]
+            start_slice = int(offset - tensor_cumnrows[start_chunk_index])
+            slice_op = TensorSlice(
+                [slice(start_slice, None), slice(None)], dtype=start_chunk.dtype
+            )
+            chunks.append(
+                slice_op.new_chunk(
+                    [start_chunk],
+                    shape=(start_chunk.shape[0] - start_slice, start_chunk.shape[1]),
+                    index=(0, 0),
+                    order=start_chunk.order,
+                )
+            )
+            chunks.extend(tensor.chunks[start_chunk_index + 1 : end_chunk_index])
+            end_chunk = tensor.chunks[end_chunk_index]
+            end_slice = int(split + offset - tensor_cumnrows[end_chunk_index])
+            slice_op_end = TensorSlice(
+                [slice(None, end_slice), slice(None)], dtype=start_chunk.dtype
+            )
+            chunks.append(
+                slice_op_end.new_chunk(
+                    [end_chunk],
+                    shape=(end_slice, end_chunk.shape[1]),
+                    index=(end_chunk_index - start_chunk_index, 0),
+                    order=end_chunk.order,
+                )
+            )
+            out_groups.append(chunks)
+
+        offset += split
+
+    return out_groups
+
+
+def build_mmap_chunks(chunks, worker, file_prefix):
+    write_mmap_chunks = []
+    nrows = sum(c.shape[0] for c in chunks)
+    array_shape = (nrows, chunks[0].shape[1])
+    array_dtype = chunks[0].dtype
+    create_mmap_op = TensorConcatenate(
+        mmap=True,
+        create_mmap_file=True,
+        total_shape=array_shape,
+        file_prefix=file_prefix,
+        dtype=array_dtype,
+    )
+    create_mmap_op.expect_worker = worker
+    create_mmap_chunk = create_mmap_op.new_chunk(
+        None, index=(0,), shape=(), dtype=array_dtype
+    )
+    start_index = 0
+    for j, chk in enumerate(chunks):
+        s = slice(start_index, start_index + chk.shape[0])
+        start_index += chk.shape[0]
+        write_mmap_op = TensorConcatenate(
+            mmap=True,
+            create_mmap_file=False,
+            total_shape=array_shape,
+            partition_slice=s,
+            dtype=array_dtype,
+        )
+        write_mmap_op.expect_worker = worker
+        write_mmap_chunk = write_mmap_op.new_chunk(
+            [create_mmap_chunk, chk], index=(j + 1, 0), shape=(), dtype=array_dtype
+        )
+        write_mmap_chunks.append(write_mmap_chunk)
+    return write_mmap_chunks
+
+
+def validate_tensor(tensor):
+    if hasattr(tensor, "to_tensor"):
+        tensor = tensor.to_tensor()
+    else:
+        tensor = mt.tensor(tensor)
+    if tensor.ndim != 2:
+        raise ValueError("Input tensor should be 2-d")
+    return tensor
+
+
+def get_proxima_type(np_dtype):
+    try:
+        return _type_mapping[np_dtype]
+    except KeyError:
+        raise TypeError(
+            f"Does not support {np_dtype}, available types include "
+            f"{', '.join(t.name for t in _type_mapping)}"
+        )
diff --git a/python/xorbits/_mars/learn/proxima/simple_index/__init__.py b/python/xorbits/_mars/learn/proxima/simple_index/__init__.py
new file mode 100644
index 000000000..42818f9a0
--- /dev/null
+++ b/python/xorbits/_mars/learn/proxima/simple_index/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .builder import build_index
+from .searcher import search_index
+from .recall import recall  # isort: skip
diff --git a/python/xorbits/_mars/learn/proxima/simple_index/builder.py b/python/xorbits/_mars/learn/proxima/simple_index/builder.py
new file mode 100644
index 000000000..1ab2f37fc
--- /dev/null
+++ b/python/xorbits/_mars/learn/proxima/simple_index/builder.py
@@ -0,0 +1,438 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import logging
+import os
+import pickle  # nosec  # pylint: disable=import_pickle
+import tempfile
+import uuid
+
+import numpy as np
+
+from .... import opcodes
+from .... import tensor as mt
+from ....core import OutputType
+from ....core.context import get_context
+from ....core.operand import OperandStage
+from ....lib.filesystem import get_fs
+from ....serialization.serializables import (
+    BytesField,
+    DataTypeField,
+    DictField,
+    Int32Field,
+    Int64Field,
+    StringField,
+    TupleField,
+)
+from ....utils import Timer, has_unknown_shape
+from ...operands import LearnOperand, LearnOperandMixin
+from ..core import (
+    available_numpy_dtypes,
+    build_mmap_chunks,
+    get_proxima_type,
+    proxima,
+    rechunk_tensor,
+    validate_tensor,
+)
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_INDEX_SIZE = 5 * 10**6
+
+
+class ProximaBuilder(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.PROXIMA_SIMPLE_BUILDER
+
+    _distance_metric = StringField("distance_metric")
+    _dimension = Int32Field("dimension")
+    _column_number = Int64Field("column_number")
+    _index_path = StringField("index_path")
+    _index_builder = StringField("index_builder")
+    _index_builder_params = DictField("index_builder_params")
+    _index_converter = StringField("index_converter")
+    _index_converter_params = DictField("index_converter_params")
+    _topk = Int32Field("topk")
+    _storage_options = BytesField(
+        "storage_options", on_serialize=pickle.dumps, on_deserialize=pickle.loads
+    )
+
+    # only for chunk
+    _array_shape = TupleField("array_shape")
+    _array_dtype = DataTypeField("array_dtype")
+    _offset = Int64Field("offset")
+
+    def __init__(
+        self,
+        distance_metric=None,
+        index_path=None,
+        dimension=None,
+        column_number=None,
+        index_builder=None,
+        index_builder_params=None,
+        index_converter=None,
+        index_converter_params=None,
+        array_shape=None,
+        array_dtype=None,
+        offset=None,
+        topk=None,
+        storage_options=None,
+        output_types=None,
+        **kw,
+    ):
+        super().__init__(
+            _distance_metric=distance_metric,
+            _index_path=index_path,
+            _dimension=dimension,
+            _column_number=column_number,
+            _index_builder=index_builder,
+            _index_builder_params=index_builder_params,
+            _array_shape=array_shape,
+            _array_dtype=array_dtype,
+            _offset=offset,
+            _index_converter=index_converter,
+            _index_converter_params=index_converter_params,
+            _topk=topk,
+            _storage_options=storage_options,
+            _output_types=output_types,
+            **kw,
+        )
+        if self._output_types is None:
+            self._output_types = [OutputType.object]
+
+    @property
+    def distance_metric(self):
+        return self._distance_metric
+
+    @property
+    def column_number(self):
+        return self._column_number
+
+    @property
+    def index_path(self):
+        return self._index_path
+
+    @property
+    def dimension(self):
+        return self._dimension
+
+    @property
+    def index_builder(self):
+        return self._index_builder
+
+    @property
+    def index_builder_params(self):
+        return self._index_builder_params
+
+    @property
+    def index_converter(self):
+        return self._index_converter
+
+    @property
+    def index_converter_params(self):
+        return self._index_converter_params
+
+    @property
+    def topk(self):
+        return self._topk
+
+    @property
+    def storage_options(self):
+        return self._storage_options
+
+    @property
+    def array_shape(self):
+        return self._array_shape
+
+    @property
+    def array_dtype(self):
+        return self._array_dtype
+
+    @property
+    def offset(self):
+        return self._offset
+
+    def __call__(self, tensor):
+        return self.new_tileable([tensor])
+
+    @classmethod
+    def _get_atleast_topk_nsplit(cls, nsplit, topk):
+        new_nsplit = []
+        i = 0
+        while i < len(nsplit):
+            cur = nsplit[i]
+            i += 1
+            if cur >= topk:
+                new_nsplit.append(cur)
+            else:
+                while i < len(nsplit):
+                    cur += nsplit[i]
+                    i += 1
+                    if cur >= topk:
+                        break
+                if cur < topk and len(new_nsplit) > 0:
+                    new_nsplit[-1] += cur
+                elif cur >= topk:
+                    new_nsplit.append(cur)
+        new_nsplit = tuple(new_nsplit)
+        assert sum(new_nsplit) == sum(
+            nsplit
+        ), f"sum of nsplit not equal, old: {nsplit}, new: {new_nsplit}"
+
+        return new_nsplit
+
+    @classmethod
+    def tile(cls, op):
+        tensor = op.inputs[0]
+        out = op.outputs[0]
+        index_path = op.index_path
+        ctx = get_context()
+        fs = None
+        if index_path is not None:
+            fs = get_fs(index_path, op.storage_options)
+
+        if index_path is not None:
+            # check if the index path is empty
+            try:
+                files = [f for f in fs.ls(index_path) if "proxima_" in f]
+                if files:
+                    raise ValueError(
+                        f"Directory {index_path} contains built proxima index, "
+                        f"clean them to perform new index building"
+                    )
+            except FileNotFoundError:
+                # if not exist, create directory
+                fs.mkdir(index_path)
+
+        # make sure all inputs have known chunk sizes
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        if op.column_number:
+            index_chunk_size = op.inputs[0].shape[0] // op.column_number
+        else:
+            worker_num = len(ctx.get_worker_addresses() or [])
+            if worker_num > 0:
+                index_chunk_size = max(
+                    op.inputs[0].shape[0] // worker_num, DEFAULT_INDEX_SIZE
+                )
+            else:
+                index_chunk_size = DEFAULT_INDEX_SIZE
+
+        if op.topk is not None:
+            index_chunk_size = cls._get_atleast_topk_nsplit(index_chunk_size, op.topk)
+
+        # build chunks for writing tensors to mmap files.
+        worker_iter = iter(itertools.cycle(ctx.get_worker_addresses() or [None]))
+        chunk_groups = rechunk_tensor(tensor, index_chunk_size)
+        out_chunks = []
+        offsets = []
+        offset = 0
+        for chunk_group in chunk_groups:
+            offsets.append(offset)
+            file_prefix = f"proxima-build-{str(uuid.uuid4())}"
+            out_chunks.append(
+                build_mmap_chunks(
+                    chunk_group, next(worker_iter), file_prefix=file_prefix
+                )
+            )
+            offset += sum(c.shape[0] for c in chunk_group)
+
+        final_out_chunks = []
+        for j, chunks in enumerate(out_chunks):
+            chunk_op = op.copy().reset_key()
+            chunk_op.stage = OperandStage.map
+            chunk_op.expect_worker = chunks[0].op.expect_worker
+            chunk_op._array_shape = chunks[0].op.total_shape
+            chunk_op._array_dtype = chunks[0].dtype
+            chunk_op._offset = offsets[j]
+            out_chunk = chunk_op.new_chunk(chunks, index=(j,))
+            final_out_chunks.append(out_chunk)
+
+        logger.warning(f"index chunks count: {len(final_out_chunks)} ")
+
+        params = out.params
+        params["chunks"] = final_out_chunks
+        params["nsplits"] = ((1,) * len(final_out_chunks),)
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def _execute_map(cls, ctx, op: "ProximaBuilder"):
+        mmap_path = ctx[op.inputs[0].key]
+        out = op.outputs[0]
+
+        data = np.memmap(
+            mmap_path, dtype=op.array_dtype, mode="r", shape=op.array_shape
+        )
+
+        proxima_type = get_proxima_type(op.array_dtype)
+        offset = op.offset
+
+        # holder
+        with Timer() as timer:
+            holder = proxima.IndexHolder(
+                type=proxima_type, dimension=op.dimension, shallow=True
+            )
+            holder.mount(data, key_base=offset)
+
+        logger.warning(f"Holder({op.key}) costs {timer.duration} seconds")
+
+        # converter
+        meta = proxima.IndexMeta(
+            proxima_type, dimension=op.dimension, measure_name=op.distance_metric
+        )
+        if op.index_converter is not None:
+            with Timer() as timer:
+                converter = proxima.IndexConverter(
+                    name=op.index_converter, meta=meta, params=op.index_converter_params
+                )
+                converter.train_and_transform(holder)
+                holder = converter.result()
+                meta = converter.meta()
+
+            logger.warning(f"Converter({op.key}) costs {timer.duration} seconds")
+
+        # builder
+        with Timer() as timer:
+            builder = proxima.IndexBuilder(
+                name=op.index_builder, meta=meta, params=op.index_builder_params
+            )
+            builder = builder.train_and_build(holder)
+
+        logger.warning(f"Builder({op.key}) costs {timer.duration} seconds")
+
+        # remove mmap file
+        os.remove(mmap_path)
+
+        # dumper
+        with Timer() as timer:
+            path = tempfile.mkstemp(prefix="proxima-", suffix=".index")[1]
+            dumper = proxima.IndexDumper(name="FileDumper", path=path)
+            builder.dump(dumper)
+            dumper.close()
+
+        logger.warning(f"Dumper({op.key}) costs {timer.duration} seconds")
+
+        if op.index_path is None:
+            ctx[out.key] = path
+        else:
+            # write to external file
+            with Timer() as timer:
+                fs = get_fs(op.index_path, op.storage_options)
+                filename = f"proxima_{out.index[0]}_index"
+                out_path = f'{op.index_path.rstrip("/")}/{filename}'
+
+                def write_index():
+                    with fs.open(out_path, "wb") as out_f:
+                        with open(path, "rb") as in_f:
+                            # 128M
+                            chunk_bytes = 128 * 1024**2
+                            while True:
+                                data = in_f.read(chunk_bytes)
+                                if data:
+                                    out_f.write(data)
+                                else:
+                                    break
+
+                # retry 3 times
+                for _ in range(3):
+                    try:
+                        write_index()
+                        break
+                    except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+                        fs.delete(out_path)
+                        continue
+
+            logger.warning(
+                f"WritingToVolume({op.key}), out path: {out_path}, "
+                f"size {os.path.getsize(path)}, "
+                f"costs {timer.duration} seconds "
+                f"speed {round(os.path.getsize(path) / (1024 ** 2) / timer.duration, 2)} MB/s"
+            )
+
+            ctx[out.key] = filename
+
+    @classmethod
+    def _execute_agg(cls, ctx, op: "ProximaBuilder"):
+        paths = [ctx[inp.key] for inp in op.inputs]
+        ctx[op.outputs[0].key] = paths
+
+    @classmethod
+    def execute(cls, ctx, op: "ProximaBuilder"):
+        if op.stage != OperandStage.agg:
+            return cls._execute_map(ctx, op)
+        else:
+            return cls._execute_agg(ctx, op)
+
+    @classmethod
+    def concat_tileable_chunks(cls, tileable):
+        assert not tileable.is_coarse()
+
+        op = cls(stage=OperandStage.agg)
+        chunk = cls(stage=OperandStage.agg).new_chunk(tileable.chunks)
+        return op.new_tileable([tileable], chunks=[chunk], nsplits=((1,),))
+
+
+def build_index(
+    tensor,
+    dimension=None,
+    index_path=None,
+    column_number=None,
+    need_shuffle=False,
+    distance_metric="SquaredEuclidean",
+    index_builder="SsgBuilder",
+    index_builder_params=None,
+    index_converter=None,
+    index_converter_params=None,
+    topk=None,
+    storage_options=None,
+    run=True,
+    session=None,
+    run_kwargs=None,
+):
+    tensor = validate_tensor(tensor)
+    if tensor.dtype not in available_numpy_dtypes:
+        raise ValueError(
+            f"Dtype to build index should be one of {available_numpy_dtypes}, "
+            f"got {tensor.dtype}"
+        )
+
+    if dimension is None:
+        dimension = tensor.shape[1]
+    if index_builder_params is None:
+        index_builder_params = {}
+    if index_converter_params is None:
+        index_converter_params = {}
+
+    if need_shuffle:
+        tensor = mt.random.permutation(tensor)
+
+    op = ProximaBuilder(
+        distance_metric=distance_metric,
+        index_path=index_path,
+        dimension=dimension,
+        column_number=column_number,
+        index_builder=index_builder,
+        index_builder_params=index_builder_params,
+        index_converter=index_converter,
+        index_converter_params=index_converter_params,
+        topk=topk,
+        storage_options=storage_options,
+    )
+    result = op(tensor)
+    if run:
+        return result.execute(session=session, **(run_kwargs or dict()))
+    else:
+        return result
diff --git a/python/xorbits/_mars/learn/proxima/simple_index/knn.py b/python/xorbits/_mars/learn/proxima/simple_index/knn.py
new file mode 100644
index 000000000..1c3fe1b40
--- /dev/null
+++ b/python/xorbits/_mars/learn/proxima/simple_index/knn.py
@@ -0,0 +1,140 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import pandas as pd
+
+from .... import dataframe as md
+from .... import tensor as mt
+from . import build_index, search_index
+
+
+def sample_data(query, sample_count=10000):
+    if sample_count > query.shape[0]:
+        sample_count = query.shape[0]
+
+    idx = random.sample(range(query.shape[0]), sample_count)
+    sample_query = query[idx, :]
+    return sample_query, idx
+
+
+def linear_build_and_search(
+    doc,
+    query,
+    topk,
+    column_number=None,
+    row_number=None,
+    dimension=None,
+    measure_name=None,
+    threads=4,
+):
+    if measure_name is None:
+        measure_name = "SquaredEuclidean"
+    if dimension is None:
+        dimension = doc.shape[1]
+
+    index = build_index(
+        tensor=doc,
+        dimension=dimension,
+        column_number=column_number,
+        distance_metric=measure_name,
+        index_builder="LinearBuilder",
+    )
+
+    pk_l, distance_l = search_index(
+        tensor=query,
+        threads=threads,
+        row_number=row_number,
+        distance_metric=measure_name,
+        dimension=dimension,
+        topk=topk,
+        index=index,
+    )
+
+    return pk_l, distance_l
+
+
+def build_and_search(
+    doc,
+    query,
+    topk,
+    doc_chunk,
+    query_chunk,
+    index_path=None,
+    threads=4,
+    dimension=None,
+    measure_name=None,
+    need_shuffle=False,
+    storage_options=None,
+    index_builder=None,
+    builder_params=None,
+    index_converter=None,
+    index_converter_params=None,
+    index_searcher=None,
+    searcher_params=None,
+    index_reformer=None,
+    index_reformer_params=None,
+):
+    if measure_name is None:
+        measure_name = "SquaredEuclidean"
+    if dimension is None:
+        dimension = doc.shape[1]
+    if index_builder is None:
+        index_builder = "SsgBuilder"
+    if builder_params is None:
+        builder_params = {}
+    if index_converter_params is None:
+        index_converter_params = {}
+    if index_searcher is None:
+        index_searcher = ""
+    if searcher_params is None:
+        searcher_params = {}
+    if index_reformer is None:
+        index_reformer = ""
+    if index_reformer_params is None:
+        index_reformer_params = {}
+
+    doc = md.DataFrame(pd.DataFrame(doc), chunk_size=(doc_chunk, dimension))
+    query = mt.tensor(query, chunk_size=(query_chunk, dimension))
+
+    index = build_index(
+        doc,
+        dimension,
+        index_path,
+        need_shuffle,
+        measure_name,
+        index_builder,
+        builder_params,
+        index_converter,
+        index_converter_params,
+        topk,
+        storage_options,
+    )
+
+    pk2, distance = search_index(
+        query,
+        topk,
+        index,
+        threads,
+        dimension,
+        measure_name,
+        index_searcher,
+        searcher_params,
+        index_reformer,
+        index_reformer_params,
+        storage_options,
+    )
+
+    return pk2, distance
diff --git a/python/xorbits/_mars/learn/proxima/simple_index/recall.py b/python/xorbits/_mars/learn/proxima/simple_index/recall.py
new file mode 100644
index 000000000..b43099065
--- /dev/null
+++ b/python/xorbits/_mars/learn/proxima/simple_index/recall.py
@@ -0,0 +1,153 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+
+from .... import remote as mr
+from .knn import linear_build_and_search, sample_data
+
+
+def recall_one(linear_score, ann_score, topk_ids, epsilon=1e-6):
+    topk_matchs = {}
+    for ids in topk_ids:
+        topk_matchs[ids] = 0
+    length = len(linear_score)
+    match = 0
+    idx, ann_item = 0, 0
+    while idx < length:
+        cur_topk = idx + 1
+        if ann_item < len(ann_score):
+            if math.fabs(linear_score[idx] - ann_score[ann_item]) < epsilon:
+                ann_item += 1
+                idx += 1
+                match += 1
+            else:
+                if linear_score[idx] < ann_score[ann_item]:
+                    idx += 1  # linear
+                else:
+                    ann_item += 1  # ann
+        else:
+            idx += 1
+
+        if cur_topk in topk_ids:
+            topk_matchs[cur_topk] = match / cur_topk
+
+    return topk_matchs
+
+
+def recall_one_byid(linear_key, ann_key, ann_score, topk_ids):
+    idx, length = 0, len(linear_key)
+    topk_matchs, result_topk_matchs = {}, {}
+
+    for ids in topk_ids:
+        topk_matchs[ids] = 0
+        result_topk_matchs[ids] = 0
+
+    while idx < length:
+        for k in topk_ids:
+            dynamic_size = k
+            while dynamic_size + 1 < length:
+                if math.isclose(ann_score[dynamic_size - 1], ann_score[dynamic_size]):
+                    dynamic_size += 1
+                else:
+                    break
+
+            items = 0
+            while items < len(ann_score) and items < dynamic_size:
+                if linear_key[idx] == ann_key[items]:
+                    topk_matchs[k] += 1
+                    break
+                else:
+                    items += 1
+
+        idx += 1
+        if idx in topk_ids:
+            result_topk_matchs[idx] = topk_matchs[idx] / idx
+
+    return result_topk_matchs
+
+
+def compute_recall(
+    pk_l, distance_l, pk_p, distance_p, topk_ids, method="BYID", epsilon=1e-6
+):
+    pk_l, distance_l, pk_p, distance_p = (
+        np.array(pk_l),
+        np.array(distance_l),
+        np.array(pk_p),
+        np.array(distance_p),
+    )
+    topk_matchs = {}
+    for ids in topk_ids:
+        topk_matchs[ids] = 0
+    for linear_res_k, linear_res_s, knn_res_k, knn_res_s in zip(
+        pk_l, distance_l, pk_p, distance_p
+    ):
+        if method == "BYID":
+            res_t = recall_one_byid(linear_res_k, knn_res_k, knn_res_s, topk_ids)
+        else:
+            res_t = recall_one(linear_res_s, knn_res_s, topk_ids, epsilon)
+        for k, v in res_t.items():
+            topk_matchs[k] += v
+
+    length = len(pk_l)
+    for k, v in topk_matchs.items():
+        topk_matchs[k] = min(v / length, 1)
+    return topk_matchs
+
+
+def recall(
+    doc,
+    query,
+    topk,
+    sample_count,
+    pk_p,
+    distance_p,
+    row_number=None,
+    column_number=None,
+    topk_ids=None,
+    method=None,
+    epsilon=1e-6,
+    session=None,
+    run_kwargs=None,
+):
+    if topk_ids is None:
+        topk_ids = [topk]
+    if method is None:
+        method = "BYSCORE"
+
+    query_sample, idx = sample_data(query=query, sample_count=sample_count)
+    pk_p_sample, distance_p_sample = pk_p[idx, :], distance_p[idx, :]
+    pk_l, distance_l = linear_build_and_search(
+        doc=doc,
+        query=query_sample,
+        topk=topk,
+        row_number=row_number,
+        column_number=column_number,
+    )
+
+    r = mr.spawn(
+        compute_recall,
+        args=(
+            pk_l,
+            distance_l,
+            pk_p_sample,
+            distance_p_sample,
+            topk_ids,
+            method,
+            epsilon,
+        ),
+    )
+    return r.execute(session=session, **(run_kwargs or dict())).fetch()
diff --git a/python/xorbits/_mars/learn/proxima/simple_index/searcher.py b/python/xorbits/_mars/learn/proxima/simple_index/searcher.py
new file mode 100644
index 000000000..76ec1ad3e
--- /dev/null
+++ b/python/xorbits/_mars/learn/proxima/simple_index/searcher.py
@@ -0,0 +1,553 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import logging
+import os
+import pickle  # nosec  # pylint: disable=import_pickle
+import random
+from collections import defaultdict
+from hashlib import md5
+
+import numpy as np
+
+from .... import opcodes
+from .... import tensor as mt
+from ....config import options
+from ....core import ENTITY_TYPE, OutputType, recursive_tile
+from ....core.context import get_context
+from ....core.operand import OperandStage
+from ....lib.filesystem import FileSystem, get_fs
+from ....serialization.serializables import (
+    AnyField,
+    BoolField,
+    BytesField,
+    DictField,
+    Int32Field,
+    Int64Field,
+    KeyField,
+    StringField,
+)
+from ....tensor.core import TensorOrder
+from ....utils import Timer, ceildiv, has_unknown_shape
+from ...operands import LearnOperand, LearnOperandMixin
+from ..core import get_proxima_type, proxima, validate_tensor
+
+logger = logging.getLogger(__name__)
+
+
+class ProximaSearcher(LearnOperand, LearnOperandMixin):
+    _op_type_ = opcodes.PROXIMA_SIMPLE_SEARCHER
+    _tensor = KeyField("tensor")
+    _distance_metric = StringField("distance_metric")
+    _dimension = Int32Field("dimension")
+    _row_number = Int64Field("row_number")
+    _topk = Int32Field("topk")
+    _threads = Int32Field("threads")
+    _index = AnyField("index")
+    _index_searcher = StringField("index_searcher")
+    _index_searcher_params = DictField("index_searcher_params")
+    _index_reformer = StringField("index_reformer")
+    _index_reformer_params = DictField("index_reformer_params")
+    _download_index = BoolField("download_index")
+    _storage_options = BytesField(
+        "storage_options", on_serialize=pickle.dumps, on_deserialize=pickle.loads
+    )
+
+    def __init__(
+        self,
+        tensor=None,
+        distance_metric=None,
+        dimension=None,
+        row_number=None,
+        topk=None,
+        index=None,
+        threads=None,
+        index_searcher=None,
+        index_searcher_params=None,
+        index_reformer=None,
+        index_reformer_params=None,
+        download_index=None,
+        storage_options=None,
+        output_types=None,
+        stage=None,
+        **kw,
+    ):
+        super().__init__(
+            _tensor=tensor,
+            _distance_metric=distance_metric,
+            _row_number=row_number,
+            _dimension=dimension,
+            _index=index,
+            _threads=threads,
+            _index_searcher=index_searcher,
+            _index_searcher_params=index_searcher_params,
+            _index_reformer=index_reformer,
+            _index_reformer_params=index_reformer_params,
+            _download_index=download_index,
+            _output_types=output_types,
+            _topk=topk,
+            _storage_options=storage_options,
+            **kw,
+        )
+        if self._output_types is None:
+            self._output_types = [OutputType.tensor, OutputType.tensor]
+
+    @property
+    def tensor(self):
+        return self._tensor
+
+    @property
+    def distance_metric(self):
+        return self._distance_metric
+
+    @property
+    def dimension(self):
+        return self._dimension
+
+    @property
+    def row_number(self):
+        return self._row_number
+
+    @property
+    def topk(self):
+        return self._topk
+
+    @property
+    def threads(self):
+        return self._threads
+
+    @property
+    def index(self):
+        return self._index
+
+    @property
+    def index_searcher(self):
+        return self._index_searcher
+
+    @property
+    def index_searcher_params(self):
+        return self._index_searcher_params
+
+    @property
+    def index_reformer(self):
+        return self._index_reformer
+
+    @property
+    def index_reformer_params(self):
+        return self._index_reformer_params
+
+    @property
+    def download_index(self):
+        return self._download_index
+
+    @property
+    def storage_options(self):
+        return self._storage_options
+
+    @property
+    def output_limit(self):
+        return 1 if self._download_index else 2
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if self.stage != OperandStage.agg and not self._download_index:
+            self._tensor = self._inputs[0]
+            if isinstance(self._index, ENTITY_TYPE):
+                self._index = self._inputs[-1]
+
+    def __call__(self, tensor, index):
+        kws = [
+            {
+                "dtype": np.dtype(np.uint64),
+                "shape": (tensor.shape[0], self._topk),
+                "order": TensorOrder.C_ORDER,
+            },
+            {
+                "dtype": np.dtype(np.float32),
+                "shape": (tensor.shape[0], self._topk),
+                "order": TensorOrder.C_ORDER,
+            },
+        ]
+        inputs = [tensor]
+        if hasattr(index, "op"):
+            inputs.append(index)
+        return mt.ExecutableTuple(self.new_tileables(inputs, kws=kws))
+
+    @classmethod
+    def _build_download_chunks(cls, op, indexes):
+        ctx = get_context()
+        workers = ctx.get_worker_addresses() or [None]
+        if len(workers) < len(indexes):
+            workers = [workers[i % len(workers)] for i in range(len(indexes))]
+        indexes_iter = iter(itertools.cycle(indexes))
+
+        download_chunks = defaultdict(list)
+        for i, worker in enumerate(workers):
+            download_op = op.copy().reset_key()
+            download_op.stage = OperandStage.map
+            download_op.expect_worker = worker
+            download_op._download_index = True
+            download_op._tensor = None
+            download_op._index = next(indexes_iter)
+            download_chunks[i % len(indexes)].append(
+                download_op.new_chunk(
+                    None, index=(i,), shape=(), dtype=op.inputs[0].dtype
+                )
+            )
+        return download_chunks
+
+    @classmethod
+    def tile(cls, op: "ProximaSearcher"):
+        tensor = op.tensor
+        index = op.index
+        topk = op.topk
+        outs = op.outputs
+        row_number = op.row_number
+
+        ctx = get_context()
+
+        # make sure all inputs have known chunk sizes
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        rechunk_size = dict()
+        if tensor.chunk_shape[1] > 1:
+            rechunk_size[1] = tensor.shape[1]
+        if row_number is not None:
+            rechunk_size[0] = tensor.shape[0] // row_number
+        if len(rechunk_size) > 0:
+            tensor = yield from recursive_tile(tensor.rechunk(rechunk_size))
+
+        logger.warning(f"query chunks count: {len(tensor.chunks)} ")
+
+        if hasattr(index, "op"):
+            built_indexes = [index.chunks] * len(tensor.chunks)
+        else:
+            # index path
+            fs: FileSystem = get_fs(index, op.storage_options)
+            index_paths = [
+                f for f in fs.ls(index) if f.rsplit("/", 1)[-1].startswith("proxima_")
+            ]
+            download_chunks = cls._build_download_chunks(op, index_paths)
+            iters = [iter(itertools.cycle(i)) for i in download_chunks.values()]
+            built_indexes = []
+            for _ in range(len(tensor.chunks)):
+                built_indexes.append([next(it) for it in iters])
+
+        if hasattr(index, "op"):
+            index_chunks_workers = [
+                m["bands"][0][0]
+                for m in ctx.get_chunks_meta(
+                    [c.key for c in index.chunks], fields=["bands"]
+                )
+            ]
+        else:
+            index_chunks_workers = [None] * len(built_indexes[0])
+
+        out_chunks = [], []
+        for i, tensor_chunk in enumerate(tensor.chunks):
+            pk_chunks, distance_chunks = [], []
+            for j, chunk_index, worker in zip(
+                itertools.count(), built_indexes[i], index_chunks_workers
+            ):
+                chunk_op = op.copy().reset_key()
+                chunk_op.stage = OperandStage.map
+                if hasattr(index, "op"):
+                    chunk_op.expect_worker = worker
+                else:
+                    chunk_op.expect_worker = chunk_index.op.expect_worker
+                chunk_op._index = chunk_index
+                chunk_op._tensor = None
+                chunk_kws = [
+                    {
+                        "index": (tensor_chunk.index[0], j),
+                        "dtype": outs[0].dtype,
+                        "shape": (tensor_chunk.shape[0], topk),
+                        "order": TensorOrder.C_ORDER,
+                    },
+                    {
+                        "index": (tensor_chunk.index[0], j),
+                        "dtype": outs[1].dtype,
+                        "shape": (tensor_chunk.shape[0], topk),
+                        "order": TensorOrder.C_ORDER,
+                    },
+                ]
+                chunk_inputs = [tensor_chunk, chunk_index]
+                pk_chunk, distance_chunk = chunk_op.new_chunks(
+                    chunk_inputs, kws=chunk_kws
+                )
+                pk_chunks.append(pk_chunk)
+                distance_chunks.append(distance_chunk)
+
+            if len(pk_chunks) == 1:
+                out_chunks[0].append(pk_chunks[0])
+                out_chunks[1].append(distance_chunks[0])
+                continue
+
+            # combine topk results
+            combine_size = options.combine_size
+
+            tensor_out_chunks = [pk_chunks, distance_chunks]
+            while True:
+                chunk_size = ceildiv(len(tensor_out_chunks[0]), combine_size)
+                cur_out_chunks = [[], []]
+                for k in range(chunk_size):
+                    to_combine_pks = tensor_out_chunks[0][
+                        k * combine_size : (k + 1) * combine_size
+                    ]
+                    to_combine_distances = tensor_out_chunks[1][
+                        k * combine_size : (k + 1) * combine_size
+                    ]
+
+                    chunk_op = op.copy().reset_key()
+                    chunk_op.stage = OperandStage.agg
+                    chunk_op._tensor = None
+                    chunk_op._index = None
+                    agg_chunk_kws = [
+                        {
+                            "index": (i, 0),
+                            "dtype": outs[0].dtype,
+                            "shape": (tensor_chunk.shape[0], topk),
+                            "order": outs[0].order,
+                        },
+                        {
+                            "index": (i, 0),
+                            "dtype": outs[1].dtype,
+                            "shape": (tensor_chunk.shape[0], topk),
+                            "order": outs[1].order,
+                        },
+                    ]
+                    pk_result_chunk, distance_result_chunk = chunk_op.new_chunks(
+                        to_combine_pks + to_combine_distances, kws=agg_chunk_kws
+                    )
+                    cur_out_chunks[0].append(pk_result_chunk)
+                    cur_out_chunks[1].append(distance_result_chunk)
+                tensor_out_chunks = cur_out_chunks
+                if len(tensor_out_chunks[0]) == 1:
+                    break
+            out_chunks[0].append(tensor_out_chunks[0][0])
+            out_chunks[1].append(tensor_out_chunks[1][0])
+
+        kws = []
+        pk_params = outs[0].params
+        pk_params["chunks"] = out_chunks[0]
+        pk_params["nsplits"] = (tensor.nsplits[0], (topk,))
+        kws.append(pk_params)
+        distance_params = outs[1].params
+        distance_params["chunks"] = out_chunks[1]
+        distance_params["nsplits"] = (tensor.nsplits[0], (topk,))
+        kws.append(distance_params)
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=kws)
+
+    @classmethod
+    def _execute_download(cls, ctx, op: "ProximaSearcher"):
+        index_path = op.index
+        with Timer() as timer:
+            fs = get_fs(index_path, op.storage_options)
+
+            # TODO
+            dirs = os.environ.get("MARS_SPILL_DIRS")
+            if dirs:
+                temp_dir = random.choice(dirs.split(":"))
+            else:
+                temp_dir = "/tmp/proxima-index/"
+
+            local_path = os.path.join(
+                temp_dir, md5(str(index_path).encode("utf-8")).hexdigest()
+            )  # noqa: B303  # nosec
+            exist_state = True
+            if not os.path.exists(local_path):
+                exist_state = False
+                if not os.path.exists(local_path.rsplit("/", 1)[0]):
+                    os.mkdir(local_path.rsplit("/", 1)[0])
+                with open(local_path, "wb") as out_f:
+                    with fs.open(index_path, "rb") as in_f:
+                        # 32M
+                        chunk_bytes = 32 * 1024**2
+                        while True:
+                            data = in_f.read(chunk_bytes)
+                            if data:
+                                out_f.write(data)
+                            else:
+                                break
+
+        logger.warning(
+            f"ReadingFromVolume({op.key}), index path: {index_path}, "
+            f"local_path {local_path}"
+            f"size {os.path.getsize(local_path)}, "
+            f"already exist {exist_state}, "
+            f"costs {timer.duration} seconds "
+            f"speed {round(os.path.getsize(local_path) / (1024 ** 2) / timer.duration, 2)} MB/s"
+        )
+        ctx[op.outputs[0].key] = local_path
+
+    @classmethod
+    def _execute_map(cls, ctx, op: "ProximaSearcher"):
+        if op.download_index:
+            cls._execute_download(ctx, op)
+            return
+
+        inp = ctx[op.tensor.key]
+        index_path = ctx[op.inputs[-1].key]
+
+        with Timer() as timer:
+            flow = proxima.IndexFlow(
+                container_name="MMapFileContainer",
+                container_params={},
+                searcher_name=op.index_searcher,
+                searcher_params=op.index_searcher_params,
+                measure_name="",
+                measure_params={},
+                reformer_name=op.index_reformer,
+                reformer_params=op.index_reformer_params,
+            )
+
+            flow.load(index_path)
+            vecs = np.ascontiguousarray(inp)
+
+        logger.warning(
+            f"LoadIndex({op.key})  index path: {index_path}  costs {timer.duration} seconds"
+        )
+        logger.warning(f"threads count:{op.threads}  vecs count:{len(vecs)}")
+
+        with Timer() as timer:
+            batch = 10000
+            s_idx = 0
+            e_idx = min(s_idx + batch, len(vecs))
+            result_pks, result_distances = None, None
+            while s_idx < len(vecs):
+                with Timer() as timer_s:
+                    tp = get_proxima_type(vecs.dtype)
+                    result_pks_b, result_distances_b = proxima.IndexUtility.ann_search(
+                        searcher=flow,
+                        type=tp,
+                        query=vecs[s_idx:e_idx],
+                        topk=op.topk,
+                        threads=op.threads,
+                    )
+                    if result_pks is None:
+                        result_pks = np.asarray(result_pks_b)
+                        result_distances = np.asarray(result_distances_b)
+                    else:
+                        result_pks = np.concatenate(
+                            (result_pks, np.asarray(result_pks_b))
+                        )
+                        result_distances = np.concatenate(
+                            (result_distances, np.asarray(result_distances_b))
+                        )
+
+                    s_idx = e_idx
+                    e_idx = min(s_idx + batch, len(vecs))
+                logger.warning(
+                    f"Search({op.key}) count {s_idx}/{len(vecs)}:{round(s_idx * 100 / len(vecs), 2)}%"
+                    f" costs {round(timer_s.duration, 2)} seconds"
+                )
+        logger.warning(f"Search({op.key}) costs {timer.duration} seconds")
+
+        ctx[op.outputs[0].key] = np.asarray(result_pks)
+        ctx[op.outputs[1].key] = np.asarray(result_distances)
+
+    @classmethod
+    def _execute_agg(cls, ctx, op: "ProximaSearcher"):
+        inputs_data = [ctx[inp.key] for inp in op.inputs]
+
+        chunk_num = len(inputs_data) // 2
+        pks = np.concatenate(inputs_data[:chunk_num], axis=1)
+        distances = np.concatenate(inputs_data[chunk_num:], axis=1)
+
+        n_doc = len(pks)
+        topk = op.topk
+
+        # calculate topk on rows
+        if op.distance_metric == "InnerProduct":
+            inds = np.argsort(distances, axis=1)[:, -1 : -topk - 1 : -1]
+        else:
+            inds = np.argsort(distances, axis=1)[:, :topk]
+
+        result_pks = np.empty((n_doc, topk), dtype=pks.dtype)
+        result_distances = np.empty((n_doc, topk), dtype=distances.dtype)
+        rng = np.arange(n_doc)
+        for i in range(topk):
+            ind = inds[:, i]
+            result_pks[:, i] = pks[rng, ind]
+            result_distances[:, i] = distances[rng, ind]
+        del rng
+
+        ctx[op.outputs[0].key] = result_pks
+        ctx[op.outputs[1].key] = result_distances
+
+    @classmethod
+    def execute(cls, ctx, op: "ProximaSearcher"):
+        if op.stage != OperandStage.agg:
+            return cls._execute_map(ctx, op)
+        else:
+            return cls._execute_agg(ctx, op)
+
+
+def search_index(
+    tensor,
+    topk,
+    index,
+    threads=4,
+    row_number=None,
+    dimension=None,
+    distance_metric=None,
+    index_searcher=None,
+    index_searcher_params=None,
+    index_reformer=None,
+    index_reformer_params=None,
+    storage_options=None,
+    run=True,
+    session=None,
+    run_kwargs=None,
+):
+    tensor = validate_tensor(tensor)
+
+    if dimension is None:
+        dimension = tensor.shape[1]
+    if index_searcher is None:
+        index_searcher = ""
+    if index_searcher_params is None:
+        index_searcher_params = {}
+    if index_reformer is None:
+        index_reformer = ""
+    if index_reformer_params is None:
+        index_reformer_params = {}
+    if distance_metric is None:
+        distance_metric = ""
+    if hasattr(index, "op") and index.op.index_path is not None:
+        storage_options = storage_options or index.op.storage_options
+        index = index.op.index_path
+
+    op = ProximaSearcher(
+        tensor=tensor,
+        distance_metric=distance_metric,
+        dimension=dimension,
+        row_number=row_number,
+        topk=topk,
+        index=index,
+        threads=threads,
+        index_searcher=index_searcher,
+        index_searcher_params=index_searcher_params,
+        index_reformer=index_reformer,
+        index_reformer_params=index_reformer_params,
+        storage_options=storage_options,
+    )
+    result = op(tensor, index)
+    if run:
+        return result.execute(session=session, **(run_kwargs or dict()))
+    else:
+        return result
diff --git a/python/xorbits/_mars/learn/proxima/simple_index/tests/__init__.py b/python/xorbits/_mars/learn/proxima/simple_index/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/proxima/simple_index/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/proxima/simple_index/tests/test_simple_index.py b/python/xorbits/_mars/learn/proxima/simple_index/tests/test_simple_index.py
new file mode 100644
index 000000000..1746c9ed2
--- /dev/null
+++ b/python/xorbits/_mars/learn/proxima/simple_index/tests/test_simple_index.py
@@ -0,0 +1,799 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ..... import dataframe as md
+from ..... import tensor as mt
+from ...core import proxima
+from .. import build_index, recall, search_index
+
+
+def proxima_build_and_query(
+    doc,
+    query,
+    topk,
+    measure_name=None,
+    dimension=None,
+    index_builder=None,
+    builder_params=None,
+    index_converter=None,
+    index_converter_params=None,
+    index_searcher=None,
+    searcher_params=None,
+    index_reformer=None,
+    index_reformer_params=None,
+):
+    if measure_name is None:
+        measure_name = "SquaredEuclidean"
+    if dimension is None:
+        dimension = doc.shape[1]
+    if index_builder is None:
+        index_builder = "SsgBuilder"
+    if builder_params is None:
+        builder_params = {}
+    if index_converter_params is None:
+        index_converter_params = {}
+    if index_searcher is None:
+        index_searcher = ""
+    if searcher_params is None:
+        searcher_params = {}
+    if index_reformer is None:
+        index_reformer = ""
+    if index_reformer_params is None:
+        index_reformer_params = {}
+
+    map_dtype = {
+        np.dtype(np.float32): proxima.IndexMeta.FT_FP32,
+        np.dtype(np.int16): proxima.IndexMeta.FT_INT16,
+    }
+    # holder
+    holder = proxima.IndexHolder(type=map_dtype[doc.dtypes[0]], dimension=dimension)
+    holder.mount(np.array(doc))  # add batch data, pk starts from 0
+
+    # converter
+    meta = proxima.IndexMeta(
+        map_dtype[doc.dtypes[0]], dimension=dimension, measure_name=measure_name
+    )
+    if index_converter is not None:
+        if index_converter == "MipsConverter":
+            measure_name = ""
+        converter = proxima.IndexConverter(
+            name=index_converter, meta=meta, params=index_converter_params
+        )
+        converter.train_and_transform(holder)
+        holder = converter.result()
+        meta = converter.meta()
+
+    # builder && dumper
+    builder = proxima.IndexBuilder(name=index_builder, meta=meta, params=builder_params)
+    builder = builder.train_and_build(holder)
+    dumper = proxima.IndexDumper(name="MemoryDumper", path="test.index")
+    builder.dump(dumper)
+    dumper.close()
+
+    # indexflow for search
+    flow = proxima.IndexFlow(
+        container_name="MemoryContainer",
+        container_params={},
+        searcher_name=index_searcher,
+        searcher_params=searcher_params,
+        measure_name=measure_name,
+        measure_params={},
+        reformer_name=index_reformer,
+        reformer_params=index_reformer_params,
+    )
+    flow.load("test.index")
+    keys, scores = proxima.IndexUtility.ann_search(
+        searcher=flow, query=query, topk=topk, threads=1
+    )
+    return np.asarray(keys), np.asarray(scores)
+
+
+def gen_data(doc_count, query_count, dimension, dtype=np.float32):
+    if dtype == np.float32:
+        rs = np.random.RandomState(0)
+        doc = pd.DataFrame(rs.rand(doc_count, dimension).astype(dtype))
+        query = rs.rand(query_count, dimension).astype(dtype)
+    elif dtype == np.int32:
+        rs = np.random.RandomState(0)
+        doc = pd.DataFrame((rs.rand(doc_count, dimension) * 1000).astype(dtype))
+        query = (rs.rand(query_count, dimension) * 1000).astype(dtype)
+    else:
+        raise ValueError(f"Unsupported dtype {dtype}")
+    return doc, query
+
+
+@pytest.mark.skipif(proxima is None, reason="proxima not installed")
+def build_and_query(
+    doc,
+    query,
+    topk,
+    column_number,
+    row_number,
+    threads=1,
+    dimension=None,
+    measure_name=None,
+    index_builder=None,
+    builder_params=None,
+    index_converter=None,
+    index_converter_params=None,
+    index_searcher=None,
+    searcher_params=None,
+    index_reformer=None,
+    index_reformer_params=None,
+):
+    if measure_name is None:
+        measure_name = "SquaredEuclidean"
+    if dimension is None:
+        dimension = doc.shape[1]
+    if index_builder is None:
+        index_builder = "SsgBuilder"
+    if builder_params is None:
+        builder_params = {}
+    if index_converter_params is None:
+        index_converter_params = {}
+    if index_searcher is None:
+        index_searcher = ""
+    if searcher_params is None:
+        searcher_params = {}
+    if index_reformer is None:
+        index_reformer = ""
+    if index_reformer_params is None:
+        index_reformer_params = {}
+
+    doc = md.DataFrame(pd.DataFrame(doc))
+    query = mt.tensor(query)
+
+    index = build_index(
+        tensor=doc,
+        need_shuffle=False,
+        column_number=column_number,
+        distance_metric=measure_name,
+        dimension=dimension,
+        index_builder=index_builder,
+        index_builder_params=builder_params,
+        index_converter=index_converter,
+        index_converter_params=index_converter_params,
+    )
+    paths = index.fetch()
+    if not isinstance(paths, list):
+        paths = [paths]
+
+    try:
+        for path in paths:
+            with open(path, "rb") as f:
+                assert len(f.read()) > 0
+
+        pk2, distance = search_index(
+            tensor=query,
+            threads=threads,
+            row_number=row_number,
+            distance_metric=measure_name,
+            dimension=dimension,
+            topk=topk,
+            index=index,
+            index_searcher=index_searcher,
+            index_searcher_params=searcher_params,
+            index_reformer=index_reformer,
+            index_reformer_params=index_reformer_params,
+        )
+        assert pk2.shape == (len(query), topk)
+        assert distance.shape == (len(query), topk)
+        return pk2, distance
+    finally:
+        for path in paths:
+            os.remove(path)
+
+
+def consistency_checking(
+    doc,
+    query,
+    dimension,
+    topk,
+    measure_name,
+    column_number,
+    row_number,
+    threads,
+    index_builder,
+    builder_params,
+    index_converter,
+    index_converter_params,
+    index_searcher,
+    searcher_params,
+    index_reformer,
+    index_reformer_params,
+    decimal=6,
+):
+    # proxima_data
+    pk_p, distance_p = proxima_build_and_query(
+        doc=doc,
+        query=query,
+        dimension=dimension,
+        topk=topk,
+        measure_name=measure_name,
+        index_builder=index_builder,
+        builder_params=builder_params,
+        index_converter=index_converter,
+        index_converter_params=index_converter_params,
+        index_searcher=index_searcher,
+        searcher_params=searcher_params,
+        index_reformer=index_reformer,
+        index_reformer_params=index_reformer_params,
+    )
+
+    # mars_data
+    pk_m, distance_m = build_and_query(
+        doc,
+        query,
+        dimension=dimension,
+        topk=topk,
+        threads=threads,
+        measure_name=measure_name,
+        column_number=column_number,
+        row_number=row_number,
+        index_builder=index_builder,
+        builder_params=builder_params,
+        index_converter=index_converter,
+        index_converter_params=index_converter_params,
+        index_searcher=index_searcher,
+        searcher_params=searcher_params,
+        index_reformer=index_reformer,
+        index_reformer_params=index_reformer_params,
+    )
+
+    # testing
+    np.testing.assert_array_equal(pk_p, pk_m)
+    np.testing.assert_array_almost_equal(distance_p, distance_m, decimal=decimal)
+
+
+@pytest.mark.skipif(proxima is None, reason="proxima not installed")
+def test_build_and_search_index(setup):
+    # for now, test SquaredEuclidean and Euclidean only,
+    # TODO: add more tests for "Canberra", "Chebyshev"
+    #  "Manhattan" when ready
+
+    # L2 space
+    # params
+    doc_count, query_count, dimension, topk = 200, 15, 5, 3
+    threads, column_number, row_number = 4, 2, 2
+    measure_name_lists = ["SquaredEuclidean", "Euclidean"]
+    index_builder_lists = [
+        "SsgBuilder",
+        "HnswBuilder",
+        "LinearBuilder",
+        "ClusteringBuilder",
+        "GcBuilder",
+        "QcBuilder",
+    ]
+    builder_params_lists = [
+        {},
+        {},
+        {},
+        {"proxima.hc.builder.max_document_count": doc_count},
+        {"proxima.gc.builder.centroid_count": "16"},
+        {"proxima.qc.builder.centroid_count": "16"},
+    ]
+    index_searcher_lists = [
+        "SsgSearcher",
+        "HnswSearcher",
+        "LinearSearcher",
+        "ClusteringSearcher",
+        "GcSearcher",
+        "QcSearcher",
+    ]
+    searcher_params = {}
+    index_converter, index_converter_params = None, {}
+    index_reformer, index_reformer_params = "", {}
+
+    # data
+    doc, query = gen_data(
+        doc_count=doc_count, query_count=query_count, dimension=dimension
+    )
+
+    # test
+    for i, index_builder in enumerate(index_builder_lists):
+        for measure_name in measure_name_lists:
+            consistency_checking(
+                doc=doc,
+                query=query,
+                dimension=dimension,
+                topk=topk,
+                threads=threads,
+                measure_name=measure_name,
+                column_number=column_number,
+                row_number=row_number,
+                index_builder=index_builder,
+                builder_params=builder_params_lists[i],
+                index_converter=index_converter,
+                index_converter_params=index_converter_params,
+                index_searcher=index_searcher_lists[i],
+                searcher_params=searcher_params,
+                index_reformer=index_reformer,
+                index_reformer_params=index_reformer_params,
+            )
+
+    # L2 space with HalfFloatConverter
+    # params
+    doc_count, query_count, dimension, topk = 200, 15, 5, 3
+    threads, column_number, row_number = 4, 2, 2
+    measure_name_lists = ["SquaredEuclidean", "Euclidean"]
+    index_builder_lists = [
+        "SsgBuilder",
+        "HnswBuilder",
+        "LinearBuilder",
+        "ClusteringBuilder",
+        "GcBuilder",
+        "QcBuilder",
+    ]
+    builder_params_lists = [
+        {},
+        {},
+        {},
+        {"proxima.hc.builder.max_document_count": doc_count},
+        {"proxima.gc.builder.centroid_count": "16"},
+        {"proxima.qc.builder.centroid_count": "16"},
+    ]
+    index_searcher_lists = [
+        "SsgSearcher",
+        "HnswSearcher",
+        "LinearSearcher",
+        "ClusteringSearcher",
+        "GcSearcher",
+        "QcSearcher",
+    ]
+    index_converter_lists = [
+        "HalfFloatConverter",
+        "HalfFloatConverter",
+        "HalfFloatConverter",
+        "HalfFloatConverter",
+        "HalfFloatConverter",
+        "HalfFloatConverter",
+    ]
+    searcher_params = {}
+    index_converter, index_converter_params = None, {}
+    index_reformer, index_reformer_params = "", {}
+
+    # data
+    doc, query = gen_data(
+        doc_count=doc_count, query_count=query_count, dimension=dimension
+    )
+
+    # test
+    for i, index_builder in enumerate(index_builder_lists):
+        for measure_name in measure_name_lists:
+            consistency_checking(
+                doc=doc,
+                query=query,
+                dimension=dimension,
+                topk=topk,
+                threads=threads,
+                measure_name=measure_name,
+                column_number=column_number,
+                row_number=row_number,
+                index_builder=index_builder,
+                builder_params=builder_params_lists[i],
+                index_converter=index_converter_lists[i],
+                index_converter_params=index_converter_params,
+                index_searcher=index_searcher_lists[i],
+                searcher_params=searcher_params,
+                index_reformer=index_reformer,
+                index_reformer_params=index_reformer_params,
+                decimal=7,
+            )
+
+    # L2 space with Int8QuantizerConverter
+    # params
+    doc_count, query_count, dimension, topk = 2000, 1, 32, 5
+    threads, column_number, row_number = 4, 2, 1
+
+    measure_name_lists = ["SquaredEuclidean", "Euclidean"]
+    index_builder_lists = [
+        "SsgBuilder",
+        "HnswBuilder",
+        "LinearBuilder",
+        "ClusteringBuilder",
+        "GcBuilder",
+        "QcBuilder",
+    ]
+    builder_params_lists = [
+        {},
+        {},
+        {},
+        {"proxima.hc.builder.max_document_count": doc_count},
+        {"proxima.gc.builder.centroid_count": "16"},
+        {
+            "proxima.qc.builder.centroid_count": "16",
+            "proxima.qc.builder.quantizer_class": "Int8QuantizerConverter",
+        },
+    ]
+    index_searcher_lists = [
+        "SsgSearcher",
+        "HnswSearcher",
+        "LinearSearcher",
+        "ClusteringSearcher",
+        "GcSearcher",
+        "QcSearcher",
+    ]
+    searcher_params_lists = [
+        {},
+        {},
+        {},
+        {"proxima.hc.searcher.scan_ratio": 1},
+        {"proxima.gc.searcher.scan_ratio": 1},
+        {"proxima.qc.searcher.scan_ratio": 1},
+    ]
+    index_converter_lists = [
+        "Int8QuantizerConverter",
+        "Int8QuantizerConverter",
+        "Int8QuantizerConverter",
+        "Int8QuantizerConverter",
+        "Int8QuantizerConverter",
+        None,
+    ]
+    index_converter_params = {}
+    index_reformer, index_reformer_params = "", {}
+
+    # data
+    doc, query = gen_data(
+        doc_count=doc_count, query_count=query_count, dimension=dimension
+    )
+
+    # test
+    for i, index_builder in enumerate(index_builder_lists):
+        for measure_name in measure_name_lists:
+            consistency_checking(
+                doc=doc,
+                query=query,
+                dimension=dimension,
+                topk=topk,
+                threads=threads,
+                measure_name=measure_name,
+                column_number=column_number,
+                row_number=row_number,
+                index_builder=index_builder,
+                builder_params=builder_params_lists[i],
+                index_converter=index_converter_lists[i],
+                index_converter_params=index_converter_params,
+                index_searcher=index_searcher_lists[i],
+                searcher_params=searcher_params_lists[i],
+                index_reformer=index_reformer,
+                index_reformer_params=index_reformer_params,
+                decimal=2,
+            )
+
+    # L2 space with Int4QuantizerConverter
+    # params
+    doc_count, query_count, dimension, topk = 2000, 1, 32, 5
+    threads, column_number, row_number = 4, 2, 1
+
+    measure_name_lists = ["SquaredEuclidean", "Euclidean"]
+    index_builder_lists = [
+        "SsgBuilder",
+        "HnswBuilder",
+        "LinearBuilder",
+        "ClusteringBuilder",
+        "GcBuilder",
+        "QcBuilder",
+    ]
+    builder_params_lists = [
+        {},
+        {},
+        {},
+        {"proxima.hc.builder.max_document_count": doc_count},
+        {"proxima.gc.builder.centroid_count": "16"},
+        {
+            "proxima.qc.builder.centroid_count": "16",
+            "proxima.qc.builder.quantizer_class": "Int4QuantizerConverter",
+        },
+    ]
+    index_searcher_lists = [
+        "SsgSearcher",
+        "HnswSearcher",
+        "LinearSearcher",
+        "ClusteringSearcher",
+        "GcSearcher",
+        "QcSearcher",
+    ]
+    searcher_params_lists = [
+        {},
+        {},
+        {},
+        {"proxima.hc.searcher.scan_ratio": 1},
+        {"proxima.gc.searcher.scan_ratio": 1},
+        {"proxima.qc.searcher.scan_ratio": 1},
+    ]
+    index_converter_lists = [
+        "Int4QuantizerConverter",
+        "Int4QuantizerConverter",
+        "Int4QuantizerConverter",
+        "Int4QuantizerConverter",
+        "Int4QuantizerConverter",
+        None,
+    ]
+    index_converter_params = {}
+    index_reformer, index_reformer_params = "", {}
+
+    # data
+    doc, query = gen_data(
+        doc_count=doc_count,
+        query_count=query_count,
+        dimension=dimension,
+        dtype=np.float32,
+    )
+
+    for i, index_builder in enumerate(index_builder_lists):
+        for measure_name in measure_name_lists:
+            consistency_checking(
+                doc=doc,
+                query=query,
+                dimension=dimension,
+                topk=topk,
+                threads=threads,
+                measure_name=measure_name,
+                column_number=column_number,
+                row_number=row_number,
+                index_builder=index_builder,
+                builder_params=builder_params_lists[i],
+                index_converter=index_converter_lists[i],
+                index_converter_params=index_converter_params,
+                index_searcher=index_searcher_lists[i],
+                searcher_params=searcher_params_lists[i],
+                index_reformer=index_reformer,
+                index_reformer_params=index_reformer_params,
+                decimal=2,
+            )
+
+    # L2 space with NormalizeConverter
+    # params
+    doc_count, query_count, dimension, topk = 2000, 1, 32, 5
+    threads, column_number, row_number = 4, 2, 1
+
+    measure_name_lists = ["SquaredEuclidean", "Euclidean"]
+    index_builder_lists = [
+        "SsgBuilder",
+        "HnswBuilder",
+        "LinearBuilder",
+        "ClusteringBuilder",
+        "GcBuilder",
+        "QcBuilder",
+    ]
+    builder_params_lists = [
+        {},
+        {},
+        {},
+        {"proxima.hc.builder.max_document_count": doc_count},
+        {"proxima.gc.builder.centroid_count": "16"},
+        {"proxima.qc.builder.centroid_count": "16"},
+    ]
+    index_searcher_lists = [
+        "SsgSearcher",
+        "HnswSearcher",
+        "LinearSearcher",
+        "ClusteringSearcher",
+        "GcSearcher",
+        "QcSearcher",
+    ]
+    searcher_params_lists = [
+        {},
+        {},
+        {},
+        {"proxima.hc.searcher.scan_ratio": 1},
+        {"proxima.gc.searcher.scan_ratio": 1},
+        {"proxima.qc.searcher.scan_ratio": 1},
+    ]
+    index_converter_lists = [
+        "NormalizeConverter",
+        "NormalizeConverter",
+        "NormalizeConverter",
+        "NormalizeConverter",
+        "NormalizeConverter",
+        "NormalizeConverter",
+    ]
+    index_converter_params = {}
+    index_reformer, index_reformer_params = "", {}
+    # data
+    doc, query = gen_data(
+        doc_count=doc_count,
+        query_count=query_count,
+        dimension=dimension,
+        dtype=np.float32,
+    )
+
+    for i, index_builder in enumerate(index_builder_lists):
+        for measure_name in measure_name_lists:
+            consistency_checking(
+                doc,
+                query,
+                dimension=dimension,
+                topk=topk,
+                threads=threads,
+                measure_name=measure_name,
+                column_number=column_number,
+                row_number=row_number,
+                index_builder=index_builder,
+                builder_params=builder_params_lists[i],
+                index_converter=index_converter_lists[i],
+                index_converter_params=index_converter_params,
+                index_searcher=index_searcher_lists[i],
+                searcher_params=searcher_params_lists[i],
+                index_reformer=index_reformer,
+                index_reformer_params=index_reformer_params,
+                decimal=2,
+            )
+
+    # InnerProduct space
+    # params
+    doc_count, query_count, dimension, topk = 200, 15, 5, 2
+    threads, column_number, row_number = 4, 2, 2
+
+    measure_name_lists = ["InnerProduct"]
+    index_builder_lists = [
+        "LinearBuilder",
+        "QcBuilder",
+        "HnswBuilder",
+        "SsgBuilder",
+        "ClusteringBuilder",
+        "GcBuilder",
+    ]
+    builder_params_lists = [
+        {},
+        {"proxima.qc.builder.centroid_count": "16"},
+        {},
+        {},
+        {"proxima.hc.builder.max_document_count": doc_count},
+        {"proxima.gc.builder.centroid_count": "16"},
+    ]
+    index_searcher_lists = [
+        "LinearSearcher",
+        "QcSearcher",
+        "HnswSearcher",
+        "SsgSearcher",
+        "ClusteringSearcher",
+        "GcSearcher",
+    ]
+    index_converter_lists = [
+        None,
+        None,
+        "MipsConverter",
+        "MipsConverter",
+        "MipsConverter",
+        "MipsConverter",
+    ]
+
+    searcher_params = {}
+    index_converter_params = {}
+    index_reformer, index_reformer_params = "", {}
+
+    # data
+    doc, query = gen_data(
+        doc_count=doc_count, query_count=query_count, dimension=dimension
+    )
+
+    for i, index_builder in enumerate(index_builder_lists):
+        for measure_name in measure_name_lists:
+            consistency_checking(
+                doc,
+                query,
+                dimension=dimension,
+                topk=topk,
+                threads=threads,
+                measure_name=measure_name,
+                column_number=column_number,
+                row_number=row_number,
+                index_builder=index_builder,
+                builder_params=builder_params_lists[i],
+                index_converter=index_converter_lists[i],
+                index_converter_params=index_converter_params,
+                index_searcher=index_searcher_lists[i],
+                searcher_params=searcher_params,
+                index_reformer=index_reformer,
+                index_reformer_params=index_reformer_params,
+                decimal=5,
+            )
+
+
+@pytest.mark.skipif(proxima is None, reason="proxima not installed")
+def test_build_and_search_index_with_filesystem(setup):
+    with tempfile.TemporaryDirectory() as f:
+        # params
+        doc_count, query_count, dimension = 2000, 50, 10
+        topk = 10
+
+        # data
+        doc, query = gen_data(
+            doc_count=doc_count, query_count=query_count, dimension=dimension
+        )
+
+        df = md.DataFrame(pd.DataFrame(doc))
+        q = mt.tensor(query)
+
+        index = build_index(tensor=df, index_path=f, column_number=2)
+
+        assert len(os.listdir(f)) > 0
+
+        # proxima_data
+        pk_p, distance_p = proxima_build_and_query(doc, query, topk)
+        pk_m, distance_m = search_index(tensor=q, topk=topk, index=index, row_number=5)
+
+        # testing
+        np.testing.assert_array_equal(pk_p, pk_m)
+        np.testing.assert_array_equal(distance_p, distance_m)
+
+
+@pytest.mark.skipif(proxima is None, reason="proxima not installed")
+def test_build_and_search_index_with_filesystem_download(setup):
+    with tempfile.TemporaryDirectory() as f:
+        # params
+        doc_count, query_count, dimension = 2000, 15, 10
+        topk = 10
+        doc_chunk, query_chunk = 1000, 5
+
+        # data
+        doc, query = gen_data(
+            doc_count=doc_count, query_count=query_count, dimension=dimension
+        )
+
+        df = md.DataFrame(pd.DataFrame(doc), chunk_size=(doc_chunk, dimension))
+        q = mt.tensor(query, chunk_size=(query_chunk, dimension))
+
+        index = build_index(tensor=df, index_path=f, column_number=2)
+
+        assert len(os.listdir(f)) > 0
+
+        search_index(q[0:5], topk, index)
+        search_index(q[5:10], topk, index)
+        search_index(q[10:15], topk, index)
+
+
+@pytest.mark.skipif(proxima is None, reason="proxima not installed")
+def test_recall(setup):
+    # params
+    doc_count, query_count, dimension = 2000, 150, 20
+    topk = 100
+    sample_count = 100
+
+    # data
+    doc, query = gen_data(
+        doc_count=doc_count, query_count=query_count, dimension=dimension
+    )
+
+    # proxima_data
+    pk_p, distance_p = build_and_query(
+        doc,
+        query,
+        dimension=dimension,
+        topk=topk,
+        threads=1,
+        column_number=2,
+        row_number=3,
+    )
+    assert isinstance(
+        recall(
+            doc=doc,
+            query=query,
+            topk=topk,
+            sample_count=sample_count,
+            pk_p=pk_p,
+            distance_p=distance_p,
+            column_number=2,
+            row_number=2,
+        ),
+        dict,
+    )
diff --git a/python/xorbits/_mars/learn/semi_supervised/__init__.py b/python/xorbits/_mars/learn/semi_supervised/__init__.py
new file mode 100644
index 000000000..919f15491
--- /dev/null
+++ b/python/xorbits/_mars/learn/semi_supervised/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ._label_propagation import LabelPropagation
diff --git a/python/xorbits/_mars/learn/semi_supervised/_label_propagation.py b/python/xorbits/_mars/learn/semi_supervised/_label_propagation.py
new file mode 100644
index 000000000..a7c9904d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/semi_supervised/_label_propagation.py
@@ -0,0 +1,368 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from abc import ABCMeta, abstractmethod
+
+from sklearn.base import BaseEstimator
+from sklearn.exceptions import ConvergenceWarning
+
+from ... import tensor as mt
+from ...core import ExecutableTuple
+from ..base import ClassifierMixin
+from ..metrics.pairwise import rbf_kernel
+from ..neighbors.unsupervised import NearestNeighbors
+from ..utils import check_array
+from ..utils.multiclass import check_classification_targets
+from ..utils.validation import check_is_fitted, check_X_y
+
+
+class BaseLabelPropagation(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
+    """Base class for label propagation module.
+
+    Parameters
+    ----------
+    kernel : {'knn', 'rbf', callable}
+        String identifier for kernel function to use or the kernel function
+        itself. Only 'rbf' and 'knn' strings are valid inputs. The function
+        passed should take two inputs, each of shape [n_samples, n_features],
+        and return a [n_samples, n_samples] shaped weight matrix
+
+    gamma : float
+        Parameter for rbf kernel
+
+    n_neighbors : integer > 0
+        Parameter for knn kernel
+
+    alpha : float
+        Clamping factor
+
+    max_iter : integer
+        Change maximum number of iterations allowed
+
+    tol : float
+        Convergence tolerance: threshold to consider the system at steady
+        state
+    """
+
+    def __init__(
+        self, kernel="rbf", gamma=20, n_neighbors=7, alpha=1, max_iter=30, tol=1e-3
+    ):
+        self.max_iter = max_iter
+        self.tol = tol
+
+        # kernel parameters
+        self.kernel = kernel
+        self.gamma = gamma
+        self.n_neighbors = n_neighbors
+
+        # clamping factor
+        self.alpha = alpha
+
+        self.nn_fit = None
+
+    def _get_kernel(self, X, y=None):
+        if self.kernel == "rbf":
+            if y is None:
+                return rbf_kernel(X, X, gamma=self.gamma)
+            else:
+                return rbf_kernel(X, y, gamma=self.gamma)
+        elif self.kernel == "knn":
+            if self.nn_fit is None:
+                self.nn_fit = NearestNeighbors(self.n_neighbors).fit(X)
+            if y is None:
+                return self.nn_fit.kneighbors_graph(
+                    self.nn_fit._fit_X, self.n_neighbors, mode="connectivity"
+                )
+            else:
+                return self.nn_fit.kneighbors(y, return_distance=False)
+        elif callable(self.kernel):
+            if y is None:
+                return self.kernel(X, X)
+            else:
+                return self.kernel(X, y)
+        else:  # pragma: no cover
+            raise ValueError(
+                f"{self.kernel} is not a valid kernel. Only rbf and knn"
+                " or an explicit function "
+                " are supported at this time."
+            )
+
+    @abstractmethod
+    def _build_graph(self):  # pragma: no cover
+        raise NotImplementedError(
+            "Graph construction must be implemented"
+            " to fit a label propagation model."
+        )
+
+    def predict(self, X, session=None, run_kwargs=None):
+        """Performs inductive inference across the model.
+
+        Parameters
+        ----------
+        X : array_like, shape = [n_samples, n_features]
+
+        Returns
+        -------
+        y : array_like, shape = [n_samples]
+            Predictions for input data
+        """
+        probas = self.predict_proba(X, session=session, run_kwargs=run_kwargs)
+        result = mt.tensor(self.classes_)[mt.argmax(probas, axis=1)].ravel()
+        result.execute(session=session, **(run_kwargs or dict()))
+        return result
+
+    def predict_proba(self, X, session=None, run_kwargs=None):
+        """Predict probability for each possible outcome.
+
+        Compute the probability estimates for each single sample in X
+        and each possible outcome seen during training (categorical
+        distribution).
+
+        Parameters
+        ----------
+        X : array_like, shape = [n_samples, n_features]
+
+        Returns
+        -------
+        probabilities : Tensor, shape = [n_samples, n_classes]
+            Normalized probability distributions across
+            class labels
+        """
+
+        check_is_fitted(self, "X_")
+
+        X_2d = check_array(X, accept_sparse=True)
+        weight_matrices = self._get_kernel(self.X_, X_2d)
+        if self.kernel == "knn":
+            probabilities = mt.array(
+                [
+                    mt.sum(self.label_distributions_[weight_matrix], axis=0)
+                    for weight_matrix in weight_matrices
+                ]
+            )
+        else:
+            weight_matrices = weight_matrices.T
+            probabilities = mt.dot(weight_matrices, self.label_distributions_)
+        normalizer = mt.atleast_2d(mt.sum(probabilities, axis=1)).T
+        probabilities /= normalizer
+        probabilities.execute(session=session, **(run_kwargs or dict()))
+        return probabilities
+
+    def fit(self, X, y, session=None, run_kwargs=None):
+        """Fit a semi-supervised label propagation model based
+
+        All the input data is provided matrix X (labeled and unlabeled)
+        and corresponding label matrix y with a dedicated marker value for
+        unlabeled samples.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            A {n_samples by n_samples} size matrix will be created from this
+
+        y : array_like, shape = [n_samples]
+            n_labeled_samples (unlabeled points are marked as -1)
+            All unlabeled samples will be transductively assigned labels
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        X, y = check_X_y(X, y)
+        self.X_ = X
+        to_run = [check_classification_targets(y)]
+
+        # actual graph construction (implementations should override this)
+        graph_matrix = self._build_graph()
+
+        # label construction
+        # construct a categorical distribution for classification only
+        classes = mt.unique(y, aggregate_size=1).to_numpy(
+            session=session, **(run_kwargs or dict())
+        )
+        classes = classes[classes != -1]
+        self.classes_ = classes
+
+        n_samples, n_classes = len(y), len(classes)
+
+        alpha = self.alpha
+        # add check when we support LabelSpreading
+        # if self._variant == 'spreading' and \
+        #         (alpha is None or alpha <= 0.0 or alpha >= 1.0):
+        #     raise ValueError('alpha=%s is invalid: it must be inside '
+        #                      'the open interval (0, 1)' % alpha)
+        y = mt.asarray(y)
+        unlabeled = y == -1
+
+        # initialize distributions
+        self.label_distributions_ = mt.zeros((n_samples, n_classes))
+        for label in classes:
+            self.label_distributions_[y == label, classes == label] = 1
+
+        y_static = mt.copy(self.label_distributions_)
+        if self._variant == "propagation":
+            # LabelPropagation
+            y_static[unlabeled] = 0
+        else:  # pragma: no cover
+            # LabelSpreading
+            y_static *= 1 - alpha
+
+        l_previous = mt.zeros((self.X_.shape[0], n_classes))
+
+        unlabeled = unlabeled[:, mt.newaxis]
+
+        for self.n_iter_ in range(self.max_iter):
+            cond = mt.abs(self.label_distributions_ - l_previous).sum() < self.tol
+
+            to_run.append(cond)
+            ExecutableTuple(to_run).execute(session=session, **(run_kwargs or dict()))
+            # clear
+            to_run = []
+
+            if cond.fetch(session=session):
+                break
+
+            l_previous = self.label_distributions_
+            self.label_distributions_ = graph_matrix.dot(self.label_distributions_)
+
+            if self._variant == "propagation":
+                normalizer = mt.sum(self.label_distributions_, axis=1)[:, mt.newaxis]
+                self.label_distributions_ /= normalizer
+                self.label_distributions_ = mt.where(
+                    unlabeled, self.label_distributions_, y_static
+                )
+            else:  # pragma: no cover
+                # clamp
+                self.label_distributions_ = (
+                    mt.multiply(alpha, self.label_distributions_) + y_static
+                )
+
+            to_run.append(self.label_distributions_)
+        else:
+            warnings.warn(
+                f"max_iter={self.max_iter} was reached without convergence.",
+                category=ConvergenceWarning,
+            )
+            self.n_iter_ += 1
+
+        normalizer = mt.sum(self.label_distributions_, axis=1)[:, mt.newaxis]
+        self.label_distributions_ /= normalizer
+
+        # set the transduction item
+        transduction = mt.tensor(self.classes_)[
+            mt.argmax(self.label_distributions_, axis=1)
+        ]
+        self.transduction_ = transduction.ravel()
+        ExecutableTuple([self.label_distributions_, self.transduction_]).execute(
+            session=session, **(run_kwargs or dict())
+        )
+        return self
+
+
+class LabelPropagation(BaseLabelPropagation):
+    """Label Propagation classifier
+
+    Read more in the :ref:`User Guide <label_propagation>`.
+
+    Parameters
+    ----------
+    kernel : {'knn', 'rbf', callable}
+        String identifier for kernel function to use or the kernel function
+        itself. Only 'rbf' and 'knn' strings are valid inputs. The function
+        passed should take two inputs, each of shape [n_samples, n_features],
+        and return a [n_samples, n_samples] shaped weight matrix.
+
+    gamma : float
+        Parameter for rbf kernel
+
+    n_neighbors : integer > 0
+        Parameter for knn kernel
+
+    max_iter : integer
+        Change maximum number of iterations allowed
+
+    tol : float
+        Convergence tolerance: threshold to consider the system at steady
+        state
+
+    Attributes
+    ----------
+    X_ : array, shape = [n_samples, n_features]
+        Input array.
+
+    classes_ : array, shape = [n_classes]
+        The distinct labels used in classifying instances.
+
+    label_distributions_ : array, shape = [n_samples, n_classes]
+        Categorical distribution for each item.
+
+    transduction_ : array, shape = [n_samples]
+        Label assigned to each item via the transduction.
+
+    n_iter_ : int
+        Number of iterations run.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn import datasets
+    >>> from mars.learn.semi_supervised import LabelPropagation
+    >>> label_prop_model = LabelPropagation()
+    >>> iris = datasets.load_iris()
+    >>> rng = np.random.RandomState(42)
+    >>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3
+    >>> labels = np.copy(iris.target)
+    >>> labels[random_unlabeled_points] = -1
+    >>> label_prop_model.fit(iris.data, labels)
+    LabelPropagation(...)
+
+    References
+    ----------
+    Xiaojin Zhu and Zoubin Ghahramani. Learning from labeled and unlabeled data
+    with label propagation. Technical Report CMU-CALD-02-107, Carnegie Mellon
+    University, 2002 http://pages.cs.wisc.edu/~jerryzhu/pub/CMU-CALD-02-107.pdf
+
+    See Also
+    --------
+    LabelSpreading : Alternate label propagation strategy more robust to noise
+    """
+
+    _variant = "propagation"
+
+    def __init__(self, kernel="rbf", gamma=20, n_neighbors=7, max_iter=1000, tol=1e-3):
+        super().__init__(
+            kernel=kernel,
+            gamma=gamma,
+            n_neighbors=n_neighbors,
+            max_iter=max_iter,
+            tol=tol,
+            alpha=None,
+        )
+
+    def _build_graph(self):
+        """Matrix representing a fully connected graph between each sample
+
+        This basic implementation creates a non-stochastic affinity matrix, so
+        class distributions will exceed 1 (normalization may be desired).
+        """
+        if self.kernel == "knn":
+            self.nn_fit = None
+        affinity_matrix = self._get_kernel(self.X_)
+        normalizer = affinity_matrix.sum(axis=0)
+        affinity_matrix /= normalizer[:, mt.newaxis]
+        return affinity_matrix
+
+    def fit(self, X, y, session=None, run_kwargs=None):
+        return super().fit(X, y, session=session, run_kwargs=run_kwargs)
diff --git a/python/xorbits/_mars/learn/semi_supervised/tests/__init__.py b/python/xorbits/_mars/learn/semi_supervised/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/semi_supervised/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/semi_supervised/tests/test_label_propagation.py b/python/xorbits/_mars/learn/semi_supervised/tests/test_label_propagation.py
new file mode 100644
index 000000000..494b2447d
--- /dev/null
+++ b/python/xorbits/_mars/learn/semi_supervised/tests/test_label_propagation.py
@@ -0,0 +1,144 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+from sklearn.datasets import make_classification
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.model_selection import train_test_split
+from sklearn.utils._testing import assert_no_warnings
+
+from .... import tensor as mt
+from ...metrics.pairwise import rbf_kernel
+from ...neighbors import NearestNeighbors
+from .. import LabelPropagation
+
+estimators = [
+    (LabelPropagation, {"kernel": "rbf"}),
+    (LabelPropagation, {"kernel": "knn", "n_neighbors": 2}),
+    (LabelPropagation, {"kernel": lambda x, y: rbf_kernel(x, y, gamma=20)}),
+]
+
+
+@pytest.mark.parametrize("estimator, parameters", estimators)
+def test_fit_transduction(setup, estimator, parameters):
+    samples = [[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]]
+    labels = [0, 1, -1]
+    clf = estimator(**parameters).fit(samples, labels)
+    assert clf.transduction_[2].fetch() == 1
+
+
+@pytest.mark.parametrize("estimator, parameters", estimators)
+def test_distribution(setup, estimator, parameters):
+    samples = [[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]]
+    labels = [0, 1, -1]
+    clf = estimator(**parameters).fit(samples, labels)
+    if parameters["kernel"] == "knn":
+        return  # unstable test; changes in k-NN ordering break it
+    else:
+        np.testing.assert_array_almost_equal(
+            np.asarray(clf.label_distributions_[2]), np.array([0.5, 0.5]), 2
+        )
+
+
+@pytest.mark.parametrize("estimator, parameters", estimators)
+def test_predict(setup, estimator, parameters):
+    samples = [[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]]
+    labels = [0, 1, -1]
+    clf = estimator(**parameters).fit(samples, labels)
+    np.testing.assert_array_equal(clf.predict([[0.5, 2.5]]).fetch(), np.array([1]))
+
+
+@pytest.mark.parametrize("estimator, parameters", estimators)
+def test_predict_proba(setup, estimator, parameters):
+    samples = [[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]]
+    labels = [0, 1, -1]
+    clf = estimator(**parameters).fit(samples, labels)
+    np.testing.assert_almost_equal(
+        clf.predict_proba([[1.0, 1.0]]).fetch(), np.array([[0.5, 0.5]])
+    )
+
+
+def test_label_propagation_closed_form(setup):
+    n_classes = 2
+    X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0)
+    y[::3] = -1
+    Y = np.zeros((len(y), n_classes + 1))
+    Y[np.arange(len(y)), y] = 1
+    unlabelled_idx = Y[:, (-1,)].nonzero()[0]
+    labelled_idx = (Y[:, (-1,)] == 0).nonzero()[0]
+
+    clf = LabelPropagation(max_iter=10000, gamma=0.1)
+    clf.fit(X, y)
+    # adopting notation from Zhu et al 2002
+    T_bar = clf._build_graph().to_numpy()
+    Tuu = T_bar[tuple(np.meshgrid(unlabelled_idx, unlabelled_idx, indexing="ij"))]
+    Tul = T_bar[tuple(np.meshgrid(unlabelled_idx, labelled_idx, indexing="ij"))]
+    Y = Y[:, :-1]
+    Y_l = Y[labelled_idx, :]
+    Y_u = np.dot(np.dot(np.linalg.inv(np.eye(Tuu.shape[0]) - Tuu), Tul), Y_l)
+
+    expected = Y.copy()
+    expected[unlabelled_idx, :] = Y_u
+    expected /= expected.sum(axis=1)[:, np.newaxis]
+
+    np.testing.assert_array_almost_equal(expected, clf.label_distributions_.fetch(), 4)
+
+
+def test_convergence_warning(setup):
+    # This is a non-regression test for #5774
+    X = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]])
+    y = np.array([0, 1, -1])
+
+    mdl = LabelPropagation(kernel="rbf", max_iter=1)
+    with pytest.warns(ConvergenceWarning):
+        mdl.fit(X, y)
+    assert mdl.n_iter_ == mdl.max_iter
+
+    mdl = LabelPropagation(kernel="rbf", max_iter=500)
+    assert_no_warnings(mdl.fit, X, y)
+
+
+def test_predict_sparse_callable_kernel(setup):
+    # This is a non-regression test for #15866
+
+    # Custom sparse kernel (top-K RBF)
+    def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5):
+        nn = NearestNeighbors(n_neighbors=10, metric="euclidean", n_jobs=-1)
+        nn.fit(X)
+        W = -1 * mt.power(nn.kneighbors_graph(Y, mode="distance"), 2) * gamma
+        W = mt.exp(W)
+        assert W.issparse()
+        return W.T
+
+    n_classes = 4
+    n_samples = 500
+    n_test = 10
+    X, y = make_classification(
+        n_classes=n_classes,
+        n_samples=n_samples,
+        n_features=20,
+        n_informative=20,
+        n_redundant=0,
+        n_repeated=0,
+        random_state=0,
+    )
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=n_test, random_state=0
+    )
+
+    model = LabelPropagation(kernel=topk_rbf)
+    model.fit(X_train, y_train)
+    assert model.score(X_test, y_test).fetch() >= 0.9
diff --git a/python/xorbits/_mars/learn/tests/__init__.py b/python/xorbits/_mars/learn/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/tests/test_wrappers.py b/python/xorbits/_mars/learn/tests/test_wrappers.py
new file mode 100644
index 000000000..c0707ae58
--- /dev/null
+++ b/python/xorbits/_mars/learn/tests/test_wrappers.py
@@ -0,0 +1,107 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+from sklearn.datasets import make_classification
+from sklearn.decomposition import PCA
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+from ... import tensor as mt
+from ..wrappers import ParallelPostFit
+
+
+def test_parallel_post_fit_basic(setup):
+    raw_x, raw_y = make_classification(n_samples=1000)
+    X, y = mt.tensor(raw_x, chunk_size=100), mt.tensor(raw_y, chunk_size=100)
+    clf = ParallelPostFit(GradientBoostingClassifier())
+    clf.fit(X, y)
+
+    assert isinstance(clf.predict(X), mt.Tensor)
+    assert isinstance(clf.predict_proba(X), mt.Tensor)
+
+    result = clf.score(X, y)
+    expected = clf.estimator.score(X, y)
+    assert result.fetch() == expected
+
+    clf = ParallelPostFit(LinearRegression())
+    clf.fit(X, y)
+    with pytest.raises(
+        AttributeError, match="The wrapped estimator (.|\n)* 'predict_proba' method."
+    ):
+        clf.predict_proba(X)
+
+
+def test_parallel_post_fit_predict(setup):
+    raw_x, raw_y = make_classification(n_samples=1000)
+    X, y = mt.tensor(raw_x, chunk_size=100), mt.tensor(raw_y, chunk_size=100)
+    base = LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs")
+    wrap = ParallelPostFit(LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs"))
+
+    base.fit(X, y)
+    wrap.fit(X, y)
+
+    result = wrap.predict(X)
+    expected = base.predict(X)
+    np.testing.assert_allclose(result, expected)
+
+    result = wrap.predict_proba(X)
+    expected = base.predict_proba(X)
+    np.testing.assert_allclose(result, expected)
+
+    result = wrap.predict_log_proba(X)
+    expected = base.predict_log_proba(X)
+    np.testing.assert_allclose(result, expected)
+
+
+def test_parallel_post_fit_transform(setup):
+    raw_x, raw_y = make_classification(n_samples=1000)
+    X, y = mt.tensor(raw_x, chunk_size=100), mt.tensor(raw_y, chunk_size=100)
+    base = PCA(random_state=0)
+    wrap = ParallelPostFit(PCA(random_state=0))
+
+    base.fit(raw_x, raw_y)
+    wrap.fit(X, y)
+
+    result = base.transform(X)
+    expected = wrap.transform(X)
+    np.testing.assert_allclose(result, expected, atol=0.1)
+
+
+def test_parallel_post_fit_multiclass(setup):
+    raw_x, raw_y = make_classification(n_samples=1000)
+    X, y = mt.tensor(raw_x, chunk_size=100), mt.tensor(raw_y, chunk_size=100)
+    raw_x, raw_y = make_classification(n_classes=3, n_informative=4)
+    X, y = mt.tensor(raw_x, chunk_size=50), mt.tensor(raw_y, chunk_size=50)
+
+    clf = ParallelPostFit(
+        LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs", multi_class="auto")
+    )
+
+    clf.fit(X, y)
+    result = clf.predict(X)
+    expected = clf.estimator.predict(X)
+
+    np.testing.assert_allclose(result, expected)
+
+    result = clf.predict_proba(X)
+    expected = clf.estimator.predict_proba(X)
+
+    np.testing.assert_allclose(result, expected)
+
+    result = clf.predict_log_proba(X)
+    expected = clf.estimator.predict_log_proba(X)
+
+    np.testing.assert_allclose(result, expected)
diff --git a/python/xorbits/_mars/learn/utils/__init__.py b/python/xorbits/_mars/learn/utils/__init__.py
new file mode 100644
index 000000000..107fc1d3a
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# noinspection PyUnresolvedReferences
+from sklearn.utils import gen_batches
+
+from .collect_ports import collect_ports
+from .core import (
+    concat_chunks,
+    convert_to_tensor_or_dataframe,
+    copy_learned_attributes,
+    get_chunk_n_rows,
+)
+from .shuffle import shuffle
+from .validation import (
+    assert_all_finite,
+    check_array,
+    check_consistent_length,
+    check_X_y,
+    column_or_1d,
+)
diff --git a/python/xorbits/_mars/learn/utils/_cython_blas.pxd b/python/xorbits/_mars/learn/utils/_cython_blas.pxd
new file mode 100644
index 000000000..3667d2889
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/_cython_blas.pxd
@@ -0,0 +1,41 @@
+from cython cimport floating
+
+
+cpdef enum BLAS_Order:
+    RowMajor  # C contiguous
+    ColMajor  # Fortran contiguous
+
+
+cpdef enum BLAS_Trans:
+    NoTrans = 110  # correspond to 'n'
+    Trans = 116    # correspond to 't'
+
+
+# BLAS Level 1 ################################################################
+cdef floating _dot(int, floating*, int, floating*, int) nogil
+
+cdef floating _asum(int, floating*, int) nogil
+
+cdef void _axpy(int, floating, floating*, int, floating*, int) nogil
+
+cdef floating _nrm2(int, floating*, int) nogil
+
+cdef void _copy(int, floating*, int, floating*, int) nogil
+
+cdef void _scal(int, floating, floating*, int) nogil
+
+cdef void _rotg(floating*, floating*, floating*, floating*) nogil
+
+cdef void _rot(int, floating*, int, floating*, int, floating, floating) nogil
+
+# BLAS Level 2 ################################################################
+cdef void _gemv(BLAS_Order, BLAS_Trans, int, int, floating, floating*, int,
+                floating*, int, floating, floating*, int) nogil
+
+cdef void _ger(BLAS_Order, int, int, floating, floating*, int, floating*, int,
+               floating*, int) nogil
+
+# BLASLevel 3 ################################################################
+cdef void _gemm(BLAS_Order, BLAS_Trans, BLAS_Trans, int, int, int, floating,
+                floating*, int, floating*, int, floating, floating*,
+                int) nogil
diff --git a/python/xorbits/_mars/learn/utils/_cython_blas.pyx b/python/xorbits/_mars/learn/utils/_cython_blas.pyx
new file mode 100644
index 000000000..c15e66ee0
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/_cython_blas.pyx
@@ -0,0 +1,226 @@
+from cython cimport floating
+
+from scipy.linalg.cython_blas cimport sdot, ddot
+from scipy.linalg.cython_blas cimport sasum, dasum
+from scipy.linalg.cython_blas cimport saxpy, daxpy
+from scipy.linalg.cython_blas cimport snrm2, dnrm2
+from scipy.linalg.cython_blas cimport scopy, dcopy
+from scipy.linalg.cython_blas cimport sscal, dscal
+from scipy.linalg.cython_blas cimport srotg, drotg
+from scipy.linalg.cython_blas cimport srot, drot
+from scipy.linalg.cython_blas cimport sgemv, dgemv
+from scipy.linalg.cython_blas cimport sger, dger
+from scipy.linalg.cython_blas cimport sgemm, dgemm
+
+
+################
+# BLAS Level 1 #
+################
+
+cdef floating _dot(int n, floating *x, int incx,
+                   floating *y, int incy) nogil:
+    """x.T.y"""
+    if floating is float:
+        return sdot(&n, x, &incx, y, &incy)
+    else:
+        return ddot(&n, x, &incx, y, &incy)
+
+
+cpdef _dot_memview(floating[::1] x, floating[::1] y):
+    return _dot(x.shape[0], &x[0], 1, &y[0], 1)
+
+
+cdef floating _asum(int n, floating *x, int incx) nogil:
+    """sum(|x_i|)"""
+    if floating is float:
+        return sasum(&n, x, &incx)
+    else:
+        return dasum(&n, x, &incx)
+
+
+cpdef _asum_memview(floating[::1] x):
+    return _asum(x.shape[0], &x[0], 1)
+
+
+cdef void _axpy(int n, floating alpha, floating *x, int incx,
+                floating *y, int incy) nogil:
+    """y := alpha * x + y"""
+    if floating is float:
+        saxpy(&n, &alpha, x, &incx, y, &incy)
+    else:
+        daxpy(&n, &alpha, x, &incx, y, &incy)
+
+
+cpdef _axpy_memview(floating alpha, floating[::1] x, floating[::1] y):
+    _axpy(x.shape[0], alpha, &x[0], 1, &y[0], 1)
+
+
+cdef floating _nrm2(int n, floating *x, int incx) nogil:
+    """sqrt(sum((x_i)^2))"""
+    if floating is float:
+        return snrm2(&n, x, &incx)
+    else:
+        return dnrm2(&n, x, &incx)
+
+
+cpdef _nrm2_memview(floating[::1] x):
+    return _nrm2(x.shape[0], &x[0], 1)
+
+
+cdef void _copy(int n, floating *x, int incx, floating *y, int incy) nogil:
+    """y := x"""
+    if floating is float:
+        scopy(&n, x, &incx, y, &incy)
+    else:
+        dcopy(&n, x, &incx, y, &incy)
+
+
+cpdef _copy_memview(floating[::1] x, floating[::1] y):
+    _copy(x.shape[0], &x[0], 1, &y[0], 1)
+
+
+cdef void _scal(int n, floating alpha, floating *x, int incx) nogil:
+    """x := alpha * x"""
+    if floating is float:
+        sscal(&n, &alpha, x, &incx)
+    else:
+        dscal(&n, &alpha, x, &incx)
+
+
+cpdef _scal_memview(floating alpha, floating[::1] x):
+    _scal(x.shape[0], alpha, &x[0], 1)
+
+
+cdef void _rotg(floating *a, floating *b, floating *c, floating *s) nogil:
+    """Generate plane rotation"""
+    if floating is float:
+        srotg(a, b, c, s)
+    else:
+        drotg(a, b, c, s)
+
+
+cpdef _rotg_memview(floating a, floating b, floating c, floating s):
+    _rotg(&a, &b, &c, &s)
+    return a, b, c, s
+
+
+cdef void _rot(int n, floating *x, int incx, floating *y, int incy,
+               floating c, floating s) nogil:
+    """Apply plane rotation"""
+    if floating is float:
+        srot(&n, x, &incx, y, &incy, &c, &s)
+    else:
+        drot(&n, x, &incx, y, &incy, &c, &s)
+
+
+cpdef _rot_memview(floating[::1] x, floating[::1] y, floating c, floating s):
+    _rot(x.shape[0], &x[0], 1, &y[0], 1, c, s)
+
+
+################
+# BLAS Level 2 #
+################
+
+cdef void _gemv(BLAS_Order order, BLAS_Trans ta, int m, int n, floating alpha,
+                floating *A, int lda, floating *x, int incx,
+                floating beta, floating *y, int incy) nogil:
+    """y := alpha * op(A).x + beta * y"""
+    cdef char ta_ = ta
+    if order == RowMajor:
+        ta_ = NoTrans if ta == Trans else Trans
+        if floating is float:
+            sgemv(&ta_, &n, &m, &alpha, A, &lda, x, &incx, &beta, y, &incy)
+        else:
+            dgemv(&ta_, &n, &m, &alpha, A, &lda, x, &incx, &beta, y, &incy)
+    else:
+        if floating is float:
+            sgemv(&ta_, &m, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy)
+        else:
+            dgemv(&ta_, &m, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy)
+
+
+cpdef _gemv_memview(BLAS_Trans ta, floating alpha, floating[:, :] A,
+                    floating[::1] x, floating beta, floating[::1] y):
+    cdef:
+        int m = A.shape[0]
+        int n = A.shape[1]
+        BLAS_Order order = ColMajor if A.strides[0] == A.itemsize else RowMajor
+        int lda = m if order == ColMajor else n
+
+    _gemv(order, ta, m, n, alpha, &A[0, 0], lda, &x[0], 1, beta, &y[0], 1)
+
+
+cdef void _ger(BLAS_Order order, int m, int n, floating alpha, floating *x,
+               int incx, floating *y, int incy, floating *A, int lda) nogil:
+    """A := alpha * x.y.T + A"""
+    if order == RowMajor:
+        if floating is float:
+            sger(&n, &m, &alpha, y, &incy, x, &incx, A, &lda)
+        else:
+            dger(&n, &m, &alpha, y, &incy, x, &incx, A, &lda)
+    else:
+        if floating is float:
+            sger(&m, &n, &alpha, x, &incx, y, &incy, A, &lda)
+        else:
+            dger(&m, &n, &alpha, x, &incx, y, &incy, A, &lda)
+
+
+cpdef _ger_memview(floating alpha, floating[::1] x, floating[::] y,
+                   floating[:, :] A):
+    cdef:
+        int m = A.shape[0]
+        int n = A.shape[1]
+        BLAS_Order order = ColMajor if A.strides[0] == A.itemsize else RowMajor
+        int lda = m if order == ColMajor else n
+
+    _ger(order, m, n, alpha, &x[0], 1, &y[0], 1, &A[0, 0], lda)
+
+
+################
+# BLAS Level 3 #
+################
+
+cdef void _gemm(BLAS_Order order, BLAS_Trans ta, BLAS_Trans tb, int m, int n,
+                int k, floating alpha, floating *A, int lda, floating *B,
+                int ldb, floating beta, floating *C, int ldc) nogil:
+    """C := alpha * op(A).op(B) + beta * C"""
+    cdef:
+        char ta_ = ta
+        char tb_ = tb
+    if order == RowMajor:
+        if floating is float:
+            sgemm(&tb_, &ta_, &n, &m, &k, &alpha, B,
+                  &ldb, A, &lda, &beta, C, &ldc)
+        else:
+            dgemm(&tb_, &ta_, &n, &m, &k, &alpha, B,
+                  &ldb, A, &lda, &beta, C, &ldc)
+    else:
+        if floating is float:
+            sgemm(&ta_, &tb_, &m, &n, &k, &alpha, A,
+                  &lda, B, &ldb, &beta, C, &ldc)
+        else:
+            dgemm(&ta_, &tb_, &m, &n, &k, &alpha, A,
+                  &lda, B, &ldb, &beta, C, &ldc)
+
+
+cpdef _gemm_memview(BLAS_Trans ta, BLAS_Trans tb, floating alpha,
+                    floating[:, :] A, floating[:, :] B, floating beta,
+                    floating[:, :] C):
+    cdef:
+        int m = A.shape[0] if ta == NoTrans else A.shape[1]
+        int n = B.shape[1] if tb == NoTrans else B.shape[0]
+        int k = A.shape[1] if ta == NoTrans else A.shape[0]
+        int lda, ldb, ldc
+        BLAS_Order order = ColMajor if A.strides[0] == A.itemsize else RowMajor
+
+    if order == RowMajor:
+        lda = k if ta == NoTrans else m
+        ldb = n if tb == NoTrans else k
+        ldc = n
+    else:
+        lda = m if ta == NoTrans else k
+        ldb = k if tb == NoTrans else n
+        ldc = m
+
+    _gemm(order, ta, tb, m, n, k, alpha, &A[0, 0],
+          lda, &B[0, 0], ldb, beta, &C[0, 0], ldc)
diff --git a/python/xorbits/_mars/learn/utils/_encode.py b/python/xorbits/_mars/learn/utils/_encode.py
new file mode 100644
index 000000000..da5e4c417
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/_encode.py
@@ -0,0 +1,300 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import NamedTuple
+
+import numpy as np
+
+from ... import dataframe as md
+from ... import tensor as mt
+from .core import is_scalar_nan
+
+
+def _unique(values, *, return_inverse=False):
+    """Helper function to find unique values with support for python objects.
+
+    Uses pure python method for object dtype, and numpy method for
+    all other dtypes.
+
+    Parameters
+    ----------
+    values : ndarray
+        Values to check for unknowns.
+
+    return_inverse : bool, default=False
+        If True, also return the indices of the unique values.
+
+    Returns
+    -------
+    unique : ndarray
+        The sorted unique values.
+
+    unique_inverse : ndarray
+        The indices to reconstruct the original array from the unique array.
+        Only provided if `return_inverse` is True.
+    """
+    if values.dtype == object:
+        series_unique = md.Series(md.Series(values).unique()).sort_values().values
+        if return_inverse:
+            return series_unique, _map_to_integer(values, series_unique)
+        else:
+            return series_unique
+
+    out = mt.unique(values, return_inverse=return_inverse)
+
+    if return_inverse:
+        uniques, inverse = out
+    else:
+        uniques = out
+
+    # np.unique will have duplicate missing values at the end of `uniques`
+    # here we clip the nans and remove it from uniques
+    uniques = uniques.rechunk(tuple((s,) for idx, s in enumerate(uniques.shape)))
+    nan_idx = mt.searchsorted(uniques, mt.nan)
+    uniques = uniques.map_chunk(
+        lambda c, idx: c[: idx + 1],
+        args=(nan_idx,),
+        dtype=uniques.dtype,
+        shape=(np.nan,) * uniques.ndim,
+    )
+    if return_inverse:
+
+        def inv_mapper(c, idx):
+            if c.flags.writeable:
+                c[c > idx] = idx
+            else:  # pragma: no cover
+                # If c is got from the shared memory, it is immutable.
+                c = np.select([c <= idx], [c], idx)
+            return c
+
+        inverse = inverse.map_chunk(
+            inv_mapper,
+            args=(nan_idx,),
+            dtype=inverse.dtype,
+            shape=((np.nan,),) * inverse.ndim,
+        )
+
+        return uniques, inverse
+    return uniques
+
+
+class MissingValues(NamedTuple):  # pragma: no cover
+    """Data class for missing data information"""
+
+    nan: bool
+    none: bool
+
+    def to_list(self):
+        """Convert tuple to a list where None is always first."""
+        output = []
+        if self.none:
+            output.append(None)
+        if self.nan:
+            output.append(np.nan)
+        return output
+
+
+def _extract_missing(values):  # pragma: no cover
+    """Extract missing values from `values`.
+
+    Parameters
+    ----------
+    values: set
+        Set of values to extract missing from.
+
+    Returns
+    -------
+    output: set
+        Set with missing values extracted.
+
+    missing_values: MissingValues
+        Object with missing value information.
+    """
+    missing_values_set = {
+        value for value in values if value is None or is_scalar_nan(value)
+    }
+
+    if not missing_values_set:
+        return values, MissingValues(nan=False, none=False)
+
+    if None in missing_values_set:
+        if len(missing_values_set) == 1:
+            output_missing_values = MissingValues(nan=False, none=True)
+        else:
+            # If there is more than one missing value, then it has to be
+            # float('nan') or np.nan
+            output_missing_values = MissingValues(nan=True, none=True)
+    else:
+        output_missing_values = MissingValues(nan=True, none=False)
+
+    # create set without the missing values
+    output = values - missing_values_set
+    return output, output_missing_values
+
+
+class _nandict(dict):  # pragma: no cover
+    """Dictionary with support for nans."""
+
+    def __init__(self, mapping):
+        super().__init__(mapping)
+        for key, value in mapping.items():
+            if is_scalar_nan(key):
+                self.nan_value = value
+                break
+
+    def __missing__(self, key):
+        if hasattr(self, "nan_value") and is_scalar_nan(key):
+            return self.nan_value
+        raise KeyError(key)
+
+
+def _map_to_integer(values, uniques, check_unknown=True):
+    """Map values based on its position in uniques."""
+
+    def mapper(values_data, uniques_data):
+        if values_data.dtype.kind in "OUS":
+            try:
+                table = _nandict({val: i for i, val in enumerate(uniques_data)})
+                return np.array([table[v] for v in values_data])
+            except KeyError as e:
+                raise ValueError(f"y contains previously unseen labels: {str(e)}")
+        else:
+            if check_unknown:
+                diff = _check_unknown(values_data, uniques_data)
+                if diff:
+                    raise ValueError(
+                        f"y contains previously unseen labels: {str(diff)}"
+                    )
+            return np.searchsorted(uniques_data, values_data)
+
+    return values.map_chunk(
+        mapper, args=(uniques,), dtype=np.dtype(np.int64), shape=values.shape
+    )
+
+
+def _check_unknown(values, known_values, return_mask=False):  # pragma: no cover
+    """
+    Helper function to check for unknowns in values to be encoded.
+
+    Uses pure python method for object dtype, and numpy method for
+    all other dtypes.
+
+    Parameters
+    ----------
+    values : array
+        Values to check for unknowns.
+    known_values : array
+        Known values. Must be unique.
+    return_mask : bool, default=False
+        If True, return a mask of the same shape as `values` indicating
+        the valid values.
+
+    Returns
+    -------
+    diff : list
+        The unique values present in `values` and not in `know_values`.
+    valid_mask : boolean array
+        Additionally returned if ``return_mask=True``.
+
+    """
+    valid_mask = None
+
+    if values.dtype.kind in "OUS":
+        values_set = set(values)
+        values_set, missing_in_values = _extract_missing(values_set)
+
+        uniques_set = set(known_values)
+        uniques_set, missing_in_uniques = _extract_missing(uniques_set)
+        diff = values_set - uniques_set
+
+        nan_in_diff = missing_in_values.nan and not missing_in_uniques.nan
+        none_in_diff = missing_in_values.none and not missing_in_uniques.none
+
+        def is_valid(value):
+            return (
+                value in uniques_set
+                or missing_in_uniques.none
+                and value is None
+                or missing_in_uniques.nan
+                and is_scalar_nan(value)
+            )
+
+        if return_mask:
+            if diff or nan_in_diff or none_in_diff:
+                valid_mask = np.array([is_valid(value) for value in values])
+            else:
+                valid_mask = np.ones(len(values), dtype=bool)
+
+        diff = list(diff)
+        if none_in_diff:
+            diff.append(None)
+        if nan_in_diff:
+            diff.append(np.nan)
+    else:
+        unique_values = np.unique(values)
+        diff = np.setdiff1d(unique_values, known_values, assume_unique=True)
+        if return_mask:
+            if diff.size:
+                valid_mask = np.in1d(values, known_values)
+            else:
+                valid_mask = np.ones(len(values), dtype=bool)
+
+        # check for nans in the known_values
+        if np.isnan(known_values).any():
+            diff_is_nan = np.isnan(diff)
+            if diff_is_nan.any():
+                # removes nan from valid_mask
+                if diff.size and return_mask:
+                    is_nan = np.isnan(values)
+                    valid_mask[is_nan] = 1
+
+                # remove nan from diff
+                diff = diff[~diff_is_nan]
+        diff = list(diff)
+
+    if return_mask:
+        return diff, valid_mask
+    return diff
+
+
+def _encode(values, *, uniques, check_unknown=True):
+    """Helper function to encode values into [0, n_uniques - 1].
+
+    Uses pure python method for object dtype, and numpy method for
+    all other dtypes.
+    The numpy method has the limitation that the `uniques` need to
+    be sorted. Importantly, this is not checked but assumed to already be
+    the case. The calling method needs to ensure this for all non-object
+    values.
+
+    Parameters
+    ----------
+    values : tensor
+        Values to encode.
+    uniques : tensor
+        The unique values in `values`. If the dtype is not object, then
+        `uniques` needs to be sorted.
+    check_unknown : bool, default=True
+        If True, check for values in `values` that are not in `unique`
+        and raise an error. This is ignored for object dtype, and treated as
+        True in this case. This parameter is useful for
+        _BaseEncoder._transform() to avoid calling _check_unknown()
+        twice.
+
+    Returns
+    -------
+    encoded : tensor
+        Encoded values
+    """
+    return _map_to_integer(values, uniques, check_unknown=check_unknown)
diff --git a/python/xorbits/_mars/learn/utils/checks.py b/python/xorbits/_mars/learn/utils/checks.py
new file mode 100644
index 000000000..bd436a35e
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/checks.py
@@ -0,0 +1,474 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+try:
+    from sklearn import get_config as get_sklearn_config
+except ImportError:  # pragma: no cover
+    get_sklearn_config = None
+
+from ... import opcodes as OperandDef
+from ... import tensor as mt
+from ...config import options
+from ...core import ENTITY_TYPE, get_output_types, recursive_tile
+from ...core.operand import OperandStage
+from ...serialization.serializables import (
+    BoolField,
+    DataTypeField,
+    KeyField,
+    StringField,
+)
+from ...tensor.array_utils import as_same_device, device, get_array_module, issparse
+from ...tensor.core import TENSOR_CHUNK_TYPE, TensorOrder
+from ...utils import ceildiv
+from ..operands import LearnOperand, LearnOperandMixin, OutputType
+
+
+class CheckBase(LearnOperand, LearnOperandMixin):
+    _input = KeyField("input")
+    _value = KeyField("value")
+    _err_msg = StringField("err_msg")
+
+    def __init__(self, input=None, value=None, err_msg=None, output_types=None, **kw):
+        super().__init__(
+            _input=input,
+            _value=value,
+            _err_msg=err_msg,
+            _output_types=output_types,
+            **kw,
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def value(self):
+        return self._value
+
+    @property
+    def err_msg(self):
+        return self._err_msg
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if self._input is not None:
+            self._input = self._inputs[0]
+        if self._value is not None:
+            self._value = self._inputs[-1]
+
+    def __call__(self, x, value=None):
+        # output input if value not specified
+        self._value = value = value if value is not None else x
+        self.output_types = get_output_types(value)
+        self.stage = OperandStage.agg
+        return self.new_tileable([x, value], kws=[value.params])
+
+    @classmethod
+    def tile(cls, op):
+        combine_size = options.combine_size
+        x, value = op.input, op.value
+        check_chunks = []
+        for i, chunk in enumerate(x.chunks):
+            chunk_op = cls(
+                err_msg=op.err_msg,
+                stage=OperandStage.map,
+                output_types=[OutputType.tensor],
+            )
+            check_chunk = chunk_op.new_chunk(
+                [chunk],
+                shape=(),
+                index=(i,),
+                dtype=np.dtype(bool),
+                order=TensorOrder.C_ORDER,
+            )
+            check_chunks.append(check_chunk)
+
+        while len(check_chunks) > 1:
+            prev_check_chunks = check_chunks
+            check_chunks = []
+            chunk_size = ceildiv(len(prev_check_chunks), combine_size)
+            for i in range(chunk_size):
+                chunks = prev_check_chunks[i * combine_size : (i + 1) * combine_size]
+                chunk_op = cls(
+                    err_msg=op.err_msg,
+                    stage=OperandStage.combine,
+                    output_types=[OutputType.tensor],
+                )
+                check_chunk = chunk_op.new_chunk(
+                    chunks,
+                    shape=(),
+                    index=(i,),
+                    dtype=np.dtype(bool),
+                    order=TensorOrder.C_ORDER,
+                )
+                check_chunks.append(check_chunk)
+
+        check_chunk = check_chunks[0]
+        out_chunks = []
+        for val_chunk in value.chunks:
+            chunk_op = cls(
+                value=val_chunk,
+                err_msg=op.err_msg,
+                stage=OperandStage.agg,
+                output_types=op.output_types,
+            )
+            out_chunk = chunk_op.new_chunk(
+                [check_chunk, val_chunk], kws=[val_chunk.params]
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        kw = op.outputs[0].params
+        kw["chunks"] = out_chunks
+        kw["nsplits"] = value.nsplits
+        return new_op.new_tileables(op.inputs, kws=[kw])
+
+
+class CheckNonNegative(CheckBase):
+    _op_type_ = OperandDef.CHECK_NON_NEGATIVE
+
+    _whom = StringField("whom")
+
+    def __init__(
+        self,
+        input=None,
+        value=None,
+        whom=None,
+        err_msg=None,
+        stage=None,
+        gpu=None,
+        output_types=None,
+        **kw,
+    ):
+        super().__init__(
+            input=input,
+            value=value,
+            _whom=whom,
+            err_msg=err_msg,
+            stage=stage,
+            output_types=output_types,
+            gpu=gpu,
+            **kw,
+        )
+        if self._err_msg is None and self._whom is not None:
+            self._err_msg = f"Negative values in data passed to {self._whom}"
+
+    @property
+    def whom(self):
+        return self._whom
+
+    @classmethod
+    def _execute_tensor(cls, ctx, op):
+        (x,), device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            if issparse(x) and x.nnz == 0:
+                x_min = 0
+            else:
+                x_min = xp.min(x)
+
+            if x_min < 0:
+                raise ValueError(op.err_msg)
+
+            ctx[op.outputs[0].key] = np.array(True)
+
+    @classmethod
+    def _execute_df(cls, ctx, op):
+        x = ctx[op.inputs[0].key]
+        x_min = x.min().min()
+        if x_min < 0:
+            raise ValueError(op.err_msg)
+
+        ctx[op.outputs[0].key] = np.array(True)
+
+    @classmethod
+    def _execute_map(cls, ctx, op):
+        if isinstance(op.inputs[0], TENSOR_CHUNK_TYPE):
+            return cls._execute_tensor(ctx, op)
+        else:
+            return cls._execute_df(ctx, op)
+
+    @classmethod
+    def _execute_combine(cls, ctx, op):
+        # just pass value cuz all inputs executed successfully
+        ctx[op.outputs[0].key] = np.array(True)
+
+    @classmethod
+    def _execute_agg(cls, ctx, op):
+        ctx[op.outputs[0].key] = ctx[op.value.key]
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            return cls._execute_map(ctx, op)
+        elif op.stage == OperandStage.combine:
+            return cls._execute_combine(ctx, op)
+        else:
+            assert op.stage == OperandStage.agg
+            return cls._execute_agg(ctx, op)
+
+
+def check_non_negative_then_return_value(to_check, value, whom):
+    op = CheckNonNegative(input=to_check, value=value, whom=whom)
+    return op(to_check, value)
+
+
+class AssertAllFinite(LearnOperand, LearnOperandMixin):
+    _op_type_ = OperandDef.ASSERT_ALL_FINITE
+
+    _x = KeyField("x")
+    _allow_nan = BoolField("allow_nan")
+    _msg_dtype = DataTypeField("msg_dtype")
+    _check_only = BoolField("check_only")
+    # chunks
+    _is_finite = KeyField("is_finite")
+    _check_nan = KeyField("check_nan")
+
+    def __init__(
+        self,
+        x=None,
+        allow_nan=None,
+        msg_dtype=None,
+        check_only=None,
+        is_finite=None,
+        check_nan=None,
+        output_types=None,
+        **kw,
+    ):
+        super().__init__(
+            _x=x,
+            _allow_nan=allow_nan,
+            _msg_dtype=msg_dtype,
+            _check_only=check_only,
+            _is_finite=is_finite,
+            _check_nan=check_nan,
+            _output_types=output_types,
+            **kw,
+        )
+
+    @property
+    def x(self):
+        return self._x
+
+    @property
+    def allow_nan(self):
+        return self._allow_nan
+
+    @property
+    def msg_dtype(self):
+        return self._msg_dtype
+
+    @property
+    def check_only(self):
+        return self._check_only
+
+    @property
+    def is_finite(self):
+        return self._is_finite
+
+    @property
+    def check_nan(self):
+        return self._check_nan
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        for attr in ("_x", "_is_finite", "_check_nan"):
+            if getattr(self, attr) is not None:
+                setattr(self, attr, next(inputs_iter))
+
+    @classmethod
+    def _assume_finite(cls):
+        assume_finite = options.learn.assume_finite
+        if assume_finite is None and get_sklearn_config is not None:
+            # get config from scikit-learn
+            assume_finite = get_sklearn_config()["assume_finite"]
+        if assume_finite is None:  # pragma: no cover
+            assume_finite = False
+
+        return assume_finite
+
+    def __call__(self, x):
+        if self._assume_finite():
+            # skip check
+            if self._check_only:
+                return
+            else:
+                return x
+
+        if self._check_only:
+            return self.new_tileable(
+                [x], dtype=np.dtype(bool), shape=(), order=TensorOrder.C_ORDER
+            )
+        else:
+            return self.new_tileable([x], kws=[x.params])
+
+    @classmethod
+    def tile(cls, op):
+        from .extmath import _safe_accumulator_op
+
+        x = op.x
+        out = op.outputs[0]
+        is_float = x.dtype.kind in "fc"
+        combine_size = options.combine_size
+
+        is_finite_chunk = check_nan_chunk = None
+        if is_float:
+            is_finite_chunk = (
+                yield from recursive_tile(mt.isfinite(_safe_accumulator_op(mt.sum, x)))
+            ).chunks[0]
+        elif x.dtype == np.dtype(object) and not op.allow_nan:
+            check_nan_chunk = (yield from recursive_tile((x != x).any())).chunks[0]
+
+        map_chunks = []
+        for c in x.chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_op.stage = OperandStage.map
+            chunk_op._is_finite = is_finite_chunk
+            chunk_op._check_nan = check_nan_chunk
+            chunk_inputs = [c]
+            if is_finite_chunk is not None:
+                chunk_inputs.append(is_finite_chunk)
+            if check_nan_chunk is not None:
+                chunk_inputs.append(check_nan_chunk)
+            chunk_params = c.params
+            if op.check_only:
+                chunk_params["dtype"] = np.dtype(bool)
+                chunk_params["shape"] = ()
+                if len(x.chunks) == 1:
+                    chunk_params["index"] = ()
+            map_chunk = chunk_op.new_chunk(chunk_inputs, kws=[chunk_params])
+            map_chunks.append(map_chunk)
+
+        new_op = op.copy()
+        if not op.check_only:
+            params = out.params
+            params["nsplits"] = x.nsplits
+            params["chunks"] = map_chunks
+            return new_op.new_tileables(op.inputs, kws=[params])
+
+        out_chunks = map_chunks
+        # if check only, we use tree reduction to aggregate to one chunk
+        while len(out_chunks) > 1:
+            size = ceildiv(len(out_chunks), combine_size)
+            new_out_chunks = []
+            for i in range(size):
+                chunk_op = AssertAllFinite(
+                    check_only=True,
+                    output_types=op.output_types,
+                    stage=OperandStage.combine if size > 1 else OperandStage.agg,
+                )
+                chunk_index = (i,) if size > 1 else ()
+                out_chunk = chunk_op.new_chunk(
+                    out_chunks[combine_size * i : combine_size * (i + 1)],
+                    dtype=out.dtype,
+                    shape=(),
+                    index=chunk_index,
+                    order=out.order,
+                )
+                new_out_chunks.append(out_chunk)
+            out_chunks = new_out_chunks
+
+        params = out.params
+        params["nsplits"] = ()
+        params["chunks"] = out_chunks
+        return new_op.new_tileables(op.inputs, kws=[params])
+
+    @classmethod
+    def _execute_map(cls, ctx, op):
+        allow_nan = op.allow_nan
+        msg_dtype = op.msg_dtype
+        raw = x = ctx[op.x.key]
+        xp = get_array_module(x, nosparse=True)
+
+        if issparse(x):
+            x = x.data
+        # First try an O(n) time, O(1) space solution for the common case that
+        # everything is finite; fall back to O(n) space np.isfinite to prevent
+        # false positives from overflow in sum method. The sum is also calculated
+        # safely to reduce dtype induced overflows.
+        is_float = x.dtype.kind in "fc"
+        if is_float and ctx[op.is_finite.key]:
+            pass
+        elif is_float:
+            msg_err = "Input contains {} or a value too large for {!r}."
+            if (
+                allow_nan
+                and xp.isinf(x).any()
+                or not allow_nan
+                and not xp.isfinite(x).all()
+            ):
+                type_err = "infinity" if allow_nan else "NaN, infinity"
+                raise ValueError(
+                    msg_err.format(
+                        type_err, msg_dtype if msg_dtype is not None else x.dtype
+                    )
+                )
+        # for object dtype data, we only check for NaNs
+        elif x.dtype == np.dtype(object) and not allow_nan:
+            if ctx[op.check_nan.key]:
+                raise ValueError("Input contains NaN")
+
+        if op.check_only:
+            result = np.array(True)
+        else:
+            result = raw
+        ctx[op.outputs[0].key] = result
+
+    @classmethod
+    def _execute_combine_reduce(cls, ctx, op):
+        # just return True
+        ctx[op.outputs[0].key] = np.array(True)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            return cls._execute_map(ctx, op)
+        else:
+            assert op.stage in (OperandStage.combine, OperandStage.agg)
+            return cls._execute_combine_reduce(ctx, op)
+
+
+def assert_all_finite(X, allow_nan=False, msg_dtype=None, check_only=True):
+    if not isinstance(X, ENTITY_TYPE):
+        X = mt.asarray(X)
+
+    if (
+        isinstance(X.op, AssertAllFinite)
+        and X.op.allow_nan == allow_nan
+        and X.op.msg_dtype == msg_dtype
+        and X.op.check_only == check_only
+    ):
+        return X
+
+    if check_only:
+        output_types = [OutputType.tensor]
+        sparse = False
+    else:
+        output_types = get_output_types(X)
+        sparse = X.issparse()
+
+    op = AssertAllFinite(
+        x=X,
+        allow_nan=allow_nan,
+        msg_dtype=msg_dtype,
+        check_only=check_only,
+        sparse=sparse,
+        output_types=output_types,
+    )
+    return op(X)
diff --git a/python/xorbits/_mars/learn/utils/collect_ports.py b/python/xorbits/_mars/learn/utils/collect_ports.py
new file mode 100644
index 000000000..0bdfa959f
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/collect_ports.py
@@ -0,0 +1,111 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import socket
+
+import numpy as np
+
+from ... import opcodes
+from ...core.operand import OperandStage
+from ...serialization.serializables import (
+    FieldTypes,
+    Int32Field,
+    ListField,
+    StringField,
+)
+from ...tensor.merge import TensorConcatenate
+from ...utils import get_next_port
+from ..operands import LearnOperand, LearnOperandMixin, OutputType
+
+
+class CollectPorts(LearnOperand, LearnOperandMixin):
+    _op_code_ = opcodes.COLLECT_PORTS
+
+    _socket_type = Int32Field("socket_type")
+    _index = Int32Field("index")
+    _workers = ListField("workers", FieldTypes.string)
+    _tileable_key = StringField("tileable_key")
+
+    def __init__(
+        self, workers=None, socket_type=None, tileable_key=None, index=None, **kw
+    ):
+        super().__init__(
+            _socket_type=socket_type,
+            _workers=workers,
+            _tileable_key=tileable_key,
+            _index=index,
+            _pure_depends=[True],
+            **kw
+        )
+
+    @property
+    def socket_type(self):
+        return self._socket_type
+
+    @property
+    def workers(self):
+        return self._workers
+
+    @property
+    def tileable_key(self):
+        return self._tileable_key
+
+    def __call__(self, dep=None):
+        self._output_types = [OutputType.tensor]
+        if dep:
+            deps = [dep]
+        else:
+            deps = None
+        return self.new_tileable(deps, shape=(len(self.workers),), dtype=np.dtype(int))
+
+    @classmethod
+    def tile(cls, op: "CollectPorts"):
+        chunks = []
+        if op.inputs:
+            chunk_iter = itertools.cycle(op.inputs[0].chunks)
+        else:
+            chunk_iter = itertools.repeat(None)
+        for idx, (worker, inp) in enumerate(zip(op.workers, chunk_iter)):
+            new_op = op.copy().reset_key()
+            new_op._workers = [worker]
+            new_op.expect_worker = worker
+            new_op.stage = OperandStage.map
+            new_op._tileable_key = op.outputs[0].key
+            new_op._index = idx
+            new_op._pure_depends = [True]
+            inps = [inp] if inp else None
+            chunks.append(
+                new_op.new_chunk(inps, index=(idx,), shape=(1,), dtype=np.dtype(int))
+            )
+
+        concat_op = TensorConcatenate(axis=0, dtype=chunks[0].dtype)
+        concat_chunk = concat_op.new_chunk(chunks, shape=(len(op.workers),), index=(0,))
+
+        new_op = op.copy().reset_key()
+        params = op.outputs[0].params
+        params.update(dict(chunks=[concat_chunk], nsplits=((len(op.workers),),)))
+        return new_op.new_tileables(op.inputs, **params)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        assert ctx.band[0] == op.expect_worker
+        socket_type = op.socket_type or socket.SOCK_STREAM
+        port_num = get_next_port(socket_type, occupy=False)
+        ctx[op.outputs[0].key] = np.array([port_num], dtype=int)
+
+
+def collect_ports(workers, input_tileable=None):
+    op = CollectPorts(workers=workers)
+    return op(input_tileable)
diff --git a/python/xorbits/_mars/learn/utils/core.py b/python/xorbits/_mars/learn/utils/core.py
new file mode 100644
index 000000000..e12c0b3aa
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/core.py
@@ -0,0 +1,155 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import numbers
+import warnings
+from typing import List
+
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator
+
+try:
+    from sklearn import get_config as sklearn_get_config
+except ImportError:  # pragma: no cover
+    sklearn_get_config = None
+
+from ... import options
+from ...core import enter_mode
+from ...dataframe import DataFrame, Series
+from ...dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
+from ...tensor import tensor as astensor
+from ...typing import TileableType
+from ...utils import parse_readable_size
+
+
+def convert_to_tensor_or_dataframe(item):
+    if isinstance(item, (DATAFRAME_TYPE, pd.DataFrame)):
+        item = DataFrame(item)
+    elif isinstance(item, (SERIES_TYPE, pd.Series)):
+        item = Series(item)
+    else:
+        item = astensor(item)
+    return item
+
+
+def concat_chunks(chunks):
+    tileable = chunks[0].op.create_tileable_from_chunks(chunks)
+    return tileable.op.concat_tileable_chunks(tileable).chunks[0]
+
+
+def copy_learned_attributes(from_estimator: BaseEstimator, to_estimator: BaseEstimator):
+    attrs = {
+        k: v
+        for k, v in vars(from_estimator).items()
+        if k.endswith("_") or k.startswith("_")
+    }
+    for k, v in attrs.items():
+        setattr(to_estimator, k, v)
+
+
+def is_scalar_nan(x):
+    """Tests if x is NaN.
+
+    This function is meant to overcome the issue that np.isnan does not allow
+    non-numerical types as input, and that np.nan is not float('nan').
+
+    Parameters
+    ----------
+    x : any type
+
+    Returns
+    -------
+    boolean
+
+    Examples
+    --------
+    >>> is_scalar_nan(np.nan)
+    True
+    >>> is_scalar_nan(float("nan"))
+    True
+    >>> is_scalar_nan(None)
+    False
+    >>> is_scalar_nan("")
+    False
+    >>> is_scalar_nan([np.nan])
+    False
+    """
+    return isinstance(x, numbers.Real) and math.isnan(x)
+
+
+def get_chunk_n_rows(row_bytes, max_n_rows=None, working_memory=None):
+    """Calculates how many rows can be processed within working_memory
+
+    Parameters
+    ----------
+    row_bytes : int
+        The expected number of bytes of memory that will be consumed
+        during the processing of each row.
+    max_n_rows : int, optional
+        The maximum return value.
+    working_memory : int or float, optional
+        The number of rows to fit inside this number of MiB will be returned.
+        When None (default), the value of
+        ``sklearn.get_config()['working_memory']`` is used.
+
+    Returns
+    -------
+    int or the value of n_samples
+
+    Warns
+    -----
+    Issues a UserWarning if ``row_bytes`` exceeds ``working_memory`` MiB.
+    """
+
+    if working_memory is None:  # pragma: no cover
+        working_memory = options.learn.working_memory
+        if working_memory is None and sklearn_get_config is not None:
+            working_memory = sklearn_get_config()["working_memory"]
+        elif working_memory is None:
+            working_memory = 1024
+
+    if isinstance(working_memory, int):
+        working_memory *= 2**20
+    else:
+        working_memory = parse_readable_size(working_memory)[0]
+
+    chunk_n_rows = int(working_memory // row_bytes)
+    if max_n_rows is not None:
+        chunk_n_rows = min(chunk_n_rows, max_n_rows)
+    if chunk_n_rows < 1:  # pragma: no cover
+        warnings.warn(
+            "Could not adhere to working_memory config. "
+            "Currently %.0fMiB, %.0fMiB required."
+            % (working_memory, np.ceil(row_bytes * 2**-20))
+        )
+        chunk_n_rows = 1
+    return chunk_n_rows
+
+
+@enter_mode(build=True)
+def sort_by(
+    tensors: List[TileableType], by: TileableType, ascending: bool = True
+) -> List[TileableType]:
+    # sort tensors by another tensor
+    i_to_tensors = {i: t for i, t in enumerate(tensors)}
+    if by not in tensors:
+        by_name = len(i_to_tensors)
+        i_to_tensors[by_name] = by
+    else:
+        by_name = tensors.index(by)
+    df = DataFrame(i_to_tensors)
+    sorted_df = df.sort_values(by_name, ascending=ascending)
+    return [sorted_df[i].to_tensor() for i in range(len(tensors))]
diff --git a/python/xorbits/_mars/learn/utils/extmath.py b/python/xorbits/_mars/learn/utils/extmath.py
new file mode 100644
index 000000000..76c8d07fc
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/extmath.py
@@ -0,0 +1,107 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import tensor as mt
+
+
+# Use at least float64 for the accumulating functions to avoid precision issue
+# see https://github.com/numpy/numpy/issues/9393. The float64 is also retained
+# as it is in case the float overflows
+def _safe_accumulator_op(op, x, *args, **kwargs):
+    """
+    This function provides numpy accumulator functions with a float64 dtype
+    when used on a floating point input. This prevents accumulator overflow on
+    smaller floating point dtypes.
+
+    Parameters
+    ----------
+    op : function
+        A accumulator function such as np.mean or np.sum
+    x : numpy array
+        A tensor to apply the accumulator function
+    *args : positional arguments
+        Positional arguments passed to the accumulator function after the
+        input x
+    **kwargs : keyword arguments
+        Keyword arguments passed to the accumulator function
+
+    Returns
+    -------
+    result : The output of the accumulator function passed to this function
+    """
+    if np.issubdtype(x.dtype, np.floating) and x.dtype.itemsize < 8:
+        result = op(x, *args, **kwargs, dtype=np.float64)
+    else:
+        result = op(x, *args, **kwargs)
+    return result
+
+
+def row_norms(X, squared=False):
+    """Row-wise (squared) Euclidean norm of X.
+
+    Performs no input validation.
+
+    Parameters
+    ----------
+    X : array_like
+        The input tensor
+    squared : bool, optional (default = False)
+        If True, return squared norms.
+
+    Returns
+    -------
+    array_like
+        The row-wise (squared) Euclidean norm of X.
+    """
+
+    norms = (X * X).sum(axis=1)
+    if not squared:
+        norms = mt.sqrt(norms)
+    return norms
+
+
+def softmax(X, copy=True):
+    """
+    Calculate the softmax function.
+
+    The softmax function is calculated by
+    np.exp(X) / np.sum(np.exp(X), axis=1)
+
+    This will cause overflow when large values are exponentiated.
+    Hence the largest value in each row is subtracted from each data
+    point to prevent this.
+
+    Parameters
+    ----------
+    X : array-like of float of shape (M, N)
+        Argument to the logistic function.
+
+    copy : bool, default=True
+        Copy X or not.
+
+    Returns
+    -------
+    out : ndarray of shape (M, N)
+        Softmax function evaluated at every point in x.
+    """
+    if copy:
+        X = mt.copy(X)
+    max_prob = mt.max(X, axis=1).reshape((-1, 1))
+    X = X - max_prob
+    X = mt.exp(X)
+    sum_prob = mt.sum(X, axis=1).reshape((-1, 1))
+    X = X / sum_prob
+    return X
diff --git a/python/xorbits/_mars/learn/utils/multiclass.py b/python/xorbits/_mars/learn/utils/multiclass.py
new file mode 100644
index 000000000..854e37211
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/multiclass.py
@@ -0,0 +1,465 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from collections.abc import Sequence
+from typing import List
+
+import numpy as np
+from scipy.sparse.base import spmatrix
+from sklearn.utils.multiclass import is_multilabel as sklearn_is_multilabel
+from sklearn.utils.multiclass import type_of_target as sklearn_type_of_target
+
+from ... import opcodes as OperandDef
+from ... import tensor as mt
+from ...core import ENTITY_TYPE, TILEABLE_TYPE, recursive_tile
+from ...core.context import get_context
+from ...serialization.serializables import AnyField, ListField
+from ...tensor.core import TENSOR_TYPE, TensorOrder
+from ...typing import TileableType
+from ...utils import has_unknown_shape
+from ..operands import LearnOperand, LearnOperandMixin, OutputType
+from ..utils import assert_all_finite
+from .validation import check_array
+
+
+def _unique_multiclass(y):
+    if hasattr(y, "__array__") or hasattr(y, "__mars_tensor__"):
+        return mt.unique(mt.asarray(y))
+    else:
+        return set(y)
+
+
+def _unique_indicator(y):
+    return mt.arange(check_array(y, accept_sparse=True).shape[1])
+
+
+_FN_UNIQUE_LABELS = {
+    "binary": _unique_multiclass,
+    "multiclass": _unique_multiclass,
+    "multilabel-indicator": _unique_indicator,
+}
+
+
+class UniqueLabels(LearnOperand, LearnOperandMixin):
+    _op_type_ = OperandDef.UNIQUE_LABELS
+
+    ys = ListField("ys")
+
+    def __call__(self, ys: List[TileableType]):
+        self._output_types = [OutputType.tensor]
+        inputs = [y for y in ys if isinstance(y, TILEABLE_TYPE)]
+        return self.new_tileable(
+            inputs,
+            shape=(np.nan,),
+            dtype=mt.tensor(ys[0]).dtype,
+            order=TensorOrder.C_ORDER,
+        )
+
+    @classmethod
+    def tile(cls, op: "UniqueLabels"):
+        ys = op.ys
+        ctx = get_context()
+
+        target_types = yield from recursive_tile([type_of_target(x) for x in ys])
+        # yield chunks of target_types for execution
+        chunks = list(itertools.chain(*(t.chunks for t in target_types)))
+        yield chunks
+
+        ys_types = set(
+            [it.item() for it in ctx.get_chunks_result([c.key for c in chunks])]
+        )
+        if ys_types == {"binary", "multiclass"}:
+            ys_types = {"multiclass"}
+
+        if len(ys_types) > 1:
+            raise ValueError("Mix type of y not allowed, got types %s" % ys_types)
+
+        label_type = ys_types.pop()
+
+        # Check consistency for the indicator format
+        if label_type == "multilabel-indicator":
+            check_arrays = []
+            chunks = []
+            for y in ys:
+                arr = yield from recursive_tile(check_array(y, accept_sparse=True))
+                check_arrays.append(arr)
+                chunks.extend(arr.chunks)
+            yield check_arrays + chunks
+            if len(set(arr.shape[1] for arr in check_arrays)) > 1:
+                raise ValueError(
+                    "Multi-label binary indicator input with "
+                    "different numbers of labels"
+                )
+
+        # Get the unique set of labels
+        _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None)
+        if not _unique_labels:
+            raise ValueError("Unknown label type: %s" % repr(ys))
+
+        labels = [_unique_labels(y) for y in ys]
+        labels_chunks = []
+        ys_labels = set()
+        for label in labels:
+            if isinstance(label, ENTITY_TYPE):
+                label = yield from recursive_tile(label)
+                labels_chunks.extend(label.chunks)
+            else:
+                ys_labels.update(label)
+        yield labels_chunks
+        ys_labels.update(
+            itertools.chain.from_iterable(
+                ctx.get_chunks_result([c.key for c in labels_chunks])
+            )
+        )
+
+        # Check that we don't mix string type with number type
+        if len(set(isinstance(label, str) for label in ys_labels)) > 1:
+            raise ValueError("Mix of label input types (string and number)")
+
+        return (yield from recursive_tile(mt.array(sorted(ys_labels))))
+
+
+def unique_labels(*ys):
+    """
+    Extract an ordered array of unique labels.
+
+    We don't allow:
+        - mix of multilabel and multiclass (single label) targets
+        - mix of label indicator matrix and anything else,
+          because there are no explicit labels)
+        - mix of label indicator matrices of different sizes
+        - mix of string and integer labels
+
+    At the moment, we also don't allow "multiclass-multioutput" input type.
+
+    Parameters
+    ----------
+    *ys : array-likes
+
+    Returns
+    -------
+    out : ndarray of shape (n_unique_labels,)
+        An ordered array of unique labels.
+
+    Examples
+    --------
+    >>> from mars.learn.utils.multiclass import unique_labels
+    >>> unique_labels([3, 5, 5, 5, 7, 7]).execute()
+    array([3, 5, 7])
+    >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4]).execute()
+    array([1, 2, 3, 4])
+    >>> unique_labels([1, 2, 10], [5, 11]).execute()
+    array([ 1,  2,  5, 10, 11])
+    """
+    if not ys:
+        raise ValueError("No argument has been passed.")
+
+    ys = list(ys)
+    op = UniqueLabels(ys=ys)
+    return op(ys)
+
+
+class IsMultilabel(LearnOperand, LearnOperandMixin):
+    _op_type_ = OperandDef.IS_MULTILABEL
+
+    y = AnyField("y")
+
+    def __call__(self, y):
+        self._output_types = [OutputType.tensor]
+        inputs = [y] if isinstance(y, ENTITY_TYPE) else []
+        return self.new_tileable(
+            inputs, shape=(), dtype=np.dtype(bool), order=TensorOrder.C_ORDER
+        )
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if self._inputs:
+            self.y = self._inputs[0]
+
+    @classmethod
+    def _tile(cls, op: "IsMultilabel"):
+        y = op.y
+
+        if not isinstance(y, ENTITY_TYPE):
+            return sklearn_is_multilabel(y)
+
+        ctx = get_context()
+
+        if has_unknown_shape(y):  # pragma: no cover
+            yield
+
+        if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1):
+            return False
+
+        labels = yield from recursive_tile(mt.unique(y))
+        yield labels.chunks + [labels]
+
+        if len(labels) < 3:
+            if y.dtype.kind in "biu":
+                return True
+            if y.dtype.kind == "f":
+                is_integral_float = yield from recursive_tile(
+                    mt.all(mt.equal(y.astype(int), y))
+                )
+                yield is_integral_float.chunks
+                is_integral_float = ctx.get_chunks_result(
+                    [is_integral_float.chunks[0].key]
+                )[0]
+                if is_integral_float:
+                    return True
+
+        return False
+
+    @classmethod
+    def tile(cls, op: "IsMultilabel"):
+        result = yield from cls._tile(op)
+        return (yield from recursive_tile(mt.array(result)))
+
+
+def is_multilabel(y):
+    """
+    Check if ``y`` is in a multilabel format.
+
+    Parameters
+    ----------
+    y : numpy array of shape [n_samples]
+        Target values.
+
+    Returns
+    -------
+    out : bool,
+        Return ``True``, if ``y`` is in a multilabel format, else ```False``.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> from mars.learn.utils.multiclass import is_multilabel
+    >>> is_multilabel([0, 1, 0, 1]).execute()
+    False
+    >>> is_multilabel([[1], [0, 2], []]).execute()
+    False
+    >>> is_multilabel(mt.array([[1, 0], [0, 0]])).execute()
+    True
+    >>> is_multilabel(mt.array([[1], [0], [0]])).execute()
+    False
+    >>> is_multilabel(mt.array([[1, 0, 0]])).execute()
+    True
+    """
+    if not isinstance(y, ENTITY_TYPE):
+        if hasattr(y, "__array__") or isinstance(y, Sequence):
+            y = np.asarray(y)
+        yt = None
+    else:
+        yt = y = mt.tensor(y)
+
+    op = IsMultilabel(y=y)
+    return op(yt)
+
+
+class TypeOfTarget(LearnOperand, LearnOperandMixin):
+    _op_type_ = OperandDef.TYPE_OF_TARGET
+
+    y = AnyField("y")
+
+    def __call__(self, y: TileableType):
+        self._output_types = [OutputType.tensor]
+        inputs = [y] if isinstance(y, ENTITY_TYPE) else []
+        return self.new_tileable(
+            inputs, shape=(), order=TensorOrder.C_ORDER, dtype=np.dtype(object)
+        )
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if self._inputs:
+            self.y = self._inputs[0]
+
+    @classmethod
+    def _tile(cls, op: "TypeOfTarget"):
+        y = op.y
+
+        # y is ndarray
+        if not isinstance(y, ENTITY_TYPE):
+            return sklearn_type_of_target(y)
+        else:
+            # make sure y executed
+            yield
+
+        ctx = get_context()
+
+        multilabel = yield from recursive_tile(is_multilabel(y))
+        yield multilabel.chunks
+        multilabel = ctx.get_chunks_result([multilabel.chunks[0].key])[0]
+        if multilabel:
+            return "multilabel-indicator"
+
+        # Invalid inputs
+        if y.ndim > 2:
+            return "unknown"
+        if y.dtype == object and len(y):
+            # [[[1, 2]]] or [obj_1] and not ["label_1"]
+            first_val = ctx.get_chunks_result([y.chunks[0].key])[0].flat[0]
+            if not isinstance(first_val, str):
+                return "unknown"
+
+        if y.ndim == 2 and y.shape[1] == 0:
+            return "unknown"  # [[]]
+
+        if y.ndim == 2 and y.shape[1] > 1:
+            suffix = "-multioutput"  # [[1, 2], [1, 2]]
+        else:
+            suffix = ""  # [1, 2, 3] or [[1], [2], [3]]
+
+        if y.dtype.kind == "f":
+            # check float and contains non-integer float values
+            contain_float_values = yield from recursive_tile(mt.any(y != y.astype(int)))
+            yield contain_float_values.chunks
+            contain_float_values = ctx.get_chunks_result(
+                [contain_float_values.chunks[0].key]
+            )[0]
+            # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
+            if contain_float_values:
+                yield from recursive_tile(assert_all_finite(y))
+                return "continuous" + suffix
+
+        unique_y = yield from recursive_tile(mt.unique(y))
+        yield unique_y.chunks + [unique_y]
+        if (len(unique_y) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
+            return "multiclass" + suffix  # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
+        else:
+            return "binary"  # [1, 2] or [["a"], ["b"]]
+
+    @classmethod
+    def tile(cls, op: "TypeOfTarget"):
+        result = yield from cls._tile(op)
+        return (yield from recursive_tile(mt.array(result)))
+
+
+def type_of_target(y):
+    """
+    Determine the type of data indicated by the target.
+
+    Note that this type is the most specific type that can be inferred.
+    For example:
+
+        * ``binary`` is more specific but compatible with ``multiclass``.
+        * ``multiclass`` of integers is more specific but compatible with
+          ``continuous``.
+        * ``multilabel-indicator`` is more specific but compatible with
+          ``multiclass-multioutput``.
+
+    Parameters
+    ----------
+    y : array-like
+
+    Returns
+    -------
+    target_type : string
+        One of:
+
+        * 'continuous': `y` is an array-like of floats that are not all
+          integers, and is 1d or a column vector.
+        * 'continuous-multioutput': `y` is a 2d tensor of floats that are
+          not all integers, and both dimensions are of size > 1.
+        * 'binary': `y` contains <= 2 discrete values and is 1d or a column
+          vector.
+        * 'multiclass': `y` contains more than two discrete values, is not a
+          sequence of sequences, and is 1d or a column vector.
+        * 'multiclass-multioutput': `y` is a 2d tensor that contains more
+          than two discrete values, is not a sequence of sequences, and both
+          dimensions are of size > 1.
+        * 'multilabel-indicator': `y` is a label indicator matrix, a tensor
+          of two dimensions with at least two columns, and at most 2 unique
+          values.
+        * 'unknown': `y` is array-like but none of the above, such as a 3d
+          tensor, sequence of sequences, or a tensor of non-sequence objects.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> from mars.learn.utils.multiclass import type_of_target
+    >>> type_of_target([0.1, 0.6]).execute()
+    'continuous'
+    >>> type_of_target([1, -1, -1, 1]).execute()
+    'binary'
+    >>> type_of_target(['a', 'b', 'a']).execute()
+    'binary'
+    >>> type_of_target([1.0, 2.0]).execute()
+    'binary'
+    >>> type_of_target([1, 0, 2]).execute()
+    'multiclass'
+    >>> type_of_target([1.0, 0.0, 3.0]).execute()
+    'multiclass'
+    >>> type_of_target(['a', 'b', 'c']).execute()
+    'multiclass'
+    >>> type_of_target(mt.array([[1, 2], [3, 1]])).execute()
+    'multiclass-multioutput'
+    >>> type_of_target([[1, 2]]).execute()
+    'multiclass-multioutput'
+    >>> type_of_target(mt.array([[1.5, 2.0], [3.0, 1.6]])).execute()
+    'continuous-multioutput'
+    >>> type_of_target(mt.array([[0, 1], [1, 1]])).execute()
+    'multilabel-indicator'
+    """
+    if isinstance(y, TENSOR_TYPE):
+        y = mt.tensor(y)
+
+    valid_types = (Sequence, spmatrix) if spmatrix is not None else (Sequence,)
+    valid = (
+        isinstance(y, valid_types)
+        or hasattr(y, "__array__")
+        or hasattr(y, "__mars_tensor__")
+    ) and not isinstance(y, str)
+
+    if not valid:
+        raise ValueError(f"Expected array-like (array or non-string sequence), got {y}")
+
+    sparse_pandas = y.__class__.__name__ in ["SparseSeries", "SparseArray"]
+    if sparse_pandas:  # pragma: no cover
+        raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")
+
+    if isinstance(y, ENTITY_TYPE):
+        y = mt.tensor(y)
+
+    op = TypeOfTarget(y=y)
+    return op(y)
+
+
+def check_classification_targets(y):
+    """
+    Ensure that target y is of a non-regression type.
+
+    Only the following target types (as defined in type_of_target) are allowed:
+        'binary', 'multiclass', 'multiclass-multioutput',
+        'multilabel-indicator', 'multilabel-sequences'
+
+    Parameters
+    ----------
+    y : array-like
+    """
+    y_type = type_of_target(y)
+
+    def check(t):
+        if t not in [
+            "binary",
+            "multiclass",
+            "multiclass-multioutput",
+            "multilabel-indicator",
+            "multilabel-sequences",
+        ]:
+            raise ValueError("Unknown label type: %r" % t)
+        return t
+
+    y_type = y_type.map_chunk(check, dtype=y_type.dtype)
+    return y_type
diff --git a/python/xorbits/_mars/learn/utils/shuffle.py b/python/xorbits/_mars/learn/utils/shuffle.py
new file mode 100644
index 000000000..8b8a3d646
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/shuffle.py
@@ -0,0 +1,490 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from collections import defaultdict
+from collections.abc import Iterable
+from functools import reduce
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...core import ExecutableTuple, get_output_types, recursive_tile
+from ...core.operand import MapReduceOperand, OperandStage
+from ...dataframe.utils import parse_index
+from ...lib import sparse
+from ...serialization.serializables import FieldTypes, KeyField, TupleField
+from ...tensor.array_utils import get_array_module
+from ...tensor.utils import (
+    check_random_state,
+    decide_unify_split,
+    gen_random_seeds,
+    validate_axis,
+)
+from ...utils import has_unknown_shape, lazy_import, tokenize
+from ..operands import LearnOperandMixin, LearnShuffleProxy, OutputType
+from ..utils import convert_to_tensor_or_dataframe
+
+cudf = lazy_import("cudf")
+
+
+def _shuffle_index_value(op, index_value, chunk_index=None):
+    key = tokenize((op._values_, chunk_index, index_value.key))
+    return parse_index(pd.Index([], index_value.to_pandas().dtype), key=key)
+
+
+def _safe_slice(obj, slc, output_type):
+    if output_type == OutputType.tensor:
+        return obj[slc]
+    else:
+        return obj.iloc[slc]
+
+
+class LearnShuffle(MapReduceOperand, LearnOperandMixin):
+    _op_type_ = OperandDef.PERMUTATION
+
+    _axes = TupleField("axes", FieldTypes.int32)
+    _seeds = TupleField("seeds", FieldTypes.uint32)
+
+    _input = KeyField("input")
+    _reduce_sizes = TupleField("reduce_sizes", FieldTypes.uint32)
+
+    def __init__(
+        self, axes=None, seeds=None, output_types=None, reduce_sizes=None, **kw
+    ):
+        super().__init__(
+            _axes=axes,
+            _seeds=seeds,
+            _output_types=output_types,
+            _reduce_sizes=reduce_sizes,
+            **kw,
+        )
+
+    @property
+    def axes(self):
+        return self._axes
+
+    @property
+    def seeds(self):
+        return self._seeds
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def reduce_sizes(self):
+        return self._reduce_sizes
+
+    @property
+    def output_limit(self):
+        if self.stage is None:
+            return len(self.output_types)
+        return 1
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def __call__(self, arrays):
+        params = self._calc_params([ar.params for ar in arrays])
+        return self.new_tileables(arrays, kws=params)
+
+    def _shuffle_index_value(self, index_value):
+        return _shuffle_index_value(self, index_value)
+
+    def _shuffle_dtypes(self, dtypes):
+        seed = self.seeds[self.axes.index(1)]
+        rs = np.random.RandomState(seed)
+        shuffled_dtypes = dtypes[rs.permutation(np.arange(len(dtypes)))]
+        return shuffled_dtypes
+
+    def _calc_params(self, params):
+        axes = set(self.axes)
+        for i, output_type, param in zip(itertools.count(0), self.output_types, params):
+            if output_type == OutputType.dataframe:
+                if 0 in axes:
+                    param["index_value"] = self._shuffle_index_value(
+                        param["index_value"]
+                    )
+                if 1 in axes:
+                    dtypes = param["dtypes"] = self._shuffle_dtypes(param["dtypes"])
+                    param["columns_value"] = parse_index(dtypes.index, store_data=True)
+            elif output_type == OutputType.series:
+                if 0 in axes:
+                    param["index_value"] = self._shuffle_index_value(
+                        param["index_value"]
+                    )
+            param["_position_"] = i
+        return params
+
+    @staticmethod
+    def _safe_rechunk(tileable, ax_nsplit):
+        do_rechunk = False
+        for ax, nsplit in ax_nsplit.items():
+            if ax >= tileable.ndim:
+                continue
+            if tuple(tileable.nsplits[ax]) != tuple(nsplit):
+                do_rechunk = True
+        if do_rechunk:
+            return (yield from recursive_tile(tileable.rechunk(ax_nsplit)))
+        else:
+            return tileable
+
+    @classmethod
+    def _calc_chunk_params(
+        cls,
+        in_chunk,
+        axes,
+        chunk_shape,
+        output,
+        output_type,
+        chunk_op,
+        no_shuffle: bool,
+    ):
+        params = {"index": in_chunk.index}
+        if output_type == OutputType.tensor:
+            shape_c = list(in_chunk.shape)
+            for ax in axes:
+                if not no_shuffle and chunk_shape[ax] > 1:
+                    shape_c[ax] = np.nan
+            params["shape"] = tuple(shape_c)
+            params["dtype"] = in_chunk.dtype
+            params["order"] = output.order
+        elif output_type == OutputType.dataframe:
+            shape_c = list(in_chunk.shape)
+            if 0 in axes:
+                if not no_shuffle and chunk_shape[0] > 1:
+                    shape_c[0] = np.nan
+            params["shape"] = tuple(shape_c)
+            if 1 not in axes:
+                params["dtypes"] = in_chunk.dtypes
+                params["columns_value"] = in_chunk.columns_value
+            else:
+                params["dtypes"] = output.dtypes
+                params["columns_value"] = output.columns_value
+            params["index_value"] = _shuffle_index_value(
+                chunk_op, in_chunk.index_value, in_chunk.index
+            )
+        else:
+            assert output_type == OutputType.series
+            if no_shuffle:
+                params["shape"] = in_chunk.shape
+            else:
+                params["shape"] = (np.nan,)
+            params["name"] = in_chunk.name
+            params["index_value"] = _shuffle_index_value(
+                chunk_op, in_chunk.index_value, in_chunk.index
+            )
+            params["dtype"] = in_chunk.dtype
+        return params
+
+    @classmethod
+    def tile(cls, op):
+        inputs = op.inputs
+        if has_unknown_shape(inputs):
+            yield
+        axis_to_nsplits = defaultdict(list)
+        has_dataframe = any(
+            output_type == OutputType.dataframe for output_type in op.output_types
+        )
+        for ax in op.axes:
+            if has_dataframe and ax == 1:
+                # if DataFrame exists, for the columns axis,
+                # we only allow 1 chunk to ensure the columns consistent
+                axis_to_nsplits[ax].append((inputs[0].shape[ax],))
+                continue
+            for inp in inputs:
+                if ax < inp.ndim:
+                    axis_to_nsplits[ax].append(inp.nsplits[ax])
+        ax_nsplit = {ax: decide_unify_split(*ns) for ax, ns in axis_to_nsplits.items()}
+        rechunked_inputs = []
+        for inp in inputs:
+            inp_ax_nsplit = {ax: ns for ax, ns in ax_nsplit.items() if ax < inp.ndim}
+            inp = yield from cls._safe_rechunk(inp, inp_ax_nsplit)
+            rechunked_inputs.append(inp)
+        inputs = rechunked_inputs
+
+        mapper_seeds = [None] * len(op.axes)
+        reducer_seeds = [None] * len(op.axes)
+        for i, ax in enumerate(op.axes):
+            rs = np.random.RandomState(op.seeds[i])
+            size = len(ax_nsplit[ax])
+            if size > 1:
+                mapper_seeds[i] = gen_random_seeds(size, rs)
+                reducer_seeds[i] = gen_random_seeds(size, rs)
+            else:
+                mapper_seeds[i] = reducer_seeds[i] = [op.seeds[i]] * size
+        out_chunks = []
+        out_nsplits = []
+        for output_type, inp, oup in zip(op.output_types, inputs, op.outputs):
+            inp_axes = tuple(ax for ax in op.axes if ax < inp.ndim)
+            reduce_sizes = tuple(inp.chunk_shape[ax] for ax in inp_axes)
+            output_types = [output_type]
+
+            if len(inp_axes) == 0:
+                continue
+
+            nsplits = list(inp.nsplits)
+            for ax in inp_axes:
+                cs = len(nsplits[ax])
+                if cs > 1:
+                    nsplits[ax] = (np.nan,) * cs
+            out_nsplits.append(tuple(nsplits))
+
+            if all(reduce_size == 1 for reduce_size in reduce_sizes):
+                # no need to do shuffle
+                chunks = []
+                for c in inp.chunks:
+                    chunk_op = LearnShuffle(
+                        axes=inp_axes,
+                        seeds=op.seeds[: len(inp_axes)],
+                        output_types=output_types,
+                    )
+                    params = cls._calc_chunk_params(
+                        c, inp_axes, inp.chunk_shape, oup, output_type, chunk_op, True
+                    )
+                    out_chunk = chunk_op.new_chunk([c], kws=[params])
+                    chunks.append(out_chunk)
+                out_chunks.append(chunks)
+                continue
+
+            if inp.ndim > 1:
+                left_chunk_shape = [
+                    s for ax, s in enumerate(inp.chunk_shape) if ax not in inp_axes
+                ]
+                idx_iter = itertools.product(*[range(s) for s in left_chunk_shape])
+            else:
+                idx_iter = [()]
+            reduce_chunks = []
+            out_chunks.append(reduce_chunks)
+            for idx in idx_iter:
+                map_chunks = []
+                for reducer_inds in itertools.product(
+                    *[range(s) for s in reduce_sizes]
+                ):
+                    inp_index = list(idx)
+                    for ax, reducer_ind in zip(inp_axes, reducer_inds):
+                        inp_index.insert(ax, reducer_ind)
+                    inp_index = tuple(inp_index)
+                    in_chunk = inp.cix[inp_index]
+                    params = in_chunk.params
+                    map_chunk_op = LearnShuffle(
+                        stage=OperandStage.map,
+                        output_types=output_types,
+                        axes=inp_axes,
+                        seeds=tuple(
+                            mapper_seeds[j][in_chunk.index[ax]]
+                            for j, ax in enumerate(inp_axes)
+                        ),
+                        reduce_sizes=reduce_sizes,
+                    )
+                    map_chunk = map_chunk_op.new_chunk([in_chunk], **params)
+                    map_chunks.append(map_chunk)
+
+                map_chunk_kw = {}
+                if output_type == OutputType.tensor:
+                    map_chunk_kw = {"dtype": inp.dtype, "shape": ()}
+                proxy_chunk = LearnShuffleProxy(
+                    _tileable_keys=[inp.key], output_types=[output_type]
+                ).new_chunk(map_chunks, **map_chunk_kw)
+
+                reduce_axes = tuple(
+                    ax for j, ax in enumerate(inp_axes) if reduce_sizes[j] > 1
+                )
+                reduce_sizes_ = tuple(rs for rs in reduce_sizes if rs > 1)
+                for c in map_chunks:
+                    chunk_op = LearnShuffle(
+                        stage=OperandStage.reduce,
+                        output_types=output_types,
+                        axes=reduce_axes,
+                        seeds=tuple(
+                            reducer_seeds[j][c.index[ax]]
+                            for j, ax in enumerate(inp_axes)
+                            if reduce_sizes[j] > 1
+                        ),
+                        reduce_sizes=reduce_sizes_,
+                        n_reducers=len(map_chunks),
+                    )
+                    params = cls._calc_chunk_params(
+                        c, inp_axes, inp.chunk_shape, oup, output_type, chunk_op, False
+                    )
+                    reduce_chunk = chunk_op.new_chunk([proxy_chunk], kws=[params])
+                    reduce_chunks.append(reduce_chunk)
+
+        new_op = op.copy()
+        params = [out.params for out in op.outputs]
+        if len(out_chunks) < len(op.outputs):
+            # axes are all higher than its ndim
+            for i, inp in enumerate(op.inputs):
+                if all(ax >= inp.ndim for ax in op.axes):
+                    out_chunks.insert(i, inp.chunks)
+                    out_nsplits.insert(i, inp.nsplits)
+            assert len(out_chunks) == len(op.outputs)
+        for i, param, chunks, ns in zip(
+            itertools.count(), params, out_chunks, out_nsplits
+        ):
+            param["chunks"] = chunks
+            param["nsplits"] = ns
+            param["_position_"] = i
+        return new_op.new_tileables(op.inputs, kws=params)
+
+    @classmethod
+    def execute_single(cls, ctx, op):
+        x = ctx[op.inputs[0].key]
+        conv = lambda x: x
+        if op.output_types[0] == OutputType.tensor:
+            xp = get_array_module(x)
+            if xp is sparse:
+                conv = lambda x: x
+            else:
+                conv = (
+                    xp.ascontiguousarray
+                    if op.outputs[0].order.value == "C"
+                    else xp.asfortranarray
+                )
+
+        for axis, seed in zip(op.axes, op.seeds):
+            size = x.shape[axis]
+            ind = np.random.RandomState(seed).permutation(np.arange(size))
+            slc = (slice(None),) * axis + (ind,)
+            x = _safe_slice(x, slc, op.output_types[0])
+
+        ctx[op.outputs[0].key] = conv(x)
+
+    @classmethod
+    def execute_map(cls, ctx, op):
+        out = op.outputs[0]
+        x = ctx[op.input.key]
+        axes, seeds, reduce_sizes = op.axes, op.seeds, op.reduce_sizes
+        if 1 in set(op.reduce_sizes):
+            # if chunk size on shuffle axis == 0
+            inds = [slice(None) for _ in range(x.ndim)]
+            extra_axes, extra_seeds, extra_reduce_sizes = [], [], []
+            for ax, seed, reduce_size in zip(axes, seeds, reduce_sizes):
+                rs = np.random.RandomState(seed)
+                if reduce_size == 1:
+                    inds[ax] = rs.permutation(np.arange(x.shape[ax]))
+                else:
+                    extra_axes.append(ax)
+                    extra_seeds.append(seed)
+                    extra_reduce_sizes.append(reduce_size)
+            # for the reduce == 1
+            # do shuffle on the map phase
+            x = _safe_slice(x, tuple(inds), op.output_types[0])
+            axes, seeds, reduce_sizes = extra_axes, extra_seeds, extra_reduce_sizes
+
+        to_hash_inds = []
+        for ax, seed, reduce_size in zip(axes, seeds, reduce_sizes):
+            rs = np.random.RandomState(seed)
+            to_hash_inds.append(rs.randint(reduce_size, size=x.shape[ax]))
+
+        for reduce_index in itertools.product(*(range(rs) for rs in reduce_sizes)):
+            index = list(out.index)
+            for ax, ind in zip(axes, reduce_index):
+                index[ax] = ind
+            selected = x
+            for ax, to_hash_ind in zip(axes, to_hash_inds):
+                slc = (slice(None),) * ax + (to_hash_ind == index[ax],)
+                selected = _safe_slice(selected, slc, op.output_types[0])
+            ctx[out.key, tuple(index)] = (ctx.get_current_chunk().index, selected)
+
+    @classmethod
+    def execute_reduce(cls, ctx, op: "LearnShuffle"):
+        inputs_grid = np.empty(op.reduce_sizes, dtype=object)
+        for input_index, inp in op.iter_mapper_data(ctx):
+            reduce_index = tuple(input_index[ax] for ax in op.axes)
+            inputs_grid[reduce_index] = inp
+        ret = cls._concat_grid(inputs_grid, op.axes, op.output_types[0])
+        for ax, seed in zip(op.axes, op.seeds):
+            ind = np.random.RandomState(seed).permutation(np.arange(ret.shape[ax]))
+            slc = (slice(None),) * ax + (ind,)
+            ret = _safe_slice(ret, slc, op.output_types[0])
+        ctx[op.outputs[0].key] = ret
+
+    @classmethod
+    def _concat_grid(cls, grid, axes, output_type):
+        if output_type == OutputType.tensor:
+            return cls._concat_tensor_grid(grid, axes)
+        elif output_type == OutputType.dataframe:
+            return cls._concat_dataframe_grid(grid, axes)
+        else:
+            assert output_type == OutputType.series
+            return cls._concat_series_grid(grid, axes)
+
+    @classmethod
+    def _concat_dataframe_grid(cls, grid, axes):
+        xdf = pd if isinstance(grid.ravel()[0], pd.DataFrame) else cudf
+        # if 1 exists in axes, the shuffle would have been done in map phase
+        assert len(axes) == 1
+        return xdf.concat(grid, axis=axes[0])
+
+    @classmethod
+    def _concat_series_grid(cls, grid, axes):
+        assert axes == (0,) and grid.ndim == 1
+
+        return reduce(lambda a, b: a.append(b), grid)
+
+    @classmethod
+    def _concat_tensor_grid(cls, grid, axes):
+        cur = grid
+        xp = get_array_module(grid.ravel()[0])
+        for ax, i in zip(axes[:0:-1], range(len(axes) - 1, 0, -1)):
+            new_shape = grid.shape[:i]
+            new_grid = np.empty(new_shape, dtype=object)
+            for idx in itertools.product(*(range(s) for s in new_shape)):
+                new_grid[idx] = xp.concatenate(cur[idx], axis=ax)
+            cur = new_grid
+        return xp.concatenate(cur, axis=axes[0])
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            cls.execute_map(ctx, op)
+        elif op.stage == OperandStage.reduce:
+            cls.execute_reduce(ctx, op)
+        else:
+            cls.execute_single(ctx, op)
+
+
+def shuffle(*arrays, **options):
+    arrays = [convert_to_tensor_or_dataframe(ar) for ar in arrays]
+    axes = options.pop("axes", (0,))
+    if not isinstance(axes, Iterable):
+        axes = (axes,)
+    elif not isinstance(axes, tuple):
+        axes = tuple(axes)
+    random_state = check_random_state(options.pop("random_state", None)).to_numpy()
+    if options:
+        raise TypeError(
+            f"shuffle() got an unexpected keyword argument {next(iter(options))}"
+        )
+
+    max_ndim = max(ar.ndim for ar in arrays)
+    axes = tuple(np.unique([validate_axis(max_ndim, ax) for ax in axes]).tolist())
+    seeds = gen_random_seeds(len(axes), random_state)
+
+    # verify shape
+    for ax in axes:
+        shapes = {ar.shape[ax] for ar in arrays if ax < ar.ndim}
+        if len(shapes) > 1:
+            raise ValueError(f"arrays do not have same shape on axis {ax}")
+
+    op = LearnShuffle(axes=axes, seeds=seeds, output_types=get_output_types(*arrays))
+    shuffled_arrays = op(arrays)
+    if len(arrays) == 1:
+        return shuffled_arrays[0]
+    else:
+        return ExecutableTuple(shuffled_arrays)
diff --git a/python/xorbits/_mars/learn/utils/sparsefuncs.py b/python/xorbits/_mars/learn/utils/sparsefuncs.py
new file mode 100644
index 000000000..1033edb51
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/sparsefuncs.py
@@ -0,0 +1,183 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import numpy as np
+
+from ... import opcodes
+from ... import tensor as mt
+from ...core import OutputType, recursive_tile
+from ...serialization.serializables import Int16Field, ReferenceField
+from ...utils import has_unknown_shape
+from ..operands import LearnOperand, LearnOperandMixin
+
+
+class LearnCountNonzero(LearnOperand, LearnOperandMixin):
+    _op_code_ = opcodes.COUNT_NONZERO
+
+    axis = Int16Field("axis")
+    sample_weight = ReferenceField("sample_weight")
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if self.sample_weight is not None:
+            self.sample_weight = inputs[-1]
+
+    def __call__(self, x, sample_weight=None):
+        self.sample_weight = sample_weight
+        self._output_types = [
+            OutputType.scalar if self.axis is None else OutputType.tensor
+        ]
+        dtype = np.dtype(int)
+        inputs = [x]
+        if sample_weight is not None:
+            dtype = sample_weight.dtype
+            inputs = [x, sample_weight]
+
+        if self.axis is None:
+            shape = ()
+        else:
+            shape = (x.shape[1 - self.axis],)
+
+        return self.new_tileable(inputs, shape=shape, dtype=dtype)
+
+    @classmethod
+    def tile(cls, op: "LearnCountNonzero"):
+        input_tensor = op.inputs[0]
+        out_tensor = op.outputs[0]
+
+        if op.sample_weight is not None:
+            if has_unknown_shape(input_tensor):
+                yield
+            sample_weight = yield from recursive_tile(
+                op.sample_weight.rechunk({0: input_tensor.nsplits[0]})
+            )
+        else:
+            sample_weight = None
+
+        chunks = []
+        for input_chunk in input_tensor.chunks:
+            if sample_weight is None:
+                weight_chunk = None
+            else:
+                weight_chunk = sample_weight.cix[(input_chunk.index[0],)]
+
+            new_op = op.copy().reset_key()
+            new_op.sample_weight = weight_chunk
+
+            inputs = [input_chunk] if not weight_chunk else [input_chunk, weight_chunk]
+            if op.axis is None:
+                shape = (1, 1)
+            elif op.axis == 0:
+                shape = (1, input_chunk.shape[1])
+            else:
+                shape = (input_chunk.shape[0], 1)
+            chunks.append(
+                new_op.new_chunk(
+                    inputs, shape=shape, dtype=out_tensor.dtype, index=input_chunk.index
+                )
+            )
+
+        new_op = op.copy().reset_key()
+        if op.axis is None:
+            nsplits = tuple((1,) * len(split) for split in input_tensor.nsplits)
+            shape = tuple(len(split) for split in input_tensor.nsplits)
+        elif op.axis == 0:
+            nsplits = ((1,) * len(input_tensor.nsplits[0]), input_tensor.nsplits[1])
+            shape = (len(input_tensor.nsplits[0]), input_tensor.shape[1])
+        else:
+            nsplits = (input_tensor.nsplits[0], (1,) * len(input_tensor.nsplits[1]))
+            shape = (input_tensor.shape[0], len(input_tensor.nsplits[1]))
+
+        tileable = new_op.new_tileable(
+            out_tensor.inputs,
+            chunks=chunks,
+            nsplits=nsplits,
+            shape=shape,
+            dtype=out_tensor.dtype,
+        )
+        return [(yield from recursive_tile(mt.sum(tileable, axis=op.axis)))]
+
+    @classmethod
+    def execute(cls, ctx, op: "LearnCountNonzero"):
+        axis = op.axis
+        X = ctx[op.inputs[0].key]
+        sample_weight = (
+            ctx[op.sample_weight.key] if op.sample_weight is not None else None
+        )
+
+        # We rely here on the fact that np.diff(Y.indptr) for a CSR
+        # will return the number of nonzero entries in each row.
+        # A bincount over Y.indices will return the number of nonzeros
+        # in each column. See ``csr_matrix.getnnz`` in scipy >= 0.14.
+        if axis is None:
+            if sample_weight is None:
+                res = X.nnz
+            else:
+                res = np.dot(np.diff(X.indptr), sample_weight)
+        elif axis == 1:
+            out = np.diff(X.indptr)
+            if sample_weight is None:
+                # astype here is for consistency with axis=0 dtype
+                res = out.astype("intp")
+            else:
+                res = out * sample_weight
+        else:
+            if sample_weight is None:
+                res = np.bincount(X.indices, minlength=X.shape[1])
+            else:
+                weights = np.repeat(sample_weight, np.diff(X.indptr))
+                res = np.bincount(X.indices, minlength=X.shape[1], weights=weights)
+        if np.isscalar(res):
+            res = np.array([res])
+        out_shape = op.outputs[0].shape
+        if any(np.isnan(s) for s in out_shape):
+            new_shape = list(out_shape)
+            for i, s in enumerate(out_shape):
+                if np.isnan(s):
+                    new_shape[i] = -1
+            out_shape = tuple(new_shape)
+        ctx[op.outputs[0].key] = res.reshape(out_shape)
+
+
+def count_nonzero(X, axis: Optional[int] = None, sample_weight=None):
+    """A variant of X.getnnz() with extension to weighting on axis 0
+
+    Useful in efficiently calculating multilabel metrics.
+
+    Parameters
+    ----------
+    X : CSR sparse matrix of shape (n_samples, n_labels)
+        Input data.
+
+    axis : None, 0 or 1
+        The axis on which the data is aggregated.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Weight for each row of X.
+    """
+    if axis == -1:
+        axis = 1
+    elif axis == -2:
+        axis = 0
+    if axis is not None and axis not in (0, 1):
+        raise ValueError(f"Unsupported axis: {axis}")
+
+    X = mt.asarray(X)
+    if sample_weight is not None:
+        sample_weight = mt.asarray(sample_weight)
+
+    op = LearnCountNonzero(axis=axis)
+    return op(X, sample_weight=sample_weight)
diff --git a/python/xorbits/_mars/learn/utils/tests/__init__.py b/python/xorbits/_mars/learn/utils/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/learn/utils/tests/test_checks.py b/python/xorbits/_mars/learn/utils/tests/test_checks.py
new file mode 100644
index 000000000..1bdd8273a
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/tests/test_checks.py
@@ -0,0 +1,130 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import pytest
+import scipy.sparse as sps
+
+from .... import dataframe as md
+from .... import tensor as mt
+from ....config import option_context
+from ..checks import assert_all_finite, check_non_negative_then_return_value
+
+
+def test_check_non_negative_then_return_value_execution(setup):
+    raw = np.random.randint(10, size=(10, 5))
+    c = mt.tensor(raw, chunk_size=(3, 2))
+
+    r = check_non_negative_then_return_value(c, c, "sth")
+    result = r.execute().fetch()
+    np.testing.assert_array_equal(result, raw)
+
+    raw = raw.copy()
+    raw[1, 3] = -1
+    c = mt.tensor(raw, chunk_size=(3, 2))
+
+    r = check_non_negative_then_return_value(c, c, "sth")
+    with pytest.raises(ValueError):
+        _ = r.execute().fetch()
+
+    raw = sps.random(10, 5, density=0.3, format="csr")
+    c = mt.tensor(raw, chunk_size=(3, 2))
+
+    r = check_non_negative_then_return_value(c, c, "sth")
+    result = r.execute().fetch()
+    np.testing.assert_array_equal(result.toarray(), raw.A)
+
+    raw = raw.copy()
+    raw[1, 3] = -1
+    c = mt.tensor(raw, chunk_size=(3, 2))
+
+    r = check_non_negative_then_return_value(c, c, "sth")
+    with pytest.raises(ValueError):
+        _ = r.execute().fetch()
+
+    raw = pd.DataFrame(np.random.rand(10, 4))
+    c = md.DataFrame(raw, chunk_size=(3, 2))
+
+    r = check_non_negative_then_return_value(c, c, "sth")
+    result = r.execute().fetch()
+
+    pd.testing.assert_frame_equal(result, raw)
+
+    raw = raw.copy()
+    raw.iloc[1, 3] = -1
+    c = md.DataFrame(raw, chunk_size=(3, 2))
+
+    r = check_non_negative_then_return_value(c, c, "sth")
+    with pytest.raises(ValueError):
+        _ = r.execute().fetch()
+
+
+def test_assert_all_finite(setup):
+    raw = np.array([2.3, np.inf], dtype=np.float64)
+    x = mt.tensor(raw)
+
+    with pytest.raises(ValueError):
+        r = assert_all_finite(x)
+        r.execute()
+
+    raw = np.array([2.3, np.nan], dtype=np.float64)
+    x = mt.tensor(raw)
+
+    with pytest.raises(ValueError):
+        r = assert_all_finite(x, allow_nan=False)
+        r.execute()
+
+    max_float32 = np.finfo(np.float32).max
+    raw = [max_float32] * 2
+    assert not np.isfinite(np.sum(raw))
+    x = mt.tensor(raw)
+
+    r = assert_all_finite(x)
+    result = r.execute().fetch()
+    assert result is True
+
+    raw = np.array([np.nan, "a"], dtype=object)
+    x = mt.tensor(raw)
+
+    with pytest.raises(ValueError):
+        r = assert_all_finite(x)
+        r.execute()
+
+    raw = np.random.rand(10)
+    x = mt.tensor(raw, chunk_size=2)
+
+    r = assert_all_finite(x, check_only=False)
+    result = r.execute().fetch()
+    np.testing.assert_array_equal(result, raw)
+
+    r = assert_all_finite(x)
+    result = r.execute().fetch()
+    assert result is True
+
+    with option_context() as options:
+        options.learn.assume_finite = True
+
+        assert assert_all_finite(x) is None
+        assert assert_all_finite(x, check_only=False) is x
+
+    # test sparse
+    s = sps.random(
+        10, 3, density=0.1, format="csr", random_state=np.random.RandomState(0)
+    )
+    s[0, 2] = np.nan
+
+    with pytest.raises(ValueError):
+        r = assert_all_finite(s)
+        r.execute()
diff --git a/python/xorbits/_mars/learn/utils/tests/test_collect_ports.py b/python/xorbits/_mars/learn/utils/tests/test_collect_ports.py
new file mode 100644
index 000000000..1642f158b
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/tests/test_collect_ports.py
@@ -0,0 +1,24 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..collect_ports import collect_ports
+
+
+def test_collect_ports(setup_cluster):
+    session = setup_cluster
+    workers = [
+        pool.external_address for pool in session._session.client._cluster._worker_pools
+    ]
+    # make sure assert works inside execution of collect ports
+    collect_ports(workers * 2).execute(session=session)
diff --git a/python/xorbits/_mars/learn/utils/tests/test_core.py b/python/xorbits/_mars/learn/utils/tests/test_core.py
new file mode 100644
index 000000000..a6dc71862
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/tests/test_core.py
@@ -0,0 +1,41 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from .... import tensor as mt
+from ..core import sort_by
+
+
+def test_sort_by(setup):
+    rs = np.random.RandomState(0)
+    raw1 = rs.rand(10)
+    raw2 = rs.rand(10)
+    raw3 = rs.rand(10)
+
+    a1 = mt.tensor(raw1, chunk_size=4)
+    a2 = mt.tensor(raw2, chunk_size=4)
+    a3 = mt.tensor(raw3, chunk_size=4)
+
+    s1, s2 = sort_by([a1, a2], by=a3)
+    ind = np.argsort(raw3)
+    e1, e2 = raw1[ind], raw2[ind]
+    np.testing.assert_array_equal(s1, e1)
+    np.testing.assert_array_equal(s2, e2)
+
+    s1, s2 = sort_by([a1, a2], by=a2, ascending=False)
+    ind = np.argsort(raw2)[::-1]
+    e1, e2 = raw1[ind], raw2[ind]
+    np.testing.assert_array_equal(s1, e1)
+    np.testing.assert_array_equal(s2, e2)
diff --git a/python/xorbits/_mars/learn/utils/tests/test_extmath.py b/python/xorbits/_mars/learn/utils/tests/test_extmath.py
new file mode 100644
index 000000000..48f8c24a2
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/tests/test_extmath.py
@@ -0,0 +1,26 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from numpy.testing import assert_array_almost_equal
+
+from ..extmath import softmax
+
+
+@pytest.mark.parametrize("copy", [True, False])
+def test_softmax(setup, copy):
+    x = [[1, 2, 3], [2, 3, 4]]
+    ref = [[0.09003057, 0.24472847, 0.66524096], [0.09003057, 0.24472847, 0.66524096]]
+    x_ = softmax(x, copy=copy)
+    assert_array_almost_equal(ref, x_)
diff --git a/python/xorbits/_mars/learn/utils/tests/test_multiclass.py b/python/xorbits/_mars/learn/utils/tests/test_multiclass.py
new file mode 100644
index 000000000..d778ac339
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/tests/test_multiclass.py
@@ -0,0 +1,262 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from itertools import product
+
+import numpy as np
+import pytest
+import scipy.sparse as sps
+from scipy.sparse import csr_matrix
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils.estimator_checks import _NotAnArray
+from sklearn.utils.multiclass import is_multilabel as sklearn_is_multilabel
+from sklearn.utils.multiclass import type_of_target as sklearn_type_of_target
+
+from .... import tensor as mt
+from ..multiclass import is_multilabel, type_of_target, unique_labels
+
+EXAMPLES = {
+    "multilabel-indicator": [
+        # valid when the data is formatted as sparse or dense, identified
+        # by CSR format when the testing takes place
+        csr_matrix(np.random.RandomState(42).randint(2, size=(10, 10))),
+        [[0, 1], [1, 0]],
+        [[0, 1]],
+        csr_matrix(np.array([[0, 1], [1, 0]])),
+        csr_matrix(np.array([[0, 1], [1, 0]], dtype=bool)),
+        csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.int8)),
+        csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.uint8)),
+        csr_matrix(np.array([[0, 1], [1, 0]], dtype=float)),
+        csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.float32)),
+        csr_matrix(np.array([[0, 0], [0, 0]])),
+        csr_matrix(np.array([[0, 1]])),
+        # Only valid when data is dense
+        [[-1, 1], [1, -1]],
+        np.array([[-1, 1], [1, -1]]),
+        np.array([[-3, 3], [3, -3]]),
+        _NotAnArray(np.array([[-3, 3], [3, -3]])),
+    ],
+    "multiclass": [
+        [1, 0, 2, 2, 1, 4, 2, 4, 4, 4],
+        np.array([1, 0, 2]),
+        np.array([1, 0, 2], dtype=np.int8),
+        np.array([1, 0, 2], dtype=np.uint8),
+        np.array([1, 0, 2], dtype=float),
+        np.array([1, 0, 2], dtype=np.float32),
+        np.array([[1], [0], [2]]),
+        _NotAnArray(np.array([1, 0, 2])),
+        [0, 1, 2],
+        ["a", "b", "c"],
+        np.array(["a", "b", "c"]),
+        np.array(["a", "b", "c"], dtype=object),
+        np.array(["a", "b", "c"], dtype=object),
+    ],
+    "multiclass-multioutput": [
+        [[1, 0, 2, 2], [1, 4, 2, 4]],
+        [["a", "b"], ["c", "d"]],
+        np.array([[1, 0, 2, 2], [1, 4, 2, 4]]),
+        np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8),
+        np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8),
+        np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float),
+        np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32),
+        np.array([["a", "b"], ["c", "d"]]),
+        np.array([["a", "b"], ["c", "d"]]),
+        np.array([["a", "b"], ["c", "d"]], dtype=object),
+        np.array([[1, 0, 2]]),
+        _NotAnArray(np.array([[1, 0, 2]])),
+    ],
+    "binary": [
+        [0, 1],
+        [1, 1],
+        [],
+        [0],
+        np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]),
+        np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool),
+        np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8),
+        np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8),
+        np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float),
+        np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32),
+        np.array([[0], [1]]),
+        _NotAnArray(np.array([[0], [1]])),
+        [1, -1],
+        [3, 5],
+        ["a"],
+        ["a", "b"],
+        ["abc", "def"],
+        np.array(["abc", "def"]),
+        ["a", "b"],
+        np.array(["abc", "def"], dtype=object),
+    ],
+    "continuous": [
+        [1e-5],
+        [0, 0.5],
+        np.array([[0], [0.5]]),
+        np.array([[0], [0.5]], dtype=np.float32),
+    ],
+    "continuous-multioutput": [
+        np.array([[0, 0.5], [0.5, 0]]),
+        np.array([[0, 0.5], [0.5, 0]], dtype=np.float32),
+        np.array([[0, 0.5]]),
+    ],
+    "unknown": [
+        [[]],
+        [()],
+        # sequence of sequences that weren't supported even before deprecation
+        np.array([np.array([]), np.array([1, 2, 3])], dtype=object),
+        # [np.array([]), np.array([1, 2, 3])], # deprecated in numpy v1.24
+        [{1, 2, 3}, {1, 2}],
+        [frozenset([1, 2, 3]), frozenset([1, 2])],
+        # and also confusable as sequences of sequences
+        [{0: "a", 1: "b"}, {0: "a"}],
+        # empty second dimension
+        np.array([[], []]),
+        # 3d
+        np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]),
+    ],
+}
+
+NON_ARRAY_LIKE_EXAMPLES = [
+    {1, 2, 3},
+    {0: "a", 1: "b"},
+    {0: [5], 1: [5]},
+    "abc",
+    frozenset([1, 2, 3]),
+    None,
+]
+
+
+def test_unique_labels(setup):
+    # Empty iterable
+    with pytest.raises(ValueError):
+        unique_labels()
+
+    # Multiclass problem
+    assert_array_equal(unique_labels(range(10)), np.arange(10))
+    assert_array_equal(unique_labels(np.arange(10)), np.arange(10))
+    assert_array_equal(unique_labels([4, 0, 2]), np.array([0, 2, 4]))
+
+    # Multilabel indicator
+    assert_array_equal(
+        unique_labels(np.array([[0, 0, 1], [1, 0, 1], [0, 0, 0]])), np.arange(3)
+    )
+
+    assert_array_equal(unique_labels(np.array([[0, 0, 1], [0, 0, 0]])), np.arange(3))
+
+    # Several arrays passed
+    assert_array_equal(unique_labels([4, 0, 2], range(5)), np.arange(5))
+    assert_array_equal(unique_labels((0, 1, 2), (0,), (2, 1)), np.arange(3))
+
+    # Border line case with binary indicator matrix
+    with pytest.raises(ValueError):
+        unique_labels([4, 0, 2], np.ones((5, 5))).execute()
+    with pytest.raises(ValueError):
+        unique_labels(np.ones((5, 4)), np.ones((5, 5))).execute()
+
+    assert_array_equal(unique_labels(np.ones((4, 5)), np.ones((5, 5))), np.arange(5))
+
+
+def test_unique_labels_non_specific(setup):
+    # Test unique_labels with a variety of collected examples
+
+    # Smoke test for all supported format
+    for format in ["binary", "multiclass", "multilabel-indicator"]:
+        for y in EXAMPLES[format]:
+            unique_labels(y).execute()
+
+    # We don't support those format at the moment
+    for example in NON_ARRAY_LIKE_EXAMPLES:
+        with pytest.raises(ValueError):
+            unique_labels(example).execute()
+
+    for y_type in [
+        "unknown",
+        "continuous",
+        "continuous-multioutput",
+        "multiclass-multioutput",
+    ]:
+        for example in EXAMPLES[y_type]:
+            with pytest.raises(ValueError):
+                unique_labels(example).execute()
+
+
+def test_unique_labels_mixed_types(setup):
+    # Mix with binary or multiclass and multilabel
+    mix_clf_format = product(
+        EXAMPLES["multilabel-indicator"], EXAMPLES["multiclass"] + EXAMPLES["binary"]
+    )
+
+    for y_multilabel, y_multiclass in mix_clf_format:
+        with pytest.raises(ValueError):
+            unique_labels(y_multiclass, y_multilabel).execute()
+        with pytest.raises(ValueError):
+            unique_labels(y_multilabel, y_multiclass).execute()
+
+    with pytest.raises(ValueError):
+        unique_labels([[1, 2]], [["a", "d"]]).execute()
+
+    with pytest.raises(ValueError):
+        unique_labels(["1", 2]).execute()
+
+    with pytest.raises(ValueError):
+        unique_labels([["1", 2], [1, 3]]).execute()
+
+    with pytest.raises(ValueError):
+        unique_labels([["1", "2"], [2, 3]]).execute()
+
+
+def test_is_multilabel(setup):
+    raws = [
+        [[1, 2]],
+        [0, 1, 0, 1],
+        # [[1], [0, 2], []], # deprecated in numpy v1.24
+        np.array([[1, 0], [0, 0]]),
+        np.array([[1], [0], [0]]),
+        np.array([[1, 0, 0]]),
+        np.array([[1.0, 0.0], [0.0, 0.0]]),
+        sps.csr_matrix([[1, 0], [0, 1]]),
+    ]
+
+    for raw in raws:
+        assert is_multilabel(raw).to_numpy() == sklearn_is_multilabel(raw)
+
+    t = mt.tensor(raws[3], chunk_size=1)
+    assert is_multilabel(t).to_numpy() == sklearn_is_multilabel(raws[3])
+
+
+def test_type_of_target(setup):
+    raws = [
+        np.array([[0, 1], [0, 0]]),  # multilabel
+        np.random.randint(2, size=(5, 3, 3)),  # ndim > 2, unknown
+        np.array([[]]),  # ndim == 2, shape[1] == 0, unknown
+        np.array([[1, 2], [1, 2]]),
+        np.array([1, 2, 3]),
+        np.array([0.1, 0.2, 3]),
+        np.array([[0.1, 0.2, 3]]),
+        np.array([[1.0, 0.2]]),
+        np.array([[1.0, 2.0, 3]]),
+        np.array([[1, 2]]),
+        np.array([1, 2]),
+        np.array([["a"], ["b"]], dtype=object),
+        [[1, 2]],
+        [],  # empty list
+    ]
+
+    for raw in raws:
+        assert type_of_target(raw).to_numpy() == sklearn_type_of_target(raw)
+
+    t = mt.tensor(raws[0], chunk_size=1)
+    assert type_of_target(t).to_numpy() == sklearn_type_of_target(raws[0])
+
+    with pytest.raises(ValueError):
+        type_of_target("sth")
diff --git a/python/xorbits/_mars/learn/utils/tests/test_shuffle.py b/python/xorbits/_mars/learn/utils/tests/test_shuffle.py
new file mode 100644
index 000000000..139df2e25
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/tests/test_shuffle.py
@@ -0,0 +1,161 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .... import dataframe as md
+from .... import tensor as mt
+from ....core import tile
+from .. import shuffle
+from ..shuffle import LearnShuffle
+
+
+def test_shuffle_expr():
+    a = mt.random.rand(10, 3, chunk_size=2)
+    b = md.DataFrame(mt.random.rand(10, 5), chunk_size=2)
+
+    new_a, new_b = shuffle(a, b, random_state=0)
+
+    assert new_a.op is new_b.op
+    assert isinstance(new_a.op, LearnShuffle)
+    assert new_a.shape == a.shape
+    assert new_b.shape == b.shape
+    assert b.index_value.key != new_b.index_value.key
+
+    new_a, new_b = tile(new_a, new_b)
+
+    assert len(new_a.chunks) == 10
+    assert np.isnan(new_a.chunks[0].shape[0])
+    assert len(new_b.chunks) == 15
+    assert np.isnan(new_b.chunks[0].shape[0])
+    assert new_b.chunks[0].index_value.key != new_b.chunks[1].index_value.key
+    assert new_a.chunks[0].op.seeds == new_b.chunks[0].op.seeds
+
+    c = mt.random.rand(10, 5, 3, chunk_size=2)
+    d = md.DataFrame(mt.random.rand(10, 5), chunk_size=(2, 5))
+
+    new_c, new_d = shuffle(c, d, axes=(0, 1), random_state=0)
+
+    assert new_c.op is new_d.op
+    assert isinstance(new_c.op, LearnShuffle)
+    assert new_c.shape == c.shape
+    assert new_d.shape == d.shape
+    assert d.index_value.key != new_d.index_value.key
+    assert not np.all(new_d.dtypes.index[:-1] < new_d.dtypes.index[1:])
+    pd.testing.assert_series_equal(d.dtypes, new_d.dtypes.sort_index())
+
+    new_c, new_d = tile(new_c, new_d)
+
+    assert len(new_c.chunks) == 5 * 1 * 2
+    assert np.isnan(new_c.chunks[0].shape[0])
+    assert len(new_d.chunks) == 5
+    assert np.isnan(new_d.chunks[0].shape[0])
+    assert new_d.chunks[0].shape[1] == 5
+    assert new_d.chunks[0].index_value.key != new_d.chunks[1].index_value.key
+    pd.testing.assert_series_equal(new_d.chunks[0].dtypes.sort_index(), d.dtypes)
+    assert new_c.chunks[0].op.seeds == new_d.chunks[0].op.seeds
+    assert len(new_c.chunks[0].op.seeds) == 1
+    assert new_c.chunks[0].op.reduce_sizes == (5,)
+
+    with pytest.raises(ValueError):
+        a = mt.random.rand(10, 5)
+        b = mt.random.rand(10, 4, 3)
+        shuffle(a, b, axes=1)
+
+    with pytest.raises(TypeError):
+        shuffle(a, b, unknown_param=True)
+
+    assert isinstance(shuffle(mt.random.rand(10, 5)), mt.Tensor)
+
+
+def _sort(data, axes):
+    cur = data
+    for ax in axes:
+        if ax < data.ndim:
+            cur = np.sort(cur, axis=ax)
+    return cur
+
+
+def test_shuffle_execution(setup):
+    # test consistency
+    s1 = np.arange(9).reshape(3, 3)
+    s2 = np.arange(1, 10).reshape(3, 3)
+    ts1 = mt.array(s1, chunk_size=2)
+    ts2 = mt.array(s2, chunk_size=2)
+
+    ret = shuffle(ts1, ts2, axes=[0, 1], random_state=0)
+    res1, res2 = ret.execute().fetch()
+
+    # calc row index
+    s1_col_0 = s1[:, 0].tolist()
+    rs1_col_0 = [res1[:, i] for i in range(3) if set(s1_col_0) == set(res1[:, i])][0]
+    row_index = [s1_col_0.index(j) for j in rs1_col_0]
+    # calc col index
+    s1_row_0 = s1[0].tolist()
+    rs1_row_0 = [res1[i] for i in range(3) if set(s1_row_0) == set(res1[i])][0]
+    col_index = [s1_row_0.index(j) for j in rs1_row_0]
+    np.testing.assert_array_equal(res2, s2[row_index][:, col_index])
+
+    # tensor + tensor
+    raw1 = np.random.rand(10, 15, 20)
+    t1 = mt.array(raw1, chunk_size=8)
+    raw2 = np.random.rand(10, 15, 20)
+    t2 = mt.array(raw2, chunk_size=5)
+
+    for axes in [(0,), (0, 1), (0, 2), (1, 2), (0, 1, 2)]:
+        ret = shuffle(t1, t2, axes=axes, random_state=0)
+        res1, res2 = ret.execute().fetch()
+
+        assert res1.shape == raw1.shape
+        assert res2.shape == raw2.shape
+        np.testing.assert_array_equal(_sort(raw1, axes), _sort(res1, axes))
+        np.testing.assert_array_equal(_sort(raw2, axes), _sort(res2, axes))
+
+    # tensor + tensor(more dimension)
+    raw3 = np.random.rand(10, 15)
+    t3 = mt.array(raw3, chunk_size=(8, 15))
+    raw4 = np.random.rand(10, 15, 20)
+    t4 = mt.array(raw4, chunk_size=(5, 15, 10))
+
+    for axes in [(1,), (0, 1), (1, 2)]:
+        ret = shuffle(t3, t4, axes=axes, random_state=0)
+        res3, res4 = ret.execute().fetch()
+
+        assert res3.shape == raw3.shape
+        assert res4.shape == raw4.shape
+        np.testing.assert_array_equal(_sort(raw3, axes), _sort(res3, axes))
+        np.testing.assert_array_equal(_sort(raw4, axes), _sort(res4, axes))
+
+    # tensor + dataframe + series
+    raw5 = np.random.rand(10, 15, 20)
+    t5 = mt.array(raw5, chunk_size=8)
+    t6 = mt.array(raw5[:, 0, 0], chunk_size=6)
+    raw6 = pd.DataFrame(np.random.rand(10, 15))
+    df = md.DataFrame(raw6, chunk_size=(8, 15))
+    raw7 = pd.Series(np.random.rand(10))
+    series = md.Series(raw7, chunk_size=8)
+
+    for axes in [(0,), (1,), (0, 1), (1, 2), [0, 1, 2]]:
+        ret = shuffle(t5, df, series, t6, axes=axes, random_state=0)
+        # skip check nsplits because it's updated
+        res5, res_df, res_series, res6 = ret.execute(
+            extra_config={"check_nsplits": False}
+        ).fetch(extra_config={"check_nsplits": False})
+
+        assert res5.shape == raw5.shape
+        assert res_df.shape == df.shape
+        assert res_series.shape == series.shape
+        assert res6.shape == (raw5.shape[0],)
diff --git a/python/xorbits/_mars/learn/utils/tests/test_sparsefuncs.py b/python/xorbits/_mars/learn/utils/tests/test_sparsefuncs.py
new file mode 100644
index 000000000..6952d3763
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/tests/test_sparsefuncs.py
@@ -0,0 +1,71 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import scipy.sparse as sp
+
+from .... import tensor as mt
+from ..sparsefuncs import count_nonzero
+
+
+def test_count_nonzero(setup):
+    X = np.array(
+        [[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
+    )
+    X_nonzero = X != 0
+
+    X_csr = sp.csr_matrix(X)
+    X_csr_t = mt.tensor(X_csr, chunk_size=3)
+
+    sample_weight = [0.5, 0.2, 0.3, 0.1, 0.1]
+    X_nonzero_weighted = X_nonzero * np.array(sample_weight)[:, None]
+
+    for axis in [0, 1, -1, -2, None]:
+        np.testing.assert_array_almost_equal(
+            count_nonzero(X_csr_t, axis=axis).execute().fetch(),
+            X_nonzero.sum(axis=axis),
+        )
+        np.testing.assert_array_almost_equal(
+            count_nonzero(X_csr_t, axis=axis, sample_weight=sample_weight)
+            .execute()
+            .fetch(),
+            X_nonzero_weighted.sum(axis=axis),
+        )
+
+    with pytest.raises(ValueError):
+        count_nonzero(X_csr_t, axis=2).execute()
+
+    assert count_nonzero(X_csr_t, axis=0).dtype == count_nonzero(X_csr_t, axis=1).dtype
+    assert (
+        count_nonzero(X_csr_t, axis=0, sample_weight=sample_weight).dtype
+        == count_nonzero(X_csr_t, axis=1, sample_weight=sample_weight).dtype
+    )
+
+    # Check dtypes with large sparse matrices too
+    # XXX: test fails on 32bit (Windows/Linux)
+    try:
+        X_csr.indices = X_csr.indices.astype(np.int64)
+        X_csr.indptr = X_csr.indptr.astype(np.int64)
+        X_csr_t = mt.tensor(X_csr, chunk_size=3)
+
+        assert (
+            count_nonzero(X_csr_t, axis=0).dtype == count_nonzero(X_csr_t, axis=1).dtype
+        )
+        assert (
+            count_nonzero(X_csr_t, axis=0, sample_weight=sample_weight).dtype
+            == count_nonzero(X_csr_t, axis=1, sample_weight=sample_weight).dtype
+        )
+    except TypeError as e:
+        assert "according to the rule 'safe'" in e.args[0] and np.intp().nbytes < 8, e
diff --git a/python/xorbits/_mars/learn/utils/tests/test_validation.py b/python/xorbits/_mars/learn/utils/tests/test_validation.py
new file mode 100644
index 000000000..f9283de69
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/tests/test_validation.py
@@ -0,0 +1,256 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from itertools import product
+
+import numpy as np
+import pytest
+import scipy.sparse as sp
+from sklearn.utils._mocking import MockDataFrame
+from sklearn.utils._testing import assert_raise_message, assert_raises_regex
+from sklearn.utils.estimator_checks import _NotAnArray
+
+from .... import dataframe as md
+from .... import tensor as mt
+from ....tensor.core import Tensor
+from ..validation import check_array, check_consistent_length
+
+
+def test_ordering():
+    # Check that ordering is enforced correctly by validation utilities.
+    # We need to check each validation utility, because a 'copy' without
+    # 'order=K' will kill the ordering.
+    X = mt.ones((10, 5))
+    for A in X, X.T:
+        for copy in (True, False):
+            B = check_array(A, order="C", copy=copy)
+            assert B.flags["C_CONTIGUOUS"] is True
+            B = check_array(A, order="F", copy=copy)
+            assert B.flags["F_CONTIGUOUS"] is True
+            if copy:
+                assert A is not B
+
+
+def test_check_array(setup):
+    # accept_sparse == False
+    # raise error on sparse inputs
+    X = [[1, 2], [3, 4]]
+    X_csr = sp.csr_matrix(X)
+    with pytest.raises(TypeError):
+        check_array(X_csr)
+    X_csr = mt.tensor(sp.csr_matrix(X))
+    with pytest.raises(TypeError):
+        check_array(X_csr)
+    # ensure_2d=False
+    X_array = check_array([0, 1, 2], ensure_2d=False)
+    assert X_array.ndim == 1
+    # ensure_2d=True with 1d array
+    assert_raise_message(
+        ValueError,
+        "Expected 2D array, got 1D array instead",
+        check_array,
+        [0, 1, 2],
+        ensure_2d=True,
+    )
+    assert_raise_message(
+        ValueError,
+        "Expected 2D array, got 1D array instead",
+        check_array,
+        mt.tensor([0, 1, 2]),
+        ensure_2d=True,
+    )
+    # ensure_2d=True with scalar array
+    assert_raise_message(
+        ValueError,
+        "Expected 2D array, got scalar array instead",
+        check_array,
+        10,
+        ensure_2d=True,
+    )
+    # don't allow ndim > 3
+    X_ndim = mt.arange(8).reshape(2, 2, 2)
+    with pytest.raises(ValueError):
+        check_array(X_ndim)
+    check_array(X_ndim, allow_nd=True)  # doesn't raise
+
+    # dtype and order enforcement.
+    X_C = mt.arange(4).reshape(2, 2).copy("C")
+    X_F = X_C.copy("F")
+    X_int = X_C.astype(mt.int)
+    X_float = X_C.astype(mt.float)
+    Xs = [X_C, X_F, X_int, X_float]
+    dtypes = [mt.int32, mt.int, mt.float, mt.float32, None, mt.bool, object]
+    orders = ["C", "F", None]
+    copy_flags = [True, False]
+
+    for X, dtype, order, copy in product(Xs, dtypes, orders, copy_flags):
+        X_checked = check_array(
+            X, dtype=dtype, order=order, copy=copy, force_all_finite=False
+        )
+        if dtype is not None:
+            assert X_checked.dtype == dtype
+        else:
+            assert X_checked.dtype == X.dtype
+        if order == "C":
+            assert X_checked.flags["C_CONTIGUOUS"]
+            assert not X_checked.flags["F_CONTIGUOUS"]
+        elif order == "F":
+            assert X_checked.flags["F_CONTIGUOUS"]
+            assert not X_checked.flags["C_CONTIGUOUS"]
+        if copy:
+            assert X is not X_checked
+        else:
+            # doesn't copy if it was already good
+            if (
+                X.dtype == X_checked.dtype
+                and X_checked.flags["C_CONTIGUOUS"] == X.flags["C_CONTIGUOUS"]
+                and X_checked.flags["F_CONTIGUOUS"] == X.flags["F_CONTIGUOUS"]
+            ):
+                assert X is X_checked
+
+    # other input formats
+    # convert lists to arrays
+    X_dense = check_array([[1, 2], [3, 4]])
+    assert isinstance(X_dense, Tensor)
+    # raise on too deep lists
+    with pytest.raises(ValueError):
+        check_array(X_ndim.to_numpy().tolist())
+    check_array(X_ndim.to_numpy().tolist(), allow_nd=True)  # doesn't raise
+    # convert weird stuff to arrays
+    X_no_array = _NotAnArray(X_dense.to_numpy())
+    result = check_array(X_no_array)
+    assert isinstance(result, Tensor)
+
+    # deprecation warning if string-like array with dtype="numeric"
+    expected_warn_regex = r"converted to decimal numbers if dtype='numeric'"
+    X_str = [["11", "12"], ["13", "xx"]]
+    for X in [X_str, mt.array(X_str, dtype="U"), mt.array(X_str, dtype="S")]:
+        with pytest.warns(FutureWarning, match=expected_warn_regex):
+            check_array(X, dtype="numeric")
+
+    # deprecation warning if byte-like array with dtype="numeric"
+    X_bytes = [[b"a", b"b"], [b"c", b"d"]]
+    for X in [X_bytes, mt.array(X_bytes, dtype="V1")]:
+        with pytest.warns(FutureWarning, match=expected_warn_regex):
+            check_array(X, dtype="numeric")
+
+    # test finite
+    X = [[1.0, np.nan], [2.0, 3.0]]
+    with pytest.raises(ValueError):
+        _ = check_array(X).execute()
+
+
+def test_check_array_pandas_dtype_object_conversion():
+    # test that data-frame like objects with dtype object
+    # get converted
+    X = mt.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=mt.object)
+    X_df = MockDataFrame(X)
+    assert check_array(X_df).dtype.kind == "f"
+    assert check_array(X_df, ensure_2d=False).dtype.kind == "f"
+    # smoke-test against dataframes with column named "dtype"
+    X_df.dtype = "Hans"
+    assert check_array(X_df, ensure_2d=False).dtype.kind == "f"
+
+
+def test_check_array_from_dataframe():
+    X = md.DataFrame({"a": [1.0, 2.0, 3.0]})
+    assert check_array(X).dtype.kind == "f"
+
+
+def test_check_array_accept_sparse_type_exception():
+    X = [[1, 2], [3, 4]]
+    X_csr = sp.csr_matrix(X)
+
+    msg = (
+        "A sparse tensor was passed, but dense data is required. "
+        "Use X.todense() to convert to a dense tensor."
+    )
+    assert_raise_message(TypeError, msg, check_array, X_csr, accept_sparse=False)
+
+    msg = (
+        "When providing 'accept_sparse' as a tuple or list, "
+        "it must contain at least one string value."
+    )
+    assert_raise_message(
+        ValueError, msg.format([]), check_array, X_csr, accept_sparse=[]
+    )
+    assert_raise_message(
+        ValueError, msg.format(()), check_array, X_csr, accept_sparse=()
+    )
+
+    with pytest.raises(ValueError):
+        check_array(X_csr, accept_sparse=object)
+
+
+def test_check_array_accept_sparse_no_exception():
+    X = [[1, 2], [3, 4]]
+    X_csr = sp.csr_matrix(X)
+
+    array = check_array(X_csr, accept_sparse=True)
+    assert isinstance(array, Tensor)
+    assert array.issparse() is True
+
+
+def test_check_array_min_samples_and_features_messages():
+    # empty list is considered 2D by default:
+    msg = "0 feature(s) (shape=(1, 0)) while a minimum of 1 is required."
+    assert_raise_message(ValueError, msg, check_array, [[]])
+
+    # If considered a 1D collection when ensure_2d=False, then the minimum
+    # number of samples will break:
+    msg = "0 sample(s) (shape=(0,)) while a minimum of 1 is required."
+    assert_raise_message(ValueError, msg, check_array, [], ensure_2d=False)
+
+    # Invalid edge case when checking the default minimum sample of a scalar
+    msg = "Singleton array array(42) cannot be considered a valid collection."
+    assert_raise_message(TypeError, msg, check_array, 42, ensure_2d=False)
+
+
+def test_check_array_complex_data_error():
+    X = mt.array([[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]])
+    assert_raises_regex(ValueError, "Complex data not supported", check_array, X)
+
+    # list of lists
+    X = [[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]]
+    assert_raises_regex(ValueError, "Complex data not supported", check_array, X)
+
+    # tuple of tuples
+    X = ((1 + 2j, 3 + 4j, 5 + 7j), (2 + 3j, 4 + 5j, 6 + 7j))
+    assert_raises_regex(ValueError, "Complex data not supported", check_array, X)
+
+    # list of np arrays
+    X = [mt.array([1 + 2j, 3 + 4j, 5 + 7j]), mt.array([2 + 3j, 4 + 5j, 6 + 7j])]
+    assert_raises_regex(ValueError, "Complex data not supported", check_array, X)
+
+    # tuple of np arrays
+    X = (mt.array([1 + 2j, 3 + 4j, 5 + 7j]), mt.array([2 + 3j, 4 + 5j, 6 + 7j]))
+    assert_raises_regex(ValueError, "Complex data not supported", check_array, X)
+
+    # dataframe
+    X = MockDataFrame(mt.array([[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]]))
+    assert_raises_regex(ValueError, "Complex data not supported", check_array, X)
+
+    # sparse matrix
+    X = sp.coo_matrix([[0, 1 + 2j], [0, 0]])
+    assert_raises_regex(ValueError, "Complex data not supported", check_array, X)
+
+
+def test_check_consistent_length(setup):
+    t = mt.random.RandomState(0).rand(10, 5)
+    t2 = t[t[:, 0] < 0.5]
+    t3 = t[t[:, 1] < 0.1]
+
+    check_consistent_length(t2, t2.copy())
+    with pytest.raises(ValueError):
+        check_consistent_length(t2, t3)
diff --git a/python/xorbits/_mars/learn/utils/validation.py b/python/xorbits/_mars/learn/utils/validation.py
new file mode 100644
index 000000000..c2cd68792
--- /dev/null
+++ b/python/xorbits/_mars/learn/utils/validation.py
@@ -0,0 +1,727 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numbers
+import warnings
+
+import numpy as np
+from numpy.core.numeric import ComplexWarning
+
+try:
+    from sklearn.exceptions import DataConversionWarning
+    from sklearn.utils.validation import check_is_fitted
+except ImportError:  # pragma: no cover
+    check_is_fitted = None
+    DataConversionWarning = UserWarning
+
+from ... import dataframe as md
+from ... import tensor as mt
+from ...core import ExecutableTuple
+from ...lib.sparse import issparse
+from ...tensor import Tensor
+from .checks import (
+    AssertAllFinite,
+    assert_all_finite,
+    check_non_negative_then_return_value,
+)
+
+FLOAT_DTYPES = (mt.float64, mt.float32, mt.float16)
+
+# ---------------------------------------------------------
+# Original implementation is in `sklearn.utils.validation`.
+# ---------------------------------------------------------
+
+assert_all_finite = _assert_all_finite = assert_all_finite
+
+
+def _num_samples(x):
+    """Return number of samples in array-like x."""
+    if hasattr(x, "fit") and callable(x.fit):
+        # Don't get num_samples from an ensembles length!
+        raise TypeError(f"Expected sequence or array-like, got estimator {x}")
+    if not hasattr(x, "__len__") and not hasattr(x, "shape"):
+        if hasattr(x, "__array__"):
+            x = mt.asarray(x)
+        else:
+            raise TypeError(f"Expected sequence or array-like, got {type(x)}")
+    if hasattr(x, "shape"):
+        if len(x.shape) == 0:
+            if isinstance(x.op, AssertAllFinite):
+                x = x.op.x
+            if hasattr(x.op, "data") and x.op.data is not None:
+                x = np.asarray(x.op.data)
+            raise TypeError(
+                f"Singleton array {x!r} cannot be considered a valid collection."
+            )
+        # Check that shape is returning an integer or default to len
+        if isinstance(x.shape[0], numbers.Integral):
+            return x.shape[0]
+        elif np.isnan(x.shape[0]):
+            return x.shape[0]
+        else:
+            return len(x)
+    else:
+        return len(x)
+
+
+def check_consistent_length(*arrays, session=None, run_kwargs=None):
+    """Check that all arrays have consistent first dimensions.
+
+    Checks whether all objects in arrays have the same shape or length.
+
+    Parameters
+    ----------
+    *arrays : list or tuple of input objects.
+        Objects that will be checked for consistent length.
+    """
+
+    new_arrays = []
+    lengths = []
+    to_execute = []
+    for X in arrays:
+        if X is not None:
+            n = _num_samples(X)
+            if np.isnan(n):
+                to_execute.append(X)
+            new_arrays.append(X)
+            lengths.append(n)
+    # unknown length exists
+    if len(to_execute) > 0:
+        # update shape
+        ExecutableTuple(to_execute).execute(session=session, **(run_kwargs or dict()))
+        # get length again
+        lengths = [_num_samples(X) for X in new_arrays]
+
+    uniques = np.unique(lengths)
+    if len(uniques) > 1:
+        raise ValueError(
+            "Found input variables with inconsistent numbers of"
+            f" samples: {[int(length) for length in lengths]}"
+        )
+
+
+def _make_indexable(iterable):
+    """Ensure iterable supports indexing or convert to an indexable variant.
+
+    Convert sparse matrices to csr and other non-indexable iterable to arrays.
+    Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged.
+
+    Parameters
+    ----------
+    iterable : {list, dataframe, array, sparse} or None
+        Object to be converted to an indexable iterable.
+    """
+    if issparse(iterable):
+        return mt.tensor(iterable)
+    elif hasattr(iterable, "iloc"):
+        if iterable.ndim == 1:
+            return md.Series(iterable)
+        else:
+            return md.DataFrame(iterable)
+    elif hasattr(iterable, "__getitem__"):
+        return mt.tensor(iterable)
+    elif iterable is None:
+        return iterable
+    return mt.tensor(iterable)
+
+
+def indexable(*iterables, session=None, run_kwargs=None):
+    """Make arrays indexable for cross-validation.
+
+    Checks consistent length, passes through None, and ensures that everything
+    can be indexed by converting sparse matrices to csr and converting
+    non-interable objects to arrays.
+
+    Parameters
+    ----------
+    *iterables : lists, dataframes, arrays, sparse matrices
+        List of objects to ensure sliceability.
+    """
+    result = [_make_indexable(X) for X in iterables]
+    check_consistent_length(*result, session=session, run_kwargs=run_kwargs)
+    return result
+
+
+def _ensure_no_complex_data(array):
+    if (
+        hasattr(array, "dtype")
+        and array.dtype is not None
+        and hasattr(array.dtype, "kind")
+        and array.dtype.kind == "c"
+    ):
+        raise ValueError(f"Complex data not supported\n{array}\n")
+
+
+def _ensure_sparse_format(
+    spmatrix, accept_sparse, dtype, copy, force_all_finite, accept_large_sparse
+):
+    """Convert a sparse matrix to a given format.
+
+    Checks the sparse format of spmatrix and converts if necessary.
+
+    Parameters
+    ----------
+    spmatrix : scipy sparse matrix
+        Input to validate and convert.
+
+    accept_sparse : string, boolean or list/tuple of strings
+        String[s] representing allowed sparse matrix formats ('csc',
+        'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but
+        not in the allowed format, it will be converted to the first listed
+        format. True allows the input to be any format. False means
+        that a sparse matrix input will raise an error.
+
+    dtype : string, type or None
+        Data type of result. If None, the dtype of the input is preserved.
+
+    copy : boolean
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
+
+    force_all_finite : boolean or 'allow-nan', (default=True)
+        Whether to raise an error on np.inf and np.nan in X. The possibilities
+        are:
+
+        - True: Force all values of X to be finite.
+        - False: accept both np.inf and np.nan in X.
+        - 'allow-nan': accept only np.nan values in X. Values cannot be
+          infinite.
+
+    Returns
+    -------
+    spmatrix_converted : scipy sparse matrix.
+        Matrix that is ensured to have an allowed type.
+    """
+    if dtype is None:
+        dtype = spmatrix.dtype
+
+    changed_format = False
+
+    if isinstance(accept_sparse, str):
+        accept_sparse = [accept_sparse]
+
+    # Indices dtype validation
+    # _check_large_sparse(spmatrix, accept_large_sparse)
+
+    if accept_sparse is False:
+        raise TypeError(
+            "A sparse tensor was passed, but dense "
+            "data is required. Use X.todense() to "
+            "convert to a dense tensor."
+        )
+    elif isinstance(accept_sparse, (list, tuple)):
+        if len(accept_sparse) == 0:
+            raise ValueError(
+                "When providing 'accept_sparse' "
+                "as a tuple or list, it must contain at "
+                "least one string value."
+            )
+        # # ensure correct sparse format
+        # if spmatrix.format not in accept_sparse:
+        #     # create new with correct sparse
+        #     spmatrix = spmatrix.asformat(accept_sparse[0])
+        #     changed_format = True
+    elif accept_sparse is not True:
+        # any other type
+        raise ValueError(
+            "Parameter 'accept_sparse' should be a string, "
+            "boolean or list of strings. You provided "
+            f"'accept_sparse={accept_sparse}'."
+        )
+
+    if dtype != spmatrix.dtype:
+        # convert dtype
+        spmatrix = spmatrix.astype(dtype)
+    elif copy and not changed_format:
+        # force copy
+        spmatrix = spmatrix.copy()
+
+    if force_all_finite:
+        spmatrix = assert_all_finite(
+            spmatrix, allow_nan=force_all_finite == "allow-nan", check_only=False
+        )
+
+    return spmatrix
+
+
+def check_array(
+    array,
+    accept_sparse=False,
+    accept_large_sparse=True,
+    dtype="numeric",
+    order=None,
+    copy=False,
+    force_all_finite=True,
+    ensure_2d=True,
+    allow_nd=False,
+    ensure_min_samples=1,
+    ensure_min_features=1,
+    estimator=None,
+) -> Tensor:
+    """Input validation on a tensor, list, sparse matrix or similar.
+
+    By default, the input is checked to be a non-empty 2D array containing
+    only finite values. If the dtype of the tensor is object, attempt
+    converting to float, raising on failure.
+
+    Parameters
+    ----------
+    array : object
+        Input object to check / convert.
+
+    accept_sparse : string, boolean or list/tuple of strings (default=False)
+        String[s] representing allowed sparse matrix formats, such as 'csc',
+        'csr', etc. If the input is sparse but not in the allowed format,
+        it will be converted to the first listed format. True allows the input
+        to be any format. False means that a sparse matrix input will
+        raise an error.
+
+    accept_large_sparse : bool (default=True)
+        If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
+        accept_sparse, accept_large_sparse=False will cause it to be accepted
+        only if its indices are stored with a 32-bit dtype.
+
+    dtype : string, type, list of types or None (default="numeric")
+        Data type of result. If None, the dtype of the input is preserved.
+        If "numeric", dtype is preserved unless array.dtype is object.
+        If dtype is a list of types, conversion on the first type is only
+        performed if the dtype of the input is not in the list.
+
+    order : 'F', 'C' or None (default=None)
+        Whether a tenor will be forced to be fortran or c-style.
+        When order is None (default), then if copy=False, nothing is ensured
+        about the memory layout of the output tensor; otherwise (copy=True)
+        the memory layout of the returned tensor is kept as close as possible
+        to the original tensor.
+
+    copy : boolean (default=False)
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
+
+    force_all_finite : boolean or 'allow-nan', (default=True)
+        Whether to raise an error on np.inf and np.nan in tensor. The
+        possibilities are:
+
+        - True: Force all values of tensor to be finite.
+        - False: accept both np.inf and np.nan in tensor.
+        - 'allow-nan': accept only np.nan values in tensor. Values cannot
+          be infinite.
+
+        For object dtyped data, only np.nan is checked and not np.inf.
+
+    ensure_2d : boolean (default=True)
+        Whether to raise a value error if tensor is not 2D.
+
+    allow_nd : boolean (default=False)
+        Whether to allow tensor.ndim > 2.
+
+    ensure_min_samples : int (default=1)
+        Make sure that the tensor has a minimum number of samples in its first
+        axis (rows for a 2D tensor). Setting to 0 disables this check.
+
+    ensure_min_features : int (default=1)
+        Make sure that the 2D tensor has some minimum number of features
+        (columns). The default value of 1 rejects empty datasets.
+        This check is only enforced when the input data has effectively 2
+        dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
+        disables this check.
+
+    estimator : str or estimator instance (default=None)
+        If passed, include the name of the estimator in warning messages.
+
+    Returns
+    -------
+    array_converted : object
+        The converted and validated tensor.
+    """
+
+    # store whether originally we wanted numeric dtype
+    dtype_numeric = isinstance(dtype, str) and dtype == "numeric"
+
+    dtype_orig = getattr(array, "dtype", None)
+    if not hasattr(dtype_orig, "kind"):
+        # not a data type (e.g. a column named dtype in a pandas DataFrame)
+        dtype_orig = None
+
+    if dtype_numeric:
+        if dtype_orig is not None and dtype_orig.kind == "O":
+            # if input is object, convert to float.
+            dtype = np.float64
+        else:
+            dtype = None
+
+    if isinstance(dtype, (list, tuple)):
+        if dtype_orig is not None and dtype_orig in dtype:
+            # no dtype conversion required
+            dtype = None
+        else:
+            # dtype conversion required. Let's select the first element of the
+            # list of accepted types.
+            dtype = dtype[0]
+
+    if force_all_finite not in (True, False, "allow-nan"):
+        raise ValueError(
+            'force_all_finite should be a bool or "allow-nan"'
+            f". Got {force_all_finite!r} instead"
+        )
+
+    if estimator is not None:
+        if isinstance(estimator, str):
+            estimator_name = estimator
+        else:
+            estimator_name = estimator.__class__.__name__
+    else:
+        estimator_name = "Estimator"
+    context = f" by {estimator_name}" if estimator is not None else ""
+
+    if (hasattr(array, "issparse") and array.issparse()) or issparse(array):
+        _ensure_no_complex_data(array)
+        array = mt.asarray(array)
+        array = _ensure_sparse_format(
+            array,
+            accept_sparse=accept_sparse,
+            dtype=dtype,
+            copy=copy,
+            force_all_finite=force_all_finite,
+            accept_large_sparse=accept_large_sparse,
+        )
+    else:
+        # If np.array(..) gives ComplexWarning, then we convert the warning
+        # to an error. This is needed because specifying a non complex
+        # dtype to the function converts complex to real dtype,
+        # thereby passing the test made in the lines following the scope
+        # of warnings context manager.
+        with warnings.catch_warnings():
+            try:
+                warnings.simplefilter("error", ComplexWarning)
+                array = mt.asarray(array, dtype=dtype, order=order)
+            except ComplexWarning:
+                raise ValueError(f"Complex data not supported\n{array}\n")
+
+        # It is possible that the np.array(..) gave no warning. This happens
+        # when no dtype conversion happened, for example dtype = None. The
+        # result is that np.array(..) produces an array of complex dtype
+        # and we need to catch and raise exception for such cases.
+        _ensure_no_complex_data(array)
+
+        if ensure_2d:
+            # If input is scalar raise error
+            if array.ndim == 0:
+                raise ValueError(
+                    f"Expected 2D array, got scalar array instead:\narray={array}.\n"
+                    "Reshape your data either using array.reshape(-1, 1) if "
+                    "your data has a single feature or array.reshape(1, -1) "
+                    "if it contains a single sample."
+                )
+            # If input is 1D raise error
+            if array.ndim == 1:
+                raise ValueError(
+                    f"Expected 2D array, got 1D array instead:\narray={array}.\n"
+                    "Reshape your data either using array.reshape(-1, 1) if "
+                    "your data has a single feature or array.reshape(1, -1) "
+                    "if it contains a single sample."
+                )
+
+        # in the future np.flexible dtypes will be handled like object dtypes
+        if dtype_numeric and np.issubdtype(array.dtype, np.flexible):
+            warnings.warn(
+                "Beginning in version 0.22, arrays of bytes/strings will be "
+                "converted to decimal numbers if dtype='numeric'. "
+                "It is recommended that you convert the array to "
+                "a float dtype before using it in scikit-learn, "
+                "for example by using "
+                "your_array = your_array.astype(np.float64).",
+                FutureWarning,
+            )
+
+        # make sure we actually converted to numeric:
+        if dtype_numeric and array.dtype.kind == "O":
+            array = array.astype(np.float64)
+        if not allow_nd and array.ndim >= 3:
+            raise ValueError(
+                "Found array with dim %d. %s expected <= 2."
+                % (array.ndim, estimator_name)
+            )
+        if force_all_finite:
+            array = _assert_all_finite(
+                array, allow_nan=force_all_finite == "allow-nan", check_only=False
+            )
+
+    if ensure_min_samples > 0:
+        n_samples = _num_samples(array)
+        if n_samples < ensure_min_samples:
+            raise ValueError(
+                "Found array with %d sample(s) (shape=%s) while a"
+                " minimum of %d is required%s."
+                % (n_samples, array.shape, ensure_min_samples, context)
+            )
+
+    if ensure_min_features > 0 and array.ndim == 2:
+        n_features = array.shape[1]
+        if n_features < ensure_min_features:
+            raise ValueError(
+                "Found array with %d feature(s) (shape=%s) while"
+                " a minimum of %d is required%s."
+                % (n_features, array.shape, ensure_min_features, context)
+            )
+
+    if copy:
+        array = mt.array(array, dtype=dtype, order=order)
+
+    return array
+
+
+def check_X_y(
+    X,
+    y,
+    accept_sparse=False,
+    accept_large_sparse=True,
+    dtype="numeric",
+    order=None,
+    copy=False,
+    force_all_finite=True,
+    ensure_2d=True,
+    allow_nd=False,
+    multi_output=False,
+    ensure_min_samples=1,
+    ensure_min_features=1,
+    y_numeric=False,
+    estimator=None,
+):
+    """Input validation for standard estimators.
+
+    Checks X and y for consistent length, enforces X to be 2D and y 1D. By
+    default, X is checked to be non-empty and containing only finite values.
+    Standard input checks are also applied to y, such as checking that y
+    does not have np.nan or np.inf targets. For multi-label y, set
+    multi_output=True to allow 2D and sparse y. If the dtype of X is
+    object, attempt converting to float, raising on failure.
+
+    Parameters
+    ----------
+    X : tensor, list or sparse tensor
+        Input data.
+
+    y : tensor, list or sparse tensor
+        Labels.
+
+    accept_sparse : string, boolean or list of string (default=False)
+        String[s] representing allowed sparse matrix formats, such as 'csc',
+        'csr', etc. If the input is sparse but not in the allowed format,
+        it will be converted to the first listed format. True allows the input
+        to be any format. False means that a sparse matrix input will
+        raise an error.
+
+    accept_large_sparse : bool (default=True)
+        If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
+        accept_sparse, accept_large_sparse will cause it to be accepted only
+        if its indices are stored with a 32-bit dtype.
+
+    dtype : string, type, list of types or None (default="numeric")
+        Data type of result. If None, the dtype of the input is preserved.
+        If "numeric", dtype is preserved unless array.dtype is object.
+        If dtype is a list of types, conversion on the first type is only
+        performed if the dtype of the input is not in the list.
+
+    order : 'F', 'C' or None (default=None)
+        Whether an array will be forced to be fortran or c-style.
+
+    copy : boolean (default=False)
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
+
+    force_all_finite : boolean or 'allow-nan', (default=True)
+        Whether to raise an error on np.inf and np.nan in X. This parameter
+        does not influence whether y can have np.inf or np.nan values.
+        The possibilities are:
+
+        - True: Force all values of X to be finite.
+        - False: accept both np.inf and np.nan in X.
+        - 'allow-nan': accept only np.nan values in X. Values cannot be
+          infinite.
+
+    ensure_2d : boolean (default=True)
+        Whether to raise a value error if X is not 2D.
+
+    allow_nd : boolean (default=False)
+        Whether to allow X.ndim > 2.
+
+    multi_output : boolean (default=False)
+        Whether to allow 2D y (array or sparse matrix). If false, y will be
+        validated as a vector. y cannot have np.nan or np.inf values if
+        multi_output=True.
+
+    ensure_min_samples : int (default=1)
+        Make sure that X has a minimum number of samples in its first
+        axis (rows for a 2D array).
+
+    ensure_min_features : int (default=1)
+        Make sure that the 2D array has some minimum number of features
+        (columns). The default value of 1 rejects empty datasets.
+        This check is only enforced when X has effectively 2 dimensions or
+        is originally 1D and ``ensure_2d`` is True. Setting to 0 disables
+        this check.
+
+    y_numeric : boolean (default=False)
+        Whether to ensure that y has a numeric type. If dtype of y is object,
+        it is converted to float64. Should only be used for regression
+        algorithms.
+
+    estimator : str or estimator instance (default=None)
+        If passed, include the name of the estimator in warning messages.
+
+    Returns
+    -------
+    X_converted : object
+        The converted and validated X.
+
+    y_converted : object
+        The converted and validated y.
+    """
+    if y is None:
+        raise ValueError("y cannot be None")
+
+    X = check_array(
+        X,
+        accept_sparse=accept_sparse,
+        accept_large_sparse=accept_large_sparse,
+        dtype=dtype,
+        order=order,
+        copy=copy,
+        force_all_finite=force_all_finite,
+        ensure_2d=ensure_2d,
+        allow_nd=allow_nd,
+        ensure_min_samples=ensure_min_samples,
+        ensure_min_features=ensure_min_features,
+        estimator=estimator,
+    )
+    if multi_output:
+        y = check_array(y, True, force_all_finite=True, ensure_2d=False, dtype=None)
+    else:
+        y = column_or_1d(y, warn=True)
+        y = _assert_all_finite(y, check_only=False)
+    if y_numeric and y.dtype.kind == "O":
+        y = y.astype(np.float64)
+
+    check_consistent_length(X, y)
+
+    return X, y
+
+
+def check_non_negative(X, whom):
+    """
+    Check if there is any negative value in a tensor.
+
+    Parameters
+    ----------
+    X : array-like or sparse matrix
+        Input data.
+
+    whom : string
+        Who passed X to this function.
+    """
+    return check_non_negative_then_return_value(X, X, whom)
+
+
+def column_or_1d(y, warn=False):
+    """Ravel column or 1d numpy array, else raises an error
+
+    Parameters
+    ----------
+    y : array-like
+
+    warn : boolean, default False
+       To control display of warnings.
+
+    Returns
+    -------
+    y : array
+
+    """
+    y = mt.tensor(y)
+    shape = y.shape
+    if len(shape) == 1:
+        return mt.ravel(y)
+    if len(shape) == 2 and shape[1] == 1:
+        if warn:
+            warnings.warn(
+                "A column-vector y was passed when a 1d array was"
+                " expected. Please change the shape of y to "
+                "(n_samples, ), for example using ravel().",
+                DataConversionWarning,
+                stacklevel=2,
+            )
+        return mt.ravel(y)
+
+    raise ValueError(
+        "y should be a 1d array, got an array of shape {} instead.".format(shape)
+    )
+
+
+check_is_fitted = check_is_fitted
+
+
+def _check_sample_weight(sample_weight, X, dtype=None):
+    """Validate sample weights.
+
+    Note that passing sample_weight=None will output an array of ones.
+    Therefore, in some cases, you may want to protect the call with:
+    if sample_weight is not None:
+        sample_weight = _check_sample_weight(...)
+
+    Parameters
+    ----------
+    sample_weight : {ndarray, Number or None}, shape (n_samples,)
+       Input sample weights.
+
+    X : nd-array, list or sparse matrix
+        Input data.
+
+    dtype: dtype
+       dtype of the validated `sample_weight`.
+       If None, and the input `sample_weight` is an array, the dtype of the
+       input is preserved; otherwise an array with the default numpy dtype
+       is be allocated.  If `dtype` is not one of `float32`, `float64`,
+       `None`, the output will be of dtype `float64`.
+
+    Returns
+    -------
+    sample_weight : ndarray, shape (n_samples,)
+       Validated sample weight. It is guaranteed to be "C" contiguous.
+    """
+    n_samples = _num_samples(X)
+
+    if dtype is not None and dtype not in [np.float32, np.float64]:
+        dtype = np.float64
+
+    if sample_weight is None or isinstance(sample_weight, numbers.Number):
+        if sample_weight is None:
+            sample_weight = mt.ones(n_samples, dtype=dtype)
+        else:
+            sample_weight = mt.full(n_samples, sample_weight, dtype=dtype)
+    else:
+        if dtype is None:
+            dtype = [np.float64, np.float32]
+        sample_weight = check_array(
+            sample_weight, accept_sparse=False, ensure_2d=False, dtype=dtype, order="C"
+        )
+        if sample_weight.ndim != 1:
+            raise ValueError("Sample weights must be 1D array or scalar")
+
+        if sample_weight.shape != (n_samples,):
+            raise ValueError(
+                f"sample_weight.shape == {sample_weight.shape}, "
+                f"expected {(n_samples,)}!"
+            )
+    return sample_weight
diff --git a/python/xorbits/_mars/learn/wrappers.py b/python/xorbits/_mars/learn/wrappers.py
new file mode 100644
index 000000000..79ef22e8d
--- /dev/null
+++ b/python/xorbits/_mars/learn/wrappers.py
@@ -0,0 +1,341 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Union
+
+import numpy as np
+from sklearn.base import BaseEstimator as SklearnBaseEstimator
+from sklearn.base import ClassifierMixin as SklearnClassifierMixin
+from sklearn.base import MetaEstimatorMixin
+from sklearn.base import RegressorMixin as SklearnRegressorMixin
+
+from .. import remote as mr
+from .. import tensor as mt
+from .base import BaseEstimator, ClassifierMixin, RegressorMixin
+from .metrics import get_scorer
+from .utils import check_array, copy_learned_attributes
+
+
+def _wrap(estimator: SklearnBaseEstimator, method, X, y, **kwargs):
+    return getattr(estimator, method)(X, y, **kwargs)
+
+
+class ParallelPostFit(BaseEstimator, MetaEstimatorMixin):
+    """
+    Meta-estimator for parallel predict and transform.
+
+    Parameters
+    ----------
+    estimator : Estimator
+        The underlying estimator that is fit.
+
+    scoring : string or callable, optional
+        A single string (see :ref:`scoring_parameter`) or a callable
+        (see :ref:`scoring`) to evaluate the predictions on the test set.
+
+        For evaluating multiple metrics, either give a list of (unique)
+        strings or a dict with names as keys and callables as values.
+
+        NOTE that when using custom scorers, each scorer should return a
+        single value. Metric functions returning a list/array of values
+        can be wrapped into multiple scorers that return one value each.
+
+        See :ref:`multimetric_grid_search` for an example.
+
+        .. warning::
+
+           If None, the estimator's default scorer (if available) is used.
+           Most scikit-learn estimators will convert large Mars tensors to
+           a single NumPy array, which may exhaust the memory of your worker.
+           You probably want to always specify `scoring`.
+
+    Notes
+    -----
+
+    .. warning::
+
+       This class is not appropriate for parallel or distributed *training*
+       on large datasets. For that, see :class:`Incremental`, which provides
+       distributed (but sequential) training. If you're doing distributed
+       hyperparameter optimization on larger-than-memory datasets, see
+       :class:`mars.learn.model_selection.IncrementalSearch`.
+
+    This estimator does not parallelize the training step. This simply calls
+    the underlying estimators's ``fit`` method called and copies over the
+    learned attributes to ``self`` afterwards.
+
+    It is helpful for situations where your training dataset is relatively
+    small (fits on a single machine) but you need to predict or transform
+    a much larger dataset. ``predict``, ``predict_proba`` and ``transform``
+    will be done in parallel (potentially distributed if you've connected
+    to a Mars cluster).
+
+    Note that many scikit-learn estimators already predict and transform in
+    parallel. This meta-estimator may still be useful in those cases when your
+    dataset is larger than memory, as the distributed scheduler will ensure the
+    data isn't all read into memory at once.
+
+    See Also
+    --------
+    Incremental
+    mars.learn.model_selection.IncrementalSearch
+
+    Examples
+    --------
+    >>> from sklearn.ensemble import GradientBoostingClassifier
+    >>> from sklearn.datasets import make_classification
+    >>> import mars.tensor as mt
+    >>> from mars.learn.wrappers import ParallelPostFit
+
+    Make a small 1,000 sample 2 training dataset and fit normally.
+
+    >>> X, y = make_classification(n_samples=1000, random_state=0)
+    >>> clf = ParallelPostFit(estimator=GradientBoostingClassifier(),
+    ...                       scoring='accuracy')
+    >>> clf.fit(X, y)
+    ParallelPostFit(estimator=GradientBoostingClassifier(...))
+
+    >>> clf.classes_
+    array([0, 1])
+
+    Transform and predict return Mars outputs for Mars inputs.
+
+    >>> X_big, y_big = make_classification(n_samples=100000,
+                                           random_state=0)
+    >>> X_big, y_big = mt.tensor(X_big), mt.tensor(y_big)
+    >>> clf.predict(X_big)
+    array([1, 0, 0, ..., 1, 0, 0])
+
+    Which can be computed in parallel.
+
+    >>> clf.predict_proba(X_big)
+    array([[0.01780031, 0.98219969],
+           [0.62199242, 0.37800758],
+           [0.89059934, 0.10940066],
+           ...,
+           [0.03249968, 0.96750032],
+           [0.951434  , 0.048566  ],
+           [0.99527114, 0.00472886]])
+    """
+
+    def __init__(
+        self,
+        estimator: SklearnBaseEstimator = None,
+        scoring: Union[str, Callable] = None,
+    ):
+        self.estimator = estimator
+        self.scoring = scoring
+
+    def _make_fit(self, method):
+        def _fit(X, y=None, **kwargs):
+            result = (
+                mr.spawn(
+                    _wrap,
+                    args=(self.estimator, method, X, y),
+                    kwargs=kwargs,
+                    resolve_tileable_input=True,
+                )
+                .execute()
+                .fetch()
+            )
+
+            copy_learned_attributes(result, self)
+            copy_learned_attributes(result, self.estimator)
+            return self
+
+        return _fit
+
+    def fit(self, X, y=None, **kwargs):
+        """
+        Fit the underlying estimator.
+
+        Parameters
+        ----------
+        X, y : array-like
+        **kwargs
+            Additional fit-kwargs for the underlying estimator.
+
+        Returns
+        -------
+        self : object
+        """
+        return self._make_fit("fit")(X, y=y, **kwargs)
+
+    def partial_fit(self, X, y=None, **kwargs):  # pragma: no cover
+        return self._make_fit("partial_fit")(X, y=y, **kwargs)
+
+    def _check_method(self, method):
+        """
+        Check if self.estimator has 'method'.
+
+        Raises
+        ------
+        AttributeError
+        """
+        estimator = self.estimator
+        if not hasattr(estimator, method):
+            msg = "The wrapped estimator '{}' does not have a '{}' method.".format(
+                estimator, method
+            )
+            raise AttributeError(msg)
+        return getattr(estimator, method)
+
+    def transform(self, X):
+        """
+        Transform block or partition-wise for Mars inputs.
+
+        For Mars inputs, a Mars tensor is returned. For other
+        inputs (NumPy array, pandas dataframe, scipy sparse matrix), the
+        regular return value is returned.
+
+        If the underlying estimator does not have a ``transform`` method, then
+        an ``AttributeError`` is raised.
+
+        Parameters
+        ----------
+        X : array-like
+
+        Returns
+        -------
+        transformed : array-like
+        """
+        self._check_method("transform")
+        X = check_array(X)
+        dtype = self.estimator.transform(np.zeros((1, X.shape[1]), dtype=X.dtype)).dtype
+        return X.map_chunk(self.estimator.transform, dtype=dtype)
+
+    def score(self, X, y):
+        """
+        Returns the score on the given data.
+
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+            Input data, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        Returns
+        -------
+        score : float
+            return self.estimator.score(X, y)
+        """
+
+        scoring = self.scoring
+        X = check_array(X)
+        y = check_array(y, ensure_2d=False)
+
+        if not scoring:
+            if type(self.estimator).score in (
+                RegressorMixin.score,
+                SklearnRegressorMixin.score,
+            ):  # pragma: no cover
+                scoring = "r2"
+            elif type(self.estimator).score in (
+                ClassifierMixin.score,
+                SklearnClassifierMixin.score,
+            ):
+                scoring = "accuracy"
+        else:  # pragma: no cover
+            scoring = self.scoring
+
+        if scoring:
+            scorer = get_scorer(scoring)
+            return scorer(self, X, y).execute()
+        else:  # pragma: no cover
+            return mr.spawn(self.estimator.score, args=(X, y)).execute().fetch()
+
+    def predict(self, X, execute=True):
+        """
+        Predict for X.
+
+        For Mars inputs, a Mars tensor is returned. For other
+        inputs (NumPy array, pandas dataframe, scipy sparse matrix), the
+        regular return value is returned.
+
+        Parameters
+        ----------
+        X : array-like
+
+        Returns
+        -------
+        y : array-like
+        """
+
+        self._check_method("predict")
+        X = check_array(X)
+
+        result = X.map_chunk(self.estimator.predict, dtype="int", shape=X.shape[:1])
+        if execute:
+            result.execute()
+        return result
+
+    def predict_proba(self, X, execute=True):
+        """
+        Probability estimates.
+
+        For Mars inputs, a Mars tensor is returned. For other
+        inputs (NumPy array, pandas dataframe, scipy sparse matrix), the
+        regular return value is returned.
+
+        If the underlying estimator does not have a ``predict_proba``
+        method, then an ``AttributeError`` is raised.
+
+        Parameters
+        ----------
+        X : array or dataframe
+
+        Returns
+        -------
+        y : array-like
+        """
+        self._check_method("predict_proba")
+        X = check_array(X)
+        result = X.map_chunk(
+            self.estimator.predict_proba,
+            dtype="float",
+            shape=(X.shape[0], len(self.estimator.classes_)),
+        )
+        if execute:
+            result.execute()
+        return result
+
+    def predict_log_proba(self, X, execute=True):
+        """
+        Log of probability estimates.
+
+        For Mars inputs, a Mars tensor is returned. For other
+        inputs (NumPy array, pandas dataframe, scipy sparse matrix), the
+        regular return value is returned.
+
+        If the underlying estimator does not have a ``predict_proba``
+        method, then an ``AttributeError`` is raised.
+
+        Parameters
+        ----------
+        X : array or dataframe
+
+        Returns
+        -------
+        y : array-like
+        """
+
+        self._check_method("predict_log_proba")
+        result = mt.log(self.predict_proba(X, execute=False))
+        if execute:
+            result.execute()
+        return result
diff --git a/python/xorbits/_mars/lib/__init__.py b/python/xorbits/_mars/lib/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/lib/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/lib/aio/__init__.py b/python/xorbits/_mars/lib/aio/__init__.py
new file mode 100644
index 000000000..43ff1f846
--- /dev/null
+++ b/python/xorbits/_mars/lib/aio/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import contextlib
+import sys
+
+from .file import AioFileObject, AioFilesystem
+from .isolation import Isolation, get_isolation, new_isolation, stop_isolation
+from .lru import alru_cache
+from .parallelism import AioEvent
+
+if sys.version_info[:2] < (3, 9):
+    from ._threads import to_thread
+
+    asyncio.to_thread = to_thread
diff --git a/python/xorbits/_mars/lib/aio/_runners.py b/python/xorbits/_mars/lib/aio/_runners.py
new file mode 100644
index 000000000..b45828e45
--- /dev/null
+++ b/python/xorbits/_mars/lib/aio/_runners.py
@@ -0,0 +1,162 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+Backport of the asyncio.runners module from Python 3.7.
+"""
+# Source:
+# https://github.com/python/cpython/blob/a4afcdfa55ddffa4b9ae3b0cf101628c7bff4102/Lib/asyncio/runners.py
+
+# Modifications:
+# * removed relative imports of .coroutines, .events, .tasks
+# * replaced `coroutines`, `events`, `tasks` with `asyncio`.
+# * replaced `tasks.all_tasks` with `asyncio.Task.all_tasks` because it is
+#   backwards compatible.
+# * Use private function `asyncio.events._get_running_loop` directly in
+#   Python 3.6
+
+import asyncio
+import weakref
+from typing import Any, Awaitable, Coroutine, TypeVar, Union
+
+try:
+    from asyncio import get_running_loop  # noqa Python >=3.7
+except ImportError:  # pragma: no cover
+    from asyncio.events import _get_running_loop as get_running_loop  # pragma: no cover
+
+__all__ = ("run", "get_running_loop")
+_T = TypeVar("_T")
+
+
+def _patch_loop(loop):
+    """
+    This function is designed to work around https://bugs.python.org/issue36607
+
+    It's job is to keep a thread safe variable tasks up to date with any tasks that
+    are created for the given loop. This then lets you cancel them as _all_tasks
+    was intended for.
+
+    We also need to patch the {get,set}_task_factory functions because we can't allow
+    Other users of it to overwrite our factory function. This function will pretend
+    like there is no factory set but in reality our factory is always set and we will
+    call the provided one set
+    """
+    tasks = weakref.WeakSet()
+
+    task_factory = [None]
+
+    def _set_task_factory(factory):
+        task_factory[0] = factory
+
+    def _get_task_factory():
+        return task_factory[0]
+
+    def _safe_task_factory(loop, coro):
+        if task_factory[0] is None:
+            # These lines are copied from the standard library because they don't have
+            # this inside a default factory function for me to call.
+            # https://github.com/python/cpython/blob/3.6/Lib/asyncio/base_events.py#L304
+            task = asyncio.Task(coro, loop=loop)
+            if task._source_traceback:
+                del task._source_traceback[-1]  # pragma: no cover
+        else:
+            task = task_factory[0](loop, coro)
+        tasks.add(task)
+        return task
+
+    loop.set_task_factory(_safe_task_factory)
+    loop.set_task_factory = _set_task_factory
+    loop.get_task_factory = _get_task_factory
+
+    return tasks
+
+
+def run(
+    main: Union[Coroutine[Any, None, _T], Awaitable[_T]], *, debug: bool = False
+) -> _T:
+    """Run a coroutine.
+
+    This function runs the passed coroutine, taking care of
+    managing the asyncio event loop and finalizing asynchronous
+    generators.
+
+    This function cannot be called when another asyncio event loop is
+    running in the same thread.
+
+    If debug is True, the event loop will be run in debug mode.
+
+    This function always creates a new event loop and closes it at the end.
+    It should be used as a main entry point for asyncio programs, and should
+    ideally only be called once.
+
+    Example:
+
+        async def main():
+            await asyncio.sleep(1)
+            print('hello')
+
+        asyncio.run(main())
+    """
+    # Python 3.7+ raises RuntimeError while <3.6 returns None
+    try:
+        loop = get_running_loop()
+    except RuntimeError:
+        loop = None
+    if loop is not None:
+        raise RuntimeError("asyncio.run() cannot be called from a running event loop")
+
+    if not asyncio.iscoroutine(main):
+        raise ValueError("a coroutine was expected, got {!r}".format(main))
+
+    loop = asyncio.new_event_loop()
+    tasks = _patch_loop(loop)
+
+    try:
+        asyncio.set_event_loop(loop)
+        loop.set_debug(debug)
+        return loop.run_until_complete(main)
+    finally:
+        try:
+            _cancel_all_tasks(loop, tasks)
+            loop.run_until_complete(loop.shutdown_asyncgens())
+        finally:
+            asyncio.set_event_loop(None)  # type: ignore
+            loop.close()
+
+
+def _cancel_all_tasks(loop, tasks):
+    to_cancel = [task for task in tasks if not task.done()]
+
+    if not to_cancel:
+        return
+
+    for task in to_cancel:
+        task.cancel()
+
+    loop.run_until_complete(
+        asyncio.gather(*to_cancel, loop=loop, return_exceptions=True)
+    )
+
+    for task in to_cancel:
+        if task.cancelled():
+            continue
+        if task.exception() is not None:
+            loop.call_exception_handler(
+                {
+                    "message": "unhandled exception during asyncio.run() shutdown",
+                    "exception": task.exception(),
+                    "task": task,
+                }
+            )
diff --git a/python/xorbits/_mars/lib/aio/_threads.py b/python/xorbits/_mars/lib/aio/_threads.py
new file mode 100644
index 000000000..6324577ae
--- /dev/null
+++ b/python/xorbits/_mars/lib/aio/_threads.py
@@ -0,0 +1,35 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextvars
+import functools
+from asyncio import events
+
+__all__ = ("to_thread",)
+
+
+async def to_thread(func, *args, **kwargs):
+    """Asynchronously run function *func* in a separate thread.
+
+    Any *args and **kwargs supplied for this function are directly passed
+    to *func*. Also, the current :class:`contextvars.Context` is propagated,
+    allowing context variables from the main thread to be accessed in the
+    separate thread.
+
+    Return a coroutine that can be awaited to get the eventual result of *func*.
+    """
+    loop = events.get_running_loop()
+    ctx = contextvars.copy_context()
+    func_call = functools.partial(ctx.run, func, *args, **kwargs)
+    return await loop.run_in_executor(None, func_call)
diff --git a/python/xorbits/_mars/lib/aio/base.py b/python/xorbits/_mars/lib/aio/base.py
new file mode 100644
index 000000000..db5557e09
--- /dev/null
+++ b/python/xorbits/_mars/lib/aio/base.py
@@ -0,0 +1,82 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import asyncio
+import functools
+from concurrent.futures import Executor
+from typing import Any, Type
+
+
+def _make_delegate_method(attr):
+    async def method(self, *args, **kwargs):
+        func = functools.partial(getattr(self._file, attr), *args, **kwargs)
+        return await self._loop.run_in_executor(self._executor, func)
+
+    return method
+
+
+def _make_proxy_method(attr):
+    def method(self, *args, **kwargs):
+        return getattr(self._file, attr)(*args, **kwargs)
+
+    return method
+
+
+def _make_proxy_property(attr):
+    def proxy_property(self):
+        return getattr(self._file, attr)
+
+    return property(proxy_property)
+
+
+def delegate_to_executor(*attrs):
+    def wrap_cls(cls: Type):
+        for attr in attrs:
+            setattr(cls, attr, _make_delegate_method(attr))
+        return cls
+
+    return wrap_cls
+
+
+def proxy_method_directly(*attrs):
+    def wrap_cls(cls: Type):
+        for attr in attrs:
+            setattr(cls, attr, _make_proxy_method(attr))
+        return cls
+
+    return wrap_cls
+
+
+def proxy_property_directly(*attrs):
+    def wrap_cls(cls):
+        for attr in attrs:
+            setattr(cls, attr, _make_proxy_property(attr))
+        return cls
+
+    return wrap_cls
+
+
+class AioBase:
+    def __init__(
+        self, file: Any, loop: asyncio.BaseEventLoop = None, executor: Executor = None
+    ):
+        if loop is None:
+            loop = asyncio.get_event_loop()
+        if isinstance(file, AioBase):
+            file = file._file
+
+        self._file = file
+        self._loop = loop
+        self._executor = executor
diff --git a/python/xorbits/_mars/lib/aio/file.py b/python/xorbits/_mars/lib/aio/file.py
new file mode 100644
index 000000000..fd9d1b2d9
--- /dev/null
+++ b/python/xorbits/_mars/lib/aio/file.py
@@ -0,0 +1,85 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+
+from .base import (
+    AioBase,
+    delegate_to_executor,
+    proxy_method_directly,
+    proxy_property_directly,
+)
+
+
+@delegate_to_executor(
+    "close",
+    "flush",
+    "isatty",
+    "read",
+    "read1",
+    "readinto",
+    "readline",
+    "readlines",
+    "seek",
+    "seekable",
+    "tell",
+    "truncate",
+    "writable",
+    "write",
+    "writelines",
+)
+@proxy_method_directly("fileno", "readable")
+@proxy_property_directly("closed", "name", "mode")
+class AioFileObject(AioBase):
+    def __aiter__(self):
+        return self
+
+    async def __anext__(self):
+        """Simulate normal file iteration."""
+        line = await self.readline()
+        if line:
+            return line
+        else:
+            raise StopAsyncIteration
+
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.close()
+        self._file = None
+
+
+@delegate_to_executor(
+    "cat",
+    "ls",
+    "delete",
+    "disk_usage",
+    "stat",
+    "rm",
+    "mv",
+    "rename",
+    "mkdir",
+    "exists",
+    "isdir",
+    "isfile",
+    "read_parquet",
+    "walk",
+)
+@proxy_property_directly("pathsep")
+class AioFilesystem(AioBase):
+    async def open(self, *args, **kwargs):
+        func = functools.partial(self._file.open, *args, **kwargs)
+        file = await self._loop.run_in_executor(self._executor, func)
+        return AioFileObject(file)
diff --git a/python/xorbits/_mars/lib/aio/isolation.py b/python/xorbits/_mars/lib/aio/isolation.py
new file mode 100644
index 000000000..7968588ad
--- /dev/null
+++ b/python/xorbits/_mars/lib/aio/isolation.py
@@ -0,0 +1,95 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import atexit
+import threading
+from typing import Dict, Optional
+
+
+class Isolation:
+    loop: asyncio.AbstractEventLoop
+    _stopped: Optional[asyncio.Event]
+    _thread: Optional[threading.Thread]
+
+    def __init__(self, loop: asyncio.AbstractEventLoop, threaded: bool = True):
+        self.loop = loop
+        self._threaded = threaded
+
+        self._stopped = None
+        self._thread = None
+        self._thread_ident = None
+
+    def _run(self):
+        asyncio.set_event_loop(self.loop)
+        self._stopped = asyncio.Event()
+        self.loop.run_until_complete(self._stopped.wait())
+
+    def start(self):
+        if self._threaded:
+            self._thread = thread = threading.Thread(target=self._run)
+            thread.daemon = True
+            thread.start()
+            self._thread_ident = thread.ident
+
+    @property
+    def thread_ident(self):
+        return self._thread_ident
+
+    async def _stop(self):
+        self._stopped.set()
+
+    def stop(self):
+        if self._threaded:
+            asyncio.run_coroutine_threadsafe(self._stop(), self.loop).result()
+            self._thread.join()
+
+
+_name_to_isolation: Dict[str, Isolation] = dict()
+
+
+DEFAULT_ISOLATION = "oscar"
+
+
+def new_isolation(
+    name: str = DEFAULT_ISOLATION,
+    loop: asyncio.AbstractEventLoop = None,
+    threaded: bool = True,
+) -> Isolation:
+    if name in _name_to_isolation:
+        return _name_to_isolation[name]
+
+    if loop is None:
+        loop = asyncio.new_event_loop()
+
+    isolation = Isolation(loop, threaded=threaded)
+    isolation.start()
+    _name_to_isolation[name] = isolation
+    return isolation
+
+
+def get_isolation(name: str = DEFAULT_ISOLATION):
+    isolation = _name_to_isolation[name]
+    if isolation.loop.is_closed():  # pragma: no cover
+        _name_to_isolation.pop(name)
+        raise KeyError(name)
+    return isolation
+
+
+def stop_isolation(name: str = DEFAULT_ISOLATION):
+    if name in _name_to_isolation:
+        return _name_to_isolation.pop(name).stop()
+
+
+atexit.register(stop_isolation)
diff --git a/python/xorbits/_mars/lib/aio/lru.py b/python/xorbits/_mars/lib/aio/lru.py
new file mode 100644
index 000000000..46f8ed232
--- /dev/null
+++ b/python/xorbits/_mars/lib/aio/lru.py
@@ -0,0 +1,229 @@
+# The MIT License
+#
+# Copyright (c) 2018 aio-libs team https://github.com/aio-libs/
+# Copyright (c) 2017 Ocean S. A. https://ocean.io/
+# Copyright (c) 2016-2017 WikiBusiness Corporation http://wikibusiness.org/
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+import asyncio
+import os
+import weakref
+from collections import OrderedDict
+from functools import _CacheInfo, _make_key, partial, wraps
+
+
+__version__ = "1.0.2"
+
+__all__ = ("alru_cache", "clear_all_alru_caches")
+
+_is_ci = (os.environ.get("CI") or "0").lower() in ("1", "true")
+_all_wrapped = weakref.WeakSet()
+
+
+def clear_all_alru_caches():
+    for wrapped in _all_wrapped:
+        wrapped.cache_clear()
+
+
+def unpartial(fn):
+    while hasattr(fn, "func"):
+        fn = fn.func
+
+    return fn
+
+
+def _done_callback(fut, task):
+    if task.cancelled():
+        fut.cancel()
+        return
+
+    exc = task.exception()
+    if exc is not None:
+        fut.set_exception(exc)
+        return
+
+    fut.set_result(task.result())
+
+
+def _cache_invalidate(wrapped, typed, *args, **kwargs):
+    key = _make_key(args, kwargs, typed)
+
+    exists = key in wrapped._cache
+
+    if exists:
+        wrapped._cache.pop(key)
+
+    return exists
+
+
+def _cache_clear(wrapped):
+    wrapped.hits = wrapped.misses = 0
+    wrapped._cache = OrderedDict()
+    wrapped.tasks = set()
+
+
+def _open(wrapped):
+    if not wrapped.closed:
+        raise RuntimeError("alru_cache is not closed")
+
+    was_closed = (
+        wrapped.hits == wrapped.misses == len(wrapped.tasks) == len(wrapped._cache) == 0
+    )
+
+    if not was_closed:
+        raise RuntimeError("alru_cache was not closed correctly")
+
+    wrapped.closed = False
+
+
+def _close(wrapped, *, cancel=False, return_exceptions=True):
+    if wrapped.closed:
+        raise RuntimeError("alru_cache is closed")
+
+    wrapped.closed = True
+
+    if cancel:
+        for task in wrapped.tasks:
+            if not task.done():  # not sure is it possible
+                task.cancel()
+
+    return _wait_closed(wrapped, return_exceptions=return_exceptions)
+
+
+async def _wait_closed(wrapped, *, return_exceptions):
+    wait_closed = asyncio.gather(*wrapped.tasks, return_exceptions=return_exceptions)
+
+    wait_closed.add_done_callback(partial(_close_waited, wrapped))
+
+    ret = await wait_closed
+
+    # hack to get _close_waited callback to be executed
+    await asyncio.sleep(0)
+
+    return ret
+
+
+def _close_waited(wrapped, _):
+    wrapped.cache_clear()
+
+
+def _cache_info(wrapped, maxsize):
+    return _CacheInfo(
+        wrapped.hits,
+        wrapped.misses,
+        maxsize,
+        len(wrapped._cache),
+    )
+
+
+def __cache_touch(wrapped, key):
+    try:
+        wrapped._cache.move_to_end(key)
+    except KeyError:  # not sure is it possible
+        pass
+
+
+def _cache_hit(wrapped, key):
+    wrapped.hits += 1
+    __cache_touch(wrapped, key)
+
+
+def _cache_miss(wrapped, key):
+    wrapped.misses += 1
+    __cache_touch(wrapped, key)
+
+
+def alru_cache(
+    fn=None,
+    maxsize=128,
+    typed=False,
+    *,
+    cache_exceptions=True,
+):
+    def wrapper(fn):
+        _origin = unpartial(fn)
+
+        if not asyncio.iscoroutinefunction(_origin):
+            raise RuntimeError("Coroutine function is required, got {}".format(fn))
+
+        # functools.partialmethod support
+        if hasattr(fn, "_make_unbound_method"):
+            fn = fn._make_unbound_method()
+
+        @wraps(fn)
+        async def wrapped(*fn_args, **fn_kwargs):
+            if wrapped.closed:
+                raise RuntimeError("alru_cache is closed for {}".format(wrapped))
+
+            loop = asyncio.get_event_loop()
+
+            key = _make_key(fn_args, fn_kwargs, typed)
+
+            fut = wrapped._cache.get(key)
+
+            if fut is not None:
+                if not fut.done():
+                    _cache_hit(wrapped, key)
+                    return await asyncio.shield(fut)
+
+                exc = fut._exception
+
+                if exc is None or cache_exceptions:
+                    _cache_hit(wrapped, key)
+                    return fut.result()
+
+                # exception here and cache_exceptions == False
+                wrapped._cache.pop(key)
+
+            fut = loop.create_future()
+            task = loop.create_task(fn(*fn_args, **fn_kwargs))
+            task.add_done_callback(partial(_done_callback, fut))
+
+            wrapped.tasks.add(task)
+            task.add_done_callback(wrapped.tasks.remove)
+
+            wrapped._cache[key] = fut
+
+            if maxsize is not None and len(wrapped._cache) > maxsize:
+                wrapped._cache.popitem(last=False)
+
+            _cache_miss(wrapped, key)
+            return await asyncio.shield(fut)
+
+        _cache_clear(wrapped)
+        wrapped._origin = _origin
+        wrapped.closed = False
+        wrapped.cache_info = partial(_cache_info, wrapped, maxsize)
+        wrapped.cache_clear = partial(_cache_clear, wrapped)
+        wrapped.invalidate = partial(_cache_invalidate, wrapped, typed)
+        wrapped.close = partial(_close, wrapped)
+        wrapped.open = partial(_open, wrapped)
+
+        if _is_ci:
+            _all_wrapped.add(wrapped)
+        return wrapped
+
+    if fn is None:
+        return wrapper
+
+    if callable(fn) or hasattr(fn, "_make_unbound_method"):
+        return wrapper(fn)
+
+    raise NotImplementedError("{} decorating is not supported".format(fn))
diff --git a/python/xorbits/_mars/lib/aio/parallelism.py b/python/xorbits/_mars/lib/aio/parallelism.py
new file mode 100644
index 000000000..45c3e9308
--- /dev/null
+++ b/python/xorbits/_mars/lib/aio/parallelism.py
@@ -0,0 +1,37 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import multiprocessing
+import threading
+from concurrent.futures import Executor
+from typing import Union
+
+from .base import AioBase, delegate_to_executor, proxy_method_directly
+
+event_types = Union[threading.Event, multiprocessing.Event]
+
+
+@delegate_to_executor("wait")
+@proxy_method_directly("set", "is_set", "clear")
+class AioEvent(AioBase):
+    def __init__(
+        self,
+        event: event_types = None,
+        loop: asyncio.BaseEventLoop = None,
+        executor: Executor = None,
+    ):
+        if event is None:
+            event = threading.Event()
+        super().__init__(event, loop=loop, executor=executor)
diff --git a/python/xorbits/_mars/lib/aio/tests/__init__.py b/python/xorbits/_mars/lib/aio/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/lib/aio/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/lib/aio/tests/test_aio_file.py b/python/xorbits/_mars/lib/aio/tests/test_aio_file.py
new file mode 100644
index 000000000..fe8538007
--- /dev/null
+++ b/python/xorbits/_mars/lib/aio/tests/test_aio_file.py
@@ -0,0 +1,55 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+
+import pytest
+
+from ...filesystem import LocalFileSystem
+from .. import AioFileObject, AioFilesystem
+
+
+@pytest.mark.asyncio
+async def test_aio_filesystem():
+    local_fs = LocalFileSystem.get_instance()
+    aio_fs = AioFilesystem(local_fs)
+
+    assert aio_fs.pathsep == local_fs.pathsep
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test")
+
+        with open(file_path, "wb") as f:
+            f.write(b"text for test")
+
+        stat = await aio_fs.stat(tempdir)
+        assert stat["type"] == "directory"
+
+
+@pytest.mark.asyncio
+async def test_aio_file_object():
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test")
+
+        f = AioFileObject(open(file_path, "w"))
+        async with f:
+            assert f.readable() is False
+            assert f.mode == "w"
+            await f.write("text for test")
+
+        f2 = AioFileObject(open(file_path))
+        async with f2:
+            async for l in f2:
+                assert len(l) > 0
diff --git a/python/xorbits/_mars/lib/bloom_filter.py b/python/xorbits/_mars/lib/bloom_filter.py
new file mode 100644
index 000000000..8faee0fa6
--- /dev/null
+++ b/python/xorbits/_mars/lib/bloom_filter.py
@@ -0,0 +1,572 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# pylint: disable=superfluous-parens,redefined-variable-type
+# superfluous-parens: Sometimes extra parens are more clear
+
+"""Bloom Filter: Probabilistic set membership testing for large sets"""
+
+# Shamelessly borrowed (under MIT license) from
+# https://code.activestate.com/recipes/577686-bloom-filter/
+# About Bloom Filters: https://en.wikipedia.org/wiki/Bloom_filter
+
+# Tweaked by Daniel Richard Stromberg, mostly to:
+# 1) Give it a little nicer __init__ parameters.
+# 2) Improve the hash functions to get a much lower rate of false positives.
+# 3) Give it a selection of backends.
+# 4) Make it pass pylint.
+
+# In the literature:
+# k is the number of probes - we call this num_probes_k
+# m is the number of bits in the filter - we call this num_bits_m
+# n is the ideal number of elements to eventually be stored in the filter - we
+# call this ideal_num_elements_n
+# p is the desired error rate when full - we call this error_rate_p
+
+import array
+import math
+import os
+import random
+
+try:
+    import mmap as mmap_mod
+except ImportError:
+    # Jython lacks mmap()
+    HAVE_MMAP = False
+else:
+    HAVE_MMAP = True
+
+
+class Mmap_backend(object):
+    """
+    Backend storage for our "array of bits" using an mmap'd file.
+    Please note that this has only been tested on Linux so far.
+    """
+
+    effs = 2**8 - 1
+
+    def __init__(self, num_bits, filename):
+        if not HAVE_MMAP:
+            raise NotImplementedError("mmap is not available")
+        self.num_bits = num_bits
+        self.num_chars = (self.num_bits + 7) // 8
+        flags = os.O_RDWR | os.O_CREAT
+        if hasattr(os, "O_BINARY"):
+            flags |= getattr(os, "O_BINARY")
+        self.file_ = os.open(filename, flags)
+        os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET)
+        os.write(self.file_, b"\x00")
+        self.mmap = mmap_mod.mmap(self.file_, self.num_chars)
+
+    def is_set(self, bitno):
+        """Return true iff bit number bitno is set"""
+        byteno, bit_within_wordno = divmod(bitno, 8)
+        mask = 1 << bit_within_wordno
+        byte = self.mmap[byteno]
+        return byte & mask
+
+    def set(self, bitno):
+        """set bit number bitno to true"""
+
+        byteno, bit_within_byteno = divmod(bitno, 8)
+        mask = 1 << bit_within_byteno
+        byte = self.mmap[byteno]
+        byte |= mask
+        self.mmap[byteno] = byte
+
+    def clear(self, bitno):
+        """clear bit number bitno - set it to false"""
+
+        byteno, bit_within_byteno = divmod(bitno, 8)
+        mask = 1 << bit_within_byteno
+        byte = self.mmap[byteno]
+        byte &= Mmap_backend.effs - mask
+        self.mmap[byteno] = byte
+
+    def __iand__(self, other):
+        assert self.num_bits == other.num_bits
+
+        for byteno in range(self.num_chars):
+            self.mmap[byteno] = self.mmap[byteno] & other.mmap[byteno]
+
+        return self
+
+    def __ior__(self, other):
+        assert self.num_bits == other.num_bits
+
+        for byteno in range(self.num_chars):
+            self.mmap[byteno] = self.mmap[byteno] | other.mmap[byteno]
+
+        return self
+
+    def close(self):
+        """Close the file"""
+        os.close(self.file_)
+
+
+class File_seek_backend(object):
+    """Backend storage for our "array of bits" using a file in which we seek"""
+
+    effs = 2**8 - 1
+
+    def __init__(self, num_bits, filename):
+        self.num_bits = num_bits
+        self.num_chars = (self.num_bits + 7) // 8
+        flags = os.O_RDWR | os.O_CREAT
+        if hasattr(os, "O_BINARY"):
+            flags |= getattr(os, "O_BINARY")
+        self.file_ = os.open(filename, flags)
+        os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET)
+        os.write(self.file_, b"\x00")
+
+    def is_set(self, bitno):
+        """Return true iff bit number bitno is set"""
+        byteno, bit_within_wordno = divmod(bitno, 8)
+        mask = 1 << bit_within_wordno
+        os.lseek(self.file_, byteno, os.SEEK_SET)
+        byte = os.read(self.file_, 1)[0]
+        return byte & mask
+
+    def set(self, bitno):
+        """set bit number bitno to true"""
+
+        byteno, bit_within_byteno = divmod(bitno, 8)
+        mask = 1 << bit_within_byteno
+        os.lseek(self.file_, byteno, os.SEEK_SET)
+        byte = os.read(self.file_, 1)[0]
+        byte |= mask
+        os.lseek(self.file_, byteno, os.SEEK_SET)
+        os.write(self.file_, bytes([byte]))
+
+    def clear(self, bitno):
+        """clear bit number bitno - set it to false"""
+
+        byteno, bit_within_byteno = divmod(bitno, 8)
+        mask = 1 << bit_within_byteno
+        os.lseek(self.file_, byteno, os.SEEK_SET)
+        byte = os.read(self.file_, 1)[0]
+        byte &= File_seek_backend.effs - mask
+        os.lseek(self.file_, byteno, os.SEEK_SET)
+        os.write(self.file_, bytes([byte]))
+
+    # These are quite slow ways to do iand and ior, but they should work,
+    # and a faster version is going to take more time
+    def __iand__(self, other):
+        assert self.num_bits == other.num_bits
+
+        for bitno in range(self.num_bits):
+            if self.is_set(bitno) and other.is_set(bitno):
+                self.set(bitno)
+            else:
+                self.clear(bitno)
+
+        return self
+
+    def __ior__(self, other):
+        assert self.num_bits == other.num_bits
+
+        for bitno in range(self.num_bits):
+            if self.is_set(bitno) or other.is_set(bitno):
+                self.set(bitno)
+            else:
+                self.clear(bitno)
+
+        return self
+
+    def close(self):
+        """Close the file"""
+        os.close(self.file_)
+
+
+class Array_then_file_seek_backend(object):
+    # pylint: disable=R0902
+    # R0902: We kinda need a bunch of instance attributes
+    """
+    Backend storage for our "array of bits" using a python array of integers up
+    to some maximum number of bytes, then spilling over to a file.
+    This is -not- a cache; we instead save the leftmost bits in RAM, and the
+    rightmost bits (if necessary) in a file.  On open, we read from the file to
+    RAM.  On close, we write from RAM to the file.
+    """
+
+    effs = 2**8 - 1
+
+    def __init__(self, num_bits, filename, max_bytes_in_memory):
+        self.num_bits = num_bits
+        num_chars = (self.num_bits + 7) // 8
+        self.filename = filename
+        self.max_bytes_in_memory = max_bytes_in_memory
+        self.bits_in_memory = min(num_bits, self.max_bytes_in_memory * 8)
+        self.bits_in_file = max(self.num_bits - self.bits_in_memory, 0)
+        self.bytes_in_memory = (self.bits_in_memory + 7) // 8
+        self.bytes_in_file = (self.bits_in_file + 7) // 8
+
+        self.array_ = array.array("B", [0]) * self.bytes_in_memory
+        flags = os.O_RDWR | os.O_CREAT
+        if hasattr(os, "O_BINARY"):
+            flags |= getattr(os, "O_BINARY")
+        self.file_ = os.open(filename, flags)
+        os.lseek(self.file_, num_chars + 1, os.SEEK_SET)
+        os.write(self.file_, b"\x00")
+
+        os.lseek(self.file_, 0, os.SEEK_SET)
+        offset = 0
+        intended_block_len = 2**17
+        while True:
+            if offset + intended_block_len < self.bytes_in_memory:
+                block = os.read(self.file_, intended_block_len)
+            elif offset < self.bytes_in_memory:
+                block = os.read(self.file_, self.bytes_in_memory - offset)
+            else:
+                break
+            for index_in_block, byte in enumerate(block):
+                self.array_[offset + index_in_block] = byte
+            offset += intended_block_len
+
+    def is_set(self, bitno):
+        """Return true iff bit number bitno is set"""
+        byteno, bit_within_byteno = divmod(bitno, 8)
+        mask = 1 << bit_within_byteno
+        if byteno < self.bytes_in_memory:
+            return self.array_[byteno] & mask
+        else:
+            os.lseek(self.file_, byteno, os.SEEK_SET)
+            byte = os.read(self.file_, 1)[0]
+            return byte & mask
+
+    def set(self, bitno):
+        """set bit number bitno to true"""
+        byteno, bit_within_byteno = divmod(bitno, 8)
+        mask = 1 << bit_within_byteno
+        if byteno < self.bytes_in_memory:
+            self.array_[byteno] |= mask
+        else:
+            os.lseek(self.file_, byteno, os.SEEK_SET)
+            byte = os.read(self.file_, 1)[0]
+            byte |= mask
+            os.lseek(self.file_, byteno, os.SEEK_SET)
+            os.write(self.file_, bytes([byte]))
+
+    def clear(self, bitno):
+        """clear bit number bitno - set it to false"""
+        byteno, bit_within_byteno = divmod(bitno, 8)
+        mask = Array_backend.effs - (1 << bit_within_byteno)
+        if byteno < self.bytes_in_memory:
+            self.array_[byteno] &= mask
+        else:
+            os.lseek(self.file_, byteno, os.SEEK_SET)
+            byte = os.read(self.file_, 1)[0]
+            byte &= File_seek_backend.effs - mask
+            os.lseek(self.file_, byteno, os.SEEK_SET)
+            os.write(self.file_, bytes([byte]))
+
+    # These are quite slow ways to do iand and ior, but they should work,
+    # and a faster version is going to take more time
+    def __iand__(self, other):
+        assert self.num_bits == other.num_bits
+
+        for bitno in range(self.num_bits):
+            if self.is_set(bitno) and other.is_set(bitno):
+                self.set(bitno)
+            else:
+                self.clear(bitno)
+
+        return self
+
+    def __ior__(self, other):
+        assert self.num_bits == other.num_bits
+
+        for bitno in range(self.num_bits):
+            if self.is_set(bitno) or other.is_set(bitno):
+                self.set(bitno)
+            else:
+                self.clear(bitno)
+
+        return self
+
+    def close(self):
+        """
+        Write the in-memory portion to disk, leave the already-on-disk  portion
+        unchanged
+        """
+
+        os.lseek(self.file_, 0, os.SEEK_SET)
+        os.write(self.file_, bytes(self.array_[0 : self.bytes_in_memory]))
+
+        os.close(self.file_)
+
+
+class Array_backend(object):
+    """
+    Backend storage for our "array of bits" using a python array of integers
+    """
+
+    # Note that this has now been split out into a bits_mod for the benefit of
+    # other projects.
+    effs = 2**32 - 1
+
+    def __init__(self, num_bits):
+        self.num_bits = num_bits
+        self.num_words = (self.num_bits + 31) // 32
+        self.array_ = array.array("L", [0]) * self.num_words
+
+    def is_set(self, bitno):
+        """Return true iff bit number bitno is set"""
+        wordno, bit_within_wordno = divmod(bitno, 32)
+        mask = 1 << bit_within_wordno
+        return self.array_[wordno] & mask
+
+    def set(self, bitno):
+        """set bit number bitno to true"""
+        wordno, bit_within_wordno = divmod(bitno, 32)
+        mask = 1 << bit_within_wordno
+        self.array_[wordno] |= mask
+
+    def clear(self, bitno):
+        """clear bit number bitno - set it to false"""
+        wordno, bit_within_wordno = divmod(bitno, 32)
+        mask = Array_backend.effs - (1 << bit_within_wordno)
+        self.array_[wordno] &= mask
+
+    # It'd be nice to do __iand__ and __ior__ in a base class, but
+    # that'd be Much slower
+
+    def __iand__(self, other):
+        assert self.num_bits == other.num_bits
+
+        for wordno in range(self.num_words):
+            self.array_[wordno] &= other.array_[wordno]
+
+        return self
+
+    def __ior__(self, other):
+        assert self.num_bits == other.num_bits
+
+        for wordno in range(self.num_words):
+            self.array_[wordno] |= other.array_[wordno]
+
+        return self
+
+    def close(self):
+        """Noop for compatibility with the file+seek backend"""
+        pass
+
+
+def get_bitno_seed_rnd(bloom_filter, key):
+    """
+    Apply num_probes_k hash functions to key.
+    Generate the array index and bitmask corresponding to each result.
+    """
+
+    # We're using key as a seed to a pseudorandom number generator
+    hasher = random.Random(key).randrange
+    for dummy in range(bloom_filter.num_probes_k):
+        bitno = hasher(bloom_filter.num_bits_m)
+        yield bitno % bloom_filter.num_bits_m
+
+
+MERSENNES1 = [2**x - 1 for x in [17, 31, 127]]
+MERSENNES2 = [2**x - 1 for x in [19, 67, 257]]
+
+
+def simple_hash(int_list, prime1, prime2, prime3):
+    """Compute a hash value from a list of integers and 3 primes"""
+    result = 0
+    for integer in int_list:
+        result += ((result + integer + prime1) * prime2) % prime3
+    return result
+
+
+def hash1(int_list):
+    """Basic hash function #1"""
+    return simple_hash(int_list, MERSENNES1[0], MERSENNES1[1], MERSENNES1[2])
+
+
+def hash2(int_list):
+    """Basic hash function #2"""
+    return simple_hash(int_list, MERSENNES2[0], MERSENNES2[1], MERSENNES2[2])
+
+
+def get_filter_bitno_probes(bloom_filter, key):
+    """
+    Apply num_probes_k hash functions to key.
+    Generate the array index and bitmask corresponding to each result
+    """
+
+    # This one assumes key is either bytes or str (or other list of integers)
+
+    if hasattr(key, "__divmod__"):
+        int_list = []
+        temp = key
+        while temp:
+            quotient, remainder = divmod(temp, 256)
+            int_list.append(remainder)
+            temp = quotient
+    elif isinstance(key, (list, tuple, str, bytes)) and not key:
+        int_list = []
+
+    elif isinstance(key, (list, tuple)):
+        int_list = []
+        for v in key:
+            if isinstance(v, str):
+                int_list.extend([ord(char) for char in v])
+            elif hasattr(v, "__divmod__"):
+                int_list.append(v)
+            else:
+                raise TypeError("Sorry, I do not know how to hash this type")
+    elif isinstance(key[0], str):
+        int_list = [ord(char) for char in key]
+    else:
+        raise TypeError("Sorry, I do not know how to hash this type")
+
+    hash_value1 = hash1(int_list)
+    hash_value2 = hash2(int_list)
+    probe_value = hash_value1
+
+    for _ in range(1, bloom_filter.num_probes_k + 1):
+        probe_value *= hash_value1
+        probe_value += hash_value2
+        probe_value %= MERSENNES1[2]
+        yield probe_value % bloom_filter.num_bits_m
+
+
+def try_unlink(filename):
+    """unlink a file.  Don't complain if it's not there"""
+    try:
+        os.unlink(filename)
+    except OSError:
+        pass
+    return
+
+
+class BloomFilter(object):
+    """Probabilistic set membership testing for large sets"""
+
+    def __init__(
+        self,
+        max_elements=10000,
+        error_rate=0.1,
+        probe_bitnoer=get_filter_bitno_probes,
+        filename=None,
+        start_fresh=False,
+    ):
+        # pylint: disable=R0913
+        # R0913: We want a few arguments
+        if max_elements <= 0:
+            raise ValueError("ideal_num_elements_n must be > 0")
+        if not (0 < error_rate < 1):
+            raise ValueError("error_rate_p must be between 0 and 1 exclusive")
+
+        self.error_rate_p = error_rate
+        # With fewer elements, we should do very well. With more elements, our
+        # error rate "guarantee" drops rapidly.
+        self.ideal_num_elements_n = max_elements
+
+        numerator = -1 * self.ideal_num_elements_n * math.log(self.error_rate_p)
+        denominator = math.log(2) ** 2
+        real_num_bits_m = numerator / denominator
+        self.num_bits_m = int(math.ceil(real_num_bits_m))
+
+        if filename is None:
+            self.backend = Array_backend(self.num_bits_m)
+        elif isinstance(filename, tuple) and isinstance(filename[1], int):
+            if start_fresh:
+                try_unlink(filename[0])
+            if filename[1] == -1:
+                self.backend = Mmap_backend(self.num_bits_m, filename[0])
+            else:
+                self.backend = Array_then_file_seek_backend(
+                    self.num_bits_m,
+                    filename[0],
+                    filename[1],
+                )
+        else:
+            if start_fresh:
+                try_unlink(filename)
+            self.backend = File_seek_backend(self.num_bits_m, filename)
+
+        # AKA num_offsetters
+        # Verified against
+        # https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
+        real_num_probes_k = (self.num_bits_m / self.ideal_num_elements_n) * math.log(2)
+        self.num_probes_k = int(math.ceil(real_num_probes_k))
+        self.probe_bitnoer = probe_bitnoer
+
+    def __repr__(self):
+        return (
+            "BloomFilter(ideal_num_elements_n=%d, error_rate_p=%f, " + "num_bits_m=%d)"
+        ) % (
+            self.ideal_num_elements_n,
+            self.error_rate_p,
+            self.num_bits_m,
+        )
+
+    def add(self, key):
+        """Add an element to the filter"""
+        for bitno in self.probe_bitnoer(self, key):
+            self.backend.set(bitno)
+
+    def __iadd__(self, key):
+        self.add(key)
+        return self
+
+    def _match_template(self, bloom_filter):
+        """
+        Compare a sort of signature for two bloom filters.
+        Used in preparation for binary operations
+        """
+        return (
+            self.num_bits_m == bloom_filter.num_bits_m
+            and self.num_probes_k == bloom_filter.num_probes_k
+            and self.probe_bitnoer == bloom_filter.probe_bitnoer
+        )
+
+    def union(self, bloom_filter):
+        """Compute the set union of two bloom filters"""
+        self.backend |= bloom_filter.backend
+
+    def __ior__(self, bloom_filter):
+        self.union(bloom_filter)
+        return self
+
+    def intersection(self, bloom_filter):
+        """Compute the set intersection of two bloom filters"""
+        self.backend &= bloom_filter.backend
+
+    def __iand__(self, bloom_filter):
+        self.intersection(bloom_filter)
+        return self
+
+    def __contains__(self, key):
+        for bitno in self.probe_bitnoer(self, key):
+            if not self.backend.is_set(bitno):
+                return False
+        return True
+
+    def close(self):
+        self.backend.close()
+        self.backend = None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+        self.backend = None
+
+    def __del__(self):
+        if self.backend is not None:
+            self.backend.close()
+            self.backend = None
diff --git a/python/xorbits/_mars/lib/compression.py b/python/xorbits/_mars/lib/compression.py
new file mode 100644
index 000000000..a24ff8c3a
--- /dev/null
+++ b/python/xorbits/_mars/lib/compression.py
@@ -0,0 +1,55 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from gzip import GzipFile
+from typing import BinaryIO
+
+try:
+    import lz4
+    import lz4.frame
+except ImportError:  # pragma: no cover
+    lz4 = None
+
+
+_compressions = {"gzip": lambda f: GzipFile(fileobj=f)}
+
+if lz4:
+    _compressions["lz4"] = lz4.frame.open
+
+
+def compress(file: BinaryIO, compress_type: str) -> BinaryIO:
+    """
+    Return a compressed file object.
+
+    Parameters
+    ----------
+    file:
+        file object.
+    compress_type: str
+       compression type.
+
+    Returns
+    -------
+    compressed_file:
+        compressed file object.
+    """
+    try:
+        compress_ = _compressions[compress_type]
+    except KeyError:  # pragma: no cover
+        raise ValueError(
+            f"Unknown compress type: {compress_type}, "
+            f'available include: {", ".join(_compressions)}'
+        )
+
+    return compress_(file)
diff --git a/python/xorbits/_mars/lib/cython/__init__.py b/python/xorbits/_mars/lib/cython/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/lib/cython/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/lib/cython/libcpp.pxd b/python/xorbits/_mars/lib/cython/libcpp.pxd
new file mode 100644
index 000000000..f183fd433
--- /dev/null
+++ b/python/xorbits/_mars/lib/cython/libcpp.pxd
@@ -0,0 +1,30 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# complementary header for C++ STL libs not included in Cython
+
+from libc.stdint cimport uint_fast64_t
+
+
+cdef extern from "<random>" namespace "std" nogil:
+    cdef cppclass mt19937_64:
+        ctypedef uint_fast64_t result_type
+
+        mt19937_64() except +
+        mt19937_64(result_type seed) except +
+        result_type operator()() except +
+        result_type min() except +
+        result_type max() except +
+        void discard(size_t z) except +
+        void seed(result_type seed) except +
diff --git a/python/xorbits/_mars/lib/filesystem/__init__.py b/python/xorbits/_mars/lib/filesystem/__init__.py
new file mode 100644
index 000000000..574aac3a7
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .azure import AzureBlobFileSystem
+from .base import FileSystem
+from .core import file_size, get_fs, glob, open_file, register_filesystem
+from .fsmap import FSMap
+
+# noinspection PyUnresolvedReferences
+from .hdfs import HadoopFileSystem
+from .local import LocalFileSystem
+from .s3 import S3FileSystem
diff --git a/python/xorbits/_mars/lib/filesystem/_glob.py b/python/xorbits/_mars/lib/filesystem/_glob.py
new file mode 100644
index 000000000..c859c473c
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/_glob.py
@@ -0,0 +1,173 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fnmatch
+import os
+import re
+
+from .core import FileSystem
+
+magic_check = re.compile("([*?[])")
+magic_check_bytes = re.compile(b"([*?[])")
+
+
+def has_magic(s):
+    if isinstance(s, bytes):  # pragma: no cover
+        match = magic_check_bytes.search(s)
+    else:
+        match = magic_check.search(s)
+    return match is not None
+
+
+def _ishidden(path):
+    return path[0] in (".", b"."[0])
+
+
+def _isrecursive(pattern):
+    if isinstance(pattern, bytes):  # pragma: no cover
+        return pattern == b"**"
+    else:
+        return pattern == "**"
+
+
+class FileSystemGlob:
+    def __init__(self, fs: FileSystem):
+        self._fs = fs
+
+    def glob(self, pathname, recursive=False):
+        """Return a list of paths matching a pathname pattern.
+
+        The pattern may contain simple shell-style wildcards a la
+        fnmatch. However, unlike fnmatch, filenames starting with a
+        dot are special cases that are not matched by '*' and '?'
+        patterns.
+
+        If recursive is true, the pattern '**' will match any files and
+        zero or more directories and subdirectories.
+        """
+        return list(self.iglob(pathname, recursive=recursive))
+
+    def iglob(self, pathname, recursive=False):
+        """Return an iterator which yields the paths matching a pathname pattern.
+
+        The pattern may contain simple shell-style wildcards a la
+        fnmatch. However, unlike fnmatch, filenames starting with a
+        dot are special cases that are not matched by '*' and '?'
+        patterns.
+
+        If recursive is true, the pattern '**' will match any files and
+        zero or more directories and subdirectories.
+        """
+        it = self._iglob(pathname, recursive, False)
+        if recursive and _isrecursive(pathname):  # pragma: no cover
+            s = next(it)  # skip empty string
+            assert not s
+        return it
+
+    def _iglob(self, pathname, recursive, dironly):
+        dirname, basename = self._fs.path_split(pathname.replace(os.path.sep, "/"))
+        if not has_magic(pathname):
+            assert not dironly
+            if basename:
+                if self._fs.exists(pathname):
+                    yield pathname
+            else:  # pragma: no cover
+                # Patterns ending with a slash should match only directories
+                if self._fs.isdir(dirname):
+                    yield pathname
+            return
+        if not dirname:  # pragma: no cover
+            if recursive and _isrecursive(basename):
+                yield from self._glob2(dirname, basename, dironly)
+            else:
+                yield from self._glob1(dirname, basename, dironly)
+            return
+        # `os.path.split()` returns the argument itself as a dirname if it is a
+        # drive or UNC path.  Prevent an infinite recursion if a drive or UNC path
+        # contains magic characters (i.e. r'\\?\C:').
+        if dirname != pathname and has_magic(dirname):
+            dirs = self._iglob(dirname, recursive, True)
+        else:
+            dirs = [dirname]
+        if has_magic(basename):
+            if recursive and _isrecursive(basename):
+                glob_in_dir = self._glob2
+            else:
+                glob_in_dir = self._glob1
+        else:
+            glob_in_dir = self._glob0
+        for dirname in dirs:
+            for name in glob_in_dir(dirname, basename, dironly):
+                if dirname:
+                    yield self._fs.path_join(dirname, name)
+                else:
+                    yield name
+
+    # These 2 helper functions non-recursively glob inside a literal directory.
+    # They return a list of basenames.  _glob1 accepts a pattern while _glob0
+    # takes a literal basename (so it only has to check for its existence).
+
+    def _glob1(self, dirname, pattern, dironly):
+        names = list(self._iterdir(dirname, dironly))
+        if not _ishidden(pattern):
+            names = (x for x in names if not _ishidden(x))
+        return fnmatch.filter(names, pattern)
+
+    def _glob0(self, dirname, basename, dironly):  # pragma: no cover
+        if not basename:
+            # `os.path.split()` returns an empty basename for paths ending with a
+            # directory separator.  'q*x/' should match only directories.
+            if self._fs.isdir(dirname):
+                return [basename]
+        else:
+            if self._fs.exists(self._fs.path_join(dirname, basename)):
+                return [basename]
+        return []
+
+    # Following functions are not public but can be used by third-party code.
+
+    def glob0(self, dirname, pattern):  # pragma: no cover
+        return self._glob0(dirname, pattern, False)
+
+    def glob1(self, dirname, pattern):  # pragma: no cover
+        return self._glob1(dirname, pattern, False)
+
+    # This helper function recursively yields relative pathnames inside a literal
+    # directory.
+
+    def _glob2(self, dirname, pattern, dironly):  # pragma: no cover
+        assert _isrecursive(pattern)
+        yield pattern[:0]
+        yield from self._rlistdir(dirname, dironly)
+
+    # If dironly is false, yields all file names inside a directory.
+    # If dironly is true, yields only directory names.
+    def _iterdir(self, dirname, dironly):
+        if not dirname:  # pragma: no cover
+            dirname = ""
+        if not self._fs.isdir(dirname):
+            return iter(())
+        for entry in self._fs.ls(dirname):
+            if not dironly or self._fs.isdir(entry):
+                yield self._fs.path_split(entry)[-1]
+
+    # Recursively yields relative pathnames inside a literal directory.
+    def _rlistdir(self, dirname, dironly):  # pragma: no cover
+        names = list(self._iterdir(dirname, dironly))
+        for x in names:
+            if not _ishidden(x):
+                yield x
+                path = self._fs.path_join(dirname, x) if dirname else x
+                for y in self._rlistdir(path, dironly):
+                    yield self._fs.path_join(x, y)
diff --git a/python/xorbits/_mars/lib/filesystem/_oss_lib/__init__.py b/python/xorbits/_mars/lib/filesystem/_oss_lib/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/_oss_lib/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/lib/filesystem/_oss_lib/common.py b/python/xorbits/_mars/lib/filesystem/_oss_lib/common.py
new file mode 100644
index 000000000..9fc5ec310
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/_oss_lib/common.py
@@ -0,0 +1,198 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import json
+import os
+
+from ....utils import lazy_import
+from ..base import path_type, stringify_path
+
+oss2 = lazy_import("oss2", placeholder=True)
+
+# OSS api time out
+_oss_time_out = 10
+
+
+class OSSFileEntry:
+    def __init__(
+        self, path, *, is_dir=None, is_file=None, stat=None, storage_options=None
+    ):
+        self._path = path
+        self._name = os.path.basename(path)
+        self._is_file = is_file
+        self._is_dir = is_dir
+        self._stat = stat
+        self._storage_options = storage_options
+
+    def is_dir(self):
+        if self._is_dir is None:
+            self._is_dir = oss_isdir(self._path)
+        return self._is_dir
+
+    def is_file(self):
+        if self._is_file is None:
+            if self.is_dir() or not oss_exists(self._path):
+                self._is_file = False
+            else:
+                self._is_file = True
+        return self._is_file
+
+    def stat(self):
+        if self._stat is None:
+            self._stat = oss_stat(self._path)
+        return self._stat
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def path(self):
+        return self._path
+
+
+def parse_osspath(path: path_type):
+    # Extract OSS configuration from the encoded URL.
+    str_path = stringify_path(path)
+    parse_result = oss2.urlparse(str_path)
+    if parse_result.scheme != "oss":
+        raise ValueError(
+            f"Except scheme oss, but got scheme: {parse_result.scheme}"
+            f" in path: {str_path}"
+        )
+    bucket = parse_result.hostname
+    if not (parse_result.username and parse_result.password):
+        raise RuntimeError(r"Please use build_oss_path to add OSS info")
+    param_dict = url_to_dict(parse_result.username)
+    access_key_id = param_dict["access_key_id"]
+    access_key_secret = parse_result.password
+    end_point = param_dict["end_point"]
+    key = parse_result.path
+    key = key[1:] if key.startswith("/") else key
+    return bucket, key, access_key_id, access_key_secret, end_point
+
+
+def _get_oss_bucket(bucket, access_key_id, access_key_secret, end_point):
+    oss_bucket = oss2.Bucket(
+        auth=oss2.Auth(
+            access_key_id=access_key_id, access_key_secret=access_key_secret
+        ),
+        endpoint=end_point,
+        bucket_name=bucket,
+        connect_timeout=_oss_time_out,
+    )
+    return oss_bucket
+
+
+def oss_exists(path: path_type):
+    bucket, key, access_key_id, access_key_secret, end_point = parse_osspath(path)
+    oss_bucket = _get_oss_bucket(bucket, access_key_id, access_key_secret, end_point)
+    return oss_bucket.object_exists(key) or oss_isdir(path)
+
+
+def oss_isdir(path: path_type):
+    """
+    OSS has no concept of directories, but we define
+    a ossurl is dir, When there is at least one object
+    at the ossurl that is the prefix(end with char "/"),
+    it is considered as a directory.
+    """
+    dirname = stringify_path(path)
+    if not dirname.endswith("/"):
+        dirname = dirname + "/"
+    bucket, key, access_key_id, access_key_secret, end_point = parse_osspath(dirname)
+    oss_bucket = _get_oss_bucket(bucket, access_key_id, access_key_secret, end_point)
+    isdir = False
+    for obj in oss2.ObjectIteratorV2(oss_bucket, prefix=key, max_keys=2):
+        if obj.key == key:
+            continue
+        isdir = True
+        break
+    return isdir
+
+
+def oss_stat(path: path_type):
+    path = stringify_path(path)
+    bucket, key, access_key_id, access_key_secret, end_point = parse_osspath(path)
+    oss_bucket = _get_oss_bucket(bucket, access_key_id, access_key_secret, end_point)
+    if oss_isdir(path):
+        stat = dict(name=path, size=0, modified_time=-1)
+        stat["type"] = "directory"
+    else:
+        meta = oss_bucket.get_object_meta(key)
+        stat = dict(
+            name=path,
+            size=int(meta.headers["Content-Length"]),
+            modified_time=meta.headers["Last-Modified"],
+        )
+        stat["type"] = "file"
+    return stat
+
+
+def oss_scandir(dirname: path_type):
+    dirname = stringify_path(dirname)
+    if not dirname.endswith("/"):
+        dirname = dirname + "/"
+    bucket, key, access_key_id, access_key_secret, end_point = parse_osspath(dirname)
+    oss_bucket = _get_oss_bucket(bucket, access_key_id, access_key_secret, end_point)
+    dirname_set = set()
+    for obj in oss2.ObjectIteratorV2(oss_bucket, prefix=key):
+        rel_path = obj.key[len(key) :]
+        try:
+            inside_dirname, inside_filename = rel_path.split("/", 1)
+        except ValueError:
+            inside_dirname = None
+            inside_filename = rel_path
+        if inside_dirname is not None:
+            if inside_dirname in dirname_set:
+                continue
+            dirname_set.add(inside_dirname)
+            yield OSSFileEntry(
+                os.path.join(dirname, inside_dirname),
+                is_dir=True,
+                is_file=False,
+                stat={
+                    "name": os.path.join(dirname, inside_dirname),
+                    "type": "directory",
+                    "size": 0,
+                    "modified_time": -1,
+                },
+            )
+        else:
+            yield OSSFileEntry(
+                os.path.join(dirname, inside_filename),
+                is_dir=False,
+                is_file=True,
+                stat={
+                    "name": os.path.join(dirname, inside_filename),
+                    "type": "file",
+                    "size": obj.size,
+                    "modified_time": obj.last_modified,
+                },
+            )
+
+
+def dict_to_url(param: dict):
+    # Encode the dictionary with url-safe-base64.
+    str_param = json.dumps(param)
+    url_param = base64.urlsafe_b64encode(bytes(str_param, encoding="utf8"))
+    return bytes.decode(url_param, encoding="utf8")
+
+
+def url_to_dict(url_param: str):
+    # Decode url-safe-base64 encoded string.
+    bytes_param = bytes(url_param, encoding="utf8")
+    str_param = bytes.decode(base64.urlsafe_b64decode(bytes_param), encoding="utf8")
+    return json.loads(str_param)
diff --git a/python/xorbits/_mars/lib/filesystem/_oss_lib/glob.py b/python/xorbits/_mars/lib/filesystem/_oss_lib/glob.py
new file mode 100644
index 000000000..8b8d4f944
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/_oss_lib/glob.py
@@ -0,0 +1,147 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Filename globbing utility, modified from python glob.
+
+obviously，this implementation is not optimal, it will cause too many
+oss requests. Lately, We can then convert the glob expression into
+a regular expression, and then match the oss key list.
+But before that, we need to figure out how to deal with magic char
+in oss key, such like oss glob: oss://bucket/[key]/*, the key
+oss://bucket/[key]/a exactly exists.
+
+Notes:
+    OSS need a bucket to specify the file or dir, the "**" pattern is
+    not supported. So _isrecursive(pattern) is removed.
+"""
+
+import fnmatch
+import os
+import re
+
+from .common import oss_exists, oss_isdir, oss_scandir
+
+__all__ = ["glob", "iglob", "escape"]
+
+
+def glob(pathname, *, recursive=False):
+    """Return a list of paths matching a pathname pattern.
+    The pattern may contain simple shell-style wildcards a la
+    fnmatch. However, unlike fnmatch, filenames starting with a
+    dot are special cases that are not matched by '*' and '?'
+    patterns.
+    """
+    return list(iglob(pathname, recursive=recursive))
+
+
+def iglob(pathname, *, recursive=False):
+    """Return an iterator which yields the paths matching a pathname pattern.
+    The pattern may contain simple shell-style wildcards like
+    fnmatch. However, unlike fnmatch, filenames starting with a
+    dot are special cases that are not matched by '*' and '?'
+    patterns.
+    """
+    it = _iglob(pathname, recursive, False)
+    return it
+
+
+def _iglob(pathname, recursive, dironly):
+    dirname, basename = os.path.split(pathname)
+    if not has_magic(pathname):
+        assert not dironly
+        if basename:
+            if oss_exists(pathname):
+                yield pathname
+        else:
+            # Patterns ending with a slash should match only directories
+            if oss_isdir(dirname):
+                yield pathname
+        return
+    # dirname will not be None in oss path.
+    #  Prevent an infinite recursion if a drive or UNC path
+    # contains magic characters (i.e. r'\\?\C:').
+    if dirname != pathname and has_magic(dirname):
+        dirs = _iglob(dirname, recursive, True)
+    else:
+        dirs = [dirname]
+    if has_magic(basename):
+        glob_in_dir = _glob1
+    else:
+        glob_in_dir = _glob0
+    for dirname in dirs:
+        for name in glob_in_dir(dirname, basename, dironly):
+            yield os.path.join(dirname, name)
+
+
+# These 2 helper functions non-recursively glob inside a literal directory.
+# They return a list of basenames.  _glob1 accepts a pattern while _glob0
+# takes a literal basename (so it only has to check for its existence).
+
+
+def _glob1(dirname, pattern, dironly):
+    names = list(_iterdir(dirname, dironly))
+    if not _ishidden(pattern):
+        names = (x for x in names if not _ishidden(x))
+    return fnmatch.filter(names, pattern)
+
+
+def _glob0(dirname, basename, dironly):
+    if not basename:
+        # `os.path.split()` returns an empty basename for paths ending with a
+        # directory separator.  'q*x/' should match only directories.
+        if oss_isdir(dirname):
+            return [basename]
+    else:
+        if oss_exists(os.path.join(dirname, basename)):
+            return [basename]
+    return []
+
+
+# If dironly is false, yields all file names inside a directory.
+# If dironly is true, yields only directory names.
+# An oss path must contain a dirname.
+def _iterdir(dirname, dironly):
+    for entry in oss_scandir(dirname):
+        if not dironly or entry.is_dir():
+            yield entry.name
+    return
+
+
+magic_check = re.compile("([*?[])")
+magic_check_bytes = re.compile(b"([*?[])")
+
+
+def has_magic(s):
+    if isinstance(s, bytes):
+        match = magic_check_bytes.search(s)
+    else:
+        match = magic_check.search(s)
+    return match is not None
+
+
+def _ishidden(path):
+    return False
+
+
+def escape(pathname):
+    """Escape all special characters."""
+    # Escaping is done by wrapping any of "*?[" between square brackets.
+    # Metacharacters do not work in the drive part and shouldn't be escaped.
+    drive, pathname = os.path.splitdrive(pathname)
+    if isinstance(pathname, bytes):
+        pathname = magic_check_bytes.sub(rb"[\1]", pathname)
+    else:
+        pathname = magic_check.sub(r"[\1]", pathname)
+    return drive + pathname
diff --git a/python/xorbits/_mars/lib/filesystem/_oss_lib/handle.py b/python/xorbits/_mars/lib/filesystem/_oss_lib/handle.py
new file mode 100644
index 000000000..1eb93999e
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/_oss_lib/handle.py
@@ -0,0 +1,156 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from io import IOBase
+
+from ....utils import lazy_import
+from .common import oss_stat, parse_osspath
+
+oss2 = lazy_import("oss2", placeholder=True)
+
+
+class OSSIOBase(IOBase):
+    def __init__(self, path, mode):
+        self._path = path
+        (
+            self._bucket_name,
+            self._key_name,
+            self._access_key_id,
+            self._access_key_secret,
+            self._end_point,
+        ) = parse_osspath(self._path)
+        self._bucket = self._get_bucket()
+        self._current_pos = 0
+        self._size = None
+        self._buffer = b""
+        self._buffer_size = 1 * 1024
+        self._mode = mode
+
+    @property
+    def mode(self):
+        return self._mode
+
+    def fileno(self) -> int:
+        raise AttributeError
+
+    def _get_bucket(self):
+        return oss2.Bucket(
+            auth=oss2.Auth(
+                access_key_id=self._access_key_id,
+                access_key_secret=self._access_key_secret,
+            ),
+            endpoint=self._end_point,
+            bucket_name=self._bucket_name,
+        )
+
+    def _get_size(self):
+        if self._size is None:
+            self._size = int(oss_stat(self._path)["size"])
+        return self._size
+
+    def seek(self, pos, whence=0):
+        if whence == 0:
+            if pos < 0:
+                raise OSError("Invalid argument")
+            self._current_pos = pos
+        elif whence == 2:
+            self._current_pos = self._get_size() + pos
+        elif whence == 1:
+            check_pos = self._current_pos + pos
+            if check_pos < 0:
+                raise OSError("Invalid argument")
+            else:
+                self._current_pos = self._current_pos + pos
+        else:
+            raise ValueError('Parameter "whence" should be 0 or 1 or 2')
+        if pos > 0 and self._current_pos > self._get_size() - 1:
+            self._current_pos = self._get_size()
+        return self._current_pos
+
+    def seekable(self):
+        return True
+
+    def read(self, size=-1):
+        """
+        Read and return up to size bytes, where size is an int.
+
+        If the argument is omitted, None, or negative, reads and
+        returns all data until EOF.
+
+        If the argument is positive, multiple raw reads may be issued to satisfy
+        the byte count (unless EOF is reached first).
+
+        Returns an empty bytes array on EOF.
+        """
+        if self._current_pos == self._get_size() or size == 0:
+            return b""
+        elif size < 0:
+            obj = self._bucket.get_object(
+                self._key_name, byte_range=(self._current_pos, None)
+            )
+            self._current_pos = self._get_size()
+        else:
+            obj = self._bucket.get_object(
+                self._key_name,
+                byte_range=(self._current_pos, self._current_pos + size - 1),
+            )
+            self._current_pos = self._current_pos + size
+        content = obj.read()
+        return content
+
+    def readline(self, size=-1):
+        # For backwards compatibility, a (slowish) readline().
+        def nreadahead():
+            # Read to the beginning of the next line
+            read_to = min(
+                self._get_size() - 1, self._current_pos + self._buffer_size - 1
+            )
+            buffer = self._bucket.get_object(
+                self._key_name, byte_range=(self._current_pos, read_to)
+            ).read()
+            if not buffer:
+                return 1
+            n = (buffer.find(b"\n") + 1) or len(buffer)
+            if size >= 0:
+                n = min(n, size)
+            return n
+
+        if size is None:
+            size = -1
+        else:
+            try:
+                size_index = size.__index__
+            except AttributeError:
+                raise TypeError(f"{size!r} is not an integer")
+            else:
+                size = size_index()
+        res = bytearray()
+        while size < 0 or len(res) < size:
+            b = self.read(nreadahead())
+            if not b:
+                break
+            res += b
+            if res.endswith(b"\n"):
+                break
+        return bytes(res)
+
+    def readable(self):
+        return True
+
+    def writable(self):
+        return False
+
+    def close(self):
+        # already closed by oss
+        pass
diff --git a/python/xorbits/_mars/lib/filesystem/arrow.py b/python/xorbits/_mars/lib/filesystem/arrow.py
new file mode 100644
index 000000000..2960cf3c3
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/arrow.py
@@ -0,0 +1,236 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+import weakref
+from typing import BinaryIO, Dict, Iterator, List, TextIO, Tuple, Union
+from urllib.parse import urlparse
+
+import pyarrow as pa
+from pyarrow.fs import FileInfo, FileSelector
+from pyarrow.fs import FileSystem as ArrowFileSystem
+from pyarrow.fs import FileType
+from pyarrow.fs import HadoopFileSystem as ArrowHadoopFileSystem
+from pyarrow.fs import LocalFileSystem as ArrowLocalFileSystem
+
+from ...utils import implements, stringify_path
+from .core import FileSystem, path_type
+
+__all__ = ("ArrowBasedLocalFileSystem", "HadoopFileSystem")
+
+
+# When pyarrow.fs.FileSystem gc collected,
+# the underlying connection will be closed,
+# so we hold the reference to make sure
+# FileSystem will not be gc collected before file object
+_file_to_filesystems = weakref.WeakKeyDictionary()
+
+
+class ArrowBasedFileSystem(FileSystem):
+    """
+    FileSystem implemented with arrow fs API (>=2.0.0).
+    """
+
+    def __init__(self, arrow_fs: ArrowFileSystem, sequential_read=False):
+        self._arrow_fs = arrow_fs
+        # for open('rb'), open a sequential reading only or not
+        self._sequential_read = sequential_read
+
+    @staticmethod
+    def _process_path(path):
+        return stringify_path(path)
+
+    @implements(FileSystem.cat)
+    def cat(self, path: path_type) -> bytes:
+        path = self._process_path(path)
+        file: pa.NativeFile = self._arrow_fs.open_input_stream(path)
+        return file.read()
+
+    @implements(FileSystem.ls)
+    def ls(self, path: path_type) -> List[path_type]:
+        path = self._process_path(path)
+        file_selector: FileSelector = FileSelector(path)
+        paths = []
+        for file_info in self._arrow_fs.get_file_info(file_selector):
+            paths.append(file_info.path)
+        return paths
+
+    def _get_file_info(self, path: path_type) -> FileInfo:
+        path = self._process_path(path)
+        file_info: FileInfo = self._arrow_fs.get_file_info([path])[0]
+        return file_info
+
+    @implements(FileSystem.delete)
+    def delete(self, path: path_type, recursive: bool = False):
+        path = self._process_path(path)
+        info = self._get_file_info(path)
+        if info.is_file:
+            self._arrow_fs.delete_file(path)
+        elif info.type == FileType.Directory:
+            if not recursive and len(self.ls(path)) > 0:
+                raise OSError(f"[Errno 66] Directory not empty: '{path}'")
+            self._arrow_fs.delete_dir(path)
+        else:  # pragma: no cover
+            raise TypeError(f"path({path}) to delete must be a file or directory")
+
+    @implements(FileSystem.rename)
+    def rename(self, path: path_type, new_path: path_type):
+        path = self._process_path(path)
+        new_path = self._process_path(new_path)
+        self._arrow_fs.move(path, new_path)
+
+    @implements(FileSystem.stat)
+    def stat(self, path: path_type) -> Dict:
+        path = self._process_path(path)
+        info = self._get_file_info(path)
+        stat = dict(name=path, size=info.size, modified_time=info.mtime_ns / 1e9)
+        if info.type == FileType.File:
+            stat["type"] = "file"
+        elif info.type == FileType.Directory:
+            stat["type"] = "directory"
+        else:  # pragma: no cover
+            stat["type"] = "other"
+        return stat
+
+    @implements(FileSystem.mkdir)
+    def mkdir(self, path: path_type, create_parents: bool = True):
+        path = self._process_path(path)
+        self._arrow_fs.create_dir(path, recursive=create_parents)
+
+    @implements(FileSystem.isdir)
+    def isdir(self, path: path_type) -> bool:
+        path = self._process_path(path)
+        info = self._get_file_info(path)
+        return info.type == FileType.Directory
+
+    @implements(FileSystem.isfile)
+    def isfile(self, path: path_type) -> bool:
+        path = self._process_path(path)
+        info = self._get_file_info(path)
+        return info.is_file
+
+    @implements(FileSystem._isfilestore)
+    def _isfilestore(self) -> bool:
+        return True
+
+    @implements(FileSystem.exists)
+    def exists(self, path: path_type):
+        path = self._process_path(path)
+        info = self._get_file_info(path)
+        return info.type != FileType.NotFound
+
+    @implements(FileSystem.open)
+    def open(self, path: path_type, mode: str = "rb") -> Union[BinaryIO, TextIO]:
+        path = self._process_path(path)
+        is_binary = mode.endswith("b")
+        if not is_binary:  # pragma: no cover
+            raise ValueError(
+                f"mode can only be binary for arrow based filesystem, got {mode}"
+            )
+        mode = mode.rstrip("b")
+        if mode == "w":
+            file = self._arrow_fs.open_output_stream(path)
+        elif mode == "r":
+            if self._sequential_read:  # pragma: no cover
+                file = self._arrow_fs.open_input_stream(path)
+            else:
+                file = self._arrow_fs.open_input_file(path)
+        elif mode == "a":
+            file = self._arrow_fs.open_append_stream(path)
+        else:  # pragma: no cover
+            raise ValueError(
+                f'mode can only be "wb", "rb" and "ab" for '
+                f"arrow based filesystem, got {mode}"
+            )
+
+        _file_to_filesystems[file] = self._arrow_fs
+        return file
+
+    @implements(FileSystem.walk)
+    def walk(self, path: path_type) -> Iterator[Tuple[str, List[str], List[str]]]:
+        path = self._process_path(path)
+        q = [path]
+        while q:
+            curr = q.pop(0)
+            file_selector: FileSelector = FileSelector(curr)
+            dirs, files = [], []
+            for info in self._arrow_fs.get_file_info(file_selector):
+                if info.type == FileType.File:
+                    files.append(info.base_name)
+                elif info.type == FileType.Directory:
+                    dirs.append(info.base_name)
+                    q.append(info.path)
+                else:  # pragma: no cover
+                    continue
+            yield curr, dirs, files
+
+    @implements(FileSystem.glob)
+    def glob(self, path: path_type, recursive: bool = False) -> List[path_type]:
+        from ._glob import FileSystemGlob
+
+        path = self._process_path(path)
+        return FileSystemGlob(self).glob(path, recursive=recursive)
+
+
+class ArrowBasedLocalFileSystem(ArrowBasedFileSystem):
+    def __init__(self):
+        super().__init__(ArrowLocalFileSystem())
+
+    _instance = None
+
+    @classmethod
+    def get_instance(cls):
+        if cls._instance is None:
+            cls._instance = ArrowBasedLocalFileSystem()
+        return cls._instance
+
+
+class HadoopFileSystem(ArrowBasedFileSystem):
+    def __init__(
+        self,
+        host="default",
+        port=0,
+        user=None,
+        kerb_ticket=None,
+        driver="libhdfs",
+        extra_conf=None,
+    ):
+        assert driver == "libhdfs"
+        if "HADOOP_HOME" in os.environ and "CLASSPATH" not in os.environ:
+            classpath_proc = subprocess.run(
+                [os.environ["HADOOP_HOME"] + "/bin/hdfs", "classpath", "--glob"],
+                stdout=subprocess.PIPE,
+            )
+            os.environ["CLASSPATH"] = classpath_proc.stdout.decode().strip()
+        arrow_fs = ArrowHadoopFileSystem(
+            host=host,
+            port=port,
+            user=user,
+            kerb_ticket=kerb_ticket,
+            extra_conf=extra_conf,
+        )
+        super().__init__(arrow_fs)
+
+    @staticmethod
+    def _process_path(path):
+        path = ArrowBasedFileSystem._process_path(path)
+        # use urlparse to extract path from like:
+        # hdfs://localhost:8020/tmp/test/simple_test.csv,
+        # due to the reason that pa.fs.HadoopFileSystem cannot accept
+        # path with hdfs:// prefix
+        if path.startswith("hdfs://"):
+            return urlparse(path).path
+        else:
+            return path
diff --git a/python/xorbits/_mars/lib/filesystem/azure.py b/python/xorbits/_mars/lib/filesystem/azure.py
new file mode 100644
index 000000000..4c02155d0
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/azure.py
@@ -0,0 +1,36 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try:  # pragma: no cover
+    # make sure adlfs is installed
+    from adlfs import AzureBlobFileSystem as _AzureBlobFileSystem
+
+    # make sure fsspec is installed
+    from .fsspec_adapter import FsSpecAdapter
+
+    del _AzureBlobFileSystem
+except ImportError:
+    FsSpecAdapter = None
+
+if FsSpecAdapter is not None:  # pragma: no cover
+    from .core import register_filesystem
+
+    class AzureBlobFileSystem(FsSpecAdapter):
+        def __init__(self, **kwargs):
+            super().__init__("az", **kwargs)
+
+    register_filesystem("az", AzureBlobFileSystem)
+    register_filesystem("abfs", AzureBlobFileSystem)
+else:
+    AzureBlobFileSystem = None
diff --git a/python/xorbits/_mars/lib/filesystem/base.py b/python/xorbits/_mars/lib/filesystem/base.py
new file mode 100644
index 000000000..feb01657a
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/base.py
@@ -0,0 +1,263 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from abc import ABC, abstractmethod
+from typing import BinaryIO, Dict, Iterator, List, TextIO, Tuple, Union
+from urllib.parse import urlparse
+
+from ...utils import stringify_path
+
+path_type = Union[str, os.PathLike]
+
+
+class FileSystem(ABC):
+    """
+    Abstract filesystem interface
+    """
+
+    @abstractmethod
+    def cat(self, path: path_type) -> bytes:
+        """
+        Return contents of file as a bytes object
+
+        Parameters
+        ----------
+        path : str or path-like
+            File path to read content from.
+
+        Returns
+        -------
+        contents : bytes
+        """
+
+    @abstractmethod
+    def ls(self, path: path_type) -> List[path_type]:
+        """
+        Return list of file paths
+
+        Returns
+        -------
+        paths : list
+        """
+
+    @abstractmethod
+    def delete(self, path: path_type, recursive: bool = False):
+        """
+        Delete the indicated file or directory
+
+        Parameters
+        ----------
+        path : str
+        recursive : bool, default False
+            If True, also delete child paths for directories
+        """
+
+    def disk_usage(self, path: path_type) -> int:
+        """
+        Compute bytes used by all contents under indicated path in file tree
+
+        Parameters
+        ----------
+        path : string
+            Can be a file path or directory
+
+        Returns
+        -------
+        usage : int
+        """
+        path = stringify_path(path)
+        path_info = self.stat(path)
+        if path_info["type"] == "file":
+            return path_info["size"]
+
+        total = 0
+        for root, directories, files in self.walk(path):
+            for child_path in files:
+                abspath = self.path_join(root, child_path)
+                total += self.stat(abspath)["size"]
+
+        return total
+
+    def path_join(self, *args):
+        return self.pathsep.join(args)
+
+    def path_split(self, path):
+        """
+        Split a pathname. Returns tuple "(head, tail)" where "tail" is everything after the final slash. Either part
+        may be empty.
+
+        Parameters
+        ----------
+        path : string
+            Can be a file path or directory
+
+        Returns
+        -------
+        usage : int
+        """
+        splits = path.rsplit(self.pathsep, 1)
+        if len(splits) == 1:
+            return "", splits[0]
+        else:
+            return splits
+
+    @abstractmethod
+    def stat(self, path: path_type) -> Dict:
+        """
+        Information about a filesystem entry.
+
+        Returns
+        -------
+        stat : dict
+        """
+
+    def rm(self, path: path_type, recursive: bool = False):
+        """
+        Alias for FileSystem.delete
+        """
+        return self.delete(path, recursive=recursive)
+
+    def mv(self, path, new_path):
+        """
+        Alias for FileSystem.rename
+        """
+        return self.rename(path, new_path)
+
+    @abstractmethod
+    def rename(self, path: path_type, new_path: path_type):
+        """
+        Rename file, like UNIX mv command
+
+        Parameters
+        ----------
+        path : string
+            Path to alter
+        new_path : string
+            Path to move to
+        """
+
+    @abstractmethod
+    def mkdir(self, path: path_type, create_parents: bool = True):
+        """
+        Create a directory.
+
+        Parameters
+        ----------
+        path : str
+            Path to the directory.
+        create_parents : bool, default True
+            If the parent directories don't exists create them as well.
+        """
+
+    @abstractmethod
+    def exists(self, path: path_type):
+        """
+        Return True if path exists.
+
+        Parameters
+        ----------
+        path : str
+            Path to check.
+        """
+
+    @abstractmethod
+    def isdir(self, path: path_type) -> bool:
+        """
+        Return True if path is a directory.
+
+        Parameters
+        ----------
+        path : str
+            Path to check.
+        """
+
+    @abstractmethod
+    def isfile(self, path: path_type) -> bool:
+        """
+        Return True if path is a file.
+
+        Parameters
+        ----------
+        path : str
+            Path to check.
+        """
+
+    @abstractmethod
+    def _isfilestore(self) -> bool:
+        """
+        Returns True if this FileSystem is a unix-style file store with
+        directories.
+        """
+
+    @abstractmethod
+    def open(self, path: path_type, mode: str = "rb") -> Union[BinaryIO, TextIO]:
+        """
+        Open file for reading or writing.
+        """
+
+    @abstractmethod
+    def walk(self, path: path_type) -> Iterator[Tuple[str, List[str], List[str]]]:
+        """
+        Directory tree generator.
+
+        Parameters
+        ----------
+        path : str
+
+        Returns
+        -------
+        generator
+        """
+
+    @abstractmethod
+    def glob(self, path: path_type, recursive: bool = False) -> List[path_type]:
+        """
+        Return a list of paths matching a pathname pattern.
+
+        Parameters
+        ----------
+        path : str
+            Pattern may contain simple shell-style wildcards
+        recursive : bool
+            If recursive is true, the pattern '**' will match any files and
+            zero or more directories and subdirectories.
+
+        Returns
+        -------
+        paths : List
+        """
+
+    @property
+    def pathsep(self) -> str:
+        return "/"
+
+    @staticmethod
+    def parse_from_path(uri: str):
+        parsed_uri = urlparse(uri)
+        options = dict()
+        options["host"] = parsed_uri.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0]
+        if parsed_uri.port:
+            options["port"] = parsed_uri.port
+        if parsed_uri.username:
+            options["user"] = parsed_uri.username
+        if parsed_uri.password:
+            options["password"] = parsed_uri.password
+        return options
+
+    @classmethod
+    def get_storage_options(cls, storage_options: Dict, uri: str) -> Dict:
+        options = cls.parse_from_path(uri)
+        storage_options.update(options)
+        return storage_options
diff --git a/python/xorbits/_mars/lib/filesystem/core.py b/python/xorbits/_mars/lib/filesystem/core.py
new file mode 100644
index 000000000..db122537c
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/core.py
@@ -0,0 +1,95 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob as glob_
+import os
+from typing import Dict, List
+from urllib.parse import urlparse
+
+from ..compression import compress
+from .base import FileSystem, path_type
+from .local import LocalFileSystem
+from .oss import OSSFileSystem
+
+_filesystems = {"file": LocalFileSystem, "oss": OSSFileSystem}
+_scheme_to_dependencies = {
+    "hdfs": ["pyarrow"],
+    "az": ["fsspec", "adlfs"],
+    "abfs": ["fsspec", "adlfs"],
+    "s3": ["fsspec", "s3fs"],
+}
+
+
+def register_filesystem(name: str, fs):
+    _filesystems[name] = fs
+
+
+def get_fs(path: path_type, storage_options: Dict = None) -> FileSystem:
+    if storage_options is None:
+        storage_options = dict()
+
+    # detect scheme
+    if os.path.exists(path) or glob_.glob(path):
+        scheme = "file"
+    else:
+        scheme = urlparse(path).scheme
+    if scheme == "" or len(scheme) == 1:  # len == 1 for windows
+        scheme = "file"
+
+    if scheme in _filesystems:
+        file_system_type = _filesystems[scheme]
+        if scheme == "file" or scheme == "oss":
+            # local file systems are singletons.
+            return file_system_type.get_instance()
+        else:
+            storage_options = file_system_type.get_storage_options(
+                storage_options, path
+            )
+            return file_system_type(**storage_options)
+    elif scheme in _scheme_to_dependencies:  # pragma: no cover
+        dependencies = ", ".join(_scheme_to_dependencies[scheme])
+        raise ImportError(f"Need to install {dependencies} to access {scheme}.")
+    else:
+        raise ValueError(
+            f"Unknown file system type: {scheme}, "
+            f'available include: {", ".join(_scheme_to_dependencies.keys())}'
+        )
+
+
+def glob(path: path_type, storage_options: Dict = None) -> List[path_type]:
+    if "*" in path:
+        fs = get_fs(path, storage_options)
+        return fs.glob(path)
+    else:
+        return [path]
+
+
+def file_size(path: path_type, storage_options: Dict = None) -> int:
+    fs = get_fs(path, storage_options)
+    return fs.stat(path)["size"]
+
+
+def open_file(
+    path: path_type,
+    mode: str = "rb",
+    compression: str = None,
+    storage_options: Dict = None,
+):
+    fs = get_fs(path, storage_options)
+    file = fs.open(path, mode=mode)
+
+    if compression is not None:
+        file = compress(file, compression)
+
+    return file
diff --git a/python/xorbits/_mars/lib/filesystem/fsmap.py b/python/xorbits/_mars/lib/filesystem/fsmap.py
new file mode 100644
index 000000000..82380824e
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/fsmap.py
@@ -0,0 +1,164 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import MutableMapping
+from urllib.parse import urlparse
+
+from .local import LocalFileSystem
+
+
+class FSMap(MutableMapping):
+    """
+    Wrap a FileSystem instance as a mutable wrapping.
+    The keys of the mapping become files under the given root, and the
+    values (which must be bytes) the contents of those files.
+
+    Parameters
+    ----------
+    root: string
+        prefix for all the files
+    fs: FileSystem instance
+    check: bool (=True)
+        performs a touch at the location, to check for write access.
+    """
+
+    def __init__(self, root, fs, check=False, create=False):
+        self.fs = fs
+        self.root = self._get_path(fs, root)
+        if create:
+            if not self.fs.exists(root):
+                self.fs.mkdir(root)
+        if check:
+            if not self.fs.exists(root):
+                raise ValueError(
+                    f"Path {root} does not exist. Create with the ``create=True`` keyword"
+                )
+            with self.fs.open(fs.pathsep.join([root, "a"]), "w"):
+                pass
+            self.fs.rm(fs.pathsep.join([root, "a"]))
+
+    @staticmethod
+    def _get_path(fs, path):
+        return path if isinstance(fs, LocalFileSystem) else urlparse(path).path
+
+    @staticmethod
+    def _normalize_path(fs, path, lstrip=False, rstrip=False):
+        if fs.pathsep != "/":  # pragma: no cover
+            path = path.replace("/", fs.pathsep)
+        if lstrip:
+            path = path.lstrip(fs.pathsep)
+        if rstrip:
+            path = path.rstrip(fs.pathsep)
+        return path
+
+    @staticmethod
+    def _join_path(fs, paths):
+        if fs.pathsep == "/":
+            return "/".join(paths)
+
+        new_paths = []
+        for i, path in enumerate(paths):
+            path = FSMap._normalize_path(
+                fs, path, lstrip=i > 0, rstrip=i < len(paths) - 1
+            )
+            new_paths.append(path)
+        return fs.pathsep.join(new_paths)
+
+    def clear(self):
+        """Remove all keys below root - empties out mapping"""
+        try:
+            self.fs.rm(self.root, True)
+            self.fs.mkdir(self.root)
+        except:  # noqa: E722  # pragma: no cover
+            pass
+
+    def _key_to_str(self, key):
+        """Generate full path for the key"""
+        if isinstance(key, (tuple, list)):
+            key = str(tuple(key))
+        else:
+            key = str(key)
+        return self._join_path(self.fs, [self.root, key]) if self.root else key
+
+    def _str_to_key(self, s):
+        """Strip path of to leave key name"""
+        key = self._normalize_path(self.fs, s[len(self.root) :], lstrip=True)
+        if self.fs.pathsep != "/":  # pragma: no cover
+            key = key.replace(self.fs.pathsep, "/")
+        return key
+
+    def __getitem__(self, key, default=None):
+        """Retrieve data"""
+        key = self._key_to_str(key)
+        try:
+            result = self.fs.cat(key)
+        except:  # noqa: E722
+            if default is not None:
+                return default
+            raise KeyError(key)
+        return result
+
+    def pop(self, key, default=None):
+        result = self.__getitem__(key, default)
+        try:
+            del self[key]
+        except KeyError:
+            pass
+        return result
+
+    @staticmethod
+    def _parent(fs, path):
+        path = FSMap._get_path(fs, path.rstrip(fs.pathsep))
+        if fs.pathsep in path:
+            return path.rsplit(fs.pathsep, 1)[0]
+        else:  # pragma: no cover
+            return ""
+
+    def __setitem__(self, key, value):
+        """Store value in key"""
+        key = self._key_to_str(key)
+        try:
+            self.fs.mkdir(self._parent(self.fs, key))
+        except FileExistsError:
+            pass
+        with self.fs.open(key, "wb") as f:
+            f.write(value)
+
+    @staticmethod
+    def _find(fs, path):
+        out = set()
+        for path, dirs, files in fs.walk(path):
+            out.update(fs.pathsep.join([path, f]) for f in files)
+        if fs.isfile(path) and path not in out:
+            # walk works on directories, but find should also return [path]
+            # when path happens to be a file
+            out.add(path)
+        return sorted(out)
+
+    def __iter__(self):
+        return (self._str_to_key(x) for x in self._find(self.fs, self.root))
+
+    def __len__(self):
+        return len(self._find(self.fs, self.root))
+
+    def __delitem__(self, key):
+        """Remove key"""
+        try:
+            self.fs.rm(self._key_to_str(key))
+        except:  # noqa: E722
+            raise KeyError
+
+    def __contains__(self, key):
+        """Does key exist in mapping?"""
+        return self.fs.exists(self._key_to_str(key))
diff --git a/python/xorbits/_mars/lib/filesystem/fsspec_adapter.py b/python/xorbits/_mars/lib/filesystem/fsspec_adapter.py
new file mode 100644
index 000000000..e8b913078
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/fsspec_adapter.py
@@ -0,0 +1,132 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Iterator, Tuple, Union, BinaryIO, TextIO, Dict
+from urllib.parse import urlparse, urlunparse, ParseResult
+
+from fsspec import filesystem
+from fsspec.core import stringify_path
+
+from ...utils import implements
+from .core import FileSystem
+from .core import path_type
+
+
+class FsSpecAdapter(FileSystem):
+    def __init__(self, scheme: str, **kwargs):
+        self._fs = filesystem(scheme, **kwargs)
+        self._scheme = scheme
+
+    @implements(FileSystem.cat)
+    def cat(self, path: path_type) -> bytes:
+        return self._fs.cat_file(self._normalize_path(path))
+
+    @implements(FileSystem.ls)
+    def ls(self, path: path_type) -> List[path_type]:
+        entries = []
+        for entry in self._fs.ls(self._normalize_path(path), detail=False):
+            if isinstance(entry, Dict):
+                entries.append(entry.get("name"))
+            elif isinstance(entry, str):
+                entries.append(entry)
+            else:  # pragma: no cover
+                raise TypeError(f"Expect str or dict, but got {type(entry)}")
+        return self._append_scheme(entries)
+
+    @implements(FileSystem.delete)
+    def delete(self, path: path_type, recursive: bool = False):
+        raise NotImplementedError
+
+    @implements(FileSystem.stat)
+    def stat(self, path: path_type) -> Dict:
+        return self._fs.info(self._normalize_path(path))
+
+    @implements(FileSystem.rename)
+    def rename(self, path: path_type, new_path: path_type):
+        raise NotImplementedError
+
+    @implements(FileSystem.mkdir)
+    def mkdir(self, path: path_type, create_parents: bool = True):
+        raise NotImplementedError
+
+    @implements(FileSystem.exists)
+    def exists(self, path: path_type):
+        return self._fs.exists(self._normalize_path(path))
+
+    @implements(FileSystem.isdir)
+    def isdir(self, path: path_type) -> bool:
+        return self._fs.isdir(self._normalize_path(path))
+
+    @implements(FileSystem.isfile)
+    def isfile(self, path: path_type) -> bool:
+        return self._fs.isfile(self._normalize_path(path))
+
+    @implements(FileSystem._isfilestore)
+    def _isfilestore(self) -> bool:
+        raise NotImplementedError
+
+    @implements(FileSystem.open)
+    def open(self, path: path_type, mode: str = "rb") -> Union[BinaryIO, TextIO]:
+        return self._fs.open(self._normalize_path(path), mode=mode)
+
+    @implements(FileSystem.walk)
+    def walk(self, path: path_type) -> Iterator[Tuple[str, List[str], List[str]]]:
+        for root, dirs, files in self._fs.walk(path):
+            yield self._append_scheme([root])[0], self._append_scheme(
+                dirs
+            ), self._append_scheme(files)
+
+    @implements(FileSystem.glob)
+    def glob(self, path: path_type, recursive: bool = False) -> List[path_type]:
+        from ._glob import FileSystemGlob
+
+        return self._append_scheme(
+            FileSystemGlob(self).glob(self._normalize_path(path), recursive=recursive)
+        )
+
+    @staticmethod
+    def _normalize_path(path: path_type) -> str:
+        """
+        Stringify path and remove its scheme.
+        """
+        path_str = stringify_path(path)
+        parsed = urlparse(path_str)
+        if parsed.scheme:
+            return urlunparse(
+                ParseResult(
+                    scheme="",
+                    netloc=parsed.netloc,
+                    path=parsed.path,
+                    params="",
+                    query="",
+                    fragment="",
+                )
+            )
+        else:
+            return path_str
+
+    def _append_scheme(self, paths: List[path_type]) -> List[path_type]:
+        return [
+            urlunparse(
+                ParseResult(
+                    scheme=self._scheme,
+                    netloc="",
+                    path=path,
+                    params="",
+                    query="",
+                    fragment="",
+                )
+            )
+            for path in paths
+        ]
diff --git a/python/xorbits/_mars/lib/filesystem/hdfs.py b/python/xorbits/_mars/lib/filesystem/hdfs.py
new file mode 100644
index 000000000..91ae22403
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/hdfs.py
@@ -0,0 +1,31 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try:
+    from pyarrow.fs import HadoopFileSystem as _ArrowHadoopFileSystem
+
+    from .arrow import HadoopFileSystem
+
+    del _ArrowHadoopFileSystem
+except ImportError:  # pragma: no cover
+    try:
+        # pyarrow < 2.0.0
+        from pyarrow import HadoopFileSystem
+    except ImportError:
+        HadoopFileSystem = None
+
+from .core import register_filesystem
+
+if HadoopFileSystem is not None:  # pragma: no branch
+    register_filesystem("hdfs", HadoopFileSystem)
diff --git a/python/xorbits/_mars/lib/filesystem/local.py b/python/xorbits/_mars/lib/filesystem/local.py
new file mode 100644
index 000000000..e2a8ee6ef
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/local.py
@@ -0,0 +1,112 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+import shutil
+from typing import BinaryIO, Dict, Iterator, List, TextIO, Tuple, Union
+
+from ...utils import implements, stringify_path
+from .base import FileSystem, path_type
+
+
+class LocalFileSystem(FileSystem):
+    _instance = None
+
+    @classmethod
+    def get_instance(cls):
+        if cls._instance is None:
+            cls._instance = LocalFileSystem()
+        return cls._instance
+
+    @implements(FileSystem.cat)
+    def cat(self, path: path_type):
+        with self.open(path, "rb") as f:
+            return f.read()
+
+    @implements(FileSystem.ls)
+    def ls(self, path: path_type) -> List[path_type]:
+        path = stringify_path(path)
+        return sorted(os.path.join(path, x) for x in os.listdir(path))
+
+    @implements(FileSystem.delete)
+    def delete(self, path: path_type, recursive: bool = False):
+        if os.path.isfile(path):
+            os.remove(path)
+        elif not recursive:
+            os.rmdir(path)
+        else:
+            shutil.rmtree(path)
+
+    @implements(FileSystem.rename)
+    def rename(self, path: path_type, new_path: path_type):
+        os.rename(path, new_path)
+
+    @implements(FileSystem.stat)
+    def stat(self, path: path_type) -> Dict:
+        os_stat = os.stat(path)
+        stat = dict(name=path, size=os_stat.st_size, modified_time=os_stat.st_mtime)
+        if os.path.isfile(path):
+            stat["type"] = "file"
+        elif os.path.isdir(path):
+            stat["type"] = "directory"
+        else:  # pragma: no cover
+            stat["type"] = "other"
+        return stat
+
+    @implements(FileSystem.mkdir)
+    def mkdir(self, path: path_type, create_parents: bool = True):
+        path = stringify_path(path)
+        if create_parents:
+            os.makedirs(path)
+        else:
+            os.mkdir(path)
+
+    @implements(FileSystem.isdir)
+    def isdir(self, path: path_type) -> bool:
+        path = stringify_path(path)
+        return os.path.isdir(path)
+
+    @implements(FileSystem.isfile)
+    def isfile(self, path: path_type) -> bool:
+        path = stringify_path(path)
+        return os.path.isfile(path)
+
+    @implements(FileSystem._isfilestore)
+    def _isfilestore(self) -> bool:
+        return True
+
+    @implements(FileSystem.exists)
+    def exists(self, path: path_type):
+        path = stringify_path(path)
+        return os.path.exists(path)
+
+    @implements(FileSystem.open)
+    def open(self, path: path_type, mode: str = "rb") -> Union[BinaryIO, TextIO]:
+        path = stringify_path(path)
+        return open(path, mode=mode)
+
+    @implements(FileSystem.walk)
+    def walk(self, path: path_type) -> Iterator[Tuple[str, List[str], List[str]]]:
+        path = stringify_path(path)
+        return os.walk(path)
+
+    @implements(FileSystem.glob)
+    def glob(self, path: path_type, recursive: bool = False) -> List[path_type]:
+        path = stringify_path(path)
+        return glob.glob(path, recursive=recursive)
+
+    @property
+    def pathsep(self) -> str:
+        return os.path.sep
diff --git a/python/xorbits/_mars/lib/filesystem/oss.py b/python/xorbits/_mars/lib/filesystem/oss.py
new file mode 100644
index 000000000..95cf29a0b
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/oss.py
@@ -0,0 +1,157 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Iterator, List, Tuple
+from urllib import parse
+
+from ...utils import implements, lazy_import
+from ._oss_lib import common as oc
+from ._oss_lib.glob import glob
+from ._oss_lib.handle import OSSIOBase
+from .base import FileSystem, path_type
+
+oss2 = lazy_import("oss2", placeholder=True)
+
+_oss_time_out = 10
+
+
+class OSSFileSystem(FileSystem):
+    _instance = None
+
+    @classmethod
+    def get_instance(cls):
+        if cls._instance is None:
+            cls._instance = OSSFileSystem()
+        return cls._instance
+
+    @implements(FileSystem.cat)
+    def cat(self, path: path_type):
+        raise NotImplementedError
+
+    @implements(FileSystem.ls)
+    def ls(self, path: path_type) -> List[path_type]:
+        file_list = []
+        file_entry = oc.OSSFileEntry(path)
+        if not file_entry.is_dir():
+            raise OSError("ls for file is not supported")
+        else:
+            bucket, key, access_key_id, access_key_secret, end_point = oc.parse_osspath(
+                path
+            )
+            oss_bucket = oss2.Bucket(
+                auth=oss2.Auth(
+                    access_key_id=access_key_id, access_key_secret=access_key_secret
+                ),
+                endpoint=end_point,
+                bucket_name=bucket,
+                connect_timeout=_oss_time_out,
+            )
+            for obj in oss2.ObjectIteratorV2(oss_bucket, prefix=key):
+                if obj.key.endswith("/"):
+                    continue
+                obj_path = rf"oss://{bucket}/{obj.key}"
+                file_list.append(
+                    build_oss_path(
+                        obj_path, access_key_id, access_key_secret, end_point
+                    )
+                )
+        return file_list
+
+    @implements(FileSystem.delete)
+    def delete(self, path: path_type, recursive: bool = False):
+        raise NotImplementedError
+
+    @implements(FileSystem.rename)
+    def rename(self, path: path_type, new_path: path_type):
+        raise NotImplementedError
+
+    @implements(FileSystem.stat)
+    def stat(self, path: path_type) -> Dict:
+        ofe = oc.OSSFileEntry(path)
+        return ofe.stat()
+
+    @implements(FileSystem.mkdir)
+    def mkdir(self, path: path_type, create_parents: bool = True):
+        raise NotImplementedError
+
+    @implements(FileSystem.isdir)
+    def isdir(self, path: path_type) -> bool:
+        file_entry = oc.OSSFileEntry(path)
+        return file_entry.is_dir()
+
+    @implements(FileSystem.isfile)
+    def isfile(self, path: path_type) -> bool:
+        file_entry = oc.OSSFileEntry(path)
+        return file_entry.is_file()
+
+    @implements(FileSystem._isfilestore)
+    def _isfilestore(self) -> bool:
+        raise NotImplementedError
+
+    @implements(FileSystem.exists)
+    def exists(self, path: path_type):
+        return oc.oss_exists(path)
+
+    @implements(FileSystem.open)
+    def open(self, path: path_type, mode: str = "rb") -> OSSIOBase:
+        file_handle = OSSIOBase(path, mode)
+        return file_handle
+
+    @implements(FileSystem.walk)
+    def walk(self, path: path_type) -> Iterator[Tuple[str, List[str], List[str]]]:
+        raise NotImplementedError
+
+    @implements(FileSystem.glob)
+    def glob(self, path: path_type, recursive: bool = False) -> List[path_type]:
+        return glob(path, recursive=recursive)
+
+
+def build_oss_path(path: path_type, access_key_id, access_key_secret, end_point):
+    """
+    Returns a path with oss info.
+    Used to register the access_key_id, access_key_secret and
+    endpoint of OSS. The access_key_id and endpoint are put
+    into the url with url-safe-base64 encoding.
+
+    Parameters
+    ----------
+    path : path_type
+        The original oss url.
+
+    access_key_id : str
+        The access key id of oss.
+
+    access_key_secret : str
+        The access key secret of oss.
+
+    end_point : str
+        The endpoint of oss.
+
+    Returns
+    -------
+    path_type
+        Path include the encoded access key id, end point and
+        access key secret of oss.
+    """
+    if isinstance(path, (list, tuple)):
+        path = path[0]
+    param_dict = {"access_key_id": access_key_id, "end_point": end_point}
+    id_endpoint = oc.dict_to_url(param_dict)
+    password = access_key_secret
+    parse_result = parse.urlparse(path)
+    new_path = (
+        f"{parse_result.scheme}://{id_endpoint}:{password}"
+        f"@{parse_result.netloc}{parse_result.path}"
+    )
+    return new_path
diff --git a/python/xorbits/_mars/lib/filesystem/s3.py b/python/xorbits/_mars/lib/filesystem/s3.py
new file mode 100644
index 000000000..aac457241
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/s3.py
@@ -0,0 +1,79 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Dict
+
+"""
+An example to read csv from s3
+------------------------------
+>>> import mars
+>>> import mars.dataframe as md
+>>>
+>>> mars.new_session()
+>>> # Pass endpoint_url / aws_access_key_id / aws_secret_access_key to read_csv.
+>>> mdf = md.read_csv("s3://bucket/example.csv", index_col=0, storage_options={
+>>>     "client_kwargs": {
+>>>         "endpoint_url": "http://192.168.1.12:9000",
+>>>         "aws_access_key_id": "<s3 access id>",
+>>>         "aws_secret_access_key": "<s3 access key>",
+>>>         "aws_session_token": "<s3 session token>",
+>>>     }})
+>>> # Export environment vars AWS_ENDPOINT_URL / AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY / AWS_SESSION_TOKEN.
+>>> mdf = md.read_csv("s3://bucket/example.csv", index_col=0)
+>>> r = mdf.head(1000).execute()
+>>> print(r)
+"""
+
+try:  # pragma: no cover
+    # make sure s3fs is installed
+    from s3fs import S3FileSystem as _S3FileSystem
+
+    # make sure fsspec is installed
+    from .fsspec_adapter import FsSpecAdapter
+
+    del _S3FileSystem
+except ImportError:
+    FsSpecAdapter = None
+
+if FsSpecAdapter is not None:  # pragma: no cover
+    from .core import register_filesystem
+
+    class S3FileSystem(FsSpecAdapter):
+        def __init__(self, **kwargs):
+            super().__init__("s3", **kwargs)
+
+        @staticmethod
+        def parse_from_path(uri: str):
+            client_kwargs = {
+                "endpoint_url": os.environ.get("AWS_ENDPOINT_URL"),
+                "aws_access_key_id": os.environ.get("AWS_ACCESS_KEY_ID"),
+                "aws_secret_access_key": os.environ.get("AWS_SECRET_ACCESS_KEY"),
+                "aws_session_token": os.environ.get("AWS_SESSION_TOKEN"),
+            }
+            client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None}
+            return {"client_kwargs": client_kwargs}
+
+        @classmethod
+        def get_storage_options(cls, storage_options: Dict, uri: str) -> Dict:
+            options = cls.parse_from_path(uri)
+            for k, v in storage_options.items():
+                if k == "client_kwargs":
+                    options["client_kwargs"].update(v)
+                else:
+                    options[k] = v
+            return options
+
+    register_filesystem("s3", S3FileSystem)
+else:
+    S3FileSystem = None
diff --git a/python/xorbits/_mars/lib/filesystem/tests/__init__.py b/python/xorbits/_mars/lib/filesystem/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/lib/filesystem/tests/test_filesystem.py b/python/xorbits/_mars/lib/filesystem/tests/test_filesystem.py
new file mode 100644
index 000000000..d03696e97
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/tests/test_filesystem.py
@@ -0,0 +1,223 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob as _glob
+import os
+import tempfile
+
+import numpy as np
+import pytest
+
+try:
+    import pyarrow as pa
+except ImportError:  # pragma: no cover
+    pa = None
+
+from ....tests.core import require_hadoop
+from ....utils import lazy_import
+from .. import FileSystem, FSMap, LocalFileSystem, glob
+
+if pa is not None:
+    from ..arrow import ArrowBasedLocalFileSystem, HadoopFileSystem
+else:  # pragma: no cover
+    ArrowBasedLocalFileSystem = None
+
+fsspec_installed = lazy_import("fsspec") is not None
+
+
+def test_path_parser():
+    path = "hdfs://user:password@localhost:8080/test"
+    parsed_result = FileSystem.parse_from_path(path)
+    assert parsed_result["host"] == "localhost"
+    assert parsed_result["port"] == 8080
+    assert parsed_result["user"] == "user"
+    assert parsed_result["password"] == "password"
+
+
+def test_local_filesystem():
+    local_fs1 = LocalFileSystem.get_instance()
+    local_fs2 = LocalFileSystem.get_instance()
+    assert local_fs1 is local_fs2
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test")
+
+        with open(file_path, "wb") as f:
+            f.write(b"text for test")
+        assert local_fs1.stat(tempdir)["type"] == "directory"
+        assert local_fs1.stat(file_path)["type"] == "file"
+        assert len(glob(tempdir + "*")) == 1
+
+
+@pytest.mark.parametrize(
+    "fs_type",
+    [LocalFileSystem, ArrowBasedLocalFileSystem]
+    if pa is not None
+    else [LocalFileSystem],
+)
+def test_filesystems(fs_type):
+    fs = fs_type.get_instance()
+
+    with tempfile.TemporaryDirectory() as root:
+        test1_dir = os.path.join(root, "test1")
+        fs.mkdir(test1_dir, create_parents=False)
+        test2_dir = os.path.join(root, "test2")
+        sub_test2_dir = os.path.join(test2_dir, "sub_test2")
+        fs.mkdir(sub_test2_dir)
+
+        sub_test2_dir_stat = fs.stat(sub_test2_dir)
+        assert sub_test2_dir_stat["type"] == "directory"
+        assert sub_test2_dir_stat["name"] == sub_test2_dir
+        assert fs.isdir(sub_test2_dir)
+
+        test1_file = os.path.join(test1_dir, "test1")
+        with fs.open(test1_file, "wb") as f:
+            f.write(b"abc test")
+        with fs.open(test1_file, "ab") as f:
+            f.write(b"\nappend test")
+        with fs.open(test1_file, "rb") as f:
+            content = f.read()
+            with open(test1_file, "rb") as f2:
+                expected = f2.read()
+                assert content == expected
+
+        assert fs.cat(test1_file) == expected
+
+        assert fs.isfile(test1_file)
+        test1_file_stat = fs.stat(test1_file)
+        assert test1_file_stat["type"] == "file"
+        assert test1_file_stat["name"] == test1_file
+        assert test1_file_stat["size"] == os.stat(test1_file).st_size
+        np.testing.assert_almost_equal(
+            test1_file_stat["modified_time"], os.stat(test1_file).st_mtime, decimal=6
+        )
+
+        walked = [
+            (os.path.normpath(root), dirs, files) for root, dirs, files in fs.walk(root)
+        ]
+        expected = os.walk(root)
+        assert sorted(walked) == sorted(expected)
+
+        test2_file = os.path.join(sub_test2_dir, "test2")
+        with fs.open(test2_file, "wb") as f:
+            f.write(b"def test")
+
+        for recursive in [False, True]:
+            globs = [
+                os.path.normpath(p)
+                for p in fs.glob(os.path.join(root, "*"), recursive=recursive)
+            ]
+            expected = [
+                os.path.normpath(p)
+                for p in _glob.glob(os.path.join(root, "*"), recursive=recursive)
+            ]
+            assert sorted(globs) == sorted(expected)
+
+        for path in [os.path.join(root, "*", "*"), test1_dir]:
+            globs = [os.path.normpath(p) for p in fs.glob(path)]
+            expected = [os.path.normpath(p) for p in _glob.glob(path)]
+            assert sorted(globs) == sorted(expected)
+
+        test1_new_file = os.path.join(test1_dir, "test1_new")
+        fs.rename(test1_file, test1_new_file)
+        test1_new_file2 = os.path.join(test1_dir, "test1_new2")
+        fs.mv(test1_new_file, test1_new_file2)
+        assert fs.exists(test1_new_file2)
+        assert not fs.exists(test1_file)
+
+        assert fs.disk_usage(test1_dir) > 0
+
+        fs.delete(test2_file)
+        assert not fs.exists(test2_file)
+
+        assert fs._isfilestore()
+
+        with pytest.raises(OSError):
+            fs.delete(test1_dir)
+        fs.delete(test1_dir, recursive=True)
+        assert not fs.exists(test1_dir)
+
+
+@require_hadoop
+def test_hadoop_filesystem():
+    fs = HadoopFileSystem(host="localhost", port=8020)
+
+    test_dir = "/tmp/test/test_hadoop_fs"
+    fs.mkdir(test_dir)
+    test_file = f"{test_dir}/my_file.txt"
+    test_file_content = b"text for text"
+    with fs.open(test_file, "wb") as f:
+        f.write(test_file_content)
+    with fs.open(test_file, "rb") as f:
+        assert test_file_content == f.read()
+    # test file with hdfs:// prefix
+    assert fs.exists(f"hdfs://{test_dir}")
+
+
+def test_fsmap():
+    fs = LocalFileSystem.get_instance()
+    with tempfile.TemporaryDirectory() as root:
+        fs_map = FSMap(root, fs, check=True)
+
+        path = "/to/path/test_file"
+        test_content = b"text for test"
+        fs_map[path] = test_content
+        assert fs_map[path] == test_content
+        assert len(fs_map) == 1
+        assert path in fs_map
+
+        path2 = "/to/path2/test_file2"
+        fs_map[path2] = test_content
+        assert len(fs_map) == 2
+
+        del fs_map[path]
+        assert list(fs_map) == ["to/path2/test_file2"]
+
+        path3 = "/to2/path3/test_file3"
+        fs_map[path3] = test_content
+        assert fs_map.pop(path3) == test_content
+        assert fs_map.pop(path3, "fake_content") == "fake_content"
+        with pytest.raises(KeyError):
+            fs_map.pop("not_exist")
+
+        fs_map.clear()
+        assert len(fs_map) == 0
+
+        # test root not exist
+        with pytest.raises(ValueError):
+            _ = FSMap(root + "/path2", fs, check=True)
+
+        # create root
+        fs_map = FSMap(root + "/path2", fs, create=True)
+        assert len(fs_map) == 0
+
+
+@pytest.mark.skipif(not fsspec_installed, reason="fsspec not installed")
+def test_get_fs():
+    from .. import get_fs, register_filesystem
+    from ..fsspec_adapter import FsSpecAdapter
+
+    class InMemoryFileSystemAdapter(FsSpecAdapter):
+        def __init__(self, **kwargs):
+            super().__init__("memory", **kwargs)
+
+    register_filesystem("memory", InMemoryFileSystemAdapter)
+
+    assert isinstance(get_fs("file://"), LocalFileSystem)
+    assert isinstance(get_fs("memory://"), InMemoryFileSystemAdapter)
+
+    try:
+        get_fs("unknown://")
+    except ValueError as e:
+        assert "Unknown file system type" in e.__str__()
diff --git a/python/xorbits/_mars/lib/filesystem/tests/test_fsspec_adapter.py b/python/xorbits/_mars/lib/filesystem/tests/test_fsspec_adapter.py
new file mode 100644
index 000000000..c33c4d642
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/tests/test_fsspec_adapter.py
@@ -0,0 +1,164 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from ....utils import lazy_import
+
+fsspec_installed = lazy_import("fsspec") is not None
+
+
+@pytest.mark.skipif(not fsspec_installed, reason="fsspec not installed")
+def test_fsspec_adapter():
+    """
+    Assuming the implementations follows fsspec strictly, we only need to test if the adapter
+    works correctly.
+    """
+    from ..fsspec_adapter import FsSpecAdapter
+
+    adapter = FsSpecAdapter(scheme="memory")
+
+    fs = adapter._fs
+    # generate directories and files as follows:
+    # .
+    # ├── dir
+    # │   ├── bar.txt
+    # │   └── subdir
+    # │       └── baz.txt
+    # └── foo.txt
+    with fs.open("foo.txt", mode="wb") as f:
+        f.write(str.encode("foo"))
+    fs.mkdir("dir")
+    fs.mkdirs("/dir/subdir")
+    with fs.open("/dir/bar.txt", mode="wb") as f:
+        f.write(str.encode("bar"))
+    with fs.open("/dir/subdir/baz.txt", mode="wb") as f:
+        f.write(str.encode("baz"))
+
+    # open
+    f = adapter.open("test.txt", mode="wb")
+    f.write(str.encode("test"))
+    f.close()
+
+    # cat
+    assert "test" == adapter.cat("test.txt").decode()
+    try:
+        adapter.cat("non-existent.txt")
+        pytest.fail()
+    except FileNotFoundError:
+        pass
+
+    # ls
+    entries = adapter.ls("/")
+    assert 3 == len(entries)
+    assert "memory:/test.txt" in entries
+    assert "memory:/foo.txt" in entries
+    assert "memory:/dir" in entries
+    entries = adapter.ls("dir")
+    assert 2 == len(entries)
+    assert "memory:/dir/bar.txt" in entries
+    assert "memory:/dir/subdir" in entries
+    entries = adapter.ls("test.txt")
+    assert 1 == len(entries)
+    assert "memory:/test.txt" in entries
+    try:
+        adapter.ls("non-existent.txt")
+        pytest.fail()
+    except FileNotFoundError:
+        pass
+
+    # stat
+    stat = adapter.stat("test.txt")
+    assert stat is not None
+    assert stat["name"] == "/test.txt"
+    assert stat["type"] == "file"
+    stat = adapter.stat("dir")
+    assert stat is not None
+    assert stat["name"] == "/dir"
+    assert stat["type"] == "directory"
+    try:
+        adapter.stat("non-existent.txt")
+        pytest.fail()
+    except FileNotFoundError:
+        pass
+
+    # exists
+    assert adapter.exists("test.txt")
+    assert not adapter.exists("non-existent.txt")
+
+    # isdir
+    assert adapter.isdir("dir")
+    assert not adapter.isdir("test.txt")
+    assert not adapter.isdir("non-existent.txt")
+
+    # isfile
+    assert adapter.isfile("test.txt")
+    assert not adapter.isfile("dir")
+    assert not adapter.isfile("non-existent.txt")
+
+    # walk
+    for root, dirs, files in adapter.walk("/"):
+        if root == "memory:":
+            assert dirs == ["memory:dir"]
+            assert files == ["memory:foo.txt", "memory:test.txt"]
+        elif root == "memory:/dir":
+            assert dirs == ["memory:subdir"]
+            assert files == ["memory:bar.txt"]
+        elif root == "memory:/dir/subdir":
+            assert len(dirs) == 0
+            assert files == ["memory:baz.txt"]
+        else:
+            pytest.fail(f"unexpected dir: {root}")
+
+    # glob
+    # the expected results come from built-in glob lib.
+    expected = [
+        "memory:foo.txt",
+        "memory:dir",
+        "memory:dir/subdir",
+        "memory:dir/subdir/baz.txt",
+        "memory:dir/bar.txt",
+        "memory:test.txt",
+    ]
+    expected.sort()
+    actual = adapter.glob("**", recursive=True)
+    actual.sort()
+    assert actual == expected
+    expected = ["memory:foo.txt"]
+    actual = adapter.glob("**/foo.txt", recursive=True)
+    assert actual == expected
+    expected = ["memory:dir/bar.txt"]
+    actual = adapter.glob("**/bar.txt", recursive=True)
+    assert actual == expected
+    expected = ["memory:dir/subdir/baz.txt"]
+    actual = adapter.glob("**/baz.txt", recursive=True)
+    assert actual == expected
+    expected = ["memory:dir/bar.txt", "memory:dir/subdir/baz.txt"]
+    actual = adapter.glob("**/ba[rz].txt", recursive=True)
+    assert actual == expected
+    actual = adapter.glob("**/ba?.txt", recursive=True)
+    assert actual == expected
+    expected = ["memory:foo.txt", "memory:test.txt", "memory:dir"]
+    expected.sort()
+    actual = adapter.glob("**", recursive=False)
+    actual.sort()
+    assert actual == expected
+    actual = adapter.glob("*", recursive=False)
+    actual.sort()
+    assert actual == expected
+    expected = ["memory:foo.txt", "memory:test.txt"]
+    expected.sort()
+    actual = adapter.glob("*.txt", recursive=False)
+    actual.sort()
+    assert actual == expected
diff --git a/python/xorbits/_mars/lib/filesystem/tests/test_oss.py b/python/xorbits/_mars/lib/filesystem/tests/test_oss.py
new file mode 100644
index 000000000..5ce3be7cd
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/tests/test_oss.py
@@ -0,0 +1,186 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from io import BytesIO
+
+import pytest
+
+from .... import dataframe as md
+from ....tests.core import mock
+from .. import oss
+from .._oss_lib import glob as og
+from .._oss_lib.common import OSSFileEntry
+from ..oss import build_oss_path
+
+
+class OSSObjInfo:
+    def __init__(self, name, content):
+        self.key = name
+        # Use the current time as "Last-Modified" in the test.
+        self.last_modified = int(time.time())
+        self.size = len(content.encode("utf8"))
+
+
+class ObjectMeta:
+    def __init__(self, key, obj_dict):
+        self.headers = {}
+        self.headers["Last-Modified"] = int(time.time())
+        self.headers["Content-Length"] = len(obj_dict[key].encode("utf8"))
+
+
+class MockObject:
+    def __init__(self, obj_dict, key, byte_range):
+        self._stream = BytesIO(obj_dict[key].encode("utf8"))
+        self._byte_range = byte_range
+
+    def read(self):
+        self._stream.seek(self._byte_range[0])
+        if self._byte_range[1] is None:
+            return self._stream.read()
+        else:
+            size = self._byte_range[1] - self._byte_range[0] + 1
+            return self._stream.read(size)
+
+
+class SideEffectBucket:
+    def __init__(self, *_, **__):
+        self.obj_dict = {
+            "file.csv": "id1,id2,id3\n1,2,3\n",
+            "dir/": "",
+            "dir/file1.csv": "2",
+            "dir/file2.csv": "3",
+            "dir/subdir/": "",
+            "dir/subdir/file3.csv": "s4",
+            "dir/subdir/file4.csv": "s5",
+            "dir2/": "",
+            "dir2/file6.csv": "6",
+            "dir2/file7.csv": "7",
+        }
+
+    def get_object_meta(self, key):
+        return ObjectMeta(key, self.obj_dict)
+
+    def object_exists(self, key):
+        return key in self.obj_dict.keys()
+
+    def get_object(self, key, byte_range):
+        return MockObject(self.obj_dict, key, byte_range)
+
+
+class SideEffectObjIter:
+    def __init__(self, *args, **kwargs):
+        self.bucket = args[0]
+        self.prefix = kwargs["prefix"]
+
+    def __iter__(self):
+        for name, content in self.bucket.obj_dict.items():
+            if name.startswith(self.prefix):
+                yield OSSObjInfo(name, content)
+
+
+@mock.patch("oss2.Bucket", side_effect=SideEffectBucket)
+@mock.patch("oss2.ObjectIteratorV2", side_effect=SideEffectObjIter)
+def test_oss_filesystem(fake_obj_iter, fake_oss_bucket, setup):
+    access_key_id = "your_access_key_id"
+    access_key_secret = "your_access_key_secret"
+    end_point = "your_endpoint"
+
+    file_path = f"oss://bucket/file.csv"
+    dir_path = f"oss://bucket/dir/"
+    dir_path_content_magic = f"oss://bucket/dir*/"
+    other_scheme_path = f"scheme://netloc/path"
+    not_exist_file_path = f"oss://bucket/not_exist.csv"
+
+    fake_file_path = build_oss_path(
+        file_path, access_key_id, access_key_secret, end_point
+    )
+    fake_dir_path = build_oss_path(
+        dir_path, access_key_id, access_key_secret, end_point
+    )
+    fake_dir_path_contains_magic = build_oss_path(
+        dir_path_content_magic, access_key_id, access_key_secret, end_point
+    )
+    fake_other_scheme_path = build_oss_path(
+        other_scheme_path, access_key_id, access_key_secret, end_point
+    )
+    fake_not_exist_file_path = build_oss_path(
+        not_exist_file_path, access_key_id, access_key_secret, end_point
+    )
+    fs = oss.OSSFileSystem.get_instance()
+
+    # Test OSSFileSystem.
+    assert len(fs.ls(fake_dir_path)) == 4
+    assert not fs.isfile(fake_dir_path)
+    assert fs.isdir(fake_dir_path)
+    assert not fs.isdir(fake_file_path)
+    assert fs.isfile(fake_file_path)
+    assert fs.exists(fake_file_path)
+    assert not fs.exists(fake_not_exist_file_path)
+    assert fs.stat(fake_file_path)["type"] == "file"
+    assert fs.stat(fake_dir_path)["type"] == "directory"
+    assert fs.glob(fake_dir_path) == [fake_dir_path]
+
+    with pytest.raises(ValueError) as e:
+        fs.exists(fake_other_scheme_path)
+    msg1 = e.value.args[0]
+    assert (
+        msg1 == f"Except scheme oss, but got scheme: "
+        f"scheme in path: {fake_other_scheme_path}"
+    )
+
+    with pytest.raises(RuntimeError) as e:
+        fs.exists(file_path)
+    msg2 = e.value.args[0]
+    assert msg2 == "Please use build_oss_path to add OSS info"
+
+    with pytest.raises(OSError):
+        print(fs.ls(fake_file_path))
+
+    assert len(fs.glob(fake_file_path)) == 1
+    assert len(fs.glob(fake_dir_path + "*", recursive=True)) == 4
+    assert len(fs.glob(fake_dir_path_contains_magic)) == 2
+
+    # Test the specific functions of glob.
+    assert og.has_magic(b"*")
+    assert og.escape(b"*") == b"[*]"
+    assert og.escape("*") == "[*]"
+
+    # test OSSIOBase
+    with fs.open(fake_file_path) as f:
+        assert f.readline() == b"id1,id2,id3\n"
+        assert f.readline() == b"1,2,3\n"
+        f.seek(-1, 2)
+        assert f.readline() == b"\n"
+        with pytest.raises(AttributeError):
+            f.fileno()
+        with pytest.raises(OSError):
+            f.seek(-1)
+        with pytest.raises(OSError):
+            f.seek(-100, 1)
+        with pytest.raises(ValueError):
+            f.seek(1, 3)
+        f.seek(0)
+        assert f.read() == b"id1,id2,id3\n1,2,3\n"
+        f.seek(0)
+        assert f.readline(2) == b"id"
+        f.seek(0)
+        with pytest.raises(TypeError):
+            f.readline("2")
+
+    fe = OSSFileEntry(fake_file_path)
+    assert fe.path == fake_file_path
+
+    df = md.read_csv(fake_file_path).execute()
+    assert df.shape == (1, 3)
diff --git a/python/xorbits/_mars/lib/filesystem/tests/test_s3.py b/python/xorbits/_mars/lib/filesystem/tests/test_s3.py
new file mode 100644
index 000000000..d01a2be20
--- /dev/null
+++ b/python/xorbits/_mars/lib/filesystem/tests/test_s3.py
@@ -0,0 +1,98 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import pytest
+
+from ....dataframe import read_csv
+from ..core import register_filesystem
+from ..s3 import S3FileSystem
+
+
+class KwArgsException(Exception):
+    def __init__(self, kwargs):
+        self.kwargs = kwargs
+
+
+if S3FileSystem is not None:
+
+    class TestS3FileSystem(S3FileSystem):
+        def __init__(self, **kwargs):
+            super().__init__(**kwargs)
+            raise KwArgsException(kwargs)
+
+else:
+    TestS3FileSystem = None
+
+
+@pytest.mark.skipif(S3FileSystem is None, reason="S3 is not supported")
+def test_client_kwargs():
+    register_filesystem("s3", TestS3FileSystem)
+
+    test_kwargs = {
+        "endpoint_url": "http://192.168.1.12:9000",
+        "aws_access_key_id": "test_id",
+        "aws_secret_access_key": "test_key",
+        "aws_session_token": "test_session_token",
+    }
+
+    def _assert_true():
+        # Pass endpoint_url / aws_access_key_id / aws_secret_access_key / aws_session_token to read_csv.
+        with pytest.raises(KwArgsException) as e:
+            read_csv(
+                "s3://bucket/example.csv",
+                index_col=0,
+                storage_options={"client_kwargs": test_kwargs},
+            )
+        assert e.value.kwargs == {
+            "client_kwargs": {
+                "endpoint_url": "http://192.168.1.12:9000",
+                "aws_access_key_id": "test_id",
+                "aws_secret_access_key": "test_key",
+                "aws_session_token": "test_session_token",
+            }
+        }
+
+    _assert_true()
+
+    test_env = {
+        "AWS_ENDPOINT_URL": "a",
+        "AWS_ACCESS_KEY_ID": "b",
+        "AWS_SECRET_ACCESS_KEY": "c",
+        "AWS_SESSION_TOKEN": "d",
+    }
+    for k, v in test_env.items():
+        os.environ[k] = v
+
+    try:
+        _assert_true()
+
+        for k, v in test_kwargs.items():
+            with pytest.raises(KwArgsException) as e:
+                read_csv(
+                    "s3://bucket/example.csv",
+                    index_col=0,
+                    storage_options={"client_kwargs": {k: v}},
+                )
+            expect = {
+                "endpoint_url": "a",
+                "aws_access_key_id": "b",
+                "aws_secret_access_key": "c",
+                "aws_session_token": "d",
+            }
+            expect[k] = v
+            assert e.value.kwargs == {"client_kwargs": expect}
+    finally:
+        for k, v in test_env.items():
+            os.environ.pop(k, None)
diff --git a/python/xorbits/_mars/lib/groupby_wrapper.py b/python/xorbits/_mars/lib/groupby_wrapper.py
new file mode 100644
index 000000000..f1477a086
--- /dev/null
+++ b/python/xorbits/_mars/lib/groupby_wrapper.py
@@ -0,0 +1,279 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from collections.abc import Iterable
+
+import cloudpickle
+import numpy as np
+from pandas.core.groupby import DataFrameGroupBy, SeriesGroupBy
+
+from ..utils import estimate_pandas_size, no_default, pd_release_version
+
+_HAS_SQUEEZE = pd_release_version < (1, 1, 0)
+_HAS_DROPNA = pd_release_version >= (1, 1, 0)
+_GROUP_KEYS_NO_DEFAULT = pd_release_version >= (1, 5, 0)
+
+_default_group_keys = no_default if _GROUP_KEYS_NO_DEFAULT else True
+
+
+class GroupByWrapper:
+    def __init__(
+        self,
+        obj,
+        groupby_obj=None,
+        keys=None,
+        axis=0,
+        level=None,
+        grouper=None,
+        exclusions=None,
+        selection=None,
+        as_index=True,
+        sort=True,
+        group_keys=_default_group_keys,
+        squeeze=False,
+        observed=False,
+        dropna=True,
+        mutated=False,
+        grouper_cache=None,
+    ):
+        def fill_value(v, key):
+            return (
+                v if v is not None or groupby_obj is None else getattr(groupby_obj, key)
+            )
+
+        self.obj = obj
+        self.keys = fill_value(keys, "keys")
+        self.axis = fill_value(axis, "axis")
+        self.level = fill_value(level, "level")
+        self.exclusions = fill_value(exclusions, "exclusions")
+        self.selection = selection
+        self.as_index = fill_value(as_index, "as_index")
+        self.sort = fill_value(sort, "sort")
+        self.group_keys = fill_value(group_keys, "group_keys")
+        self.squeeze = fill_value(squeeze, "squeeze")
+        self.observed = fill_value(observed, "observed")
+        self.mutated = fill_value(mutated, "mutated")
+        self.dropna = fill_value(dropna, "dropna")
+
+        if groupby_obj is None:
+            groupby_kw = dict(
+                keys=keys,
+                axis=axis,
+                level=level,
+                grouper=grouper,
+                exclusions=exclusions,
+                as_index=as_index,
+                group_keys=group_keys,
+                squeeze=squeeze,
+                observed=observed,
+                mutated=mutated,
+                dropna=dropna,
+            )
+            if not _HAS_SQUEEZE:  # pragma: no branch
+                groupby_kw.pop("squeeze")
+            if not _HAS_DROPNA:  # pragma: no branch
+                groupby_kw.pop("dropna")
+
+            if obj.ndim == 2:
+                self.groupby_obj = DataFrameGroupBy(obj, **groupby_kw)
+            else:
+                self.groupby_obj = SeriesGroupBy(obj, **groupby_kw)
+        else:
+            self.groupby_obj = groupby_obj
+
+        if grouper_cache:
+            self.groupby_obj.grouper._cache = grouper_cache
+        if selection:
+            self.groupby_obj = self.groupby_obj[selection]
+
+        self.is_frame = isinstance(self.groupby_obj, DataFrameGroupBy)
+
+    def __getitem__(self, item):
+        return GroupByWrapper(
+            self.obj,
+            keys=self.keys,
+            axis=self.axis,
+            level=self.level,
+            grouper=self.groupby_obj.grouper,
+            exclusions=self.exclusions,
+            selection=item,
+            as_index=self.as_index,
+            sort=self.sort,
+            group_keys=self.group_keys,
+            squeeze=self.squeeze,
+            observed=self.observed,
+            dropna=self.dropna,
+            mutated=self.mutated,
+        )
+
+    def __getattr__(self, item):
+        if item.startswith("_"):  # pragma: no cover
+            return object.__getattribute__(self, item)
+        if item in getattr(self.obj, "columns", ()):
+            return self.__getitem__(item)
+        return getattr(self.groupby_obj, item)
+
+    def __iter__(self):
+        return self.groupby_obj.__iter__()
+
+    def __sizeof__(self):
+        return sys.getsizeof(self.obj) + sys.getsizeof(
+            getattr(self.groupby_obj.grouper, "_cache", None)
+        )
+
+    def estimate_size(self):
+        return estimate_pandas_size(self.obj) + estimate_pandas_size(self.obj.index)
+
+    def __reduce__(self):
+        return (
+            type(self).from_tuple,
+            (self.to_tuple(pickle_function=True, truncate=True),),
+        )
+
+    def __bool__(self):
+        return bool(np.prod(self.shape))
+
+    @property
+    def empty(self):
+        return self.obj.empty
+
+    @property
+    def shape(self):
+        shape = list(self.groupby_obj.obj.shape)
+        if self.is_frame and self.selection:
+            shape[1] = len(self.selection)
+        return tuple(shape)
+
+    @property
+    def _selected_obj(self):
+        return getattr(self.groupby_obj, "_selected_obj")
+
+    def to_tuple(self, truncate=False, pickle_function=False):
+        if self.selection and truncate:
+            if isinstance(self.selection, Iterable) and not isinstance(
+                self.selection, str
+            ):
+                item_list = list(self.selection)
+            else:
+                item_list = [self.selection]
+            item_set = set(item_list)
+
+            if isinstance(self.keys, list):
+                sel_keys = self.keys
+            elif self.keys in self.obj.columns:
+                sel_keys = [self.keys]
+            else:
+                sel_keys = []
+
+            all_items = item_list + [k for k in sel_keys or () if k not in item_set]
+            if set(all_items) == set(self.obj.columns):
+                obj = self.obj
+            else:
+                obj = self.obj[all_items]
+        else:
+            obj = self.obj
+
+        if pickle_function and callable(self.keys):
+            keys = cloudpickle.dumps(self.keys)
+        else:
+            keys = self.keys
+
+        return (
+            obj,
+            keys,
+            self.axis,
+            self.level,
+            self.exclusions,
+            self.selection,
+            self.as_index,
+            self.sort,
+            self.group_keys,
+            self.squeeze,
+            self.observed,
+            self.dropna,
+            self.mutated,
+            getattr(self.groupby_obj.grouper, "_cache", dict()),
+        )
+
+    @classmethod
+    def from_tuple(cls, tp):
+        (
+            obj,
+            keys,
+            axis,
+            level,
+            exclusions,
+            selection,
+            as_index,
+            sort,
+            group_keys,
+            squeeze,
+            observed,
+            dropna,
+            mutated,
+            grouper_cache,
+        ) = tp
+
+        if isinstance(keys, (bytes, bytearray)):
+            keys = cloudpickle.loads(keys)
+
+        return cls(
+            obj,
+            keys=keys,
+            axis=axis,
+            level=level,
+            exclusions=exclusions,
+            selection=selection,
+            as_index=as_index,
+            sort=sort,
+            group_keys=group_keys,
+            squeeze=squeeze,
+            observed=observed,
+            dropna=dropna,
+            mutated=mutated,
+            grouper_cache=grouper_cache,
+        )
+
+
+def wrapped_groupby(
+    obj,
+    by=None,
+    axis=0,
+    level=None,
+    as_index=True,
+    sort=True,
+    group_keys=_default_group_keys,
+    squeeze=False,
+    observed=False,
+    dropna=True,
+):
+    groupby_kw = dict(
+        by=by,
+        axis=axis,
+        level=level,
+        as_index=as_index,
+        sort=sort,
+        group_keys=group_keys,
+        squeeze=squeeze,
+        observed=observed,
+        dropna=dropna,
+    )
+    if not _HAS_SQUEEZE:  # pragma: no branch
+        groupby_kw.pop("squeeze")
+    if not _HAS_DROPNA:  # pragma: no branch
+        groupby_kw.pop("dropna")
+
+    groupby_obj = obj.groupby(**groupby_kw)
+    return GroupByWrapper(obj, groupby_obj=groupby_obj, as_index=as_index)
diff --git a/python/xorbits/_mars/lib/mkl_interface.py b/python/xorbits/_mars/lib/mkl_interface.py
new file mode 100644
index 000000000..c864ecf5c
--- /dev/null
+++ b/python/xorbits/_mars/lib/mkl_interface.py
@@ -0,0 +1,86 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ctypes
+import os
+import sys
+
+
+def _load_mkl_rt(lib_name):
+    """
+    Load certain MKL library
+    """
+    if sys.platform.startswith("win"):
+        lib_path = os.path.join(sys.prefix, "Library", "bin", lib_name + ".dll")
+    elif sys.platform == "darwin":
+        lib_path = os.path.join(sys.prefix, "lib", "lib" + lib_name + ".dylib")
+    else:
+        lib_path = os.path.join(sys.prefix, "lib", "lib" + lib_name + ".so")
+    if not os.path.exists(lib_path):
+        lib_path = None
+
+    if lib_path:
+        return ctypes.cdll.LoadLibrary(lib_path)
+
+
+class MKLVersion(ctypes.Structure):
+    _fields_ = [
+        ("major", ctypes.c_int),
+        ("minor", ctypes.c_int),
+        ("update", ctypes.c_int),
+        ("product_status", ctypes.c_char_p),
+        ("build", ctypes.c_char_p),
+        ("processor", ctypes.c_char_p),
+        ("platform", ctypes.c_char_p),
+    ]
+
+
+mkl_free_buffers = None
+mkl_get_version = None
+mkl_mem_stat = None
+
+mkl_rt = _load_mkl_rt("mkl_rt")
+if mkl_rt:
+    try:
+        mkl_free_buffers = mkl_rt.mkl_free_buffers
+        mkl_free_buffers.argtypes = []
+        mkl_free_buffers.restype = None
+    except AttributeError:  # pragma: no cover
+        pass
+
+    try:
+        _mkl_mem_stat = mkl_rt.mkl_mem_stat
+        _mkl_mem_stat.argtypes = [ctypes.POINTER(ctypes.c_int32)]
+        _mkl_mem_stat.restype = ctypes.c_int64
+
+        def mkl_mem_stat():
+            n_bufs = ctypes.c_int32(0)
+            size = _mkl_mem_stat(ctypes.pointer(n_bufs))
+            return size, n_bufs.value
+
+    except AttributeError:  # pragma: no cover
+        pass
+
+    try:
+        _mkl_get_version = mkl_rt.mkl_get_version
+        _mkl_get_version.argtypes = [ctypes.POINTER(MKLVersion)]
+        _mkl_get_version.restype = None
+
+        def mkl_get_version():
+            version = MKLVersion()
+            _mkl_get_version(version)
+            return version
+
+    except AttributeError:  # pragma: no cover
+        pass
diff --git a/python/xorbits/_mars/lib/mmh3_src/MurmurHash3.cpp b/python/xorbits/_mars/lib/mmh3_src/MurmurHash3.cpp
new file mode 100755
index 000000000..4c73436da
--- /dev/null
+++ b/python/xorbits/_mars/lib/mmh3_src/MurmurHash3.cpp
@@ -0,0 +1,339 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+#include "MurmurHash3.h"
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+#define FORCE_INLINE	__forceinline
+
+#include <stdlib.h>
+
+#define ROTL32(x,y)	_rotl(x,y)
+#define ROTL64(x,y)	_rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#if defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && GNUC_MINOR >= 4))
+/* gcc version >= 4.4 4.1 = RHEL 5, 4.4 = RHEL 6. Don't inline for RHEL 5 gcc which is 4.1*/
+#define FORCE_INLINE inline __attribute__((always_inline))
+#else
+#define FORCE_INLINE
+#endif
+
+inline uint32_t rotl32 ( uint32_t x, int8_t r )
+{
+  return (x << r) | (x >> (32 - r));
+}
+
+inline uint64_t rotl64 ( uint64_t x, int8_t r )
+{
+  return (x << r) | (x >> (64 - r));
+}
+
+#define	ROTL32(x,y)	rotl32(x,y)
+#define ROTL64(x,y)	rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i )
+{
+  return p[i];
+}
+
+FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i )
+{
+  return p[i];
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE uint32_t fmix ( uint32_t h )
+{
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+//----------
+
+FORCE_INLINE uint64_t fmix ( uint64_t k )
+{
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+  k ^= k >> 33;
+
+  return k;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32 ( const void * key, Py_ssize_t len,
+                          uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const Py_ssize_t nblocks = len / 4;
+
+  uint32_t h1 = seed;
+
+  const uint32_t c1 = 0xcc9e2d51;
+  const uint32_t c2 = 0x1b873593;
+
+  //----------
+  // body
+
+  const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
+
+  for(Py_ssize_t i = -nblocks; i; i++)
+  {
+    uint32_t k1 = getblock(blocks,i);
+
+    k1 *= c1;
+    k1 = ROTL32(k1,15);
+    k1 *= c2;
+
+    h1 ^= k1;
+    h1 = ROTL32(h1,13);
+    h1 = h1*5+0xe6546b64;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+  uint32_t k1 = 0;
+
+  switch(len & 3)
+  {
+  case 3: k1 ^= tail[2] << 16;
+  case 2: k1 ^= tail[1] << 8;
+  case 1: k1 ^= tail[0];
+          k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+
+  h1 = fmix(h1);
+
+  *(uint32_t*)out = h1;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_128 ( const void * key, const Py_ssize_t len,
+                           uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const Py_ssize_t nblocks = len / 16;
+
+  uint32_t h1 = seed;
+  uint32_t h2 = seed;
+  uint32_t h3 = seed;
+  uint32_t h4 = seed;
+
+  const uint32_t c1 = 0x239b961b;
+  const uint32_t c2 = 0xab0e9789;
+  const uint32_t c3 = 0x38b34ae5;
+  const uint32_t c4 = 0xa1e38b93;
+
+  //----------
+  // body
+
+  const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
+
+  for(Py_ssize_t i = -nblocks; i; i++)
+  {
+    uint32_t k1 = getblock(blocks,i*4+0);
+    uint32_t k2 = getblock(blocks,i*4+1);
+    uint32_t k3 = getblock(blocks,i*4+2);
+    uint32_t k4 = getblock(blocks,i*4+3);
+
+    k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+
+    h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
+
+    k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+    h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
+
+    k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+    h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
+
+    k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+    h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+  uint32_t k1 = 0;
+  uint32_t k2 = 0;
+  uint32_t k3 = 0;
+  uint32_t k4 = 0;
+
+  switch(len & 15)
+  {
+  case 15: k4 ^= tail[14] << 16;
+  case 14: k4 ^= tail[13] << 8;
+  case 13: k4 ^= tail[12] << 0;
+           k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+  case 12: k3 ^= tail[11] << 24;
+  case 11: k3 ^= tail[10] << 16;
+  case 10: k3 ^= tail[ 9] << 8;
+  case  9: k3 ^= tail[ 8] << 0;
+           k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+  case  8: k2 ^= tail[ 7] << 24;
+  case  7: k2 ^= tail[ 6] << 16;
+  case  6: k2 ^= tail[ 5] << 8;
+  case  5: k2 ^= tail[ 4] << 0;
+           k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+  case  4: k1 ^= tail[ 3] << 24;
+  case  3: k1 ^= tail[ 2] << 16;
+  case  2: k1 ^= tail[ 1] << 8;
+  case  1: k1 ^= tail[ 0] << 0;
+           k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  h1 = fmix(h1);
+  h2 = fmix(h2);
+  h3 = fmix(h3);
+  h4 = fmix(h4);
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  ((uint32_t*)out)[0] = h1;
+  ((uint32_t*)out)[1] = h2;
+  ((uint32_t*)out)[2] = h3;
+  ((uint32_t*)out)[3] = h4;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x64_128 ( const void * key, const Py_ssize_t len,
+                           const uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const Py_ssize_t nblocks = len / 16;
+
+  uint64_t h1 = seed;
+  uint64_t h2 = seed;
+
+  const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
+  const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+  //----------
+  // body
+
+  const uint64_t * blocks = (const uint64_t *)(data);
+
+  for(Py_ssize_t i = 0; i < nblocks; i++)
+  {
+    uint64_t k1 = getblock(blocks,i*2+0);
+    uint64_t k2 = getblock(blocks,i*2+1);
+
+    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+
+    h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
+
+    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+    h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+  uint64_t k1 = 0;
+  uint64_t k2 = 0;
+
+  switch(len & 15)
+  {
+  case 15: k2 ^= uint64_t(tail[14]) << 48;
+  case 14: k2 ^= uint64_t(tail[13]) << 40;
+  case 13: k2 ^= uint64_t(tail[12]) << 32;
+  case 12: k2 ^= uint64_t(tail[11]) << 24;
+  case 11: k2 ^= uint64_t(tail[10]) << 16;
+  case 10: k2 ^= uint64_t(tail[ 9]) << 8;
+  case  9: k2 ^= uint64_t(tail[ 8]) << 0;
+           k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+  case  8: k1 ^= uint64_t(tail[ 7]) << 56;
+  case  7: k1 ^= uint64_t(tail[ 6]) << 48;
+  case  6: k1 ^= uint64_t(tail[ 5]) << 40;
+  case  5: k1 ^= uint64_t(tail[ 4]) << 32;
+  case  4: k1 ^= uint64_t(tail[ 3]) << 24;
+  case  3: k1 ^= uint64_t(tail[ 2]) << 16;
+  case  2: k1 ^= uint64_t(tail[ 1]) << 8;
+  case  1: k1 ^= uint64_t(tail[ 0]) << 0;
+           k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len; h2 ^= len;
+
+  h1 += h2;
+  h2 += h1;
+
+  h1 = fmix(h1);
+  h2 = fmix(h2);
+
+  h1 += h2;
+  h2 += h1;
+
+  ((uint64_t*)out)[0] = h1;
+  ((uint64_t*)out)[1] = h2;
+}
+
+//-----------------------------------------------------------------------------
diff --git a/python/xorbits/_mars/lib/mmh3_src/MurmurHash3.h b/python/xorbits/_mars/lib/mmh3_src/MurmurHash3.h
new file mode 100755
index 000000000..75e248c1b
--- /dev/null
+++ b/python/xorbits/_mars/lib/mmh3_src/MurmurHash3.h
@@ -0,0 +1,43 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#ifndef _MURMURHASH3_H_
+#define _MURMURHASH3_H_
+
+
+// To handle 64-bit data; see https://docs.python.org/2.7/c-api/arg.html
+#ifndef PY_SSIZE_T_CLEAN
+#define PY_SSIZE_T_CLEAN
+#endif
+#include <Python.h>
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+typedef unsigned __int8 uint8_t;
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#include <stdint.h>
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32  ( const void * key, Py_ssize_t len, uint32_t seed, void * out );
+
+void MurmurHash3_x86_128 ( const void * key, Py_ssize_t len, uint32_t seed, void * out );
+
+void MurmurHash3_x64_128 ( const void * key, Py_ssize_t len, uint32_t seed, void * out );
+
+//-----------------------------------------------------------------------------
+
+#endif // _MURMURHASH3_H_
diff --git a/python/xorbits/_mars/lib/mmh3_src/mmh3module.cpp b/python/xorbits/_mars/lib/mmh3_src/mmh3module.cpp
new file mode 100755
index 000000000..d5b17104d
--- /dev/null
+++ b/python/xorbits/_mars/lib/mmh3_src/mmh3module.cpp
@@ -0,0 +1,387 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. mmh3 Python module was written by Hajime Senuma,
+// and is also placed in the public domain.
+// The authors hereby disclaim copyright to these source codes.
+
+// To handle 64-bit data; see https://docs.python.org/2.7/c-api/arg.html
+#ifndef PY_SSIZE_T_CLEAN
+#define PY_SSIZE_T_CLEAN
+#endif
+
+#include <stdio.h>
+#include <string.h>
+#include <Python.h>
+#include "MurmurHash3.h"
+
+#if defined(_MSC_VER)
+typedef signed __int8 int8_t;
+typedef signed __int32 int32_t;
+typedef signed __int64 int64_t;
+typedef unsigned __int8 uint8_t;
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+// Other compilers
+#else    // defined(_MSC_VER)
+#include <stdint.h>
+#endif // !defined(_MSC_VER)
+
+static int
+_GetMemoryViewDataAndSize(PyObject *mview, const char **target_str,
+                          Py_ssize_t *target_str_len) {
+    Py_buffer *mview_buffer = NULL;
+
+    if (!PyMemoryView_Check(mview)) {
+        PyErr_Format(PyExc_TypeError, "key must be byte-like object "
+                     "or memoryview, not '%.200s'",
+                     mview->ob_type->tp_name);
+        return 0;
+    }
+
+    mview_buffer = PyMemoryView_GET_BUFFER(mview);
+    *target_str = (const char *)mview_buffer->buf;
+    *target_str_len = mview_buffer->len;
+    return 1;
+}
+
+static PyObject *
+mmh3_hash(PyObject *self, PyObject *args, PyObject *keywds)
+{
+    const char *target_str;
+    Py_ssize_t target_str_len;
+    PyObject *target_mview = NULL;
+    uint32_t seed = 0;
+    int32_t result[1];
+    long long_result = 0;
+    int is_signed = 1;
+
+    static char *kwlist[] = {(char *)"key", (char *)"seed",
+      (char *)"signed", NULL};
+
+#ifndef _MSC_VER
+  static uint64_t mask[] = {0x0ffffffff, 0xffffffffffffffff};
+#endif
+
+    if (!PyArg_ParseTupleAndKeywords(args, keywds, "s#|IB", kwlist,
+        &target_str, &target_str_len, &seed, &is_signed)) {
+        if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|IB", kwlist,
+            &target_mview, &seed, &is_signed)) {
+            return NULL;
+        }
+        PyErr_Clear();
+        Py_INCREF(target_mview);
+
+        if (!_GetMemoryViewDataAndSize(target_mview, &target_str, &target_str_len)) {
+            Py_DECREF(target_mview);
+            return NULL;
+        }
+    }
+
+    MurmurHash3_x86_32(target_str, target_str_len, seed, result);
+
+    if (target_mview) {
+        Py_DECREF(target_mview);
+    }
+
+#if defined(_MSC_VER)
+  /* for Windows envs */
+  long_result = result[0];
+  if (is_signed == 1) {
+    return PyLong_FromLong(long_result);
+  } else {
+    return PyLong_FromUnsignedLong(long_result);
+  }
+#else
+  /* for standard envs */
+  long_result = result[0] & mask[is_signed];
+  return PyLong_FromLong(long_result);
+#endif
+}
+
+static PyObject *
+mmh3_hash_from_buffer(PyObject *self, PyObject *args, PyObject *keywds)
+{
+    Py_buffer target_buf;
+    Py_buffer *target_buf_ptr;
+    PyObject *target_mview = NULL;
+    uint32_t seed = 0;
+    int32_t result[1];
+    long long_result = 0;
+    int is_signed = 1;
+
+    static char *kwlist[] = {(char *)"key", (char *)"seed",
+      (char *)"signed", NULL};
+
+#ifndef _MSC_VER
+    static uint64_t mask[] = {0x0ffffffff, 0xffffffffffffffff};
+#endif
+
+    if (!PyArg_ParseTupleAndKeywords(args, keywds, "s*|IB", kwlist,
+                                     &target_buf, &seed, &is_signed)) {
+        if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|IB", kwlist,
+            &target_mview, &seed, &is_signed)) {
+            return NULL;
+        }
+        PyErr_Clear();
+        Py_INCREF(target_mview);
+
+        if (!PyMemoryView_Check(target_mview)) {
+            PyErr_Format(PyExc_TypeError, "key must be byte-like object "
+                         "or memoryview, not '%.200s'",
+                         target_mview->ob_type->tp_name);
+            Py_DECREF(target_mview);
+            return NULL;
+        }
+
+        target_buf_ptr = PyMemoryView_GET_BUFFER(target_mview);
+    } else {
+        target_buf_ptr = &target_buf;
+    }
+
+    MurmurHash3_x86_32(target_buf_ptr->buf, target_buf_ptr->len, seed, result);
+
+    if (target_mview) {
+        Py_DECREF(target_mview);
+    }
+
+#if defined(_MSC_VER)
+    /* for Windows envs */
+    long_result = result[0];
+    if (is_signed == 1) {
+      return PyLong_FromLong(long_result);
+    } else {
+      return PyLong_FromUnsignedLong(long_result);
+    }
+#else
+    /* for standard envs */
+    long_result = result[0] & mask[is_signed];
+    return PyLong_FromLong(long_result);
+#endif
+}
+
+static PyObject *
+mmh3_hash64(PyObject *self, PyObject *args, PyObject *keywds)
+{
+    const char *target_str;
+    Py_ssize_t target_str_len;
+    PyObject *target_mview = NULL;
+    uint32_t seed = 0;
+    uint64_t result[2];
+    char x64arch = 1;
+    int is_signed = 1;
+
+    static char *kwlist[] = {(char *)"key", (char *)"seed",
+      (char *)"x64arch", (char *)"signed", NULL};
+
+    static char *valflag[] = {(char *) "KK", (char *) "LL"};
+
+    if (!PyArg_ParseTupleAndKeywords(args, keywds, "s#|IBB", kwlist,
+        &target_str, &target_str_len, &seed, &x64arch, &is_signed)) {
+        if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|IBB", kwlist,
+            &target_mview, &seed, &x64arch, &is_signed)) {
+            return NULL;
+        }
+        PyErr_Clear();
+        Py_INCREF(target_mview);
+
+        if (!_GetMemoryViewDataAndSize(target_mview, &target_str, &target_str_len)) {
+            Py_DECREF(target_mview);
+            return NULL;
+        }
+    }
+
+    if (x64arch == 1) {
+      MurmurHash3_x64_128(target_str, target_str_len, seed, result);
+    } else {
+      MurmurHash3_x86_128(target_str, target_str_len, seed, result);
+    }
+
+    if (target_mview) {
+        Py_DECREF(target_mview);
+    }
+
+    PyObject *retval = Py_BuildValue(valflag[is_signed], result[0], result[1]);
+    return retval;
+}
+
+static PyObject *
+mmh3_hash128(PyObject *self, PyObject *args, PyObject *keywds)
+{
+    const char *target_str;
+    Py_ssize_t target_str_len;
+    PyObject *target_mview = NULL;
+    uint32_t seed = 0;
+    uint64_t result[2];
+    char x64arch = 1;
+    char is_signed = 0;
+
+    static char *kwlist[] = {(char *)"key", (char *)"seed",
+      (char *)"x64arch", (char *)"signed", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, keywds, "s#|IBB", kwlist,
+        &target_str, &target_str_len, &seed, &x64arch, &is_signed)) {
+        if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|IBB", kwlist,
+            &target_mview, &seed, &x64arch, &is_signed)) {
+            return NULL;
+        }
+        PyErr_Clear();
+        Py_INCREF(target_mview);
+
+        if (!_GetMemoryViewDataAndSize(target_mview, &target_str, &target_str_len)) {
+            Py_DECREF(target_mview);
+            return NULL;
+        }
+    }
+
+    if (x64arch == 1) {
+      MurmurHash3_x64_128(target_str, target_str_len, seed, result);
+    } else {
+      MurmurHash3_x86_128(target_str, target_str_len, seed, result);
+    }
+
+    if (target_mview) {
+        Py_DECREF(target_mview);
+    }
+
+    /**
+     * _PyLong_FromByteArray is not a part of official Python/C API
+     * and can be displaced (although it is practically stable). cf.
+     * https://mail.python.org/pipermail/python-list/2006-August/372368.html
+     */
+    PyObject *retval = _PyLong_FromByteArray((unsigned char *)result, 16, 1, is_signed);
+
+    return retval;
+}
+
+static PyObject *
+mmh3_hash_bytes(PyObject *self, PyObject *args, PyObject *keywds)
+{
+    const char *target_str = NULL;
+    Py_ssize_t target_str_len;
+    PyObject *target_mview = NULL;
+    uint32_t seed = 0;
+    uint32_t result[4];
+    char x64arch = 1;
+
+    static char *kwlist[] = {(char *)"key", (char *)"seed",
+      (char *)"x64arch", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, keywds, "s#|IB", kwlist,
+        &target_str, &target_str_len, &seed, &x64arch)) {
+        if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|IB", kwlist,
+            &target_mview, &seed, &x64arch)) {
+            return NULL;
+        }
+        PyErr_Clear();
+        Py_INCREF(target_mview);
+
+        if (!_GetMemoryViewDataAndSize(target_mview, &target_str, &target_str_len)) {
+            Py_DECREF(target_mview);
+            return NULL;
+        }
+    }
+
+    if (x64arch == 1) {
+      MurmurHash3_x64_128(target_str, target_str_len, seed, result);
+    } else {
+      MurmurHash3_x86_128(target_str, target_str_len, seed, result);
+    }
+
+    if (target_mview) {
+        Py_DECREF(target_mview);
+    }
+
+    char bytes[16];
+    memcpy(bytes, result, 16);
+    return PyBytes_FromStringAndSize(bytes, 16);
+}
+
+struct module_state {
+  PyObject *error;
+};
+
+#if PY_MAJOR_VERSION >= 3
+#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
+#else
+#define GETSTATE(m) (&_state)
+static struct module_state _state;
+#endif
+
+static PyMethodDef Mmh3Methods[] = {
+    {"hash", (PyCFunction)mmh3_hash, METH_VARARGS | METH_KEYWORDS,
+        "hash(key[, seed=0, signed=True]) -> hash value\n Return a 32 bit integer."},
+    {"hash_from_buffer", (PyCFunction)mmh3_hash_from_buffer, METH_VARARGS | METH_KEYWORDS,
+     "hash_from_buffer(key[, seed=0, signed=True]) -> hash value from a memory buffer\n Return a 32 bit integer. Designed for large memory-views such as numpy arrays."},
+    {"hash64", (PyCFunction)mmh3_hash64, METH_VARARGS | METH_KEYWORDS,
+        "hash64(key[, seed=0, x64arch=True, signed=True]) -> (hash value 1, hash value 2)\n Return a tuple of two 64 bit integers for a string. Optimized for the x64 bit architecture when x64arch=True, otherwise for x86."},
+    {"hash128", (PyCFunction)mmh3_hash128, METH_VARARGS | METH_KEYWORDS,
+        "hash128(key[, seed=0, x64arch=True, signed=False]]) -> hash value\n Return a 128 bit long integer. Optimized for the x64 bit architecture when x64arch=True, otherwise for x86."},
+    {"hash_bytes", (PyCFunction)mmh3_hash_bytes,
+      METH_VARARGS | METH_KEYWORDS,
+        "hash_bytes(key[, seed=0, x64arch=True]) -> bytes\n Return a 128 bit hash value as bytes for a string. Optimized for the x64 bit architecture when x64arch=True, otherwise for the x86."},
+    {NULL, NULL, 0, NULL}
+};
+
+#if PY_MAJOR_VERSION >= 3
+
+static int mmh3_traverse(PyObject *m, visitproc visit, void *arg) {
+    Py_VISIT(GETSTATE(m)->error);
+    return 0;
+}
+
+static int mmh3_clear(PyObject *m) {
+    Py_CLEAR(GETSTATE(m)->error);
+    return 0;
+}
+
+static struct PyModuleDef mmh3module = {
+    PyModuleDef_HEAD_INIT,
+    "mmh3",
+    "mmh3 is a Python front-end to MurmurHash3, a fast and robust hash library created by Austin Appleby (http://code.google.com/p/smhasher/).\n Ported by Hajime Senuma <hajime.senuma@gmail.com>\n Try hash('foobar') or hash('foobar', 1984).\n If you find any bugs, please submit an issue via https://github.com/hajimes/mmh3",
+    sizeof(struct module_state),
+    Mmh3Methods,
+    NULL,
+    mmh3_traverse,
+    mmh3_clear,
+    NULL
+};
+
+#define INITERROR return NULL
+
+extern "C" {
+PyMODINIT_FUNC
+PyInit_mmh3(void)
+
+#else // PY_MAJOR_VERSION >= 3
+#define INITERROR return
+
+extern "C" {
+void
+initmmh3(void)
+#endif // PY_MAJOR_VERSION >= 3
+
+{
+#if PY_MAJOR_VERSION >= 3
+    PyObject *module = PyModule_Create(&mmh3module);
+#else
+    PyObject *module = Py_InitModule("mmh3", Mmh3Methods);
+#endif
+
+    if (module == NULL)
+        INITERROR;
+
+    PyModule_AddStringConstant(module, "__version__", "2.5.1");
+
+    struct module_state *st = GETSTATE(module);
+
+    st->error = PyErr_NewException((char *) "mmh3.Error", NULL, NULL);
+    if (st->error == NULL) {
+        Py_DECREF(module);
+        INITERROR;
+    }
+
+#if PY_MAJOR_VERSION >= 3
+    return module;
+#endif
+}
+} // extern "C"
diff --git a/python/xorbits/_mars/lib/nvutils.py b/python/xorbits/_mars/lib/nvutils.py
new file mode 100644
index 000000000..d3af99bfa
--- /dev/null
+++ b/python/xorbits/_mars/lib/nvutils.py
@@ -0,0 +1,713 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+import logging
+import os
+import subprocess
+import sys
+import uuid
+from collections import namedtuple
+from ctypes import (
+    CDLL,
+    POINTER,
+    Structure,
+    byref,
+    c_char,
+    c_char_p,
+    c_int,
+    c_uint,
+    c_ulonglong,
+    create_string_buffer,
+)
+from typing import List, Optional, Tuple, Union
+
+from ..utils import parse_readable_size
+
+logger = logging.getLogger(__name__)
+
+# Some constants taken from cuda.h
+CUDA_SUCCESS = 0
+CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
+CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39
+CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
+CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33
+CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34
+CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
+
+CU_NO_CUDA_CAPABLE_DEVICE_DETECTED = 100
+
+# nvml constants
+NVML_SUCCESS = 0
+NVML_ERROR_UNINITIALIZED = 1
+NVML_ERROR_INVALID_ARGUMENT = 2
+NVML_ERROR_NOT_SUPPORTED = 3
+NVML_ERROR_NO_PERMISSION = 4
+NVML_ERROR_ALREADY_INITIALIZED = 5
+NVML_ERROR_NOT_FOUND = 6
+NVML_ERROR_INSUFFICIENT_SIZE = 7
+NVML_ERROR_INSUFFICIENT_POWER = 8
+NVML_ERROR_DRIVER_NOT_LOADED = 9
+NVML_ERROR_TIMEOUT = 10
+NVML_ERROR_IRQ_ISSUE = 11
+NVML_ERROR_LIBRARY_NOT_FOUND = 12
+NVML_ERROR_FUNCTION_NOT_FOUND = 13
+NVML_ERROR_CORRUPTED_INFOROM = 14
+NVML_ERROR_GPU_IS_LOST = 15
+NVML_ERROR_RESET_REQUIRED = 16
+NVML_ERROR_OPERATING_SYSTEM = 17
+NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18
+NVML_ERROR_IN_USE = 19
+NVML_ERROR_MEMORY = 20
+NVML_ERROR_NO_DATA = 21
+NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22
+NVML_ERROR_INSUFFICIENT_RESOURCES = 23
+NVML_ERROR_FREQ_NOT_SUPPORTED = 24
+NVML_ERROR_UNKNOWN = 999
+NVML_TEMPERATURE_GPU = 0
+NVML_DRIVER_NOT_LOADED = 9
+NVML_DEVICE_UUID_V2_BUFFER_SIZE = 96
+NVML_VALUE_NOT_AVAILABLE_ulonglong = c_ulonglong(-1)
+NVML_DEVICE_MIG_DISABLE = 0x0
+NVML_DEVICE_MIG_ENABLE = 0x1
+
+
+class _CUuuid_t(Structure):
+    _fields_ = [("bytes", c_char * 16)]
+
+
+class _nvmlUtilization_t(Structure):
+    _fields_ = [
+        ("gpu", c_uint),
+        ("memory", c_uint),
+    ]
+
+
+class _struct_nvmlDevice_t(Structure):
+    pass  # opaque handle
+
+
+_nvmlDevice_t = POINTER(_struct_nvmlDevice_t)
+
+
+class _nvmlBAR1Memory_t(Structure):
+    _fields_ = [
+        ("total", c_ulonglong),
+        ("free", c_ulonglong),
+        ("used", c_ulonglong),
+    ]
+
+
+class _nvmlProcessInfo_t(Structure):
+    _fields_ = [
+        ("pid", c_uint),
+        ("usedGpuMemory", c_ulonglong),
+        ("gpuInstanceId", c_uint),
+        ("computeInstanceId", c_uint),
+    ]
+
+
+## Alternative object
+# Allows the object to be printed
+# Allows mismatched types to be assigned
+#  - like None when the Structure variant requires c_uint
+class nvmlFriendlyObject:
+    def __init__(self, dictionary):
+        for x in dictionary:
+            setattr(self, x, dictionary[x])
+
+    def __str__(self):
+        return self.__dict__.__str__()
+
+
+def nvmlStructToFriendlyObject(struct):
+    d = {}
+    for x in struct._fields_:
+        key = x[0]
+        value = getattr(struct, key)
+        # only need to convert from bytes if bytes, no need to check python version.
+        d[key] = value.decode() if isinstance(value, bytes) else value
+    obj = nvmlFriendlyObject(d)
+    return obj
+
+
+@dataclasses.dataclass
+class CudaDeviceInfo:
+    uuid: bytes = None
+    device_index: int = None
+    mig_index: int = None
+
+
+@dataclasses.dataclass
+class CudaContext:
+    has_context: bool
+    device_info: CudaDeviceInfo = None
+
+
+_is_windows: bool = sys.platform.startswith("win")
+_is_wsl: bool = "WSL_DISTRO_NAME" in os.environ
+
+
+def _load_nv_library(*libnames):
+    for lib in libnames:
+        try:
+            return CDLL(lib)
+        except OSError:
+            continue
+
+
+_cuda_lib = _nvml_lib = None
+
+_cu_device_info = namedtuple(
+    "_cu_device_info", "index uuid name multiprocessors cuda_cores threads"
+)
+_nvml_driver_info = namedtuple("_nvml_driver_info", "driver_version cuda_version")
+_nvml_device_status = namedtuple(
+    "_nvml_device_status",
+    "gpu_util mem_util temperature fb_total_mem fb_used_mem fb_free_mem",
+)
+
+_init_pid = None
+_gpu_count = None
+_driver_info = None
+_device_infos = dict()
+
+_no_device_warned = False
+
+
+class NVError(Exception):
+    def __init__(self, msg, *args, errno=None):
+        self._errno = errno
+        super().__init__(msg or "Unknown error", *args)
+
+    def __str__(self):
+        return f"({self._errno}) {super().__str__()}"
+
+    @property
+    def errno(self):
+        return self._errno
+
+    @property
+    def message(self):
+        return super().__str__()
+
+
+class NVDeviceAPIError(NVError):
+    pass
+
+
+class NVMLAPIError(NVError):
+    pass
+
+
+def _cu_check_error(result):
+    if result != CUDA_SUCCESS:
+        _error_str = c_char_p()
+        _cuda_lib.cuGetErrorString(result, byref(_error_str))
+        err_value = _error_str.value.decode() if _error_str.value is not None else None
+        raise NVDeviceAPIError(err_value, errno=result)
+
+
+_nvmlErrorString = None
+
+
+def _nvml_check_error(result):
+    global _nvmlErrorString
+    if _nvmlErrorString is None:
+        _nvmlErrorString = _nvml_lib.nvmlErrorString
+        _nvmlErrorString.restype = c_char_p
+
+    if result != NVML_SUCCESS:
+        _error_str = _nvmlErrorString(result)
+        raise NVMLAPIError(_error_str.decode(), errno=result)
+
+
+_cu_process_var_to_cores = {
+    (1, 0): 8,
+    (1, 1): 8,
+    (1, 2): 8,
+    (1, 3): 8,
+    (2, 0): 32,
+    (2, 1): 48,
+}
+
+
+def _cu_get_processor_cores(major, minor):
+    return _cu_process_var_to_cores.get((major, minor), 192)
+
+
+def _init_cp():
+    global _cuda_lib, _no_device_warned
+    if _init_pid == os.getpid():
+        return
+
+    libcuda_paths = ["libcuda.so", "libcuda.dylib", "cuda.dll", "nvcuda.dll"]
+    if _is_wsl:
+        libcuda_paths = ["/usr/lib/wsl/lib/libcuda.so"] + libcuda_paths
+    _cuda_lib = _load_nv_library(*libcuda_paths)
+
+    if _cuda_lib is None:
+        return
+    try:
+        _cu_check_error(_cuda_lib.cuInit(0))
+    except NVDeviceAPIError as ex:
+        if ex.errno == CU_NO_CUDA_CAPABLE_DEVICE_DETECTED:
+            _cuda_lib = None
+            if not _no_device_warned:
+                logger.warning("No CUDA device detected")
+                _no_device_warned = True
+        else:
+            logger.exception("Failed to initialize libcuda.")
+        return
+
+
+def _init_nvml():
+    global _nvml_lib, _no_device_warned
+    if _init_pid == os.getpid():
+        return
+
+    nvml_paths = [
+        "libnvidia-ml.so",
+        "libnvidia-ml.so.1",
+        "libnvidia-ml.dylib",
+        "nvml.dll",
+    ]
+    if _is_windows:
+        nvml_paths.append(
+            os.path.join(
+                os.getenv("ProgramFiles", "C:/Program Files"),
+                "NVIDIA Corporation/NVSMI/nvml.dll",
+            )
+        )
+    if _is_wsl:
+        nvml_paths = ["/usr/lib/wsl/lib/libnvidia-ml.so.1"] + nvml_paths
+    _nvml_lib = _load_nv_library(*nvml_paths)
+
+    if _nvml_lib is None:
+        return
+    try:
+        _nvml_check_error(_nvml_lib.nvmlInit_v2())
+    except NVMLAPIError as ex:
+        if ex.errno == NVML_DRIVER_NOT_LOADED:
+            _nvml_lib = None
+            if not _no_device_warned:
+                logger.warning(
+                    "Failed to load libnvidia-ml: %s, no CUDA device will be enabled",
+                    ex.message,
+                )
+                _no_device_warned = True
+        else:
+            logger.exception("Failed to initialize libnvidia-ml.")
+        return
+
+
+def _init():
+    global _init_pid
+
+    _init_cp()
+    _init_nvml()
+
+    if _nvml_lib is not None and _cuda_lib is not None:
+        _init_pid = os.getpid()
+
+
+def get_device_count() -> int:
+    global _gpu_count
+
+    if _gpu_count is not None:
+        return _gpu_count
+
+    _init_nvml()
+    if _nvml_lib is None:
+        return None
+
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        devices = os.environ["CUDA_VISIBLE_DEVICES"].strip()
+        if not devices or devices == "-1":
+            _gpu_count = 0
+        else:
+            _gpu_count = len(devices.split(","))
+    else:
+        n_gpus = c_uint()
+        _cu_check_error(_nvml_lib.nvmlDeviceGetCount(byref(n_gpus)))
+        _gpu_count = n_gpus.value
+    return _gpu_count
+
+
+def _get_all_device_count() -> int:
+    _init_nvml()
+    if _nvml_lib is None:
+        return None
+
+    n_gpus = c_uint()
+    _cu_check_error(_nvml_lib.nvmlDeviceGetCount(byref(n_gpus)))
+    return n_gpus.value
+
+
+def get_driver_info() -> _nvml_driver_info:
+    global _driver_info
+
+    _init_nvml()
+    if _nvml_lib is None:
+        return None
+    if _driver_info is not None:
+        return _driver_info
+
+    version_buf = create_string_buffer(100)
+    cuda_version = c_uint()
+
+    _nvml_check_error(
+        _nvml_lib.nvmlSystemGetDriverVersion(version_buf, len(version_buf))
+    )
+    _nvml_check_error(_nvml_lib.nvmlSystemGetCudaDriverVersion(byref(cuda_version)))
+
+    _driver_info = _nvml_driver_info(
+        driver_version=version_buf.value.decode(),
+        cuda_version=".".join(str(v) for v in divmod(cuda_version.value, 1000)),
+    )
+    return _driver_info
+
+
+def get_device_info(dev_index: int) -> _cu_device_info:
+    try:
+        return _device_infos[dev_index]
+    except KeyError:
+        pass
+
+    _init()
+    if _init_pid is None:
+        return None
+
+    device = c_int()
+    name_buf = create_string_buffer(100)
+    uuid_t = _CUuuid_t()
+    cc_major = c_int()
+    cc_minor = c_int()
+    cores = c_int()
+    threads_per_core = c_int()
+
+    _cu_check_error(_cuda_lib.cuDeviceGet(byref(device), c_int(dev_index)))
+    _cu_check_error(_cuda_lib.cuDeviceGetName(name_buf, len(name_buf), device))
+    _cu_check_error(_cuda_lib.cuDeviceGetUuid(byref(uuid_t), device))
+    _cu_check_error(
+        _cuda_lib.cuDeviceComputeCapability(byref(cc_major), byref(cc_minor), device)
+    )
+    _cu_check_error(
+        _cuda_lib.cuDeviceGetAttribute(
+            byref(cores), CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device
+        )
+    )
+    _cu_check_error(
+        _cuda_lib.cuDeviceGetAttribute(
+            byref(threads_per_core),
+            CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
+            device,
+        )
+    )
+
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        real_dev_index = [
+            int(s) for s in os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        ][dev_index]
+    else:
+        real_dev_index = dev_index
+
+    info = _device_infos[dev_index] = _cu_device_info(
+        index=real_dev_index,
+        uuid=uuid.UUID(bytes=uuid_t.bytes),
+        name=name_buf.value.decode(),
+        multiprocessors=cores.value,
+        cuda_cores=cores.value
+        * _cu_get_processor_cores(cc_major.value, cc_minor.value),
+        threads=cores.value * threads_per_core.value,
+    )
+    return info
+
+
+def get_device_status(dev_index: int) -> _nvml_device_status:
+    _init()
+    if _init_pid is None:
+        return None
+
+    c_device = _nvmlDevice_t()
+    c_utils = _nvmlUtilization_t()
+    c_temperature = c_uint()
+    c_memory_info = _nvmlBAR1Memory_t()
+
+    dev_uuid = get_device_info(dev_index).uuid
+
+    uuid_str = ("GPU-" + str(dev_uuid)).encode()
+
+    if not _is_wsl:
+        _nvml_check_error(
+            _nvml_lib.nvmlDeviceGetHandleByUUID(uuid_str, byref(c_device))
+        )
+
+        _nvml_check_error(
+            _nvml_lib.nvmlDeviceGetUtilizationRates(c_device, byref(c_utils))
+        )
+        gpu_util = c_utils.gpu
+        mem_util = c_utils.memory
+
+        _nvml_check_error(
+            _nvml_lib.nvmlDeviceGetTemperature(
+                c_device, NVML_TEMPERATURE_GPU, byref(c_temperature)
+            )
+        )
+        temperature = c_temperature.value
+
+        _nvml_check_error(
+            _nvml_lib.nvmlDeviceGetMemoryInfo(c_device, byref(c_memory_info))
+        )
+        fb_total_mem = c_memory_info.total
+        fb_free_mem = c_memory_info.free
+        fb_used_mem = c_memory_info.used
+    else:
+        import defusedxml
+
+        proc = subprocess.Popen(
+            ["/usr/lib/wsl/lib/nvidia-smi", "-q", f"--id={dev_index}", "-x"],
+            stdout=subprocess.PIPE,
+        )
+        proc.wait()
+        xml_result = defusedxml.ElementTree.fromstring(proc.stdout.read())
+        gpu_node = xml_result.find("gpu")
+
+        fb_node = gpu_node.find("fb_memory_usage")
+        fb_total_mem = int(parse_readable_size(fb_node.find("total").text)[0])
+        fb_free_mem = int(parse_readable_size(fb_node.find("free").text)[0])
+        fb_used_mem = int(parse_readable_size(fb_node.find("used").text)[0])
+
+        util_node = gpu_node.find("utilization")
+        if util_node.find("gpu_util").text == "N/A":
+            gpu_util = 0
+        else:
+            gpu_util = int(util_node.find("gpu_util"))
+        if util_node.find("memory_util").text == "N/A":
+            mem_util = 0
+        else:
+            mem_util = int(util_node.find("memory_util"))
+
+        temperature = int(gpu_node.find("temperature").find("gpu_temp").text[:-1])
+
+    return _nvml_device_status(
+        gpu_util=gpu_util,
+        mem_util=mem_util,
+        temperature=temperature,
+        fb_total_mem=fb_total_mem,
+        fb_free_mem=fb_free_mem,
+        fb_used_mem=fb_used_mem,
+    )
+
+
+def get_handle_by_index(index: int) -> _nvmlDevice_t:
+    _init_nvml()
+    if _nvml_lib is None:
+        return None
+
+    c_index = c_int(index)
+    device = _nvmlDevice_t()
+    _nvml_check_error(_nvml_lib.nvmlDeviceGetHandleByIndex_v2(c_index, byref(device)))
+    return device
+
+
+def get_handle_by_uuid(uuid: bytes) -> _nvmlDevice_t:
+    _init_nvml()
+    if _nvml_lib is None:
+        return None
+
+    c_uuid = c_char_p(uuid)
+    device = _nvmlDevice_t()
+    _nvml_check_error(_nvml_lib.nvmlDeviceGetHandleByUUID(c_uuid, byref(device)))
+    return device
+
+
+def get_mig_mode(device: _nvmlDevice_t) -> Tuple[int, int]:
+    _init_nvml()
+    if _nvml_lib is None:
+        return None
+
+    c_current_mode, c_pending_mode = c_uint(), c_uint()
+    _nvml_check_error(
+        _nvml_lib.nvmlDeviceGetMigMode(
+            device, byref(c_current_mode), byref(c_pending_mode)
+        )
+    )
+    return c_current_mode.value, c_pending_mode.value
+
+
+def get_max_mig_device_count(device: _nvmlDevice_t) -> int:
+    _init_nvml()
+    if _nvml_lib is None:
+        return None
+
+    c_count = c_uint()
+    _nvml_check_error(_nvml_lib.nvmlDeviceGetMaxMigDeviceCount(device, byref(c_count)))
+    return c_count.value
+
+
+def get_mig_device_handle_by_index(device: _nvmlDevice_t, index: int) -> _nvmlDevice_t:
+    _init_nvml()
+    if _nvml_lib is None:
+        return None
+
+    c_index = c_uint(index)
+    mig_device = _nvmlDevice_t()
+    _nvml_check_error(
+        _nvml_lib.nvmlDeviceGetMigDeviceHandleByIndex(
+            device, c_index, byref(mig_device)
+        )
+    )
+    return mig_device
+
+
+def get_index(handle: _nvmlDevice_t) -> int:
+    _init_nvml()
+    if _nvml_lib is None:
+        return None
+
+    c_index = c_uint()
+    _nvml_check_error(_nvml_lib.nvmlDeviceGetIndex(handle, byref(c_index)))
+    return c_index.value
+
+
+def get_uuid(handle: _nvmlDevice_t) -> bytes:
+    _init_nvml()
+    if _nvml_lib is None:
+        return None
+
+    c_uuid = create_string_buffer(NVML_DEVICE_UUID_V2_BUFFER_SIZE)
+    _nvml_check_error(
+        _nvml_lib.nvmlDeviceGetUUID(
+            handle, c_uuid, c_uint(NVML_DEVICE_UUID_V2_BUFFER_SIZE)
+        )
+    )
+    return c_uuid.value
+
+
+def get_index_and_uuid(device: Union[int, bytes, str]) -> CudaDeviceInfo:
+    _init_nvml()
+    if _nvml_lib is None:
+        return None
+
+    try:
+        device_index = int(device)
+        device_handle = get_handle_by_index(device_index)
+        uuid = get_uuid(device_handle)
+    except ValueError:
+        uuid = device if isinstance(device, bytes) else device.encode()
+        uuid_handle = get_handle_by_uuid(uuid)
+        device_index = get_index(uuid_handle)
+        uuid = get_uuid(uuid_handle)
+
+    return CudaDeviceInfo(uuid=uuid, device_index=device_index)
+
+
+def get_compute_running_processes(handle: _nvmlDevice_t) -> List[nvmlFriendlyObject]:
+    _init_nvml()
+    if _nvml_lib is None:
+        return None
+
+    c_count = c_uint(0)
+    func = getattr(_nvml_lib, "nvmlDeviceGetComputeRunningProcesses_v3", None)
+    if func is None:
+        func = getattr(_nvml_lib, "nvmlDeviceGetComputeRunningProcesses_v2")
+    ret = func(handle, byref(c_count), None)
+
+    if ret == NVML_SUCCESS:
+        # special case, no running processes
+        return []
+    elif ret == NVML_ERROR_INSUFFICIENT_SIZE:
+        # typical case
+        # oversize the array in case more processes are created
+        c_count.value = c_count.value * 2 + 5
+        proc_array = _nvmlProcessInfo_t * c_count.value
+        c_procs = proc_array()
+
+        _nvml_check_error(func(handle, byref(c_count), c_procs))
+
+        procs = []
+        for i in range(c_count.value):
+            # use an alternative struct for this object
+            obj = nvmlStructToFriendlyObject(c_procs[i])
+            if obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value:
+                # special case for WDDM on Windows, see comment above
+                obj.usedGpuMemory = None
+            procs.append(obj)
+
+        return procs
+    else:
+        # error case
+        _nvml_check_error(ret)
+
+
+def _running_process_matches(handle: _nvmlDevice_t) -> bool:
+    """Check whether the current process is same as that of handle
+    Parameters
+    ----------
+    handle : _nvmlDevice_t
+        NVML handle to CUDA device
+    Returns
+    -------
+    out : bool
+        Whether the device handle has a CUDA context on the running process.
+    """
+    return any(os.getpid() == o.pid for o in get_compute_running_processes(handle))
+
+
+def get_cuda_context() -> CudaContext:
+    """Check whether the current process already has a CUDA context created."""
+
+    _init()
+    if _init_pid is None:
+        return CudaContext(has_context=False)
+
+    for index in range(_get_all_device_count()):
+        handle = get_handle_by_index(index)
+        try:
+            mig_current_mode, mig_pending_mode = get_mig_mode(handle)
+        except NVMLAPIError as e:
+            if e.errno == NVML_ERROR_NOT_SUPPORTED:
+                mig_current_mode = NVML_DEVICE_MIG_DISABLE
+            else:
+                raise
+        if mig_current_mode == NVML_DEVICE_MIG_ENABLE:
+            for mig_index in range(get_max_mig_device_count(handle)):
+                try:
+                    mig_handle = get_mig_device_handle_by_index(handle, mig_index)
+                except NVMLAPIError as e:
+                    if e.errno == NVML_ERROR_NOT_FOUND:
+                        # No MIG device with that index
+                        continue
+                    else:
+                        raise
+                if _running_process_matches(mig_handle):
+                    return CudaContext(
+                        has_context=True,
+                        device_info=CudaDeviceInfo(
+                            uuid=get_uuid(handle),
+                            device_index=index,
+                            mig_index=mig_index,
+                        ),
+                    )
+        else:
+            if _running_process_matches(handle):
+                return CudaContext(
+                    has_context=True,
+                    device_info=CudaDeviceInfo(
+                        uuid=get_uuid(handle), device_index=index
+                    ),
+                )
+
+    return CudaContext(has_context=False)
diff --git a/python/xorbits/_mars/lib/ordered_set.pyx b/python/xorbits/_mars/lib/ordered_set.pyx
new file mode 100644
index 000000000..0f9055c74
--- /dev/null
+++ b/python/xorbits/_mars/lib/ordered_set.pyx
@@ -0,0 +1,517 @@
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+# cython: annotate = True
+# Copy from https://github.com/simonpercivall/orderedset/blob/master/lib/orderedset/_orderedset.pyx
+import sys
+
+if sys.version_info[0] == 2:
+    from itertools import izip
+    from collections import Set, MutableSet, Iterable
+else:
+    izip = zip
+    from collections.abc import Set, MutableSet, Iterable
+
+from cpython cimport PyDict_Contains, PyIndex_Check
+
+
+cdef extern from "Python.h":
+    int PySlice_GetIndicesEx(slice, ssize_t length, ssize_t *start,
+                             ssize_t *stop, ssize_t *step, ssize_t *slicelength) except -1
+
+
+__all__ = ["OrderedSet"]
+
+
+cdef class entry:
+    cdef object key
+    cdef entry prev
+    cdef entry next
+
+
+cdef inline void _add(_OrderedSet oset, object key):
+    cdef entry end = oset.end
+    cdef dict map = oset.map
+    cdef entry next
+
+    if not PyDict_Contains(map, key):
+        next = entry()
+        next.key, next.prev, next.next = key, end.prev, end
+        end.prev.next = end.prev = map[key] = next
+        oset.os_used += 1
+
+
+cdef void _discard(_OrderedSet oset, object key):
+    cdef dict map = oset.map
+    cdef entry _entry
+
+    if PyDict_Contains(map, key):
+        _entry = map.pop(key)
+        _entry.prev.next = _entry.next
+        _entry.next.prev = _entry.prev
+        oset.os_used -= 1
+
+
+cdef inline object  _isorderedsubset(seq1, seq2):
+    if not len(seq1) <= len(seq2):
+            return False
+    for self_elem, other_elem in izip(seq1, seq2):
+        if not self_elem == other_elem:
+            return False
+    return True
+
+
+cdef class OrderedSetIterator(object):
+    cdef _OrderedSet oset
+    cdef entry curr
+    cdef ssize_t si_used
+
+    def __cinit__(self, _OrderedSet oset):
+        self.oset = oset
+        self.curr = oset.end
+        self.si_used = oset.os_used
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        cdef entry item
+
+        if self.si_used != self.oset.os_used:
+            # make this state sticky
+            self.si_used = -1
+            raise RuntimeError('%s changed size during iteration' % type(self.oset).__name__)
+
+        item = self.curr.next
+        if item == self.oset.end:
+            raise StopIteration()
+        self.curr = item
+        return item.key
+
+
+cdef class OrderedSetReverseIterator(object):
+    cdef _OrderedSet oset
+    cdef entry curr
+    cdef ssize_t si_used
+
+    def __cinit__(self, _OrderedSet oset):
+        self.oset = oset
+        self.curr = oset.end
+        self.si_used = oset.os_used
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        cdef entry item
+
+        if self.si_used != self.oset.os_used:
+            # make this state sticky
+            self.si_used = -1
+            raise RuntimeError('%s changed size during iteration' % type(self.oset).__name__)
+
+        item = self.curr.prev
+        if item is self.oset.end:
+            raise StopIteration()
+        self.curr = item
+        return item.key
+
+
+cdef class _OrderedSet(object):
+    cdef dict map
+    cdef entry end
+    cdef ssize_t os_used
+
+    def __cinit__(self):
+        self.map = {}
+        self.os_used = 0
+        self.end = end = entry()
+        end.prev = end.next = end
+
+    def __init__(self, object iterable=None):
+        cdef dict map = self.map
+        cdef entry end = self.end
+        cdef entry next
+
+        if iterable is not None:
+            for elem in iterable:
+                if not PyDict_Contains(map, elem):
+                    next = entry()
+                    next.key, next.prev, next.next = elem, end.prev, end
+                    end.prev.next = end.prev = map[elem] = next
+                    self.os_used += 1
+
+    @classmethod
+    def _from_iterable(cls, it):
+        return cls(it)
+
+    ##
+    # set methods
+    ##
+    cpdef add(self, elem):
+        """Add element `elem` to the set."""
+        _add(self, elem)
+
+    cpdef discard(self, elem):
+        """Remove element `elem` from the ``OrderedSet`` if it is present."""
+        _discard(self, elem)
+
+    cpdef pop(self, last=True):
+        """Remove last element. Raises ``KeyError`` if the ``OrderedSet`` is empty."""
+        if not self:
+            raise KeyError('OrderedSet is empty')
+        key = self.end.prev.key if last else self.end.next.key
+        _discard(self, key)
+        return key
+
+    def remove(self, elem):
+        """
+        Remove element `elem` from the ``set``.
+        Raises :class:`KeyError` if `elem` is not contained in the set.
+        """
+        if elem not in self:
+            raise KeyError(elem)
+        _discard(self, elem)
+
+    def clear(self):
+        """Remove all elements from the `set`."""
+        cdef entry end = self.end
+        end.next.prev = end.next = None
+
+        # reinitialize
+        self.map = {}
+        self.os_used = 0
+        self.end = end = entry()
+        end.prev = end.next = end
+
+    def copy(self):
+        """
+        :rtype: OrderedSet
+        :return: a new ``OrderedSet`` with a shallow copy of self.
+        """
+        return self._from_iterable(self)
+
+    def difference(self, other):
+        """``OrderedSet - other``
+
+        :rtype: OrderedSet
+        :return: a new ``OrderedSet`` with elements in the set that are not in the others.
+        """
+        return self - other
+
+    def difference_update(self, other):
+        """``OrderedSet -= other``
+
+        Update the ``OrderedSet``, removing elements found in others.
+        """
+        self -= other
+
+    def __sub__(self, other):
+        """
+        :rtype: OrderedSet
+        """
+        ostyp = type(self if isinstance(self, OrderedSet) else other)
+
+        if not isinstance(self, Iterable):
+            return NotImplemented
+        if not isinstance(other, Set):
+            if not isinstance(other, Iterable):
+                return NotImplemented
+            other = ostyp._from_iterable(other)
+
+        return ostyp._from_iterable(value for value in self if value not in other)
+
+    def __isub__(self, other):
+        if other is self:
+            self.clear()
+        else:
+            for value in other:
+                self.discard(value)
+        return self
+
+    def intersection(self, other):
+        """``OrderedSet & other``
+
+        :rtype: OrderedSet
+        :return: a new ``OrderedSet`` with elements common to the set and all others.
+        """
+        return self & other
+
+    def intersection_update(self, other):
+        """``OrderedSet &= other``
+
+        Update the ``OrderedSet``, keeping only elements found in it and all others.
+        """
+        self &= other
+
+    def __and__(self, other):
+        """
+        :rtype: OrderedSet
+        """
+        ostyp = type(self if isinstance(self, OrderedSet) else other)
+
+        if not isinstance(self, Iterable):
+            return NotImplemented
+        if not isinstance(other, Set):
+            if not isinstance(other, Iterable):
+                return NotImplemented
+            other = ostyp._from_iterable(other)
+
+        return ostyp._from_iterable(value for value in self if value in other)
+
+    def __iand__(self, it):
+        for value in (self - it):
+            self.discard(value)
+        return self
+
+    def isdisjoint(self, other):
+        """
+        Return True if the set has no elements in common with other.
+        Sets are disjoint if and only if their intersection is the empty set.
+
+        :rtype: bool
+        """
+        for value in other:
+            if value in self:
+                return False
+        return True
+
+    def issubset(self, other):
+        """``OrderedSet <= other``
+
+        :rtype: bool
+
+        Test whether the ``OrderedSet`` is a proper subset of other, that is,
+        ``OrderedSet <= other and OrderedSet != other``.
+        """
+        return self <= other
+
+    def issuperset(self, other):
+        """``OrderedSet >= other``
+
+        :rtype: bool
+
+        Test whether every element in other is in the set.
+        """
+        return other <= self
+
+    def isorderedsubset(self, other):
+        return _isorderedsubset(self, other)
+
+    def isorderedsuperset(self, other):
+        return _isorderedsubset(other, self)
+
+    def symmetric_difference(self, other):
+        """``OrderedSet ^ other``
+
+        :rtype: OrderedSet
+        :return: a new ``OrderedSet`` with elements in either the set or other but not both.
+        """
+        return self ^ other
+
+    def symmetric_difference_update(self, other):
+        """``OrderedSet ^= other``
+
+        Update the ``OrderedSet``, keeping only elements found in either set, but not in both.
+        """
+        self ^= other
+
+    def __xor__(self, other):
+        """
+        :rtype: OrderedSet
+        """
+        if not isinstance(self, Iterable):
+            return NotImplemented
+        if not isinstance(other, Iterable):
+            return NotImplemented
+
+        return (self - other) | (other - self)
+
+    def __ixor__(self, other):
+        if other is self:
+            self.clear()
+        else:
+            if not isinstance(other, Set):
+                other = self._from_iterable(other)
+            for value in other:
+                if value in self:
+                    self.discard(value)
+                else:
+                    self.add(value)
+        return self
+
+    def union(self, other):
+        """``OrderedSet | other``
+
+        :rtype: OrderedSet
+        :return: a new ``OrderedSet`` with elements from the set and all others.
+        """
+        return self | other
+
+    def update(self, other):
+        """``OrderedSet |= other``
+
+        Update the ``OrderedSet``, adding elements from all others.
+        """
+        self |= other
+
+    def __or__(self, other):
+        """
+        :rtype: OrderedSet
+        """
+        ostyp = type(self if isinstance(self, OrderedSet) else other)
+
+        if not isinstance(self, Iterable):
+            return NotImplemented
+        if not isinstance(other, Iterable):
+            return NotImplemented
+        chain = (e for s in (self, other) for e in s)
+        return ostyp._from_iterable(chain)
+
+    def __ior__(self, other):
+        for elem in other:
+            _add(self, elem)
+        return self
+
+    ##
+    # list methods
+    ##
+    def index(self, elem):
+        """Return the index of `elem`. Rases :class:`ValueError` if not in the OrderedSet."""
+        if elem not in self:
+            raise ValueError("%s is not in %s" % (elem, type(self).__name__))
+        cdef entry curr = self.end.next
+        cdef ssize_t index = 0
+        while curr.key != elem:
+            curr = curr.next
+            index += 1
+        return index
+
+    cdef _getslice(self, slice item):
+        cdef ssize_t start, stop, step, slicelength, place, i
+        cdef entry curr
+        cdef _OrderedSet result
+        PySlice_GetIndicesEx(item, len(self), &start, &stop, &step, &slicelength)
+
+        result = type(self)()
+        place = start
+        curr = self.end
+
+        if slicelength <= 0:
+            pass
+        elif step > 0:
+            # normal forward slice
+            i = 0
+            while slicelength > 0:
+                while i <= place:
+                    curr = curr.next
+                    i += 1
+                _add(result, curr.key)
+                place += step
+                slicelength -= 1
+        else:
+            # we're going backwards
+            i = len(self)
+            while slicelength > 0:
+                while i > place:
+                    curr = curr.prev
+                    i -= 1
+                _add(result, curr.key)
+                place += step
+                slicelength -= 1
+        return result
+
+    cdef _getindex(self, ssize_t index):
+        cdef ssize_t _len = len(self)
+        if index >= _len or (index < 0 and abs(index) > _len):
+            raise IndexError("list index out of range")
+
+        cdef entry curr
+        if index >= 0:
+            curr = self.end.next
+            while index:
+                curr = curr.next
+                index -= 1
+        else:
+            index = abs(index) - 1
+            curr = self.end.prev
+            while index:
+                curr = curr.prev
+                index -= 1
+        return curr.key
+
+    def __getitem__(self, item):
+        """Return the `elem` at `index`. Raises :class:`IndexError` if `index` is out of range."""
+        if isinstance(item, slice):
+            return self._getslice(item)
+        if not PyIndex_Check(item):
+            raise TypeError("%s indices must be integers, not %s" % (type(self).__name__, type(item)))
+        return self._getindex(item)
+
+    ##
+    # sequence methods
+    ##
+    def __len__(self):
+        return len(self.map)
+
+    def __contains__(self, elem):
+        return elem in self.map
+
+    def __iter__(self):
+        return OrderedSetIterator(self)
+
+    def __reversed__(self):
+        return OrderedSetReverseIterator(self)
+
+    def __reduce__(self):
+        items = list(self)
+        inst_dict = vars(self).copy()
+        return self.__class__, (items, ), inst_dict
+
+
+class OrderedSet(_OrderedSet, MutableSet):
+    """
+    An ``OrderedSet`` object is an ordered collection of distinct hashable objects.
+
+    It works like the :class:`set` type, but remembers insertion order.
+
+    It also supports :meth:`__getitem__` and :meth:`index`, like the
+    :class:`list` type.
+    """
+    def __repr__(self):
+        if not self:
+            return '%s()' % (self.__class__.__name__,)
+        return '%s(%r)' % (self.__class__.__name__, list(self))
+
+    def __eq__(self, other):
+        if isinstance(other, (_OrderedSet, list)):
+            return len(self) == len(other) and list(self) == list(other)
+        elif isinstance(other, Set):
+            return set(self) == set(other)
+        return NotImplemented
+
+    def __le__(self, other):
+        if isinstance(other, Set):
+            return len(self) <= len(other) and set(self) <= set(other)
+        elif isinstance(other, list):
+            return len(self) <= len(other) and list(self) <= list(other)
+        return NotImplemented
+
+    def __lt__(self, other):
+        if isinstance(other, Set):
+            return len(self) < len(other) and set(self) < set(other)
+        elif isinstance(other, list):
+            return len(self) < len(other) and list(self) < list(other)
+        return NotImplemented
+
+    def __ge__(self, other):
+        ret = self < other
+        if ret is NotImplemented:
+            return ret
+        return not ret
+
+    def __gt__(self, other):
+        ret = self <= other
+        if ret is NotImplemented:
+            return ret
+        return not ret
diff --git a/python/xorbits/_mars/lib/sparse/__init__.py b/python/xorbits/_mars/lib/sparse/__init__.py
new file mode 100644
index 000000000..a7fa00d1b
--- /dev/null
+++ b/python/xorbits/_mars/lib/sparse/__init__.py
@@ -0,0 +1,859 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import builtins
+import operator
+from collections.abc import Iterable
+from functools import partial, reduce
+
+from .array import SparseNDArray, call_sparse
+from .core import get_sparse_module, issparse
+from .matrix import SparseMatrix
+from .vector import SparseVector
+
+
+def asarray(x, shape=None):
+    from .core import issparse
+
+    if issparse(x):
+        return SparseNDArray(x, shape=shape)
+
+    return x
+
+
+def add(a, b, **_):
+    try:
+        return a + b
+    except TypeError:
+        if hasattr(b, "__radd__"):
+            return b.__radd__(a)
+        raise
+
+
+def subtract(a, b, **_):
+    try:
+        return a - b
+    except TypeError:
+        if hasattr(b, "__rsub__"):
+            return b.__rsub__(a)
+        raise
+
+
+def multiply(a, b, **_):
+    try:
+        return a * b
+    except TypeError:
+        if hasattr(b, "__rmul__"):
+            return b.__rmul__(a)
+        raise
+
+
+def divide(a, b, **_):
+    try:
+        return a / b
+    except TypeError:
+        if hasattr(b, "__rdiv__"):
+            return b.__rdiv__(a)
+        raise
+
+
+def true_divide(a, b, **_):
+    try:
+        return a / b
+    except TypeError:
+        if hasattr(b, "__rtruediv__"):
+            return b.__rtruediv__(a)
+        raise
+
+
+def floor_divide(a, b, **_):
+    try:
+        return a // b
+    except TypeError:
+        if hasattr(b, "__rfloordiv__"):
+            return b.__rfloordiv__(a)
+        raise
+
+
+def power(a, b, **_):
+    try:
+        return a**b
+    except TypeError:
+        if hasattr(b, "__rpow__"):
+            return b.__rpow__(a)
+        raise
+
+
+def mod(a, b, **_):
+    try:
+        return a % b
+    except TypeError:
+        if hasattr(b, "__rmod__"):
+            return b.__rmod__(a)
+        raise
+
+
+def _call_bin(method, a, b, **kwargs):
+    from .core import cp, get_array_module, issparse
+
+    # order does not take effect for sparse
+    kwargs.pop("order", None)
+    if hasattr(a, method):
+        res = getattr(a, method)(b, **kwargs)
+    elif get_array_module(a).isscalar(a):
+        res = call_sparse(method, a, b, **kwargs)
+    else:
+        assert get_array_module(a) == get_array_module(b)
+        xp = get_array_module(a)
+        try:
+            res = getattr(xp, method)(a, b, **kwargs)
+        except TypeError:
+            if xp is cp and issparse(b):
+                res = getattr(xp, method)(a, b.toarray(), **kwargs)
+            else:
+                raise
+
+    if res is NotImplemented:
+        raise NotImplementedError
+
+    return res
+
+
+def _call_unary(method, x, *args, **kwargs):
+    from .core import get_array_module
+
+    # order does not take effect for sparse
+    kwargs.pop("order", None)
+    if hasattr(x, method):
+        res = getattr(x, method)(*args, **kwargs)
+    else:
+        xp = get_array_module(x)
+        res = getattr(xp, method)(x, *args, **kwargs)
+
+    if res is NotImplemented:
+        raise NotImplementedError
+
+    return res
+
+
+def float_power(a, b, **kw):
+    return _call_bin("float_power", a, b, **kw)
+
+
+def fmod(a, b, **kw):
+    return _call_bin("fmod", a, b, **kw)
+
+
+def logaddexp(a, b, **kw):
+    return _call_bin("logaddexp", a, b, **kw)
+
+
+def logaddexp2(a, b, **kw):
+    return _call_bin("logaddexp2", a, b, **kw)
+
+
+def negative(x, **_):
+    return -x
+
+
+def positive(x, **_):
+    return operator.pos(x)
+
+
+def absolute(x, **_):
+    return builtins.abs(x)
+
+
+abs = absolute
+
+
+fabs = partial(_call_unary, "fabs")
+
+
+def rint(x, **kw):
+    return _call_unary("rint", x, **kw)
+
+
+def sign(x, **kw):
+    return _call_unary("sign", x, **kw)
+
+
+def conj(x, **kw):
+    return _call_unary("conj", x, **kw)
+
+
+def exp(x, **kw):
+    return _call_unary("exp", x, **kw)
+
+
+def exp2(x, **kw):
+    return _call_unary("exp2", x, **kw)
+
+
+def log(x, **kw):
+    return _call_unary("log", x, **kw)
+
+
+def log2(x, **kw):
+    return _call_unary("log2", x, **kw)
+
+
+def log10(x, **kw):
+    return _call_unary("log10", x, **kw)
+
+
+def expm1(x, **kw):
+    return _call_unary("expm1", x, **kw)
+
+
+def log1p(x, **kw):
+    return _call_unary("log1p", x, **kw)
+
+
+def sqrt(x, **kw):
+    return _call_unary("sqrt", x, **kw)
+
+
+def square(x, **kw):
+    return _call_unary("square", x, **kw)
+
+
+def cbrt(x, **kw):
+    return _call_unary("cbrt", x, **kw)
+
+
+def reciprocal(x, **kw):
+    return _call_unary("reciprocal", x, **kw)
+
+
+gamma = partial(_call_unary, "gamma")
+gammaln = partial(_call_unary, "gammaln")
+loggamma = partial(_call_unary, "loggamma")
+gammasgn = partial(_call_unary, "gammasgn")
+gammainc = partial(_call_bin, "gammainc")
+gammaincinv = partial(_call_bin, "gammaincinv")
+gammaincc = partial(_call_bin, "gammaincc")
+gammainccinv = partial(_call_bin, "gammainccinv")
+beta = partial(_call_bin, "beta")
+betaln = partial(_call_bin, "betaln")
+betainc = partial(call_sparse, "betainc")
+betaincinv = partial(call_sparse, "betaincinv")
+psi = partial(_call_unary, "psi")
+rgamma = partial(_call_unary, "rgamma")
+polygamma = partial(_call_bin, "polygamma")
+multigammaln = partial(_call_bin, "multigammaln")
+digamma = partial(_call_unary, "digamma")
+poch = partial(_call_bin, "poch")
+
+entr = partial(_call_unary, "entr")
+rel_entr = partial(_call_bin, "rel_entr")
+kl_div = partial(_call_bin, "kl_div")
+
+xlogy = partial(_call_bin, "xlogy")
+
+erf = partial(_call_unary, "erf")
+erfc = partial(_call_unary, "erfc")
+erfcx = partial(_call_unary, "erfcx")
+erfi = partial(_call_unary, "erfi")
+erfinv = partial(_call_unary, "erfinv")
+erfcinv = partial(_call_unary, "erfcinv")
+wofz = partial(_call_unary, "wofz")
+dawsn = partial(_call_unary, "dawsn")
+voigt_profile = partial(call_sparse, "voigt_profile")
+
+jv = partial(_call_bin, "jv")
+jve = partial(_call_bin, "jve")
+yn = partial(_call_bin, "yn")
+yv = partial(_call_bin, "yv")
+yve = partial(_call_bin, "yve")
+kn = partial(_call_bin, "kn")
+kv = partial(_call_bin, "kv")
+kve = partial(_call_bin, "kve")
+iv = partial(_call_bin, "iv")
+ive = partial(_call_bin, "ive")
+hankel1 = partial(_call_bin, "hankel1")
+hankel1e = partial(_call_bin, "hankel1e")
+hankel2 = partial(_call_bin, "hankel2")
+hankel2e = partial(_call_bin, "hankel2e")
+
+hyp2f1 = partial(call_sparse, "hyp2f1")
+hyp1f1 = partial(call_sparse, "hyp1f1")
+hyperu = partial(call_sparse, "hyperu")
+hyp0f1 = partial(_call_bin, "hyp0f1")
+
+ellip_harm = partial(call_sparse, "ellip_harm")
+ellip_harm_2 = partial(call_sparse, "ellip_harm_2")
+ellip_normal = partial(call_sparse, "ellip_normal")
+
+ellipk = partial(_call_unary, "ellipk")
+ellipkm1 = partial(_call_unary, "ellipkm1")
+ellipkinc = partial(_call_bin, "ellipkinc")
+ellipe = partial(_call_unary, "ellipe")
+ellipeinc = partial(_call_bin, "ellipeinc")
+elliprc = partial(_call_bin, "elliprc")
+elliprd = partial(call_sparse, "elliprd")
+elliprf = partial(call_sparse, "elliprf")
+elliprg = partial(call_sparse, "elliprg")
+elliprj = partial(call_sparse, "elliprj")
+
+airy = partial(_call_unary, "airy")
+airye = partial(_call_unary, "airye")
+itairy = partial(_call_unary, "itairy")
+
+
+def equal(a, b, **_):
+    try:
+        return a == b
+    except TypeError:
+        return b == a
+
+
+def not_equal(a, b, **_):
+    try:
+        return a != b
+    except TypeError:
+        return b != a
+
+
+def less(a, b, **_):
+    try:
+        return a < b
+    except TypeError:
+        return b > a
+
+
+def less_equal(a, b, **_):
+    try:
+        return a <= b
+    except TypeError:
+        return b >= a
+
+
+def greater(a, b, **_):
+    try:
+        return a > b
+    except TypeError:
+        return b < a
+
+
+def greater_equal(a, b, **_):
+    try:
+        return a >= b
+    except TypeError:
+        return b <= a
+
+
+def logical_and(a, b, **kw):
+    return _call_bin("logical_and", a, b, **kw)
+
+
+def logical_or(a, b, **kw):
+    return _call_bin("logical_or", a, b, **kw)
+
+
+def logical_xor(a, b, **kw):
+    return _call_bin("logical_xor", a, b, **kw)
+
+
+def logical_not(x, **kw):
+    return _call_unary("logical_not", x, **kw)
+
+
+def isclose(a, b, **kw):
+    return _call_bin("isclose", a, b, **kw)
+
+
+def bitwise_and(a, b, **_):
+    try:
+        return a & b
+    except TypeError:
+        return b & a
+
+
+def bitwise_or(a, b, **_):
+    try:
+        return a | b
+    except TypeError:
+        return b | a
+
+
+def bitwise_xor(a, b, **_):
+    try:
+        return operator.xor(a, b)
+    except TypeError:
+        return operator.xor(b, a)
+
+
+def invert(x, **_):
+    return ~x
+
+
+def left_shift(a, b, **_):
+    return a << b
+
+
+def right_shift(a, b, **_):
+    return a >> b
+
+
+def sin(x, **kw):
+    return _call_unary("sin", x, **kw)
+
+
+def cos(x, **kw):
+    return _call_unary("cos", x, **kw)
+
+
+def tan(x, **kw):
+    return _call_unary("tan", x, **kw)
+
+
+def arcsin(x, **kw):
+    return _call_unary("arcsin", x, **kw)
+
+
+def arccos(x, **kw):
+    return _call_unary("arccos", x, **kw)
+
+
+def arctan(x, **kw):
+    return _call_unary("arctan", x, **kw)
+
+
+def arctan2(a, b, **kw):
+    return _call_bin("arctan2", a, b, **kw)
+
+
+def hypot(a, b, **kw):
+    return _call_bin("hypot", a, b, **kw)
+
+
+def sinh(x, **kw):
+    return _call_unary("sinh", x, **kw)
+
+
+def cosh(x, **kw):
+    return _call_unary("cosh", x, **kw)
+
+
+def tanh(x, **kw):
+    return _call_unary("tanh", x, **kw)
+
+
+def arcsinh(x, **kw):
+    return _call_unary("arcsinh", x, **kw)
+
+
+def arccosh(x, **kw):
+    return _call_unary("arccosh", x, **kw)
+
+
+def around(x, **kw):
+    return _call_unary("around", x, **kw)
+
+
+def arctanh(x, **kw):
+    return _call_unary("arctanh", x, **kw)
+
+
+def deg2rad(x, **kw):
+    return _call_unary("deg2rad", x, **kw)
+
+
+def rad2deg(x, **kw):
+    return _call_unary("rad2deg", x, **kw)
+
+
+def angle(x, **kw):
+    return _call_unary("angle", x, **kw)
+
+
+def isinf(x, **kw):
+    return _call_unary("isinf", x, **kw)
+
+
+def isnan(x, **kw):
+    return _call_unary("isnan", x, **kw)
+
+
+def signbit(x, **kw):
+    return _call_unary("signbit", x, **kw)
+
+
+def dot(a, b, sparse=True, **_):
+    from .core import issparse
+
+    if not issparse(a):
+        ret = a.dot(b)
+        if not sparse:
+            return ret
+        else:
+            xps = get_sparse_module(ret)
+            return SparseNDArray(xps.csr_matrix(ret), shape=ret.shape)
+
+    return a.dot(b, sparse=sparse)
+
+
+def tensordot(a, b, axes=2, sparse=True):
+    if isinstance(axes, Iterable):
+        a_axes, b_axes = axes
+    else:
+        a_axes = tuple(range(a.ndim - 1, a.ndim - axes - 1, -1))
+        b_axes = tuple(range(0, axes))
+
+    if isinstance(a_axes, Iterable):
+        a_axes = tuple(a_axes)
+    else:
+        a_axes = (a_axes,)
+    if isinstance(b_axes, Iterable):
+        b_axes = tuple(b_axes)
+    else:
+        b_axes = (b_axes,)
+
+    if a_axes == (a.ndim - 1,) and b_axes == (b.ndim - 2,):
+        return dot(a, b, sparse=sparse)
+
+    if a.ndim == b.ndim == 2:
+        if a_axes == (a.ndim - 1,) and b_axes == (b.ndim - 1,):
+            # inner product of multiple dims
+            return dot(a, b.T, sparse=sparse)
+
+    if a.ndim == 1 or b.ndim == 1:
+        return dot(a, b, sparse=sparse)
+
+    raise NotImplementedError
+
+
+def matmul(a, b, sparse=True, **_):
+    return dot(a, b, sparse=sparse)
+
+
+def concatenate(tensors, axis=0):
+    return reduce(lambda a, b: _call_bin("concatenate", a, b, axis=axis), tensors)
+
+
+def transpose(tensor, axes=None):
+    return _call_unary("transpose", tensor, axes=axes)
+
+
+def swapaxes(tensor, axis1, axis2):
+    return _call_unary("swapaxes", tensor, axis1, axis2)
+
+
+def sum(tensor, axis=None, **kw):
+    return _call_unary("sum", tensor, axis=axis, **kw)
+
+
+def prod(tensor, axis=None, **kw):
+    return _call_unary("prod", tensor, axis=axis, **kw)
+
+
+def amax(tensor, axis=None, **kw):
+    return _call_unary("amax", tensor, axis=axis, **kw)
+
+
+max = amax
+
+
+def amin(tensor, axis=None, **kw):
+    return _call_unary("amin", tensor, axis=axis, **kw)
+
+
+min = amin
+
+
+def all(tensor, axis=None, **kw):
+    return _call_unary("all", tensor, axis=axis, **kw)
+
+
+def any(tensor, axis=None, **kw):
+    return _call_unary("any", tensor, axis=axis, **kw)
+
+
+def mean(tensor, axis=None, **kw):
+    return _call_unary("mean", tensor, axis=axis, **kw)
+
+
+def nansum(tensor, axis=None, **kw):
+    return _call_unary("nansum", tensor, axis=axis, **kw)
+
+
+def nanprod(tensor, axis=None, **kw):
+    return _call_unary("nanprod", tensor, axis=axis, **kw)
+
+
+def nanmax(tensor, axis=None, **kw):
+    return _call_unary("nanmax", tensor, axis=axis, **kw)
+
+
+def nanmin(tensor, axis=None, **kw):
+    return _call_unary("nanmin", tensor, axis=axis, **kw)
+
+
+def argmax(tensor, axis=None, **kw):
+    return _call_unary("argmax", tensor, axis=axis, **kw)
+
+
+def nanargmax(tensor, axis=None, **kw):
+    return _call_unary("nanargmax", tensor, axis=axis, **kw)
+
+
+def argmin(tensor, axis=None, **kw):
+    return _call_unary("argmin", tensor, axis=axis, **kw)
+
+
+def nanargmin(tensor, axis=None, **kw):
+    return _call_unary("nanargmin", tensor, axis=axis, **kw)
+
+
+def var(tensor, axis=None, **kw):
+    return _call_unary("var", tensor, axis=axis, **kw)
+
+
+def cumsum(tensor, axis=None, **kw):
+    return _call_unary("cumsum", tensor, axis=axis, **kw)
+
+
+def cumprod(tensor, axis=None, **kw):
+    return _call_unary("cumprod", tensor, axis=axis, **kw)
+
+
+def nancumsum(tensor, axis=None, **kw):
+    return _call_unary("nancumsum", tensor, axis=axis, **kw)
+
+
+def nancumprod(tensor, axis=None, **kw):
+    return _call_unary("nancumprod", tensor, axis=axis, **kw)
+
+
+def count_nonzero(tensor, axis=None, **kw):
+    return _call_unary("count_nonzero", tensor, axis=axis, **kw)
+
+
+def maximum(a, b, **kw):
+    return _call_bin("maximum", a, b, **kw)
+
+
+def minimum(a, b, **kw):
+    return _call_bin("minimum", a, b, **kw)
+
+
+def fmax(a, b, **kw):
+    return _call_bin("fmax", a, b, **kw)
+
+
+def fmin(a, b, **kw):
+    return _call_bin("fmin", a, b, **kw)
+
+
+def floor(x, **kw):
+    return _call_unary("floor", x, **kw)
+
+
+def ceil(x, **kw):
+    return _call_unary("ceil", x, **kw)
+
+
+def trunc(x, **kw):
+    return _call_unary("trunc", x, **kw)
+
+
+def degrees(x, **kw):
+    return _call_unary("degrees", x, **kw)
+
+
+def radians(x, **kw):
+    return _call_unary("radians", x, **kw)
+
+
+def clip(a, a_max, a_min, **kw):
+    from .core import get_array_module
+
+    if hasattr(a, "clip"):
+        res = getattr(a, "clip")(a_max, a_min, **kw)
+    else:
+        xp = get_array_module(a)
+        res = getattr(xp, "clip")(a, a_max, a_min, **kw)
+
+    if res is NotImplemented:
+        raise NotImplementedError
+
+    return res
+
+
+def iscomplex(x, **kw):
+    return _call_unary("iscomplex", x, **kw)
+
+
+def real(x, **_):
+    return x.real
+
+
+def imag(x, **_):
+    return x.imag
+
+
+def fix(x, **kw):
+    return _call_unary("fix", x, **kw)
+
+
+def i0(x, **kw):
+    return _call_unary("i0", x, **kw)
+
+
+def nan_to_num(x, **kw):
+    return _call_unary("nan_to_num", x, **kw)
+
+
+def copysign(a, b, **kw):
+    return _call_bin("copysign", a, b, **kw)
+
+
+def nextafter(a, b, **kw):
+    return _call_bin("nextafter", a, b, **kw)
+
+
+def spacing(x, **kw):
+    return _call_unary("spacing", x, **kw)
+
+
+def ldexp(a, b, **kw):
+    return _call_bin("ldexp", a, b, **kw)
+
+
+def frexp(x, **kw):
+    return _call_unary("frexp", x, **kw)
+
+
+def modf(x, **kw):
+    return _call_unary("modf", x, **kw)
+
+
+def sinc(x, **kw):
+    return _call_unary("sinc", x, **kw)
+
+
+def isfinite(x, **kw):
+    return _call_unary("isfinite", x, **kw)
+
+
+def isreal(x, **kw):
+    return _call_unary("isreal", x, **kw)
+
+
+def isfortran(x, **kw):
+    return call_sparse("isfortran", x, **kw)
+
+
+def where(cond, x, y):
+    if any([i.ndim not in (0, 2) for i in (cond, x, y)]):
+        raise NotImplementedError
+
+    from .matrix import where as matrix_where
+
+    return matrix_where(cond, x, y)
+
+
+def digitize(x, bins, right=False):
+    return _call_unary("digitize", x, bins, right)
+
+
+def repeat(a, repeats, axis=None):
+    return _call_unary("repeat", a, repeats, axis=axis)
+
+
+def fill_diagonal(a, val, wrap=False):
+    return _call_unary("fill_diagonal", a, val, wrap=wrap)
+
+
+def unique(a, return_index=False, return_inverse=False, return_counts=False, axis=None):
+    return _call_unary(
+        "unique",
+        a,
+        return_index=return_index,
+        return_inverse=return_inverse,
+        return_counts=return_counts,
+        axis=axis,
+    )
+
+
+def zeros(shape, dtype=float, gpu=False):
+    if len(shape) == 2:
+        from .matrix import zeros_sparse_matrix
+
+        return zeros_sparse_matrix(shape, dtype=dtype, gpu=gpu)
+
+    raise NotImplementedError
+
+
+def ones_like(x):
+    from .core import get_array_module
+
+    return get_array_module(x).ones(x.shape)
+
+
+def diag(v, k=0, gpu=False):
+    assert v.ndim in {1, 2}
+
+    from .matrix import diag_sparse_matrix
+
+    return diag_sparse_matrix(v, k=k, gpu=gpu)
+
+
+def eye(N, M=None, k=0, dtype=float, gpu=False):
+    from .matrix import eye_sparse_matrix
+
+    return eye_sparse_matrix(N, M=M, k=k, dtype=dtype, gpu=gpu)
+
+
+def triu(m, k=0, gpu=False):
+    if m.ndim == 2:
+        from .matrix import triu_sparse_matrix
+
+        return triu_sparse_matrix(m, k=k, gpu=gpu)
+
+    raise NotImplementedError
+
+
+def tril(m, k=0, gpu=False):
+    if m.ndim == 2:
+        from .matrix import tril_sparse_matrix
+
+        return tril_sparse_matrix(m, k=k, gpu=gpu)
+
+    raise NotImplementedError
+
+
+def lu(m):
+    from .matrix import lu_sparse_matrix
+
+    return lu_sparse_matrix(m)
+
+
+def solve_triangular(a, b, lower=False, sparse=True):
+    from .matrix import solve_triangular_sparse_matrix
+
+    return solve_triangular_sparse_matrix(a, b, lower=lower, sparse=sparse)
+
+
+def block(arrs):
+    arr = arrs[0]
+    while isinstance(arr, list):
+        arr = arr[0]
+    if arr.ndim != 2:  # pragma: no cover
+        raise NotImplementedError
+
+    from .matrix import block
+
+    return block(arrs)
diff --git a/python/xorbits/_mars/lib/sparse/array.py b/python/xorbits/_mars/lib/sparse/array.py
new file mode 100644
index 000000000..5f986657a
--- /dev/null
+++ b/python/xorbits/_mars/lib/sparse/array.py
@@ -0,0 +1,1603 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partialmethod
+
+from ...utils import ceildiv
+from .core import (
+    cp,
+    cps,
+    get_array_module,
+    get_sparse_module,
+    is_cupy,
+    issparse,
+    naked,
+    np,
+)
+
+
+class SparseNDArray:
+    __slots__ = ("__weakref__",)
+    __array_priority__ = 21
+
+    def __new__(cls, *args, **kwargs):
+        shape = kwargs.get("shape", None)
+        if shape is not None and len(shape) == 1:
+            from .vector import SparseVector
+
+            return object.__new__(SparseVector)
+        if len(args) == 1 and issparse(args[0]) and args[0].ndim == 2:
+            from .matrix import SparseMatrix
+
+            return object.__new__(SparseMatrix)
+
+        else:
+            if cls is not SparseNDArray:
+                return object.__new__(cls)
+            else:
+                raise ValueError(
+                    f"The construct params of {cls.__name__} are invalid: "
+                    f"args={args}, kwargs={kwargs}"
+                )
+
+    @property
+    def raw(self):
+        raise NotImplementedError
+
+
+def call_sparse(method, *args, **kwargs):
+    new_args = []
+    make_dense = False
+    matrix = None
+    for arg in args:
+        if hasattr(arg, "spmatrix"):
+            # todo add support for multiple sparse arrays
+            if make_dense or matrix is not None:
+                make_dense = True
+            matrix = arg
+            new_args.append(matrix.spmatrix.data)
+        else:
+            if isinstance(arg, np.ndarray):
+                make_dense = True
+            new_args.append(arg)
+
+    spmatrix = matrix.spmatrix
+    if make_dense:
+        new_args = [arg.toarray() if hasattr(arg, "spmatrix") else arg for arg in args]
+
+    xp = get_array_module(spmatrix)
+    try:
+        new_data = getattr(xp, method)(*new_args, **kwargs)
+    except AttributeError:
+        if xp is np:
+            from scipy import special
+        else:
+            from cupyx.scipy import special
+        new_data = getattr(special, method)(*new_args, **kwargs)
+
+    if not make_dense:
+        new_spmatrix = get_sparse_module(spmatrix).csr_matrix(
+            (new_data, spmatrix.indices, spmatrix.indptr), spmatrix.shape
+        )
+    else:
+        new_spmatrix = get_sparse_module(spmatrix).csr_matrix(new_data)
+    return SparseNDArray(new_spmatrix, shape=matrix.shape)
+
+
+class SparseArray(SparseNDArray):
+    __slots__ = ("spmatrix",)
+
+    @property
+    def ndim(self):
+        return len(self.shape)
+
+    def tocsr(self):
+        return self
+
+    def toarray(self):
+        if self.shape != self.spmatrix.shape:
+            return self.spmatrix.toarray().reshape(self.shape)
+        else:
+            return self.spmatrix.toarray()
+
+    def todense(self):
+        return self.toarray()
+
+    def ascupy(self):
+        is_cp = get_array_module(self.spmatrix) is cp
+        if is_cp:
+            return self
+        mat_tuple = (
+            cp.asarray(self.data),
+            cp.asarray(self.indices),
+            cp.asarray(self.indptr),
+        )
+        return SparseNDArray(
+            cps.csr_matrix(mat_tuple, shape=self.spmatrix.shape), shape=self.shape
+        )
+
+    def asscipy(self):
+        is_cp = get_array_module(self.spmatrix) is cp
+        if not is_cp:
+            return self
+        return SparseNDArray(self.spmatrix.get(), shape=self.shape)
+
+    def __array__(self, dtype=None):
+        x = self.toarray()
+        if dtype and x.dtype != dtype:
+            return x.astype(dtype)
+        return x
+
+    @property
+    def nbytes(self):
+        return (
+            self.spmatrix.data.nbytes
+            + self.spmatrix.indptr.nbytes
+            + self.spmatrix.indices.nbytes
+        )
+
+    @property
+    def raw(self):
+        return self.spmatrix
+
+    @property
+    def data(self):
+        return self.spmatrix.data
+
+    @property
+    def indptr(self):
+        return self.spmatrix.indptr
+
+    @property
+    def indices(self):
+        return self.spmatrix.indices
+
+    @property
+    def nnz(self):
+        return self.spmatrix.nnz
+
+    @property
+    def shape(self):
+        raise self.spmatrix.shape
+
+    @property
+    def dtype(self):
+        return self.spmatrix.dtype
+
+    def copy(self):
+        return SparseNDArray(self.spmatrix.copy(), shape=self.shape)
+
+    @property
+    def real(self):
+        xps = get_sparse_module(self.spmatrix)
+        return SparseNDArray(
+            xps.csr_matrix(
+                (self.spmatrix.data.real, self.spmatrix.indices, self.spmatrix.indptr),
+                self.spmatrix.shape,
+            ),
+            shape=self.shape,
+        )
+
+    @real.setter
+    def real(self, r):
+        xps = get_sparse_module(self.spmatrix)
+        x = self.spmatrix.toarray()
+        if issparse(r):
+            r = r.toarray()
+        x.real = r
+        self.spmatrix = xps.csr_matrix(x)
+
+    @property
+    def imag(self):
+        xps = get_sparse_module(self.spmatrix)
+        return SparseNDArray(
+            xps.csr_matrix(
+                (self.spmatrix.data.imag, self.spmatrix.indices, self.spmatrix.indptr),
+                self.spmatrix.shape,
+            ),
+            shape=self.shape,
+        )
+
+    @imag.setter
+    def imag(self, imag):
+        xps = get_sparse_module(self.spmatrix)
+        x = self.spmatrix.toarray()
+        if issparse(imag):
+            imag = imag.toarray()
+        x.imag = imag
+        self.spmatrix = xps.csr_matrix(x)
+
+    def __getattr__(self, attr):
+        is_cp = get_array_module(self.spmatrix) is cp
+        if attr == "device" and is_cp:
+            try:
+                return self.spmatrix.device
+            except NotImplementedError:
+                return cp.cuda.Device(0)
+        if attr == "get" and is_cp:
+            return lambda: SparseNDArray(self.spmatrix.get(), shape=self.shape)
+
+        return super().__getattribute__(attr)
+
+    def __getstate__(self):
+        return self.spmatrix
+
+    def __setstate__(self, state):
+        self.spmatrix = state
+
+    def astype(self, dtype, **_):
+        dtype = np.dtype(dtype)
+        if self.dtype == dtype:
+            return self
+        return SparseNDArray(self.spmatrix.astype(dtype), shape=self.shape)
+
+    def transpose(self, axes=None):
+        raise NotImplementedError
+
+    def swapaxes(self, axis1, axis2):
+        if axis1 == 0 and axis2 == 1:
+            return self
+
+        assert axis1 == 1 and axis2 == 0
+        return self.transpose()
+
+    def reshape(self, shape, **_):
+        sp_shape = shape if len(shape) == 2 else (1, shape[0])
+        spmatrix = self.spmatrix.tolil().reshape(sp_shape)
+        return SparseNDArray(spmatrix, shape=shape)
+
+    def broadcast_to(self, shape):
+        # TODO(jisheng): implement broadcast_to
+        raise NotImplementedError
+
+    def squeeze(self, axis=None):
+        # TODO(jisheng): implement squeeze
+        raise NotImplementedError
+
+    @property
+    def T(self):
+        raise NotImplementedError
+
+    # ---------------- arithmetic ----------------------
+
+    def __add__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+        other_xp = get_array_module(naked_other)
+        if other_xp.isscalar(naked_other):
+            return call_sparse("add", self, naked_other)
+        if issparse(naked_other):
+            x = self.spmatrix + naked_other
+        else:
+            x = self.toarray() + naked_other
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __radd__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+        other_xp = get_array_module(naked_other)
+        if other_xp.isscalar(naked_other):
+            return call_sparse("add", naked_other, self)
+        if issparse(naked_other):
+            x = self.spmatrix + naked_other
+        else:
+            x = self.toarray() + naked_other
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __sub__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+        other_xp = get_array_module(naked_other)
+        if other_xp.isscalar(naked_other):
+            return call_sparse("subtract", self, naked_other)
+        if issparse(naked_other):
+            x = self.spmatrix - naked_other
+        else:
+            x = self.toarray() - naked_other
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __rsub__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+        other_xp = get_array_module(naked_other)
+        if other_xp.isscalar(naked_other):
+            return call_sparse("subtract", naked_other, self)
+        if issparse(naked_other):
+            x = naked_other - self.spmatrix
+        else:
+            x = naked_other - self.toarray()
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __mul__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+        if is_cupy(self.spmatrix):
+            if not cp.isscalar(naked_other):
+                # TODO(jisheng): cupy does not implement multiply method
+                is_other_sparse = issparse(naked_other)
+                if (
+                    is_other_sparse
+                    and self.spmatrix.nnz == naked_other.nnz
+                    and cp.all(self.spmatrix.indptr == naked_other.indptr)
+                    and cp.all(self.spmatrix.indices == naked_other.indices)
+                ):
+                    x = cps.csr_matrix(
+                        (
+                            self.spmatrix.data * naked_other.data,
+                            self.spmatrix.indices,
+                            self.spmatrix.indptr,
+                        ),
+                        self.spmatrix.shape,
+                    )
+                else:
+                    if is_other_sparse:
+                        naked_other = other.toarray()
+                    dense = self.spmatrix.toarray()
+                    res = cp.multiply(dense, naked_other, out=dense)
+                    x = cps.csr_matrix(res)
+            else:
+                x = self.spmatrix * naked_other
+        else:
+            x = self.spmatrix.multiply(naked_other)
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __rmul__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+        if is_cupy(self.spmatrix):
+            if not cp.isscalar(naked_other):
+                # TODO(jisheng): cupy does not implement multiply method
+                is_other_sparse = issparse(naked_other)
+                if (
+                    is_other_sparse
+                    and self.spmatrix.nnz == naked_other.nnz
+                    and cp.all(self.spmatrix.indptr == naked_other.indptr)
+                    and cp.all(self.spmatrix.indices == naked_other.indices)
+                ):
+                    x = cps.csr_matrix(
+                        (
+                            naked_other.data * self.spmatrix.data,
+                            self.spmatrix.indices,
+                            self.spmatrix.indptr,
+                        ),
+                        self.spmatrix.shape,
+                    )
+                else:
+                    if is_other_sparse:
+                        naked_other = other.toarray()
+                    dense = self.spmatrix.toarray()
+                    res = cp.multiply(naked_other, dense, out=dense)
+                    x = cps.csr_matrix(res)
+            else:
+                x = naked_other * self.spmatrix
+        else:
+            x = self.spmatrix.multiply(naked_other)
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __matmul__(self, other):
+        from . import matmul
+
+        return matmul(self, other)
+
+    def __rmatmul__(self, other):
+        from . import matmul
+
+        return matmul(other, self)
+
+    def __div__(self, other):
+        return self.__truediv__(other)
+
+    def __rdiv__(self, other):
+        return self.__rtruediv__(other)
+
+    def __truediv__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+        x = self.spmatrix / naked_other
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __rtruediv__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+        try:
+            x = naked_other / self.spmatrix
+        except TypeError:
+            x = naked_other / self.spmatrix.toarray()
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __floordiv__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+        other_xp = get_array_module(naked_other)
+        if other_xp.isscalar(naked_other):
+            return call_sparse("floor_divide", self, naked_other)
+        else:
+            if issparse(naked_other):
+                naked_other = other.toarray()
+                x = get_sparse_module(self.spmatrix).csr_matrix(
+                    self.toarray() // naked_other
+                )
+            else:
+                x = self.toarray() // naked_other
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __rfloordiv__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+        other_xp = get_array_module(naked_other)
+        if other_xp.isscalar(naked_other):
+            return call_sparse("floor_divide", naked_other, self)
+        else:
+            if issparse(naked_other):
+                naked_other = other.toarray()
+                x = get_sparse_module(self.spmatrix).csr_matrix(
+                    naked_other // self.toarray()
+                )
+            else:
+                x = naked_other // self.toarray()
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __pow__(self, other, modulo=None):
+        if modulo is not None:
+            return NotImplemented
+
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+        if get_array_module(naked_other).isscalar(naked_other):
+            try:
+                x = self.spmatrix.power(naked_other)
+            except ValueError as e:  # pragma: no cover
+                # https://github.com/mars-project/mars/issues/3268
+                # https://github.com/scipy/scipy/issues/8678
+                assert "WRITEBACKIFCOPY" in e.args[0]
+                self.spmatrix = self.spmatrix.copy()
+                x = self.spmatrix.power(naked_other)
+        else:
+            if issparse(naked_other):
+                naked_other = other.toarray()
+            x = self.toarray() ** naked_other
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __rpow__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+        if issparse(naked_other):
+            naked_other = other.toarray()
+        x = naked_other ** self.toarray()
+        return get_array_module(x).asarray(x)
+
+    def float_power(self, other):
+        ret = self.__pow__(other)
+        ret = naked(ret).astype(float)
+        if issparse(ret):
+            return SparseNDArray(ret, shape=self.shape)
+        return ret
+
+    def __mod__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+        if get_array_module(naked_other).isscalar(naked_other):
+            data = self.spmatrix.data % naked_other
+            x = get_sparse_module(self.spmatrix).csr_matrix(
+                (data, self.spmatrix.indices, self.spmatrix.indptr), self.spmatrix.shape
+            )
+        else:
+            if issparse(naked_other):
+                naked_other = other.toarray()
+            x = get_sparse_module(self.spmatrix).csr_matrix(
+                self.toarray() % naked_other
+            )
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __rmod__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+        is_sparse = issparse(naked_other)
+        if issparse(naked_other):
+            naked_other = other.toarray()
+        if get_array_module(naked_other).isscalar(naked_other):
+            data = naked_other % self.spmatrix.data
+            x = get_sparse_module(self.spmatrix).csr_matrix(
+                (data, self.spmatrix.indices, self.spmatrix.indptr), self.spmatrix.shape
+            )
+        else:
+            x = naked_other % self.toarray()
+            if is_sparse:
+                x = get_sparse_module(self.spmatrix).csr_matrix(x)
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def fmod(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        xp = get_array_module(self.spmatrix)
+        if get_array_module(naked_other).isscalar(naked_other):
+            return call_sparse("fmod", self, naked_other)
+        else:
+            if issparse(naked_other):
+                naked_other = other.toarray()
+            x = get_sparse_module(self.spmatrix).csr_matrix(
+                xp.fmod(self.toarray(), naked_other)
+            )
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def logaddexp(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        xp = get_array_module(self.spmatrix)
+        if get_array_module(naked_other).isscalar(naked_other):
+            return call_sparse("logaddexp", self, naked_other)
+        if issparse(naked_other):
+            naked_other = other.toarray()
+        return xp.logaddexp(self.toarray(), naked_other)
+
+    def logaddexp2(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        xp = get_array_module(self.spmatrix)
+        if get_array_module(naked_other).isscalar(naked_other):
+            return call_sparse("logaddexp2", self, naked_other)
+        if issparse(naked_other):
+            naked_other = other.toarray()
+        return xp.logaddexp2(self.toarray(), naked_other)
+
+    def __neg__(self):
+        return SparseNDArray(-self.spmatrix, shape=self.shape)
+
+    def __pos__(self):
+        return SparseNDArray(self.spmatrix.copy(), shape=self.shape)
+
+    def __abs__(self):
+        return SparseNDArray(abs(self.spmatrix), shape=self.shape)
+
+    def fabs(self):
+        xp = get_array_module(self.spmatrix)
+        return SparseNDArray(
+            get_sparse_module(self.spmatrix).csr_matrix(
+                xp.abs(self.spmatrix), dtype="f8"
+            ),
+            shape=self.shape,
+        )
+
+    def rint(self):
+        return SparseNDArray(self.spmatrix.rint(), shape=self.shape)
+
+    def sign(self):
+        return SparseNDArray(self.spmatrix.sign(), shape=self.shape)
+
+    def conj(self):
+        return SparseNDArray(self.spmatrix.conj(), shape=self.shape)
+
+    def exp(self):
+        return call_sparse("exp", self)
+
+    def exp2(self):
+        return call_sparse("exp2", self)
+
+    def log(self):
+        return call_sparse("log", self)
+
+    def log2(self):
+        return call_sparse("log2", self)
+
+    def log10(self):
+        return call_sparse("log10", self)
+
+    def expm1(self):
+        return SparseNDArray(self.spmatrix.expm1(), shape=self.shape)
+
+    def log1p(self):
+        return SparseNDArray(self.spmatrix.log1p(), shape=self.shape)
+
+    def sqrt(self):
+        return SparseNDArray(self.spmatrix.sqrt(), shape=self.shape)
+
+    def square(self):
+        return call_sparse("square", self)
+
+    def cbrt(self):
+        return call_sparse("cbrt", self)
+
+    def reciprocal(self):
+        return call_sparse("reciprocal", self)
+
+    def _scipy_unary(self, func_name):
+        spmatrix = self.spmatrix
+        xp = get_array_module(spmatrix)
+        if xp is np:
+            from scipy import special
+        else:
+            from cupyx.scipy import special
+
+        new_data = getattr(special, func_name)(spmatrix.data)
+        new_spmatrix = get_sparse_module(spmatrix).csr_matrix(
+            (new_data, spmatrix.indices, spmatrix.indptr), spmatrix.shape
+        )
+        return SparseNDArray(new_spmatrix, shape=self.shape)
+
+    def _scipy_binary(self, func_name, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:  # pragma: no cover
+            return NotImplemented
+
+        xp = get_array_module(self.spmatrix)
+
+        if xp is np:
+            from scipy import special
+        else:  # pragma: no cover
+            from cupyx.scipy import special
+
+        func = getattr(special, func_name)
+
+        if get_array_module(naked_other).isscalar(naked_other):  # pragma: no cover
+            return call_sparse(func, self, naked_other)
+        else:
+            if issparse(naked_other):  # pragma: no cover
+                naked_other = other.toarray()
+            x = get_sparse_module(self.spmatrix).csr_matrix(
+                func(self.toarray(), naked_other)
+            )
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    gamma = partialmethod(_scipy_unary, "gamma")
+    gammaln = partialmethod(_scipy_unary, "gammaln")
+    loggamma = partialmethod(_scipy_unary, "loggamma")
+    gammasgn = partialmethod(_scipy_unary, "gammasgn")
+    gammainc = partialmethod(_scipy_binary, "gammainc")
+    gammaincinv = partialmethod(_scipy_binary, "gammaincinv")
+    gammaincc = partialmethod(_scipy_binary, "gammaincc")
+    gammainccinv = partialmethod(_scipy_binary, "gammainccinv")
+    beta = partialmethod(_scipy_binary, "beta")
+    betaln = partialmethod(_scipy_binary, "betaln")
+    psi = partialmethod(_scipy_unary, "psi")
+    rgamma = partialmethod(_scipy_unary, "rgamma")
+    polygamma = partialmethod(_scipy_binary, "polygamma")
+    multigammaln = partialmethod(_scipy_binary, "multigammaln")
+    digamma = partialmethod(_scipy_unary, "digamma")
+    poch = partialmethod(_scipy_binary, "poch")
+
+    erf = partialmethod(_scipy_unary, "erf")
+    erfc = partialmethod(_scipy_unary, "erfc")
+    erfcx = partialmethod(_scipy_unary, "erfcx")
+    erfi = partialmethod(_scipy_unary, "erfi")
+    erfinv = partialmethod(_scipy_unary, "erfinv")
+    erfcinv = partialmethod(_scipy_unary, "erfcinv")
+    wofz = partialmethod(_scipy_unary, "wofz")
+    dawsn = partialmethod(_scipy_unary, "dawsn")
+    entr = partialmethod(_scipy_unary, "entr")
+
+    ellipk = partialmethod(_scipy_unary, "ellipk")
+    ellipkm1 = partialmethod(_scipy_unary, "ellipkm1")
+    ellipkinc = partialmethod(_scipy_binary, "ellipkinc")
+    ellipe = partialmethod(_scipy_unary, "ellipe")
+    ellipeinc = partialmethod(_scipy_binary, "ellipeinc")
+    elliprc = partialmethod(_scipy_binary, "elliprc")
+
+    rel_entr = partialmethod(_scipy_binary, "rel_entr")
+    kl_div = partialmethod(_scipy_binary, "kl_div")
+    xlogy = partialmethod(_scipy_binary, "xlogy")
+
+    jv = partialmethod(_scipy_binary, "jv")
+    jve = partialmethod(_scipy_binary, "jve")
+    yn = partialmethod(_scipy_binary, "yn")
+    yv = partialmethod(_scipy_binary, "yv")
+    yve = partialmethod(_scipy_binary, "yve")
+    kn = partialmethod(_scipy_binary, "kn")
+    kv = partialmethod(_scipy_binary, "kv")
+    kve = partialmethod(_scipy_binary, "kve")
+    iv = partialmethod(_scipy_binary, "iv")
+    ive = partialmethod(_scipy_binary, "ive")
+    hankel1 = partialmethod(_scipy_binary, "hankel1")
+    hankel1e = partialmethod(_scipy_binary, "hankel1e")
+    hankel2 = partialmethod(_scipy_binary, "hankel2")
+    hankel2e = partialmethod(_scipy_binary, "hankel2e")
+
+    hyp0f1 = partialmethod(_scipy_binary, "hyp0f1")
+
+    airy = partialmethod(_scipy_unary, "airy")
+    airye = partialmethod(_scipy_unary, "airye")
+    itairy = partialmethod(_scipy_unary, "itairy")
+
+    def __eq__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        if get_array_module(naked_other).isscalar(naked_other):
+            return call_sparse("equal", self, naked_other)
+        if is_cupy(self.spmatrix):
+            return NotImplemented
+        else:
+            if issparse(naked_other):
+                x = self.spmatrix == naked_other
+            else:
+                x = self.toarray() == other
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __ne__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        if get_array_module(naked_other).isscalar(naked_other):
+            return call_sparse("not_equal", self, naked_other)
+        if is_cupy(self.spmatrix):
+            return NotImplemented
+        else:
+            if issparse(naked_other):
+                x = self.spmatrix != naked_other
+            else:
+                x = self.toarray() != other
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __lt__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        if get_array_module(naked_other).isscalar(naked_other):
+            return call_sparse("less", self, naked_other)
+        if is_cupy(self.spmatrix):
+            return NotImplemented
+        else:
+            if issparse(naked_other):
+                x = self.spmatrix < naked_other
+            else:
+                x = self.toarray() < other
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __le__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        if get_array_module(naked_other).isscalar(naked_other):
+            return call_sparse("less_equal", self, naked_other)
+        if is_cupy(self.spmatrix):
+            return NotImplemented
+        else:
+            if issparse(naked_other):
+                x = self.spmatrix <= naked_other
+            else:
+                x = self.toarray() <= other
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __gt__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        if get_array_module(naked_other).isscalar(naked_other):
+            return call_sparse("greater", self, naked_other)
+        if is_cupy(self.spmatrix):
+            return NotImplemented
+        else:
+            if issparse(naked_other):
+                x = self.spmatrix > naked_other
+            else:
+                x = self.toarray() > other
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __ge__(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        if get_array_module(naked_other).isscalar(naked_other):
+            return call_sparse("greater_equal", self, naked_other)
+        if is_cupy(self.spmatrix):
+            return NotImplemented
+        else:
+            if issparse(naked_other):
+                x = self.spmatrix >= naked_other
+            else:
+                x = self.toarray() >= other
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def logical_and(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        if is_cupy(self.spmatrix):
+            return NotImplemented
+        else:
+            other_xp = get_array_module(naked_other)
+            if other_xp.isscalar(naked_other):
+                naked_other = other_xp.array(naked_other).astype(bool)
+            else:
+                naked_other = naked_other.astype(bool)
+            x = self.spmatrix.astype(bool).multiply(naked_other)
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def logical_or(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        if is_cupy(self.spmatrix):
+            return NotImplemented
+        else:
+            other_xp = get_array_module(naked_other)
+            if other_xp.isscalar(naked_other):
+                if naked_other != 0:
+                    x = np.logical_and(self.toarray(), naked_other)
+                else:
+                    x = self.spmatrix.astype(bool)
+            else:
+                naked_other = naked_other.astype(bool)
+                x = (self.spmatrix.astype(bool) + naked_other).astype(bool)
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def logical_xor(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        if is_cupy(self.spmatrix):
+            return NotImplemented
+        else:
+            other_xp = get_array_module(naked_other)
+            if other_xp.isscalar(naked_other):
+                naked_other = other_xp.array(naked_other).astype(bool)
+            else:
+                naked_other = naked_other.astype(bool)
+            x = self.spmatrix.astype(bool) != naked_other
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def logical_not(self):
+        return call_sparse("logical_not", self)
+
+    @staticmethod
+    def _bitwise(this, other, method_name):
+        try:
+            naked_this = naked(this)
+        except TypeError:
+            return NotImplemented
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        if not issparse(naked_this):
+            return SparseArray._bitwise(naked_other, naked_this, method_name)
+
+        if issparse(naked_other):
+            naked_other = other.toarray()
+
+        xp = get_array_module(naked_this)
+        xps = get_sparse_module(naked_this)
+        return SparseNDArray(
+            xps.csr_matrix(getattr(xp, method_name)(this.toarray(), naked_other)),
+            shape=naked_this.shape,
+        )
+
+    def __and__(self, other):
+        if get_array_module(other).isscalar(other):
+            return call_sparse("bitwise_and", self, other)
+        return self._bitwise(self.spmatrix, other, "bitwise_and")
+
+    def __rand__(self, other):
+        if get_array_module(other).isscalar(other):
+            return call_sparse("bitwise_and", other, self)
+        return self._bitwise(other, self.spmatrix, "bitwise_and")
+
+    def __or__(self, other):
+        if get_array_module(other).isscalar(other):
+            return call_sparse("bitwise_or", self, other)
+        return self._bitwise(self.spmatrix, other, "bitwise_or")
+
+    def __ror__(self, other):
+        if get_array_module(other).isscalar(other):
+            return call_sparse("bitwise_or", other, self)
+        return self._bitwise(other, self.spmatrix, "bitwise_or")
+
+    def __xor__(self, other):
+        if get_array_module(other).isscalar(other):
+            return call_sparse("bitwise_xor", self, other)
+        return self._bitwise(self.spmatrix, other, "bitwise_xor")
+
+    def __rxor__(self, other):
+        if get_array_module(other).isscalar(other):
+            return call_sparse("bitwise_xor", other, self)
+        return self._bitwise(other, self.spmatrix, "bitwise_xor")
+
+    def isclose(self, other, **kw):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        xp = get_array_module(naked_other)
+        if issparse(naked_other):
+            naked_other = other.toarray()
+        return xp.isclose(self.toarray(), naked_other, **kw)
+
+    def __invert__(self):
+        return call_sparse("invert", self)
+
+    @staticmethod
+    def _shift(this, other, method_name):
+        try:
+            naked_this = naked(this)
+        except TypeError:
+            return NotImplemented
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        xps = get_sparse_module(naked_this)
+        xp = get_array_module(naked_this)
+
+        if xp.isscalar(naked_this):
+            other_xp = get_array_module(naked_other)
+            data = getattr(other_xp, method_name)(naked_this, naked_other.data)
+            indices, indptr, shape = (
+                naked_other.indices,
+                naked_other.indptr,
+                naked_other.shape,
+            )
+        elif isinstance(naked_this, xp.ndarray):
+            # dense
+            return getattr(xp, method_name)(naked_this, other.toarray())
+        else:
+            tp = (
+                np.int32 if is_cupy(naked_this) else np.bool_
+            )  # cupy.sparse does not support bool
+            mask = xps.csr_matrix(
+                (
+                    (naked_this.data > 0).astype(tp),
+                    naked_this.indices,
+                    naked_this.indptr,
+                ),
+                naked_this.shape,
+            )
+            naked_other = mask.multiply(naked_other)
+            indices, indptr, shape = (
+                naked_this.indices,
+                naked_this.indptr,
+                naked_this.shape,
+            )
+            data = getattr(xp, method_name)(naked_this.data, naked_other.data)
+
+        return SparseNDArray(
+            xps.csr_matrix((data, indices, indptr), shape), shape=shape
+        )
+
+    def __lshift__(self, other):
+        return self._shift(self.spmatrix, other, "left_shift")
+
+    def __rlshift__(self, other):
+        return self._shift(other, self.spmatrix, "left_shift")
+
+    def __rshift__(self, other):
+        return self._shift(self.spmatrix, other, "right_shift")
+
+    def __rrshift__(self, other):
+        return self._shift(other, self.spmatrix, "right_shift")
+
+    def sin(self):
+        return SparseNDArray(self.spmatrix.sin(), shape=self.shape)
+
+    def cos(self):
+        return call_sparse("cos", self)
+
+    def tan(self):
+        return SparseNDArray(self.spmatrix.tan(), shape=self.shape)
+
+    def arcsin(self):
+        return SparseNDArray(self.spmatrix.arcsin(), shape=self.shape)
+
+    def arccos(self):
+        return call_sparse("arccos", self)
+
+    def arctan(self):
+        return SparseNDArray(self.spmatrix.arctan(), shape=self.shape)
+
+    def arctan2(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        xp = get_array_module(self.spmatrix)
+        other_xp = get_array_module(naked_other)
+        if other_xp.isscalar(naked_other):
+            return call_sparse("arctan2", self, naked_other)
+        if issparse(naked_other):
+            naked_other = other.toarray()
+        x = xp.arctan2(self.toarray(), naked_other)
+        return SparseNDArray(get_sparse_module(x).csr_matrix(x), shape=self.shape)
+
+    def hypot(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        xp = get_array_module(self.spmatrix)
+        other_xp = get_array_module(naked_other)
+        if other_xp.isscalar(naked_other):
+            return call_sparse("hypot", self, naked_other)
+        if issparse(naked_other):
+            naked_other = other.toarray()
+        x = xp.hypot(self.toarray(), naked_other)
+        return SparseNDArray(get_sparse_module(x).csr_matrix(x), shape=self.shape)
+
+    def sinh(self):
+        return SparseNDArray(self.spmatrix.sinh(), shape=self.shape)
+
+    def cosh(self):
+        xp = get_array_module(self.spmatrix)
+        return xp.cosh(self.toarray())
+
+    def tanh(self):
+        return SparseNDArray(self.spmatrix.tanh(), shape=self.shape)
+
+    def arcsinh(self):
+        return SparseNDArray(self.spmatrix.arcsinh(), shape=self.shape)
+
+    def arccosh(self):
+        return call_sparse("arccosh", self)
+
+    def arctanh(self):
+        return SparseNDArray(self.spmatrix.arctanh(), shape=self.shape)
+
+    def around(self, decimals=0):
+        return call_sparse("around", self, decimals=decimals)
+
+    def deg2rad(self):
+        return SparseNDArray(self.spmatrix.deg2rad(), shape=self.shape)
+
+    def rad2deg(self):
+        return SparseNDArray(self.spmatrix.rad2deg(), shape=self.shape)
+
+    def angle(self, deg=0):
+        return call_sparse("angle", self, deg=deg)
+
+    def dot(self, other, sparse=True):
+        raise NotImplementedError
+
+    def concatenate(self, other, axis=0):
+        raise NotImplementedError
+
+    def _reduction(
+        self, method_name, axis=None, dtype=None, keepdims=None, todense=False, **kw
+    ):
+        raise NotImplementedError
+
+    def sum(self, axis=None, dtype=None, keepdims=None):
+        return self._reduction("sum", axis=axis, dtype=dtype, keepdims=keepdims)
+
+    def prod(self, axis=None, dtype=None, keepdims=None):
+        return self._reduction(
+            "sum", axis=axis, dtype=dtype, keepdims=keepdims, todense=True
+        )
+
+    def amax(self, axis=None, dtype=None, keepdims=None):
+        return self._reduction("max", axis=axis, dtype=dtype, keepdims=keepdims)
+
+    def amin(self, axis=None, dtype=None, keepdims=None):
+        return self._reduction("min", axis=axis, dtype=dtype, keepdims=keepdims)
+
+    def all(self, axis=None, dtype=None, keepdims=None):
+        ret = self._reduction(
+            "all", axis=axis, dtype=dtype, keepdims=keepdims, todense=True
+        )
+        if not issparse(ret):
+            if get_array_module(ret).isscalar(ret):
+                return ret
+            xps = get_sparse_module(self.spmatrix)
+            ret = SparseNDArray(xps.csr_matrix(ret))
+            return ret
+        return ret
+
+    def any(self, axis=None, dtype=None, keepdims=None):
+        ret = self._reduction(
+            "any", axis=axis, dtype=dtype, keepdims=keepdims, todense=True
+        )
+        if not issparse(ret):
+            if get_array_module(ret).isscalar(ret):
+                return ret
+            xps = get_sparse_module(self.spmatrix)
+            ret = SparseNDArray(xps.csr_matrix(ret))
+            return ret
+        return ret
+
+    def mean(self, axis=None, dtype=None, keepdims=None):
+        return self._reduction("mean", axis=axis, dtype=dtype, keepdims=keepdims)
+
+    def nansum(self, axis=None, dtype=None, keepdims=None):
+        return self._reduction(
+            "nansum", axis=axis, dtype=dtype, keepdims=keepdims, todense=True
+        )
+
+    def nanprod(self, axis=None, dtype=None, keepdims=None):
+        return self._reduction(
+            "nanprod", axis=axis, dtype=dtype, keepdims=keepdims, todense=True
+        )
+
+    def nanmax(self, axis=None, dtype=None, keepdims=None):
+        ret = self._reduction(
+            "nanmax", axis=axis, dtype=dtype, keepdims=keepdims, todense=True
+        )
+        if not issparse(ret):
+            if get_array_module(ret).isscalar(ret):
+                return ret
+            xps = get_sparse_module(self.spmatrix)
+            ret = SparseNDArray(xps.csr_matrix(ret))
+            return ret
+        return ret
+
+    def nanmin(self, axis=None, dtype=None, keepdims=None):
+        ret = self._reduction(
+            "nanmin", axis=axis, dtype=dtype, keepdims=keepdims, todense=True
+        )
+        if not issparse(ret):
+            if get_array_module(ret).isscalar(ret):
+                return ret
+            xps = get_sparse_module(self.spmatrix)
+            ret = SparseNDArray(xps.csr_matrix(ret))
+            return ret
+        return ret
+
+    def nanmean(self, axis=None, dtype=None, keepdims=None):
+        return self._reduction(
+            "nanmean", axis=axis, dtype=dtype, keepdims=keepdims, todense=True
+        )
+
+    def argmax(self, axis=None, dtype=None, keepdims=None):
+        return self._reduction("argmax", axis=axis, dtype=dtype, keepdims=keepdims)
+
+    def nanargmax(self, axis=None, dtype=None, keepdims=None):
+        return self._reduction(
+            "nanargmax", axis=axis, dtype=dtype, keepdims=keepdims, todense=True
+        )
+
+    def argmin(self, axis=None, dtype=None, keepdims=None):
+        return self._reduction("argmin", axis=axis, dtype=dtype, keepdims=keepdims)
+
+    def nanargmin(self, axis=None, dtype=None, keepdims=None):
+        return self._reduction(
+            "nanargmin", axis=axis, dtype=dtype, keepdims=keepdims, todense=True
+        )
+
+    def var(self, axis=None, dtype=None, ddof=0, keepdims=None):
+        return self._reduction(
+            "var", axis=axis, dtype=dtype, ddof=ddof, keepdims=keepdims, todense=True
+        )
+
+    def cumsum(self, axis=None, dtype=None):
+        return self.toarray().cumsum(axis=axis)
+
+    def cumprod(self, axis=None, dtype=None):
+        return self.toarray().cumprod(axis=axis)
+
+    def nancumsum(self, axis=None, dtype=None):
+        xp = get_array_module(self.spmatrix)
+        return xp.nancumsum(self.toarray(), axis=axis)
+
+    def nancumprod(self, axis=None, dtype=None):
+        xp = get_array_module(self.spmatrix)
+        return xp.nancumprod(self.toarray(), axis=axis)
+
+    def count_nonzero(self, axis=None, dtype=None, keepdims=None):
+        if axis is None:
+            return get_array_module(self.spmatrix).array(
+                [self.spmatrix.count_nonzero()]
+            )[0]
+        else:
+            return get_array_module(self.spmatrix).count_nonzero(
+                self.toarray(), axis=axis
+            )
+
+    def __getitem__(self, item):
+        if isinstance(item, SparseArray):
+            item = item.spmatrix
+        if isinstance(item, list):
+            item = tuple(item)
+
+        x = self.spmatrix[item]
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __setitem__(self, key, value):
+        if is_cupy(self.spmatrix):
+            return NotImplemented
+        else:
+            x = self.spmatrix.tolil()
+            x[key] = value
+            x = x.tocsr()
+        self.spmatrix = x
+
+    def _maximum_minimum(self, other, method_name):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        if is_cupy(self.spmatrix):
+            # TODO(jisheng): cupy does not implement sparse maximum and minimum
+            return NotImplemented
+
+        xps = get_sparse_module(self.spmatrix)
+        xp = get_array_module(self.spmatrix)
+        has_nan = xps.csr_matrix(
+            (xp.isnan(self.spmatrix.data), self.spmatrix.indices, self.spmatrix.indptr),
+            self.spmatrix.shape,
+        )
+        if issparse(naked_other):
+            has_nan += xps.csr_matrix(
+                (xp.isnan(naked_other.data), naked_other.indices, naked_other.indptr),
+                naked_other.shape,
+            )
+
+        if issparse(naked_other):
+            x = getattr(self.spmatrix, method_name)(naked_other)
+        else:
+            x = getattr(xp, method_name)(self.toarray(), naked_other)
+
+        if has_nan.sum() > 0:
+            x = x + (has_nan * np.nan)
+
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+
+        return get_array_module(x).asarray(x)
+
+    def maximum(self, other):
+        return self._maximum_minimum(other, "maximum")
+
+    def minimum(self, other):
+        return self._maximum_minimum(other, "minimum")
+
+    def fmax(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        x = self.spmatrix.maximum(naked_other)
+        if issparse(x):
+            return SparseArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def fmin(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        x = self.spmatrix.minimum(naked_other)
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def isinf(self):
+        return call_sparse("isinf", self)
+
+    def isnan(self):
+        return call_sparse("isnan", self)
+
+    def signbit(self):
+        return call_sparse("signbit", self)
+
+    def floor(self):
+        return SparseNDArray(self.spmatrix.floor(), shape=self.shape)
+
+    def ceil(self):
+        return SparseNDArray(self.spmatrix.ceil(), shape=self.shape)
+
+    def trunc(self):
+        return SparseNDArray(self.spmatrix.trunc(), shape=self.shape)
+
+    def degrees(self):
+        return call_sparse("degrees", self)
+
+    def radians(self):
+        return call_sparse("radians", self)
+
+    def clip(self, a_min, a_max):
+        try:
+            a_min = naked(a_min)
+        except TypeError:
+            return NotImplemented
+
+        try:
+            a_max = naked(a_max)
+        except TypeError:
+            return NotImplemented
+
+        x = self.spmatrix.maximum(a_min)
+        if issparse(x):
+            x = x.minimum(a_max)
+        elif issparse(a_max):
+            x = a_max.minimum(x)
+        else:
+            xp = get_array_module(x)
+            x = xp.minimum(x, a_max)
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        return get_array_module(x).asarray(x)
+
+    def iscomplex(self):
+        return call_sparse("iscomplex", self)
+
+    def fix(self):
+        return call_sparse("fix", self)
+
+    def i0(self):
+        xp = get_array_module(self.spmatrix)
+        data = xp.i0(self.spmatrix.data).reshape(self.spmatrix.data.shape)
+        x = get_sparse_module(self.spmatrix).csr_matrix(
+            (data, self.spmatrix.indices, self.spmatrix.indptr), self.spmatrix.shape
+        )
+        return SparseNDArray(x, shape=self.shape)
+
+    def nan_to_num(self):
+        return call_sparse("nan_to_num", self)
+
+    def copysign(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        if get_array_module(naked_other).isscalar(naked_other):
+            return call_sparse("copysign", self, naked_other)
+
+        if issparse(naked_other):
+            naked_other = other.toarray()
+
+        xp = get_array_module(self.spmatrix)
+        return xp.copysign(self.toarray(), naked_other)
+
+    def nextafter(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        ret_sparse = False
+        if issparse(naked_other):
+            ret_sparse = True
+            naked_other = other.toarray()
+
+        xp = get_array_module(self.spmatrix)
+        xps = get_sparse_module(self.spmatrix)
+
+        x = xp.nextafter(self.toarray(), naked_other)
+        if ret_sparse:
+            return SparseNDArray(xps.csr_matrix(x), shape=self.shape)
+        return x
+
+    def spacing(self):
+        if is_cupy(self.spmatrix):
+            raise NotImplementedError
+        return call_sparse("spacing", self)
+
+    def ldexp(self, other):
+        try:
+            naked_other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        if get_array_module(naked_other).isscalar(naked_other):
+            return call_sparse("ldexp", self, naked_other)
+
+        if issparse(naked_other):
+            naked_other = other.toarray()
+
+        return SparseNDArray(self.spmatrix.multiply(2**naked_other))
+
+    def frexp(self, **kw):
+        xp = get_array_module(self.spmatrix)
+        xps = get_sparse_module(self.spmatrix)
+        x, y = xp.frexp(self.toarray(), **kw)
+        return (
+            SparseNDArray(xps.csr_matrix(x), shape=self.shape),
+            SparseNDArray(xps.csr_matrix(y), shape=self.shape),
+        )
+
+    def modf(self, **kw):
+        xp = get_array_module(self.spmatrix)
+        xps = get_sparse_module(self.spmatrix)
+        x, y = xp.modf(self.toarray(), **kw)
+        return (
+            SparseNDArray(xps.csr_matrix(x), shape=self.shape),
+            SparseNDArray(xps.csr_matrix(y), shape=self.shape),
+        )
+
+    def sinc(self):
+        return call_sparse("sinc", self)
+
+    def isfinite(self):
+        return call_sparse("isfinite", self)
+
+    def isreal(self):
+        return call_sparse("isreal", self)
+
+    def digitize(self, bins, right=False):
+        return call_sparse("digitize", self, bins=bins, right=right)
+
+    def repeat(self, repeats, axis=None):
+        if axis is None:
+            raise NotImplementedError
+
+        xp = get_array_module(self.spmatrix)
+        xps = get_sparse_module(self.spmatrix)
+        r = xp.repeat(self.toarray(), repeats, axis=axis)
+        x = xps.csr_matrix(r)
+        return SparseNDArray(x, shape=r.shape)
+
+    @staticmethod
+    def _expand_val(val, expect_val_size, xp):
+        if val.size > expect_val_size:
+            val = val[:expect_val_size]
+        elif val.size < expect_val_size:
+            n_repeat = ceildiv(expect_val_size, val.size)
+            val = xp.tile(val, n_repeat)[:expect_val_size]
+        return val
+
+    def fill_diagonal(self, val, wrap=False):
+        lil_matrix = self.spmatrix.tolil()
+
+        xp = get_array_module(self.spmatrix)
+        val = xp.asarray(val)
+        if val.ndim > 1:
+            val = val.ravel()
+        is_tall_matrix = lil_matrix.shape[0] > lil_matrix.shape[1] + 1
+        n_rows, n_cols = lil_matrix.shape
+
+        if not wrap or not is_tall_matrix:
+            if val.ndim > 0:
+                # check if val is long enough
+                expect_val_size = min(n_rows, n_cols)
+                val = self._expand_val(val, expect_val_size, xp)
+            lil_matrix.setdiag(val)
+            matrix = lil_matrix
+        else:
+            block_size = n_cols + 1
+
+            n_block = n_rows // block_size
+            n_vals = n_cols * n_block
+            if n_rows % block_size > 0:
+                # 1 chunk left
+                n_block += 1
+                n_vals += min(n_rows % block_size, n_cols)
+
+            if val.ndim > 0:
+                val = self._expand_val(val, n_vals, xp)
+
+            sub_matrices = []
+            for i in range(n_block):
+                sub_lil_matrix = lil_matrix[i * block_size : (i + 1) * block_size]
+                if val.ndim > 0:
+                    sub_val = val[i * n_cols : (i + 1) * n_cols]
+                else:
+                    sub_val = val
+                sub_lil_matrix.setdiag(sub_val)
+                sub_matrices.append(sub_lil_matrix)
+
+            xps = get_sparse_module(self.spmatrix)
+            matrix = SparseArray(xps.vstack(sub_matrices, format="csr"))
+
+        self.spmatrix = matrix.tocsr()
+
+    def unique(
+        self, return_index=False, return_inverse=False, return_counts=False, axis=None
+    ):
+        if return_inverse or return_index:  # pragma: no cover
+            raise NotImplementedError
+        if self.ndim == 2 and axis is not None:  # pragma: no cover
+            raise NotImplementedError
+
+        xp = get_array_module(self.spmatrix)
+        return xp.unique(self.spmatrix.data, return_counts=return_counts)
diff --git a/python/xorbits/_mars/lib/sparse/core.py b/python/xorbits/_mars/lib/sparse/core.py
new file mode 100644
index 000000000..1e65b2323
--- /dev/null
+++ b/python/xorbits/_mars/lib/sparse/core.py
@@ -0,0 +1,90 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+try:
+    import scipy.sparse as sps
+    import scipy.sparse.linalg as splinalg
+except ImportError:  # pragma: no cover
+    sps = None
+    splinalg = None
+
+from ...utils import lazy_import
+
+splinalg = splinalg
+cp = lazy_import("cupy", rename="cp")
+cps = lazy_import("cupy.sparse", rename="cps")
+
+
+def issparse(x):
+    if cps and cps.issparse(x):
+        # is cupy.sparse
+        return True
+    if sps and sps.issparse(x):
+        # is scipy.sparse
+        return True
+    if np and isinstance(x, np.ndarray):
+        return False
+    if cp and isinstance(x, cp.ndarray):
+        return False
+
+    from .array import SparseNDArray
+
+    return isinstance(x, SparseNDArray)
+
+
+def is_sparse_or_dense(x):
+    if issparse(x):
+        return True
+    m = get_array_module(x)
+    if m.isscalar(x):
+        return True
+    return isinstance(x, m.ndarray)
+
+
+def get_dense_module(x):
+    from .array import SparseNDArray
+
+    if cp:
+        if isinstance(x, SparseNDArray):
+            return get_array_module(x.raw)
+        return get_array_module(x)
+
+    return np
+
+
+def get_array_module(x):
+    if cp:
+        return cp.get_array_module(x)
+    return np
+
+
+def get_sparse_module(x):
+    m = get_array_module(x)
+    if m is np:
+        return sps
+    return cps
+
+
+def is_cupy(x):
+    return get_array_module(x) is cp
+
+
+def naked(x):
+    if hasattr(x, "spmatrix"):
+        return x.spmatrix
+    if not is_sparse_or_dense(x):
+        raise TypeError("only sparse matrix or ndarray accepted")
+    return x
diff --git a/python/xorbits/_mars/lib/sparse/matrix.py b/python/xorbits/_mars/lib/sparse/matrix.py
new file mode 100644
index 000000000..b4f401bc2
--- /dev/null
+++ b/python/xorbits/_mars/lib/sparse/matrix.py
@@ -0,0 +1,239 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Iterable
+from typing import List
+
+import numpy as np
+
+from .array import SparseArray, SparseNDArray
+from .core import (
+    cp,
+    cps,
+    get_array_module,
+    get_sparse_module,
+    issparse,
+    naked,
+    splinalg,
+    sps,
+)
+
+
+def zeros_sparse_matrix(shape, dtype=float, gpu=False):
+    m = sps if not gpu else cps
+    return SparseMatrix(m.csr_matrix(shape, dtype=np.dtype(dtype)))
+
+
+def diag_sparse_matrix(v, k=0, gpu=False):
+    v = naked(v)
+    if gpu and get_array_module(v) is not cp:
+        v = cp.asarray(v)
+    if not gpu and get_array_module(v) is not np:
+        v = v.get()
+
+    if v.ndim == 1:
+        sparse_m = sps if not gpu else cps
+        m = n = v.size + k
+        mat = sparse_m.spdiags(v[None], [k], m, n, format="csr")
+        return SparseMatrix(mat)
+    else:
+        assert v.ndim == 2
+        sparse_m = sps if not gpu else cps
+        sparse_eye = sparse_m.eye(v.shape[0], v.shape[1], k=k)
+        mat = sparse_eye.multiply(v).tocoo()
+        size = sparse_eye.nnz
+        col = mat.col - max(k, 0)
+        row = get_array_module(col).zeros((len(col),))
+        return SparseNDArray(
+            sparse_m.csr_matrix((mat.data, (row, col)), shape=(1, size)), shape=(size,)
+        )
+
+
+def eye_sparse_matrix(N, M=None, k=0, dtype=float, gpu=False):
+    m = sps if not gpu else cps
+    return SparseMatrix(m.eye(N, n=M, k=k, dtype=dtype, format="csr"))
+
+
+def triu_sparse_matrix(m, k=0, gpu=False):
+    m = naked(m)
+    if gpu and get_array_module(m) is not cp:
+        m = cp.asarray(m)
+    if not gpu and get_array_module(m) is not np:
+        m = m.get()
+
+    sparse_m = sps if not gpu else cps
+    mat = sparse_m.triu(m, k=k)
+    return SparseMatrix(mat)
+
+
+def tril_sparse_matrix(m, k=0, gpu=False):
+    m = naked(m)
+    if gpu and get_array_module(m) is not cp:
+        m = cp.asarray(m)
+    if not gpu and get_array_module(m) is not np:
+        m = m.get()
+
+    sparse_m = sps if not gpu else cps
+    mat = sparse_m.tril(m, k=k)
+    return SparseMatrix(mat)
+
+
+def where(cond, x, y):
+    cond, x, y = [SparseMatrix(i) if issparse(i) else i for i in (cond, x, y)]
+    return cond * x + (cond * (-y) + y)
+
+
+def lu_sparse_matrix(a):
+    a = naked(a)
+    a = a.tocsc()
+    super_lu = splinalg.splu(
+        a, permc_spec="NATURAL", diag_pivot_thresh=0, options={"SymmetricMode": True}
+    )
+    l_ = super_lu.L
+    u = super_lu.U
+    p = sps.lil_matrix(a.shape)
+    p[super_lu.perm_r.copy(), np.arange(a.shape[1])] = 1
+    return (
+        SparseMatrix(p),
+        SparseMatrix(l_),
+        SparseMatrix(u),
+    )
+
+
+def solve_triangular_sparse_matrix(a, b, lower=False, sparse=True):
+    a = naked(a)
+    b = b.toarray() if issparse(b) else b
+
+    x = splinalg.spsolve_triangular(a, b, lower=lower)
+    if sparse:
+        spx = (
+            sps.csr_matrix(x).reshape(x.shape[0], 1)
+            if len(x.shape) == 1
+            else sps.csr_matrix(x)
+        )
+        return SparseNDArray(spx, shape=x.shape)
+    else:
+        return x
+
+
+def block(arrs: List[List[SparseArray]]) -> SparseArray:
+    mats = []
+    for dim_arrs in arrs:
+        mats.append([naked(a) for a in dim_arrs])
+    return SparseNDArray(sps.bmat(mats, format="csr"))
+
+
+class SparseMatrix(SparseArray):
+    __slots__ = ("spmatrix",)
+
+    def __init__(self, spmatrix, shape=()):
+        if shape and len(shape) != 2:
+            raise ValueError("Only accept 2-d array")
+        if isinstance(spmatrix, SparseMatrix):
+            self.spmatrix = spmatrix.spmatrix
+        else:
+            self.spmatrix = spmatrix.tocsr()
+
+    @property
+    def shape(self):
+        return self.spmatrix.shape
+
+    @property
+    def size(self):
+        return int(np.prod(self.shape))
+
+    def transpose(self, axes=None):
+        assert axes is None or tuple(axes) == (1, 0)
+        return SparseMatrix(self.spmatrix.transpose())
+
+    @property
+    def T(self):
+        return SparseMatrix(self.spmatrix.T)
+
+    def dot(self, other, sparse=True):
+        other_shape = other.shape
+        try:
+            other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        if sparse:
+            if len(other_shape) == 1:
+                x = self.spmatrix.dot(other.T)
+            else:
+                x = self.spmatrix.dot(other)
+        else:
+            a = self.spmatrix.toarray()
+            if issparse(other):
+                other = other.toarray().reshape(other_shape)
+            x = a.dot(other)
+        if issparse(x):
+            shape = (x.shape[0],) if len(other_shape) == 1 else x.shape
+            return SparseNDArray(x, shape=shape)
+        return get_array_module(x).asarray(x)
+
+    def concatenate(self, other, axis=0):
+        try:
+            other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        if issparse(other):
+            xps = get_sparse_module(self.spmatrix)
+            if axis not in (0, 1):
+                raise ValueError("axis can only be 0 or 1")
+            method = xps.vstack if axis == 0 else xps.hstack
+            x = method((self.spmatrix, other))
+        else:
+            xp = get_array_module(self.spmatrix)
+            x = xp.concatenate((self.spmatrix.toarray(), other), axis=axis)
+
+        if issparse(x):
+            return SparseMatrix(x)
+        return get_array_module(x).asarray(x)
+
+    def _reduction(
+        self, method_name, axis=None, dtype=None, keepdims=None, todense=False, **kw
+    ):
+        # TODO: support keepdims
+        if isinstance(axis, tuple):
+            if sorted(axis) != [0, 1]:
+                assert len(axis) == 1
+                axis = axis[0]
+            else:
+                axis = None
+
+        if todense:
+            x = self.spmatrix.toarray()
+            x = getattr(get_array_module(x), method_name)(x, axis=axis, **kw)
+        else:
+            x = getattr(self.spmatrix, method_name)(axis=axis, **kw)
+        if not isinstance(axis, Iterable):
+            axis = (axis,)
+        axis = list(range(len(self.shape))) if axis is None else axis
+        shape = tuple(
+            s if i not in axis else 1
+            for i, s in enumerate(self.shape)
+            if keepdims or i not in axis
+        )
+        m = get_array_module(x)
+        if issparse(x):
+            return SparseNDArray(x, shape=shape)
+        if m.isscalar(x):
+            if keepdims:
+                return m.array([x])[0].reshape((1,) * self.ndim)
+            else:
+                return m.array([x])[0]
+        else:
+            return m.asarray(x).reshape(shape)
diff --git a/python/xorbits/_mars/lib/sparse/tests/__init__.py b/python/xorbits/_mars/lib/sparse/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/lib/sparse/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/lib/sparse/tests/test_sparse.py b/python/xorbits/_mars/lib/sparse/tests/test_sparse.py
new file mode 100644
index 000000000..2abf150f5
--- /dev/null
+++ b/python/xorbits/_mars/lib/sparse/tests/test_sparse.py
@@ -0,0 +1,474 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle
+
+import numpy as np
+import pytest
+import scipy.sparse as sps
+
+from ... import sparse as mls
+from .. import SparseMatrix, SparseNDArray, SparseVector
+from ..core import issparse
+
+s1_data = sps.csr_matrix([[1, 0, 1], [0, 0, 1]])
+s2_data = sps.csr_matrix([[0, 1, 1], [1, 0, 1]])
+v1_data = np.random.rand(3)
+v1 = sps.csr_matrix(v1_data)
+v2_data = np.random.rand(2)
+v2 = sps.csr_matrix(v2_data)
+d1 = np.array([1, 2, 3])
+
+
+def assert_array_equal(a, b, almost=False):
+    if issparse(a):
+        a = a.toarray()
+    else:
+        a = np.asarray(a)
+    if issparse(b):
+        b = b.toarray()
+    else:
+        b = np.asarray(b)
+    if not almost:
+        np.testing.assert_array_equal(a, b)
+    else:
+        np.testing.assert_almost_equal(a, b)
+
+
+def test_sparse_creation():
+    with pytest.raises(ValueError):
+        SparseNDArray()
+
+    s = SparseNDArray(s1_data)
+    assert s.ndim == 2
+    assert isinstance(s, SparseMatrix)
+    assert_array_equal(s.toarray(), s1_data.A)
+    assert_array_equal(s.todense(), s1_data.A)
+
+    ss = pickle.loads(pickle.dumps(s))
+    assert s == ss
+    assert_array_equal(ss.toarray(), s1_data.A)
+    assert_array_equal(ss.todense(), s1_data.A)
+
+    v = SparseNDArray(v1, shape=(3,))
+    assert s.ndim
+    assert isinstance(v, SparseVector)
+    assert v.shape == (3,)
+    assert_array_equal(v.todense(), v1_data)
+    assert_array_equal(v.toarray(), v1_data)
+    assert_array_equal(v, v1_data)
+
+    vv = pickle.loads(pickle.dumps(v))
+    assert v == vv
+    assert_array_equal(vv.todense(), v1_data)
+    assert_array_equal(vv.toarray(), v1_data)
+    assert_array_equal(vv, v1_data)
+
+
+def test_sparse_add():
+    s1 = SparseNDArray(s1_data)
+    s2 = SparseNDArray(s2_data)
+
+    assert_array_equal(s1 + s2, s1 + s2)
+    assert_array_equal(s1 + d1, s1 + d1)
+    assert_array_equal(d1 + s1, d1 + s1)
+    r = sps.csr_matrix(((s1.data + 1), s1.indices, s1.indptr), s1.shape)
+    assert_array_equal(s1 + 1, r)
+    r = sps.csr_matrix(((1 + s1.data), s1.indices, s1.indptr), s1.shape)
+    assert_array_equal(1 + s1, r)
+
+    # test sparse vector
+    v = SparseNDArray(v1, shape=(3,))
+    assert_array_equal(v + v, v1_data + v1_data)
+    assert_array_equal(v + d1, v1_data + d1)
+    assert_array_equal(d1 + v, d1 + v1_data)
+    r = sps.csr_matrix(((v1.data + 1), v1.indices, v1.indptr), v1.shape)
+    assert_array_equal(v + 1, r.toarray().reshape(3))
+    r = sps.csr_matrix(((1 + v1.data), v1.indices, v1.indptr), v1.shape)
+    assert_array_equal(1 + v, r.toarray().reshape(3))
+
+
+def test_sparse_subtract():
+    s1 = SparseNDArray(s1_data)
+    s2 = SparseNDArray(s2_data)
+
+    assert_array_equal(s1 - s2, s1 - s2)
+    assert_array_equal(s1 - d1, s1 - d1)
+    assert_array_equal(d1 - s1, d1 - s1)
+    r = sps.csr_matrix(((s1.data - 1), s1.indices, s1.indptr), s1.shape)
+    assert_array_equal(s1 - 1, r)
+    r = sps.csr_matrix(((1 - s1.data), s1.indices, s1.indptr), s1.shape)
+    assert_array_equal(1 - s1, r)
+
+    # test sparse vector
+    v = SparseNDArray(v1, shape=(3,))
+    assert_array_equal(v - v, v1_data - v1_data)
+    assert_array_equal(v - d1, v1_data - d1)
+    assert_array_equal(d1 - v, d1 - v1_data)
+    r = sps.csr_matrix(((v1.data - 1), v1.indices, v1.indptr), v1.shape)
+    assert_array_equal(v - 1, r.toarray().reshape(3))
+    r = sps.csr_matrix(((1 - v1.data), v1.indices, v1.indptr), v1.shape)
+    assert_array_equal(1 - v, r.toarray().reshape(3))
+
+
+def test_sparse_multiply():
+    s1 = SparseNDArray(s1_data)
+    s2 = SparseNDArray(s2_data)
+
+    assert_array_equal(s1 * s2, s1_data.multiply(s2_data))
+    assert_array_equal(s1 * d1, s1_data.multiply(d1))
+    assert_array_equal(d1 * s1, s1_data.multiply(d1))
+    assert_array_equal(s1 * 2, s1 * 2)
+    assert_array_equal(2 * s1, s1 * 2)
+
+    # test sparse vector
+    v = SparseNDArray(v1, shape=(3,))
+    assert_array_equal(v * v, v1_data * v1_data)
+    assert_array_equal(v * d1, v1_data * d1)
+    assert_array_equal(d1 * v, d1 * v1_data)
+    r = sps.csr_matrix(((v1.data * 1), v1.indices, v1.indptr), v1.shape)
+    assert_array_equal(v * 1, r.toarray().reshape(3))
+    r = sps.csr_matrix(((1 * v1.data), v1.indices, v1.indptr), v1.shape)
+    assert_array_equal(1 * v, r.toarray().reshape(3))
+
+
+def test_sparse_divide():
+    s1 = SparseNDArray(s1_data)
+    s2 = SparseNDArray(s2_data)
+
+    assert_array_equal(s1 / s2, s1 / s2)
+    assert_array_equal(s1 / d1, s1 / d1)
+    assert_array_equal(d1 / s1, d1 / s1.toarray())
+    assert_array_equal(s1 / 2, s1 / 2)
+    assert_array_equal(2 / s1, 2 / s1.toarray())
+
+    # test sparse vector
+    v = SparseNDArray(v1, shape=(3,))
+    assert_array_equal(v / v, v1_data / v1_data)
+    assert_array_equal(v / d1, v1_data / d1)
+    assert_array_equal(d1 / v, d1 / v1_data)
+    r = sps.csr_matrix(((v1.data / 1), v1.indices, v1.indptr), v1.shape)
+    assert_array_equal(v / 1, r.toarray().reshape(3))
+    r = sps.csr_matrix(((1 / v1.data), v1.indices, v1.indptr), v1.shape)
+    assert_array_equal(1 / v, r.toarray().reshape(3))
+
+
+def test_sparse_floor_divide():
+    s1 = SparseNDArray(s1_data)
+    s2 = SparseNDArray(s2_data)
+
+    assert_array_equal(s1 // s2, s1.toarray() // s2.toarray())
+    assert_array_equal(s1 // d1, s1.toarray() // d1)
+    assert_array_equal(d1 // s1, d1 // s1.toarray())
+    assert_array_equal(s1 // 2, s1.toarray() // 2)
+    assert_array_equal(2 // s1, 2 // s1.toarray())
+
+    # test sparse vector
+    v = SparseNDArray(v1, shape=(3,))
+    assert_array_equal(v // v, v1_data // v1_data)
+    assert_array_equal(v // d1, v1_data // d1)
+    assert_array_equal(d1 // v, d1 // v1_data)
+    r = sps.csr_matrix(((v1.data // 1), v1.indices, v1.indptr), v1.shape)
+    assert_array_equal(v // 1, r.toarray().reshape(3))
+    r = sps.csr_matrix(((1 // v1.data), v1.indices, v1.indptr), v1.shape)
+    assert_array_equal(1 // v, r.toarray().reshape(3))
+
+
+def test_sparse_power():
+    s1 = SparseNDArray(s1_data)
+    s2 = SparseNDArray(s2_data)
+
+    assert_array_equal(s1**s2, s1.toarray() ** s2.toarray())
+    assert_array_equal(s1**d1, s1.toarray() ** d1)
+    assert_array_equal(d1**s1, d1 ** s1.toarray())
+    assert_array_equal(s1**2, s1_data.power(2))
+    assert_array_equal(2**s1, 2 ** s1.toarray())
+
+    # test sparse vector
+    v = SparseNDArray(v1, shape=(3,))
+    assert_array_equal(v**v, v1_data**v1_data)
+    assert_array_equal(v**d1, v1_data**d1)
+    assert_array_equal(d1**v, d1**v1_data)
+    r = sps.csr_matrix(((v1.data**1), v1.indices, v1.indptr), v1.shape)
+    assert_array_equal(v**1, r.toarray().reshape(3))
+    r = sps.csr_matrix(((1**v1.data), v1.indices, v1.indptr), v1.shape)
+    assert_array_equal(1**v, r.toarray().reshape(3))
+
+
+def test_sparse_mod():
+    s1 = SparseNDArray(s1_data)
+    s2 = SparseNDArray(s2_data)
+
+    assert_array_equal(s1 % s2, s1.toarray() % s2.toarray())
+    assert_array_equal(s1 % d1, s1.toarray() % d1)
+    assert_array_equal(d1 % s1, d1 % s1.toarray())
+    assert_array_equal(s1 % 2, s1.toarray() % 2)
+    assert_array_equal(2 % s1, 2 % s1.toarray())
+
+    # test sparse vector
+    v = SparseNDArray(v1, shape=(3,))
+    assert_array_equal(v % v, v1_data % v1_data)
+    assert_array_equal(v % d1, v1_data % d1)
+    assert_array_equal(d1 % v, d1 % v1_data)
+    r = sps.csr_matrix(((v1.data % 1), v1.indices, v1.indptr), v1.shape)
+    assert_array_equal(v % 1, r.toarray().reshape(3))
+    r = sps.csr_matrix(((1 % v1.data), v1.indices, v1.indptr), v1.shape)
+    assert_array_equal(1 % v, r.toarray().reshape(3))
+
+
+def test_sparse_bin():
+    s1 = SparseNDArray(s1_data)
+    s2 = SparseNDArray(s2_data)
+    v = SparseNDArray(v1, shape=(3,))
+
+    for method in (
+        "fmod",
+        "logaddexp",
+        "logaddexp2",
+        "equal",
+        "not_equal",
+        "less",
+        "less_equal",
+        "greater",
+        "greater_equal",
+        "hypot",
+        "arctan2",
+    ):
+        lm, rm = getattr(mls, method), getattr(np, method)
+        assert_array_equal(lm(s1, s2), rm(s1.toarray(), s2.toarray()))
+        assert_array_equal(lm(s1, d1), rm(s1.toarray(), d1))
+        assert_array_equal(lm(d1, s1), rm(d1, s1.toarray()))
+        r1 = sps.csr_matrix((rm(s1.data, 2), s1.indices, s1.indptr), s1.shape)
+        assert_array_equal(lm(s1, 2), r1)
+        r2 = sps.csr_matrix((rm(2, s1.data), s1.indices, s1.indptr), s1.shape)
+        assert_array_equal(lm(2, s1), r2)
+
+        # test sparse
+        assert_array_equal(lm(v, v), rm(v1_data, v1_data))
+        assert_array_equal(lm(v, d1), rm(v1_data, d1))
+        assert_array_equal(lm(d1, v), rm(d1, v1_data))
+        assert_array_equal(lm(v, 2), rm(v1_data, 2))
+        assert_array_equal(lm(2, v), rm(2, v1_data))
+
+
+def test_sparse_unary():
+    s1 = SparseNDArray(s1_data)
+    v = SparseNDArray(v1, shape=(3,))
+
+    for method in (
+        "negative",
+        "positive",
+        "absolute",
+        "abs",
+        "fabs",
+        "rint",
+        "sign",
+        "conj",
+        "exp",
+        "exp2",
+        "log",
+        "log2",
+        "log10",
+        "expm1",
+        "log1p",
+        "sqrt",
+        "square",
+        "cbrt",
+        "reciprocal",
+        "sin",
+        "cos",
+        "tan",
+        "arcsin",
+        "arccos",
+        "arctan",
+        "arcsinh",
+        "arccosh",
+        "arctanh",
+        "deg2rad",
+        "rad2deg",
+        "angle",
+        "isnan",
+        "isinf",
+        "signbit",
+        "sinc",
+        "isreal",
+        "isfinite",
+    ):
+        lm, rm = getattr(mls, method), getattr(np, method)
+        r = sps.csr_matrix((rm(s1.data), s1.indices, s1.indptr), s1.shape)
+        assert_array_equal(lm(s1), r)
+        assert_array_equal(lm(v), rm(v1_data))
+
+
+def test_sparse_dot():
+    s1 = SparseNDArray(s1_data)
+    s2 = SparseNDArray(s2_data)
+    v1_s = SparseNDArray(v1, shape=(3,))
+    v2_s = SparseNDArray(v2, shape=(2,))
+
+    assert_array_equal(mls.dot(s1, s2.T), s1.dot(s2.T))
+    assert_array_equal(s1.dot(d1), s1.dot(d1))
+    assert_array_equal(d1.dot(s1.T), d1.dot(s1.T.toarray()))
+
+    assert_array_equal(s1 @ s2.T, s1_data @ s2_data.T)
+
+    assert_array_equal(mls.tensordot(s1, s2.T, axes=(1, 0)), s1.dot(s2.T))
+    assert_array_equal(mls.tensordot(s1, d1, axes=(1, -1)), s1.dot(d1))
+    assert_array_equal(mls.tensordot(d1, s1.T, axes=(0, 0)), d1.dot(s1.T.toarray()))
+
+    assert_array_equal(mls.dot(s1, v1_s), s1.dot(v1_data))
+    assert_array_equal(mls.dot(s2, v1_s), s2.dot(v1_data))
+    assert_array_equal(mls.dot(v2_s, s1), v2_data.dot(s1_data.A))
+    assert_array_equal(mls.dot(v2_s, s2), v2_data.dot(s2_data.A))
+    assert_array_equal(mls.dot(v1_s, v1_s), v1_data.dot(v1_data), almost=True)
+    assert_array_equal(mls.dot(v2_s, v2_s), v2_data.dot(v2_data), almost=True)
+
+    assert_array_equal(mls.dot(v2_s, s1, sparse=False), v2_data.dot(s1_data.A))
+    assert_array_equal(mls.dot(v1_s, v1_s, sparse=False), v1_data.dot(v1_data))
+
+
+def test_sparse_sum():
+    s1 = SparseNDArray(s1_data)
+    v = SparseNDArray(v1, shape=(3,))
+    assert s1.sum() == s1.sum()
+    np.testing.assert_array_equal(s1.sum(axis=1), np.asarray(s1.sum(axis=1)).reshape(2))
+    np.testing.assert_array_equal(s1.sum(axis=0), np.asarray(s1.sum(axis=0)).reshape(3))
+    np.testing.assert_array_equal(v.sum(), np.asarray(v1_data.sum()))
+
+
+def test_sparse_setitem():
+    s1 = SparseNDArray(s1_data.copy())
+    s1[1:2, 1] = [2]
+    ss1 = s1_data.tolil()
+    ss1[1:2, 1] = [2]
+    np.testing.assert_array_equal(s1.toarray(), ss1.toarray())
+
+    v = SparseVector(v1, shape=(3,))
+    v[1:2] = [2]
+    vv1 = v1_data.copy()
+    vv1[1:2] = [2]
+    np.testing.assert_array_equal(v.toarray(), vv1)
+
+
+def test_sparse_maximum():
+    s1 = SparseNDArray(s1_data)
+    s2 = SparseNDArray(s2_data)
+
+    np.testing.assert_array_equal(s1.maximum(s2).toarray(), s1.maximum(s2).toarray())
+
+    v = SparseVector(v1, shape=(3,))
+    np.testing.assert_array_equal(v.maximum(d1), np.maximum(v1_data, d1))
+
+
+def test_sparse_minimum():
+    s1 = SparseNDArray(s1_data)
+    s2 = SparseNDArray(s2_data)
+
+    np.testing.assert_array_equal(s1.minimum(s2).toarray(), s1.minimum(s2).toarray())
+
+    v = SparseVector(v1, shape=(3,))
+    np.testing.assert_array_equal(v.minimum(d1), np.minimum(v1_data, d1))
+
+
+def test_sparse_fill_diagonal():
+    s1 = sps.random(100, 11, density=0.3, format="csr", random_state=0)
+
+    # fill scalar
+    arr = SparseNDArray(s1)
+    arr.fill_diagonal(3)
+
+    expected = s1.copy().A
+    np.fill_diagonal(expected, 3)
+
+    np.testing.assert_array_equal(arr.toarray(), expected)
+
+    # fill scalar, wrap=True
+    arr = SparseNDArray(s1)
+    arr.fill_diagonal(3, wrap=True)
+
+    expected = s1.copy().A
+    np.fill_diagonal(expected, 3, wrap=True)
+
+    np.testing.assert_array_equal(arr.toarray(), expected)
+
+    # fill list
+    arr = SparseNDArray(s1)
+    arr.fill_diagonal([1, 2, 3])
+
+    expected = s1.copy().A
+    np.fill_diagonal(expected, [1, 2, 3])
+
+    np.testing.assert_array_equal(arr.toarray(), expected)
+
+    # fill list, wrap=True
+    arr = SparseNDArray(s1)
+    arr.fill_diagonal([1, 2, 3], wrap=True)
+
+    expected = s1.copy().A
+    np.fill_diagonal(expected, [1, 2, 3], wrap=True)
+
+    np.testing.assert_array_equal(arr.toarray(), expected)
+
+    # fill long list
+    val = np.random.RandomState(0).rand(101)
+    arr = SparseNDArray(s1)
+    arr.fill_diagonal(val)
+
+    expected = s1.copy().A
+    np.fill_diagonal(expected, val)
+
+    np.testing.assert_array_equal(arr.toarray(), expected)
+
+    # fill long list, wrap=True
+    val = np.random.RandomState(0).rand(101)
+    arr = SparseNDArray(s1)
+    arr.fill_diagonal(val, wrap=True)
+
+    expected = s1.copy().A
+    np.fill_diagonal(expected, val, wrap=True)
+
+    np.testing.assert_array_equal(arr.toarray(), expected)
+
+    # fill ndarray
+    val = np.random.RandomState(0).rand(3, 4)
+    arr = SparseNDArray(s1)
+    arr.fill_diagonal(val)
+
+    expected = s1.copy().A
+    np.fill_diagonal(expected, val)
+
+    np.testing.assert_array_equal(arr.toarray(), expected)
+
+    # fill ndarray, wrap=True
+    val = np.random.RandomState(0).rand(3, 4)
+    arr = SparseNDArray(s1)
+    arr.fill_diagonal(val, wrap=True)
+
+    expected = s1.copy().A
+    np.fill_diagonal(expected, val, wrap=True)
+
+    np.testing.assert_array_equal(arr.toarray(), expected)
+
+
+def test_sparse_block():
+    r1 = sps.rand(10, 5)
+    r2 = sps.rand(10, 3)
+    r3 = sps.rand(3, 5)
+    r4 = sps.rand(3, 3)
+
+    result = mls.block(
+        [[SparseNDArray(r1), SparseNDArray(r2)], [SparseNDArray(r3), SparseNDArray(r4)]]
+    )
+    expected = sps.bmat([[r1, r2], [r3, r4]])
+    assert_array_equal(result, expected)
diff --git a/python/xorbits/_mars/lib/sparse/vector.py b/python/xorbits/_mars/lib/sparse/vector.py
new file mode 100644
index 000000000..86ad51e9f
--- /dev/null
+++ b/python/xorbits/_mars/lib/sparse/vector.py
@@ -0,0 +1,148 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .array import SparseArray, SparseNDArray
+from .core import get_array_module, get_sparse_module, is_cupy, issparse, naked, np
+
+
+class SparseVector(SparseArray):
+    __slots__ = ("spmatrix",)
+
+    def __init__(self, spvector, shape=()):
+        if shape and len(shape) != 1:
+            raise ValueError("Only accept 1-d array")
+        if isinstance(spvector, SparseVector):
+            self.spmatrix = spvector.spmatrix
+        else:
+            spvector = spvector.reshape(1, shape[0])
+            self.spmatrix = spvector.tocsr()
+
+    @property
+    def shape(self):
+        return (self.spmatrix.shape[1],)
+
+    def transpose(self, axes=None):
+        assert axes is None or tuple(axes) == (0,)
+        return self
+
+    @property
+    def T(self):
+        return self
+
+    def __truediv__(self, other):
+        try:
+            other = naked(other)
+        except TypeError:
+            return NotImplemented
+        x = self.spmatrix / other
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        if x.shape != self.shape:
+            x = np.asarray(x).reshape(self.shape)
+        return get_array_module(x).asarray(x)
+
+    def __rtruediv__(self, other):
+        try:
+            other = naked(other)
+        except TypeError:
+            return NotImplemented
+        try:
+            x = other / self.spmatrix
+        except TypeError:
+            x = other / self.spmatrix.toarray()
+        if issparse(x):
+            return SparseNDArray(x, shape=self.shape)
+        if x.shape != self.shape:
+            x = np.asarray(x).reshape(self.shape)
+        return get_array_module(x).asarray(x)
+
+    def dot(self, other, sparse=True):
+        other_shape = other.shape
+        try:
+            other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        if not sparse:
+            a = self.toarray()
+            if issparse(other):
+                other = other.toarray().reshape(other_shape)
+
+            x = a.dot(other)
+        else:
+            if len(other_shape) == 1:
+                x = self.spmatrix.dot(other.T)
+            else:
+                x = self.spmatrix.dot(other)
+        if issparse(x):
+            if x.shape == (1, 1):
+                # return scalar
+                return x.toarray()[0, 0]
+            shape = (x.shape[1],)
+            return SparseNDArray(x, shape=shape)
+        return get_array_module(x).asarray(x)
+
+    def concatenate(self, other, axis=0):
+        if other.ndim != 1:
+            raise ValueError("all the input arrays must have same number of dimensions")
+
+        try:
+            other = naked(other)
+        except TypeError:
+            return NotImplemented
+
+        if issparse(other):
+            xps = get_sparse_module(self.spmatrix)
+            if axis != 0:
+                raise ValueError("axis can only be 0")
+            other = other.reshape(1, other.shape[0]) if other.shape[0] != 1 else other
+            x = xps.hstack((self.spmatrix.reshape(1, self.shape[0]), other))
+        else:
+            xp = get_array_module(self.spmatrix)
+            x = xp.concatenate(
+                (self.spmatrix.toarray().reshape(self.shape), other), axis=axis
+            )
+
+        if issparse(x):
+            return SparseNDArray(x, shape=(x.shape[1],))
+        return get_array_module(x).asarray(x)
+
+    def _reduction(
+        self, method_name, axis=None, dtype=None, keepdims=None, todense=False, **kw
+    ):
+        if not todense:
+            assert keepdims is None or keepdims is False
+
+        if isinstance(axis, tuple):
+            assert axis == (0,)
+            axis = None
+
+        if todense:
+            x = self.spmatrix.toarray()
+            x = getattr(get_array_module(x), method_name)(x, axis=axis, **kw)
+        else:
+            x = getattr(self.spmatrix, method_name)(axis=axis, **kw)
+
+        m = get_array_module(x)
+        return m.array([x])[0]
+
+    def __setitem__(self, key, value):
+        if is_cupy(self.spmatrix):
+            return NotImplemented
+        else:
+            x = self.spmatrix.tolil()
+            key = (0,) + (key,)
+            x[key] = value
+            x = x.tocsr()
+        self.spmatrix = x
diff --git a/python/xorbits/_mars/lib/tbcode.py b/python/xorbits/_mars/lib/tbcode.py
new file mode 100644
index 000000000..34c637d0c
--- /dev/null
+++ b/python/xorbits/_mars/lib/tbcode.py
@@ -0,0 +1,96 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This utility module dumps code of remote traceback and loads them
+into local linecache. This enables displaying codes of remote
+tracebacks correctly.
+"""
+
+import linecache
+import os
+import types
+from collections import defaultdict
+
+
+def dump_traceback_code(tb: types.TracebackType, number_of_lines_of_context: int = 5):
+    """
+    Dump codes before and after lines of tracebacks.
+
+    Parameters
+    ----------
+    tb: types.TracebackType
+        Traceback object
+    number_of_lines_of_context: int
+        Total number of lines around the code
+    Returns
+    -------
+    result: dict
+        Dumped code lines of traceback
+    """
+    results = defaultdict(lambda: dict(fragments=[]))
+
+    while tb:
+        file_name = tb.tb_frame.f_code.co_filename
+        if linecache.getline(file_name, tb.tb_lineno):  # pragma: no branch
+            code_lines = linecache.cache[file_name][2]
+            left_range = max(tb.tb_lineno - number_of_lines_of_context // 2 - 1, 0)
+            right_range = min(left_range + number_of_lines_of_context, len(code_lines))
+
+            cache_data = linecache.cache[file_name]
+            fragment = cache_data[2][left_range:right_range]
+            results[file_name]["fragments"].append(
+                dict(left=left_range, right=right_range, code=fragment)
+            )
+            results[file_name].update(
+                dict(size=cache_data[0], lines=len(cache_data[2]))
+            )
+        tb = tb.tb_next
+    return dict(results)
+
+
+def load_traceback_code(code_frags: dict, cache: dict = None):
+    """
+    Load dumped codes for remote tracebacks.
+
+    Parameters
+    ----------
+    code_frags: dict
+        Dumped codes for remote traceback.
+    cache: dict
+        Target for codes to be dumped, for test purpose only.
+        Production code should keep this field as None.
+    """
+    if cache is not None:
+        real_cache = False
+    else:
+        real_cache = True
+        cache = linecache.cache
+
+    for file_name, profile in code_frags.items():
+        if real_cache and os.path.exists(file_name):
+            # skip rewriting caches of existing files
+            continue
+
+        if file_name not in cache:
+            # keep field 1 (mtime) as None to ensure lazy cache
+            cache[file_name] = (
+                profile["size"],
+                None,
+                [""] * profile["lines"],
+                file_name,
+            )
+        for fragment in profile["fragments"]:
+            left_range, right_range = fragment["left"], fragment["right"]
+            cache[file_name][2][left_range:right_range] = fragment["code"]
diff --git a/python/xorbits/_mars/lib/tblib/__init__.py b/python/xorbits/_mars/lib/tblib/__init__.py
new file mode 100644
index 000000000..69a5e28e0
--- /dev/null
+++ b/python/xorbits/_mars/lib/tblib/__init__.py
@@ -0,0 +1,329 @@
+import re
+import sys
+from types import CodeType
+from types import FrameType
+from types import TracebackType
+
+try:
+    from __pypy__ import tproxy
+except ImportError:
+    tproxy = None
+try:
+    from .cpython import tb_set_next
+except ImportError:
+    tb_set_next = None
+
+if not tb_set_next and not tproxy:
+    raise ImportError("Cannot use tblib. Runtime not supported.")
+
+__version__ = "1.7.0"
+__all__ = "Traceback", "TracebackParseError", "Frame", "Code"
+
+PY3 = sys.version_info[0] == 3
+FRAME_RE = re.compile(
+    r'^\s*File "(?P<co_filename>.+)", line (?P<tb_lineno>\d+)(, in (?P<co_name>.+))?$'
+)
+
+
+class _AttrDict(dict):
+    __slots__ = ()
+
+    def __getattr__(self, name):
+        try:
+            return self[name]
+        except KeyError:
+            raise AttributeError(name)
+
+
+# noinspection PyPep8Naming
+class __traceback_maker(Exception):
+    pass
+
+
+class TracebackParseError(Exception):
+    pass
+
+
+class Code(object):
+    """
+    Class that replicates just enough of the builtin Code object to enable serialization and traceback rendering.
+    """
+
+    co_code = None
+
+    def __init__(self, code):
+        self.co_filename = code.co_filename
+        self.co_name = code.co_name
+        self.co_argcount = 0
+        self.co_kwonlyargcount = 0
+        self.co_varnames = ()
+        self.co_nlocals = 0
+        self.co_stacksize = 0
+        self.co_flags = 64
+        self.co_firstlineno = 0
+
+    # noinspection SpellCheckingInspection
+    def __tproxy__(self, operation, *args, **kwargs):
+        """
+        Necessary for PyPy's tproxy.
+        """
+        if operation in ("__getattribute__", "__getattr__"):
+            return getattr(self, args[0])
+        else:
+            return getattr(self, operation)(*args, **kwargs)
+
+
+class Frame(object):
+    """
+    Class that replicates just enough of the builtin Frame object to enable serialization and traceback rendering.
+    """
+
+    def __init__(self, frame):
+        self.f_locals = {}
+        self.f_globals = {
+            k: v for k, v in frame.f_globals.items() if k in ("__file__", "__name__")
+        }
+        self.f_code = Code(frame.f_code)
+        self.f_lineno = frame.f_lineno
+
+    def clear(self):
+        """
+        For compatibility with PyPy 3.5;
+        clear() was added to frame in Python 3.4
+        and is called by traceback.clear_frames(), which
+        in turn is called by unittest.TestCase.assertRaises
+        """
+
+    # noinspection SpellCheckingInspection
+    def __tproxy__(self, operation, *args, **kwargs):
+        """
+        Necessary for PyPy's tproxy.
+        """
+        if operation in ("__getattribute__", "__getattr__"):
+            if args[0] == "f_code":
+                return tproxy(CodeType, self.f_code.__tproxy__)
+            else:
+                return getattr(self, args[0])
+        else:
+            return getattr(self, operation)(*args, **kwargs)
+
+
+class Traceback(object):
+    """
+    Class that wraps builtin Traceback objects.
+    """
+
+    tb_next = None
+
+    def __init__(self, tb):
+        self.tb_frame = Frame(tb.tb_frame)
+        # noinspection SpellCheckingInspection
+        self.tb_lineno = int(tb.tb_lineno)
+
+        # Build in place to avoid exceeding the recursion limit
+        tb = tb.tb_next
+        prev_traceback = self
+        cls = type(self)
+        while tb is not None:
+            traceback = object.__new__(cls)
+            traceback.tb_frame = Frame(tb.tb_frame)
+            traceback.tb_lineno = int(tb.tb_lineno)
+            prev_traceback.tb_next = traceback
+            prev_traceback = traceback
+            tb = tb.tb_next
+
+    def as_traceback(self):
+        """
+        Convert to a builtin Traceback object that is usable for raising or rendering a stacktrace.
+        """
+        if tproxy:
+            return tproxy(TracebackType, self.__tproxy__)
+        if not tb_set_next:
+            raise RuntimeError("Unsupported Python interpreter!")
+
+        current = self
+        top_tb = None
+        tb = None
+        while current:
+            f_code = current.tb_frame.f_code
+            code = compile(
+                "\n" * (current.tb_lineno - 1) + "raise __traceback_maker",
+                current.tb_frame.f_code.co_filename,
+                "exec",
+            )
+            if hasattr(code, "replace"):
+                # Python 3.8 and newer
+                code = code.replace(
+                    co_argcount=0,
+                    co_filename=f_code.co_filename,
+                    co_name=f_code.co_name,
+                    co_freevars=(),
+                    co_cellvars=(),
+                )
+            elif PY3:
+                code = CodeType(
+                    0,
+                    code.co_kwonlyargcount,
+                    code.co_nlocals,
+                    code.co_stacksize,
+                    code.co_flags,
+                    code.co_code,
+                    code.co_consts,
+                    code.co_names,
+                    code.co_varnames,
+                    f_code.co_filename,
+                    f_code.co_name,
+                    code.co_firstlineno,
+                    code.co_lnotab,
+                    (),
+                    (),
+                )
+            else:
+                code = CodeType(
+                    0,
+                    code.co_nlocals,
+                    code.co_stacksize,
+                    code.co_flags,
+                    code.co_code,
+                    code.co_consts,
+                    code.co_names,
+                    code.co_varnames,
+                    f_code.co_filename.encode(),
+                    f_code.co_name.encode(),
+                    code.co_firstlineno,
+                    code.co_lnotab,
+                    (),
+                    (),
+                )
+
+            # noinspection PyBroadException
+            try:
+                exec(code, dict(current.tb_frame.f_globals), {})
+            except Exception:
+                next_tb = sys.exc_info()[2].tb_next
+                if top_tb is None:
+                    top_tb = next_tb
+                if tb is not None:
+                    tb_set_next(tb, next_tb)
+                tb = next_tb
+                del next_tb
+
+            current = current.tb_next
+        try:
+            return top_tb
+        finally:
+            del top_tb
+            del tb
+
+    to_traceback = as_traceback
+
+    # noinspection SpellCheckingInspection
+    def __tproxy__(self, operation, *args, **kwargs):
+        """
+        Necessary for PyPy's tproxy.
+        """
+        if operation in ("__getattribute__", "__getattr__"):
+            if args[0] == "tb_next":
+                return self.tb_next and self.tb_next.as_traceback()
+            elif args[0] == "tb_frame":
+                return tproxy(FrameType, self.tb_frame.__tproxy__)
+            else:
+                return getattr(self, args[0])
+        else:
+            return getattr(self, operation)(*args, **kwargs)
+
+    def as_dict(self):
+        """
+        Converts to a dictionary representation. You can serialize the result to JSON as it only has
+        builtin objects like dicts, lists, ints or strings.
+        """
+        if self.tb_next is None:
+            tb_next = None
+        else:
+            tb_next = self.tb_next.to_dict()
+
+        code = {
+            "co_filename": self.tb_frame.f_code.co_filename,
+            "co_name": self.tb_frame.f_code.co_name,
+        }
+        frame = {
+            "f_globals": self.tb_frame.f_globals,
+            "f_code": code,
+            "f_lineno": self.tb_frame.f_lineno,
+        }
+        return {
+            "tb_frame": frame,
+            "tb_lineno": self.tb_lineno,
+            "tb_next": tb_next,
+        }
+
+    to_dict = as_dict
+
+    @classmethod
+    def from_dict(cls, dct):
+        """
+        Creates an instance from a dictionary with the same structure as ``.as_dict()`` returns.
+        """
+        if dct["tb_next"]:
+            tb_next = cls.from_dict(dct["tb_next"])
+        else:
+            tb_next = None
+
+        code = _AttrDict(
+            co_filename=dct["tb_frame"]["f_code"]["co_filename"],
+            co_name=dct["tb_frame"]["f_code"]["co_name"],
+        )
+        frame = _AttrDict(
+            f_globals=dct["tb_frame"]["f_globals"],
+            f_code=code,
+            f_lineno=dct["tb_frame"]["f_lineno"],
+        )
+        tb = _AttrDict(
+            tb_frame=frame,
+            tb_lineno=dct["tb_lineno"],
+            tb_next=tb_next,
+        )
+        return cls(tb)
+
+    @classmethod
+    def from_string(cls, string, strict=True):
+        """
+        Creates an instance by parsing a stacktrace. Strict means that parsing stops when lines are not indented by at least two spaces
+        anymore.
+        """
+        frames = []
+        header = strict
+
+        for line in string.splitlines():
+            line = line.rstrip()
+            if header:
+                if line == "Traceback (most recent call last):":
+                    header = False
+                continue
+            frame_match = FRAME_RE.match(line)
+            if frame_match:
+                frames.append(frame_match.groupdict())
+            elif line.startswith("  "):
+                pass
+            elif strict:
+                break  # traceback ended
+
+        if frames:
+            previous = None
+            for frame in reversed(frames):
+                previous = _AttrDict(
+                    frame,
+                    tb_frame=_AttrDict(
+                        frame,
+                        f_globals=_AttrDict(
+                            __file__=frame["co_filename"],
+                            __name__="?",
+                        ),
+                        f_code=_AttrDict(frame),
+                        f_lineno=int(frame["tb_lineno"]),
+                    ),
+                    tb_next=previous,
+                )
+            return cls(previous)
+        else:
+            raise TracebackParseError("Could not find any frames in %r." % string)
diff --git a/python/xorbits/_mars/lib/tblib/cpython.py b/python/xorbits/_mars/lib/tblib/cpython.py
new file mode 100644
index 000000000..06d898364
--- /dev/null
+++ b/python/xorbits/_mars/lib/tblib/cpython.py
@@ -0,0 +1,83 @@
+"""
+Taken verbatim from Jinja2.
+https://github.com/mitsuhiko/jinja2/blob/master/jinja2/debug.py#L267
+"""
+import platform
+import sys
+
+
+def _init_ugly_crap():
+    """This function implements a few ugly things so that we can patch the
+    traceback objects.  The function returned allows resetting `tb_next` on
+    any python traceback object.  Do not attempt to use this on non cpython
+    interpreters
+    """
+    import ctypes
+    from types import TracebackType
+
+    # figure out side of _Py_ssize_t
+    if hasattr(ctypes.pythonapi, "Py_InitModule4_64"):
+        _Py_ssize_t = ctypes.c_int64
+    else:
+        _Py_ssize_t = ctypes.c_int
+
+    # regular python
+    class _PyObject(ctypes.Structure):
+        pass
+
+    _PyObject._fields_ = [
+        ("ob_refcnt", _Py_ssize_t),
+        ("ob_type", ctypes.POINTER(_PyObject)),
+    ]
+
+    # python with trace
+    if hasattr(sys, "getobjects"):
+
+        class _PyObject(ctypes.Structure):
+            pass
+
+        _PyObject._fields_ = [
+            ("_ob_next", ctypes.POINTER(_PyObject)),
+            ("_ob_prev", ctypes.POINTER(_PyObject)),
+            ("ob_refcnt", _Py_ssize_t),
+            ("ob_type", ctypes.POINTER(_PyObject)),
+        ]
+
+    class _Traceback(_PyObject):
+        pass
+
+    _Traceback._fields_ = [
+        ("tb_next", ctypes.POINTER(_Traceback)),
+        ("tb_frame", ctypes.POINTER(_PyObject)),
+        ("tb_lasti", ctypes.c_int),
+        ("tb_lineno", ctypes.c_int),
+    ]
+
+    def tb_set_next(tb, next):
+        """Set the tb_next attribute of a traceback object."""
+        if not (
+            isinstance(tb, TracebackType)
+            and (next is None or isinstance(next, TracebackType))
+        ):
+            raise TypeError("tb_set_next arguments must be traceback objects")
+        obj = _Traceback.from_address(id(tb))
+        if tb.tb_next is not None:
+            old = _Traceback.from_address(id(tb.tb_next))
+            old.ob_refcnt -= 1
+        if next is None:
+            obj.tb_next = ctypes.POINTER(_Traceback)()
+        else:
+            next = _Traceback.from_address(id(next))
+            next.ob_refcnt += 1
+            obj.tb_next = ctypes.pointer(next)
+
+    return tb_set_next
+
+
+tb_set_next = None
+try:
+    if platform.python_implementation() == "CPython":
+        tb_set_next = _init_ugly_crap()
+except Exception as exc:
+    sys.stderr.write("Failed to initialize cpython support: {!r}".format(exc))
+del _init_ugly_crap
diff --git a/python/xorbits/_mars/lib/tblib/decorators.py b/python/xorbits/_mars/lib/tblib/decorators.py
new file mode 100644
index 000000000..77778bc97
--- /dev/null
+++ b/python/xorbits/_mars/lib/tblib/decorators.py
@@ -0,0 +1,44 @@
+import sys
+from functools import wraps
+
+from . import Traceback
+
+
+class Error(object):
+    def __init__(self, exc_type, exc_value, traceback):
+        self.exc_type = exc_type
+        self.exc_value = exc_value
+        self.__traceback = Traceback(traceback)
+
+    @property
+    def traceback(self):
+        return self.__traceback.as_traceback()
+
+    def reraise(self):
+        raise self.exc_value.with_traceback(self.traceback) from None
+
+
+def return_error(func, exc_type=Exception):
+    @wraps(func)
+    def return_exceptions_wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except exc_type:
+            return Error(*sys.exc_info())
+
+    return return_exceptions_wrapper
+
+
+returns_error = (
+    return_errors
+) = returns_errors = return_error  # cause I make too many typos
+
+
+@return_error
+def apply_with_return_error(args):
+    """
+    args is a tuple where the first argument is a callable.
+    eg::
+        apply_with_return_error((func, 1, 2, 3)) - this will call func(1, 2, 3)
+    """
+    return args[0](*args[1:])
diff --git a/python/xorbits/_mars/lib/tblib/pickling_support.py b/python/xorbits/_mars/lib/tblib/pickling_support.py
new file mode 100644
index 000000000..995f4c620
--- /dev/null
+++ b/python/xorbits/_mars/lib/tblib/pickling_support.py
@@ -0,0 +1,91 @@
+import sys
+from types import TracebackType
+
+from . import Frame
+from . import Traceback
+
+if sys.version_info.major >= 3:
+    import copyreg
+else:
+    import copy_reg as copyreg
+
+
+def unpickle_traceback(tb_frame, tb_lineno, tb_next):
+    ret = object.__new__(Traceback)
+    ret.tb_frame = tb_frame
+    ret.tb_lineno = tb_lineno
+    ret.tb_next = tb_next
+    return ret.as_traceback()
+
+
+def pickle_traceback(tb):
+    return unpickle_traceback, (
+        Frame(tb.tb_frame),
+        tb.tb_lineno,
+        tb.tb_next and Traceback(tb.tb_next),
+    )
+
+
+def unpickle_exception(func, args, cause, tb):
+    inst = func(*args)
+    inst.__cause__ = cause
+    inst.__traceback__ = tb
+    return inst
+
+
+def pickle_exception(obj):
+    # All exceptions, unlike generic Python objects, define __reduce_ex__
+    # __reduce_ex__(4) should be no different from __reduce_ex__(3).
+    # __reduce_ex__(5) could bring benefits in the unlikely case the exception
+    # directly contains buffers, but PickleBuffer objects will cause a crash when
+    # running on protocol=4, and there's no clean way to figure out the current
+    # protocol from here. Note that any object returned by __reduce_ex__(3) will
+    # still be pickled with protocol 5 if pickle.dump() is running with it.
+    rv = obj.__reduce_ex__(3)
+    if isinstance(rv, str):
+        raise TypeError("str __reduce__ output is not supported")
+    assert isinstance(rv, tuple) and len(rv) >= 2
+
+    return (unpickle_exception, rv[:2] + (obj.__cause__, obj.__traceback__)) + rv[2:]
+
+
+def _get_subclasses(cls):
+    # Depth-first traversal of all direct and indirect subclasses of cls
+    to_visit = [cls]
+    while to_visit:
+        this = to_visit.pop()
+        yield this
+        to_visit += list(this.__subclasses__())
+
+
+def install(*exc_classes_or_instances):
+    copyreg.pickle(TracebackType, pickle_traceback)
+
+    if sys.version_info.major < 3:
+        # Dummy decorator?
+        if len(exc_classes_or_instances) == 1:
+            exc = exc_classes_or_instances[0]
+            if isinstance(exc, type) and issubclass(exc, BaseException):
+                return exc
+        return
+
+    if not exc_classes_or_instances:
+        for exception_cls in _get_subclasses(BaseException):
+            copyreg.pickle(exception_cls, pickle_exception)
+        return
+
+    for exc in exc_classes_or_instances:
+        if isinstance(exc, BaseException):
+            while exc is not None:
+                copyreg.pickle(type(exc), pickle_exception)
+                exc = exc.__cause__
+        elif isinstance(exc, type) and issubclass(exc, BaseException):
+            copyreg.pickle(exc, pickle_exception)
+            # Allow using @install as a decorator for Exception classes
+            if len(exc_classes_or_instances) == 1:
+                return exc
+        else:
+            raise TypeError(
+                "Expected subclasses or instances of BaseException, got %s"
+                % (type(exc))
+            )
diff --git a/python/xorbits/_mars/lib/tests/__init__.py b/python/xorbits/_mars/lib/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/lib/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/lib/tests/test_lib.py b/python/xorbits/_mars/lib/tests/test_lib.py
new file mode 100644
index 000000000..56ab750c1
--- /dev/null
+++ b/python/xorbits/_mars/lib/tests/test_lib.py
@@ -0,0 +1,133 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle
+import sys
+
+import numpy as np
+import pandas as pd
+
+from ...tests.core import assert_groupby_equal
+from ...utils import calc_data_size, estimate_pandas_size
+from ..groupby_wrapper import wrapped_groupby
+from ..tbcode import dump_traceback_code, load_traceback_code
+
+
+def test_groupby_wrapper():
+    df = pd.DataFrame(
+        {
+            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
+            "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
+            "C": np.random.randn(8),
+            "D": np.random.randn(8),
+        },
+        index=pd.MultiIndex.from_tuples([(i // 4, i) for i in range(8)]),
+    )
+
+    conv_func = lambda x: pickle.loads(pickle.dumps(x))
+
+    grouped = conv_func(wrapped_groupby(df, level=0))
+    assert_groupby_equal(grouped, df.groupby(level=0))
+    assert grouped.shape == (8, 4)
+    assert grouped.is_frame is True
+    assert sys.getsizeof(grouped) > sys.getsizeof(grouped.groupby_obj)
+    assert calc_data_size(grouped) > sys.getsizeof(grouped.groupby_obj)
+    assert grouped.estimate_size() > estimate_pandas_size(grouped.groupby_obj)
+
+    grouped = conv_func(wrapped_groupby(df, level=0).C)
+    assert_groupby_equal(grouped, df.groupby(level=0).C)
+    assert grouped.shape == (8,)
+    assert grouped.is_frame is False
+
+    grouped = conv_func(wrapped_groupby(df, "B"))
+    assert_groupby_equal(grouped, df.groupby("B"))
+    assert grouped.shape == (8, 4)
+    assert grouped.is_frame is True
+
+    grouped = conv_func(wrapped_groupby(df, "B").C)
+    assert_groupby_equal(grouped, df.groupby("B").C, with_selection=True)
+    assert grouped.shape == (8,)
+    assert grouped.is_frame is False
+
+    grouped = conv_func(wrapped_groupby(df, "B")[["C", "D"]])
+    assert_groupby_equal(grouped, df.groupby("B")[["C", "D"]], with_selection=True)
+    assert grouped.shape == (8, 2)
+    assert grouped.is_frame is True
+
+    grouped = conv_func(wrapped_groupby(df, ["B", "C"]))
+    assert_groupby_equal(grouped, df.groupby(["B", "C"]))
+    assert grouped.shape == (8, 4)
+    assert grouped.is_frame is True
+
+    grouped = conv_func(wrapped_groupby(df, ["B", "C"]).C)
+    assert_groupby_equal(grouped, df.groupby(["B", "C"]).C, with_selection=True)
+    assert grouped.shape == (8,)
+    assert grouped.is_frame is False
+
+    grouped = conv_func(wrapped_groupby(df, ["B", "C"])[["A", "D"]])
+    assert_groupby_equal(
+        grouped, df.groupby(["B", "C"])[["A", "D"]], with_selection=True
+    )
+    assert grouped.shape == (8, 2)
+    assert grouped.is_frame is True
+
+    grouped = conv_func(wrapped_groupby(df, ["B", "C"])[["C", "D"]])
+    assert_groupby_equal(
+        grouped, df.groupby(["B", "C"])[["C", "D"]], with_selection=True
+    )
+    assert grouped.shape == (8, 2)
+    assert grouped.is_frame is True
+
+    grouped = conv_func(wrapped_groupby(df, lambda x: x[-1] % 2))
+    assert_groupby_equal(grouped, df.groupby(lambda x: x[-1] % 2), with_selection=True)
+    assert grouped.shape == (8, 4)
+    assert grouped.is_frame is True
+
+    grouped = conv_func(wrapped_groupby(df, lambda x: x[-1] % 2).C)
+    assert_groupby_equal(
+        grouped, df.groupby(lambda x: x[-1] % 2).C, with_selection=True
+    )
+    assert grouped.shape == (8,)
+    assert grouped.is_frame is False
+
+    grouped = conv_func(wrapped_groupby(df, lambda x: x[-1] % 2)[["C", "D"]])
+    assert_groupby_equal(
+        grouped, df.groupby(lambda x: x[-1] % 2)[["C", "D"]], with_selection=True
+    )
+    assert grouped.shape == (8, 2)
+    assert grouped.is_frame is True
+
+    grouped = conv_func(wrapped_groupby(df.B, lambda x: x[-1] % 2))
+    assert_groupby_equal(
+        grouped, df.B.groupby(lambda x: x[-1] % 2), with_selection=True
+    )
+    assert grouped.shape == (8,)
+    assert grouped.is_frame is False
+
+
+def test_traceback_code():
+    def get_tb():
+        try:
+            raise ValueError
+        except ValueError:
+            return sys.exc_info()[-1]
+
+    tb = get_tb()
+    frags = dump_traceback_code(tb)
+
+    target_dict = dict()
+    load_traceback_code(frags, target_dict)
+    code_lines = target_dict[__file__][2]
+    assert "raise" in code_lines[tb.tb_lineno - 1]
+    assert len([line for line in code_lines if line]) == 5
diff --git a/python/xorbits/_mars/lib/tests/test_nvutils.py b/python/xorbits/_mars/lib/tests/test_nvutils.py
new file mode 100644
index 000000000..a514a645d
--- /dev/null
+++ b/python/xorbits/_mars/lib/tests/test_nvutils.py
@@ -0,0 +1,38 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...tests.core import require_cupy
+from ...utils import lazy_import
+from .. import nvutils
+
+
+cupy = lazy_import("cupy")
+
+
+@require_cupy
+def test_nvutil():
+    device_info = nvutils.get_device_info(0)
+    assert device_info.uuid is not None
+
+    # run something
+    _ = cupy.ones(10)
+
+    handle = nvutils.get_handle_by_index(0)
+    assert nvutils._running_process_matches(handle)
+    assert nvutils.get_cuda_context().has_context
+
+    info = nvutils.get_index_and_uuid(0)
+    info2 = nvutils.get_index_and_uuid(info.uuid)
+    assert info.device_index == info2.device_index
+    assert info.uuid == info2.uuid
diff --git a/python/xorbits/_mars/lib/uhashring/__init__.py b/python/xorbits/_mars/lib/uhashring/__init__.py
new file mode 100644
index 000000000..8cc5d9ba1
--- /dev/null
+++ b/python/xorbits/_mars/lib/uhashring/__init__.py
@@ -0,0 +1,3 @@
+from .ring import HashRing
+
+__all__ = ["HashRing", "monkey"]
diff --git a/python/xorbits/_mars/lib/uhashring/monkey.py b/python/xorbits/_mars/lib/uhashring/monkey.py
new file mode 100644
index 000000000..bbe6edf11
--- /dev/null
+++ b/python/xorbits/_mars/lib/uhashring/monkey.py
@@ -0,0 +1,40 @@
+from . import HashRing
+
+__all__ = ["patch_memcache"]
+
+
+def patch_memcache():
+    """Monkey patch python-memcached to implement our consistent hashring
+    in its node selection and operations.
+    """
+
+    def _init(self, servers, *k, **kw):
+        self._old_init(servers, *k, **kw)
+
+        nodes = {}
+        for server in self.servers:
+            conf = {
+                "hostname": server.ip,
+                "instance": server,
+                "port": server.port,
+                "weight": server.weight,
+            }
+            nodes[server.ip] = conf
+        self.uhashring = HashRing(nodes)
+
+    def _get_server(self, key):
+        if isinstance(key, tuple):
+            return self._old_get_server(key)
+
+        for i in range(self._SERVER_RETRIES):
+            for node in self.uhashring.range(key):
+                if node["instance"].connect():
+                    return node["instance"], key
+
+        return None, None
+
+    memcache = __import__("memcache")
+    memcache.Client._old_get_server = memcache.Client._get_server
+    memcache.Client._old_init = memcache.Client.__init__
+    memcache.Client.__init__ = _init
+    memcache.Client._get_server = _get_server
diff --git a/python/xorbits/_mars/lib/uhashring/ring.py b/python/xorbits/_mars/lib/uhashring/ring.py
new file mode 100644
index 000000000..8fc19c03e
--- /dev/null
+++ b/python/xorbits/_mars/lib/uhashring/ring.py
@@ -0,0 +1,341 @@
+from bisect import bisect
+
+from .ring_ketama import KetamaRing
+from .ring_meta import MetaRing
+
+
+class HashRing(object):
+    """Implement a consistent hashing ring."""
+
+    def __init__(self, nodes=[], **kwargs):
+        """Create a new HashRing given the implementation.
+
+        :param nodes: nodes used to create the continuum (see doc for format).
+        :param hash_fn: use this callable function to hash keys, can be set to
+                        'ketama' to use the ketama compatible implementation.
+        :param vnodes: default number of vnodes per node.
+        :param weight_fn: use this function to calculate the node's weight.
+        """
+        hash_fn = kwargs.get("hash_fn", None)
+        vnodes = kwargs.get("vnodes", None)
+        weight_fn = kwargs.get("weight_fn", None)
+
+        if hash_fn == "ketama":
+            if vnodes is None:
+                vnodes = 40
+            self.runtime = KetamaRing()
+        else:
+            if vnodes is None:
+                vnodes = 160
+            self.runtime = MetaRing(hash_fn)
+
+        self._default_vnodes = vnodes
+        self.hashi = self.runtime.hashi
+
+        if weight_fn and not hasattr(weight_fn, "__call__"):
+            raise TypeError("weight_fn should be a callable function")
+        self._weight_fn = weight_fn
+
+        if self._configure_nodes(nodes):
+            self.runtime._create_ring(self.runtime._nodes.items())
+
+    def _configure_nodes(self, nodes):
+        """Parse and set up the given nodes.
+
+        :param nodes: nodes used to create the continuum (see doc for format).
+        """
+        if isinstance(nodes, str):
+            nodes = [nodes]
+        elif not isinstance(nodes, (dict, list)):
+            raise ValueError(
+                f"nodes configuration should be a list or a dict, got {type(nodes)}"
+            )
+
+        conf_changed = False
+        for node in nodes:
+            conf = {
+                "hostname": node,
+                "instance": None,
+                "nodename": node,
+                "port": None,
+                "vnodes": self._default_vnodes,
+                "weight": 1,
+            }
+            current_conf = self.runtime._nodes.get(node, {})
+            nodename = node
+            # new node, trigger a ring update
+            if not current_conf:
+                conf_changed = True
+            # complex config
+            if isinstance(nodes, dict):
+                node_conf = nodes[node]
+                if isinstance(node_conf, int):
+                    conf["weight"] = node_conf
+                elif isinstance(node_conf, dict):
+                    for k, v in node_conf.items():
+                        if k in conf:
+                            conf[k] = v
+                            # changing those config trigger a ring update
+                            if k in ["nodename", "vnodes", "weight"]:
+                                if current_conf.get(k) != v:
+                                    conf_changed = True
+                else:
+                    raise ValueError(
+                        "node configuration should be a dict or an int,"
+                        f" got {type(node_conf)}"
+                    )
+            if self._weight_fn:
+                conf["weight"] = self._weight_fn(**conf)
+            # changing the weight of a node trigger a ring update
+            if current_conf.get("weight") != conf["weight"]:
+                conf_changed = True
+            self.runtime._nodes[nodename] = conf
+        return conf_changed
+
+    def __delitem__(self, nodename):
+        """Remove the given node.
+
+        :param nodename: the node name.
+        """
+        self.runtime._remove_node(nodename)
+
+    remove_node = __delitem__
+
+    def __getitem__(self, key):
+        """Returns the instance of the node matching the hashed key.
+
+        :param key: the key to look for.
+        """
+        return self._get(key, "instance")
+
+    get_node_instance = __getitem__
+
+    def __setitem__(self, nodename, conf={"weight": 1}):
+        """Add the given node with its associated configuration.
+
+        :param nodename: the node name.
+        :param conf: the node configuration.
+        """
+        if self._configure_nodes({nodename: conf}):
+            self.runtime._create_ring([(nodename, self._nodes[nodename])])
+
+    add_node = __setitem__
+
+    def _get_pos(self, key):
+        """Get the index of the given key in the sorted key list.
+
+        We return the position with the nearest hash based on
+        the provided key unless we reach the end of the continuum/ring
+        in which case we return the 0 (beginning) index position.
+
+        :param key: the key to hash and look for.
+        """
+        p = bisect(self.runtime._keys, self.hashi(key))
+        if p == len(self.runtime._keys):
+            return 0
+        else:
+            return p
+
+    def _get(self, key, what):
+        """Generic getter magic method.
+
+        The node with the nearest but not less hash value is returned.
+
+        :param key: the key to look for.
+        :param what: the information to look for in, allowed values:
+            - instance (default): associated node instance
+            - nodename: node name
+            - pos: index of the given key in the ring
+            - tuple: ketama compatible (pos, name) tuple
+            - weight: node weight
+        """
+        if not self.runtime._ring:
+            return None
+
+        pos = self._get_pos(key)
+        if what == "pos":
+            return pos
+
+        nodename = self.runtime._ring[self.runtime._keys[pos]]
+        if what in ["hostname", "instance", "port", "weight"]:
+            return self.runtime._nodes[nodename][what]
+        elif what == "dict":
+            return self.runtime._nodes[nodename]
+        elif what == "nodename":
+            return nodename
+        elif what == "tuple":
+            return (self.runtime._keys[pos], nodename)
+
+    def get(self, key):
+        """Returns the node object dict matching the hashed key.
+
+        :param key: the key to look for.
+        """
+        return self._get(key, "dict")
+
+    def get_instances(self):
+        """Returns a list of the instances of all the configured nodes."""
+        return [
+            c.get("instance") for c in self.runtime._nodes.values() if c.get("instance")
+        ]
+
+    def get_key(self, key):
+        """Alias of ketama hashi method, returns the hash of the given key.
+
+        This method is present for hash_ring compatibility.
+
+        :param key: the key to look for.
+        """
+        return self.hashi(key)
+
+    def get_node(self, key):
+        """Returns the node name of the node matching the hashed key.
+
+        :param key: the key to look for.
+        """
+        return self._get(key, "nodename")
+
+    def get_node_hostname(self, key):
+        """Returns the hostname of the node matching the hashed key.
+
+        :param key: the key to look for.
+        """
+        return self._get(key, "hostname")
+
+    def get_node_port(self, key):
+        """Returns the port of the node matching the hashed key.
+
+        :param key: the key to look for.
+        """
+        return self._get(key, "port")
+
+    def get_node_pos(self, key):
+        """Returns the index position of the node matching the hashed key.
+
+        :param key: the key to look for.
+        """
+        return self._get(key, "pos")
+
+    def get_node_weight(self, key):
+        """Returns the weight of the node matching the hashed key.
+
+        :param key: the key to look for.
+        """
+        return self._get(key, "weight")
+
+    def get_nodes(self):
+        """Returns a list of the names of all the configured nodes."""
+        return self.runtime._nodes.keys()
+
+    def get_points(self):
+        """Returns a ketama compatible list of (position, nodename) tuples."""
+        return [(k, self.runtime._ring[k]) for k in self.runtime._keys]
+
+    def get_server(self, key):
+        """Returns a ketama compatible (position, nodename) tuple.
+
+        :param key: the key to look for.
+        """
+        return self._get(key, "tuple")
+
+    def iterate_nodes(self, key, distinct=True):
+        """hash_ring compatibility implementation.
+
+        Given a string key it returns the nodes as a generator that
+        can hold the key.
+        The generator iterates one time through the ring
+        starting at the correct position.
+        if `distinct` is set, then the nodes returned will be unique,
+        i.e. no virtual copies will be returned.
+        """
+        if not self.runtime._ring:
+            yield None
+        else:
+            for node in self.range(key, unique=distinct):
+                yield node["nodename"]
+
+    def print_continuum(self):
+        """Prints a ketama compatible continuum report."""
+        numpoints = len(self.runtime._keys)
+        if numpoints:
+            print(f"Numpoints in continuum: {numpoints}")
+        else:
+            print("Continuum empty")
+        for p in self.get_points():
+            point, node = p
+            print(f"{node} ({point})")
+
+    def range(self, key, size=None, unique=True):
+        """Returns a generator of nodes' configuration available
+        in the continuum/ring.
+
+        :param key: the key to look for.
+        :param size: limit the list to at most this number of nodes.
+        :param unique: a node may only appear once in the list (default True).
+        """
+        all_nodes = set()
+        if unique:
+            size = size or len(self.runtime._nodes)
+        else:
+            all_nodes = []
+
+        pos = self._get_pos(key)
+        for key in self.runtime._keys[pos:]:
+            nodename = self.runtime._ring[key]
+            if unique:
+                if nodename in all_nodes:
+                    continue
+                all_nodes.add(nodename)
+            else:
+                all_nodes.append(nodename)
+            yield self.runtime._nodes[nodename]
+            if len(all_nodes) == size:
+                break
+        else:
+            for i, key in enumerate(self.runtime._keys):
+                if i < pos:
+                    nodename = self.runtime._ring[key]
+                    if unique:
+                        if nodename in all_nodes:
+                            continue
+                        all_nodes.add(nodename)
+                    else:
+                        all_nodes.append(nodename)
+                    yield self.runtime._nodes[nodename]
+                    if len(all_nodes) == size:
+                        break
+
+    def regenerate(self):
+        self.runtime._create_ring(self.runtime._nodes.items())
+
+    @property
+    def conf(self):
+        return self.runtime._nodes
+
+    nodes = conf
+
+    @property
+    def distribution(self):
+        return self.runtime._distribution
+
+    @property
+    def ring(self):
+        return self.runtime._ring
+
+    continuum = ring
+
+    @property
+    def size(self):
+        return len(self.runtime._ring)
+
+    @property
+    def _ring(self):
+        return self.runtime._ring
+
+    @property
+    def _nodes(self):
+        return self.runtime._nodes
+
+    @property
+    def _keys(self):
+        return self.runtime._keys
diff --git a/python/xorbits/_mars/lib/uhashring/ring_ketama.py b/python/xorbits/_mars/lib/uhashring/ring_ketama.py
new file mode 100644
index 000000000..03e61e9c1
--- /dev/null
+++ b/python/xorbits/_mars/lib/uhashring/ring_ketama.py
@@ -0,0 +1,81 @@
+from bisect import insort
+from collections import Counter
+from hashlib import md5
+from sys import version_info
+
+
+class KetamaRing(object):
+    """Implement a ketama compatible consistent hashing ring."""
+
+    def __init__(self):
+        """Create a new HashRing."""
+        self._distribution = Counter()
+        self._keys = []
+        self._nodes = {}
+        self._replicas = 4
+        self._ring = {}
+
+        if version_info >= (3,):
+            self._listbytes = lambda x: x
+
+    def hashi(self, key, replica=0):
+        """Returns a ketama compatible hash from the given key."""
+        dh = self._listbytes(md5(str(key).encode("utf-8")).digest())
+        rd = replica * 4
+        return (dh[3 + rd] << 24) | (dh[2 + rd] << 16) | (dh[1 + rd] << 8) | dh[0 + rd]
+
+    def _hashi_weight_generator(self, node_name, node_conf):
+        """Calculate the weight factor of the given node and
+        yield its hash key for every configured replica.
+
+        :param node_name: the node name.
+        """
+        ks = (
+            node_conf["vnodes"] * len(self._nodes) * node_conf["weight"]
+        ) // self._weight_sum
+        for w in range(0, ks):
+            w_node_name = f"{node_name}-{w}"
+            for i in range(0, self._replicas):
+                yield self.hashi(w_node_name, replica=i)
+
+    @staticmethod
+    def _listbytes(data):
+        """Python 2 compatible int iterator from str.
+
+        :param data: the string to int iterate upon.
+        """
+        return map(ord, data)
+
+    def _create_ring(self, nodes):
+        """Generate a ketama compatible continuum/ring."""
+        _weight_sum = 0
+        for node_conf in self._nodes.values():
+            _weight_sum += node_conf["weight"]
+        self._weight_sum = _weight_sum
+
+        _distribution = Counter()
+        _keys = []
+        _ring = {}
+        for node_name, node_conf in self._nodes.items():
+            for h in self._hashi_weight_generator(node_name, node_conf):
+                _ring[h] = node_name
+                insort(_keys, h)
+                _distribution[node_name] += 1
+        self._distribution = _distribution
+        self._keys = _keys
+        self._ring = _ring
+
+    def _remove_node(self, node_name):
+        """Remove the given node from the continuum/ring.
+
+        :param node_name: the node name.
+        """
+        try:
+            self._nodes.pop(node_name)
+        except Exception:
+            raise KeyError(
+                f"node '{node_name}' not found, "
+                f"available nodes: {list(self._nodes.keys())}"
+            )
+        else:
+            self._create_ring(self._nodes)
diff --git a/python/xorbits/_mars/lib/uhashring/ring_meta.py b/python/xorbits/_mars/lib/uhashring/ring_meta.py
new file mode 100644
index 000000000..33bc3d5b2
--- /dev/null
+++ b/python/xorbits/_mars/lib/uhashring/ring_meta.py
@@ -0,0 +1,52 @@
+from collections import Counter
+from hashlib import md5
+
+
+class MetaRing(object):
+    """Implement a tunable consistent hashing ring."""
+
+    def __init__(self, hash_fn):
+        """Create a new HashRing.
+
+        :param hash_fn: use this callable function to hash keys.
+        """
+        self._distribution = Counter()
+        self._keys = []
+        self._nodes = {}
+        self._ring = {}
+
+        if hash_fn and not hasattr(hash_fn, "__call__"):
+            raise TypeError("hash_fn should be a callable function")
+        self._hash_fn = hash_fn or (
+            lambda key: int(md5(str(key).encode("utf-8")).hexdigest(), 16)
+        )
+
+    def hashi(self, key):
+        """Returns an integer derived from the md5 hash of the given key."""
+        return self._hash_fn(key)
+
+    def _create_ring(self, nodes):
+        """Generate a ketama compatible continuum/ring."""
+        for node_name, node_conf in nodes:
+            for w in range(0, node_conf["vnodes"] * node_conf["weight"]):
+                self._distribution[node_name] += 1
+                self._ring[self.hashi(f"{node_name}-{w}")] = node_name
+        self._keys = sorted(self._ring.keys())
+
+    def _remove_node(self, node_name):
+        """Remove the given node from the continuum/ring.
+
+        :param node_name: the node name.
+        """
+        try:
+            node_conf = self._nodes.pop(node_name)
+        except Exception:
+            raise KeyError(
+                f"node '{node_name}' not found, "
+                f"available nodes: {list(self._nodes.keys())}"
+            )
+        else:
+            self._distribution.pop(node_name)
+            for w in range(0, node_conf["vnodes"] * node_conf["weight"]):
+                del self._ring[self.hashi(f"{node_name}-{w}")]
+            self._keys = sorted(self._ring.keys())
diff --git a/python/xorbits/_mars/lib/version.py b/python/xorbits/_mars/lib/version.py
new file mode 100644
index 000000000..32457773a
--- /dev/null
+++ b/python/xorbits/_mars/lib/version.py
@@ -0,0 +1,606 @@
+# File merged from these files:
+#   setuptools/pkg_resources/_vendor/packaging/_structures.py
+#   setuptools/pkg_resources/_vendor/packaging/_typing.py
+#   setuptools/pkg_resources/_vendor/packaging/version.py
+# Originally released under Apache License, Version 2.0, and the BSD License.
+
+# This file is dual licensed under the terms of the Apache License, Version
+# 2.0, and the BSD License. See the LICENSE file in the root of this repository
+# for complete details.
+
+# Copyright Jason R. Coombs
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+import collections
+import itertools
+import re
+import warnings
+from typing import Callable, Iterator, List, Optional, SupportsInt, Tuple, Union
+
+__all__ = ["parse", "Version", "LegacyVersion", "InvalidVersion", "VERSION_PATTERN"]
+
+
+class InfinityType(object):
+    def __repr__(self):
+        # type: () -> str
+        return "Infinity"
+
+    def __hash__(self):
+        # type: () -> int
+        return hash(repr(self))
+
+    def __lt__(self, other):
+        # type: (object) -> bool
+        return False
+
+    def __le__(self, other):
+        # type: (object) -> bool
+        return False
+
+    def __eq__(self, other):
+        # type: (object) -> bool
+        return isinstance(other, self.__class__)
+
+    def __ne__(self, other):
+        # type: (object) -> bool
+        return not isinstance(other, self.__class__)
+
+    def __gt__(self, other):
+        # type: (object) -> bool
+        return True
+
+    def __ge__(self, other):
+        # type: (object) -> bool
+        return True
+
+    def __neg__(self):
+        # type: (object) -> NegativeInfinityType
+        return NegativeInfinity
+
+
+Infinity = InfinityType()
+
+
+class NegativeInfinityType(object):
+    def __repr__(self):
+        # type: () -> str
+        return "-Infinity"
+
+    def __hash__(self):
+        # type: () -> int
+        return hash(repr(self))
+
+    def __lt__(self, other):
+        # type: (object) -> bool
+        return True
+
+    def __le__(self, other):
+        # type: (object) -> bool
+        return True
+
+    def __eq__(self, other):
+        # type: (object) -> bool
+        return isinstance(other, self.__class__)
+
+    def __ne__(self, other):
+        # type: (object) -> bool
+        return not isinstance(other, self.__class__)
+
+    def __gt__(self, other):
+        # type: (object) -> bool
+        return False
+
+    def __ge__(self, other):
+        # type: (object) -> bool
+        return False
+
+    def __neg__(self):
+        # type: (object) -> InfinityType
+        return Infinity
+
+
+NegativeInfinity = NegativeInfinityType()
+
+
+InfiniteTypes = Union[InfinityType, NegativeInfinityType]
+PrePostDevType = Union[InfiniteTypes, Tuple[str, int]]
+SubLocalType = Union[InfiniteTypes, int, str]
+LocalType = Union[
+    NegativeInfinityType,
+    Tuple[
+        Union[
+            SubLocalType,
+            Tuple[SubLocalType, str],
+            Tuple[NegativeInfinityType, SubLocalType],
+        ],
+        ...,
+    ],
+]
+CmpKey = Tuple[
+    int, Tuple[int, ...], PrePostDevType, PrePostDevType, PrePostDevType, LocalType
+]
+LegacyCmpKey = Tuple[int, Tuple[str, ...]]
+VersionComparisonMethod = Callable[
+    [Union[CmpKey, LegacyCmpKey], Union[CmpKey, LegacyCmpKey]], bool
+]
+
+_Version = collections.namedtuple(
+    "_Version", ["epoch", "release", "dev", "pre", "post", "local"]
+)
+
+
+def parse(version: str) -> Union["LegacyVersion", "Version"]:
+    """
+    Parse the given version string and return either a :class:`Version` object
+    or a :class:`LegacyVersion` object depending on if the given version is
+    a valid PEP 440 version or a legacy version.
+    """
+    try:
+        return Version(version)
+    except InvalidVersion:
+        return LegacyVersion(version)
+
+
+class InvalidVersion(ValueError):
+    """
+    An invalid version was found, users should refer to PEP 440.
+    """
+
+
+class _BaseVersion:
+    _key: Union[CmpKey, LegacyCmpKey]
+
+    def __hash__(self) -> int:
+        return hash(self._key)
+
+    # Please keep the duplicated `isinstance` check
+    # in the six comparisons hereunder
+    # unless you find a way to avoid adding overhead function calls.
+    def __lt__(self, other: "_BaseVersion") -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key < other._key
+
+    def __le__(self, other: "_BaseVersion") -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key <= other._key
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key == other._key
+
+    def __ge__(self, other: "_BaseVersion") -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key >= other._key
+
+    def __gt__(self, other: "_BaseVersion") -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key > other._key
+
+    def __ne__(self, other: object) -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key != other._key
+
+
+class LegacyVersion(_BaseVersion):
+    def __init__(self, version: str) -> None:
+        self._version = str(version)
+        self._key = _legacy_cmpkey(self._version)
+
+        warnings.warn(
+            "Creating a LegacyVersion has been deprecated and will be "
+            "removed in the next major release",
+            DeprecationWarning,
+        )
+
+    def __str__(self) -> str:
+        return self._version
+
+    def __repr__(self) -> str:
+        return f"<LegacyVersion('{self}')>"
+
+    @property
+    def public(self) -> str:
+        return self._version
+
+    @property
+    def base_version(self) -> str:
+        return self._version
+
+    @property
+    def epoch(self) -> int:
+        return -1
+
+    @property
+    def release(self) -> None:
+        return None
+
+    @property
+    def pre(self) -> None:
+        return None
+
+    @property
+    def post(self) -> None:
+        return None
+
+    @property
+    def dev(self) -> None:
+        return None
+
+    @property
+    def local(self) -> None:
+        return None
+
+    @property
+    def is_prerelease(self) -> bool:
+        return False
+
+    @property
+    def is_postrelease(self) -> bool:
+        return False
+
+    @property
+    def is_devrelease(self) -> bool:
+        return False
+
+
+_legacy_version_component_re = re.compile(r"(\d+ | [a-z]+ | \.| -)", re.VERBOSE)
+
+_legacy_version_replacement_map = {
+    "pre": "c",
+    "preview": "c",
+    "-": "final-",
+    "rc": "c",
+    "dev": "@",
+}
+
+
+def _parse_version_parts(s: str) -> Iterator[str]:
+    for part in _legacy_version_component_re.split(s):
+        part = _legacy_version_replacement_map.get(part, part)
+
+        if not part or part == ".":
+            continue
+
+        if part[:1] in "0123456789":
+            # pad for numeric comparison
+            yield part.zfill(8)
+        else:
+            yield "*" + part
+
+    # ensure that alpha/beta/candidate are before final
+    yield "*final"
+
+
+def _legacy_cmpkey(version: str) -> LegacyCmpKey:
+    # We hardcode an epoch of -1 here. A PEP 440 version can only have a epoch
+    # greater than or equal to 0. This will effectively put the LegacyVersion,
+    # which uses the defacto standard originally implemented by setuptools,
+    # as before all PEP 440 versions.
+    epoch = -1
+
+    # This scheme is taken from pkg_resources.parse_version setuptools prior to
+    # it's adoption of the packaging library.
+    parts: List[str] = []
+    for part in _parse_version_parts(version.lower()):
+        if part.startswith("*"):
+            # remove "-" before a prerelease tag
+            if part < "*final":
+                while parts and parts[-1] == "*final-":
+                    parts.pop()
+
+            # remove trailing zeros from each series of numeric parts
+            while parts and parts[-1] == "00000000":
+                parts.pop()
+
+        parts.append(part)
+
+    return epoch, tuple(parts)
+
+
+# Deliberately not anchored to the start and end of the string, to make it
+# easier for 3rd party code to reuse
+VERSION_PATTERN = r"""
+    v?
+    (?:
+        (?:(?P<epoch>[0-9]+)!)?                           # epoch
+        (?P<release>[0-9]+(?:\.[0-9]+)*)                  # release segment
+        (?P<pre>                                          # pre-release
+            [-_\.]?
+            (?P<pre_l>(a|b|c|rc|alpha|beta|pre|preview))
+            [-_\.]?
+            (?P<pre_n>[0-9]+)?
+        )?
+        (?P<post>                                         # post release
+            (?:-(?P<post_n1>[0-9]+))
+            |
+            (?:
+                [-_\.]?
+                (?P<post_l>post|rev|r)
+                [-_\.]?
+                (?P<post_n2>[0-9]+)?
+            )
+        )?
+        (?P<dev>                                          # dev release
+            [-_\.]?
+            (?P<dev_l>dev)
+            [-_\.]?
+            (?P<dev_n>[0-9]+)?
+        )?
+    )
+    (?:\+(?P<local>[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
+"""
+
+
+class Version(_BaseVersion):
+    _regex = re.compile(r"^\s*" + VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE)
+
+    def __init__(self, version: str) -> None:
+        # Validate the version and parse it into pieces
+        match = self._regex.search(version)
+        if not match:
+            raise InvalidVersion(f"Invalid version: '{version}'")
+
+        # Store the parsed out pieces of the version
+        self._version = _Version(
+            epoch=int(match.group("epoch")) if match.group("epoch") else 0,
+            release=tuple(int(i) for i in match.group("release").split(".")),
+            pre=_parse_letter_version(match.group("pre_l"), match.group("pre_n")),
+            post=_parse_letter_version(
+                match.group("post_l"), match.group("post_n1") or match.group("post_n2")
+            ),
+            dev=_parse_letter_version(match.group("dev_l"), match.group("dev_n")),
+            local=_parse_local_version(match.group("local")),
+        )
+
+        # Generate a key which will be used for sorting
+        self._key = _cmpkey(
+            self._version.epoch,
+            self._version.release,
+            self._version.pre,
+            self._version.post,
+            self._version.dev,
+            self._version.local,
+        )
+
+    def __repr__(self) -> str:
+        return f"<Version('{self}')>"
+
+    def __str__(self) -> str:
+        parts = []
+
+        # Epoch
+        if self.epoch != 0:
+            parts.append(f"{self.epoch}!")
+
+        # Release segment
+        parts.append(".".join(str(x) for x in self.release))
+
+        # Pre-release
+        if self.pre is not None:
+            parts.append("".join(str(x) for x in self.pre))
+
+        # Post-release
+        if self.post is not None:
+            parts.append(f".post{self.post}")
+
+        # Development release
+        if self.dev is not None:
+            parts.append(f".dev{self.dev}")
+
+        # Local version segment
+        if self.local is not None:
+            parts.append(f"+{self.local}")
+
+        return "".join(parts)
+
+    @property
+    def epoch(self) -> int:
+        _epoch: int = self._version.epoch
+        return _epoch
+
+    @property
+    def release(self) -> Tuple[int, ...]:
+        _release: Tuple[int, ...] = self._version.release
+        return _release
+
+    @property
+    def pre(self) -> Optional[Tuple[str, int]]:
+        _pre: Optional[Tuple[str, int]] = self._version.pre
+        return _pre
+
+    @property
+    def post(self) -> Optional[int]:
+        return self._version.post[1] if self._version.post else None
+
+    @property
+    def dev(self) -> Optional[int]:
+        return self._version.dev[1] if self._version.dev else None
+
+    @property
+    def local(self) -> Optional[str]:
+        if self._version.local:
+            return ".".join(str(x) for x in self._version.local)
+        else:
+            return None
+
+    @property
+    def public(self) -> str:
+        return str(self).split("+", 1)[0]
+
+    @property
+    def base_version(self) -> str:
+        parts = []
+
+        # Epoch
+        if self.epoch != 0:
+            parts.append(f"{self.epoch}!")
+
+        # Release segment
+        parts.append(".".join(str(x) for x in self.release))
+
+        return "".join(parts)
+
+    @property
+    def is_prerelease(self) -> bool:
+        return self.dev is not None or self.pre is not None
+
+    @property
+    def is_postrelease(self) -> bool:
+        return self.post is not None
+
+    @property
+    def is_devrelease(self) -> bool:
+        return self.dev is not None
+
+    @property
+    def major(self) -> int:
+        return self.release[0] if len(self.release) >= 1 else 0
+
+    @property
+    def minor(self) -> int:
+        return self.release[1] if len(self.release) >= 2 else 0
+
+    @property
+    def micro(self) -> int:
+        return self.release[2] if len(self.release) >= 3 else 0
+
+
+def _parse_letter_version(
+    letter: str, number: Union[str, bytes, SupportsInt]
+) -> Optional[Tuple[str, int]]:
+    if letter:
+        # We consider there to be an implicit 0 in a pre-release if there is
+        # not a numeral associated with it.
+        if number is None:
+            number = 0
+
+        # We normalize any letters to their lower case form
+        letter = letter.lower()
+
+        # We consider some words to be alternate spellings of other words and
+        # in those cases we want to normalize the spellings to our preferred
+        # spelling.
+        if letter == "alpha":
+            letter = "a"
+        elif letter == "beta":
+            letter = "b"
+        elif letter in ["c", "pre", "preview"]:
+            letter = "rc"
+        elif letter in ["rev", "r"]:
+            letter = "post"
+
+        return letter, int(number)
+    if not letter and number:
+        # We assume if we are given a number, but we are not given a letter
+        # then this is using the implicit post release syntax (e.g. 1.0-1)
+        letter = "post"
+
+        return letter, int(number)
+
+    return None
+
+
+_local_version_separators = re.compile(r"[\._-]")
+
+
+def _parse_local_version(local: str) -> Optional[LocalType]:
+    """
+    Takes a string like abc.1.twelve and turns it into ("abc", 1, "twelve").
+    """
+    if local is not None:
+        return tuple(
+            part.lower() if not part.isdigit() else int(part)
+            for part in _local_version_separators.split(local)
+        )
+    return None
+
+
+def _cmpkey(
+    epoch: int,
+    release: Tuple[int, ...],
+    pre: Optional[Tuple[str, int]],
+    post: Optional[Tuple[str, int]],
+    dev: Optional[Tuple[str, int]],
+    local: Optional[Tuple[SubLocalType]],
+) -> CmpKey:
+    # When we compare a release version, we want to compare it with all of the
+    # trailing zeros removed. So we'll use a reverse the list, drop all the now
+    # leading zeros until we come to something non zero, then take the rest
+    # re-reverse it back into the correct order and make it a tuple and use
+    # that for our sorting key.
+    _release = tuple(
+        reversed(list(itertools.dropwhile(lambda x: x == 0, reversed(release))))
+    )
+
+    # We need to "trick" the sorting algorithm to put 1.0.dev0 before 1.0a0.
+    # We'll do this by abusing the pre segment, but we _only_ want to do this
+    # if there is not a pre or a post segment. If we have one of those then
+    # the normal sorting rules will handle this case correctly.
+    if pre is None and post is None and dev is not None:
+        _pre: PrePostDevType = NegativeInfinity
+    # Versions without a pre-release (except as noted above) should sort after
+    # those with one.
+    elif pre is None:
+        _pre = Infinity
+    else:
+        _pre = pre
+
+    # Versions without a post segment should sort before those with one.
+    if post is None:
+        _post: PrePostDevType = NegativeInfinity
+
+    else:
+        _post = post
+
+    # Versions without a development segment should sort after those with one.
+    if dev is None:
+        _dev: PrePostDevType = Infinity
+
+    else:
+        _dev = dev
+
+    if local is None:
+        # Versions without a local segment should sort before those with one.
+        _local: LocalType = NegativeInfinity
+    else:
+        # Versions with a local segment need that segment parsed to implement
+        # the sorting rules in PEP440.
+        # - Alpha numeric segments sort before numeric segments
+        # - Alpha numeric segments sort lexicographically
+        # - Numeric segments sort numerically
+        # - Shorter versions sort before longer versions when the prefixes
+        #   match exactly
+        _local = tuple(
+            (i, "") if isinstance(i, int) else (NegativeInfinity, i) for i in local
+        )
+
+    return epoch, _release, _pre, _post, _dev, _local
diff --git a/python/xorbits/_mars/metrics/__init__.py b/python/xorbits/_mars/metrics/__init__.py
new file mode 100644
index 000000000..e53d36497
--- /dev/null
+++ b/python/xorbits/_mars/metrics/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .api import (
+    Metrics,
+    Percentile,
+    init_metrics,
+    record_time_cost_percentile,
+    shutdown_metrics,
+)
diff --git a/python/xorbits/_mars/metrics/api.py b/python/xorbits/_mars/metrics/api.py
new file mode 100644
index 000000000..b258997a1
--- /dev/null
+++ b/python/xorbits/_mars/metrics/api.py
@@ -0,0 +1,292 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import time
+import weakref
+from contextlib import contextmanager
+from enum import Enum
+from queue import PriorityQueue
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple
+
+from .backends.console import console_metric
+from .backends.metric import AbstractMetric
+from .backends.prometheus import prometheus_metric
+from .backends.ray import ray_metric
+
+logger = logging.getLogger(__name__)
+
+_init = False
+_metric_backend = "console"
+_backends_cls = {
+    "console": console_metric,
+    "prometheus": prometheus_metric,
+    "ray": ray_metric,
+}
+
+
+_metrics_to_be_initialized = weakref.WeakSet()
+
+
+def init_metrics(backend="console", config: Dict[str, Any] = None):
+    global _init
+    if _init is True:
+        return
+
+    backend = backend or "console"
+    if backend not in _backends_cls:
+        raise NotImplementedError(f"Do not support metric backend {backend}")
+    global _metric_backend
+    _metric_backend = backend
+    if _metric_backend == "prometheus":
+        try:
+            from prometheus_client import start_http_server
+
+            from ..utils import get_next_port
+
+            port = config.get("port", 0) if config else 0
+            port = port or get_next_port()
+            start_http_server(port)
+            logger.warning(
+                "Finished startup prometheus http server and port is %d", port
+            )
+        except ImportError:
+            logger.warning(
+                "Failed to start prometheus http server because there is no prometheus_client"
+            )
+    _init = True
+    for m in _metrics_to_be_initialized:
+        cls = getattr(_backends_cls[_metric_backend], m.type)
+        metric = cls(m.name, m.description, m.tag_keys)
+        m.set_metric(metric)
+    logger.info("Finished initialize the metrics of backend: %s.", _metric_backend)
+
+
+def shutdown_metrics():
+    global _metric_backend
+    _metric_backend = "console"
+    global _init
+    _init = False
+    logger.info("Shutdown metrics of backend: %s.", _metric_backend)
+
+
+class _MetricWrapper(AbstractMetric):
+    _metric: AbstractMetric
+    _log_not_init_error: bool
+
+    def __init__(
+        self,
+        name: str,
+        description: str = "",
+        tag_keys: Optional[Tuple[str, ...]] = None,
+        metric_type: str = "Counter",
+    ):
+        self._name = name
+        self._description = description
+        self._tag_keys = tag_keys or tuple()
+        self._type = metric_type
+        self._metric = None
+        self._log_not_init_error = False
+
+    @property
+    def type(self):
+        return self._type
+
+    @property
+    def value(self):
+        assert (
+            self._metric is not None
+        ), "Metric is not initialized, please call `init_metrics()` before using metrics."
+        return self._metric.value
+
+    def set_metric(self, metric):
+        assert metric is not None, "Argument metric is None, please check it."
+        self._metric = metric
+
+    def record(self, value=1, tags: Optional[Dict[str, str]] = None):
+        if self._metric is not None:
+            self._metric.record(value, tags)
+        elif not self._log_not_init_error:
+            self._log_not_init_error = True
+            logger.warning(
+                "Metric is not initialized, please call `init_metrics()` before using metrics."
+            )
+
+
+def gen_metric(func):
+    def wrapper(
+        name, descriptions: str = "", tag_keys: Optional[Tuple[str, ...]] = None
+    ):
+        if _init is True:
+            return func(name, descriptions, tag_keys)
+        else:
+            logger.info(
+                "Metric %s will be initialized when invoking `init_metrics()`.", name
+            )
+            metric = _MetricWrapper(
+                name, descriptions, tag_keys, func.__name__.capitalize()
+            )
+            _metrics_to_be_initialized.add(metric)
+            return metric
+
+    return wrapper
+
+
+class Metrics:
+    """
+    A factory to generate different types of metrics.
+
+    Note:
+        Counter, Meter and Histogram are not thread safe.
+
+    Examples
+    --------
+    >>> c1 = counter('counter1', 'A counter')
+    >>> c1.record(1)
+
+    >>> c2 = counter('counter2', 'A counter', ('service', 'tenant'))
+    >>> c2.record(1, {'service': 'mars', 'tenant': 'test'})
+
+    >>> g1 = gauge('gauge1')
+    >>> g1.record(1)
+
+    >>> m1 = meter('meter1')
+    >>> m1.record(1)
+
+    >>> h1 = histogram('histogram1')
+    >>> h1.record(1)
+    """
+
+    @staticmethod
+    @gen_metric
+    def counter(
+        name, description: str = "", tag_keys: Optional[Tuple[str, ...]] = None
+    ):
+        logger.debug(
+            "Initializing a counter with name: %s, tag keys: %s, backend: %s",
+            name,
+            tag_keys,
+            _metric_backend,
+        )
+        return _backends_cls[_metric_backend].Counter(name, description, tag_keys)
+
+    @staticmethod
+    @gen_metric
+    def gauge(name, description: str = "", tag_keys: Optional[Tuple[str, ...]] = None):
+        logger.debug(
+            "Initializing a gauge whose name: %s, tag keys: %s, backend: %s",
+            name,
+            tag_keys,
+            _metric_backend,
+        )
+        return _backends_cls[_metric_backend].Gauge(name, description, tag_keys)
+
+    @staticmethod
+    @gen_metric
+    def meter(name, description: str = "", tag_keys: Optional[Tuple[str, ...]] = None):
+        logger.debug(
+            "Initializing a meter whose name: %s, tag keys: %s, backend: %s",
+            name,
+            tag_keys,
+            _metric_backend,
+        )
+        return _backends_cls[_metric_backend].Meter(name, description, tag_keys)
+
+    @staticmethod
+    @gen_metric
+    def histogram(
+        name, description: str = "", tag_keys: Optional[Tuple[str, ...]] = None
+    ):
+        logger.debug(
+            "Initializing a histogram whose name: %s, tag keys: %s, backend: %s",
+            name,
+            tag_keys,
+            _metric_backend,
+        )
+        return _backends_cls[_metric_backend].Histogram(name, description, tag_keys)
+
+
+class Percentile:
+    class PercentileType(Enum):
+        P99 = 1
+        P95 = 2
+        P90 = 3
+
+    def __init__(self, capacity: int, window: int, callback: Callable[[float], None]):
+        self._capacity = capacity
+        self._window = window
+        self._callback = callback
+        self._min_heap = PriorityQueue()
+        self._cur_num = 0
+
+        if capacity <= 0 or window <= 0:
+            raise ValueError(
+                f"capacity or window expect to get a positive integer,"
+                f"but capacity got: {capacity} and window got: {window}"
+            )
+
+    def record_data(self, value):
+        store_value = -1 * value
+        if self._min_heap.qsize() < self._capacity:
+            self._min_heap.put(store_value)
+        else:
+            top_value = self._min_heap.get_nowait()
+            store_value = store_value if top_value < store_value else top_value
+            self._min_heap.put(store_value)
+
+        self._cur_num += 1
+        if self._cur_num % self._window == 0:
+            self._callback(-1 * self._min_heap.get_nowait())
+            self._cur_num = 0
+            self._min_heap = PriorityQueue()
+
+    @classmethod
+    def build_p99(cls, callback: Callable[[float], None], window: int):
+        return cls(int(window * 0.01), window, callback)
+
+    @classmethod
+    def build_p95(cls, callback: Callable[[float], None], window: int):
+        return cls(int(window * 0.05), window, callback)
+
+    @classmethod
+    def build_p90(cls, callback: Callable[[float], None], window: int):
+        return cls(int(window * 0.1), window, callback)
+
+
+_percentile_builder = {
+    Percentile.PercentileType.P99: Percentile.build_p99,
+    Percentile.PercentileType.P95: Percentile.build_p95,
+    Percentile.PercentileType.P90: Percentile.build_p90,
+}
+
+
+class PercentileArg(NamedTuple):
+    percentile_type: Percentile.PercentileType
+    callback: Callable[[float], None]
+    window: int
+
+
+@contextmanager
+def record_time_cost_percentile(percentile_args: List[PercentileArg]):
+    percentile_list = [
+        _percentile_builder[percentile_type](callback, window)
+        for percentile_type, callback, window in percentile_args
+    ]
+    st_time = time.time()
+
+    yield
+
+    cost_time = time.time() - st_time
+    for percentile in percentile_list:
+        percentile.record_data(cost_time)
diff --git a/python/xorbits/_mars/metrics/backends/__init__.py b/python/xorbits/_mars/metrics/backends/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/metrics/backends/console/__init__.py b/python/xorbits/_mars/metrics/backends/console/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/console/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/metrics/backends/console/console_metric.py b/python/xorbits/_mars/metrics/backends/console/console_metric.py
new file mode 100644
index 000000000..c76ecbbc7
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/console/console_metric.py
@@ -0,0 +1,78 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Dict, Optional, Tuple
+
+from ..metric import (
+    AbstractCounter,
+    AbstractGauge,
+    AbstractHistogram,
+    AbstractMeter,
+    AbstractMetric,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class SimpleMetric:
+    def __init__(
+        self, name: str, description: str = "", tag_keys: Optional[Tuple[str]] = None
+    ):
+        self._name = name
+        self._description = description
+        self._tag_keys = tag_keys
+        self._value = 0
+
+    def update(self, value: float = 1.0, tags: Optional[Dict[str, str]] = None):
+        self._value = value
+        logger.debug(
+            "Reporting metric with name: %s, description: %s, value: %s, tags: %s",
+            self._name,
+            self._description,
+            value,
+            tags,
+        )
+
+    @property
+    def value(self):
+        return self._value
+
+
+class ConsoleMetricMixin(AbstractMetric):
+    @property
+    def value(self):
+        return self._metric.value
+
+    def _init(self):
+        self._metric = SimpleMetric(self._name, self._description, self._tag_keys)
+
+    def _record(self, value=1, tags: Optional[Dict[str, str]] = None):
+        self._metric.update(value, tags)
+
+
+class Counter(ConsoleMetricMixin, AbstractCounter):
+    pass
+
+
+class Gauge(ConsoleMetricMixin, AbstractGauge):
+    pass
+
+
+class Meter(ConsoleMetricMixin, AbstractMeter):
+    pass
+
+
+class Histogram(ConsoleMetricMixin, AbstractHistogram):
+    pass
diff --git a/python/xorbits/_mars/metrics/backends/console/tests/__init__.py b/python/xorbits/_mars/metrics/backends/console/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/console/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/metrics/backends/console/tests/test_console_metric.py b/python/xorbits/_mars/metrics/backends/console/tests/test_console_metric.py
new file mode 100644
index 000000000..3e41cc459
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/console/tests/test_console_metric.py
@@ -0,0 +1,63 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..console_metric import Counter, Gauge, Histogram, Meter
+
+
+def test_counter():
+    c = Counter("test_counter", "A test counter", ("service", "tenant"))
+    assert c.name == "test_counter"
+    assert c.description == "A test counter"
+    assert c.tag_keys == ("service", "tenant")
+    assert c.type == "Counter"
+    c.record(1, {"service": "mars", "tenant": "test"})
+    c.record(2, {"service": "mars", "tenant": "test"})
+    assert c.value == 3
+
+
+def test_gauge():
+    g = Gauge("test_gauge", "A test gauge")
+    assert g.name == "test_gauge"
+    assert g.description == "A test gauge"
+    assert g.tag_keys == ()
+    assert g.type == "Gauge"
+    g.record(1)
+    assert g.value == 1
+    g.record(2)
+    assert g.value == 2
+
+
+def test_meter():
+    m = Meter("test_meter")
+    assert m.name == "test_meter"
+    assert m.description == ""
+    assert m.tag_keys == ()
+    assert m.type == "Meter"
+    m.record(1)
+    assert m.value == 0
+    m.record(2001)
+    assert m.value > 0
+
+
+def test_histogram():
+    h = Histogram("test_histogram")
+    assert h.name == "test_histogram"
+    assert h.description == ""
+    assert h.tag_keys == ()
+    assert h.type == "Histogram"
+    h.record(1)
+    assert h.value == 0
+    for i in range(2002):
+        h.record(1)
+    assert h.value > 0
diff --git a/python/xorbits/_mars/metrics/backends/metric.py b/python/xorbits/_mars/metrics/backends/metric.py
new file mode 100644
index 000000000..78f00b63a
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/metric.py
@@ -0,0 +1,145 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from abc import ABC
+from typing import Dict, Optional, Tuple
+
+_THRESHOLD = 2000
+_RECORDED_INTERVAL_SECS = 1
+
+
+class AbstractMetric(ABC):
+    """Base class of metrics."""
+
+    _type = None
+
+    def __init__(
+        self, name: str, description: str = "", tag_keys: Optional[Tuple[str]] = None
+    ):
+        assert isinstance(name, str), "Argument name should be a str"
+        assert isinstance(description, str), "Argument description should be a str"
+        if tag_keys is not None:
+            assert isinstance(tag_keys, tuple) and all(
+                isinstance(tag, str) for tag in tag_keys
+            ), "Argument tag_keys should be a tuple and its elements should be str"
+        self._name = name
+        self._description = description
+        self._tag_keys = tag_keys or tuple()
+        self._init()
+
+    @property
+    def type(self):
+        return self._type
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def description(self):
+        return self._description
+
+    @property
+    def tag_keys(self):
+        return self._tag_keys
+
+    def _init(self):
+        """Some initialization in subclass."""
+        pass
+
+    def record(self, value=1, tags: Optional[Dict[str, str]] = None):
+        """A public method called by users."""
+        pass
+
+    def _record(self, value: float = 1.0, tags: Optional[Dict[str, str]] = None):
+        """An internal method called by record() and should be
+        implemented by different metric backends.
+        """
+        pass
+
+
+class AbstractCounter(AbstractMetric):
+    """A counter records the counts of events."""
+
+    _type = "Counter"
+
+    def __init__(
+        self, name: str, description: str = "", tag_keys: Optional[Tuple[str]] = None
+    ):
+        super().__init__(name, description, tag_keys)
+        self._count = 0
+
+    def record(self, value=1, tags: Optional[Dict[str, str]] = None):
+        self._count += value
+        self._record(self._count, tags)
+
+
+class AbstractGauge(AbstractMetric):
+    """A gauge represents a single numerical value that can be
+    arbitrarily set.
+    """
+
+    _type = "Gauge"
+
+    def record(self, value=1, tags: Optional[Dict[str, str]] = None):
+        self._record(value, tags)
+
+
+class AbstractMeter(AbstractMetric):
+    """A meter measures the rate at which a set of events occur."""
+
+    _type = "Meter"
+
+    def __init__(
+        self, name: str, description: str = "", tag_keys: Optional[Tuple[str]] = None
+    ):
+        super().__init__(name, description, tag_keys)
+        self._count = 0
+        self._last_time = time.time()
+
+    def record(self, value=1, tags: Optional[Dict[str, str]] = None):
+        self._count += value
+        now = time.time()
+        past = now - self._last_time
+        if self._count >= _THRESHOLD or past >= _RECORDED_INTERVAL_SECS:
+            qps = self._count / past
+            self._record(qps, tags)
+            self._last_time = now
+            self._count = 0
+
+
+class AbstractHistogram(AbstractMetric):
+    """A histogram measures the distribution of values in a stream of data."""
+
+    _type = "Histogram"
+
+    def __init__(
+        self, name: str, description: str = "", tag_keys: Optional[Tuple[str]] = None
+    ):
+        super().__init__(name, description, tag_keys)
+        self._data = list()
+        self._last_time = time.time()
+
+    def record(self, value=1, tags: Optional[Dict[str, str]] = None):
+        self._data.append(value)
+        now = time.time()
+        if (
+            len(self._data) >= _THRESHOLD
+            or now - self._last_time >= _RECORDED_INTERVAL_SECS
+        ):
+            avg = sum(self._data) / len(self._data)
+            self._record(avg, tags)
+            self._data.clear()
+            self._last_time = now
diff --git a/python/xorbits/_mars/metrics/backends/prometheus/__init__.py b/python/xorbits/_mars/metrics/backends/prometheus/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/prometheus/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/metrics/backends/prometheus/prometheus_metric.py b/python/xorbits/_mars/metrics/backends/prometheus/prometheus_metric.py
new file mode 100644
index 000000000..e5eaa38da
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/prometheus/prometheus_metric.py
@@ -0,0 +1,70 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import socket
+from typing import Dict, Optional
+
+from ....utils import lazy_import
+from ..metric import (
+    AbstractCounter,
+    AbstractGauge,
+    AbstractHistogram,
+    AbstractMeter,
+    AbstractMetric,
+)
+
+pc = lazy_import("prometheus_client", rename="pc")
+
+
+class PrometheusMetricMixin(AbstractMetric):
+    def _init(self):
+        # Prometheus metric name must match the regex `[a-zA-Z_:][a-zA-Z0-9_:]*`
+        # `.` is a common character in metrics, so here replace it with `:`
+        self._name = self._name.replace(".", ":")
+        self._tag_keys = self._tag_keys + (
+            "host",
+            "pid",
+        )
+        self._tags = {"host": socket.gethostname(), "pid": os.getpid()}
+        try:
+            self._metric = (
+                pc.Gauge(self._name, self._description, self._tag_keys) if pc else None
+            )
+        except ValueError:  # pragma: no cover
+            self._metric = None
+
+    def _record(self, value=1, tags: Optional[Dict[str, str]] = None):
+        if self._metric:
+            if tags is not None:
+                tags.update(self._tags)
+            else:
+                tags = self._tags
+            self._metric.labels(**tags).set(value)
+
+
+class Counter(PrometheusMetricMixin, AbstractCounter):
+    pass
+
+
+class Gauge(PrometheusMetricMixin, AbstractGauge):
+    pass
+
+
+class Meter(PrometheusMetricMixin, AbstractMeter):
+    pass
+
+
+class Histogram(PrometheusMetricMixin, AbstractHistogram):
+    pass
diff --git a/python/xorbits/_mars/metrics/backends/prometheus/tests/__init__.py b/python/xorbits/_mars/metrics/backends/prometheus/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/prometheus/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/metrics/backends/prometheus/tests/test_prometheus_metric.py b/python/xorbits/_mars/metrics/backends/prometheus/tests/test_prometheus_metric.py
new file mode 100644
index 000000000..9f903a6d6
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/prometheus/tests/test_prometheus_metric.py
@@ -0,0 +1,111 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+import pytest
+
+try:
+    import requests
+except ImportError:
+    requests = None
+
+try:
+    from prometheus_client import start_http_server
+except ImportError:
+    start_http_server = None
+
+from .....utils import get_next_port
+from ..prometheus_metric import Counter, Gauge, Histogram, Meter
+
+_PROMETHEUS_CLIENT_PORT = get_next_port()
+
+
+@pytest.fixture(scope="module")
+def start_prometheus_http_server():
+    if start_http_server:
+        start_http_server(_PROMETHEUS_CLIENT_PORT)
+
+
+def verify_metric(name, value, delta=1e-6):
+    if start_http_server is None or requests is None:
+        return
+    resp = requests.get("http://127.0.0.1:{}".format(_PROMETHEUS_CLIENT_PORT)).text
+    assert name in resp
+    lines = resp.splitlines()
+    for line in lines:
+        if line.startswith(name):
+            items = line.split(" ")
+            assert len(items) == 2
+            assert pytest.approx(float(items[1]), abs=delta) == value
+
+
+def test_counter(start_prometheus_http_server):
+    c = Counter("test_counter", "A test counter", ("service", "tenant"))
+    assert c.name == "test_counter"
+    assert c.description == "A test counter"
+    assert set(["host", "pid"]).issubset(set(c.tag_keys))
+    assert set(["service", "tenant"]).issubset(set(c.tag_keys))
+    assert c.type == "Counter"
+    c.record(1, {"service": "mars", "tenant": "test"})
+    verify_metric("test_counter", 1.0)
+    c.record(2, {"service": "mars", "tenant": "test"})
+    verify_metric("test_counter", 3.0)
+
+
+def test_gauge(start_prometheus_http_server):
+    g = Gauge("test_gauge", "A test gauge")
+    assert g.name == "test_gauge"
+    assert g.description == "A test gauge"
+    assert set(["host", "pid"]).issubset(set(g.tag_keys))
+    assert g.type == "Gauge"
+    g.record(0.1)
+    verify_metric("test_gauge", 0.1)
+    g.record(1.1)
+    verify_metric("test_gauge", 1.1)
+
+
+def test_meter(start_prometheus_http_server):
+    m = Meter("test_meter")
+    assert m.name == "test_meter"
+    assert m.description == ""
+    assert set(["host", "pid"]).issubset(set(m.tag_keys))
+    assert m.type == "Meter"
+    num = 3
+    while num > 0:
+        m.record(1)
+        time.sleep(1)
+        num -= 1
+    verify_metric("test_meter", 1, 0.05)
+
+
+def test_histogram(start_prometheus_http_server):
+    h = Histogram("test_histogram")
+    assert h.name == "test_histogram"
+    assert h.description == ""
+    assert set(["host", "pid"]).issubset(set(h.tag_keys))
+    assert h.type == "Histogram"
+    num = 3
+    while num > 0:
+        h.record(1)
+        h.record(2)
+        time.sleep(1)
+        num -= 1
+    verify_metric("test_histogram", 1.5, 0.15)
+    num = 3
+    while num > 0:
+        h.record(3)
+        time.sleep(1)
+        num -= 1
+    verify_metric("test_histogram", 3, 0.1)
diff --git a/python/xorbits/_mars/metrics/backends/ray/__init__.py b/python/xorbits/_mars/metrics/backends/ray/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/ray/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/metrics/backends/ray/ray_metric.py b/python/xorbits/_mars/metrics/backends/ray/ray_metric.py
new file mode 100644
index 000000000..51ee5f775
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/ray/ray_metric.py
@@ -0,0 +1,76 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Optional
+
+from ....utils import lazy_import, lazy_import_on_load
+from ..metric import (
+    AbstractCounter,
+    AbstractGauge,
+    AbstractHistogram,
+    AbstractMeter,
+    AbstractMetric,
+)
+
+ray_metrics = lazy_import("ray.util.metrics", rename="ray_metrics")
+
+_ray_gauge_set_available = None
+
+
+@lazy_import_on_load(ray_metrics)
+def _reload_ray_gauge_set_available():
+    """
+    Note: Gauge `record` method is deprecated in ray 1.3.0 version, so here
+    make it compatible with the old and new ray versions.
+    """
+    global _ray_gauge_set_available
+
+    if _ray_gauge_set_available is not None:
+        return _ray_gauge_set_available
+    _ray_gauge_set_available = (
+        True if ray_metrics and hasattr(ray_metrics.Gauge, "set") else False
+    )
+    return _ray_gauge_set_available
+
+
+class RayMetricMixin(AbstractMetric):
+    def _init(self):
+        _reload_ray_gauge_set_available()
+
+        if ray_metrics is not None:  # pragma: no branch
+            self._metric = ray_metrics.Gauge(
+                self._name, self._description, self._tag_keys
+            )
+
+    def _record(self, value=1, tags: Optional[Dict[str, str]] = None):
+        if _ray_gauge_set_available:
+            self._metric.set(value, tags)
+        elif ray_metrics is not None:  # pragma: no branch
+            self._metric.record(value, tags)
+
+
+class Counter(RayMetricMixin, AbstractCounter):
+    pass
+
+
+class Gauge(RayMetricMixin, AbstractGauge):
+    pass
+
+
+class Meter(RayMetricMixin, AbstractMeter):
+    pass
+
+
+class Histogram(RayMetricMixin, AbstractHistogram):
+    pass
diff --git a/python/xorbits/_mars/metrics/backends/ray/tests/__init__.py b/python/xorbits/_mars/metrics/backends/ray/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/ray/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/metrics/backends/ray/tests/test_ray_metric.py b/python/xorbits/_mars/metrics/backends/ray/tests/test_ray_metric.py
new file mode 100644
index 000000000..75b4747bb
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/ray/tests/test_ray_metric.py
@@ -0,0 +1,62 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .....tests.core import require_ray
+from ..ray_metric import Counter, Gauge, Histogram, Meter
+
+
+@require_ray
+def test_record():
+    c = Counter("test_counter")
+    assert c.record(1) is None
+
+
+@require_ray
+def test_counter():
+    c = Counter("test_counter", "A test counter", ("service", "tenant"))
+    assert c.name == "test_counter"
+    assert c.description == "A test counter"
+    assert c.tag_keys == ("service", "tenant")
+    assert c.type == "Counter"
+    assert c.record(1, {"service": "mars", "tenant": "test"}) is None
+
+
+@require_ray
+def test_gauge():
+    g = Gauge("test_gauge", "A test gauge")
+    assert g.name == "test_gauge"
+    assert g.description == "A test gauge"
+    assert g.tag_keys == ()
+    assert g.type == "Gauge"
+    assert g.record(1) is None
+
+
+@require_ray
+def test_meter():
+    m = Meter("test_meter")
+    assert m.name == "test_meter"
+    assert m.description == ""
+    assert m.tag_keys == ()
+    assert m.type == "Meter"
+    assert m.record(1) is None
+
+
+@require_ray
+def test_histogram():
+    h = Histogram("test_histogram")
+    assert h.name == "test_histogram"
+    assert h.description == ""
+    assert h.tag_keys == ()
+    assert h.type == "Histogram"
+    assert h.record(1) is None
diff --git a/python/xorbits/_mars/metrics/backends/tests/__init__.py b/python/xorbits/_mars/metrics/backends/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/metrics/backends/tests/test_metric.py b/python/xorbits/_mars/metrics/backends/tests/test_metric.py
new file mode 100644
index 000000000..96ab4c2e8
--- /dev/null
+++ b/python/xorbits/_mars/metrics/backends/tests/test_metric.py
@@ -0,0 +1,109 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from ..metric import (
+    AbstractCounter,
+    AbstractGauge,
+    AbstractHistogram,
+    AbstractMeter,
+    AbstractMetric,
+)
+
+
+def test_illegal_arguments():
+    class DummyMetric(AbstractMetric):
+        pass
+
+    DummyMetric.__abstractmethods__ = set()
+    with pytest.raises(AssertionError):
+        DummyMetric(1)
+
+    with pytest.raises(AssertionError):
+        DummyMetric("dummy_metric", 1)
+
+    with pytest.raises(AssertionError):
+        DummyMetric("dummy_metric", "A test metric", "service")
+
+    with pytest.raises(AssertionError):
+        DummyMetric("dummy_metric", "A test metric", ("service", 1))
+
+
+def test_dummy_metric():
+    class DummyMetric(AbstractMetric):
+        pass
+
+    DummyMetric.__abstractmethods__ = set()
+    m = DummyMetric("dummy_metric", "A test metric", ("service", "tenant"))
+    assert isinstance(m, AbstractMetric)
+    assert m.name == "dummy_metric"
+    assert m.description == "A test metric"
+    assert m.tag_keys == ("service", "tenant")
+    assert m.type is None
+    assert m._init() is None
+    assert m.record() is None
+    assert m._record() is None
+
+
+def test_counter():
+    class DummyCounter(AbstractCounter):
+        pass
+
+    DummyCounter.__abstractmethods__ = set()
+    c = DummyCounter("test_counter", "A test counter", ("service", "tenant"))
+    assert c.name == "test_counter"
+    assert c.description == "A test counter"
+    assert c.tag_keys == ("service", "tenant")
+    assert c.type == "Counter"
+    assert c.record(1, {"service": "mars", "tenant": "test"}) is None
+
+
+def test_gauge():
+    class DummyGauge(AbstractGauge):
+        pass
+
+    DummyGauge.__abstractmethods__ = set()
+    g = DummyGauge("test_gauge", "A test gauge")
+    assert g.name == "test_gauge"
+    assert g.description == "A test gauge"
+    assert g.tag_keys == ()
+    assert g.type == "Gauge"
+    assert g.record(1) is None
+
+
+def test_meter():
+    class DummyMeter(AbstractMeter):
+        pass
+
+    DummyMeter.__abstractmethods__ = set()
+    m = DummyMeter("test_meter")
+    assert m.name == "test_meter"
+    assert m.description == ""
+    assert m.tag_keys == ()
+    assert m.type == "Meter"
+    assert m.record(1) is None
+
+
+def test_histogram():
+    class DummyHistogram(AbstractHistogram):
+        pass
+
+    DummyHistogram.__abstractmethods__ = set()
+    h = DummyHistogram("test_histogram")
+    assert h.name == "test_histogram"
+    assert h.description == ""
+    assert h.tag_keys == ()
+    assert h.type == "Histogram"
+    assert h.record(1) is None
diff --git a/python/xorbits/_mars/metrics/tests/__init__.py b/python/xorbits/_mars/metrics/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/metrics/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/metrics/tests/test_metric_api.py b/python/xorbits/_mars/metrics/tests/test_metric_api.py
new file mode 100644
index 000000000..138c6c01e
--- /dev/null
+++ b/python/xorbits/_mars/metrics/tests/test_metric_api.py
@@ -0,0 +1,168 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from random import random
+
+import pytest
+
+from .. import api
+from ..api import (
+    Metrics,
+    Percentile,
+    _percentile_builder,
+    init_metrics,
+    record_time_cost_percentile,
+    shutdown_metrics,
+)
+
+
+@pytest.fixture
+def init():
+    init_metrics()
+
+
+def test_init_metrics():
+    init_metrics()
+    assert api._metric_backend == "console"
+    shutdown_metrics()
+    init_metrics("console")
+    assert api._metric_backend == "console"
+    shutdown_metrics()
+    init_metrics(backend="console")
+    assert api._metric_backend == "console"
+    shutdown_metrics()
+    init_metrics("prometheus")
+    assert api._metric_backend == "prometheus"
+    shutdown_metrics()
+    init_metrics(backend="prometheus", config={"port": 0})
+    assert api._metric_backend == "prometheus"
+    shutdown_metrics()
+    init_metrics("ray")
+    assert api._metric_backend == "ray"
+    shutdown_metrics()
+    with pytest.raises(NotImplementedError):
+        init_metrics("not_exist")
+
+
+@pytest.mark.parametrize("init_firstly", [True, False])
+def test_counter(init_firstly):
+    if init_firstly:
+        init_metrics()
+    c = Metrics.counter("test_counter", "A test counter", ("service", "tenant"))
+    assert c.name == "test_counter"
+    assert c.description == "A test counter"
+    assert c.tag_keys == ("service", "tenant")
+    assert c.type == "Counter"
+    if not init_firstly:
+        init_metrics()
+    c.record(1, {"service": "mars", "tenant": "test"})
+    c.record(2, {"service": "mars", "tenant": "test"})
+    assert c.value == 3
+
+
+@pytest.mark.parametrize("init_firstly", [True, False])
+def test_gauge(init_firstly):
+    if init_firstly:
+        init_metrics()
+    g = Metrics.gauge("test_gauge", "A test gauge")
+    assert g.name == "test_gauge"
+    assert g.description == "A test gauge"
+    assert g.tag_keys == ()
+    assert g.type == "Gauge"
+    if not init_firstly:
+        init_metrics()
+    g.record(1)
+    assert g.value == 1
+    g.record(2)
+    assert g.value == 2
+
+
+@pytest.mark.parametrize("init_firstly", [True, False])
+def test_meter(init_firstly):
+    if init_firstly:
+        init_metrics()
+    m = Metrics.meter("test_meter")
+    assert m.name == "test_meter"
+    assert m.description == ""
+    assert m.tag_keys == ()
+    assert m.type == "Meter"
+    if not init_firstly:
+        init_metrics()
+    m.record(1)
+    assert m.value == 0
+    m.record(2001)
+    assert m.value > 0
+
+
+@pytest.mark.parametrize("init_firstly", [True, False])
+def test_histogram(init_firstly):
+    if init_firstly:
+        init_metrics()
+    h = Metrics.histogram("test_histogram")
+    assert h.name == "test_histogram"
+    assert h.description == ""
+    assert h.tag_keys == ()
+    assert h.type == "Histogram"
+    if not init_firstly:
+        init_metrics()
+    h.record(1)
+    assert h.value == 0
+    for i in range(2002):
+        h.record(1)
+    assert h.value > 0
+
+
+def test_percentile_report():
+    def gen_callback(data):
+        def callback(value):
+            data.append(value)
+
+        return callback
+
+    data90 = []
+    data95 = []
+    data99 = []
+
+    all_data = []
+    percentile_args = [
+        (Percentile.PercentileType.P90, gen_callback(data90), 100),
+        (Percentile.PercentileType.P95, gen_callback(data95), 100),
+        (Percentile.PercentileType.P99, gen_callback(data99), 100),
+    ]
+    percentile_list = [
+        _percentile_builder[percentile_type](callback, window)
+        for percentile_type, callback, window in percentile_args
+    ]
+    for _ in range(199):
+        data = random()
+        all_data.append(data)
+        for percentile in percentile_list:
+            percentile.record_data(data)
+    sub_data = sorted(all_data[:100])
+    print(sub_data[:10])
+    assert len(data90) == 1 and sub_data[10 - 1] == data90[0]
+    assert len(data95) == 1 and sub_data[5 - 1] == data95[0]
+    assert len(data99) == 1 and sub_data[1 - 1] == data99[0]
+
+
+def test_invaild_percentile_report():
+    with pytest.raises(ValueError):
+        Percentile(-1, 10, lambda x: ...)
+
+    with pytest.raises(ValueError):
+        Percentile(1, -1, lambda x: ...)
+
+    with pytest.raises(ValueError):
+        with record_time_cost_percentile([]):
+            raise ValueError
diff --git a/python/xorbits/_mars/opcodes.py b/python/xorbits/_mars/opcodes.py
new file mode 100644
index 000000000..5d42caaf6
--- /dev/null
+++ b/python/xorbits/_mars/opcodes.py
@@ -0,0 +1,584 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+NULL = 0
+
+# creation
+# tensor
+SCALAR = 1
+TENSOR_DATA_SOURCE = 2
+TENSOR_ONES = 3
+TENSOR_ONES_LIKE = 4
+TENSOR_ZEROS = 5
+TENSOR_ZEROS_LIKE = 6
+TENSOR_EMPTY = 7
+TENSOR_EMPTY_LIKE = 8
+TENSOR_FULL = 9
+TENSOR_FULL_LIKE = 25
+TENSOR_ARANGE = 10
+TENSOR_INDICES = 11
+TENSOR_DIAG = 12
+TENSOR_EYE = 13
+TENSOR_LINSPACE = 14
+TENSOR_TRIU = 15
+TENSOR_TRIL = 16
+# external storage
+TENSOR_FROM_TILEDB = 18
+TENSOR_STORE_TILEDB = 19
+TENSOR_STORE_TILEDB_CONSOLIDATE = 20
+TENSOR_FROM_DATAFRAME = 22
+TENSOR_FROM_HDF5 = 27
+TENSOR_STORE_HDF5 = 28
+TENSOR_FROM_ZARR = 29
+TENSOR_STORE_ZARR = 32
+
+# dataframe
+DATAFRAME_DATA_SOURCE = 17
+DATAFRAME_FROM_TENSOR = 21
+DATAFRAME_FROM_RECORDS = 24
+# series
+SERIES_DATA_SOURCE = 23
+SERIES_FROM_TENSOR = 26
+SERIES_FROM_INDEX = 39
+# index
+INDEX_DATA_SOURCE = 33
+DATE_RANGE = 34
+TIMEDELTA_RANGE = 35
+CHECK_MONOTONIC = 38
+# misc
+MEMORY_USAGE = 36
+REBALANCE = 37
+
+# GPU
+TO_GPU = 30
+TO_CPU = 31
+
+# random
+RAND_RAND = 41
+RAND_RANDN = 42
+RAND_RANDINT = 43
+RAND_RANDOM_INTEGERS = 44
+RAND_RANDOM_SAMPLE = 45
+RAND_RANDOM = 46
+RAND_RANF = 47
+RAND_SAMPLE = 48
+RAND_BYTES = 49
+
+# random distribution
+RAND_BETA = 50
+RAND_BINOMIAL = 51
+RAND_CHISQUARE = 52
+RAND_CHOICE = 53
+RAND_DIRICHLET = 54
+RAND_EXPONENTIAL = 55
+RAND_F = 56
+RAND_GAMMA = 57
+RAND_GEOMETRIC = 58
+RAND_GUMBEL = 59
+RAND_HYPERGEOMETRIC = 60
+RAND_LAPLACE = 61
+RAND_LOGISTIC = 62
+RAND_LOGNORMAL = 63
+RAND_LOGSERIES = 64
+RAND_MULTINOMIAL = 65
+RAND_MULTIVARIATE_NORMAL = 66
+RAND_NEGATIVE_BINOMIAL = 67
+RAND_NONCENTRAL_CHISQURE = 68
+RAND_NONCENTRAL_F = 69
+RAND_NORMAL = 70
+RAND_PARETO = 71
+RAND_PERMUTATION = 72
+RAND_POSSION = 73
+RAND_POWER = 74
+RAND_RAYLEIGH = 75
+RAND_SHUFFLE = 76
+RAND_STANDARD_CAUCHY = 77
+RAND_STANDARD_EXPONENTIAL = 78
+RAND_STANDARD_GAMMMA = 79
+RAND_STANDARD_NORMAL = 80
+RAND_STANDARD_T = 81
+RAND_TOMAXINT = 82
+RAND_TRIANGULAR = 83
+RAND_UNIFORM = 84
+RAND_VONMISES = 85
+RAND_WALD = 86
+RAND_WEIBULL = 87
+RAND_ZIPF = 88
+PERMUTATION = 89
+UNIQUE = 90
+
+# ufunc
+ADD = 101
+SUB = 102
+MUL = 103
+DIV = 104
+TRUEDIV = 105
+FLOORDIV = 106
+POW = 107
+MOD = 108
+FMOD = 109
+LOGADDEXP = 110
+LOGADDEXP2 = 111
+NEGATIVE = 112
+POSITIVE = 113
+ABSOLUTE = 114
+FABS = 115
+ABS = 116
+RINT = 117
+SIGN = 118
+CONJ = 119
+EXP = 120
+EXP2 = 121
+LOG = 122
+LOG2 = 123
+LOG10 = 124
+EXPM1 = 125
+LOG1P = 126
+SQRT = 127
+SQUARE = 128
+CBRT = 129
+RECIPROCAL = 130
+EQ = 131
+NE = 132
+LT = 133
+LE = 134
+GT = 135
+GE = 136
+SIN = 137
+COS = 138
+TAN = 139
+ARCSIN = 140
+ARCCOS = 141
+ARCTAN = 142
+ARCTAN2 = 143
+HYPOT = 144
+SINH = 145
+COSH = 146
+TANH = 147
+ARCSINH = 148
+ARCCOSH = 149
+ARCTANH = 150
+DEG2RAD = 151
+RAD2DEG = 152
+BITAND = 153
+BITOR = 154
+BITXOR = 155
+INVERT = 156
+LSHIFT = 157
+RSHIFT = 158
+AND = 159
+OR = 160
+XOR = 161
+NOT = 162
+MAXIMUM = 163
+MINIMUM = 164
+AROUND = 165
+FLOAT_POWER = 166
+FMAX = 167
+FMIN = 168
+ISFINITE = 169
+ISINF = 170
+ISNAN = 171
+SIGNBIT = 172
+COPYSIGN = 173
+NEXTAFTER = 174
+SPACING = 175
+LDEXP = 176
+FREXP = 177
+MODF = 178
+FLOOR = 179
+CEIL = 180
+TRUNC = 181
+DEGREES = 182
+RADIANS = 183
+CLIP = 184
+ISREAL = 185
+ISCOMPLEX = 186
+REAL = 187
+IMAG = 188
+FIX = 189
+I0 = 190
+SINC = 191
+NAN_TO_NUM = 192
+ISCLOSE = 193
+DIVMOD = 194
+ANGLE = 195
+SET_REAL = 196
+SET_IMAG = 197
+
+# special
+SPECIAL = 200
+
+# spatial
+PDIST = 231
+CDIST = 232
+SQUAREFORM = 233
+
+# tree operand
+TREE_ADD = 251
+TREE_MULTIPLY = 252
+TREE_OR = 253
+
+# reduction
+CUMSUM = 301
+CUMPROD = 302
+PROD = 303
+SUM = 304
+MAX = 305
+MIN = 306
+ALL = 307
+ANY = 308
+MEAN = 309
+ARGMAX = 310
+ARGMIN = 311
+NANSUM = 312
+NANMAX = 313
+NANMIN = 314
+NANPROD = 315
+NANMEAN = 316
+NANARGMAX = 317
+NANARGMIN = 318
+COUNT_NONZERO = 319
+MOMENT = 320
+NANMOMENT = 321
+VAR = 322
+STD = 323
+NANVAR = 324
+NANSTD = 325
+NANCUMSUM = 326
+NANCUMPROD = 327
+COUNT = 343
+CUMMAX = 344
+CUMMIN = 345
+CUMCOUNT = 346
+CORR = 347
+REDUCTION_SIZE = 348
+CUSTOM_REDUCTION = 349
+SKEW = 350
+KURTOSIS = 351
+SEM = 352
+STR_CONCAT = 353
+MAD = 354
+
+# tensor operand
+RESHAPE = 401
+SLICE = 402
+INDEX = 403
+INDEXSETVALUE = 404
+CONCATENATE = 405
+RECHUNK = 406
+ASTYPE = 407
+TRANSPOSE = 408
+SWAPAXES = 409
+BROADCAST_TO = 410
+STACK = 411
+WHERE = 412
+CHOOSE = 413
+NONZERO = 414
+ARGWHERE = 415
+UNRAVEL_INDEX = 416
+RAVEL_MULTI_INDEX = 417
+ARRAY_SPLIT = 418
+SQUEEZE = 419
+DIGITIZE = 420
+REPEAT = 421
+COPYTO = 422
+ISIN = 423
+SEARCHSORTED = 428
+SORT = 429
+HISTOGRAM = 430
+HISTOGRAM_BIN_EDGES = 431
+PARTITION = 432
+QUANTILE = 440
+FILL_DIAGONAL = 441
+NORMALIZE = 442
+TOPK = 443
+TRAPZ = 444
+GET_SHAPE = 445
+BINCOUNT = 446
+# fancy index, distributed phase is a shuffle operation that
+# the fancy indexes will be distributed to the left chunks
+# the concat phase will concat back the indexed left chunks and index
+# according to the original fancy index order
+FANCY_INDEX_DISTRIBUTE = 424
+FANCY_INDEX_CONCAT = 425
+
+# linear algebra
+TENSORDOT = 501
+DOT = 502
+MATMUL = 503
+CHOLESKY = 510
+QR = 511
+SVD = 512
+LU = 513
+SOLVE_TRIANGULAR = 520
+INV = 521
+NORM = 530
+
+# fft
+FFT = 601
+IFFT = 602
+FFT2 = 603
+IFFT2 = 604
+FFTN = 605
+IFFTN = 606
+RFFT = 607
+IRFFT = 608
+RFFT2 = 609
+IRFFT2 = 610
+RFFTN = 611
+IRFFTN = 612
+HFFT = 613
+IHFFT = 614
+FFTFREQ = 615
+FFTFREQ_CHUNK = 616
+RFFTFREQ = 617
+FFTSHIFT = 618
+IFFTSHIFT = 619
+
+# einsum
+EINSUM = 630
+
+# sparse creation
+SPARSE_MATRIX_DATA_SOURCE = 701
+DENSE_TO_SPARSE = 702
+SPARSE_TO_DENSE = 703
+
+# DataFrame
+MAP = 710
+DESCRIBE = 712
+FILL_NA = 713
+AGGREGATE = 714
+STRING_METHOD = 715
+DATETIME_METHOD = 716
+APPLY = 717
+TRANSFORM = 718
+CHECK_NA = 719
+DROP_NA = 720
+NUNIQUE = 721
+CUT = 722
+SHIFT = 723
+DIFF = 724
+VALUE_COUNTS = 725
+TO_DATETIME = 726
+DATAFRAME_DROP = 727
+DROP_DUPLICATES = 728
+MELT = 729
+RENAME = 731
+INSERT = 732
+MAP_CHUNK = 733
+CARTESIAN_CHUNK = 734
+EXPLODE = 735
+REPLACE = 736
+RENAME_AXIS = 737
+DATAFRAME_EVAL = 738
+DUPLICATED = 739
+DELETE = 740
+ALIGN = 741
+
+FUSE = 801
+
+# table like input for tensor
+TABLE_COO = 1003
+# store tensor as coo format
+STORE_COO = 1004
+
+# shuffle
+SHUFFLE_PROXY = 2001
+DATAFRAME_INDEX_ALIGN = 2004
+
+# indexing
+DATAFRAME_SET_INDEX = 2020
+DATAFRAME_SET_AXIS = 730
+DATAFRAME_ILOC_GETITEM = 2021
+DATAFRAME_ILOC_SETITEM = 2022
+DATAFRAME_LOC_GETITEM = 2023
+DATAFRAME_LOC_SETITEM = 2024
+
+# merge
+DATAFRAME_MERGE = 2010
+DATAFRAME_SHUFFLE_MERGE_ALIGN = 2011
+
+# bloom filter
+DATAFRAME_BLOOM_FILTER = 2014
+
+# append
+APPEND = 2015
+
+# reset index
+RESET_INDEX = 2028
+# reindex
+REINDEX = 2029
+
+# groupby
+GROUPBY = 2030
+GROUPBY_AGG = 2033
+GROUPBY_CONCAT = 2034
+GROUPBY_HEAD = 2035
+GROUPBY_SAMPLE_ILOC = 2036
+GROUPBY_SORT_REGULAR_SAMPLE = 2037
+GROUPBY_SORT_PIVOT = 2038
+GROUPBY_SORT_SHUFFLE = 2039
+
+# parallel sorting by regular sampling
+PSRS_SORT_REGULAR_SMAPLE = 2040
+PSRS_CONCAT_PIVOT = 2041
+PSRS_SHUFFLE = 2042
+PSRS_ALIGN = 2043
+# partition
+CALC_PARTITIONS_INFO = 2046
+PARTITION_MERGED = 2047
+
+# dataframe sort
+SORT_VALUES = 2050
+SORT_INDEX = 2051
+
+# window
+ROLLING_AGG = 2060
+EXPANDING_AGG = 2061
+EWM_AGG = 2062
+
+# store
+READ_CSV = 2100
+TO_CSV = 2101
+READ_PARQUET = 2103
+TO_PARQUET = 2104
+READ_SQL = 2105
+TO_SQL = 2108
+READ_RAYDATASET = 2109
+READ_MLDATASET = 2106
+
+TO_CSV_STAT = 2102
+
+# standardize range index
+STANDARDIZE_RANGE_INDEX = 2107
+
+# successors exclusive
+SUCCESSORS_EXCLUSIVE = 2002
+
+# read images
+IMREAD = 2110
+
+# machine learning
+
+# pairwise distances
+PAIRWISE_EUCLIDEAN_DISTANCES = 2200
+PAIRWISE_MANHATTAN_DISTANCES = 2201
+PAIRWISE_COSINE_DISTANCES = 2202
+PAIRWISE_HAVERSINE_DISTANCES = 2203
+PAIRWISE_DISTANCES_TOPK = 2204
+
+# nearest neighbors
+KD_TREE_TRAIN = 2230
+KD_TREE_QUERY = 2231
+BALL_TREE_TRAIN = 2232
+BALL_TREE_QUERY = 2233
+FAISS_BUILD_INDEX = 2234
+FAISS_TRAIN_SAMPLED_INDEX = 2235
+FAISS_QUERY = 2236
+PROXIMA_SIMPLE_BUILDER = 2238
+PROXIMA_SIMPLE_SEARCHER = 2239
+KNEIGHBORS_GRAPH = 2237
+
+# cluster
+KMEANS_PLUS_PLUS_INIT = 2250
+KMEANS_SCALABLE_PLUS_PLUS_INIT = 2251
+KMEANS_ELKAN_INIT_BOUNDS = 2252
+KMEANS_ELKAN_UPDATE = 2253
+KMEANS_ELKAN_POSTPROCESS = 2254
+KMEANS_LLOYD_UPDATE = 2255
+KMEANS_LLOYD_POSTPROCESS = 2256
+KMEANS_INERTIA = 2257
+KMEANS_RELOCASTE_EMPTY_CLUSTERS = 2258
+
+# XGBoost
+XGBOOST_TRAIN = 3001
+XGBOOST_PREDICT = 3002
+TO_DMATRIX = 3003
+START_TRACKER = 3004
+
+# LightGBM
+LGBM_TRAIN = 3020
+LGBM_PREDICT = 3021
+LGBM_ALIGN = 3022
+
+# TensorFlow
+RUN_TENSORFLOW = 3010
+
+# PyTorch
+RUN_PYTORCH = 3011
+
+# statsmodels
+STATSMODELS_TRAIN = 3012
+STATSMODELS_PREDICT = 3013
+
+# learn
+# checks
+CHECK_NON_NEGATIVE = 3300
+# classifier check targets
+CHECK_TARGETS = 3301
+ASSERT_ALL_FINITE = 3302
+# multilabel
+IS_MULTILABEL = 3303
+# get type
+TYPE_OF_TARGET = 3304
+# classification
+ACCURACY_SCORE = 3305
+# port detection
+COLLECT_PORTS = 3306
+# unique labels
+UNIQUE_LABELS = 3307
+# preprocessing
+LABEL_BINARIZE = 3308
+# ensemble: blockwise
+BLOCKWISE_ENSEMBLE_FIT = 3309
+BLOCKWISE_ENSEMBLE_PREDICT = 3310
+# ensemble: bagging
+BAGGING_SHUFFLE_SAMPLE = 3400
+BAGGING_SHUFFLE_REINDEX = 3401
+BAGGING_FIT = 3402
+BAGGING_PREDICTION = 3403
+
+# Remote Functions and class
+REMOTE_FUNCATION = 5001
+RUN_SCRIPT = 5002
+
+# vineyard
+TENSOR_FROM_VINEYARD_CHUNK = 4000
+TENSOR_FROM_VINEYARD_META = 4001
+TENSOR_STORE_VINEYARD_CHUNK = 4002
+TENSOR_STORE_VINEYARD_META = 4003
+DATAFRAME_FROM_VINEYARD_CHUNK = 4004
+DATAFRAME_FROM_VINEYARD_META = 4005
+DATAFRAME_STORE_VINEYARD_CHUNK = 4006
+DATAFRAME_STORE_VINEYARD_META = 4007
+
+CHOLESKY_FUSE = 999988
+
+# fetches
+FETCH_SHUFFLE = 999998
+FETCH = 999999
+
+
+_val_to_dict = dict()
+for _var_name, _var_val in globals().copy().items():
+    if not isinstance(_var_val, int):
+        continue
+    if _var_val in _val_to_dict:  # pragma: no cover
+        raise ImportError(
+            f"Cannot import opcode: {_var_name} and "
+            f"{_val_to_dict[_var_val]} collides with value {_var_val}"
+        )
+    _val_to_dict[_var_val] = _var_name
+del _val_to_dict, _var_name, _var_val
diff --git a/python/xorbits/_mars/optimization/__init__.py b/python/xorbits/_mars/optimization/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/optimization/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/optimization/logical/__init__.py b/python/xorbits/_mars/optimization/logical/__init__.py
new file mode 100644
index 000000000..852b4fe10
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import OptimizationRecords
diff --git a/python/xorbits/_mars/optimization/logical/chunk/__init__.py b/python/xorbits/_mars/optimization/logical/chunk/__init__.py
new file mode 100644
index 000000000..30b40aa92
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/chunk/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .column_pruning import ChunkGetitemPruneDataSource
+from .core import optimize
+from .head import ChunkHeadPushDown
diff --git a/python/xorbits/_mars/optimization/logical/chunk/column_pruning.py b/python/xorbits/_mars/optimization/logical/chunk/column_pruning.py
new file mode 100644
index 000000000..bb6f7b6c9
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/chunk/column_pruning.py
@@ -0,0 +1,24 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....dataframe.indexing.getitem import DataFrameIndex
+from ..common.column_pruning import GetitemPruneDataSource
+from .core import register_operand_based_optimization_rule
+
+
+@register_operand_based_optimization_rule([DataFrameIndex])
+class ChunkGetitemPruneDataSource(GetitemPruneDataSource):
+    """
+    Prune data source via getitem.
+    """
diff --git a/python/xorbits/_mars/optimization/logical/chunk/core.py b/python/xorbits/_mars/optimization/logical/chunk/core.py
new file mode 100644
index 000000000..702ce7905
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/chunk/core.py
@@ -0,0 +1,39 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Type
+
+from ....core import ChunkGraph
+from ....typing import OperandType
+from ..core import OperandBasedOptimizationRule, OptimizationRecords, Optimizer
+
+
+class ChunkOptimizer(Optimizer):
+    """
+    Chunk Optimizer
+    """
+
+
+def register_operand_based_optimization_rule(op_types: List[Type[OperandType]]):
+    def wrap(rule_type: Type[OperandBasedOptimizationRule]):
+        for op_type in op_types:
+            rule_type.register_operand(op_type)
+        ChunkOptimizer.register_rule(rule_type)
+
+    return wrap
+
+
+def optimize(chunk_graph: ChunkGraph) -> OptimizationRecords:
+    return ChunkOptimizer.optimize(chunk_graph)
diff --git a/python/xorbits/_mars/optimization/logical/chunk/head.py b/python/xorbits/_mars/optimization/logical/chunk/head.py
new file mode 100644
index 000000000..234aebf30
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/chunk/head.py
@@ -0,0 +1,24 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....dataframe.indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem
+from ..common.head import HeadPushDown
+from .core import register_operand_based_optimization_rule
+
+
+@register_operand_based_optimization_rule([DataFrameIlocGetItem, SeriesIlocGetItem])
+class ChunkHeadPushDown(HeadPushDown):
+    """
+    Head push down.
+    """
diff --git a/python/xorbits/_mars/optimization/logical/chunk/tests/__init__.py b/python/xorbits/_mars/optimization/logical/chunk/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/chunk/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/optimization/logical/chunk/tests/test_column_pruning.py b/python/xorbits/_mars/optimization/logical/chunk/tests/test_column_pruning.py
new file mode 100644
index 000000000..03432910a
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/chunk/tests/test_column_pruning.py
@@ -0,0 +1,70 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+
+import pandas as pd
+import pytest
+
+from ..... import dataframe as md
+from .....core import (
+    ChunkGraphBuilder,
+    TileableGraph,
+    TileableGraphBuilder,
+    TileContext,
+    enter_mode,
+)
+from .. import optimize
+
+
+@pytest.fixture(scope="module")
+def gen_data1():
+    with tempfile.TemporaryDirectory() as tempdir:
+        df = pd.DataFrame(
+            {
+                "a": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+                "b": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+                "c": list("aabaaddce"),
+                "d": list("abaaaddce"),
+            }
+        )
+        yield df, tempdir
+
+
+@enter_mode(build=True)
+def test_groupby_read_csv(gen_data1):
+    pdf, tempdir = gen_data1
+    file_path = os.path.join(tempdir, "test.csv")
+    pdf.to_csv(file_path)
+
+    df1 = md.read_csv(file_path)
+    df2 = df1[["a", "b"]]
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    context = TileContext()
+    chunk_graph_builder = ChunkGraphBuilder(
+        graph, fuse_enabled=False, tile_context=context
+    )
+    chunk_graph = next(chunk_graph_builder.build())
+    chunk1 = context[df1.data].chunks[0].data
+    chunk2 = context[df2.data].chunks[0].data
+    records = optimize(chunk_graph)
+    opt_chunk1 = records.get_optimization_result(chunk1)
+    assert opt_chunk1 is None
+    opt_chunk2 = records.get_optimization_result(chunk2)
+    assert opt_chunk2 is not None
+    assert opt_chunk2.op.usecols == ["a", "b"]
+    # original tileable should not be modified
+    assert chunk2.inputs[0] is chunk1
diff --git a/python/xorbits/_mars/optimization/logical/chunk/tests/test_head.py b/python/xorbits/_mars/optimization/logical/chunk/tests/test_head.py
new file mode 100644
index 000000000..945644c70
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/chunk/tests/test_head.py
@@ -0,0 +1,68 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+
+import pandas as pd
+import pytest
+
+from ..... import dataframe as md
+from .....core import (
+    ChunkGraphBuilder,
+    TileableGraph,
+    TileableGraphBuilder,
+    TileContext,
+    enter_mode,
+)
+from .. import optimize
+
+
+@pytest.fixture(scope="module")
+def gen_data1():
+    with tempfile.TemporaryDirectory() as tempdir:
+        df = pd.DataFrame(
+            {
+                "a": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+                "b": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+                "c": list("aabaaddce"),
+                "d": list("abaaaddce"),
+            }
+        )
+        yield df, tempdir
+
+
+@enter_mode(build=True)
+def test_read_csv_head(gen_data1):
+    pdf, tempdir = gen_data1
+    file_path = os.path.join(tempdir, "test.csv")
+    pdf.to_csv(file_path)
+
+    df1 = md.read_csv(file_path)
+    df2 = df1.head(5)
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    context = TileContext()
+    chunk_graph_builder = ChunkGraphBuilder(
+        graph, fuse_enabled=False, tile_context=context
+    )
+    chunk_graph = next(chunk_graph_builder.build())
+    chunk1 = context[df1.data].chunks[0].data
+    chunk2 = context[df2.data].chunks[0].data
+    records = optimize(chunk_graph)
+    assert records.get_optimization_result(chunk1) is None
+    opt_chunk2 = records.get_optimization_result(chunk2)
+    assert opt_chunk2.op.nrows == 5
+    assert len(chunk_graph) == 1
+    assert opt_chunk2 in chunk_graph.results
diff --git a/python/xorbits/_mars/optimization/logical/common/__init__.py b/python/xorbits/_mars/optimization/logical/common/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/common/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/optimization/logical/common/column_pruning.py b/python/xorbits/_mars/optimization/logical/common/column_pruning.py
new file mode 100644
index 000000000..a05709a21
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/common/column_pruning.py
@@ -0,0 +1,239 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABCMeta, abstractmethod
+from typing import Any, List
+
+from ....core import CHUNK_TYPE, OperandType, TileableType
+from ....dataframe.datasource.core import ColumnPruneSupportedDataSourceMixin
+from ....dataframe.utils import parse_index
+from ....utils import implements
+from ..core import (
+    OperandBasedOptimizationRule,
+    OptimizationRecord,
+    OptimizationRecordType,
+)
+
+
+class PruneDataSource(OperandBasedOptimizationRule, metaclass=ABCMeta):
+    def _all_successor_prune_pushdown(self, successors: List[TileableType]):
+        for succ in successors:
+            prune_rule_types = [
+                rule_type
+                for rule_type in self._rule_type_to_op_types
+                if issubclass(rule_type, PruneDataSource)
+                and isinstance(succ.op, tuple(self._rule_type_to_op_types[rule_type]))
+            ]
+            if not prune_rule_types:
+                return False
+
+            for rule_type in prune_rule_types:
+                rule = self._cached_rule(rule_type)
+                if not rule._need_prune(succ.op):
+                    return False
+        return True
+
+    @implements(OperandBasedOptimizationRule.match_operand)
+    def match_operand(self, op: OperandType) -> bool:
+        node = op.outputs[0]
+        input_node = self._graph.predecessors(node)[0]
+        successors = self._graph.successors(input_node)
+        return self._all_successor_prune_pushdown(successors)
+
+    @abstractmethod
+    def _need_prune(self, op: OperandType) -> bool:
+        """
+        Check if this operand can prune
+
+        Returns
+        -------
+        need_prune : bool
+        """
+
+    @abstractmethod
+    def _get_selected_columns(self, op: OperandType) -> List[Any]:
+        """
+        Get selected columns to prune data source.
+
+        Parameters
+        ----------
+        op : OperandType
+            Operand.
+
+        Returns
+        -------
+        columns : list
+            Columns selected.
+        """
+
+    def _merge_selected_columns(self, selected_columns: List[Any], op: OperandType):
+        input_node = self._graph.predecessors(op.outputs[0])[0]
+        original_node = self._records.get_original_entity(input_node)
+        if original_node is None:
+            # not pruned before
+            original_all_columns = input_node.dtypes.index.tolist()
+            if set(selected_columns) != set(original_all_columns):
+                # not prune all fields
+                return [c for c in original_all_columns if c in selected_columns]
+            else:
+                return []
+        else:
+            # pruned before
+            original_all_columns = original_node.dtypes.index.tolist()
+            original_pruned_columns = input_node.op.get_columns()
+            pruned_columns_set = set(selected_columns) | set(original_pruned_columns)
+            # pruned before, cannot revert it,
+            # so we just return pruned columns
+            # even though no columns pruned
+            return [c for c in original_all_columns if c in pruned_columns_set]
+
+    @implements(OperandBasedOptimizationRule.apply_to_operand)
+    def apply_to_operand(self, op: OperandType):
+        node = op.outputs[0]
+        data_source_node = self._graph.predecessors(node)[0]
+
+        if (
+            isinstance(node, CHUNK_TYPE)
+            and self._graph.count_successors(data_source_node) == 1
+        ):
+            # merge into data source only for chunk
+            data_source_params = node.params.copy()
+            data_source_params.update(data_source_node.extra_params)
+            data_source_op = data_source_node.op.copy()
+            data_source_op._key = data_source_node.op.key
+            data_source_op._output_types = op.output_types
+            if node.ndim == 1:
+                data_source_op.set_pruned_columns(node.name, keep_order=True)
+            else:
+                data_source_op.set_pruned_columns(
+                    node.dtypes.index.tolist(), keep_order=True
+                )
+            new_entity = (
+                data_source_op.new_tileable
+                if not isinstance(node, CHUNK_TYPE)
+                else data_source_op.new_chunk
+            )
+            new_data_source_node = new_entity(
+                data_source_node.inputs, kws=[data_source_params]
+            ).data
+            new_data_source_node._key = node.key
+            new_data_source_node._id = node.id
+            # just remove the input data
+            self._graph.add_node(new_data_source_node)
+            for succ in self._graph.successors(node):
+                self._graph.add_edge(new_data_source_node, succ)
+            self._graph.remove_node(data_source_node)
+            self._graph.remove_node(node)
+
+            # mark optimization record
+            # the input node is removed
+            self._records.append_record(
+                OptimizationRecord(
+                    data_source_node, None, OptimizationRecordType.delete
+                )
+            )
+            self._records.append_record(
+                OptimizationRecord(
+                    node, new_data_source_node, OptimizationRecordType.replace
+                )
+            )
+            new_outputs = [new_data_source_node]
+        else:
+            selected_columns: List[Any] = self._get_selected_columns(op)
+            original_node = self._records.get_original_entity(data_source_node)
+            if original_node is not None:
+                # pruned before
+                dtypes = original_node.dtypes
+            else:
+                dtypes = data_source_node.dtypes
+            data_source_params = data_source_node.params.copy()
+            data_source_params["shape"] = (
+                data_source_node.shape[0],
+                len(selected_columns),
+            )
+            data_source_params["dtypes"] = dtypes = dtypes[selected_columns]
+            data_source_params["columns_value"] = parse_index(
+                dtypes.index, store_data=True
+            )
+            data_source_params.update(data_source_node.extra_params)
+            data_source_node_op = data_source_node.op.copy()
+            data_source_node_op._key = data_source_node.op.key
+            data_source_node_op.set_pruned_columns(selected_columns, keep_order=True)
+            new_data_source_node = data_source_node_op.new_tileable(
+                data_source_node_op.inputs, kws=[data_source_params]
+            ).data
+
+            self._replace_node(data_source_node, new_data_source_node)
+            # mark optimization record
+            self._records.append_record(
+                OptimizationRecord(
+                    data_source_node,
+                    new_data_source_node,
+                    OptimizationRecordType.replace,
+                )
+            )
+
+            new_op = op.copy()
+            new_op._key = op.key
+            kws = []
+            for out in op.outputs:
+                params = out.params.copy()
+                params.update(out.extra_params)
+                kws.append(params)
+            new_entity = (
+                new_op.new_tileables
+                if not isinstance(node, CHUNK_TYPE)
+                else new_op.new_chunks
+            )
+            new_outputs = [t.data for t in new_entity([new_data_source_node], kws=kws)]
+
+            for out, new_out in zip(op.outputs, new_outputs):
+                new_out._id = out.id
+                new_out._key = out.key
+                self._replace_node(out, new_out)
+                # mark optimization record
+                self._records.append_record(
+                    OptimizationRecord(out, new_out, OptimizationRecordType.replace)
+                )
+
+        for out, new_out in zip(op.outputs, new_outputs):
+            # check out if it's in result
+            try:
+                i = self._graph.results.index(out)
+                self._graph.results[i] = new_out
+            except ValueError:
+                pass
+
+
+class GetitemPruneDataSource(PruneDataSource):
+    def _need_prune(self, op: OperandType) -> bool:
+        data_source_node = self._graph.predecessors(op.outputs[0])[0]
+        input_can_be_pruned = isinstance(
+            data_source_node.op, ColumnPruneSupportedDataSourceMixin
+        )
+        if (
+            input_can_be_pruned
+            and data_source_node not in self._graph.results
+            and op.col_names is not None
+        ):
+            selected_columns = self._get_selected_columns(op)
+            if not isinstance(op.outputs[0], CHUNK_TYPE) and not selected_columns:
+                # no columns selected, skip
+                return False
+            return True
+        return False
+
+    def _get_selected_columns(self, op: OperandType) -> List[str]:
+        columns = op.col_names if isinstance(op.col_names, list) else [op.col_names]
+        return self._merge_selected_columns(columns, op)
diff --git a/python/xorbits/_mars/optimization/logical/common/head.py b/python/xorbits/_mars/optimization/logical/common/head.py
new file mode 100644
index 000000000..7e020294a
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/common/head.py
@@ -0,0 +1,145 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+from ....core import CHUNK_TYPE, OperandType, TileableType
+from ....dataframe.base.value_counts import DataFrameValueCounts
+from ....dataframe.datasource.core import HeadOptimizedDataSource
+from ....dataframe.sort.core import DataFrameSortOperand
+from ....dataframe.utils import parse_index
+from ....utils import implements
+from ..core import (
+    OperandBasedOptimizationRule,
+    OptimizationRecord,
+    OptimizationRecordType,
+)
+
+
+class HeadPushDown(OperandBasedOptimizationRule):
+    @implements(OperandBasedOptimizationRule.match_operand)
+    def match_operand(self, op: OperandType) -> bool:
+        node = op.outputs[0]
+        input_node = self._graph.predecessors(node)[0]
+        successors = self._graph.successors(input_node)
+        return self._all_successor_head_pushdown(successors)
+
+    def _all_successor_head_pushdown(self, successors: List[TileableType]):
+        for succ in successors:
+            push_down_rule_types = [
+                rule_type
+                for rule_type in self._rule_type_to_op_types
+                if issubclass(rule_type, HeadPushDown)
+                and isinstance(succ.op, tuple(self._rule_type_to_op_types[rule_type]))
+            ]
+            if not push_down_rule_types:
+                return False
+
+            for rule_type in push_down_rule_types:
+                rule = self._cached_rule(rule_type)
+                if not rule._can_push_down(succ.op):
+                    return False
+        return True
+
+    def _can_push_down(self, op: OperandType) -> bool:
+        input_nodes = self._graph.predecessors(op.outputs[0])
+        accept_types = (
+            HeadOptimizedDataSource,
+            DataFrameSortOperand,
+            DataFrameValueCounts,
+        )
+        if (
+            len(input_nodes) == 1
+            and op.can_be_optimized()
+            and isinstance(input_nodes[0].op, accept_types)
+            and input_nodes[0] not in self._graph.results
+        ):
+            return True
+        return False
+
+    def apply_to_operand(self, op: OperandType):
+        node = op.outputs[0]
+        input_node = self._graph.predecessors(node)[0]
+        nrows = input_node.op.nrows or 0
+        head = op.indexes[0].stop
+
+        new_input_op = input_node.op.copy()
+        new_input_op._key = input_node.op.key
+        new_input_op.nrows = nrows = max(nrows, head)
+        new_input_params = input_node.params.copy()
+        new_input_params["shape"] = (nrows,) + input_node.shape[1:]
+        pandas_index = node.index_value.to_pandas()[:nrows]
+        new_input_params["index_value"] = parse_index(pandas_index, node)
+        new_input_params.update(input_node.extra_params)
+        new_entity = (
+            new_input_op.new_tileable
+            if not isinstance(node, CHUNK_TYPE)
+            else new_input_op.new_chunk
+        )
+        new_input_node = new_entity(input_node.inputs, kws=[new_input_params]).data
+
+        if (
+            new_input_node.op.nrows == head
+            and self._graph.count_successors(input_node) == 1
+        ):
+            new_input_node._key = node.key
+            new_input_node._id = node.id
+            # just remove the input data
+            self._graph.add_node(new_input_node)
+            for succ in self._graph.successors(node):
+                self._graph.add_edge(new_input_node, succ)
+            for pred in self._graph.predecessors(input_node):
+                self._graph.add_edge(pred, new_input_node)
+            self._graph.remove_node(input_node)
+            self._graph.remove_node(node)
+
+            # mark optimization record
+            # the input node is removed
+            self._records.append_record(
+                OptimizationRecord(input_node, None, OptimizationRecordType.delete)
+            )
+            self._records.append_record(
+                OptimizationRecord(node, new_input_node, OptimizationRecordType.replace)
+            )
+            new_node = new_input_node
+        else:
+            self._replace_node(input_node, new_input_node)
+            new_op = op.copy()
+            new_op._key = op.key
+            params = node.params.copy()
+            params.update(node.extra_params)
+            new_entity = (
+                new_op.new_tileable
+                if not isinstance(node, CHUNK_TYPE)
+                else new_op.new_chunk
+            )
+            new_node = new_entity([new_input_node], kws=[params]).data
+            self._replace_node(node, new_node)
+
+            # mark optimization record
+            self._records.append_record(
+                OptimizationRecord(
+                    input_node, new_input_node, OptimizationRecordType.replace
+                )
+            )
+            self._records.append_record(
+                OptimizationRecord(node, new_node, OptimizationRecordType.replace)
+            )
+
+        # check node if it's in result
+        try:
+            i = self._graph.results.index(node)
+            self._graph.results[i] = new_node
+        except ValueError:
+            pass
diff --git a/python/xorbits/_mars/optimization/logical/core.py b/python/xorbits/_mars/optimization/logical/core.py
new file mode 100644
index 000000000..6e5cffeb2
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/core.py
@@ -0,0 +1,285 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import weakref
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from dataclasses import dataclass
+from enum import Enum
+from typing import Dict, List, Optional, Set, Type
+
+from ...core import EntityType, OperandType, enter_mode
+from ...core.graph import EntityGraph
+from ...utils import implements
+
+
+class OptimizationRecordType(Enum):
+    replace = 0
+    new = 1
+    delete = 2
+
+
+@dataclass
+class OptimizationRecord:
+    original_entity: EntityType = None
+    new_entity: EntityType = None
+    record_type: OptimizationRecordType = None
+
+
+class OptimizationRecords:
+    _records: List[OptimizationRecord]
+    _original_entity_to_records: Dict[EntityType, OptimizationRecord]
+
+    def __init__(self):
+        self._records = list()
+        self._original_entity_to_records = dict()
+        self._optimized_entity_to_records = dict()
+
+    def append_record(self, record: OptimizationRecord):
+        self._records.append(record)
+        if record.record_type in (
+            OptimizationRecordType.replace,
+            OptimizationRecordType.delete,
+        ):
+            self._original_entity_to_records[record.original_entity] = record
+        if record.record_type in (
+            OptimizationRecordType.new,
+            OptimizationRecordType.replace,
+        ):
+            self._optimized_entity_to_records[record.new_entity] = record
+
+    def get_optimization_result(
+        self, original_entity: EntityType, default: Optional[EntityType] = None
+    ) -> EntityType:
+        entity = original_entity
+        if entity not in self._original_entity_to_records:
+            return default
+        while entity in self._original_entity_to_records:
+            record = self._original_entity_to_records[entity]
+            if record.record_type == OptimizationRecordType.replace:
+                entity = record.new_entity
+            else:
+                assert record.record_type == OptimizationRecordType.delete
+                return None
+        return entity
+
+    def get_original_entity(
+        self, optimized_entity: EntityType, default: Optional[EntityType] = None
+    ) -> EntityType:
+        entity = optimized_entity
+        if entity not in self._optimized_entity_to_records:
+            return default
+        while entity in self._optimized_entity_to_records:
+            record = self._optimized_entity_to_records[entity]
+            if record.record_type == OptimizationRecordType.replace:
+                entity = record.original_entity
+            else:
+                assert record.record_type == OptimizationRecordType.new
+                return None
+        return entity
+
+
+class OptimizationRule(ABC):
+    _preds_to_remove = weakref.WeakKeyDictionary()
+
+    def __init__(
+        self,
+        graph: EntityGraph,
+        records: OptimizationRecords,
+        optimizer_cls: Type["Optimizer"],
+    ):
+        self._graph = graph
+        self._records = records
+        self._optimizer_cls = optimizer_cls
+        self._cached_rule = functools.lru_cache(maxsize=None)(
+            lambda _rule_type: _rule_type(
+                self._graph, self._records, self._optimizer_cls
+            )
+        )
+
+    @abstractmethod
+    def apply(self) -> bool:
+        """
+        Apply the rule to the graph.
+
+        Returns
+        -------
+        bool
+            If the graph got optimized.
+        """
+        pass
+
+    def _replace_node(self, original_node: EntityType, new_node: EntityType):
+        predecessors = self._graph.predecessors(original_node)
+        successors = self._graph.successors(original_node)
+        self._graph.remove_node(original_node)
+        self._graph.add_node(new_node)
+        for pred in predecessors:
+            self._graph.add_edge(pred, new_node)
+        for succ in successors:
+            self._graph.add_edge(new_node, succ)
+
+    def _add_collapsable_predecessor(self, node: EntityType, predecessor: EntityType):
+        pred_original = self._records.get_original_entity(predecessor, predecessor)
+        if predecessor not in self._preds_to_remove:
+            self._preds_to_remove[pred_original] = {node}
+        else:
+            self._preds_to_remove[pred_original].add(node)
+
+    def _remove_collapsable_predecessors(self, node: EntityType):
+        node = self._records.get_optimization_result(node) or node
+        preds_opt_to_remove = []
+        for pred in self._graph.predecessors(node):
+            pred_original = self._records.get_original_entity(pred, pred)
+            pred_opt = self._records.get_optimization_result(pred, pred)
+
+            if pred_opt in self._graph.results or pred_original in self._graph.results:
+                continue
+            affect_succ = self._preds_to_remove.get(pred_original) or []
+            affect_succ_opt = [
+                self._records.get_optimization_result(s, s) for s in affect_succ
+            ]
+            if all(s in affect_succ_opt for s in self._graph.successors(pred)):
+                preds_opt_to_remove.append((pred_original, pred_opt))
+
+        for pred_original, pred_opt in preds_opt_to_remove:
+            self._graph.remove_node(pred_opt)
+            self._records.append_record(
+                OptimizationRecord(pred_original, None, OptimizationRecordType.delete)
+            )
+
+
+class OperandBasedOptimizationRule(OptimizationRule):
+    """
+    Optimization rule that optimize certain operands of the graph in topological way.
+    """
+
+    _rule_type_to_op_types: Dict[
+        Type[OptimizationRule], Set[Type[OperandType]]
+    ] = defaultdict(set)
+
+    @implements(OptimizationRule.apply)
+    def apply(self) -> bool:
+        visited = set()
+        optimized = False
+        for entity in list(self._graph.topological_iter()):
+            op = entity.op
+            if op in visited:
+                continue
+            visited.add(op)
+
+            if entity not in self._graph:  # pragma: no cover
+                # maybe removed during optimization
+                continue
+            op_types = self._rule_type_to_op_types[type(self)]
+            if isinstance(op, tuple(op_types)) and self.match_operand(op):
+                optimized = True
+                self.apply_to_operand(op)
+
+        return optimized
+
+    @abstractmethod
+    def apply_to_operand(self, op: OperandType) -> None:
+        """
+        Apply this rule to the given operand.
+
+        Parameters
+        ----------
+        op : OperandType
+            Operand.
+        """
+        pass
+
+    @abstractmethod
+    def match_operand(self, op: OperandType) -> bool:
+        """
+        If this operand matches this rule.
+
+        Parameters
+        ----------
+        op : OperandType
+            Operand.
+
+        Returns
+        -------
+        bool
+            If this operand matches this rule.
+        """
+        pass
+
+    @classmethod
+    def register_operand(cls, op_type: Type[OperandType]):
+        cls._rule_type_to_op_types[cls].add(op_type)
+        for derived in op_type.__subclasses__():
+            cls._rule_type_to_op_types[cls].add(derived)
+
+
+class Optimizer(ABC):
+    _rule_types: List[Type[OptimizationRule]]
+
+    @classmethod
+    def register_rule(cls, rule_type: Type[OptimizationRule]):
+        if not hasattr(cls, "_rule_types"):
+            cls._rule_types = []
+        cls._rule_types.append(rule_type)
+
+    @classmethod
+    def _replace_inputs(cls, graph: EntityGraph, records: OptimizationRecords):
+        for node in graph:
+            for succ in graph.successors(node):
+                input_optimized = False
+                new_inputs = []
+                for inp in succ.inputs:
+                    optimized = records.get_optimization_result(inp)
+                    if optimized is None:
+                        optimized = inp
+                    if optimized is not inp:
+                        input_optimized = True
+                    new_inputs.append(optimized)
+                if input_optimized:
+                    succ.inputs = new_inputs
+
+    @classmethod
+    @enter_mode(build=True)
+    def optimize(cls, graph: EntityGraph) -> OptimizationRecords:
+        """
+        Optimize a graph.
+
+        Parameters
+        ----------
+        graph : EntityGraph
+            Tileable or chunk graph.
+
+        Returns
+        -------
+        optimization_records : OptimizationRecords
+            Optimization records.
+        """
+        records = OptimizationRecords()
+        cached_rule = functools.lru_cache(maxsize=None)(
+            lambda _rule_type: _rule_type(graph, records, cls)
+        )
+
+        for rule_type in cls._rule_types:
+            rule = cached_rule(rule_type)
+            if rule.apply():
+                cls._replace_inputs(graph, records)
+                new_results = []
+                for result in graph.results:
+                    new_results.append(
+                        records.get_optimization_result(result, default=result)
+                    )
+                graph.results = new_results
+
+        return records
diff --git a/python/xorbits/_mars/optimization/logical/tileable/__init__.py b/python/xorbits/_mars/optimization/logical/tileable/__init__.py
new file mode 100644
index 000000000..f28460357
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .arithmetic_query import SeriesArithmeticToEval
+
+# TODO: the order of applying optimization rules depends on the order of import
+# Column pruning must be applied first for now.
+from .column_pruning import ColumnPruningRule
+from .core import optimize
+from .head import HeadPushDown
diff --git a/python/xorbits/_mars/optimization/logical/tileable/arithmetic_query.py b/python/xorbits/_mars/optimization/logical/tileable/arithmetic_query.py
new file mode 100644
index 000000000..56a580aef
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/arithmetic_query.py
@@ -0,0 +1,366 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import weakref
+from typing import NamedTuple, Optional
+
+import numpy as np
+from pandas.api.types import is_scalar
+
+from .... import dataframe as md
+from ....core import ENTITY_TYPE, Tileable, get_output_types
+from ....dataframe.arithmetic.core import DataFrameBinopUfunc, DataFrameUnaryUfunc
+from ....dataframe.base.eval import DataFrameEval
+from ....dataframe.indexing.getitem import DataFrameIndex
+from ....dataframe.indexing.setitem import DataFrameSetitem
+from ....typing import OperandType
+from ....utils import implements
+from ..core import OptimizationRecord, OptimizationRecordType
+from ..tileable.core import register_operand_based_optimization_rule
+from .core import OperandBasedOptimizationRule
+
+
+class EvalExtractRecord(NamedTuple):
+    tileable: Optional[Tileable] = None
+    expr: Optional[str] = None
+    variables: Optional[dict] = None
+
+
+def _get_binop_builder(op_str: str):
+    def builder(lhs: str, rhs: str):
+        return f"({lhs}) {op_str} ({rhs})"
+
+    return builder
+
+
+_func_name_to_builder = {
+    "add": _get_binop_builder("+"),
+    "sub": _get_binop_builder("-"),
+    "mul": _get_binop_builder("*"),
+    "floordiv": _get_binop_builder("//"),
+    "truediv": _get_binop_builder("/"),
+    "pow": _get_binop_builder("**"),
+    "eq": _get_binop_builder("=="),
+    "ne": _get_binop_builder("!="),
+    "lt": _get_binop_builder("<"),
+    "le": _get_binop_builder("<="),
+    "gt": _get_binop_builder(">"),
+    "ge": _get_binop_builder(">="),
+    "__and__": _get_binop_builder("&"),
+    "__or__": _get_binop_builder("|"),
+    "__xor__": _get_binop_builder("^"),
+    "negative": lambda expr: f"-({expr})",
+    "__invert__": lambda expr: f"~({expr})",
+}
+_extract_result_cache = weakref.WeakKeyDictionary()
+
+
+@register_operand_based_optimization_rule([DataFrameUnaryUfunc, DataFrameBinopUfunc])
+class SeriesArithmeticToEval(OperandBasedOptimizationRule):
+    _var_counter = 0
+
+    @classmethod
+    def _next_var_id(cls):
+        cls._var_counter += 1
+        return cls._var_counter
+
+    @implements(OperandBasedOptimizationRule.match_operand)
+    def match_operand(self, op: OperandType) -> bool:
+        if op.gpu:
+            return False
+        _, expr, _ = self._extract_eval_expression(op.outputs[0])
+        return expr is not None
+
+    @staticmethod
+    def _is_select_dataframe_column(tileable) -> bool:
+        if not isinstance(tileable, md.Series) or not isinstance(
+            tileable.op, DataFrameIndex
+        ):
+            return False
+
+        input_df = tileable.inputs[0]
+        index_op: DataFrameIndex = tileable.op
+        if (
+            not isinstance(input_df, md.DataFrame)
+            or input_df.dtypes is None
+            or not input_df.dtypes.index.is_unique
+            or any(not isinstance(v, str) for v in input_df.dtypes.keys())
+        ):
+            return False
+
+        return (
+            isinstance(input_df, md.DataFrame)
+            and input_df.dtypes is not None
+            and index_op.col_names is not None
+            and index_op.col_names in input_df.dtypes
+            and index_op.mask is None
+        )
+
+    def _extract_eval_expression(self, tileable) -> EvalExtractRecord:
+        if is_scalar(tileable):
+            if isinstance(tileable, (int, bool, str, bytes, np.integer, np.bool_)):
+                return EvalExtractRecord(expr=repr(tileable))
+            else:
+                var_name = f"__eval_scalar_var{self._next_var_id()}"
+                var_dict = {var_name: tileable}
+                return EvalExtractRecord(expr=f"@{var_name}", variables=var_dict)
+
+        if not isinstance(tileable, ENTITY_TYPE):  # pragma: no cover
+            return EvalExtractRecord()
+
+        if tileable in _extract_result_cache:
+            return _extract_result_cache[tileable]
+
+        if self._is_select_dataframe_column(tileable):
+            result = self._extract_column_select(tileable)
+        elif isinstance(tileable.op, DataFrameUnaryUfunc):
+            result = self._extract_unary(tileable)
+        elif isinstance(tileable.op, DataFrameBinopUfunc):
+            if tileable.op.fill_value is not None or tileable.op.level is not None:
+                result = EvalExtractRecord()
+            else:
+                result = self._extract_binary(tileable)
+        else:
+            result = EvalExtractRecord()
+
+        _extract_result_cache[tileable] = result
+        return result
+
+    @classmethod
+    def _extract_column_select(cls, tileable) -> EvalExtractRecord:
+        return EvalExtractRecord(tileable.inputs[0], f"`{tileable.op.col_names}`")
+
+    def _extract_unary(self, tileable) -> EvalExtractRecord:
+        op = tileable.op
+        func_name = getattr(op, "_func_name") or getattr(op, "_bin_func_name")
+        if func_name not in _func_name_to_builder:  # pragma: no cover
+            return EvalExtractRecord()
+
+        in_tileable, expr, variables = self._extract_eval_expression(op.inputs[0])
+        if in_tileable is None:
+            return EvalExtractRecord()
+
+        self._add_collapsable_predecessor(tileable, op.inputs[0])
+        return EvalExtractRecord(
+            in_tileable, _func_name_to_builder[func_name](expr), variables
+        )
+
+    def _extract_binary(self, tileable) -> EvalExtractRecord:
+        op = tileable.op
+        func_name = getattr(op, "_func_name", None) or getattr(op, "_bit_func_name")
+        if func_name not in _func_name_to_builder:  # pragma: no cover
+            return EvalExtractRecord()
+
+        lhs_tileable, lhs_expr, lhs_vars = self._extract_eval_expression(op.lhs)
+        if lhs_tileable is not None:
+            self._add_collapsable_predecessor(tileable, op.lhs)
+        rhs_tileable, rhs_expr, rhs_vars = self._extract_eval_expression(op.rhs)
+        if rhs_tileable is not None:
+            self._add_collapsable_predecessor(tileable, op.rhs)
+
+        if lhs_expr is None or rhs_expr is None:
+            return EvalExtractRecord()
+        if (
+            lhs_tileable is not None
+            and rhs_tileable is not None
+            and lhs_tileable.key != rhs_tileable.key
+        ):
+            return EvalExtractRecord()
+
+        variables = (lhs_vars or dict()).copy()
+        variables.update(rhs_vars or dict())
+        in_tileable = next(t for t in [lhs_tileable, rhs_tileable] if t is not None)
+        return EvalExtractRecord(
+            in_tileable, _func_name_to_builder[func_name](lhs_expr, rhs_expr), variables
+        )
+
+    @implements(OperandBasedOptimizationRule.apply_to_operand)
+    def apply_to_operand(self, op: OperandType):
+        node = op.outputs[0]
+        in_tileable, expr, variables = self._extract_eval_expression(node)
+        opt_in_tileable = self._records.get_optimization_result(
+            in_tileable, in_tileable
+        )
+
+        new_op = DataFrameEval(
+            _key=node.op.key,
+            _output_types=get_output_types(node),
+            expr=expr,
+            variables=variables or dict(),
+            parser="pandas",
+            is_query=False,
+        )
+        new_node = new_op.new_tileable(
+            [opt_in_tileable], _key=node.key, _id=node.id, **node.params
+        ).data
+
+        self._remove_collapsable_predecessors(node)
+        self._replace_node(node, new_node)
+        self._graph.add_edge(opt_in_tileable, new_node)
+
+        self._records.append_record(
+            OptimizationRecord(node, new_node, OptimizationRecordType.replace)
+        )
+
+        # check node if it's in result
+        try:
+            i = self._graph.results.index(node)
+            self._graph.results[i] = new_node
+        except ValueError:
+            pass
+
+
+class _DataFrameEvalRewriteRule(OperandBasedOptimizationRule):
+    @implements(OperandBasedOptimizationRule.match_operand)
+    def match_operand(self, op: OperandType) -> bool:
+        optimized_eval_op = self._get_optimized_eval_op(op)
+        if (
+            op.gpu
+            or not isinstance(optimized_eval_op, DataFrameEval)
+            or optimized_eval_op.is_query
+            or optimized_eval_op.inputs[0].key != op.inputs[0].key
+        ):
+            return False
+        return True
+
+    def _build_new_eval_op(self, op: OperandType):
+        raise NotImplementedError
+
+    def _get_optimized_eval_op(self, op: OperandType) -> OperandType:
+        in_columnar_node = self._get_input_columnar_node(op)
+        optimized = self._records.get_optimization_result(in_columnar_node)
+        return optimized.op if optimized is not None else in_columnar_node.op
+
+    def _get_input_columnar_node(self, op: OperandType) -> ENTITY_TYPE:
+        raise NotImplementedError
+
+    def _update_op_node(self, old_node: ENTITY_TYPE, new_node: ENTITY_TYPE):
+        self._replace_node(old_node, new_node)
+        for in_tileable in new_node.inputs:
+            self._graph.add_edge(in_tileable, new_node)
+
+        original_node = self._records.get_original_entity(old_node, old_node)
+        self._records.append_record(
+            OptimizationRecord(original_node, new_node, OptimizationRecordType.replace)
+        )
+
+    @implements(OperandBasedOptimizationRule.apply_to_operand)
+    def apply_to_operand(self, op: DataFrameIndex):
+        node = op.outputs[0]
+        in_tileable = op.inputs[0]
+        in_columnar_node = self._get_input_columnar_node(op)
+        opt_in_tileable = self._records.get_optimization_result(
+            in_tileable, in_tileable
+        )
+
+        new_op = self._build_new_eval_op(op)
+        new_node = new_op.new_tileable(
+            [opt_in_tileable], _key=node.key, _id=node.id, **node.params
+        ).data
+
+        self._add_collapsable_predecessor(node, in_columnar_node)
+        self._remove_collapsable_predecessors(node)
+        self._update_op_node(node, new_node)
+
+
+@register_operand_based_optimization_rule([DataFrameIndex])
+class DataFrameBoolEvalToQuery(_DataFrameEvalRewriteRule):
+    @implements(OperandBasedOptimizationRule.match_operand)
+    def match_operand(self, op: DataFrameIndex) -> bool:
+        if (
+            op.col_names is not None
+            or not isinstance(op.mask, md.Series)
+            or op.mask.dtype != bool
+        ):
+            return False
+        return super().match_operand(op)
+
+    def _get_input_columnar_node(self, op: OperandType) -> ENTITY_TYPE:
+        return op.mask
+
+    def _build_new_eval_op(self, op: OperandType):
+        in_eval_op = self._get_optimized_eval_op(op)
+        return DataFrameEval(
+            _key=op.key,
+            _output_types=get_output_types(op.outputs[0]),
+            expr=in_eval_op.expr,
+            variables=in_eval_op.variables,
+            parser="pandas",
+            is_query=True,
+        )
+
+
+@register_operand_based_optimization_rule([DataFrameSetitem])
+class DataFrameEvalSetItemToEval(_DataFrameEvalRewriteRule):
+    @implements(OperandBasedOptimizationRule.match_operand)
+    def match_operand(self, op: DataFrameSetitem):
+        if not isinstance(op.indexes, str) or not isinstance(op.value, md.Series):
+            return False
+        return super().match_operand(op)
+
+    def _get_input_columnar_node(self, op: DataFrameSetitem) -> ENTITY_TYPE:
+        return op.value
+
+    def _build_new_eval_op(self, op: DataFrameSetitem):
+        in_eval_op = self._get_optimized_eval_op(op)
+        return DataFrameEval(
+            _key=op.key,
+            _output_types=get_output_types(op.outputs[0]),
+            expr=f"`{op.indexes}` = {in_eval_op.expr}",
+            variables=in_eval_op.variables,
+            parser="pandas",
+            is_query=False,
+            self_target=True,
+        )
+
+    @implements(OperandBasedOptimizationRule.apply_to_operand)
+    def apply_to_operand(self, op: DataFrameIndex):
+        super().apply_to_operand(op)
+
+        node = op.outputs[0]
+        opt_node = self._records.get_optimization_result(node, node)
+        if not isinstance(opt_node.op, DataFrameEval):  # pragma: no cover
+            return
+
+        # when encountering consecutive SetItems, expressions can be
+        # merged as a multiline expression
+        pred_opt_node = opt_node.inputs[0]
+        if (
+            isinstance(pred_opt_node.op, DataFrameEval)
+            and opt_node.op.parser == pred_opt_node.op.parser == "pandas"
+            and not opt_node.op.is_query
+            and not pred_opt_node.op.is_query
+            and opt_node.op.self_target
+            and pred_opt_node.op.self_target
+        ):
+            new_expr = pred_opt_node.op.expr + "\n" + opt_node.op.expr
+            new_variables = (pred_opt_node.op.variables or dict()).copy()
+            new_variables.update(opt_node.op.variables or dict())
+
+            new_op = DataFrameEval(
+                _key=op.key,
+                _output_types=get_output_types(op.outputs[0]),
+                expr=new_expr,
+                variables=new_variables,
+                parser="pandas",
+                is_query=False,
+                self_target=True,
+            )
+            new_node = new_op.new_tileable(
+                pred_opt_node.inputs, _key=node.key, _id=node.id, **node.params
+            ).data
+
+            self._add_collapsable_predecessor(opt_node, pred_opt_node)
+            self._remove_collapsable_predecessors(opt_node)
+            self._update_op_node(opt_node, new_node)
diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/__init__.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/__init__.py
new file mode 100644
index 000000000..0ef066397
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .column_pruning_rule import ColumnPruningRule
diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/column_pruning_rule.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/column_pruning_rule.py
new file mode 100644
index 000000000..41927a78a
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/column_pruning_rule.py
@@ -0,0 +1,241 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Dict, Set, Any, Type, Union, Optional
+
+import pandas as pd
+
+from .input_column_selector import InputColumnSelector
+from .self_column_selector import SelfColumnSelector
+from ..core import register_optimization_rule
+from ...core import (
+    OptimizationRecord,
+    OptimizationRecordType,
+    OptimizationRule,
+    OptimizationRecords,
+    Optimizer,
+)
+from .....core import TileableData
+from .....core.graph import EntityGraph
+from .....dataframe.core import (
+    parse_index,
+    BaseSeriesData,
+    BaseDataFrameData,
+)
+from .....dataframe.datasource.core import ColumnPruneSupportedDataSourceMixin
+from .....dataframe.groupby.aggregation import DataFrameGroupByAgg
+from .....dataframe.indexing.getitem import DataFrameIndex
+from .....dataframe.merge import DataFrameMerge
+from .....utils import implements
+
+OPTIMIZABLE_OP_TYPES = (DataFrameMerge, DataFrameGroupByAgg)
+
+
+@register_optimization_rule()
+class ColumnPruningRule(OptimizationRule):
+    def __init__(
+        self,
+        graph: EntityGraph,
+        records: OptimizationRecords,
+        optimizer_cls: Type["Optimizer"],
+    ):
+        super().__init__(graph, records, optimizer_cls)
+        self._context: Dict[TileableData, Dict[TileableData, Set[Any]]] = {}
+
+    def _get_successor_required_columns(self, data: TileableData) -> Set[Any]:
+        """
+        Get columns required by the successors of the given tileable data.
+        """
+        successors = self._get_successors(data)
+        if successors:
+            return set().union(
+                *[self._context[successor][data] for successor in successors]
+            )
+        else:
+            return self._get_all_columns(data)
+
+    @staticmethod
+    def _get_self_required_columns(data: TileableData) -> Set[Any]:
+        return SelfColumnSelector.select(data)
+
+    def _get_required_columns(self, data: TileableData) -> Optional[Set[Any]]:
+        required_columns = set()
+        successor_required_columns = self._get_successor_required_columns(data)
+        if successor_required_columns is None:
+            return None
+        required_columns.update(successor_required_columns)
+        self_required_columns = self._get_self_required_columns(data)
+        required_columns.update(self_required_columns)
+        return required_columns
+
+    @staticmethod
+    def _get_all_columns(data: TileableData) -> Union[Set[Any], None]:
+        """
+        Return all the columns of given tileable data. If the given tileable data is neither
+        BaseDataFrameData nor BaseSeriesData, None will be returned, indicating that column pruning
+        is not available for the given tileable data.
+        """
+        if isinstance(data, BaseDataFrameData) and data.dtypes is not None:
+            return set(data.dtypes.index)
+        elif isinstance(data, BaseSeriesData):
+            return {data.name}
+        else:
+            return None
+
+    def _get_successors(self, data: TileableData) -> List[TileableData]:
+        """
+        Get successors of the given tileable data.
+
+        Column pruning is available only when every successor is available for column pruning
+        (i.e. appears in the context).
+        """
+        successors = list(self._graph.successors(data))
+        if all(successor in self._context for successor in successors):
+            return successors
+        else:
+            return []
+
+    def _build_context(self) -> None:
+        """
+        Select required columns for each tileable data in the graph.
+        """
+        for data in self._graph.topological_iter(reverse=True):
+            if self._is_skipped_type(data):
+                continue
+            self._context[data] = InputColumnSelector.select(
+                data, self._get_successor_required_columns(data)
+            )
+
+    def _prune_columns(self) -> List[TileableData]:
+        pruned_nodes: List[TileableData] = []
+        datasource_nodes: List[TileableData] = []
+
+        node_list = list(self._graph.topological_iter())
+        for data in node_list:
+            if self._is_skipped_type(data):
+                continue
+
+            op = data.op
+
+            successor_required_columns = self._get_successor_required_columns(data)
+            if (
+                isinstance(op, ColumnPruneSupportedDataSourceMixin)
+                and successor_required_columns is not None
+                and set(successor_required_columns) != self._get_all_columns(data)
+            ):
+                op.set_pruned_columns(list(successor_required_columns))
+                self.effective = True
+                pruned_nodes.append(data)
+                datasource_nodes.append(data)
+                continue
+
+            if isinstance(op, OPTIMIZABLE_OP_TYPES):
+                predecessors = list(self._graph.predecessors(data))
+                for predecessor in predecessors:
+                    if (
+                        self._is_skipped_type(predecessor)
+                        or predecessor in datasource_nodes
+                        # if the group by key is a series, no need to do column pruning
+                        or isinstance(predecessor, BaseSeriesData)
+                    ):
+                        continue
+
+                    pruned_columns = list(self._context[data][predecessor])
+                    if set(pruned_columns) == self._get_all_columns(predecessor):
+                        continue
+
+                    # new node init
+                    new_node_op = DataFrameIndex(
+                        col_names=pruned_columns,
+                    )
+                    new_params = predecessor.params.copy()
+                    new_params["shape"] = (
+                        new_params["shape"][0],
+                        len(pruned_columns),
+                    )
+                    new_params["dtypes"] = new_params["dtypes"][pruned_columns]
+                    new_params["columns_value"] = parse_index(
+                        new_params["dtypes"].index, store_data=True
+                    )
+                    new_node = new_node_op.new_dataframe(
+                        [predecessor], **new_params
+                    ).data
+
+                    # update context
+                    del self._context[data][predecessor]
+                    self._context[new_node] = {predecessor: set(pruned_columns)}
+                    self._context[data][new_node] = set(pruned_columns)
+
+                    # change edges and nodes
+                    self._graph.remove_edge(predecessor, data)
+                    self._graph.add_node(new_node)
+                    self._graph.add_edge(predecessor, new_node)
+                    self._graph.add_edge(new_node, data)
+
+                    self._records.append_record(
+                        OptimizationRecord(
+                            predecessor, new_node, OptimizationRecordType.new
+                        )
+                    )
+                    # update inputs
+                    data.inputs[data.inputs.index(predecessor)] = new_node
+                    self.effective = True
+                    pruned_nodes.extend([predecessor])
+        return pruned_nodes
+
+    def _update_tileable_params(self, pruned_nodes: List[TileableData]) -> None:
+        # change dtypes and columns_value
+        queue = [n for n in pruned_nodes]
+        affected_nodes = set()
+        while len(queue) > 0:
+            node = queue.pop(0)
+            if isinstance(node.op, ColumnPruneSupportedDataSourceMixin):
+                affected_nodes.add(node)
+            for successor in self._graph.successors(node):
+                if successor not in affected_nodes:
+                    queue.append(successor)
+                    if not self._is_skipped_type(successor):
+                        affected_nodes.add(successor)
+
+        for node in affected_nodes:
+            required_columns = self._get_required_columns(node)
+            if (
+                isinstance(node, BaseDataFrameData)
+                and required_columns is not None
+                and set(required_columns) != set(node.dtypes.index)
+            ):
+                new_dtypes = pd.Series(
+                    dict(
+                        (col, dtype)
+                        for col, dtype in node.dtypes.items()
+                        if col in required_columns
+                    )
+                )
+                new_columns_value = parse_index(new_dtypes.index, store_data=True)
+                node._dtypes = new_dtypes
+                node._columns_value = new_columns_value
+                node._shape = (node.shape[0], len(new_dtypes))
+
+    @implements(OptimizationRule.apply)
+    def apply(self):
+        self._build_context()
+        pruned_nodes = self._prune_columns()
+        self._update_tileable_params(pruned_nodes)
+
+    @staticmethod
+    def _is_skipped_type(data: TileableData) -> bool:
+        """
+        If column pruning should be applied to the given tileable data.
+        """
+        return not isinstance(data, (BaseSeriesData, BaseDataFrameData))
diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/input_column_selector.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/input_column_selector.py
new file mode 100644
index 000000000..4481dca91
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/input_column_selector.py
@@ -0,0 +1,255 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import Callable, Dict, Any, Set
+
+from .....core import TileableData
+from .....dataframe import NamedAgg
+from .....dataframe.arithmetic.core import DataFrameBinOp, DataFrameUnaryOp
+from .....dataframe.core import (
+    BaseDataFrameData,
+    BaseSeriesData,
+)
+from .....dataframe.groupby.aggregation import DataFrameGroupByAgg
+from .....dataframe.indexing.getitem import DataFrameIndex
+from .....dataframe.indexing.setitem import DataFrameSetitem
+from .....dataframe.merge import DataFrameMerge
+from .....typing import OperandType
+from .utils import get_cols_exclude_index
+
+
+class InputColumnSelector:
+    _OP_TO_SELECT_FUNCTION = {}
+
+    @staticmethod
+    def select_all_input_columns(
+        tileable_data: TileableData, _required_cols: Set[Any]
+    ) -> Dict[TileableData, Set[Any]]:
+        ret = {}
+        for inp in tileable_data.op.inputs:
+            if isinstance(inp, BaseDataFrameData):
+                ret[inp] = set(inp.dtypes.index)
+            elif isinstance(inp, BaseSeriesData):
+                ret[inp] = {inp.name}
+        return ret
+
+    @staticmethod
+    def select_required_input_columns(
+        tileable_data: TileableData, required_cols: Set[Any]
+    ) -> Dict[TileableData, Set[Any]]:
+        ret = {}
+        for inp in tileable_data.op.inputs:
+            if isinstance(inp, BaseDataFrameData):
+                ret[inp] = required_cols.intersection(set(inp.dtypes.index))
+            elif isinstance(inp, BaseSeriesData):
+                ret[inp] = {inp.name}
+        return ret
+
+    @classmethod
+    def register(
+        cls,
+        op_cls: OperandType,
+        func: Callable[[TileableData, Set[Any]], Dict[TileableData, Set[Any]]],
+        replace: bool = False,
+    ) -> None:
+        if op_cls not in cls._OP_TO_SELECT_FUNCTION or replace:
+            cls._OP_TO_SELECT_FUNCTION[op_cls] = func
+        else:
+            raise ValueError(f"key {op_cls} exists.")
+
+    @classmethod
+    def unregister(cls, op_cls: OperandType) -> None:
+        if op_cls in cls._OP_TO_SELECT_FUNCTION:
+            del cls._OP_TO_SELECT_FUNCTION[op_cls]
+
+    @classmethod
+    def select(
+        cls, tileable_data: TileableData, required_cols: Set[Any]
+    ) -> Dict[TileableData, Set[Any]]:
+        """
+        Get the column pruning results of given tileable data.
+
+        Parameters
+        ----------
+        tileable_data : TileableData
+            The tileable data to be processed.
+        required_cols: List[Any]
+            Names of columns required by the successors of the given tileable data. If required_cols is None, all the
+            input columns will be selected.
+        Returns
+        -------
+        Dict[TileableData: List[Any]]
+            A dictionary that represents the column pruning results. For every key-value pairs in the dictionary, the
+            key is a predecessor of the given tileable data, and the value is a list of column names that the given
+            tileable data depends on.
+        """
+        if required_cols is None:
+            return cls.select_all_input_columns(tileable_data, set())
+
+        op_type = type(tileable_data.op)
+        if op_type in cls._OP_TO_SELECT_FUNCTION:
+            return cls._OP_TO_SELECT_FUNCTION[op_type](tileable_data, required_cols)
+        for op_cls in op_type.__mro__:
+            if op_cls in cls._OP_TO_SELECT_FUNCTION:
+                cls._OP_TO_SELECT_FUNCTION[op_type] = cls._OP_TO_SELECT_FUNCTION[op_cls]
+                return cls._OP_TO_SELECT_FUNCTION[op_cls](tileable_data, required_cols)
+        return cls.select_all_input_columns(tileable_data, required_cols)
+
+
+def register_selector(op_type: OperandType) -> Callable:
+    def wrap(selector_func: Callable):
+        InputColumnSelector.register(op_type, selector_func)
+        return selector_func
+
+    return wrap
+
+
+@register_selector(DataFrameMerge)
+def df_merge_select_function(
+    tileable_data: TileableData, required_cols: Set[Any]
+) -> Dict[TileableData, Set[Any]]:
+    op: DataFrameMerge = tileable_data.op
+    assert len(op.inputs) == 2
+    assert isinstance(op.inputs[0], BaseDataFrameData)
+    assert isinstance(op.inputs[1], BaseDataFrameData)
+    left_data: BaseDataFrameData = op.inputs[0]
+    right_data: BaseDataFrameData = op.inputs[1]
+
+    ret = defaultdict(set)
+    for df, suffix in zip([left_data, right_data], op.suffixes):
+        for col in df.dtypes.index:
+            if col in required_cols:
+                ret[df].add(col)
+            else:
+                # TODO: this does not work when col is a tuple.
+                suffix_col = str(col) + suffix
+                if suffix_col in required_cols:
+                    ret[df].add(col)
+                    # The column in the other dataframe has to be selected as well. Otherwise, in
+                    # the runtime, there will not be a column with suffix.
+                    other_data = right_data if df is left_data else left_data
+                    ret[other_data].add(col)
+
+    if op.on is not None:
+        ret[left_data].update(get_cols_exclude_index(left_data, op.on))
+        ret[right_data].update(get_cols_exclude_index(right_data, op.on))
+    if op.left_on is not None:
+        ret[left_data].update(get_cols_exclude_index(left_data, op.left_on))
+    if op.right_on is not None:
+        ret[right_data].update(get_cols_exclude_index(right_data, op.right_on))
+
+    return ret
+
+
+@register_selector(DataFrameGroupByAgg)
+def df_groupby_agg_select_function(
+    tileable_data: TileableData, required_cols: Set[Any]
+) -> Dict[TileableData, Set[Any]]:
+    op: DataFrameGroupByAgg = tileable_data.op
+    assert isinstance(op.inputs[0], (BaseDataFrameData, BaseSeriesData))
+    inp: BaseDataFrameData = op.inputs[0]
+    by = op.groupby_params["by"]
+    selection = op.groupby_params.get("selection", None)
+    raw_func = op.raw_func
+
+    ret = {}
+    # group by a series
+    groupby_series = False
+    if isinstance(by, list) and len(by) == 1 and isinstance(by[0], BaseSeriesData):
+        groupby_series = True
+        ret[by[0]] = by[0].name
+
+    if isinstance(inp, BaseSeriesData):
+        ret[inp] = {inp.name}
+    else:
+        selected_cols = set()
+        # group by keys should be included
+        if not groupby_series:
+            selected_cols.update(get_cols_exclude_index(inp, by))
+        # add agg columns
+        if op.raw_func is not None:
+            if op.raw_func == "size":
+                # special for size, its return value is always series
+                pass
+            elif isinstance(raw_func, dict):
+                selected_cols.update(set(raw_func.keys()))
+            else:
+                # no specified agg columns
+                # required_cols should always be a subset of selection
+                for col in required_cols:
+                    # col is a tuple when required col is a MultiIndex
+                    if isinstance(col, tuple):
+                        for c in col:
+                            selected_cols.add(c)
+                    selected_cols.add(col)
+                if selection is not None:
+                    if isinstance(selection, (list, tuple)):
+                        selected_cols.update(set(selection))
+                    else:
+                        selected_cols.add(selection)
+        elif op.raw_func_kw:
+            # add renamed columns
+            for _, origin in op.raw_func_kw.items():
+                if isinstance(origin, NamedAgg):
+                    selected_cols.add(origin.column)
+                else:
+                    assert isinstance(origin, tuple)
+                    selected_cols.add(origin[0])
+
+        ret[inp] = selected_cols.intersection(inp.dtypes.index)
+    return ret
+
+
+@register_selector(DataFrameSetitem)
+def df_setitem_select_function(
+    tileable_data: TileableData, required_cols: Set[Any]
+) -> Dict[TileableData, Set[Any]]:
+    if len(tileable_data.inputs) == 1:
+        # if value is not a Mars object, return required input columns
+        return InputColumnSelector.select_required_input_columns(
+            tileable_data, required_cols
+        )
+    else:
+        df, value = tileable_data.inputs
+        ret = {df: required_cols.intersection(set(df.dtypes.index))}
+        # if value is a Mars object, return all its columns so that setitem can be executed
+        if isinstance(value, BaseDataFrameData):
+            value_cols = set(value.dtypes.index)
+            ret[value] = value_cols
+        elif isinstance(value, BaseSeriesData):
+            value_cols = {value.name}
+            ret[value] = value_cols
+        return ret
+
+
+@register_selector(DataFrameIndex)
+def df_getitem_select_function(
+    tileable_data: TileableData, required_cols: Set[Any]
+) -> Dict[TileableData, Set[Any]]:
+    if tileable_data.op.col_names:
+        return InputColumnSelector.select_required_input_columns(
+            tileable_data, required_cols
+        )
+    else:
+        return InputColumnSelector.select_all_input_columns(
+            tileable_data, required_cols
+        )
+
+
+SELECT_REQUIRED_OP_TYPES = [DataFrameBinOp, DataFrameUnaryOp]
+for op_type in SELECT_REQUIRED_OP_TYPES:
+    InputColumnSelector.register(
+        op_type, InputColumnSelector.select_required_input_columns
+    )
diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/self_column_selector.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/self_column_selector.py
new file mode 100644
index 000000000..6c2ec83e2
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/self_column_selector.py
@@ -0,0 +1,156 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Set, Any, Callable
+
+from .utils import get_cols_exclude_index
+from .....core import TileableData
+from .....dataframe.core import BaseDataFrameData, BaseSeriesData
+from .....dataframe.groupby.aggregation import DataFrameGroupByAgg
+from .....dataframe.indexing.getitem import DataFrameIndex
+from .....dataframe.indexing.setitem import DataFrameSetitem
+from .....dataframe.merge import DataFrameMerge
+from .....typing import OperandType
+
+
+class SelfColumnSelector:
+    _OP_TO_SELECT_FUNCTION = {}
+
+    @classmethod
+    def register(
+        cls,
+        op_cls: OperandType,
+        func: Callable[[TileableData], Set[Any]],
+        replace: bool = False,
+    ) -> None:
+        if op_cls not in cls._OP_TO_SELECT_FUNCTION or replace:
+            cls._OP_TO_SELECT_FUNCTION[op_cls] = func
+        else:
+            raise ValueError(f"key {op_cls} exists.")
+
+    @classmethod
+    def select(cls, tileable_data: TileableData) -> Set[Any]:
+        """
+        TODO: docstring
+        """
+        op_type = type(tileable_data.op)
+        if op_type in cls._OP_TO_SELECT_FUNCTION:
+            return cls._OP_TO_SELECT_FUNCTION[op_type](tileable_data)
+        for op_cls in op_type.__mro__:
+            if op_cls in cls._OP_TO_SELECT_FUNCTION:
+                cls._OP_TO_SELECT_FUNCTION[op_type] = cls._OP_TO_SELECT_FUNCTION[op_cls]
+                return cls._OP_TO_SELECT_FUNCTION[op_cls](tileable_data)
+        return set()
+
+
+def register_selector(op_type: OperandType) -> Callable:
+    def wrap(selector_func: Callable):
+        SelfColumnSelector.register(op_type, selector_func)
+        return selector_func
+
+    return wrap
+
+
+@register_selector(DataFrameSetitem)
+def df_setitem_select_function(tileable_data: TileableData) -> Set[Any]:
+    if isinstance(tileable_data.op.indexes, list):
+        return set(tileable_data.op.indexes)
+    else:
+        return {tileable_data.op.indexes}
+
+
+@register_selector(DataFrameIndex)
+def df_getitem_select_function(tileable_data: TileableData) -> Set[Any]:
+    if tileable_data.op.col_names is not None:
+        col_names = tileable_data.op.col_names
+        if isinstance(col_names, list):
+            return set(tileable_data.op.col_names)
+        else:
+            return {tileable_data.op.col_names}
+    else:
+        if isinstance(tileable_data, BaseDataFrameData):
+            return set(tileable_data.dtypes.index)
+        elif isinstance(tileable_data, BaseSeriesData):
+            return {tileable_data.name}
+
+
+@register_selector(DataFrameGroupByAgg)
+def df_groupby_agg_select_function(tileable_data: TileableData) -> Set[Any]:
+    """
+    Make sure the "group by columns" are preserved.
+    """
+
+    op: DataFrameGroupByAgg = tileable_data.op
+    by = op.groupby_params["by"]
+
+    if isinstance(tileable_data, BaseDataFrameData):
+        return get_cols_exclude_index(tileable_data, by)
+    elif isinstance(tileable_data, BaseSeriesData):
+        return {tileable_data.name}
+    else:
+        return set()
+
+
+@register_selector(DataFrameMerge)
+def df_merge_select_function(tileable_data: TileableData) -> Set[Any]:
+    """
+    Make sure the merge keys are preserved.
+    """
+
+    op: DataFrameMerge = tileable_data.op
+    on = op.on
+    if on is not None:
+        return get_cols_exclude_index(tileable_data, on)
+
+    ret = set()
+    left_data: BaseDataFrameData = op.inputs[0]
+    right_data: BaseDataFrameData = op.inputs[1]
+    left_index = op.left_index
+    right_index = op.right_index
+    left_on = op.left_on if isinstance(op.left_on, list) else [op.left_on]
+    right_on = op.right_on if isinstance(op.right_on, list) else [op.right_on]
+
+    if left_index and right_index:
+        return ret
+
+    if left_index:
+        for col in right_data.dtypes.index:
+            if col in right_on:
+                ret.add(col)
+        return ret
+    if right_index:
+        for col in left_data.dtypes.index:
+            if col in left_on:
+                ret.add(col)
+        return ret
+
+    for data, merge_keys, suffix in zip(
+        [left_data, right_data], [left_on, right_on], op.suffixes
+    ):
+        if merge_keys is None:
+            continue
+        for col in data.dtypes.index:
+            if col in merge_keys:
+                other_data = right_data if data is left_data else left_data
+                other_merge_keys = right_on if merge_keys is left_on else left_on
+
+                if col in other_data.dtypes.index and col not in other_merge_keys:
+                    # if the merge key exists in the other dataframe but not in the other
+                    # dataframe's merge keys, suffixes will be added.
+                    # TODO: this does not work when col is a tuple.
+                    suffix_col = str(col) + suffix
+                    ret.add(suffix_col)
+                else:
+                    ret.add(col)
+    return ret
diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/__init__.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/__init__.py
new file mode 100644
index 000000000..313d6ba7a
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_column_pruning.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_column_pruning.py
new file mode 100644
index 000000000..7190d9287
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_column_pruning.py
@@ -0,0 +1,592 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+
+import pandas as pd
+import pytest
+
+from ...... import dataframe as md
+from ...... import tensor as mt
+from ......dataframe.arithmetic import DataFrameMul
+from ......dataframe.base.eval import DataFrameEval
+from ......dataframe.base.isin import DataFrameIsin
+from ......dataframe.core import DataFrameData, SeriesData, DataFrameGroupByData
+from ......dataframe.datasource.dataframe import DataFrameDataSource
+from ......dataframe.datasource.read_csv import DataFrameReadCSV
+from ......dataframe.datasource.read_parquet import DataFrameReadParquet
+from ......dataframe.groupby.aggregation import DataFrameGroupByAgg
+from ......dataframe.groupby.core import DataFrameGroupByOperand
+from ......dataframe.indexing.getitem import DataFrameIndex
+from ......dataframe.indexing.setitem import DataFrameSetitem
+from ......dataframe.merge import DataFrameMerge
+from ......optimization.logical.tileable import optimize
+from ......tensor.core import TensorData
+from ......tensor.datasource import ArrayDataSource
+
+
+@pytest.fixture()
+def gen_data1():
+    with tempfile.TemporaryDirectory() as tempdir:
+        df = pd.DataFrame(
+            {
+                "c1": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+                "c2": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+                "c3": list("aabaaddce"),
+                "c4": list("abaaaddce"),
+            }
+        )
+
+        df2 = pd.DataFrame(
+            {
+                "c1": [3, 3, 4, 5, 6, 5, 4, 4, 4],
+                "c2": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+                "c3": list("aabaaddce"),
+                "c4": list("abaaaddce"),
+            }
+        )
+        file_path = os.path.join(tempdir, "test.csv")
+        file_path2 = os.path.join(tempdir, "test2.csv")
+
+        df.to_csv(file_path, index=False)
+        df2.to_csv(file_path2, index=False)
+        yield file_path, file_path2
+
+
+@pytest.fixture()
+def gen_data2():
+    with tempfile.TemporaryDirectory() as tempdir:
+        df = pd.DataFrame(
+            {
+                "c1": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+                "c2": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+                "c3": [1, 3, 4, 1, 1, 9, 4, 4, 4],
+                "c4": [3, 0, 5, 3, 5, 4, 1, 2, 10],
+            }
+        )
+
+        df2 = pd.DataFrame(
+            {
+                "cc1": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+                "cc2": [1, 6, 4, 5, 6, 5, 4, 4, 4],
+                "cc3": [1, 3, 4, 1, 1, 9, 4, 8, 4],
+                "cc4": [3, 0, 5, 3, 5, 4, 1, 2, 10],
+            }
+        )
+
+        file_path = os.path.join(tempdir, "test.pq")
+        file_path2 = os.path.join(tempdir, "test2.pq")
+        df.to_parquet(file_path)
+        df2.to_parquet(file_path2)
+        yield file_path, file_path2
+
+
+def test_groupby(setup, gen_data2):
+    # no column pruning
+    file_path, file_path2 = gen_data2
+    df1 = md.read_parquet(file_path)
+    df2 = md.read_parquet(file_path2)
+    m = df1.merge(df2, left_on="c1", right_on="cc1")
+    g = m.groupby(["c1"])
+
+    graph = g.build_graph()
+    optimize(graph)
+
+    assert len(graph.result_tileables) == 1
+    groupby_data = graph.result_tileables[0]
+    assert isinstance(groupby_data, DataFrameGroupByData)
+    assert isinstance(groupby_data.op, DataFrameGroupByOperand)
+    assert len(groupby_data.dtypes) == 8
+
+    assert len(groupby_data.inputs) == 1
+    merge_data = groupby_data.inputs[0]
+    assert isinstance(merge_data, DataFrameData)
+    assert isinstance(merge_data.op, DataFrameMerge)
+    assert len(groupby_data.dtypes) == 8
+
+    assert len(merge_data.inputs) == 2
+    left_data = merge_data.inputs[0]
+    right_data = merge_data.inputs[1]
+    assert isinstance(left_data, DataFrameData)
+    assert isinstance(left_data.op, DataFrameReadParquet)
+    assert len(left_data.dtypes) == 4
+    assert isinstance(right_data, DataFrameData)
+    assert isinstance(right_data.op, DataFrameReadParquet)
+    assert len(right_data.dtypes) == 4
+
+
+def test_tensor(setup):
+    t = mt.tensor((1, 2, 3))
+    s = md.DataFrame({"foo": (1, 2, 3), "bar": (4, 5, 6)}).isin(t)
+
+    graph = s.build_graph()
+    optimize(graph)
+
+    assert len(graph.result_tileables) == 1
+    isin_data = graph.result_tileables[0]
+    assert isinstance(isin_data, DataFrameData)
+    assert isinstance(isin_data.op, DataFrameIsin)
+    assert len(isin_data.dtypes) == 2
+
+    assert len(isin_data.inputs) == 2
+    df_data = isin_data.inputs[0]
+    assert isinstance(df_data, DataFrameData)
+    assert isinstance(df_data.op, DataFrameDataSource)
+    assert len(df_data.dtypes) == 2
+
+    tensor_data = isin_data.inputs[1]
+    assert isinstance(tensor_data, TensorData)
+    assert isinstance(tensor_data.op, ArrayDataSource)
+
+
+def test_groupby_agg(setup, gen_data1):
+    file_path, _ = gen_data1
+
+    df1 = md.read_csv(file_path)
+    c = df1.groupby("c1")["c2"].sum()
+
+    graph = c.build_graph()
+    optimize(graph)
+    groupby_agg_node = graph.result_tileables[0]
+    assert isinstance(groupby_agg_node, SeriesData)
+    assert isinstance(groupby_agg_node.op, DataFrameGroupByAgg)
+    assert type(groupby_agg_node.op) is DataFrameGroupByAgg
+    assert groupby_agg_node.name == "c2"
+
+    groupby_agg_node_preds = graph.predecessors(groupby_agg_node)
+    assert len(groupby_agg_node_preds) == 1
+    read_csv_node = groupby_agg_node_preds[0]
+    assert isinstance(read_csv_node, DataFrameData)
+    assert isinstance(read_csv_node.op, DataFrameReadCSV)
+    assert len(read_csv_node.op.usecols) == 2
+    assert len({"c1", "c2"} ^ set(read_csv_node.op.usecols)) == 0
+
+    raw = pd.read_csv(file_path)
+    pd_res = raw.groupby("c1")["c2"].sum()
+    r = c.execute().fetch()
+    pd.testing.assert_series_equal(r, pd_res)
+
+
+def test_merge_and_getitem(setup, gen_data1):
+    file_path, file_path2 = gen_data1
+    df1 = md.read_csv(file_path)
+    df2 = md.read_csv(file_path2, names=["c1", "c2", "cc3", "cc4"], header=0)
+    r = df1.merge(df2)["c1"]
+
+    graph = r.build_graph()
+    optimize(graph)
+
+    index_node = graph.result_tileables[0]
+    assert isinstance(index_node.op, DataFrameIndex)
+    assert index_node.name == "c1"
+
+    assert len(graph.predecessors(index_node)) == 1
+    merge_node = graph.predecessors(index_node)[0]
+    assert type(merge_node.op) is DataFrameMerge
+
+    read_csv_node_left, read_csv_node_right = graph.predecessors(merge_node)
+    assert type(read_csv_node_left.op) is DataFrameReadCSV
+    assert type(read_csv_node_right.op) is DataFrameReadCSV
+    assert len(read_csv_node_left.op.usecols) == 2
+    assert len(read_csv_node_right.op.usecols) == 2
+    assert set(read_csv_node_left.op.usecols) == {"c1", "c2"}
+    assert set(read_csv_node_right.op.usecols) == {"c1", "c2"}
+
+    r = r.execute().fetch()
+    raw1 = pd.read_csv(file_path)
+    raw2 = pd.read_csv(file_path2, names=["c1", "c2", "cc3", "cc4"], header=0)
+    expected = raw1.merge(raw2)["c1"]
+    pd.testing.assert_series_equal(r, expected)
+
+
+def test_merge_on_one_column(setup, gen_data1):
+    file_path, file_path2 = gen_data1
+    df1 = md.read_csv(file_path)
+    df2 = md.read_csv(file_path2)
+    c = df1.merge(df2, left_on="c1", right_on="c1")["c1"]
+
+    graph = c.build_graph()
+    optimize(graph)
+
+    index_node = graph.result_tileables[0]
+    assert type(index_node.op) is DataFrameIndex
+
+    index_node_preds = graph.predecessors(index_node)
+    assert len(index_node_preds) == 1
+
+    merge_node = index_node_preds[0]
+    assert type(merge_node.op) is DataFrameMerge
+
+    merge_node_preds = graph.predecessors(merge_node)
+    assert len(merge_node_preds) == 2
+
+    read_csv_node = merge_node_preds[0]
+    read_csv_op = read_csv_node.op
+    assert type(read_csv_op) is DataFrameReadCSV
+    assert len(read_csv_op.usecols) == 1
+    assert read_csv_op.usecols == ["c1"]
+
+    r = c.execute().fetch()
+    raw1 = pd.read_csv(file_path)
+    raw2 = pd.read_csv(file_path2)
+    expected = raw1.merge(raw2, left_on="c1", right_on="c1")["c1"]
+    pd.testing.assert_series_equal(r, expected)
+
+
+def test_merge_on_two_columns(setup, gen_data1):
+    file_path, file_path2 = gen_data1
+    df1 = md.read_csv(file_path)
+    df2 = md.read_csv(file_path2)
+    c = df1.merge(df2, left_on=["c1", "c2"], right_on=["c1", "c2"])[["c1", "c2"]]
+
+    graph = c.build_graph()
+    optimize(graph)
+
+    index_node = graph.result_tileables[0]
+    assert type(index_node.op) is DataFrameIndex
+    assert len(index_node.op.col_names) == 2
+
+    merge_node = graph.predecessors(index_node)[0]
+    read_csv_node = graph.predecessors(merge_node)[0]
+    assert type(read_csv_node.op) is DataFrameReadCSV
+
+    use_cols = read_csv_node.op.usecols
+    assert len(use_cols) == 2
+    assert set(use_cols) & {"c1", "c2"} == {"c1", "c2"}
+    assert len(set(use_cols) ^ {"c1", "c2"}) == 0
+
+    r = c.execute().fetch()
+    raw1 = pd.read_csv(file_path)
+    raw2 = pd.read_csv(file_path2)
+    expected = raw1.merge(raw2, left_on=["c1", "c2"], right_on=["c1", "c2"])[
+        ["c1", "c2"]
+    ]
+    pd.testing.assert_frame_equal(r, expected)
+
+
+def test_groupby_agg_then_merge(setup, gen_data1):
+    file_path, file_path2 = gen_data1
+    df1 = md.read_csv(file_path)
+    df2 = md.read_csv(file_path2)
+    r_group_res = df1.groupby(["c1"])[["c2"]].sum()
+    c = df2.merge(r_group_res, left_on=["c2"], right_on=["c2"])[["c1", "c3"]]
+    graph = c.build_graph()
+    optimize(graph)
+    r = c.execute().fetch()
+
+    raw1 = pd.read_csv(file_path)
+    raw2 = pd.read_csv(file_path2)
+    group_res = raw1.groupby(["c1"])[["c2"]].sum()
+    expected = raw2.merge(group_res, left_on=["c2"], right_on=["c2"])[["c1", "c3"]]
+    pd.testing.assert_frame_equal(r, expected)
+
+    index_node = graph.result_tileables[0]
+    assert type(index_node.op) is DataFrameIndex
+
+    merge_node = graph.predecessors(index_node)[0]
+    merge_node_preds = graph.predecessors(merge_node)
+
+    df2_node = [n for n in merge_node_preds if type(n.op) is DataFrameReadCSV][0]
+    assert set(df2_node.op.usecols) == {"c1", "c2", "c3"}
+
+    df1_node = [
+        n
+        for n in graph._nodes
+        if type(n.op) is DataFrameReadCSV and n.op.path == file_path
+    ][0]
+    assert type(df1_node.op) is DataFrameReadCSV
+    assert set(df1_node.op.usecols) == {"c1", "c2"}
+
+
+def test_merge_then_groupby_apply(setup, gen_data2):
+    file_path, file_path2 = gen_data2
+    df1 = md.read_parquet(file_path)
+    df2 = md.read_parquet(file_path2)
+
+    c = (
+        (
+            ((df1 + 1) * 2).merge(df2, left_on=["c1", "c3"], right_on=["cc2", "cc4"])[
+                ["c1", "cc4"]
+            ]
+            * 2
+        )
+        .groupby(["cc4"])
+        .apply(lambda x: x / x.sum())
+    )
+    graph = c.build_graph()
+    optimize(graph)
+    r = c.execute().fetch()
+
+    raw1 = pd.read_parquet(file_path)
+    raw2 = pd.read_parquet(file_path2)
+    expected = (
+        (
+            ((raw1 + 1) * 2).merge(raw2, left_on=["c1", "c3"], right_on=["cc2", "cc4"])[
+                ["c1", "cc4"]
+            ]
+            * 2
+        )
+        .groupby(["cc4"])
+        .apply(lambda x: x / x.sum())
+    )
+    pd.testing.assert_frame_equal(r, expected)
+
+    read_parquet_nodes = [n for n in graph._nodes if type(n.op) is DataFrameReadParquet]
+    assert len(read_parquet_nodes) == 2
+
+    for n in read_parquet_nodes:
+        assert len(n.op.get_columns()) == 2
+
+    merge_node = [n for n in graph._nodes if type(n.op) is DataFrameMerge][0]
+    merge_node_preds = graph.predecessors(merge_node)
+    assert len(merge_node_preds) == 2
+
+    inserted_node = [n for n in merge_node_preds if type(n.op) is DataFrameIndex][0]
+    assert len(inserted_node.op.col_names) == 2
+    assert set(inserted_node.op.col_names) == {"c1", "c3"}
+
+    mul_node = graph.predecessors(inserted_node)[0]
+    assert type(mul_node.op) is DataFrameMul
+    assert set(mul_node.dtypes.index.tolist()) == {"c1", "c3"}
+
+
+def test_two_merges(setup, gen_data2):
+    file_path, file_path2 = gen_data2
+    df1 = md.read_parquet(file_path)
+    df2 = md.read_parquet(file_path2)
+    c = (
+        (df1 + 1)
+        .merge((df2 + 2), left_on=["c2", "c3"], right_on=["cc1", "cc4"])[
+            ["c2", "c4", "cc1", "cc2"]
+        ]
+        .merge(df2, left_on=["cc1"], right_on=["cc3"])
+    )
+    graph = c.build_graph()
+    optimize(graph)
+    r = c.execute().fetch()
+
+    raw1 = pd.read_parquet(file_path)
+    raw2 = pd.read_parquet(file_path2)
+
+    expected = (
+        (raw1 + 1)
+        .merge((raw2 + 2), left_on=["c2", "c3"], right_on=["cc1", "cc4"])[
+            ["c2", "c4", "cc1", "cc2"]
+        ]
+        .merge(raw2, left_on=["cc1"], right_on=["cc3"])
+    )
+    pd.testing.assert_frame_equal(r, expected)
+
+    parquet_nodes = [n for n in graph._nodes if type(n.op) is DataFrameReadParquet]
+    assert len(parquet_nodes) == 2
+
+    # df1 read parquet push down
+    df1_node = [n for n in parquet_nodes if n.op.path == file_path][0]
+    assert set(df1_node.op.get_columns()) == {"c2", "c3", "c4"}
+
+    # df2 read parquet not push down since it needs all the columns
+    df2_node = [n for n in parquet_nodes if n.op.path == file_path2][0]
+    assert df2_node.op.columns is None
+
+    # prove that inserted nodes take effect
+    inserted_nodes = [n for n in graph._nodes if type(n.op) is DataFrameIndex]
+    assert len(inserted_nodes) == 3
+
+    index_after_merge_node = [
+        n for n in inserted_nodes if type(graph.predecessors(n)[0].op) is DataFrameMerge
+    ][0]
+    assert set(index_after_merge_node.op.col_names) == {"c2", "c4", "cc1", "cc2"}
+
+
+def test_two_groupby_aggs_with_multi_index(setup, gen_data2):
+    file_path, _ = gen_data2
+    df = md.read_parquet(file_path)
+    c = (
+        (df * 2)
+        .groupby(["c2", "c3"])
+        .apply(lambda x: x["c1"].sum() / x["c2"].mean())
+        .reset_index()
+        .groupby("c3")
+        .agg(["min", "max"])
+    )
+    graph = c.build_graph()
+    optimize(graph)
+    r = c.execute().fetch()
+
+    raw = pd.read_parquet(file_path)
+    expected = (
+        (raw * 2)
+        .groupby(["c2", "c3"])
+        .apply(lambda x: x["c1"].sum() / x["c2"].mean())
+        .reset_index()
+        .groupby("c3")
+        .agg(["min", "max"])
+    )
+    pd.testing.assert_frame_equal(r, expected)
+
+    apply_node = [n for n in graph._nodes if type(n.op) is DataFrameGroupByAgg][0]
+    assert set(apply_node.columns.index_value._index_value._data) == {
+        (0, "min"),
+        (0, "max"),
+        ("c2", "max"),
+        ("c2", "min"),
+    }
+
+    # apply cannot push down
+    read_parquet_node = [
+        n
+        for n in graph._nodes
+        if type(n.op) is DataFrameReadParquet and n.op.path == file_path
+    ][0]
+    assert read_parquet_node.op.get_columns() is None
+
+
+def test_merge_and_get_col_with_suffix(setup, gen_data1):
+    file_path, file_path2 = gen_data1
+    left = md.read_csv(file_path)
+    right = md.read_csv(file_path2)
+    r = left.merge(right, on="c1")[["c3_x"]]
+
+    graph = r.build_graph()
+    optimize(graph)
+
+    index_node = graph.result_tileables[0]
+    assert isinstance(index_node.op, DataFrameIndex)
+    assert index_node.op.col_names == ["c3_x"]
+
+    assert len(graph.predecessors(index_node)) == 1
+    merge_node = graph.predecessors(index_node)[0]
+    assert type(merge_node.op) is DataFrameMerge
+
+    read_csv_node_left, read_csv_node_right = graph.predecessors(merge_node)
+    assert type(read_csv_node_left.op) is DataFrameReadCSV
+    assert type(read_csv_node_right.op) is DataFrameReadCSV
+    assert len(read_csv_node_left.op.usecols) == 2
+    assert len(read_csv_node_right.op.usecols) == 2
+    assert set(read_csv_node_left.op.usecols) == {"c1", "c3"}
+    assert set(read_csv_node_right.op.usecols) == {"c1", "c3"}
+
+    r = r.execute().fetch()
+    raw1 = pd.read_csv(file_path)
+    raw2 = pd.read_csv(file_path2)
+    expected = raw1.merge(raw2, on="c1")[["c3_x"]]
+    pd.testing.assert_frame_equal(r, expected)
+
+
+def test_getitem_with_mask(setup, gen_data1):
+    """
+    Getitem with mask shouldn't prune any column.
+    """
+    file_path, file_path2 = gen_data1
+    df = md.read_csv(file_path)
+    df2 = md.read_csv(file_path2)
+
+    df = df[df2["c1"] > 3]
+    r = df.groupby(by="c1", as_index=False).sum()["c2"]
+
+    graph = r.build_graph()
+    optimize(graph)
+
+    index_node = graph.result_tileables[0]
+    assert isinstance(index_node.op, DataFrameIndex)
+    assert index_node.name == "c2"
+
+    assert len(graph.predecessors(index_node)) == 1
+    gb_node = graph.predecessors(index_node)[0]
+    assert isinstance(gb_node.op, DataFrameGroupByAgg)
+    assert set(gb_node.dtypes.index) == {"c1", "c2"}
+
+    assert len(graph.predecessors(gb_node)) == 1
+    index_node_2 = graph.predecessors(gb_node)[0]
+    isinstance(index_node_2.op, DataFrameIndex)
+    assert set(index_node_2.dtypes.index) == {"c1", "c2"}
+
+    assert len(graph.predecessors(index_node_2)) == 1
+    index_node_3 = graph.predecessors(index_node_2)[0]
+    isinstance(index_node_3.op, DataFrameIndex)
+    assert set(index_node_3.dtypes.index) == {"c1", "c2", "c3", "c4"}
+
+    assert len(graph.predecessors(index_node_3)) == 2
+    read_csv_node, eval_node = graph.predecessors(index_node_3)
+    assert isinstance(read_csv_node.op, DataFrameReadCSV)
+    assert isinstance(eval_node.op, DataFrameEval)
+    assert read_csv_node.op.usecols is None  # all the columns.
+    assert eval_node.name == "c1"
+
+    assert len(graph.predecessors(eval_node)) == 1
+    read_csv_node_2 = graph.predecessors(eval_node)[0]
+    assert isinstance(read_csv_node_2.op, DataFrameReadCSV)
+    assert read_csv_node_2.op.usecols == ["c1"]
+
+    raw1 = pd.read_csv(file_path)
+    raw2 = pd.read_csv(file_path2)
+    raw1 = raw1[raw2["c1"] > 3]
+    expected = raw1.groupby(by="c1", as_index=False).sum()["c2"]
+    pd.testing.assert_series_equal(
+        r.execute(extra_config={"check_series_name": False}).fetch(), expected
+    )
+
+
+def test_setitem(setup, gen_data1):
+    """
+    The output of DataFrameSetitem should preserve the column being set so that tile can work
+    correctly.
+    """
+    file_path, file_path2 = gen_data1
+    df = md.read_csv(file_path)
+    df2 = md.read_csv(file_path2)
+
+    df["c5"] = df2["c1"]
+    r = df.groupby(by="c1", as_index=False).sum()["c2"]
+
+    graph = r.build_graph()
+    optimize(graph)
+
+    index_node = graph.result_tileables[0]
+    assert isinstance(index_node.op, DataFrameIndex)
+    assert index_node.name == "c2"
+
+    assert len(graph.predecessors(index_node)) == 1
+    gb_node = graph.predecessors(index_node)[0]
+    assert isinstance(gb_node.op, DataFrameGroupByAgg)
+    assert set(gb_node.dtypes.index) == {"c1", "c2"}
+
+    assert len(graph.predecessors(gb_node)) == 1
+    index_node_2 = graph.predecessors(gb_node)[0]
+    isinstance(index_node_2.op, DataFrameIndex)
+    assert set(index_node_2.dtypes.index) == {"c1", "c2"}
+
+    assert len(graph.predecessors(index_node_2)) == 1
+    setitem_node = graph.predecessors(index_node_2)[0]
+    isinstance(setitem_node.op, DataFrameSetitem)
+    assert set(setitem_node.dtypes.index) == {"c1", "c2", "c5"}
+
+    assert len(graph.predecessors(setitem_node)) == 2
+    read_csv_node, index_node_3 = graph.predecessors(setitem_node)
+    assert isinstance(read_csv_node.op, DataFrameReadCSV)
+    assert isinstance(index_node_3.op, DataFrameIndex)
+    assert set(read_csv_node.op.usecols) == {"c1", "c2"}
+    assert index_node_3.name == "c1"
+
+    assert len(graph.predecessors(index_node_3)) == 1
+    read_csv_node_2 = graph.predecessors(index_node_3)[0]
+    assert isinstance(read_csv_node_2.op, DataFrameReadCSV)
+    assert read_csv_node_2.op.usecols == ["c1"]
+
+    raw1 = pd.read_csv(file_path)
+    raw2 = pd.read_csv(file_path2)
+    raw1["c5"] = raw2["c1"]
+    expected = raw1.groupby(by="c1", as_index=False).sum()["c2"]
+    pd.testing.assert_series_equal(r.execute().fetch(), expected)
diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_input_column_selector.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_input_column_selector.py
new file mode 100644
index 000000000..5938f358a
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_input_column_selector.py
@@ -0,0 +1,301 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Any, Set, List, Union
+
+import pytest
+
+from ..input_column_selector import InputColumnSelector
+from ......core import TileableData, ENTITY_TYPE
+from ......core.operand import Operand
+from ......dataframe import DataFrame, Series
+from ......tensor import tensor
+
+
+class MockOperand(Operand):
+    _mock_input: TileableData = DataFrame({"foo": (1, 2, 3), "bar": (4, 5, 6)}).data
+
+    @property
+    def inputs(self) -> List[Union[ENTITY_TYPE]]:
+        return [self._mock_input]
+
+    @classmethod
+    def get_mock_input(cls) -> TileableData:
+        return cls._mock_input
+
+
+class MockEntityData(TileableData):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._op = MockOperand()
+
+
+def test_register():
+    def _select_input_columns(
+        tileable_data: TileableData, required_cols: Set[Any]
+    ) -> Dict[TileableData, Set[Any]]:
+        return {}
+
+    InputColumnSelector.register(MockOperand, _select_input_columns)
+    mock_data = MockEntityData()
+    assert InputColumnSelector.select(mock_data, {"foo"}) == {}
+
+    # unregister
+    InputColumnSelector.unregister(MockOperand)
+    assert InputColumnSelector.select(mock_data, {"foo"}) == {
+        MockOperand.get_mock_input(): {"foo", "bar"}
+    }
+
+
+def test_df_groupby_agg():
+    df: DataFrame = DataFrame(
+        {
+            "foo": (1, 1, 2, 2),
+            "bar": (3, 4, 3, 4),
+            "baz": (5, 6, 7, 8),
+            "qux": (9, 10, 11, 12),
+        }
+    )
+
+    s = df.groupby(by="foo")["baz"].sum()
+    input_columns = InputColumnSelector.select(s.data, {"baz"})
+    assert len(input_columns) == 1
+    assert df.data in input_columns
+    assert input_columns[df.data] == {"foo", "baz"}
+
+    s = df.groupby(by=["foo", "bar"]).sum()
+    input_columns = InputColumnSelector.select(s.data, {"baz"})
+    assert len(input_columns) == 1
+    assert df.data in input_columns
+    assert input_columns[df.data] == {"foo", "bar", "baz"}
+
+    s = df.groupby(by="foo").agg(["sum", "max"])
+    input_columns = InputColumnSelector.select(s.data, {"baz"})
+    assert len(input_columns) == 1
+    assert df.data in input_columns
+    assert input_columns[df.data] == {"foo", "baz"}
+
+    s = df.groupby(by="foo")["bar", "baz"].agg(["sum", "max"])
+    input_columns = InputColumnSelector.select(s.data, {"baz"})
+    assert len(input_columns) == 1
+    assert df.data in input_columns
+    assert input_columns[df.data] == {"foo", "bar", "baz"}
+
+    s = df.groupby(by="foo").agg(new_bar=("bar", "sum"), new_baz=("baz", "sum"))
+    input_columns = InputColumnSelector.select(s.data, {"new_bar"})
+    assert len(input_columns) == 1
+    assert df.data in input_columns
+    assert input_columns[df.data] == {"foo", "bar", "baz"}
+
+
+@pytest.mark.skip(reason="group by index is not supported yet")
+def test_df_groupby_index_agg():
+    df: DataFrame = DataFrame({"foo": (1, 1, 3), "bar": (4, 5, 6)})
+    df = df.set_index("foo")
+    s = df.groupby(by="foo").sum()
+    input_columns = InputColumnSelector.select(s.data, {"bar"})
+    assert len(input_columns) == 1
+    assert df.data in input_columns
+    assert input_columns[df.data] == {"bar"}
+
+
+def test_df_merge():
+    left: DataFrame = DataFrame({"foo": (1, 2, 3), "bar": (4, 5, 6), 1: (7, 8, 9)})
+    right = DataFrame({"foo": (1, 2), "bar": (4, 5), "baz": (5, 8), 1: (7, 8)})
+
+    joined = left.merge(right, on=["foo"])
+
+    input_columns = InputColumnSelector.select(joined.data, {"foo"})
+    assert left.data in input_columns
+    assert input_columns[left.data] == {"foo"}
+    assert right.data in input_columns
+    assert input_columns[right.data] == {"foo"}
+
+    input_columns = InputColumnSelector.select(joined.data, {"foo", "baz"})
+    assert left.data in input_columns
+    assert input_columns[left.data] == {"foo"}
+    assert right.data in input_columns
+    assert input_columns[right.data] == {"foo", "baz"}
+
+    input_columns = InputColumnSelector.select(joined.data, {"foo", "1_x"})
+    assert left.data in input_columns
+    assert input_columns[left.data] == {"foo", 1}
+    assert right.data in input_columns
+    assert input_columns[right.data] == {"foo", 1}
+
+    joined = left.merge(right, on=["foo", "bar"])
+    input_columns = InputColumnSelector.select(joined.data, {"baz"})
+    assert left.data in input_columns
+    assert input_columns[left.data] == {"foo", "bar"}
+    assert right.data in input_columns
+    assert input_columns[right.data] == {"foo", "bar", "baz"}
+
+    joined = left.merge(right, on=["foo", "bar"])
+    input_columns = InputColumnSelector.select(joined.data, {"1_x", "1_y"})
+    assert left.data in input_columns
+    assert input_columns[left.data] == {"foo", "bar", 1}
+    assert right.data in input_columns
+    assert input_columns[right.data] == {"foo", "bar", 1}
+
+
+def test_df_merge_on_index():
+    left: DataFrame = DataFrame({"foo": (1, 2, 3), "bar": (4, 5, 6), 1: (7, 8, 9)})
+    left = left.set_index("foo")
+    right = DataFrame({"foo": (1, 2), "bar": (4, 5), "baz": (5, 8), 1: (7, 8)})
+    right = right.set_index("foo")
+
+    # join on index
+    joined = left.merge(right, on="foo")
+    input_columns = InputColumnSelector.select(joined.data, {"baz"})
+    assert left.data in input_columns
+    assert input_columns[left.data] == set()
+    assert right.data in input_columns
+    assert input_columns[right.data] == {"baz"}
+
+    # left_on is an index and right_on is a column
+    joined = left.merge(right, left_on="foo", right_on="bar")
+    input_columns = InputColumnSelector.select(joined.data, {"baz"})
+    assert left.data in input_columns
+    assert input_columns[left.data] == set()
+    assert right.data in input_columns
+    assert input_columns[right.data] == {"bar", "baz"}
+
+    # left_on is a column and right_on is an index
+    joined = left.merge(right, left_on="bar", right_on="foo")
+    input_columns = InputColumnSelector.select(joined.data, {"baz"})
+    assert left.data in input_columns
+    assert input_columns[left.data] == {"bar"}
+    assert right.data in input_columns
+    assert input_columns[right.data] == {"baz"}
+
+
+def test_df_arithmatic_ops():
+    def add(x, y):
+        return x + y
+
+    def sub(x, y):
+        return x - y
+
+    def mul(x, y):
+        return x * y
+
+    def div(x, y):
+        return x / y
+
+    ops = (add, sub, mul, div)
+    df1: DataFrame = DataFrame({"foo": (1, 2, 3), "bar": (4, 5, 6)})
+    df2: DataFrame = DataFrame({"foo": (1, 2, 3), "bar": (4, 5, 6)})
+
+    for op in ops:
+        res: DataFrame = op(df1, 1)
+        input_columns = InputColumnSelector.select(res.data, {"foo"})
+        assert len(input_columns) == 1
+        assert res.data.inputs[0] in input_columns
+        assert input_columns[res.data.inputs[0]] == {"foo"}
+
+    for op in ops:
+        res: DataFrame = op(df1, df2)
+        input_columns = InputColumnSelector.select(res.data, {"foo"})
+        assert len(input_columns) == 2
+        assert res.data.inputs[0] in input_columns
+        assert input_columns[res.data.inputs[0]] == {"foo"}
+        assert res.data.inputs[1] in input_columns
+        assert input_columns[res.data.inputs[1]] == {"foo"}
+
+
+def test_df_setitem():
+    df: DataFrame = DataFrame(
+        {
+            "foo": (1, 1, 2, 2),
+            "bar": (3, 4, 3, 4),
+            "baz": (5, 6, 7, 8),
+            "qux": (9, 10, 11, 12),
+        }
+    )
+
+    # scaler
+    df[4] = 13
+    input_columns = InputColumnSelector.select(df.data, {"foo"})
+    assert len(input_columns) == 1
+    assert df.data.inputs[0] in input_columns
+    assert input_columns[df.data.inputs[0]] == {"foo"}
+
+    # scaler tensor
+    df[5] = tensor()
+    input_columns = InputColumnSelector.select(df.data, {"foo"})
+    assert len(input_columns) == 1
+    assert df.data.inputs[0] in input_columns
+    assert input_columns[df.data.inputs[0]] == {"foo"}
+
+    # tensor
+    df[6] = tensor([13, 14, 15, 16])
+    input_columns = InputColumnSelector.select(df.data, {"foo"})
+    assert len(input_columns) == 2
+    assert df.data.inputs[0] in input_columns
+    assert input_columns[df.data.inputs[0]] == {"foo"}
+    assert df.data.inputs[1] in input_columns
+    assert input_columns[df.data.inputs[1]] == {None}
+
+    # series
+    df[7] = Series([13, 14, 15, 16])
+    input_columns = InputColumnSelector.select(df.data, {"foo"})
+    assert len(input_columns) == 2
+    assert df.data.inputs[0] in input_columns
+    assert input_columns[df.data.inputs[0]] == {"foo"}
+    assert df.data.inputs[1] in input_columns
+    assert input_columns[df.data.inputs[1]] == {None}
+
+    # dataframe
+    df[[8, 9]] = df[["foo", "bar"]]
+    input_columns = InputColumnSelector.select(df.data, {8})
+    assert len(input_columns) == 2
+    assert df.data.inputs[0] in input_columns
+    assert input_columns[df.data.inputs[0]] == set()
+    assert df.data.inputs[1] in input_columns
+    assert input_columns[df.data.inputs[1]] == {"foo", "bar"}
+
+
+def test_select_all():
+    df: DataFrame = DataFrame(
+        {
+            "foo": (1, 1, 2, 2),
+            "bar": (3, 4, 3, 4),
+            "baz": (5, 6, 7, 8),
+            "qux": (9, 10, 11, 12),
+        }
+    )
+    head = df.head()
+    input_columns = InputColumnSelector.select(head.data, {"foo"})
+    assert len(input_columns) == 1
+    assert head.data.inputs[0] in input_columns
+    assert input_columns[head.data.inputs[0]] == {"foo", "bar", "baz", "qux"}
+
+
+def test_getitem():
+    df: DataFrame = DataFrame(
+        {
+            "foo": (1, 1, 2, 2),
+            "bar": (3, 4, 3, 4),
+            "baz": (5, 6, 7, 8),
+            "qux": (9, 10, 11, 12),
+        }
+    )
+
+    getitem = df[df["foo"] == 1]
+    input_columns = InputColumnSelector.select(getitem.data, {"foo"})
+    assert input_columns[getitem.data.inputs[0]] == {"foo", "bar", "baz", "qux"}
+
+    getitem = df["foo"]
+    input_columns = InputColumnSelector.select(getitem.data, {"foo"})
+    assert input_columns[getitem.data.inputs[0]] == {"foo"}
diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_self_column_selector.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_self_column_selector.py
new file mode 100644
index 000000000..7655e8aa0
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_self_column_selector.py
@@ -0,0 +1,97 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ......dataframe import DataFrame
+from ..self_column_selector import SelfColumnSelector
+
+
+def test_df_setitem():
+    df = DataFrame({"foo": (1, 1, 3)})
+
+    df["bar"] = [1, 2, 3]
+    required_columns = SelfColumnSelector.select(df.data)
+    assert required_columns == {"bar"}
+
+    df[["baz", "qux"]] = 1, 2
+    required_columns = SelfColumnSelector.select(df.data)
+    assert required_columns == {"baz", "qux"}
+
+
+def test_df_getitem():
+    df = DataFrame({"foo": (1, 1, 3), "bar": (4, 5, 6)})
+
+    getitem = df["foo"]
+    required_columns = SelfColumnSelector.select(getitem.data)
+    assert required_columns == {"foo"}
+
+    getitem = df[["foo", "bar"]]
+    required_columns = SelfColumnSelector.select(getitem.data)
+    assert required_columns == {"foo", "bar"}
+
+
+def test_df_groupby_agg():
+    df = DataFrame({"foo": (1, 1, 3), "bar": (4, 5, 6)})
+
+    a = df.groupby(by="foo", as_index=False).sum()
+    required_columns = SelfColumnSelector.select(a.data)
+    assert required_columns == {"foo"}
+
+    a = df.groupby(by="foo").sum()
+    required_columns = SelfColumnSelector.select(a.data)
+    assert required_columns == set()
+
+
+def test_df_merge():
+    left = DataFrame({"foo": (1, 2, 3), "bar": (4, 5, 6), 1: (7, 8, 9)})
+    right = DataFrame({"foo": (1, 2), "bar": (4, 5), "baz": (5, 8), 1: (7, 8)})
+
+    joined = left.merge(right, on="foo")
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == {"foo"}
+
+    joined = left.merge(right, on=["foo", "bar"])
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == {"foo", "bar"}
+
+    joined = left.merge(right, left_on=["foo", "bar"], right_on=["foo", "bar"])
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == {"foo", "bar"}
+
+    joined = left.merge(right)
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == {"foo", "bar", 1}
+
+    joined = left.merge(right, left_on=["foo"], right_on=["bar"])
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == {"foo_x", "bar_y"}
+
+    joined = left.merge(right, left_index=True, right_index=True)
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == set()
+
+    joined = left.merge(right, left_index=True, right_on="foo")
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == {"foo"}
+
+    joined = left.merge(right, left_index=True, right_on=["foo"])
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == {"foo"}
+
+    joined = left.merge(right, left_on="foo", right_index=True)
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == {"foo"}
+
+    joined = left.merge(right, left_on=["foo"], right_index=True)
+    required_columns = SelfColumnSelector.select(joined.data)
+    assert required_columns == {"foo"}
diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/utils.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/utils.py
new file mode 100644
index 000000000..fdb188fc2
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/utils.py
@@ -0,0 +1,31 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Set
+
+from .....dataframe.core import BaseDataFrameData
+
+
+def get_cols_exclude_index(inp: BaseDataFrameData, cols: Any) -> Set[Any]:
+    ret = set()
+    if isinstance(cols, (list, tuple)):
+        for col in cols:
+            if col in inp.dtypes.index:
+                # exclude index
+                ret.add(col)
+    else:
+        if cols in inp.dtypes.index:
+            # exclude index
+            ret.add(cols)
+    return ret
diff --git a/python/xorbits/_mars/optimization/logical/tileable/core.py b/python/xorbits/_mars/optimization/logical/tileable/core.py
new file mode 100644
index 000000000..b1ca31165
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/core.py
@@ -0,0 +1,49 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Type
+
+from ....core import OperandType, TileableGraph
+from ..core import (
+    OperandBasedOptimizationRule,
+    OptimizationRecords,
+    OptimizationRule,
+    Optimizer,
+)
+
+
+class TileableOptimizer(Optimizer):
+    """
+    Tileable Optimizer
+    """
+
+
+def register_optimization_rule():
+    def wrap(rule_type: Type[OptimizationRule]):
+        TileableOptimizer.register_rule(rule_type)
+
+    return wrap
+
+
+def register_operand_based_optimization_rule(op_types: List[Type[OperandType]]):
+    def wrap(rule_type: Type[OperandBasedOptimizationRule]):
+        for op_type in op_types:
+            rule_type.register_operand(op_type)
+        TileableOptimizer.register_rule(rule_type)
+
+    return wrap
+
+
+def optimize(tileable_graph: TileableGraph) -> OptimizationRecords:
+    return TileableOptimizer.optimize(tileable_graph)
diff --git a/python/xorbits/_mars/optimization/logical/tileable/head.py b/python/xorbits/_mars/optimization/logical/tileable/head.py
new file mode 100644
index 000000000..3183a2249
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/head.py
@@ -0,0 +1,24 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....dataframe.indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem
+from ..common.head import HeadPushDown
+from .core import register_operand_based_optimization_rule
+
+
+@register_operand_based_optimization_rule([DataFrameIlocGetItem, SeriesIlocGetItem])
+class TileableHeadPushDown(HeadPushDown):
+    """
+    Head push down.
+    """
diff --git a/python/xorbits/_mars/optimization/logical/tileable/tests/__init__.py b/python/xorbits/_mars/optimization/logical/tileable/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/optimization/logical/tileable/tests/test_arithmetic_query.py b/python/xorbits/_mars/optimization/logical/tileable/tests/test_arithmetic_query.py
new file mode 100644
index 000000000..87c73d9c9
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/tests/test_arithmetic_query.py
@@ -0,0 +1,184 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+import numpy as np
+import pandas as pd
+
+from ..... import dataframe as md
+from ..... import execute, fetch
+from .....core import TileableGraph, TileableGraphBuilder, enter_mode
+from .....dataframe.base.eval import DataFrameEval
+from .. import optimize
+
+_var_pattern = re.compile(r"@__eval_scalar_var\d+")
+
+
+def _norm_vars(var_str):
+    return _var_pattern.sub("@scalar", var_str)
+
+
+@enter_mode(build=True)
+def test_arithmetic_query(setup):
+    raw = pd.DataFrame(np.random.rand(100, 10), columns=list("ABCDEFGHIJ"))
+    raw2 = pd.DataFrame(np.random.rand(100, 5), columns=list("ABCDE"))
+
+    # does not support heterogeneous sources
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df2 = md.DataFrame(raw2, chunk_size=10)
+    df3 = -(df1["A"] + df2["B"])
+    graph = TileableGraph([df3.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df3.data) is None
+
+    # does not support customized args in arithmetic
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df3 = (-df1["A"]).add(df1["B"], fill_value=0.0)
+    graph = TileableGraph([df3.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df3.data) is None
+
+    # does not support GPU
+    df1 = md.DataFrame(raw, chunk_size=10, gpu=True)
+    df4 = (-df1["A"]).add(df1["B"])
+    graph = TileableGraph([df4.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df4.data) is None
+
+    # does not support non-string headers
+    df1 = md.DataFrame(np.random.rand(100, 5))
+    df2 = df1[0] + df1[1]
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df2.data) is None
+
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df2 = -df1["A"] + df1["B"] * 5
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert opt_df2.op.expr == "(-(`A`)) + ((`B`) * (5))"
+
+    pd.testing.assert_series_equal(df2.execute().fetch(), -raw["A"] + raw["B"] * 5)
+
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df2 = -df1["A"] + df1["B"] * 5 + 3 * df1["C"]
+    graph = TileableGraph([df1["A"].data, df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert opt_df2.op.expr == "((-(`A`)) + ((`B`) * (5))) + ((3) * (`C`))"
+
+    r_df2, _r_col_a = fetch(execute(df2, df1["A"]))
+    pd.testing.assert_series_equal(r_df2, -raw["A"] + raw["B"] * 5 + 3 * raw["C"])
+
+
+@enter_mode(build=True)
+def test_bool_eval_to_query(setup):
+    raw = pd.DataFrame(np.random.rand(100, 10), columns=list("ABCDEFGHIJ"))
+
+    # does not support non-eval inputs
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df2 = df1[(df1["A"] * 5).astype(bool)]
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df2.data) is None
+
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df2 = df1[(df1["A"] > 0.5) & (df1["C"] < 0.5)]
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert isinstance(opt_df2.op, DataFrameEval)
+    assert opt_df2.op.is_query
+    assert _norm_vars(opt_df2.op.expr) == "((`A`) > (@scalar)) & ((`C`) < (@scalar))"
+
+    pd.testing.assert_frame_equal(
+        df2.execute().fetch(), raw[(raw["A"] > 0.5) & (raw["C"] < 0.5)]
+    )
+
+    raw = pd.DataFrame(np.random.rand(100, 10), columns=list("ABCDEFGHIJ"))
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df2 = df1[(df1["A"] > 0.5) & (df1["C"] < 0.5)] + 1
+    assert isinstance(opt_df2.op, DataFrameEval)
+    assert opt_df2.op.is_query
+
+    r_df2, _r_col_a = fetch(execute(df2, df1["A"]))
+    pd.testing.assert_frame_equal(r_df2, raw[(raw["A"] > 0.5) & (raw["C"] < 0.5)] + 1)
+
+    raw = pd.DataFrame(
+        {
+            "a": np.arange(100),
+            "b": [pd.Timestamp("2022-1-1") + pd.Timedelta(days=i) for i in range(100)],
+        }
+    )
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df2 = df1[df1.b < pd.Timestamp("2022-3-20")]
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert _norm_vars(opt_df2.op.expr) == "(`b`) < (@scalar)"
+
+    r_df2 = fetch(execute(df2))
+    pd.testing.assert_frame_equal(r_df2, raw[raw.b < pd.Timestamp("2022-3-20")])
+
+
+@enter_mode(build=True)
+def test_eval_setitem_to_eval(setup):
+    raw = pd.DataFrame(np.random.rand(100, 10), columns=list("ABCDEFGHIJ"))
+    raw2 = pd.DataFrame(np.random.rand(100, 5), columns=list("ABCDE"))
+
+    # does not support non-eval value setting
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df1["K"] = 345
+    graph = TileableGraph([df1.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df1.data) is None
+
+    df1 = md.DataFrame(raw, chunk_size=10)
+    df2 = md.DataFrame(raw2, chunk_size=10)
+    df3 = df1.merge(df2, on="A", suffixes=("", "_"))
+    df3["K"] = df3["A"] * (1 - df3["B"])
+    df3["L"] = df3["K"] - df3["A"]
+    df3["M"] = df3["K"] + df3["L"]
+
+    graph = TileableGraph([df3.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    opt_df3 = records.get_optimization_result(df3.data)
+    assert opt_df3.op.expr == "\n".join(
+        [
+            "`K` = (`A`) * ((1) - (`B`))",
+            "`L` = (`K`) - (`A`)",
+            "`M` = (`K`) + (`L`)",
+        ]
+    )
+    assert len(graph) == 4
+    assert len([n for n in graph if isinstance(n.op, DataFrameEval)]) == 1
+
+    r_df3 = raw.merge(raw2, on="A", suffixes=("", "_"))
+    r_df3["K"] = r_df3["A"] * (1 - r_df3["B"])
+    r_df3["L"] = r_df3["K"] - r_df3["A"]
+    r_df3["M"] = r_df3["K"] + r_df3["L"]
+    pd.testing.assert_frame_equal(df3.execute().fetch(), r_df3)
diff --git a/python/xorbits/_mars/optimization/logical/tileable/tests/test_head.py b/python/xorbits/_mars/optimization/logical/tileable/tests/test_head.py
new file mode 100644
index 000000000..b95b8adcd
--- /dev/null
+++ b/python/xorbits/_mars/optimization/logical/tileable/tests/test_head.py
@@ -0,0 +1,225 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ..... import dataframe as md
+from .....core import TileableGraph, TileableGraphBuilder, enter_mode
+from .....dataframe.indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem
+from .. import optimize
+
+
+@pytest.fixture(scope="module")
+def prepare_data():
+    rs = np.random.RandomState(0)
+    df = pd.DataFrame(
+        {
+            "a": rs.randint(10, size=100),
+            "b": rs.rand(100),
+            "c": rs.choice(list("abc"), size=100),
+        }
+    )
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        yield tempdir, df
+
+
+def _execute_iloc(*_):  # pragma: no cover
+    raise ValueError("cannot run iloc")
+
+
+_iloc_operand_executors = {
+    DataFrameIlocGetItem: _execute_iloc,
+    SeriesIlocGetItem: _execute_iloc,
+}
+
+
+@enter_mode(build=True)
+def test_read_csv_head(prepare_data, setup):
+    tempdir, pdf = prepare_data
+    file_path = os.path.join(tempdir, "test.csv")
+    pdf.to_csv(file_path, index=False)
+
+    size = os.stat(file_path).st_size / 2
+    df1 = md.read_csv(file_path, chunk_bytes=size)
+    df2 = df1.head(5)
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df1.data) is None
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert opt_df2.op.nrows == 5
+    assert len(graph) == 1
+    assert opt_df2 in graph.results
+
+    result = df2.execute(
+        extra_config={"operand_executors": _iloc_operand_executors}
+    ).fetch()
+    expected = pdf.head(5)
+    pd.testing.assert_frame_equal(result, expected)
+
+    # test multiple head
+    df3 = df1.head(10)
+    graph = TileableGraph([df2.data, df3.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    opt_df1 = records.get_optimization_result(df1.data)
+    assert opt_df1 is not None
+    assert opt_df1.op.nrows == 10
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert opt_df2 is not None
+    assert graph.predecessors(opt_df2)[0] is opt_df1
+    assert opt_df2.inputs[0] is opt_df1
+    opt_df3 = records.get_optimization_result(df3.data)
+    assert opt_df3 is not None
+    assert graph.predecessors(opt_df3)[0] is opt_df1
+    assert opt_df3.inputs[0] is opt_df1
+
+    # test head with successor
+    df1 = md.read_csv(file_path, chunk_bytes=size)
+    df2 = df1.head(5)
+    df3 = df2 + 1
+    graph = TileableGraph([df3.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df1.data) is None
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert opt_df2.op.nrows == 5
+    assert len(graph) == 2
+
+
+@enter_mode(build=True)
+def test_read_parquet_head(prepare_data, setup):
+    tempdir, pdf = prepare_data
+    dirname = os.path.join(tempdir, "test_parquet")
+    os.makedirs(dirname)
+    for i in range(3):
+        file_path = os.path.join(dirname, f"test{i}.parquet")
+        pdf[i * 40 : (i + 1) * 40].to_parquet(file_path, index=False)
+
+    df1 = md.read_parquet(dirname)
+    df2 = df1.head(5)
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df1.data) is None
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert opt_df2.op.nrows == 5
+    assert len(graph) == 1
+    assert opt_df2 in graph.results
+
+    result = df2.execute(
+        extra_config={"operand_executors": _iloc_operand_executors}
+    ).fetch()
+    expected = pdf.head(5)
+    pd.testing.assert_frame_equal(result, expected)
+
+
+@enter_mode(build=True)
+def test_sort_head(prepare_data, setup):
+    _, pdf = prepare_data
+
+    df1 = md.DataFrame(pdf, chunk_size=20)
+    df1 = df1.sort_values(by="b")
+    df2 = df1.head(10)
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df1.data) is None
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert opt_df2.op.nrows == 10
+    assert len(graph) == 2
+    assert opt_df2 in graph.results
+
+    result = df2.execute(
+        extra_config={"operand_executors": _iloc_operand_executors}
+    ).fetch()
+    expected = pdf.sort_values(by="b").head(10)
+    pd.testing.assert_frame_equal(result, expected)
+
+    pdf2 = pdf.copy()
+    pdf2.set_index("b", inplace=True)
+    df1 = md.DataFrame(pdf2, chunk_size=20)
+    df1 = df1.sort_index()
+    df2 = df1.head(10)
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df1.data) is None
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert opt_df2.op.nrows == 10
+    assert len(graph) == 2
+    assert opt_df2 in graph.results
+
+    result = df2.execute(
+        extra_config={"operand_executors": _iloc_operand_executors}
+    ).fetch()
+    expected = pdf2.sort_index().head(10)
+    pd.testing.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("chunk_size", [5, 10])
+@enter_mode(build=True)
+def test_value_counts_head(prepare_data, setup, chunk_size):
+    _, pdf = prepare_data
+    df = md.DataFrame(pdf, chunk_size=chunk_size)
+
+    df1 = df["a"].value_counts(method="tree")
+    df2 = df1.head(3)
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df1.data) is None
+    opt_df2 = records.get_optimization_result(df2.data)
+    assert opt_df2.op.nrows == 3
+    assert len(graph) == 3
+    assert opt_df2 in graph.results
+
+    result = df2.execute(
+        extra_config={"operand_executors": _iloc_operand_executors}
+    ).fetch()
+    expected = pdf["a"].value_counts().head(3)
+    pd.testing.assert_series_equal(result, expected)
+
+
+@enter_mode(build=True)
+def test_no_head(prepare_data):
+    tempdir, pdf = prepare_data
+    file_path = os.path.join(tempdir, "test.csv")
+    pdf.to_csv(file_path, index=False)
+
+    size = os.stat(file_path).st_size / 2
+    df1 = md.read_csv(file_path, chunk_bytes=size)
+    df2 = df1.iloc[1:10]
+
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df1.data) is None
+    assert records.get_optimization_result(df2.data) is None
+
+    df2 = df1.head(3)
+    df3 = df1 + 1
+
+    graph = TileableGraph([df2.data, df3.data])
+    next(TileableGraphBuilder(graph).build())
+    records = optimize(graph)
+    assert records.get_optimization_result(df1.data) is None
+    assert records.get_optimization_result(df2.data) is None
+    assert records.get_optimization_result(df3.data) is None
diff --git a/python/xorbits/_mars/optimization/physical/__init__.py b/python/xorbits/_mars/optimization/physical/__init__.py
new file mode 100644
index 000000000..8f9a8f9ad
--- /dev/null
+++ b/python/xorbits/_mars/optimization/physical/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import optimize
+from .cupy import CupyRuntimeOptimizer
+from .numexpr import NumexprRuntimeOptimizer
diff --git a/python/xorbits/_mars/optimization/physical/core.py b/python/xorbits/_mars/optimization/physical/core.py
new file mode 100644
index 000000000..a3b66afcd
--- /dev/null
+++ b/python/xorbits/_mars/optimization/physical/core.py
@@ -0,0 +1,97 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Dict, List, Tuple, Type
+
+from ...core import ChunkGraph, ChunkType, OperandType
+from ...utils import build_fuse_chunk
+
+
+class RuntimeOptimizer(ABC):
+    engine = None
+
+    def __init__(self, graph: ChunkGraph):
+        self._graph = graph
+
+    @classmethod
+    @abstractmethod
+    def is_available(cls) -> bool:
+        """
+        Check this optimizer is available.
+
+        Returns
+        -------
+        is_available : bool
+            Available.
+        """
+
+    @abstractmethod
+    def optimize(self):
+        """
+        Optimize chunk graph.
+        """
+
+    def _fuse_nodes(
+        self, fuses: List[List[ChunkType]], fuse_cls: OperandType
+    ) -> Tuple[List[List[ChunkType]], List[ChunkType]]:
+        graph = self._graph
+        fused_nodes = []
+
+        for fuse in fuses:
+            head_node = fuse[0]
+            tail_node = fuse[-1]
+
+            fused_chunk = build_fuse_chunk(
+                fuse, fuse_cls, op_kw={"dtype": tail_node.dtype}
+            ).data
+            graph.add_node(fused_chunk)
+            for node in graph.iter_successors(tail_node):
+                graph.add_edge(fused_chunk, node)
+            for node in graph.iter_predecessors(head_node):
+                graph.add_edge(node, fused_chunk)
+            for node in fuse:
+                graph.remove_node(node)
+            fused_nodes.append(fused_chunk)
+
+            try:
+                # check tail node if it's in results
+                i = graph.results.index(tail_node)
+                graph.results[i] = fused_chunk
+            except ValueError:
+                pass
+
+        return fuses, fused_nodes
+
+
+_engine_to_optimizers: Dict[str, Type[RuntimeOptimizer]] = dict()
+
+
+def register_optimizer(optimizer_cls: Type[RuntimeOptimizer]):
+    _engine_to_optimizers[optimizer_cls.engine] = optimizer_cls
+    return optimizer_cls
+
+
+def optimize(graph: ChunkGraph, engines: List[str] = None) -> ChunkGraph:
+    if engines is None:
+        engines = ["numexpr", "cupy"]
+
+    for engine in engines:
+        optimizer_cls = _engine_to_optimizers[engine]
+        optimizer = optimizer_cls(graph)
+        if not optimizer.is_available():
+            continue
+        optimizer.optimize()
+
+    return graph
diff --git a/python/xorbits/_mars/optimization/physical/cupy.py b/python/xorbits/_mars/optimization/physical/cupy.py
new file mode 100644
index 000000000..16dd9278e
--- /dev/null
+++ b/python/xorbits/_mars/optimization/physical/cupy.py
@@ -0,0 +1,70 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...tensor import arithmetic
+from ...tensor.fuse import TensorCpFuseChunk
+from ...utils import lazy_import
+from .core import RuntimeOptimizer, register_optimizer
+
+cp = lazy_import("cupy", rename="cp")
+CP_INSTALLED = cp is not None
+
+CP_ELEMENTWISE_OP = {
+    arithmetic.TensorSubtract,
+    arithmetic.TensorMultiply,
+    arithmetic.TensorTrueDiv,
+    arithmetic.TensorSqrt,
+}
+CP_OP = CP_ELEMENTWISE_OP
+
+
+@register_optimizer
+class CupyRuntimeOptimizer(RuntimeOptimizer):
+    engine = "cupy"
+
+    @classmethod
+    def is_available(cls) -> bool:
+        return CP_INSTALLED
+
+    def optimize(self):
+        fuses = []
+        explored = set()
+
+        graph = self._graph
+        for node in graph.topological_iter():
+            if type(node.op) not in CP_OP:
+                continue
+            if node in explored:
+                continue
+            if graph.count_predecessors(node) != 1:
+                continue
+            if node in graph.results:
+                continue
+
+            selected = [node]
+            # add successors
+            cur_node = graph.successors(node)[0]
+            while (
+                graph.count_predecessors(cur_node) == 1 and type(cur_node.op) in CP_OP
+            ):
+                selected.append(cur_node)
+                if graph.count_successors(cur_node) != 1 or cur_node in graph.results:
+                    break
+                else:
+                    cur_node = graph.successors(cur_node)[0]
+            if len(selected) > 1:
+                explored.update(selected)
+                fuses.append(list(selected))
+
+        return self._fuse_nodes(fuses, TensorCpFuseChunk)
diff --git a/python/xorbits/_mars/optimization/physical/numexpr.py b/python/xorbits/_mars/optimization/physical/numexpr.py
new file mode 100644
index 000000000..7f5d8c475
--- /dev/null
+++ b/python/xorbits/_mars/optimization/physical/numexpr.py
@@ -0,0 +1,252 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+import functools
+import logging
+from typing import List, Set
+
+import numpy as np
+
+from ...core import ChunkGraph, ChunkType
+from ...tensor import arithmetic, reduction
+from ...tensor.fuse import TensorNeFuseChunk
+from ...tensor.fuse.numexpr import NUMEXPR_INSTALLED
+from .core import RuntimeOptimizer, register_optimizer
+
+logger = logging.getLogger(__name__)
+
+
+REDUCTION = object()
+REDUCTION_OP = {
+    reduction.TensorSum,
+    reduction.TensorProd,
+    reduction.TensorMax,
+    reduction.TensorMin,
+}
+SUPPORT_OP = {
+    arithmetic.TensorAdd,
+    arithmetic.TensorSubtract,
+    arithmetic.TensorMultiply,
+    arithmetic.TensorDivide,
+    arithmetic.TensorPower,
+    arithmetic.TensorMod,
+    arithmetic.TensorNegative,
+    arithmetic.TensorAbs,
+    arithmetic.TensorConj,
+    arithmetic.TensorExp,
+    arithmetic.TensorLog,
+    arithmetic.TensorLog10,
+    arithmetic.TensorExpm1,
+    arithmetic.TensorLog1p,
+    arithmetic.TensorSqrt,
+    arithmetic.TensorEqual,
+    arithmetic.TensorNotEqual,
+    arithmetic.TensorLessThan,
+    arithmetic.TensorLessEqual,
+    arithmetic.TensorGreaterThan,
+    arithmetic.TensorGreaterEqual,
+    arithmetic.TensorSin,
+    arithmetic.TensorCos,
+    arithmetic.TensorTan,
+    arithmetic.TensorArcsin,
+    arithmetic.TensorArccos,
+    arithmetic.TensorArctan,
+    arithmetic.TensorSinh,
+    arithmetic.TensorCosh,
+    arithmetic.TensorTanh,
+    arithmetic.TensorArcsinh,
+    arithmetic.TensorArccosh,
+    arithmetic.TensorArctanh,
+    arithmetic.TensorLshift,
+    arithmetic.TensorRshift,
+    arithmetic.TensorTreeAdd,
+    arithmetic.TensorTreeMultiply,
+    arithmetic.TensorFloor,
+    arithmetic.TensorCeil,
+    arithmetic.TensorAnd,
+    arithmetic.TensorOr,
+    arithmetic.TensorNot,
+    reduction.TensorSum,
+    reduction.TensorProd,
+    reduction.TensorMax,
+    reduction.TensorMin,
+}
+
+
+@dataclasses.dataclass
+class _Fuse:
+    graph: ChunkGraph
+    heads: List[ChunkType]
+    tails: List[ChunkType]
+
+
+def _can_fuse(node: ChunkType):
+    op = node.op
+    op_type = type(op)
+    if op_type in REDUCTION_OP:
+        if len(op.axis) == 1 or len(op.axis) == node.ndim:
+            return REDUCTION
+        else:
+            return False
+    # return op_type in SUPPORT_OP
+    if op_type not in SUPPORT_OP:
+        return False
+    if op_type in (arithmetic.TensorOr, arithmetic.TensorAnd):
+        # numexpr only support logical and or:
+        # https://numexpr.readthedocs.io/projects/NumExpr3/en/latest/user_guide.html#supported-operators
+        if np.isscalar(op.lhs) or np.isscalar(op.rhs):
+            return False
+    return True
+
+
+def _collect_fuse(
+    graph: ChunkGraph,
+    node: ChunkType,
+    graph_results: Set[ChunkType],
+    cached_can_fuse,
+):
+    fuse_graph = ChunkGraph()
+    fuse_graph.add_node(node)
+    fuse_heads = []
+    fuse_tails = []
+    tail_reduction_node = None
+
+    stack = [node]
+    # Do a full search of sub graph even the fuse tails > 1
+    while len(stack) != 0:
+        node = stack.pop()
+        is_head = graph.count_predecessors(node) == 0
+        for n in graph.iter_predecessors(node):
+            can_fuse = cached_can_fuse(n)
+            if can_fuse is False or can_fuse is REDUCTION:
+                is_head = True
+            elif not fuse_graph.contains(n):
+                stack.append(n)
+                fuse_graph.add_node(n)
+            else:
+                fuse_graph.add_edge(n, node)
+        if is_head:
+            fuse_heads.append(node)
+        # Skip the successors of tail reduction node.
+        if node is tail_reduction_node:
+            continue
+        is_tail = graph.count_successors(node) == 0 or node in graph_results
+        for n in graph.iter_successors(node):
+            can_fuse = cached_can_fuse(n)
+            if can_fuse is False:
+                is_tail = True
+            elif can_fuse is REDUCTION:
+                if tail_reduction_node is None:
+                    tail_reduction_node = n
+                    fuse_tails.append(n)
+                    stack.append(n)
+                    fuse_graph.add_node(n)
+                elif n is tail_reduction_node:
+                    fuse_graph.add_edge(node, n)
+                else:
+                    is_tail = True
+            elif not fuse_graph.contains(n):
+                stack.append(n)
+                fuse_graph.add_node(n)
+            else:
+                fuse_graph.add_edge(node, n)
+        if is_tail:
+            fuse_tails.append(node)
+
+    return _Fuse(fuse_graph, fuse_heads, fuse_tails)
+
+
+@register_optimizer
+class NumexprRuntimeOptimizer(RuntimeOptimizer):
+    engine = "numexpr"
+
+    @classmethod
+    def is_available(cls) -> bool:
+        return NUMEXPR_INSTALLED
+
+    def optimize(self):
+        fuses = []
+        explored = set()
+        cached_can_fuse = functools.lru_cache(maxsize=None)(_can_fuse)
+
+        graph = self._graph
+        graph_results = set(graph.results)
+        for node in graph.topological_iter():
+            if node.op.gpu or node.op.sparse:
+                # break
+                return [], []
+            if node in explored or node in graph_results:
+                continue
+            can_fuse = cached_can_fuse(node)
+            if can_fuse is True:
+                fuse = _collect_fuse(graph, node, graph_results, cached_can_fuse)
+                if len(fuse.graph) > 1:
+                    explored.update(fuse.graph)
+                    if len(fuse.tails) == 1:
+                        fuses.append(fuse)
+                    else:
+                        logger.info(
+                            "Refused fusing for numexpr because the tail node count > 1."
+                        )
+
+        return self._fuse_nodes(fuses, TensorNeFuseChunk)
+
+    def _fuse_nodes(self, fuses: List[_Fuse], fuse_cls):
+        graph = self._graph
+        fused_nodes = []
+
+        for fuse in fuses:
+            fuse_graph = fuse.graph
+            tail_nodes = fuse.tails
+            head_nodes = fuse.heads
+            inputs = [
+                inp for n in head_nodes for inp in n.inputs if inp not in fuse_graph
+            ]
+
+            tail_chunk = tail_nodes[0]
+            tail_chunk_op = tail_chunk.op
+            fuse_op = fuse_cls(
+                sparse=tail_chunk_op.sparse,
+                gpu=tail_chunk_op.gpu,
+                _key=tail_chunk_op.key,
+                fuse_graph=fuse_graph,
+                dtype=tail_chunk.dtype,
+            )
+            fused_chunk = fuse_op.new_chunk(
+                inputs,
+                kws=[tail_chunk.params],
+                _key=tail_chunk.key,
+                _chunk=tail_chunk,
+            ).data
+
+            graph.add_node(fused_chunk)
+            for node in graph.iter_successors(tail_chunk):
+                graph.add_edge(fused_chunk, node)
+            for head_chunk in head_nodes:
+                for node in graph.iter_predecessors(head_chunk):
+                    if not fuse_graph.contains(node):
+                        graph.add_edge(node, fused_chunk)
+            for node in fuse_graph:
+                graph.remove_node(node)
+            fused_nodes.append(fused_chunk)
+
+            try:
+                # check tail node if it's in results
+                i = graph.results.index(tail_chunk)
+                graph.results[i] = fused_chunk
+            except ValueError:
+                pass
+
+        return fuses, fused_nodes
diff --git a/python/xorbits/_mars/optimization/physical/tests/__init__.py b/python/xorbits/_mars/optimization/physical/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/optimization/physical/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/optimization/physical/tests/test_cupy.py b/python/xorbits/_mars/optimization/physical/tests/test_cupy.py
new file mode 100644
index 000000000..0a3194d0e
--- /dev/null
+++ b/python/xorbits/_mars/optimization/physical/tests/test_cupy.py
@@ -0,0 +1,41 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import tensor as mt
+from ....core import (
+    ChunkGraphBuilder,
+    TileableGraph,
+    TileableGraphBuilder,
+    TileContext,
+    enter_mode,
+)
+from ..cupy import CupyRuntimeOptimizer
+
+
+@enter_mode(build=True)
+def test_cupy():
+    t1 = mt.ones((100, 50), chunk_size=50, gpu=True)
+    t2 = mt.ones(50, chunk_size=50, gpu=True)
+    t = (t1 - t2) / mt.sqrt(t2 * (1 - t2) * len(t2))
+
+    graph = TileableGraph([t.data])
+    next(TileableGraphBuilder(graph).build())
+    context = TileContext()
+    chunk_graph_builder = ChunkGraphBuilder(
+        graph, fuse_enabled=False, tile_context=context
+    )
+    chunk_graph = next(chunk_graph_builder.build())
+
+    CupyRuntimeOptimizer(chunk_graph).optimize()
+    assert any(n.op.__class__.__name__ == "TensorCpFuseChunk" for n in chunk_graph)
diff --git a/python/xorbits/_mars/optimization/physical/tests/test_numexpr.py b/python/xorbits/_mars/optimization/physical/tests/test_numexpr.py
new file mode 100644
index 000000000..74d3a4b6e
--- /dev/null
+++ b/python/xorbits/_mars/optimization/physical/tests/test_numexpr.py
@@ -0,0 +1,383 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import operator
+
+from ....core import ChunkGraph
+from ....tensor.arithmetic import TensorTreeAdd
+from ....tensor.indexing import TensorSlice
+from ....tensor.reduction import TensorSum
+from ..numexpr import NumexprRuntimeOptimizer
+
+
+def test_numexpr():
+    r"""
+        graph(@: node, S: Slice Chunk, #: fused_node):
+
+        @                   @                          @
+          \               /                          /
+            @ --> @ --> S      ========>     # --> S
+          /               \                          \
+        @                   @                          @
+
+        fuse stopped at S, because numexpr don't support Slice op
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(6)
+    ]
+    chunk_slice = TensorSlice().new_chunk([None], None).data
+    chunk_reduction = TensorSum(axis=(1,)).new_chunk([None], None).data
+    graph = ChunkGraph([chunks[4], chunks[5]])
+    list(map(graph.add_node, chunks[:6]))
+    graph.add_node(chunk_slice)
+    graph.add_edge(chunks[0], chunks[2])
+    graph.add_edge(chunks[1], chunks[2])
+    graph.add_edge(chunks[2], chunks[3])
+    graph.add_edge(chunks[3], chunk_slice)
+    graph.add_edge(chunk_slice, chunks[4])
+    graph.add_edge(chunk_slice, chunks[5])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert fused_nodes[0].composed == chunks[:4]
+    assert len(graph) == 4
+
+    r"""
+        graph(@: node, S: Slice Chunk, #: fused_node):
+
+        @                   @
+          \               /
+            @ --> @ --> @      ========>   Tail node count > 1, can't be fused.
+          /               \
+        @                   @
+
+        fuse stopped at S, because numexpr don't support Slice op
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(7)
+    ]
+    graph = ChunkGraph([chunks[5], chunks[6]])
+    list(map(graph.add_node, chunks[:7]))
+    graph.add_edge(chunks[0], chunks[2])
+    graph.add_edge(chunks[1], chunks[2])
+    graph.add_edge(chunks[2], chunks[3])
+    graph.add_edge(chunks[3], chunks[4])
+    graph.add_edge(chunks[4], chunks[5])
+    graph.add_edge(chunks[4], chunks[6])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert len(fused_nodes) == 0
+    assert len(graph) == 7
+
+    r"""
+        graph(@: node, S: Slice Chunk, #: fused_node):
+
+        @           S       S
+          \        /       /
+            @ --> @ --> @      ========>   Tail node count > 1, can't be fused.
+          /               \
+        @                   @
+
+        fuse stopped at S, because numexpr don't support Slice op
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(6)
+    ]
+    chunk_slices = [
+        TensorSlice(_key=str(n)).new_chunk([None], None).data for n in range(2)
+    ]
+    graph = ChunkGraph([chunks[5], chunk_slices[0], chunk_slices[1]])
+    list(map(graph.add_node, chunks[:6]))
+    list(map(graph.add_node, chunk_slices[:2]))
+    graph.add_edge(chunks[0], chunks[2])
+    graph.add_edge(chunks[1], chunks[2])
+    graph.add_edge(chunks[2], chunks[3])
+    graph.add_edge(chunks[3], chunk_slices[0])
+    graph.add_edge(chunks[3], chunks[4])
+    graph.add_edge(chunks[4], chunks[5])
+    graph.add_edge(chunks[4], chunk_slices[1])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert len(fused_nodes) == 0
+    assert len(graph) == 8
+
+    r"""
+        graph(@: node, S: Slice Chunk, #: fused_node):
+
+        @
+          \
+            @
+          /   \
+        @      \
+                 @   ========>   #
+        @      /
+          \   /
+            @
+          /
+        @
+
+        fuse stopped at S, because numexpr don't support Slice op
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(7)
+    ]
+    graph = ChunkGraph([chunks[6]])
+    list(map(graph.add_node, chunks[:7]))
+    graph.add_edge(chunks[0], chunks[2])
+    graph.add_edge(chunks[1], chunks[2])
+    graph.add_edge(chunks[3], chunks[5])
+    graph.add_edge(chunks[4], chunks[5])
+    graph.add_edge(chunks[2], chunks[6])
+    graph.add_edge(chunks[5], chunks[6])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    sorted_composed = sorted(fused_nodes[0].composed, key=operator.attrgetter("key"))
+    assert sorted_composed == chunks
+    assert len(graph) == 1
+
+    r"""
+        graph(@: node, S: Slice Chunk, #: fused_node):
+
+        @
+          \
+            @
+          /   \                            #
+        @      \                              \
+                 S --> @ --> @  ========>       S --> #
+        @      /                              /
+          \   /                            #
+            @
+          /
+        @
+
+        fuse stopped at S, because numexpr don't support Slice op
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(8)
+    ]
+    graph = ChunkGraph([chunks[7]])
+    list(map(graph.add_node, chunks[:8]))
+    graph.add_node(chunk_slice)
+    graph.add_edge(chunks[0], chunks[2])
+    graph.add_edge(chunks[1], chunks[2])
+    graph.add_edge(chunks[3], chunks[5])
+    graph.add_edge(chunks[4], chunks[5])
+    graph.add_edge(chunks[2], chunk_slice)
+    graph.add_edge(chunks[5], chunk_slice)
+    graph.add_edge(chunk_slice, chunks[6])
+    graph.add_edge(chunks[6], chunks[7])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert len(fused_nodes) == 3
+    assert sorted(len(n.composed) for n in fused_nodes) == [2, 3, 3]
+    assert len(graph) == 4
+    assert graph.contains(chunk_slice)
+
+    r"""
+        graph(@: node, S: Slice Chunk, #: fused_node):
+
+        S
+          \
+            @
+          /   \                         S
+        @      \                           \
+                 @ --- @   ========>    S --  #
+        @      /     /                     /
+          \   /     S                   S
+            @
+          /
+        S
+
+        fuse stopped at S, because numexpr don't support Slice op
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(6)
+    ]
+    chunk_slices = [
+        TensorSlice(_key=str(n)).new_chunk([None], None).data for n in range(3)
+    ]
+    graph = ChunkGraph([chunks[5]])
+    list(map(graph.add_node, chunks[:6]))
+    list(map(graph.add_node, chunk_slices[:3]))
+    graph.add_edge(chunk_slices[0], chunks[1])
+    graph.add_edge(chunks[0], chunks[1])
+    graph.add_edge(chunks[2], chunks[3])
+    graph.add_edge(chunk_slices[1], chunks[3])
+    graph.add_edge(chunks[1], chunks[4])
+    graph.add_edge(chunks[3], chunks[4])
+    graph.add_edge(chunks[4], chunks[5])
+    graph.add_edge(chunk_slices[2], chunks[5])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert len(fused_nodes) == 1
+    sorted_composed = sorted(fused_nodes[0].composed, key=operator.attrgetter("key"))
+    assert sorted_composed == chunks
+    assert len(graph) == 4
+    assert graph.count_predecessors(fused_nodes[0]) == 3
+
+    r"""
+        graph(@: node, S: Slice Chunk, #: fused_node):
+
+        @ --> @ --> S --> @  ========>  # --> S --> @
+
+        fuse stopped at S, because numexpr don't support Slice op
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(4)
+    ]
+    graph = ChunkGraph([chunks[2]])
+    list(map(graph.add_node, chunks[:3]))
+    graph.add_node(chunk_slice)
+    graph.add_edge(chunks[0], chunks[1])
+    graph.add_edge(chunks[1], chunk_slice)
+    graph.add_edge(chunk_slice, chunks[2])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert fused_nodes[0].composed == chunks[:2]
+    assert len(fused_nodes) == 1
+
+    r"""
+        graph(@: node, S: Slice Chunk, #: fused_node):
+
+        @ --> @ --> S --> @ --> @   ========>  # --> S --> #
+
+        fuse stopped at S, because numexpr don't support Slice op
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(4)
+    ]
+    graph = ChunkGraph([chunks[3]])
+    list(map(graph.add_node, chunks[:4]))
+    graph.add_node(chunk_slice)
+    graph.add_edge(chunks[0], chunks[1])
+    graph.add_edge(chunks[1], chunk_slice)
+    graph.add_edge(chunk_slice, chunks[2])
+    graph.add_edge(chunks[2], chunks[3])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert fused_nodes[0].composed == chunks[:2]
+    assert fused_nodes[1].composed == chunks[2:4]
+
+    r"""
+        graph(@: node, R: Reduction Chunk, #: fused_node):
+
+        @ --> @ --> R --> @ --> @   ========>  # --> #
+
+        fuse stopped at R, because reduction should be the last in the numexpr stack.
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(4)
+    ]
+    graph = ChunkGraph([chunks[3]])
+    list(map(graph.add_node, chunks[:4]))
+    graph.add_node(chunk_reduction)
+    graph.add_edge(chunks[0], chunks[1])
+    graph.add_edge(chunks[1], chunk_reduction)
+    graph.add_edge(chunk_reduction, chunks[2])
+    graph.add_edge(chunks[2], chunks[3])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert len(fused_nodes) == 2
+    assert fused_nodes[0].composed == chunks[:2] + [chunk_reduction]
+    assert fused_nodes[1].composed == chunks[2:4]
+    assert len(graph) == 2
+
+    r"""
+        graph(@: node, R: Reduction Chunk, #: fused_node):
+
+        R --> @ --> @   ========>  R --> #
+
+        fuse stopped at R, because reduction should be the last in the numexpr stack.
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(2)
+    ]
+    graph = ChunkGraph([chunks[1]])
+    list(map(graph.add_node, chunks[:2]))
+    graph.add_node(chunk_reduction)
+    graph.add_edge(chunk_reduction, chunks[0])
+    graph.add_edge(chunks[0], chunks[1])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert len(fused_nodes) == 1
+    assert fused_nodes[0].composed == chunks[:2]
+    assert len(graph) == 2
+
+    r"""
+        graph(@: node, R: Reduction Chunk, #: fused_node):
+
+        @ --> @ --> R   ========>  #
+
+        fuse stopped at R, because reduction should be the last in the numexpr stack.
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(2)
+    ]
+    graph = ChunkGraph([chunk_reduction])
+    list(map(graph.add_node, chunks[:2]))
+    graph.add_node(chunk_reduction)
+    graph.add_edge(chunks[0], chunks[1])
+    graph.add_edge(chunks[1], chunk_reduction)
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert len(fused_nodes) == 1
+    assert fused_nodes[0].composed == chunks[:2] + [chunk_reduction]
+    assert len(graph) == 1
+
+    r"""
+        graph(@: node, R: Reduction Chunk, #: fused_node):
+
+        @
+          \                                        R
+            R     R                               /
+          /  \   /         =============>  # --> #     R
+        @      R     R                            \   /
+             /   \  /                               @ --> R
+            @     @ --> R
+
+        fuse stopped at R, because reduction should be the last in the numexpr stack.
+        """
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(4)
+    ]
+    chunk_reductions = [
+        TensorSum(axis=(1,), _key=str(n)).new_chunk([None], None).data for n in range(5)
+    ]
+    graph = ChunkGraph([chunk_reductions[2], chunk_reductions[3], chunk_reductions[4]])
+    list(map(graph.add_node, chunks[:4]))
+    list(map(graph.add_node, chunk_reductions[:5]))
+    graph.add_edge(chunks[0], chunk_reductions[0])
+    graph.add_edge(chunks[1], chunk_reductions[0])
+    graph.add_edge(chunks[2], chunk_reductions[1])
+    graph.add_edge(chunk_reductions[0], chunk_reductions[1])
+    graph.add_edge(chunk_reductions[1], chunk_reductions[2])
+    graph.add_edge(chunk_reductions[1], chunks[3])
+    graph.add_edge(chunks[3], chunk_reductions[3])
+    graph.add_edge(chunks[3], chunk_reductions[4])
+
+    optimizer = NumexprRuntimeOptimizer(graph)
+    _, fused_nodes = optimizer.optimize()
+    assert len(fused_nodes) == 2
+    assert fused_nodes[0].composed == [chunks[2], chunk_reductions[1]]
+    assert set(fused_nodes[1].composed) == {chunks[0], chunks[1], chunk_reductions[0]}
+    assert len(graph) == 6
diff --git a/python/xorbits/_mars/oscar/__init__.py b/python/xorbits/_mars/oscar/__init__.py
new file mode 100644
index 000000000..48bafea48
--- /dev/null
+++ b/python/xorbits/_mars/oscar/__init__.py
@@ -0,0 +1,57 @@
+# isort: skip_file
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TypeVar, Union
+
+# import aio to ensure patch enabled for Python 3.6
+from ..lib import aio
+
+del aio
+
+from . import debug
+from .api import (
+    actor_ref,
+    create_actor,
+    has_actor,
+    destroy_actor,
+    kill_actor,
+    Actor,
+    StatelessActor,
+    create_actor_pool,
+    setup_cluster,
+    wait_actor_pool_recovered,
+    get_pool_config,
+)
+from .backends import allocate_strategy
+from .backends.pool import MainActorPoolType
+from .batch import extensible
+from .core import ActorRef
+from .debug import set_debug_options, get_debug_options, DebugOptions
+from .errors import (
+    ActorNotExist,
+    ActorAlreadyExist,
+    ServerClosed,
+    SendMessageFailed,
+    Return,
+)
+from .utils import create_actor_ref
+
+# make sure methods are registered
+from .backends import mars, ray, test
+
+del mars, ray, test
+
+_T = TypeVar("_T")
+ActorRefType = Union[ActorRef, _T]
diff --git a/python/xorbits/_mars/oscar/api.py b/python/xorbits/_mars/oscar/api.py
new file mode 100644
index 000000000..4382d4839
--- /dev/null
+++ b/python/xorbits/_mars/oscar/api.py
@@ -0,0 +1,142 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from numbers import Number
+from typing import Any, Dict, Tuple, Type
+from urllib.parse import urlparse
+
+from .backend import get_backend
+from .context import get_context
+from .core import ActorRef, _Actor, _StatelessActor
+
+
+async def create_actor(actor_cls, *args, uid=None, address=None, **kwargs) -> ActorRef:
+    ctx = get_context()
+    return await ctx.create_actor(actor_cls, *args, uid=uid, address=address, **kwargs)
+
+
+async def has_actor(actor_ref: ActorRef) -> bool:
+    ctx = get_context()
+    return await ctx.has_actor(actor_ref)
+
+
+async def destroy_actor(actor_ref: ActorRef):
+    ctx = get_context()
+    return await ctx.destroy_actor(actor_ref)
+
+
+async def actor_ref(*args, **kwargs) -> ActorRef:
+    ctx = get_context()
+    return await ctx.actor_ref(*args, **kwargs)
+
+
+async def kill_actor(actor_ref):
+    ctx = get_context()
+    return await ctx.kill_actor(actor_ref)
+
+
+async def create_actor_pool(address: str, n_process: int = None, **kwargs):
+    if address is None:
+        raise ValueError("address has to be provided")
+    if "://" not in address:
+        scheme = None
+    else:
+        scheme = urlparse(address).scheme or None
+
+    return await get_backend(scheme).create_actor_pool(
+        address, n_process=n_process, **kwargs
+    )
+
+
+async def wait_actor_pool_recovered(address: str, main_pool_address: str = None):
+    ctx = get_context()
+    return await ctx.wait_actor_pool_recovered(address, main_pool_address)
+
+
+async def get_pool_config(address: str):
+    ctx = get_context()
+    return await ctx.get_pool_config(address)
+
+
+def setup_cluster(address_to_resources: Dict[str, Dict[str, Number]]):
+    scheme_to_address_resources = defaultdict(dict)
+    for address, resources in address_to_resources.items():
+        if address is None:
+            raise ValueError("address has to be provided")
+        if "://" not in address:
+            scheme = None
+        else:
+            scheme = urlparse(address).scheme or None
+
+        scheme_to_address_resources[scheme][address] = resources
+    for scheme, address_resources in scheme_to_address_resources.items():
+        get_backend(scheme).get_driver_cls().setup_cluster(address_resources)
+
+
+class AsyncActorMixin:
+    @classmethod
+    def default_uid(cls):
+        return cls.__name__
+
+    def __new__(cls, *args, **kwargs):
+        try:
+            return _actor_implementation[cls](*args, **kwargs)
+        except KeyError:
+            return super().__new__(cls, *args, **kwargs)
+
+    async def __post_create__(self):
+        """
+        Method called after actor creation
+        """
+        return await super().__post_create__()
+
+    async def __pre_destroy__(self):
+        """
+        Method called before actor destroy
+        """
+        return await super().__pre_destroy__()
+
+    async def __on_receive__(self, message: Tuple[Any]):
+        """
+        Handle message from other actors and dispatch them to user methods
+
+        Parameters
+        ----------
+        message : tuple
+            Message shall be (method_name,) + args + (kwargs,)
+        """
+        return await super().__on_receive__(message)
+
+
+class Actor(AsyncActorMixin, _Actor):
+    pass
+
+
+class StatelessActor(AsyncActorMixin, _StatelessActor):
+    pass
+
+
+_actor_implementation: Dict[Type[Actor], Type[Actor]] = dict()
+
+
+def register_actor_implementation(actor_cls: Type[Actor], impl_cls: Type[Actor]):
+    _actor_implementation[actor_cls] = impl_cls
+
+
+def unregister_actor_implementation(actor_cls: Type[Actor]):
+    try:
+        del _actor_implementation[actor_cls]
+    except KeyError:
+        pass
diff --git a/python/xorbits/_mars/oscar/backend.py b/python/xorbits/_mars/oscar/backend.py
new file mode 100644
index 000000000..23c0a1716
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backend.py
@@ -0,0 +1,62 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Dict, Type
+
+from .context import register_backend_context
+from .driver import register_backend_driver
+
+__all__ = ["BaseActorBackend", "register_backend", "get_backend"]
+
+
+class BaseActorBackend(ABC):
+    @staticmethod
+    @abstractmethod
+    def name():
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def get_context_cls():
+        pass
+
+    @classmethod
+    async def create_actor_pool(cls, address: str, n_process: int = None, **kwargs):
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def get_driver_cls():
+        pass
+
+
+_scheme_to_backend_cls: Dict[str, Type[BaseActorBackend]] = dict()
+
+
+def register_backend(backend_cls: Type[BaseActorBackend]):
+    name = backend_cls.name()
+    if isinstance(name, (list, tuple)):
+        names = name
+    else:
+        names = [name]
+    for name in names:
+        _scheme_to_backend_cls[name] = backend_cls
+        register_backend_context(name, backend_cls.get_context_cls())
+        register_backend_driver(name, backend_cls.get_driver_cls())
+    return backend_cls
+
+
+def get_backend(name):
+    return _scheme_to_backend_cls[name]
diff --git a/python/xorbits/_mars/oscar/backends/__init__.py b/python/xorbits/_mars/oscar/backends/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/oscar/backends/allocate_strategy.py b/python/xorbits/_mars/oscar/backends/allocate_strategy.py
new file mode 100644
index 000000000..f9fcc3ffe
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/allocate_strategy.py
@@ -0,0 +1,159 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from random import choice
+from typing import Dict, Optional, Tuple
+
+from ...utils import implements
+from ..core import ActorRef
+from ..errors import NoIdleSlot
+from .config import ActorPoolConfig
+from .message import _MessageBase
+
+allocated_value = Tuple["AllocateStrategy", Optional[_MessageBase]]
+allocated_values = Dict[Optional[ActorRef], allocated_value]
+allocated_type = Dict[str, allocated_values]
+
+
+class AllocateStrategy(ABC):
+    __slots__ = ()
+
+    @abstractmethod
+    def get_allocated_address(
+        self, config: ActorPoolConfig, allocated: allocated_type
+    ) -> str:
+        """
+        Get external address where the actor allocated to.
+
+        Parameters
+        ----------
+        config: ActorPoolConfig
+            Actor pool config.
+        allocated:
+            Already allocated of actor and its strategy.
+
+        Returns
+        -------
+        allocated_address: str
+            External address to allocate.
+        """
+
+
+class AddressSpecified(AllocateStrategy):
+    __slots__ = ("address",)
+
+    def __init__(self, address):
+        self.address = address
+
+    @implements(AllocateStrategy.get_allocated_address)
+    def get_allocated_address(
+        self, config: ActorPoolConfig, allocated: allocated_type
+    ) -> str:
+        return self.address
+
+
+class MainPool(AllocateStrategy):
+    __slots__ = ()
+
+    @implements(AllocateStrategy.get_allocated_address)
+    def get_allocated_address(
+        self, config: ActorPoolConfig, allocated: allocated_type
+    ) -> str:
+        # allocate to main process
+        main_process_index = config.get_process_indexes()[0]
+        return config.get_external_address(main_process_index)
+
+
+class Random(AllocateStrategy):
+    __slots__ = ()
+
+    @implements(AllocateStrategy.get_allocated_address)
+    def get_allocated_address(
+        self, config: ActorPoolConfig, allocated: allocated_type
+    ) -> str:
+        return choice(config.get_external_addresses())
+
+
+class RandomSubPool(AllocateStrategy):
+    __slots__ = ()
+
+    @implements(AllocateStrategy.get_allocated_address)
+    def get_allocated_address(
+        self, config: ActorPoolConfig, allocated: allocated_type
+    ) -> str:
+        return choice(config.get_external_addresses()[1:])
+
+
+class ProcessIndex(AllocateStrategy):
+    __slots__ = ("process_index",)
+
+    def __init__(self, process_index: int):
+        self.process_index = process_index
+
+    @implements(AllocateStrategy.get_allocated_address)
+    def get_allocated_address(
+        self, config: ActorPoolConfig, allocated: allocated_type
+    ) -> str:
+        actual_process_index = config.get_process_indexes()[self.process_index]
+        return config.get_pool_config(actual_process_index)["external_address"][0]
+
+
+class RandomLabel(AllocateStrategy):
+    __slots__ = ("label",)
+
+    def __init__(self, label):
+        self.label = label
+
+    @implements(AllocateStrategy.get_allocated_address)
+    def get_allocated_address(
+        self, config: ActorPoolConfig, allocated: allocated_type
+    ) -> str:
+        return choice(config.get_external_addresses(label=self.label))
+
+
+class IdleLabel(AllocateStrategy):
+    __slots__ = "label", "mark"
+
+    def __init__(self, label, mark):
+        self.label = label
+        self.mark = mark
+
+    def __hash__(self):
+        return hash((type(self), self.label, self.mark))
+
+    def __eq__(self, other):
+        return (
+            isinstance(other, IdleLabel)
+            and self.label == other.label
+            and self.mark == other.mark
+        )
+
+    @implements(AllocateStrategy.get_allocated_address)
+    def get_allocated_address(
+        self, config: ActorPoolConfig, allocated: allocated_type
+    ) -> str:
+        addresses = config.get_external_addresses(label=self.label)
+        for addr in addresses:
+            occupied = False
+            for strategy, _ in allocated.get(addr, dict()).values():
+                if strategy == self:
+                    occupied = True
+                    break
+            if not occupied:
+                return addr
+        raise NoIdleSlot(
+            f"No idle slot for creating actor "
+            f"with label {self.label}, mark {self.mark}"
+        )
diff --git a/python/xorbits/_mars/oscar/backends/communication/__init__.py b/python/xorbits/_mars/oscar/backends/communication/__init__.py
new file mode 100644
index 000000000..02b720e06
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import Channel, Client, Server
+from .core import gen_local_address, get_client_type, get_server_type
+from .dummy import DummyChannel, DummyClient, DummyServer
+from .socket import (
+    SocketChannel,
+    SocketClient,
+    SocketServer,
+    UnixSocketClient,
+    UnixSocketServer,
+)
+from .ucx import (  # noqa: F401 # pylint: disable=unused-import
+    UCXChannel,
+    UCXClient,
+    UCXServer,
+)
diff --git a/python/xorbits/_mars/oscar/backends/communication/base.py b/python/xorbits/_mars/oscar/backends/communication/base.py
new file mode 100644
index 000000000..513136609
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/base.py
@@ -0,0 +1,305 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import Any, Callable, Coroutine, Dict, Type
+
+from ....utils import classproperty, implements
+
+
+class ChannelType(Enum):
+    local = 0  # for local communication
+    ipc = 1  # inproc
+    remote = 2  # remote
+    ray = 3  # for ray actors communication
+
+
+class Channel(ABC):
+    """
+    Channel is used to do data exchange between server and client.
+    """
+
+    __slots__ = "local_address", "dest_address", "compression"
+
+    name = None
+
+    def __init__(
+        self, local_address: str = None, dest_address: str = None, compression=None
+    ):
+        self.local_address = local_address
+        self.dest_address = dest_address
+        self.compression = compression
+
+    @abstractmethod
+    async def send(self, message: Any):
+        """
+        Send data to dest. There should be only one send for one recv, otherwise recv messages
+        may overlap.
+
+        Parameters
+        ----------
+        message:
+            data that sent to dest.
+        """
+
+    @abstractmethod
+    async def recv(self):
+        """
+        Receive data that sent from dest.
+        """
+
+    @abstractmethod
+    async def close(self):
+        """
+        Close channel.
+        """
+
+    @property
+    @abstractmethod
+    def closed(self) -> bool:
+        """
+        This channel is closed or not.
+
+        Returns
+        -------
+        closed:
+            If the channel is closed.
+        """
+
+    @property
+    @abstractmethod
+    def type(self) -> ChannelType:
+        """
+        Channel is used for, can be dummy, ipc or remote.
+
+        Returns
+        -------
+        channel_type: ChannelType
+            type that can be dummy, ipc or remote.
+        """
+
+    @property
+    def info(self) -> Dict:
+        return {
+            "name": self.name,
+            "compression": self.compression,
+            "type": self.type,
+            "local_address": self.local_address,
+            "dest_address": self.dest_address,
+        }
+
+
+class Server(ABC):
+    __slots__ = "address", "channel_handler"
+
+    scheme = None
+
+    def __init__(
+        self, address: str, channel_handler: Callable[[Channel], Coroutine] = None
+    ):
+        self.address = address
+        self.channel_handler = channel_handler
+
+    @classproperty
+    @abstractmethod
+    def client_type(self) -> Type["Client"]:
+        """
+        Return the corresponding client type.
+
+        Returns
+        -------
+        client_type: type
+            client type.
+        """
+
+    @property
+    @abstractmethod
+    def channel_type(self) -> ChannelType:
+        """
+        Channel type, can be dummy, ipc or remote.
+
+        Returns
+        -------
+        channel_type: ChannelType
+            type that can be dummy, ipc or remote.
+        """
+
+    @staticmethod
+    @abstractmethod
+    async def create(config: Dict) -> "Server":
+        """
+        Create a server instance according to configuration.
+
+        Parameters
+        ----------
+        config: dict
+            configuration about creating a channel.
+
+        Returns
+        -------
+        server: Server
+            a server that waiting for connections from clients.
+        """
+
+    @abstractmethod
+    async def start(self):
+        """
+        Used for listening to port or similar stuff.
+        """
+
+    @abstractmethod
+    async def join(self, timeout=None):
+        """
+        Wait forever until timeout.
+        """
+
+    @abstractmethod
+    async def on_connected(self, *args, **kwargs):
+        """
+        Return a channel when new client connected.
+
+        Returns
+        -------
+        channel: Channel
+            channel for communication
+        """
+
+    @abstractmethod
+    async def stop(self):
+        """
+        Stop the server.
+        """
+
+    @property
+    @abstractmethod
+    def stopped(self) -> bool:
+        """
+        If this server is stopped or not.
+
+        Returns
+        -------
+        if_stopped: bool
+           This server is stopped or not.
+        """
+
+    @property
+    def info(self) -> Dict:
+        return {
+            "name": self.scheme,
+            "address": self.address,
+            "channel_type": self.channel_type,
+        }
+
+    @classmethod
+    def parse_config(cls, config: dict) -> dict:
+        # skip parsing config by default
+        return dict()
+
+    async def __aenter__(self):
+        await self.start()
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.stop()
+
+
+class Client(ABC):
+    __slots__ = "local_address", "dest_address", "channel"
+
+    scheme = None
+
+    def __init__(self, local_address: str, dest_address: str, channel: Channel):
+        self.local_address = local_address
+        self.dest_address = dest_address
+        self.channel = channel
+
+    @property
+    def channel_type(self) -> ChannelType:
+        """
+        Channel type, can be dummy, ipc or remote.
+
+        Returns
+        -------
+        channel_type: ChannelType
+            type that can be dummy, ipc or remote.
+        """
+        return self.channel.type
+
+    @staticmethod
+    @abstractmethod
+    async def connect(
+        dest_address: str, local_address: str = None, **kwargs
+    ) -> "Client":
+        """
+        Create a client that is able to connect to some server.
+
+        Parameters
+        ----------
+        dest_address: str
+            Destination server address that to connect to.
+        local_address: str
+            local address.
+
+        Returns
+        -------
+        client: Client
+            Client that holds a channel to communicate.
+        """
+
+    @classmethod
+    def parse_config(cls, config: dict) -> dict:
+        # skip parsing config by default
+        return dict()
+
+    @implements(Channel.send)
+    async def send(self, message):
+        return await self.channel.send(message)
+
+    @implements(Channel.recv)
+    async def recv(self):
+        return await self.channel.recv()
+
+    async def close(self):
+        """
+        Close connection.
+        """
+        await self.channel.close()
+
+    @property
+    def closed(self) -> bool:
+        """
+        This client is closed or not.
+
+        Returns
+        -------
+        closed: bool
+            If the client is closed.
+        """
+        return self.channel.closed
+
+    @property
+    def info(self) -> Dict:
+        return {
+            "local_address": self.local_address,
+            "dest_address": self.dest_address,
+            "channel_name": self.channel.name,
+            "channel_type": self.channel_type,
+        }
+
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.close()
diff --git a/python/xorbits/_mars/oscar/backends/communication/core.py b/python/xorbits/_mars/oscar/backends/communication/core.py
new file mode 100644
index 000000000..d716fb372
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/core.py
@@ -0,0 +1,66 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Type
+from urllib.parse import urlparse
+
+from .base import Client, Server
+
+_scheme_to_client_types: Dict[str, Type[Client]] = dict()
+_scheme_to_server_types: Dict[str, Type[Server]] = dict()
+
+
+def register_client(client_type: Type[Client]):
+    _scheme_to_client_types[client_type.scheme] = client_type
+    return client_type
+
+
+def register_server(server_type: Type[Server]):
+    _scheme_to_server_types[server_type.scheme] = server_type
+    return server_type
+
+
+def _check_scheme(scheme: str, types: Dict):
+    if scheme == "":
+        scheme = None
+    if scheme not in types:  # pragma: no cover
+        raise ValueError(
+            f"address illegal, address scheme "
+            f"should be one of "
+            f'{", ".join(types)}, '
+            f"got {scheme}"
+        )
+    return scheme
+
+
+def get_scheme(address: str) -> str:
+    if "://" not in address:
+        scheme = None
+    else:
+        scheme = urlparse(address).scheme
+    return scheme
+
+
+def get_client_type(address: str) -> Type[Client]:
+    scheme = _check_scheme(get_scheme(address), _scheme_to_client_types)
+    return _scheme_to_client_types[scheme]
+
+
+def get_server_type(address: str) -> Type[Server]:
+    scheme = _check_scheme(get_scheme(address), _scheme_to_server_types)
+    return _scheme_to_server_types[scheme]
+
+
+def gen_local_address(process_index: int) -> str:
+    return f"dummy://{process_index}"
diff --git a/python/xorbits/_mars/oscar/backends/communication/dummy.py b/python/xorbits/_mars/oscar/backends/communication/dummy.py
new file mode 100644
index 000000000..08c0ba8e5
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/dummy.py
@@ -0,0 +1,230 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import concurrent.futures as futures
+import weakref
+from typing import Any, Callable, Coroutine, Dict, Type
+from urllib.parse import urlparse
+
+from ....utils import abc_type_require_weakref_slot, classproperty, implements
+from ...errors import ServerClosed
+from .base import Channel, ChannelType, Client, Server
+from .core import register_client, register_server
+from .errors import ChannelClosed
+
+DEFAULT_DUMMY_ADDRESS = "dummy://0"
+
+
+class DummyChannel(Channel):
+    """
+    Channel for communications in same process.
+    """
+
+    __slots__ = "_in_queue", "_out_queue", "_closed"
+
+    name = "dummy"
+
+    def __init__(
+        self,
+        in_queue: asyncio.Queue,
+        out_queue: asyncio.Queue,
+        closed: asyncio.Event,
+        local_address: str = None,
+        dest_address: str = None,
+        compression=None,
+    ):
+        super().__init__(
+            local_address=local_address,
+            dest_address=dest_address,
+            compression=compression,
+        )
+        self._in_queue = in_queue
+        self._out_queue = out_queue
+        self._closed = closed
+
+    @property
+    @implements(Channel.type)
+    def type(self) -> ChannelType:
+        return ChannelType.local
+
+    @implements(Channel.send)
+    async def send(self, message: Any):
+        if self._closed.is_set():  # pragma: no cover
+            raise ChannelClosed("Channel already closed, cannot send message")
+        # put message directly into queue
+        self._out_queue.put_nowait(message)
+
+    @implements(Channel.recv)
+    async def recv(self):
+        if self._closed.is_set():  # pragma: no cover
+            raise ChannelClosed("Channel already closed, cannot write message")
+        try:
+            return await self._in_queue.get()
+        except RuntimeError:
+            if self._closed.is_set():
+                pass
+
+    @implements(Channel.close)
+    async def close(self):
+        self._closed.set()
+
+    @property
+    @implements(Channel.closed)
+    def closed(self) -> bool:
+        return self._closed.is_set()
+
+
+@register_server
+class DummyServer(Server):
+    __slots__ = (
+        ("_closed", "_channels", "_tasks") + ("__weakref__",)
+        if abc_type_require_weakref_slot
+        else tuple()
+    )
+
+    _address_to_instances: Dict[str, "DummyServer"] = weakref.WeakValueDictionary()
+    scheme = "dummy"
+
+    def __init__(
+        self, address: str, channel_handler: Callable[[Channel], Coroutine] = None
+    ):
+        super().__init__(address, channel_handler)
+        self._closed = asyncio.Event()
+        self._channels = []
+        self._tasks = []
+
+    @classmethod
+    def get_instance(cls, address: str):
+        return cls._address_to_instances[address]
+
+    @classproperty
+    @implements(Server.client_type)
+    def client_type(self) -> Type["Client"]:
+        return DummyClient
+
+    @property
+    @implements(Server.channel_type)
+    def channel_type(self) -> ChannelType:
+        return ChannelType.local
+
+    @staticmethod
+    @implements(Server.create)
+    async def create(config: Dict) -> "DummyServer":
+        config = config.copy()
+        address = config.pop("address", DEFAULT_DUMMY_ADDRESS)
+        handle_channel = config.pop("handle_channel")
+        if urlparse(address).scheme != DummyServer.scheme:  # pragma: no cover
+            raise ValueError(
+                f"Address for DummyServer "
+                f'should be starts with "dummy://", '
+                f"got {address}"
+            )
+        if config:  # pragma: no cover
+            raise TypeError(
+                f"Creating DummyServer got unexpected " f'arguments: {",".join(config)}'
+            )
+        try:
+            server = DummyServer.get_instance(address)
+            if server.stopped:
+                raise KeyError("server closed")
+        except KeyError:
+            server = DummyServer(address, handle_channel)
+            DummyServer._address_to_instances[address] = server
+        return server
+
+    @implements(Server.start)
+    async def start(self):
+        # nothing needs to do for dummy server
+        pass
+
+    @implements(Server.join)
+    async def join(self, timeout=None):
+        wait_coro = self._closed.wait()
+        try:
+            await asyncio.wait_for(wait_coro, timeout=timeout)
+        except (futures.TimeoutError, asyncio.TimeoutError):
+            pass
+
+    @implements(Server.on_connected)
+    async def on_connected(self, *args, **kwargs):
+        if self._closed.is_set():  # pragma: no cover
+            raise ServerClosed("Dummy server already closed")
+
+        channel = args[0]
+        assert isinstance(channel, DummyChannel)
+        if kwargs:  # pragma: no cover
+            raise TypeError(
+                f"{type(self).__name__} got unexpected "
+                f'arguments: {",".join(kwargs)}'
+            )
+        self._channels.append(channel)
+        await self.channel_handler(channel)
+
+    @implements(Server.stop)
+    async def stop(self):
+        self._closed.set()
+        _ = [t.cancel() for t in self._tasks]
+        await asyncio.gather(*(channel.close() for channel in self._channels))
+
+    @property
+    @implements(Server.stopped)
+    def stopped(self) -> bool:
+        return self._closed.is_set()
+
+
+@register_client
+class DummyClient(Client):
+    __slots__ = ("_task",)
+
+    scheme = DummyServer.scheme
+
+    def __init__(self, local_address: str, dest_address: str, channel: Channel):
+        super().__init__(local_address, dest_address, channel)
+
+    @staticmethod
+    @implements(Client.connect)
+    async def connect(
+        dest_address: str, local_address: str = None, **kwargs
+    ) -> "Client":
+        if urlparse(dest_address).scheme != DummyServer.scheme:  # pragma: no cover
+            raise ValueError(
+                f'Destination address should start with "dummy://" '
+                f"for DummyClient, got {dest_address}"
+            )
+        server = DummyServer.get_instance(dest_address)
+        if server is None:  # pragma: no cover
+            raise RuntimeError(
+                f"DummyServer {dest_address} needs to be created first before DummyClient"
+            )
+        if server.stopped:  # pragma: no cover
+            raise ConnectionError(f"Dummy server {dest_address} closed")
+
+        q1, q2 = asyncio.Queue(), asyncio.Queue()
+        closed = asyncio.Event()
+        client_channel = DummyChannel(q1, q2, closed, local_address=local_address)
+        server_channel = DummyChannel(q2, q1, closed, dest_address=local_address)
+
+        conn_coro = server.on_connected(server_channel)
+        task = asyncio.create_task(conn_coro)
+        client = DummyClient(local_address, dest_address, client_channel)
+        client._task = task
+        server._tasks.append(task)
+        return client
+
+    @implements(Client.close)
+    async def close(self):
+        await super().close()
+        self._task.cancel()
+        self._task = None
diff --git a/python/xorbits/_mars/oscar/backends/communication/errors.py b/python/xorbits/_mars/oscar/backends/communication/errors.py
new file mode 100644
index 000000000..c8cc79b6c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/errors.py
@@ -0,0 +1,19 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....core.base import MarsError
+
+
+class ChannelClosed(MarsError):
+    pass
diff --git a/python/xorbits/_mars/oscar/backends/communication/socket.py b/python/xorbits/_mars/oscar/backends/communication/socket.py
new file mode 100644
index 000000000..d2adffe53
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/socket.py
@@ -0,0 +1,363 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import concurrent.futures as futures
+import os
+import socket
+import sys
+import tempfile
+from abc import ABCMeta
+from asyncio import AbstractServer, StreamReader, StreamWriter
+from functools import lru_cache
+from hashlib import md5
+from typing import Any, Callable, Coroutine, Dict, Type
+from urllib.parse import urlparse
+
+from ....serialization import AioDeserializer, AioSerializer, deserialize
+from ....utils import classproperty, implements, to_binary
+from .base import Channel, ChannelType, Client, Server
+from .core import register_client, register_server
+from .utils import read_buffers, write_buffers
+
+_is_windows: bool = sys.platform.startswith("win")
+
+
+class SocketChannel(Channel):
+    __slots__ = "reader", "writer", "_channel_type", "_send_lock", "_recv_lock"
+
+    name = "socket"
+
+    def __init__(
+        self,
+        reader: StreamReader,
+        writer: StreamWriter,
+        local_address: str = None,
+        dest_address: str = None,
+        compression: int = None,
+        channel_type: ChannelType = None,
+    ):
+        super().__init__(
+            local_address=local_address,
+            dest_address=dest_address,
+            compression=compression,
+        )
+        self.reader = reader
+        self.writer = writer
+        self._channel_type = channel_type
+
+        self._send_lock = asyncio.Lock()
+        self._recv_lock = asyncio.Lock()
+
+    @property
+    @implements(Channel.type)
+    def type(self) -> ChannelType:
+        return self._channel_type
+
+    @implements(Channel.send)
+    async def send(self, message: Any):
+        # get buffers
+        compress = self.compression or 0
+        serializer = AioSerializer(message, compress=compress)
+        buffers = await serializer.run()
+
+        # write buffers
+        write_buffers(self.writer, buffers)
+        async with self._send_lock:
+            # add lock, or when parallel send,
+            # assertion error may be raised
+            await self.writer.drain()
+
+    @implements(Channel.recv)
+    async def recv(self):
+        deserializer = AioDeserializer(self.reader)
+        async with self._recv_lock:
+            header = await deserializer.get_header()
+            buffers = await read_buffers(header, self.reader)
+        return deserialize(header, buffers)
+
+    @implements(Channel.close)
+    async def close(self):
+        self.writer.close()
+        try:
+            await self.writer.wait_closed()
+        except ConnectionResetError:  # pragma: no cover
+            pass
+
+    @property
+    @implements(Channel.closed)
+    def closed(self):
+        return self.writer.is_closing()
+
+
+class _BaseSocketServer(Server, metaclass=ABCMeta):
+    __slots__ = "_aio_server", "_channels"
+
+    def __init__(
+        self,
+        address: str,
+        aio_server: AbstractServer,
+        channel_handler: Callable[[Channel], Coroutine] = None,
+    ):
+        super().__init__(address, channel_handler)
+        # asyncio.Server
+        self._aio_server = aio_server
+        self._channels = []
+
+    @implements(Server.start)
+    async def start(self):
+        await self._aio_server.start_serving()
+
+    @implements(Server.join)
+    async def join(self, timeout=None):
+        if timeout is None:
+            await self._aio_server.serve_forever()
+        else:
+            future = asyncio.create_task(self._aio_server.serve_forever())
+            try:
+                await asyncio.wait_for(future, timeout=timeout)
+            except (futures.TimeoutError, asyncio.TimeoutError):
+                future.cancel()
+
+    @implements(Server.on_connected)
+    async def on_connected(self, *args, **kwargs):
+        reader, writer = args
+        local_address = kwargs.pop("local_address", None)
+        dest_address = kwargs.pop("dest_address", None)
+        if kwargs:  # pragma: no cover
+            raise TypeError(
+                f"{type(self).__name__} got unexpected "
+                f'arguments: {",".join(kwargs)}'
+            )
+        channel = SocketChannel(
+            reader,
+            writer,
+            local_address=local_address,
+            dest_address=dest_address,
+            channel_type=self.channel_type,
+        )
+        self._channels.append(channel)
+        # handle over channel to some handlers
+        await self.channel_handler(channel)
+
+    @implements(Server.stop)
+    async def stop(self):
+        self._aio_server.close()
+        await self._aio_server.wait_closed()
+        # close all channels
+        await asyncio.gather(
+            *(channel.close() for channel in self._channels if not channel.closed)
+        )
+
+    @property
+    @implements(Server.stopped)
+    def stopped(self) -> bool:
+        return not self._aio_server.is_serving()
+
+
+@register_server
+class SocketServer(_BaseSocketServer):
+    __slots__ = "host", "port"
+
+    scheme = None
+
+    def __init__(
+        self,
+        host: str,
+        port: int,
+        aio_server: AbstractServer,
+        channel_handler: Callable[[Channel], Coroutine] = None,
+    ):
+        address = f"{host}:{port}"
+        super().__init__(address, aio_server, channel_handler=channel_handler)
+        self.host = host
+        self.port = port
+
+    @classproperty
+    @implements(Server.client_type)
+    def client_type(self) -> Type["Client"]:
+        return SocketClient
+
+    @property
+    @implements(Server.channel_type)
+    def channel_type(self) -> ChannelType:
+        return ChannelType.remote
+
+    @staticmethod
+    @implements(Server.create)
+    async def create(config: Dict) -> "Server":
+        config = config.copy()
+        if "address" in config:
+            address = config.pop("address")
+            host, port = address.split(":", 1)
+            port = int(port)
+        else:
+            host = config.pop("host")
+            port = int(config.pop("port"))
+        handle_channel = config.pop("handle_channel")
+        if "start_serving" not in config:
+            config["start_serving"] = False
+
+        async def handle_connection(reader: StreamReader, writer: StreamWriter):
+            # create a channel when client connected
+            return await server.on_connected(
+                reader, writer, local_address=server.address
+            )
+
+        port = port if port != 0 else None
+        aio_server = await asyncio.start_server(
+            handle_connection, host=host, port=port, **config
+        )
+
+        # get port of the socket if not specified
+        if not port:
+            port = aio_server.sockets[0].getsockname()[1]
+
+        if _is_windows:
+            for sock in aio_server.sockets:
+                sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, True)
+
+        server = SocketServer(host, port, aio_server, channel_handler=handle_channel)
+        return server
+
+
+@register_client
+class SocketClient(Client):
+    __slots__ = ()
+
+    scheme = SocketServer.scheme
+
+    @staticmethod
+    @implements(Client.connect)
+    async def connect(
+        dest_address: str, local_address: str = None, **kwargs
+    ) -> "Client":
+        host, port = dest_address.split(":", 1)
+        port = int(port)
+        (reader, writer) = await asyncio.open_connection(host=host, port=port, **kwargs)
+        channel = SocketChannel(
+            reader, writer, local_address=local_address, dest_address=dest_address
+        )
+        return SocketClient(local_address, dest_address, channel)
+
+
+TEMPDIR = tempfile.gettempdir()
+
+
+@lru_cache(100)
+def _gen_unix_socket_default_path(process_index):
+    return f"{TEMPDIR}/mars/{md5(to_binary(str(process_index))).hexdigest()}"  # nosec
+
+
+@register_server
+class UnixSocketServer(_BaseSocketServer):
+    __slots__ = "process_index", "path"
+
+    scheme = "unixsocket"
+
+    def __init__(
+        self,
+        process_index: int,
+        aio_server: AbstractServer,
+        path: str,
+        channel_handler: Callable[[Channel], Coroutine] = None,
+    ):
+        address = f"{self.scheme}:///{process_index}"
+        super().__init__(address, aio_server, channel_handler=channel_handler)
+        self.process_index = process_index
+        self.path = path
+
+    @classproperty
+    @implements(Server.client_type)
+    def client_type(self) -> Type["Client"]:
+        return UnixSocketClient
+
+    @property
+    @implements(Server.channel_type)
+    def channel_type(self) -> ChannelType:
+        return ChannelType.ipc
+
+    @staticmethod
+    @implements(Server.create)
+    async def create(config: Dict) -> "Server":
+        config = config.copy()
+        if "address" in config:
+            process_index = int(urlparse(config.pop("address")).path.lstrip("/"))
+        else:
+            process_index = config.pop("process_index")
+        handle_channel = config.pop("handle_channel")
+        path = config.pop("path", _gen_unix_socket_default_path(process_index))
+
+        dirname = os.path.dirname(path)
+        if not os.path.exists(dirname):
+            os.makedirs(dirname, exist_ok=True)
+
+        if "start_serving" not in config:
+            config["start_serving"] = False
+
+        async def handle_connection(reader, writer):
+            # create a channel when client connected
+            return await server.on_connected(
+                reader, writer, local_address=server.address
+            )
+
+        aio_server = await asyncio.start_unix_server(
+            handle_connection, path=path, **config
+        )
+
+        for sock in aio_server.sockets:
+            sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, True)
+
+        server = UnixSocketServer(
+            process_index, aio_server, path, channel_handler=handle_channel
+        )
+        return server
+
+    @implements(Server.stop)
+    async def stop(self):
+        await super().stop()
+        try:
+            os.remove(self.path)
+        except OSError:  # pragma: no cover
+            pass
+
+
+@register_client
+class UnixSocketClient(Client):
+    __slots__ = ()
+
+    scheme = UnixSocketServer.scheme
+
+    @staticmethod
+    @lru_cache(100)
+    def _get_process_index(addr):
+        return int(urlparse(addr).path.lstrip("/"))
+
+    @staticmethod
+    @implements(Client.connect)
+    async def connect(
+        dest_address: str, local_address: str = None, **kwargs
+    ) -> "Client":
+        process_index = UnixSocketClient._get_process_index(dest_address)
+        path = kwargs.pop("path", _gen_unix_socket_default_path(process_index))
+        try:
+            (reader, writer) = await asyncio.open_unix_connection(path, **kwargs)
+        except FileNotFoundError:
+            raise ConnectionRefusedError(
+                "Cannot connect unix socket due to file not exists"
+            )
+        channel = SocketChannel(
+            reader, writer, local_address=local_address, dest_address=dest_address
+        )
+        return UnixSocketClient(local_address, dest_address, channel)
diff --git a/python/xorbits/_mars/oscar/backends/communication/tests/__init__.py b/python/xorbits/_mars/oscar/backends/communication/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/oscar/backends/communication/tests/test_comm.py b/python/xorbits/_mars/oscar/backends/communication/tests/test_comm.py
new file mode 100644
index 000000000..4600b662f
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/tests/test_comm.py
@@ -0,0 +1,228 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import multiprocessing
+import sys
+from typing import Dict, List, Tuple, Type, Union
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .....lib.aio import AioEvent
+from .....tests.core import require_cudf, require_cupy
+from .....utils import get_next_port, lazy_import
+from .. import (
+    Channel,
+    DummyChannel,
+    DummyClient,
+    DummyServer,
+    Server,
+    SocketChannel,
+    SocketClient,
+    SocketServer,
+    UCXServer,
+    UnixSocketClient,
+    UnixSocketServer,
+    get_client_type,
+)
+from ..ucx import UCXInitializer
+
+test_data = np.random.RandomState(0).rand(10, 10)
+port = get_next_port()
+cupy = lazy_import("cupy")
+cudf = lazy_import("cudf")
+ucp = lazy_import("ucp")
+
+
+def gen_params():
+    # server_type, config, con
+    params: List[Tuple[Type[Server], Dict, str]] = [
+        (SocketServer, dict(host="127.0.0.1", port=port), f"127.0.0.1:{port}"),
+    ]
+    if sys.platform != "win32":
+        params.append((UnixSocketServer, dict(process_index="0"), f"unixsocket:///0"))
+    if ucp is not None:
+        ucp_port = get_next_port()
+        # test ucx
+        params.append(
+            (UCXServer, dict(host="127.0.0.1", port=ucp_port), f"127.0.0.1:{ucp_port}")
+        )
+    return params
+
+
+params = gen_params()
+local_params = gen_params().copy()
+local_params.append((DummyServer, dict(), "dummy://0"))
+
+
+@pytest.mark.parametrize("server_type, config, con", local_params)
+@pytest.mark.asyncio
+async def test_comm(server_type, config, con):
+    async def check_data(chan: Union[SocketChannel, DummyChannel]):
+        np.testing.assert_array_equal(test_data, await chan.recv())
+        await chan.send("success")
+
+    config = config.copy()
+    config["handle_channel"] = check_data
+
+    # create server
+    server = await server_type.create(config)
+    await server.start()
+    assert isinstance(server.info, dict)
+
+    # create client
+    client = await server_type.client_type.connect(con)
+    assert isinstance(client.info, dict)
+    assert isinstance(client.channel.info, dict)
+    await client.send(test_data)
+
+    assert "success" == await client.recv()
+
+    await client.close()
+    assert client.closed
+
+    # create client2
+    async with await server_type.client_type.connect(con) as client2:
+        assert not client2.closed
+    assert client2.closed
+
+    await server.join(0.001)
+    await server.stop()
+
+    assert server.stopped
+
+    if server_type is UCXServer:
+        UCXInitializer.reset()
+        # skip create server on same port for ucx
+        return
+
+    async with await server_type.create(config) as server2:
+        assert not server2.stopped
+    assert server2.stopped
+
+
+def _wrap_test(server_started_event, conf, tp):
+    async def _test():
+        async def check_data(chan: SocketChannel):
+            np.testing.assert_array_equal(test_data, await chan.recv())
+            await chan.send("success")
+
+        nonlocal conf
+        conf = conf.copy()
+        conf["handle_channel"] = check_data
+
+        # create server
+        server = await tp.create(conf)
+        await server.start()
+        server_started_event.set()
+        await server.join()
+
+    asyncio.run(_test())
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("server_type, config, con", params)
+async def test_multiprocess_comm(server_type, config, con):
+    if server_type is UCXServer:
+        UCXInitializer.reset()
+
+    server_started = multiprocessing.Event()
+
+    p = multiprocessing.Process(
+        target=_wrap_test, args=(server_started, config, server_type)
+    )
+    p.daemon = True
+    p.start()
+
+    try:
+        await AioEvent(server_started).wait()
+
+        # create client
+        client = await server_type.client_type.connect(con)
+        await client.channel.send(test_data)
+
+        assert "success" == await client.recv()
+
+        await client.close()
+        assert client.closed
+    finally:
+        p.kill()
+
+
+cupy_data = np.arange(100).reshape((10, 10))
+cudf_data = pd.DataFrame({"col1": np.arange(10), "col2": [f"s{i}" for i in range(10)]})
+
+
+def _wrap_cuda_test(server_started_event, conf, tp):
+    async def _test():
+        async def check_data(chan: Channel):
+            import cupy
+
+            r = await chan.recv()
+
+            if isinstance(r, cupy.ndarray):
+                np.testing.assert_array_equal(cupy.asnumpy(r), cupy_data)
+            else:
+                pd.testing.assert_frame_equal(r.to_pandas(), cudf_data)
+            await chan.send("success")
+
+        conf["handle_channel"] = check_data
+
+        # create server
+        server = await tp.create(conf)
+        await server.start()
+        server_started_event.set()
+        await server.join()
+
+    asyncio.run(_test())
+
+
+@require_cupy
+@require_cudf
+@pytest.mark.parametrize("server_type", [SocketServer, UCXServer])
+@pytest.mark.asyncio
+async def test_multiprocess_cuda_comm(server_type):
+    mp_ctx = multiprocessing.get_context("spawn")
+
+    server_started = mp_ctx.Event()
+    port = get_next_port()
+    p = mp_ctx.Process(
+        target=_wrap_cuda_test,
+        args=(server_started, dict(host="127.0.0.1", port=port), server_type),
+    )
+    p.daemon = True
+    p.start()
+
+    await AioEvent(server_started).wait()
+
+    # create client
+    client = await server_type.client_type.connect(f"127.0.0.1:{port}")
+
+    await client.channel.send(cupy.asarray(cupy_data))
+    assert "success" == await client.recv()
+
+    client = await server_type.client_type.connect(f"127.0.0.1:{port}")
+
+    await client.channel.send(cudf.DataFrame(cudf_data))
+    assert "success" == await client.recv()
+
+    await client.close()
+
+
+def test_get_client_type():
+    assert issubclass(get_client_type("127.0.0.1"), SocketClient)
+    assert issubclass(get_client_type("unixsocket:///1"), UnixSocketClient)
+    assert issubclass(get_client_type("dummy://"), DummyClient)
diff --git a/python/xorbits/_mars/oscar/backends/communication/ucx.py b/python/xorbits/_mars/oscar/backends/communication/ucx.py
new file mode 100644
index 000000000..5026464fd
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/ucx.py
@@ -0,0 +1,481 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import concurrent.futures as futures
+import functools
+import logging
+import os
+import weakref
+from typing import Any, Callable, Coroutine, Dict, Tuple, Type, List
+
+import cloudpickle
+import numpy as np
+
+from ....utils import lazy_import, implements, classproperty
+from ....lib.nvutils import get_index_and_uuid, get_cuda_context
+from ....serialization import deserialize
+from ....serialization.aio import AioSerializer, get_header_length, BUFFER_SIZES_NAME
+from .base import Channel, ChannelType, Server, Client
+from .core import register_client, register_server
+from .errors import ChannelClosed
+
+ucp = lazy_import("ucp")
+numba_cuda = lazy_import("numba.cuda")
+rmm = lazy_import("rmm")
+
+_warning_suffix = (
+    "This is often the result of a CUDA-enabled library calling a CUDA runtime function before "
+    "spawning worker processes. Please make sure any such function calls don't happen "
+    "at import time or in the global scope of a program."
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+def synchronize_stream(stream: int = 0):
+    ctx = numba_cuda.current_context()
+    cu_stream = numba_cuda.driver.drvapi.cu_stream(stream)
+    stream = numba_cuda.driver.Stream(ctx, cu_stream, None)
+    stream.synchronize()
+
+
+class UCXInitializer:
+    _inited = False
+
+    @staticmethod
+    def _get_options(ucx_config: dict) -> Tuple[dict, dict]:
+        """
+        Get options and envs from ucx options in oscar config
+        """
+        options = dict()
+        envs = dict()
+
+        # if any of the flags are set, as long as they are not Null/None,
+        # we assume we should configure basic TLS settings for UCX, otherwise we
+        # leave UCX to its default configuration
+        if any(ucx_config.get(name) for name in ["tcp", "nvlink", "infiniband"]):
+            if ucx_config.get("rdmacm"):  # pragma: no cover
+                tls = "tcp"
+                tls_priority = "rdmacm"
+            else:
+                tls = "tcp"
+                tls_priority = "tcp"
+
+            # CUDA COPY can optionally be used with ucx -- we rely on the user
+            # to define when messages will include CUDA objects.  Note:
+            # defining only the Infiniband flag will not enable cuda_copy
+            if any(
+                ucx_config.get(name) for name in ["nvlink", "cuda-copy"]
+            ):  # pragma: no cover
+                tls += ",cuda_copy"
+
+            if ucx_config.get("infiniband"):  # pragma: no cover
+                tls = "rc," + tls
+            if ucx_config.get("nvlink"):  # pragma: no cover
+                tls += ",cuda_ipc"
+
+            options["TLS"] = tls
+            options["SOCKADDR_TLS_PRIORITY"] = tls_priority
+        elif "UCX_TLS" in os.environ:  # pragma: no cover
+            options["TLS"] = os.environ["UCX_TLS"]
+
+        for k, v in ucx_config.get("environment", dict()).items():  # pragma: no cover
+            # {"some-name": value} is translated to {"UCX_SOME_NAME": value}
+            key = f'UCX_{"_".join(s.upper() for s in k.split("-"))}'
+            opt_key = key[4:]
+            if opt_key in options:
+                logger.warning(
+                    f"Ignoring {k}={v} (key={key}) in ucx.environment, "
+                    f"preferring {opt_key}={options[opt_key]} "
+                    "from high level options"
+                )
+            elif key in os.environ:
+                # This is only info because setting UCX configuration via
+                # environment variables is a reasonably common approach
+                logger.info(
+                    f"Ignoring {k}={v} (key={key}) in ucx.environment, "
+                    f"preferring {key}={os.environ[key]} from external environment"
+                )
+            else:
+                envs[key] = v
+
+        return options, envs
+
+    @staticmethod
+    def init(ucx_config: dict):
+        if UCXInitializer._inited:
+            return
+
+        options, envs = UCXInitializer._get_options(ucx_config)
+
+        # We ensure the CUDA context is created before initializing UCX. This can't
+        # be safely handled externally because communications start before
+        # preload scripts run.
+        # Precedence:
+        # 1. external environment
+        # 2. ucx_config (high level settings passed to ucp.init)
+        # 3. ucx_environment (low level settings equivalent to environment variables)
+        ucx_tls = os.environ.get("UCX_TLS", options.get("TLS", envs.get("UCX_TLS", "")))
+        if (
+            ucx_config.get("create-cuda-contex") is True
+            # This is not foolproof, if UCX_TLS=all we might require CUDA
+            # depending on configuration of UCX, but this is better than
+            # nothing
+            or ("cuda" in ucx_tls and "^cuda" not in ucx_tls)
+        ):
+            if numba_cuda is None:  # pragma: no cover
+                raise ImportError(
+                    "CUDA support with UCX requires Numba for context management"
+                )
+
+            pre_existing_cuda_context = get_cuda_context()
+            if pre_existing_cuda_context.has_context:
+                dev = pre_existing_cuda_context.device_info
+                logger.warning(
+                    f"A CUDA context for device {dev.device_index} ({str(dev.uuid)}) "
+                    f"already exists on process ID {os.getpid()}. {_warning_suffix}"
+                )
+
+            numba_cuda.current_context()
+
+            cuda_context_created = get_cuda_context()
+            cuda_visible_device = get_index_and_uuid(
+                os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
+            )
+            if (
+                cuda_context_created.has_context
+                and cuda_context_created.device_info.uuid != cuda_visible_device.uuid
+            ):  # pragma: no cover
+                cuda_context_created_dev = cuda_context_created.device_info
+                logger.warning(
+                    f"Worker with process ID {os.getpid()} should have a CUDA context assigned to device "
+                    f"{cuda_visible_device.device_index} ({str(cuda_visible_device.uuid)}), "
+                    f"but instead the CUDA context is on device {cuda_context_created_dev.device_index} "
+                    f"({str(cuda_context_created_dev.uuid)}). {_warning_suffix}"
+                )
+
+        original_environ = os.environ
+        new_environ = os.environ.copy()
+        new_environ.update(envs)
+        os.environ = new_environ
+        try:
+            ucp.init(
+                options=options, env_takes_precedence=True, blocking_progress_mode=False
+            )
+        finally:
+            os.environ = original_environ
+
+        UCXInitializer._inited = True
+
+    @staticmethod
+    def reset():
+        ucp.reset()
+        UCXInitializer._inited = False
+
+
+class UCXChannel(Channel):
+    __slots__ = (
+        "ucp_endpoint",
+        "_closed",
+        "_has_close_callback",
+        "_send_lock",
+        "_recv_lock",
+        "__weakref__",
+    )
+
+    name = "ucx"
+
+    def __init__(
+        self,
+        ucp_endpoint: "ucp.Endpoint",
+        local_address: str = None,
+        dest_address: str = None,
+        compression: int = None,
+    ):
+        super().__init__(
+            local_address=local_address,
+            dest_address=dest_address,
+            compression=compression,
+        )
+        self.ucp_endpoint = ucp_endpoint
+
+        self._send_lock = asyncio.Lock()
+        self._recv_lock = asyncio.Lock()
+
+        # When the UCX endpoint closes or errors the registered callback
+        # is called.
+        if hasattr(self.ucp_endpoint, "set_close_callback"):
+            ref = weakref.ref(self)
+            self.ucp_endpoint.set_close_callback(
+                functools.partial(UCXChannel._close_channel, ref)
+            )
+            self._closed = False
+            self._has_close_callback = True
+        else:  # pragma: no cover
+            self._has_close_callback = False
+
+    @staticmethod
+    def _close_channel(channel_ref: weakref.ReferenceType):
+        channel = channel_ref()
+        if channel is not None:
+            channel._closed = True
+
+    @property
+    @implements(Channel.type)
+    def type(self) -> ChannelType:
+        return ChannelType.remote
+
+    @implements(Channel.send)
+    async def send(self, message: Any):
+        if self.closed:
+            raise ChannelClosed("UCX Endpoint is closed, unable to send message")
+
+        compress = self.compression or 0
+        serializer = AioSerializer(message, compress=compress)
+        buffers = await serializer.run()
+        try:
+            # It is necessary to first synchronize the default stream before start
+            # sending We synchronize the default stream because UCX is not
+            # stream-ordered and syncing the default stream will wait for other
+            # non-blocking CUDA streams. Note this is only sufficient if the memory
+            # being sent is not currently in use on non-blocking CUDA streams.
+            if any(hasattr(buf, "__cuda_array_interface__") for buf in buffers):
+                # has GPU buffer
+                synchronize_stream(0)
+
+            async with self._send_lock:
+                for buffer in buffers:
+                    if buffer.nbytes if hasattr(buffer, "nbytes") else len(buffer) > 0:
+                        await self.ucp_endpoint.send(buffer)
+        except ucp.exceptions.UCXBaseException:  # pragma: no cover
+            self.abort()
+            raise ChannelClosed("While writing, the connection was closed")
+
+    @implements(Channel.recv)
+    async def recv(self):
+        async with self._recv_lock:
+            try:
+                info_buffer = np.empty(11, dtype="u1").data
+                await self.ucp_endpoint.recv(info_buffer)
+                head_length = get_header_length(info_buffer)
+                header_buffer = np.empty(head_length, dtype="u1").data
+                await self.ucp_endpoint.recv(header_buffer)
+                header = cloudpickle.loads(header_buffer)
+
+                is_cuda_buffers = header[0].get("is_cuda_buffers")
+                buffer_sizes = header[0].pop(BUFFER_SIZES_NAME)
+
+                buffers = []
+                for is_cuda_buffer, buf_size in zip(is_cuda_buffers, buffer_sizes):
+                    if buf_size == 0:  # pragma: no cover
+                        buffers.append(bytes())
+                    elif is_cuda_buffer:
+                        cuda_buffer = rmm.DeviceBuffer(size=buf_size)
+                        await self.ucp_endpoint.recv(cuda_buffer)
+                        buffers.append(cuda_buffer)
+                    else:
+                        buffer = np.empty(buf_size, dtype="u1").data
+                        await self.ucp_endpoint.recv(buffer)
+                        buffers.append(buffer)
+            except BaseException as e:
+                if not self._closed:
+                    # In addition to UCX exceptions, may be CancelledError or another
+                    # "low-level" exception. The only safe thing to do is to abort.
+                    self.abort()
+                    raise ChannelClosed(
+                        f"Connection closed by writer.\nInner exception: {e!r}"
+                    ) from e
+                else:
+                    raise EOFError("Server closed already")
+        return deserialize(header, buffers)
+
+    def abort(self):
+        self._closed = True
+        if self.ucp_endpoint is not None:
+            self.ucp_endpoint.abort()
+            self.ucp_endpoint = None
+
+    @implements(Channel.close)
+    async def close(self):
+        self._closed = True
+        if self.ucp_endpoint is not None:
+            await self.ucp_endpoint.close()
+            # abort
+            self.ucp_endpoint.abort()
+            self.ucp_endpoint = None
+
+    @property
+    @implements(Channel.closed)
+    def closed(self):
+        if self._has_close_callback is None:  # pragma: no cover
+            # The self._closed flag is separate from the endpoint's lifetime, even when
+            # the endpoint has closed or errored, there may be messages on its buffer
+            # still to be received, even though sending is not possible anymore.
+            return self._closed
+        else:
+            return self.ucp_endpoint is None
+
+
+@register_server
+class UCXServer(Server):
+    __slots__ = "host", "port", "_ucp_listener", "_channels", "_closed"
+
+    scheme = "ucx"
+
+    _ucp_listener: "ucp.Listener"
+    _channels: List[UCXChannel]
+
+    def __init__(
+        self,
+        host: str,
+        port: int,
+        ucp_listener: "ucp.Listener",
+        channel_handler: Callable[[Channel], Coroutine] = None,
+    ):
+        super().__init__(f"{UCXServer.scheme}://{host}:{port}", channel_handler)
+        self.host = host
+        self.port = port
+        self._ucp_listener = ucp_listener
+        self._channels = []
+        self._closed = asyncio.Event()
+
+    @classproperty
+    @implements(Server.client_type)
+    def client_type(self) -> Type["Client"]:
+        return UCXClient
+
+    @property
+    @implements(Server.channel_type)
+    def channel_type(self) -> ChannelType:
+        return ChannelType.remote
+
+    @staticmethod
+    async def create(config: Dict) -> "Server":
+        config = config.copy()
+        if "address" in config:
+            address = config.pop("address")
+            prefix = f"{UCXServer.scheme}://"
+            if address.startswith(prefix):
+                address = address[len(prefix) :]
+            host, port = address.split(":", 1)
+            port = int(port)
+        else:
+            host = config.pop("host")
+            port = int(config.pop("port"))
+        handle_channel = config.pop("handle_channel")
+
+        # init
+        UCXInitializer.init(config.get("ucx", dict()))
+
+        async def serve_forever(client_ucp_endpoint: "ucp.Endpoint"):
+            try:
+                await server.on_connected(
+                    client_ucp_endpoint, local_address=server.address
+                )
+            except ChannelClosed:  # pragma: no cover
+                logger.debug("Connection closed before handshake completed")
+                return
+
+        ucp_listener = ucp.create_listener(serve_forever, port=port)
+
+        # get port of the ucp listener if not specified
+        if not port:
+            port = ucp_listener.port
+
+        server = UCXServer(host, port, ucp_listener, channel_handler=handle_channel)
+        return server
+
+    @classmethod
+    def parse_config(cls, config: dict) -> dict:
+        return config
+
+    @implements(Server.start)
+    async def start(self):
+        pass
+
+    @implements(Server.join)
+    async def join(self, timeout=None):
+        wait_coro = self._closed.wait()
+        try:
+            await asyncio.wait_for(wait_coro, timeout=timeout)
+        except (futures.TimeoutError, asyncio.TimeoutError):
+            pass
+
+    @implements(Server.on_connected)
+    async def on_connected(self, *args, **kwargs):
+        (ucp_endpoint,) = args
+        local_address = kwargs.pop("local_address", None)
+        dest_address = kwargs.pop("dest_address", None)
+        if kwargs:  # pragma: no cover
+            raise TypeError(
+                f"{type(self).__name__} got unexpected "
+                f'arguments: {",".join(kwargs)}'
+            )
+        channel = UCXChannel(
+            ucp_endpoint, local_address=local_address, dest_address=dest_address
+        )
+        self._channels.append(channel)
+        # handle over channel to some handlers
+        await self.channel_handler(channel)
+
+    @implements(Server.stop)
+    async def stop(self):
+        self._ucp_listener.close()
+        # close all channels
+        await asyncio.gather(
+            *(channel.close() for channel in self._channels if not channel.closed)
+        )
+        self._ucp_listener = None
+        self._closed.set()
+
+    @property
+    @implements(Server.stopped)
+    def stopped(self) -> bool:
+        return self._ucp_listener is None
+
+
+@register_client
+class UCXClient(Client):
+    __slots__ = ()
+
+    scheme = UCXServer.scheme
+
+    @classmethod
+    def parse_config(cls, config: dict) -> dict:
+        return config
+
+    @staticmethod
+    @implements(Client.connect)
+    async def connect(
+        dest_address: str, local_address: str = None, **kwargs
+    ) -> "Client":
+        prefix = f"{UCXClient.scheme}://"
+        if dest_address.startswith(prefix):
+            dest_address = dest_address[len(prefix) :]
+        host, port = dest_address.split(":", 1)
+        port = int(port)
+        kwargs = kwargs.copy()
+        ucx_config = kwargs.pop("config", dict()).get("ucx", dict())
+        UCXInitializer.init(ucx_config)
+
+        try:
+            ucp_endpoint = await ucp.create_endpoint(host, port)
+        except ucp.exceptions.UCXBaseException:  # pragma: no cover
+            raise ChannelClosed("Connection closed before handshake completed")
+        channel = UCXChannel(
+            ucp_endpoint, local_address=local_address, dest_address=dest_address
+        )
+        return UCXClient(local_address, dest_address, channel)
diff --git a/python/xorbits/_mars/oscar/backends/communication/utils.py b/python/xorbits/_mars/oscar/backends/communication/utils.py
new file mode 100644
index 000000000..7fa11a659
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/communication/utils.py
@@ -0,0 +1,96 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from asyncio import StreamReader, StreamWriter
+from typing import Dict, List, Union
+
+import numpy as np
+
+from ....serialization.aio import BUFFER_SIZES_NAME
+from ....utils import lazy_import
+
+cupy = lazy_import("cupy")
+cudf = lazy_import("cudf")
+rmm = lazy_import("rmm")
+
+CUDA_CHUNK_SIZE = 16 * 1024**2
+
+
+def _convert_to_cupy_ndarray(
+    cuda_buffer: Union["cupy.ndarray", "rmm.DeviceBuffer"]
+) -> "cupy.ndarray":
+    if isinstance(cuda_buffer, cupy.ndarray):
+        return cuda_buffer
+
+    size = cuda_buffer.nbytes
+    data = cuda_buffer.__cuda_array_interface__["data"][0]
+    memory = cupy.cuda.UnownedMemory(data, size, cuda_buffer)
+    ptr = cupy.cuda.MemoryPointer(memory, 0)
+    return cupy.ndarray(shape=size, dtype="u1", memptr=ptr)
+
+
+def write_buffers(writer: StreamWriter, buffers: List):
+    def _write_cuda_buffer(cuda_buffer: Union["cupy.ndarray", "rmm.DeviceBuffer"]):
+        # convert cuda buffer to cupy ndarray
+        cuda_buffer = _convert_to_cupy_ndarray(cuda_buffer)
+
+        chunk_size = CUDA_CHUNK_SIZE
+        offset = 0
+        nbytes = buffer.nbytes
+        while offset < nbytes:
+            size = chunk_size if (offset + chunk_size) < nbytes else nbytes - offset
+            # slice on cupy ndarray
+            chunk_buffer = cuda_buffer[offset : offset + size]
+            # `get` will return numpy ndarray,
+            # write its data which is a memoryview into writer
+            writer.write(chunk_buffer.get().data)
+            offset += size
+
+    for buffer in buffers:
+        if hasattr(buffer, "__cuda_array_interface__"):
+            # GPU buffer
+            _write_cuda_buffer(buffer)
+        else:
+            writer.write(buffer)
+
+
+async def read_buffers(header: Dict, reader: StreamReader):
+    is_cuda_buffers = header[0].get("is_cuda_buffers")
+    buffer_sizes = header[0].pop(BUFFER_SIZES_NAME)
+
+    buffers = []
+    for is_cuda_buffer, buf_size in zip(is_cuda_buffers, buffer_sizes):
+        if is_cuda_buffer:  # pragma: no cover
+            if buf_size == 0:
+                content = await reader.readexactly(buf_size)
+                buffers.append(content)
+            else:
+                buffer = rmm.DeviceBuffer(size=buf_size)
+                arr = _convert_to_cupy_ndarray(buffer)
+                offset = 0
+                chunk_size = CUDA_CHUNK_SIZE
+                while offset < buf_size:
+                    read_size = (
+                        chunk_size
+                        if (offset + chunk_size) < buf_size
+                        else buf_size - offset
+                    )
+                    content = await reader.readexactly(read_size)
+                    chunk_arr = np.frombuffer(content, dtype="u1")
+                    arr[offset : offset + len(content)].set(chunk_arr)
+                    offset += read_size
+                buffers.append(buffer)
+        else:
+            buffers.append(await reader.readexactly(buf_size))
+    return buffers
diff --git a/python/xorbits/_mars/oscar/backends/config.py b/python/xorbits/_mars/oscar/backends/config.py
new file mode 100644
index 000000000..a05055f52
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/config.py
@@ -0,0 +1,137 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Union
+
+
+class ActorPoolConfig:
+    __slots__ = ("_conf",)
+
+    def __init__(self, conf: Dict = None):
+        if conf is None:
+            conf = dict()
+        self._conf = conf
+        if "pools" not in self._conf:
+            self._conf["pools"] = dict()
+        if "mapping" not in self._conf:
+            self._conf["mapping"] = dict()
+        if "metrics" not in self._conf:
+            self._conf["metrics"] = dict()
+        if "comm" not in self._conf:
+            self._conf["comm"] = dict()
+
+    @property
+    def n_pool(self):
+        return len(self._conf["pools"])
+
+    def add_pool_conf(
+        self,
+        process_index: int,
+        label: str,
+        internal_address: str,
+        external_address: Union[str, List[str]],
+        env: Dict = None,
+        modules: List[str] = None,
+        suspend_sigint: bool = False,
+        use_uvloop: bool = False,
+        logging_conf: Dict = None,
+        kwargs: Dict = None,
+    ):
+        pools: Dict = self._conf["pools"]
+        if not isinstance(external_address, list):
+            external_address = [external_address]
+        pools[process_index] = {
+            "label": label,
+            "internal_address": internal_address,
+            "external_address": external_address,
+            "env": env,
+            "modules": modules,
+            "suspend_sigint": suspend_sigint,
+            "use_uvloop": use_uvloop,
+            "logging_conf": logging_conf,
+            "kwargs": kwargs or {},
+        }
+
+        mapping: Dict = self._conf["mapping"]
+        for addr in external_address:
+            mapping[addr] = internal_address
+
+    def get_pool_config(self, process_index: int):
+        return self._conf["pools"][process_index]
+
+    def get_external_address(self, process_index: int) -> str:
+        return self._conf["pools"][process_index]["external_address"][0]
+
+    def get_process_indexes(self):
+        return list(self._conf["pools"])
+
+    def get_process_index(self, external_address: str):
+        for process_index, conf in self._conf["pools"].items():
+            if external_address in conf["external_address"]:
+                return process_index
+        raise ValueError(
+            f"Cannot get process_index for {external_address}"
+        )  # pragma: no cover
+
+    def reset_pool_external_address(
+        self,
+        process_index: int,
+        external_address: Union[str, List[str]],
+    ):
+        if not isinstance(external_address, list):
+            external_address = [external_address]
+        cur_pool_config = self._conf["pools"][process_index]
+        internal_address = cur_pool_config["internal_address"]
+
+        mapping: Dict = self._conf["mapping"]
+        for addr in cur_pool_config["external_address"]:
+            if internal_address == addr:
+                # internal address may be the same as external address in Windows
+                internal_address = external_address[0]
+            mapping.pop(addr, None)
+
+        cur_pool_config["external_address"] = external_address
+        for addr in external_address:
+            mapping[addr] = internal_address
+
+    def get_external_addresses(self, label=None) -> List[str]:
+        result = []
+        for c in self._conf["pools"].values():
+            if label is not None:
+                if label == c["label"]:
+                    result.append(c["external_address"][0])
+            else:
+                result.append(c["external_address"][0])
+        return result
+
+    @property
+    def external_to_internal_address_map(self) -> Dict[str, str]:
+        return self._conf["mapping"]
+
+    def as_dict(self):
+        return self._conf
+
+    def add_metric_configs(self, metrics: Dict[str, Any]):
+        if metrics:
+            self._conf["metrics"].update(metrics)
+
+    def get_metric_configs(self):
+        return self._conf["metrics"]
+
+    def add_comm_config(self, comm_config: Dict[str, Any]):
+        if comm_config:
+            self._conf["comm"].update(comm_config)
+
+    def get_comm_config(self) -> dict:
+        return self._conf["comm"]
diff --git a/python/xorbits/_mars/oscar/backends/context.py b/python/xorbits/_mars/oscar/backends/context.py
new file mode 100644
index 000000000..68027af67
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/context.py
@@ -0,0 +1,242 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+from dataclasses import dataclass
+from typing import Tuple, Type, Union
+
+from ...utils import dataslots, to_binary
+from ..api import Actor
+from ..context import BaseActorContext
+from ..core import ActorRef, create_local_actor_ref
+from ..debug import debug_async_timeout, detect_cycle_send
+from ..errors import CannotCancelTask
+from ..utils import create_actor_ref
+from .allocate_strategy import AddressSpecified, AllocateStrategy
+from .core import ActorCaller
+from .message import (
+    DEFAULT_PROTOCOL,
+    ActorRefMessage,
+    CancelMessage,
+    ControlMessage,
+    ControlMessageType,
+    CreateActorMessage,
+    DestroyActorMessage,
+    ErrorMessage,
+    HasActorMessage,
+    ResultMessage,
+    SendMessage,
+    _MessageBase,
+    new_message_id,
+)
+from .router import Router
+
+
+@dataslots
+@dataclass
+class ProfilingContext:
+    task_id: str
+
+
+class MarsActorContext(BaseActorContext):
+    __slots__ = ("_caller",)
+
+    support_allocate_strategy = True
+
+    def __init__(self, address: str = None):
+        BaseActorContext.__init__(self, address)
+        self._caller = ActorCaller()
+
+    def __del__(self):
+        self._caller.cancel_tasks()
+
+    async def _call(
+        self, address: str, message: _MessageBase, wait: bool = True
+    ) -> Union[ResultMessage, ErrorMessage, asyncio.Future]:
+        return await self._caller.call(
+            Router.get_instance_or_empty(), address, message, wait=wait
+        )
+
+    @staticmethod
+    def _process_result_message(message: Union[ResultMessage, ErrorMessage]):
+        if isinstance(message, ResultMessage):
+            return message.result
+        else:
+            raise message.as_instanceof_cause()
+
+    async def _wait(self, future: asyncio.Future, address: str, message: _MessageBase):
+        try:
+            await asyncio.shield(future)
+        except asyncio.CancelledError:
+            try:
+                await self.cancel(address, message.message_id)
+            except CannotCancelTask:
+                # cancel failed, already finished
+                raise asyncio.CancelledError
+        except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+            pass
+        return await future
+
+    async def create_actor(
+        self, actor_cls: Type[Actor], *args, uid=None, address: str = None, **kwargs
+    ) -> ActorRef:
+        router = Router.get_instance_or_empty()
+        address = address or self._address or router.external_address
+        allocate_strategy = kwargs.get("allocate_strategy", None)
+        if isinstance(allocate_strategy, AllocateStrategy):
+            allocate_strategy = kwargs.pop("allocate_strategy")
+        else:
+            allocate_strategy = AddressSpecified(address)
+        create_actor_message = CreateActorMessage(
+            new_message_id(),
+            actor_cls,
+            to_binary(uid),
+            args,
+            kwargs,
+            allocate_strategy,
+            protocol=DEFAULT_PROTOCOL,
+        )
+        future = await self._call(address, create_actor_message, wait=False)
+        result = await self._wait(future, address, create_actor_message)
+        return self._process_result_message(result)
+
+    async def has_actor(self, actor_ref: ActorRef) -> bool:
+        message = HasActorMessage(
+            new_message_id(), actor_ref, protocol=DEFAULT_PROTOCOL
+        )
+        future = await self._call(actor_ref.address, message, wait=False)
+        result = await self._wait(future, actor_ref.address, message)
+        return self._process_result_message(result)
+
+    async def destroy_actor(self, actor_ref: ActorRef):
+        message = DestroyActorMessage(
+            new_message_id(), actor_ref, protocol=DEFAULT_PROTOCOL
+        )
+        future = await self._call(actor_ref.address, message, wait=False)
+        result = await self._wait(future, actor_ref.address, message)
+        return self._process_result_message(result)
+
+    async def kill_actor(self, actor_ref: ActorRef, force: bool = True):
+        # get main_pool_address
+        control_message = ControlMessage(
+            new_message_id(),
+            actor_ref.address,
+            ControlMessageType.get_config,
+            "main_pool_address",
+            protocol=DEFAULT_PROTOCOL,
+        )
+        main_address = self._process_result_message(
+            await self._call(actor_ref.address, control_message)
+        )
+        real_actor_ref = await self.actor_ref(actor_ref)
+        if real_actor_ref.address == main_address:
+            raise ValueError("Cannot kill actor on main pool")
+        stop_message = ControlMessage(
+            new_message_id(),
+            real_actor_ref.address,
+            ControlMessageType.stop,
+            # default timeout (3 secs) and force
+            (3.0, force),
+            protocol=DEFAULT_PROTOCOL,
+        )
+        # stop server
+        result = await self._call(main_address, stop_message)
+        return self._process_result_message(result)
+
+    async def actor_ref(self, *args, **kwargs):
+        actor_ref = create_actor_ref(*args, **kwargs)
+        local_actor_ref = create_local_actor_ref(actor_ref.address, actor_ref.uid)
+        if local_actor_ref is not None:
+            return local_actor_ref
+        message = ActorRefMessage(
+            new_message_id(), actor_ref, protocol=DEFAULT_PROTOCOL
+        )
+        future = await self._call(actor_ref.address, message, wait=False)
+        result = await self._wait(future, actor_ref.address, message)
+        return self._process_result_message(result)
+
+    async def send(
+        self,
+        actor_ref: ActorRef,
+        message: Tuple,
+        wait_response: bool = True,
+        profiling_context: ProfilingContext = None,
+    ):
+        message = SendMessage(
+            new_message_id(),
+            actor_ref,
+            message,
+            protocol=DEFAULT_PROTOCOL,
+            profiling_context=profiling_context,
+        )
+
+        # use `%.500` to avoid print too long messages
+        with debug_async_timeout(
+            "actor_call_timeout",
+            "Calling %.500r on %s at %s timed out",
+            message.content,
+            actor_ref.uid,
+            actor_ref.address,
+        ):
+            detect_cycle_send(message, wait_response)
+            future = await self._call(actor_ref.address, message, wait=False)
+            if wait_response:
+                result = await self._wait(future, actor_ref.address, message)
+                return self._process_result_message(result)
+            else:
+                return future
+
+    async def cancel(self, address: str, cancel_message_id: bytes):
+        message = CancelMessage(
+            new_message_id(), address, cancel_message_id, protocol=DEFAULT_PROTOCOL
+        )
+        result = await self._call(address, message)
+        return self._process_result_message(result)
+
+    async def wait_actor_pool_recovered(self, address: str, main_address: str = None):
+        if main_address is None:
+            # get main_pool_address
+            control_message = ControlMessage(
+                new_message_id(),
+                address,
+                ControlMessageType.get_config,
+                "main_pool_address",
+                protocol=DEFAULT_PROTOCOL,
+            )
+            main_address = self._process_result_message(
+                await self._call(address, control_message)
+            )
+
+        # if address is main pool, it is never recovered
+        if address == main_address:
+            return
+
+        control_message = ControlMessage(
+            new_message_id(),
+            address,
+            ControlMessageType.wait_pool_recovered,
+            None,
+            protocol=DEFAULT_PROTOCOL,
+        )
+        self._process_result_message(await self._call(main_address, control_message))
+
+    async def get_pool_config(self, address: str):
+        control_message = ControlMessage(
+            new_message_id(),
+            address,
+            ControlMessageType.get_config,
+            None,
+            protocol=DEFAULT_PROTOCOL,
+        )
+        return self._process_result_message(await self._call(address, control_message))
diff --git a/python/xorbits/_mars/oscar/backends/core.py b/python/xorbits/_mars/oscar/backends/core.py
new file mode 100644
index 000000000..157f9abd0
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/core.py
@@ -0,0 +1,140 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import copy
+import logging
+from typing import Dict, Union
+
+from ...oscar.profiling import ProfilingData
+from ...utils import Timer
+from ..errors import ServerClosed
+from .communication import Client
+from .message import DeserializeMessageFailed, ErrorMessage, ResultMessage, _MessageBase
+from .router import Router
+
+ResultMessageType = Union[ResultMessage, ErrorMessage]
+logger = logging.getLogger(__name__)
+
+
+class ActorCaller:
+    __slots__ = "_client_to_message_futures", "_clients"
+
+    def __init__(self):
+        self._client_to_message_futures: Dict[
+            Client, Dict[bytes, asyncio.Future]
+        ] = dict()
+        self._clients: Dict[Client, asyncio.Task] = dict()
+
+    async def get_client(self, router: Router, dest_address: str) -> Client:
+        client = await router.get_client(dest_address, from_who=self)
+        if client not in self._clients:
+            self._clients[client] = asyncio.create_task(self._listen(client))
+            self._client_to_message_futures[client] = dict()
+            client_count = len(self._clients)
+            if client_count >= 100:  # pragma: no cover
+                if (client_count - 100) % 10 == 0:  # pragma: no cover
+                    logger.warning(
+                        "Actor caller has created too many clients (%s >= 100), "
+                        "the global router may not be set.",
+                        client_count,
+                    )
+        return client
+
+    async def _listen(self, client: Client):
+        while not client.closed:
+            try:
+                try:
+                    message: _MessageBase = await client.recv()
+                except (EOFError, ConnectionError, BrokenPipeError):
+                    # remote server closed, close client and raise ServerClosed
+                    try:
+                        await client.close()
+                    except (ConnectionError, BrokenPipeError):
+                        # close failed, ignore it
+                        pass
+                    raise ServerClosed(
+                        f"Remote server {client.dest_address} closed"
+                    ) from None
+                future = self._client_to_message_futures[client].pop(message.message_id)
+                future.set_result(message)
+            except DeserializeMessageFailed as e:
+                message_id = e.message_id
+                future = self._client_to_message_futures[client].pop(message_id)
+                future.set_exception(e.__cause__)
+            except Exception as e:  # noqa: E722  # pylint: disable=bare-except
+                message_futures = self._client_to_message_futures.get(client)
+                self._client_to_message_futures[client] = dict()
+                for future in message_futures.values():
+                    future.set_exception(copy.copy(e))
+            finally:
+                # message may have Ray ObjectRef, delete it early in case next loop doesn't run
+                # as soon as expected.
+                try:
+                    del message
+                except NameError:
+                    pass
+                try:
+                    del future
+                except NameError:
+                    pass
+                await asyncio.sleep(0)
+
+        message_futures = self._client_to_message_futures.get(client)
+        self._client_to_message_futures[client] = dict()
+        error = ServerClosed(f"Remote server {client.dest_address} closed")
+        for future in message_futures.values():
+            future.set_exception(copy.copy(error))
+
+    async def call(
+        self,
+        router: Router,
+        dest_address: str,
+        message: _MessageBase,
+        wait: bool = True,
+    ) -> Union[ResultMessage, ErrorMessage, asyncio.Future]:
+        client = await self.get_client(router, dest_address)
+        loop = asyncio.get_running_loop()
+        wait_response = loop.create_future()
+        self._client_to_message_futures[client][message.message_id] = wait_response
+
+        with Timer() as timer:
+            try:
+                await client.send(message)
+            except ConnectionError:
+                try:
+                    await client.close()
+                except ConnectionError:
+                    # close failed, ignore it
+                    pass
+                raise ServerClosed(f"Remote server {client.dest_address} closed")
+
+            if not wait:
+                r = wait_response
+            else:
+                r = await wait_response
+
+        ProfilingData.collect_actor_call(message, timer.duration)
+        return r
+
+    async def stop(self):
+        try:
+            await asyncio.gather(*[client.close() for client in self._clients])
+        except (ConnectionError, ServerClosed):
+            pass
+        self.cancel_tasks()
+
+    def cancel_tasks(self):
+        # cancel listening for all clients
+        _ = [task.cancel() for task in self._clients.values()]
diff --git a/python/xorbits/_mars/oscar/backends/mars/__init__.py b/python/xorbits/_mars/oscar/backends/mars/__init__.py
new file mode 100644
index 000000000..006392541
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .backend import MarsActorBackend
diff --git a/python/xorbits/_mars/oscar/backends/mars/backend.py b/python/xorbits/_mars/oscar/backends/mars/backend.py
new file mode 100644
index 000000000..e050380f5
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/backend.py
@@ -0,0 +1,72 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+from ...backend import BaseActorBackend, register_backend
+from ..context import MarsActorContext
+from .driver import MarsActorDriver
+from .pool import MainActorPool
+
+__all__ = ["MarsActorBackend"]
+
+
+def build_pool_kwargs(n_process: int, kwargs: Dict):
+    n_io_process = kwargs.pop("n_io_process", 0)
+    if n_io_process:
+        n_process += n_io_process
+
+        labels = kwargs["labels"]
+        envs = kwargs["envs"]
+        external_address_schemes = kwargs["external_address_schemes"]
+        enable_internal_addresses = kwargs["enable_internal_addresses"]
+        # sub-pools for IO(transfer and spill)
+        for _ in range(n_io_process):
+            if envs:  # pragma: no cover
+                envs.append(dict())
+            labels.append("io")
+            if external_address_schemes:
+                # just use main process' scheme for IO process
+                external_address_schemes.append(external_address_schemes[0])
+            if enable_internal_addresses:
+                # just use main process' setting for IO process
+                enable_internal_addresses.append(enable_internal_addresses[0])
+
+    return n_process, kwargs
+
+
+@register_backend
+class MarsActorBackend(BaseActorBackend):
+    @staticmethod
+    def name():
+        # None means Mars is default scheme
+        # ucx can be recognized as Mars backend as well
+        return [None, "ucx"]
+
+    @staticmethod
+    def get_context_cls():
+        return MarsActorContext
+
+    @staticmethod
+    def get_driver_cls():
+        return MarsActorDriver
+
+    @classmethod
+    async def create_actor_pool(cls, address: str, n_process: int = None, **kwargs):
+        from ..pool import create_actor_pool
+
+        n_process, kwargs = build_pool_kwargs(n_process, kwargs)
+        return await create_actor_pool(
+            address, pool_cls=MainActorPool, n_process=n_process, **kwargs
+        )
diff --git a/python/xorbits/_mars/oscar/backends/mars/driver.py b/python/xorbits/_mars/oscar/backends/mars/driver.py
new file mode 100644
index 000000000..67171171c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/driver.py
@@ -0,0 +1,25 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from numbers import Number
+from typing import Dict
+
+from ...driver import BaseActorDriver
+
+
+class MarsActorDriver(BaseActorDriver):
+    @classmethod
+    def setup_cluster(cls, address_to_resources: Dict[str, Dict[str, Number]]):
+        # nothing need to be done in driver of Mars backend
+        pass
diff --git a/python/xorbits/_mars/oscar/backends/mars/pool.py b/python/xorbits/_mars/oscar/backends/mars/pool.py
new file mode 100644
index 000000000..04762f088
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/pool.py
@@ -0,0 +1,343 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import atexit
+import concurrent.futures as futures
+import configparser
+import contextlib
+import itertools
+import logging.config
+import multiprocessing
+import os
+import random
+import signal
+import sys
+import threading
+import uuid
+from dataclasses import dataclass
+from types import TracebackType
+from typing import List
+
+from ....utils import (
+    clean_mars_tmp_dir,
+    dataslots,
+    ensure_coverage,
+    reset_id_random_seed,
+)
+from ..config import ActorPoolConfig
+from ..message import CreateActorMessage
+from ..pool import MainActorPoolBase, SubActorPoolBase, _register_message_handler
+
+atexit.register(clean_mars_tmp_dir)
+
+_is_windows: bool = sys.platform.startswith("win")
+
+if sys.version_info[:2] == (3, 9):
+    # fix for Python 3.9, see https://bugs.python.org/issue43517
+    if sys.platform == "win32":
+        from multiprocessing import popen_spawn_win32 as popen_spawn
+
+        popen_forkserver = popen_fork = synchronize = None
+    else:
+        from multiprocessing import popen_fork, popen_forkserver
+        from multiprocessing import popen_spawn_posix as popen_spawn
+        from multiprocessing import synchronize
+    _ = popen_spawn, popen_forkserver, popen_fork, synchronize
+    del _
+elif sys.version_info[:2] == (3, 6):  # pragma: no cover
+    from multiprocessing.process import BaseProcess
+
+    # define kill method for multiprocessing
+    def _mp_kill(self):
+        if not _is_windows:
+            try:
+                os.kill(self.pid, signal.SIGKILL)
+            except ProcessLookupError:
+                pass
+            except OSError:
+                if self.wait(timeout=0.1) is None:
+                    raise
+        else:
+            self.terminate()
+
+    BaseProcess.kill = _mp_kill
+
+logger = logging.getLogger(__name__)
+_init_main_suspended_local = threading.local()
+
+
+def _patch_spawn_get_preparation_data():
+    try:
+        from multiprocessing import spawn as mp_spawn
+
+        _raw_get_preparation_data = mp_spawn.get_preparation_data
+
+        def _patched_get_preparation_data(*args, **kw):
+            ret = _raw_get_preparation_data(*args, **kw)
+            if getattr(_init_main_suspended_local, "value", False):
+                # make sure user module is not imported when start Mars cluster
+                ret.pop("init_main_from_name", None)
+                ret.pop("init_main_from_path", None)
+            return ret
+
+        _patched_get_preparation_data._mars_patched = True
+        if not getattr(mp_spawn.get_preparation_data, "_mars_patched", False):
+            mp_spawn.get_preparation_data = _patched_get_preparation_data
+    except (ImportError, AttributeError):  # pragma: no cover
+        pass
+
+
+@contextlib.contextmanager
+def _suspend_init_main():
+    try:
+        _init_main_suspended_local.value = True
+        yield
+    finally:
+        _init_main_suspended_local.value = False
+
+
+@dataslots
+@dataclass
+class SubpoolStatus:
+    # for status, 0 is succeeded, 1 is failed
+    status: int = None
+    external_addresses: List[str] = None
+    error: BaseException = None
+    traceback: TracebackType = None
+
+
+@_register_message_handler
+class MainActorPool(MainActorPoolBase):
+    @classmethod
+    def get_external_addresses(
+        cls,
+        address: str,
+        n_process: int = None,
+        ports: List[int] = None,
+        schemes: List[str] = None,
+    ):
+        """Get external address for every process"""
+        if ":" in address:
+            host, port = address.split(":", 1)
+            port = int(port)
+            if ports:
+                if len(ports) != n_process:
+                    raise ValueError(
+                        f"`ports` specified, but its count "
+                        f"is not equal to `n_process`, "
+                        f"number of ports: {len(ports)}, "
+                        f"n_process: {n_process}"
+                    )
+                sub_ports = ports
+            else:
+                sub_ports = [0] * n_process
+        else:
+            host = address
+            if ports and len(ports) != n_process + 1:
+                # ports specified, the first of which should be main port
+                raise ValueError(
+                    f"`ports` specified, but its count "
+                    f"is not equal to `n_process` + 1, "
+                    f"number of ports: {len(ports)}, "
+                    f"n_process + 1: {n_process + 1}"
+                )
+            elif not ports:
+                ports = [0] * (n_process + 1)
+            port = ports[0]
+            sub_ports = ports[1:]
+        if not schemes:
+            prefix_iter = itertools.repeat("")
+        else:
+            prefix_iter = [f"{scheme}://" if scheme else "" for scheme in schemes]
+        return [
+            f"{prefix}{host}:{port}"
+            for port, prefix in zip([port] + sub_ports, prefix_iter)
+        ]
+
+    @classmethod
+    def gen_internal_address(
+        cls, process_index: int, external_address: str = None
+    ) -> str:
+        if hasattr(asyncio, "start_unix_server"):
+            return f"unixsocket:///{process_index}"
+        else:
+            return external_address
+
+    @classmethod
+    async def start_sub_pool(
+        cls,
+        actor_pool_config: ActorPoolConfig,
+        process_index: int,
+        start_method: str = None,
+    ):
+        def start_pool_in_process():
+            ctx = multiprocessing.get_context(method=start_method)
+            status_queue = ctx.Queue()
+
+            with _suspend_init_main():
+                process = ctx.Process(
+                    target=cls._start_sub_pool,
+                    args=(actor_pool_config, process_index, status_queue),
+                    name=f"MarsActorPool{process_index}",
+                )
+                process.daemon = True
+                process.start()
+
+            # wait for sub actor pool to finish starting
+            process_status = status_queue.get()
+            return process, process_status
+
+        _patch_spawn_get_preparation_data()
+        loop = asyncio.get_running_loop()
+        with futures.ThreadPoolExecutor(1) as executor:
+            create_pool_task = loop.run_in_executor(executor, start_pool_in_process)
+            return await create_pool_task
+
+    @classmethod
+    async def wait_sub_pools_ready(cls, create_pool_tasks: List[asyncio.Task]):
+        processes = []
+        ext_addresses = []
+        for task in create_pool_tasks:
+            process, status = await task
+            if status.status == 1:
+                # start sub pool failed
+                raise status.error.with_traceback(status.traceback)
+            processes.append(process)
+            ext_addresses.append(status.external_addresses)
+        return processes, ext_addresses
+
+    @classmethod
+    def _start_sub_pool(
+        cls,
+        actor_config: ActorPoolConfig,
+        process_index: int,
+        status_queue: multiprocessing.Queue,
+    ):
+        ensure_coverage()
+
+        # make sure enough randomness for every sub pool
+        random.seed(uuid.uuid1().bytes)
+        reset_id_random_seed()
+
+        conf = actor_config.get_pool_config(process_index)
+        suspend_sigint = conf["suspend_sigint"]
+        if suspend_sigint:
+            signal.signal(signal.SIGINT, lambda *_: None)
+
+        logging_conf = conf["logging_conf"] or {}
+        if isinstance(logging_conf, configparser.RawConfigParser):
+            logging.config.fileConfig(logging_conf)
+        elif logging_conf.get("file"):
+            logging.config.fileConfig(logging_conf["file"])
+        elif logging_conf.get("level"):
+            logging.getLogger("__main__").setLevel(logging_conf["level"])
+            logging.getLogger("mars").setLevel(logging_conf["level"])
+            if logging_conf.get("format"):
+                logging.basicConfig(format=logging_conf["format"])
+
+        use_uvloop = conf["use_uvloop"]
+        if use_uvloop:
+            import uvloop
+
+            asyncio.set_event_loop(uvloop.new_event_loop())
+        else:
+            asyncio.set_event_loop(asyncio.new_event_loop())
+
+        coro = cls._create_sub_pool(actor_config, process_index, status_queue)
+        asyncio.run(coro)
+
+    @classmethod
+    async def _create_sub_pool(
+        cls,
+        actor_config: ActorPoolConfig,
+        process_index: int,
+        status_queue: multiprocessing.Queue,
+    ):
+        process_status = None
+        try:
+            cur_pool_config = actor_config.get_pool_config(process_index)
+            env = cur_pool_config["env"]
+            if env:
+                os.environ.update(env)
+            pool = await SubActorPool.create(
+                {"actor_pool_config": actor_config, "process_index": process_index}
+            )
+            external_addresses = cur_pool_config["external_address"]
+            process_status = SubpoolStatus(
+                status=0, external_addresses=external_addresses
+            )
+            await pool.start()
+        except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+            _, error, tb = sys.exc_info()
+            process_status = SubpoolStatus(status=1, error=error, traceback=tb)
+            raise
+        finally:
+            status_queue.put(process_status)
+        await pool.join()
+
+    async def kill_sub_pool(
+        self, process: multiprocessing.Process, force: bool = False
+    ):
+        if (
+            "COV_CORE_SOURCE" in os.environ and not force and not _is_windows
+        ):  # pragma: no cover
+            # must shutdown gracefully, or coverage info lost
+            try:
+                os.kill(process.pid, signal.SIGINT)
+            except OSError:  # pragma: no cover
+                pass
+            process.terminate()
+            wait_pool = futures.ThreadPoolExecutor(1)
+            try:
+                loop = asyncio.get_running_loop()
+                await loop.run_in_executor(wait_pool, process.join, 3)
+            finally:
+                wait_pool.shutdown(False)
+        process.kill()
+        await asyncio.to_thread(process.join, 5)
+
+    async def is_sub_pool_alive(self, process: multiprocessing.Process):
+        try:
+            return await asyncio.to_thread(process.is_alive)
+        except RuntimeError as ex:  # pragma: no cover
+            if "cannot schedule new futures after interpreter shutdown" not in str(ex):
+                # when atexit is triggered, the default pool might be shutdown
+                # and to_thread will fail
+                raise
+            return process.is_alive()
+
+    async def recover_sub_pool(self, address: str):
+        process_index = self._config.get_process_index(address)
+        # process dead, restart it
+        # remember always use spawn to recover sub pool
+        task = asyncio.create_task(
+            self.start_sub_pool(self._config, process_index, "spawn")
+        )
+        self.sub_processes[address] = (await self.wait_sub_pools_ready([task]))[0][0]
+
+        if self._auto_recover == "actor":
+            # need to recover all created actors
+            for _, message in self._allocated_actors[address].values():
+                create_actor_message: CreateActorMessage = message
+                await self.call(address, create_actor_message)
+
+    async def start(self):
+        await super().start()
+        await self.start_monitor()
+
+
+@_register_message_handler
+class SubActorPool(SubActorPoolBase):
+    pass
diff --git a/python/xorbits/_mars/oscar/backends/mars/tests/__init__.py b/python/xorbits/_mars/oscar/backends/mars/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/oscar/backends/mars/tests/test-logging.conf b/python/xorbits/_mars/oscar/backends/mars/tests/test-logging.conf
new file mode 100644
index 000000000..bb545c6b3
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/tests/test-logging.conf
@@ -0,0 +1,26 @@
+[loggers]
+keys=root,test_mars_pool
+
+[handlers]
+keys=stream_handler
+
+[formatters]
+keys=formatter
+
+[logger_root]
+level=WARN
+handlers=stream_handler
+
+[logger_test_mars_pool]
+level=DEBUG
+handlers=stream_handler
+qualname=mars.oscar.backends.mars.tests
+propagate=0
+
+[handler_stream_handler]
+class=StreamHandler
+formatter=formatter
+args=(sys.stderr,)
+
+[formatter_formatter]
+format=%(asctime)s %(name)-12s %(process)d %(levelname)-8s %(message)s
diff --git a/python/xorbits/_mars/oscar/backends/mars/tests/test_allocate_strategy.py b/python/xorbits/_mars/oscar/backends/mars/tests/test_allocate_strategy.py
new file mode 100644
index 000000000..df79c5ffb
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/tests/test_allocate_strategy.py
@@ -0,0 +1,84 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from .... import create_actor_ref
+from ....errors import NoIdleSlot
+from ...allocate_strategy import (
+    AddressSpecified,
+    IdleLabel,
+    MainPool,
+    Random,
+    RandomLabel,
+    RandomSubPool,
+)
+from ...config import ActorPoolConfig
+
+config = ActorPoolConfig()
+config.add_pool_conf(0, "main", "unixsocket:///0", "127.0.0.1:1111")
+config.add_pool_conf(1, "test", "unixsocket:///1", "127.0.0.1:1112")
+config.add_pool_conf(2, "test2", "unixsocket:///2", "127.0.0.1:1113")
+config.add_pool_conf(3, "test", "unixsocket:///3", "127.0.0.1:1114")
+
+
+def test_address_specified():
+    addr = "127.0.0.1:1112"
+    strategy = AddressSpecified(addr)
+    assert strategy.get_allocated_address(config, dict()) == addr
+
+
+def test_main_pool():
+    strategy = MainPool()
+    assert strategy.get_allocated_address(config, dict()) == "127.0.0.1:1111"
+
+
+def test_random():
+    strategy = Random()
+    addresses = config.get_external_addresses()
+    assert strategy.get_allocated_address(config, dict()) in addresses
+
+
+def test_random_sub_pool():
+    strategy = RandomSubPool()
+    addresses = config.get_external_addresses()[1:]
+    assert strategy.get_allocated_address(config, dict()) in addresses
+
+
+def test_random_label():
+    strategy = RandomLabel("test")
+    addresses = config.get_external_addresses(label="test")
+    assert len(addresses) == 2
+    assert strategy.get_allocated_address(config, dict()) in addresses
+
+
+def test_idle_label():
+    strategy = IdleLabel("test", "my_mark")
+    addresses = config.get_external_addresses(label="test")
+    assert len(addresses) == 2
+    allocated = {
+        addresses[0]: {create_actor_ref(addresses[0], b"id1"): (strategy, None)}
+    }
+    assert strategy.get_allocated_address(config, allocated) == addresses[1]
+
+    strategy2 = IdleLabel("test", "my_mark")
+    allocated = {
+        addresses[0]: {
+            create_actor_ref(addresses[0], b"id1"): (strategy, None),
+            create_actor_ref(addresses[0], b"id2"): (RandomLabel("test"), None),
+        },
+        addresses[1]: {create_actor_ref(addresses[1], b"id3"): (strategy2, None)},
+    }
+    with pytest.raises(NoIdleSlot):
+        strategy2.get_allocated_address(config, allocated)
diff --git a/python/xorbits/_mars/oscar/backends/mars/tests/test_debug.py b/python/xorbits/_mars/oscar/backends/mars/tests/test_debug.py
new file mode 100644
index 000000000..05fb10ee0
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/tests/test_debug.py
@@ -0,0 +1,182 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import os
+import sys
+from contextlib import contextmanager
+from io import StringIO
+from typing import List
+
+import pytest
+
+from ..... import oscar as mo
+from ....debug import get_debug_options, reload_debug_opts_from_env
+
+
+class DebugActor(mo.Actor):
+    def __init__(self):
+        self._log_file = None
+        self._pos = 0
+
+    @classmethod
+    async def wait(cls, delay: float):
+        await asyncio.sleep(delay)
+
+    @classmethod
+    async def raise_error(cls, exc):
+        raise exc
+
+    @classmethod
+    async def call_chain(
+        cls, chain: List, use_yield: bool = False, use_tell: bool = False
+    ):
+        if not chain:
+            return
+        ref_uid, ref_address = chain[0]
+        new_ref = await mo.actor_ref(ref_uid, address=ref_address)
+
+        if use_tell:
+            call_coro = new_ref.call_chain.tell(chain[1:])
+        else:
+            call_coro = new_ref.call_chain(chain[1:])
+
+        if use_yield:
+            yield call_coro
+        else:
+            await call_coro
+
+    async def call_self_ref(self):
+        await self.ref().wait(1)
+
+
+@pytest.fixture
+async def actor_pool():
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await mo.create_actor_pool(
+        "127.0.0.1", n_process=0, subprocess_start_method=start_method
+    )
+    await pool.start()
+    yield pool
+    await pool.stop()
+
+
+@pytest.fixture
+async def debug_logger():
+    log_file = StringIO()
+    logger = logging.getLogger("mars.oscar.debug")
+
+    log_handler = logging.StreamHandler(log_file)
+    log_handler.setLevel(logging.DEBUG)
+    logger.addHandler(log_handler)
+
+    try:
+        mo.set_debug_options(
+            mo.DebugOptions(
+                actor_call_timeout=1,
+                log_unhandled_errors=True,
+                log_cycle_send=True,
+            )
+        )
+        yield log_file
+    finally:
+        mo.set_debug_options(None)
+        logger.removeHandler(log_handler)
+        assert mo.get_debug_options() is None
+
+
+@contextmanager
+def cut_file_log(log_file) -> StringIO:
+    dest = StringIO()
+    pos = log_file.tell()
+    try:
+        yield dest
+    finally:
+        log_file.seek(pos, os.SEEK_SET)
+        dest.write(log_file.read())
+
+
+@pytest.mark.asyncio
+async def test_error_logs(actor_pool, debug_logger):
+    debug_ref = await mo.create_actor(
+        DebugActor, uid=DebugActor.default_uid(), address=actor_pool.external_address
+    )
+
+    with cut_file_log(debug_logger) as log_file:
+        await debug_ref.wait(0.2)
+    assert log_file.getvalue() == ""
+
+    with cut_file_log(debug_logger) as log_file:
+        await debug_ref.wait(1.2)
+    assert DebugActor.default_uid() in log_file.getvalue()
+
+    with pytest.raises(ValueError), cut_file_log(debug_logger) as log_file:
+        await debug_ref.raise_error(ValueError)
+    assert "ValueError" in log_file.getvalue()
+
+
+@pytest.mark.asyncio
+async def test_cycle_logs(actor_pool, debug_logger):
+    address = actor_pool.external_address
+    ref1 = await mo.create_actor(DebugActor, uid="debug_ref1", address=address)
+    ref2 = await mo.create_actor(DebugActor, uid="debug_ref2", address=address)
+
+    chain = [(ref2.uid, ref2.address)]
+
+    with cut_file_log(debug_logger) as log_file:
+        task = asyncio.create_task(ref1.call_chain(chain))
+        await asyncio.wait_for(task, 1)
+    assert log_file.getvalue() == ""
+
+    chain = [(ref2.uid, ref2.address), (ref1.uid, ref1.address)]
+
+    # test cycle detection with chain
+    with pytest.raises(asyncio.TimeoutError), cut_file_log(debug_logger) as log_file:
+        task = asyncio.create_task(ref1.call_chain(chain))
+        await asyncio.wait_for(task, 1)
+    assert "cycle" in log_file.getvalue()
+
+    # test yield call (should not produce loops)
+    with cut_file_log(debug_logger) as log_file:
+        task = asyncio.create_task(ref1.call_chain(chain, use_yield=True))
+        await asyncio.wait_for(task, 1)
+    assert log_file.getvalue() == ""
+
+    # test tell (should not produce loops)
+    with cut_file_log(debug_logger) as log_file:
+        task = asyncio.create_task(ref1.call_chain(chain, use_tell=True))
+        await asyncio.wait_for(task, 1)
+    assert log_file.getvalue() == ""
+
+    # test calling actor inside itself
+    with pytest.raises(asyncio.TimeoutError), cut_file_log(debug_logger) as log_file:
+        task = asyncio.create_task(ref1.call_self_ref())
+        await asyncio.wait_for(task, 1)
+    assert "cycle" in log_file.getvalue()
+
+
+def test_environ():
+    os.environ["DEBUG_OSCAR"] = "1"
+    try:
+        reload_debug_opts_from_env()
+        assert get_debug_options() is not None
+    finally:
+        os.environ.pop("DEBUG_OSCAR")
+        reload_debug_opts_from_env()
+        assert get_debug_options() is None
diff --git a/python/xorbits/_mars/oscar/backends/mars/tests/test_mars_actor_context.py b/python/xorbits/_mars/oscar/backends/mars/tests/test_mars_actor_context.py
new file mode 100644
index 000000000..88947a39e
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/tests/test_mars_actor_context.py
@@ -0,0 +1,625 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import os
+import sys
+import time
+import traceback
+from collections import deque
+
+import pandas as pd
+import pytest
+
+from ..... import oscar as mo
+from .....oscar.core import ActorRef, LocalActorRef
+from ....backends.allocate_strategy import RandomSubPool
+from ....debug import DebugOptions, get_debug_options, set_debug_options
+from ...router import Router
+
+logger = logging.getLogger(__name__)
+
+
+class DummyActor(mo.Actor):
+    def __init__(self, value):
+        super().__init__()
+
+        if value < 0:
+            raise ValueError("value < 0")
+        self.value = value
+
+    @mo.extensible
+    async def add(self, value):
+        if not isinstance(value, int):
+            raise TypeError("add number must be int")
+        self.value += value
+        return self.value
+
+    @add.batch
+    async def add(self, args_list, _kwargs_list):
+        self.value += sum(v[0] for v in args_list)
+        return self.value
+
+    @mo.extensible
+    async def add_ret(self, value):
+        return self.value + value
+
+    @add_ret.batch
+    async def add_ret(self, args_list, _kwargs_list):
+        sum_val = sum(v[0] for v in args_list)
+        return [self.value + sum_val for _ in args_list]
+
+    async def create(self, actor_cls, *args, **kw):
+        kw["address"] = self.address
+        return await mo.create_actor(actor_cls, *args, **kw)
+
+    async def create_ignore(self, actor_cls, *args, **kw):
+        try:
+            return await mo.create_actor(actor_cls, *args, **kw)
+        except ValueError:
+            pass
+
+    async def create_send(self, actor_cls, *args, **kw):
+        method = kw.pop("method")
+        method_args = kw.pop("method_args")
+        ref = await mo.create_actor(actor_cls, *args, **kw)
+        return await getattr(ref, method)(*method_args)
+
+    async def delete(self, value):
+        return await mo.destroy_actor(value)
+
+    async def has(self, value):
+        return await mo.has_actor(value)
+
+    async def send(self, uid, method, *args):
+        actor_ref = await mo.actor_ref(uid, address=self.address)
+        tp = (
+            LocalActorRef
+            if actor_ref.address == self.address and get_debug_options() is None
+            else ActorRef
+        )
+        assert (
+            type(actor_ref) is tp
+        ), f"Expect type of actor ref is {tp}, but got {actor_ref} instead."
+        return await getattr(actor_ref, method)(*args)
+
+    async def tell(self, uid, method, *args):
+        actor_ref = await mo.actor_ref(uid, address=self.address)
+        await getattr(actor_ref, method).tell(*args)
+
+    async def tell_delay(self, uid, method, *args, delay=None):
+        actor_ref = await mo.actor_ref(uid)
+        getattr(actor_ref, method).tell_delay(*args, delay=delay)
+
+    async def send_unpickled(self, value):
+        actor_ref = await mo.actor_ref(value)
+        return await actor_ref.send(lambda x: x)
+
+    async def create_unpickled(self):
+        return await mo.create_actor(DummyActor, lambda x: x, uid="admin-5")
+
+    async def destroy(self):
+        await self.ref().destroy()
+
+    def get_value(self):
+        return self.value
+
+    def get_ref(self):
+        ref = self.ref()
+        tp = LocalActorRef if get_debug_options() is None else ActorRef
+        assert (
+            type(ref) is tp
+        ), f"Expect type of actor ref is {tp}, but got {ref} instead."
+        return ref
+
+
+class RecordActor(mo.Actor):
+    def __init__(self):
+        self._records = []
+
+    def add_record(self, rec):
+        self._records.append(rec)
+
+    def get_records(self):
+        return self._records
+
+
+class CreateDestroyActor(mo.Actor):
+    def __init__(self):
+        self._record_ref = None
+
+    async def __post_create__(self):
+        self._record_ref = await mo.actor_ref(
+            RecordActor.default_uid(), address=self.address
+        )
+        await self._record_ref.add_record(f"create {self.uid}")
+        assert "sth" == await self.ref().echo("sth")
+
+    async def __pre_destroy__(self):
+        await self._record_ref.add_record(f"destroy {self.uid}")
+        assert "sth2" == await self.ref().echo("sth2")
+
+    def echo(self, message):
+        return message
+
+
+class ResourceLockActor(mo.StatelessActor):
+    def __init__(self, count=1):
+        self._sem = asyncio.Semaphore(count)
+        self._requests = deque()
+
+    async def apply(self, val=None):
+        await self._sem.acquire()
+        return val + 1 if val is not None else None
+
+    def release(self):
+        self._sem.release()
+
+
+class PromiseTestActor(mo.Actor):
+    def __init__(self, res_lock_ref):
+        self.res_lock_ref = res_lock_ref
+        self.call_log = []
+
+    async def _apply_step(self, idx, delay):
+        res = None
+        try:
+            self.call_log.append(("A", idx, time.time()))
+            res = yield self.res_lock_ref.apply(idx)
+            assert res == idx + 1
+
+            self.call_log.append(("B", idx, time.time()))
+            yield asyncio.sleep(delay)
+            self.call_log.append(("C", idx, time.time()))
+        finally:
+            yield self.res_lock_ref.release()
+            raise mo.Return(res)
+
+    async def test_promise_call(self, idx, delay=0.1):
+        return self._apply_step(idx, delay)
+
+    async def test_yield_tuple(self, delay=0.1):
+        tp = yield tuple(self._apply_step(idx, delay) for idx in range(4)) + (
+            asyncio.sleep(delay),
+            "PlainString",
+        )
+        raise mo.Return(tp)
+
+    async def async_raiser_func(self):
+        yield asyncio.sleep(0.1)
+        raise ValueError
+
+    async def test_yield_exceptions(self):
+        task = asyncio.create_task(self.ref().async_raiser_func())
+        return task
+
+    async def test_exceptions(self):
+        async def async_raiser():
+            yield asyncio.sleep(0.1)
+            raise SystemError
+
+        try:
+            yield async_raiser(),
+        except SystemError:
+            raise ValueError
+        raise KeyError
+
+    async def test_cancel(self, delay):
+        async def intermediate_error():
+            raise ValueError
+
+        async def task_fun():
+            try:
+                yield intermediate_error()
+            except ValueError:
+                pass
+            try:
+                yield asyncio.sleep(delay)
+            except asyncio.CancelledError:
+                self.call_log.append((time.time(), "CANCELLED"))
+                raise
+
+        self.call_log.append((time.time(), "START"))
+        return task_fun()
+
+    def get_call_log(self):
+        log = self.call_log
+        self.call_log = []
+        return log
+
+
+@pytest.mark.parametrize(indirect=True)
+@pytest.fixture(params=[False, True])
+async def actor_pool(request):
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await mo.create_actor_pool(
+        "127.0.0.1", n_process=2, subprocess_start_method=start_method
+    )
+
+    try:
+        if request.param:
+            set_debug_options(DebugOptions())
+        else:
+            set_debug_options(None)
+
+        await pool.start()
+        yield pool
+        await pool.stop()
+    finally:
+        set_debug_options(None)
+
+
+@pytest.mark.asyncio
+async def test_simple_local_actor_pool(actor_pool):
+    actor_ref = await mo.create_actor(
+        DummyActor, 100, address=actor_pool.external_address
+    )
+    assert await actor_ref.add(1) == 101
+    await actor_ref.add(1)
+
+    res = await actor_ref.get_value()
+    assert res == 102
+
+    ref2 = await actor_ref.get_ref()
+    assert actor_ref.address == ref2.address
+    assert actor_ref.uid == ref2.uid
+
+    ref = await mo.actor_ref(uid=actor_ref.uid, address=actor_pool.external_address)
+    assert await ref.add(2) == 104
+
+
+@pytest.mark.asyncio
+async def test_mars_post_create_pre_destroy(actor_pool):
+    rec_ref = await mo.create_actor(
+        RecordActor, uid=RecordActor.default_uid(), address=actor_pool.external_address
+    )
+    actor_ref = await mo.create_actor(
+        CreateDestroyActor, address=actor_pool.external_address
+    )
+    await actor_ref.destroy()
+
+    records = await rec_ref.get_records()
+    assert len(records) == 2
+    assert records[0].startswith("create")
+    assert records[1].startswith("destroy")
+
+
+@pytest.mark.asyncio
+async def test_mars_create_actor(actor_pool):
+    actor_ref = await mo.create_actor(
+        DummyActor, 1, address=actor_pool.external_address
+    )
+    # create actor inside on_receive
+    r = await actor_ref.create(DummyActor, 5, address=actor_pool.external_address)
+    ref = await mo.actor_ref(r, address=actor_pool.external_address)
+    assert await ref.add(10) == 15
+    # create actor inside on_receive and send message
+    r = await actor_ref.create_send(
+        DummyActor,
+        5,
+        method="add",
+        method_args=(1,),
+        address=actor_pool.external_address,
+    )
+    assert r == 6
+
+
+@pytest.mark.asyncio
+async def test_mars_create_actor_error(actor_pool):
+    ref1 = await mo.create_actor(
+        DummyActor, 1, uid="dummy1", address=actor_pool.external_address
+    )
+    with pytest.raises(mo.ActorAlreadyExist):
+        await mo.create_actor(
+            DummyActor, 1, uid="dummy1", address=actor_pool.external_address
+        )
+    await mo.destroy_actor(ref1)
+
+    with pytest.raises(ValueError):
+        await mo.create_actor(DummyActor, -1, address=actor_pool.external_address)
+    ref1 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+    with pytest.raises(ValueError):
+        await ref1.create(DummyActor, -2, address=actor_pool.external_address)
+
+
+@pytest.mark.asyncio
+async def test_mars_send(actor_pool):
+    ref1 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+    ref2 = await mo.actor_ref(
+        await ref1.create(DummyActor, 2, address=actor_pool.external_address)
+    )
+    assert await ref1.send(ref2, "add", 3) == 5
+
+    ref3 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+    ref4 = await mo.create_actor(
+        DummyActor,
+        2,
+        address=actor_pool.external_address,
+        allocate_strategy=RandomSubPool(),
+    )
+    assert await ref4.send(ref3, "add", 3) == 4
+
+
+@pytest.mark.asyncio
+async def test_mars_send_error(actor_pool):
+    ref1 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+    with pytest.raises(TypeError):
+        await ref1.add(1.0)
+    ref2 = await mo.create_actor(DummyActor, 2, address=actor_pool.external_address)
+    with pytest.raises(TypeError):
+        await ref1.send(ref2, "add", 1.0)
+    with pytest.raises(mo.ActorNotExist):
+        await (await mo.actor_ref("fake_uid", address=actor_pool.external_address)).add(
+            1
+        )
+
+
+@pytest.mark.asyncio
+async def test_mars_tell(actor_pool):
+    ref1 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+    ref2 = await mo.actor_ref(await ref1.create(DummyActor, 2))
+    await ref1.tell(ref2, "add", 3)
+    assert await ref2.get_value() == 5
+
+    await ref1.tell_delay(ref2, "add", 4, delay=0.5)  # delay 0.5 secs
+    assert await ref2.get_value() == 5
+    await asyncio.sleep(0.45)
+    assert await ref2.get_value() == 5
+    await asyncio.sleep(0.2)
+    assert await ref2.get_value() == 9
+
+    # error needed when illegal uids are passed
+    with pytest.raises(ValueError):
+        await ref1.tell(await mo.actor_ref(set()), "add", 3)
+
+
+@pytest.mark.asyncio
+async def test_mars_batch_method(actor_pool):
+    ref1 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+    batch_result = await ref1.add_ret.batch(
+        ref1.add_ret.delay(1), ref1.add_ret.delay(2), ref1.add_ret.delay(3)
+    )
+    assert len(batch_result) == 3
+    assert all(r == 7 for r in batch_result)
+
+    await ref1.add.batch(
+        ref1.add.delay(1), ref1.add.delay(2), ref1.add.delay(3), send=False
+    )
+    assert await ref1.get_value() == 7
+
+    with pytest.raises(ValueError):
+        await ref1.add_ret.batch(ref1.add_ret.delay(1), ref1.add.delay(2))
+
+
+@pytest.mark.asyncio
+async def test_gather_exception(actor_pool):
+    try:
+        Router.get_instance_or_empty()._cache.clear()
+        ref1 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+        router = Router.get_instance_or_empty()
+        client = next(iter(router._cache.values()))
+
+        future = asyncio.Future()
+        client_channel = client.channel
+
+        class FakeChannel(type(client_channel)):
+            def __init__(self):
+                pass
+
+            def __getattr__(self, item):
+                return getattr(client_channel, item)
+
+            async def recv(self):
+                return await future
+
+        client.channel = FakeChannel()
+
+        class MyException(Exception):
+            pass
+
+        await ref1.add(1)
+        tasks = [ref1.add(i) for i in range(200)]
+        future.set_exception(MyException("Test recv exception!!"))
+        with pytest.raises(MyException) as ex:
+            await asyncio.gather(*tasks)
+        s = traceback.format_tb(ex.tb)
+        assert 10 > "\n".join(s).count("send") > 0
+    finally:
+        Router.get_instance_or_empty()._cache.clear()
+
+
+@pytest.mark.asyncio
+async def test_mars_destroy_has_actor(actor_pool):
+    ref1 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+    ref2 = await mo.actor_ref(ref1)
+    ref2_add_method = ref2.add
+    assert isinstance(ref1, ActorRef)
+    assert await mo.has_actor(ref2)
+    await mo.destroy_actor(ref2)
+    assert not await mo.has_actor(ref1)
+    assert not await mo.has_actor(ref2)
+
+    if isinstance(ref2, LocalActorRef):
+        assert "weakref" in str(ref2)
+        assert "dead" in str(ref2)
+
+    # error needed when illegal uids are passed
+    with pytest.raises(ValueError):
+        await mo.has_actor(await mo.actor_ref(set()))
+
+    with pytest.raises(mo.ActorNotExist):
+        await ref2.add(1)
+
+    with pytest.raises(mo.ActorNotExist):
+        await ref2_add_method(1)
+
+    ref1 = await mo.create_actor(
+        DummyActor, 1, uid=ref1.uid, address=actor_pool.external_address
+    )
+
+    # the ref2 should be works after actor is recreated.
+    assert await ref2.add(1) == 2
+    # the ref2 method should be works after actor is recreated.
+    assert await ref2_add_method(1) == 3
+
+    assert isinstance(ref2, ActorRef)
+    assert await mo.has_actor(ref1)
+    await mo.destroy_actor(ref1)
+    assert not await mo.has_actor(ref1)
+    assert not await mo.has_actor(ref2)
+
+    ref1 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+    ref2 = await ref1.create(DummyActor, 2, address=actor_pool.external_address)
+
+    assert await mo.has_actor(ref2)
+
+    await ref1.delete(ref2)
+    assert not await ref1.has(ref2)
+
+    with pytest.raises(mo.ActorNotExist):
+        await mo.destroy_actor(
+            await mo.actor_ref("fake_uid", address=actor_pool.external_address)
+        )
+
+    ref1 = await mo.create_actor(DummyActor, 1, address=actor_pool.external_address)
+    with pytest.raises(mo.ActorNotExist):
+        await ref1.delete(
+            await mo.actor_ref("fake_uid", address=actor_pool.external_address)
+        )
+
+    # test self destroy
+    ref1 = await mo.create_actor(DummyActor, 2, address=actor_pool.external_address)
+    await ref1.destroy()
+    assert not await mo.has_actor(ref1)
+
+
+@pytest.mark.asyncio
+async def test_mars_resource_lock(actor_pool):
+    ref = await mo.create_actor(ResourceLockActor, address=actor_pool.external_address)
+    event_list = []
+
+    async def test_task(idx):
+        await ref.apply()
+        event_list.append(("A", idx, time.time()))
+        await asyncio.sleep(0.1)
+        event_list.append(("B", idx, time.time()))
+        await ref.release()
+
+    tasks = [asyncio.create_task(test_task(idx)) for idx in range(4)]
+    await asyncio.wait(tasks)
+
+    for idx in range(0, len(event_list), 2):
+        event_pair = event_list[idx : idx + 2]
+        assert (event_pair[0][0], event_pair[1][0]) == ("A", "B")
+        assert event_pair[0][1] == event_pair[1][1]
+
+
+@pytest.mark.asyncio
+async def test_promise_chain(actor_pool):
+    lock_ref = await mo.create_actor(
+        ResourceLockActor, 2, address=actor_pool.external_address
+    )
+    promise_test_ref = await mo.create_actor(
+        PromiseTestActor, lock_ref, address=actor_pool.external_address
+    )
+
+    delay_val = 1.0
+
+    start_time = time.time()
+    tasks = [
+        asyncio.create_task(promise_test_ref.test_promise_call(idx, delay=delay_val))
+        for idx in range(4)
+    ]
+    await asyncio.gather(*tasks)
+
+    logs = pd.DataFrame(
+        await promise_test_ref.get_call_log(), columns=["group", "idx", "time"]
+    )
+    logs.time -= start_time
+    assert logs.query('group == "A"').time.max() < delay_val / 2
+    max_apply_time = (
+        logs.query('group == "A" | group == "B"')
+        .groupby("idx")
+        .apply(lambda s: s.time.max() - s.time.min())
+        .max()
+    )
+    assert max_apply_time > delay_val / 2
+    max_delay_time = (
+        logs.query('group == "B" | group == "C"')
+        .groupby("idx")
+        .apply(lambda s: s.time.max() - s.time.min())
+        .max()
+    )
+    assert max_delay_time > delay_val / 2
+
+    start_time = time.time()
+    ret = await promise_test_ref.test_yield_tuple(delay=delay_val)
+    assert set(ret) == {1, 2, 3, 4, None, "PlainString"}
+
+    logs = pd.DataFrame(
+        await promise_test_ref.get_call_log(), columns=["group", "idx", "time"]
+    )
+    logs.time -= start_time
+    assert logs.query('group == "A"').time.max() < delay_val / 2
+    max_apply_time = (
+        logs.query('group == "A" | group == "B"')
+        .groupby("idx")
+        .apply(lambda s: s.time.max() - s.time.min())
+        .max()
+    )
+    assert max_apply_time > delay_val / 2
+    max_delay_time = (
+        logs.query('group == "B" | group == "C"')
+        .groupby("idx")
+        .apply(lambda s: s.time.max() - s.time.min())
+        .max()
+    )
+    assert max_delay_time > delay_val / 2
+
+    with pytest.raises(ValueError):
+        await promise_test_ref.test_exceptions()
+    with pytest.raises(ValueError):
+        await promise_test_ref.test_yield_exceptions()
+
+    with pytest.raises(asyncio.CancelledError):
+        task = asyncio.create_task(promise_test_ref.test_cancel(5))
+        await asyncio.sleep(0.1)
+        task.cancel()
+        await task
+    call_log = await promise_test_ref.get_call_log()
+    assert len(call_log) == 2
+    assert call_log[1][0] - call_log[0][0] < 1
+
+
+class ActorCannotDestroy(mo.Actor):
+    async def __pre_destroy__(self):
+        raise ValueError("Cannot destroy")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("in_sub_pool", [True, False])
+async def test_error_in_pre_destroy(actor_pool, in_sub_pool):
+    pool = actor_pool
+
+    strategy = None if not in_sub_pool else RandomSubPool()
+    a = await mo.create_actor(
+        ActorCannotDestroy, address=pool.external_address, strategy=strategy
+    )
+    with pytest.raises(ValueError, match="Cannot destroy"):
+        await mo.destroy_actor(a)
diff --git a/python/xorbits/_mars/oscar/backends/mars/tests/test_pool.py b/python/xorbits/_mars/oscar/backends/mars/tests/test_pool.py
new file mode 100644
index 000000000..0e0281ce3
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/mars/tests/test_pool.py
@@ -0,0 +1,972 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import os
+import re
+import sys
+import time
+
+import pytest
+
+from .....tests.core import mock, require_ucx
+from .....utils import get_next_port
+from .... import Actor, create_actor_ref, kill_actor
+from ....context import get_context
+from ....errors import ActorNotExist, NoIdleSlot, SendMessageFailed, ServerClosed
+from ...allocate_strategy import (
+    AddressSpecified,
+    IdleLabel,
+    MainPool,
+    ProcessIndex,
+    RandomSubPool,
+)
+from ...config import ActorPoolConfig
+from ...message import (
+    ActorRefMessage,
+    CancelMessage,
+    ControlMessage,
+    ControlMessageType,
+    CreateActorMessage,
+    DestroyActorMessage,
+    ErrorMessage,
+    HasActorMessage,
+    MessageType,
+    SendMessage,
+    TellMessage,
+    new_message_id,
+)
+from ...pool import create_actor_pool
+from ...router import Router
+from ..pool import MainActorPool, SubActorPool
+
+
+class _CannotBePickled:
+    def __getstate__(self):
+        raise RuntimeError("cannot pickle")
+
+
+class _CannotBeUnpickled:
+    def __getstate__(self):
+        return ()
+
+    def __setstate__(self, state):
+        raise RuntimeError("cannot unpickle")
+
+
+class TestActor(Actor):
+    __test__ = False
+
+    def __init__(self):
+        self.value = 0
+
+    def add(self, val):
+        self.value += val
+        return self.value
+
+    async def add_other(self, ref, val):
+        self.value += await ref.add(val)
+        return self.value
+
+    async def sleep(self, second):
+        try:
+            await asyncio.sleep(second)
+            return self.value
+        except asyncio.CancelledError:
+            return self.value + 1
+
+    def return_cannot_unpickle(self):
+        return _CannotBeUnpickled()
+
+    def raise_cannot_pickle(self):
+        raise ValueError(_CannotBePickled())
+
+
+def _add_pool_conf(
+    config: ActorPoolConfig,
+    process_index: int,
+    label: str,
+    internal_address: str,
+    external_address: str,
+    env: dict = None,
+):
+    if sys.platform.startswith("win"):
+        config.add_pool_conf(
+            process_index, label, external_address, external_address, env=env
+        )
+    else:
+        config.add_pool_conf(
+            process_index, label, internal_address, external_address, env=env
+        )
+
+
+def _raise_if_error(message):
+    if message.message_type == MessageType.error:
+        raise message.error.with_traceback(message.traceback)
+
+
+@pytest.fixture(autouse=True)
+def clear_routers():
+    yield
+    Router.set_instance(None)
+
+
+@pytest.mark.asyncio
+@mock.patch("mars.oscar.backends.mars.pool.SubActorPool.notify_main_pool_to_create")
+@mock.patch("mars.oscar.backends.mars.pool.SubActorPool.notify_main_pool_to_destroy")
+async def test_sub_actor_pool(notify_main_pool_to_create, notify_main_pool_to_destroy):
+    notify_main_pool_to_create.return_value = None
+    notify_main_pool_to_destroy.return_value = None
+    config = ActorPoolConfig()
+
+    ext_address0 = f"127.0.0.1:{get_next_port()}"
+    ext_address1 = f"127.0.0.1:{get_next_port()}"
+    _add_pool_conf(config, 0, "main", "unixsocket:///0", ext_address0)
+    _add_pool_conf(config, 1, "sub", "unixsocket:///1", ext_address1)
+
+    pool = await SubActorPool.create({"actor_pool_config": config, "process_index": 1})
+    await pool.start()
+
+    try:
+        create_actor_message = CreateActorMessage(
+            new_message_id(),
+            TestActor,
+            b"test",
+            tuple(),
+            dict(),
+            AddressSpecified(pool.external_address),
+        )
+        message = await pool.create_actor(create_actor_message)
+        assert message.message_type == MessageType.result
+        actor_ref = message.result
+        assert actor_ref.address == pool.external_address
+        assert actor_ref.uid == b"test"
+
+        has_actor_message = HasActorMessage(new_message_id(), actor_ref)
+        assert (await pool.has_actor(has_actor_message)).result is True
+
+        actor_ref_message = ActorRefMessage(new_message_id(), actor_ref)
+        assert (await pool.actor_ref(actor_ref_message)).result == actor_ref
+
+        tell_message = TellMessage(
+            new_message_id(), actor_ref, ("add", 0, (1,), dict())
+        )
+        message = await pool.tell(tell_message)
+        assert message.result is None
+
+        send_message = SendMessage(
+            new_message_id(), actor_ref, ("add", 0, (3,), dict())
+        )
+        message = await pool.send(send_message)
+        assert message.result == 4
+
+        # test error message
+        # type mismatch
+        send_message = SendMessage(
+            new_message_id(), actor_ref, ("add", 0, ("3",), dict())
+        )
+        result = await pool.send(send_message)
+        assert result.message_type == MessageType.error
+        assert isinstance(result.error, TypeError)
+
+        send_message = SendMessage(
+            new_message_id(),
+            create_actor_ref(actor_ref.address, "non_exist"),
+            ("add", 0, (3,), dict()),
+        )
+        result = await pool.send(send_message)
+        assert isinstance(result.error, ActorNotExist)
+
+        # test send message and cancel it
+        send_message = SendMessage(
+            new_message_id(), actor_ref, ("sleep", 0, (20,), dict())
+        )
+        result_task = asyncio.create_task(pool.send(send_message))
+        await asyncio.sleep(0)
+        start = time.time()
+        cancel_message = CancelMessage(
+            new_message_id(), actor_ref.address, send_message.message_id
+        )
+        cancel_task = asyncio.create_task(pool.cancel(cancel_message))
+        result = await asyncio.wait_for(cancel_task, 3)
+        assert result.message_type == MessageType.result
+        assert result.result is True
+        result = await result_task
+        # test time
+        assert time.time() - start < 3
+        assert result.message_type == MessageType.result
+        assert result.result == 5
+
+        # test processing message on background
+        async with await pool.router.get_client(pool.external_address) as client:
+            send_message = SendMessage(
+                new_message_id(), actor_ref, ("add", 0, (5,), dict())
+            )
+            await client.send(send_message)
+            result = await client.recv()
+            _raise_if_error(result)
+            assert result.result == 9
+
+            send_message = SendMessage(
+                new_message_id(), actor_ref, ("add", 0, ("5",), dict())
+            )
+            await client.send(send_message)
+            result = await client.recv()
+            assert isinstance(result.error, TypeError)
+
+        destroy_actor_message = DestroyActorMessage(new_message_id(), actor_ref)
+        message = await pool.destroy_actor(destroy_actor_message)
+        assert message.result == actor_ref.uid
+
+        # send destroy failed
+        message = await pool.destroy_actor(destroy_actor_message)
+        assert isinstance(message.error, ActorNotExist)
+
+        message = await pool.has_actor(has_actor_message)
+        assert not message.result
+
+        # test sync config
+        _add_pool_conf(
+            config, 1, "sub", "unixsocket:///1", f"127.0.0.1:{get_next_port()}"
+        )
+        sync_config_message = ControlMessage(
+            new_message_id(), "", ControlMessageType.sync_config, config
+        )
+        message = await pool.handle_control_command(sync_config_message)
+        assert message.result is True
+
+        # test get config
+        get_config_message = ControlMessage(
+            new_message_id(), "", ControlMessageType.get_config, None
+        )
+        message = await pool.handle_control_command(get_config_message)
+        config2 = message.result
+        assert config.as_dict() == config2.as_dict()
+
+        assert pool.router._mapping == Router.get_instance()._mapping
+        assert (
+            pool.router._curr_external_addresses
+            == Router.get_instance()._curr_external_addresses
+        )
+
+        stop_message = ControlMessage(
+            new_message_id(), "", ControlMessageType.stop, None
+        )
+        message = await pool.handle_control_command(stop_message)
+        assert message.result is True
+
+        await pool.join(0.05)
+        assert pool.stopped
+    finally:
+        await pool.stop()
+
+
+@pytest.mark.asyncio
+async def test_fail_when_create_subpool():
+    config = ActorPoolConfig()
+    my_label = "computation"
+    main_address = f"127.0.0.1:{get_next_port()}"
+    port = get_next_port()
+    _add_pool_conf(config, 0, "main", "unixsocket:///0", main_address)
+
+    # use the same port for sub pools, will raise `OSError` with "address already in use"
+    _add_pool_conf(
+        config, 1, my_label, "unixsocket:///1", f"127.0.0.1:{port}", env={"my_env": "1"}
+    )
+    _add_pool_conf(config, 2, my_label, "unixsocket:///2", f"127.0.0.1:{port}")
+
+    with pytest.raises(OSError):
+        await MainActorPool.create({"actor_pool_config": config})
+
+
+@pytest.mark.asyncio
+async def test_main_actor_pool():
+    config = ActorPoolConfig()
+    my_label = "computation"
+    main_address = f"127.0.0.1:{get_next_port()}"
+    _add_pool_conf(config, 0, "main", "unixsocket:///0", main_address)
+    _add_pool_conf(
+        config,
+        1,
+        my_label,
+        "unixsocket:///1",
+        f"127.0.0.1:{get_next_port()}",
+        env={"my_env": "1"},
+    )
+    _add_pool_conf(
+        config, 2, my_label, "unixsocket:///2", f"127.0.0.1:{get_next_port()}"
+    )
+
+    strategy = IdleLabel(my_label, "my_test")
+
+    async with await MainActorPool.create({"actor_pool_config": config}) as pool:
+        create_actor_message = CreateActorMessage(
+            new_message_id(), TestActor, b"test", tuple(), dict(), MainPool()
+        )
+        message = await pool.create_actor(create_actor_message)
+        actor_ref = message.result
+        assert actor_ref.address == main_address
+
+        create_actor_message1 = CreateActorMessage(
+            new_message_id(), TestActor, b"test1", tuple(), dict(), strategy
+        )
+        message1 = await pool.create_actor(create_actor_message1)
+        actor_ref1 = message1.result
+        assert actor_ref1.address in config.get_external_addresses(my_label)
+
+        create_actor_message2 = CreateActorMessage(
+            new_message_id(), TestActor, b"test2", tuple(), dict(), strategy
+        )
+        message2 = await pool.create_actor(create_actor_message2)
+        actor_ref2 = message2.result
+        assert actor_ref2.address in config.get_external_addresses(my_label)
+        assert actor_ref2.address != actor_ref1.address
+
+        create_actor_message3 = CreateActorMessage(
+            new_message_id(), TestActor, b"test3", tuple(), dict(), strategy
+        )
+        message3 = await pool.create_actor(create_actor_message3)
+        # no slot to allocate the same label
+        assert isinstance(message3.error, NoIdleSlot)
+
+        has_actor_message = HasActorMessage(
+            new_message_id(), create_actor_ref(main_address, b"test2")
+        )
+        assert (await pool.has_actor(has_actor_message)).result is True
+
+        actor_ref_message = ActorRefMessage(
+            new_message_id(), create_actor_ref(main_address, b"test2")
+        )
+        assert (await pool.actor_ref(actor_ref_message)).result == actor_ref2
+
+        # tell
+        tell_message = TellMessage(
+            new_message_id(), actor_ref1, ("add", 0, (2,), dict())
+        )
+        message = await pool.tell(tell_message)
+        assert message.result is None
+
+        # send
+        send_message = SendMessage(
+            new_message_id(), actor_ref1, ("add", 0, (4,), dict())
+        )
+        assert (await pool.send(send_message)).result == 6
+
+        # test error message
+        # type mismatch
+        send_message = SendMessage(
+            new_message_id(), actor_ref1, ("add", 0, ("3",), dict())
+        )
+        result = await pool.send(send_message)
+        assert isinstance(result.error, TypeError)
+
+        # send and tell to main process
+        tell_message = TellMessage(
+            new_message_id(), actor_ref, ("add", 0, (2,), dict())
+        )
+        message = await pool.tell(tell_message)
+        assert message.result is None
+        send_message = SendMessage(
+            new_message_id(), actor_ref, ("add", 0, (4,), dict())
+        )
+        assert (await pool.send(send_message)).result == 6
+
+        # send and cancel
+        send_message = SendMessage(
+            new_message_id(), actor_ref1, ("sleep", 0, (20,), dict())
+        )
+        result_task = asyncio.create_task(pool.send(send_message))
+        start = time.time()
+        cancel_message = CancelMessage(
+            new_message_id(), actor_ref1.address, send_message.message_id
+        )
+        cancel_task = asyncio.create_task(pool.cancel(cancel_message))
+        result = await asyncio.wait_for(cancel_task, 3)
+        assert result.message_type == MessageType.result
+        assert result.result is True
+        result = await result_task
+        assert time.time() - start < 3
+        assert result.message_type == MessageType.result
+        assert result.result == 7
+
+        # destroy
+        destroy_actor_message = DestroyActorMessage(new_message_id(), actor_ref1)
+        message = await pool.destroy_actor(destroy_actor_message)
+        assert message.result == actor_ref1.uid
+
+        tell_message = TellMessage(
+            new_message_id(), actor_ref1, ("add", 0, (2,), dict())
+        )
+        message = await pool.tell(tell_message)
+        assert isinstance(message, ErrorMessage)
+
+        # destroy via connecting to sub pool directly
+        async with await pool.router.get_client(
+            config.get_external_addresses()[-1]
+        ) as client:
+            destroy_actor_message = DestroyActorMessage(new_message_id(), actor_ref2)
+            await client.send(destroy_actor_message)
+            result = await client.recv()
+            _raise_if_error(result)
+            assert result.result == actor_ref2.uid
+
+        # test sync config
+        config.add_pool_conf(
+            3, "sub", "unixsocket:///3", f"127.0.0.1:{get_next_port()}"
+        )
+        sync_config_message = ControlMessage(
+            new_message_id(),
+            pool.external_address,
+            ControlMessageType.sync_config,
+            config,
+        )
+        message = await pool.handle_control_command(sync_config_message)
+        assert message.result is True
+
+        # test get config
+        get_config_message = ControlMessage(
+            new_message_id(),
+            config.get_external_addresses()[1],
+            ControlMessageType.get_config,
+            None,
+        )
+        message = await pool.handle_control_command(get_config_message)
+        config2 = message.result
+        assert config.as_dict() == config2.as_dict()
+
+    assert pool.stopped
+
+
+@pytest.mark.asyncio
+async def test_create_actor_pool():
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=2,
+        subprocess_start_method=start_method,
+    )
+
+    async with pool:
+        # test global router
+        global_router = Router.get_instance()
+        # global router should not be the identical one with pool's router
+        assert global_router is not pool.router
+        assert pool.external_address in global_router._curr_external_addresses
+        assert pool.external_address in global_router._mapping
+
+        ctx = get_context()
+
+        # actor on main pool
+        actor_ref = await ctx.create_actor(
+            TestActor, uid="test-1", address=pool.external_address
+        )
+        assert await actor_ref.add(3) == 3
+        assert await actor_ref.add(1) == 4
+        assert (await ctx.has_actor(actor_ref)) is True
+        assert (await ctx.actor_ref(actor_ref)) == actor_ref
+        # test cancel
+        task = asyncio.create_task(actor_ref.sleep(20))
+        await asyncio.sleep(0)
+        task.cancel()
+        assert await task == 5
+        await ctx.destroy_actor(actor_ref)
+        assert (await ctx.has_actor(actor_ref)) is False
+        for f in actor_ref.add, ctx.actor_ref, ctx.destroy_actor:
+            with pytest.raises(ActorNotExist):
+                await f(actor_ref)
+
+        # actor on sub pool
+        actor_ref1 = await ctx.create_actor(
+            TestActor, uid="test-main", address=pool.external_address
+        )
+        actor_ref2 = await ctx.create_actor(
+            TestActor,
+            uid="test-2",
+            address=pool.external_address,
+            allocate_strategy=RandomSubPool(),
+        )
+        assert (
+            await ctx.actor_ref(uid="test-2", address=actor_ref2.address)
+        ) == actor_ref2
+        main_ref = await ctx.actor_ref(uid="test-main", address=actor_ref2.address)
+        assert main_ref.address == pool.external_address
+        main_ref = await ctx.actor_ref(actor_ref1)
+        assert main_ref.address == pool.external_address
+        assert actor_ref2.address != actor_ref.address
+        assert await actor_ref2.add(3) == 3
+        assert await actor_ref2.add(1) == 4
+        with pytest.raises(RuntimeError):
+            await actor_ref2.return_cannot_unpickle()
+        with pytest.raises(SendMessageFailed):
+            await actor_ref2.raise_cannot_pickle()
+        assert (await ctx.has_actor(actor_ref2)) is True
+        assert (await ctx.actor_ref(actor_ref2)) == actor_ref2
+        # test cancel
+        task = asyncio.create_task(actor_ref2.sleep(20))
+        start = time.time()
+        await asyncio.sleep(0)
+        task.cancel()
+        assert await task == 5
+        assert time.time() - start < 3
+        await ctx.destroy_actor(actor_ref2)
+        assert (await ctx.has_actor(actor_ref2)) is False
+
+    assert pool.stopped
+    # after pool shutdown, global router must has been cleaned
+    global_router = Router.get_instance()
+    assert len(global_router._curr_external_addresses) == 0
+    assert len(global_router._mapping) == 0
+
+
+@pytest.mark.asyncio
+async def test_errors():
+    with pytest.raises(ValueError):
+        _ = await create_actor_pool(
+            "127.0.0.1", pool_cls=MainActorPool, n_process=1, labels=["a"]
+        )
+
+    with pytest.raises(ValueError):
+        _ = await create_actor_pool(
+            f"127.0.0.1:{get_next_port()}",
+            pool_cls=MainActorPool,
+            n_process=1,
+            ports=[get_next_port(), get_next_port()],
+        )
+
+    with pytest.raises(ValueError):
+        _ = await create_actor_pool(
+            "127.0.0.1", pool_cls=MainActorPool, n_process=1, ports=[get_next_port()]
+        )
+
+    with pytest.raises(ValueError):
+        _ = await create_actor_pool(
+            "127.0.0.1", pool_cls=MainActorPool, n_process=1, auto_recover="illegal"
+        )
+
+    with pytest.raises(ValueError, match="external_address_schemes"):
+        _ = await create_actor_pool(
+            "127.0.0.1",
+            pool_cls=MainActorPool,
+            n_process=1,
+            external_address_schemes=["ucx"],
+        )
+
+    with pytest.raises(ValueError, match="enable_internal_addresses"):
+        _ = await create_actor_pool(
+            "127.0.0.1",
+            pool_cls=MainActorPool,
+            n_process=1,
+            enable_internal_addresses=[True],
+        )
+
+
+@pytest.mark.asyncio
+async def test_server_closed():
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=2,
+        subprocess_start_method=start_method,
+        auto_recover=False,
+    )
+
+    ctx = get_context()
+
+    async with pool:
+        actor_ref = await ctx.create_actor(
+            TestActor, address=pool.external_address, allocate_strategy=ProcessIndex(1)
+        )
+
+        # check if error raised normally when subprocess killed
+        task = asyncio.create_task(actor_ref.sleep(10))
+        await asyncio.sleep(0)
+
+        # kill subprocess 1
+        process = list(pool._sub_processes.values())[0]
+        process.kill()
+        process.join()
+
+        with pytest.raises(ServerClosed):
+            # process already been killed,
+            # ServerClosed will be raised
+            await task
+
+        assert not process.is_alive()
+
+    with pytest.raises(RuntimeError):
+        await pool.start()
+
+    # test server unreachable
+    with pytest.raises(ConnectionError):
+        await ctx.has_actor(actor_ref)
+
+
+@pytest.mark.asyncio
+@pytest.mark.skipif(sys.platform.startswith("win"), reason="skip under Windows")
+@pytest.mark.parametrize("auto_recover", [False, True, "actor", "process"])
+async def test_auto_recover(auto_recover):
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    recovered = asyncio.Event()
+
+    def on_process_recover(*_):
+        recovered.set()
+
+    pool = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=2,
+        subprocess_start_method=start_method,
+        auto_recover=auto_recover,
+        on_process_recover=on_process_recover,
+    )
+
+    async with pool:
+        ctx = get_context()
+
+        # wait for recover of main pool always returned immediately
+        await ctx.wait_actor_pool_recovered(
+            pool.external_address, pool.external_address
+        )
+
+        # create actor on main
+        actor_ref = await ctx.create_actor(
+            TestActor, address=pool.external_address, allocate_strategy=MainPool()
+        )
+
+        with pytest.raises(ValueError):
+            # cannot kill actors on main pool
+            await kill_actor(actor_ref)
+
+        # create actor
+        actor_ref = await ctx.create_actor(
+            TestActor, address=pool.external_address, allocate_strategy=ProcessIndex(1)
+        )
+        # kill_actor will cause kill corresponding process
+        await ctx.kill_actor(actor_ref)
+
+        if auto_recover:
+            # process must have been killed
+            await ctx.wait_actor_pool_recovered(
+                actor_ref.address, pool.external_address
+            )
+            assert recovered.is_set()
+
+            expect_has_actor = True if auto_recover in ["actor", True] else False
+            assert await ctx.has_actor(actor_ref) is expect_has_actor
+        else:
+            with pytest.raises((ServerClosed, ConnectionError)):
+                await ctx.has_actor(actor_ref)
+
+
+@pytest.mark.parametrize(
+    "exception_config",
+    [
+        (Exception("recover exception"), False),
+        (asyncio.CancelledError("cancel monitor"), True),
+    ],
+)
+@pytest.mark.asyncio
+async def test_monitor_sub_pool_exception(exception_config):
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    recovered = asyncio.Event()
+    exception, done = exception_config
+
+    def on_process_recover(*_):
+        recovered.set()
+        raise exception
+
+    pool = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=2,
+        subprocess_start_method=start_method,
+        on_process_recover=on_process_recover,
+    )
+
+    async with pool:
+        ctx = get_context()
+        task = await pool.start_monitor()
+
+        # create actor
+        actor_ref = await ctx.create_actor(
+            TestActor, address=pool.external_address, allocate_strategy=ProcessIndex(1)
+        )
+        # kill_actor will cause kill corresponding process
+        await ctx.kill_actor(actor_ref)
+
+        await recovered.wait()
+        assert task.done() is done
+
+
+@pytest.mark.asyncio
+async def test_two_pools():
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+
+    ctx = get_context()
+
+    pool1 = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=2,
+        subprocess_start_method=start_method,
+    )
+    pool2 = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=2,
+        subprocess_start_method=start_method,
+    )
+
+    def is_interprocess_address(addr):
+        if sys.platform.startswith("win"):
+            return re.match(r"127\.0\.0\.1:\d+", addr)
+        else:
+            return addr.startswith("unixsocket://")
+
+    try:
+        actor_ref1 = await ctx.create_actor(
+            TestActor, address=pool1.external_address, allocate_strategy=MainPool()
+        )
+        assert actor_ref1.address == pool1.external_address
+        assert await actor_ref1.add(1) == 1
+        assert (
+            Router.get_instance()
+            .get_internal_address(actor_ref1.address)
+            .startswith("dummy://")
+        )
+
+        actor_ref2 = await ctx.create_actor(
+            TestActor, address=pool1.external_address, allocate_strategy=RandomSubPool()
+        )
+        assert actor_ref2.address in pool1._config.get_external_addresses()[1:]
+        assert await actor_ref2.add(3) == 3
+        assert is_interprocess_address(
+            Router.get_instance().get_internal_address(actor_ref2.address)
+        )
+
+        actor_ref3 = await ctx.create_actor(
+            TestActor, address=pool2.external_address, allocate_strategy=MainPool()
+        )
+        assert actor_ref3.address == pool2.external_address
+        assert await actor_ref3.add(5) == 5
+        assert (
+            Router.get_instance()
+            .get_internal_address(actor_ref3.address)
+            .startswith("dummy://")
+        )
+
+        actor_ref4 = await ctx.create_actor(
+            TestActor, address=pool2.external_address, allocate_strategy=RandomSubPool()
+        )
+        assert actor_ref4.address in pool2._config.get_external_addresses()[1:]
+        assert await actor_ref4.add(7) == 7
+        assert is_interprocess_address(
+            Router.get_instance().get_internal_address(actor_ref4.address)
+        )
+
+        assert await actor_ref2.add_other(actor_ref4, 3) == 13
+    finally:
+        await pool1.stop()
+        await pool2.stop()
+
+
+@pytest.mark.asyncio
+async def test_parallel_allocate_idle_label():
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=2,
+        subprocess_start_method=start_method,
+        labels=[None, "my_label", "my_label"],
+    )
+
+    class _Actor(Actor):
+        def get_pid(self):
+            return os.getpid()
+
+    async with pool:
+        ctx = get_context()
+        strategy = IdleLabel("my_label", "tests")
+        tasks = [
+            ctx.create_actor(
+                _Actor, allocate_strategy=strategy, address=pool.external_address
+            ),
+            ctx.create_actor(
+                _Actor, allocate_strategy=strategy, address=pool.external_address
+            ),
+        ]
+        refs = await asyncio.gather(*tasks)
+        # outputs identical process ids, while the result should be different
+        assert len({await ref.get_pid() for ref in refs}) == 2
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "logging_conf",
+    [
+        {
+            "file": os.path.join(
+                os.path.dirname(os.path.abspath(__file__)), "test-logging.conf"
+            )
+        },
+        {"level": logging.DEBUG},
+        {"level": logging.DEBUG, "format": "%(asctime)s %(message)s"},
+    ],
+)
+async def test_logging_config(logging_conf):
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=1,
+        subprocess_start_method=start_method,
+        labels=[None, "my_label"],
+        logging_conf=logging_conf,
+    )
+
+    class _Actor(Actor):
+        def get_logger_level(self):
+            logger = logging.getLogger(__name__)
+            return logger.getEffectiveLevel()
+
+    async with pool:
+        ctx = get_context()
+        strategy = IdleLabel("my_label", "tests")
+        ref = await ctx.create_actor(
+            _Actor, allocate_strategy=strategy, address=pool.external_address
+        )
+        assert await ref.get_logger_level() == logging.DEBUG
+
+
+@pytest.mark.asyncio
+async def test_ref_sub_pool_actor():
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=1,
+        subprocess_start_method=start_method,
+    )
+
+    async with pool:
+        ctx = get_context()
+        ref1 = await ctx.create_actor(
+            TestActor, address=pool.external_address, allocate_strategy=RandomSubPool()
+        )
+        sub_address = ref1.address
+        ref2 = await ctx.create_actor(TestActor, address=sub_address)
+        ref2_main = await ctx.actor_ref(ref2.uid, address=pool.external_address)
+        assert ref2_main.address == sub_address
+
+        await ctx.destroy_actor(create_actor_ref(pool.external_address, ref2.uid))
+        assert not await ctx.has_actor(
+            create_actor_ref(pool.external_address, ref2.uid)
+        )
+        assert not await ctx.has_actor(create_actor_ref(sub_address, ref2.uid))
+
+
+class TestUCXActor(Actor):
+    __test__ = False
+
+    def __init__(self, init_val: int):
+        self._init_val = init_val
+
+    def verify(self, enabled_internal_addr: bool):
+        router = Router.get_instance()
+        assert router.external_address.startswith("ucx")
+        assert len(router._mapping) > 0
+        if not enabled_internal_addr:
+            # no internal address
+            assert all(v is None for v in router._mapping.values())
+        else:
+            assert all(v is not None for v in router._mapping.values())
+
+    def add(self, n: int):
+        return self._init_val + n
+
+    async def foo(self, ref, n: int):
+        assert self.address != ref.address
+        return self._init_val + await ref.add(n)
+
+
+@require_ucx
+@pytest.mark.asyncio
+@pytest.mark.parametrize("enable_internal_addr", [False, True])
+async def test_ucx(enable_internal_addr: bool):
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await create_actor_pool(
+        "127.0.0.1",
+        pool_cls=MainActorPool,
+        n_process=2,
+        subprocess_start_method=start_method,
+        external_address_schemes=["ucx"] * 3,
+        enable_internal_addresses=[enable_internal_addr] * 3,
+    )
+
+    async with pool:
+        ctx = get_context()
+        ref1 = await ctx.create_actor(
+            TestUCXActor,
+            1,
+            address=pool.external_address,
+            allocate_strategy=ProcessIndex(0),
+        )
+        await ref1.verify(enable_internal_addr)
+        ref2 = await ctx.create_actor(
+            TestUCXActor,
+            2,
+            address=pool.external_address,
+            allocate_strategy=ProcessIndex(1),
+        )
+        assert await ref1.foo(ref2, 3) == 6
diff --git a/python/xorbits/_mars/oscar/backends/message.pyi b/python/xorbits/_mars/oscar/backends/message.pyi
new file mode 100644
index 000000000..dac3aff7b
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/message.pyi
@@ -0,0 +1,214 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from types import TracebackType
+from typing import Any, Type
+
+from ..core import ActorRef
+
+DEFAULT_PROTOCOL: int = 0
+
+class MessageType(Enum):
+    control = 0
+    result = 1
+    error = 2
+    create_actor = 3
+    destroy_actor = 4
+    has_actor = 5
+    actor_ref = 6
+    send = 7
+    tell = 8
+    cancel = 9
+
+class ControlMessageType(Enum):
+    stop = 0
+    restart = 1
+    sync_config = 2
+    get_config = 3
+    wait_pool_recovered = 4
+    add_sub_pool_actor = 5
+
+class _MessageBase:
+    message_type: MessageType
+    protocol: int
+    message_id: bytes
+    message_trace: list
+    profiling_context: Any
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+        profiling_context: Any = None,
+    ): ...
+    def __repr__(self): ...
+
+class ControlMessage(_MessageBase):
+    message_type = MessageType.control
+
+    address: str
+    control_message_type: ControlMessageType
+    content: Any
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        address: str = None,
+        control_message_type: ControlMessageType = None,
+        content: Any = None,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+    ): ...
+
+class ResultMessage(_MessageBase):
+    message_type = MessageType.result
+
+    result: Any
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        result: Any = None,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+        profiling_context: Any = None,
+    ): ...
+
+class ErrorMessage(_MessageBase):
+    message_type = MessageType.error
+
+    address: str
+    pid: int
+    error_type: Type
+    error: BaseException
+    traceback: TracebackType
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        address: str = None,
+        pid: int = -1,
+        error_type: Type[BaseException] = None,
+        error: BaseException = None,
+        traceback: TracebackType = None,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+    ): ...
+    def as_instanceof_cause(self) -> BaseException: ...
+
+class CreateActorMessage(_MessageBase):
+    message_type = MessageType.create_actor
+
+    actor_cls: Type
+    actor_id: bytes
+    args: tuple
+    kwargs: dict
+    allocate_strategy: Any
+    from_main: bool
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        actor_cls: Type = None,
+        actor_id: bytes = None,
+        args: tuple = None,
+        kwargs: dict = None,
+        allocate_strategy: Any = None,
+        from_main: bool = False,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+    ): ...
+
+class DestroyActorMessage(_MessageBase):
+    message_type = MessageType.destroy_actor
+
+    actor_ref: ActorRef
+    from_main: bool
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        actor_ref: ActorRef = None,
+        from_main: bool = False,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+    ): ...
+
+class HasActorMessage(_MessageBase):
+    message_type = MessageType.has_actor
+
+    actor_ref: ActorRef
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        actor_ref: ActorRef = None,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+    ): ...
+
+class ActorRefMessage(_MessageBase):
+    message_type = MessageType.actor_ref
+
+    actor_ref: ActorRef
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        actor_ref: ActorRef = None,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+    ): ...
+
+class SendMessage(_MessageBase):
+    message_type = MessageType.send
+
+    actor_ref: ActorRef
+    content: Any
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        actor_ref: ActorRef = None,
+        content: object = None,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+        profiling_context: Any = None,
+    ): ...
+
+class TellMessage(SendMessage):
+    message_type = MessageType.tell
+
+class CancelMessage(_MessageBase):
+    message_type = MessageType.cancel
+
+    address: str
+    cancel_message_id: bytes
+
+    def __init__(
+        self,
+        message_id: bytes = None,
+        address: str = None,
+        cancel_message_id: bytes = None,
+        protocol: int = DEFAULT_PROTOCOL,
+        message_trace: list = None,
+    ): ...
+
+class DeserializeMessageFailed(RuntimeError):
+    def __init__(self, message_id: bytes): ...
+    def __str__(self): ...
+
+def new_message_id() -> bytes: ...
diff --git a/python/xorbits/_mars/oscar/backends/message.pyx b/python/xorbits/_mars/oscar/backends/message.pyx
new file mode 100644
index 000000000..a6a5bd987
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/message.pyx
@@ -0,0 +1,551 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from types import TracebackType
+from typing import Any, Type
+
+from ...lib.tblib import pickling_support
+
+from ..._utils cimport new_random_id
+from ...serialization.core cimport Serializer
+
+from ...utils import wrap_exception
+
+from ..core cimport ActorRef
+
+# make sure traceback can be pickled
+pickling_support.install()
+
+cdef int _DEFAULT_PROTOCOL = 0
+DEFAULT_PROTOCOL = _DEFAULT_PROTOCOL
+
+
+class MessageType(Enum):
+    control = 0
+    result = 1
+    error = 2
+    create_actor = 3
+    destroy_actor = 4
+    has_actor = 5
+    actor_ref = 6
+    send = 7
+    tell = 8
+    cancel = 9
+
+
+class ControlMessageType(Enum):
+    stop = 0
+    restart = 1
+    sync_config = 2
+    get_config = 3
+    wait_pool_recovered = 4
+    add_sub_pool_actor = 5
+
+
+cdef class _MessageSerialItem:
+    cdef:
+        tuple serialized
+        list subs
+
+    def __cinit__(self, tuple serialized, list subs):
+        self.serialized = serialized
+        self.subs = subs
+
+
+cdef class _MessageBase:
+    message_type: MessageType = None
+
+    cdef:
+        public int protocol
+        public bytes message_id
+        public list message_trace
+        public object profiling_context
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+        object profiling_context = None,
+    ):
+        self.message_id = message_id
+        self.protocol = protocol
+        # A message can be in the scope of other messages,
+        # this is mainly used for detecting deadlocks,
+        # e.g. Actor `A` sent a message(id: 1) to actor `B`,
+        # in the processing of `B`, it sent back a message(id: 2) to `A`,
+        # deadlock happens, because `A` is still waiting for reply from `B`.
+        # In this case, the `scoped_message_ids` will be [1, 2],
+        # `A` will find that id:1 already exists in inbox,
+        # thus deadlock detected.
+        self.message_trace = message_trace
+        self.profiling_context = profiling_context
+
+    cdef _MessageSerialItem serial(self):
+        return _MessageSerialItem(
+            (
+                self.message_type.value,
+                self.message_id,
+                self.protocol,
+                self.message_trace,
+                self.profiling_context,
+            ),
+            [],
+        )
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        self.message_id = serialized[1]
+        self.protocol = serialized[2]
+        self.message_trace = serialized[3]
+        self.profiling_context = serialized[4]
+
+    def __repr__(self):
+        cdef list attr_reprs = []
+        for attr in dir(self):
+            if attr.startswith("_") or attr == "message_type":
+                continue
+            val = getattr(self, attr)
+            if callable(val):
+                continue
+            attr_reprs.append(f"{attr}={val!r}")
+        values = ", ".join(attr_reprs)
+        return f"{type(self).__name__}({values})"
+
+
+cdef class ControlMessage(_MessageBase):
+    message_type = MessageType.control
+
+    cdef:
+        public str address
+        public object control_message_type
+        public object content
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        str address = None,
+        object control_message_type: ControlMessageType = None,
+        object content: Any = None,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+    ):
+        _MessageBase.__init__(
+            self, message_id, protocol=protocol, message_trace=message_trace
+        )
+        self.address = address
+        self.control_message_type = control_message_type
+        self.content = content
+
+    cdef _MessageSerialItem serial(self):
+        cdef _MessageSerialItem item = _MessageBase.serial(self)
+        item.serialized += (
+            self.address,
+            self.control_message_type,
+        )
+        item.subs = [self.content]
+        return item
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        _MessageBase.deserial_members(self, serialized, subs)
+        self.address = serialized[-2]
+        self.control_message_type = serialized[-1]
+        self.content = subs[0]
+
+
+cdef class ResultMessage(_MessageBase):
+    message_type = MessageType.result
+
+    cdef:
+        public object result
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        object result: Any = None,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+        object profiling_context = None,
+    ):
+        _MessageBase.__init__(
+            self,
+            message_id,
+            protocol=protocol,
+            message_trace=message_trace,
+            profiling_context=profiling_context,
+        )
+        self.result = result
+
+    cdef _MessageSerialItem serial(self):
+        cdef _MessageSerialItem item = _MessageBase.serial(self)
+        item.subs = [self.result]
+        return item
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        _MessageBase.deserial_members(self, serialized, subs)
+        self.result = subs[0]
+
+
+class _AsCauseBase:
+    def __str__(self):
+        return f"[address={self.address}, pid={self.pid}] {str(self.__wrapped__)}"
+
+
+cdef class ErrorMessage(_MessageBase):
+    message_type = MessageType.error
+
+    cdef:
+        public str address
+        public long pid
+        public type error_type
+        public object error
+        public object traceback
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        str address: str = None,
+        long pid = -1,
+        type error_type: Type[BaseException] = None,
+        object error: BaseException = None,
+        object traceback: TracebackType = None,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+    ):
+        _MessageBase.__init__(
+            self, message_id, protocol=protocol, message_trace=message_trace
+        )
+        self.address = address
+        self.pid = pid
+        self.error_type = error_type
+        self.error = error
+        self.traceback = traceback
+
+    def as_instanceof_cause(self):
+        # Check the as_instanceof_cause is not recursive.
+        #
+        # e.g. SubtaskRunnerActor.run_subtask will reraise the exception raised
+        # from SubtaskProcessorActor.run. But these two actors are in the same
+        # process, so we don't want to append duplicated address and pid in the
+        # error message.
+        if issubclass(self.error_type, _AsCauseBase):
+            return self.error.with_traceback(self.traceback)
+
+        return wrap_exception(
+            self.error,
+            (_AsCauseBase,),
+            traceback=self.traceback,
+            attr_dict=dict(address=self.address, pid=self.pid),
+        )
+
+    cdef _MessageSerialItem serial(self):
+        cdef _MessageSerialItem item = _MessageBase.serial(self)
+        item.serialized += (self.address, self.pid)
+        item.subs = [self.error_type, self.error, self.traceback]
+        return item
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        _MessageBase.deserial_members(self, serialized, subs)
+        self.address = serialized[-2]
+        self.pid = serialized[-1]
+        self.error_type = subs[0]
+        self.error = subs[1]
+        self.traceback = subs[2]
+
+
+cdef class CreateActorMessage(_MessageBase):
+    message_type = MessageType.create_actor
+
+    cdef:
+        public type actor_cls
+        public bytes actor_id
+        public tuple args
+        public dict kwargs
+        public object allocate_strategy
+        public object from_main
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        type actor_cls = None,
+        bytes actor_id = None,
+        tuple args = None,
+        dict kwargs = None,
+        object allocate_strategy = None,
+        object from_main: bool = False,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+    ):
+        _MessageBase.__init__(
+            self, message_id, protocol=protocol, message_trace=message_trace
+        )
+        self.actor_cls = actor_cls
+        self.actor_id = actor_id
+        self.args = args
+        self.kwargs = kwargs
+        self.allocate_strategy = allocate_strategy
+        self.from_main = from_main
+
+    cdef _MessageSerialItem serial(self):
+        cdef _MessageSerialItem item = _MessageBase.serial(self)
+        item.serialized += (
+            self.actor_id, self.allocate_strategy, self.from_main
+        )
+        item.subs = [self.actor_cls, self.args, self.kwargs]
+        return item
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        _MessageBase.deserial_members(self, serialized, subs)
+        self.actor_id = serialized[-3]
+        self.allocate_strategy = serialized[-2]
+        self.from_main = serialized[-1]
+        self.actor_cls = subs[0]
+        self.args = subs[1]
+        self.kwargs = subs[2]
+
+
+cdef class DestroyActorMessage(_MessageBase):
+    message_type = MessageType.destroy_actor
+
+    cdef:
+        public ActorRef actor_ref
+        public object from_main
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        ActorRef actor_ref = None,
+        object from_main: bool = False,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+    ):
+        _MessageBase.__init__(
+            self, message_id, protocol=protocol, message_trace=message_trace
+        )
+        self.actor_ref = actor_ref
+        self.from_main = from_main
+
+    cdef _MessageSerialItem serial(self):
+        cdef _MessageSerialItem item = _MessageBase.serial(self)
+        item.serialized += (
+            self.actor_ref.address, self.actor_ref.uid, self.from_main
+        )
+        return item
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        _MessageBase.deserial_members(self, serialized, subs)
+        self.actor_ref = ActorRef(serialized[-3], serialized[-2])
+        self.from_main = serialized[-1]
+
+
+cdef class HasActorMessage(_MessageBase):
+    message_type = MessageType.has_actor
+
+    cdef:
+        public ActorRef actor_ref
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        ActorRef actor_ref = None,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+    ):
+        _MessageBase.__init__(
+            self, message_id, protocol=protocol, message_trace=message_trace
+        )
+        self.actor_ref = actor_ref
+
+    cdef _MessageSerialItem serial(self):
+        cdef _MessageSerialItem item = _MessageBase.serial(self)
+        item.serialized += (
+            self.actor_ref.address, self.actor_ref.uid
+        )
+        return item
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        _MessageBase.deserial_members(self, serialized, subs)
+        self.actor_ref = ActorRef(serialized[-2], serialized[-1])
+
+
+cdef class ActorRefMessage(_MessageBase):
+    message_type = MessageType.actor_ref
+
+    cdef:
+        public ActorRef actor_ref
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        ActorRef actor_ref = None,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+    ):
+        _MessageBase.__init__(
+            self, message_id, protocol=protocol, message_trace=message_trace
+        )
+        self.actor_ref = actor_ref
+
+    cdef _MessageSerialItem serial(self):
+        cdef _MessageSerialItem item = _MessageBase.serial(self)
+        item.serialized += (
+            self.actor_ref.address, self.actor_ref.uid
+        )
+        return item
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        _MessageBase.deserial_members(self, serialized, subs)
+        self.actor_ref = ActorRef(serialized[-2], serialized[-1])
+
+
+cdef class SendMessage(_MessageBase):
+    message_type = MessageType.send
+
+    cdef:
+        public ActorRef actor_ref
+        public object content
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        ActorRef actor_ref = None,
+        object content = None,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+        object profiling_context = None,
+    ):
+        _MessageBase.__init__(
+            self,
+            message_id,
+            protocol=protocol,
+            message_trace=message_trace,
+            profiling_context=profiling_context,
+        )
+        self.actor_ref = actor_ref
+        self.content = content
+
+    cdef _MessageSerialItem serial(self):
+        cdef _MessageSerialItem item = _MessageBase.serial(self)
+        item.serialized += (
+            self.actor_ref.address, self.actor_ref.uid
+        )
+        item.subs = [self.content]
+        return item
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        _MessageBase.deserial_members(self, serialized, subs)
+        self.actor_ref = ActorRef(serialized[-2], serialized[-1])
+        self.content = subs[0]
+
+
+cdef class TellMessage(SendMessage):
+    message_type = MessageType.tell
+
+
+cdef class CancelMessage(_MessageBase):
+    message_type = MessageType.cancel
+
+    cdef:
+        public str address
+        public bytes cancel_message_id
+
+    def __init__(
+        self,
+        bytes message_id = None,
+        str address = None,
+        bytes cancel_message_id = None,
+        int protocol = _DEFAULT_PROTOCOL,
+        list message_trace = None,
+    ):
+        _MessageBase.__init__(
+            self, message_id, protocol=protocol, message_trace=message_trace
+        )
+        self.address = address
+        self.cancel_message_id = cancel_message_id
+
+    cdef _MessageSerialItem serial(self):
+        cdef _MessageSerialItem item = _MessageBase.serial(self)
+        item.serialized += (
+            self.address, self.cancel_message_id
+        )
+        return item
+
+    cdef deserial_members(self, tuple serialized, list subs):
+        _MessageBase.deserial_members(self, serialized, subs)
+        self.address = serialized[-2]
+        self.cancel_message_id = serialized[-1]
+
+
+cdef dict _message_type_to_message_cls = {
+    MessageType.control.value: ControlMessage,
+    MessageType.result.value: ResultMessage,
+    MessageType.error.value: ErrorMessage,
+    MessageType.create_actor.value: CreateActorMessage,
+    MessageType.destroy_actor.value: DestroyActorMessage,
+    MessageType.has_actor.value: HasActorMessage,
+    MessageType.actor_ref.value: ActorRefMessage,
+    MessageType.send.value: SendMessage,
+    MessageType.tell.value: TellMessage,
+    MessageType.cancel.value: CancelMessage,
+}
+
+
+class DeserializeMessageFailed(RuntimeError):
+    def __init__(self, message_id):
+        self.message_id = message_id
+
+    def __str__(self):
+        return f"Deserialize {self.message_id} failed"
+
+
+cdef class MessageSerializer(Serializer):
+    serializer_id = 32105
+
+    cpdef serial(self, object obj, dict context):
+        cdef _MessageBase msg = <_MessageBase>obj
+        cdef _MessageSerialItem ser_item
+
+        assert msg.protocol == _DEFAULT_PROTOCOL, "only support protocol 0 for now"
+        ser_item = msg.serial()
+        return ser_item.serialized, ser_item.subs, False
+
+    cpdef deserial(self, tuple serialized, dict context, list subs):
+        cdef _MessageBase msg
+
+        msg_type = serialized[0]
+        msg = _message_type_to_message_cls[msg_type]()
+        msg.deserial_members(serialized, subs)
+        return msg
+
+    cpdef on_deserial_error(
+        self,
+        tuple serialized,
+        dict context,
+        list subs_serialized,
+        int error_index,
+        object exc,
+    ):
+        message_id = serialized[1]  # pos of message_id field
+        try:
+            raise DeserializeMessageFailed(message_id) from exc
+        except BaseException as new_ex:
+            return new_ex
+
+
+# register message serializer
+MessageSerializer.register(_MessageBase)
+
+
+cpdef bytes new_message_id():
+    return new_random_id(32)
diff --git a/python/xorbits/_mars/oscar/backends/pool.py b/python/xorbits/_mars/oscar/backends/pool.py
new file mode 100644
index 000000000..cd3b1d183
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/pool.py
@@ -0,0 +1,1519 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import concurrent.futures as futures
+import contextlib
+import itertools
+import logging
+import multiprocessing
+import os
+import threading
+import traceback
+from abc import ABC, ABCMeta, abstractmethod
+from typing import Callable, Coroutine, Dict, List, Optional, Type, TypeVar, Union
+
+from ...core.entrypoints import init_extension_entrypoints
+from ...metrics import init_metrics
+from ...utils import (
+    TypeDispatcher,
+    implements,
+    lazy_import,
+    register_asyncio_task_timeout_detector,
+    to_binary,
+)
+from ..api import Actor
+from ..core import ActorRef, register_local_pool
+from ..debug import debug_async_timeout, record_message_trace
+from ..errors import (
+    ActorAlreadyExist,
+    ActorNotExist,
+    CannotCancelTask,
+    SendMessageFailed,
+    ServerClosed,
+)
+from ..utils import create_actor_ref
+from .allocate_strategy import AddressSpecified, allocated_type
+from .communication import Channel, Server, gen_local_address, get_server_type
+from .communication.errors import ChannelClosed
+from .config import ActorPoolConfig
+from .core import ActorCaller, ResultMessageType
+from .message import (
+    DEFAULT_PROTOCOL,
+    ActorRefMessage,
+    CancelMessage,
+    ControlMessage,
+    ControlMessageType,
+    CreateActorMessage,
+    DestroyActorMessage,
+    ErrorMessage,
+    HasActorMessage,
+    MessageType,
+    ResultMessage,
+    SendMessage,
+    TellMessage,
+    _MessageBase,
+    new_message_id,
+)
+from .router import Router
+
+logger = logging.getLogger(__name__)
+ray = lazy_import("ray")
+
+
+@contextlib.contextmanager
+def _disable_log_temporally():
+    if os.getenv("CUDA_VISIBLE_DEVICES") == "-1":
+        # disable logging when CUDA_VISIBLE_DEVICES == -1
+        # many logging comes from ptxcompiler may distract users
+        try:
+            logging.disable(level=logging.ERROR)
+            yield
+        finally:
+            logging.disable(level=logging.NOTSET)
+    else:
+        yield
+
+
+class _ErrorProcessor:
+    def __init__(self, address: str, message_id: bytes, protocol):
+        self._address = address
+        self._message_id = message_id
+        self._protocol = protocol
+        self.result = None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.result is None:
+            self.result = ErrorMessage(
+                self._message_id,
+                self._address,
+                os.getpid(),
+                exc_type,
+                exc_val,
+                exc_tb,
+                protocol=self._protocol,
+            )
+            return True
+
+
+def _register_message_handler(pool_type: Type["AbstractActorPool"]):
+    pool_type._message_handler = dict()
+    for message_type, handler in [
+        (MessageType.create_actor, pool_type.create_actor),
+        (MessageType.destroy_actor, pool_type.destroy_actor),
+        (MessageType.has_actor, pool_type.has_actor),
+        (MessageType.actor_ref, pool_type.actor_ref),
+        (MessageType.send, pool_type.send),
+        (MessageType.tell, pool_type.tell),
+        (MessageType.cancel, pool_type.cancel),
+        (MessageType.control, pool_type.handle_control_command),
+    ]:
+        pool_type._message_handler[message_type] = handler
+    return pool_type
+
+
+class AbstractActorPool(ABC):
+    __slots__ = (
+        "process_index",
+        "label",
+        "external_address",
+        "internal_address",
+        "env",
+        "_servers",
+        "_router",
+        "_config",
+        "_stopped",
+        "_actors",
+        "_caller",
+        "_process_messages",
+        "_asyncio_task_timeout_detector_task",
+    )
+
+    def __init__(
+        self,
+        process_index: int,
+        label: str,
+        external_address: str,
+        internal_address: str,
+        env: Dict,
+        router: Router,
+        config: ActorPoolConfig,
+        servers: List[Server],
+    ):
+        # register local pool for local actor lookup.
+        # The pool is weakrefed, so we don't need to unregister it.
+        register_local_pool(external_address, self)
+        self.process_index = process_index
+        self.label = label
+        self.external_address = external_address
+        self.internal_address = internal_address
+        self.env = env
+        self._router = router
+        self._config = config
+        self._servers = servers
+
+        self._stopped = asyncio.Event()
+
+        # states
+        # actor id -> actor
+        self._actors: Dict[bytes, Actor] = dict()
+        # message id -> future
+        self._process_messages: Dict[bytes, asyncio.Future] = dict()
+
+        # manage async actor callers
+        self._caller = ActorCaller()
+        self._asyncio_task_timeout_detector_task = (
+            register_asyncio_task_timeout_detector()
+        )
+        # load third party extensions.
+        init_extension_entrypoints()
+        # init metrics
+        metric_configs = self._config.get_metric_configs()
+        metric_backend = metric_configs.get("backend")
+        init_metrics(metric_backend, config=metric_configs.get(metric_backend))
+
+    @property
+    def router(self):
+        return self._router
+
+    @abstractmethod
+    async def create_actor(self, message: CreateActorMessage) -> ResultMessageType:
+        """
+        Create an actor.
+
+        Parameters
+        ----------
+        message: CreateActorMessage
+            message to create an actor.
+
+        Returns
+        -------
+        result_message
+            result or error message.
+        """
+
+    @abstractmethod
+    async def has_actor(self, message: HasActorMessage) -> ResultMessage:
+        """
+        Check if an actor exists or not.
+
+        Parameters
+        ----------
+        message: HasActorMessage
+            message
+
+        Returns
+        -------
+        result_message
+            result message contains if an actor exists or not.
+        """
+
+    @abstractmethod
+    async def destroy_actor(self, message: DestroyActorMessage) -> ResultMessageType:
+        """
+        Destroy an actor.
+
+        Parameters
+        ----------
+        message: DestroyActorMessage
+            message to destroy an actor.
+
+        Returns
+        -------
+        result_message
+            result or error message.
+        """
+
+    @abstractmethod
+    async def actor_ref(self, message: ActorRefMessage) -> ResultMessageType:
+        """
+        Get an actor's ref.
+
+        Parameters
+        ----------
+        message: ActorRefMessage
+            message to get an actor's ref.
+
+        Returns
+        -------
+        result_message
+            result or error message.
+        """
+
+    @abstractmethod
+    async def send(self, message: SendMessage) -> ResultMessageType:
+        """
+        Send a message to some actor.
+
+        Parameters
+        ----------
+        message: SendMessage
+            Message to send.
+
+        Returns
+        -------
+        result_message
+            result or error message.
+        """
+
+    @abstractmethod
+    async def tell(self, message: TellMessage) -> ResultMessageType:
+        """
+        Tell message to some actor.
+
+        Parameters
+        ----------
+        message: TellMessage
+            Message to tell.
+
+        Returns
+        -------
+        result_message
+            result or error message.
+        """
+
+    @abstractmethod
+    async def cancel(self, message: CancelMessage) -> ResultMessageType:
+        """
+        Cancel message that sent
+
+        Parameters
+        ----------
+        message: CancelMessage
+            Cancel message.
+
+        Returns
+        -------
+        result_message
+            result or error message
+        """
+
+    def _sync_pool_config(self, actor_pool_config: ActorPoolConfig):
+        self._config = actor_pool_config
+        # remove router from global one
+        global_router = Router.get_instance()
+        global_router.remove_router(self._router)
+        # update router
+        self._router.set_mapping(actor_pool_config.external_to_internal_address_map)
+        # update global router
+        global_router.add_router(self._router)
+
+    async def handle_control_command(
+        self, message: ControlMessage
+    ) -> ResultMessageType:
+        """
+        Handle control command.
+
+        Parameters
+        ----------
+        message: ControlMessage
+            Control message.
+
+        Returns
+        -------
+        result_message
+            result or error message.
+        """
+        with _ErrorProcessor(
+            self.external_address, message.message_id, protocol=message.protocol
+        ) as processor:
+            content = True
+            if message.control_message_type == ControlMessageType.stop:
+                await self.stop()
+            elif message.control_message_type == ControlMessageType.sync_config:
+                self._sync_pool_config(message.content)
+            elif message.control_message_type == ControlMessageType.get_config:
+                if message.content == "main_pool_address":
+                    main_process_index = self._config.get_process_indexes()[0]
+                    content = self._config.get_pool_config(main_process_index)[
+                        "external_address"
+                    ][0]
+                else:
+                    content = self._config
+            else:  # pragma: no cover
+                raise TypeError(
+                    f"Unable to handle control message "
+                    f"with type {message.control_message_type}"
+                )
+            processor.result = ResultMessage(
+                message.message_id, content, protocol=message.protocol
+            )
+
+        return processor.result
+
+    async def _run_coro(self, message_id: bytes, coro: Coroutine):
+        self._process_messages[message_id] = asyncio.tasks.current_task()
+        try:
+            return await coro
+        finally:
+            self._process_messages.pop(message_id, None)
+
+    async def _send_channel(
+        self, result: _MessageBase, channel: Channel, resend_failure: bool = True
+    ):
+        try:
+            await channel.send(result)
+        except (ChannelClosed, ConnectionResetError):
+            if not self._stopped.is_set():
+                raise
+        except Exception as ex:
+            logger.exception(
+                "Error when sending message %s from %s to %s",
+                result.message_id.hex(),
+                channel.local_address,
+                channel.dest_address,
+            )
+            if not resend_failure:  # pragma: no cover
+                raise
+
+            with _ErrorProcessor(
+                self.external_address, result.message_id, result.protocol
+            ) as processor:
+                error_msg = (
+                    f"Error when sending message {result.message_id.hex()}. "
+                    f"Caused by {ex!r}. "
+                )
+                if isinstance(result, ErrorMessage):
+                    format_tb = "\n".join(traceback.format_tb(result.traceback))
+                    error_msg += (
+                        f"\nOriginal error: {result.error!r}"
+                        f"Traceback: \n{format_tb}"
+                    )
+                else:
+                    error_msg += "See server logs for more details"
+                raise SendMessageFailed(error_msg) from None
+            await self._send_channel(processor.result, channel, resend_failure=False)
+
+    async def process_message(self, message: _MessageBase, channel: Channel):
+        handler = self._message_handler[message.message_type]
+        with _ErrorProcessor(
+            self.external_address, message.message_id, message.protocol
+        ) as processor:
+            # use `%.500` to avoid print too long messages
+            with debug_async_timeout(
+                "process_message_timeout",
+                "Process message %.500s of channel %s timeout.",
+                message,
+                channel,
+            ):
+                processor.result = await self._run_coro(
+                    message.message_id, handler(self, message)
+                )
+
+        await self._send_channel(processor.result, channel)
+
+    async def call(self, dest_address: str, message: _MessageBase) -> ResultMessageType:
+        return await self._caller.call(self._router, dest_address, message)
+
+    @staticmethod
+    def _parse_config(config: Dict, kw: Dict) -> Dict:
+        actor_pool_config: ActorPoolConfig = config.pop("actor_pool_config")
+        kw["config"] = actor_pool_config
+        kw["process_index"] = process_index = config.pop("process_index")
+        curr_pool_config = actor_pool_config.get_pool_config(process_index)
+        kw["label"] = curr_pool_config["label"]
+        external_addresses = curr_pool_config["external_address"]
+        kw["external_address"] = external_addresses[0]
+        kw["internal_address"] = curr_pool_config["internal_address"]
+        kw["router"] = Router(
+            external_addresses,
+            gen_local_address(process_index),
+            actor_pool_config.external_to_internal_address_map,
+            comm_config=actor_pool_config.get_comm_config(),
+        )
+        kw["env"] = curr_pool_config["env"]
+
+        if config:  # pragma: no cover
+            raise TypeError(
+                f"Creating pool got unexpected " f'arguments: {",".join(config)}'
+            )
+
+        return kw
+
+    @classmethod
+    @abstractmethod
+    async def create(cls, config: Dict) -> "AbstractActorPool":
+        """
+        Create an actor pool.
+
+        Parameters
+        ----------
+        config: Dict
+            configurations.
+
+        Returns
+        -------
+        actor_pool:
+            Actor pool.
+        """
+
+    async def start(self):
+        if self._stopped.is_set():
+            raise RuntimeError("pool has been stopped, cannot start again")
+        start_servers = [server.start() for server in self._servers]
+        await asyncio.gather(*start_servers)
+
+    async def join(self, timeout: float = None):
+        wait_stopped = asyncio.create_task(self._stopped.wait())
+
+        try:
+            await asyncio.wait_for(wait_stopped, timeout=timeout)
+        except (futures.TimeoutError, asyncio.TimeoutError):  # pragma: no cover
+            wait_stopped.cancel()
+
+    async def stop(self):
+        try:
+            # clean global router
+            router = Router.get_instance()
+            if router is not None:
+                router.remove_router(self._router)
+            stop_tasks = []
+            # stop all servers
+            stop_tasks.extend([server.stop() for server in self._servers])
+            # stop all clients
+            stop_tasks.append(self._caller.stop())
+            await asyncio.gather(*stop_tasks)
+
+            self._servers = []
+            if self._asyncio_task_timeout_detector_task:  # pragma: no cover
+                self._asyncio_task_timeout_detector_task.cancel()
+        finally:
+            self._stopped.set()
+
+    @property
+    def stopped(self) -> bool:
+        return self._stopped.is_set()
+
+    async def on_new_channel(self, channel: Channel):
+        while not self._stopped.is_set():
+            try:
+                message = await channel.recv()
+            except EOFError:
+                # no data to read, check channel
+                try:
+                    await channel.close()
+                except (ConnectionError, EOFError):
+                    # close failed, ignore
+                    pass
+                return
+            asyncio.create_task(self.process_message(message, channel))
+            # delete to release the reference of message
+            del message
+            await asyncio.sleep(0)
+
+    async def __aenter__(self):
+        await self.start()
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.stop()
+
+
+class ActorPoolBase(AbstractActorPool, metaclass=ABCMeta):
+    __slots__ = ()
+
+    @implements(AbstractActorPool.create_actor)
+    async def create_actor(self, message: CreateActorMessage) -> ResultMessageType:
+        with _ErrorProcessor(
+            self.external_address, message.message_id, message.protocol
+        ) as processor:
+            actor_id = message.actor_id
+            if actor_id in self._actors:
+                raise ActorAlreadyExist(
+                    f"Actor {actor_id} already exist, cannot create"
+                )
+
+            actor = message.actor_cls(*message.args, **message.kwargs)
+            actor.uid = actor_id
+            actor.address = address = self.external_address
+            self._actors[actor_id] = actor
+            await self._run_coro(message.message_id, actor.__post_create__())
+
+            result = ActorRef(address, actor_id)
+            # ensemble result message
+            processor.result = ResultMessage(
+                message.message_id, result, protocol=message.protocol
+            )
+        return processor.result
+
+    @implements(AbstractActorPool.has_actor)
+    async def has_actor(self, message: HasActorMessage) -> ResultMessage:
+        result = ResultMessage(
+            message.message_id,
+            message.actor_ref.uid in self._actors,
+            protocol=message.protocol,
+        )
+        return result
+
+    @implements(AbstractActorPool.destroy_actor)
+    async def destroy_actor(self, message: DestroyActorMessage) -> ResultMessageType:
+        with _ErrorProcessor(
+            self.external_address, message.message_id, message.protocol
+        ) as processor:
+            actor_id = message.actor_ref.uid
+            try:
+                actor = self._actors[actor_id]
+            except KeyError:
+                raise ActorNotExist(f"Actor {actor_id} does not exist")
+            await self._run_coro(message.message_id, actor.__pre_destroy__())
+            del self._actors[actor_id]
+
+            processor.result = ResultMessage(
+                message.message_id, actor_id, protocol=message.protocol
+            )
+        return processor.result
+
+    @implements(AbstractActorPool.actor_ref)
+    async def actor_ref(self, message: ActorRefMessage) -> ResultMessageType:
+        with _ErrorProcessor(
+            self.external_address, message.message_id, message.protocol
+        ) as processor:
+            actor_id = message.actor_ref.uid
+            if actor_id not in self._actors:
+                raise ActorNotExist(f"Actor {actor_id} does not exist")
+            result = ResultMessage(
+                message.message_id,
+                ActorRef(self.external_address, actor_id),
+                protocol=message.protocol,
+            )
+            processor.result = result
+        return processor.result
+
+    @implements(AbstractActorPool.send)
+    async def send(self, message: SendMessage) -> ResultMessageType:
+        with _ErrorProcessor(
+            self.external_address, message.message_id, message.protocol
+        ) as processor, record_message_trace(message):
+            actor_id = message.actor_ref.uid
+            if actor_id not in self._actors:
+                raise ActorNotExist(f"Actor {actor_id} does not exist")
+            coro = self._actors[actor_id].__on_receive__(message.content)
+            result = await self._run_coro(message.message_id, coro)
+            processor.result = ResultMessage(
+                message.message_id,
+                result,
+                protocol=message.protocol,
+                profiling_context=message.profiling_context,
+            )
+        return processor.result
+
+    @implements(AbstractActorPool.tell)
+    async def tell(self, message: TellMessage) -> ResultMessageType:
+        with _ErrorProcessor(
+            self.external_address, message.message_id, message.protocol
+        ) as processor:
+            actor_id = message.actor_ref.uid
+            if actor_id not in self._actors:  # pragma: no cover
+                raise ActorNotExist(f"Actor {actor_id} does not exist")
+            call = self._actors[actor_id].__on_receive__(message.content)
+            # asynchronously run, tell does not care about result
+            asyncio.create_task(call)
+            await asyncio.sleep(0)
+            processor.result = ResultMessage(
+                message.message_id,
+                None,
+                protocol=message.protocol,
+                profiling_context=message.profiling_context,
+            )
+        return processor.result
+
+    @implements(AbstractActorPool.cancel)
+    async def cancel(self, message: CancelMessage) -> ResultMessageType:
+        with _ErrorProcessor(
+            self.external_address, message.message_id, message.protocol
+        ) as processor:
+            future = self._process_messages.get(message.cancel_message_id)
+            if future is None or future.done():  # pragma: no cover
+                raise CannotCancelTask(
+                    "Task not exists, maybe it is done or cancelled already"
+                )
+            future.cancel()
+            processor.result = ResultMessage(
+                message.message_id, True, protocol=message.protocol
+            )
+        return processor.result
+
+    @staticmethod
+    def _set_global_router(router: Router):
+        # be cautious about setting global router
+        # for instance, multiple main pool may be created in the same process
+
+        # get default router or create an empty one
+        default_router = Router.get_instance_or_empty()
+        Router.set_instance(default_router)
+        # append this router to global
+        default_router.add_router(router)
+
+    @staticmethod
+    def _update_stored_addresses(
+        servers: List[Server],
+        raw_addresses: List[str],
+        actor_pool_config: ActorPoolConfig,
+        kw: Dict,
+    ):
+        process_index = kw["process_index"]
+        curr_pool_config = actor_pool_config.get_pool_config(process_index)
+        external_addresses = curr_pool_config["external_address"]
+        external_address_set = set(external_addresses)
+
+        kw["servers"] = servers
+
+        new_external_addresses = [
+            server.address
+            for server, raw_address in zip(servers, raw_addresses)
+            if raw_address in external_address_set
+        ]
+
+        if external_address_set != set(new_external_addresses):
+            external_addresses = new_external_addresses
+            actor_pool_config.reset_pool_external_address(
+                process_index, external_addresses
+            )
+            external_addresses = curr_pool_config["external_address"]
+
+            logger.debug(
+                "External address of process index %s updated to %s",
+                process_index,
+                external_addresses[0],
+            )
+            if kw["internal_address"] == kw["external_address"]:
+                # internal address may be the same as external address in Windows
+                kw["internal_address"] = external_addresses[0]
+            kw["external_address"] = external_addresses[0]
+
+            kw["router"] = Router(
+                external_addresses,
+                gen_local_address(process_index),
+                actor_pool_config.external_to_internal_address_map,
+                comm_config=actor_pool_config.get_comm_config(),
+            )
+
+    @classmethod
+    async def _create_servers(
+        cls, addresses: List[str], channel_handler: Callable, config: dict
+    ):
+        assert len(set(addresses)) == len(addresses)
+        # create servers
+        create_server_tasks = []
+        for addr in addresses:
+            server_type = get_server_type(addr)
+            extra_config = server_type.parse_config(config)
+            server_config = dict(address=addr, handle_channel=channel_handler)
+            server_config.update(extra_config)
+            task = asyncio.create_task(server_type.create(server_config))
+            create_server_tasks.append(task)
+
+        await asyncio.gather(*create_server_tasks)
+        return [f.result() for f in create_server_tasks]
+
+    @classmethod
+    @implements(AbstractActorPool.create)
+    async def create(cls, config: Dict) -> "ActorPoolType":
+        config = config.copy()
+        kw = dict()
+        cls._parse_config(config, kw)
+        process_index: int = kw["process_index"]
+        actor_pool_config = kw["config"]  # type: ActorPoolConfig
+        cur_pool_config = actor_pool_config.get_pool_config(process_index)
+        external_addresses = cur_pool_config["external_address"]
+        internal_address = kw["internal_address"]
+
+        # import predefined modules
+        modules = cur_pool_config["modules"] or []
+        for mod in modules:
+            __import__(mod, globals(), locals(), [])
+        # make sure all lazy imports loaded
+        with _disable_log_temporally():
+            TypeDispatcher.reload_all_lazy_handlers()
+
+        def handle_channel(channel):
+            return pool.on_new_channel(channel)
+
+        # create servers
+        server_addresses = list(external_addresses)
+        if internal_address:
+            server_addresses.append(internal_address)
+        server_addresses.append(gen_local_address(process_index))
+        server_addresses = sorted(set(server_addresses))
+        servers = await cls._create_servers(
+            server_addresses, handle_channel, actor_pool_config.get_comm_config()
+        )
+        cls._update_stored_addresses(servers, server_addresses, actor_pool_config, kw)
+
+        # set default router
+        # actor context would be able to use exact client
+        cls._set_global_router(kw["router"])
+
+        # create pool
+        pool = cls(**kw)
+        return pool
+
+
+ActorPoolType = TypeVar("ActorPoolType", bound=AbstractActorPool)
+MainActorPoolType = TypeVar("MainActorPoolType", bound="MainActorPoolBase")
+SubProcessHandle = Union[multiprocessing.Process, "ray.actor.ActorHandle"]
+
+
+class SubActorPoolBase(ActorPoolBase):
+    __slots__ = ("_main_address",)
+
+    def __init__(
+        self,
+        process_index: int,
+        label: str,
+        external_address: str,
+        internal_address: str,
+        env: Dict,
+        router: Router,
+        config: ActorPoolConfig,
+        servers: List[Server],
+        main_address: str,
+    ):
+        super().__init__(
+            process_index,
+            label,
+            external_address,
+            internal_address,
+            env,
+            router,
+            config,
+            servers,
+        )
+        self._main_address = main_address
+
+    async def notify_main_pool_to_destroy(
+        self, message: DestroyActorMessage
+    ):  # pragma: no cover
+        await self.call(self._main_address, message)
+
+    async def notify_main_pool_to_create(self, message: CreateActorMessage):
+        reg_message = ControlMessage(
+            new_message_id(),
+            self.external_address,
+            ControlMessageType.add_sub_pool_actor,
+            (self.external_address, message.allocate_strategy, message),
+        )
+        await self.call(self._main_address, reg_message)
+
+    @implements(AbstractActorPool.create_actor)
+    async def create_actor(self, message: CreateActorMessage) -> ResultMessageType:
+        result = await super().create_actor(message)
+        if not message.from_main:
+            await self.notify_main_pool_to_create(message)
+        return result
+
+    @implements(AbstractActorPool.actor_ref)
+    async def actor_ref(self, message: ActorRefMessage) -> ResultMessageType:
+        result = await super().actor_ref(message)
+        if isinstance(result, ErrorMessage):
+            # need a new message id to call main actor
+            main_message = ActorRefMessage(
+                new_message_id(),
+                create_actor_ref(self._main_address, message.actor_ref.uid),
+            )
+            result = await self.call(self._main_address, main_message)
+            # rewrite to message_id of the original request
+            result.message_id = message.message_id
+        return result
+
+    @implements(AbstractActorPool.destroy_actor)
+    async def destroy_actor(self, message: DestroyActorMessage) -> ResultMessageType:
+        result = await super().destroy_actor(message)
+        if isinstance(result, ResultMessage) and not message.from_main:
+            # sync back to main actor pool
+            await self.notify_main_pool_to_destroy(message)
+        return result
+
+    @implements(AbstractActorPool.handle_control_command)
+    async def handle_control_command(
+        self, message: ControlMessage
+    ) -> ResultMessageType:
+        if message.control_message_type == ControlMessageType.sync_config:
+            self._main_address = message.address
+        return await super().handle_control_command(message)
+
+    @staticmethod
+    def _parse_config(config: Dict, kw: Dict) -> Dict:
+        kw = AbstractActorPool._parse_config(config, kw)
+        config: ActorPoolConfig = kw["config"]
+        main_process_index = config.get_process_indexes()[0]
+        kw["main_address"] = config.get_pool_config(main_process_index)[
+            "external_address"
+        ][0]
+        return kw
+
+
+class MainActorPoolBase(ActorPoolBase):
+    __slots__ = (
+        "_allocated_actors",
+        "sub_actor_pool_manager",
+        "_auto_recover",
+        "_monitor_task",
+        "_on_process_down",
+        "_on_process_recover",
+        "_recover_events",
+    )
+
+    def __init__(
+        self,
+        process_index: int,
+        label: str,
+        external_address: str,
+        internal_address: str,
+        env: Dict,
+        router: Router,
+        config: ActorPoolConfig,
+        servers: List[Server],
+        subprocess_start_method: str = None,
+        auto_recover: Union[str, bool] = "actor",
+        on_process_down: Callable[[MainActorPoolType, str], None] = None,
+        on_process_recover: Callable[[MainActorPoolType, str], None] = None,
+    ):
+        super().__init__(
+            process_index,
+            label,
+            external_address,
+            internal_address,
+            env,
+            router,
+            config,
+            servers,
+        )
+        self._subprocess_start_method = subprocess_start_method
+
+        # auto recovering
+        self._auto_recover = auto_recover
+        self._monitor_task: Optional[asyncio.Task] = None
+        self._on_process_down = on_process_down
+        self._on_process_recover = on_process_recover
+        self._recover_events: Dict[str, asyncio.Event] = dict()
+
+        # states
+        self._allocated_actors: allocated_type = {
+            addr: dict() for addr in self._config.get_external_addresses()
+        }
+        self._allocation_lock = threading.Lock()
+
+        self.sub_processes: Dict[str, SubProcessHandle] = dict()
+
+    _process_index_gen = itertools.count()
+
+    @classmethod
+    def process_index_gen(cls, address):
+        # make sure different processes does not share process indexes
+        pid = os.getpid()
+        for idx in cls._process_index_gen:
+            yield pid << 16 + idx
+
+    @property
+    def _sub_processes(self):
+        return self.sub_processes
+
+    @implements(AbstractActorPool.create_actor)
+    async def create_actor(self, message: CreateActorMessage) -> ResultMessageType:
+        with _ErrorProcessor(
+            address=self.external_address,
+            message_id=message.message_id,
+            protocol=message.protocol,
+        ) as processor:
+            allocate_strategy = message.allocate_strategy
+            with self._allocation_lock:
+                # get allocated address according to corresponding strategy
+                address = allocate_strategy.get_allocated_address(
+                    self._config, self._allocated_actors
+                )
+                # set placeholder to make sure this label is occupied
+                self._allocated_actors[address][None] = (allocate_strategy, message)
+            if address == self.external_address:
+                # creating actor on main actor pool
+                result = await super().create_actor(message)
+                if isinstance(result, ResultMessage):
+                    self._allocated_actors[self.external_address][result.result] = (
+                        allocate_strategy,
+                        message,
+                    )
+                processor.result = result
+            else:
+                # creating actor on sub actor pool
+                # rewrite allocate strategy to AddressSpecified
+                new_allocate_strategy = AddressSpecified(address)
+                new_create_actor_message = CreateActorMessage(
+                    message.message_id,
+                    message.actor_cls,
+                    message.actor_id,
+                    message.args,
+                    message.kwargs,
+                    allocate_strategy=new_allocate_strategy,
+                    from_main=True,
+                    protocol=message.protocol,
+                    message_trace=message.message_trace,
+                )
+                result = await self.call(address, new_create_actor_message)
+                if isinstance(result, ResultMessage):
+                    self._allocated_actors[address][result.result] = (
+                        allocate_strategy,
+                        new_create_actor_message,
+                    )
+                processor.result = result
+
+            # revert placeholder
+            self._allocated_actors[address].pop(None, None)
+
+        return processor.result
+
+    @implements(AbstractActorPool.has_actor)
+    async def has_actor(self, message: HasActorMessage) -> ResultMessage:
+        actor_ref = message.actor_ref
+        # lookup allocated
+        for address, item in self._allocated_actors.items():
+            ref = create_actor_ref(address, to_binary(actor_ref.uid))
+            if ref in item:
+                return ResultMessage(
+                    message.message_id, True, protocol=message.protocol
+                )
+
+        return ResultMessage(message.message_id, False, protocol=message.protocol)
+
+    @implements(AbstractActorPool.destroy_actor)
+    async def destroy_actor(self, message: DestroyActorMessage) -> ResultMessageType:
+        actor_ref_message = ActorRefMessage(
+            message.message_id, message.actor_ref, protocol=message.protocol
+        )
+        result = await self.actor_ref(actor_ref_message)
+        if not isinstance(result, ResultMessage):
+            return result
+        real_actor_ref = result.result
+        if real_actor_ref.address == self.external_address:
+            result = await super().destroy_actor(message)
+            if result.message_type == MessageType.error:
+                return result
+            del self._allocated_actors[self.external_address][real_actor_ref]
+            return ResultMessage(
+                message.message_id, real_actor_ref.uid, protocol=message.protocol
+            )
+        # remove allocated actor ref
+        self._allocated_actors[real_actor_ref.address].pop(real_actor_ref, None)
+        new_destroy_message = DestroyActorMessage(
+            message.message_id,
+            real_actor_ref,
+            from_main=True,
+            protocol=message.protocol,
+        )
+        return await self.call(real_actor_ref.address, new_destroy_message)
+
+    @implements(AbstractActorPool.send)
+    async def send(self, message: SendMessage) -> ResultMessageType:
+        if message.actor_ref.uid in self._actors:
+            return await super().send(message)
+        actor_ref_message = ActorRefMessage(
+            message.message_id, message.actor_ref, protocol=message.protocol
+        )
+        result = await self.actor_ref(actor_ref_message)
+        if not isinstance(result, ResultMessage):
+            return result
+        actor_ref = result.result
+        new_send_message = SendMessage(
+            message.message_id,
+            actor_ref,
+            message.content,
+            protocol=message.protocol,
+            message_trace=message.message_trace,
+        )
+        return await self.call(actor_ref.address, new_send_message)
+
+    @implements(AbstractActorPool.tell)
+    async def tell(self, message: TellMessage) -> ResultMessageType:
+        if message.actor_ref.uid in self._actors:
+            return await super().tell(message)
+        actor_ref_message = ActorRefMessage(
+            message.message_id, message.actor_ref, protocol=message.protocol
+        )
+        result = await self.actor_ref(actor_ref_message)
+        if not isinstance(result, ResultMessage):
+            return result
+        actor_ref = result.result
+        new_tell_message = TellMessage(
+            message.message_id,
+            actor_ref,
+            message.content,
+            protocol=message.protocol,
+            message_trace=message.message_trace,
+        )
+        return await self.call(actor_ref.address, new_tell_message)
+
+    @implements(AbstractActorPool.actor_ref)
+    async def actor_ref(self, message: ActorRefMessage) -> ResultMessageType:
+        actor_ref = message.actor_ref
+        actor_ref.uid = to_binary(actor_ref.uid)
+        if actor_ref.address == self.external_address and actor_ref.uid in self._actors:
+            return ResultMessage(
+                message.message_id, actor_ref, protocol=message.protocol
+            )
+
+        # lookup allocated
+        for address, item in self._allocated_actors.items():
+            ref = create_actor_ref(address, actor_ref.uid)
+            if ref in item:
+                return ResultMessage(message.message_id, ref, protocol=message.protocol)
+
+        with _ErrorProcessor(
+            self.external_address, message.message_id, protocol=message.protocol
+        ) as processor:
+            raise ActorNotExist(
+                f"Actor {actor_ref.uid} does not exist in {actor_ref.address}"
+            )
+
+        return processor.result
+
+    @implements(AbstractActorPool.cancel)
+    async def cancel(self, message: CancelMessage) -> ResultMessageType:
+        if message.address == self.external_address:
+            # local message
+            return await super().cancel(message)
+        # redirect to sub pool
+        return await self.call(message.address, message)
+
+    @implements(AbstractActorPool.handle_control_command)
+    async def handle_control_command(
+        self, message: ControlMessage
+    ) -> ResultMessageType:
+        with _ErrorProcessor(
+            self.external_address, message.message_id, message.protocol
+        ) as processor:
+            if message.address == self.external_address:
+                if message.control_message_type == ControlMessageType.sync_config:
+                    # sync config, need to notify all sub pools
+                    tasks = []
+                    for addr in self.sub_processes:
+                        control_message = ControlMessage(
+                            new_message_id(),
+                            message.address,
+                            message.control_message_type,
+                            message.content,
+                            protocol=message.protocol,
+                            message_trace=message.message_trace,
+                        )
+                        tasks.append(
+                            asyncio.create_task(self.call(addr, control_message))
+                        )
+                    # call super
+                    task = asyncio.create_task(super().handle_control_command(message))
+                    tasks.append(task)
+                    await asyncio.gather(*tasks)
+                    processor.result = await task
+                else:
+                    processor.result = await super().handle_control_command(message)
+            elif message.control_message_type == ControlMessageType.stop:
+                timeout, force = (
+                    message.content if message.content is not None else (None, False)
+                )
+                await self.stop_sub_pool(
+                    message.address,
+                    self.sub_processes[message.address],
+                    timeout=timeout,
+                    force=force,
+                )
+                processor.result = ResultMessage(
+                    message.message_id, True, protocol=message.protocol
+                )
+            elif message.control_message_type == ControlMessageType.wait_pool_recovered:
+                if self._auto_recover and message.address not in self._recover_events:
+                    self._recover_events[message.address] = asyncio.Event()
+
+                event = self._recover_events.get(message.address, None)
+                if event is not None:
+                    await event.wait()
+                processor.result = ResultMessage(
+                    message.message_id, True, protocol=message.protocol
+                )
+            elif message.control_message_type == ControlMessageType.add_sub_pool_actor:
+                address, allocate_strategy, create_message = message.content
+                create_message.from_main = True
+                ref = create_actor_ref(address, to_binary(create_message.actor_id))
+                self._allocated_actors[address][ref] = (
+                    allocate_strategy,
+                    create_message,
+                )
+                processor.result = ResultMessage(
+                    message.message_id, True, protocol=message.protocol
+                )
+            else:
+                processor.result = await self.call(message.address, message)
+        return processor.result
+
+    @staticmethod
+    def _parse_config(config: Dict, kw: Dict) -> Dict:
+        kw["subprocess_start_method"] = config.pop("start_method", None)
+        kw["auto_recover"] = config.pop("auto_recover", "actor")
+        kw["on_process_down"] = config.pop("on_process_down", None)
+        kw["on_process_recover"] = config.pop("on_process_recover", None)
+        kw = AbstractActorPool._parse_config(config, kw)
+        return kw
+
+    @classmethod
+    @implements(AbstractActorPool.create)
+    async def create(cls, config: Dict) -> MainActorPoolType:
+        config = config.copy()
+        actor_pool_config: ActorPoolConfig = config.get("actor_pool_config")
+        start_method = config.get("start_method", None)
+        if "process_index" not in config:
+            config["process_index"] = actor_pool_config.get_process_indexes()[0]
+        curr_process_index = config.get("process_index")
+        old_config_addresses = set(actor_pool_config.get_external_addresses())
+
+        tasks = []
+        subpool_process_idxes = []
+        # create sub actor pools
+        n_sub_pool = actor_pool_config.n_pool - 1
+        if n_sub_pool > 0:
+            process_indexes = actor_pool_config.get_process_indexes()
+            for process_index in process_indexes:
+                if process_index == curr_process_index:
+                    continue
+                create_pool_task = asyncio.create_task(
+                    cls.start_sub_pool(actor_pool_config, process_index, start_method)
+                )
+                await asyncio.sleep(0)
+                # await create_pool_task
+                tasks.append(create_pool_task)
+                subpool_process_idxes.append(process_index)
+
+        processes, ext_addresses = await cls.wait_sub_pools_ready(tasks)
+        if ext_addresses:
+            for process_index, ext_address in zip(subpool_process_idxes, ext_addresses):
+                actor_pool_config.reset_pool_external_address(
+                    process_index, ext_address
+                )
+
+        # create main actor pool
+        pool: MainActorPoolType = await super().create(config)
+        addresses = actor_pool_config.get_external_addresses()[1:]
+
+        assert len(addresses) == len(
+            processes
+        ), f"addresses {addresses}, processes {processes}"
+        for addr, proc in zip(addresses, processes):
+            pool.attach_sub_process(addr, proc)
+
+        new_config_addresses = set(actor_pool_config.get_external_addresses())
+        if old_config_addresses != new_config_addresses:
+            control_message = ControlMessage(
+                message_id=new_message_id(),
+                address=pool.external_address,
+                control_message_type=ControlMessageType.sync_config,
+                content=actor_pool_config,
+            )
+            await pool.handle_control_command(control_message)
+
+        return pool
+
+    async def start_monitor(self):
+        if self._monitor_task is None:
+            self._monitor_task = asyncio.create_task(self.monitor_sub_pools())
+        return self._monitor_task
+
+    @implements(AbstractActorPool.stop)
+    async def stop(self):
+        global_router = Router.get_instance()
+        if global_router is not None:
+            global_router.remove_router(self._router)
+
+        # turn off auto recover to avoid errors
+        self._auto_recover = False
+        self._stopped.set()
+        if self._monitor_task and not self._monitor_task.done():
+            await self._monitor_task
+            self._monitor_task = None
+        await self.stop_sub_pools()
+        await super().stop()
+
+    @classmethod
+    @abstractmethod
+    async def start_sub_pool(
+        cls,
+        actor_pool_config: ActorPoolConfig,
+        process_index: int,
+        start_method: str = None,
+    ):
+        """Start a sub actor pool"""
+
+    @classmethod
+    @abstractmethod
+    async def wait_sub_pools_ready(cls, create_pool_tasks: List[asyncio.Task]):
+        """Wait all sub pools ready"""
+
+    def attach_sub_process(self, external_address: str, process: SubProcessHandle):
+        self.sub_processes[external_address] = process
+
+    async def stop_sub_pools(self):
+        to_stop_processes: Dict[str, SubProcessHandle] = dict()
+        for address, process in self.sub_processes.items():
+            if not await self.is_sub_pool_alive(process):
+                continue
+            to_stop_processes[address] = process
+
+        tasks = []
+        for address, process in to_stop_processes.items():
+            tasks.append(self.stop_sub_pool(address, process))
+        await asyncio.gather(*tasks)
+
+    async def stop_sub_pool(
+        self,
+        address: str,
+        process: SubProcessHandle,
+        timeout: float = None,
+        force: bool = False,
+    ):
+        if force:
+            await self.kill_sub_pool(process, force=True)
+            return
+
+        stop_message = ControlMessage(
+            new_message_id(),
+            address,
+            ControlMessageType.stop,
+            None,
+            protocol=DEFAULT_PROTOCOL,
+        )
+        try:
+            if timeout is None:
+                message = await self.call(address, stop_message)
+                if isinstance(message, ErrorMessage):  # pragma: no cover
+                    raise message.as_instanceof_cause()
+            else:
+                call = asyncio.create_task(self.call(address, stop_message))
+                try:
+                    await asyncio.wait_for(call, timeout)
+                except (futures.TimeoutError, asyncio.TimeoutError):  # pragma: no cover
+                    # timeout, just let kill to finish it
+                    force = True
+        except (ConnectionError, ServerClosed):  # pragma: no cover
+            # process dead maybe, ignore it
+            pass
+        # kill process
+        await self.kill_sub_pool(process, force=force)
+
+    @abstractmethod
+    async def kill_sub_pool(self, process: SubProcessHandle, force: bool = False):
+        """Kill a sub actor pool"""
+
+    @abstractmethod
+    async def is_sub_pool_alive(self, process: SubProcessHandle):
+        """
+        Check whether sub pool process is alive
+        Parameters
+        ----------
+        process : SubProcessHandle
+            sub pool process handle
+        Returns
+        -------
+        bool
+        """
+
+    @abstractmethod
+    def recover_sub_pool(self, address):
+        """Recover a sub actor pool"""
+
+    def process_sub_pool_lost(self, address: str):
+        if self._auto_recover in (False, "process"):
+            # process down, when not auto_recover
+            # or only recover process, remove all created actors
+            self._allocated_actors[address] = dict()
+
+    async def monitor_sub_pools(self):
+        try:
+            while not self._stopped.is_set():
+                for address, process in self.sub_processes.items():
+                    try:
+                        recover_events_discovered = address in self._recover_events
+                        if not await self.is_sub_pool_alive(
+                            process
+                        ):  # pragma: no cover
+                            if self._on_process_down is not None:
+                                self._on_process_down(self, address)
+                            self.process_sub_pool_lost(address)
+                            if self._auto_recover:
+                                await self.recover_sub_pool(address)
+                                if self._on_process_recover is not None:
+                                    self._on_process_recover(self, address)
+                        if recover_events_discovered:
+                            event = self._recover_events.pop(address)
+                            event.set()
+                    except asyncio.CancelledError:
+                        raise
+                    except RuntimeError as ex:  # pragma: no cover
+                        if (
+                            "cannot schedule new futures after interpreter shutdown"
+                            not in str(ex)
+                        ):
+                            # to silence log when process exit, otherwise it
+                            # will raise "RuntimeError: cannot schedule new futures
+                            # after interpreter shutdown".
+                            logger.exception("Monitor sub pool %s failed", address)
+                    except Exception:
+                        # log the exception instead of stop monitoring the
+                        # sub pool silently.
+                        logger.exception("Monitor sub pool %s failed", address)
+
+                # check every half second
+                await asyncio.sleep(0.5)
+        except asyncio.CancelledError:  # pragma: no cover
+            # cancelled
+            return
+
+    @classmethod
+    @abstractmethod
+    def get_external_addresses(
+        cls,
+        address: str,
+        n_process: int = None,
+        ports: List[int] = None,
+        schemes: List[str] = None,
+    ):
+        """Returns external addresses for n pool processes"""
+
+    @classmethod
+    @abstractmethod
+    def gen_internal_address(
+        cls, process_index: int, external_address: str = None
+    ) -> str:
+        """Returns internal address for pool of specified process index"""
+
+
+async def create_actor_pool(
+    address: str,
+    *,
+    pool_cls: Type[MainActorPoolType] = None,
+    n_process: int = None,
+    labels: List[str] = None,
+    ports: List[int] = None,
+    envs: List[Dict] = None,
+    external_address_schemes: List[str] = None,
+    enable_internal_addresses: List[bool] = None,
+    subprocess_start_method: str = None,
+    auto_recover: Union[str, bool] = "actor",
+    modules: List[str] = None,
+    suspend_sigint: bool = None,
+    use_uvloop: Union[str, bool] = "auto",
+    logging_conf: Union[Dict, None] = None,
+    on_process_down: Callable[[MainActorPoolType, str], None] = None,
+    on_process_recover: Callable[[MainActorPoolType, str], None] = None,
+    extra_conf: dict = None,
+    **kwargs,
+) -> MainActorPoolType:
+    from ... import dataframe, learn, remote, tensor
+
+    if n_process is None:
+        n_process = multiprocessing.cpu_count()
+    if labels and len(labels) != n_process + 1:
+        raise ValueError(
+            f"`labels` should be of size {n_process + 1}, got {len(labels)}"
+        )
+    if envs and len(envs) != n_process:
+        raise ValueError(f"`envs` should be of size {n_process}, got {len(envs)}")
+    if external_address_schemes and len(external_address_schemes) != n_process + 1:
+        raise ValueError(
+            f"`external_address_schemes` should be of size {n_process + 1}, "
+            f"got {len(external_address_schemes)}"
+        )
+    if enable_internal_addresses and len(enable_internal_addresses) != n_process + 1:
+        raise ValueError(
+            f"`enable_internal_addresses` should be of size {n_process + 1}, "
+            f"got {len(enable_internal_addresses)}"
+        )
+    elif not enable_internal_addresses:
+        enable_internal_addresses = [True] * (n_process + 1)
+    if auto_recover is True:
+        auto_recover = "actor"
+    if auto_recover not in ("actor", "process", False):
+        raise ValueError(
+            f'`auto_recover` should be one of "actor", "process", '
+            f"True or False, got {auto_recover}"
+        )
+    if use_uvloop == "auto":
+        try:
+            import uvloop  # noqa: F401 # pylint: disable=unused-variable
+
+            use_uvloop = True
+        except ImportError:
+            use_uvloop = False
+
+    modules = list(modules or []) + [
+        tensor.__name__,
+        dataframe.__name__,
+        learn.__name__,
+        remote.__name__,
+    ]
+
+    external_addresses = pool_cls.get_external_addresses(
+        address, n_process=n_process, ports=ports, schemes=external_address_schemes
+    )
+    actor_pool_config = ActorPoolConfig()
+    actor_pool_config.add_metric_configs(kwargs.get("metrics", {}))
+    # add main config
+    process_index_gen = pool_cls.process_index_gen(address)
+    main_process_index = next(process_index_gen)
+    main_internal_address = (
+        pool_cls.gen_internal_address(main_process_index, external_addresses[0])
+        if enable_internal_addresses[0]
+        else None
+    )
+    actor_pool_config.add_pool_conf(
+        main_process_index,
+        labels[0] if labels else None,
+        main_internal_address,
+        external_addresses[0],
+        modules=modules,
+        suspend_sigint=suspend_sigint,
+        use_uvloop=use_uvloop,
+        logging_conf=logging_conf,
+        kwargs=kwargs,
+    )
+    # add sub configs
+    for i in range(n_process):
+        sub_process_index = next(process_index_gen)
+        internal_address = (
+            pool_cls.gen_internal_address(sub_process_index, external_addresses[i + 1])
+            if enable_internal_addresses[i + 1]
+            else None
+        )
+        actor_pool_config.add_pool_conf(
+            sub_process_index,
+            labels[i + 1] if labels else None,
+            internal_address,
+            external_addresses[i + 1],
+            env=envs[i] if envs else None,
+            modules=modules,
+            suspend_sigint=suspend_sigint,
+            use_uvloop=use_uvloop,
+            logging_conf=logging_conf,
+            kwargs=kwargs,
+        )
+    actor_pool_config.add_comm_config(extra_conf)
+
+    pool: MainActorPoolType = await pool_cls.create(
+        {
+            "actor_pool_config": actor_pool_config,
+            "process_index": main_process_index,
+            "start_method": subprocess_start_method,
+            "auto_recover": auto_recover,
+            "on_process_down": on_process_down,
+            "on_process_recover": on_process_recover,
+        }
+    )
+    await pool.start()
+    return pool
diff --git a/python/xorbits/_mars/oscar/backends/ray/__init__.py b/python/xorbits/_mars/oscar/backends/ray/__init__.py
new file mode 100644
index 000000000..c566a786b
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .backend import RayActorBackend
diff --git a/python/xorbits/_mars/oscar/backends/ray/backend.py b/python/xorbits/_mars/oscar/backends/ray/backend.py
new file mode 100644
index 000000000..2873eb202
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/backend.py
@@ -0,0 +1,110 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import Dict
+
+from ....utils import Timer, lazy_import
+from ...backend import BaseActorBackend, register_backend
+from ..context import MarsActorContext
+from .driver import RayActorDriver
+from .pool import RayMainPool
+from .utils import get_placement_group, process_address_to_placement
+
+ray = lazy_import("ray")
+
+__all__ = ["RayActorBackend"]
+
+logger = logging.getLogger(__name__)
+
+
+@register_backend
+class RayActorBackend(BaseActorBackend):
+    @staticmethod
+    def name():
+        return "ray"
+
+    @staticmethod
+    def get_context_cls():
+        return MarsActorContext
+
+    @staticmethod
+    def get_driver_cls():
+        return RayActorDriver
+
+    @classmethod
+    async def _create_ray_pools(cls, address: str, n_process: int = None, **kwargs):
+        # pop `n_io_process` from kwargs as ray doesn't need this
+        kwargs.pop("n_io_process", 0)
+        pg_name, bundle_index, _ = process_address_to_placement(address)
+        from .pool import RayMainActorPool
+
+        pool_addresses = RayMainActorPool.get_external_addresses(address, n_process)
+        assert pool_addresses[0] == address
+        pg = get_placement_group(pg_name) if pg_name else None
+        num_cpus = kwargs.get("main_pool_cpus", 0)
+        sub_pools = {
+            sub_pool_address: RayMainActorPool.create_sub_pool(
+                address, sub_pool_address
+            )
+            for sub_pool_address in pool_addresses[1:]
+        }
+        actor_handle = (
+            ray.remote(RayMainPool)
+            .options(
+                num_cpus=num_cpus,
+                name=address,
+                max_concurrency=10000000,  # By default, 1000 tasks can be running concurrently.
+                max_restarts=-1,  # Auto restarts by ray
+                placement_group=pg,
+                placement_group_bundle_index=bundle_index,
+                placement_group_capture_child_tasks=False,
+            )
+            .remote(address, n_process, sub_pools, **kwargs)
+        )
+        pool_handle = RayPoolHandle(actor_handle, sub_pools)
+        return pool_handle
+
+    @classmethod
+    async def create_actor_pool(cls, address: str, n_process: int = None, **kwargs):
+        with Timer() as timer:
+            pool_handle = await cls._create_ray_pools(address, n_process, **kwargs)
+        logger.info(
+            "Submit create actor pool %s took %s seconds.",
+            pool_handle.main_pool,
+            timer.duration,
+        )
+        with Timer() as timer:
+            await pool_handle.main_pool.start.remote()
+        logger.info(
+            "Start actor pool %s took %s seconds.",
+            pool_handle.main_pool,
+            timer.duration,
+        )
+        return pool_handle
+
+
+class RayPoolHandle:
+    def __init__(
+        self,
+        main_pool: "ray.actor.ActorHandle",
+        sub_pools: Dict[str, "ray.actor.ActorHandle"],
+    ):
+        self.main_pool = main_pool
+        # Hold sub_pool actor handles to avoid gc.
+        self.sub_pools = sub_pools
+
+    def __getattr__(self, item):
+        if item in ("main_pool", "sub_pools"):  # pragma: no cover
+            return object.__getattribute__(self, item)
+        return getattr(self.main_pool, item)
diff --git a/python/xorbits/_mars/oscar/backends/ray/communication.py b/python/xorbits/_mars/oscar/backends/ray/communication.py
new file mode 100644
index 000000000..dc8e56a9c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/communication.py
@@ -0,0 +1,552 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import concurrent.futures as futures
+import itertools
+import logging
+import time
+from abc import ABC
+from collections import namedtuple
+from dataclasses import dataclass
+from typing import Any, Callable, Coroutine, Dict, List, Set, Tuple, Type
+from urllib.parse import urlparse
+
+from ....metrics import Metrics
+from ....oscar.profiling import ProfilingData
+from ....serialization import deserialize, serialize
+from ....utils import Timer, classproperty, implements, lazy_import, lazy_import_on_load
+from ...debug import debug_async_timeout
+from ...errors import ServerClosed
+from ..communication.base import Channel, ChannelType, Client, Server
+from ..communication.core import register_client, register_server
+from ..communication.errors import ChannelClosed
+from .utils import report_event
+
+ray = lazy_import("ray")
+logger = logging.getLogger(__name__)
+
+ChannelID = namedtuple(
+    "ChannelID", ["local_address", "client_id", "channel_index", "dest_address"]
+)
+
+SERIALIZATION_TIMEOUT_MILLS = 1000
+DESERIALIZATION_TIMEOUT_MILLS = 1000
+
+
+def msg_to_simple_str(msg):  # pragma: no cover
+    """An helper that prints message structure without generate a big str."""
+    from ..message import SendMessage, _MessageBase
+
+    if type(msg) == _ArgWrapper:
+        msg = msg.message
+    if isinstance(msg, SendMessage):
+        return f"{str(type(msg).__name__)}(actor_ref={msg.actor_ref}, content={msg_to_simple_str(msg.content)})"
+    if isinstance(msg, _MessageBase):
+        return str(msg)
+    if isinstance(msg, List):
+        part_str = ", ".join([msg_to_simple_str(item) for item in msg[:5]])
+        return f"List<{part_str}...{len(msg)}>"
+    if isinstance(msg, Set):
+        part_str = ", ".join([msg_to_simple_str(item) for item in list(msg)[:5]])
+        return f"Set<{part_str}...{len(msg)}>"
+    if isinstance(msg, Tuple):
+        part_str = ", ".join([msg_to_simple_str(item) for item in msg[:5]])
+        return f"Tuple<{part_str}...{len(msg)}>"
+    if isinstance(msg, Dict):
+        part_str = []
+        it = iter(msg.items())
+        try:
+            while len(part_str) < 5:
+                entry = next(it)
+                part_str.append(
+                    f"k={msg_to_simple_str(entry[0])}, v={msg_to_simple_str(entry[1])}"
+                )
+        except StopIteration:
+            pass
+        part_str = ", ".join(part_str)
+        return f"Dict<{part_str}...{len(msg)}>"
+    if isinstance(msg, (str, float, int, bool)):
+        return "{!s:.50}".format(msg)
+    return str(type(msg))
+
+
+def _argwrapper_unpickler(serialized_message):
+    return _ArgWrapper(deserialize(*serialized_message))
+
+
+@dataclass
+class _ArgWrapper:
+    message: Any = None
+
+    def __init__(self, message):
+        self.message = message
+
+    def __reduce__(self):
+        return _argwrapper_unpickler, (
+            serialize(self.message, context={"serializer": "ray"}),
+        )
+
+
+@lazy_import_on_load(ray)
+def _init_ray_serialization_deserialization():
+    _ray_serialize = ray.serialization.SerializationContext.serialize
+    _ray_deserialize_object = ray.serialization.SerializationContext._deserialize_object
+    serialized_bytes_counter = Metrics.counter(
+        "mars.channel_serialized_bytes",
+        "The bytes serialized by mars ray channel.",
+    )
+    deserialized_bytes_counter = Metrics.counter(
+        "mars.channel_deserialized_bytes",
+        "The bytes deserialized by mars ray channel.",
+    )
+    serialization_time_mills = Metrics.counter(
+        "mars.channel_serialization_time_mills",
+        "The time used by mars ray channel serialization.",
+    )
+    deserialization_time_mills = Metrics.counter(
+        "mars.channel_deserialization_time_mills",
+        "The time used by mars ray channel deserialization.",
+    )
+
+    def _serialize(self, value):
+        if type(value) is _ArgWrapper:  # pylint: disable=unidiomatic-typecheck
+            message = value.message
+            with Timer() as timer:
+                serialized_object = _ray_serialize(self, value)
+                bytes_length = serialized_object.total_bytes
+                serialized_bytes_counter.record(bytes_length)
+            serialization_time_mills.record(timer.duration * 1000)
+            if bytes_length > 1 * 1024 * 1024 * 1024:  # pragma: no cover
+                logger.warning(
+                    "Serialize large message (%s bytes > 1GB) through ray channel, message: %s.",
+                    bytes_length,
+                    msg_to_simple_str(message),
+                )
+            if timer.duration * 1000 > SERIALIZATION_TIMEOUT_MILLS:  # pragma: no cover
+                report_event(
+                    "WARNING",
+                    "SERIALIZATION_TIMEOUT",
+                    f"Serialization took {timer.duration} seconds for {bytes_length} sized message {msg_to_simple_str(message)}.",
+                )
+            try:
+                if message.profiling_context is not None:
+                    task_id = message.profiling_context.task_id
+                    ProfilingData[task_id, "serialization"].inc(
+                        "serialize", timer.duration
+                    )
+            except AttributeError:  # pragma: no cover
+                logger.info(
+                    "Profiling serialization got error, the send "
+                    "message %s may not be an instance of message",
+                    type(message),
+                )
+        else:
+            serialized_object = _ray_serialize(self, value)
+        return serialized_object
+
+    def _deserialize_object(self, data, metadata, object_ref):
+        start_time = time.time()
+        bytes_length = 0
+        if data:
+            bytes_length = len(data)
+            deserialized_bytes_counter.record(bytes_length)
+        value = _ray_deserialize_object(self, data, metadata, object_ref)
+        duration = time.time() - start_time
+        deserialization_time_mills.record(duration * 1000)
+        if duration * 1000 > DESERIALIZATION_TIMEOUT_MILLS:  # pragma: no cover
+            report_event(
+                "WARNING",
+                "DESERIALIZATION_TIMEOUT",
+                f"Deserialization took {duration} seconds for "
+                f"{bytes_length} sized msg {msg_to_simple_str(value)}",
+            )
+        if type(value) is _ArgWrapper:  # pylint: disable=unidiomatic-typecheck
+            message = value.message
+            try:
+                if message.profiling_context is not None:
+                    task_id = message.profiling_context.task_id
+                    ProfilingData[task_id, "serialization"].inc(
+                        "deserialize", time.time() - start_time
+                    )
+            except AttributeError:  # pragma: no cover
+                logger.info(
+                    "Profiling serialization got error, the recv "
+                    "message %s may not be an instance of message",
+                    type(message),
+                )
+        return value
+
+    ray.serialization.SerializationContext.serialize = _serialize
+    ray.serialization.SerializationContext._deserialize_object = _deserialize_object
+
+
+class RayChannelException(Exception):
+    def __init__(self, exc_type, exc_value: BaseException, exc_traceback):
+        self.exc_type = exc_type
+        self.exc_value = exc_value
+        self.exc_traceback = exc_traceback
+
+
+class RayChannelBase(Channel, ABC):
+    """
+    Channel for communications between ray processes.
+    """
+
+    __slots__ = "_channel_index", "_channel_id", "_closed"
+
+    name = "ray"
+    _channel_index_gen = itertools.count()
+
+    def __init__(
+        self,
+        local_address: str = None,
+        dest_address: str = None,
+        channel_index: int = None,
+        channel_id: ChannelID = None,
+        compression=None,
+    ):
+        super().__init__(
+            local_address=local_address,
+            dest_address=dest_address,
+            compression=compression,
+        )
+        self._channel_index = channel_index or next(self._channel_index_gen)
+        self._channel_id = channel_id or ChannelID(
+            local_address, _gen_client_id(), self._channel_index, dest_address
+        )
+        self._closed = asyncio.Event()
+
+    @property
+    def channel_id(self) -> ChannelID:
+        return self._channel_id
+
+    @property
+    @implements(Channel.type)
+    def type(self) -> ChannelType:
+        return ChannelType.ray
+
+    @implements(Channel.close)
+    async def close(self):
+        self._closed.set()
+
+    @property
+    @implements(Channel.closed)
+    def closed(self) -> bool:
+        return self._closed.is_set()
+
+
+class RayClientChannel(RayChannelBase):
+    """
+    A channel from ray driver/actor to ray actor. Use ray call reply for client channel recv.
+    """
+
+    __slots__ = "_peer_actor", "_done", "_todo"
+
+    def __init__(
+        self,
+        dest_address: str = None,
+        channel_index: int = None,
+        channel_id: ChannelID = None,
+        compression=None,
+    ):
+        super().__init__(None, dest_address, channel_index, channel_id, compression)
+        # ray actor should be created with the address as the name.
+        self._peer_actor: "ray.actor.ActorHandle" = ray.get_actor(dest_address)
+        self._done = asyncio.Queue()
+        self._todo = set()
+
+    def _submit_task(self, message: Any, object_ref: "ray.ObjectRef"):
+        async def handle_task(message: Any, object_ref: "ray.ObjectRef"):
+            # use `%.500` to avoid print too long messages
+            with debug_async_timeout(
+                "ray_object_retrieval_timeout",
+                "Message that client sent to actor %s is %.500s and object_ref is %s",
+                self.dest_address,
+                message,
+                object_ref,
+            ):
+                try:
+                    result = await object_ref
+                except Exception as e:  # pragma: no cover
+                    # The error ClientObjectRef can't be formatted, so
+                    # we give it a string `ClientObjectRef` instead.
+                    try:
+                        object_ref_str = str(object_ref)
+                    except Exception:
+                        object_ref_str = "ClientObjectRef"
+                    logger.exception(
+                        "Get object %s from %s failed, got exception %s.",
+                        object_ref_str,
+                        self.dest_address,
+                        e,
+                    )
+                    raise
+            if isinstance(result, RayChannelException):
+                raise result.exc_value.with_traceback(result.exc_traceback)
+            return result.message
+
+        def _on_completion(future):
+            self._todo.remove(future)
+            self._done.put_nowait(future)
+
+        future = asyncio.ensure_future(handle_task(message, object_ref))
+        future.add_done_callback(_on_completion)
+        self._todo.add(future)
+
+    @implements(Channel.send)
+    async def send(self, message: Any):
+        if self._closed.is_set():  # pragma: no cover
+            raise ChannelClosed("Channel already closed, cannot send message")
+        # Put ray object ref to todo queue
+        task = self._peer_actor.__on_ray_recv__.remote(
+            self.channel_id, _ArgWrapper(message)
+        )
+        self._submit_task(message, task)
+        await asyncio.sleep(0)
+
+    @implements(Channel.recv)
+    async def recv(self):
+        if self._closed.is_set():  # pragma: no cover
+            raise ChannelClosed("Channel already closed, cannot recv message")
+        try:
+            # Wait first done.
+            future = await self._done.get()
+            return future.result()
+        except ray.exceptions.RayActorError:
+            if not self._closed.is_set():
+                # raise a EOFError as the SocketChannel does
+                raise EOFError("Server may be closed")
+        except (RuntimeError, ServerClosed) as e:  # pragma: no cover
+            if not self._closed.is_set():
+                raise e
+
+
+class RayServerChannel(RayChannelBase):
+    """
+    A channel from ray actor to ray driver/actor. Since ray actor can't call ray driver,
+    we use ray call reply for server channel send. Note that there can't be multiple
+    channel message sends for one received message, or else it will be taken as next
+    message's reply.
+    """
+
+    __slots__ = "_in_queue", "_out_queue", "_msg_recv_counter", "_msg_sent_counter"
+
+    def __init__(
+        self,
+        local_address: str = None,
+        channel_index: int = None,
+        channel_id: ChannelID = None,
+        compression=None,
+    ):
+        super().__init__(local_address, None, channel_index, channel_id, compression)
+        self._in_queue = asyncio.Queue()
+        self._out_queue = asyncio.Queue()
+        self._msg_recv_counter = 0
+        self._msg_sent_counter = 0
+
+    @implements(Channel.send)
+    async def send(self, message: Any):
+        if self._closed.is_set():  # pragma: no cover
+            raise ChannelClosed("Channel already closed, cannot send message")
+        # Current process is ray actor, we use ray call reply to send message to ray driver/actor.
+        # Not that we can only send once for every read message in channel, otherwise
+        # it will be taken as other message's reply.
+        await self._out_queue.put(message)
+        self._msg_sent_counter += 1
+        assert (
+            self._msg_sent_counter <= self._msg_recv_counter
+        ), "RayServerChannel channel doesn't support send multiple replies for one message."
+
+    @implements(Channel.recv)
+    async def recv(self):
+        if self._closed.is_set():  # pragma: no cover
+            raise ChannelClosed("Channel already closed, cannot write message")
+        try:
+            return await self._in_queue.get()
+        except RuntimeError:  # pragma: no cover
+            if not self._closed.is_set():
+                raise
+
+    async def __on_ray_recv__(self, message_wrapper):
+        """This method will be invoked when current process is a ray actor rather than a ray driver"""
+        self._msg_recv_counter += 1
+        await self._in_queue.put(message_wrapper.message)
+        result_message = await self._out_queue.get()
+        if self._closed.is_set():  # pragma: no cover
+            raise ChannelClosed("Channel already closed")
+        return _ArgWrapper(result_message)
+
+    @implements(Channel.close)
+    async def close(self):
+        await super().close()
+        self._out_queue.put_nowait(None)
+
+
+@register_server
+class RayServer(Server):
+    __slots__ = "_closed", "_channels", "_tasks"
+
+    scheme = "ray"
+    _server_instance = None
+    _ray_actor_started = False
+
+    def __init__(self, address, channel_handler: Callable[[Channel], Coroutine] = None):
+        super().__init__(address, channel_handler)
+        self._closed = asyncio.Event()
+        self._channels: Dict[ChannelID, RayServerChannel] = dict()
+        self._tasks: Dict[ChannelID, asyncio.Task] = dict()
+
+    @classproperty
+    @implements(Server.client_type)
+    def client_type(self) -> Type["Client"]:
+        return RayClient
+
+    @property
+    @implements(Server.channel_type)
+    def channel_type(self) -> ChannelType:
+        return ChannelType.ray
+
+    @classmethod
+    def set_ray_actor_started(cls):
+        cls._ray_actor_started = True
+
+    @classmethod
+    def is_ray_actor_started(cls):
+        return cls._ray_actor_started
+
+    @staticmethod
+    @implements(Server.create)
+    async def create(config: Dict) -> "RayServer":
+        if not RayServer.is_ray_actor_started():
+            logger.warning(
+                "Current process is not a ray actor, the ray server "
+                "will not receive messages from clients."
+            )
+        assert RayServer._server_instance is None
+        config = config.copy()
+        address = config.pop("address")
+        handle_channel = config.pop("handle_channel")
+        if urlparse(address).scheme != RayServer.scheme:  # pragma: no cover
+            raise ValueError(
+                f"Address for RayServer "
+                f'should be starts with "ray://", '
+                f"got {address}"
+            )
+        if config:  # pragma: no cover
+            raise TypeError(
+                f"Creating RayServer got unexpected " f'arguments: {",".join(config)}'
+            )
+        server = RayServer(address, handle_channel)
+        RayServer._server_instance = server
+        return server
+
+    @classmethod
+    def get_instance(cls):
+        return cls._server_instance
+
+    @classmethod
+    def clear(cls):
+        cls._server_instance = None
+        cls._ray_actor_started = False
+
+    @implements(Server.start)
+    async def start(self):
+        # nothing needs to do for ray server
+        pass
+
+    @implements(Server.join)
+    async def join(self, timeout=None):
+        wait_coro = self._closed.wait()
+        try:
+            await asyncio.wait_for(wait_coro, timeout=timeout)
+        except (futures.TimeoutError, asyncio.TimeoutError):  # pragma: no cover
+            pass
+
+    @implements(Server.on_connected)
+    async def on_connected(self, *args, **kwargs):
+        channel = args[0]
+        assert isinstance(channel, RayServerChannel)
+        if kwargs:  # pragma: no cover
+            raise TypeError(
+                f"{type(self).__name__} got unexpected "
+                f'arguments: {",".join(kwargs)}'
+            )
+        await self.channel_handler(channel)
+
+    @implements(Server.stop)
+    async def stop(self):
+        self._closed.set()
+        for task in self._tasks.values():
+            task.cancel()
+        self._tasks = dict()
+        for channel in self._channels.values():
+            await channel.close()
+        self._channels = dict()
+        self.clear()
+
+    @property
+    @implements(Server.stopped)
+    def stopped(self) -> bool:
+        return self._closed.is_set()
+
+    async def __on_ray_recv__(self, channel_id: ChannelID, message):
+        if self.stopped:
+            raise ServerClosed(
+                f"Remote server {self.address} closed, but got message {message} "
+                f"from channel {channel_id}"
+            )
+        channel = self._channels.get(channel_id)
+        if not channel:
+            _, _, peer_channel_index, peer_dest_address = channel_id
+            channel = RayServerChannel(
+                peer_dest_address, peer_channel_index, channel_id
+            )
+            self._channels[channel_id] = channel
+            self._tasks[channel_id] = asyncio.create_task(self.on_connected(channel))
+        return await channel.__on_ray_recv__(message)
+
+
+@register_client
+class RayClient(Client):
+    __slots__ = ()
+
+    scheme = RayServer.scheme
+
+    def __init__(self, local_address: str, dest_address: str, channel: Channel):
+        super().__init__(local_address, dest_address, channel)
+
+    @staticmethod
+    @implements(Client.connect)
+    async def connect(
+        dest_address: str, local_address: str = None, **kwargs
+    ) -> "Client":
+        if urlparse(dest_address).scheme != RayServer.scheme:  # pragma: no cover
+            raise ValueError(
+                f'Destination address should start with "ray://" '
+                f"for RayClient, got {dest_address}"
+            )
+        client_channel = RayClientChannel(dest_address)
+        client = RayClient(local_address, dest_address, client_channel)
+        return client
+
+    @implements(Client.close)
+    async def close(self):
+        await super().close()
+
+
+def _gen_client_id():
+    import uuid
+
+    return uuid.uuid4().hex
diff --git a/python/xorbits/_mars/oscar/backends/ray/driver.py b/python/xorbits/_mars/oscar/backends/ray/driver.py
new file mode 100644
index 000000000..9fe7ddc09
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/driver.py
@@ -0,0 +1,92 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import logging
+import os
+from numbers import Number
+from typing import Dict
+
+from ....utils import lazy_import
+from ...driver import BaseActorDriver
+from .utils import addresses_to_placement_group_info, process_placement_to_address
+
+ray = lazy_import("ray")
+logger = logging.getLogger(__name__)
+
+
+class RayActorDriver(BaseActorDriver):
+    _cluster_info = dict()
+
+    @classmethod
+    def setup_cluster(cls, address_to_resources: Dict[str, Dict[str, Number]]):
+        logger.info("Setup cluster with %s", address_to_resources)
+        # Note: Deep copy the dict to keep the origin values, because `bundles`
+        # returned by `addresses_to_placement_group_info()` will be modified
+        # by `ray.util.placement_group()`
+        original_address_to_resources = copy.deepcopy(address_to_resources)
+        pg_name, bundles = addresses_to_placement_group_info(address_to_resources)
+        logger.info("Creating placement group %s with bundles %s.", pg_name, bundles)
+        pg = ray.util.placement_group(name=pg_name, bundles=bundles, strategy="SPREAD")
+        create_pg_timeout = 120
+        done, _ = ray.wait([pg.ready()], timeout=create_pg_timeout)
+        if not done:  # pragma: no cover
+            raise Exception(
+                f"""Can't create placement group {pg.bundle_specs} in {create_pg_timeout} seconds"""
+            )
+        cluster_info = {
+            "original_address_to_resources": original_address_to_resources,
+            "address_to_resources": address_to_resources,
+            "pg_name": pg_name,
+            "pg_group": pg,
+            "main_pool_handles": [],  # Hold actor_handle to avoid actor being freed.
+        }
+        logger.info("Create placement group success.")
+        cls._cluster_info = cluster_info
+
+    @classmethod
+    def stop_cluster(cls):
+        logger.info("Stopping cluster %s.", cls._cluster_info)
+        if not cls._cluster_info:  # pragma: no cover
+            return
+        pg_name = cls._cluster_info["pg_name"]
+        pg = cls._cluster_info["pg_group"]
+        for index, bundle_spec in enumerate(pg.bundle_specs):
+            # Main pool took a process.
+            # If supervisor is created in the same node with worker, it will take a process too.
+            n_process = int(bundle_spec["CPU"]) + 2
+            for process_index in reversed(range(n_process)):
+                address = process_placement_to_address(
+                    pg_name, index, process_index=process_index
+                )
+                try:
+                    ray_actor = ray.get_actor(address)
+                    if "COV_CORE_SOURCE" in os.environ:  # pragma: no cover
+                        # must clean up first, or coverage info lost.
+                        # must save the local reference until this is fixed:
+                        # https://github.com/ray-project/ray/issues/7815
+                        ray.get(ray_actor.cleanup.remote())
+                    ray.kill(ray_actor, no_restart=True)
+                    while True:
+                        try:
+                            ray.get(ray_actor.wait.remote(30))
+                            logger.warning(
+                                "Waiting actor %s to be killed.", ray_actor
+                            )  # pragma: no cover
+                        except ray.exceptions.RayActorError:
+                            break
+                except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+                    pass
+        ray.util.remove_placement_group(pg)
+        cls._cluster_info = dict()
+        logger.info("Stopped cluster %s.", pg_name)
diff --git a/python/xorbits/_mars/oscar/backends/ray/pool.py b/python/xorbits/_mars/oscar/backends/ray/pool.py
new file mode 100644
index 000000000..9d26e4dce
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/pool.py
@@ -0,0 +1,396 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import datetime
+import inspect
+import itertools
+import logging
+import os
+import sys
+import threading
+import time
+import types
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import List, Optional
+
+from ....utils import ensure_coverage, lazy_import, retry_callable
+from ... import ServerClosed
+from ..config import ActorPoolConfig
+from ..message import CreateActorMessage
+from ..pool import (
+    AbstractActorPool,
+    MainActorPoolBase,
+    SubActorPoolBase,
+    _register_message_handler,
+    create_actor_pool,
+)
+from ..router import Router
+from .communication import ChannelID, RayChannelException, RayServer
+from .utils import (
+    get_placement_group,
+    kill_and_wait,
+    process_address_to_placement,
+    process_placement_to_address,
+)
+
+ray = lazy_import("ray")
+logger = logging.getLogger(__name__)
+
+
+class RayPoolState(Enum):
+    INIT = 0
+    POOL_READY = 1
+    SERVICE_READY = 2
+
+
+@_register_message_handler
+class RayMainActorPool(MainActorPoolBase):
+    @classmethod
+    def process_index_gen(cls, address):
+        _, __, process_index = process_address_to_placement(address)
+        return itertools.count(process_index)
+
+    @classmethod
+    def get_external_addresses(
+        cls,
+        address: str,
+        n_process: int = None,
+        ports: List[int] = None,
+        schemes: List[str] = None,
+    ):
+        assert (
+            not ports
+        ), f"ports should be none when actor pool running on ray, but got {ports}"
+        pg_name, bundle_index, process_index = process_address_to_placement(address)
+        return [
+            process_placement_to_address(pg_name, bundle_index, process_index + i)
+            for i in range(n_process + 1)
+        ]
+
+    @classmethod
+    def gen_internal_address(
+        cls, process_index: int, external_address: str = None
+    ) -> str:
+        return external_address
+
+    @classmethod
+    def create_sub_pool(
+        cls,
+        main_pool_address,
+        sub_pool_address,
+    ):
+        pg_name, bundle_index, process_index = process_address_to_placement(
+            sub_pool_address
+        )
+        pg = get_placement_group(pg_name) if pg_name else None
+        # Hold actor_handle to avoid actor being freed.
+        actor_handle = (
+            ray.remote(RaySubPool)
+            .options(
+                num_cpus=0,
+                name=sub_pool_address,
+                max_concurrency=10000000,  # By default, 1000 tasks can be running concurrently.
+                max_restarts=-1,  # Auto restarts by ray
+                placement_group=pg,
+                placement_group_bundle_index=bundle_index,
+                placement_group_capture_child_tasks=False,
+            )
+            .remote(main_pool_address, process_index)
+        )
+        return actor_handle
+
+    @classmethod
+    async def start_sub_pool(
+        cls,
+        actor_pool_config: ActorPoolConfig,
+        process_index: int,
+        start_method: str = None,
+    ):
+        config = actor_pool_config.get_pool_config(process_index)
+        external_addresses = config["external_address"]
+        assert (
+            len(external_addresses) == 1
+        ), f"Ray pool allows only one external address but got {external_addresses}"
+        external_address = external_addresses[0]
+        pg_name, bundle_index, _process_index = process_address_to_placement(
+            external_address
+        )
+        assert process_index == _process_index, (
+            f"process_index {process_index} is not consistent with index {_process_index} "
+            f"in external_address {external_address}"
+        )
+        actor_handle = config["kwargs"]["sub_pool_handles"][external_address]
+        state = await retry_callable(
+            actor_handle.state.remote, ex_type=ray.exceptions.RayActorError, sync=False
+        )()
+        if state is RayPoolState.SERVICE_READY:  # pragma: no cover
+            logger.info("Ray sub pool %s is alive, kill it first.", external_address)
+            await kill_and_wait(actor_handle, no_restart=False)
+            # Wait sub pool process restarted.
+            await retry_callable(
+                actor_handle.state.remote,
+                ex_type=ray.exceptions.RayActorError,
+                sync=False,
+            )()
+        logger.info("Start to start ray sub pool %s.", external_address)
+        create_sub_pool_timeout = 120
+        try:
+            await asyncio.wait_for(
+                actor_handle.set_actor_pool_config.remote(actor_pool_config),
+                timeout=create_sub_pool_timeout,
+            )
+        except asyncio.TimeoutError:  # pragma: no cover
+            msg = (
+                f"Can not start ray sub pool {external_address} in {create_sub_pool_timeout} seconds.",
+            )
+            logger.error(msg)
+            raise Exception(msg)
+        await actor_handle.start.remote()
+        logger.info("Start ray sub pool %s successfully.", external_address)
+        return actor_handle
+
+    @classmethod
+    async def wait_sub_pools_ready(cls, create_pool_tasks: List[asyncio.Task]):
+        return [await t for t in create_pool_tasks], None
+
+    async def recover_sub_pool(self, address: str):
+        process = self.sub_processes[address]
+        # ray call will error when actor is restarting
+        await retry_callable(
+            process.state.remote, ex_type=ray.exceptions.RayActorError, sync=False
+        )()
+        await process.start.remote()
+
+        if self._auto_recover == "actor":
+            # need to recover all created actors
+            for _, message in self._allocated_actors[address].values():
+                create_actor_message: CreateActorMessage = message
+                await self.call(address, create_actor_message)
+            await process.mark_service_ready.remote()
+
+    async def kill_sub_pool(
+        self,
+        process: "ray.actor.ActorHandle",
+        force: bool = False,
+        no_restart: bool = False,
+    ):
+        logger.info("Start to kill ray sub pool %s", process)
+        await kill_and_wait(process, no_restart=no_restart)
+
+    async def is_sub_pool_alive(self, process: "ray.actor.ActorHandle"):
+        try:
+            if self._auto_recover == "process":
+                return await process.state.remote() in [
+                    RayPoolState.POOL_READY,
+                    RayPoolState.SERVICE_READY,
+                ]
+            else:
+                return await process.state.remote() == RayPoolState.SERVICE_READY
+        except Exception:
+            logger.info("Detected RaySubPool %s died", process)
+            return False
+
+
+@_register_message_handler
+class RaySubActorPool(SubActorPoolBase):
+    async def stop(self):
+        try:
+            # clean global router
+            Router.get_instance().remove_router(self._router)
+            await self._caller.stop()
+            self._servers = []
+        finally:
+            self._stopped.set()
+
+
+class RayPoolBase(ABC):
+    __slots__ = "_actor_pool", "_ray_server"
+
+    _actor_pool: Optional["AbstractActorPool"]
+    _state: RayPoolState = RayPoolState.INIT
+
+    def __new__(cls, *args, **kwargs):
+        if threading.current_thread() is threading.main_thread():
+            ensure_coverage()
+        return super().__new__(cls, *args, **kwargs)
+
+    def __init__(self):
+        self._actor_pool = None
+        self._ray_server = None
+        RayServer.set_ray_actor_started()
+
+    @abstractmethod
+    async def start(self):
+        """Start actor pool in ray actor"""
+
+    def _set_ray_server(self, actor_pool: AbstractActorPool):
+        ray_servers = [
+            server for server in actor_pool._servers if isinstance(server, RayServer)
+        ]
+        assert (
+            len(ray_servers) == 1
+        ), f"Ray only support single server but got {ray_servers}."
+        self._ray_server = ray_servers[0]
+
+    async def __on_ray_recv__(self, channel_id: ChannelID, message):
+        """Method for communication based on ray actors"""
+        try:
+            if self._ray_server is None:
+                raise ServerClosed(f"Remote server {channel_id.dest_address} closed")
+            return await self._ray_server.__on_ray_recv__(channel_id, message)
+        except Exception:  # pragma: no cover
+            return RayChannelException(*sys.exc_info())
+
+    async def actor_pool(self, attribute, *args, **kwargs):
+        attr = getattr(self._actor_pool, attribute)
+        if isinstance(attr, types.MethodType):
+            if inspect.iscoroutinefunction(attr):
+                return await attr(*args, **kwargs)
+            return attr(*args, **kwargs)
+        else:
+            return attr
+
+    def state(self):
+        return self._state
+
+    @staticmethod
+    def getpid():
+        return os.getpid()
+
+    async def wait(self, seconds):
+        await asyncio.sleep(seconds)
+
+    def cleanup(self):
+        logger.info("Cleaning up %s of process %s now", self, os.getpid())
+        try:
+            from pytest_cov.embed import cleanup
+
+            cleanup()
+        except ImportError:  # pragma: no cover
+            pass
+
+
+class RayMainPool(RayPoolBase):
+    _actor_pool: RayMainActorPool
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self._args = args
+        self._kwargs = kwargs
+        self._start_timestamp = time.time_ns()
+
+    async def start(self):
+        # create mars pool outside the constructor is to avoid ray actor creation failed.
+        # ray can't get the creation exception.
+        address, n_process, sub_pool_handles = self._args
+        assert (
+            self._state == RayPoolState.INIT
+        ), f"The pool {address} is already started, current state is {self._state}"
+        self._actor_pool = await create_actor_pool(
+            address,
+            n_process=n_process,
+            pool_cls=RayMainActorPool,
+            sub_pool_handles=sub_pool_handles,
+            **self._kwargs,
+        )
+        self._set_ray_server(self._actor_pool)
+        self._state = RayPoolState.POOL_READY
+        logger.info("Started main pool %s with %s processes.", address, n_process)
+
+    async def mark_service_ready(self):
+        results = []
+        for _, sub_pool in self._actor_pool.sub_processes.items():
+            r = sub_pool.mark_service_ready.remote()
+            results.append(r)
+        await asyncio.gather(*results)
+        self._state = RayPoolState.SERVICE_READY
+        await self._actor_pool.start_monitor()
+
+    async def alive(self):
+        await asyncio.sleep(30)
+        return self._start_timestamp
+
+
+class RaySubPool(RayPoolBase):
+    _actor_pool: RaySubActorPool
+
+    def __init__(self, *args):
+        super().__init__()
+        self._args = args
+        self._actor_pool_config = None
+        self._check_alive_task = None
+        self._main_pool_start_timestamp = None
+
+    def set_actor_pool_config(self, actor_pool_config):
+        self._actor_pool_config = actor_pool_config
+
+    async def start(self):
+        # create mars pool outside the constructor is to avoid ray actor creation failed.
+        # ray can't get the creation exception.
+        main_pool_address, process_index = self._args
+        logger.info(
+            "Start to init sub pool %s for main pool %s.",
+            process_index,
+            main_pool_address,
+        )
+        main_pool = ray.get_actor(main_pool_address)
+        self._check_alive_task = asyncio.create_task(
+            self.check_main_pool_alive(main_pool)
+        )
+        if self._actor_pool_config is None:
+            self._actor_pool_config = await main_pool.actor_pool.remote("_config")
+        pool_config = self._actor_pool_config.get_pool_config(process_index)
+        sub_pool_address = pool_config["external_address"]
+        assert (
+            self._state == RayPoolState.INIT
+        ), f"The pool {sub_pool_address} is already started, current state is {self._state}"
+        env = pool_config["env"]
+        if env:  # pragma: no cover
+            os.environ.update(env)
+        self._actor_pool = await RaySubActorPool.create(
+            {
+                "actor_pool_config": self._actor_pool_config,
+                "process_index": process_index,
+            }
+        )
+        self._set_ray_server(self._actor_pool)
+        await self._actor_pool.start()
+        asyncio.create_task(self._actor_pool.join())
+        self._state = RayPoolState.POOL_READY
+        logger.info("Started sub pool %s.", sub_pool_address)
+
+    def mark_service_ready(self):
+        self._state = RayPoolState.SERVICE_READY
+
+    async def check_main_pool_alive(self, main_pool):
+        try:
+            main_pool_start_timestamp = await main_pool.alive.remote()
+            if self._main_pool_start_timestamp is None:
+                self._main_pool_start_timestamp = main_pool_start_timestamp
+            if (
+                main_pool_start_timestamp != self._main_pool_start_timestamp
+            ):  # pragma: no cover
+                logger.error(
+                    "Main pool %s has restarted at %s, exit current sub pool now.",
+                    datetime.datetime.fromtimestamp(main_pool_start_timestamp / 1e9),
+                    main_pool,
+                )
+                os._exit(0)
+        except:  # noqa: E722  # pylint: disable=bare-except  # pragma: no cover
+            logger.exception(
+                "Main pool %s has exited, exit current sub pool now.", main_pool
+            )
+            os._exit(0)
diff --git a/python/xorbits/_mars/oscar/backends/ray/tests/__init__.py b/python/xorbits/_mars/oscar/backends/ray/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/oscar/backends/ray/tests/test_communication.py b/python/xorbits/_mars/oscar/backends/ray/tests/test_communication.py
new file mode 100644
index 000000000..a49314710
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/tests/test_communication.py
@@ -0,0 +1,145 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import inspect
+
+import pytest
+
+from .....tests.core import require_ray
+from .....utils import ensure_coverage, lazy_import
+from ....core import ActorRef
+from ....errors import ServerClosed
+from ...communication.base import ChannelType
+from ...message import SendMessage
+from ..communication import Channel, ChannelID, RayClient, RayServer, msg_to_simple_str
+
+ray = lazy_import("ray")
+
+
+class ServerActor:
+    def __new__(cls, *args, **kwargs):
+        ensure_coverage()
+        return super().__new__(cls, *args, **kwargs)
+
+    def __init__(self, address):
+        self.address = address
+        self.server = None
+
+    async def start(self):
+        RayServer.set_ray_actor_started()
+        self.server = await RayServer.create(
+            {"address": self.address, "handle_channel": self.on_new_channel}
+        )
+
+    async def on_new_channel(self, channel: Channel):
+        while True:
+            try:
+                message = await channel.recv()
+                await channel.send(message)
+            except EOFError:
+                # no data to read, check channel
+                await channel.close()
+                return
+            await asyncio.sleep(0.1)
+
+    async def __on_ray_recv__(self, channel_id: ChannelID, message):
+        """Method for communication based on ray actors"""
+        return await self.server.__on_ray_recv__(channel_id, message)
+
+    async def server(self, method_name, *args, **kwargs):
+        result = getattr(self.server, method_name)(*args, **kwargs)
+        if inspect.iscoroutine(result):
+            result = await result
+        return result
+
+
+class ServerCallActor(ServerActor):
+    def __init__(self, address):
+        super().__init__(address)
+
+    async def check(self, dest_address, x):
+        client = await RayClient.connect(dest_address, self.address)
+        await client.send(x)
+        return await client.recv() == x
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_driver_to_actor_channel(ray_start_regular):
+    dest_address = "ray://test_cluster/0/0"
+    server_actor = (
+        ray.remote(ServerActor).options(name=dest_address).remote(dest_address)
+    )
+    await server_actor.start.remote()
+    client = await RayClient.connect(dest_address, None)
+    assert client.channel_type == ChannelType.ray
+    for i in range(10):
+        await client.send(i)
+        assert await client.recv() == i
+    await server_actor.server.remote("stop")
+    with pytest.raises(ServerClosed):
+        await client.send(1)
+        await client.recv()
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_actor_to_actor_channel(ray_start_regular):
+    server1_address, server2_address = (
+        "ray://test_cluster/0/0",
+        "ray://test_cluster/0/1",
+    )
+    server_actor1 = (
+        ray.remote(ServerCallActor)
+        .options(name=server1_address)
+        .remote(server1_address)
+    )
+    server_actor2 = (
+        ray.remote(ServerCallActor)
+        .options(name=server2_address)
+        .remote(server2_address)
+    )
+    await server_actor1.start.remote()
+    await server_actor2.start.remote()
+    for client in [
+        await RayClient.connect(addr, None)
+        for addr in [server1_address, server2_address]
+    ]:
+        for i in range(10):
+            await client.send(i)
+            assert await client.recv() == i
+    for i in range(10):
+        assert await server_actor1.check.remote(server2_address, i)
+        assert await server_actor2.check.remote(server1_address, i)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_msg_to_simple_str(ray_start_regular):
+    assert msg_to_simple_str(1) == "1"
+    assert msg_to_simple_str(True) == "True"
+    assert msg_to_simple_str("a") == "a"
+    assert msg_to_simple_str([1, 2]) == "List<1, 2...2>"
+    assert msg_to_simple_str({1, 2}) == "Set<1, 2...2>"
+    assert msg_to_simple_str((1, 2.0, False)) == "Tuple<1, 2.0, False...3>"
+    assert msg_to_simple_str({"a": [1, 2]}) == "Dict<k=a, v=List<1, 2...2>...1>"
+    assert (
+        msg_to_simple_str(
+            SendMessage(
+                message_id=b"abc", actor_ref=ActorRef("addr", b"id"), content="abc"
+            )
+        )
+        == "SendMessage(actor_ref=ActorRef(uid=b'id', address='addr'), content=abc)"
+    )
diff --git a/python/xorbits/_mars/oscar/backends/ray/tests/test_ray_actor_context.py b/python/xorbits/_mars/oscar/backends/ray/tests/test_ray_actor_context.py
new file mode 100644
index 000000000..d2d2d3860
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/tests/test_ray_actor_context.py
@@ -0,0 +1,145 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import time
+
+import pytest
+
+from .....tests.core import require_ray
+from .....utils import lazy_import
+from ...mars.tests import test_mars_actor_context
+from ...router import Router
+from ..backend import RayActorBackend
+from ..communication import RayServer
+from ..pool import RayMainPool
+from ..utils import process_placement_to_address
+
+ray = lazy_import("ray")
+
+
+@pytest.fixture
+async def actor_pool_context():
+    pg_name, n_process = f"ray_cluster_{time.time_ns()}", 2
+    address = process_placement_to_address(pg_name, 0, process_index=0)
+    # Hold actor_handle to avoid actor being freed.
+    pg = ray.util.placement_group(
+        name=pg_name, bundles=[{"CPU": n_process}], strategy="SPREAD"
+    )
+    ray.get(pg.ready())
+    pg, _ = ray.util.get_placement_group(pg_name), 0
+    pool_handle = await RayActorBackend._create_ray_pools(address, n_process)
+    await pool_handle.start.remote()
+
+    class ProxyPool:
+        def __init__(self, ray_pool_actor_handle):
+            self.ray_pool_actor_handle = ray_pool_actor_handle
+
+        def __getattr__(self, item):
+            if hasattr(RayMainPool, item) and inspect.isfunction(
+                getattr(RayMainPool, item)
+            ):
+
+                def call(*args, **kwargs):
+                    ray.get(
+                        self.ray_pool_actor_handle.actor_pool.remote(
+                            item, *args, **kwargs
+                        )
+                    )
+
+                return call
+
+            return ray.get(self.ray_pool_actor_handle.actor_pool.remote(item))
+
+    yield ProxyPool(pool_handle)
+    for addr in [
+        process_placement_to_address(pg_name, 0, process_index=i)
+        for i in range(n_process)
+    ]:
+        try:
+            # kill main pool first to avoid main pool monitor task recreate sub pool
+            ray.kill(ray.get_actor(addr))
+        except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+            pass
+    ray.util.remove_placement_group(pg)
+    Router.set_instance(None)
+    RayServer.clear()
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_simple_local_actor_pool(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_simple_local_actor_pool(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_mars_post_create_pre_destroy(
+    ray_start_regular_shared, actor_pool_context
+):
+    await test_mars_actor_context.test_mars_post_create_pre_destroy(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_mars_create_actor(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_mars_create_actor(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_mars_create_actor_error(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_mars_create_actor_error(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_mars_send(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_mars_send(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_mars_send_error(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_mars_send_error(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_mars_tell(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_mars_tell(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_mars_batch_method(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_mars_batch_method(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_mars_destroy_has_actor(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_mars_destroy_has_actor(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_mars_resource_lock(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_mars_resource_lock(actor_pool_context)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_promise_chain(ray_start_regular_shared, actor_pool_context):
+    await test_mars_actor_context.test_promise_chain(actor_pool_context)
diff --git a/python/xorbits/_mars/oscar/backends/ray/tests/test_ray_actor_driver.py b/python/xorbits/_mars/oscar/backends/ray/tests/test_ray_actor_driver.py
new file mode 100644
index 000000000..744d81572
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/tests/test_ray_actor_driver.py
@@ -0,0 +1,174 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+import pytest
+
+from ..... import oscar as mo
+from .....tests.core import require_ray
+from .....utils import lazy_import
+from ..communication import RayServer
+from ..driver import RayActorDriver
+from ..utils import (
+    addresses_to_placement_group_info,
+    get_placement_group,
+    node_address_to_placement,
+    placement_group_info_to_addresses,
+    process_address_to_placement,
+    process_placement_to_address,
+)
+
+ray = lazy_import("ray")
+
+TEST_PLACEMENT_GROUP_NAME = "test_placement_group"
+TEST_PLACEMENT_GROUP_BUNDLES = [{"CPU": 3}, {"CPU": 5}, {"CPU": 7}]
+TEST_ADDRESS_TO_RESOURCES = placement_group_info_to_addresses(
+    TEST_PLACEMENT_GROUP_NAME, TEST_PLACEMENT_GROUP_BUNDLES
+)
+
+
+class DummyActor(mo.Actor):
+    def __init__(self, index):
+        super().__init__()
+        self._index = index
+
+    def getppid(self):
+        return os.getppid()
+
+    def index(self):
+        return self._index
+
+
+@pytest.fixture
+async def mars_cluster():
+    mo.setup_cluster(address_to_resources=TEST_ADDRESS_TO_RESOURCES)
+    main_pool_handles = []  # Hold actor_handle to avoid actor being freed.
+    for index, bundle_spec in enumerate(TEST_PLACEMENT_GROUP_BUNDLES):
+        address = process_placement_to_address(TEST_PLACEMENT_GROUP_NAME, index, 0)
+        actor_handle = await mo.create_actor_pool(address, bundle_spec["CPU"])
+        main_pool_handles.append(actor_handle)
+
+    yield
+
+    RayActorDriver.stop_cluster()
+    RayServer.clear()
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_create_actor_in_placement_group(ray_large_cluster, mars_cluster):
+    actor_refs = []
+    for i, r in enumerate(TEST_PLACEMENT_GROUP_BUNDLES):
+        for _ in range(r["CPU"]):
+            address = process_placement_to_address(TEST_PLACEMENT_GROUP_NAME, i, 0)
+            actor_ref = await mo.create_actor(DummyActor, i, address=address)
+            actor_refs.append(actor_ref)
+    results = []
+    for actor_ref in actor_refs:
+        ppid = await actor_ref.getppid()
+        index = await actor_ref.index()
+        results.append((ppid, index))
+
+    counter = collections.Counter(results)
+    assert len(counter) == len(TEST_PLACEMENT_GROUP_BUNDLES)
+    assert sorted(counter.values()) == sorted(
+        r["CPU"] for r in TEST_PLACEMENT_GROUP_BUNDLES
+    )
+
+
+def test_address_to_pg_bundle():
+    # Missing bundle index.
+    with pytest.raises(ValueError):
+        node_address_to_placement("ray://bundle_name")
+    # Extra path is not allowed.
+    with pytest.raises(ValueError):
+        node_address_to_placement("ray://bundle_name/0/")
+    # The scheme is not ray
+    with pytest.raises(ValueError):
+        node_address_to_placement("http://bundle_name/0")
+    # The bundle index is not an int string.
+    with pytest.raises(ValueError):
+        node_address_to_placement("ray://abc/def")
+    pg_name, bundle_index = node_address_to_placement("ray://bundle_name/0")
+    assert pg_name == "bundle_name"
+    assert bundle_index == 0
+    pg_name, bundle_index = node_address_to_placement("ray://127.0.0.1/1")
+    assert pg_name == "127.0.0.1"
+    assert bundle_index == 1
+    pg_name, bundle_index = node_address_to_placement("ray://127.0.0.1%2F2")
+    assert pg_name == "127.0.0.1"
+    assert bundle_index == 2
+    with pytest.raises(ValueError):
+        node_address_to_placement("ray://")
+
+
+def test_addresses_to_placement_group_info():
+    # Missing bundle index 1
+    with pytest.raises(ValueError):
+        addresses_to_placement_group_info(
+            {"ray://127.0.0.1/0": {"CPU": 1}, "ray://127.0.0.1/2": {"CPU": 1}}
+        )
+    # The bundle index is not starts from 0
+    with pytest.raises(ValueError):
+        addresses_to_placement_group_info({"ray://127.0.0.1/1": {"CPU": 1}})
+    pg_name, bundles = addresses_to_placement_group_info(
+        {"ray://127.0.0.1/0": {"CPU": 1}}
+    )
+    assert pg_name == "127.0.0.1"
+    assert bundles == [{"CPU": 1}]
+    pg_name, bundles = addresses_to_placement_group_info(
+        {
+            "ray://127.0.0.1/4": {"CPU": 4},
+            "ray://127.0.0.1/2": {"CPU": 2},
+            "ray://127.0.0.1/1": {"CPU": 1},
+            "ray://127.0.0.1/3": {"CPU": 3},
+            "ray://127.0.0.1/0": {"CPU": 0},
+        }
+    )
+    assert pg_name == "127.0.0.1"
+    assert bundles == [{"CPU": 0}, {"CPU": 1}, {"CPU": 2}, {"CPU": 3}, {"CPU": 4}]
+    pg_name, bundles = addresses_to_placement_group_info(TEST_ADDRESS_TO_RESOURCES)
+    assert pg_name == TEST_PLACEMENT_GROUP_NAME
+    assert bundles == TEST_PLACEMENT_GROUP_BUNDLES
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_get_placement_group(ray_large_cluster):
+    pg_name = "test_pg"
+    pg = ray.util.placement_group(name=pg_name, bundles=[{"CPU": 1}], strategy="SPREAD")
+    ray.get(pg.ready())
+    pg2 = get_placement_group(pg_name)
+    assert pg2.bundle_specs == pg.bundle_specs
+
+
+def test_address_to_placement():
+    assert process_address_to_placement("ray://test_cluster/0/0") == (
+        "test_cluster",
+        0,
+        0,
+    )
+    with pytest.raises(ValueError):
+        process_address_to_placement("ray://")
+    assert node_address_to_placement("ray://test_cluster/0") == ("test_cluster", 0)
+    with pytest.raises(ValueError):
+        node_address_to_placement("ray://")
+    with pytest.raises(ValueError):
+        node_address_to_placement("ray://test_cluster")
+    with pytest.raises(ValueError):
+        node_address_to_placement("ray://test_cluster/")
+    with pytest.raises(ValueError):
+        node_address_to_placement("ray://test_cluster//")
diff --git a/python/xorbits/_mars/oscar/backends/ray/tests/test_ray_pool.py b/python/xorbits/_mars/oscar/backends/ray/tests/test_ray_pool.py
new file mode 100644
index 000000000..48b4e270c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/tests/test_ray_pool.py
@@ -0,0 +1,213 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+
+import pytest
+
+from ..... import oscar as mo
+from .....tests.core import mock, require_ray
+from .....utils import lazy_import
+from ....context import get_context
+from ....errors import ServerClosed
+from ...allocate_strategy import MainPool, ProcessIndex
+from ..backend import RayActorBackend
+from ..pool import RayMainActorPool, RayPoolState, create_actor_pool
+from ..utils import kill_and_wait, process_placement_to_address
+
+ray = lazy_import("ray")
+
+
+class TestActor(mo.Actor):
+    __test__ = False
+
+    async def kill(self, address, uid):
+        actor_ref = await mo.actor_ref(address, uid)
+        task = asyncio.create_task(actor_ref.crash())
+        return await task
+
+    async def crash(self):
+        os._exit(0)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_main_pool(ray_start_regular):
+    pg, pg_name, n_process = None, "ray_cluster", 3
+    if hasattr(ray.util, "get_placement_group"):
+        pg = ray.util.placement_group(name=pg_name, bundles=[{"CPU": n_process}])
+        ray.get(pg.ready())
+    address = process_placement_to_address(pg_name, 0, process_index=0)
+    addresses = RayMainActorPool.get_external_addresses(address, n_process)
+    assert addresses == [address] + [
+        process_placement_to_address(pg_name, 0, process_index=i + 1)
+        for i in range(n_process)
+    ]
+    assert RayMainActorPool.gen_internal_address(0, address) == address
+
+    pool_handle = await RayActorBackend._create_ray_pools(address, n_process)
+    main_actor_pool = await create_actor_pool(
+        address,
+        n_process=n_process,
+        pool_cls=RayMainActorPool,
+        sub_pool_handles=pool_handle.sub_pools,
+    )
+    async with main_actor_pool:
+        sub_processes = list(main_actor_pool.sub_processes.values())
+        assert len(sub_processes) == n_process
+        await main_actor_pool.kill_sub_pool(sub_processes[0], force=True)
+        assert not (await main_actor_pool.is_sub_pool_alive(sub_processes[0]))
+        await main_actor_pool.kill_sub_pool(sub_processes[1], force=False)
+        assert not (await main_actor_pool.is_sub_pool_alive(sub_processes[1]))
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_shutdown_sub_pool(ray_start_regular):
+    import ray
+
+    pg_name, n_process = "ray_cluster", 2
+    if hasattr(ray.util, "get_placement_group"):
+        pg = ray.util.placement_group(name=pg_name, bundles=[{"CPU": n_process}])
+        ray.get(pg.ready())
+    else:
+        pg = None
+    address = process_placement_to_address(pg_name, 0, process_index=0)
+    pool_handle = await RayActorBackend._create_ray_pools(address, n_process)
+    actor_handle = pool_handle.main_pool
+    await actor_handle.start.remote()
+    sub_pool_address1 = process_placement_to_address(pg_name, 0, process_index=1)
+    sub_pool_handle1 = ray.get_actor(sub_pool_address1)
+    sub_pool_address2 = process_placement_to_address(pg_name, 0, process_index=2)
+    sub_pool_handle2 = ray.get_actor(sub_pool_address2)
+    await actor_handle.actor_pool.remote(
+        "stop_sub_pool", sub_pool_address1, sub_pool_handle1, force=True
+    )
+    await actor_handle.actor_pool.remote(
+        "stop_sub_pool", sub_pool_address2, sub_pool_handle2, force=False
+    )
+    assert await sub_pool_handle1.state.remote() == RayPoolState.INIT
+    assert await sub_pool_handle2.state.remote() == RayPoolState.INIT
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_server_closed(ray_start_regular):
+    pg_name, n_process = "ray_cluster", 1
+    pg = ray.util.placement_group(name=pg_name, bundles=[{"CPU": n_process}])
+    ray.get(pg.ready())
+    address = process_placement_to_address(pg_name, 0, process_index=0)
+    # start the actor pool
+    actor_handle = await mo.create_actor_pool(address, n_process=n_process)
+    await actor_handle.mark_service_ready.remote()
+
+    ctx = get_context()
+    actor_main = await ctx.create_actor(
+        TestActor, address=address, uid="Test-main", allocate_strategy=ProcessIndex(0)
+    )
+
+    actor_sub = await ctx.create_actor(
+        TestActor, address=address, uid="Test-sub", allocate_strategy=ProcessIndex(1)
+    )
+
+    # test calling from ray driver to ray actor
+    task = asyncio.create_task(actor_sub.crash())
+
+    with pytest.raises(ServerClosed):
+        # process already died,
+        # ServerClosed will be raised
+        await task
+
+    # wait for recover of sub pool
+    await ctx.wait_actor_pool_recovered(actor_sub.address, address)
+
+    # test calling from ray actor to ray actor
+    task = asyncio.create_task(actor_main.kill(actor_sub.address, "Test-sub"))
+
+    with pytest.raises(ServerClosed):
+        await task
+
+
+@require_ray
+@pytest.mark.asyncio
+@pytest.mark.parametrize("auto_recover", [False, True, "actor", "process"])
+async def test_auto_recover(ray_start_regular, auto_recover):
+    pg_name, n_process = "ray_cluster", 1
+    pg = ray.util.placement_group(name=pg_name, bundles=[{"CPU": n_process}])
+    assert pg.wait(timeout_seconds=20)
+    address = process_placement_to_address(pg_name, 0, process_index=0)
+    actor_handle = await mo.create_actor_pool(
+        address, n_process=n_process, auto_recover=auto_recover
+    )
+    await actor_handle.mark_service_ready.remote()
+
+    ctx = get_context()
+
+    # wait for recover of main pool always returned immediately
+    await ctx.wait_actor_pool_recovered(address, address)
+
+    # create actor on main
+    actor_ref = await ctx.create_actor(
+        TestActor, address=address, allocate_strategy=MainPool()
+    )
+
+    with pytest.raises(ValueError):
+        # cannot kill actors on main pool
+        await mo.kill_actor(actor_ref)
+
+    # create actor
+    actor_ref = await ctx.create_actor(
+        TestActor, address=address, allocate_strategy=ProcessIndex(1)
+    )
+    # kill_actor will cause kill corresponding process
+    await ctx.kill_actor(actor_ref)
+
+    if auto_recover:
+        await ctx.wait_actor_pool_recovered(actor_ref.address, address)
+        sub_pool_address = process_placement_to_address(pg_name, 0, process_index=1)
+        sub_pool_handle = ray.get_actor(sub_pool_address)
+        if auto_recover == "process":
+            assert await sub_pool_handle.state.remote() == RayPoolState.POOL_READY
+        else:
+            assert await sub_pool_handle.state.remote() == RayPoolState.SERVICE_READY
+
+        expect_has_actor = True if auto_recover in ["actor", True] else False
+        assert await ctx.has_actor(actor_ref) is expect_has_actor
+    else:
+        with pytest.raises((ServerClosed, ConnectionError)):
+            await ctx.has_actor(actor_ref)
+
+    if "COV_CORE_SOURCE" in os.environ:
+        for addr in [
+            process_placement_to_address(pg_name, 0, process_index=i) for i in range(2)
+        ]:
+            # must save the local reference until this is fixed:
+            # https://github.com/ray-project/ray/issues/7815
+            ray_actor = ray.get_actor(addr)
+            ray.get(ray_actor.cleanup.remote())
+
+
+@require_ray
+@pytest.mark.asyncio
+@mock.patch("ray.kill")
+async def test_kill_and_wait_timeout(fake_ray_kill, ray_start_regular):
+    pg_name, n_process = "ray_cluster", 1
+    pg = ray.util.placement_group(name=pg_name, bundles=[{"CPU": n_process}])
+    ray.get(pg.ready())
+    address = process_placement_to_address(pg_name, 0, process_index=0)
+    # start the actor pool
+    actor_handle = await mo.create_actor_pool(address, n_process=n_process)
+    with pytest.raises(Exception, match="not died"):
+        await kill_and_wait(actor_handle, timeout=1)
diff --git a/python/xorbits/_mars/oscar/backends/ray/tests/test_utils.py b/python/xorbits/_mars/oscar/backends/ray/tests/test_utils.py
new file mode 100644
index 000000000..fa2bfeda4
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/tests/test_utils.py
@@ -0,0 +1,33 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .....tests.core import mock, require_ray
+from .....utils import lazy_import
+from ..utils import report_event
+
+ray = lazy_import("ray")
+
+
+@require_ray
+@mock.patch("ray.report_event")
+def test_report_event(fake_report_event, ray_start_regular):
+    arguments = []
+
+    def _report_event(*args):
+        arguments.extend(args)
+
+    fake_report_event.side_effect = _report_event
+    severity, label, message = "WARNING", "test_label", "test_message"
+    report_event(severity, label, message)
+    assert arguments == [ray.EventSeverity.WARNING, label, message]
diff --git a/python/xorbits/_mars/oscar/backends/ray/utils.py b/python/xorbits/_mars/oscar/backends/ray/utils.py
new file mode 100644
index 000000000..f50c1886c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/ray/utils.py
@@ -0,0 +1,203 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import enum
+import logging
+import os
+import posixpath
+from urllib.parse import unquote, urlparse
+
+from ....utils import lazy_import, lazy_import_on_load
+
+ray = lazy_import("ray")
+
+logger = logging.getLogger(__name__)
+
+
+def get_placement_group(pg_name):  # pragma: no cover
+    return ray.util.get_placement_group(pg_name)
+
+
+def process_address_to_placement(address):
+    """
+    Parameters
+    ----------
+    address: str
+        The address of an actor pool which running in a ray actor. It's also
+        the name of the ray actor. address ex: ray://${pg_name}/${bundle_index}/${process_index}
+
+    Returns
+    -------
+    tuple
+        A tuple consisting of placement group name, bundle index, process index.
+    """
+    name, parts = _address_to_placement(address)
+    if not parts or len(parts) != 2:
+        raise ValueError(
+            f"Only bundle index and process index path are allowed in ray "
+            f"address {address} but got {parts}."
+        )
+    bundle_index, process_index = parts
+    return name, int(bundle_index), int(process_index)
+
+
+def node_address_to_placement(address):
+    """
+    Parameters
+    ----------
+    address : str
+        The address of a node. ex: ray://${pg_name}/${bundle_index}
+
+    Returns
+    -------
+    tuple
+        A tuple consisting of placement group name, bundle index.
+    """
+    name, parts = _address_to_placement(address)
+    if not parts or len(parts) != 1:
+        raise ValueError(
+            f"Only bundle index path is allowed in ray address {address} but got {parts}"
+        )
+    bundle_index = parts[0]
+    return name, int(bundle_index)
+
+
+def _address_to_placement(address):
+    """
+
+    Parameters
+    ----------
+    address : str
+        The address of a node or an actor pool which running in a ray actor.
+
+    Returns
+    -------
+    tuple
+        A tuple consisting of placement group name, bundle index, process index.
+    """
+    parsed_url = urlparse(unquote(address))
+    if parsed_url.scheme != "ray":
+        raise ValueError(f"The address scheme is not ray: {address}")
+    # os.path.split will not handle backslashes (\) correctly,
+    # so we use the posixpath.
+    parts = []
+    if parsed_url.netloc:
+        tmp = parsed_url.path
+        while tmp and tmp != "/":
+            tmp2, item = posixpath.split(tmp)
+            parts.append(item)
+            if tmp2 != tmp:
+                tmp = tmp2
+            else:
+                parts.append(tmp2)
+                break
+    parts = list(reversed(parts))
+    return parsed_url.netloc, parts
+
+
+def process_placement_to_address(
+    pg_name: str, bundle_index: int, process_index: int = 0
+):
+    return f"ray://{pg_name}/{bundle_index}/{process_index}"
+
+
+def node_placement_to_address(pg_name, bundle_index):
+    return f"ray://{pg_name}/{bundle_index}"
+
+
+def addresses_to_placement_group_info(address_to_resources):
+    bundles = {}
+    pg_name = None
+    for address, bundle_resources in address_to_resources.items():
+        name, bundle_index = node_address_to_placement(address)
+        if pg_name is None:
+            pg_name = name
+        else:
+            if name != pg_name:
+                raise ValueError(
+                    "All addresses should have consistent placement group names."
+                )
+        bundles[bundle_index] = bundle_resources
+    sorted_bundle_keys = sorted(bundles.keys())
+    if sorted_bundle_keys != list(range(len(address_to_resources))):
+        raise ValueError("The addresses contains invalid bundle.")
+    bundles = [bundles[k] for k in sorted_bundle_keys]
+    if not pg_name:
+        raise ValueError("Can't find a valid placement group name.")
+    return pg_name, bundles
+
+
+def placement_group_info_to_addresses(pg_name, bundles):
+    addresses = {}
+    for bundle_index, bundle_resources in enumerate(bundles):
+        address = node_placement_to_address(pg_name, bundle_index)
+        addresses[address] = bundle_resources
+    return addresses
+
+
+async def kill_and_wait(
+    actor_handle: "ray.actor.ActorHandle", no_restart=False, timeout: float = 30
+):
+    if "COV_CORE_SOURCE" in os.environ:  # pragma: no cover
+        try:
+            # must clean up first, or coverage info lost
+            await actor_handle.cleanup.remote()
+        except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+            pass
+    r = actor_handle.wait.remote(timeout)
+    ray.kill(actor_handle, no_restart=no_restart)
+    ready, _ = await asyncio.wait([r], timeout=timeout)
+    if ready:
+        try:
+            await r
+        except ray.exceptions.RayActorError:
+            return  # We expect a RayActorError, it indicated that the actor is died.
+    raise Exception(
+        f"The actor {actor_handle} is not died after ray.kill {timeout} seconds."
+    )
+
+
+@lazy_import_on_load(ray)
+def _patch_event_security():
+    global ray
+
+    if ray and not hasattr(ray, "report_event"):  # pragma: no cover
+        # lower version of ray doesn't support event
+
+        class EventSeverity(enum.Enum):
+            INFO = 0
+            WARNING = 1
+            ERROR = 2
+            FATAL = 3
+
+        def _report_event(severity, label, message):
+            logger.warning(
+                "severity: %s, label: %s, message: %s.", severity, label, message
+            )
+
+        import ray
+
+        ray.EventSeverity = EventSeverity
+        ray.report_event = _report_event
+
+
+def report_event(severity, label, message):
+    if ray and ray.is_initialized():
+        severity = (
+            getattr(ray.EventSeverity, severity)
+            if isinstance(severity, str)
+            else severity
+        )
+        ray.report_event(severity, label, message)
diff --git a/python/xorbits/_mars/oscar/backends/router.py b/python/xorbits/_mars/oscar/backends/router.py
new file mode 100644
index 000000000..8b5ca749d
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/router.py
@@ -0,0 +1,134 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import threading
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+from .communication import Client, get_client_type
+
+
+class Router:
+    """
+    Router provides mapping from external address to internal address.
+    """
+
+    __slots__ = (
+        "_curr_external_addresses",
+        "_local_mapping",
+        "_mapping",
+        "_comm_config",
+        "_cache_local",
+    )
+
+    _instance: "Router" = None
+
+    @staticmethod
+    def set_instance(router: Optional["Router"]):
+        # Default router is set when an actor pool started
+        Router._instance = router
+
+    @staticmethod
+    def get_instance() -> "Router":
+        return Router._instance
+
+    @staticmethod
+    def get_instance_or_empty() -> "Router":
+        return Router._instance or Router(list(), None)
+
+    def __init__(
+        self,
+        external_addresses: List[str],
+        local_address: Optional[str],
+        mapping: Dict[str, str] = None,
+        comm_config: dict = None,
+    ):
+        self._curr_external_addresses = external_addresses
+        self._local_mapping = dict()
+        for addr in self._curr_external_addresses:
+            self._local_mapping[addr] = local_address
+        if mapping is None:
+            mapping = dict()
+        self._mapping = mapping
+        self._comm_config = comm_config or dict()
+        self._cache_local = threading.local()
+
+    @property
+    def _cache(self) -> Dict[Tuple[str, Any], Client]:
+        try:
+            return self._cache_local.cache
+        except AttributeError:
+            cache = self._cache_local.cache = dict()
+            return cache
+
+    def set_mapping(self, mapping: Dict[str, str]):
+        self._mapping = mapping
+        self._cache_local = threading.local()
+
+    def add_router(self, router: "Router"):
+        self._curr_external_addresses.extend(router._curr_external_addresses)
+        self._local_mapping.update(router._local_mapping)
+        self._mapping.update(router._mapping)
+        self._comm_config.update(router._comm_config)
+        self._cache_local = threading.local()
+
+    def remove_router(self, router: "Router"):
+        for external_address in router._curr_external_addresses:
+            try:
+                self._curr_external_addresses.remove(external_address)
+            except ValueError:
+                pass
+        for addr in router._local_mapping:
+            self._local_mapping.pop(addr, None)
+        for addr in router._mapping:
+            self._mapping.pop(addr, None)
+        self._cache_local = threading.local()
+
+    @property
+    def external_address(self):
+        if self._curr_external_addresses:
+            return self._curr_external_addresses[0]
+
+    def get_internal_address(self, external_address: str) -> str:
+        if external_address in self._curr_external_addresses:
+            # local address, use dummy address
+            return self._local_mapping.get(external_address)
+        # try to lookup inner address from address mapping
+        return self._mapping.get(external_address)
+
+    async def get_client(
+        self, external_address: str, from_who: Any = None, cached: bool = True, **kw
+    ) -> Client:
+        if cached and (external_address, from_who) in self._cache:
+            cached_client = self._cache[external_address, from_who]
+            if cached_client.closed:
+                # closed before, ignore it
+                del self._cache[external_address, from_who]
+            else:
+                return cached_client
+
+        address = self.get_internal_address(external_address)
+        if address is None:
+            # no inner address, just use external address
+            address = external_address
+        client_type: Type[Client] = get_client_type(address)
+        local_address = (
+            self._curr_external_addresses[0] if self._curr_external_addresses else None
+        )
+        config = client_type.parse_config(self._comm_config)
+        if config:
+            kw["config"] = config
+        client = await client_type.connect(address, local_address=local_address, **kw)
+        if cached:
+            self._cache[external_address, from_who] = client
+        return client
diff --git a/python/xorbits/_mars/oscar/backends/test/__init__.py b/python/xorbits/_mars/oscar/backends/test/__init__.py
new file mode 100644
index 000000000..f27d44076
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/test/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .backend import TestActorBackend
diff --git a/python/xorbits/_mars/oscar/backends/test/backend.py b/python/xorbits/_mars/oscar/backends/test/backend.py
new file mode 100644
index 000000000..6c13fb02a
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/test/backend.py
@@ -0,0 +1,33 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...backend import register_backend
+from ..mars.backend import MarsActorBackend, build_pool_kwargs
+from .pool import TestMainActorPool
+
+
+@register_backend
+class TestActorBackend(MarsActorBackend):
+    @staticmethod
+    def name():
+        return "test"
+
+    @classmethod
+    async def create_actor_pool(cls, address: str, n_process: int = None, **kwargs):
+        from ..pool import create_actor_pool
+
+        n_process, kwargs = build_pool_kwargs(n_process, kwargs)
+        return await create_actor_pool(
+            address, pool_cls=TestMainActorPool, n_process=n_process, **kwargs
+        )
diff --git a/python/xorbits/_mars/oscar/backends/test/pool.py b/python/xorbits/_mars/oscar/backends/test/pool.py
new file mode 100644
index 000000000..e73b3fcf4
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/test/pool.py
@@ -0,0 +1,135 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import multiprocessing
+from typing import Dict, List
+
+from ..communication import DummyServer, gen_local_address
+from ..config import ActorPoolConfig
+from ..mars.pool import MainActorPool, SubActorPool, SubpoolStatus
+from ..pool import ActorPoolType
+
+
+class TestMainActorPool(MainActorPool):
+    @classmethod
+    def get_external_addresses(
+        cls,
+        address: str,
+        n_process: int = None,
+        ports: List[int] = None,
+        schemes: List[str] = None,
+    ):
+        if "://" in address:
+            address = address.split("://", 1)[1]
+        return super().get_external_addresses(address, n_process=n_process, ports=ports)
+
+    @classmethod
+    def gen_internal_address(
+        cls, process_index: int, external_address: str = None
+    ) -> str:
+        return f"dummy://{process_index}"
+
+    @classmethod
+    async def start_sub_pool(
+        cls,
+        actor_pool_config: ActorPoolConfig,
+        process_index: int,
+        start_method: str = None,
+    ):
+        status_queue = multiprocessing.Queue()
+        return (
+            asyncio.create_task(
+                cls._create_sub_pool(actor_pool_config, process_index, status_queue)
+            ),
+            status_queue,
+        )
+
+    @classmethod
+    async def wait_sub_pools_ready(cls, create_pool_tasks: List[asyncio.Task]):
+        addresses = []
+        tasks = []
+        for t in create_pool_tasks:
+            pool_task, queue = await t
+            tasks.append(pool_task)
+            status = await asyncio.to_thread(queue.get)
+            addresses.append(status.external_addresses)
+        return tasks, addresses
+
+    @classmethod
+    async def _create_sub_pool(
+        cls,
+        actor_config: ActorPoolConfig,
+        process_index: int,
+        status_queue: multiprocessing.Queue,
+    ):
+        pool = await TestSubActorPool.create(
+            {"actor_pool_config": actor_config, "process_index": process_index}
+        )
+        await pool.start()
+        status_queue.put(
+            SubpoolStatus(status=0, external_addresses=[pool.external_address])
+        )
+        actor_config.reset_pool_external_address(process_index, [pool.external_address])
+        await pool.join()
+
+    def _sync_pool_config(self, actor_pool_config: ActorPoolConfig):
+        # test pool does not create routers, thus can skip this step
+        pass
+
+    async def kill_sub_pool(
+        self, process: multiprocessing.Process, force: bool = False
+    ):
+        process.cancel()
+
+    async def is_sub_pool_alive(self, process: multiprocessing.Process):
+        return not process.cancelled()
+
+
+class TestSubActorPool(SubActorPool):
+    def _sync_pool_config(self, actor_pool_config: ActorPoolConfig):
+        # test pool does not create routers, thus can skip this step
+        pass
+
+    @classmethod
+    async def create(cls, config: Dict) -> ActorPoolType:
+        kw = dict()
+        cls._parse_config(config, kw)
+        process_index: int = kw["process_index"]
+        actor_pool_config = kw["config"]  # type: ActorPoolConfig
+        external_addresses = actor_pool_config.get_pool_config(process_index)[
+            "external_address"
+        ]
+
+        def handle_channel(channel):
+            return pool.on_new_channel(channel)
+
+        # create servers
+        server_addresses = external_addresses + [gen_local_address(process_index)]
+        server_addresses = sorted(set(server_addresses))
+        servers = await cls._create_servers(
+            server_addresses, handle_channel, actor_pool_config.get_comm_config()
+        )
+        cls._update_stored_addresses(servers, server_addresses, actor_pool_config, kw)
+
+        # create pool
+        pool = cls(**kw)
+        return pool
+
+    async def stop(self):
+        # do not close dummy server
+        self._servers = [
+            s for s in self._servers[:-1] if not isinstance(s, DummyServer)
+        ]
+        await super().stop()
diff --git a/python/xorbits/_mars/oscar/backends/test/tests/__init__.py b/python/xorbits/_mars/oscar/backends/test/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/test/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/oscar/backends/test/tests/test_actor_context.py b/python/xorbits/_mars/oscar/backends/test/tests/test_actor_context.py
new file mode 100644
index 000000000..f42e49b51
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/test/tests/test_actor_context.py
@@ -0,0 +1,61 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+import pytest
+
+from ..... import oscar as mo
+
+
+class DummyActor(mo.Actor):
+    def __init__(self, value):
+        super().__init__()
+
+        if value < 0:
+            raise ValueError("value < 0")
+        self.value = value
+
+    async def add(self, value):
+        if not isinstance(value, int):
+            raise TypeError("add number must be int")
+        self.value += value
+        return self.value
+
+
+@pytest.fixture
+async def actor_pool_context():
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await mo.create_actor_pool(
+        "test://127.0.0.1", n_process=2, subprocess_start_method=start_method
+    )
+    async with pool:
+        yield pool
+
+
+@pytest.mark.asyncio
+async def test_simple(actor_pool_context):
+    pool = actor_pool_context
+    actor_ref = await mo.create_actor(
+        DummyActor,
+        100,
+        address=pool.external_address,
+        allocate_strategy=mo.allocate_strategy.RandomSubPool(),
+    )
+    assert await actor_ref.add(1) == 101
diff --git a/python/xorbits/_mars/oscar/backends/test/tests/test_message.py b/python/xorbits/_mars/oscar/backends/test/tests/test_message.py
new file mode 100644
index 000000000..3ec18751c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/backends/test/tests/test_message.py
@@ -0,0 +1,74 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cloudpickle as pickle
+
+from ...message import ErrorMessage
+
+
+def test_as_instanceof_cause():
+    fake_address = "Fake address"
+    fake_pid = 123
+    value = 3
+
+    class CustomException(Exception):
+        def __init__(self, i):
+            self.i = i
+
+        def __str__(self):
+            return "Custom Exception."
+
+    try:
+        raise CustomException(value)
+    except Exception as e:
+        em = ErrorMessage(
+            b"Fake message id", fake_address, fake_pid, type(e), e, e.__traceback__
+        )
+        assert "Fake message id" in repr(em)
+        try:
+            cause = em.as_instanceof_cause()
+            # Test serialization.
+            cause1 = pickle.loads(pickle.dumps(cause))
+            assert type(cause) is type(cause1)
+            raise cause
+        except Exception as e1:
+            e1 = pickle.loads(pickle.dumps(e1))
+            # Check cause exception.
+            assert isinstance(e1, CustomException)
+            assert e1.i == value
+            assert e1.address == fake_address
+            assert e1.pid == fake_pid
+            assert fake_address in str(e1)
+            assert "Custom Exception" in str(e1)
+            assert str(fake_pid) in str(e1)
+            em1 = ErrorMessage(
+                b"Fake message id",
+                fake_address,
+                fake_pid,
+                type(e1),
+                e1,
+                e1.__traceback__,
+            )
+            try:
+                raise em1.as_instanceof_cause()
+            except Exception as e2:
+                e2 = pickle.loads(pickle.dumps(e2))
+                # Check recursive cause exception.
+                assert isinstance(e2, CustomException)
+                assert e2.i == value
+                assert e2.address == fake_address
+                assert e2.pid == fake_pid
+                assert str(e2).count("Custom Exception") == 1
+                assert str(e2).count(fake_address) == 1
+                assert str(e2).count(str(fake_pid)) == 1
diff --git a/python/xorbits/_mars/oscar/batch.py b/python/xorbits/_mars/oscar/batch.py
new file mode 100644
index 000000000..37026e2a4
--- /dev/null
+++ b/python/xorbits/_mars/oscar/batch.py
@@ -0,0 +1,244 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import inspect
+import textwrap
+from collections import namedtuple
+from dataclasses import dataclass
+from typing import Callable, Dict, Optional, Tuple
+
+
+def build_args_binder(func, remove_self: bool = True) -> Optional[Callable]:
+    try:
+        spec = inspect.getfullargspec(func)
+    except TypeError:  # pragma: no cover
+        return None
+
+    sig_list = list(spec.args)
+    args_list = list(spec.args)
+    if remove_self:
+        args_list = args_list[1:]
+
+    if spec.varargs:
+        sig_list.append(f"*{spec.varargs}")
+        args_list.append(spec.varargs)
+    elif spec.kwonlyargs:
+        sig_list.append("*")
+
+    sig_list.extend(spec.kwonlyargs)
+    args_list.extend(spec.kwonlyargs)
+
+    if spec.varkw:
+        sig_list.append(f"**{spec.varkw}")
+        args_list.append(spec.varkw)
+
+    if getattr(func, "__name__", None).isidentifier():
+        ret_func_name = f"{func.__name__}_binder"
+        ret_type_name = f"_Args_{func.__name__}"
+    else:
+        ret_func_name = f"anon_{id(func)}_binder"
+        ret_type_name = f"_ArgsAnon_{id(func)}"
+
+    func_str = textwrap.dedent(
+        f"""
+    def {ret_func_name}({', '.join(sig_list)}):
+        return {ret_type_name}({', '.join(args_list)})
+    """
+    )
+
+    glob_vars = globals().copy()
+    glob_vars[ret_type_name] = namedtuple(ret_type_name, args_list)
+    loc_vars = dict()
+    exec(func_str, glob_vars, loc_vars)
+    ext_func = loc_vars[ret_func_name]
+    ext_func.__defaults__ = spec.defaults
+    ext_func.__kwdefaults__ = spec.kwonlydefaults
+
+    return ext_func
+
+
+@dataclass
+class _DelayedArgument:
+    args: Tuple
+    kwargs: Dict
+
+
+class _ExtensibleCallable:
+    func: Callable
+    batch_func: Optional[Callable]
+    is_async: bool
+    has_single_func: bool
+
+    def __call__(self, *args, **kwargs):
+        if self.is_async:
+            return self._async_call(*args, **kwargs)
+        else:
+            return self._sync_call(*args, **kwargs)
+
+    async def _async_call(self, *args, **kwargs):
+        try:
+            if self.has_single_func:
+                return await self.func(*args, **kwargs)
+        except NotImplementedError:
+            self.has_single_func = False
+
+        if self.batch_func is not None:
+            ret = await self.batch_func([args], [kwargs])
+            return None if ret is None else ret[0]
+        raise NotImplementedError
+
+    def _sync_call(self, *args, **kwargs):
+        try:
+            if self.has_single_func:
+                return self.func(*args, **kwargs)
+        except NotImplementedError:
+            self.has_single_func = False
+
+        if self.batch_func is not None:
+            return self.batch_func([args], [kwargs])[0]
+        raise NotImplementedError
+
+
+class _ExtensibleWrapper(_ExtensibleCallable):
+    def __init__(
+        self,
+        func: Callable,
+        batch_func: Optional[Callable] = None,
+        bind_func: Optional[Callable] = None,
+        is_async: bool = False,
+    ):
+        self.func = func
+        self.batch_func = batch_func
+        self.bind_func = bind_func
+        self.is_async = is_async
+        self.has_single_func = True
+
+    @staticmethod
+    def delay(*args, **kwargs):
+        return _DelayedArgument(args=args, kwargs=kwargs)
+
+    @staticmethod
+    def _gen_args_kwargs_list(delays):
+        args_list = [delay.args for delay in delays]
+        kwargs_list = [delay.kwargs for delay in delays]
+        return args_list, kwargs_list
+
+    async def _async_batch(self, args_list, kwargs_list):
+        # when there is only one call in batch, calling one-pass method
+        # will be more efficient
+        if len(args_list) == 0:
+            return []
+        elif len(args_list) == 1:
+            return [await self._async_call(*args_list[0], **kwargs_list[0])]
+        elif self.batch_func:
+            return await self.batch_func(args_list, kwargs_list)
+        else:
+            # this function has no batch implementation
+            # call it separately
+            tasks = [
+                asyncio.create_task(self.func(*args, **kwargs))
+                for args, kwargs in zip(args_list, kwargs_list)
+            ]
+            try:
+                return await asyncio.gather(*tasks)
+            except asyncio.CancelledError:
+                _ = [task.cancel() for task in tasks]
+                return await asyncio.gather(*tasks)
+
+    def _sync_batch(self, args_list, kwargs_list):
+        if len(args_list) == 0:
+            return []
+        elif self.batch_func:
+            return self.batch_func(args_list, kwargs_list)
+        else:
+            # this function has no batch implementation
+            # call it separately
+            return [
+                self.func(*args, **kwargs)
+                for args, kwargs in zip(args_list, kwargs_list)
+            ]
+
+    def batch(self, *delays):
+        args_list, kwargs_list = self._gen_args_kwargs_list(delays)
+        return self.call_with_lists(args_list, kwargs_list)
+
+    def call_with_lists(self, args_list, kwargs_list):
+        if self.is_async:
+            return self._async_batch(args_list, kwargs_list)
+        else:
+            return self._sync_batch(args_list, kwargs_list)
+
+    def bind(self, *args, **kwargs):
+        if self.bind_func is None:
+            raise TypeError(f"bind function not exist for method {self.func.__name__}")
+        return self.bind_func(*args, **kwargs)
+
+
+class _ExtensibleAccessor(_ExtensibleCallable):
+    func: Callable
+    batch_func: Optional[Callable]
+
+    def __init__(self, func: Callable):
+        self.func = func
+        self.batch_func = None
+        self.bind_func = build_args_binder(func, remove_self=True)
+        self.is_async = asyncio.iscoroutinefunction(self.func)
+        self.has_single_func = True
+
+    def batch(self, func: Callable):
+        self.batch_func = func
+        return self
+
+    def __get__(self, instance, owner):
+        if instance is None:
+            # calling from class
+            return self.func
+
+        func = self.func.__get__(instance, owner)
+        batch_func = (
+            self.batch_func.__get__(instance, owner)
+            if self.batch_func is not None
+            else None
+        )
+        bind_func = (
+            self.bind_func.__get__(instance, owner)
+            if self.bind_func is not None
+            else None
+        )
+
+        return _ExtensibleWrapper(
+            func, batch_func=batch_func, bind_func=bind_func, is_async=self.is_async
+        )
+
+
+def extensible(func: Callable):
+    """
+    `extensible` means this func could be functionality extended,
+    especially for batch operations.
+
+    Consider remote function calls, each function may have operations
+    like opening file, closing file, batching them can help to reduce the cost,
+    especially for remote function calls.
+
+    Parameters
+    ----------
+    func : callable
+        Function
+
+    Returns
+    -------
+    func
+    """
+    return _ExtensibleAccessor(func)
diff --git a/python/xorbits/_mars/oscar/context.pxd b/python/xorbits/_mars/oscar/context.pxd
new file mode 100644
index 000000000..d348efcc2
--- /dev/null
+++ b/python/xorbits/_mars/oscar/context.pxd
@@ -0,0 +1,20 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+cdef class BaseActorContext:
+    cdef public str _address
+
+
+cpdef get_context()
diff --git a/python/xorbits/_mars/oscar/context.pyx b/python/xorbits/_mars/oscar/context.pyx
new file mode 100644
index 000000000..9fff1228c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/context.pyx
@@ -0,0 +1,275 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from urllib.parse import urlparse
+
+from .core cimport ActorRef
+from .utils cimport new_actor_id
+
+from .utils import create_actor_ref
+
+
+cdef dict _backend_context_cls = dict()
+
+cdef object _context = None
+
+
+cdef class BaseActorContext:
+    # allocate strategy is for Mars backend only
+    support_allocate_strategy = False
+
+    """
+    Base class for actor context. Every backend need to implement
+    actor context for their own.
+    """
+
+    def __init__(self, address: str = None):
+        self._address = address
+
+    async def create_actor(
+        self,
+        object actor_cls,
+        *args,
+        object uid=None,
+        object address=None,
+        **kwargs,
+    ):
+        """
+        Stub method for creating an actor in current context.
+
+        Parameters
+        ----------
+        actor_cls : Actor
+            Actor class
+        args : tuple
+            args to be passed into actor_cls.__init__
+        uid : identifier
+            Actor identifier
+        address : str
+            Address to locate the actor
+        kwargs : dict
+            kwargs to be passed into actor_cls.__init__
+
+        Returns
+        -------
+        ActorRef
+
+        """
+        raise NotImplementedError
+
+    async def has_actor(self, ActorRef actor_ref):
+        """
+        Check if actor exists in current context
+
+        Parameters
+        ----------
+        actor_ref : ActorRef
+            Reference to an actor
+
+        Returns
+        -------
+        bool
+        """
+        raise NotImplementedError
+
+    async def destroy_actor(self, ActorRef actor_ref):
+        """
+        Destroy an actor by its reference
+
+        Parameters
+        ----------
+        actor_ref : ActorRef
+            Reference to an actor
+
+        Returns
+        -------
+        bool
+        """
+        raise NotImplementedError
+
+    async def kill_actor(self, ActorRef actor_ref):
+        """
+        Force to kill an actor, take care this is a dangerous operation,
+        it may lead to the result that other actors are killed as well.
+        Hence, unless you are knowing what you are doing and know how
+        to recover possible effected actors, DO NOT USE this method!
+
+        Parameters
+        ----------
+        actor_ref : ActorRef
+            Reference to an actor
+
+        Returns
+        -------
+        bool
+        """
+
+    async def send(
+        self,
+        ActorRef actor_ref,
+        object message,
+        bint wait_response=True,
+        object profiling_context=None,
+    ):
+        """
+        Send a message to given actor by its reference
+
+        Parameters
+        ----------
+        actor_ref : ActorRef
+            Reference to an actor
+        message : object
+            Message to send to an actor, need to comply to Actor.__on_receive__
+        wait_response : bool
+            Whether to wait for responses from the actor.
+        profiling_context: ProfilingContext
+            The profiling context.
+
+        Returns
+        -------
+        object
+        """
+        raise NotImplementedError
+
+    async def actor_ref(self, *args, **kwargs):
+        """
+        Create a reference to an actor
+
+        Returns
+        -------
+        ActorRef
+        """
+        raise NotImplementedError
+
+    async def wait_actor_pool_recovered(self, str address, str main_address = None):
+        """
+        Wait until an actor pool is recovered
+
+        Parameters
+        ----------
+        address
+            address of the actor pool
+        main_address
+            address of the main pool
+        """
+        raise NotImplementedError
+
+    async def get_pool_config(self, str address):
+        """
+        Get config of actor pool with given address
+
+        Parameters
+        ----------
+        address
+            address of the actor pool
+
+        Returns
+        -------
+
+        """
+        raise NotImplementedError
+
+
+cdef class ClientActorContext(BaseActorContext):
+    """
+    Default actor context. This context will keep references to other contexts
+    given their protocol scheme (i.e., `ray://xxx`).
+    """
+    cdef dict _backend_contexts
+
+    def __init__(self, address: str = None):
+        BaseActorContext.__init__(self, address)
+        self._backend_contexts = dict()
+
+    cdef inline object _get_backend_context(self, object address):
+        if address is None:
+            raise ValueError('address has to be provided')
+        if '://' not in address:
+            scheme = None
+        else:
+            scheme = urlparse(address).scheme or None
+        try:
+            return self._backend_contexts[scheme]
+        except KeyError:
+            context = self._backend_contexts[scheme] = \
+                _backend_context_cls[scheme](address)
+            return context
+
+    def create_actor(
+        self,
+        object actor_cls,
+        *args,
+        object uid=None,
+        object address=None,
+        **kwargs,
+    ):
+        context = self._get_backend_context(address)
+        uid = uid or new_actor_id()
+        return context.create_actor(actor_cls, *args, uid=uid, address=address, **kwargs)
+
+    def has_actor(self, ActorRef actor_ref):
+        context = self._get_backend_context(actor_ref.address)
+        return context.has_actor(actor_ref)
+
+    def destroy_actor(self, ActorRef actor_ref):
+        context = self._get_backend_context(actor_ref.address)
+        return context.destroy_actor(actor_ref)
+
+    def kill_actor(self, ActorRef actor_ref):
+        context = self._get_backend_context(actor_ref.address)
+        return context.kill_actor(actor_ref)
+
+    def actor_ref(self, *args, **kwargs):
+        actor_ref = create_actor_ref(*args, **kwargs)
+        context = self._get_backend_context(actor_ref.address)
+        return context.actor_ref(actor_ref)
+
+    def send(
+        self,
+        ActorRef actor_ref,
+        object message,
+        bint wait_response=True,
+        object profiling_context=None
+    ):
+        context = self._get_backend_context(actor_ref.address)
+        return context.send(
+            actor_ref,
+            message,
+            wait_response=wait_response,
+            profiling_context=profiling_context,
+        )
+
+    def wait_actor_pool_recovered(self, str address, str main_address = None):
+        context = self._get_backend_context(address)
+        return context.wait_actor_pool_recovered(address, main_address)
+
+    def get_pool_config(self, str address):
+        context = self._get_backend_context(address)
+        return context.get_pool_config(address)
+
+
+def register_backend_context(scheme, cls):
+    assert issubclass(cls, BaseActorContext)
+    _backend_context_cls[scheme] = cls
+
+
+cpdef get_context():
+    """
+    Get an actor context. If not in an actor environment,
+    ClientActorContext will be used
+    """
+    global _context
+    if _context is None:
+        _context = ClientActorContext()
+    return _context
diff --git a/python/xorbits/_mars/oscar/core.pxd b/python/xorbits/_mars/oscar/core.pxd
new file mode 100644
index 000000000..02f6dab9c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/core.pxd
@@ -0,0 +1,39 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+cdef class ActorRef:
+    cdef object __weakref__
+    cdef public str address
+    cdef public object uid
+    cdef dict _methods
+
+
+cdef class LocalActorRef(ActorRef):
+    cdef object _actor_weakref
+    cdef _weakref_local_actor(self)
+
+
+cdef class _BaseActor:
+    cdef object __weakref__
+    cdef str _address
+    cdef object _lock
+    cdef object _uid
+
+    cpdef ActorRef ref(self)
+
+
+cdef class ActorEnvironment:
+    cdef public dict actor_locks
+    cdef public object address
diff --git a/python/xorbits/_mars/oscar/core.pyx b/python/xorbits/_mars/oscar/core.pyx
new file mode 100644
index 000000000..d07bf181c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/core.pyx
@@ -0,0 +1,549 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import inspect
+import logging
+import sys
+import weakref
+from typing import AsyncGenerator
+
+cimport cython
+
+from .context cimport get_context
+
+from .errors import ActorNotExist, Return
+
+from .utils cimport is_async_generator
+
+CALL_METHOD_DEFAULT = 0
+CALL_METHOD_BATCH = 1
+
+logger = logging.getLogger(__name__)
+
+cdef:
+    bint _log_unhandled_errors = False
+    bint _log_cycle_send = False
+    dict _local_pool_map = dict()
+    object _actor_method_wrapper
+
+
+def set_debug_options(options):
+    global _log_unhandled_errors, _log_cycle_send
+    if options is None:
+        _log_unhandled_errors = _log_cycle_send = False
+    else:
+        _log_unhandled_errors = options.log_unhandled_errors
+        _log_cycle_send = options.log_cycle_send
+
+
+cdef _get_local_actor(address, uid):
+    # Do not expose this method to Python to avoid actor being
+    # referenced everywhere.
+    #
+    # The cycle send detection relies on send message, so we
+    # disabled the local actor proxy if the debug option is on.
+    if _log_cycle_send:
+        return None
+    pool_ref = _local_pool_map.get(address)
+    pool = None if pool_ref is None else pool_ref()
+    if pool is not None:
+        actor = pool._actors.get(uid)
+        if actor is not None:
+            return actor
+    return None
+
+
+def register_local_pool(address, pool):
+    """
+    Register local actor pool for local actor lookup.
+    """
+    _local_pool_map[address] = weakref.ref(
+        pool, lambda _: _local_pool_map.pop(address, None)
+    )
+
+
+cpdef create_local_actor_ref(address, uid):
+    """
+    Create a reference to local actor.
+
+    Returns
+    -------
+    LocalActorRef or None
+    """
+    actor = _get_local_actor(address, uid)
+    if actor is not None:
+        return LocalActorRef(actor)
+    return None
+
+
+cpdef create_actor_ref(address, uid):
+    """
+    Create an actor reference.
+    TODO(fyrestone): Remove the create_actor_ref in utils.pyx
+
+    Returns
+    -------
+    ActorRef or LocalActorRef
+    """
+    actor = _get_local_actor(address, uid)
+    return ActorRef(address, uid) if actor is None else LocalActorRef(actor)
+
+
+cdef class ActorRef:
+    """
+    Reference of an Actor at user side
+    """
+    def __init__(self, str address, object uid):
+        if isinstance(uid, str):
+            uid = uid.encode()
+        self.uid = uid
+        self.address = address
+        self._methods = dict()
+
+    def destroy(self, object callback=None):
+        ctx = get_context()
+        return ctx.destroy_actor(self)
+
+    def __reduce__(self):
+        return create_actor_ref, (self.address, self.uid)
+
+    def __getattr__(self, item):
+        if item.startswith('_'):
+            return object.__getattribute__(self, item)
+
+        try:
+            return self._methods[item]
+        except KeyError:
+            method = self._methods[item] = ActorRefMethod(self, item)
+            return method
+
+    def __hash__(self):
+        return hash((self.address, self.uid))
+
+    def __eq__(self, other):
+        other_type = type(other)
+        if other_type is ActorRef or other_type is LocalActorRef:
+            return self.address == other.address and self.uid == other.uid
+        return False
+
+    def __repr__(self):
+        return 'ActorRef(uid={!r}, address={!r})'.format(self.uid, self.address)
+
+
+cdef class _DelayedArgument:
+    cdef readonly tuple arguments
+
+    def __init__(self, tuple arguments):
+        self.arguments = arguments
+
+
+cdef class ActorRefMethod:
+    """
+    Wrapper for an Actor method at client
+    """
+    cdef ActorRef ref
+    cdef object method_name
+    cdef object _options
+
+    def __init__(self, ref, method_name, options=None):
+        self.ref = ref
+        self.method_name = method_name
+        self._options = options or {}
+
+    def __call__(self, *args, **kwargs):
+        return self.send(*args, **kwargs)
+
+    def options(self, **options):
+        return ActorRefMethod(self.ref, self.method_name, options)
+
+    def send(self, *args, **kwargs):
+        arg_tuple = (self.method_name, CALL_METHOD_DEFAULT, args, kwargs)
+        return get_context().send(self.ref, arg_tuple, **self._options)
+
+    def tell(self, *args, **kwargs):
+        arg_tuple = (self.method_name, CALL_METHOD_DEFAULT, args, kwargs)
+        return get_context().send(self.ref, arg_tuple, wait_response=False, **self._options)
+
+    def delay(self, *args, **kwargs):
+        arg_tuple = (self.method_name, CALL_METHOD_DEFAULT, args, kwargs)
+        return _DelayedArgument(arg_tuple)
+
+    def batch(self, *delays, send=True):
+        cdef:
+            long n_delays = len(delays)
+            bint has_kw = False
+            list args_list
+            list kwargs_list
+            _DelayedArgument delay
+
+        args_list = [None] * n_delays
+        kwargs_list = [None] * n_delays
+
+        last_method = None
+        for idx in range(n_delays):
+            delay = delays[idx]
+            method, _call_method, args, kwargs = delay.arguments
+            if last_method is not None and method != last_method:
+                raise ValueError('Does not support calling multiple methods in batch')
+            last_method = method
+
+            args_list[idx] = args
+            kwargs_list[idx] = kwargs
+            if kwargs:
+                has_kw = True
+
+        if not has_kw:
+            kwargs_list = None
+        if last_method is None:
+            last_method = self.method_name
+
+        message = (last_method, CALL_METHOD_BATCH, (args_list, kwargs_list), None)
+        return get_context().send(self.ref, message, wait_response=send, **self._options)
+
+    def tell_delay(self, *args, delay=None, ignore_conn_fail=True, **kwargs):
+        async def delay_fun():
+            try:
+                await asyncio.sleep(delay)
+                message = (self.method_name, CALL_METHOD_DEFAULT, args, kwargs)
+                await get_context().send(self.ref, message, wait_response=False, **self._options)
+            except Exception as ex:
+                if ignore_conn_fail and isinstance(ex, ConnectionRefusedError):
+                    return
+
+                logger.error(f'Error {type(ex)} occurred when calling {self.method_name} '
+                             f'on {self.ref.uid} at {self.ref.address} with tell_delay')
+                raise
+
+        return asyncio.create_task(delay_fun())
+
+
+cdef class LocalActorRef(ActorRef):
+    def __init__(self, _BaseActor actor):
+        # Make sure the input actor is an instance of _BaseActor.
+        super().__init__(actor._address, actor._uid)
+        self._actor_weakref = weakref.ref(actor, lambda _: self._methods.clear())
+
+    cdef _weakref_local_actor(self):
+        actor = _get_local_actor(self.address, self.uid)
+        # Make sure the input actor is an instance of _BaseActor.
+        if actor is not None and isinstance(actor, _BaseActor):
+            self._actor_weakref = weakref.ref(actor, lambda _: self._methods.clear())
+            return actor
+        return None
+
+    def __getattr__(self, item):
+        try:
+            return self._methods[item]
+        except KeyError:
+            actor = self._actor_weakref() or self._weakref_local_actor()
+            if actor is None:
+                raise ActorNotExist(f"Actor {self.uid} does not exist") from None
+            # For detecting the attribute error.
+            getattr(actor, item)
+            method = self._methods[item] = LocalActorRefMethod(self, item)
+            return method
+
+    def __repr__(self):
+        return 'LocalActorRef(uid={!r}, address={!r}), actor_weakref={!r}'.format(
+            self.uid, self.address, self._actor_weakref)
+
+
+async def __pyx_actor_method_wrapper(method, result_handler, lock, args, kwargs):
+    async with lock:
+        result = method(*args, **kwargs)
+        if asyncio.iscoroutine(result):
+            result = await result
+    return await result_handler(result)
+
+# Avoid global lookup.
+_actor_method_wrapper = __pyx_actor_method_wrapper
+
+
+cdef class LocalActorRefMethod:
+    cdef LocalActorRef _local_actor_ref
+    cdef object _method_name
+
+    def __init__(self, LocalActorRef local_actor_ref, method_name):
+        self._local_actor_ref = local_actor_ref
+        self._method_name = method_name
+
+    cdef tuple _get_referent(self):
+        actor = self._local_actor_ref._actor_weakref() or self._local_actor_ref._weakref_local_actor()
+        if actor is None:
+            raise ActorNotExist(f"Actor {self._local_actor_ref.uid} does not exist.")
+        method = getattr(actor, self._method_name)
+        return actor, method
+
+    def __call__(self, *args, **kwargs):
+        actor, method = self._get_referent()
+        return _actor_method_wrapper(
+            method, actor._handle_actor_result, (<_BaseActor>actor)._lock, args, kwargs)
+
+    def options(self, **options):
+        return self
+
+    def send(self, *args, **kwargs):
+        actor, method = self._get_referent()
+        return _actor_method_wrapper(
+            method, actor._handle_actor_result, (<_BaseActor>actor)._lock, args, kwargs)
+
+    def tell(self, *args, **kwargs):
+        actor, method = self._get_referent()
+        coro = _actor_method_wrapper(
+            method, actor._handle_actor_result, (<_BaseActor>actor)._lock, args, kwargs)
+        asyncio.create_task(coro)
+        return asyncio.sleep(0)
+
+    def delay(self, *args, **kwargs):
+        actor, method = self._get_referent()
+        return method.delay(*args, **kwargs)
+
+    def batch(self, *delays, send=True):
+        actor, method = self._get_referent()
+        coro = _actor_method_wrapper(
+            method.batch, actor._handle_actor_result, (<_BaseActor>actor)._lock, delays, dict())
+        if send:
+            return coro
+        else:
+            asyncio.create_task(coro)
+            return asyncio.sleep(0)
+
+    def tell_delay(self, *args, delay=None, ignore_conn_fail=True, **kwargs):
+        async def delay_fun():
+            await asyncio.sleep(delay)
+            await self.tell(*args, **kwargs)
+
+        return asyncio.create_task(delay_fun())
+
+
+cdef class _BaseActor:
+    """
+    Base Mars actor class, user methods implemented as methods
+    """
+    def __cinit__(self, *args, **kwargs):
+        self._lock = self._create_lock()
+
+    def _create_lock(self):
+        raise NotImplementedError
+
+    @property
+    def uid(self):
+        return self._uid
+
+    @uid.setter
+    def uid(self, uid):
+        self._uid = uid
+
+    def _set_uid(self, uid):
+        self._uid = uid
+
+    @property
+    def address(self):
+        return self._address
+
+    @address.setter
+    def address(self, addr):
+        self._address = addr
+
+    def _set_address(self, addr):
+        self._address = addr
+
+    cpdef ActorRef ref(self):
+        return create_actor_ref(self._address, self._uid)
+
+    async def _handle_actor_result(self, result):
+        cdef int idx
+        cdef tuple res_tuple
+        cdef list tasks, coros, coro_poses, values
+        cdef object coro
+        cdef bint extract_tuple = False
+        cdef bint cancelled = False
+        cdef set dones, pending
+
+        if inspect.isawaitable(result):
+            result = await result
+        elif is_async_generator(result):
+            result = (result,)
+            extract_tuple = True
+
+        if type(result) is tuple:
+            res_tuple = result
+            coros = []
+            coro_poses = []
+            values = []
+            for idx, res_item in enumerate(res_tuple):
+                if is_async_generator(res_item):
+                    value = self._run_actor_async_generator(res_item)
+                    coros.append(value)
+                    coro_poses.append(idx)
+                elif inspect.isawaitable(res_item):
+                    value = res_item
+                    coros.append(value)
+                    coro_poses.append(idx)
+                else:
+                    value = res_item
+                values.append(value)
+
+            # when there is only one coroutine, we do not need to use
+            # asyncio.wait as it introduces much overhead
+            if len(coros) == 1:
+                task_result = await coros[0]
+                if extract_tuple:
+                    result = task_result
+                else:
+                    result = tuple(task_result if t is coros[0] else t for t in values)
+            elif len(coros) > 0:
+                tasks = [asyncio.create_task(t) for t in coros]
+                try:
+                    dones, pending = await asyncio.wait(tasks)
+                except asyncio.CancelledError:
+                    cancelled = True
+                    for task in tasks:
+                        task.cancel()
+                    # wait till all tasks return cancelled
+                    dones, pending = await asyncio.wait(tasks)
+
+                if extract_tuple:
+                    result = list(dones)[0].result()
+                else:
+                    for pos in coro_poses:
+                        task = tasks[pos]
+                        values[pos] = task.result()
+                    result = tuple(values)
+
+                if cancelled:
+                    # raise in case no CancelledError raised
+                    raise asyncio.CancelledError
+
+        return result
+
+    async def _run_actor_async_generator(self, gen: AsyncGenerator):
+        """
+        Run an async generator under Actor lock
+        """
+        cdef tuple res_tuple
+        cdef bint is_exception = False
+        cdef object res
+        cdef object message_trace = None, pop_message_trace = None, set_message_trace = None
+
+        from .debug import debug_async_timeout, pop_message_trace, set_message_trace
+        try:
+            res = None
+            while True:
+                async with self._lock:
+                    with debug_async_timeout('actor_lock_timeout',
+                                             'async_generator %r hold lock timeout', gen):
+                        if not is_exception:
+                            res = await gen.asend(res)
+                        else:
+                            res = await gen.athrow(*res)
+                try:
+                    if _log_cycle_send:
+                        message_trace = pop_message_trace()
+
+                    res = await self._handle_actor_result(res)
+                    is_exception = False
+                except:
+                    res = sys.exc_info()
+                    is_exception = True
+                finally:
+                    if _log_cycle_send:
+                        set_message_trace(message_trace)
+        except Return as ex:
+            return ex.value
+        except StopAsyncIteration as ex:
+            return
+
+    async def __post_create__(self):
+        """
+        Method called after actor creation
+        """
+        pass
+
+    async def __pre_destroy__(self):
+        """
+        Method called before actor destroy
+        """
+        pass
+
+    async def __on_receive__(self, tuple message):
+        """
+        Handle message from other actors and dispatch them to user methods
+
+        Parameters
+        ----------
+        message : tuple
+            Message shall be (method_name,) + args + (kwargs,)
+        """
+        from .debug import debug_async_timeout
+        try:
+            method, call_method, args, kwargs = message
+            if call_method == CALL_METHOD_DEFAULT:
+                func = getattr(self, method)
+                async with self._lock:
+                    with debug_async_timeout('actor_lock_timeout',
+                                             "Method %s of actor %s hold lock timeout.",
+                                             method, self.uid):
+                        result = func(*args, **kwargs)
+                        if asyncio.iscoroutine(result):
+                            result = await result
+            elif call_method == CALL_METHOD_BATCH:
+                func = getattr(self, method)
+                async with self._lock:
+                    with debug_async_timeout('actor_lock_timeout',
+                                             "Batch method %s of actor %s hold lock timeout, batch size %s.",
+                                             method, self.uid, len(args)):
+                        args_list, kwargs_list = args
+                        if kwargs_list is None:
+                            kwargs_list = [{}] * len(args_list)
+                        result = func.call_with_lists(args_list, kwargs_list)
+                        if asyncio.iscoroutine(result):
+                            result = await result
+            else:  # pragma: no cover
+                raise ValueError(f'call_method {call_method} not valid')
+
+            return await self._handle_actor_result(result)
+        except Exception as ex:
+            if _log_unhandled_errors:
+                from .debug import logger as debug_logger
+
+                # use `%.500` to avoid print too long messages
+                debug_logger.exception('Got unhandled error when handling message %.500r '
+                                       'in actor %s at %s', message, self.uid, self.address)
+            raise ex
+
+
+# The @cython.binding(True) is for ray getting members.
+# The value is True by default after cython >= 3.0.0
+@cython.binding(True)
+cdef class _Actor(_BaseActor):
+    def _create_lock(self):
+        return asyncio.locks.Lock()
+
+
+cdef class _FakeLock:
+    async def __aenter__(self):
+        pass
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+
+# The @cython.binding(True) is for ray getting members.
+# The value is True by default after cython >= 3.0.0
+@cython.binding(True)
+cdef class _StatelessActor(_BaseActor):
+    def _create_lock(self):
+        return _FakeLock()
diff --git a/python/xorbits/_mars/oscar/debug.py b/python/xorbits/_mars/oscar/debug.py
new file mode 100644
index 000000000..fc9e893e3
--- /dev/null
+++ b/python/xorbits/_mars/oscar/debug.py
@@ -0,0 +1,182 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio.tasks
+import contextvars
+import json
+import logging
+import os
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import List, Optional  # noqa: F401
+
+from ..utils import dataslots
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+@dataslots
+@dataclass
+class MessageTraceItem:
+    uid: str
+    address: str
+    method: str
+
+
+@dataslots
+@dataclass
+class DebugOptions:
+    actor_call_timeout: int = 10
+    process_message_timeout: int = 30
+    actor_lock_timeout: int = 30
+    ray_object_retrieval_timeout: int = 10
+    log_unhandled_errors: bool = True
+    log_cycle_send: bool = True
+
+
+_debug_opts: Optional[DebugOptions] = None
+
+
+def get_debug_options() -> Optional[DebugOptions]:
+    return _debug_opts
+
+
+def set_debug_options(options: Optional[DebugOptions]):
+    global _debug_opts
+    _debug_opts = options
+
+    # deliver debug config to native codes for optimization
+    from .core import set_debug_options as core_set_debug_options
+
+    core_set_debug_options(options)
+
+
+def reload_debug_opts_from_env():
+    config_str = os.environ.get("DEBUG_OSCAR", "0")
+    if config_str == "0":
+        set_debug_options(None)
+        return
+    config_str = os.environ["DEBUG_OSCAR"]
+    config_json = {} if config_str == "1" else json.loads(config_str)
+    set_debug_options(DebugOptions(**config_json))
+
+
+async def _log_timeout(timeout, msg, *args, **kwargs):
+    start_time, rnd = time.time(), 1
+    while True:
+        await asyncio.sleep(timeout * rnd)
+        rnd += 1
+        logger.warning(
+            msg + " (timeout for %.4f seconds).",
+            *args,
+            time.time() - start_time,
+            **kwargs,
+        )
+
+
+@contextmanager
+def debug_async_timeout(option_name: str, msg, *args, **kwargs):
+    if _debug_opts is None:
+        yield
+    else:
+        timeout_val = getattr(_debug_opts, option_name, -1)
+        timeout_task = None
+        if timeout_val and timeout_val > 0:
+            timeout_task = asyncio.create_task(
+                _log_timeout(timeout_val, msg, *args, **kwargs)
+            )
+
+        try:
+            yield
+        finally:
+            if timeout_task is not None:
+                timeout_task.cancel()
+
+
+_message_trace_var = contextvars.ContextVar("_message_trace_var")
+
+
+@contextmanager
+def record_message_trace(message):
+    if _debug_opts is None or not _debug_opts.log_cycle_send:
+        yield
+    else:
+        msg_trace = list(message.message_trace or [])
+        msg_trace.append(
+            MessageTraceItem(
+                uid=message.actor_ref.uid,
+                address=message.actor_ref.address,
+                method=message.content[0],
+            )
+        )
+        _message_trace_var.set(msg_trace)
+        try:
+            yield
+        finally:
+            _message_trace_var.set(None)
+
+
+def detect_cycle_send(message, wait_response: bool = True):
+    if _debug_opts is None or not _debug_opts.log_cycle_send or not wait_response:
+        return
+
+    cur_trace = _message_trace_var.get(None) or []  # type: List[MessageTraceItem]
+    message.message_trace = cur_trace
+
+    ref_key = (message.actor_ref.uid, message.actor_ref.address)
+    traced_ref_keys = set((item.uid, item.address) for item in cur_trace)
+    if ref_key in traced_ref_keys:
+        looped_trace = cur_trace + [
+            MessageTraceItem(
+                uid=message.actor_ref.uid,
+                address=message.actor_ref.address,
+                method=message.content[0],
+            )
+        ]
+
+        formatted_trace = "\n    ".join(
+            f"Calling {t.method!r} in actor {t.uid} at {t.address}"
+            for t in looped_trace
+        )
+        logger.warning(
+            "Call cycle detected when sending to actor %s at %s, the trace is\n"
+            "    %s",
+            message.actor_ref.uid,
+            message.actor_ref.address,
+            formatted_trace,
+        )
+
+
+@contextmanager
+def no_message_trace():
+    if _debug_opts is None or not _debug_opts.log_cycle_send:
+        yield
+    else:
+        trace = pop_message_trace()
+        yield
+        set_message_trace(trace)
+
+
+def pop_message_trace():
+    trace = _message_trace_var.get(None)
+    _message_trace_var.set(None)
+    return trace
+
+
+def set_message_trace(message_trace):
+    _message_trace_var.set(message_trace)
+
+
+reload_debug_opts_from_env()
diff --git a/python/xorbits/_mars/oscar/driver.py b/python/xorbits/_mars/oscar/driver.py
new file mode 100644
index 000000000..baadfedee
--- /dev/null
+++ b/python/xorbits/_mars/oscar/driver.py
@@ -0,0 +1,41 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from numbers import Number
+from typing import Dict, Type
+
+
+class BaseActorDriver(ABC):
+    @classmethod
+    @abstractmethod
+    def setup_cluster(cls, address_to_resources: Dict[str, Dict[str, Number]]):
+        """
+        Setup cluster according to given resources,
+        resources is a dict, e.g. {'CPU': 3, 'GPU': 1}
+
+        Parameters
+        ----------
+        address_to_resources: dict
+            resources that required for each node.
+        """
+        pass
+
+
+_backend_driver_cls: Dict[str, Type[BaseActorDriver]] = dict()
+
+
+def register_backend_driver(scheme: str, cls: Type[BaseActorDriver]):
+    assert issubclass(cls, BaseActorDriver)
+    _backend_driver_cls[scheme] = cls
diff --git a/python/xorbits/_mars/oscar/errors.py b/python/xorbits/_mars/oscar/errors.py
new file mode 100644
index 000000000..5af99b8e2
--- /dev/null
+++ b/python/xorbits/_mars/oscar/errors.py
@@ -0,0 +1,60 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core.base import MarsError
+
+
+class ReconstructWorkerError(MarsError):
+    pass
+
+
+class ActorPoolNotStarted(MarsError):
+    pass
+
+
+class ActorNotExist(MarsError):
+    pass
+
+
+class ActorAlreadyExist(MarsError):
+    pass
+
+
+class NoIdleSlot(MarsError):
+    pass
+
+
+class NoFreeSlot(MarsError):
+    pass
+
+
+class SlotStateError(MarsError):
+    pass
+
+
+class ServerClosed(MarsError):
+    pass
+
+
+class CannotCancelTask(MarsError):
+    pass
+
+
+class SendMessageFailed(MarsError):
+    pass
+
+
+class Return(MarsError):
+    def __init__(self, value):
+        self.value = value
diff --git a/python/xorbits/_mars/oscar/profiling.py b/python/xorbits/_mars/oscar/profiling.py
new file mode 100644
index 000000000..c9a0eb880
--- /dev/null
+++ b/python/xorbits/_mars/oscar/profiling.py
@@ -0,0 +1,293 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import copy
+import heapq
+import json
+import logging
+import operator
+import os
+from collections import Counter
+from collections.abc import Mapping
+
+from ..typing import BandType
+from .backends.message import SendMessage, TellMessage
+
+logger = logging.getLogger(__name__)
+
+MARS_ENABLE_PROFILING = int(os.environ.get("MARS_ENABLE_PROFILING", 0))
+
+
+class _ProfilingOptionDescriptor:
+    def __init__(self, _type, default):
+        self._name = None
+        self._type = _type
+        self._default = default
+
+    def __get__(self, obj, cls):
+        if obj is None:
+            return self
+        v = obj._options.get(self._name)
+        if v is None:
+            v = os.environ.get(f"MARS_PROFILING_{self._name.upper()}", self._default)
+        if v is not None:
+            v = self._type(v)
+        # Cache the value.
+        obj.__dict__[self._name] = v
+        return v
+
+    def set_name(self, name: str):
+        self._name = name
+
+
+class _ProfilingOptionsMeta(type):
+    def __init__(cls, name, bases, classdict):
+        super(_ProfilingOptionsMeta, cls).__init__(name, bases, classdict)
+        for k, v in classdict.items():
+            if isinstance(v, _ProfilingOptionDescriptor):
+                v.set_name(k)
+
+
+class _ProfilingOptions(metaclass=_ProfilingOptionsMeta):
+    debug_interval_seconds = _ProfilingOptionDescriptor(float, default=None)
+    slow_calls_duration_threshold = _ProfilingOptionDescriptor(int, default=1)
+    slow_subtasks_duration_threshold = _ProfilingOptionDescriptor(int, default=10)
+
+    def __init__(self, options):
+        if isinstance(options, Mapping):
+            invalid_keys = options.keys() - type(self).__dict__.keys()
+            if invalid_keys:
+                raise ValueError(f"Invalid profiling options: {invalid_keys}")
+            self._options = options
+        elif options in (True, False, None):
+            self._options = {}
+        else:
+            raise ValueError(f"Invalid profiling options: {options}")
+
+
+class DummyOperator:
+    @staticmethod
+    def set(key, value):
+        pass
+
+    @staticmethod
+    def inc(key, value):
+        pass
+
+    @staticmethod
+    def nest(key):
+        return DummyOperator
+
+    @staticmethod
+    def values():
+        return []
+
+    @staticmethod
+    def empty():
+        return True
+
+
+class ProfilingDataOperator:
+    __slots__ = ("_target",)
+
+    def __init__(self, target):
+        self._target = target
+
+    def set(self, key, value):
+        self._target[key] = value
+
+    def inc(self, key, value):
+        old = self._target.get(key, 0)
+        self._target[key] = old + value
+
+    def nest(self, key):
+        v = self._target.setdefault(key, {})
+        if not isinstance(v, dict):
+            raise TypeError(
+                f"The value type of key {key} is {type(v)}, but a dict is expected."
+            )
+        return ProfilingDataOperator(v)
+
+    def values(self):
+        return self._target.values()
+
+    def empty(self):
+        return len(self._target) == 0
+
+
+class _CallStats:
+    def __init__(self, options: _ProfilingOptions):
+        self._options = options
+        self._call_counter = Counter()
+        self._slow_calls = []
+
+    def collect(self, message, duration: float):
+        key = (message.actor_ref.uid, message.content[0])
+        self._call_counter[key] += 1
+        if duration < self._options.slow_calls_duration_threshold:
+            return
+        key = (
+            duration,
+            message.actor_ref.uid,
+            message.actor_ref.address,
+            message.content,
+        )
+        try:
+            if len(self._slow_calls) < 10:
+                heapq.heappush(self._slow_calls, key)
+            else:
+                heapq.heapreplace(self._slow_calls, key)
+        except TypeError:
+            pass
+
+    def to_dict(self) -> dict:
+        most_calls = {}
+        for name_tuple, count in self._call_counter.most_common(10):
+            uid, method_name = name_tuple
+            most_calls[f"{uid.decode('utf-8')}.{method_name}"] = count
+        slow_calls = {}
+        for duration, uid, address, content in sorted(
+            self._slow_calls, key=operator.itemgetter(0), reverse=True
+        ):
+            method_name, _batch, args, kwargs = content
+            slow_calls[
+                f"[{address}]{uid.decode('utf-8')}.{method_name}(args={args}, kwargs={kwargs})"
+            ] = duration
+        return {"most_calls": most_calls, "slow_calls": slow_calls}
+
+
+class _SubtaskStats:
+    def __init__(self, options: _ProfilingOptions):
+        self._options = options
+        self._band_counter = Counter()
+        self._slow_subtasks = []
+
+    def collect(self, subtask, band: BandType, duration: float):
+        band_address = band[0]
+        self._band_counter[band_address] += 1
+        if duration < self._options.slow_subtasks_duration_threshold:
+            return
+        key = (duration, band_address, subtask)
+        try:
+            if len(self._slow_subtasks) < 10:
+                heapq.heappush(self._slow_subtasks, key)
+            else:
+                heapq.heapreplace(self._slow_subtasks, key)
+        except TypeError:
+            pass
+
+    def to_dict(self) -> dict:
+        band_subtasks = {}
+        key = operator.itemgetter(1)
+        if len(self._band_counter) > 10:
+            items = self._band_counter.items()
+            band_subtasks.update(heapq.nlargest(5, items, key=key))
+            band_subtasks.update(reversed(heapq.nsmallest(5, items, key=key)))
+        else:
+            band_subtasks.update(
+                sorted(self._band_counter.items(), key=key, reverse=True)
+            )
+        slow_subtasks = {}
+        for duration, band, subtask in sorted(
+            self._slow_subtasks, key=operator.itemgetter(0), reverse=True
+        ):
+            slow_subtasks[f"[{band}]{subtask}"] = duration
+        return {"band_subtasks": band_subtasks, "slow_subtasks": slow_subtasks}
+
+
+class _ProfilingData:
+    def __init__(self):
+        self._data = {}
+        self._call_stats = {}
+        self._subtask_stats = {}
+        self._debug_task = {}
+
+    def init(self, task_id: str, options=None):
+        options = _ProfilingOptions(options)
+        logger.info(
+            "Init profiling data for task %s with debug interval seconds %s.",
+            task_id,
+            options.debug_interval_seconds,
+        )
+        self._data[task_id] = {
+            "general": {},
+            "serialization": {},
+            "most_calls": {},
+            "slow_calls": {},
+            "band_subtasks": {},
+            "slow_subtasks": {},
+        }
+        self._call_stats[task_id] = _CallStats(options)
+        self._subtask_stats[task_id] = _SubtaskStats(options)
+
+        async def _debug_profiling_log():
+            while True:
+                try:
+                    r = self._data.get(task_id, None)
+                    if r is None:
+                        logger.info("Profiling debug log break.")
+                        break
+                    r = copy.copy(r)  # shadow copy is enough.
+                    r.update(self._call_stats.get(task_id).to_dict())
+                    r.update(self._subtask_stats.get(task_id).to_dict())
+                    logger.warning("Profiling debug:\n%s", json.dumps(r, indent=4))
+                except Exception:
+                    logger.exception("Profiling debug log failed.")
+                await asyncio.sleep(options.debug_interval_seconds)
+
+        if options.debug_interval_seconds is not None:
+            self._debug_task[task_id] = task = asyncio.create_task(
+                _debug_profiling_log()
+            )
+            task.add_done_callback(lambda _: self._debug_task.pop(task_id, None))
+
+    def pop(self, task_id: str):
+        logger.info("Pop profiling data of task %s.", task_id)
+        debug_task = self._debug_task.pop(task_id, None)
+        if debug_task is not None:
+            debug_task.cancel()
+        r = self._data.pop(task_id, None)
+        if r is not None:
+            r.update(self._call_stats.pop(task_id).to_dict())
+            r.update(self._subtask_stats.pop(task_id).to_dict())
+        return r
+
+    def collect_actor_call(self, message, duration: float):
+        if self._call_stats:
+            message_type = type(message)
+            if message_type is SendMessage or message_type is TellMessage:
+                for stats in self._call_stats.values():
+                    stats.collect(message, duration)
+
+    def collect_subtask(self, subtask, band: BandType, duration: float):
+        if self._subtask_stats:
+            stats = self._subtask_stats.get(subtask.task_id)
+            if stats is not None:
+                stats.collect(subtask, band, duration)
+
+    def __getitem__(self, item):
+        key = item if isinstance(item, tuple) else (item,)
+        v = None
+        d = self._data
+        for k in key:
+            v = d.get(k, None)
+            if v is None:
+                break
+            else:
+                d = v
+        return DummyOperator if v is None else ProfilingDataOperator(v)
+
+
+ProfilingData = _ProfilingData()
diff --git a/python/xorbits/_mars/oscar/tests/__init__.py b/python/xorbits/_mars/oscar/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/oscar/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/oscar/tests/test_actorcaller.py b/python/xorbits/_mars/oscar/tests/test_actorcaller.py
new file mode 100644
index 000000000..1e271f7ca
--- /dev/null
+++ b/python/xorbits/_mars/oscar/tests/test_actorcaller.py
@@ -0,0 +1,84 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+
+import pytest
+
+from ...tests.core import mock
+from ..backends.core import ActorCaller
+from ..backends.router import Router
+from ..errors import ServerClosed
+
+
+@pytest.mark.asyncio
+@mock.patch.object(Router, "get_client")
+async def test_send_when_close(fake_get_client):
+    class FakeClient:
+        def __init__(self):
+            self.closed = False
+            self.send_num = 0
+            self._messages = asyncio.Queue()
+            self.dest_address = "test"
+
+        async def send(self, message):
+            await self._messages.put(message)
+            self.send_num += 1
+            if self.send_num >= 3:
+                raise ConnectionError("test")
+
+        async def recv(self, *args, **kwargs):
+            await asyncio.sleep(3)
+            res = await self._messages.get()
+            return res
+
+        async def close(self):
+            self.closed = True
+
+    fake_client = FakeClient()
+    fake_get_client.side_effect = lambda *args, **kwargs: fake_client
+
+    class FakeMessage:
+        def __init__(self, id_num):
+            self.message_id = id_num
+
+    caller = ActorCaller()
+
+    router = Router(
+        external_addresses=["test1"],
+        local_address="test2",
+    )
+    futures = []
+    for index in range(2):
+        futures.append(
+            await caller.call(
+                router=router,
+                dest_address="test1",
+                message=FakeMessage(index),
+                wait=False,
+            )
+        )
+
+    with pytest.raises(ServerClosed):
+        # Just wait _list run.
+        await asyncio.sleep(1)
+        await caller.call(
+            router=router, dest_address="test1", message=FakeMessage(2), wait=False
+        )
+
+    res0 = await futures[0]
+    assert res0.message_id == 0
+
+    with pytest.raises(ServerClosed):
+        await futures[1]
diff --git a/python/xorbits/_mars/oscar/tests/test_batch.py b/python/xorbits/_mars/oscar/tests/test_batch.py
new file mode 100644
index 000000000..bf81ad6b1
--- /dev/null
+++ b/python/xorbits/_mars/oscar/tests/test_batch.py
@@ -0,0 +1,216 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import sys
+
+import pytest
+
+from ..batch import build_args_binder, extensible
+
+
+def _wrap_async(use_async):
+    def wrapper(func):
+        async def _wrapped(*args, **kwargs):
+            return func(*args, **kwargs)
+
+        _wrapped.__name__ = func.__name__
+        return _wrapped if use_async else func
+
+    return wrapper
+
+
+def test_args_binder():
+    anon_binder = build_args_binder(lambda x, y=10: None, remove_self=False)
+    assert (20, 10) == anon_binder(20)
+
+    def fun1(a, b=10):
+        pass
+
+    binder1 = build_args_binder(fun1, remove_self=False)
+    assert (20, 10) == binder1(20)
+
+    async def fun2(*, kw_only=10, **kw):
+        pass
+
+    binder2 = build_args_binder(fun2, remove_self=False)
+    assert (20, {"ext_arg": 5}) == binder2(kw_only=20, ext_arg=5)
+
+    async def fun3(x, *args, kw_only=10, **kw):
+        pass
+
+    binder3 = build_args_binder(fun3, remove_self=False)
+    assert 10 == binder3(20, 36, ext_arg=5).kw_only
+    assert (20, (36,), 10, {"ext_arg": 5}) == binder3(20, 36, ext_arg=5)
+
+
+def test_extensible_bind():
+    class TestClass:
+        def __init__(self):
+            self.a_list = []
+            self.b_list = []
+
+        @extensible
+        def method(self, a, b=10):
+            pass
+
+        @method.batch
+        def method(self, args_list, kwargs_list):
+            for args, kwargs in zip(args_list, kwargs_list):
+                a, b = self.method.bind(*args, **kwargs)
+                self.a_list.append(a)
+                self.b_list.append(b)
+
+    test_inst = TestClass()
+    test_inst.method.batch(
+        test_inst.method.delay(20),
+        test_inst.method.delay(30, 5),
+    )
+    assert test_inst.a_list == [20, 30]
+    assert test_inst.b_list == [10, 5]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("use_async", [False, True])
+@pytest.mark.skipif(
+    sys.version_info[:2] < (3, 7), reason="only run with Python 3.7 or greater"
+)
+async def test_extensible_no_batch(use_async):
+    class TestClass:
+        def __init__(self):
+            self.arg_list = []
+            self.kwarg_list = []
+
+        @extensible
+        @_wrap_async(use_async)
+        def method(self, *args, **kwargs):
+            self.arg_list.append(tuple(a - 1 for a in args))
+            self.kwarg_list.append({k: v - 1 for k, v in kwargs.items()})
+            return len(self.kwarg_list)
+
+    test_inst = TestClass()
+    ret = test_inst.method.batch(
+        test_inst.method.delay(12, kwarg=34), test_inst.method.delay(10, kwarg=33)
+    )
+    ret = await ret if use_async else ret
+    assert ret == [1, 2]
+    assert test_inst.arg_list == [(11,), (9,)]
+    assert test_inst.kwarg_list == [{"kwarg": 33}, {"kwarg": 32}]
+
+    if use_async:
+        test_inst = TestClass()
+        ret = await test_inst.method.batch(
+            test_inst.method.delay(12, kwarg=34), test_inst.method.delay(10, kawarg=33)
+        )
+        assert ret == [1, 2]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("use_async", [False, True])
+async def test_extensible_batch_only(use_async):
+    class TestClass:
+        def __init__(self):
+            self.arg_list = []
+            self.kwarg_list = []
+
+        @extensible
+        @_wrap_async(use_async)
+        def not_implemented_method(self, *args, **kw):
+            raise NotImplementedError
+
+        @extensible
+        @_wrap_async(use_async)
+        def method(self, *args, **kwargs):
+            raise NotImplementedError
+
+        @method.batch
+        @_wrap_async(use_async)
+        def method(self, args_list, kwargs_list):
+            self.arg_list.extend(args_list)
+            self.kwarg_list.extend(kwargs_list)
+            return [len(self.kwarg_list)] * len(args_list)
+
+    if use_async:
+        assert asyncio.iscoroutinefunction(TestClass.method)
+
+    test_inst = TestClass()
+    ret = test_inst.method.batch()
+    ret = await ret if use_async else ret
+    assert ret == []
+
+    test_inst = TestClass()
+    ret = test_inst.method.batch(test_inst.method.delay(12))
+    ret = await ret if use_async else ret
+    assert ret == [1]
+
+    test_inst = TestClass()
+    ret = test_inst.method.batch(test_inst.method.delay(12), test_inst.method.delay(10))
+    ret = await ret if use_async else ret
+    assert ret == [2, 2]
+    assert test_inst.arg_list == [(12,), (10,)]
+    assert test_inst.kwarg_list == [{}, {}]
+
+    test_inst = TestClass()
+    for _ in range(2):
+        with pytest.raises(NotImplementedError):
+            ret = test_inst.not_implemented_method()
+            await ret if use_async else ret
+    ret = test_inst.method(12, kwarg=34)
+    ret = await ret if use_async else ret
+    assert ret == 1
+    assert test_inst.arg_list == [(12,)]
+    assert test_inst.kwarg_list == [{"kwarg": 34}]
+
+
+@pytest.mark.asyncio
+@pytest.mark.skipif(
+    sys.version_info[:2] < (3, 7), reason="only run with Python 3.7 or greater"
+)
+@pytest.mark.parametrize("use_async", [False, True])
+async def test_extensible_single_with_batch(use_async):
+    class TestClass:
+        def __init__(self):
+            self.arg_list = []
+            self.kwarg_list = []
+
+        @extensible
+        @_wrap_async(use_async)
+        def method(self, *args, **kwargs):
+            self.arg_list.append(tuple(a * 2 for a in args))
+            self.kwarg_list.append({k: v * 2 for k, v in kwargs.items()})
+            return len(self.kwarg_list)
+
+        @method.batch
+        @_wrap_async(use_async)
+        def method(self, args_list, kwargs_list):
+            self.arg_list.extend([tuple(a * 2 + 1 for a in args) for args in args_list])
+            self.kwarg_list.extend(
+                [{k: v * 2 + 1 for k, v in kwargs.items()} for kwargs in kwargs_list]
+            )
+            return [len(self.kwarg_list)] * len(args_list)
+
+    if use_async:
+        assert asyncio.iscoroutinefunction(TestClass.method)
+
+    test_inst = TestClass()
+    ret = test_inst.method(15, kwarg=56)
+    ret = await ret if use_async else ret
+    assert ret == 1
+    ret = test_inst.method.batch(
+        test_inst.method.delay(16, kwarg=57), test_inst.method.delay(17, kwarg=58)
+    )
+    ret = await ret if use_async else ret
+    assert ret == [3, 3]
+    assert test_inst.arg_list == [(30,), (33,), (35,)]
+    assert test_inst.kwarg_list == [{"kwarg": 112}, {"kwarg": 115}, {"kwarg": 117}]
diff --git a/python/xorbits/_mars/oscar/tests/test_profiling.py b/python/xorbits/_mars/oscar/tests/test_profiling.py
new file mode 100644
index 000000000..75802de9c
--- /dev/null
+++ b/python/xorbits/_mars/oscar/tests/test_profiling.py
@@ -0,0 +1,177 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import dataclasses
+import os
+
+import pytest
+
+from ...tests.core import check_dict_structure_same, mock
+from ..backends.message import SendMessage
+from ..profiling import (
+    DummyOperator,
+    ProfilingData,
+    ProfilingDataOperator,
+    _CallStats,
+    _ProfilingOptions,
+    _SubtaskStats,
+)
+
+
+def test_profiling_data():
+    ProfilingData.init("abc")
+    try:
+        for n in ["general", "serialization"]:
+            assert isinstance(ProfilingData["abc", n], ProfilingDataOperator)
+        assert ProfilingData["def"] is DummyOperator
+        assert ProfilingData["abc", "def"] is DummyOperator
+        assert ProfilingData["abc", "def", 1] is DummyOperator
+        ProfilingData["def"].set("a", 1)
+        ProfilingData["def"].inc("b", 1)
+        assert ProfilingData["def"].empty()
+        assert sum(ProfilingData["def"].nest("a").values()) == 0
+        ProfilingData["abc", "serialization"].set("a", 1)
+        ProfilingData["abc", "serialization"].inc("b", 1)
+        with pytest.raises(TypeError):
+            assert ProfilingData["abc", "serialization"].nest("a")
+        assert sum(ProfilingData["abc", "serialization"].nest("c").values()) == 0
+        assert not ProfilingData["abc", "serialization"].empty()
+    finally:
+        v = ProfilingData.pop("abc")
+        check_dict_structure_same(
+            v,
+            {
+                "general": {},
+                "serialization": {"a": 1, "b": 1, "c": {}},
+                "most_calls": {},
+                "slow_calls": {},
+                "band_subtasks": {},
+                "slow_subtasks": {},
+            },
+        )
+
+
+@pytest.mark.asyncio
+@mock.patch("mars.oscar.profiling.logger.warning")
+async def test_profiling_debug(fake_warning):
+    ProfilingData.init("abc", {"debug_interval_seconds": 0.1})
+    assert len(ProfilingData._debug_task) == 1
+    assert not ProfilingData._debug_task["abc"].done()
+    await asyncio.sleep(0.5)
+    assert fake_warning.call_count > 1
+    ProfilingData.pop("abc")
+    call_count = fake_warning.call_count
+    assert len(ProfilingData._debug_task) == 0
+    await asyncio.sleep(0.5)
+    assert fake_warning.call_count == call_count
+
+    ProfilingData.init("abc", {"debug_interval_seconds": 0.1})
+    assert len(ProfilingData._debug_task) == 1
+    await asyncio.sleep(0.5)
+    assert fake_warning.call_count > call_count
+    ProfilingData._data.clear()
+    call_count = fake_warning.call_count
+    await asyncio.sleep(0.5)
+    assert len(ProfilingData._debug_task) == 0
+    assert fake_warning.call_count == call_count
+
+
+@pytest.mark.asyncio
+async def test_profiling_options():
+    with pytest.raises(ValueError):
+        ProfilingData.init("abc", 1.2)
+    with pytest.raises(ValueError):
+        ProfilingData.init("abc", ["invalid"])
+    with pytest.raises(ValueError):
+        ProfilingData.init("abc", {"invalid": True})
+    with pytest.raises(ValueError):
+        ProfilingData.init("abc", {"debug_interval_seconds": "abc"})
+
+    # Test the priority, options first, then env var.
+    env_key = "MARS_PROFILING_DEBUG_INTERVAL_SECONDS"
+    try:
+        os.environ[env_key] = "2"
+        options = _ProfilingOptions(True)
+        assert options.debug_interval_seconds == 2.0
+        options = _ProfilingOptions({"debug_interval_seconds": 1.0})
+        assert options.debug_interval_seconds == 1.0
+    finally:
+        os.environ.pop(env_key)
+
+    # Test option value cache.
+    d = {"debug_interval_seconds": 1.0}
+    options = _ProfilingOptions(d)
+    assert options.debug_interval_seconds == 1.0
+    d["debug_interval_seconds"] = 2.0
+    assert options.debug_interval_seconds == 1.0
+    try:
+        os.environ[env_key] = "2"
+        assert options.debug_interval_seconds == 1.0
+    finally:
+        os.environ.pop(env_key)
+
+
+def test_collect():
+    options = _ProfilingOptions(
+        {"slow_calls_duration_threshold": 0, "slow_subtasks_duration_threshold": 0}
+    )
+
+    # Test collect message with incomparable arguments.
+    from ..core import ActorRef
+
+    fake_actor_ref = ActorRef("def", b"uid")
+    fake_message1 = SendMessage(b"abc", fake_actor_ref, ["name", {}])
+    fake_message2 = SendMessage(b"abc", fake_actor_ref, ["name", 1])
+
+    cs = _CallStats(options)
+    cs.collect(fake_message1, 1.0)
+    cs.collect(fake_message2, 1.0)
+
+    @dataclasses.dataclass
+    class _FakeSubtask:
+        extra_config: dict
+
+    # Test collect subtask with incomparable arguments.
+    band = ("1.2.3.4", "numa-0")
+    subtask1 = _FakeSubtask({})
+    subtask2 = _FakeSubtask(None)
+    ss = _SubtaskStats(options)
+    ss.collect(subtask1, band, 1.0)
+    ss.collect(subtask2, band, 1.0)
+
+    # Test call stats order.
+    cs = _CallStats(options)
+    for i in range(20):
+        fake_message = SendMessage(
+            f"{i}".encode(), fake_actor_ref, ["name", True, (i,), {}]
+        )
+        cs.collect(fake_message, i)
+    d = cs.to_dict()
+    assert list(d["most_calls"].values())[0] == 20
+    assert list(d["slow_calls"].values()) == list(reversed(range(10, 20)))
+
+    # Test subtask stats order.
+    ss = _SubtaskStats(options)
+    counter = 0
+    for i in range(20):
+        for j in range(i):
+            fake_message = _FakeSubtask(counter)
+            ss.collect(fake_message, (str(j), "numa-0"), counter)
+            counter += 1
+    d = ss.to_dict()
+    assert list(d["band_subtasks"].values()) == [19, 18, 17, 16, 15, 5, 4, 3, 2, 1]
+    assert list(d["slow_subtasks"].values()) == list(
+        reversed(range(counter - 10, counter))
+    )
diff --git a/python/xorbits/_mars/oscar/utils.pxd b/python/xorbits/_mars/oscar/utils.pxd
new file mode 100644
index 000000000..7acd646c1
--- /dev/null
+++ b/python/xorbits/_mars/oscar/utils.pxd
@@ -0,0 +1,16 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cpdef bytes new_actor_id()
+cdef bint is_async_generator(obj)
diff --git a/python/xorbits/_mars/oscar/utils.pyx b/python/xorbits/_mars/oscar/utils.pyx
new file mode 100644
index 000000000..d08e55998
--- /dev/null
+++ b/python/xorbits/_mars/oscar/utils.pyx
@@ -0,0 +1,77 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import AsyncGenerator
+
+from .._utils cimport new_random_id, to_str
+from .core cimport ActorRef, LocalActorRef
+
+
+cpdef bytes new_actor_id():
+    return new_random_id(32)
+
+
+def create_actor_ref(*args, **kwargs):
+    """
+    Create an actor reference.
+
+    Returns
+    -------
+    ActorRef
+    """
+
+    cdef str address
+    cdef object uid
+    cdef ActorRef existing_ref
+
+    address = to_str(kwargs.pop('address', None))
+    uid = kwargs.pop('uid', None)
+
+    if kwargs:
+        raise ValueError('Only `address` or `uid` keywords are supported')
+
+    if len(args) == 2:
+        if address:
+            raise ValueError('address has been specified')
+        address = to_str(args[0])
+        uid = args[1]
+    elif len(args) == 1:
+        tp0 = type(args[0])
+        if tp0 is ActorRef or tp0 is LocalActorRef:
+            existing_ref = <ActorRef>(args[0])
+            uid = existing_ref.uid
+            address = to_str(address or existing_ref.address)
+        else:
+            uid = args[0]
+
+    if uid is None:
+        raise ValueError('Actor uid should be provided')
+
+    return ActorRef(address, uid)
+
+
+cdef set _is_async_generator_typecache = set()
+
+
+cdef bint is_async_generator(obj):
+    cdef type tp = type(obj)
+    if tp in _is_async_generator_typecache:
+        return True
+
+    if isinstance(obj, AsyncGenerator):
+        if len(_is_async_generator_typecache) < 100:
+            _is_async_generator_typecache.add(tp)
+        return True
+    else:
+        return False
diff --git a/python/xorbits/_mars/remote/__init__.py b/python/xorbits/_mars/remote/__init__.py
new file mode 100644
index 000000000..3aecffae3
--- /dev/null
+++ b/python/xorbits/_mars/remote/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# noinspection PyUnresolvedReferences
+from ..core import ExecutableTuple
+from .core import spawn
+from .run_script import run_script
diff --git a/python/xorbits/_mars/remote/core.py b/python/xorbits/_mars/remote/core.py
new file mode 100644
index 000000000..c5c5a443b
--- /dev/null
+++ b/python/xorbits/_mars/remote/core.py
@@ -0,0 +1,343 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import UserDict
+from collections.abc import Iterable
+from functools import partial
+
+import numpy as np
+
+from .. import opcodes
+from ..core import ENTITY_TYPE, ChunkData, OutputType, Tileable
+from ..core.custom_log import redirect_custom_log
+from ..core.operand import Operand
+from ..dataframe.core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
+from ..serialization.serializables import (
+    BoolField,
+    DictField,
+    FunctionField,
+    Int32Field,
+    ListField,
+)
+from ..tensor.core import TENSOR_TYPE
+from ..utils import (
+    build_fetch_tileable,
+    enter_current_session,
+    find_objects,
+    merge_chunks,
+    merged_chunk_as_tileable_type,
+    replace_objects,
+)
+from .operands import RemoteOperandMixin
+
+
+class RemoteFunction(RemoteOperandMixin, Operand):
+    _op_type_ = opcodes.REMOTE_FUNCATION
+    _op_module_ = "remote"
+
+    function = FunctionField("function")
+    function_args = ListField("function_args")
+    function_kwargs = DictField("function_kwargs")
+    retry_when_fail = BoolField("retry_when_fail")
+    resolve_tileable_input = BoolField("resolve_tileable_input", default=False)
+    n_output = Int32Field("n_output", default=None)
+
+    def __init__(self, output_types=None, **kwargs):
+        super().__init__(_output_types=output_types, **kwargs)
+
+    @property
+    def output_limit(self):
+        return self.n_output or 1
+
+    @property
+    def retryable(self) -> bool:
+        return self.retry_when_fail
+
+    @classmethod
+    def _no_prepare(cls, tileable):
+        return isinstance(
+            tileable, (TENSOR_TYPE, DATAFRAME_TYPE, SERIES_TYPE, INDEX_TYPE)
+        )
+
+    def _set_inputs(self, inputs):
+        raw_inputs = getattr(self, "_inputs", None)
+        super()._set_inputs(inputs)
+
+        function_inputs = iter(inp for inp in self._inputs)
+        mapping = {inp: new_inp for inp, new_inp in zip(inputs, self._inputs)}
+        if raw_inputs is not None:
+            for raw_inp in raw_inputs:
+                if self._no_prepare(raw_inp):
+                    if not isinstance(self._inputs[0], ChunkData):
+                        # not in tile, set_inputs from tileable
+                        mapping[raw_inp] = next(function_inputs)
+                    else:
+                        # in tile, set_inputs from chunk
+                        mapping[raw_inp] = build_fetch_tileable(raw_inp)
+                else:
+                    mapping[raw_inp] = next(function_inputs)
+        self.function_args = replace_objects(self.function_args, mapping)
+        self.function_kwargs = replace_objects(self.function_kwargs, mapping)
+
+    def __call__(self):
+        find_inputs = partial(find_objects, types=ENTITY_TYPE)
+        inputs = find_inputs(self.function_args) + find_inputs(self.function_kwargs)
+        if self.n_output is None:
+            return self.new_tileable(inputs)
+        else:
+            return self.new_tileables(
+                inputs, kws=[dict(i=i) for i in range(self.n_output)]
+            )
+
+    @classmethod
+    def tile(cls, op):
+        outs = op.outputs
+        chunk_op = op.copy().reset_key()
+
+        chunk_inputs = []
+        pure_depends = []
+        executed = False
+        for inp in op.inputs:
+            if cls._no_prepare(inp):  # pragma: no cover
+                if not executed:
+                    # trigger execution
+                    yield
+                else:
+                    executed = True
+                # if input is tensor, DataFrame etc,
+                # do not prepare data, because the data may be to huge,
+                # and users can choose to fetch slice of the data themselves
+                pure_depends.extend([not op.resolve_tileable_input] * len(inp.chunks))
+            else:
+                pure_depends.extend([False] * len(inp.chunks))
+            chunk_inputs.extend(inp.chunks)
+        chunk_op._pure_depends = pure_depends
+        # record tileable op key for chunk op
+        chunk_op.tileable_op_key = op.key
+
+        out_chunks = [list() for _ in range(len(outs))]
+        chunk_kws = []
+        for i, (out, out_type) in enumerate(zip(outs, op.output_types)):
+            chunk_params = out.params.copy()
+            chunk_params["i"] = i
+            chunk_kws.append(chunk_params)
+            if out_type == OutputType.dataframe:
+                chunk_params["index"] = (0, 0)
+                chunk_params["shape"] = (np.nan, np.nan)
+            elif out_type == OutputType.series:
+                chunk_params["index"] = (0,)
+                chunk_params["shape"] = (np.nan,)
+            elif out_type == OutputType.df_or_series:
+                chunk_params["index"] = (0, 0)
+                chunk_params["shape"] = (np.nan, np.nan)
+                chunk_params["collapse_axis"] = 1
+            else:
+                chunk_params["index"] = ()
+                chunk_params["shape"] = ()
+        chunks = chunk_op.new_chunks(chunk_inputs, kws=chunk_kws)
+        for i, c in enumerate(chunks):
+            out_chunks[i].append(c)
+
+        kws = []
+        for i, (out, out_type) in enumerate(zip(outs, op.output_types)):
+            params = out.params.copy()
+            params["chunks"] = out_chunks[i]
+            if out_type == OutputType.dataframe:
+                params["nsplits"] = ((np.nan,), (np.nan,))
+            elif out_type == OutputType.series:
+                params["nsplits"] = ((np.nan,),)
+            else:
+                params["nsplits"] = ()
+            kws.append(params)
+        new_op = op.copy()
+        return new_op.new_tileables(op.inputs, kws=kws)
+
+    @classmethod
+    @redirect_custom_log
+    @enter_current_session
+    def execute(cls, ctx, op: "RemoteFunction"):
+        class MapperWrapper(UserDict):
+            def __getitem__(self, item):
+                if op.resolve_tileable_input and isinstance(item, Tileable):
+                    index_chunks = [(c.index, ctx[c.key]) for c in item.chunks]
+                    merged = merge_chunks(index_chunks)
+                    return merged_chunk_as_tileable_type(merged, item)
+                return super().__getitem__(item)
+
+        mapping = MapperWrapper(
+            {
+                inp: ctx[inp.key]
+                for inp, is_pure_dep in zip(op.inputs, op.pure_depends)
+                if not is_pure_dep
+            }
+        )
+
+        function = op.function
+        function_args = replace_objects(op.function_args, mapping)
+        function_kwargs = replace_objects(op.function_kwargs, mapping)
+
+        result = function(*function_args, **function_kwargs)
+
+        if op.n_output is None:
+            ctx[op.outputs[0].key] = result
+        else:
+            if not isinstance(result, Iterable):
+                raise TypeError(
+                    f"Specifying n_output={op.n_output}, "
+                    f"but result is not iterable, got {result}"
+                )
+            result = list(result)
+            if len(result) != op.n_output:
+                raise ValueError(
+                    f"Length of return value should be {op.n_output}, "
+                    f"got {len(result)}"
+                )
+            for out, r in zip(op.outputs, result):
+                ctx[out.key] = r
+
+
+def spawn(
+    func,
+    args=(),
+    kwargs=None,
+    retry_when_fail=False,
+    resolve_tileable_input=False,
+    n_output=None,
+    output_types=None,
+    **kw,
+):
+    """
+    Spawn a function and return a Mars Object which can be executed later.
+
+    Parameters
+    ----------
+    func : function
+        Function to spawn.
+    args: tuple
+       Args to pass to function
+    kwargs: dict
+       Kwargs to pass to function
+    retry_when_fail: bool, default False
+       If True, retry when function failed.
+    resolve_tileable_input: bool default False
+       If True, resolve tileable inputs as values.
+    n_output: int
+       Count of outputs for the function
+    output_types: str or list, default "object"
+        Specify type of returned objects.
+
+    Returns
+    -------
+    Object
+        Mars Object.
+
+    Examples
+    --------
+    >>> import mars.remote as mr
+    >>> def inc(x):
+    >>>     return x + 1
+    >>>
+    >>> result = mr.spawn(inc, args=(0,))
+    >>> result
+    Object <op=RemoteFunction, key=e0b31261d70dd9b1e00da469666d72d9>
+    >>> result.execute().fetch()
+    1
+
+    List of spawned functions can be converted to :class:`mars.remote.ExecutableTuple`,
+    and `.execute()` can be called to run together.
+
+    >>> results = [mr.spawn(inc, args=(i,)) for i in range(10)]
+    >>> mr.ExecutableTuple(results).execute().fetch()
+    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    Mars Object returned by :meth:`mars.remote.spawn` can be treated
+    as arguments for other spawn functions.
+
+    >>> results = [mr.spawn(inc, args=(i,)) for i in range(10)]   # list of spawned functions
+    >>> def sum_all(xs):
+            return sum(xs)
+    >>> mr.spawn(sum_all, args=(results,)).execute().fetch()
+    55
+
+    inside a spawned function, new functions can be spawned.
+
+    >>> def driver():
+    >>>     results = [mr.spawn(inc, args=(i,)) for i in range(10)]
+    >>>     return mr.ExecutableTuple(results).execute().fetch()
+    >>>
+    >>> mr.spawn(driver).execute().fetch()
+    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    Mars tensor, DataFrame and so forth is available in spawned functions as well.
+
+    >>> import mars.tensor as mt
+    >>> def driver2():
+    >>>     t = mt.random.rand(10, 10)
+    >>>     return t.sum().to_numpy()
+    >>>
+    >>> mr.spawn(driver2).execute().fetch()
+    52.47844223908132
+
+    Argument of `n_output` can indicate that the spawned function will return multiple outputs.
+    This is important when some of the outputs may be passed to different functions.
+
+    >>> def triage(alist):
+    >>>     ret = [], []
+    >>>     for i in alist:
+    >>>         if i < 0.5:
+    >>>             ret[0].append(i)
+    >>>         else:
+    >>>             ret[1].append(i)
+    >>>     return ret
+    >>>
+    >>> def sum_all(xs):
+    >>>     return sum(xs)
+    >>>
+    >>> l = [0.4, 0.7, 0.2, 0.8]
+    >>> la, lb = mr.spawn(triage, args=(l,), n_output=2)
+    >>>
+    >>> sa = mr.spawn(sum_all, args=(la,))
+    >>> sb = mr.spawn(sum_all, args=(lb,))
+    >>> mr.ExecutableTuple([sa, sb]).execute().fetch()
+    >>> [0.6000000000000001, 1.5]
+    """
+    if not isinstance(args, tuple):
+        args = [args]
+    else:
+        args = list(args)
+    if kwargs is None:
+        kwargs = dict()
+    if not isinstance(output_types, (list, tuple)):
+        if output_types is None:
+            output_types = OutputType.object
+        elif isinstance(output_types, str):
+            output_types = getattr(OutputType, output_types)
+        output_types = [output_types] if n_output is None else [output_types] * n_output
+
+    if not isinstance(kwargs, dict):
+        raise TypeError("kwargs has to be a dict")
+
+    op = RemoteFunction(
+        function=func,
+        function_args=args,
+        function_kwargs=kwargs,
+        retry_when_fail=retry_when_fail,
+        resolve_tileable_input=resolve_tileable_input,
+        n_output=n_output,
+        output_types=output_types,
+        **kw,
+    )
+    if op.extra_params:
+        raise ValueError(f"Unexpected kw: {list(op.extra_params)[0]}")
+    return op()
diff --git a/python/xorbits/_mars/remote/operands.py b/python/xorbits/_mars/remote/operands.py
new file mode 100644
index 000000000..b01fc2588
--- /dev/null
+++ b/python/xorbits/_mars/remote/operands.py
@@ -0,0 +1,34 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core import FuseChunk, FuseChunkData
+from ..core.operand import Fuse, FuseChunkMixin, ObjectOperandMixin
+
+
+class RemoteFuseChunkMixin(ObjectOperandMixin, FuseChunkMixin):
+    __slots__ = ()
+
+    def _create_chunk(self, output_idx, index, **kw):
+        data = FuseChunkData(_index=index, _op=self, **kw)
+
+        return FuseChunk(data)
+
+
+class RemoteFuseChunk(RemoteFuseChunkMixin, Fuse):
+    pass
+
+
+class RemoteOperandMixin(ObjectOperandMixin):
+    def get_fuse_op_cls(self, _):
+        return RemoteFuseChunk
diff --git a/python/xorbits/_mars/remote/run_script.py b/python/xorbits/_mars/remote/run_script.py
new file mode 100644
index 000000000..3284ff0a6
--- /dev/null
+++ b/python/xorbits/_mars/remote/run_script.py
@@ -0,0 +1,248 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+from typing import Any, BinaryIO, Dict, List, TextIO, Union
+
+import numpy as np
+
+from .. import opcodes
+from ..core import TILEABLE_TYPE, OutputType
+from ..core.context import Context
+from ..core.operand import MergeDictOperand
+from ..serialization.serializables import (
+    BoolField,
+    BytesField,
+    DictField,
+    Int32Field,
+    ListField,
+)
+from ..typing import SessionType, TileableType
+from ..utils import build_fetch_tileable, to_binary
+
+
+class RunScript(MergeDictOperand):
+    _op_type_ = opcodes.RUN_SCRIPT
+
+    _code: bytes = BytesField("code")
+    _data: Dict[str, TileableType] = DictField("data")
+    _retry_when_fail: bool = BoolField("retry_when_fail")
+    _command_args: List[str] = ListField("command_args")
+    _world_size: int = Int32Field("world_size")
+    _rank: int = Int32Field("rank")
+
+    def __init__(
+        self,
+        code=None,
+        data=None,
+        world_size=None,
+        rank=None,
+        retry_when_fail=None,
+        command_args=None,
+        **kw
+    ):
+        super().__init__(
+            _code=code,
+            _data=data,
+            _world_size=world_size,
+            _rank=rank,
+            _retry_when_fail=retry_when_fail,
+            _command_args=command_args,
+            **kw
+        )
+        if self.output_types is None:
+            self.output_types = [OutputType.object]
+
+    @property
+    def code(self):
+        return self._code
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def world_size(self):
+        return self._world_size
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @property
+    def command_args(self):
+        return self._command_args or []
+
+    @property
+    def retryable(self):
+        return self._retry_when_fail
+
+    def __call__(self, inputs):
+        return self.new_tileable(inputs)
+
+    @classmethod
+    def _get_chunk_data(cls, op: "RunScript"):
+        new_data = None
+        input_chunks = []
+        inputs_iter = iter(op.inputs)
+        if op.data:
+            new_data = dict()
+            for k, v in op.data.items():
+                if isinstance(v, TILEABLE_TYPE):
+                    v = next(inputs_iter)
+                    new_data[k] = build_fetch_tileable(v)
+                    input_chunks.extend(v.chunks)
+                else:
+                    new_data[k] = v
+        return new_data, input_chunks
+
+    @classmethod
+    def tile(cls, op: "RunScript"):
+        if len(op.inputs) > 0:
+            # trigger inputs to execute
+            yield
+
+        new_data, input_chunks = cls._get_chunk_data(op)
+
+        out_chunks = []
+        for i in range(op.world_size):
+            chunk_op = op.copy().reset_key()
+            chunk_op._data = new_data
+            chunk_op._rank = i
+            out_chunks.append(chunk_op.new_chunk(input_chunks, index=(i,)))
+
+        new_op = op.copy()
+        return new_op.new_tileables(
+            op.inputs,
+            chunks=out_chunks,
+            nsplits=(tuple(np.nan for _ in range(len(out_chunks))),),
+        )
+
+    @classmethod
+    def _build_envs(cls, ctx, op):
+        # set mars envs
+        envs = dict()
+        envs["RANK"] = str(op.rank)
+        envs["WORLD_SIZE"] = str(op.world_size)
+        return envs
+
+    @classmethod
+    def _build_locals(cls, ctx: Union[Context, dict], op: "RunScript"):
+        sess = ctx.get_current_session().as_default()
+        local = {"session": sess, "__name__": "__main__"}
+        if op.data is not None:
+            local.update(op.data)
+        return local
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.merge:
+            return super().execute(ctx, op)
+
+        old_env = os.environ.copy()
+        envs = cls._build_envs(ctx, op)
+        old_argv = sys.argv.copy()
+
+        # since a new session will be created and set as default, the current default session need
+        # to be restored after the execution of the script.
+        from ..deploy.oscar.session import get_default_session
+
+        old_default_session = get_default_session()
+
+        try:
+            os.environ.update(envs)
+            sys.argv = ["script"]
+            sys.argv.extend(op.command_args)
+
+            exec(op.code, cls._build_locals(ctx, op))
+
+            if op.rank == 0:
+                ctx[op.outputs[0].key] = {"status": "ok"}
+            else:
+                ctx[op.outputs[0].key] = {}
+        finally:
+            os.environ = old_env
+            sys.argv = old_argv
+            if old_default_session is not None:
+                old_default_session.as_default()
+            sys.stdout.flush()
+
+
+def _extract_inputs(data: Dict[str, TileableType] = None) -> List[TileableType]:
+    if data is not None and not isinstance(data, dict):
+        raise TypeError(
+            "`data` must be a dict whose key is variable name and value is data"
+        )
+
+    inputs = []
+    if data is not None:
+        for v in data.values():
+            if isinstance(v, TILEABLE_TYPE):
+                inputs.append(v)
+
+    return inputs
+
+
+def run_script(
+    script: Union[bytes, str, BinaryIO, TextIO],
+    data: Dict[str, TileableType] = None,
+    n_workers: int = 1,
+    command_argv: List[str] = None,
+    session: SessionType = None,
+    retry_when_fail: bool = False,
+    run_kwargs: Dict[str, Any] = None,
+):
+    """
+    Run script in Mars cluster.
+
+    Parameters
+    ----------
+    script: str or file-like object
+        Script to run.
+    data: dict
+        Variable name to data.
+    n_workers: int
+        number of workers to run the script
+    command_argv: list
+        extra command args for script
+    session: Mars session
+        if not provided, will use default one
+    retry_when_fail: bool, default False
+       If True, retry when function failed.
+    run_kwargs: dict
+        extra kwargs for session.run
+
+    Returns
+    -------
+    Object
+        Mars Object.
+
+    """
+
+    if hasattr(script, "read"):
+        code = script.read()
+    else:
+        with open(os.path.abspath(script), "rb") as f:
+            code = f.read()
+
+    inputs = _extract_inputs(data)
+    op = RunScript(
+        data=data,
+        code=to_binary(code),
+        world_size=n_workers,
+        retry_when_fail=retry_when_fail,
+        command_args=command_argv,
+    )
+    return op(inputs).execute(session=session, **(run_kwargs or {}))
diff --git a/python/xorbits/_mars/remote/tests/__init__.py b/python/xorbits/_mars/remote/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/remote/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/remote/tests/sample_script.py b/python/xorbits/_mars/remote/tests/sample_script.py
new file mode 100644
index 000000000..19e65a9e3
--- /dev/null
+++ b/python/xorbits/_mars/remote/tests/sample_script.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+assert os.environ["WORLD_SIZE"] == "2"
diff --git a/python/xorbits/_mars/remote/tests/test_remote_function.py b/python/xorbits/_mars/remote/tests/test_remote_function.py
new file mode 100644
index 000000000..9abee0520
--- /dev/null
+++ b/python/xorbits/_mars/remote/tests/test_remote_function.py
@@ -0,0 +1,331 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ... import dataframe as md
+from ... import get_context
+from ... import oscar as mo
+from ... import tensor as mt
+from ...core import tile
+from ...dataframe.core import DATAFRAME_OR_SERIES_TYPE, DATAFRAME_TYPE, SERIES_TYPE
+from ...deploy.oscar.session import get_default_session
+from ...learn.utils import shuffle
+from ...lib.mmh3 import hash as mmh3_hash
+from ...tensor.core import TENSOR_TYPE
+from .. import ExecutableTuple, spawn
+
+
+def test_params():
+    def f(x):
+        return x + 1
+
+    r = spawn(f, args=(1,))
+    c = tile(r).chunks[0]
+    assert isinstance(c.params, dict)
+    c.params = c.get_params_from_data(2)
+    assert isinstance(c.params, dict)
+
+    params = c.params
+    params.pop("index", None)
+    r.params = params
+    r.refresh_params()
+
+
+def test_remote_function(setup):
+    session = setup
+
+    def f1(x):
+        return x + 1
+
+    def f2(x, y, z=None):
+        return x * y * (z[0] + z[1])
+
+    rs = np.random.RandomState(0)
+    raw1 = rs.rand(10, 10)
+    raw2 = rs.rand(10, 10)
+
+    r1 = spawn(f1, raw1)
+    r2 = spawn(f1, raw2)
+    r3 = spawn(f2, (r1, r2), {"z": [r1, r2]})
+
+    result = r3.execute().fetch()
+    expected = (raw1 + 1) * (raw2 + 1) * (raw1 + 1 + raw2 + 1)
+    np.testing.assert_almost_equal(result, expected)
+
+    with pytest.raises(TypeError):
+        spawn(f2, (r1, r2), kwargs=())
+
+    with pytest.raises(ValueError, match="Unexpected kw: k"):
+        spawn(f2, (r1, r2), k=1)
+
+    session_id = session.session_id
+
+    def f():
+        assert get_default_session().session_id == session_id
+        return mt.ones((2, 3)).sum().to_numpy()
+
+    assert spawn(f).execute().fetch() == 6
+
+
+def test_specific_output_types(setup):
+    pd_df = pd.DataFrame(np.ones((10, 3)), columns=["a", "b", "c"])
+
+    def f1():
+        return pd_df
+
+    r = spawn(f1, output_types="dataframe").execute()
+
+    assert isinstance(r, DATAFRAME_TYPE)
+    assert r.index_value is not None
+    pd.testing.assert_frame_equal(r.fetch(), pd_df)
+    pd.testing.assert_index_equal(r.columns.to_pandas(), pd_df.columns)
+
+    pd_series = pd.Series(np.ones((10,)), name="a")
+
+    def f2():
+        return pd_series
+
+    r = spawn(f2, output_types="series").execute()
+
+    assert isinstance(r, SERIES_TYPE)
+    assert r.index_value is not None
+    assert r.name == pd_series.name
+    pd.testing.assert_series_equal(r.fetch(), pd_series)
+
+    def f3(v):
+        if v > 0:
+            return pd_series
+        else:
+            return pd_df
+
+    r = spawn(f3, args=(1,), output_types="df_or_series").execute()
+
+    assert isinstance(r, DATAFRAME_OR_SERIES_TYPE)
+    assert r.index_value is not None
+    assert r.name == pd_series.name
+    assert r.shape == pd_series.shape
+    assert getattr(r, "dtypes", None) is None
+    s = r.ensure_data()
+    assert isinstance(s, SERIES_TYPE)
+    pd.testing.assert_series_equal(s.fetch(), pd_series)
+
+    r = spawn(f3, args=(0,), output_types="df_or_series").execute()
+
+    assert isinstance(r, DATAFRAME_OR_SERIES_TYPE)
+    assert r.index_value is not None
+    pd.testing.assert_series_equal(r.dtypes, pd_df.dtypes)
+    assert r.shape == pd_df.shape
+    assert getattr(r, "dtype", None) is None
+    s = r.ensure_data()
+    assert isinstance(s, DATAFRAME_TYPE)
+    pd.testing.assert_frame_equal(s.fetch(), pd_df)
+
+    np_array = np.random.rand(10, 10)
+
+    def f2():
+        return np_array
+
+    r = spawn(f2, output_types="tensor").execute()
+
+    assert isinstance(r, TENSOR_TYPE)
+    assert r.dtype == np_array.dtype
+    np.testing.assert_array_equal(r.fetch(), np_array)
+
+
+def test_context(setup_cluster):
+    def get_workers():
+        ctx = get_context()
+        return ctx.get_worker_addresses()
+
+    def f1(worker: str):
+        ctx = get_context()
+        assert worker == ctx.worker_address
+        return np.random.rand(3, 3)
+
+    def f2(data_key: str, worker: str):
+        ctx = get_context()
+        assert worker == ctx.worker_address
+        meta = ctx.get_chunks_meta([data_key], fields=["bands"])[0]
+        assert len(meta) == 1
+        ctx.get_chunks_result([data_key], fetch_only=True)
+        # fetched, two workers have the data
+        meta = ctx.get_chunks_meta([data_key], fields=["bands"])[0]
+        assert len(meta["bands"]) == 2
+
+    workers = spawn(get_workers).execute().fetch()
+    assert len(workers) == len(set(workers)) > 1
+
+    r1 = spawn(f1, args=(workers[0],), expect_worker=workers[0]).execute()
+    data_key = r1._fetch_infos(fields=["data_key"])["data_key"][0]
+    r2 = spawn(f2, args=(data_key, workers[1]), expect_worker=workers[1])
+    r2.execute()
+
+    def get_bands():
+        ctx = get_context()
+        return [b for b in ctx.get_worker_bands() if b[1].startswith("numa-")]
+
+    def f3(band: tuple):
+        ctx = get_context()
+        assert band == ctx.band
+        return np.random.rand(3, 3)
+
+    def f4(data_key: str, band: tuple):
+        ctx = get_context()
+        assert band == ctx.band
+        meta = ctx.get_chunks_meta([data_key], fields=["bands"])[0]
+        assert len(meta) == 1
+        ctx.get_chunks_result([data_key], fetch_only=True)
+        # fetched, two bands have the data
+        meta = ctx.get_chunks_meta([data_key], fields=["bands"])[0]
+        assert len(meta["bands"]) == 2
+
+    bands = spawn(get_bands).execute().fetch()
+    assert len(bands) == len(set(bands)) > 1
+
+    r3 = spawn(f3, args=(bands[0],), expect_band=bands[0]).execute()
+    data_key = r3._fetch_infos(fields=["data_key"])["data_key"][0]
+    r4 = spawn(f4, args=(data_key, bands[1]), expect_band=bands[1])
+    r4.execute()
+
+
+def test_multi_output(setup):
+    sentences = ["word1 word2", "word2 word3", "word3 word2 word1"]
+
+    def mapper(s):
+        word_to_count = defaultdict(lambda: 0)
+        for word in s.split():
+            word_to_count[word] += 1
+
+        downsides = [defaultdict(lambda: 0), defaultdict(lambda: 0)]
+        for word, count in word_to_count.items():
+            downsides[mmh3_hash(word) % 2][word] += count
+
+        return downsides
+
+    def reducer(word_to_count_list):
+        d = defaultdict(lambda: 0)
+        for word_to_count in word_to_count_list:
+            for word, count in word_to_count.items():
+                d[word] += count
+
+        return dict(d)
+
+    outs = [], []
+    for sentence in sentences:
+        out1, out2 = spawn(mapper, sentence, n_output=2)
+        outs[0].append(out1)
+        outs[1].append(out2)
+
+    rs = []
+    for out in outs:
+        r = spawn(reducer, out)
+        rs.append(r)
+
+    result = dict()
+    for wc in ExecutableTuple(rs).to_object():
+        result.update(wc)
+
+    assert result == {"word1": 2, "word2": 3, "word3": 2}
+
+
+def test_chained_remote(setup):
+    def f(x):
+        return x + 1
+
+    def g(x):
+        return x * 2
+
+    s = spawn(g, spawn(f, 2))
+
+    result = s.execute().fetch()
+    assert result == 6
+
+
+def test_input_tileable(setup):
+    def f(t, x):
+        return (t * x).sum().to_numpy()
+
+    rs = np.random.RandomState(0)
+    raw = rs.rand(5, 4)
+
+    t1 = mt.tensor(raw, chunk_size=3)
+    t2 = t1.sum(axis=0)
+    s = spawn(f, args=(t2, 3))
+
+    result = s.execute().fetch()
+    expected = (raw.sum(axis=0) * 3).sum()
+    assert pytest.approx(result) == expected
+
+    df1 = md.DataFrame(raw, chunk_size=3)
+    df1.execute()
+    df2 = shuffle(df1)
+    df2.execute()
+
+    def f2(input_df):
+        bonus = input_df.iloc[:, 0].fetch().sum()
+        return input_df.sum().to_pandas() + bonus
+
+    for df in [df1, df2]:
+        s = spawn(f2, args=(df,))
+
+        result = s.execute().fetch()
+        expected = pd.DataFrame(raw).sum() + raw[:, 0].sum()
+        pd.testing.assert_series_equal(result, expected)
+
+
+def test_unknown_shape_inputs(setup):
+    def f(t, x):
+        assert all(not np.isnan(s) for s in t.shape)
+        return (t * x).sum().to_numpy(extra_config={"check_nsplits": False})
+
+    rs = np.random.RandomState(0)
+    raw = rs.rand(5, 4)
+
+    t1 = mt.tensor(raw, chunk_size=3)
+    t2 = t1[t1 > 0]
+    s = spawn(f, args=(t2, 3))
+
+    result = s.execute().fetch()
+    expected = (raw[raw > 0] * 3).sum()
+    assert pytest.approx(result) == expected
+
+
+def test_none_outputs(setup):
+    def f(*_args):
+        pass
+
+    r1 = spawn(f, args=(0,))
+    r2 = spawn(f, args=(r1, 1))
+    r3 = spawn(f, args=(r1, 2))
+    r4 = spawn(f, args=(r2, r3))
+
+    assert r4.execute().fetch() is None
+
+
+def test_remote_with_unpickable(setup_cluster):
+    def f(*_):
+        class Unpickleable:
+            def __reduce__(self):
+                raise ValueError
+
+        raise KeyError(Unpickleable())
+
+    with pytest.raises(mo.SendMessageFailed):
+        d = spawn(f, retry_when_fail=False)
+        d.execute()
diff --git a/python/xorbits/_mars/remote/tests/test_run_script.py b/python/xorbits/_mars/remote/tests/test_run_script.py
new file mode 100644
index 000000000..6473b07cc
--- /dev/null
+++ b/python/xorbits/_mars/remote/tests/test_run_script.py
@@ -0,0 +1,65 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from io import BytesIO
+
+import pytest
+
+from ... import dataframe as md
+from ... import tensor as mt
+from .. import run_script
+
+script1 = b"""
+import os
+assert os.environ['WORLD_SIZE'] == '2'
+"""
+
+script2 = b"""
+assert session is not None
+"""
+
+script3 = b"""
+from mars.core.operand import Fetch
+from mars.deploy.oscar.session import AbstractSession
+
+assert AbstractSession.default is not None
+assert isinstance(tensor.op, Fetch)
+assert len(tensor.chunks) > 0
+assert isinstance(tensor.chunks[0].op, Fetch)
+tensor.fetch().sum() == df.fetch()['s'].sum()
+"""
+
+
+def test_local_run_script(setup_cluster):
+    s = BytesIO(script1)
+    assert run_script(s, n_workers=2).fetch()["status"] == "ok"
+
+
+def test_local_run_script_with_exec(setup_cluster):
+    s = BytesIO(script2)
+    assert run_script(s, n_workers=2).fetch()["status"] == "ok"
+
+
+def test_local_run_script_with_data(setup_cluster):
+    s = BytesIO(script3)
+    data = {"tensor": mt.arange(10), "df": md.DataFrame({"s": mt.arange(9, 0, -1)})}
+    assert run_script(s, data=data, n_workers=1).fetch()["status"] == "ok"
+
+    pytest.raises(TypeError, run_script, s, data=[])
+
+
+def test_run_with_file(setup_cluster):
+    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "sample_script.py")
+    assert run_script(path, n_workers=2).fetch()["status"] == "ok"
diff --git a/python/xorbits/_mars/resource.py b/python/xorbits/_mars/resource.py
new file mode 100644
index 000000000..65eb56f6b
--- /dev/null
+++ b/python/xorbits/_mars/resource.py
@@ -0,0 +1,413 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import math
+import os
+import subprocess  # nosec
+import sys
+import time
+from collections import namedtuple
+from typing import List, Optional
+
+import psutil
+
+from ._resource import Resource, ZeroResource
+from .lib import nvutils
+from .utils import get_bool_environ
+
+Resource = Resource
+ZeroResource = ZeroResource
+
+logger = logging.getLogger(__name__)
+
+CGROUP_V1_CPU_ACCT_FILE = "/sys/fs/cgroup/cpuacct/cpuacct.usage"
+CGROUP_V1_MEM_STAT_FILE = "/sys/fs/cgroup/memory/memory.stat"
+CGROUP_V2_CPU_STAT_FILE = "/sys/fs/cgroup/cpu.stat"
+CGROUP_V2_MEM_CURRENT_FILE = "/sys/fs/cgroup/memory.current"
+CGROUP_V2_MEM_MAX_FILE = "/sys/fs/cgroup/memory.max"
+
+_is_cgroup_v2 = os.path.exists(CGROUP_V2_CPU_STAT_FILE)
+
+_proc = psutil.Process()
+_timer = getattr(time, "monotonic", time.time)
+
+_use_process_stat = get_bool_environ("MARS_USE_PROCESS_STAT")
+_use_cgroup_stat = get_bool_environ("MARS_USE_CGROUP_STAT")
+_cpu_use_process_stat = get_bool_environ("MARS_CPU_USE_PROCESS_STAT")
+_cpu_use_cgroup_stat = get_bool_environ("MARS_CPU_USE_CGROUP_STAT")
+_mem_use_process_stat = get_bool_environ("MARS_MEM_USE_PROCESS_STAT")
+_mem_use_cgroup_stat = get_bool_environ("MARS_MEM_USE_CGROUP_STAT")
+
+# if general config exists, overwrite individual ones
+if _use_process_stat is not None:
+    _cpu_use_process_stat = _mem_use_process_stat = _use_process_stat
+if _use_cgroup_stat is not None:
+    _cpu_use_cgroup_stat = _mem_use_cgroup_stat = _use_cgroup_stat
+
+if "MARS_CPU_TOTAL" in os.environ:
+    _cpu_total = int(math.ceil(float(os.environ["MARS_CPU_TOTAL"])))
+else:
+    _cpu_total = psutil.cpu_count(logical=True)
+
+if "MARS_MEMORY_TOTAL" in os.environ:
+    _mem_total = int(os.environ["MARS_MEMORY_TOTAL"])
+else:
+    _mem_total = None
+
+_virt_memory_stat = namedtuple("virtual_memory", "total available percent used free")
+
+_shm_path = [
+    pt.mountpoint
+    for pt in psutil.disk_partitions(all=True)
+    if pt.mountpoint in ("/tmp", "/dev/shm") and pt.fstype == "tmpfs"
+]
+if not _shm_path:
+    _shm_path = None
+else:
+    _shm_path = _shm_path[0]
+
+
+def _read_cgroup_stat_file(file_name: str):
+    with open(file_name, "r") as cg_file:
+        contents = cg_file.read()
+    kvs = dict()
+    for line in contents.splitlines():
+        parts = line.split(" ")
+        if len(parts) == 2:
+            kvs[parts[0]] = int(parts[1])
+    return kvs
+
+
+_root_pid = None
+
+
+def virtual_memory() -> _virt_memory_stat:
+    global _root_pid
+
+    sys_mem = psutil.virtual_memory()
+    if _mem_use_cgroup_stat:
+        max_mem = min(_mem_total or sys_mem.total, sys_mem.total)
+        if _is_cgroup_v2:
+            # see Memory section in https://www.kernel.org/doc/Documentation/cgroup-v2.txt
+            with open(CGROUP_V2_MEM_MAX_FILE, "r") as mem_max_file:
+                max_str = mem_max_file.read().strip()
+                total = max_mem if max_str == "max" else int(max_str)
+            with open(CGROUP_V2_MEM_CURRENT_FILE, "r") as mem_current_file:
+                used = int(mem_current_file.read().strip())
+        else:
+            # see section 5.5 in https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt
+            cgroup_mem_info = _read_cgroup_stat_file(CGROUP_V1_MEM_STAT_FILE)
+            total = cgroup_mem_info["hierarchical_memory_limit"]
+            total = min(max_mem, total)
+            used = cgroup_mem_info["rss"] + cgroup_mem_info.get("swap", 0)
+
+        if _shm_path:
+            shm_stats = psutil.disk_usage(_shm_path)
+            used += shm_stats.used
+        available = free = total - used
+        percent = 100.0 * (total - available) / total
+        return _virt_memory_stat(total, available, percent, used, free)
+    elif not _mem_use_process_stat:
+        total = min(_mem_total or sys_mem.total, sys_mem.total)
+        used = sys_mem.used + getattr(sys_mem, "shared", 0)
+        available = sys_mem.available
+        free = sys_mem.free
+        percent = 100.0 * (total - available) / total
+        return _virt_memory_stat(total, available, percent, used, free)
+    else:
+        used = 0
+        if _root_pid is None:
+            cur_proc = psutil.Process()
+            while True:
+                par_proc = cur_proc.parent()
+                if par_proc is None:
+                    break
+                try:
+                    cmd = par_proc.cmdline()
+                    if "python" not in " ".join(cmd).lower():
+                        break
+                    cur_proc = par_proc
+                except:  # noqa: E722  # nosec  # pylint: disable=bare-except  # pragma: no cover
+                    break
+            _root_pid = cur_proc.pid
+
+        root_proc = psutil.Process(_root_pid)
+        for p in root_proc.children(True):
+            try:
+                used += p.memory_info().rss
+            except (psutil.NoSuchProcess, psutil.AccessDenied):
+                pass
+
+        if _shm_path:
+            shm_stats = psutil.disk_usage(_shm_path)
+            used += shm_stats.used
+
+        total = min(_mem_total or sys_mem.total, sys_mem.total)
+        # TODO sys_mem.available does not work in container
+        # available = min(sys_mem.available, total - used)
+        available = total - used
+        free = min(sys_mem.free, total - used)
+        percent = 100.0 * (total - available) / total
+        return _virt_memory_stat(total, available, percent, used, free)
+
+
+def cpu_count():
+    return _cpu_total
+
+
+def mem_total():
+    return virtual_memory().total
+
+
+_last_cgroup_cpu_measure = None
+_last_proc_cpu_measure = None
+_last_psutil_measure = None
+_last_cpu_percent = None
+_cpu_percent_interval = 0.1
+
+
+def _take_process_cpu_snapshot():
+    pts = dict()
+    sts = dict()
+    for p in psutil.process_iter():
+        try:
+            pts[p.pid] = p.cpu_times()
+            sts[p.pid] = _timer()
+        except (psutil.NoSuchProcess, psutil.AccessDenied):
+            pass
+    return pts, sts
+
+
+def cpu_percent():
+    global _last_cgroup_cpu_measure, _last_proc_cpu_measure, _last_cpu_percent, _last_psutil_measure
+    if _cpu_use_cgroup_stat:
+        if _is_cgroup_v2:
+            # see CPU section in https://www.kernel.org/doc/Documentation/cgroup-v2.txt
+            cpu_content = _read_cgroup_stat_file(CGROUP_V2_CPU_STAT_FILE)
+            cpu_acct = cpu_content["usage_usec"] * 1000
+        else:
+            # see https://www.kernel.org/doc/Documentation/cgroup-v1/cpuacct.txt
+            with open(CGROUP_V1_CPU_ACCT_FILE, "r") as cgroup_file:
+                cpu_acct = int(cgroup_file.read())
+        sample_time = _timer()
+
+        if _last_cgroup_cpu_measure is None:
+            _last_cgroup_cpu_measure = (cpu_acct, sample_time)
+            return None
+
+        last_cpu_acct, last_sample_time = _last_cgroup_cpu_measure
+        time_delta = sample_time - last_sample_time
+        if time_delta < _cpu_percent_interval:
+            return _last_cpu_percent or 0
+
+        _last_cgroup_cpu_measure = (cpu_acct, sample_time)
+        # nanoseconds / seconds * 100, we shall divide 1e7.
+        _last_cpu_percent = round(
+            (cpu_acct - last_cpu_acct) / (sample_time - last_sample_time) / 1e7, 1
+        )
+        return _last_cpu_percent or 0
+    elif _cpu_use_process_stat:
+        pts, sts = _take_process_cpu_snapshot()
+
+        if _last_proc_cpu_measure is None:
+            _last_proc_cpu_measure = (pts, sts)
+            return None
+
+        old_pts, old_sts = _last_proc_cpu_measure
+
+        percents = []
+        for pid in pts:
+            if pid not in old_pts:
+                continue
+            pt1 = old_pts[pid]
+            pt2 = pts[pid]
+            delta_proc = (pt2.user - pt1.user) + (pt2.system - pt1.system)
+            time_delta = sts[pid] - old_sts[pid]
+
+            if time_delta < _cpu_percent_interval:
+                return _last_cpu_percent or 0
+            percents.append((delta_proc / time_delta) * 100)
+        _last_proc_cpu_measure = (pts, sts)
+        _last_cpu_percent = round(sum(percents), 1)
+        return _last_cpu_percent or 0
+    else:
+        measure_time = time.time()
+        if (
+            _last_psutil_measure is not None
+            and measure_time - _last_psutil_measure < _cpu_percent_interval
+        ):
+            return _last_cpu_percent or 0
+        _last_psutil_measure = measure_time
+        _last_cpu_percent = psutil.cpu_percent() * _cpu_total
+        return _last_cpu_percent or 0
+
+
+def disk_usage(d):
+    return psutil.disk_usage(d)
+
+
+def iowait():
+    cpu_percent = psutil.cpu_times_percent()
+    try:
+        return cpu_percent.iowait
+    except AttributeError:
+        return None
+
+
+_last_disk_io_metas = dict()
+_path_to_device = dict()
+_win_diskperf_called = False
+
+
+def get_path_device(path: str):
+    for part in psutil.disk_partitions(all=True):
+        if path.startswith(part.mountpoint):
+            return part.device
+    return None
+
+
+def _get_path_device(path: str):
+    if path in _path_to_device:
+        return _path_to_device[path]
+
+    for part in psutil.disk_partitions(all=True):
+        if path.startswith(part.mountpoint):
+            dev_name = _path_to_device[path] = part.device.replace("/dev/", "")
+            return dev_name
+    _path_to_device[path] = None
+    return None
+
+
+_disk_io_usage_type = namedtuple("_disk_io_usage_type", "reads writes")
+
+
+def disk_io_usage(path=None) -> Optional[_disk_io_usage_type]:
+    global _win_diskperf_called
+
+    # Needed by psutil.disk_io_counters() under newer version of Windows.
+    # diskperf -y need to be called or no disk information can be found.
+    if sys.platform == "win32" and not _win_diskperf_called:
+        CREATE_NO_WINDOW = 0x08000000
+        try:
+            proc = subprocess.Popen(
+                ["diskperf", "-y"], shell=False, creationflags=CREATE_NO_WINDOW
+            )  # nosec
+            proc.wait()
+        except (subprocess.CalledProcessError, OSError):  # pragma: no cover
+            pass
+        _win_diskperf_called = True
+
+    if path is None:
+        disk_counters = psutil.disk_io_counters()
+    else:
+        dev_to_counters = psutil.disk_io_counters(perdisk=True)
+        disk_counters = dev_to_counters.get(_get_path_device(path))
+        if disk_counters is None:
+            return None
+    tst = time.time()
+
+    read_bytes = disk_counters.read_bytes
+    write_bytes = disk_counters.write_bytes
+    if path not in _last_disk_io_metas:
+        _last_disk_io_metas[path] = (read_bytes, write_bytes, tst)
+        return None
+
+    last_read_bytes, last_write_bytes, last_time = _last_disk_io_metas[path]
+    delta_time = tst - last_time
+    if delta_time == 0:
+        return None
+
+    read_speed = (read_bytes - last_read_bytes) / delta_time
+    write_speed = (write_bytes - last_write_bytes) / delta_time
+
+    _last_disk_io_metas[path] = (read_bytes, write_bytes, tst)
+    return _disk_io_usage_type(read_speed, write_speed)
+
+
+_last_net_io_meta = None
+
+
+def net_io_usage():
+    global _last_net_io_meta
+
+    net_counters = psutil.net_io_counters()
+    tst = time.time()
+
+    send_bytes = net_counters.bytes_sent
+    recv_bytes = net_counters.bytes_recv
+    if _last_net_io_meta is None:
+        _last_net_io_meta = (send_bytes, recv_bytes, tst)
+        return None
+
+    last_send_bytes, last_recv_bytes, last_time = _last_net_io_meta
+    delta_time = tst - last_time
+    if delta_time == 0:
+        return None
+
+    recv_speed = (recv_bytes - last_recv_bytes) / delta_time
+    send_speed = (send_bytes - last_send_bytes) / delta_time
+
+    _last_net_io_meta = (send_bytes, recv_bytes, tst)
+    return recv_speed, send_speed
+
+
+_cuda_info = namedtuple("cuda_info", "driver_version cuda_version products gpu_count")
+_cuda_card_stat = namedtuple(
+    "cuda_card_stat", "index product_name gpu_usage temperature fb_mem_info"
+)
+
+
+def cuda_info():  # pragma: no cover
+    driver_info = nvutils.get_driver_info()
+    if not driver_info:
+        return
+    gpu_count = nvutils.get_device_count()
+    return _cuda_info(
+        driver_version=driver_info.driver_version,
+        cuda_version=driver_info.cuda_version,
+        products=[nvutils.get_device_info(idx).name for idx in range(gpu_count)],
+        gpu_count=gpu_count,
+    )
+
+
+def cuda_count():
+    return nvutils.get_device_count() or 0
+
+
+def cuda_card_stats() -> List[_cuda_card_stat]:  # pragma: no cover
+    infos = []
+    device_count = nvutils.get_device_count()
+    if not device_count:
+        return infos
+    for device_idx in range(device_count):
+        device_info = nvutils.get_device_info(device_idx)
+        device_status = nvutils.get_device_status(device_idx)
+
+        infos.append(
+            _cuda_card_stat(
+                index=device_info.index,
+                product_name=device_info.name,
+                gpu_usage=device_status.gpu_util,
+                temperature=device_status.temperature,
+                fb_mem_info=_virt_memory_stat(
+                    total=device_status.fb_total_mem,
+                    used=device_status.fb_used_mem,
+                    free=device_status.fb_free_mem,
+                    available=device_status.fb_free_mem,
+                    percent=device_status.mem_util,
+                ),
+            )
+        )
+    return infos
diff --git a/python/xorbits/_mars/serialization/__init__.py b/python/xorbits/_mars/serialization/__init__.py
new file mode 100644
index 000000000..9dd84d48d
--- /dev/null
+++ b/python/xorbits/_mars/serialization/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import arrow, cuda, exception, mars_objects, numpy, ray, scipy
+from .aio import AioDeserializer, AioSerializer
+from .core import Serializer, deserialize, serialize, serialize_with_spawn
+
+del arrow, cuda, numpy, scipy, mars_objects, ray, exception
diff --git a/python/xorbits/_mars/serialization/aio.py b/python/xorbits/_mars/serialization/aio.py
new file mode 100644
index 000000000..4030da1ee
--- /dev/null
+++ b/python/xorbits/_mars/serialization/aio.py
@@ -0,0 +1,140 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import struct
+from io import BytesIO
+from typing import Any, BinaryIO, Union
+
+import cloudpickle
+import numpy as np
+
+from ..utils import lazy_import
+from .core import deserialize, serialize_with_spawn
+
+rmm = lazy_import("rmm")
+
+DEFAULT_SERIALIZATION_VERSION = 1
+DEFAULT_SPAWN_THRESHOLD = 100
+BUFFER_SIZES_NAME = "buf_sizes"
+
+
+class AioSerializer:
+    def __init__(self, obj: Any, compress=0):
+        self._obj = obj
+        self._compress = compress
+
+    async def _get_buffers(self):
+        headers, buffers = await serialize_with_spawn(
+            self._obj, spawn_threshold=DEFAULT_SPAWN_THRESHOLD
+        )
+
+        def _is_cuda_buffer(buf: Union["rmm.DeviceBuffer", BinaryIO]):
+            return hasattr(buf, "__cuda_array_interface__")
+
+        is_cuda_buffers = [_is_cuda_buffer(buf) for buf in buffers]
+        headers[0]["is_cuda_buffers"] = np.array(is_cuda_buffers)
+
+        # add buffer lengths into headers
+        headers[0][BUFFER_SIZES_NAME] = [
+            buf.nbytes if hasattr(buf, "nbytes") else len(buf) for buf in buffers
+        ]
+        header = cloudpickle.dumps(headers)
+
+        # gen header buffer
+        header_bio = BytesIO()
+        # write version first
+        header_bio.write(struct.pack("B", DEFAULT_SERIALIZATION_VERSION))
+        # write header length
+        header_bio.write(struct.pack("<Q", len(header)))
+        # write compression
+        header_bio.write(struct.pack("<H", self._compress))
+
+        out_buffers = [header_bio.getbuffer(), header]
+        out_buffers.extend(buffers)
+
+        return out_buffers
+
+    async def run(self):
+        return await self._get_buffers()
+
+
+MALFORMED_MSG = """\
+Received malformed data, please check Mars version on both side,
+if error occurs when using `mars.new_session('http://web_ip:web_port'),
+please check if web port is right."""
+
+
+def get_header_length(header_bytes: bytes):
+    version = struct.unpack("B", header_bytes[:1])[0]
+    # now we only have default version
+    assert version == DEFAULT_SERIALIZATION_VERSION, MALFORMED_MSG
+    # header length
+    header_length = struct.unpack("<Q", header_bytes[1:9])[0]
+    # compress
+    _ = struct.unpack("<H", header_bytes[9:])[0]
+    return header_length
+
+
+class AioDeserializer:
+    def __init__(self, file):
+        self._file = file
+
+    def _readexactly(self, n: int):
+        # asyncio StreamReader may not guarantee to read n bytes
+        # for it we need to call `readexactly` instead
+        read = (
+            self._file.readexactly
+            if hasattr(self._file, "readexactly")
+            else self._file.read
+        )
+        return read(n)
+
+    async def _get_obj_header_bytes(self):
+        try:
+            header_bytes = bytes(await self._readexactly(11))
+        except ConnectionResetError:
+            raise EOFError("Server may be closed")
+        if len(header_bytes) == 0:
+            raise EOFError("Received empty bytes")
+        header_length = get_header_length(header_bytes)
+        return await self._readexactly(header_length)
+
+    async def _get_obj(self):
+        header = cloudpickle.loads(await self._get_obj_header_bytes())
+        # get buffer size
+        buffer_sizes = header[0].pop(BUFFER_SIZES_NAME)
+        # get buffers
+        buffers = [await self._readexactly(size) for size in buffer_sizes]
+        # get num of objs
+        num_objs = header[0].get("_N", 0)
+
+        if num_objs <= DEFAULT_SPAWN_THRESHOLD:
+            return deserialize(header, buffers)
+        else:
+            return await asyncio.to_thread(deserialize, header, buffers)
+
+    async def run(self):
+        return await self._get_obj()
+
+    async def get_size(self):
+        # extract header
+        header_bytes = await self._get_obj_header_bytes()
+        header = cloudpickle.loads(header_bytes)
+        # get buffer size
+        buffer_sizes = header[0].pop(BUFFER_SIZES_NAME)
+        return 11 + len(header_bytes) + sum(buffer_sizes)
+
+    async def get_header(self):
+        return cloudpickle.loads(await self._get_obj_header_bytes())
diff --git a/python/xorbits/_mars/serialization/arrow.py b/python/xorbits/_mars/serialization/arrow.py
new file mode 100644
index 000000000..09998c20f
--- /dev/null
+++ b/python/xorbits/_mars/serialization/arrow.py
@@ -0,0 +1,55 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Union
+
+from .core import Serializer, buffered
+
+try:
+    import pyarrow as pa
+
+    pa_types = Union[pa.Table, pa.RecordBatch]
+except ImportError:  # pragma: no cover
+    pa = None
+    pa_types = Any
+
+
+class ArrowBatchSerializer(Serializer):
+    @buffered
+    def serial(self, obj: pa_types, context: Dict):
+        sink = pa.BufferOutputStream()
+        writer = pa.RecordBatchStreamWriter(sink, obj.schema)
+        if isinstance(obj, pa.Table):
+            batch_type = "T"
+            writer.write_table(obj)
+        else:
+            batch_type = "B"
+            writer.write_batch(obj)
+        writer.close()
+
+        buf = sink.getvalue()
+        buffers = [buf]
+        return (batch_type,), buffers, True
+
+    def deserial(self, serialized: Dict, context: Dict, subs: List):
+        reader = pa.RecordBatchStreamReader(pa.BufferReader(subs[0]))
+        if serialized[0] == "T":
+            return reader.read_all()
+        else:
+            return reader.read_next_batch()
+
+
+if pa is not None:  # pragma: no branch
+    ArrowBatchSerializer.register(pa.Table)
+    ArrowBatchSerializer.register(pa.RecordBatch)
diff --git a/python/xorbits/_mars/serialization/core.pxd b/python/xorbits/_mars/serialization/core.pxd
new file mode 100644
index 000000000..c0cb96890
--- /dev/null
+++ b/python/xorbits/_mars/serialization/core.pxd
@@ -0,0 +1,27 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cdef class Serializer:
+    cdef int _serializer_id
+
+    cpdef serial(self, object obj, dict context)
+    cpdef deserial(self, tuple serialized, dict context, list subs)
+    cpdef on_deserial_error(
+        self,
+        tuple serialized,
+        dict context,
+        list subs_serialized,
+        int error_index,
+        object exc,
+    )
diff --git a/python/xorbits/_mars/serialization/core.pyi b/python/xorbits/_mars/serialization/core.pyi
new file mode 100644
index 000000000..c7ed8f302
--- /dev/null
+++ b/python/xorbits/_mars/serialization/core.pyi
@@ -0,0 +1,52 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from concurrent.futures import Executor
+from typing import Any, Callable, Dict, List, Tuple
+
+def buffered(func: Callable) -> Callable: ...
+def fast_id(obj: Any) -> int: ...
+
+class Serializer:
+    serializer_id: int
+    def serial(self, obj: Any, context: Dict): ...
+    def deserial(self, serialized: Tuple, context: Dict, subs: List[Any]): ...
+    def on_deserial_error(
+        self,
+        serialized: Tuple,
+        context: Dict,
+        subs_serialized: List,
+        error_index: int,
+        exc: BaseException,
+    ): ...
+    @classmethod
+    def register(cls, obj_type): ...
+    @classmethod
+    def unregister(cls, obj_type): ...
+
+class Placeholder:
+    id: int
+    callbacks: List[Callable]
+    def __init__(self, id_: int): ...
+    def __hash__(self): ...
+    def __eq__(self, other): ...
+
+def serialize(obj: Any, context: Dict = None): ...
+async def serialize_with_spawn(
+    obj: Any,
+    context: Dict = None,
+    spawn_threshold: int = 100,
+    executor: Executor = None,
+): ...
+def deserialize(headers: List, buffers: List, context: Dict = None): ...
diff --git a/python/xorbits/_mars/serialization/core.pyx b/python/xorbits/_mars/serialization/core.pyx
new file mode 100644
index 000000000..e11656b4e
--- /dev/null
+++ b/python/xorbits/_mars/serialization/core.pyx
@@ -0,0 +1,934 @@
+# distutils: language = c++
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import datetime
+import enum
+import hashlib
+import inspect
+import sys
+from functools import partial, wraps
+from typing import Any, Dict, List
+
+import numpy as np
+import pandas as pd
+
+from cpython cimport PyObject
+from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t, uintptr_t
+from libcpp.unordered_map cimport unordered_map
+
+from .._utils import NamedType
+
+from .._utils cimport TypeDispatcher
+
+import cloudpickle
+
+if sys.version_info[:2] < (3, 8):  # pragma: no cover
+    try:
+        import pickle5 as pickle  # nosec  # pylint: disable=import_pickle
+    except ImportError:
+        import pickle  # nosec  # pylint: disable=import_pickle
+else:
+    import pickle  # nosec  # pylint: disable=import_pickle
+
+# resolve pandas pickle compatibility between <1.2 and >=1.3
+try:
+    from pandas.core.internals import blocks as pd_blocks
+    if not hasattr(pd_blocks, "new_block") and hasattr(pd_blocks, "make_block"):
+        # register missing func that would cause errors
+        pd_blocks.new_block = pd_blocks.make_block
+except (ImportError, AttributeError):
+    pass
+
+BUFFER_PICKLE_PROTOCOL = max(pickle.DEFAULT_PROTOCOL, 5)
+cdef bint HAS_PICKLE_BUFFER = pickle.HIGHEST_PROTOCOL >= 5
+cdef bint _PANDAS_HAS_MGR = hasattr(pd.Series([0]), "_mgr")
+
+
+cdef TypeDispatcher _serial_dispatcher = TypeDispatcher()
+cdef dict _deserializers = dict()
+
+cdef uint32_t _MAX_STR_PRIMITIVE_LEN = 1024
+# prime modulus for serializer ids
+# use the largest prime number smaller than 32767
+cdef int32_t _SERIALIZER_ID_PRIME = 32749
+
+
+cdef class Serializer:
+    serializer_id = None
+
+    def __cinit__(self):
+        # make the value can be referenced with C code
+        self._serializer_id = self.serializer_id
+
+    cpdef serial(self, object obj, dict context):
+        """
+        Returns intermediate serialization result of certain object.
+        The returned value can be a Placeholder or a tuple comprising
+        of three parts: a header, a group of subcomponents and
+        a finalizing flag.
+
+        * Header is a pickle-serializable tuple
+        * Subcomponents are parts or buffers for iterative
+          serialization.
+        * Flag is a boolean value. If true, subcomponents should be
+          buffers (for instance, bytes, memory views, GPU buffers,
+          etc.) that can be read and written directly. If false,
+          subcomponents will be serialized iteratively.
+
+        Parameters
+        ----------
+        obj: Any
+            Object to serialize
+        context: Dict
+            Serialization context to help creating Placeholder objects
+            for reducing duplicated serialization
+
+        Returns
+        -------
+        result: Placeholder | Tuple[Tuple, List, bool]
+            Intermediate result of serialization
+        """
+        raise NotImplementedError
+
+    cpdef deserial(self, tuple serialized, dict context, list subs):
+        """
+        Returns deserialized object given serialized headers and
+        deserialized subcomponents.
+
+        Parameters
+        ----------
+        serialized: Tuple
+            Serialized object header as a tuple
+        context
+            Serialization context for instantiation of Placeholder
+            objects
+        subs: List
+            Deserialized subcomponents
+
+        Returns
+        -------
+        result: Any
+            Deserialized objects
+        """
+        raise NotImplementedError
+
+    cpdef on_deserial_error(
+        self,
+        tuple serialized,
+        dict context,
+        list subs_serialized,
+        int error_index,
+        object exc,
+    ):
+        """
+        Returns rewritten exception when subcomponent deserialization fails
+
+        Parameters
+        ----------
+        serialized: Tuple
+            Serialized object header as a tuple
+        context
+            Serialization context for instantiation of Placeholder
+            objects
+        subs_serialized: List
+            Serialized subcomponents
+        error_index: int
+            Index of subcomponent causing error
+        exc: BaseException
+            Exception raised
+
+        Returns
+        -------
+        exc: BaseException | None
+            Rewritten exception. If None, original exception is kept.
+        """
+        return None
+
+    @classmethod
+    def calc_default_serializer_id(cls):
+        s = f"{cls.__module__}.{cls.__qualname__}"
+        h = hashlib.md5(s.encode())
+        return int(h.hexdigest(), 16) % _SERIALIZER_ID_PRIME
+
+    @classmethod
+    def register(cls, obj_type, name=None):
+        if (
+            cls.serializer_id is None
+            or cls.serializer_id == getattr(super(cls, cls), "serializer_id", None)
+        ):
+            # a class should have its own serializer_id
+            # inherited serializer_id not acceptable
+            cls.serializer_id = cls.calc_default_serializer_id()
+
+        inst = cls()
+        if name is not None:
+            obj_type = NamedType(name, obj_type)
+        _serial_dispatcher.register(obj_type, inst)
+        if _deserializers.get(cls.serializer_id) is not None:
+            assert type(_deserializers[cls.serializer_id]) is cls
+        else:
+            _deserializers[cls.serializer_id] = inst
+
+    @classmethod
+    def unregister(cls, obj_type, name=None):
+        if name is not None:
+            obj_type = NamedType(name, obj_type)
+        _serial_dispatcher.unregister(obj_type)
+        _deserializers.pop(cls.serializer_id, None)
+
+
+cdef inline uint64_t _fast_id(object obj) nogil:
+    return <uintptr_t><PyObject*>obj
+
+
+def fast_id(obj):
+    """C version of id() used for serialization"""
+    return _fast_id(obj)
+
+
+def buffered(func):
+    """
+    Wrapper for serial() method to reduce duplicated serialization
+    """
+    @wraps(func)
+    def wrapped(self, obj: Any, dict context):
+        cdef uint64_t obj_id = _fast_id(obj)
+        if obj_id in context:
+            return Placeholder(_fast_id(obj))
+        else:
+            context[obj_id] = obj
+            return func(self, obj, context)
+
+    return wrapped
+
+
+def pickle_buffers(obj):
+    cdef list buffers = [None]
+
+    if HAS_PICKLE_BUFFER:
+
+        def buffer_cb(x):
+            x = x.raw()
+            if x.ndim > 1:
+                # ravel n-d memoryview
+                x = x.cast(x.format)
+            buffers.append(memoryview(x))
+
+        buffers[0] = cloudpickle.dumps(
+            obj,
+            buffer_callback=buffer_cb,
+            protocol=BUFFER_PICKLE_PROTOCOL,
+        )
+    else:  # pragma: no cover
+        buffers[0] = cloudpickle.dumps(obj)
+    return buffers
+
+
+def unpickle_buffers(buffers):
+    result = cloudpickle.loads(buffers[0], buffers=buffers[1:])
+
+    # as pandas prior to 1.1.0 use _data instead of _mgr to hold BlockManager,
+    # deserializing from high versions may produce mal-functioned pandas objects,
+    # thus the patch is needed
+    if _PANDAS_HAS_MGR:
+        return result
+    else:  # pragma: no cover
+        if hasattr(result, "_mgr") and isinstance(result, (pd.DataFrame, pd.Series)):
+            result._data = getattr(result, "_mgr")
+            delattr(result, "_mgr")
+        return result
+
+
+cdef class PickleSerializer(Serializer):
+    serializer_id = 0
+
+    cpdef serial(self, obj: Any, dict context):
+        cdef uint64_t obj_id
+        obj_id = _fast_id(obj)
+        if obj_id in context:
+            return Placeholder(obj_id)
+        context[obj_id] = obj
+
+        return (), pickle_buffers(obj), True
+
+    cpdef deserial(self, tuple serialized, dict context, list subs):
+        return unpickle_buffers(subs)
+
+
+cdef set _primitive_types = {
+    type(None),
+    bool,
+    int,
+    float,
+    complex,
+    datetime.datetime,
+    datetime.date,
+    datetime.timedelta,
+    enum.Enum,
+    type(max),  # builtin functions
+    np.dtype,
+    np.number,
+}
+
+
+class PrimitiveSerializer(Serializer):
+    serializer_id = 1
+
+    @buffered
+    def serial(self, obj: Any, context: Dict):
+        return (obj,), [], True
+
+    def deserial(self, tuple obj, context: Dict, subs: List[Any]):
+        return obj[0]
+
+
+cdef class BytesSerializer(Serializer):
+    serializer_id = 2
+
+    cpdef serial(self, obj: Any, dict context):
+        cdef uint64_t obj_id
+        obj_id = _fast_id(obj)
+        if obj_id in context:
+            return Placeholder(obj_id)
+        context[obj_id] = obj
+
+        return (), [obj], True
+
+    cpdef deserial(self, tuple serialized, dict context, list subs):
+        return subs[0]
+
+
+cdef class StrSerializer(Serializer):
+    serializer_id = 3
+
+    cpdef serial(self, obj: Any, dict context):
+        cdef uint64_t obj_id
+        obj_id = _fast_id(obj)
+        if obj_id in context:
+            return Placeholder(obj_id)
+        context[obj_id] = obj
+
+        return (), [(<str>obj).encode()], True
+
+    cpdef deserial(self, tuple serialized, dict context, list subs):
+        buffer = subs[0]
+        if type(buffer) is memoryview:
+            buffer = buffer.tobytes()
+        return buffer.decode()
+
+
+cdef class CollectionSerializer(Serializer):
+    obj_type = None
+
+    cdef object _obj_type
+
+    def __cinit__(self):
+        # make the value can be referenced with C code
+        self._obj_type = self.obj_type
+
+    cdef tuple _serial_iterable(self, obj: Any):
+        cdef list idx_to_propagate = []
+        cdef list obj_to_propagate = []
+        cdef list obj_list = <list>obj if type(obj) is list else list(obj)
+        cdef int64_t idx
+        cdef object item
+
+        for idx in range(len(obj_list)):
+            item = obj_list[idx]
+
+            if type(item) is bytes and len(<bytes>item) < _MAX_STR_PRIMITIVE_LEN:
+                # treat short strings as primitives
+                continue
+            elif type(item) is str and len(<str>item) < _MAX_STR_PRIMITIVE_LEN:
+                # treat short strings as primitives
+                continue
+            elif type(item) in _primitive_types:
+                continue
+
+            if obj is obj_list:
+                obj_list = list(obj)
+
+            obj_list[idx] = None
+            idx_to_propagate.append(idx)
+            obj_to_propagate.append(item)
+
+        if self._obj_type is not None and type(obj) is not self._obj_type:
+            obj_type = type(obj)
+        else:
+            obj_type = None
+        return (obj_list, idx_to_propagate, obj_type), obj_to_propagate, False
+
+    cpdef serial(self, obj: Any, dict context):
+        cdef uint64_t obj_id
+        obj_id = _fast_id(obj)
+        if obj_id in context:
+            return Placeholder(obj_id)
+        context[obj_id] = obj
+
+        return self._serial_iterable(obj)
+
+    cdef list _deserial_iterable(self, tuple serialized, list subs):
+        cdef list res_list, idx_to_propagate
+        cdef int64_t i
+
+        res_list, idx_to_propagate, _ = serialized
+
+        for i in range(len(idx_to_propagate)):
+            res_list[idx_to_propagate[i]] = subs[i]
+        return res_list
+
+
+cdef class TupleSerializer(CollectionSerializer):
+    serializer_id = 4
+    obj_type = tuple
+
+    cpdef deserial(self, tuple serialized, dict context, list subs):
+        cdef list res = self._deserial_iterable(serialized, subs)
+        for v in res:
+            assert type(v) is not Placeholder
+
+        obj_type = serialized[-1] or tuple
+        if hasattr(obj_type, "_fields"):
+            # namedtuple
+            return obj_type(*res)
+        else:
+            return obj_type(res)
+
+
+cdef class ListSerializer(CollectionSerializer):
+    serializer_id = 5
+    obj_type = list
+
+    cpdef deserial(self, tuple serialized, dict context, list subs):
+        cdef int64_t idx
+        cdef list res = self._deserial_iterable(serialized, subs)
+
+        obj_type = serialized[-1]
+        if obj_type is None:
+            result = res
+        else:
+            result = obj_type(res)
+
+        for idx, v in enumerate(res):
+            if type(v) is Placeholder:
+                cb = partial(result.__setitem__, idx)
+                (<Placeholder>v).callbacks.append(cb)
+        return result
+
+
+def _dict_key_replacer(ret, key, real_key):
+    ret[real_key] = ret.pop(key)
+
+
+def _dict_value_replacer(context, ret, key, real_value):
+    if type(key) is Placeholder:
+        key = context[(<Placeholder>key).id]
+    ret[key] = real_value
+
+
+cdef class DictSerializer(CollectionSerializer):
+    serializer_id = 6
+    cdef set _inspected_inherits
+
+    def __cinit__(self):
+        self._inspected_inherits = set()
+
+    cpdef serial(self, obj: Any, dict context):
+        cdef uint64_t obj_id
+        cdef tuple key_obj, value_obj
+        cdef list key_bufs, value_bufs
+
+        if type(obj) is dict and len(<dict>obj) == 0:
+            return (), [], True
+
+        obj_id = _fast_id(obj)
+        if obj_id in context:
+            return Placeholder(obj_id)
+        context[obj_id] = obj
+
+        obj_type = type(obj)
+
+        if obj_type is not dict and obj_type not in self._inspected_inherits:
+            inspect_init = inspect.getfullargspec(obj_type.__init__)
+            if (
+                inspect_init.args == ["self"]
+                and not inspect_init.varargs
+                and not inspect_init.varkw
+            ):
+                # inherited dicts may not have proper initializers
+                # for deserialization
+                # remove context to generate real serialized result
+                context.pop(obj_id)
+                return (obj,), [], True
+            else:
+                self._inspected_inherits.add(obj_type)
+
+        key_obj, key_bufs, _ = self._serial_iterable(obj.keys())
+        value_obj, value_bufs, _ = self._serial_iterable(obj.values())
+        if obj_type is dict:
+            obj_type = None
+        ser_obj = (key_obj[:-1], value_obj[:-1], len(key_bufs), obj_type)
+        return ser_obj, key_bufs + value_bufs, False
+
+    cpdef deserial(self, tuple serialized, dict context, list subs):
+        cdef int64_t i, num_key_bufs
+        cdef list key_subs, value_subs, keys, values
+
+        if not serialized:
+            return {}
+        if len(serialized) == 1:
+            # serialized directly
+            return serialized[0]
+
+        key_serialized, value_serialized, num_key_bufs, obj_type = serialized
+        key_subs = subs[:num_key_bufs]
+        value_subs = subs[num_key_bufs:]
+
+        keys = self._deserial_iterable(<tuple>key_serialized + (None,), key_subs)
+        values = self._deserial_iterable(<tuple>value_serialized + (None,), value_subs)
+
+        if obj_type is None:
+            ret = dict(zip(keys, values))
+        else:
+            try:
+                ret = obj_type(zip(keys, values))
+            except TypeError:
+                # first arg of defaultdict is a callable
+                ret = obj_type()
+                ret.update(zip(keys, values))
+
+        for i in range(len(keys)):
+            k, v = keys[i], values[i]
+            if type(k) is Placeholder:
+                (<Placeholder>k).callbacks.append(
+                    partial(_dict_key_replacer, ret, k)
+                )
+            if type(v) is Placeholder:
+                (<Placeholder>v).callbacks.append(
+                    partial(_dict_value_replacer, context, ret, k)
+                )
+        return ret
+
+
+cdef class Placeholder:
+    """
+    Placeholder object to reduce duplicated serialization
+
+    The object records object identifier and keeps callbacks
+    to replace itself in parent objects.
+    """
+    cdef public uint64_t id
+    cdef public list callbacks
+
+    def __init__(self, uint64_t id_):
+        self.id = id_
+        self.callbacks = []
+
+    def __hash__(self):
+        return self.id
+
+    def __eq__(self, other):  # pragma: no cover
+        if type(other) is not Placeholder:
+            return False
+        return self.id == other.id
+
+    def __repr__(self):
+        return (
+            f"Placeholder(id={self.id}, "
+            f"callbacks=[list of {len(self.callbacks)}])"
+        )
+
+
+cdef class PlaceholderSerializer(Serializer):
+    serializer_id = 7
+
+    cpdef serial(self, obj: Any, dict context):
+        return (), [], True
+
+    cpdef deserial(self, tuple serialized, dict context, list subs):
+        return Placeholder(0)
+
+
+PickleSerializer.register(object)
+for _primitive in _primitive_types:
+    PrimitiveSerializer.register(_primitive)
+BytesSerializer.register(bytes)
+BytesSerializer.register(memoryview)
+StrSerializer.register(str)
+ListSerializer.register(list)
+TupleSerializer.register(tuple)
+DictSerializer.register(dict)
+PlaceholderSerializer.register(Placeholder)
+
+
+cdef class _SerialStackItem:
+    cdef public tuple serialized
+    cdef public list subs
+    cdef public list subs_serialized
+
+    def __cinit__(self, tuple serialized, list subs):
+        self.serialized = serialized
+        self.subs = subs
+        self.subs_serialized = []
+
+
+cdef class _IdContextHolder:
+    cdef unordered_map[uint64_t, uint64_t] d
+
+
+cdef int _COMMON_HEADER_LEN = 4
+
+
+cdef tuple _serial_single(
+    obj, dict context, _IdContextHolder id_context_holder
+):
+    """Serialize single object and return serialized tuples"""
+    cdef uint64_t obj_id, ordered_id
+    cdef Serializer serializer
+    cdef tuple common_header, serialized
+
+    while True:
+        name = context.get("serializer")
+        obj_type = type(obj) if name is None else NamedType(name, type(obj))
+        serializer = _serial_dispatcher.get_handler(obj_type)
+        ret_serial = serializer.serial(obj, context)
+        if type(ret_serial) is tuple:
+            # object is serialized, form a common header and return
+            serialized, subs, final = <tuple>ret_serial
+
+            if type(obj) is Placeholder:
+                obj_id = (<Placeholder>obj).id
+                ordered_id = id_context_holder.d[obj_id]
+            else:
+                obj_id = _fast_id(obj)
+                ordered_id = id_context_holder.d.size()
+                id_context_holder.d[obj_id] = ordered_id
+
+            # REMEMBER to change _COMMON_HEADER_LEN when content of
+            # this header changed
+            common_header = (
+                serializer._serializer_id, ordered_id, len(subs), final
+            )
+            break
+        else:
+            # object is converted into another (usually a Placeholder)
+            obj = ret_serial
+    return common_header + serialized, subs, final
+
+
+class _SerializeObjectOverflow(Exception):
+    def __init__(self, tuple cur_serialized, int num_total_serialized):
+        super(_SerializeObjectOverflow, self).__init__(cur_serialized)
+        self.cur_serialized = cur_serialized
+        self.num_total_serialized = num_total_serialized
+
+
+cpdef object _serialize_with_stack(
+    list serial_stack,
+    tuple serialized,
+    dict context,
+    _IdContextHolder id_context_holder,
+    list result_bufs_list,
+    int64_t num_overflow = 0,
+    int64_t num_total_serialized = 0,
+):
+    cdef _SerialStackItem stack_item
+    cdef list subs
+    cdef bint final
+    cdef int64_t num_sub_serialized
+    cdef bint is_resume = num_total_serialized > 0
+
+    while serial_stack:
+        stack_item = serial_stack[-1]
+        if serialized is not None:
+            # have previously-serialized results, record first
+            stack_item.subs_serialized.append(serialized)
+
+        num_sub_serialized = len(stack_item.subs_serialized)
+        if len(stack_item.subs) == num_sub_serialized:
+            # all subcomponents serialized, serialization of current is done
+            # and we can move to the parent object
+            serialized = stack_item.serialized + tuple(stack_item.subs_serialized)
+            num_total_serialized += 1
+            serial_stack.pop()
+        else:
+            # serialize next subcomponent at stack top
+            serialized, subs, final = _serial_single(
+                stack_item.subs[num_sub_serialized], context, id_context_holder
+            )
+            num_total_serialized += 1
+            if final or not subs:
+                # the subcomponent is a leaf
+                if subs:
+                    result_bufs_list.extend(subs)
+            else:
+                # the subcomponent has its own subcomponents, we push itself
+                # into stack and process its children
+                stack_item = _SerialStackItem(serialized, subs)
+                serial_stack.append(stack_item)
+                # note that the serialized header should not be recorded
+                # as we are now processing the subcomponent itself
+                serialized = None
+        if 0 < num_overflow < num_total_serialized:
+            raise _SerializeObjectOverflow(serialized, num_total_serialized)
+
+    # we keep an empty dict for extra metas required for other modules
+    if is_resume:
+        # returns num of deserialized objects when resumed
+        extra_meta = {"_N": num_total_serialized}
+    else:
+        # otherwise does not record the number to reduce result size
+        extra_meta = {}
+    return (extra_meta, serialized), result_bufs_list
+
+
+def serialize(obj, dict context = None):
+    """
+    Serialize an object and return a header and buffers.
+    Buffers are intended for zero-copy data manipulation.
+
+    Parameters
+    ----------
+    obj: Any
+        Object to serialize
+    context:
+        Serialization context for instantiation of Placeholder
+        objects
+
+    Returns
+    -------
+    result: Tuple[Tuple, List]
+        Picklable header and buffers
+    """
+    cdef list serial_stack = []
+    cdef list result_bufs_list = []
+    cdef tuple serialized
+    cdef list subs
+    cdef bint final
+    cdef _IdContextHolder id_context_holder = _IdContextHolder()
+
+    context = context if context is not None else dict()
+    serialized, subs, final = _serial_single(obj, context, id_context_holder)
+    if final or not subs:
+        # marked as a leaf node, return directly
+        return ({}, serialized), subs
+
+    serial_stack.append(_SerialStackItem(serialized, subs))
+    return _serialize_with_stack(
+        serial_stack, None, context, id_context_holder, result_bufs_list
+    )
+
+
+async def serialize_with_spawn(
+    obj, dict context = None, int spawn_threshold = 100, object executor = None
+):
+    """
+    Serialize an object and return a header and buffers.
+    Buffers are intended for zero-copy data manipulation.
+
+    Parameters
+    ----------
+    obj: Any
+        Object to serialize
+    context: Dict
+        Serialization context for instantiation of Placeholder
+        objects
+    spawn_threshold: int
+        Threshold to spawn into a ThreadPoolExecutor
+    executor: ThreadPoolExecutor
+        ThreadPoolExecutor to spawn rest serialization into
+
+    Returns
+    -------
+    result: Tuple[Tuple, List]
+        Picklable header and buffers
+    """
+    cdef list serial_stack = []
+    cdef list result_bufs_list = []
+    cdef tuple serialized
+    cdef list subs
+    cdef bint final
+    cdef _IdContextHolder id_context_holder = _IdContextHolder()
+
+    context = context if context is not None else dict()
+    serialized, subs, final = _serial_single(obj, context, id_context_holder)
+    if final or not subs:
+        # marked as a leaf node, return directly
+        return ({}, serialized), subs
+
+    serial_stack.append(_SerialStackItem(serialized, subs))
+
+    try:
+        result = _serialize_with_stack(
+            serial_stack, None, context, id_context_holder, result_bufs_list, spawn_threshold
+        )
+    except _SerializeObjectOverflow as ex:
+        result = await asyncio.get_running_loop().run_in_executor(
+            executor,
+            _serialize_with_stack,
+            serial_stack,
+            ex.cur_serialized,
+            context,
+            id_context_holder,
+            result_bufs_list,
+            0,
+            ex.num_total_serialized,
+        )
+    return result
+
+
+cdef class _DeserialStackItem:
+    cdef public tuple serialized
+    cdef public tuple subs
+    cdef public list subs_deserialized
+
+    def __cinit__(self, tuple serialized, tuple subs):
+        self.serialized = serialized
+        self.subs = subs
+        self.subs_deserialized = []
+
+
+cdef _deserial_single(tuple serialized, dict context, list subs):
+    """Deserialize a single object"""
+    cdef Serializer serializer
+    cdef int64_t num_subs
+
+    serializer_id, obj_id, num_subs, final = serialized[:_COMMON_HEADER_LEN]
+    serializer = _deserializers[serializer_id]
+    res = serializer.deserial(serialized[_COMMON_HEADER_LEN:], context, subs)
+
+    if type(res) is Placeholder:
+        try:
+            res = context[obj_id]
+        except KeyError:
+            (<Placeholder>res).id = obj_id
+
+    # get previously-recorded context values
+    context_val, context[obj_id] = context.get(obj_id), res
+    # if previously recorded object is a Placeholder,
+    # replace it with callbacks
+    if type(context_val) is Placeholder:
+        for cb in (<Placeholder>context_val).callbacks:
+            cb(res)
+    return res
+
+
+def deserialize(tuple serialized, list buffers, dict context = None):
+    """
+    Deserialize an object with serialized headers and buffers
+
+    Parameters
+    ----------
+    serialized: Tuple
+        Serialized object header
+    buffers: List
+        List of buffers extracted from serialize() calls
+    context: Dict
+        Serialization context for replacing Placeholder
+        objects
+
+    Returns
+    -------
+    result: Any
+        Deserialized object
+    """
+    cdef list deserial_stack = []
+    cdef _DeserialStackItem stack_item
+    cdef int64_t num_subs, num_deserialized, buf_pos = 0
+    cdef bint final
+    cdef Serializer serializer
+    cdef object deserialized = None, exc_value = None
+    cdef bint has_deserialized = False
+
+    context = context if context is not None else dict()
+    # drop extra meta field
+    serialized = serialized[-1]
+    serializer_id, obj_id, num_subs, final = serialized[:4]
+    if final or num_subs == 0:
+        # marked as a leaf node, return directly
+        return _deserial_single(serialized, context, buffers)
+
+    deserial_stack.append(
+        _DeserialStackItem(
+            serialized[:-num_subs], serialized[-num_subs:]
+        )
+    )
+
+    while deserial_stack:
+        stack_item = deserial_stack[-1]
+        # the deserialized result can be None, hence we cannot
+        # simply judge from the value deserialized
+        if has_deserialized:
+            # have previously-deserialized results, record first
+            stack_item.subs_deserialized.append(deserialized)
+        elif exc_value is not None:
+            # have exception in successor components, try rewrite
+            # and pass to predecessors
+            serializer_id = stack_item.serialized[0]
+            serializer = _deserializers[serializer_id]
+            new_exc_value = serializer.on_deserial_error(
+                stack_item.serialized[_COMMON_HEADER_LEN:],
+                context,
+                list(stack_item.subs),
+                len(stack_item.subs_deserialized),
+                exc_value,
+            )
+            exc_value = new_exc_value if new_exc_value is not None else exc_value
+            deserial_stack.pop()
+            continue
+
+        num_deserialized = len(stack_item.subs_deserialized)
+        if len(stack_item.subs) == num_deserialized:
+            try:
+                # all subcomponents deserialized, we can deserialize the object itself
+                deserialized = _deserial_single(
+                    stack_item.serialized, context, stack_item.subs_deserialized
+                )
+                has_deserialized = True
+                deserial_stack.pop()
+            except BaseException as ex:
+                has_deserialized = False
+                exc_value = ex
+                deserial_stack.pop()
+        else:
+            # select next subcomponent to process
+            serialized = stack_item.subs[num_deserialized]
+            serializer_id, obj_id, num_subs, final = serialized[:4]
+            if final or num_subs == 0:
+                try:
+                    # next subcomponent is a leaf, just deserialize
+                    deserialized = _deserial_single(
+                        serialized, context, buffers[buf_pos : buf_pos + num_subs]
+                    )
+                    has_deserialized = True
+                    buf_pos += num_subs
+                except BaseException as ex:
+                    has_deserialized = False
+                    exc_value = ex
+            else:
+                # next subcomponent has its own subcomponents, we push it
+                # into stack and start handling its children
+                stack_item = _DeserialStackItem(
+                    serialized[:-num_subs], serialized[-num_subs:]
+                )
+                deserial_stack.append(stack_item)
+                # note that the deserialized object should be cleaned
+                # as we are just starting to handle the subcomponent itself
+                has_deserialized = False
+
+    if exc_value is not None:
+        raise exc_value
+    return deserialized
diff --git a/python/xorbits/_mars/serialization/cuda.py b/python/xorbits/_mars/serialization/cuda.py
new file mode 100644
index 000000000..f5794bcbe
--- /dev/null
+++ b/python/xorbits/_mars/serialization/cuda.py
@@ -0,0 +1,110 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Tuple
+
+import pandas as pd
+
+from ..utils import lazy_import
+from .core import Serializer, buffered
+
+cupy = lazy_import("cupy")
+cudf = lazy_import("cudf")
+
+
+class CupySerializer(Serializer):
+    @buffered
+    def serial(self, obj: Any, context: Dict):
+        if not (obj.flags["C_CONTIGUOUS"] or obj.flags["F_CONTIGUOUS"]):
+            obj = cupy.array(obj, copy=True)
+
+        header = obj.__cuda_array_interface__.copy()
+        header["strides"] = tuple(obj.strides)
+        header["lengths"] = [obj.nbytes]
+        buffer = cupy.ndarray(
+            shape=(obj.nbytes,), dtype=cupy.dtype("u1"), memptr=obj.data, strides=(1,)
+        )
+        return (header,), [buffer], True
+
+    def deserial(self, serialized: Tuple, context: Dict, subs: List):
+        (header,) = serialized
+        return cupy.ndarray(
+            shape=header["shape"],
+            dtype=header["typestr"],
+            memptr=cupy.asarray(subs[0]).data,
+            strides=header["strides"],
+        )
+
+
+class CudfSerializer(Serializer):
+    @staticmethod
+    def _get_ext_index_type(index_obj):
+        import cudf
+
+        multi_index_type = None
+        if isinstance(index_obj, pd.MultiIndex):
+            multi_index_type = "pandas"
+        elif isinstance(index_obj, cudf.MultiIndex):
+            multi_index_type = "cudf"
+
+        if multi_index_type is None:
+            return None
+        return {
+            "index_type": multi_index_type,
+            "names": list(index_obj.names),
+        }
+
+    @staticmethod
+    def _apply_index_type(obj, attr, header):
+        import cudf
+
+        multi_index_cls = (
+            pd.MultiIndex if header["index_type"] == "pandas" else cudf.MultiIndex
+        )
+        original_index = getattr(obj, attr)
+        if isinstance(original_index, (pd.MultiIndex, cudf.MultiIndex)):
+            return
+        new_index = multi_index_cls.from_tuples(original_index, names=header["names"])
+        setattr(obj, attr, new_index)
+
+    def serial(self, obj: Any, context: Dict):
+        header, buffers = obj.device_serialize()
+        if hasattr(obj, "columns"):
+            header["_ext_columns"] = self._get_ext_index_type(obj.columns)
+        if hasattr(obj, "index"):
+            header["_ext_index"] = self._get_ext_index_type(obj.index)
+        return (header,), buffers, True
+
+    def deserial(self, serialized: Tuple, context: Dict, buffers: List):
+        from cudf.core.abc import Serializable
+
+        (header,) = serialized
+        col_header = header.pop("_ext_columns", None)
+        index_header = header.pop("_ext_index", None)
+
+        result = Serializable.device_deserialize(header, buffers)
+
+        if col_header is not None:
+            self._apply_index_type(result, "columns", col_header)
+        if index_header is not None:
+            self._apply_index_type(result, "index", index_header)
+        return result
+
+
+if cupy is not None:
+    CupySerializer.register("cupy.ndarray")
+if cudf is not None:
+    CudfSerializer.register("cudf.DataFrame")
+    CudfSerializer.register("cudf.Series")
+    CudfSerializer.register("cudf.Index")
diff --git a/python/xorbits/_mars/serialization/exception.py b/python/xorbits/_mars/serialization/exception.py
new file mode 100644
index 000000000..6fea0d193
--- /dev/null
+++ b/python/xorbits/_mars/serialization/exception.py
@@ -0,0 +1,46 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle  # nosec  # pylint: disable=import_pickle
+from typing import Dict, List, Union
+
+from .core import Serializer, buffered, pickle_buffers, unpickle_buffers
+
+
+class UnpickleableError(Exception):
+    def __init__(self, raw_error: Union[str, Exception]):
+        if isinstance(raw_error, str):
+            super().__init__(raw_error)
+        else:
+            super().__init__(
+                f"Error cannot be pickled, "
+                f"error type: {type(raw_error)}, "
+                f"raw error:\n{raw_error}"
+            )
+
+
+class ExceptionSerializer(Serializer):
+    @buffered
+    def serial(self, obj: Exception, context: Dict):
+        try:
+            buffers = pickle_buffers(obj)
+        except (TypeError, pickle.PicklingError):
+            buffers = pickle_buffers(UnpickleableError(obj))
+        return (), buffers, True
+
+    def deserial(self, serialized: Dict, context: Dict, subs: List):
+        return unpickle_buffers(subs)
+
+
+ExceptionSerializer.register(Exception)
diff --git a/python/xorbits/_mars/serialization/mars_objects.py b/python/xorbits/_mars/serialization/mars_objects.py
new file mode 100644
index 000000000..f4f77be46
--- /dev/null
+++ b/python/xorbits/_mars/serialization/mars_objects.py
@@ -0,0 +1,39 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List
+
+try:
+    import scipy.sparse as sps
+except ImportError:  # pragma: no cover
+    sps = None
+
+from ..lib.sparse import SparseNDArray
+from .core import Serializer, buffered, deserialize, serialize
+
+
+class SparseNDArraySerializer(Serializer):
+    @buffered
+    def serial(self, obj: Any, context: Dict):
+        raw_header, raw_buffers = serialize(obj.raw, context)
+        return (raw_header, obj.shape), raw_buffers, True
+
+    def deserial(self, serialized: Dict, context: Dict, subs: List):
+        raw_header, obj_shape = serialized
+        raw_csr = deserialize(raw_header, subs)
+        return SparseNDArray(raw_csr, shape=tuple(obj_shape))
+
+
+if sps:  # pragma: no branch
+    SparseNDArraySerializer.register(SparseNDArray)
diff --git a/python/xorbits/_mars/serialization/numpy.py b/python/xorbits/_mars/serialization/numpy.py
new file mode 100644
index 000000000..02eeeb54c
--- /dev/null
+++ b/python/xorbits/_mars/serialization/numpy.py
@@ -0,0 +1,81 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+
+from .core import Serializer, buffered, pickle_buffers, unpickle_buffers
+
+
+class NDArraySerializer(Serializer):
+    @buffered
+    def serial(self, obj: np.ndarray, context: Dict):
+        header = {}
+        if obj.dtype.hasobject:
+            header["pickle"] = True
+            buffers = pickle_buffers(obj)
+            return (header,), buffers, True
+
+        order = "C"
+        if obj.flags.f_contiguous:
+            order = "F"
+        elif not obj.flags.c_contiguous:
+            obj = np.ascontiguousarray(obj)
+        try:
+            desc = np.lib.format.dtype_to_descr(obj.dtype)
+            dtype_new_order = None
+        except ValueError:
+            # for structured dtype, array[[field2, field1]] will create a view,
+            # and dtype_to_desc will fail due to the order
+            fields = obj.dtype.fields
+            new_fields = sorted(fields, key=lambda k: fields[k][1])
+            desc = np.lib.format.dtype_to_descr(obj.dtype[new_fields])
+            dtype_new_order = list(fields)
+        header.update(
+            dict(
+                pickle=False,
+                descr=desc,
+                dtype_new_order=dtype_new_order,
+                shape=list(obj.shape),
+                strides=list(obj.strides),
+                order=order,
+            )
+        )
+        return (header,), [memoryview(obj.ravel(order=order).view("uint8").data)], True
+
+    def deserial(self, serialized: Tuple, context: Dict, subs: List[Any]):
+        header = serialized[0]
+        if header["pickle"]:
+            return unpickle_buffers(subs)
+
+        try:
+            dtype = np.lib.format.descr_to_dtype(header["descr"])
+        except AttributeError:  # pragma: no cover
+            # for older numpy versions, descr_to_dtype is not implemented
+            dtype = np.dtype(header["descr"])
+
+        dtype_new_order = header["dtype_new_order"]
+        if dtype_new_order:
+            dtype = dtype[dtype_new_order]
+        return np.ndarray(
+            shape=tuple(header["shape"]),
+            dtype=dtype,
+            buffer=subs[0],
+            strides=tuple(header["strides"]),
+            order=header["order"],
+        )
+
+
+NDArraySerializer.register(np.ndarray)
diff --git a/python/xorbits/_mars/serialization/ray.py b/python/xorbits/_mars/serialization/ray.py
new file mode 100644
index 000000000..a8aeabf8e
--- /dev/null
+++ b/python/xorbits/_mars/serialization/ray.py
@@ -0,0 +1,38 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Tuple
+
+from ..utils import lazy_import
+from .core import Serializer, buffered
+
+ray = lazy_import("ray")
+
+
+class RaySerializer(Serializer):
+    """Return raw object to let ray do serialization."""
+
+    @buffered
+    def serial(self, obj: Any, context: Dict):
+        return (obj,), [], True
+
+    def deserial(self, serialized: Tuple, context: Dict, subs: List[Any]):
+        assert not subs
+        return serialized[0]
+
+
+if ray is not None:
+    RaySerializer.register(object, "ray")
+    RaySerializer.register("ray.ObjectRef", "ray")
+    RaySerializer.register("ray.actor.ActorHandle", "ray")
diff --git a/python/xorbits/_mars/serialization/scipy.py b/python/xorbits/_mars/serialization/scipy.py
new file mode 100644
index 000000000..fddcecd2c
--- /dev/null
+++ b/python/xorbits/_mars/serialization/scipy.py
@@ -0,0 +1,71 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+
+try:
+    import scipy.sparse as sps
+except ImportError:  # pragma: no cover
+    sps = None
+
+from .core import Serializer, buffered, deserialize, serialize
+
+
+class CsrMatrixSerializer(Serializer):
+    @buffered
+    def serial(self, obj: Any, context: Dict):
+        data_header, data_buffers = serialize(obj.data)
+        idx_header, idx_buffers = serialize(obj.indices)
+        indptr_header, indptr_buffers = serialize(obj.indptr)
+        header = (
+            data_header,  # data_header
+            len(data_buffers),  # data_buf_num
+            idx_header,  # idx_header
+            len(idx_buffers),  # idx_buf_num
+            indptr_header,  # indptr_header
+            obj.shape,  # shape
+        )
+        return header, data_buffers + idx_buffers + indptr_buffers, True
+
+    def deserial(self, serialized: Tuple, context: Dict, subs: List):
+        (
+            data_header,
+            data_buf_num,
+            idx_header,
+            idx_buf_num,
+            indptr_header,
+            shape,
+        ) = serialized
+        data_buffers = subs[:data_buf_num]
+        idx_buffers = subs[data_buf_num : data_buf_num + idx_buf_num]
+        indptr_buffers = subs[data_buf_num + idx_buf_num :]
+
+        data = deserialize(data_header, data_buffers)
+        indices = deserialize(idx_header, idx_buffers)
+        indptr = deserialize(indptr_header, indptr_buffers)
+        shape = tuple(shape)
+
+        empty_arr = np.zeros(0, dtype=data.dtype)
+
+        target_csr = sps.coo_matrix(
+            (empty_arr, (empty_arr,) * 2), dtype=data.dtype, shape=shape
+        ).tocsr()
+        target_csr.data, target_csr.indices, target_csr.indptr = data, indices, indptr
+        return target_csr
+
+
+if sps:  # pragma: no branch
+    CsrMatrixSerializer.register(sps.csr_matrix)
diff --git a/python/xorbits/_mars/serialization/serializables/__init__.py b/python/xorbits/_mars/serialization/serializables/__init__.py
new file mode 100644
index 000000000..ffc7769b7
--- /dev/null
+++ b/python/xorbits/_mars/serialization/serializables/__init__.py
@@ -0,0 +1,55 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .core import Serializable, SerializableMeta
+from .field import (
+    AnyField,
+    BoolField,
+    BytesField,
+    Complex64Field,
+    Complex128Field,
+    DataFrameField,
+    DataTypeField,
+    Datetime64Field,
+    DictField,
+    Float16Field,
+    Float32Field,
+    Float64Field,
+    FunctionField,
+    IdentityField,
+    IndexField,
+    Int8Field,
+    Int16Field,
+    Int32Field,
+    Int64Field,
+    IntervalArrayField,
+    KeyField,
+    ListField,
+    NamedTupleField,
+    NDArrayField,
+    OneOfField,
+    ReferenceField,
+    SeriesField,
+    SliceField,
+    StringField,
+    Timedelta64Field,
+    TupleField,
+    TZInfoField,
+    UInt8Field,
+    UInt16Field,
+    UInt32Field,
+    UInt64Field,
+)
+from .field_type import FieldTypes
diff --git a/python/xorbits/_mars/serialization/serializables/core.py b/python/xorbits/_mars/serialization/serializables/core.py
new file mode 100644
index 000000000..73a206667
--- /dev/null
+++ b/python/xorbits/_mars/serialization/serializables/core.py
@@ -0,0 +1,245 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import operator
+import weakref
+from typing import Dict, List, Tuple, Type
+
+import cloudpickle
+
+from ..core import Placeholder, Serializer, buffered
+from .field import Field
+from .field_type import (
+    DatetimeType,
+    DictType,
+    DtypeType,
+    ListType,
+    PrimitiveFieldType,
+    TimedeltaType,
+    TupleType,
+    TZInfoType,
+)
+
+_primitive_field_types = (
+    PrimitiveFieldType,
+    DtypeType,
+    DatetimeType,
+    TimedeltaType,
+    TZInfoType,
+)
+
+
+def _is_field_primitive_compound(field: Field):
+    if field.on_serialize is not None or field.on_deserialize is not None:
+        return False
+
+    def check_type(field_type):
+        if isinstance(field_type, _primitive_field_types):
+            return True
+        if isinstance(field_type, (ListType, TupleType)):
+            if all(
+                check_type(element_type) or element_type is Ellipsis
+                for element_type in field_type._field_types
+            ):
+                return True
+        if isinstance(field_type, DictType):
+            if all(
+                isinstance(element_type, _primitive_field_types)
+                or element_type is Ellipsis
+                for element_type in (field_type.key_type, field_type.value_type)
+            ):
+                return True
+        return False
+
+    return check_type(field.field_type)
+
+
+class SerializableMeta(type):
+    def __new__(mcs, name: str, bases: Tuple[Type], properties: Dict):
+        # All the fields including base fields.
+        all_fields = dict()
+
+        for base in bases:
+            if hasattr(base, "_FIELDS"):
+                all_fields.update(base._FIELDS)
+
+        properties_without_fields = {}
+        properties_field_slot_names = []
+        for k, v in properties.items():
+            if not isinstance(v, Field):
+                properties_without_fields[k] = v
+                continue
+
+            field = all_fields.get(k)
+            if field is None:
+                properties_field_slot_names.append(k)
+            else:
+                v.name = field.name
+                v.get = field.get
+                v.set = field.set
+                v.__delete__ = field.__delete__
+            all_fields[k] = v
+
+        # Make field order deterministic to serialize it as list instead of dict.
+        all_fields = dict(sorted(all_fields.items(), key=operator.itemgetter(0)))
+        pickle_fields = []
+        non_pickle_fields = []
+        for v in all_fields.values():
+            if _is_field_primitive_compound(v):
+                pickle_fields.append(v)
+            else:
+                non_pickle_fields.append(v)
+
+        slots = set(properties.pop("__slots__", set()))
+        slots.update(properties_field_slot_names)
+
+        properties = properties_without_fields
+        properties["_FIELDS"] = all_fields
+        properties["_PRIMITIVE_FIELDS"] = pickle_fields
+        properties["_NON_PRIMITIVE_FIELDS"] = non_pickle_fields
+        properties["__slots__"] = tuple(slots)
+
+        clz = type.__new__(mcs, name, bases, properties)
+        # Bind slot member_descriptor with field.
+        for name in properties_field_slot_names:
+            member_descriptor = getattr(clz, name)
+            field = all_fields[name]
+            field.name = member_descriptor.__name__
+            field.get = member_descriptor.__get__
+            field.set = member_descriptor.__set__
+            field.__delete__ = member_descriptor.__delete__
+            setattr(clz, name, field)
+
+        return clz
+
+
+class Serializable(metaclass=SerializableMeta):
+    __slots__ = ("__weakref__",)
+
+    _cache_primitive_serial = False
+
+    _FIELDS: Dict[str, Field]
+    _PRIMITIVE_FIELDS: List[str]
+    _NON_PRIMITIVE_FIELDS: List[str]
+
+    def __init__(self, *args, **kwargs):
+        fields = self._FIELDS
+        if args:  # pragma: no cover
+            values = dict(zip(fields, args))
+            values.update(kwargs)
+        else:
+            values = kwargs
+        for k, v in values.items():
+            fields[k].set(self, v)
+
+    def __on_deserialize__(self):
+        pass
+
+    def __repr__(self):
+        values = ", ".join(
+            [
+                "{}={!r}".format(slot, getattr(self, slot, None))
+                for slot in self.__slots__
+            ]
+        )
+        return "{}({})".format(self.__class__.__name__, values)
+
+    def copy(self) -> "Serializable":
+        copied = type(self)()
+        copied_fields = copied._FIELDS
+        for k, field in self._FIELDS.items():
+            try:
+                # Slightly faster than getattr.
+                value = field.get(self, k)
+                copied_fields[k].set(copied, value)
+            except AttributeError:
+                continue
+        return copied
+
+
+_primitive_serial_cache = weakref.WeakKeyDictionary()
+
+
+class _NoFieldValue:
+    pass
+
+
+class SerializableSerializer(Serializer):
+    """
+    Leverage DictSerializer to perform serde.
+    """
+
+    @classmethod
+    def _get_field_values(cls, obj: Serializable, fields):
+        values = []
+        for field in fields:
+            try:
+                value = field.get(obj)
+                if field.on_serialize is not None:
+                    value = field.on_serialize(value)
+            except AttributeError:
+                # Most field values are not None, serialize by list is more efficient than dict.
+                value = _NoFieldValue
+            values.append(value)
+        return values
+
+    @buffered
+    def serial(self, obj: Serializable, context: Dict):
+        if obj._cache_primitive_serial and obj in _primitive_serial_cache:
+            primitive_vals = _primitive_serial_cache[obj]
+        else:
+            primitive_vals = self._get_field_values(obj, obj._PRIMITIVE_FIELDS)
+            if obj._cache_primitive_serial:
+                primitive_vals = cloudpickle.dumps(primitive_vals)
+                _primitive_serial_cache[obj] = primitive_vals
+
+        compound_vals = self._get_field_values(obj, obj._NON_PRIMITIVE_FIELDS)
+        return (type(obj), primitive_vals), [compound_vals], False
+
+    @staticmethod
+    def _set_field_value(obj: Serializable, field: Field, value):
+        if value is _NoFieldValue:
+            return
+        if type(value) is Placeholder:
+            if field.on_deserialize is not None:
+                value.callbacks.append(
+                    lambda v: field.set(obj, field.on_deserialize(v))
+                )
+            else:
+                value.callbacks.append(lambda v: field.set(obj, v))
+        else:
+            if field.on_deserialize is not None:
+                field.set(obj, field.on_deserialize(value))
+            else:
+                field.set(obj, value)
+
+    def deserial(self, serialized: Tuple, context: Dict, subs: List) -> Serializable:
+        obj_class, primitives = serialized
+
+        if type(primitives) is not list:
+            primitives = cloudpickle.loads(primitives)
+
+        obj = obj_class.__new__(obj_class)
+
+        if primitives:
+            for field, value in zip(obj_class._PRIMITIVE_FIELDS, primitives):
+                self._set_field_value(obj, field, value)
+
+        if obj_class._NON_PRIMITIVE_FIELDS:
+            for field, value in zip(obj_class._NON_PRIMITIVE_FIELDS, subs[0]):
+                self._set_field_value(obj, field, value)
+        obj.__on_deserialize__()
+        return obj
+
+
+SerializableSerializer.register(Serializable)
diff --git a/python/xorbits/_mars/serialization/serializables/field.py b/python/xorbits/_mars/serialization/serializables/field.py
new file mode 100644
index 000000000..072cb6173
--- /dev/null
+++ b/python/xorbits/_mars/serialization/serializables/field.py
@@ -0,0 +1,579 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import inspect
+import itertools
+from abc import ABC, ABCMeta, abstractmethod
+from typing import Any, Callable, Optional, Type, Union
+
+from ...utils import _is_ci, no_default
+from .field_type import (
+    AbstractFieldType,
+    DictType,
+    FieldTypes,
+    ListType,
+    ReferenceType,
+    TupleType,
+)
+
+
+class Field(ABC):
+    __slots__ = (
+        "_tag",
+        "_default_value",
+        "_default_factory",
+        "_on_serialize",
+        "_on_deserialize",
+        "name",  # The __name__ of member_descriptor
+        "get",  # The __get__ of member_descriptor
+        "set",  # The __set__ of member_descriptor
+        "__delete__",  # The __delete__ of member_descriptor
+    )
+
+    _tag: str
+    _default_value: Any
+    _default_factory: Optional[Callable]
+
+    def __init__(
+        self,
+        tag: str,
+        default: Any = no_default,
+        default_factory: Optional[Callable] = None,
+        on_serialize: Callable[[Any], Any] = None,
+        on_deserialize: Callable[[Any], Any] = None,
+    ):
+        if (
+            default is not no_default and default_factory is not None
+        ):  # pragma: no cover
+            raise ValueError("default and default_factory can not be specified both")
+
+        self._tag = tag
+        self._default_value = default
+        self._default_factory = default_factory
+        self._on_serialize = on_serialize
+        self._on_deserialize = on_deserialize
+
+    @property
+    def tag(self):
+        return self._tag
+
+    @property
+    def on_serialize(self):
+        return self._on_serialize
+
+    @property
+    def on_deserialize(self):
+        return self._on_deserialize
+
+    @property
+    @abstractmethod
+    def field_type(self) -> AbstractFieldType:
+        """
+        Field type.
+
+        Returns
+        -------
+        field_type : AbstractFieldType
+             Field type.
+        """
+
+    def __get__(self, instance, owner=None):
+        try:
+            return self.get(instance, owner)
+        except AttributeError:
+            if self._default_value is not no_default:
+                val = self._default_value
+                self.set(instance, val)
+                return val
+            elif self._default_factory is not None:
+                val = self._default_factory()
+                self.set(instance, val)
+                return val
+            else:
+                raise
+
+    def __set__(self, instance, value) -> None:
+        if _is_ci:  # pragma: no branch
+            from ...core import is_kernel_mode
+
+            if not is_kernel_mode():
+                field_type = self.field_type
+                try:
+                    to_check_value = value
+                    if to_check_value is not None and self._on_serialize:
+                        to_check_value = self._on_serialize(to_check_value)
+                    field_type.validate(to_check_value)
+                except (TypeError, ValueError) as e:
+                    raise type(e)(
+                        f"Failed to set `{self.name}` for {type(instance).__name__} "
+                        f"when environ CI=true is set: {str(e)}"
+                    )
+        self.set(instance, value)
+
+
+class AnyField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.any
+
+
+class IdentityField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.string
+
+
+class BoolField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.bool
+
+
+class Int8Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.int8
+
+
+class Int16Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.int16
+
+
+class Int32Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.int32
+
+
+class Int64Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.int64
+
+
+class UInt8Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.uint8
+
+
+class UInt16Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.uint16
+
+
+class UInt32Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.uint32
+
+
+class UInt64Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.uint64
+
+
+class Float16Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.float16
+
+
+class Float32Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.float32
+
+
+class Float64Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.float64
+
+
+class Complex64Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.complex64
+
+
+class Complex128Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.complex128
+
+
+class StringField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.string
+
+
+class BytesField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.bytes
+
+
+class KeyField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.key
+
+
+class NDArrayField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.ndarray
+
+
+class Datetime64Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.datetime
+
+
+class Timedelta64Field(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.timedelta
+
+
+class DataTypeField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.dtype
+
+
+class IndexField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.index
+
+
+class SeriesField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.series
+
+
+class DataFrameField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.dataframe
+
+
+class SliceField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.slice
+
+
+class FunctionField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.function
+
+
+class NamedTupleField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.namedtuple
+
+
+class TZInfoField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.tzinfo
+
+
+class IntervalArrayField(Field):
+    __slots__ = ()
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return FieldTypes.interval_array
+
+
+class _CollectionField(Field, metaclass=ABCMeta):
+    __slots__ = ("_field_type",)
+
+    def __init__(
+        self,
+        tag: str,
+        field_type: AbstractFieldType = None,
+        default: Any = no_default,
+        default_factory: Optional[Callable] = None,
+        on_serialize: Callable[[Any], Any] = None,
+        on_deserialize: Callable[[Any], Any] = None,
+    ):
+        super().__init__(
+            tag,
+            default=default,
+            default_factory=default_factory,
+            on_serialize=on_serialize,
+            on_deserialize=on_deserialize,
+        )
+        if field_type is None:
+            field_type = FieldTypes.any
+        collection_type = self._collection_type()
+        if not isinstance(field_type, collection_type):
+            field_type = collection_type(field_type, ...)
+        self._field_type = field_type
+
+    @classmethod
+    @abstractmethod
+    def _collection_type(cls) -> AbstractFieldType:
+        """
+        Collection type.
+
+        Returns
+        -------
+        collection_type
+        """
+
+    @property
+    def field_type(self) -> Type[AbstractFieldType]:
+        return self._field_type
+
+
+class ListField(_CollectionField):
+    __slots__ = ()
+
+    @classmethod
+    def _collection_type(cls) -> Type[AbstractFieldType]:
+        return ListType
+
+
+class TupleField(_CollectionField):
+    __slots__ = ()
+
+    @classmethod
+    def _collection_type(cls) -> Type[AbstractFieldType]:
+        return TupleType
+
+
+class DictField(Field):
+    __slots__ = ("_field_type",)
+
+    def __init__(
+        self,
+        tag: str,
+        key_type: AbstractFieldType = None,
+        value_type: AbstractFieldType = None,
+        default: Any = no_default,
+        default_factory: Optional[Callable] = None,
+        on_serialize: Callable[[Any], Any] = None,
+        on_deserialize: Callable[[Any], Any] = None,
+    ):
+        super().__init__(
+            tag,
+            default=default,
+            default_factory=default_factory,
+            on_serialize=on_serialize,
+            on_deserialize=on_deserialize,
+        )
+        self._field_type = DictType(key_type, value_type)
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return self._field_type
+
+
+class ReferenceField(Field):
+    __slots__ = "_reference_type", "_field_type"
+
+    def __init__(
+        self,
+        tag: str,
+        reference_type: Union[str, Type] = None,
+        default: Any = no_default,
+        on_serialize: Callable[[Any], Any] = None,
+        on_deserialize: Callable[[Any], Any] = None,
+    ):
+        super().__init__(
+            tag,
+            default=default,
+            on_serialize=on_serialize,
+            on_deserialize=on_deserialize,
+        )
+        self._reference_type = reference_type
+
+        if not isinstance(reference_type, str):
+            self._field_type = ReferenceType(reference_type)
+        else:
+            # need to bind dynamically
+            self._field_type = None
+
+    @property
+    def field_type(self) -> AbstractFieldType:
+        return self._field_type
+
+    def get_field_type(self, instance):
+        if self._field_type is None:
+            # bind dynamically
+            if self._reference_type == "self":
+                reference_type = type(instance)
+            elif isinstance(self._reference_type, str) and "." in self._reference_type:
+                module, name = self._reference_type.rsplit(".", 1)
+                reference_type = getattr(importlib.import_module(module), name)
+            else:
+                module = inspect.getmodule(instance)
+                reference_type = getattr(module, self._reference_type)
+            self._field_type = ReferenceType(reference_type)
+        return self._field_type
+
+    def __set__(self, instance, value):
+        if _is_ci:
+            from ...core import is_kernel_mode
+
+            if not is_kernel_mode():
+                field_type = self.get_field_type(instance)
+                try:
+                    to_check_value = value
+                    if to_check_value is not None and self._on_serialize:
+                        to_check_value = self._on_serialize(to_check_value)
+                    field_type.validate(to_check_value)
+                except (TypeError, ValueError) as e:
+                    raise type(e)(
+                        f"Failed to set `{self.name}` for {type(instance).__name__} "
+                        f"when environ CI=true is set: {e}"
+                    )
+        self.set(instance, value)
+
+
+class OneOfField(Field):
+    __slots__ = "_reference_fields"
+
+    def __init__(
+        self,
+        tag: str,
+        default: Any = no_default,
+        on_serialize: Callable[[Any], Any] = None,
+        on_deserialize: Callable[[Any], Any] = None,
+        **tag_to_reference_types,
+    ):
+        super().__init__(
+            tag,
+            default=default,
+            on_serialize=on_serialize,
+            on_deserialize=on_deserialize,
+        )
+        self._reference_fields = [
+            ReferenceField(t, ref_type)
+            for t, ref_type in tag_to_reference_types.items()
+        ]
+
+    @property
+    def reference_fields(self):
+        return self._reference_fields
+
+    @property
+    def field_type(self) -> AbstractFieldType:  # pragma: no cover
+        # takes no effect here, just return AnyType()
+        # we will do check in __set__ instead
+        return FieldTypes.any
+
+    def __set__(self, instance, value):
+        if not _is_ci:  # pragma: no cover
+            return self.set(instance, value)
+
+        for reference_field in self._reference_fields:
+            try:
+                to_check_value = value
+                if to_check_value is not None and self._on_serialize:
+                    to_check_value = self._on_serialize(to_check_value)
+                reference_field.get_field_type(instance).validate(to_check_value)
+                self.set(instance, value)
+                return
+            except TypeError:
+                continue
+        valid_types = list(
+            itertools.chain(
+                *[
+                    r.get_field_type(instance).valid_types
+                    for r in self._reference_fields
+                ]
+            )
+        )
+        raise TypeError(
+            f"Failed to set `{self.name}` for {type(instance).__name__} "
+            f"when environ CI=true is set: type of instance cannot match any "
+            f"of {valid_types}, got {type(value).__name__}"
+        )
diff --git a/python/xorbits/_mars/serialization/serializables/field_type.py b/python/xorbits/_mars/serialization/serializables/field_type.py
new file mode 100644
index 000000000..24e01308f
--- /dev/null
+++ b/python/xorbits/_mars/serialization/serializables/field_type.py
@@ -0,0 +1,559 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, ABCMeta, abstractmethod
+from datetime import datetime, timedelta, tzinfo
+from enum import Enum
+from typing import Tuple, Type
+
+import numpy as np
+import pandas as pd
+
+from ...utils import lazy_import
+
+cupy = lazy_import("cupy")
+cudf = lazy_import("cudf")
+
+
+class PrimitiveType(Enum):
+    bool = 1
+    int8 = 2
+    int16 = 3
+    int32 = 4
+    int64 = 5
+    uint8 = 6
+    uint16 = 7
+    uint32 = 8
+    uint64 = 9
+    float16 = 10
+    float32 = 11
+    float64 = 12
+    bytes = 13
+    string = 14
+    complex64 = 24
+    complex128 = 25
+
+
+_primitive_type_to_valid_types = {
+    PrimitiveType.bool: (bool, np.bool_),
+    PrimitiveType.int8: (int, np.int8),
+    PrimitiveType.int16: (int, np.int16),
+    PrimitiveType.int32: (int, np.int32),
+    PrimitiveType.int64: (int, np.int64),
+    PrimitiveType.uint8: (int, np.uint8),
+    PrimitiveType.uint16: (int, np.uint16),
+    PrimitiveType.uint32: (int, np.uint32),
+    PrimitiveType.uint64: (int, np.uint64),
+    PrimitiveType.float16: (float, np.float16),
+    PrimitiveType.float32: (float, np.float32),
+    PrimitiveType.float64: (float, np.float64),
+    PrimitiveType.bytes: (bytes, np.bytes_),
+    PrimitiveType.string: (str, np.unicode_),
+    PrimitiveType.complex64: (complex, np.complex64),
+    PrimitiveType.complex128: (complex, np.complex128),
+}
+
+
+class AbstractFieldType(ABC):
+    __slots__ = ()
+
+    @property
+    @abstractmethod
+    def type_name(self) -> str:
+        """
+        Type name.
+
+        Returns
+        -------
+        type_name : str
+        """
+
+    @property
+    def name(self) -> str:
+        """
+        Name of field type instance.
+
+        Returns
+        -------
+        name : str
+        """
+        return self.type_name.capitalize()
+
+    @property
+    @abstractmethod
+    def valid_types(self) -> Tuple[Type, ...]:
+        """
+        Valid types.
+
+        Returns
+        -------
+        valid_types: tuple
+            Valid types.
+        """
+
+    def validate(self, value):
+        if value is not None and not isinstance(value, self.valid_types):
+            raise TypeError(
+                f"value needs to be instance "
+                f"of {self.valid_types}, got {type(value)}"
+            )
+
+    def __call__(self, *args, **kwargs):
+        return type(self)(*args, **kwargs)
+
+
+class SingletonFieldType(AbstractFieldType, metaclass=ABCMeta):
+    __slots__ = ()
+
+    _instance = None
+
+    def __new__(cls, *args, **kw):
+        if cls._instance is None:
+            inst = super().__new__(cls, *args, **kw)
+            cls._instance = inst
+        return cls._instance
+
+
+class PrimitiveFieldType(AbstractFieldType):
+    __slots__ = ("type",)
+
+    _type_to_instances = dict()
+
+    def __new__(cls, *args, **kwargs):
+        primitive_type = args[0]
+        try:
+            return cls._type_to_instances[primitive_type]
+        except KeyError:
+            inst = cls._type_to_instances[primitive_type] = super().__new__(cls)
+            return inst
+
+    def __init__(self, primitive_type: PrimitiveType):
+        self.type = primitive_type
+
+    @property
+    def type_name(self) -> str:
+        return self.type.name
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return _primitive_type_to_valid_types[self.type]
+
+
+class SliceType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "slice"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return (slice,)
+
+
+class NDArrayType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "ndarray"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        if cupy is None:
+            return (np.ndarray,)
+        else:
+            return np.ndarray, cupy.ndarray
+
+
+class DtypeType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "dtype"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return np.dtype, pd.api.extensions.ExtensionDtype
+
+
+class KeyType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "dtype"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        from ...core.entity import ENTITY_TYPE
+
+        return ENTITY_TYPE
+
+
+class DatetimeType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "datetime"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return datetime, pd.Timestamp
+
+
+class TimedeltaType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "timedelta"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return timedelta, pd.Timedelta
+
+
+class IndexType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "index"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        if cudf is None:
+            return (pd.Index,)
+        else:
+            return pd.Index, cudf.Index
+
+
+class SeriesType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "series"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        if cudf is None:
+            return (pd.Series,)
+        else:
+            return pd.Series, cudf.Series
+
+
+class DataFrameType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "dataframe"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        if cudf is None:
+            return (pd.DataFrame,)
+        else:
+            return pd.DataFrame, cudf.DataFrame
+
+
+class FunctionType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "function"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:  # pragma: no cover
+        return ()
+
+    def validate(self, value):
+        if value is not None and not callable(value):
+            raise TypeError(f"value should be a function, got {type(value)}")
+
+
+class NamedtupleType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "namedtuple"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return (tuple,)
+
+    def validate(self, value):
+        if not (isinstance(value, self.valid_types) and hasattr(value, "_fields")):
+            raise TypeError(
+                f"value should be instance of namedtuple, got {type(value)}"
+            )
+
+
+class TZInfoType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "tzinfo"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return (tzinfo,)
+
+
+class IntervalArrayType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "interval_array"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return (pd.arrays.IntervalArray,)
+
+
+class AnyType(SingletonFieldType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "any"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:  # pragma: no cover
+        return ()
+
+    def validate(self, value):
+        # any type is valid
+        return
+
+
+class _CollectionType(AbstractFieldType, metaclass=ABCMeta):
+    __slots__ = ("_field_types",)
+
+    def __init__(self, *field_types):
+        self._field_types = field_types
+        if len(field_types) == 0:
+            self._field_types = (AnyType(), Ellipsis)
+
+    @property
+    def name(self) -> str:
+        base_name = super().name
+        if self.is_homogeneous():
+            if isinstance(self._field_types[0], AnyType):
+                return base_name
+            else:
+                return f"{base_name}[{self._field_types[0].name}, ...]"
+        else:
+            field_type_names = ", ".join([ft.name for ft in self._field_types])
+            return f"{base_name}[{field_type_names}]"
+
+    def is_homogeneous(self):
+        return len(self._field_types) == 1 or (
+            len(self._field_types) == 2 and self._field_types[1] is Ellipsis
+        )
+
+    def validate(self, value):
+        if value is None:
+            return
+        if not isinstance(value, self.valid_types):
+            raise TypeError(
+                f"value should be instance of {self.valid_types}, got {type(value)}"
+            )
+        if self.is_homogeneous():
+            field_type: AbstractFieldType = self._field_types[0]
+            if not isinstance(field_type, AnyType):
+                for item in value:
+                    try:
+                        field_type.validate(item)
+                    except TypeError:
+                        raise TypeError(
+                            f"item should be instance of "
+                            f"{field_type.valid_types}, "
+                            f"got {type(item)}"
+                        )
+        else:
+            if len(value) != len(self._field_types):
+                raise ValueError(
+                    f"value should own {len(self._field_types)} items, "
+                    f"got {len(value)} items"
+                )
+            for expect_field_type, item in zip(self._field_types, value):
+                try:
+                    expect_field_type.validate(item)
+                except TypeError:
+                    raise TypeError(
+                        f"item should be instance of "
+                        f"{expect_field_type.valid_types}, "
+                        f"got {type(item)}"
+                    )
+
+
+class ListType(_CollectionType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "list"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return (list,)
+
+
+class TupleType(_CollectionType):
+    __slots__ = ()
+
+    @property
+    def type_name(self) -> str:
+        return "tuple"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return (tuple,)
+
+
+class DictType(AbstractFieldType):
+    __slots__ = "key_type", "value_type"
+
+    key_type: AbstractFieldType
+    value_type: AbstractFieldType
+
+    def __init__(
+        self, key_type: AbstractFieldType = None, value_type: AbstractFieldType = None
+    ):
+        if key_type is None:
+            key_type = AnyType()
+        if value_type is None:
+            value_type = AnyType()
+        self.key_type = key_type
+        self.value_type = value_type
+
+    @property
+    def type_name(self) -> str:
+        return "dict"
+
+    @property
+    def name(self) -> str:
+        if isinstance(self.key_type, AnyType) and isinstance(self.value_type, AnyType):
+            return "Dict"
+        else:
+            return f"Dict[{self.key_type.name}, {self.value_type.name}]"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return (dict,)
+
+    def validate(self, value):
+        super().validate(value)
+        if value is None:
+            return
+        for k, v in value.items():
+            try:
+                self.key_type.validate(k)
+            except TypeError:
+                raise TypeError(
+                    f"key should be instance of "
+                    f"{self.key_type.valid_types}, got {type(k)}"
+                )
+            try:
+                self.value_type.validate(v)
+            except TypeError:
+                raise TypeError(
+                    f"value should be instance of "
+                    f"{self.value_type.valid_types}, got {type(v)}"
+                )
+
+
+class ReferenceType(AbstractFieldType):
+    __slots__ = ("reference_type",)
+
+    reference_type: Type
+
+    def __init__(self, reference_type: Type = None):
+        if reference_type is None:
+            reference_type = object
+        self.reference_type = reference_type
+
+    @property
+    def type_name(self) -> str:
+        return "reference"
+
+    @property
+    def valid_types(self) -> Tuple[Type, ...]:
+        return (self.reference_type,)
+
+
+class FieldTypes:
+    # primitive type
+    bool = PrimitiveFieldType(PrimitiveType.bool)
+    int8 = PrimitiveFieldType(PrimitiveType.int8)
+    int16 = PrimitiveFieldType(PrimitiveType.int16)
+    int32 = PrimitiveFieldType(PrimitiveType.int32)
+    int64 = PrimitiveFieldType(PrimitiveType.int64)
+    uint8 = PrimitiveFieldType(PrimitiveType.uint8)
+    uint16 = PrimitiveFieldType(PrimitiveType.uint16)
+    uint32 = PrimitiveFieldType(PrimitiveType.uint32)
+    uint64 = PrimitiveFieldType(PrimitiveType.uint64)
+    float16 = PrimitiveFieldType(PrimitiveType.float16)
+    float32 = PrimitiveFieldType(PrimitiveType.float32)
+    float64 = PrimitiveFieldType(PrimitiveType.float64)
+    complex64 = PrimitiveFieldType(PrimitiveType.complex64)
+    complex128 = PrimitiveFieldType(PrimitiveType.complex128)
+    bytes = PrimitiveFieldType(PrimitiveType.bytes)
+    string = PrimitiveFieldType(PrimitiveType.string)
+
+    key = KeyType()
+
+    # Python types
+    slice = SliceType()
+    datetime = DatetimeType()
+    # alias of datetime
+    datatime64 = DatetimeType()
+    timedelta = TimedeltaType()
+    # alias of timedelta
+    timedelta64 = TimedeltaType()
+    tzinfo = TZInfoType()
+    function = FunctionType()
+    namedtuple = NamedtupleType()
+    reference = ReferenceType()
+    any = AnyType()
+    # equivalent to any
+    pickled = AnyType()
+
+    # collection
+    list = ListType()
+    tuple = TupleType()
+    dict = DictType()
+
+    # numpy
+    ndarray = NDArrayType()
+    # alias of ndarray
+    arr = NDArrayType()
+    dtype = DtypeType()
+
+    # pandas
+    index = IndexType()
+    series = SeriesType()
+    dataframe = DataFrameType()
+    interval_array = IntervalArrayType()
+    # alias of interval_array
+    interval_arr = IntervalArrayType()
diff --git a/python/xorbits/_mars/serialization/serializables/tests/__init__.py b/python/xorbits/_mars/serialization/serializables/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/serialization/serializables/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/serialization/serializables/tests/test_field_type.py b/python/xorbits/_mars/serialization/serializables/tests/test_field_type.py
new file mode 100644
index 000000000..3dc8275ea
--- /dev/null
+++ b/python/xorbits/_mars/serialization/serializables/tests/test_field_type.py
@@ -0,0 +1,121 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import namedtuple
+from datetime import datetime, timedelta, timezone
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ....core import EntityData
+from .. import FieldTypes
+
+
+class MyClass(EntityData):
+    __slots__ = ()
+
+    @staticmethod
+    def my_func():
+        """
+        Test function
+        """
+
+
+my_named_tuple = namedtuple("my_named_tuple", "a b")
+
+
+fields_values = [
+    # field_type, valid values, invalid values
+    [FieldTypes.bool, [True, np.bool_(False)], [1]],
+    [FieldTypes.int8, [8, np.int8(8)], [8.0]],
+    [FieldTypes.int16, [16, np.int16(16)], [16.0]],
+    [FieldTypes.int32, [32, np.int32(32)], [64.0]],
+    [FieldTypes.uint8, [8, np.uint8(8)], [8.0]],
+    [FieldTypes.uint16, [16, np.uint16(16)], [16.0]],
+    [FieldTypes.uint32, [32, np.uint32(32)], [32.0]],
+    [FieldTypes.uint64, [64, np.uint64(64)], [64.0]],
+    [FieldTypes.float16, [16.0, np.float16(16)], [16]],
+    [FieldTypes.float32, [32.0, np.float32(32)], [32]],
+    [FieldTypes.float64, [64.0, np.float64(64)], [64]],
+    [FieldTypes.complex64, [1 + 2j, np.complex64(1 + 2j)], [64]],
+    [FieldTypes.complex128, [1 + 2j, np.complex128(1 + 2j)], [128]],
+    [FieldTypes.bytes, [b"abc", np.bytes_("abc")], ["abc"]],
+    [FieldTypes.string, ["abc", np.str_("abc")], [b"abc"]],
+    [FieldTypes.ndarray, [np.array([1, 2, 3])], [object()]],
+    [FieldTypes.dtype, [np.dtype(np.int32), pd.StringDtype()], [object()]],
+    [FieldTypes.key, [MyClass()], [object()]],
+    [FieldTypes.slice, [slice(1, 10), slice("a", "b")], [object()]],
+    [FieldTypes.datetime, [datetime.now(), pd.Timestamp(0)], [object()]],
+    [FieldTypes.timedelta, [timedelta(days=1), pd.Timedelta(days=1)], [object()]],
+    [FieldTypes.tzinfo, [timezone.utc], [object()]],
+    [FieldTypes.index, [pd.RangeIndex(10), pd.Index([1, 2])], [object()]],
+    [FieldTypes.series, [pd.Series([1, 2, 3])], [object()]],
+    [FieldTypes.dataframe, [pd.DataFrame({"a": [1, 2]})], [object()]],
+    [FieldTypes.interval_array, [pd.arrays.IntervalArray([])], [object()]],
+    [FieldTypes.function, [MyClass.my_func], [object()]],
+    [FieldTypes.namedtuple, [my_named_tuple(a=1, b=2)], [tuple()]],
+    [FieldTypes.reference(MyClass), [MyClass()], [object()]],
+    [
+        FieldTypes.tuple(FieldTypes.int64, ...),
+        [tuple(), tuple([1, 2])],
+        [list(), tuple([1, 2.0])],
+    ],
+    [
+        FieldTypes.list(FieldTypes.int64, FieldTypes.float64),
+        [[1, 1.0]],
+        [tuple(), [1, 1]],
+    ],
+    [
+        FieldTypes.dict(FieldTypes.string, FieldTypes.int64),
+        [{"a": 1}],
+        [{1: "a"}, {"a": 1.0}],
+    ],
+    [FieldTypes.any, [object()], []],
+]
+
+
+@pytest.mark.parametrize("field_type, valid_values, invalid_values", fields_values)
+def test_field_type(field_type, valid_values, invalid_values):
+    assert isinstance(field_type.type_name, str)
+    assert isinstance(field_type.name, str)
+
+    for valid_value in valid_values:
+        field_type.validate(valid_value)
+
+    for invalid_value in invalid_values:
+        with pytest.raises(TypeError):
+            field_type.validate(invalid_value)
+
+
+def test_collction_field_error():
+    with pytest.raises(ValueError):
+        FieldTypes.tuple(FieldTypes.int64, FieldTypes.float32).validate(
+            tuple([1, 3.0, 3.0])
+        )
+
+
+def test_field_name():
+    assert FieldTypes.list().name == "List"
+    assert (
+        FieldTypes.list(FieldTypes.int64, FieldTypes.float32).name
+        == "List[Int64, Float32]"
+    )
+    assert FieldTypes.tuple(FieldTypes.int8, ...).name == "Tuple[Int8, ...]"
+    assert FieldTypes.tuple(FieldTypes.int8).name == "Tuple[Int8, ...]"
+    assert FieldTypes.dict().name == "Dict"
+    assert (
+        FieldTypes.dict(FieldTypes.int8, FieldTypes.float64).name
+        == "Dict[Int8, Float64]"
+    )
diff --git a/python/xorbits/_mars/serialization/serializables/tests/test_serializable.py b/python/xorbits/_mars/serialization/serializables/tests/test_serializable.py
new file mode 100644
index 000000000..a60a0349a
--- /dev/null
+++ b/python/xorbits/_mars/serialization/serializables/tests/test_serializable.py
@@ -0,0 +1,261 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import os
+from collections import namedtuple
+from datetime import timezone
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ....core import EntityData
+from ....utils import no_default
+from ... import deserialize, serialize
+from .. import (
+    AnyField,
+    BoolField,
+    BytesField,
+    Complex64Field,
+    Complex128Field,
+    DataFrameField,
+    DataTypeField,
+    Datetime64Field,
+    DictField,
+    FieldTypes,
+    Float16Field,
+    Float32Field,
+    Float64Field,
+    FunctionField,
+    IdentityField,
+    IndexField,
+    Int8Field,
+    Int16Field,
+    Int32Field,
+    Int64Field,
+    IntervalArrayField,
+    KeyField,
+    ListField,
+    NamedTupleField,
+    NDArrayField,
+    OneOfField,
+    ReferenceField,
+    Serializable,
+    SeriesField,
+    SliceField,
+    StringField,
+    Timedelta64Field,
+    TupleField,
+    TZInfoField,
+    UInt8Field,
+    UInt16Field,
+    UInt32Field,
+    UInt64Field,
+)
+
+my_namedtuple = namedtuple("my_namedtuple", "a, b")
+
+
+@pytest.fixture(autouse=True)
+def set_environ(request):
+    from .. import core, field
+
+    exist_env = os.environ.get("CI", no_default)
+    env_to_set = getattr(request, "param", None) or "true"
+
+    try:
+        os.environ["CI"] = env_to_set
+        core.SerializableSerializer.unregister(core.Serializable)
+        importlib.reload(core)
+        importlib.reload(field)
+        yield
+    finally:
+        if exist_env is no_default:
+            os.environ.pop("CI", None)
+        else:
+            os.environ["CI"] = exist_env
+        core.SerializableSerializer.unregister(core.Serializable)
+        importlib.reload(core)
+        importlib.reload(field)
+
+
+class MyHasKey(EntityData):
+    def __init__(self, key=None, **kw):
+        super().__init__(_key=key, **kw)
+        self._id = "1"
+
+    def __eq__(self, other):
+        return isinstance(other, MyHasKey) and other._key == self._key
+
+
+class MySimpleSerializable(Serializable):
+    _id = IdentityField("id")
+    _int_val = Int64Field("int_val", default=1000)
+    _list_val = ListField("list_val", default_factory=list)
+    _ref_val = ReferenceField("ref_val", "MySimpleSerializable")
+
+
+class MySerializable(Serializable):
+    _id = IdentityField("id")
+    _any_val = AnyField("any_val")
+    _bool_val = BoolField("bool_val")
+    _int8_val = Int8Field("int8_val")
+    _int16_val = Int16Field("int16_val")
+    _int32_val = Int32Field("int32_val")
+    _int64_val = Int64Field("int64_val")
+    _uint8_val = UInt8Field("uint8_val")
+    _uint16_val = UInt16Field("uint16_val")
+    _uint32_val = UInt32Field("uint32_val")
+    _uint64_val = UInt64Field("uint64_val")
+    _float16_val = Float16Field("float16_val")
+    _float32_val = Float32Field(
+        "float32_val", on_serialize=lambda x: x + 1, on_deserialize=lambda x: x - 1
+    )
+    _float64_val = Float64Field("float64_val")
+    _complex64_val = Complex64Field("complex64_val")
+    _complex128_val = Complex128Field("complex128_val")
+    _string_val = StringField("string_val")
+    _bytes_val = BytesField("bytes_val")
+    _key_val = KeyField("key_val")
+    _ndarray_val = NDArrayField("ndarray_val")
+    _datetime64_val = Datetime64Field("datetime64_val")
+    _timedelta64_val = Timedelta64Field("timedelta64_val")
+    _datatype_val = DataTypeField("datatype_val")
+    _index_val = IndexField("index_val")
+    _series_val = SeriesField("series_val")
+    _dataframe_val = DataFrameField("dataframe_val")
+    _interval_array_val = IntervalArrayField("interval_array_val")
+    _slice_val = SliceField("slice_val")
+    _function_val = FunctionField("function_val")
+    _named_tuple_val = NamedTupleField("named_tuple_val")
+    _tzinfo_val = TZInfoField("tzinfo_val")
+    _list_val = ListField("list_val", FieldTypes.int64)
+    _tuple_val = TupleField("tuple_val", FieldTypes.string)
+    _dict_val = DictField("dict_val", FieldTypes.string, FieldTypes.bytes)
+    _ref_val = ReferenceField("ref_val", "self")
+    _ref_val2 = ReferenceField("ref_val2", MySimpleSerializable)
+    _oneof_val = OneOfField(
+        "ref_val",
+        oneof1_val=f"{__name__}.MySerializable",
+        oneof2_val=MySimpleSerializable,
+    )
+
+
+@pytest.mark.parametrize("set_environ", ["false", "true"], indirect=True)
+def test_serializable(set_environ):
+    my_serializable = MySerializable(
+        _id="1",
+        _any_val="any_value",
+        _bool_val=True,
+        _int8_val=-8,
+        _int16_val=np.int16(-16),
+        _int32_val=-32,
+        _int64_val=-64,
+        _uint8_val=8,
+        _uint16_val=16,
+        _uint32_val=np.uint32(32),
+        _uint64_val=64,
+        _float16_val=1.0,
+        _float32_val=np.float32(2.0),
+        _float64_val=2.0,
+        _complex64_val=np.complex64(1 + 2j),
+        _complex128_val=1 + 2j,
+        _string_val="string_value",
+        _bytes_val=b"bytes_value",
+        _key_val=MyHasKey("aaa"),
+        _ndarray_val=np.random.rand(4, 3),
+        _datetime64_val=pd.Timestamp(123),
+        _timedelta64_val=pd.Timedelta(days=1),
+        _datatype_val=np.dtype(np.int32),
+        _index_val=pd.Index([1, 2]),
+        _series_val=pd.Series(["a", "b"]),
+        _dataframe_val=pd.DataFrame({"a": [1, 2, 3]}),
+        _interval_array_val=pd.arrays.IntervalArray([]),
+        _slice_val=slice(1, 10, 2),
+        _function_val=lambda x: x + 1,
+        _named_tuple_val=my_namedtuple(a=1, b=2),
+        _tzinfo_val=timezone.utc,
+        _list_val=[1, 2],
+        _tuple_val=("a", "b"),
+        _dict_val={"a": b"bytes_value"},
+        _ref_val=MySerializable(),
+        _oneof_val=MySerializable(_id="2"),
+    )
+
+    header, buffers = serialize(my_serializable)
+    my_serializable2 = deserialize(header, buffers)
+    _assert_serializable_eq(my_serializable, my_serializable2)
+
+
+def _assert_serializable_eq(my_serializable, my_serializable2):
+    for field_name, field in my_serializable._FIELDS.items():
+        if not hasattr(my_serializable, field.tag):
+            continue
+        expect_value = getattr(my_serializable, field_name)
+        actual_value = getattr(my_serializable2, field_name)
+        if isinstance(expect_value, np.ndarray):
+            np.testing.assert_array_equal(expect_value, actual_value)
+        elif isinstance(expect_value, pd.DataFrame):
+            pd.testing.assert_frame_equal(expect_value, actual_value)
+        elif isinstance(expect_value, pd.Series):
+            pd.testing.assert_series_equal(expect_value, actual_value)
+        elif isinstance(expect_value, pd.Index):
+            pd.testing.assert_index_equal(expect_value, actual_value)
+        elif isinstance(expect_value, pd.api.extensions.ExtensionArray):
+            pd.testing.assert_extension_array_equal(expect_value, actual_value)
+        elif isinstance(expect_value, (MySimpleSerializable, MySerializable)):
+            _assert_serializable_eq(expect_value, actual_value)
+        elif callable(expect_value):
+            assert expect_value(1) == actual_value(1)
+        else:
+            assert expect_value == actual_value
+
+
+def test_fields_errors():
+    my_simple = MySimpleSerializable(_id="1", _ref_val=MySimpleSerializable(_id="2"))
+    my_serializeble = MySerializable(_oneof_val=my_simple)
+
+    with pytest.raises(TypeError) as exc_info:
+        my_simple._int_val = "10"
+    assert "_int_val" in str(exc_info.value)
+
+    del my_simple._ref_val
+    with pytest.raises(AttributeError):
+        _ = my_simple._ref_val
+
+    del my_simple._id
+    with pytest.raises(AttributeError):
+        _ = my_simple._id
+
+    assert my_simple._int_val == 1000
+    assert my_simple._list_val == []
+
+    del my_serializeble._oneof_val
+    with pytest.raises(AttributeError):
+        _ = my_serializeble._oneof_val
+
+    my_serializeble._ref_val2 = MySimpleSerializable(_id="3")
+    del my_serializeble._ref_val2
+    with pytest.raises(AttributeError):
+        _ = my_serializeble._ref_val2
+
+    with pytest.raises(TypeError):
+        my_serializeble._ref_val = my_simple
+
+    with pytest.raises(TypeError):
+        my_serializeble._oneof_val = 1
+
+    with pytest.raises(AttributeError):
+        del my_serializeble._oneof_val
diff --git a/python/xorbits/_mars/serialization/tests/__init__.py b/python/xorbits/_mars/serialization/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/serialization/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/serialization/tests/test_serial.py b/python/xorbits/_mars/serialization/tests/test_serial.py
new file mode 100644
index 000000000..02bc475a5
--- /dev/null
+++ b/python/xorbits/_mars/serialization/tests/test_serial.py
@@ -0,0 +1,322 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import threading
+from collections import OrderedDict, defaultdict
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+import pandas as pd
+import pytest
+
+try:
+    import pyarrow as pa
+except ImportError:
+    pa = None
+try:
+    import scipy.sparse as sps
+except ImportError:
+    sps = None
+
+from ...lib.sparse import SparseMatrix
+from ...tests.core import require_cudf, require_cupy
+from ...utils import lazy_import
+from .. import deserialize, serialize, serialize_with_spawn
+from ..core import ListSerializer, Placeholder
+
+cupy = lazy_import("cupy")
+cudf = lazy_import("cudf")
+
+
+class CustomList(list):
+    pass
+
+
+@pytest.mark.parametrize(
+    "val",
+    [
+        None,
+        False,
+        123,
+        3.567,
+        3.5 + 4.3j,
+        b"abcd",
+        "abcd",
+        ["uvw", ("mno", "sdaf"), 4, 6.7],
+        CustomList([3, 4, CustomList([5, 6])]),
+        {"abc": 5.6, "def": [3.4], "gh": None, "ijk": {}},
+        OrderedDict([("abcd", 5.6)]),
+        defaultdict(lambda: 0, [("abcd", 0)]),
+    ],
+)
+def test_core(val):
+    deserialized = deserialize(*serialize(val))
+    assert type(val) == type(deserialized)
+    assert val == deserialized
+
+
+def test_strings():
+    str_obj = "abcd" * 1024
+    obj = [str_obj, str_obj]
+    header, bufs = serialize(obj)
+    assert len(header) < len(str_obj) * 2
+    bufs = [memoryview(buf) for buf in bufs]
+    assert obj == deserialize(header, bufs)
+
+
+def test_placeholder_obj():
+    assert Placeholder(1024) == Placeholder(1024)
+    assert hash(Placeholder(1024)) == hash(Placeholder(1024))
+    assert Placeholder(1024) != Placeholder(1023)
+    assert hash(Placeholder(1024)) != hash(Placeholder(1023))
+    assert Placeholder(1024) != 1024
+    assert "1024" in repr(Placeholder(1024))
+
+
+def test_nested_list():
+    val = [b"a" * 1200] * 10
+    val[0] = val
+    deserialized = deserialize(*serialize(val))
+    assert deserialized[0] is deserialized
+    assert val[1:] == deserialized[1:]
+
+
+class KeyedDict(dict):
+    def _skeys(self):
+        return set(k for k in self.keys() if isinstance(k, str))
+
+    def __hash__(self):
+        return hash(frozenset(self._skeys()))
+
+    def __eq__(self, other: "KeyedDict"):
+        return self._skeys() == other._skeys()
+
+
+def test_nested_dict():
+    val = {i: "b" * 100 for i in range(10)}
+    val[0] = val
+    deserialized = deserialize(*serialize(val))
+    assert deserialized[0] is deserialized
+
+    val = KeyedDict(abcd="efgh")
+    val[val] = val
+    deserialized = deserialize(*serialize(val))
+    assert deserialized[val] is deserialized
+
+
+class DictWithoutInitArgs(dict):
+    # dict inheritance without args in __init__
+    def __init__(self):
+        super().__init__()
+
+
+def test_dict_without_init_args():
+    val = DictWithoutInitArgs()
+    val["a"] = "b"
+    deserialized = deserialize(*serialize(val))
+    assert deserialized == val
+
+
+@pytest.mark.parametrize(
+    "val",
+    [
+        np.array(np.random.rand(100, 100)),
+        np.array(np.random.rand(100, 100).T),
+        np.array(["a", "bcd", None]),
+    ],
+)
+def test_numpy(val):
+    deserialized = deserialize(*serialize(val))
+    assert type(val) == type(deserialized)
+    np.testing.assert_equal(val, deserialized)
+    if val.flags.f_contiguous:
+        assert deserialized.flags.f_contiguous
+
+
+def test_pandas():
+    val = pd.Series([1, 2, 3, 4])
+    pd.testing.assert_series_equal(val, deserialize(*serialize(val)))
+
+    val = pd.DataFrame(
+        {
+            "a": np.random.rand(1000),
+            "b": np.random.choice(list("abcd"), size=(1000,)),
+            "c": np.random.randint(0, 100, size=(1000,)),
+        }
+    )
+    pd.testing.assert_frame_equal(val, deserialize(*serialize(val)))
+
+
+@pytest.mark.skipif(pa is None, reason="need pyarrow to run the cases")
+def test_arrow():
+    test_df = pd.DataFrame(
+        {
+            "a": np.random.rand(1000),
+            "b": np.random.choice(list("abcd"), size=(1000,)),
+            "c": np.random.randint(0, 100, size=(1000,)),
+        }
+    )
+    test_vals = [
+        pa.RecordBatch.from_pandas(test_df),
+        pa.Table.from_pandas(test_df),
+    ]
+    for val in test_vals:
+        deserialized = deserialize(*serialize(val))
+        assert type(val) is type(deserialized)
+        np.testing.assert_equal(val, deserialized)
+
+
+@pytest.mark.parametrize(
+    "np_val",
+    [np.random.rand(100, 100), np.random.rand(100, 100).T],
+)
+@require_cupy
+def test_cupy(np_val):
+    val = cupy.array(np_val)
+    deserialized = deserialize(*serialize(val))
+    assert type(val) is type(deserialized)
+    cupy.testing.assert_array_equal(val, deserialized)
+
+
+@require_cudf
+def test_cudf():
+    raw_df = pd.DataFrame(
+        {
+            "a": np.random.rand(1000),
+            "b": np.random.choice(list("abcd"), size=(1000,)),
+            "c": np.random.randint(0, 100, size=(1000,)),
+        }
+    )
+    test_df = cudf.DataFrame(raw_df)
+    cudf.testing.assert_frame_equal(test_df, deserialize(*serialize(test_df)))
+
+    raw_df.columns = pd.MultiIndex.from_tuples([("a", "a"), ("a", "b"), ("b", "c")])
+    test_df = cudf.DataFrame(raw_df)
+    cudf.testing.assert_frame_equal(test_df, deserialize(*serialize(test_df)))
+
+
+@pytest.mark.skipif(sps is None, reason="need scipy to run the test")
+def test_scipy_sparse():
+    val = sps.random(100, 100, 0.1, format="csr")
+    deserial = deserialize(*serialize(val))
+    assert (val != deserial).nnz == 0
+
+
+@pytest.mark.skipif(sps is None, reason="need scipy to run the test")
+def test_mars_sparse():
+    val = SparseMatrix(sps.random(100, 100, 0.1, format="csr"))
+    deserial = deserialize(*serialize(val))
+    assert (val.spmatrix != deserial.spmatrix).nnz == 0
+
+
+class MockSerializerForErrors(ListSerializer):
+    serializer_id = 25951
+    raises = False
+
+    def on_deserial_error(
+        self,
+        serialized: Tuple,
+        context: Dict,
+        subs_serialized: List,
+        error_index: int,
+        exc: BaseException,
+    ):
+        assert serialized[2] is CustomList  # obj_type field of ListSerializer
+        assert error_index == 1
+        assert subs_serialized[error_index]
+        try:
+            raise SystemError from exc
+        except BaseException as ex:
+            return ex
+
+    def deserial(self, serialized: Tuple, context: Dict, subs: List[Any]):
+        if len(subs) == 2 and self.raises:
+            raise TypeError
+        return super().deserial(serialized, context, subs)
+
+
+class UnpickleWithError:
+    def __getstate__(self):
+        return (None,)
+
+    def __setstate__(self, state):
+        raise ValueError
+
+
+def test_deserial_errors():
+    try:
+        MockSerializerForErrors.raises = False
+        MockSerializerForErrors.register(CustomList)
+        ListSerializer.register(CustomList, name="test_name")
+
+        # error of leaf object is raised
+        obj = [1, [[3, UnpickleWithError()]]]
+        with pytest.raises(ValueError):
+            deserialize(*serialize(obj))
+
+        # error of leaf object is rewritten in parent object
+        obj = CustomList([[1], [[3, UnpickleWithError()]]])
+        with pytest.raises(SystemError) as exc_info:
+            deserialize(*serialize(obj))
+        assert isinstance(exc_info.value.__cause__, ValueError)
+
+        MockSerializerForErrors.raises = True
+
+        # error of non-leaf object is raised
+        obj = [CustomList([[1], [[2]]])]
+        with pytest.raises(TypeError):
+            deserialize(*serialize(obj))
+        deserialize(*serialize(obj, {"serializer": "test_name"}))
+
+        # error of non-leaf CustomList is rewritten in parent object
+        obj = CustomList([[1], CustomList([[1], [[2]]]), [2]])
+        with pytest.raises(SystemError) as exc_info:
+            deserialize(*serialize(obj))
+        assert isinstance(exc_info.value.__cause__, TypeError)
+        deserialize(*serialize(obj, {"serializer": "test_name"}))
+    finally:
+        MockSerializerForErrors.unregister(CustomList)
+        ListSerializer.unregister(CustomList, name="test_name")
+        # Above unregister will remove the ListSerializer from deserializers,
+        # so we need to register ListSerializer again to make the
+        # deserializers correct.
+        ListSerializer.register(list)
+
+
+class MockSerializerForSpawn(ListSerializer):
+    thread_calls = defaultdict(lambda: 0)
+
+    def serial(self, obj: Any, context: Dict):
+        self.thread_calls[threading.current_thread().ident] += 1
+        return super().serial(obj, context)
+
+
+@pytest.mark.asyncio
+async def test_spawn_threshold():
+    try:
+        assert 0 == deserialize(*(await serialize_with_spawn(0)))
+
+        MockSerializerForSpawn.register(CustomList)
+        obj = [CustomList([i]) for i in range(200)]
+        serialized = await serialize_with_spawn(obj, spawn_threshold=100)
+        assert serialized[0][0]["_N"] == 201
+        deserialized = deserialize(*serialized)
+        for s, d in zip(obj, deserialized):
+            assert s[0] == d[0]
+
+        calls = MockSerializerForSpawn.thread_calls
+        assert sum(calls.values()) == 200
+        assert calls[threading.current_thread().ident] == 101
+    finally:
+        MockSerializerForSpawn.unregister(CustomList)
diff --git a/python/xorbits/_mars/services/__init__.py b/python/xorbits/_mars/services/__init__.py
new file mode 100644
index 000000000..c01180bb2
--- /dev/null
+++ b/python/xorbits/_mars/services/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import (
+    NodeRole,
+    create_service_session,
+    destroy_service_session,
+    start_services,
+    stop_services,
+)
diff --git a/python/xorbits/_mars/services/cluster/__init__.py b/python/xorbits/_mars/services/cluster/__init__.py
new file mode 100644
index 000000000..1c7ee3a71
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .api import AbstractClusterAPI, ClusterAPI, MockClusterAPI, WebClusterAPI
+from .backends import AbstractClusterBackend
+from .core import (  # noqa: F401
+    DiskInfo,
+    NodeInfo,
+    NodeRole,
+    QuotaInfo,
+    StorageInfo,
+    WorkerSlotInfo,
+)
diff --git a/python/xorbits/_mars/services/cluster/api/__init__.py b/python/xorbits/_mars/services/cluster/api/__init__.py
new file mode 100644
index 000000000..1812045f1
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/api/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import AbstractClusterAPI
+from .oscar import ClusterAPI, MockClusterAPI
+from .web import WebClusterAPI
diff --git a/python/xorbits/_mars/services/cluster/api/core.py b/python/xorbits/_mars/services/cluster/api/core.py
new file mode 100644
index 000000000..dcd80c613
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/api/core.py
@@ -0,0 +1,190 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+from typing import Dict, List, Set
+
+from ....resource import Resource
+from ....typing import BandType
+from ...core import NodeRole
+from ..core import NodeStatus
+
+
+class AbstractClusterAPI:
+    @staticmethod
+    def _calc_statuses(
+        statuses: Set[NodeStatus] = None, exclude_statuses: Set[NodeStatus] = None
+    ) -> Set[NodeStatus]:
+        if statuses:
+            return statuses
+        elif exclude_statuses is not None:
+            return set(NodeStatus.__members__.values()).difference(exclude_statuses)
+        else:
+            return {NodeStatus.READY}
+
+    @abstractmethod
+    async def get_supervisors(self, filter_ready: bool = True) -> List[str]:
+        """
+        Get supervisor addresses
+
+        Returns
+        -------
+        out
+            list of supervisors
+        """
+
+    @abstractmethod
+    async def watch_supervisors(self):
+        """
+        Watch supervisor addresses
+
+        Returns
+        -------
+        out
+            generator of list of supervisors
+        """
+
+    @abstractmethod
+    async def watch_nodes(
+        self,
+        role: NodeRole,
+        env: bool = False,
+        resource: bool = False,
+        detail: bool = False,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ) -> List[Dict[str, Dict]]:
+        """
+        Watch changes of workers
+
+        Returns
+        -------
+        out: List[Dict[str, Dict]]
+            dict of worker resources by addresses and bands
+        """
+
+    @abstractmethod
+    async def get_nodes_info(
+        self,
+        nodes: List[str] = None,
+        role: NodeRole = None,
+        env: bool = False,
+        resource: bool = False,
+        detail: bool = False,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ):
+        """
+        Get worker info
+
+        Parameters
+        ----------
+        nodes
+            address of nodes
+        role
+            roles of nodes
+        env
+            receive env info
+        resource
+            receive resource info
+        detail
+            receive detail info
+
+        Returns
+        -------
+        out: Dict
+            info of worker
+        """
+
+    @abstractmethod
+    async def get_all_bands(
+        self,
+        role: NodeRole = None,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ) -> Dict[BandType, Resource]:
+        """
+        Get all bands that can be used for computation.
+
+        Returns
+        -------
+        band_to_resource : dict
+            Band to resource.
+        """
+
+    @abstractmethod
+    async def watch_all_bands(
+        self,
+        role: NodeRole = None,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ):
+        """
+        Watch all bands that can be used for computation.
+
+        Returns
+        -------
+        band_to_resource : dict
+            Band to resource.
+        """
+
+    @abstractmethod
+    async def get_mars_versions(self) -> List[str]:
+        """
+        Get versions used in current Mars cluster
+
+        Returns
+        -------
+        version_list : list
+            List of versions
+        """
+
+    @abstractmethod
+    async def get_node_pool_configs(self, address: str) -> List[Dict]:
+        """
+        Get pool configs of a Mars node
+
+        Returns
+        -------
+        config_list : List[Dict]
+            List of configs for all pool processes
+        """
+
+    async def get_node_thread_stacks(self, address: str) -> List[Dict[int, List[str]]]:
+        """
+        Get current thread pool stacks of a Mars node
+
+        Parameters
+        ----------
+        address
+
+        Returns
+        -------
+
+        """
+
+    async def fetch_node_log(self, size: int, address: str, offset: int) -> str:
+        """
+        Get current log content of a Mars node
+
+        Parameters
+        ----------
+        size
+        address
+        offset
+
+        Returns
+        -------
+
+        """
diff --git a/python/xorbits/_mars/services/cluster/api/oscar.py b/python/xorbits/_mars/services/cluster/api/oscar.py
new file mode 100644
index 000000000..c66877f7e
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/api/oscar.py
@@ -0,0 +1,412 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+from typing import Dict, List, Optional, Set, Type, TypeVar
+
+from .... import oscar as mo
+from ....lib.aio import alru_cache
+from ....resource import Resource
+from ....typing import BandType
+from ...core import NodeRole
+from ..core import (
+    DiskInfo,
+    NodeStatus,
+    QuotaInfo,
+    StorageInfo,
+    WorkerSlotInfo,
+    watch_method,
+)
+from .core import AbstractClusterAPI
+
+APIType = TypeVar("APIType", bound="ClusterAPI")
+logger = logging.getLogger(__name__)
+
+
+class ClusterAPI(AbstractClusterAPI):
+    def __init__(self, address: str):
+        self._address = address
+        self._locator_ref = None
+        self._uploader_ref = None
+
+    async def _init(self):
+        from ..locator import SupervisorLocatorActor
+        from ..uploader import NodeInfoUploaderActor
+
+        self._locator_ref = await mo.actor_ref(
+            SupervisorLocatorActor.default_uid(), address=self._address
+        )
+        self._uploader_ref = await mo.actor_ref(
+            NodeInfoUploaderActor.default_uid(), address=self._address
+        )
+
+    @classmethod
+    @alru_cache(cache_exceptions=False)
+    async def create(cls: Type[APIType], address: str) -> APIType:
+        api_obj = cls(address)
+        await api_obj._init()
+        return api_obj
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_node_info_ref(self):
+        from ..supervisor.node_info import NodeInfoCollectorActor
+
+        [node_info_ref] = await self.get_supervisor_refs(
+            [NodeInfoCollectorActor.default_uid()]
+        )
+        return node_info_ref
+
+    async def get_supervisors(self, filter_ready: bool = True) -> List[str]:
+        return await self._locator_ref.get_supervisors(filter_ready=filter_ready)
+
+    @watch_method
+    async def watch_supervisors(self, version: Optional[int] = None):
+        return await self._locator_ref.watch_supervisors(version=version)
+
+    async def get_supervisors_by_keys(self, keys: List[str]) -> List[str]:
+        """
+        Get supervisor address hosting the specified key
+
+        Parameters
+        ----------
+        keys
+            key for a supervisor address
+
+        Returns
+        -------
+        out
+            addresses of the supervisor
+        """
+        get_supervisor = self._locator_ref.get_supervisor
+        return await get_supervisor.batch(*(get_supervisor.delay(k) for k in keys))
+
+    @watch_method
+    async def watch_supervisors_by_keys(
+        self, keys: List[str], version: Optional[int] = None
+    ):
+        return await self._locator_ref.watch_supervisors_by_keys(keys, version=version)
+
+    async def get_supervisor_refs(self, uids: List[str]) -> List[mo.ActorRef]:
+        """
+        Get actor references hosting the specified actor uid
+
+        Parameters
+        ----------
+        uids
+            uids for a supervisor address
+        watch
+            if True, will watch changes of supervisor changes
+
+        Returns
+        -------
+        out : List[mo.ActorRef]
+            references of the actors
+        """
+        addrs = await self.get_supervisors_by_keys(uids)
+        if any(addr is None for addr in addrs):
+            none_uid = next(uid for addr, uid in zip(addrs, uids) if addr is None)
+            raise mo.ActorNotExist(f"Actor {none_uid} not exist as no supervisors")
+
+        return await asyncio.gather(
+            *[mo.actor_ref(uid, address=addr) for addr, uid in zip(addrs, uids)]
+        )
+
+    async def watch_supervisor_refs(self, uids: List[str]):
+        async for addrs in self.watch_supervisors_by_keys(uids):
+            yield await asyncio.gather(
+                *[mo.actor_ref(uid, address=addr) for addr, uid in zip(addrs, uids)]
+            )
+
+    @watch_method
+    async def watch_nodes(
+        self,
+        role: NodeRole,
+        env: bool = False,
+        resource: bool = False,
+        detail: bool = False,
+        version: Optional[int] = None,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ) -> List[Dict[str, Dict]]:
+        statuses = self._calc_statuses(statuses, exclude_statuses)
+        node_info_ref = await self._get_node_info_ref()
+        return await node_info_ref.watch_nodes(
+            role,
+            env=env,
+            resource=resource,
+            detail=detail,
+            statuses=statuses,
+            version=version,
+        )
+
+    async def get_nodes_info(
+        self,
+        nodes: List[str] = None,
+        role: NodeRole = None,
+        env: bool = False,
+        resource: bool = False,
+        detail: bool = False,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ) -> Dict[str, Dict]:
+        statuses = self._calc_statuses(statuses, exclude_statuses)
+        node_info_ref = await self._get_node_info_ref()
+        return await node_info_ref.get_nodes_info(
+            nodes=nodes,
+            role=role,
+            env=env,
+            resource=resource,
+            detail=detail,
+            statuses=statuses,
+        )
+
+    async def set_node_status(self, node: str, role: NodeRole, status: NodeStatus):
+        """
+        Set status of node
+
+        Parameters
+        ----------
+        node : str
+            address of node
+        role: NodeRole
+            role of node
+        status : NodeStatus
+            status of node
+        """
+        node_info_ref = await self._get_node_info_ref()
+        await node_info_ref.update_node_info(node, role, status=status)
+
+    async def get_all_bands(
+        self,
+        role: NodeRole = None,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ) -> Dict[BandType, Resource]:
+        statuses = self._calc_statuses(statuses, exclude_statuses)
+        node_info_ref = await self._get_node_info_ref()
+        return await node_info_ref.get_all_bands(role, statuses=statuses)
+
+    @watch_method
+    async def watch_all_bands(
+        self,
+        role: NodeRole = None,
+        version: Optional[int] = None,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ):
+        statuses = self._calc_statuses(statuses, exclude_statuses)
+        node_info_ref = await self._get_node_info_ref()
+        return await node_info_ref.watch_all_bands(
+            role, statuses=statuses, version=version
+        )
+
+    async def get_mars_versions(self) -> List[str]:
+        node_info_ref = await self._get_node_info_ref()
+        return await node_info_ref.get_mars_versions()
+
+    async def get_bands(self) -> Dict:
+        """
+        Get bands that can be used for computation on current node.
+
+        Returns
+        -------
+        band_to_resource : dict
+            Band to resource.
+        """
+        return await self._uploader_ref.get_bands()
+
+    async def mark_node_ready(self):
+        """
+        Mark current node ready for work loads
+        """
+        await self._uploader_ref.mark_node_ready()
+
+    async def wait_node_ready(self):
+        """
+        Wait current node to be ready
+        """
+        await self._uploader_ref.wait_node_ready()
+
+    async def wait_all_supervisors_ready(self):
+        """
+        Wait till all expected supervisors are ready
+        """
+        await self._locator_ref.wait_all_supervisors_ready()
+
+    async def set_band_slot_infos(
+        self, band_name: str, slot_infos: List[WorkerSlotInfo]
+    ):
+        await self._uploader_ref.set_band_slot_infos.tell(band_name, slot_infos)
+
+    async def set_band_quota_info(self, band_name: str, quota_info: QuotaInfo):
+        await self._uploader_ref.set_band_quota_info.tell(band_name, quota_info)
+
+    async def set_node_disk_info(self, disk_info: List[DiskInfo]):
+        await self._uploader_ref.set_node_disk_info(disk_info)
+
+    @mo.extensible
+    async def set_band_storage_info(self, band_name: str, storage_info: StorageInfo):
+        await self._uploader_ref.set_band_storage_info(band_name, storage_info)
+
+    async def request_worker(
+        self, worker_cpu: int = None, worker_mem: int = None, timeout: int = None
+    ) -> str:
+        node_allocator_ref = await self._get_node_allocator_ref()
+        address = await node_allocator_ref.request_worker(
+            worker_cpu, worker_mem, timeout
+        )
+        return address
+
+    async def release_worker(self, address: str):
+        node_allocator_ref = await self._get_node_allocator_ref()
+        await node_allocator_ref.release_worker(address)
+        node_info_ref = await self._get_node_info_ref()
+        await node_info_ref.update_node_info(
+            address, NodeRole.WORKER, status=NodeStatus.STOPPED
+        )
+
+    async def reconstruct_worker(self, address: str):
+        node_allocator_ref = await self._get_node_allocator_ref()
+        await node_allocator_ref.reconstruct_worker(address)
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_node_allocator_ref(self):
+        from ..supervisor.node_allocator import NodeAllocatorActor
+
+        [node_allocator_ref] = await self.get_supervisor_refs(
+            [NodeAllocatorActor.default_uid()]
+        )
+        return node_allocator_ref
+
+    async def _get_process_info_manager_ref(self, address: str = None):
+        from ..procinfo import ProcessInfoManagerActor
+
+        return await mo.actor_ref(
+            ProcessInfoManagerActor.default_uid(), address=address or self._address
+        )
+
+    async def get_node_pool_configs(self, address: str = None) -> List[Dict]:
+        ref = await self._get_process_info_manager_ref(address)
+        return await ref.get_pool_configs()
+
+    async def get_node_thread_stacks(
+        self, address: str = None
+    ) -> List[Dict[int, List[str]]]:
+        ref = await self._get_process_info_manager_ref(address)
+        return await ref.get_thread_stacks()
+
+    async def _get_log_ref(self, address: str = None):
+        from ..file_logger import FileLoggerActor
+
+        return await mo.actor_ref(
+            FileLoggerActor.default_uid(), address=address or self._address
+        )
+
+    async def fetch_node_log(
+        self, size: int, address: str = None, offset: int = 0
+    ) -> str:
+        ref = await self._get_log_ref(address)
+        return await ref.fetch_logs(size, offset)
+
+
+class MockClusterAPI(ClusterAPI):
+    @classmethod
+    async def create(cls: Type[APIType], address: str, **kw) -> APIType:
+        from ..file_logger import FileLoggerActor
+        from ..procinfo import ProcessInfoManagerActor
+        from ..supervisor.locator import SupervisorPeerLocatorActor
+        from ..supervisor.node_allocator import NodeAllocatorActor
+        from ..supervisor.node_info import NodeInfoCollectorActor
+        from ..uploader import NodeInfoUploaderActor
+
+        create_actor_coros = [
+            mo.create_actor(
+                SupervisorPeerLocatorActor,
+                "fixed",
+                address,
+                uid=SupervisorPeerLocatorActor.default_uid(),
+                address=address,
+            ),
+            mo.create_actor(
+                NodeInfoCollectorActor,
+                uid=NodeInfoCollectorActor.default_uid(),
+                address=address,
+            ),
+            mo.create_actor(
+                NodeAllocatorActor,
+                "fixed",
+                address,
+                uid=NodeAllocatorActor.default_uid(),
+                address=address,
+            ),
+            mo.create_actor(
+                NodeInfoUploaderActor,
+                NodeRole.WORKER,
+                interval=kw.get("upload_interval"),
+                band_to_resource=kw.get("band_to_resource"),
+                use_gpu=kw.get("use_gpu", False),
+                uid=NodeInfoUploaderActor.default_uid(),
+                address=address,
+            ),
+            mo.create_actor(
+                ProcessInfoManagerActor,
+                uid=ProcessInfoManagerActor.default_uid(),
+                address=address,
+            ),
+            mo.create_actor(
+                FileLoggerActor, uid=FileLoggerActor.default_uid(), address=address
+            ),
+        ]
+        dones, _ = await asyncio.wait(
+            [asyncio.ensure_future(coro) for coro in create_actor_coros]
+        )
+
+        for task in dones:
+            try:
+                task.result()
+            except mo.ActorAlreadyExist:  # pragma: no cover
+                pass
+
+        api = await super().create(address=address)
+        await api.mark_node_ready()
+        return api
+
+    @classmethod
+    async def cleanup(cls, address: str):
+        from ..file_logger import FileLoggerActor
+        from ..supervisor.locator import SupervisorPeerLocatorActor
+        from ..supervisor.node_info import NodeInfoCollectorActor
+        from ..uploader import NodeInfoUploaderActor
+
+        await asyncio.gather(
+            mo.destroy_actor(
+                mo.create_actor_ref(
+                    uid=SupervisorPeerLocatorActor.default_uid(), address=address
+                )
+            ),
+            mo.destroy_actor(
+                mo.create_actor_ref(
+                    uid=NodeInfoCollectorActor.default_uid(), address=address
+                )
+            ),
+            mo.destroy_actor(
+                mo.create_actor_ref(
+                    uid=NodeInfoUploaderActor.default_uid(), address=address
+                )
+            ),
+            mo.destroy_actor(
+                mo.create_actor_ref(uid=FileLoggerActor.default_uid(), address=address)
+            ),
+        )
diff --git a/python/xorbits/_mars/services/cluster/api/web.py b/python/xorbits/_mars/services/cluster/api/web.py
new file mode 100644
index 000000000..9755b9293
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/api/web.py
@@ -0,0 +1,378 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import time
+from typing import Callable, Dict, List, Optional, Set
+
+from ....lib.aio import alru_cache
+from ....resource import Resource
+from ....typing import BandType
+from ....utils import deserialize_serializable, serialize_serializable
+from ...core import NodeRole
+from ...web import MarsServiceWebAPIHandler, MarsWebAPIClientMixin, web_api
+from ..core import NodeStatus, watch_method
+from .core import AbstractClusterAPI
+
+
+class ClusterWebAPIHandler(MarsServiceWebAPIHandler):
+    _root_pattern = "/api/cluster"
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_cluster_api(self):
+        from ...cluster import ClusterAPI
+
+        return await ClusterAPI.create(self._supervisor_addr)
+
+    @staticmethod
+    def _convert_node_dict(node_info_list: Dict[str, Dict]):
+        res = {}
+        for node_addr, node in node_info_list.items():
+            res_dict = node.copy()
+            res_dict["status"] = res_dict["status"].value
+            res[node_addr] = res_dict
+        return res
+
+    @web_api("nodes", method=["get", "post"], cache_blocking=True)
+    async def get_nodes_info(self):
+        watch = bool(int(self.get_argument("watch", "0")))
+        env = bool(int(self.get_argument("env", "0")))
+        resource = bool(int(self.get_argument("resource", "0")))
+        detail = bool(int(self.get_argument("detail", "0")))
+
+        nodes_arg = self.get_argument("nodes", None)
+        nodes = nodes_arg.split(",") if nodes_arg is not None else None
+
+        role_arg = self.get_argument("role", None)
+        role = NodeRole(int(role_arg)) if role_arg is not None else None
+
+        statuses_arg = self.get_argument("statuses", None)
+        statuses = (
+            set(NodeStatus(int(v)) for v in statuses_arg.split(","))
+            if statuses_arg
+            else None
+        )
+
+        exclude_statuses_arg = self.get_argument("exclude_statuses", None)
+        exclude_statuses = (
+            set(NodeStatus(int(v)) for v in exclude_statuses_arg.split(","))
+            if exclude_statuses_arg
+            else None
+        )
+
+        statuses = WebClusterAPI._calc_statuses(statuses, exclude_statuses)
+
+        cluster_api = await self._get_cluster_api()
+        result = {}
+        if watch:
+            assert nodes is None
+            version = self.get_argument("version", "") or None
+            if version:
+                version = int(version)
+
+            async for version, node_infos in cluster_api.watch_nodes(
+                role,
+                env=env,
+                resource=resource,
+                detail=detail,
+                statuses=statuses,
+                version=version,
+            ):
+                result["version"] = version
+                result["nodes"] = self._convert_node_dict(node_infos)
+                break
+        else:
+            nodes = await cluster_api.get_nodes_info(
+                nodes=nodes,
+                role=role,
+                env=env,
+                resource=resource,
+                statuses=statuses,
+                detail=detail,
+            )
+            result["nodes"] = self._convert_node_dict(nodes)
+        self.write(json.dumps(result))
+
+    @web_api("bands", method="get", cache_blocking=True)
+    async def get_all_bands(self):
+        role_arg = self.get_argument("role", None)
+        role = NodeRole(int(role_arg)) if role_arg is not None else None
+        watch = bool(int(self.get_argument("watch", "0")))
+
+        statuses_arg = self.get_argument("statuses", None)
+        statuses = (
+            set(NodeStatus(int(v)) for v in statuses_arg.split(","))
+            if statuses_arg
+            else None
+        )
+
+        cluster_api = await self._get_cluster_api()
+        if watch:
+            version = self.get_argument("version", "") or None
+            if version:
+                version = int(version)
+
+            async for version, bands in cluster_api.watch_all_bands(
+                role, statuses=statuses, version=version
+            ):
+                self.write(serialize_serializable((version, bands)))
+                break
+        else:
+            self.write(
+                serialize_serializable(
+                    await cluster_api.get_all_bands(role, statuses=statuses)
+                )
+            )
+
+    @web_api("versions", method="get", cache_blocking=True)
+    async def get_mars_versions(self):
+        cluster_api = await self._get_cluster_api()
+        self.write(json.dumps(list(await cluster_api.get_mars_versions())))
+
+    @web_api("pools", method="get", cache_blocking=True)
+    async def get_node_pool_configs(self):
+        cluster_api = await self._get_cluster_api()
+        address = self.get_argument("address", "") or None
+        pools = list(await cluster_api.get_node_pool_configs(address))
+        # Since logging_conf field cannot be serialized by json,
+        # and this field is not used by the front end, it is removed.
+        for pool in pools:
+            pool.pop("logging_conf", None)
+        self.write(json.dumps({"pools": pools}))
+
+    @web_api("stacks", method="get", cache_blocking=True)
+    async def get_node_thread_stacks(self):
+        cluster_api = await self._get_cluster_api()
+        address = self.get_argument("address", "") or None
+        stacks = list(await cluster_api.get_node_thread_stacks(address))
+        self.write(
+            json.dumps(
+                {
+                    "generate_time": time.time(),
+                    "stacks": stacks,
+                }
+            )
+        )
+
+    @web_api("logs", method="get", cache_blocking=True)
+    async def fetch_node_log(self):
+        cluster_api = await self._get_cluster_api()
+        address = self.get_argument("address", "") or None
+        # 10MB by default
+        size = int(self.get_argument("size", str(10 * 1024 * 1024)))
+        offset = 0
+        content = await cluster_api.fetch_node_log(size, address=address, offset=offset)
+        if size != -1:
+            self.write(json.dumps({"content": content}))
+        # size == -1 means downloading the current file
+        else:
+            self.set_header("Content-Type", "application/octet-stream")
+            self.set_header("Content-Disposition", "attachment")
+
+            while True:
+                if len(content) == 0:  # read to file end
+                    await self.finish()
+                    break
+                else:
+                    self.write(content)
+                    await self.flush()
+                offset = offset + len(content)
+                content = await cluster_api.fetch_node_log(
+                    size, address=address, offset=offset
+                )
+
+
+web_handlers = {ClusterWebAPIHandler.get_root_pattern(): ClusterWebAPIHandler}
+
+
+class WebClusterAPI(AbstractClusterAPI, MarsWebAPIClientMixin):
+    def __init__(self, address: str, request_rewriter: Callable = None):
+        self._address = address.rstrip("/")
+        self.request_rewriter = request_rewriter
+
+    @staticmethod
+    def _convert_node_dict(node_info_list: Dict[str, Dict]):
+        res = {}
+        for node_addr, node in node_info_list.items():
+            res_dict = node.copy()
+            res_dict["status"] = NodeStatus(res_dict["status"])
+            res[node_addr] = res_dict
+        return res
+
+    async def _get_nodes_info(
+        self,
+        nodes: List[str] = None,
+        role: NodeRole = None,
+        env: bool = False,
+        resource: bool = False,
+        detail: bool = False,
+        watch: bool = False,
+        statuses: Set[NodeStatus] = None,
+        version: Optional[int] = None,
+    ):
+        statuses_str = (
+            ",".join(str(status.value) for status in statuses) if statuses else ""
+        )
+        args = [
+            ("nodes", ",".join(nodes) if nodes else None),
+            ("role", role.value if role is not None else None),
+            ("env", 1 if env else 0),
+            ("resource", 1 if resource else 0),
+            ("detail", 1 if detail else 0),
+            ("watch", 1 if watch else 0),
+            ("statuses", statuses_str),
+            ("version", str(version or "")),
+        ]
+        args_str = "&".join(f"{key}={val}" for key, val in args if val is not None)
+
+        path = f"{self._address}/api/cluster/nodes"
+        res = await self._request_url(
+            path=path,
+            method="POST",
+            data=args_str,
+            headers={"Content-Type": "application/x-www-form-urlencoded"},
+        )
+        result = json.loads(res.body)
+        if watch:
+            return result["version"], self._convert_node_dict(result["nodes"])
+        else:
+            return self._convert_node_dict(result["nodes"])
+
+    async def get_supervisors(self, filter_ready: bool = True) -> List[str]:
+        statuses = (
+            {NodeStatus.READY}
+            if filter_ready
+            else {NodeStatus.STARTING, NodeStatus.READY}
+        )
+        res = await self._get_nodes_info(role=NodeRole.SUPERVISOR, statuses=statuses)
+        return list(res.keys())
+
+    @watch_method
+    async def watch_supervisors(self, version: Optional[int] = None):
+        version, res = await self._get_nodes_info(
+            role=NodeRole.SUPERVISOR, watch=True, version=version
+        )
+        return version, list(res.keys())
+
+    async def get_nodes_info(
+        self,
+        nodes: List[str] = None,
+        role: NodeRole = None,
+        env: bool = False,
+        resource: bool = False,
+        detail: bool = False,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ):
+        statuses = self._calc_statuses(statuses, exclude_statuses)
+        return await self._get_nodes_info(
+            nodes,
+            role=role,
+            env=env,
+            resource=resource,
+            detail=detail,
+            watch=False,
+            statuses=statuses,
+        )
+
+    @watch_method
+    async def watch_nodes(
+        self,
+        role: NodeRole,
+        env: bool = False,
+        resource: bool = False,
+        detail: bool = False,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+        version: Optional[int] = None,
+    ) -> List[Dict[str, Dict]]:
+        statuses = self._calc_statuses(statuses, exclude_statuses)
+        return await self._get_nodes_info(
+            role=role,
+            env=env,
+            resource=resource,
+            detail=detail,
+            watch=True,
+            statuses=statuses,
+            version=version,
+        )
+
+    async def get_all_bands(
+        self,
+        role: NodeRole = None,
+        statuses: Set[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+    ) -> Dict[BandType, Resource]:
+        statuses = self._calc_statuses(statuses, exclude_statuses)
+        statuses_str = (
+            ",".join(str(status.value) for status in statuses) if statuses else ""
+        )
+        params = {}
+        if role is not None:  # pragma: no cover
+            params["role"] = role.value
+        if statuses_str:
+            params["statuses"] = statuses_str
+
+        path = f"{self._address}/api/cluster/bands"
+        res = await self._request_url("GET", path, params=params)
+        return deserialize_serializable(res.body)
+
+    @watch_method
+    async def watch_all_bands(
+        self,
+        role: NodeRole = None,
+        statuses: List[NodeStatus] = None,
+        exclude_statuses: Set[NodeStatus] = None,
+        version: Optional[int] = None,
+    ):
+        statuses = self._calc_statuses(statuses, exclude_statuses)
+        statuses_str = (
+            ",".join(str(status.value) for status in statuses) if statuses else ""
+        )
+        params = dict(watch=1, version=str(version or ""))
+        if role is not None:  # pragma: no cover
+            params["role"] = role.value
+        if statuses_str:
+            params["statuses"] = statuses_str
+
+        path = f"{self._address}/api/cluster/bands"
+        res = await self._request_url("GET", path, params=params)
+        return deserialize_serializable(res.body)
+
+    async def get_mars_versions(self) -> List[str]:
+        path = f"{self._address}/api/cluster/versions"
+        res = await self._request_url("GET", path)
+        return list(json.loads(res.body))
+
+    async def get_node_pool_configs(self, address: str) -> List[Dict]:
+        path = f"{self._address}/api/cluster/pools?address={address}"
+        res = await self._request_url("GET", path)
+        return list(json.loads(res.body)["pools"])
+
+    async def get_node_thread_stacks(self, address: str) -> List[Dict]:
+        path = f"{self._address}/api/cluster/stacks?address={address}"
+        res = await self._request_url("GET", path)
+        return list(json.loads(res.body)["stacks"])
+
+    async def fetch_node_log(
+        self, size: int = None, address: str = None, offset: int = 0
+    ) -> str:
+        path = f"{self._address}/api/cluster/logs?address={address}"
+        if size is not None:
+            path += f"&&size={size}"
+        res = await self._request_url("GET", path)
+        if size == -1:
+            return res.body.decode(encoding="utf8")
+        else:
+            return str(json.loads(res.body)["content"])
diff --git a/python/xorbits/_mars/services/cluster/backends/__init__.py b/python/xorbits/_mars/services/cluster/backends/__init__.py
new file mode 100644
index 000000000..4696550f1
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/backends/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import AbstractClusterBackend, get_cluster_backend, register_cluster_backend
+from .fixed import FixedClusterBackend
diff --git a/python/xorbits/_mars/services/cluster/backends/base.py b/python/xorbits/_mars/services/cluster/backends/base.py
new file mode 100644
index 000000000..f44811c1a
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/backends/base.py
@@ -0,0 +1,103 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import AsyncGenerator, Dict, List, Optional, Type
+
+from ..core import NodeRole
+
+
+class AbstractClusterBackend(ABC):
+    name = None
+
+    @classmethod
+    @abstractmethod
+    async def create(
+        cls, node_role: NodeRole, lookup_address: Optional[str], pool_address: str
+    ) -> "AbstractClusterBackend":
+        """
+
+        Parameters
+        ----------
+        node_role
+        lookup_address
+        pool_address
+
+        Returns
+        -------
+
+        """
+
+    @abstractmethod
+    async def watch_supervisors(self) -> AsyncGenerator[List[str], None]:
+        """
+        Watch changes of supervisors
+
+        Returns
+        -------
+        out : AsyncGenerator[List[str]]
+            Generator of list of schedulers
+        """
+
+    @abstractmethod
+    async def get_supervisors(self, filter_ready: bool = True) -> List[str]:
+        """
+        Get list of supervisors
+
+        Parameters
+        ----------
+        filter_ready : bool
+            True if return ready nodes only, or return starting and ready nodes
+
+        Returns
+        -------
+        out : List[str]
+            List of supervisors
+        """
+
+    @abstractmethod
+    async def request_worker(
+        self, worker_cpu: int = None, worker_mem: int = None, timeout: int = None
+    ) -> str:
+        """
+        Create a new worker
+
+        Returns
+        -------
+        Address of the new created worker
+        """
+
+    @abstractmethod
+    async def release_worker(self, address: str):
+        """
+        Return a worker
+        """
+
+    @abstractmethod
+    async def reconstruct_worker(self, address: str):
+        """
+        Reconstruct a worker
+        """
+
+
+_cluster_backend_types: Dict[str, Type[AbstractClusterBackend]] = dict()
+
+
+def register_cluster_backend(backend: Type[AbstractClusterBackend]):
+    _cluster_backend_types[backend.name] = backend
+    return backend
+
+
+def get_cluster_backend(backend_name: str) -> Type[AbstractClusterBackend]:
+    return _cluster_backend_types[backend_name]
diff --git a/python/xorbits/_mars/services/cluster/backends/fixed.py b/python/xorbits/_mars/services/cluster/backends/fixed.py
new file mode 100644
index 000000000..c5085a6b3
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/backends/fixed.py
@@ -0,0 +1,51 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import AsyncGenerator, List, Optional, Union
+
+from ..core import NodeRole
+from .base import AbstractClusterBackend, register_cluster_backend
+
+
+@register_cluster_backend
+class FixedClusterBackend(AbstractClusterBackend):
+    name = "fixed"
+
+    def __init__(self, lookup_address: Union[List[str], str]):
+        if isinstance(lookup_address, str):
+            lookup_address = lookup_address.split(",")
+        self._supervisors = [n.strip() for n in lookup_address]
+
+    @classmethod
+    async def create(
+        cls, node_role: NodeRole, lookup_address: Optional[str], pool_address: str
+    ):
+        return cls(lookup_address)
+
+    async def watch_supervisors(self) -> AsyncGenerator[List[str], None]:
+        yield self._supervisors
+
+    async def get_supervisors(self, filter_ready: bool = True) -> List[str]:
+        return self._supervisors
+
+    async def request_worker(
+        self, worker_cpu: int = None, worker_mem: int = None, timeout: int = None
+    ) -> str:
+        raise NotImplementedError
+
+    async def release_worker(self, address: str):
+        raise NotImplementedError
+
+    async def reconstruct_worker(self, address: str):
+        raise NotImplementedError
diff --git a/python/xorbits/_mars/services/cluster/core.py b/python/xorbits/_mars/services/cluster/core.py
new file mode 100644
index 000000000..26ef3e4a4
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/core.py
@@ -0,0 +1,126 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import enum
+import functools
+import time
+from dataclasses import dataclass, field
+from typing import (
+    AsyncGenerator,
+    Awaitable,
+    Callable,
+    Dict,
+    Optional,
+    Set,
+    Tuple,
+    TypeVar,
+)
+
+from ...serialization.serializables import (
+    Float64Field,
+    Int32Field,
+    Int64Field,
+    Serializable,
+    StringField,
+)
+from ...storage import StorageLevel
+from ..core import NodeRole
+
+
+class NodeStatus(enum.Enum):
+    STARTING = 0
+    READY = 1
+    DEGENERATED = 2
+    STOPPING = 3
+    STOPPED = -1
+
+
+@dataclass
+class NodeInfo:
+    role: NodeRole
+    status: NodeStatus = NodeStatus.READY
+    update_time: float = field(default_factory=time.time)
+    env: Dict = field(default_factory=dict)
+    resource: Dict = field(default_factory=dict)
+    detail: Dict = field(default_factory=dict)
+
+
+class WatchNotifier:
+    _events: Set[asyncio.Event]
+
+    def __init__(self):
+        self._event = asyncio.Event()
+        self._lock = asyncio.Lock()
+        self._version = 0
+
+    async def watch(self, version: Optional[int] = None):
+        if version != self._version:
+            return self._version
+        await self._event.wait()
+        return self._version
+
+    async def notify(self):
+        async with self._lock:
+            self._version += 1
+            self._event.set()
+            self._event = asyncio.Event()
+
+
+RetType = TypeVar("RetType")
+
+
+def watch_method(
+    func: Callable[..., Awaitable[Tuple[int, RetType]]]
+) -> Callable[..., AsyncGenerator[RetType, None]]:
+    @functools.wraps(func)
+    async def wrapped(*args, **kwargs):
+        if "version" in kwargs:
+            yield await func(*args, **kwargs)
+            return
+
+        kwargs["version"] = None
+        while True:
+            version, val = await func(*args, **kwargs)
+            kwargs["version"] = version
+            yield val
+
+    return wrapped
+
+
+class WorkerSlotInfo(Serializable):
+    slot_id: int = Int32Field("slot_id")
+    session_id: str = StringField("session_id")
+    subtask_id: str = StringField("subtask_id")
+    processor_usage: float = Float64Field("processor_usage")
+
+
+class QuotaInfo(Serializable):
+    quota_size: int = Int64Field("quota_size")
+    allocated_size: int = Int64Field("allocated_size")
+    hold_size: int = Int64Field("hold_size")
+
+
+class StorageInfo(Serializable):
+    storage_level: StorageLevel = Int32Field(
+        "storage_level", on_serialize=lambda x: x.value, on_deserialize=StorageLevel
+    )
+    total_size: int = Int64Field("total_size")
+    used_size: int = Int64Field("used_size")
+    pinned_size: int = Int64Field("pinned_size", default=None)
+
+
+class DiskInfo(Serializable):
+    path: str = StringField("path")
+    limit_size: int = Int64Field("limit_size", default=None)
diff --git a/python/xorbits/_mars/services/cluster/file_logger.py b/python/xorbits/_mars/services/cluster/file_logger.py
new file mode 100644
index 000000000..9e1685e4d
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/file_logger.py
@@ -0,0 +1,91 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+
+from ... import oscar as mo
+from ...constants import MARS_LOG_PATH_KEY
+
+logger = logging.getLogger(__name__)
+
+
+class FileLoggerActor(mo.Actor):
+    """
+    Read log file path from env (source from yaml config) for each node (including supervisor and all the workers).
+    Expose interface for web frontend to fetch log content.
+    """
+
+    def __init__(self):
+        file_path = os.environ.get(MARS_LOG_PATH_KEY)
+        self._log_filename = file_path
+
+    def fetch_logs(self, size: int, offset: int) -> str:
+        """
+        Externally exposed interface.
+
+        Parameters
+        ----------
+        size
+        offset
+
+        Returns
+        -------
+
+        """
+        if size != -1:
+            content = self._get_n_bytes_tail_file(size)
+        else:
+            content = self._get_n_bytes_from_pos(10 * 1024 * 1024, offset)
+        return content
+
+    def _get_n_bytes_tail_file(self, bytes_num: int) -> str:
+        """
+        Read last n bytes of file.
+
+        Parameters
+        ----------
+        bytes_num: the bytes to read. -1 means read the whole file.
+
+        Returns
+        -------
+
+        """
+        f_size = os.stat(self._log_filename).st_size
+        target = f_size - bytes_num if f_size > bytes_num else 0
+        with open(self._log_filename) as f:
+            f.seek(target)
+            if target == 0:
+                res = f.read()
+            else:
+                f.readline()
+                res = f.read()
+
+        return res
+
+    def _get_n_bytes_from_pos(self, size: int, offset: int) -> str:
+        """
+        Read n bytes from a position.
+        Parameters
+        ----------
+        size
+        offset
+
+        Returns
+        -------
+
+        """
+        with open(self._log_filename) as f:
+            f.seek(offset)
+            res = f.read(size)
+        return res
diff --git a/python/xorbits/_mars/services/cluster/gather.py b/python/xorbits/_mars/services/cluster/gather.py
new file mode 100644
index 000000000..7feeb0bb8
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/gather.py
@@ -0,0 +1,265 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import platform
+import socket
+import sys
+from typing import Dict, List
+
+import numpy as np
+import pandas as pd
+
+from ...resource import Resource, ZeroResource
+
+try:
+    import scipy
+except ImportError:  # pragma: no cover
+    scipy = None
+
+from ... import resource as mars_resource
+from ...config import options
+from ...storage import StorageLevel
+from ...utils import git_info, lazy_import
+from .core import DiskInfo, QuotaInfo, StorageInfo, WorkerSlotInfo
+
+cp = lazy_import("cupy", rename="cp")
+cudf = lazy_import("cudf")
+
+logger = logging.getLogger(__name__)
+
+_is_initial = True
+
+
+def gather_node_env():
+    from ... import __version__ as mars_version
+    from ...lib.mkl_interface import mkl_get_version
+    from ...lib.nvutils import NVError
+
+    global _is_initial
+    if _is_initial:
+        _is_initial = False
+        mars_resource.cpu_percent()
+
+    mem_stats = mars_resource.virtual_memory()
+
+    node_info = {
+        "command_line": sys.argv,
+        "platform": platform.platform(),
+        "host_name": socket.gethostname(),
+        "python_version": sys.version,
+        "mars_version": mars_version,
+        "cpu_total": mars_resource.cpu_count(),
+        "memory_total": mem_stats.total,
+        "options": options.to_dict(),
+    }
+
+    if "MARS_K8S_POD_NAME" in os.environ:
+        node_info["k8s_pod_name"] = os.environ["MARS_K8S_POD_NAME"]
+    if "CONTAINER_ID" in os.environ:
+        node_info["yarn_container_id"] = os.environ["CONTAINER_ID"]
+
+    try:
+        cuda_info = mars_resource.cuda_info()
+    except NVError:  # pragma: no cover
+        logger.exception("NVError encountered, cannot gather CUDA devices.")
+        cuda_info = None
+
+    if cuda_info:
+        node_info["cuda_info"] = {
+            "driver": cuda_info.driver_version,
+            "cuda": cuda_info.cuda_version,
+            "products": list(cuda_info.products),
+        }
+
+    package_vers = {
+        "numpy": np.__version__,
+        "pandas": pd.__version__,
+    }
+    if hasattr(np, "__mkl_version__") and mkl_get_version:
+        mkl_version = mkl_get_version()
+        package_vers[
+            "mkl"
+        ] = f"{mkl_version.major}.{mkl_version.minor}.{mkl_version.update}"
+
+    if scipy is not None:
+        package_vers["scipy"] = scipy.__version__
+    if cp is not None:
+        package_vers["cupy"] = cp.__version__
+    if cudf is not None:
+        package_vers["cudf"] = cudf.__version__
+
+    node_info["package_versions"] = package_vers
+
+    git = git_info()
+    if git:
+        node_info["git_info"] = {
+            "hash": git.commit_hash,
+            "ref": git.commit_ref,
+        }
+
+    bands = node_info["bands"] = dict()
+
+    cpu_band = {
+        "resources": {
+            "cpu": mars_resource.cpu_count(),
+            "memory": mars_resource.virtual_memory().total,
+        }
+    }
+    # todo numa can be supported by adding more bands
+    bands["numa-0"] = cpu_band
+
+    for idx, gpu_card_stat in enumerate(
+        mars_resource.cuda_card_stats()
+    ):  # pragma: no cover
+        bands[f"gpu-{idx}"] = {
+            "resources": {
+                "gpu": 1,
+                "memory": gpu_card_stat.fb_mem_info.total,
+            }
+        }
+    return node_info
+
+
+def gather_node_resource(band_to_resource: Dict[str, Resource] = None, use_gpu=True):
+    # todo numa can be supported by adding more bands
+    res = dict()
+    mem_info = mars_resource.virtual_memory()
+    num_cpu = (
+        mars_resource.cpu_count()
+        if band_to_resource is None
+        else band_to_resource.get("numa-0", ZeroResource).num_cpus
+    )
+    mem_bytes = (
+        mem_info.total
+        if band_to_resource is None
+        else band_to_resource.get("numa-0", ZeroResource).mem_bytes
+    )
+    if num_cpu:  # pragma: no branch
+        res["numa-0"] = {
+            "cpu_avail": mars_resource.cpu_count()
+            - mars_resource.cpu_percent() / 100.0,
+            "cpu_total": num_cpu,
+            "memory_avail": mem_info.available,
+            "memory_total": min(mem_info.total, mem_bytes),
+        }
+
+    if use_gpu:
+        for idx, gpu_card_stat in enumerate(
+            mars_resource.cuda_card_stats()
+        ):  # pragma: no cover
+            num_gpu = (
+                1
+                if band_to_resource is None
+                else band_to_resource.get(f"gpu-{idx}", ZeroResource).num_gpus
+            )
+            if not num_gpu:
+                continue
+            res[f"gpu-{idx}"] = {
+                "gpu_avail": 1 - gpu_card_stat.gpu_usage,
+                "gpu_total": num_gpu,
+                "memory_avail": gpu_card_stat.fb_mem_info.available,
+                "memory_total": gpu_card_stat.fb_mem_info.total,
+            }
+    return res
+
+
+def gather_node_details(
+    band_slot_infos: Dict[str, List[WorkerSlotInfo]] = None,
+    band_quota_infos: Dict[str, QuotaInfo] = None,
+    disk_infos: List[DiskInfo] = None,
+    band_storage_infos: Dict[str, Dict[StorageLevel, StorageInfo]] = None,
+):
+    disk_io_usage = mars_resource.disk_io_usage()
+    net_io_usage = mars_resource.net_io_usage()
+    res = {
+        "disk": dict(zip(("reads", "writes"), disk_io_usage))
+        if disk_io_usage
+        else dict(),
+        "network": dict(zip(("receives", "sends"), net_io_usage))
+        if net_io_usage
+        else dict(),
+        "iowait": mars_resource.iowait(),
+    }
+
+    if disk_infos:
+        part_dict = dict()
+        for info in disk_infos:
+            part_dev = mars_resource.get_path_device(info.path)
+            if part_dev in part_dict:
+                continue
+
+            disk_usage_result = mars_resource.disk_usage(info.path)
+            io_usage_result = mars_resource.disk_io_usage(info.path)
+            part_dict[part_dev] = disk_info = {
+                "size_limit": info.limit_size,
+                "size_used": disk_usage_result.used,
+                "size_total": disk_usage_result.total,
+            }
+            if io_usage_result is not None:
+                disk_info.update(
+                    {
+                        "reads": io_usage_result.reads if io_usage_result else None,
+                        "writes": io_usage_result.writes if io_usage_result else None,
+                    }
+                )
+            if not sys.platform.startswith("win"):
+                in_usage_result = os.statvfs(info.path)
+                disk_info.update(
+                    {
+                        "inode_used": in_usage_result.f_files
+                        - in_usage_result.f_favail,
+                        "inode_total": in_usage_result.f_files,
+                    }
+                )
+        res["disk"]["partitions"] = part_dict
+
+    band_slot_infos = band_slot_infos or dict()
+    res["slot"] = {
+        band: [
+            {
+                "slot_id": slot_info.slot_id,
+                "session_id": slot_info.session_id,
+                "subtask_id": slot_info.subtask_id,
+                "processor_usage": slot_info.processor_usage,
+            }
+            for slot_info in slot_infos
+        ]
+        for band, slot_infos in band_slot_infos.items()
+    }
+
+    band_quota_infos = band_quota_infos or dict()
+    res["quota"] = {
+        band: {
+            "quota_size": quota_info.quota_size,
+            "allocated_size": quota_info.allocated_size,
+            "hold_size": quota_info.hold_size,
+        }
+        for band, quota_info in band_quota_infos.items()
+    }
+
+    band_storage_infos = band_storage_infos or dict()
+    res["storage"] = {
+        band: {
+            level.name.lower(): {
+                "size_used": storage_info.used_size,
+                "size_total": storage_info.total_size,
+                "size_pinned": storage_info.pinned_size,
+            }
+            for level, storage_info in storage_infos.items()
+        }
+        for band, storage_infos in band_storage_infos.items()
+    }
+    return res
diff --git a/python/xorbits/_mars/services/cluster/locator.py b/python/xorbits/_mars/services/cluster/locator.py
new file mode 100644
index 000000000..28c34d281
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/locator.py
@@ -0,0 +1,114 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+from typing import List, Optional, Set
+
+from ... import oscar as mo
+from ...lib.uhashring import HashRing
+from .backends import AbstractClusterBackend, get_cluster_backend
+from .core import NodeRole, WatchNotifier
+
+logger = logging.getLogger(__name__)
+
+
+class SupervisorLocatorActor(mo.Actor):
+    _backend: Optional[AbstractClusterBackend]
+    _node_role: NodeRole = None
+
+    def __init__(self, backend_name: str, lookup_address: str):
+        self._backend_name = backend_name
+        self._lookup_address = lookup_address
+        self._backend = None
+        self._supervisors = None
+        self._hash_ring = None
+
+        self._watch_notifier = WatchNotifier()
+        self._watch_task = None
+
+    async def __post_create__(self):
+        backend_cls = get_cluster_backend(self._backend_name)
+        self._backend = await backend_cls.create(
+            self._node_role, self._lookup_address, self.address
+        )
+        await self._set_supervisors(await self._get_supervisors_from_backend())
+
+        self._watch_task = asyncio.create_task(self._watch_supervisor_changes())
+
+    async def __pre_destroy__(self):
+        self._watch_task.cancel()
+
+    async def _set_supervisors(self, supervisors: List[str]):
+        self._supervisors = supervisors
+        self._hash_ring = HashRing(nodes=supervisors, hash_fn="ketama")
+        await self._watch_notifier.notify()
+
+    async def _get_supervisors_from_backend(self, filter_ready: bool = True):
+        raise NotImplementedError
+
+    def _watch_supervisors_from_backend(self):
+        raise NotImplementedError
+
+    def _if_set_supervisors(
+        self, current_supervisors: Set[str], last_supervisors: Set[str]
+    ):
+        return current_supervisors != last_supervisors
+
+    async def _watch_supervisor_changes(self):
+        last_supervisors = set()
+        try:
+            async for sv_list in self._watch_supervisors_from_backend():
+                if self._if_set_supervisors(set(sv_list), last_supervisors):
+                    await self._set_supervisors(sv_list)
+                    last_supervisors = set(sv_list)
+        except asyncio.CancelledError:
+            return
+
+    async def get_supervisors(self, filter_ready: bool = True):
+        if filter_ready:
+            return self._supervisors
+        else:
+            return await self._get_supervisors_from_backend(filter_ready=filter_ready)
+
+    @mo.extensible
+    def get_supervisor(self, key: str, size=1):
+        if not self._supervisors:
+            return None
+        elif size == 1:
+            return self._hash_ring.get_node(key)
+        else:
+            return tuple(it["nodename"] for it in self._hash_ring.range(key, size=size))
+
+    async def watch_supervisors(self, version: Optional[int] = None):
+        version = yield self._watch_notifier.watch(version)
+        raise mo.Return((version, self._supervisors))
+
+    async def watch_supervisors_by_keys(
+        self, keys: List[str], version: Optional[int] = None
+    ):
+        version = yield self._watch_notifier.watch(version)
+        raise mo.Return((version, [self.get_supervisor(k) for k in keys]))
+
+    async def wait_all_supervisors_ready(self):
+        version = None
+        while True:
+            expected_supervisors = await self._get_supervisors_from_backend(
+                filter_ready=False
+            )
+            if self._supervisors and set(self._supervisors) == set(
+                expected_supervisors
+            ):
+                break
+            version = yield self._watch_notifier.watch(version)
diff --git a/python/xorbits/_mars/services/cluster/procinfo.py b/python/xorbits/_mars/services/cluster/procinfo.py
new file mode 100644
index 000000000..2837dcac3
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/procinfo.py
@@ -0,0 +1,95 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import sys
+import threading
+import traceback
+from typing import Dict, List
+
+from ... import oscar as mo
+from ...oscar.backends.allocate_strategy import ProcessIndex
+
+
+class ProcessInfoManagerActor(mo.StatelessActor):
+    _process_refs: List[mo.ActorRef]
+
+    def __init__(self):
+        self._process_refs = []
+        self._pool_configs = []
+
+    async def __post_create__(self):
+        index = 0
+        while True:
+            try:
+                ref = await mo.create_actor(
+                    ProcessInfoActor,
+                    process_index=index,
+                    uid=ProcessInfoActor.gen_uid(index),
+                    address=self.address,
+                    allocate_strategy=ProcessIndex(index),
+                )
+            except IndexError:
+                break
+
+            index += 1
+            self._process_refs.append(ref)
+
+        self._pool_configs = await asyncio.gather(
+            *[ref.get_pool_config() for ref in self._process_refs]
+        )
+
+    async def get_pool_configs(self) -> List[Dict]:
+        return self._pool_configs
+
+    async def get_thread_stacks(self) -> List[Dict[int, List[str]]]:
+        stack_tasks = [
+            asyncio.create_task(ref.get_thread_stacks()) for ref in self._process_refs
+        ]
+        await asyncio.wait(stack_tasks, return_when=asyncio.ALL_COMPLETED)
+
+        results = []
+        for fut in stack_tasks:
+            try:
+                results.append(fut.result())
+            except (mo.ActorNotExist, mo.ServerClosed):
+                results.append(None)
+        return results
+
+
+class ProcessInfoActor(mo.StatelessActor):
+    def __init__(self, process_index: int = 0):
+        self._process_index = process_index
+        self._pool_config = None
+
+    async def __post_create__(self):
+        self._pool_config = await mo.get_pool_config(self.address)
+
+    @classmethod
+    def gen_uid(cls, process_index: int):
+        return f"process_info_{process_index}"
+
+    def get_pool_config(self) -> dict:
+        idx = self._pool_config.get_process_index(self.address)
+        return self._pool_config.get_pool_config(idx)
+
+    @classmethod
+    def get_thread_stacks(cls) -> Dict[str, List[str]]:
+        frames = sys._current_frames()
+        stacks = dict()
+        for th in threading.enumerate():
+            tid = getattr(th, "native_id", th.ident)
+            stack_key = f"{tid}:{th.name}"
+            stacks[stack_key] = traceback.format_stack(frames[th.ident])
+        return stacks
diff --git a/python/xorbits/_mars/services/cluster/supervisor/__init__.py b/python/xorbits/_mars/services/cluster/supervisor/__init__.py
new file mode 100644
index 000000000..a258409aa
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/supervisor/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .service import ClusterSupervisorService
diff --git a/python/xorbits/_mars/services/cluster/supervisor/locator.py b/python/xorbits/_mars/services/cluster/supervisor/locator.py
new file mode 100644
index 000000000..9e1691648
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/supervisor/locator.py
@@ -0,0 +1,51 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import oscar as mo
+from ....lib.aio import alru_cache
+from ..core import NodeRole
+from ..locator import SupervisorLocatorActor
+
+
+class SupervisorPeerLocatorActor(SupervisorLocatorActor):
+    _node_role = NodeRole.SUPERVISOR
+
+    @classmethod
+    def default_uid(cls):
+        return SupervisorLocatorActor.__name__
+
+    async def __post_create__(self):
+        await super().__post_create__()
+
+        supervisors = await self._backend.get_supervisors(filter_ready=False)
+        try:
+            node_info_ref = await self._get_node_info_ref()
+            await node_info_ref.put_starting_nodes(supervisors, NodeRole.SUPERVISOR)
+        except mo.ActorNotExist:  # pragma: no cover
+            pass
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_node_info_ref(self):
+        from .node_info import NodeInfoCollectorActor
+
+        return await mo.actor_ref(
+            uid=NodeInfoCollectorActor.default_uid(), address=self.address
+        )
+
+    async def _get_supervisors_from_backend(self, filter_ready: bool = True):
+        return await self._backend.get_supervisors(filter_ready=filter_ready)
+
+    async def _watch_supervisors_from_backend(self):
+        async for supervisors in self._backend.watch_supervisors():
+            yield supervisors
diff --git a/python/xorbits/_mars/services/cluster/supervisor/node_allocator.py b/python/xorbits/_mars/services/cluster/supervisor/node_allocator.py
new file mode 100644
index 000000000..4a97a273b
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/supervisor/node_allocator.py
@@ -0,0 +1,45 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from .... import oscar as mo
+from ...core import NodeRole
+from ..backends import AbstractClusterBackend, get_cluster_backend
+
+
+class NodeAllocatorActor(mo.StatelessActor):
+    def __init__(self, backend_name: str, lookup_address: str):
+        self._backend_name = backend_name
+        self._lookup_address = lookup_address
+        self._backend: Optional[AbstractClusterBackend] = None
+
+    async def __post_create__(self):
+        backend_cls = get_cluster_backend(self._backend_name)
+        self._backend = await backend_cls.create(
+            NodeRole.WORKER, self._lookup_address, self.address
+        )
+
+    async def request_worker(
+        self, worker_cpu: int, worker_mem: int, timeout: int = None
+    ) -> str:
+        return await self._backend.request_worker(
+            worker_cpu, worker_mem, timeout=timeout
+        )
+
+    async def release_worker(self, address: str):
+        await self._backend.release_worker(address)
+
+    async def reconstruct_worker(self, address: str):
+        await self._backend.reconstruct_worker(address)
diff --git a/python/xorbits/_mars/services/cluster/supervisor/node_info.py b/python/xorbits/_mars/services/cluster/supervisor/node_info.py
new file mode 100644
index 000000000..c36ed92fb
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/supervisor/node_info.py
@@ -0,0 +1,215 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from collections import defaultdict
+from typing import Dict, List, Optional, Set
+
+from .... import oscar as mo
+from ....resource import Resource
+from ....typing import BandType
+from ...core import NodeRole
+from ..core import NodeInfo, NodeStatus, WatchNotifier
+
+DEFAULT_NODE_DEAD_TIMEOUT = 120
+DEFAULT_NODE_CHECK_INTERVAL = 1
+
+
+class NodeInfoCollectorActor(mo.Actor):
+    _node_infos: Dict[str, NodeInfo]
+
+    def __init__(self, timeout=None, check_interval=None):
+        self._role_to_nodes = defaultdict(set)
+        self._role_to_notifier = defaultdict(WatchNotifier)
+
+        self._node_infos = dict()
+
+        self._node_timeout = timeout or DEFAULT_NODE_DEAD_TIMEOUT
+        self._check_interval = check_interval or DEFAULT_NODE_CHECK_INTERVAL
+        self._check_task = None
+
+    async def __post_create__(self):
+        self._check_task = self.ref().check_dead_nodes.tell_delay(
+            delay=self._check_interval
+        )
+
+    async def __pre_destroy__(self):
+        self._check_task.cancel()
+
+    async def check_dead_nodes(self):
+        affect_roles = set()
+        for address, info in self._node_infos.items():
+            if (
+                info.status == NodeStatus.READY
+                and time.time() - info.update_time > self._node_timeout
+            ):
+                info.status = NodeStatus.STOPPED
+                node_role = info.role
+                affect_roles.add(node_role)
+
+        if affect_roles:
+            await self._notify_roles(affect_roles)
+
+        self._check_task = self.ref().check_dead_nodes.tell_delay(
+            delay=self._check_interval
+        )
+
+    async def _notify_roles(self, roles):
+        for role in roles:
+            await self._role_to_notifier[role].notify()
+
+    async def update_node_info(
+        self,
+        address: str,
+        role: NodeRole,
+        env: Dict = None,
+        resource: Dict = None,
+        detail: Dict = None,
+        status: NodeStatus = None,
+    ):
+        need_notify = False
+        if address not in self._node_infos:
+            need_notify = True
+            info = self._node_infos[address] = NodeInfo(role=role, status=status)
+        else:
+            info = self._node_infos[address]
+
+        info.update_time = time.time()
+        if env is not None:
+            info.env.update(env)
+        if resource is not None:
+            info.resource.update(resource)
+        if detail is not None:
+            info.detail.update(detail)
+        if status is not None:
+            need_notify = need_notify or (info.status != status)
+            info.status = status
+
+        if need_notify:
+            self._role_to_nodes[role].add(address)
+            await self._notify_roles([role])
+
+    def get_nodes_info(
+        self,
+        nodes: List[str] = None,
+        role: NodeRole = None,
+        env: bool = False,
+        resource: bool = False,
+        detail: bool = False,
+        statuses: Set[NodeStatus] = None,
+    ):
+        statuses = statuses or {NodeStatus.READY}
+        if nodes is None:
+            nodes = (
+                self._role_to_nodes.get(role)
+                if role is not None
+                else self._node_infos.keys()
+            )
+            nodes = nodes or []
+        ret_infos = dict()
+        for node in nodes:
+            if node not in self._node_infos:
+                continue
+            info = self._node_infos[node]
+            if info.status not in statuses:
+                continue
+
+            ret_infos[node] = dict(
+                status=info.status,
+                update_time=info.update_time,
+                env=info.env if env else None,
+                resource=info.resource if resource else None,
+                detail=info.detail if detail else None,
+            )
+        return ret_infos
+
+    def get_all_bands(
+        self, role: NodeRole = None, statuses: Set[NodeStatus] = None
+    ) -> Dict[BandType, Resource]:
+        statuses = statuses or {NodeStatus.READY}
+        role = role or NodeRole.WORKER
+        nodes = self._role_to_nodes.get(role, [])
+        band_resource = dict()
+        for node in nodes:
+            if self._node_infos[node].status not in statuses:
+                continue
+            node_resource = self._node_infos[node].resource
+            for resource_type, info in node_resource.items():
+                if resource_type.startswith("numa"):
+                    # cpu
+                    band_resource[(node, resource_type)] = Resource(
+                        num_cpus=info["cpu_total"], mem_bytes=info["memory_total"]
+                    )
+                else:  # pragma: no cover
+                    assert resource_type.startswith("gpu")
+                    band_resource[(node, resource_type)] = Resource(
+                        num_gpus=info["gpu_total"]
+                    )
+        return band_resource
+
+    def get_mars_versions(self) -> List[str]:
+        versions = set(info.env["mars_version"] for info in self._node_infos.values())
+        return list(sorted(versions))
+
+    async def watch_nodes(
+        self,
+        role: NodeRole,
+        env: bool = False,
+        resource: bool = False,
+        detail: bool = False,
+        statuses: Set[NodeStatus] = None,
+        version: Optional[int] = None,
+    ):
+        version = yield self._role_to_notifier[role].watch(version=version)
+        raise mo.Return(
+            (
+                version,
+                self.get_nodes_info(
+                    role=role,
+                    env=env,
+                    resource=resource,
+                    detail=detail,
+                    statuses=statuses,
+                ),
+            )
+        )
+
+    async def watch_all_bands(
+        self,
+        role: NodeRole = None,
+        statuses: Set[NodeStatus] = None,
+        version: Optional[int] = None,
+    ):
+        role = role or NodeRole.WORKER
+        version = yield self._role_to_notifier[role].watch(version=version)
+        raise mo.Return((version, self.get_all_bands(role=role, statuses=statuses)))
+
+    async def put_starting_nodes(self, nodes: List[str], role: NodeRole):
+        for node_ep in nodes:
+            if node_ep in self._node_infos and self._node_infos[node_ep].status not in {
+                NodeStatus.STARTING,
+                NodeStatus.STOPPED,
+            }:
+                continue
+            self._node_infos[node_ep] = NodeInfo(
+                role, NodeStatus.STARTING, update_time=time.time()
+            )
+            self._role_to_nodes[role].add(node_ep)
+
+        nodes_set = set(nodes)
+        for node, info in self._node_infos.items():
+            if info.status == NodeStatus.STARTING and node not in nodes_set:
+                info.status = NodeStatus.STOPPED
+
+        await self._role_to_notifier[role].notify()
diff --git a/python/xorbits/_mars/services/cluster/supervisor/service.py b/python/xorbits/_mars/services/cluster/supervisor/service.py
new file mode 100644
index 000000000..4344de160
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/supervisor/service.py
@@ -0,0 +1,109 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import oscar as mo
+from ...core import AbstractService, NodeRole
+from ..file_logger import FileLoggerActor
+from ..procinfo import ProcessInfoManagerActor
+from ..uploader import NodeInfoUploaderActor
+from .locator import SupervisorPeerLocatorActor
+from .node_allocator import NodeAllocatorActor
+from .node_info import NodeInfoCollectorActor
+
+
+class ClusterSupervisorService(AbstractService):
+    """
+    Cluster service on supervisor
+
+    Service Configuration
+    ---------------------
+    {
+        "cluster": {
+            "backend": "<cluster backend name>",
+            "lookup_address": "<address of master>",
+            "node_timeout": timeout seconds of nodes,
+            "node_check_interval": check interval seconds for nodes
+        }
+    }
+    """
+
+    async def start(self):
+        svc_config = self._config["cluster"]
+        address = self._address
+
+        backend = svc_config.get("backend", "fixed")
+        lookup_address = svc_config.get(
+            "lookup_address", address if backend == "fixed" else None
+        )
+        await mo.create_actor(
+            NodeInfoCollectorActor,
+            timeout=svc_config.get("node_timeout"),
+            check_interval=svc_config.get("node_check_interval"),
+            uid=NodeInfoCollectorActor.default_uid(),
+            address=address,
+        )
+        await mo.create_actor(
+            SupervisorPeerLocatorActor,
+            backend_name=backend,
+            lookup_address=lookup_address,
+            uid=SupervisorPeerLocatorActor.default_uid(),
+            address=address,
+        )
+        await mo.create_actor(
+            NodeInfoUploaderActor,
+            role=NodeRole.SUPERVISOR,
+            interval=svc_config.get("node_check_interval"),
+            uid=NodeInfoUploaderActor.default_uid(),
+            address=address,
+        )
+        await mo.create_actor(
+            NodeAllocatorActor,
+            backend_name=backend,
+            lookup_address=lookup_address,
+            uid=NodeAllocatorActor.default_uid(),
+            address=address,
+        )
+        await mo.create_actor(
+            ProcessInfoManagerActor,
+            uid=ProcessInfoManagerActor.default_uid(),
+            address=address,
+        )
+        await mo.create_actor(
+            FileLoggerActor, uid=FileLoggerActor.default_uid(), address=address
+        )
+
+    async def stop(self):
+        address = self._address
+
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=NodeInfoCollectorActor.default_uid(), address=address
+            )
+        )
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=SupervisorPeerLocatorActor.default_uid(), address=address
+            )
+        )
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=NodeInfoUploaderActor.default_uid(), address=address
+            )
+        )
+        await mo.destroy_actor(
+            mo.create_actor_ref(uid=NodeAllocatorActor.default_uid(), address=address)
+        )
+        await mo.destroy_actor(
+            mo.create_actor_ref(uid=FileLoggerActor.default_uid(), address=address)
+        )
diff --git a/python/xorbits/_mars/services/cluster/tests/__init__.py b/python/xorbits/_mars/services/cluster/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/cluster/tests/backend.py b/python/xorbits/_mars/services/cluster/tests/backend.py
new file mode 100644
index 000000000..e7be76fb1
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/tests/backend.py
@@ -0,0 +1,66 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import os
+from typing import AsyncGenerator, List, Optional
+
+from ... import NodeRole
+from ...cluster.backends import AbstractClusterBackend, register_cluster_backend
+
+logger = logging.getLogger(__name__)
+
+
+@register_cluster_backend
+class TestClusterBackend(AbstractClusterBackend):
+    name = "test"
+
+    def __init__(self, file_path: str):
+        self._file_path = file_path
+        self._modify_date = os.path.getmtime(file_path)
+
+    @classmethod
+    async def create(
+        cls, node_role: NodeRole, lookup_address: Optional[str], pool_address: str
+    ) -> "AbstractClusterBackend":
+        return TestClusterBackend(lookup_address)
+
+    async def get_supervisors(self, filter_ready: bool = True) -> List[str]:
+        with open(self._file_path, "r") as inp_file:
+            result = []
+            for line in inp_file.read().strip().splitlines(False):
+                line_parts = line.rsplit(",", 1)
+                if len(line_parts) == 1 or (filter_ready and int(line_parts[1])):
+                    result.append(line_parts[0])
+            return result
+
+    async def watch_supervisors(self) -> AsyncGenerator[List[str], None]:
+        while True:
+            mtime = os.path.getmtime(self._file_path)
+            if mtime != self._modify_date:
+                self._modify_date = mtime
+                yield await self.get_supervisors()
+            await asyncio.sleep(0.1)
+
+    async def request_worker(
+        self, worker_cpu: int = None, worker_mem: int = None, timeout: int = None
+    ) -> str:
+        raise NotImplementedError
+
+    async def release_worker(self, address: str):
+        raise NotImplementedError
+
+    async def reconstruct_worker(self, address: str):
+        raise NotImplementedError
diff --git a/python/xorbits/_mars/services/cluster/tests/test_api.py b/python/xorbits/_mars/services/cluster/tests/test_api.py
new file mode 100644
index 000000000..9dc77a61d
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/tests/test_api.py
@@ -0,0 +1,201 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import os
+import tempfile
+
+import pytest
+
+from .... import oscar as mo
+from ....constants import MARS_LOG_PATH_KEY, MARS_LOG_PREFIX, MARS_TMP_DIR_PREFIX
+from ....utils import clean_mars_tmp_dir, get_next_port
+from ... import NodeRole
+from ...web.supervisor import WebSupervisorService
+from ..api import ClusterAPI, MockClusterAPI, WebClusterAPI
+from ..api.web import web_handlers
+from ..core import NodeStatus
+
+
+@pytest.fixture
+async def actor_pool():
+    # prepare
+    mars_tmp_dir = tempfile.mkdtemp(prefix=MARS_TMP_DIR_PREFIX)
+    _, file_path = tempfile.mkstemp(prefix=MARS_LOG_PREFIX, dir=mars_tmp_dir)
+    os.environ[MARS_LOG_PATH_KEY] = file_path
+    pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+    async with pool:
+        yield pool
+
+    # clean
+    clean_mars_tmp_dir()
+
+
+class TestActor(mo.Actor):
+    __test__ = False
+
+
+async def wait_async_gen(async_gen):
+    async for _ in async_gen:
+        pass
+
+
+@pytest.mark.asyncio
+async def test_api(actor_pool):
+    pool_addr = actor_pool.external_address
+    api = await MockClusterAPI.create(pool_addr, upload_interval=0.1)
+
+    assert await api.get_supervisors() == [pool_addr]
+
+    assert pool_addr in await api.get_supervisors_by_keys(["test_mock"])
+
+    await mo.create_actor(TestActor, uid=TestActor.default_uid(), address=pool_addr)
+    assert (await api.get_supervisor_refs([TestActor.default_uid()]))[
+        0
+    ].address == pool_addr
+
+    bands = await api.get_all_bands()
+    assert (pool_addr, "numa-0") in bands
+
+    with pytest.raises(asyncio.TimeoutError):
+        await asyncio.wait_for(wait_async_gen(api.watch_supervisors()), timeout=0.1)
+    with pytest.raises(asyncio.TimeoutError):
+        await asyncio.wait_for(
+            wait_async_gen(api.watch_supervisor_refs([TestActor.default_uid()])),
+            timeout=0.1,
+        )
+    with pytest.raises(asyncio.TimeoutError):
+        await asyncio.wait_for(
+            wait_async_gen(
+                api.watch_nodes(NodeRole.WORKER, statuses={NodeStatus.READY})
+            ),
+            timeout=0.1,
+        )
+    with pytest.raises(asyncio.TimeoutError):
+        await asyncio.wait_for(
+            wait_async_gen(api.watch_all_bands(statuses={NodeStatus.READY})),
+            timeout=0.1,
+        )
+    with pytest.raises(NotImplementedError):
+        await api.request_worker(timeout=1)
+    with pytest.raises(NotImplementedError):
+        await api.release_worker("127.0.0.1:1234")
+
+    await api.set_node_status(pool_addr, NodeRole.WORKER, NodeStatus.STOPPING)
+    assert {} == await api.get_all_bands()
+    assert {} == await api.get_nodes_info(role=NodeRole.WORKER)
+    bands = await api.get_all_bands(exclude_statuses={NodeStatus.STOPPED})
+    assert (pool_addr, "numa-0") in bands
+    assert pool_addr in await api.get_nodes_info(
+        role=NodeRole.WORKER, exclude_statuses={NodeStatus.STOPPED}
+    )
+
+    log_ref = await api._get_log_ref()
+    assert log_ref is not None
+
+    content = await api.fetch_node_log(size=10, address=pool_addr)
+    assert "" == content
+    content = await api.fetch_node_log(size=-1, address=pool_addr)
+    assert type(content) is str
+    assert "" == content
+
+    await MockClusterAPI.cleanup(pool_addr)
+
+
+@pytest.mark.asyncio
+async def test_web_api(actor_pool):
+    pool_addr = actor_pool.external_address
+    await MockClusterAPI.create(pool_addr, upload_interval=0.1)
+
+    web_config = {
+        "web": {
+            "host": "127.0.0.1",
+            "port": get_next_port(),
+            "web_handlers": web_handlers,
+        }
+    }
+    web_service = WebSupervisorService(web_config, pool_addr)
+    await web_service.start()
+
+    web_api = WebClusterAPI(f'http://127.0.0.1:{web_config["web"]["port"]}')
+    assert await web_api.get_supervisors() == [pool_addr]
+
+    assert len(await web_api.get_all_bands(statuses={NodeStatus.READY})) > 0
+    nodes = await web_api.get_nodes_info(
+        role=NodeRole.WORKER, statuses={NodeStatus.READY}
+    )
+    assert len(nodes) > 0
+
+    from .... import __version__ as mars_version
+
+    assert await web_api.get_mars_versions() == [mars_version]
+
+    with pytest.raises(asyncio.TimeoutError):
+        await asyncio.wait_for(wait_async_gen(web_api.watch_supervisors()), timeout=0.1)
+    with pytest.raises(asyncio.TimeoutError):
+        await asyncio.wait_for(
+            wait_async_gen(web_api.watch_nodes(NodeRole.WORKER)), timeout=0.1
+        )
+    with pytest.raises(asyncio.TimeoutError):
+        await asyncio.wait_for(wait_async_gen(web_api.watch_all_bands()), timeout=0.1)
+
+    proc_info = await web_api.get_node_pool_configs(pool_addr)
+    assert len(proc_info) > 0
+    stacks = await web_api.get_node_thread_stacks(pool_addr)
+    assert len(stacks) > 0
+
+    log_content = await web_api.fetch_node_log(size=None, address=pool_addr)
+    assert len(log_content) == 0
+
+    log_content = await web_api.fetch_node_log(size=5, address=pool_addr)
+    assert len(log_content) == 0
+
+    log_content = await web_api.fetch_node_log(size=-1, address=pool_addr)
+    assert type(log_content) is str
+    assert len(log_content) == 0
+
+    log_file = os.environ[MARS_LOG_PATH_KEY]
+    with open(log_file, "w") as f:
+        f.write("foo bar baz")
+    log_content = await web_api.fetch_node_log(size=-1, address=pool_addr)
+    assert len(log_content) == 11
+
+    await MockClusterAPI.cleanup(pool_addr)
+
+
+@pytest.mark.asyncio
+async def test_no_supervisor(actor_pool):
+    pool_addr = actor_pool.external_address
+
+    from ..supervisor.locator import SupervisorPeerLocatorActor
+    from ..uploader import NodeInfoUploaderActor
+
+    await mo.create_actor(
+        SupervisorPeerLocatorActor,
+        "fixed",
+        [],
+        uid=SupervisorPeerLocatorActor.default_uid(),
+        address=pool_addr,
+    )
+    await mo.create_actor(
+        NodeInfoUploaderActor,
+        NodeRole.WORKER,
+        interval=1,
+        band_to_resource=None,
+        use_gpu=False,
+        uid=NodeInfoUploaderActor.default_uid(),
+        address=pool_addr,
+    )
+    api = await ClusterAPI.create(address=pool_addr)
+    with pytest.raises(mo.ActorNotExist):
+        await api.get_supervisor_refs(["KEY"])
diff --git a/python/xorbits/_mars/services/cluster/tests/test_file_logger.py b/python/xorbits/_mars/services/cluster/tests/test_file_logger.py
new file mode 100644
index 000000000..0d9ff091a
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/tests/test_file_logger.py
@@ -0,0 +1,93 @@
+# Copyright 2022 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import tempfile
+
+import pytest
+
+from .... import oscar as mo
+from ....constants import MARS_LOG_PATH_KEY, MARS_LOG_PREFIX, MARS_TMP_DIR_PREFIX
+from ....utils import clean_mars_tmp_dir
+from ..file_logger import FileLoggerActor
+
+full_content = "qwert\nasdfg\nzxcvb\nyuiop\nhjkl;\nnm,./"
+
+
+@pytest.fixture
+async def actor_pool():
+    # prepare
+    mars_tmp_dir = tempfile.mkdtemp(prefix=MARS_TMP_DIR_PREFIX)
+    _, file_path = tempfile.mkstemp(prefix=MARS_LOG_PREFIX, dir=mars_tmp_dir)
+    os.environ[MARS_LOG_PATH_KEY] = file_path
+    pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+    async with pool:
+        yield pool
+
+    # clean
+    clean_mars_tmp_dir()
+
+
+@pytest.mark.asyncio
+async def test_file_logger(actor_pool):
+    pool_addr = actor_pool.external_address
+    logger_ref = await mo.create_actor(
+        FileLoggerActor,
+        uid=FileLoggerActor.default_uid(),
+        address=pool_addr,
+    )
+
+    filename = os.environ.get(MARS_LOG_PATH_KEY)
+    with open(filename, "w", newline="\n") as f:
+        f.write(full_content)
+
+    byte_num = 5
+    expected_data = ""
+    content = await logger_ref.fetch_logs(byte_num, 0)
+    assert content == expected_data
+
+    byte_num = 6
+    expected_data = "nm,./"
+    content = await logger_ref.fetch_logs(byte_num, 0)
+    assert content == expected_data
+
+    byte_num = 11
+    expected_data = "nm,./"
+    content = await logger_ref.fetch_logs(byte_num, 0)
+    assert content == expected_data
+
+    byte_num = 12
+    expected_data = "hjkl;\nnm,./"
+    content = await logger_ref.fetch_logs(byte_num, 0)
+    assert content == expected_data
+
+    byte_num = 50
+    expected_data = "qwert\nasdfg\nzxcvb\nyuiop\nhjkl;\nnm,./"
+    content = await logger_ref.fetch_logs(byte_num, 0)
+    assert content == expected_data
+
+    byte_num = -1
+    expected_data = "qwert\nasdfg\nzxcvb\nyuiop\nhjkl;\nnm,./"
+    content = await logger_ref.fetch_logs(byte_num, 0)
+    assert content == expected_data
+
+    byte_num = -1
+    offset = 1
+    expected_data = "wert\nasdfg\nzxcvb\nyuiop\nhjkl;\nnm,./"
+    content = await logger_ref.fetch_logs(byte_num, offset)
+    assert content == expected_data
+
+    offset = 35
+    expected_data = ""
+    content = await logger_ref.fetch_logs(byte_num, offset)
+    assert content == expected_data
diff --git a/python/xorbits/_mars/services/cluster/tests/test_gather.py b/python/xorbits/_mars/services/cluster/tests/test_gather.py
new file mode 100644
index 000000000..5c5c10b8d
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/tests/test_gather.py
@@ -0,0 +1,46 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+
+from .. import DiskInfo
+from ..gather import gather_node_details, gather_node_env, gather_node_resource
+
+
+def test_gather_node_env():
+    node_env = gather_node_env()
+    band_data = node_env["bands"]["numa-0"]
+    assert band_data["resources"]["cpu"] > 0
+    assert band_data["resources"]["memory"] > 0
+
+
+def test_gather_node_resource():
+    node_res = gather_node_resource()
+    band_res = node_res["numa-0"]
+    assert band_res["cpu_total"] >= band_res["cpu_avail"]
+    assert band_res["memory_total"] >= band_res["memory_avail"]
+
+
+def test_gather_node_details():
+    gather_node_details()
+    time.sleep(0.1)
+    node_details = gather_node_details()
+    assert not node_details["disk"].get("partitions")
+
+    curdir = os.path.dirname(os.path.abspath(__file__))
+    gather_node_details(disk_infos=[DiskInfo(path=curdir)])
+    time.sleep(0.1)
+    node_details = gather_node_details(disk_infos=[DiskInfo(path=curdir)])
+    assert node_details["disk"].get("partitions")
diff --git a/python/xorbits/_mars/services/cluster/tests/test_locator.py b/python/xorbits/_mars/services/cluster/tests/test_locator.py
new file mode 100644
index 000000000..c0b1b9920
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/tests/test_locator.py
@@ -0,0 +1,213 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+import tempfile
+from typing import List
+
+import pytest
+
+from .... import oscar as mo
+from ....tests.core import flaky
+from ....utils import Timer
+from ..core import NodeRole, NodeStatus
+from ..supervisor.locator import SupervisorPeerLocatorActor
+from ..supervisor.node_info import NodeInfoCollectorActor
+from ..tests import backend
+from ..worker.locator import WorkerSupervisorLocatorActor
+
+del backend
+
+
+class MockNodeInfoCollectorActor(mo.Actor):
+    def __init__(self):
+        self._node_infos = dict()
+        self._version = 0
+
+    def set_all_node_infos(self, node_infos):
+        self._node_infos = node_infos
+
+    def get_nodes_info(self, *args, **kwargs):
+        return self._node_infos
+
+    async def watch_nodes(self, *args, version=None, **kwargs):
+        await asyncio.sleep(0.5)
+        self._version += 1
+        return self._version, self._node_infos
+
+    def put_starting_nodes(self, nodes: List[str], role: NodeRole):
+        for node in nodes:
+            self._node_infos[node] = NodeStatus.STARTING
+
+
+@pytest.fixture
+async def actor_pool():
+    pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+    async with pool:
+        await mo.create_actor(
+            MockNodeInfoCollectorActor,
+            uid=NodeInfoCollectorActor.default_uid(),
+            address=pool.external_address,
+        )
+        yield pool
+
+
+@pytest.mark.asyncio
+async def test_fixed_locator(actor_pool):
+    addresses = ["1.2.3.4:1234", "1.2.3.4:1235", "1.2.3.4:1236", "1.2.3.4:1237"]
+    locator_ref = await mo.create_actor(
+        SupervisorPeerLocatorActor,
+        "fixed",
+        ",".join(addresses),
+        address=actor_pool.external_address,
+    )
+
+    assert await locator_ref.get_supervisor("mock_name") in addresses
+
+    dbl_addrs = await locator_ref.get_supervisor("mock_name", 2)
+    assert len(dbl_addrs) == 2
+    assert all(addr in addresses for addr in dbl_addrs)
+
+    with Timer() as timer:
+        await locator_ref.wait_all_supervisors_ready()
+    assert timer.duration < 0.1
+
+    await mo.destroy_actor(locator_ref)
+
+
+@pytest.fixture
+def temp_address_file():
+    with tempfile.TemporaryDirectory(prefix="mars-test") as dir_name:
+        yield os.path.join(dir_name, "addresses")
+
+
+@flaky(max_runs=3)
+@pytest.mark.asyncio
+async def test_supervisor_peer_locator(actor_pool, temp_address_file):
+    addresses = ["1.2.3.4:1234", "1.2.3.4:1235", "1.2.3.4:1236", "1.2.3.4:1237"]
+    with open(temp_address_file, "w") as file_obj:
+        file_obj.write("\n".join(addresses))
+
+    locator_ref = await mo.create_actor(
+        SupervisorPeerLocatorActor,
+        "test",
+        temp_address_file,
+        uid=SupervisorPeerLocatorActor.default_uid(),
+        address=actor_pool.external_address,
+    )
+
+    # test starting nodes filled
+    info_ref = await mo.actor_ref(
+        uid=NodeInfoCollectorActor.default_uid(), address=actor_pool.external_address
+    )
+    assert set(await info_ref.get_nodes_info()) == set(addresses)
+
+    # test watch nodes changes
+    version, result = await asyncio.wait_for(
+        locator_ref.watch_supervisors_by_keys(["mock_name"]),
+        timeout=30,
+    )
+    assert result[0] in addresses
+
+    with open(temp_address_file, "w") as file_obj:
+        file_obj.write("\n".join(addresses[2:]))
+
+    version, result = await asyncio.wait_for(
+        locator_ref.watch_supervisors_by_keys(["mock_name"], version=version),
+        timeout=30,
+    )
+    assert result[0] in addresses[2:]
+
+    # test wait all supervisors ready
+    with open(temp_address_file, "w") as file_obj:
+        file_obj.write("\n".join(f"{a},{idx % 2}" for idx, a in enumerate(addresses)))
+
+    async def delay_read_fun():
+        await asyncio.sleep(0.2)
+        with open(temp_address_file, "w") as file_obj:
+            file_obj.write(
+                "\n".join(f"{a},{(idx + 1) % 2}" for idx, a in enumerate(addresses))
+            )
+        await asyncio.sleep(0.5)
+        with open(temp_address_file, "w") as file_obj:
+            file_obj.write("\n".join(addresses))
+
+    asyncio.create_task(delay_read_fun())
+
+    with Timer() as timer:
+        await asyncio.wait_for(locator_ref.wait_all_supervisors_ready(), timeout=30)
+    assert timer.duration > 0.4
+
+    await mo.destroy_actor(locator_ref)
+
+
+@flaky(max_runs=3)
+@pytest.mark.asyncio
+async def test_worker_supervisor_locator(actor_pool, temp_address_file):
+    addresses = [actor_pool.external_address]
+    with open(temp_address_file, "w") as file_obj:
+        file_obj.write("\n".join(addresses))
+
+    locator_ref = await mo.create_actor(
+        WorkerSupervisorLocatorActor,
+        "test",
+        temp_address_file,
+        uid=WorkerSupervisorLocatorActor.default_uid(),
+        address=actor_pool.external_address,
+    )
+
+    info_ref = await mo.actor_ref(
+        uid=NodeInfoCollectorActor.default_uid(), address=actor_pool.external_address
+    )
+    await info_ref.set_all_node_infos({actor_pool.external_address: NodeStatus.READY})
+
+    # test watch nodes changes
+    supervisors = await locator_ref.get_supervisors(filter_ready=False)
+    assert supervisors == addresses
+    version, result = await asyncio.wait_for(
+        locator_ref.watch_supervisors_by_keys(["mock_name"]),
+        timeout=30,
+    )
+    assert result[0] in addresses
+
+    # test watch without NodeInfoCollectorActor
+    await info_ref.destroy()
+
+    addresses = ["localhost:1234", "localhost:1235"]
+    with open(temp_address_file, "w") as file_obj:
+        file_obj.write("\n".join(addresses))
+    version, result = await asyncio.wait_for(
+        locator_ref.watch_supervisors_by_keys(["mock_name"], version=version),
+        timeout=30,
+    )
+    assert result[0] in addresses
+
+    # test watch when NodeInfoCollectorActor is created again
+    info_ref = await mo.create_actor(
+        MockNodeInfoCollectorActor,
+        uid=NodeInfoCollectorActor.default_uid(),
+        address=actor_pool.external_address,
+    )
+    await info_ref.set_all_node_infos({actor_pool.external_address: NodeStatus.READY})
+
+    addresses = [actor_pool.external_address]
+    with open(temp_address_file, "w") as file_obj:
+        file_obj.write("\n".join(addresses))
+
+    version, result = await asyncio.wait_for(
+        locator_ref.watch_supervisors_by_keys(["mock_name"], version=version),
+        timeout=30,
+    )
+    assert result[0] in addresses
diff --git a/python/xorbits/_mars/services/cluster/tests/test_procinfo.py b/python/xorbits/_mars/services/cluster/tests/test_procinfo.py
new file mode 100644
index 000000000..697ed0246
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/tests/test_procinfo.py
@@ -0,0 +1,42 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from .... import oscar as mo
+from ..procinfo import ProcessInfoManagerActor
+
+
+@pytest.fixture
+async def actor_pool():
+    pool = await mo.create_actor_pool(
+        "127.0.0.1", n_process=2, labels=["main", "numa-0", "gpu-0"]
+    )
+    async with pool:
+        yield pool
+
+
+@pytest.mark.asyncio
+async def test_proc_info(actor_pool):
+    address = actor_pool.external_address
+    manager_ref = await mo.create_actor(
+        ProcessInfoManagerActor,
+        uid=ProcessInfoManagerActor.default_uid(),
+        address=address,
+    )  # type: ProcessInfoManagerActor | mo.ActorRef
+    pool_cfgs = await manager_ref.get_pool_configs()
+    for cfg, expect_label in zip(pool_cfgs, ["main", "numa-0", "gpu-0"]):
+        assert cfg["label"] == expect_label
+    stacks = await manager_ref.get_thread_stacks()
+    assert len(stacks) == len(pool_cfgs)
diff --git a/python/xorbits/_mars/services/cluster/tests/test_service.py b/python/xorbits/_mars/services/cluster/tests/test_service.py
new file mode 100644
index 000000000..3e1e0160b
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/tests/test_service.py
@@ -0,0 +1,93 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+
+import pytest
+
+from .... import oscar as mo
+from ....storage import StorageLevel
+from ... import NodeRole, start_services, stop_services
+from .. import ClusterAPI, DiskInfo, QuotaInfo, StorageInfo, WorkerSlotInfo
+
+
+@pytest.fixture
+async def actor_pools():
+    async def start_pool():
+        pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+        await pool.start()
+        return pool
+
+    sv_pool, worker_pool = await asyncio.gather(start_pool(), start_pool())
+    try:
+        yield sv_pool, worker_pool
+    finally:
+        await asyncio.gather(sv_pool.stop(), worker_pool.stop())
+
+
+@pytest.mark.asyncio
+async def test_cluster_service(actor_pools):
+    sv_pool, worker_pool = actor_pools
+
+    config = {
+        "services": ["cluster"],
+        "cluster": {
+            "backend": "fixed",
+            "lookup_address": sv_pool.external_address,
+        },
+    }
+    await start_services(NodeRole.SUPERVISOR, config, address=sv_pool.external_address)
+    await start_services(NodeRole.WORKER, config, address=worker_pool.external_address)
+
+    sv_api = await ClusterAPI.create(sv_pool.external_address)
+    worker_api = await ClusterAPI.create(worker_pool.external_address)
+
+    await worker_api.set_band_quota_info(
+        "numa-0", QuotaInfo(quota_size=1024, allocated_size=100, hold_size=100)
+    )
+    await worker_api.set_band_slot_infos(
+        "numa-0",
+        [
+            WorkerSlotInfo(
+                slot_id=0,
+                session_id="test_session",
+                subtask_id="test_subtask",
+                processor_usage=1.0,
+            )
+        ],
+    )
+    await worker_api.set_band_storage_info(
+        "numa-0",
+        StorageInfo(storage_level=StorageLevel.MEMORY, total_size=1024, used_size=512),
+    )
+    curdir = os.path.dirname(os.path.abspath(__file__))
+    await worker_api.set_node_disk_info([DiskInfo(path=curdir)])
+    await asyncio.sleep(1.5)
+
+    assert (
+        next(iter(await sv_api.get_nodes_info(role=NodeRole.SUPERVISOR)))
+        == sv_pool.external_address
+    )
+    worker_infos = await sv_api.get_nodes_info(role=NodeRole.WORKER, detail=True)
+    assert worker_pool.external_address in worker_infos
+
+    info_details = worker_infos[worker_pool.external_address]["detail"]
+    assert len(info_details["disk"]["partitions"]) > 0
+    assert len(info_details["slot"]) > 0
+    assert len(info_details["quota"]) > 0
+    assert len(info_details["storage"]) > 0
+
+    await stop_services(NodeRole.WORKER, config, address=worker_pool.external_address)
+    await stop_services(NodeRole.SUPERVISOR, config, address=sv_pool.external_address)
diff --git a/python/xorbits/_mars/services/cluster/tests/test_uploader.py b/python/xorbits/_mars/services/cluster/tests/test_uploader.py
new file mode 100644
index 000000000..947bbd383
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/tests/test_uploader.py
@@ -0,0 +1,88 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+
+import pytest
+
+from .... import oscar as mo
+from ... import NodeRole
+from ..supervisor.locator import SupervisorPeerLocatorActor
+from ..supervisor.node_info import NodeInfoCollectorActor
+from ..uploader import NodeInfoUploaderActor
+
+
+@pytest.fixture
+async def actor_pool():
+    pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+    async with pool:
+        yield pool
+
+
+@pytest.mark.asyncio
+async def test_uploader(actor_pool):
+    pool_addr = actor_pool.external_address
+    await mo.create_actor(
+        SupervisorPeerLocatorActor,
+        "fixed",
+        pool_addr,
+        uid=SupervisorPeerLocatorActor.default_uid(),
+        address=pool_addr,
+    )
+    node_info_ref = await mo.create_actor(
+        NodeInfoCollectorActor,
+        timeout=0.5,
+        check_interval=0.1,
+        uid=NodeInfoCollectorActor.default_uid(),
+        address=pool_addr,
+    )
+    uploader_ref = await mo.create_actor(
+        NodeInfoUploaderActor,
+        role=NodeRole.WORKER,
+        interval=0.1,
+        uid=NodeInfoUploaderActor.default_uid(),
+        address=pool_addr,
+    )
+    wait_ready_task = asyncio.create_task(uploader_ref.wait_node_ready())
+    await uploader_ref.mark_node_ready()
+    await asyncio.wait_for(wait_ready_task, timeout=0.1)
+
+    # test empty result
+    result = await node_info_ref.get_nodes_info(role=NodeRole.WORKER)
+    assert pool_addr in result
+    assert all(result[pool_addr].get(k) is None for k in ("env", "resource", "detail"))
+
+    result = await node_info_ref.get_nodes_info(
+        role=NodeRole.WORKER, env=True, resource=True, detail=True
+    )
+    assert pool_addr in result
+    assert all(
+        result[pool_addr].get(k) is not None for k in ("env", "resource", "detail")
+    )
+
+    async def watcher():
+        version = None
+        while True:
+            version, infos = await node_info_ref.watch_nodes(
+                NodeRole.WORKER, version=version
+            )
+            if not infos:
+                break
+
+    watch_task = asyncio.create_task(watcher())
+
+    await uploader_ref.destroy()
+    assert not await asyncio.wait_for(watch_task, timeout=5)
+
+    await node_info_ref.destroy()
diff --git a/python/xorbits/_mars/services/cluster/uploader.py b/python/xorbits/_mars/services/cluster/uploader.py
new file mode 100644
index 000000000..86b2b2655
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/uploader.py
@@ -0,0 +1,196 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+from collections import defaultdict
+from typing import Dict, List
+
+from ... import oscar as mo
+from ...lib.aio import alru_cache
+from ...resource import Resource
+from ...storage import StorageLevel
+from ...typing import BandType
+from .core import DiskInfo, NodeInfo, NodeStatus, QuotaInfo, StorageInfo, WorkerSlotInfo
+from .gather import gather_node_details, gather_node_env, gather_node_resource
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_INFO_UPLOAD_INTERVAL = 1
+
+
+class NodeInfoUploaderActor(mo.Actor):
+    _band_slot_infos: Dict[str, List[WorkerSlotInfo]]
+    _band_quota_infos: Dict[str, QuotaInfo]
+    _disk_infos: List[DiskInfo]
+    _band_storage_infos: Dict[str, Dict[StorageLevel, StorageInfo]]
+
+    def __init__(self, role=None, interval=None, band_to_resource=None, use_gpu=True):
+        self._info = NodeInfo(role=role)
+
+        self._env_uploaded = False
+        self._band_to_resource = band_to_resource
+
+        self._interval = interval or DEFAULT_INFO_UPLOAD_INTERVAL
+        self._upload_task = None
+        self._upload_enabled = False
+        self._uploaded_future = asyncio.Future()
+        self._node_ready_event = asyncio.Event()
+
+        self._use_gpu = use_gpu
+
+        self._band_slot_infos = dict()
+        self._band_quota_infos = dict()
+        self._band_storage_infos = defaultdict(dict)
+        self._disk_infos = []
+
+    async def __post_create__(self):
+        self._upload_task = asyncio.create_task(self._periodical_upload_node_info())
+        await self._uploaded_future
+
+    async def __pre_destroy__(self):
+        self._upload_task.cancel()
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_node_info_ref(self):
+        from .locator import SupervisorLocatorActor
+        from .supervisor.node_info import NodeInfoCollectorActor
+
+        locator_ref = await mo.actor_ref(
+            SupervisorLocatorActor.default_uid(), address=self.address
+        )
+        supervisor_addr = await locator_ref.get_supervisor(
+            NodeInfoCollectorActor.default_uid()
+        )
+        if supervisor_addr is None:
+            raise ValueError
+
+        return await mo.actor_ref(
+            NodeInfoCollectorActor.default_uid(), address=supervisor_addr
+        )
+
+    async def _periodical_upload_node_info(self):
+        while True:
+            try:
+                await self.upload_node_info()
+                if not self._uploaded_future.done():
+                    self._uploaded_future.set_result(None)
+            except asyncio.CancelledError:  # pragma: no cover
+                break
+            except (
+                Exception
+            ) as ex:  # pragma: no cover  # noqa: E722  # nosec  # pylint: disable=bare-except
+                logger.error(f"Failed to upload node info: {ex}")
+                if not self._uploaded_future.done():
+                    self._uploaded_future.set_exception(ex)
+            try:
+                await asyncio.sleep(self._interval)
+            except asyncio.CancelledError:  # pragma: no cover
+                break
+
+    async def mark_node_ready(self):
+        self._upload_enabled = True
+
+        while True:
+            try:
+                # upload info in time to reduce latency
+                await self.upload_node_info(status=NodeStatus.READY)
+                break
+            except (mo.ActorNotExist, ConnectionError):  # pragma: no cover
+                await asyncio.sleep(1)
+
+        self._node_ready_event.set()
+
+    def is_node_ready(self):
+        return self._node_ready_event.is_set()
+
+    async def wait_node_ready(self):
+        return self._node_ready_event.wait()
+
+    async def upload_node_info(self, status: NodeStatus = None):
+        try:
+            if not self._info.env:
+                self._info.env = await asyncio.to_thread(gather_node_env)
+            self._info.detail.update(
+                await asyncio.to_thread(
+                    gather_node_details,
+                    disk_infos=self._disk_infos,
+                    band_storage_infos=self._band_storage_infos,
+                    band_slot_infos=self._band_slot_infos,
+                    band_quota_infos=self._band_quota_infos,
+                )
+            )
+
+            band_resources = await asyncio.to_thread(
+                gather_node_resource, self._band_to_resource, use_gpu=self._use_gpu
+            )
+
+            for band, res in band_resources.items():
+                try:
+                    res_dict = self._info.resource[band]
+                except KeyError:
+                    res_dict = self._info.resource[band] = dict()
+                res_dict.update(res)
+
+            if self._upload_enabled:
+                try:
+                    node_info_ref = await self._get_node_info_ref()
+                    if not self._env_uploaded:
+                        status = status or NodeStatus.READY
+                    await node_info_ref.update_node_info(
+                        address=self.address,
+                        role=self._info.role,
+                        env=self._info.env if not self._env_uploaded else None,
+                        resource=self._info.resource,
+                        detail=self._info.detail,
+                        status=status,
+                    )
+                    self._env_uploaded = True
+                except ValueError:
+                    pass
+        except RuntimeError as ex:  # pragma: no cover
+            if "cannot schedule new futures after interpreter shutdown" not in str(ex):
+                # when atexit is triggered, the default pool might be shutdown
+                # and to_thread will fail
+                raise
+        except:  # noqa: E722  # nosec  # pylint: disable=bare-except  # pragma: no cover
+            logger.exception(f"Failed to upload node info")
+            raise
+
+    def get_bands(self) -> Dict[BandType, int]:
+        band_resource = dict()
+        for resource_type, info in self._info.resource.items():
+            if resource_type.startswith("numa"):
+                # cpu
+                band_resource[(self.address, resource_type)] = Resource(
+                    num_cpus=info["cpu_total"], mem_bytes=info["memory_total"]
+                )
+            else:  # pragma: no cover
+                assert resource_type.startswith("gpu")
+                band_resource[(self.address, resource_type)] = Resource(
+                    num_gpus=info["gpu_total"]
+                )
+        return band_resource
+
+    def set_node_disk_info(self, node_disk_info: List[DiskInfo]):
+        self._disk_infos = node_disk_info
+
+    def set_band_storage_info(self, band_name: str, storage_info: StorageInfo):
+        self._band_storage_infos[band_name][storage_info.storage_level] = storage_info
+
+    def set_band_slot_infos(self, band_name, slot_infos: List[WorkerSlotInfo]):
+        self._band_slot_infos[band_name] = slot_infos
+
+    def set_band_quota_info(self, band_name, quota_info: QuotaInfo):
+        self._band_quota_infos[band_name] = quota_info
diff --git a/python/xorbits/_mars/services/cluster/worker/__init__.py b/python/xorbits/_mars/services/cluster/worker/__init__.py
new file mode 100644
index 000000000..321fd0623
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/worker/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .service import ClusterWorkerService
diff --git a/python/xorbits/_mars/services/cluster/worker/locator.py b/python/xorbits/_mars/services/cluster/worker/locator.py
new file mode 100644
index 000000000..37b161ecc
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/worker/locator.py
@@ -0,0 +1,90 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import List, Set
+
+from .... import oscar as mo
+from ..core import NodeRole, NodeStatus
+from ..locator import SupervisorLocatorActor
+
+logger = logging.getLogger(__name__)
+
+
+class WorkerSupervisorLocatorActor(SupervisorLocatorActor):
+    _node_role = NodeRole.WORKER
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._node_info_ref = None
+
+    @classmethod
+    def default_uid(cls):
+        return SupervisorLocatorActor.__name__
+
+    def _if_set_supervisors(
+        self, current_supervisors: Set[str], last_supervisors: Set[str]
+    ):
+        return current_supervisors != last_supervisors or self._node_info_ref is None
+
+    async def _set_supervisors(self, supervisors: List[str]):
+        await super()._set_supervisors(supervisors)
+        if supervisors and self._node_info_ref is None:
+            from ..supervisor.node_info import NodeInfoCollectorActor
+
+            supervisor_addr = self.get_supervisor(NodeInfoCollectorActor.default_uid())
+            try:
+                self._node_info_ref = await mo.actor_ref(
+                    uid=NodeInfoCollectorActor.default_uid(), address=supervisor_addr
+                )
+            except (OSError, mo.ServerClosed, mo.ActorNotExist):
+                self._node_info_ref = None
+
+    async def _get_supervisors_from_backend(self, filter_ready: bool = True):
+        try:
+            assert self._node_info_ref is not None
+            statuses = (
+                {NodeStatus.READY}
+                if filter_ready
+                else {NodeStatus.READY, NodeStatus.STARTING}
+            )
+            infos = await self._node_info_ref.get_nodes_info(
+                role=NodeRole.SUPERVISOR, statuses=statuses
+            )
+            return list(infos)
+        except (AssertionError, OSError, mo.ServerClosed, mo.ActorNotExist):
+            self._node_info_ref = None
+            return await self._backend.get_supervisors(filter_ready=filter_ready)
+
+    async def _watch_supervisor_from_node_info(self):
+        assert self._node_info_ref is not None
+        version = None
+        while True:
+            version, infos = await self._node_info_ref.watch_nodes(
+                role=NodeRole.SUPERVISOR, version=version
+            )
+            yield list(infos)
+
+    async def _watch_supervisors_from_backend(self):
+        while True:
+            try:
+                async for supervisors in self._watch_supervisor_from_node_info():
+                    yield supervisors
+            except (AssertionError, OSError, mo.ServerClosed, mo.ActorNotExist):
+                self._node_info_ref = None
+
+            async for supervisors in self._backend.watch_supervisors():
+                yield supervisors
+                if self._node_info_ref is not None:
+                    break
diff --git a/python/xorbits/_mars/services/cluster/worker/service.py b/python/xorbits/_mars/services/cluster/worker/service.py
new file mode 100644
index 000000000..9222a2130
--- /dev/null
+++ b/python/xorbits/_mars/services/cluster/worker/service.py
@@ -0,0 +1,90 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import oscar as mo
+from ...core import AbstractService, NodeRole
+from ..file_logger import FileLoggerActor
+from ..procinfo import ProcessInfoManagerActor
+from ..uploader import NodeInfoUploaderActor
+from .locator import WorkerSupervisorLocatorActor
+
+
+class ClusterWorkerService(AbstractService):
+    """
+    Cluster service on worker.
+
+    Service Configuration
+    ---------------------
+    {
+        "disk_dirs": ["List of disk directories"],
+        "cluster": {
+            "backend": "<cluster backend name>",
+            "lookup_address": "<address of master>",
+            "node_check_interval": check interval seconds for nodes,
+            "resource": {
+                "numa-0": Resource(num_cpus=8, mem_bytes=1073741824),
+                "gpu-0": Resource(num_gpus=1)
+            }
+        }
+    }
+    """
+
+    async def start(self):
+        svc_config = self._config["cluster"]
+        address = self._address
+
+        backend = svc_config.get("backend", "fixed")
+        lookup_address = svc_config.get(
+            "lookup_address", address if backend == "fixed" else None
+        )
+        await mo.create_actor(
+            WorkerSupervisorLocatorActor,
+            backend_name=backend,
+            lookup_address=lookup_address,
+            uid=WorkerSupervisorLocatorActor.default_uid(),
+            address=address,
+        )
+        await mo.create_actor(
+            NodeInfoUploaderActor,
+            role=NodeRole.WORKER,
+            interval=svc_config.get("node_check_interval"),
+            band_to_resource=svc_config.get("resource"),
+            uid=NodeInfoUploaderActor.default_uid(),
+            address=address,
+        )
+        await mo.create_actor(
+            ProcessInfoManagerActor,
+            uid=ProcessInfoManagerActor.default_uid(),
+            address=address,
+        )
+        await mo.create_actor(
+            FileLoggerActor, uid=FileLoggerActor.default_uid(), address=address
+        )
+
+    async def stop(self):
+        address = self._address
+
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=NodeInfoUploaderActor.default_uid(), address=address
+            )
+        )
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=WorkerSupervisorLocatorActor.default_uid(), address=address
+            )
+        )
+        await mo.destroy_actor(
+            mo.create_actor_ref(uid=FileLoggerActor.default_uid(), address=address)
+        )
diff --git a/python/xorbits/_mars/services/context.py b/python/xorbits/_mars/services/context.py
new file mode 100644
index 000000000..b646e8764
--- /dev/null
+++ b/python/xorbits/_mars/services/context.py
@@ -0,0 +1,301 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+from collections import defaultdict
+from functools import lru_cache
+from typing import Dict, List
+
+from .. import oscar as mo
+from ..core.context import Context
+from ..lib.aio import new_isolation
+from ..storage.base import StorageLevel
+from ..typing import BandType, SessionType
+from ..utils import implements, is_ray_address
+from .cluster import ClusterAPI, NodeRole
+from .meta import MetaAPI, WorkerMetaAPI
+from .session import SessionAPI
+from .storage import StorageAPI
+from .subtask import SubtaskAPI
+
+logger = logging.getLogger(__name__)
+
+
+class ThreadedServiceContext(Context):
+    _cluster_api: ClusterAPI
+    _session_api: SessionAPI
+    _meta_api: MetaAPI
+    _subtask_api: SubtaskAPI
+
+    def __init__(
+        self,
+        session_id: str,
+        supervisor_address: str,
+        worker_address: str,
+        local_address: str,
+        loop: asyncio.AbstractEventLoop,
+        band: BandType = None,
+    ):
+        super().__init__(
+            session_id=session_id,
+            supervisor_address=supervisor_address,
+            worker_address=worker_address,
+            local_address=local_address,
+            band=band,
+        )
+        self._loop = loop
+        # new isolation with current loop,
+        # so that session created in tile and execute
+        # can get the right isolation
+        new_isolation(loop=self._loop, threaded=False)
+
+        self._running_session_id = None
+        self._running_op_key = None
+
+        # APIs
+        self._cluster_api = None
+        self._session_api = None
+        self._meta_api = None
+        self._subtask_api = None
+
+    async def init(self):
+        self._cluster_api = await ClusterAPI.create(self.supervisor_address)
+        self._session_api = await SessionAPI.create(self.supervisor_address)
+        self._meta_api = await MetaAPI.create(self.session_id, self.supervisor_address)
+        try:
+            self._subtask_api = await SubtaskAPI.create(self.local_address)
+        except mo.ActorNotExist:
+            pass
+
+    def _call(self, coro):
+        fut = asyncio.run_coroutine_threadsafe(coro, self._loop)
+        return fut.result()
+
+    @implements(Context.get_current_session)
+    def get_current_session(self) -> SessionType:
+        from ..deploy.oscar.session import new_session
+
+        return new_session(
+            self.supervisor_address, self.session_id, new=False, default=False
+        )
+
+    @implements(Context.get_local_host_ip)
+    def get_local_host_ip(self) -> str:
+        local_address = self.local_address
+        if is_ray_address(local_address):
+            import ray
+
+            return ray.util.get_node_ip_address()
+        else:
+            return local_address.split(":", 1)[0]
+
+    @implements(Context.get_supervisor_addresses)
+    def get_supervisor_addresses(self) -> List[str]:
+        return self._call(self._cluster_api.get_supervisors())
+
+    @implements(Context.get_worker_addresses)
+    def get_worker_addresses(self) -> List[str]:
+        return list(self._call(self._cluster_api.get_nodes_info(role=NodeRole.WORKER)))
+
+    @implements(Context.get_worker_bands)
+    def get_worker_bands(self) -> List[BandType]:
+        return list(self._call(self._cluster_api.get_all_bands(NodeRole.WORKER)))
+
+    @implements(Context.get_total_n_cpu)
+    def get_total_n_cpu(self) -> int:
+        all_bands = self._call(self._cluster_api.get_all_bands())
+        n_cpu = 0
+        for band, resource in all_bands.items():
+            _, band_name = band
+            if band_name.startswith("numa-"):
+                n_cpu += resource.num_cpus
+        return n_cpu
+
+    @implements(Context.get_slots)
+    def get_slots(self) -> int:
+        worker_bands = self._call(self._get_worker_bands())
+        resource = worker_bands[self.band]
+        return int(resource.num_cpus or resource.num_gpus)
+
+    async def _get_worker_bands(self):
+        worker_cluster_api = await ClusterAPI.create(self.worker_address)
+        return await worker_cluster_api.get_bands()
+
+    async def _get_chunks_meta(
+        self, data_keys: List[str], fields: List[str] = None, error: str = "raise"
+    ) -> List[Dict]:
+        # get chunks meta
+        get_metas = []
+        for data_key in data_keys:
+            meta = self._meta_api.get_chunk_meta.delay(
+                data_key, fields=["bands"], error=error
+            )
+            get_metas.append(meta)
+        supervisor_metas = await self._meta_api.get_chunk_meta.batch(*get_metas)
+        key_to_supervisor_metas = dict(zip(data_keys, supervisor_metas))
+        api_to_keys_calls = defaultdict(lambda: (list(), list()))
+        for data_key, meta in zip(data_keys, supervisor_metas):
+            addr = meta["bands"][0][0]
+            worker_meta_api = await WorkerMetaAPI.create(self.session_id, addr)
+            keys, calls = api_to_keys_calls[worker_meta_api]
+            keys.append(data_key)
+            calls.append(
+                worker_meta_api.get_chunk_meta.delay(
+                    data_key, fields=fields, error=error
+                )
+            )
+        coros = []
+        for api, (keys, calls) in api_to_keys_calls.items():
+            coros.append(api.get_chunk_meta.batch(*calls))
+        all_metas = await asyncio.gather(*coros)
+        key_to_meta = dict()
+        for (keys, _), metas in zip(api_to_keys_calls.values(), all_metas):
+            for k, meta in zip(keys, metas):
+                meta["bands"] = key_to_supervisor_metas[k]["bands"]
+                key_to_meta[k] = meta
+        return [key_to_meta[k] for k in data_keys]
+
+    async def _get_chunks_result(self, data_keys: List[str]) -> List:
+        metas = await self._get_chunks_meta(data_keys, fields=["bands"])
+        addresses = [meta["bands"][0][0] for meta in metas]
+
+        storage_api_to_gets = defaultdict(lambda: (list(), list()))
+        for data_key, address in zip(data_keys, addresses):
+            storage_api = await StorageAPI.create(self.session_id, address)
+            storage_api_to_gets[storage_api][0].append(data_key)
+            storage_api_to_gets[storage_api][1].append(storage_api.get.delay(data_key))
+        results = dict()
+        for storage_api, (keys, gets) in storage_api_to_gets.items():
+            chunks_data = await storage_api.get.batch(*gets)
+            for chunk_key, chunk_data in zip(keys, chunks_data):
+                results[chunk_key] = chunk_data
+        return [results[key] for key in data_keys]
+
+    async def _fetch_chunks(self, data_keys: List[str]):
+        metas = await self._get_chunks_meta(data_keys, fields=["bands"])
+        bands = [meta["bands"][0] for meta in metas]
+
+        storage_api = await StorageAPI.create(self.session_id, self.local_address)
+        fetches = []
+        for data_key, (address, band_name) in zip(data_keys, bands):
+            fetches.append(
+                storage_api.fetch.delay(
+                    data_key, remote_address=address, band_name=band_name
+                )
+            )
+        await storage_api.fetch.batch(*fetches)
+
+    @implements(Context.get_chunks_result)
+    def get_chunks_result(self, data_keys: List[str], fetch_only: bool = False) -> List:
+        if not fetch_only:
+            return self._call(self._get_chunks_result(data_keys))
+        else:
+            return self._call(self._fetch_chunks(data_keys))
+
+    @implements(Context.get_chunks_meta)
+    def get_chunks_meta(
+        self, data_keys: List[str], fields: List[str] = None, error="raise"
+    ) -> List[Dict]:
+        return self._call(self._get_chunks_meta(data_keys, fields=fields, error=error))
+
+    async def _get_backend_info(
+        self, address: str = None, level: StorageLevel = StorageLevel.MEMORY
+    ) -> dict:
+        if address is None:
+            address = self.worker_address
+        storage_api = await StorageAPI.create(self.session_id, address)
+        return await storage_api.get_storage_info(level)
+
+    @implements(Context.get_storage_info)
+    def get_storage_info(
+        self, address: str = None, level: StorageLevel = StorageLevel.MEMORY
+    ):
+        return self._call(self._get_backend_info(address, level))
+
+    @implements(Context.create_remote_object)
+    def create_remote_object(self, name: str, object_cls, *args, **kwargs):
+        ref = self._call(
+            self._session_api.create_remote_object(
+                self.session_id, name, object_cls, *args, **kwargs
+            )
+        )
+        return _RemoteObjectWrapper(ref, self._loop)
+
+    @implements(Context.get_remote_object)
+    def get_remote_object(self, name: str):
+        ref = self._call(self._session_api.get_remote_object(self.session_id, name))
+        return _RemoteObjectWrapper(ref, self._loop)
+
+    @implements(Context.destroy_remote_object)
+    def destroy_remote_object(self, name: str):
+        return self._call(
+            self._session_api.destroy_remote_object(self.session_id, name)
+        )
+
+    @implements(Context.register_custom_log_path)
+    def register_custom_log_path(
+        self,
+        session_id: str,
+        tileable_op_key: str,
+        chunk_op_key: str,
+        worker_address: str,
+        log_path: str,
+    ):
+        return self._call(
+            self._session_api.register_custom_log_path(
+                session_id, tileable_op_key, chunk_op_key, worker_address, log_path
+            )
+        )
+
+    @implements(Context.new_custom_log_dir)
+    @lru_cache(50)
+    def new_custom_log_dir(self) -> str:
+        return self._call(
+            self._session_api.new_custom_log_dir(self.local_address, self.session_id)
+        )
+
+    def set_running_operand_key(self, session_id: str, op_key: str):
+        self._running_session_id = session_id
+        self._running_op_key = op_key
+
+    def set_progress(self, progress: float):
+        if (
+            self._running_op_key is None or self._subtask_api is None
+        ):  # pragma: no cover
+            return
+        return self._call(
+            self._subtask_api.set_running_operand_progress(
+                session_id=self._running_session_id,
+                op_key=self._running_op_key,
+                slot_address=self.local_address,
+                progress=progress,
+            )
+        )
+
+
+class _RemoteObjectWrapper:
+    def __init__(self, ref: mo.ActorRef, loop: asyncio.AbstractEventLoop):
+        self._ref = ref
+        self._loop = loop
+
+    def __getattr__(self, attr):
+        func = getattr(self._ref, attr)
+
+        def wrap(*args, **kwargs):
+            coro = func(*args, **kwargs)
+            fut = asyncio.run_coroutine_threadsafe(coro, loop=self._loop)
+            return fut.result()
+
+        return wrap
diff --git a/python/xorbits/_mars/services/core.py b/python/xorbits/_mars/services/core.py
new file mode 100644
index 000000000..8a25bf24f
--- /dev/null
+++ b/python/xorbits/_mars/services/core.py
@@ -0,0 +1,201 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+import asyncio
+import enum
+import importlib
+import inspect
+import warnings
+from typing import Dict, Iterable, List, Union
+
+_ModulesType = Union[List, str, None]
+
+
+class NodeRole(enum.Enum):
+    SUPERVISOR = 0
+    WORKER = 1
+
+
+class AbstractService(abc.ABC):
+    _instances = dict()
+
+    def __init__(self, config: Dict, address: str):
+        self._config = config
+        self._address = address
+
+    @classmethod
+    def get_instance(cls, address: str, config: Dict = None):
+        type_addr = (cls, address)
+        if type_addr not in cls._instances:
+            inst = cls._instances[type_addr] = cls(config, address)
+        else:
+            inst = cls._instances[type_addr]
+        return inst
+
+    @classmethod
+    def clear(cls):
+        cls._instances = dict()
+
+    @abc.abstractmethod
+    async def start(self):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    async def stop(self):
+        raise NotImplementedError
+
+    async def create_session(self, session_id: str):
+        pass
+
+    async def destroy_session(self, session_id: str):
+        pass
+
+
+class EmptyService(AbstractService):
+    async def start(self):
+        pass
+
+    async def stop(self):
+        pass
+
+
+def _find_service_entries(node_role: NodeRole, services: List, modules: List):
+    svc_entries_list = []
+
+    web_handlers = {}
+    for svc_names in services:
+        if isinstance(svc_names, str):
+            svc_names = [svc_names]
+        svc_entries = []
+        for svc_name in svc_names:
+            svc_mod = None
+            for mod_name in modules:
+                try:
+                    full_mod_name = f"{mod_name}.{svc_name}.{node_role.name.lower()}"
+                    svc_mod = importlib.import_module(full_mod_name)
+
+                    abstract_derivatives = []
+                    valid_derivatives = []
+                    for attr_name in dir(svc_mod):
+                        obj = getattr(svc_mod, attr_name)
+                        if (
+                            obj is not AbstractService
+                            and isinstance(obj, type)
+                            and issubclass(obj, AbstractService)
+                        ):
+                            if inspect.isabstract(obj):
+                                abstract_derivatives.append(obj)
+                            else:
+                                valid_derivatives.append(obj)
+
+                    svc_entries.extend(valid_derivatives)
+                    if not valid_derivatives and abstract_derivatives:
+                        warnings.warn(
+                            f"Module {full_mod_name} does not have non-abstract "
+                            f"service classes, but abstract classes "
+                            f"{abstract_derivatives} found.",
+                            RuntimeWarning,
+                        )
+
+                    try:
+                        web_mod = importlib.import_module(
+                            mod_name + "." + svc_name + ".api.web"
+                        )
+                        web_handlers.update(getattr(web_mod, "web_handlers", {}))
+                    except ImportError:
+                        pass
+                except ImportError:
+                    pass
+            if svc_mod is None:
+                raise ImportError(f"Cannot discover {node_role} for service {svc_name}")
+        svc_entries_list.append(svc_entries)
+
+    return svc_entries_list, web_handlers
+
+
+def _normalize_modules(modules: _ModulesType):
+    if modules is None:
+        modules = []
+    elif isinstance(modules, str):
+        modules = [modules]
+    else:
+        modules = list(modules)
+    modules = [__name__.rsplit(".", 1)[0]] + modules
+    return modules
+
+
+def _iter_service_instances(
+    node_role: NodeRole, config: Dict, address: str = None, reverse: bool = False
+) -> Iterable[List[AbstractService]]:
+    modules = _normalize_modules(config.get("modules"))
+    service_names = config["services"]
+    if reverse:
+        service_names = service_names[::-1]
+
+    svc_entries_list, _ = _find_service_entries(node_role, service_names, modules)
+    for entries in svc_entries_list:
+        yield [svc_entry.get_instance(address, config) for svc_entry in entries]
+
+
+async def start_services(
+    node_role: NodeRole, config: Dict, address: str = None, mark_ready: bool = True
+):
+    modules = _normalize_modules(config.get("modules"))
+
+    # discover services
+    service_names = config["services"]
+
+    svc_entries_list, web_handlers = _find_service_entries(
+        node_role, service_names, modules
+    )
+
+    if "web" in service_names:
+        try:
+            web_config = config["web"]
+        except KeyError:
+            web_config = config["web"] = dict()
+
+        web_config["web_handlers"] = web_handlers
+
+    for entries in svc_entries_list:
+        instances = [svc_entry.get_instance(address, config) for svc_entry in entries]
+        await asyncio.gather(*[inst.start() for inst in instances])
+
+    if mark_ready and "cluster" in service_names:
+        from .cluster import ClusterAPI
+
+        cluster_api = await ClusterAPI.create(address)
+        await cluster_api.mark_node_ready()
+
+
+async def stop_services(node_role: NodeRole, config: Dict, address: str = None):
+    for instances in _iter_service_instances(node_role, config, address, reverse=True):
+        await asyncio.gather(*[inst.stop() for inst in instances])
+
+    AbstractService.clear()
+
+
+async def create_service_session(
+    node_role: NodeRole, config: Dict, session_id: str = None, address: str = None
+):
+    for instances in _iter_service_instances(node_role, config, address):
+        await asyncio.gather(*[inst.create_session(session_id) for inst in instances])
+
+
+async def destroy_service_session(
+    node_role: NodeRole, config: Dict, session_id: str = None, address: str = None
+):
+    for instances in _iter_service_instances(node_role, config, address, reverse=True):
+        await asyncio.gather(*[inst.destroy_session(session_id) for inst in instances])
diff --git a/python/xorbits/_mars/services/lifecycle/__init__.py b/python/xorbits/_mars/services/lifecycle/__init__.py
new file mode 100644
index 000000000..bd908f4fa
--- /dev/null
+++ b/python/xorbits/_mars/services/lifecycle/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .api import AbstractLifecycleAPI, LifecycleAPI, MockLifecycleAPI, WebLifecycleAPI
+from .errors import TileableNotTracked
diff --git a/python/xorbits/_mars/services/lifecycle/api/__init__.py b/python/xorbits/_mars/services/lifecycle/api/__init__.py
new file mode 100644
index 000000000..579637d58
--- /dev/null
+++ b/python/xorbits/_mars/services/lifecycle/api/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import AbstractLifecycleAPI
+from .oscar import LifecycleAPI, MockLifecycleAPI
+from .web import WebLifecycleAPI
diff --git a/python/xorbits/_mars/services/lifecycle/api/core.py b/python/xorbits/_mars/services/lifecycle/api/core.py
new file mode 100644
index 000000000..fadcaf26e
--- /dev/null
+++ b/python/xorbits/_mars/services/lifecycle/api/core.py
@@ -0,0 +1,43 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Dict, List
+
+
+class AbstractLifecycleAPI(ABC):
+    @abstractmethod
+    async def decref_tileables(
+        self, tileable_keys: List[str], counts: List[int] = None
+    ):
+        """
+        Decref tileables.
+
+        Parameters
+        ----------
+        tileable_keys : list
+            List of tileable keys.
+        counts: list
+            List of ref count.
+        """
+
+    @abstractmethod
+    async def get_all_chunk_ref_counts(self) -> Dict[str, int]:
+        """
+        Get all chunk keys' ref counts.
+
+        Returns
+        -------
+        key_to_ref_counts: dict
+        """
diff --git a/python/xorbits/_mars/services/lifecycle/api/oscar.py b/python/xorbits/_mars/services/lifecycle/api/oscar.py
new file mode 100644
index 000000000..02f85bd1a
--- /dev/null
+++ b/python/xorbits/_mars/services/lifecycle/api/oscar.py
@@ -0,0 +1,188 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List
+
+from .... import oscar as mo
+from ....lib.aio import alru_cache
+from ..supervisor.tracker import LifecycleTrackerActor
+from .core import AbstractLifecycleAPI
+
+
+class LifecycleAPI(AbstractLifecycleAPI):
+    def __init__(
+        self,
+        session_id: str,
+        lifecycle_tracker_ref: mo.ActorRefType[LifecycleTrackerActor],
+    ):
+        self._session_id = session_id
+        self._lifecycle_tracker_ref = lifecycle_tracker_ref
+
+    @classmethod
+    @alru_cache(cache_exceptions=False)
+    async def create(cls, session_id: str, address: str) -> "LifecycleAPI":
+        """
+        Create Lifecycle API.
+
+        Parameters
+        ----------
+        session_id : str
+            Session ID.
+        address : str
+            Supervisor address.
+
+        Returns
+        -------
+        lifecycle_api
+            Lifecycle API.
+        """
+        lifecycle_tracker_ref = await mo.actor_ref(
+            address, LifecycleTrackerActor.gen_uid(session_id)
+        )
+        return LifecycleAPI(session_id, lifecycle_tracker_ref)
+
+    @mo.extensible
+    async def track(self, tileable_key: str, chunk_keys: List[str]):
+        """
+        Track tileable.
+
+        Parameters
+        ----------
+        tileable_key : str
+            Tileable key.
+        chunk_keys : list
+            List of chunk keys.
+        """
+        return await self._lifecycle_tracker_ref.track(tileable_key, chunk_keys)
+
+    @track.batch
+    async def batch_track(self, args_list, kwargs_list):
+        tracks = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            tracks.append(self._lifecycle_tracker_ref.track.delay(*args, **kwargs))
+        return await self._lifecycle_tracker_ref.track.batch(*tracks)
+
+    async def incref_tileables(
+        self, tileable_keys: List[str], counts: List[int] = None
+    ):
+        """
+        Incref tileables.
+
+        Parameters
+        ----------
+        tileable_keys : list
+             List of tileable keys.
+        counts: list
+            List of ref count.
+        """
+        return await self._lifecycle_tracker_ref.incref_tileables(
+            tileable_keys, counts=counts
+        )
+
+    async def decref_tileables(
+        self, tileable_keys: List[str], counts: List[int] = None
+    ):
+        """
+        Decref tileables.
+
+        Parameters
+        ----------
+        tileable_keys : list
+            List of tileable keys.
+        counts: list
+            List of ref count.
+        """
+        return await self._lifecycle_tracker_ref.decref_tileables(tileable_keys)
+
+    async def get_tileable_ref_counts(self, tileable_keys: List[str]) -> List[int]:
+        """
+        Get ref counts of tileables.
+
+        Parameters
+        ----------
+        tileable_keys : list
+            List of tileable keys.
+
+        Returns
+        -------
+        ref_counts : list
+            List of ref counts.
+        """
+        return await self._lifecycle_tracker_ref.get_tileable_ref_counts(tileable_keys)
+
+    async def incref_chunks(self, chunk_keys: List[str], counts: List[int] = None):
+        """
+        Incref chunks.
+
+        Parameters
+        ----------
+        chunk_keys : list
+            List of chunk keys.
+        counts: list
+            List of ref count.
+        """
+        return await self._lifecycle_tracker_ref.incref_chunks(
+            chunk_keys, counts=counts
+        )
+
+    async def decref_chunks(self, chunk_keys: List[str], counts: List[int] = None):
+        """
+        Decref chunks
+
+        Parameters
+        ----------
+        chunk_keys : list
+            List of chunk keys.
+        counts: list
+            List of ref count.
+        """
+        return await self._lifecycle_tracker_ref.decref_chunks(
+            chunk_keys, counts=counts
+        )
+
+    async def get_chunk_ref_counts(self, chunk_keys: List[str]) -> List[int]:
+        """
+        Get ref counts of chunks.
+
+        Parameters
+        ----------
+        chunk_keys : list
+            List of chunk keys.
+
+        Returns
+        -------
+        ref_counts : list
+            List of ref counts.
+        """
+        return await self._lifecycle_tracker_ref.get_chunk_ref_counts(chunk_keys)
+
+    async def get_all_chunk_ref_counts(self) -> Dict[str, int]:
+        """
+        Get all chunk keys' ref counts.
+
+        Returns
+        -------
+        key_to_ref_counts: dict
+        """
+        return await self._lifecycle_tracker_ref.get_all_chunk_ref_counts()
+
+
+class MockLifecycleAPI(LifecycleAPI):
+    @classmethod
+    async def create(cls, session_id: str, address: str) -> "LifecycleAPI":
+        from ..supervisor.service import LifecycleSupervisorService
+
+        service = LifecycleSupervisorService({}, address)
+        await service.create_session(session_id)
+        return await super().create(session_id=session_id, address=address)
diff --git a/python/xorbits/_mars/services/lifecycle/api/web.py b/python/xorbits/_mars/services/lifecycle/api/web.py
new file mode 100644
index 000000000..870c0e6e5
--- /dev/null
+++ b/python/xorbits/_mars/services/lifecycle/api/web.py
@@ -0,0 +1,78 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, List
+
+from ....utils import deserialize_serializable, serialize_serializable
+from ...web import MarsServiceWebAPIHandler, MarsWebAPIClientMixin, web_api
+from .core import AbstractLifecycleAPI
+
+
+class LifecycleWebAPIHandler(MarsServiceWebAPIHandler):
+    _root_pattern = "/api/session/(?P<session_id>[^/]+)/lifecycle"
+
+    async def _get_oscar_lifecycle_api(self, session_id: str):
+        from .oscar import LifecycleAPI
+
+        return await self._get_api_by_key(LifecycleAPI, session_id)
+
+    @web_api("", method="post", arg_filter={"action": "decref_tileables"})
+    async def decref_tileables(self, session_id: str):
+        tileable_keys = self.get_argument("tileable_keys").split(",")
+        counts = self.get_argument("counts", None)
+        if counts:
+            counts = [int(c) for c in counts.split(",")]
+
+        oscar_api = await self._get_oscar_lifecycle_api(session_id)
+        await oscar_api.decref_tileables(tileable_keys, counts=counts)
+
+    @web_api("", method="get", arg_filter={"action": "get_all_chunk_ref_counts"})
+    async def get_all_chunk_ref_counts(self, session_id: str):
+        oscar_api = await self._get_oscar_lifecycle_api(session_id)
+        res = await oscar_api.get_all_chunk_ref_counts()
+        self.write(serialize_serializable(res))
+
+
+web_handlers = {LifecycleWebAPIHandler.get_root_pattern(): LifecycleWebAPIHandler}
+
+
+class WebLifecycleAPI(AbstractLifecycleAPI, MarsWebAPIClientMixin):
+    def __init__(
+        self, session_id: str, address: str, request_rewriter: Callable = None
+    ):
+        self._session_id = session_id
+        self._address = address.rstrip("/")
+        self.request_rewriter = request_rewriter
+
+    async def decref_tileables(
+        self, tileable_keys: List[str], counts: List[int] = None
+    ):
+        path = f"{self._address}/api/session/{self._session_id}/lifecycle"
+        params = dict(action="decref_tileables")
+        counts = (
+            f"&counts={','.join(str(c) for c in counts)}" if counts is not None else ""
+        )
+        await self._request_url(
+            path=path,
+            method="POST",
+            params=params,
+            headers={"Content-Type": "application/x-www-form-urlencoded"},
+            data="tileable_keys=" + ",".join(tileable_keys) + counts,
+        )
+
+    async def get_all_chunk_ref_counts(self) -> Dict[str, int]:
+        params = dict(action="get_all_chunk_ref_counts")
+        path = f"{self._address}/api/session/{self._session_id}/lifecycle"
+        res = await self._request_url("GET", path, params=params)
+        return deserialize_serializable(res.body)
diff --git a/python/xorbits/_mars/services/lifecycle/errors.py b/python/xorbits/_mars/services/lifecycle/errors.py
new file mode 100644
index 000000000..4b020c556
--- /dev/null
+++ b/python/xorbits/_mars/services/lifecycle/errors.py
@@ -0,0 +1,19 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...core.base import MarsError
+
+
+class TileableNotTracked(MarsError):
+    pass
diff --git a/python/xorbits/_mars/services/lifecycle/supervisor/__init__.py b/python/xorbits/_mars/services/lifecycle/supervisor/__init__.py
new file mode 100644
index 000000000..3491eec41
--- /dev/null
+++ b/python/xorbits/_mars/services/lifecycle/supervisor/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .service import LifecycleSupervisorService
diff --git a/python/xorbits/_mars/services/lifecycle/supervisor/service.py b/python/xorbits/_mars/services/lifecycle/supervisor/service.py
new file mode 100644
index 000000000..bc4985bea
--- /dev/null
+++ b/python/xorbits/_mars/services/lifecycle/supervisor/service.py
@@ -0,0 +1,40 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import oscar as mo
+from ...core import AbstractService
+from .tracker import LifecycleTrackerActor
+
+
+class LifecycleSupervisorService(AbstractService):
+    async def start(self):
+        pass
+
+    async def stop(self):
+        pass
+
+    async def create_session(self, session_id: str):
+        await mo.create_actor(
+            LifecycleTrackerActor,
+            session_id,
+            address=self._address,
+            uid=LifecycleTrackerActor.gen_uid(session_id),
+        )
+
+    async def destroy_session(self, session_id: str):
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=LifecycleTrackerActor.gen_uid(session_id), address=self._address
+            )
+        )
diff --git a/python/xorbits/_mars/services/lifecycle/supervisor/tests/__init__.py b/python/xorbits/_mars/services/lifecycle/supervisor/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/lifecycle/supervisor/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/lifecycle/supervisor/tests/test_tracker.py b/python/xorbits/_mars/services/lifecycle/supervisor/tests/test_tracker.py
new file mode 100644
index 000000000..c4bc49c25
--- /dev/null
+++ b/python/xorbits/_mars/services/lifecycle/supervisor/tests/test_tracker.py
@@ -0,0 +1,110 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+from ..... import oscar as mo
+from ..... import tensor as mt
+from .....core import tile
+from ....cluster import MockClusterAPI
+from ....meta import MockMetaAPI
+from ....session import MockSessionAPI
+from ....storage import DataNotExist, MockStorageAPI
+from ....task.supervisor.manager import TaskManagerActor
+from ... import TileableNotTracked
+from ...supervisor.tracker import LifecycleTrackerActor
+
+
+class FakeTaskManager(TaskManagerActor):
+    def __init__(self, session_id: str):
+        super().__init__(session_id)
+        self._remove_tileables = []
+
+    async def __post_create__(self):
+        pass
+
+    def remove_tileables(self, tileable_keys):
+        self._remove_tileables.extend(tileable_keys)
+
+    def get_removed_tileables(self):
+        return self._remove_tileables
+
+
+@pytest.mark.asyncio
+async def test_tracker():
+    pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+
+    async with pool:
+        addr = pool.external_address
+        session_id = "test_session"
+        await MockClusterAPI.create(addr)
+        await MockSessionAPI.create(addr, session_id=session_id)
+        meta_api = await MockMetaAPI.create(session_id, addr)
+        storage_api = await MockStorageAPI.create(session_id, addr)
+
+        try:
+            task_manager = await mo.create_actor(
+                FakeTaskManager,
+                session_id,
+                uid=FakeTaskManager.gen_uid(session_id),
+                address=pool.external_address,
+            )
+
+            tracker = await mo.create_actor(
+                LifecycleTrackerActor,
+                session_id,
+                uid=LifecycleTrackerActor.gen_uid(session_id),
+                address=pool.external_address,
+            )
+
+            t = mt.random.rand(15, 5, chunk_size=5)
+            t = tile(t)
+
+            tileable_key = t.key
+            chunk_keys = []
+            for c in t.chunks:
+                chunk_keys.append(c.key)
+                await meta_api.set_chunk_meta(c, bands=[(addr, "numa-0")])
+                await storage_api.put(c.key, np.random.rand(5, 5))
+
+            await tracker.track(tileable_key, chunk_keys)
+            await tracker.incref_tileables([tileable_key])
+            await tracker.incref_tileables([tileable_key], [2])
+            await tracker.incref_chunks(chunk_keys[:2])
+            await tracker.incref_chunks(chunk_keys[:2], [3, 3])
+            await tracker.decref_chunks(chunk_keys[:2])
+            await tracker.decref_chunks(chunk_keys[:2], [3, 3])
+            await tracker.decref_tileables([tileable_key])
+            await tracker.decref_tileables([tileable_key], [2])
+            assert len(await tracker.get_all_chunk_ref_counts()) == 0
+            assert await task_manager.get_removed_tileables() == [tileable_key]
+
+            with pytest.raises(ValueError):
+                await tracker.incref_tileables([tileable_key], [2, 3])
+
+            for chunk_key in chunk_keys:
+                with pytest.raises(KeyError):
+                    await meta_api.get_chunk_meta(chunk_key)
+            for chunk_key in chunk_keys:
+                with pytest.raises(DataNotExist):
+                    await storage_api.get(chunk_key)
+
+            with pytest.raises(TileableNotTracked):
+                await tracker.incref_tileables(["not_tracked"])
+            with pytest.raises(TileableNotTracked):
+                await tracker.decref_tileables(["not_tracked"])
+        finally:
+            await MockStorageAPI.cleanup(pool.external_address)
+            await MockClusterAPI.cleanup(pool.external_address)
diff --git a/python/xorbits/_mars/services/lifecycle/supervisor/tracker.py b/python/xorbits/_mars/services/lifecycle/supervisor/tracker.py
new file mode 100644
index 000000000..00a8d6c7a
--- /dev/null
+++ b/python/xorbits/_mars/services/lifecycle/supervisor/tracker.py
@@ -0,0 +1,258 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import itertools
+import logging
+from collections import defaultdict
+from typing import Dict, List, Optional
+
+from .... import oscar as mo
+from ....lib.aio import alru_cache
+from ...meta.api import MetaAPI
+from ...storage.api import StorageAPI
+from ..errors import TileableNotTracked
+
+logger = logging.getLogger(__name__)
+
+
+class LifecycleTrackerActor(mo.Actor):
+    _meta_api: MetaAPI
+
+    def __init__(self, session_id: str):
+        self._session_id = session_id
+        self._tileable_key_to_chunk_keys = dict()
+        self._tileable_ref_counts = defaultdict(lambda: 0)
+        self._chunk_ref_counts = defaultdict(lambda: 0)
+
+        self._meta_api: Optional[MetaAPI] = None
+
+    async def __post_create__(self):
+        self._meta_api = await MetaAPI.create(self._session_id, self.address)
+
+    async def __pre_destroy__(self):
+        chunk_keys = [
+            chunk_key
+            for chunk_key, ref_count in self._chunk_ref_counts.items()
+            if ref_count > 0
+        ]
+        # remove all chunks
+        await self._remove_chunks(chunk_keys)
+
+    @alru_cache
+    async def _get_task_api(self):
+        from ...task.api import TaskAPI
+
+        return await TaskAPI.create(self._session_id, self.address)
+
+    @staticmethod
+    def gen_uid(session_id):
+        return f"{session_id}_lifecycle_tracker"
+
+    def _track(self, tileable_key: str, chunk_keys: List[str]):
+        if tileable_key not in self._tileable_key_to_chunk_keys:
+            self._tileable_key_to_chunk_keys[tileable_key] = []
+        chunk_keys_set = set(self._tileable_key_to_chunk_keys[tileable_key])
+        incref_chunk_keys = []
+        tileable_ref_count = self._tileable_ref_counts.get(tileable_key, 0)
+        for chunk_key in chunk_keys:
+            if chunk_key in chunk_keys_set:
+                continue
+            if tileable_ref_count > 0:
+                incref_chunk_keys.extend([chunk_key] * tileable_ref_count)
+            self._tileable_key_to_chunk_keys[tileable_key].append(chunk_key)
+        if incref_chunk_keys:
+            self._incref_chunks(incref_chunk_keys)
+
+    @mo.extensible
+    async def track(self, tileable_key: str, chunk_keys: List[str]):
+        return await asyncio.to_thread(self._track, tileable_key, chunk_keys)
+
+    @classmethod
+    def _check_ref_counts(cls, keys: List[str], ref_counts: List[int]):
+        if ref_counts is not None and len(keys) != len(ref_counts):
+            raise ValueError(
+                f"`ref_counts` should have same size as `keys`, expect {len(keys)}, got {len(ref_counts)}"
+            )
+
+    def _incref_chunks(self, chunk_keys: List[str], counts: List[int] = None):
+        counts = counts if counts is not None else itertools.repeat(1)
+        for chunk_key, count in zip(chunk_keys, counts):
+            self._chunk_ref_counts[chunk_key] += count
+
+    async def incref_chunks(self, chunk_keys: List[str], counts: List[int] = None):
+        self._check_ref_counts(chunk_keys, counts)
+        return await asyncio.to_thread(self._incref_chunks, chunk_keys, counts=counts)
+
+    def _get_remove_chunk_keys(self, chunk_keys: List[str], counts: List[int] = None):
+        to_remove_chunk_keys = []
+        counts = counts if counts is not None else itertools.repeat(1)
+        for chunk_key, count in zip(chunk_keys, counts):
+            ref_count = self._chunk_ref_counts[chunk_key]
+            ref_count -= count
+            assert ref_count >= 0, f"chunk key {chunk_key} will have negative ref count"
+            self._chunk_ref_counts[chunk_key] = ref_count
+            if ref_count == 0:
+                # remove
+                to_remove_chunk_keys.append(chunk_key)
+        return to_remove_chunk_keys
+
+    async def decref_chunks(self, chunk_keys: List[str], counts: List[int] = None):
+        self._check_ref_counts(chunk_keys, counts)
+        to_remove_chunk_keys = await asyncio.to_thread(
+            self._get_remove_chunk_keys, chunk_keys, counts=counts
+        )
+        # make _remove_chunks release actor lock so that multiple `decref_chunks` can run concurrently.
+        yield self._remove_chunks(to_remove_chunk_keys)
+
+    async def _remove_chunks(self, to_remove_chunk_keys: List[str]):
+        # get meta
+        logger.debug(
+            "Remove chunks %.500s with a refcount of zero", to_remove_chunk_keys
+        )
+        get_metas = []
+        for to_remove_chunk_key in to_remove_chunk_keys:
+            get_metas.append(
+                self._meta_api.get_chunk_meta.delay(
+                    to_remove_chunk_key, fields=["bands"], error="ignore"
+                )
+            )
+        metas = await self._meta_api.get_chunk_meta.batch(*get_metas)
+
+        # filter chunks that not exist
+        new_to_remove_chunk_keys = []
+        new_metas = []
+        for to_remove_chunk_key, meta in zip(to_remove_chunk_keys, metas):
+            if meta is not None:
+                new_to_remove_chunk_keys.append(to_remove_chunk_key)
+                new_metas.append(meta)
+        to_remove_chunk_keys = new_to_remove_chunk_keys
+        metas = new_metas
+
+        all_bands = [meta["bands"] for meta in metas]
+        key_to_addresses = dict()
+        for to_remove_chunk_key, bands in zip(to_remove_chunk_keys, all_bands):
+            key_to_addresses[to_remove_chunk_key] = bands
+
+        # remove data via storage API
+        storage_api_to_deletes = defaultdict(list)
+        for key, bands in key_to_addresses.items():
+            for band in bands:
+                # storage API is cached for same arguments
+                storage_api = await StorageAPI.create(
+                    self._session_id, band[0], band[1]
+                )
+                storage_api_to_deletes[storage_api].append(
+                    storage_api.delete.delay(key, error="ignore")
+                )
+        await asyncio.gather(
+            *[
+                storage_api.delete.batch(*deletes)
+                for storage_api, deletes in storage_api_to_deletes.items()
+            ]
+        )
+
+        # delete meta
+        delete_metas = []
+        for to_remove_chunk_key in to_remove_chunk_keys:
+            delete_metas.append(
+                self._meta_api.del_chunk_meta.delay(to_remove_chunk_key)
+            )
+        await self._meta_api.del_chunk_meta.batch(*delete_metas)
+
+    def get_chunk_ref_counts(self, chunk_keys: List[str]) -> List[int]:
+        return [self._chunk_ref_counts[chunk_key] for chunk_key in chunk_keys]
+
+    def get_all_chunk_ref_counts(self) -> Dict[str, int]:
+        result = dict()
+        for chunk_key, ref_count in self._chunk_ref_counts.items():
+            if ref_count > 0:
+                result[chunk_key] = ref_count
+        return result
+
+    def _incref_tileables(self, tileable_keys: List[str], counts: List[int] = None):
+        counts = counts if counts is not None else itertools.repeat(1)
+        for tileable_key, count in zip(tileable_keys, counts):
+            if tileable_key not in self._tileable_key_to_chunk_keys:
+                raise TileableNotTracked(f"tileable {tileable_key} not tracked before")
+            self._tileable_ref_counts[tileable_key] += count
+            incref_chunk_keys = self._tileable_key_to_chunk_keys[tileable_key]
+            # incref chunks for this tileable
+            logger.debug(
+                "Incref chunks %.500s while increfing tileable %s",
+                incref_chunk_keys,
+                tileable_key,
+            )
+            chunk_counts = None if count == 1 else [count] * len(incref_chunk_keys)
+            self._incref_chunks(incref_chunk_keys, counts=chunk_counts)
+
+    async def incref_tileables(
+        self, tileable_keys: List[str], counts: List[int] = None
+    ):
+        self._check_ref_counts(tileable_keys, counts)
+        return await asyncio.to_thread(
+            self._incref_tileables, tileable_keys, counts=counts
+        )
+
+    def _get_decref_chunk_keys(
+        self, tileable_keys: List[str], counts: List[int] = None
+    ) -> Dict[str, int]:
+        decref_chunk_keys = dict()
+        counts = counts if counts is not None else itertools.repeat(1)
+        for tileable_key, count in zip(tileable_keys, counts):
+            if tileable_key not in self._tileable_key_to_chunk_keys:
+                raise TileableNotTracked(f"tileable {tileable_key} not tracked before")
+            self._tileable_ref_counts[tileable_key] -= count
+
+            for chunk_key in self._tileable_key_to_chunk_keys[tileable_key]:
+                if chunk_key not in decref_chunk_keys:
+                    decref_chunk_keys[chunk_key] = count
+                else:
+                    decref_chunk_keys[chunk_key] += count
+        logger.debug(
+            "Decref chunks %.500s while decrefing tileables %s",
+            decref_chunk_keys,
+            tileable_keys,
+        )
+        return decref_chunk_keys
+
+    async def decref_tileables(
+        self, tileable_keys: List[str], counts: List[int] = None
+    ):
+        self._check_ref_counts(tileable_keys, counts)
+        decref_chunk_key_to_counts = await asyncio.to_thread(
+            self._get_decref_chunk_keys, tileable_keys, counts=counts
+        )
+        to_remove_chunk_keys = await asyncio.to_thread(
+            self._get_remove_chunk_keys,
+            list(decref_chunk_key_to_counts),
+            counts=list(decref_chunk_key_to_counts.values()),
+        )
+        to_remove_tileable_keys = await asyncio.to_thread(
+            list, (key for key in tileable_keys if self._tileable_ref_counts[key] <= 0)
+        )
+        coros = []
+        if to_remove_chunk_keys:
+            coros.append(self._remove_chunks(to_remove_chunk_keys))
+        if to_remove_tileable_keys:
+            task_api = await self._get_task_api()
+            coros.append(task_api.remove_tileables(to_remove_tileable_keys))
+        if coros:
+            # release actor lock
+            yield asyncio.gather(*coros)
+
+    def get_tileable_ref_counts(self, tileable_keys: List[str]) -> List[int]:
+        return [
+            self._tileable_ref_counts[tileable_key] for tileable_key in tileable_keys
+        ]
diff --git a/python/xorbits/_mars/services/lifecycle/worker/__init__.py b/python/xorbits/_mars/services/lifecycle/worker/__init__.py
new file mode 100644
index 000000000..55b7ebca7
--- /dev/null
+++ b/python/xorbits/_mars/services/lifecycle/worker/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...core import EmptyService
+
+
+class TaskWorkerService(EmptyService):
+    pass
diff --git a/python/xorbits/_mars/services/meta/__init__.py b/python/xorbits/_mars/services/meta/__init__.py
new file mode 100644
index 000000000..5d962a663
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .api import (
+    AbstractMetaAPI,
+    MetaAPI,
+    MockMetaAPI,
+    MockWorkerMetaAPI,
+    WebMetaAPI,
+    WorkerMetaAPI,
+)
diff --git a/python/xorbits/_mars/services/meta/api/__init__.py b/python/xorbits/_mars/services/meta/api/__init__.py
new file mode 100644
index 000000000..74aaea7fd
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/api/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import AbstractMetaAPI
+from .oscar import MetaAPI, MockMetaAPI, MockWorkerMetaAPI, WorkerMetaAPI
+from .web import WebMetaAPI
diff --git a/python/xorbits/_mars/services/meta/api/core.py b/python/xorbits/_mars/services/meta/api/core.py
new file mode 100644
index 000000000..f54daba31
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/api/core.py
@@ -0,0 +1,38 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional
+
+
+class AbstractMetaAPI(ABC):
+    @abstractmethod
+    async def get_chunk_meta(
+        self, object_id: str, fields: List[str] = None, error: str = "raise"
+    ) -> Optional[Dict]:
+        """
+        Get chunk meta
+
+        Parameters
+        ----------
+        object_id
+            Object ID
+        fields
+            Fields to obtain
+        error
+            Way to handle errors, 'raise' by default
+        Returns
+        -------
+            Dict with fields as keys
+        """
diff --git a/python/xorbits/_mars/services/meta/api/oscar.py b/python/xorbits/_mars/services/meta/api/oscar.py
new file mode 100644
index 000000000..ac35ab33d
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/api/oscar.py
@@ -0,0 +1,335 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Any, Dict, List
+
+from .... import oscar as mo
+from ....core import ChunkType
+from ....core.operand import Fuse
+from ....lib.aio import alru_cache
+from ....typing import BandType
+from ....utils import get_chunk_params
+from ..core import get_meta_type
+from ..store import AbstractMetaStore
+from ..supervisor.core import MetaStoreActor, MetaStoreManagerActor
+from ..worker.core import WorkerMetaStoreManagerActor
+from .core import AbstractMetaAPI
+
+
+class BaseMetaAPI(AbstractMetaAPI):
+    def __init__(self, session_id: str, meta_store: mo.ActorRefType[AbstractMetaStore]):
+        # make sure all meta types registered
+        from .. import metas
+
+        del metas
+
+        self._session_id = session_id
+        self._meta_store = meta_store
+
+    @mo.extensible
+    async def set_tileable_meta(
+        self, tileable, memory_size: int = None, store_size: int = None, **extra
+    ):
+        from ....dataframe.core import (
+            DATAFRAME_GROUPBY_TYPE,
+            DATAFRAME_TYPE,
+            SERIES_GROUPBY_TYPE,
+        )
+
+        params = tileable.params.copy()
+        if isinstance(
+            tileable, (DATAFRAME_TYPE, DATAFRAME_GROUPBY_TYPE, SERIES_GROUPBY_TYPE)
+        ):
+            # dataframe needs some special process for now
+            del params["columns_value"]
+            del params["dtypes"]
+            params.pop("key_dtypes", None)
+            params["dtypes_value"] = tileable.dtypes_value
+        params["nsplits"] = tileable.nsplits
+        params.update(extra)
+        meta = get_meta_type(type(tileable))(
+            object_id=tileable.key,
+            **params,
+            memory_size=memory_size,
+            store_size=store_size
+        )
+        return await self._meta_store.set_meta(tileable.key, meta)
+
+    @mo.extensible
+    async def get_tileable_meta(
+        self, object_id: str, fields: List[str] = None
+    ) -> Dict[str, Any]:
+        return await self._meta_store.get_meta(object_id, fields=fields)
+
+    @mo.extensible
+    async def del_tileable_meta(self, object_id: str):
+        return await self._meta_store.del_meta(object_id)
+
+    @classmethod
+    def _extract_chunk_meta(
+        cls,
+        chunk: ChunkType,
+        memory_size: int = None,
+        store_size: int = None,
+        bands: List[BandType] = None,
+        fields: List[str] = None,
+        exclude_fields: List[str] = None,
+        **extra
+    ):
+        if isinstance(chunk.op, Fuse):
+            # fuse op
+            chunk = chunk.chunk
+        params = get_chunk_params(chunk)
+        chunk_key = extra.pop("chunk_key", chunk.key)
+        object_ref = extra.pop("object_ref", None)
+        params.update(extra)
+
+        if object_ref:
+            object_refs = (
+                [object_ref] if not isinstance(object_ref, list) else object_ref
+            )
+        else:
+            object_refs = []
+
+        if fields is not None:
+            fields = set(fields)
+            params = {k: v for k, v in params.items() if k in fields}
+        elif exclude_fields is not None:
+            exclude_fields = set(exclude_fields)
+            params = {k: v for k, v in params.items() if k not in exclude_fields}
+
+        return get_meta_type(type(chunk))(
+            object_id=chunk_key,
+            **params,
+            bands=bands,
+            memory_size=memory_size,
+            store_size=store_size,
+            object_refs=object_refs
+        )
+
+    @mo.extensible
+    async def set_chunk_meta(
+        self,
+        chunk: ChunkType,
+        memory_size: int = None,
+        store_size: int = None,
+        bands: List[BandType] = None,
+        fields: List[str] = None,
+        exclude_fields: List[str] = None,
+        **extra
+    ):
+        """
+        Parameters
+        ----------
+        chunk: ChunkType
+            chunk to set meta
+        memory_size: int
+            memory size for chunk data
+        store_size: int
+            serialized size for chunk data
+        bands:
+            chunk data bands
+        fields: list
+            fields to include in meta
+        exclude_fields: list
+            fields to exclude in meta
+        extra
+
+        Returns
+        -------
+
+        """
+        meta = self._extract_chunk_meta(
+            chunk,
+            memory_size=memory_size,
+            store_size=store_size,
+            bands=bands,
+            fields=fields,
+            exclude_fields=exclude_fields,
+            **extra
+        )
+        return await self._meta_store.set_meta(meta.object_id, meta)
+
+    @set_chunk_meta.batch
+    async def batch_set_chunk_meta(self, args_list, kwargs_list):
+        set_chunk_metas = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            meta = self._extract_chunk_meta(*args, **kwargs)
+            set_chunk_metas.append(
+                self._meta_store.set_meta.delay(meta.object_id, meta)
+            )
+        return await self._meta_store.set_meta.batch(*set_chunk_metas)
+
+    @mo.extensible
+    async def get_chunk_meta(
+        self, object_id: str, fields: List[str] = None, error="raise"
+    ):
+        return await self._meta_store.get_meta(object_id, fields=fields, error=error)
+
+    @get_chunk_meta.batch
+    async def batch_get_chunk_meta(self, args_list, kwargs_list):
+        get_chunk_metas = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            get_chunk_metas.append(self._meta_store.get_meta.delay(*args, **kwargs))
+        return await self._meta_store.get_meta.batch(*get_chunk_metas)
+
+    @mo.extensible
+    async def del_chunk_meta(self, object_id: str):
+        """
+        Parameters
+        ----------
+        object_id: str
+            chunk id
+        """
+        return await self._meta_store.del_meta(object_id)
+
+    @del_chunk_meta.batch
+    async def batch_del_chunk_meta(self, args_list, kwargs_list):
+        del_chunk_metas = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            del_chunk_metas.append(self._meta_store.del_meta.delay(*args, **kwargs))
+        return await self._meta_store.del_meta.batch(*del_chunk_metas)
+
+    @mo.extensible
+    async def add_chunk_bands(self, object_id: str, bands: List[BandType]):
+        return await self._meta_store.add_chunk_bands(object_id, bands)
+
+    @add_chunk_bands.batch
+    async def batch_add_chunk_bands(self, args_list, kwargs_list):
+        add_chunk_bands_tasks = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            add_chunk_bands_tasks.append(
+                self._meta_store.add_chunk_bands.delay(*args, **kwargs)
+            )
+        return await self._meta_store.add_chunk_bands.batch(*add_chunk_bands_tasks)
+
+    @mo.extensible
+    async def remove_chunk_bands(self, object_id: str, bands: List[BandType]):
+        return await self._meta_store.remove_chunk_bands(object_id, bands)
+
+    @remove_chunk_bands.batch
+    async def batch_remove_chunk_bands(self, args_list, kwargs_list):
+        remove_chunk_bands_tasks = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            remove_chunk_bands_tasks.append(
+                self._meta_store.remove_chunk_bands.delay(*args, **kwargs)
+            )
+        return await self._meta_store.remove_chunk_bands.batch(
+            *remove_chunk_bands_tasks
+        )
+
+    @mo.extensible
+    async def get_band_chunks(self, band: BandType) -> List[str]:
+        return await self._meta_store.get_band_chunks(band)
+
+
+class MetaAPI(BaseMetaAPI):
+    @classmethod
+    @alru_cache(maxsize=1024, cache_exceptions=False)
+    async def create(cls, session_id: str, address: str) -> "MetaAPI":
+        """
+        Create Meta API.
+
+        Parameters
+        ----------
+        session_id : str
+            Session ID.
+        address : str
+            Supervisor address.
+
+        Returns
+        -------
+        meta_api
+            Meta api.
+        """
+        meta_store_ref = await mo.actor_ref(address, MetaStoreActor.gen_uid(session_id))
+
+        return MetaAPI(session_id, meta_store_ref)
+
+
+class MockMetaAPI(MetaAPI):
+    @classmethod
+    async def create(cls, session_id: str, address: str) -> "MetaAPI":
+        # create an Actor for mock
+        try:
+            meta_store_manager_ref = await mo.create_actor(
+                MetaStoreManagerActor,
+                "dict",
+                dict(),
+                address=address,
+                uid=MetaStoreManagerActor.default_uid(),
+            )
+        except mo.ActorAlreadyExist:
+            # ignore if actor exists
+            meta_store_manager_ref = await mo.actor_ref(
+                MetaStoreManagerActor,
+                address=address,
+                uid=MetaStoreManagerActor.default_uid(),
+            )
+        try:
+            await meta_store_manager_ref.new_session_meta_store(session_id)
+        except mo.ActorAlreadyExist:
+            pass
+        return await super().create(session_id=session_id, address=address)
+
+
+class WorkerMetaAPI(BaseMetaAPI):
+    @classmethod
+    @alru_cache(cache_exceptions=False)
+    async def create(cls, session_id: str, address: str) -> "WorkerMetaAPI":
+        """
+        Create worker meta API.
+
+        Parameters
+        ----------
+        session_id : str
+            Session ID.
+        address : str
+            Worker address.
+
+        Returns
+        -------
+        meta_api
+            Worker meta api.
+        """
+        worker_meta_store_manager_ref = await mo.actor_ref(
+            uid=WorkerMetaStoreManagerActor.default_uid(), address=address
+        )
+        worker_meta_store_ref = (
+            await worker_meta_store_manager_ref.new_session_meta_store(session_id)
+        )
+        return WorkerMetaAPI(session_id, worker_meta_store_ref)
+
+
+class MockWorkerMetaAPI(WorkerMetaAPI):
+    @classmethod
+    async def create(cls, session_id: str, address: str) -> "WorkerMetaAPI":
+        # create an Actor for mock
+        try:
+            await mo.create_actor(
+                WorkerMetaStoreManagerActor,
+                "dict",
+                dict(),
+                address=address,
+                uid=WorkerMetaStoreManagerActor.default_uid(),
+            )
+        except mo.ActorAlreadyExist:
+            # ignore if actor exists
+            await mo.actor_ref(
+                WorkerMetaStoreManagerActor,
+                address=address,
+                uid=WorkerMetaStoreManagerActor.default_uid(),
+            )
+        return await super().create(session_id, address)
diff --git a/python/xorbits/_mars/services/meta/api/web.py b/python/xorbits/_mars/services/meta/api/web.py
new file mode 100644
index 000000000..193206cf1
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/api/web.py
@@ -0,0 +1,90 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, List, Optional
+
+from .... import oscar as mo
+from ....utils import deserialize_serializable, serialize_serializable
+from ...web import MarsServiceWebAPIHandler, MarsWebAPIClientMixin, web_api
+from .core import AbstractMetaAPI
+
+
+class MetaWebAPIHandler(MarsServiceWebAPIHandler):
+    _root_pattern = "/api/session/(?P<session_id>[^/]+)/meta"
+
+    async def _get_oscar_meta_api(self, session_id: str):
+        from .oscar import MetaAPI
+
+        return await self._get_api_by_key(MetaAPI, session_id)
+
+    @web_api("(?P<data_key>[^/]+)", method="get")
+    async def get_chunk_meta(self, session_id: str, data_key: str):
+        fields_str = self.get_argument("fields", None)
+        error = self.get_argument("error", "raise")
+        fields = fields_str.split(",") if fields_str else None
+
+        oscar_api = await self._get_oscar_meta_api(session_id)
+        result = await oscar_api.get_chunk_meta(data_key, fields=fields, error=error)
+        self.write(serialize_serializable(result))
+
+    @web_api("", method="post")
+    async def get_chunks_meta(self, session_id: str):
+        body_args = deserialize_serializable(self.request.body)
+        oscar_api = await self._get_oscar_meta_api(session_id)
+        get_metas = []
+        for data_key, fields, error in body_args:
+            get_metas.append(oscar_api.get_chunk_meta.delay(data_key, fields, error))
+        results = await oscar_api.get_chunk_meta.batch(*get_metas)
+        self.write(serialize_serializable(results))
+
+
+web_handlers = {MetaWebAPIHandler.get_root_pattern(): MetaWebAPIHandler}
+
+
+class WebMetaAPI(AbstractMetaAPI, MarsWebAPIClientMixin):
+    def __init__(
+        self, session_id: str, address: str, request_rewriter: Callable = None
+    ):
+        # make sure all meta types registered
+        from .. import metas
+
+        del metas
+
+        self._session_id = session_id
+        self._address = address.rstrip("/")
+        self.request_rewriter = request_rewriter
+
+    @mo.extensible
+    async def get_chunk_meta(
+        self, object_id: str, fields: List[str] = None, error: str = "raise"
+    ) -> Optional[Dict]:
+        params = dict(error=error)
+        req_addr = f"{self._address}/api/session/{self._session_id}/meta/{object_id}"
+        if fields:
+            params["fields"] = ",".join(fields)
+        res = await self._request_url("GET", req_addr, params=params)
+        return deserialize_serializable(res.body)
+
+    @get_chunk_meta.batch
+    async def get_chunks_meta(self, args_list, kwargs_list):
+        get_chunk_metas = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            object_id, fields, error = self.get_chunk_meta.bind(*args, **kwargs)
+            get_chunk_metas.append([object_id, fields, error])
+
+        req_addr = f"{self._address}/api/session/{self._session_id}/meta"
+        res = await self._request_url(
+            "POST", req_addr, data=serialize_serializable(get_chunk_metas)
+        )
+        return deserialize_serializable(res.body)
diff --git a/python/xorbits/_mars/services/meta/core.py b/python/xorbits/_mars/services/meta/core.py
new file mode 100644
index 000000000..9fb16b578
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/core.py
@@ -0,0 +1,77 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple, Type, Union
+
+import numpy as np
+import pandas as pd
+
+from ...typing import BandType
+from ...utils import TypeDispatcher, dataslots
+
+PandasDtypeType = Union[np.dtype, pd.api.extensions.ExtensionDtype]
+
+_meta_class_dispatcher = TypeDispatcher()
+
+
+def register_meta_type(object_types: Tuple):
+    def _call(meta_type: Type["_CommonMeta"]):
+        _meta_class_dispatcher.register(object_types, meta_type)
+        return meta_type
+
+    return _call
+
+
+def get_meta_type(object_type: Type) -> Type["_CommonMeta"]:
+    return _meta_class_dispatcher.get_handler(object_type)
+
+
+@dataslots
+@dataclass
+class _CommonMeta:
+    """
+    Class for common meta, for both tileable and chunk, or DataFrame, tensor etc.
+    """
+
+    object_id: str
+    name: Any = None
+    memory_size: int = None  # size in memory
+    store_size: int = None  # size that stored in storage
+    extra: Dict = None
+
+    def merge_from(self, value: "_CommonMeta"):
+        return self
+
+
+@dataslots
+@dataclass
+class _TileableMeta(_CommonMeta):
+    nsplits: Tuple[Tuple[int]] = None
+
+
+@dataslots
+@dataclass
+class _ChunkMeta(_CommonMeta):
+    index: Tuple[int] = None
+    bands: List[BandType] = None
+    # needed by ray ownership to keep object alive when worker died.
+    object_refs: List[Any] = None
+
+    def merge_from(self, value: "_ChunkMeta"):
+        if value.bands:
+            self.bands = list(set(self.bands) | set(value.bands))
+        if value.object_refs:
+            self.object_refs = list(set(self.object_refs) | set(value.object_refs))
+        return self
diff --git a/python/xorbits/_mars/services/meta/metas.py b/python/xorbits/_mars/services/meta/metas.py
new file mode 100644
index 000000000..d166c4fe3
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/metas.py
@@ -0,0 +1,207 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+
+from ...core import OBJECT_CHUNK_TYPE, OBJECT_TYPE
+from ...dataframe.core import (
+    CATEGORICAL_CHUNK_TYPE,
+    CATEGORICAL_TYPE,
+    DATAFRAME_CHUNK_TYPE,
+    DATAFRAME_GROUPBY_CHUNK_TYPE,
+    DATAFRAME_GROUPBY_TYPE,
+    DATAFRAME_OR_SERIES_CHUNK_TYPE,
+    DATAFRAME_OR_SERIES_TYPE,
+    DATAFRAME_TYPE,
+    INDEX_CHUNK_TYPE,
+    INDEX_TYPE,
+    SERIES_CHUNK_TYPE,
+    SERIES_GROUPBY_CHUNK_TYPE,
+    SERIES_GROUPBY_TYPE,
+    SERIES_TYPE,
+    DtypesValue,
+    IndexValue,
+)
+from ...tensor.core import TENSOR_CHUNK_TYPE, TENSOR_TYPE, TensorOrder
+from ...utils import dataslots
+from .core import PandasDtypeType, _ChunkMeta, _TileableMeta, register_meta_type
+
+"""
+Create a separate module for metas to avoid direct
+dependency on mars.dataframe
+"""
+
+
+@register_meta_type(TENSOR_TYPE)
+@dataslots
+@dataclass
+class TensorMeta(_TileableMeta):
+    shape: Tuple[int] = None
+    dtype: np.dtype = None
+    order: TensorOrder = None
+
+
+@register_meta_type(DATAFRAME_TYPE)
+@dataslots
+@dataclass
+class DataFrameMeta(_TileableMeta):
+    shape: Tuple[int] = None
+    dtypes_value: DtypesValue = None
+    index_value: IndexValue = None
+
+
+@register_meta_type(SERIES_TYPE)
+@dataslots
+@dataclass
+class SeriesMeta(_TileableMeta):
+    shape: Tuple[int] = None
+    dtype: PandasDtypeType = None
+    index_value: IndexValue = None
+
+
+@register_meta_type(INDEX_TYPE)
+@dataslots
+@dataclass
+class IndexMeta(_TileableMeta):
+    shape: Tuple[int] = None
+    dtype: PandasDtypeType = None
+    index_value: IndexValue = None
+
+
+@register_meta_type(DATAFRAME_GROUPBY_TYPE)
+@dataslots
+@dataclass
+class DataFrameGroupByMeta(_TileableMeta):
+    shape: Tuple[int] = None
+    dtypes_value: DtypesValue = None
+    index_value: IndexValue = None
+    selection: List = None
+
+
+@register_meta_type(SERIES_GROUPBY_TYPE)
+@dataslots
+@dataclass
+class SeriesGroupByMeta(_TileableMeta):
+    shape: Tuple[int] = None
+    dtype: PandasDtypeType = None
+    index_value: IndexValue = None
+    selection: List = None
+
+
+@register_meta_type(CATEGORICAL_TYPE)
+@dataslots
+@dataclass
+class CategoricalMeta(_TileableMeta):
+    shape: Tuple[int] = None
+    dtype: PandasDtypeType = None
+    categories_value: IndexValue = None
+
+
+@register_meta_type(OBJECT_TYPE)
+@dataslots
+@dataclass
+class ObjectMeta(_TileableMeta):
+    pass
+
+
+@register_meta_type(TENSOR_CHUNK_TYPE)
+@dataslots
+@dataclass
+class TensorChunkMeta(_ChunkMeta):
+    shape: Tuple[int] = None
+    dtype: np.dtype = None
+    order: TensorOrder = None
+
+
+@register_meta_type(DATAFRAME_CHUNK_TYPE)
+@dataslots
+@dataclass
+class DataFrameChunkMeta(_ChunkMeta):
+    shape: Tuple[int] = None
+    dtypes_value: DtypesValue = None
+    index_value: IndexValue = None
+
+
+@register_meta_type(SERIES_CHUNK_TYPE)
+@dataslots
+@dataclass
+class SeriesChunkMeta(_ChunkMeta):
+    shape: Tuple[int] = None
+    dtype: PandasDtypeType = None
+    index_value: IndexValue = None
+
+
+@register_meta_type(INDEX_CHUNK_TYPE)
+@dataslots
+@dataclass
+class IndexChunkMeta(_ChunkMeta):
+    shape: Tuple[int] = None
+    dtype: PandasDtypeType = None
+    index_value: IndexValue = None
+
+
+@register_meta_type(DATAFRAME_GROUPBY_CHUNK_TYPE)
+@dataslots
+@dataclass
+class DataFrameGroupByChunkMeta(_ChunkMeta):
+    shape: Tuple[int] = None
+    dtypes_value: DtypesValue = None
+    index_value: IndexValue = None
+    selection: List = None
+
+
+@register_meta_type(SERIES_GROUPBY_CHUNK_TYPE)
+@dataslots
+@dataclass
+class SeriesGroupByChunkMeta(_ChunkMeta):
+    shape: Tuple[int] = None
+    dtype: PandasDtypeType = None
+    index_value: IndexValue = None
+    selection: List = None
+
+
+@register_meta_type(CATEGORICAL_CHUNK_TYPE)
+@dataslots
+@dataclass
+class CategoricalChunkMeta(_ChunkMeta):
+    shape: Tuple[int] = None
+    dtype: PandasDtypeType = None
+    categories_value: IndexValue = None
+
+
+@register_meta_type(OBJECT_CHUNK_TYPE)
+@dataslots
+@dataclass
+class ObjectChunkMeta(_ChunkMeta):
+    pass
+
+
+@register_meta_type(DATAFRAME_OR_SERIES_TYPE)
+@dataslots
+@dataclass
+class DataFrameOrSeriesMeta(_TileableMeta):
+    data_type: str = None
+    data_params: Dict[str, Any] = None
+
+
+@register_meta_type(DATAFRAME_OR_SERIES_CHUNK_TYPE)
+@dataslots
+@dataclass
+class DataFrameOrSeriesChunkMeta(_ChunkMeta):
+    collapse_axis: int = None
+    data_type: str = None
+    data_params: Dict[str, Any] = None
diff --git a/python/xorbits/_mars/services/meta/store/__init__.py b/python/xorbits/_mars/services/meta/store/__init__.py
new file mode 100644
index 000000000..e238d3b3c
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/store/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import AbstractMetaStore, get_meta_store
+from .dictionary import DictMetaStore
diff --git a/python/xorbits/_mars/services/meta/store/base.py b/python/xorbits/_mars/services/meta/store/base.py
new file mode 100644
index 000000000..24c05e1f0
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/store/base.py
@@ -0,0 +1,139 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Dict, List, Type
+
+from ....typing import BandType
+from ..core import _CommonMeta
+
+
+class AbstractMetaStore(ABC):
+    name = None
+
+    def __init__(self, session_id: str, **kw):
+        # make sure all meta types registered
+        from .. import metas
+
+        del metas
+
+        self._session_id = session_id
+
+    @classmethod
+    @abstractmethod
+    async def create(cls, config) -> Dict:
+        """
+        Create a meta store. Do some initialization work.
+        For instance, for database backend,
+        db files including tables may be created first.
+        This should be done when service starting.
+
+        Parameters
+        ----------
+        config : dict
+            config.
+
+        Returns
+        -------
+        kwargs : dict
+            kwargs to create a meta store.
+        """
+
+    @abstractmethod
+    async def set_meta(self, object_id: str, meta: _CommonMeta):
+        """
+        Set meta.
+
+        Parameters
+        ----------
+        object_id : str
+            Object ID.
+        meta : _CommonMeta
+            Meta.
+        """
+
+    @abstractmethod
+    async def get_meta(
+        self, object_id: str, fields: List[str] = None, error="raise"
+    ) -> Dict:
+        """
+        Get meta.
+
+        Parameters
+        ----------
+        object_id : str
+            Object ID.
+        fields : list
+            Fields to filter, if not provided, get all fields.
+        error : str
+            'raise' or 'ignore'
+
+        Returns
+        -------
+        meta: dict
+            Meta.
+        """
+
+    @abstractmethod
+    async def del_meta(self, object_id: str):
+        """
+        Delete meta.
+
+        Parameters
+        ----------
+        object_id : str
+            Object ID.
+        """
+
+    @abstractmethod
+    async def add_chunk_bands(self, object_id: str, bands: List[BandType]):
+        """
+        Add band to chunk.
+
+        Parameters
+        ----------
+        object_id : str
+            Object ID.
+        bands : List[BandType]
+            Band of chunk to add, shall be tuple of (worker, band).
+        """
+
+    @abstractmethod
+    async def remove_chunk_bands(self, object_id: str, bands: List[BandType]):
+        """
+        Remove bands from chunk.
+
+        Parameters
+        ----------
+        object_id : str
+            Object ID.
+        bands : List[BandType]
+            Bands of chunk to remove, shall be tuple of (worker, band).
+        """
+
+    @abstractmethod
+    async def get_band_chunks(self, band: BandType) -> List[str]:
+        """Get chunks key of band"""
+
+
+_meta_store_types: Dict[str, Type[AbstractMetaStore]] = dict()
+
+
+def register_meta_store(meta_store: Type[AbstractMetaStore]):
+    _meta_store_types[meta_store.name] = meta_store
+    return meta_store
+
+
+def get_meta_store(meta_store_name: str) -> Type[AbstractMetaStore]:
+    return _meta_store_types[meta_store_name]
diff --git a/python/xorbits/_mars/services/meta/store/dictionary.py b/python/xorbits/_mars/services/meta/store/dictionary.py
new file mode 100644
index 000000000..fbdbdc5d0
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/store/dictionary.py
@@ -0,0 +1,159 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+from collections import defaultdict
+from dataclasses import fields as dataclass_fields
+from typing import Dict, List
+
+from .... import oscar as mo
+from ....lib.ordered_set import OrderedSet
+from ....typing import BandType
+from ....utils import implements
+from ..core import _ChunkMeta, _CommonMeta
+from .base import AbstractMetaStore, register_meta_store
+
+
+@functools.lru_cache(100)
+def _get_meta_fields(meta_cls):
+    return [f.name for f in dataclass_fields(meta_cls)]
+
+
+@register_meta_store
+class DictMetaStore(AbstractMetaStore):
+    name = "dict"
+
+    def __init__(self, session_id: str, **kw):
+        super().__init__(session_id)
+        self._store: Dict[str, _CommonMeta] = dict()
+        # For shuffle data, we use main key to record them, here uses
+        # OrderedSet to make sure that the first band in set stores complete
+        # data, other bands may only have part data, so when reducers fetch data,
+        # we always choose the first band to avoid unexpected absence.
+        self._band_chunks: Dict[BandType, OrderedSet] = defaultdict(OrderedSet)
+        if kw:  # pragma: no cover
+            raise TypeError(f"Keyword arguments {kw!r} cannot be recognized.")
+
+    @classmethod
+    @implements(AbstractMetaStore.create)
+    async def create(cls, config) -> Dict:
+        # Nothing needs to do for dict-based meta store.
+        # no extra kwargs.
+        return dict()
+
+    def _set_meta(self, object_id: str, meta: _CommonMeta):
+        if isinstance(meta, _ChunkMeta):
+            for band in meta.bands:
+                self._band_chunks[band].add(object_id)
+        prev_meta = self._store.get(object_id)
+        if prev_meta:
+            meta = meta.merge_from(prev_meta)
+        self._store[object_id] = meta
+
+    @implements(AbstractMetaStore.set_meta)
+    @mo.extensible
+    async def set_meta(self, object_id: str, meta: _CommonMeta):
+        self._set_meta(object_id, meta)
+
+    @set_meta.batch
+    async def batch_set_meta(self, args_list, kwargs_list):
+        for args, kwargs in zip(args_list, kwargs_list):
+            self._set_meta(*args, **kwargs)
+
+    def _get_meta(
+        self, object_id: str, fields: List[str] = None, error: str = "raise"
+    ) -> Dict:
+        if error not in ("raise", "ignore"):  # pragma: no cover
+            raise ValueError("error must be raise or ignore")
+        try:
+            meta = self._store[object_id]
+            if fields is None:
+                fields = _get_meta_fields(type(meta))
+            return {k: getattr(meta, k) for k in fields}
+        except KeyError:
+            if error == "raise":
+                raise
+            else:
+                return
+
+    @implements(AbstractMetaStore.get_meta)
+    @mo.extensible
+    async def get_meta(
+        self, object_id: str, fields: List[str] = None, error: str = "raise"
+    ) -> Dict:
+        return self._get_meta(object_id, fields=fields, error=error)
+
+    @get_meta.batch
+    async def batch_get_meta(self, args_list, kwargs_list):
+        metas = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            metas.append(self._get_meta(*args, **kwargs))
+        return metas
+
+    def _del_meta(self, object_id: str):
+        meta = self._store[object_id]
+        if isinstance(meta, _ChunkMeta):
+            for band in meta.bands:
+                chunks = self._band_chunks[band]
+                chunks.remove(object_id)
+                if len(chunks) == 0:
+                    del self._band_chunks[band]
+        del self._store[object_id]
+
+    @implements(AbstractMetaStore.del_meta)
+    @mo.extensible
+    async def del_meta(self, object_id: str):
+        self._del_meta(object_id)
+
+    @del_meta.batch
+    async def batch_del_meta(self, args_list, kwargs_list):
+        for args, kwargs in zip(args_list, kwargs_list):
+            self._del_meta(*args, **kwargs)
+
+    def _add_chunk_bands(self, object_id: str, bands: List[BandType]):
+        meta = self._store[object_id]
+        assert isinstance(meta, _ChunkMeta)
+        meta.bands = list(OrderedSet(meta.bands) | OrderedSet(bands))
+        for band in bands:
+            self._band_chunks[band].add(object_id)
+
+    @implements(AbstractMetaStore.add_chunk_bands)
+    @mo.extensible
+    async def add_chunk_bands(self, object_id: str, bands: List[BandType]):
+        self._add_chunk_bands(object_id, bands)
+
+    @add_chunk_bands.batch
+    async def batch_add_chunk_bands(self, args_list, kwargs_list):
+        for args, kwargs in zip(args_list, kwargs_list):
+            self._add_chunk_bands(*args, **kwargs)
+
+    def _remove_chunk_bands(self, object_id: str, bands: List[BandType]):
+        meta = self._store[object_id]
+        assert isinstance(meta, _ChunkMeta)
+        meta.bands = list(OrderedSet(meta.bands) - OrderedSet(bands))
+        for band in bands:
+            self._band_chunks[band].remove(object_id)
+
+    @implements(AbstractMetaStore.remove_chunk_bands)
+    @mo.extensible
+    async def remove_chunk_bands(self, object_id: str, bands: List[BandType]):
+        self._remove_chunk_bands(object_id, bands)
+
+    @remove_chunk_bands.batch
+    async def batch_remove_chunk_bands(self, args_list, kwargs_list):
+        for args, kwargs in zip(args_list, kwargs_list):
+            self._remove_chunk_bands(*args, **kwargs)
+
+    async def get_band_chunks(self, band: BandType) -> List[str]:
+        return list(self._band_chunks[band])
diff --git a/python/xorbits/_mars/services/meta/store/tests/__init__.py b/python/xorbits/_mars/services/meta/store/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/store/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/meta/store/tests/test_meta_store.py b/python/xorbits/_mars/services/meta/store/tests/test_meta_store.py
new file mode 100644
index 000000000..8ccfa96b9
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/store/tests/test_meta_store.py
@@ -0,0 +1,54 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from ..... import tensor as mt
+from .....core import tile
+from ...metas import TensorMeta
+from ...store import get_meta_store
+
+
+@pytest.mark.asyncio
+async def test_mock_meta_store():
+    meta_store = get_meta_store("dict")("mock_session_id")
+
+    t = mt.random.rand(10, 10)
+    t = tile(t)
+
+    await meta_store.set_meta(
+        t.key,
+        TensorMeta(
+            object_id=t.key,
+            shape=t.shape,
+            dtype=t.dtype,
+            order=t.order,
+            nsplits=t.nsplits,
+        ),
+    )
+
+    meta = await meta_store.get_meta(t.key)
+    assert meta["shape"] == t.shape
+    assert meta["order"] == t.order
+    assert meta["dtype"] == t.dtype
+
+    meta = await meta_store.get_meta(t.key, fields=["shape", "order"])
+    assert meta["shape"] == t.shape
+    assert meta["order"] == t.order
+    assert "dtype" not in meta
+
+    await meta_store.del_meta(t.key)
+
+    with pytest.raises(KeyError):
+        await meta_store.get_meta(t.key)
diff --git a/python/xorbits/_mars/services/meta/supervisor/__init__.py b/python/xorbits/_mars/services/meta/supervisor/__init__.py
new file mode 100644
index 000000000..5f21cf2a3
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/supervisor/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .service import MetaSupervisorService
diff --git a/python/xorbits/_mars/services/meta/supervisor/core.py b/python/xorbits/_mars/services/meta/supervisor/core.py
new file mode 100644
index 000000000..9def8e47b
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/supervisor/core.py
@@ -0,0 +1,73 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+from typing import Dict
+
+from .... import oscar as mo
+from ...session import SessionAPI
+from ..store import get_meta_store
+
+
+class MetaStoreManagerActor(mo.Actor):
+    def __init__(self, meta_store_name: str, config: Dict):
+        self._meta_store_name = meta_store_name
+        self._meta_store_type = get_meta_store(meta_store_name)
+        self._config = config
+        self._meta_init_kwargs = None
+
+        # API
+        self._session_api = None
+
+    async def __post_create__(self):
+        self._meta_init_kwargs = await self._meta_store_type.create(self._config)
+        self._session_api = await SessionAPI.create(self.address)
+
+    async def new_session_meta_store(self, session_id: str) -> mo.ActorRef:
+        session_address = await self._session_api.get_session_address(session_id)
+        allocate_strategy = mo.allocate_strategy.AddressSpecified(session_address)
+        return await mo.create_actor(
+            MetaStoreActor,
+            self._meta_store_name,
+            session_id,
+            address=self.address,
+            uid=MetaStoreActor.gen_uid(session_id),
+            allocate_strategy=allocate_strategy,
+            **self._meta_init_kwargs,
+        )
+
+
+class MetaStoreActor(mo.Actor):
+    def __init__(self, meta_store_name: str, session_id: str, **meta_store_kwargs):
+        meta_store_type = get_meta_store(meta_store_name)
+        self._store = meta_store_type(session_id, **meta_store_kwargs)
+        self._worker_meta_store_refs = []
+
+    def add_worker_meta_store(self, ref: mo.ActorRef):
+        self._worker_meta_store_refs.append(ref)
+
+    async def __pre_destroy__(self):
+        await asyncio.gather(
+            *[
+                mo.destroy_actor(mo.create_actor_ref(ref))
+                for ref in self._worker_meta_store_refs
+            ]
+        )
+
+    @staticmethod
+    def gen_uid(session_id: str):
+        return f"{session_id}_meta"
+
+    def __getattr__(self, attr):
+        return getattr(self._store, attr)
diff --git a/python/xorbits/_mars/services/meta/supervisor/service.py b/python/xorbits/_mars/services/meta/supervisor/service.py
new file mode 100644
index 000000000..82f043b30
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/supervisor/service.py
@@ -0,0 +1,65 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import oscar as mo
+from ...core import AbstractService
+from .core import MetaStoreActor, MetaStoreManagerActor
+
+
+class MetaSupervisorService(AbstractService):
+    """
+    Meta service on supervisor.
+
+    Service Configuration
+    ---------------------
+    {
+        "meta" : {
+            "store": "<meta store name>",
+            # other config related to each store
+        }
+    }
+    """
+
+    async def start(self):
+        service_config = self._config["meta"]
+        meta_store_name = service_config.get("meta", "dict")
+        extra_config = service_config.copy()
+        extra_config.pop("meta", None)
+        await mo.create_actor(
+            MetaStoreManagerActor,
+            meta_store_name,
+            extra_config,
+            uid=MetaStoreManagerActor.default_uid(),
+            address=self._address,
+        )
+
+    async def stop(self):
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=MetaStoreManagerActor.default_uid(), address=self._address
+            )
+        )
+
+    async def create_session(self, session_id: str):
+        # get MetaStoreManagerActor ref.
+        meta_store_manager_ref = await mo.actor_ref(
+            self._address, MetaStoreManagerActor.default_uid()
+        )
+        await meta_store_manager_ref.new_session_meta_store(session_id)
+
+    async def destroy_session(self, session_id: str):
+        meta_store_ref = await mo.actor_ref(
+            self._address, MetaStoreActor.gen_uid(session_id)
+        )
+        await mo.destroy_actor(meta_store_ref)
diff --git a/python/xorbits/_mars/services/meta/tests/__init__.py b/python/xorbits/_mars/services/meta/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/meta/tests/test_api.py b/python/xorbits/_mars/services/meta/tests/test_api.py
new file mode 100644
index 000000000..6adb32a9a
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/tests/test_api.py
@@ -0,0 +1,171 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import pytest
+
+from .... import dataframe as md
+from .... import oscar as mo
+from .... import remote as mr
+from .... import tensor as mt
+from ....core import tile
+from ....utils import get_next_port
+from ... import NodeRole, start_services, stop_services
+from ...cluster import MockClusterAPI
+from ...session import MockSessionAPI, SessionAPI
+from .. import MetaAPI, MockMetaAPI, WebMetaAPI, WorkerMetaAPI
+
+t = mt.random.rand(10, 10)
+df = md.DataFrame(t)
+series = df[0]
+index = df.index
+obj = mr.spawn(lambda: 3)
+t, df, series, index, obj = tile(t, df, series, index, obj)
+
+test_objects = [t, df, series, index, obj]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("obj", test_objects)
+async def test_meta_mock_api(obj):
+    start_method = "fork" if sys.platform != "win32" else None
+    pool = await mo.create_actor_pool(
+        "127.0.0.1", 2, subprocess_start_method=start_method
+    )
+    async with pool:
+        session_id = "mock_session_id"
+
+        await MockClusterAPI.create(pool.external_address)
+        await MockSessionAPI.create(pool.external_address, session_id=session_id)
+        meta_api = await MockMetaAPI.create(
+            session_id=session_id, address=pool.external_address
+        )
+
+        await meta_api.set_tileable_meta(obj)
+        meta = await meta_api.get_tileable_meta(obj.key, fields=["nsplits"])
+        assert meta["nsplits"] == obj.nsplits
+        await meta_api.del_tileable_meta(obj.key)
+        with pytest.raises(KeyError):
+            await meta_api.get_tileable_meta(obj.key)
+
+        chunk = obj.chunks[0]
+
+        await meta_api.set_chunk_meta(chunk, bands=[(pool.external_address, "numa-0")])
+        meta = await meta_api.get_chunk_meta(chunk.key, fields=["index", "bands"])
+        assert meta["index"] == chunk.index
+        assert meta["bands"] == [(pool.external_address, "numa-0")]
+
+        for i in range(2):
+            band = (f"1.2.3.{i}:1234", "numa-0")
+            await meta_api.add_chunk_bands(chunk.key, [band])
+            meta = await meta_api.get_chunk_meta(chunk.key, fields=["bands"])
+            assert band in meta["bands"]
+        meta = await meta_api.get_chunk_meta(chunk.key, fields=["bands"])
+        band = meta["bands"][0]
+        chunks = await meta_api.get_band_chunks(band)
+        assert chunk.key in chunks
+        await meta_api.remove_chunk_bands(chunk.key, [band])
+        meta = await meta_api.get_chunk_meta(chunk.key, fields=["bands"])
+        assert band not in meta["bands"]
+
+        await meta_api.del_chunk_meta(chunk.key)
+        with pytest.raises(KeyError):
+            await meta_api.get_chunk_meta(chunk.key)
+
+        await MockClusterAPI.cleanup(pool.external_address)
+
+
+@pytest.mark.asyncio
+async def test_worker_meta_api():
+    supervisor_pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+    worker_pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+
+    async with supervisor_pool, worker_pool:
+        config = {
+            "services": ["cluster", "session", "meta", "web"],
+            "cluster": {
+                "backend": "fixed",
+                "lookup_address": supervisor_pool.external_address,
+            },
+            "meta": {"store": "dict"},
+        }
+        await start_services(
+            NodeRole.SUPERVISOR, config, address=supervisor_pool.external_address
+        )
+        await start_services(
+            NodeRole.WORKER, config, address=worker_pool.external_address
+        )
+
+        session_id = "test_session"
+        session_api = await SessionAPI.create(supervisor_pool.external_address)
+        await session_api.create_session(session_id)
+
+        worker_meta_api = await WorkerMetaAPI.create(
+            session_id=session_id, address=worker_pool.external_address
+        )
+        await worker_meta_api.set_tileable_meta(t)
+        meta = await worker_meta_api.get_tileable_meta(t.key, fields=["nsplits"])
+        assert meta["nsplits"] == t.nsplits
+        await worker_meta_api.del_tileable_meta(t.key)
+        with pytest.raises(KeyError):
+            await worker_meta_api.get_tileable_meta(t.key)
+
+        await stop_services(
+            NodeRole.WORKER, config, address=worker_pool.external_address
+        )
+        await stop_services(
+            NodeRole.SUPERVISOR, config, address=supervisor_pool.external_address
+        )
+
+
+@pytest.mark.asyncio
+async def test_meta_web_api():
+    pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+    web_port = get_next_port()
+
+    async with pool:
+        config = {
+            "services": ["cluster", "session", "meta", "web"],
+            "cluster": {
+                "backend": "fixed",
+                "lookup_address": pool.external_address,
+            },
+            "meta": {"store": "dict"},
+            "web": {
+                "port": web_port,
+            },
+        }
+        await start_services(NodeRole.SUPERVISOR, config, address=pool.external_address)
+
+        session_id = "test_session"
+        session_api = await SessionAPI.create(pool.external_address)
+        await session_api.create_session(session_id)
+
+        t = mt.random.rand(10, 10)
+        t = tile(t)
+
+        meta_api = await MetaAPI.create(session_id, pool.external_address)
+        web_api = WebMetaAPI(session_id, f"http://localhost:{web_port}")
+
+        await meta_api.set_chunk_meta(
+            t.chunks[0], bands=[(pool.external_address, "numa-0")]
+        )
+        meta = await web_api.get_chunk_meta(t.chunks[0].key, fields=["shape", "bands"])
+        assert set(meta.keys()) == {"shape", "bands"}
+
+        with pytest.raises(KeyError):
+            await web_api.get_chunk_meta("non-exist-key")
+
+        await stop_services(NodeRole.SUPERVISOR, config, address=pool.external_address)
diff --git a/python/xorbits/_mars/services/meta/tests/test_service.py b/python/xorbits/_mars/services/meta/tests/test_service.py
new file mode 100644
index 000000000..6df7acbaf
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/tests/test_service.py
@@ -0,0 +1,65 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from .... import oscar as mo
+from ... import NodeRole, start_services, stop_services
+from ...session.api import SessionAPI
+from ..api import MetaAPI, WorkerMetaAPI
+from ..supervisor import MetaSupervisorService
+
+
+@pytest.mark.asyncio
+async def test_meta_service():
+    pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+    worker_pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+
+    async with pool, worker_pool:
+        config = {
+            "services": ["cluster", "session", "meta"],
+            "cluster": {
+                "backend": "fixed",
+                "lookup_address": pool.external_address,
+            },
+            "meta": {"store": "dict"},
+        }
+        await start_services(NodeRole.SUPERVISOR, config, address=pool.external_address)
+        await start_services(
+            NodeRole.WORKER, config, address=worker_pool.external_address
+        )
+
+        session_id = "test_session"
+        session_api = await SessionAPI.create(pool.external_address)
+        await session_api.create_session(session_id)
+        # get session store
+        meta_api = await MetaAPI.create(session_id, pool.external_address)
+        # get worker meta store
+        worker_meta_api = await WorkerMetaAPI.create(
+            session_id, worker_pool.external_address
+        )
+
+        # destroy session
+        service = MetaSupervisorService({}, pool.external_address)
+        await service.destroy_session(session_id)
+        with pytest.raises(mo.ActorNotExist):
+            await service.destroy_session(session_id)
+        with pytest.raises(mo.ActorNotExist):
+            # actor already destroyed
+            await worker_meta_api.get_tileable_meta("any_id")
+
+        # test alru_cache
+        assert await MetaAPI.create(session_id, pool.external_address) is meta_api
+
+        await stop_services(NodeRole.SUPERVISOR, config, address=pool.external_address)
diff --git a/python/xorbits/_mars/services/meta/worker/__init__.py b/python/xorbits/_mars/services/meta/worker/__init__.py
new file mode 100644
index 000000000..8ea51a17f
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/worker/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .service import MetaWorkerService
diff --git a/python/xorbits/_mars/services/meta/worker/core.py b/python/xorbits/_mars/services/meta/worker/core.py
new file mode 100644
index 000000000..d7660333e
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/worker/core.py
@@ -0,0 +1,77 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+from .... import oscar as mo
+from ....lib.aio import alru_cache
+from ...cluster import ClusterAPI
+from ..store import get_meta_store
+
+
+class WorkerMetaStoreManagerActor(mo.Actor):
+    def __init__(self, meta_store_name: str, config: Dict):
+        self._meta_store_name = meta_store_name
+        self._meta_store_type = get_meta_store(meta_store_name)
+        self._config = config
+        self._meta_init_kwargs = None
+
+        self._cluster_api = None
+
+    async def __post_create__(self):
+        self._meta_init_kwargs = await self._meta_store_type.create(self._config)
+        self._cluster_api = await ClusterAPI.create(self.address)
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_supervisor_address(self, session_id: str):
+        [address] = await self._cluster_api.get_supervisors_by_keys([session_id])
+        return address
+
+    async def new_session_meta_store(self, session_id: str) -> mo.ActorRef:
+        from ..supervisor.core import MetaStoreActor
+
+        try:
+            ref = await mo.create_actor(
+                WorkerMetaStoreActor,
+                self._meta_store_name,
+                session_id,
+                uid=WorkerMetaStoreActor.gen_uid(session_id),
+                address=self.address,
+                **self._meta_init_kwargs,
+            )
+            supervisor_address = await self._get_supervisor_address(session_id)
+            supervisor_meta_store_ref = await mo.actor_ref(
+                uid=MetaStoreActor.gen_uid(session_id), address=supervisor_address
+            )
+            # register worker meta store,
+            # when session destroyed, this worker meta store actor will be removed
+            await supervisor_meta_store_ref.add_worker_meta_store(ref)
+        except mo.ActorAlreadyExist:
+            ref = await mo.actor_ref(
+                uid=WorkerMetaStoreActor.gen_uid(session_id), address=self.address
+            )
+        return ref
+
+
+class WorkerMetaStoreActor(mo.Actor):
+    def __init__(self, meta_store_name: str, session_id: str, **meta_store_kwargs):
+        meta_store_type = get_meta_store(meta_store_name)
+        self._store = meta_store_type(session_id, **meta_store_kwargs)
+
+    @staticmethod
+    def gen_uid(session_id: str):
+        return f"{session_id}_worker_meta"
+
+    def __getattr__(self, attr):
+        return getattr(self._store, attr)
diff --git a/python/xorbits/_mars/services/meta/worker/service.py b/python/xorbits/_mars/services/meta/worker/service.py
new file mode 100644
index 000000000..8b412c6e0
--- /dev/null
+++ b/python/xorbits/_mars/services/meta/worker/service.py
@@ -0,0 +1,52 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import oscar as mo
+from ...core import AbstractService
+from .core import WorkerMetaStoreManagerActor
+
+
+class MetaWorkerService(AbstractService):
+    """
+    Meta service on worker.
+
+    Service Configuration
+    ---------------------
+    {
+        "meta" : {
+            "store": "<meta store name>",
+            # other config related to each store
+        }
+    }
+    """
+
+    async def start(self):
+        service_config = self._config["meta"]
+        meta_store_name = service_config.get("meta", "dict")
+        extra_config = service_config.copy()
+        extra_config.pop("meta", None)
+        await mo.create_actor(
+            WorkerMetaStoreManagerActor,
+            meta_store_name,
+            extra_config,
+            uid=WorkerMetaStoreManagerActor.default_uid(),
+            address=self._address,
+        )
+
+    async def stop(self):
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=WorkerMetaStoreManagerActor.default_uid(), address=self._address
+            )
+        )
diff --git a/python/xorbits/_mars/services/mutable/__init__.py b/python/xorbits/_mars/services/mutable/__init__.py
new file mode 100644
index 000000000..10c5a5cb6
--- /dev/null
+++ b/python/xorbits/_mars/services/mutable/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .api import AbstractMutableAPI, MockMutableAPI, MutableAPI, WebMutableAPI
+from .core import MutableTensor
+from .supervisor import MutableObjectManagerActor, MutableTensorActor
diff --git a/python/xorbits/_mars/services/mutable/api/__init__.py b/python/xorbits/_mars/services/mutable/api/__init__.py
new file mode 100644
index 000000000..3ba73c389
--- /dev/null
+++ b/python/xorbits/_mars/services/mutable/api/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import AbstractMutableAPI
+from .oscar import MockMutableAPI, MutableAPI
+from .web import WebMutableAPI
diff --git a/python/xorbits/_mars/services/mutable/api/core.py b/python/xorbits/_mars/services/mutable/api/core.py
new file mode 100644
index 000000000..1e9cfa9fa
--- /dev/null
+++ b/python/xorbits/_mars/services/mutable/api/core.py
@@ -0,0 +1,126 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Tuple, Union
+
+import numpy as np
+
+from ..core import MutableTensorInfo
+
+
+class AbstractMutableAPI(ABC):
+    @abstractmethod
+    async def create_mutable_tensor(
+        self,
+        shape: tuple,
+        dtype: Union[np.dtype, str],
+        name: str = None,
+        default_value: Union[int, float] = 0,
+        chunk_size: Union[int, Tuple] = None,
+    ) -> MutableTensorInfo:
+        """
+        Create a mutable tensor.
+
+        Parameters
+        ----------
+        shape: tuple
+            Shape of the mutable tensor.
+
+        dtype: np.dtype or str
+            Data type of the mutable tensor.
+
+        chunk_size: int or tuple
+            Chunk size of the mutable tensor.
+
+        name: str, optional
+            Name of the mutable tensor, a random name will be used if not specified.
+
+        default_value: optional
+            Default value of the mutable tensor. Default is 0.
+
+        Returns
+        -------
+            MutableTensorInfo
+        """
+
+    @abstractmethod
+    async def get_mutable_tensor(self, name: str) -> MutableTensorInfo:
+        """
+        Get the mutable tensor by name.
+
+        Parameters
+        ----------
+        name: str
+            Name of the mutable tensor to get.
+
+        Returns
+        -------
+            MutableTensorInfo
+        """
+
+    @abstractmethod
+    async def seal_mutable_tensor(self, name: str, timestamp=None):
+        """
+        Seal the mutable tensor by name.
+
+        Parameters
+        ----------
+        name: str
+            Name of the mutable tensor to seal.
+
+        timestamp: optional
+            Operations that happened before timestamp will be sealed, and later ones will be discard.
+
+        Returns
+        -------
+            object
+        """
+
+    @abstractmethod
+    async def read(self, name: str, index: object, timestamp=None):
+        """
+        Read value from mutable tensor.
+
+        Parameters
+        ----------
+        name: str
+            Name of mutable tensor to read.
+
+        index:
+            Index to read from the tensor.
+
+        timestamp: optional
+            Timestamp to read value that happened before then.
+        """
+
+    @abstractmethod
+    async def write(self, name: str, index: object, value: object, timestamp=None):
+        """
+        Write value to mutable tensor.
+
+        Parameters
+        ----------
+        name: str
+            Name of the mutable tensor to write.
+
+        index:
+            Index to write to the tensor.
+
+        value:
+            The value that will be filled into the mutable tensor according to `index`.
+
+        timestamp: optional
+            Timestamp to associated with the newly touched value.
+        """
diff --git a/python/xorbits/_mars/services/mutable/api/oscar.py b/python/xorbits/_mars/services/mutable/api/oscar.py
new file mode 100644
index 000000000..1c5283d21
--- /dev/null
+++ b/python/xorbits/_mars/services/mutable/api/oscar.py
@@ -0,0 +1,104 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple, Type, TypeVar, Union
+
+import numpy as np
+
+from .... import oscar as mo
+from ....lib.aio import alru_cache
+from ..core import MutableTensorInfo
+from ..supervisor import MutableObjectManagerActor, MutableTensorActor
+from .core import AbstractMutableAPI
+
+APIType = TypeVar("APIType", bound="MutableAPI")
+
+
+class MutableAPI(AbstractMutableAPI):
+    def __init__(
+        self,
+        address: str,
+        mutable_mananger: mo.ActorRefType[MutableObjectManagerActor],
+    ):
+        self._address = address
+        self._mutable_manager_ref = mutable_mananger
+
+    @classmethod
+    @alru_cache(cache_exceptions=False)
+    async def create(cls, session_id: str, address: str) -> "MutableAPI":
+        mutable_manager = await mo.actor_ref(
+            address, MutableObjectManagerActor.gen_uid(session_id)
+        )
+        return MutableAPI(address, mutable_manager)
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_mutable_tensor_ref(
+        self, name: str
+    ) -> mo.ActorRefType[MutableTensorActor]:
+        return await self._mutable_manager_ref.get_mutable_tensor(name)
+
+    async def create_mutable_tensor(
+        self,
+        shape: tuple,
+        dtype: Union[np.dtype, str],
+        name: str = None,
+        default_value: Union[int, float] = 0,
+        chunk_size: Union[int, Tuple] = None,
+    ) -> MutableTensorInfo:
+        actor_ref = await self._mutable_manager_ref.create_mutable_tensor(
+            name=name,
+            shape=shape,
+            dtype=dtype,
+            chunk_size=chunk_size,
+            default_value=default_value,
+        )
+        return await actor_ref.info()
+
+    @alru_cache(cache_exceptions=False)
+    async def get_mutable_tensor(self, name: str):
+        actor_ref = await self._mutable_manager_ref.get_mutable_tensor(name)
+        return await actor_ref.info()
+
+    async def seal_mutable_tensor(self, name: str, timestamp=None):
+        # invalidate the `get_mutable_tensor` cache first.
+        self.get_mutable_tensor.invalidate()
+        return await self._mutable_manager_ref.seal_mutable_tensor(
+            name, timestamp=timestamp
+        )
+
+    async def read(self, name: str, index, timestamp=None):
+        tensor_ref = await self._get_mutable_tensor_ref(name)
+        return await tensor_ref.read(index, timestamp)
+
+    async def write(self, name: str, index, value, timestamp=None):
+        tensor_ref = await self._get_mutable_tensor_ref(name)
+        return await tensor_ref.write(index, value, timestamp)
+
+
+class MockMutableAPI(MutableAPI):
+    @classmethod
+    async def create(cls: Type[APIType], session_id: str, address: str) -> "MutableAPI":
+        mutable_managger = await mo.create_actor(
+            MutableObjectManagerActor,
+            session_id,
+            address=address,
+            uid=MutableObjectManagerActor.gen_uid(session_id),
+        )
+        return MockMutableAPI(address, mutable_managger)
+
+    @classmethod
+    async def cleanup(cls: Type[APIType], session_id: str, address: str):
+        await mo.destroy_actor(
+            await mo.actor_ref(address, MutableObjectManagerActor.gen_uid(session_id))
+        )
diff --git a/python/xorbits/_mars/services/mutable/api/web.py b/python/xorbits/_mars/services/mutable/api/web.py
new file mode 100644
index 000000000..abcb000a9
--- /dev/null
+++ b/python/xorbits/_mars/services/mutable/api/web.py
@@ -0,0 +1,186 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Union
+
+import numpy as np
+
+from ....lib.aio import alru_cache
+from ....utils import deserialize_serializable, serialize_serializable
+from ...web import MarsServiceWebAPIHandler, MarsWebAPIClientMixin, web_api
+from .core import AbstractMutableAPI
+
+
+class MutableWebAPIHandler(MarsServiceWebAPIHandler):
+    _root_pattern = "/api/session/(?P<session_id>[^/]+)/mutable"
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_cluster_api(self):
+        from ...cluster import ClusterAPI
+
+        return await ClusterAPI.create(self._supervisor_addr)
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_oscar_mutable_api(self, session_id: str):
+        from .oscar import MutableAPI
+
+        cluster_api = await self._get_cluster_api()
+        [address] = await cluster_api.get_supervisors_by_keys([session_id])
+        return await MutableAPI.create(session_id, address)
+
+    @web_api("", method="post")
+    async def create_mutable_tensor(self, session_id: str):
+        body_args = (
+            deserialize_serializable(self.request.body) if self.request.body else None
+        )
+        shape = body_args.get("shape")
+        dtype = body_args.get("dtype")
+        name = body_args.get("name")
+        default_value = body_args.get("default_value")
+        chunk_size = body_args.get("chunk_size")
+
+        oscar_api = await self._get_oscar_mutable_api(session_id)
+        res = await oscar_api.create_mutable_tensor(
+            shape, dtype, name, default_value, chunk_size
+        )
+        self.write(serialize_serializable(res))
+
+    @web_api("(?P<name>[^/]+)", method="get")
+    async def get_mutable_tensor(self, session_id: str, name: str):
+        oscar_api = await self._get_oscar_mutable_api(session_id)
+        res = await oscar_api.get_mutable_tensor(name)
+        self.write(serialize_serializable(res))
+
+    @web_api("(?P<name>[^/]+)/seal", method="post")
+    async def seal_mutable_tensor(self, session_id: str, name: str):  # pragma: no cover
+        body_args = (
+            deserialize_serializable(self.request.body) if self.request.body else None
+        )
+        timestamp = body_args.get("timestamp")
+
+        oscar_api = await self._get_oscar_mutable_api(session_id)
+        res = await oscar_api.seal_mutable_tensor(name, timestamp)
+        self.write(serialize_serializable(res))
+
+    @web_api("(?P<name>[^/]+)/read", method="post")
+    async def read_mutable(self, session_id: str, name: str):  # pragma: no cover
+        body_args = (
+            deserialize_serializable(self.request.body) if self.request.body else None
+        )
+        index = body_args.get("index")
+        timestamp = body_args.get("timestamp")
+
+        oscar_api = await self._get_oscar_mutable_api(session_id)
+        res = await oscar_api.read(name, index, timestamp)
+        self.write(serialize_serializable(res))
+
+    @web_api("(?P<name>[^/]+)/write", method="post")
+    async def write_mutable(self, session_id: str, name: str):  # pragma: no cover
+        body_args = (
+            deserialize_serializable(self.request.body) if self.request.body else None
+        )
+        index = body_args.get("index")
+        value = body_args.get("value")
+        timestamp = body_args.get("timestamp")
+
+        oscar_api = await self._get_oscar_mutable_api(session_id)
+        res = await oscar_api.write(name, index, value, timestamp)
+        self.write(serialize_serializable(res))
+
+
+web_handlers = {
+    MutableWebAPIHandler.get_root_pattern(): MutableWebAPIHandler,
+}
+
+
+class WebMutableAPI(AbstractMutableAPI, MarsWebAPIClientMixin):
+    def __init__(
+        self, session_id: str, address: str, request_rewriter: Callable = None
+    ):
+        self._session_id = session_id
+        self._address = address.rstrip("/")
+        self.request_rewriter = request_rewriter
+
+    async def create_mutable_tensor(
+        self,
+        shape: tuple,
+        dtype: Union[np.dtype, str],
+        name: str = None,
+        default_value: Union[int, float] = 0,
+        chunk_size: Union[tuple, int] = None,
+    ):
+        path = f"{self._address}/api/session/{self._session_id}/mutable"
+        params = dict(
+            shape=shape,
+            dtype=dtype,
+            name=name,
+            default_value=default_value,
+            chunk_size=chunk_size,
+        )
+        body = serialize_serializable(params)
+        res = await self._request_url(
+            path=path,
+            method="POST",
+            data=body,
+            headers={"Content-Type": "application/octet-stream"},
+        )
+        return deserialize_serializable(res.body)
+
+    async def get_mutable_tensor(self, name: str):
+        path = f"{self._address}/api/session/{self._session_id}/mutable/{name}"
+        res = await self._request_url(
+            path=path,
+            method="GET",
+            headers={"Content-Type": "application/octet-stream"},
+        )
+        return deserialize_serializable(res.body)
+
+    async def seal_mutable_tensor(self, name: str, timestamp=None):
+        path = f"{self._address}/api/session/{self._session_id}/mutable/{name}/seal"
+        params = dict(timestamp=timestamp)
+        body = serialize_serializable(params)
+        res = await self._request_url(
+            path=path,
+            method="POST",
+            data=body,
+            headers={"Content-Type": "application/octet-stream"},
+            request_timeout=3600,
+        )
+        return deserialize_serializable(res.body)
+
+    async def read(self, name: str, index, timestamp=None):
+        path = f"{self._address}/api/session/{self._session_id}/mutable/{name}/read"
+        params = dict(index=index, timestamp=timestamp)
+        body = serialize_serializable(params)
+        res = await self._request_url(
+            path=path,
+            method="POST",
+            data=body,
+            headers={"Content-Type": "application/octet-stream"},
+            request_timeout=3600,
+        )
+        return deserialize_serializable(res.body)
+
+    async def write(self, name: str, index, value, timestamp=None):
+        path = f"{self._address}/api/session/{self._session_id}/mutable/{name}/write"
+        params = dict(index=index, value=value, timestamp=timestamp)
+        body = serialize_serializable(params)
+        res = await self._request_url(
+            path=path,
+            method="POST",
+            data=body,
+            headers={"Content-Type": "application/octet-stream"},
+            request_timeout=3600,
+        )
+        return deserialize_serializable(res.body)
diff --git a/python/xorbits/_mars/services/mutable/core.py b/python/xorbits/_mars/services/mutable/core.py
new file mode 100644
index 000000000..43213332f
--- /dev/null
+++ b/python/xorbits/_mars/services/mutable/core.py
@@ -0,0 +1,191 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+
+
+class MutableTensorInfo:
+    """
+    Why `MutableTensorInfo`?
+
+    We need a cluster to transfer meta information of mutable tensor, between
+    server and client, as over the HTTP web session.
+
+    Thus we design an internal-only `MutableTensorInfo` type as a container
+    for those information.
+
+    A `MutableTensor` can be initialized from
+
+        - a info, which contains the metadata
+        - a `mutable_api`, which will be used to request the backend API
+        - a `loop`, which will be used to execute `__setitem__` (and `__getitem__`)
+          synchronously to make the API more user-friendly.
+    """
+
+    def __init__(self, shape, dtype, name, default_value):
+        self._shape = shape
+        self._dtype = dtype
+        self._name = name
+        self._default_value = default_value
+
+    @property
+    def shape(self):
+        return self._shape
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def default_value(self):
+        return self._default_value
+
+
+class MutableTensor:
+    def __init__(self, info, mutable_api, loop):
+        self._info = info
+        self._mutable_api = mutable_api
+        self._loop = loop
+
+    @classmethod
+    def create(
+        cls,
+        info: "MutableTensorInfo",
+        mutable_api,  # no type signature, to avoid cycle imports
+        loop: asyncio.AbstractEventLoop,
+    ) -> "MutableTensor":
+        return MutableTensor(info, mutable_api, loop)
+
+    @property
+    def shape(self):
+        """
+        Get the shape the mutable tensor.
+
+        Returns
+        -------
+            Tuple
+        """
+        return self._info.shape
+
+    @property
+    def dtype(self):
+        """
+        Get the dtype the mutable tensor.
+
+        Returns
+        -------
+            np.dtype or str
+        """
+        return self._info.dtype
+
+    @property
+    def name(self):
+        """
+        Get the dtype the mutable tensor.
+
+        Returns
+        -------
+            str
+        """
+        return self._info.name
+
+    @property
+    def default_value(self):
+        """
+        Get the dtype the mutable tensor.
+
+        Returns
+        -------
+            int or float
+        """
+        return self._info.default_value
+
+    async def read(self, index, timestamp=None):
+        """
+        Read value from mutable tensor.
+
+        Parameters
+        ----------
+        index:
+            Index to read from the tensor.
+
+        timestamp: optional
+            Timestamp to read value that happened before then.
+        """
+        return await self._mutable_api.read(self.name, index, timestamp)
+
+    async def write(self, index, value, timestamp=None):
+        """
+        Write value to mutable tensor.
+
+        Parameters
+        ----------
+        index:
+            Index to write to the tensor.
+
+        value:
+            The value that will be filled into the mutable tensor according to `index`.
+
+        timestamp: optional
+            Timestamp to associated with the newly touched value.
+        """
+        return await self._mutable_api.write(self.name, index, value, timestamp)
+
+    def __getitem__(self, index):
+        """
+        Read value from mutable tensor with a synchronous API.
+
+        Parameters
+        ----------
+        index:
+            Index to read from the tensor.
+        """
+        coro = self.read(index)
+        fut = asyncio.run_coroutine_threadsafe(coro, self._loop)
+        return fut.result()
+
+    def __setitem__(self, index, value):
+        """
+        Write value to mutable tensor with a synchronous API.
+
+        Parameters
+        ----------
+        index:
+            Index to write to the tensor.
+
+        value:
+            The value that will be filled into the mutable tensor according to `index`.
+        """
+        coro = self.write(index, value)
+        fut = asyncio.run_coroutine_threadsafe(coro, self._loop)
+        return fut.result()
+
+    async def seal(self, timestamp=None):
+        """
+        Seal the mutable tensor by name.
+
+        Parameters
+        ----------
+        timestamp: optional
+            Operations that happened before timestamp will be sealed, and later ones will be discard.
+
+        Returns
+        -------
+            object
+        """
+        return await self._mutable_api.seal_mutable_tensor(self.name, timestamp)
diff --git a/python/xorbits/_mars/services/mutable/supervisor/__init__.py b/python/xorbits/_mars/services/mutable/supervisor/__init__.py
new file mode 100644
index 000000000..af3be8fff
--- /dev/null
+++ b/python/xorbits/_mars/services/mutable/supervisor/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import MutableObjectManagerActor, MutableTensorActor
+from .service import MutableObjectManagerSupervisorService
diff --git a/python/xorbits/_mars/services/mutable/supervisor/core.py b/python/xorbits/_mars/services/mutable/supervisor/core.py
new file mode 100644
index 000000000..0cedbdeb3
--- /dev/null
+++ b/python/xorbits/_mars/services/mutable/supervisor/core.py
@@ -0,0 +1,245 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+
+from .... import oscar as mo
+from ....core import tile
+from ....utils import build_fetch
+from ...cluster import ClusterAPI
+from ...core import NodeRole
+from ...meta import MetaAPI
+from ..core import MutableTensorInfo
+from ..utils import (
+    getitem_to_records,
+    normalize_name,
+    normalize_timestamp,
+    setitem_to_records,
+)
+from ..worker import MutableTensorChunkActor
+
+
+class MutableObjectManagerActor(mo.Actor):
+    def __init__(self, session_id: str):
+        self._session_id = session_id
+        self._cluster_api: Optional[ClusterAPI] = None
+
+        self._mutable_objects = dict()
+
+    async def __post_create__(self):
+        self._cluster_api = await ClusterAPI.create(self.address)
+
+    async def __pre_destroy__(self):
+        await asyncio.gather(
+            *[mo.destroy_actor(ref) for ref in self._mutable_objects.values()]
+        )
+
+    @classmethod
+    def gen_uid(cls, session_id: str):
+        return f"mutable-object-manager-{session_id}"
+
+    async def create_mutable_tensor(self, *args, name: Optional[str] = None, **kwargs):
+        name = normalize_name(name)
+        if name in self._mutable_objects:
+            raise ValueError(f"Mutable tensor {name} already exists!")
+
+        workers: List[str] = list(
+            await self._cluster_api.get_nodes_info(role=NodeRole.WORKER)
+        )
+
+        tensor_ref = await mo.create_actor(
+            MutableTensorActor,
+            self._session_id,
+            name,
+            workers,
+            *args,
+            **kwargs,
+            address=self.address,
+            uid=MutableTensorActor.gen_uid(self._session_id, name),
+        )
+        self._mutable_objects[name] = tensor_ref
+        return tensor_ref
+
+    async def get_mutable_tensor(self, name: str):
+        tensor_ref = self._mutable_objects.get(name, None)
+        if tensor_ref is None:
+            raise ValueError(f"Mutable tensor {name} doesn't exist!")
+        return tensor_ref
+
+    async def seal_mutable_tensor(self, name: str, timestamp=None):
+        tensor_ref = self._mutable_objects.get(name, None)
+        if tensor_ref is None:
+            raise ValueError(f"Mutable tensor {name} doesn't exist!")
+        tensor = await tensor_ref.seal(timestamp)
+        await mo.destroy_actor(tensor_ref)
+        self._mutable_objects.pop(name)
+        return tensor
+
+
+class MutableTensorActor(mo.Actor):
+    def __init__(
+        self,
+        session_id: str,
+        name: str,
+        workers: List[str],
+        shape: Tuple,
+        dtype: Union[np.dtype, str],
+        default_value: Union[int, float] = 0,
+        chunk_size: Union[int, Tuple] = None,
+    ):
+        self._session_id = session_id
+        self._name = name
+        self._workers = workers
+        self._shape = shape
+        self._dtype = dtype
+        self._default_value = default_value
+        self._chunk_size = chunk_size
+
+        self._sealed = False
+
+        self._fetch = None
+        self._chunk_actors = []
+        # chunk to actor: {chunk index -> actor uid}
+        self._chunk_to_actor: Dict[
+            Tuple, mo.ActorRefType[MutableTensorChunkActor]
+        ] = dict()
+
+    async def __post_create__(self):
+        self._meta_api = await MetaAPI.create(self._session_id, self.address)
+
+        # tiling a random tensor to generate keys, but we doesn't actually execute
+        # the random generator
+        from ....tensor.random import rand
+
+        self._fetch = build_fetch(
+            tile(rand(*self._shape, dtype=self._dtype, chunk_size=self._chunk_size))
+        )
+
+        chunk_groups = np.array_split(self._fetch.chunks, len(self._workers))
+        for idx, (worker, chunks) in enumerate(zip(self._workers, chunk_groups)):
+            if len(chunks) == 0:
+                break
+            chunk_actor_ref = await mo.create_actor(
+                MutableTensorChunkActor,
+                self._session_id,
+                self.address,
+                list(chunks),
+                dtype=self._dtype,
+                default_value=self._default_value,
+                address=worker,
+                uid=MutableTensorChunkActor.gen_uid(self._session_id, self._name, idx),
+            )
+            self._chunk_actors.append(chunk_actor_ref)
+            for chunk in chunks:
+                self._chunk_to_actor[chunk.index] = chunk_actor_ref
+
+    async def __pre_destroy__(self):
+        await asyncio.gather(*[mo.destroy_actor(ref) for ref in self._chunk_actors])
+
+    @classmethod
+    def gen_uid(cls, session_id, name):
+        return f"mutable-tensor-{session_id}-{name}"
+
+    async def info(self) -> "MutableTensorInfo":
+        return MutableTensorInfo(
+            self._shape, self._dtype, self._name, self._default_value
+        )
+
+    @mo.extensible
+    async def _read_chunk(
+        self, chunk_actor_ref, chunk_index, records, chunk_value_shape, timestamp
+    ):
+        return await chunk_actor_ref.read(
+            chunk_index, records, chunk_value_shape, timestamp
+        )
+
+    async def read(self, index, timestamp=None):
+        """
+        Read value from mutable tensor.
+
+        Parameters
+        ----------
+        index:
+            Index to read from the tensor.
+
+        timestamp: optional
+            Timestamp to read value that happened before then.
+        """
+        timestamp = normalize_timestamp(timestamp)
+        records, output_shape = getitem_to_records(self._fetch, index)
+
+        read_tasks, chunk_indices = [], []
+        for chunk_index, (records, chunk_value_shape, indices) in records.items():
+            chunk_actor_ref = self._chunk_to_actor[chunk_index]
+            read_tasks.append(
+                self._read_chunk.delay(
+                    chunk_actor_ref, chunk_index, records, chunk_value_shape, timestamp
+                )
+            )
+            chunk_indices.append(indices)
+        chunks = await self._read_chunk.batch(*read_tasks)
+        result = np.full(output_shape, fill_value=self._default_value)
+        for chunk, indices in zip(chunks, chunk_indices):
+            result[indices] = chunk
+        return result
+
+    @mo.extensible
+    async def _write_chunk(self, chunk_actor_ref, chunk_index, records):
+        await chunk_actor_ref.write(chunk_index, records)
+
+    async def write(self, index, value, timestamp=None):
+        """
+        Write value to mutable tensor.
+
+        Parameters
+        ----------
+        index:
+            Index to write to the tensor.
+
+        value:
+            The value that will be filled into the mutable tensor according to `index`.
+
+        timestamp: optional
+            Timestamp to associated with the newly touched value.
+        """
+        timestamp = normalize_timestamp(timestamp)
+        records = setitem_to_records(self._fetch, index, value, timestamp)
+
+        write_tasks = []
+        for chunk_index, records in records.items():
+            chunk_actor_ref = self._chunk_to_actor[chunk_index]
+            write_tasks.append(
+                self._write_chunk.delay(chunk_actor_ref, chunk_index, records)
+            )
+        await self._write_chunk.batch(*write_tasks)
+
+    @mo.extensible
+    async def _seal_chunk(self, chunk_actor_ref, timestamp):
+        await chunk_actor_ref.seal(timestamp)
+
+    async def seal(self, timestamp=None):
+        if self._sealed:
+            return self._fetch
+
+        timestamp = normalize_timestamp(timestamp)
+        self._sealed = True
+        seal_tasks = []
+        for chunk_actor_ref in self._chunk_actors:
+            seal_tasks.append(self._seal_chunk.delay(chunk_actor_ref, timestamp))
+        await self._seal_chunk.batch(*seal_tasks)
+        self._chunk_actors = []
+        return self._fetch
diff --git a/python/xorbits/_mars/services/mutable/supervisor/service.py b/python/xorbits/_mars/services/mutable/supervisor/service.py
new file mode 100644
index 000000000..9e0302347
--- /dev/null
+++ b/python/xorbits/_mars/services/mutable/supervisor/service.py
@@ -0,0 +1,40 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import oscar as mo
+from ...core import AbstractService
+from .core import MutableObjectManagerActor
+
+
+class MutableObjectManagerSupervisorService(AbstractService):
+    async def start(self):
+        pass
+
+    async def stop(self):
+        pass
+
+    async def create_session(self, session_id: str):
+        await mo.create_actor(
+            MutableObjectManagerActor,
+            session_id,
+            address=self._address,
+            uid=MutableObjectManagerActor.gen_uid(session_id),
+        )
+
+    async def destroy_session(self, session_id: str):
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=MutableObjectManagerActor.gen_uid(session_id), address=self._address
+            )
+        )
diff --git a/python/xorbits/_mars/services/mutable/tests/__init__.py b/python/xorbits/_mars/services/mutable/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/mutable/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/mutable/tests/test_mutable.py b/python/xorbits/_mars/services/mutable/tests/test_mutable.py
new file mode 100644
index 000000000..87fc8d510
--- /dev/null
+++ b/python/xorbits/_mars/services/mutable/tests/test_mutable.py
@@ -0,0 +1,247 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import sys
+import uuid
+
+import numpy as np
+import pytest
+
+from ....deploy.oscar.local import new_cluster
+from ....deploy.oscar.session import AsyncSession, SyncSession
+from ..core import MutableTensor
+from ..utils import normalize_timestamp
+
+_is_windows = sys.platform.lower().startswith("win")
+
+
+@pytest.fixture
+async def create_cluster():
+    client = await new_cluster(n_worker=2, n_cpu=2, web=True)
+    async with client:
+        yield client
+
+
+@pytest.mark.skipif(_is_windows, reason="FIXME")
+@pytest.mark.parametrize(
+    "session_type",
+    ["async_session", "async_web_session", "sync_session", "sync_web_session"],
+)
+@pytest.mark.asyncio
+async def test_mutable_tensor(create_cluster, session_type):
+    is_web = "web" in session_type
+    is_async = "async" in session_type
+
+    if is_web:
+        session_id = str(uuid.uuid4())
+        session = await AsyncSession.init(create_cluster.web_address, session_id)
+    else:
+        session = create_cluster.session
+    if not is_async:
+        session = SyncSession.from_isolated_session(session)
+
+    tensor_useless: MutableTensor = session.create_mutable_tensor(  # noqa: F841
+        shape=(10, 30, 50), dtype=np.int64, default_value=100, chunk_size=(20, 20, 20)
+    )
+    if is_async:
+        tensor_useless = await tensor_useless
+
+    tensor: MutableTensor = session.create_mutable_tensor(
+        shape=(10, 30, 50),
+        dtype=np.int64,
+        name="mytensor",
+        default_value=100,
+        chunk_size=(20, 20, 20),
+    )
+    if is_async:
+        tensor = await tensor
+
+    assert tensor.shape == (10, 30, 50)
+    assert tensor.dtype == np.int64
+    assert tensor.name == "mytensor"
+    assert tensor.default_value == 100
+
+    assert tensor_useless.name != tensor.name
+
+    # non exists
+    with pytest.raises(ValueError):
+        tensor1 = session.get_mutable_tensor("notensor")
+        if is_async:
+            tensor1 = await tensor1
+
+    # create with duplicate name
+    with pytest.raises(ValueError):
+        tensor2 = session.create_mutable_tensor(
+            shape=(10, 30, 50),
+            dtype=np.int64,
+            name="mytensor",
+            default_value=100,
+            chunk_size=(20, 20, 20),
+        )
+        if is_async:
+            tensor2 = await tensor2
+
+    tensor3: MutableTensor = session.get_mutable_tensor("mytensor")
+    if is_async:
+        tensor3 = await tensor3
+    assert tensor3.shape == (10, 30, 50)
+    assert tensor3.dtype == np.int64
+    assert tensor3.name == "mytensor"
+    assert tensor3.default_value == 100
+
+    # test using read/write
+
+    expected = np.full((10, 30, 50), fill_value=100)
+    xs = await tensor3.read(slice(None, None, None))
+    np.testing.assert_array_equal(expected, xs)
+
+    await tensor.write(slice(None, None, None), 1)
+    expected[:] = 1
+    xs = await tensor3.read(slice(None, None, None))
+    np.testing.assert_array_equal(expected, xs)
+
+    await tensor.write((9, 2, 3), 2)
+    expected[9, 2, 3] = 2
+    xs = await tensor3.read((9, 2, 3))
+    assert expected[9, 2, 3] == xs
+
+    await tensor.write((slice(2, 9, 3), slice(5, 15, None), slice(8, 50, 9)), 4)
+    expected[2:9:3, 5:15, 8:50:9] = 4
+    xs = await tensor3.read(slice(None, None, None))
+    np.testing.assert_array_equal(expected, xs)
+
+    # test using __getitem__/__setitem__
+
+    # reset
+    tensor[:] = 100
+
+    expected = np.full((10, 30, 50), fill_value=100)
+    xs = tensor3[:]
+    np.testing.assert_array_equal(expected, xs)
+
+    tensor[:] = 1
+    expected[:] = 1
+    xs = tensor3[:]
+    np.testing.assert_array_equal(expected, xs)
+
+    tensor[9, 2, 3] = 2
+    expected[9, 2, 3] = 2
+    xs = tensor3[9, 2, 3]
+    assert expected[9, 2, 3] == xs
+
+    tensor[2:19:3, 5:15, 8:50:9] = 4
+    expected[2:19:3, 5:15, 8:50:9] = 4
+    xs = tensor3[:]
+    np.testing.assert_array_equal(expected, xs)
+
+    # seal
+
+    if is_async:
+        sealed = await tensor.seal()
+        info = await session.execute(sealed)
+        await info
+        value = await session.fetch(sealed)
+    else:
+        sealed = await tensor.seal()
+        session.execute(sealed)
+        value = session.fetch(sealed)
+    np.testing.assert_array_equal(expected, value)
+
+    # non exists after sealed
+    with pytest.raises(ValueError):
+        await tensor.seal()
+    with pytest.raises(ValueError):
+        await tensor3.seal()
+
+    # TODO: real fancy index not supported yet, as `TensorConcatenate` involved
+    #
+    # await tensor.write(([11, 2, 3, 50], [14, 5, 6, 50], [17, 8, 9, 50]), 3)
+    # expected[[11, 2, 3, 50], [14, 5, 6, 50], [17, 8, 9, 50]] = 3
+    # xs = await tensor1[:]
+    # np.testing.assert_array_equal(expected, xs)
+
+
+@pytest.mark.skipif(_is_windows, reason="FIXME")
+@pytest.mark.parametrize(
+    "session_type",
+    ["async_session", "async_web_session", "sync_session", "sync_web_session"],
+)
+@pytest.mark.asyncio
+async def test_mutable_tensor_timestamp(create_cluster, session_type):
+    is_web = "web" in session_type
+    is_async = "async" in session_type
+
+    if is_web:
+        session_id = str(uuid.uuid4())
+        session = await AsyncSession.init(create_cluster.web_address, session_id)
+    else:
+        session = create_cluster.session
+    if not is_async:
+        session = SyncSession.from_isolated_session(session)
+
+    tensor: MutableTensor = session.create_mutable_tensor(
+        shape=(2, 4), dtype=np.int64, default_value=0, chunk_size=(1, 3)
+    )
+    if is_async:
+        tensor = await tensor
+
+    assert tensor.shape == (2, 4)
+    assert tensor.dtype == np.int64
+    assert tensor.default_value == 0
+
+    t0 = normalize_timestamp()
+    await asyncio.sleep(5)
+    t1 = normalize_timestamp()
+
+    # write with earlier timestamp
+    await tensor.write((slice(0, 2, 1), slice(0, 2, 1)), 1, timestamp=t1)
+
+    # read staled value
+    actual = await tensor.read(slice(None, None, None), t0)
+    expected = np.array([[0, 0, 0, 0], [0, 0, 0, 0]])
+    np.testing.assert_array_equal(expected, actual)
+
+    # read current value
+    actual = await tensor.read(slice(None, None, None), t1)
+    expected = np.array([[1, 1, 0, 0], [1, 1, 0, 0]])
+    np.testing.assert_array_equal(expected, actual)
+
+    # read new value
+    t2 = normalize_timestamp()
+    actual = await tensor.read(slice(None, None, None), t2)
+    expected = np.array([[1, 1, 0, 0], [1, 1, 0, 0]])
+    np.testing.assert_array_equal(expected, actual)
+
+    # read latest value
+    actual = await tensor.read(slice(None, None, None))
+    expected = np.array([[1, 1, 0, 0], [1, 1, 0, 0]])
+    np.testing.assert_array_equal(expected, actual)
+
+    # seal on staled value
+    if is_async:
+        sealed = await tensor.seal(timestamp=t0)
+        info = await session.execute(sealed)
+        await info
+        actual = await session.fetch(sealed)
+    else:
+        sealed = await tensor.seal(timestamp=t0)
+        session.execute(sealed)
+        actual = session.fetch(sealed)
+    expected = np.array([[0, 0, 0, 0], [0, 0, 0, 0]])
+    np.testing.assert_array_equal(expected, actual)
+
+    # non exists after sealed
+    with pytest.raises(ValueError):
+        await tensor.seal()
diff --git a/python/xorbits/_mars/services/mutable/utils.py b/python/xorbits/_mars/services/mutable/utils.py
new file mode 100644
index 000000000..2be686205
--- /dev/null
+++ b/python/xorbits/_mars/services/mutable/utils.py
@@ -0,0 +1,219 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import uuid
+from datetime import datetime
+from numbers import Integral
+from typing import Optional
+
+import numpy as np
+
+from ...core import tile
+
+
+def indexing_to_chunk_indices(output_chunk):
+    """
+    Compute input_indices and value_indices when read from or write to
+    a tensor chunk.
+
+    Parameters
+    ----------
+    output_chunk:
+        A chunk in the output of the `__setitem__` op.
+
+    Returns
+    -------
+        The indices in the input chunk, and value_indices in the value block
+        that will be assigned.
+    """
+    input_indices = []  # index in the chunk of the mutable tensor
+    value_indices = []  # index in the chunk of the assigned value
+    for d, s in zip(output_chunk.op.indexes, output_chunk.op.inputs[0].shape):
+        # expand the index (slice)
+        idx = np.r_[slice(*d.indices(s)) if isinstance(d, slice) else d]
+        input_indices.append(idx)
+        if not isinstance(d, Integral):
+            value_indices.append(np.arange(len(idx)))
+    return input_indices, value_indices
+
+
+def compute_output_of_indexing(tensor, tensor_index):
+    """
+    Compute the output information of `__{set,get}item__` on tensor for every chunk.
+    """
+    from ...tensor.indexing.core import calc_shape, process_index
+    from ...tensor.indexing.getitem import TensorIndex
+
+    tensor_index = process_index(tensor.ndim, tensor_index)
+    output_shape = calc_shape(tensor.shape, tensor_index)
+
+    index_tensor_op = TensorIndex(
+        dtype=tensor.dtype, sparse=False, indexes=list(tensor_index)
+    )
+    index_tensor = tile(index_tensor_op.new_tensor([tensor], shape=tuple(output_shape)))
+    output_chunks = index_tensor.chunks
+
+    nsplits_acc = [
+        np.cumsum(
+            (0,)
+            + tuple(
+                c.shape[i]
+                for c in output_chunks
+                if all(idx == 0 for j, idx in enumerate(c.index) if j != i)
+            )
+        )
+        for i in range(len(output_chunks[0].shape))
+    ]
+    return output_shape, output_chunks, nsplits_acc
+
+
+def setitem_on_chunk_to_records(nsplits_acc, output_chunk, value, ts, is_scalar):
+    """
+    Turns a `__setitem__` on chunk to a list of index-value records.
+
+    Parameters
+    ----------
+    nsplits_acc:
+        Accumulate nsplits arrays of the output tensor chunks.
+
+    Returns
+    -------
+        A list of `(index, value, timestamp)`, where `index` is the in-chunk index.
+    """
+    input_indices, value_indices = indexing_to_chunk_indices(output_chunk)
+
+    # normalize assigned value
+    if is_scalar:
+        chunk_value = value
+    else:
+        chunk_value_slice = tuple(
+            slice(
+                nsplits_acc[i][output_chunk.index[i]],
+                nsplits_acc[i][output_chunk.index[i] + 1],
+            )
+            for i in range(len(output_chunk.index))
+        )
+        chunk_value = value[chunk_value_slice]
+
+    records = []
+    for chunk_idx, value_idx in zip(
+        itertools.product(*input_indices), itertools.product(*value_indices)
+    ):
+        new_value = chunk_value if is_scalar else chunk_value[value_idx]
+        index_in_chunk = np.ravel_multi_index(
+            chunk_idx, output_chunk.op.inputs[0].shape
+        )
+        records.append((index_in_chunk, new_value, ts))
+    return records
+
+
+def setitem_to_records(tensor, tensor_index, value, timestamp):
+    """
+    Compute the records of `__setitem__` on tensor for every chunk.
+
+    Returns
+    -------
+        dict, a dict of chunk index to records in that chunk.
+    """
+    output_shape, output_chunks, nsplits_acc = compute_output_of_indexing(
+        tensor, tensor_index
+    )
+
+    is_scalar = (
+        np.isscalar(value)
+        or isinstance(value, tuple)
+        and tensor.dtype.fields is not None
+    )
+    if not is_scalar:
+        value = np.broadcast_to(value, output_shape).astype(tensor.dtype)
+
+    records = dict()
+    for output_chunk in output_chunks:
+        records_in_chunk = setitem_on_chunk_to_records(
+            nsplits_acc, output_chunk, value, timestamp, is_scalar=is_scalar
+        )
+        records[output_chunk.op.inputs[0].index] = records_in_chunk
+    return records
+
+
+def getitem_on_chunk_to_records(nsplits_acc, output_chunk):
+    """
+    Turns a `__getitem__` on chunk to a list of index-value records.
+
+    Parameters
+    ----------
+    nsplits_acc:
+        Accumulate nsplits arrays of the output tensor chunks.
+
+    Returns
+    -------
+        records: A list of `(index, value_index)`, where `index` is the in-chunk index, and
+        `value_index` is the index in the final result block.
+        chunk_value_shape: shape of result of this chunk.
+        chunk_value_slice: index of result of this chunk in the whole result tensor.
+    """
+    input_indices, value_indices = indexing_to_chunk_indices(output_chunk)
+
+    chunk_value_slice = tuple(
+        slice(
+            nsplits_acc[i][output_chunk.index[i]],
+            nsplits_acc[i][output_chunk.index[i] + 1],
+        )
+        for i in range(len(output_chunk.index))
+    )
+
+    records = []
+    for chunk_idx, value_idx in zip(
+        itertools.product(*input_indices), itertools.product(*value_indices)
+    ):
+        index_in_chunk = np.ravel_multi_index(
+            chunk_idx, output_chunk.op.inputs[0].shape
+        )
+        records.append((index_in_chunk, value_idx))
+    return records, output_chunk.shape, chunk_value_slice
+
+
+def getitem_to_records(tensor, tensor_index):
+    """
+    Compute the records of `__getitem__` on tensor for every chunk.
+
+    Returns
+    -------
+        records and output_chunk dict, records is a dict of chunk index to records
+        in that chunk.
+    """
+    output_shape, output_chunks, nsplits_acc = compute_output_of_indexing(
+        tensor, tensor_index
+    )
+
+    records = dict()
+    for output_chunk in output_chunks:
+        records_in_chunk = getitem_on_chunk_to_records(nsplits_acc, output_chunk)
+        records[output_chunk.op.inputs[0].index] = records_in_chunk
+    return records, output_shape
+
+
+def normalize_timestamp(timestamp=None):
+    if timestamp is None:
+        timestamp = np.datetime64(datetime.now())
+    if isinstance(timestamp, datetime):
+        timestamp = np.datetime64(timestamp)
+    return timestamp
+
+
+def normalize_name(name: Optional[str] = None):
+    if not name:
+        return str(uuid.uuid4())
+    return name
diff --git a/python/xorbits/_mars/services/mutable/worker/__init__.py b/python/xorbits/_mars/services/mutable/worker/__init__.py
new file mode 100644
index 000000000..e6334983a
--- /dev/null
+++ b/python/xorbits/_mars/services/mutable/worker/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import MutableTensorChunkActor
diff --git a/python/xorbits/_mars/services/mutable/worker/core.py b/python/xorbits/_mars/services/mutable/worker/core.py
new file mode 100644
index 000000000..926747a68
--- /dev/null
+++ b/python/xorbits/_mars/services/mutable/worker/core.py
@@ -0,0 +1,142 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bisect
+import sys
+from collections import defaultdict
+from typing import List, Union
+
+import numpy as np
+
+from .... import oscar as mo
+from ....typing import ChunkType
+
+
+class MutableTensorChunkActor(mo.Actor):
+    def __init__(
+        self,
+        session_id: str,
+        manager_address: str,
+        chunks: List,
+        dtype: Union[np.dtype, str],
+        default_value: Union[int, float] = 0,
+    ) -> None:
+        self._session_id = session_id
+        self._manager_address = manager_address
+        self._chunks = chunks
+        self._dtype = dtype
+        self._default_value = default_value
+
+        self._storage_api = None
+        self._meta_api = None
+
+        self._index_to_chunk = None
+
+    @classmethod
+    def gen_uid(cls, session_id: str, name: str, index: int):
+        return f"mutable-tensor-chunk-{session_id}-{name}-{index}"
+
+    async def __post_create__(self):
+        from ...meta import MetaAPI
+        from ...storage import StorageAPI
+
+        self._storage_api = await StorageAPI.create(self._session_id, self.address)
+        self._meta_api = await MetaAPI.create(self._session_id, self._manager_address)
+
+        self._index_to_chunk = {
+            chunk.index: MutableTensorChunk(
+                chunk,
+                self._manager_address,
+                self.address,
+                default_value=self._default_value,
+            )
+            for chunk in self._chunks
+        }
+
+    async def write(self, chunk_index, records):
+        chunk: MutableTensorChunk = self._index_to_chunk[chunk_index]
+        await chunk.write(records)
+
+    async def read(self, chunk_index, records, chunk_value_shape, timestamp):
+        chunk: MutableTensorChunk = self._index_to_chunk[chunk_index]
+        return await chunk.read(records, chunk_value_shape, timestamp)
+
+    async def seal(self, timestamp):
+        for _, chunk in self._index_to_chunk.items():
+            chunk_data = await chunk.seal(timestamp)
+            await self._storage_api.put(chunk.chunk.key, chunk_data)
+            await self._meta_api.set_chunk_meta(
+                chunk.chunk, bands=[(self.address, "numa-0")]
+            )
+
+
+class MutableTensorChunk:
+    def __init__(
+        self,
+        chunk: ChunkType,
+        manager_address: str,
+        worker_address: str,
+        default_value: Union[int, float] = 0,
+    ) -> None:
+        self._chunk = chunk
+        self._manager_address = manager_address
+        self._worker_address = worker_address
+        self._default_value = default_value
+
+        self._records = defaultdict(list)
+
+    @property
+    def chunk(self):
+        return self._chunk
+
+    async def write(self, records):
+        for flat_index, value, ts in records:
+            self._records[flat_index].append((ts, value))
+
+    async def read(self, records, chunk_value_shape, timestamp):
+        result = np.full(shape=chunk_value_shape, fill_value=self._default_value)
+        for flat_index, value_index in records:
+            if flat_index not in self._records:
+                continue
+            # Find the newest one.
+            #
+            # FIXME Python doesn't have things like SortedDict or SortedList,
+            # we trigger a `sorted` here to ensure the correct semantic and try
+            # to be as efficient as possible.
+            self._records[flat_index].sort()
+            # bitsect will compare on first element in the tuple.
+            index = bisect.bisect_right(
+                self._records[flat_index], (timestamp, sys.float_info.max)
+            )
+            if index == 0:
+                continue
+            result[value_index] = self._records[flat_index][index - 1][
+                1
+            ]  # take the value
+        return result
+
+    async def seal(self, timestamp):
+        result = np.full(self._chunk.shape, self._default_value)
+        for flat_index, values in self._records.items():
+            if flat_index not in self._records:
+                continue
+            # compute value
+            values.sort()
+            index = bisect.bisect_right(values, (timestamp, sys.float_info.max))
+            if index == 0:
+                continue
+            # compute value index
+            value_index = np.unravel_index(flat_index, self._chunk.shape)
+            result[value_index] = values[index - 1][1]  # take the value
+        return result
diff --git a/python/xorbits/_mars/services/scheduling/__init__.py b/python/xorbits/_mars/services/scheduling/__init__.py
new file mode 100644
index 000000000..679cdc435
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .api import MockSchedulingAPI, SchedulingAPI
diff --git a/python/xorbits/_mars/services/scheduling/api/__init__.py b/python/xorbits/_mars/services/scheduling/api/__init__.py
new file mode 100644
index 000000000..da695e1bf
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/api/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .oscar import MockSchedulingAPI, SchedulingAPI
diff --git a/python/xorbits/_mars/services/scheduling/api/core.py b/python/xorbits/_mars/services/scheduling/api/core.py
new file mode 100644
index 000000000..41773db67
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/api/core.py
@@ -0,0 +1,37 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import List, Optional
+
+from ..core import SubtaskScheduleSummary
+
+
+class AbstractSchedulingAPI(ABC):
+    @abstractmethod
+    def get_subtask_schedule_summaries(
+        self, task_id: Optional[str] = None
+    ) -> List[SubtaskScheduleSummary]:
+        """
+        Get details of scheduling for tasks
+
+        Parameters
+        ----------
+        task_id
+
+        Returns
+        -------
+        details
+            List of details for subtasks
+        """
diff --git a/python/xorbits/_mars/services/scheduling/api/oscar.py b/python/xorbits/_mars/services/scheduling/api/oscar.py
new file mode 100644
index 000000000..96e3ea43f
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/api/oscar.py
@@ -0,0 +1,202 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple, Type, TypeVar, Union
+
+from .... import oscar as mo
+from ....lib.aio import alru_cache
+from ...subtask import Subtask
+from ..core import SubtaskScheduleSummary
+from .core import AbstractSchedulingAPI
+
+APIType = TypeVar("APIType", bound="SchedulingAPI")
+
+
+class SchedulingAPI(AbstractSchedulingAPI):
+    def __init__(
+        self,
+        session_id: str,
+        address: str,
+        manager_ref=None,
+        queueing_ref=None,
+        autoscaler_ref=None,
+    ):
+        self._session_id = session_id
+        self._address = address
+
+        self._manager_ref = manager_ref
+        self._queueing_ref = queueing_ref
+        self._autoscaler = autoscaler_ref
+
+    @classmethod
+    @alru_cache
+    async def create(cls: Type[APIType], session_id: str, address: str) -> APIType:
+        from ..supervisor.manager import SubtaskManagerActor
+
+        manager_ref = await mo.actor_ref(
+            SubtaskManagerActor.gen_uid(session_id), address=address
+        )
+        from ..supervisor.queueing import SubtaskQueueingActor
+
+        queueing_ref = await mo.actor_ref(
+            SubtaskQueueingActor.gen_uid(session_id), address=address
+        )
+
+        from ...cluster import ClusterAPI
+        from ..supervisor.autoscale import AutoscalerActor
+
+        cluster_api = await ClusterAPI.create(address)
+        [autoscaler] = await cluster_api.get_supervisor_refs(
+            [AutoscalerActor.default_uid()]
+        )
+        scheduling_api = SchedulingAPI(
+            session_id, address, manager_ref, queueing_ref, autoscaler
+        )
+        return scheduling_api
+
+    async def get_subtask_schedule_summaries(
+        self, task_id: Optional[str] = None
+    ) -> List[SubtaskScheduleSummary]:
+        return await self._manager_ref.get_schedule_summaries(task_id)
+
+    async def add_subtasks(
+        self, subtasks: List[Subtask], priorities: Optional[List[Tuple]] = None
+    ):
+        """
+        Submit subtasks into scheduling service
+
+        Parameters
+        ----------
+        subtasks
+            list of subtasks to be submitted to service
+        priorities
+            list of priorities of subtasks
+        """
+        if priorities is None:
+            priorities = [subtask.priority or tuple() for subtask in subtasks]
+        await self._manager_ref.add_subtasks(subtasks, priorities)
+
+    @mo.extensible
+    async def update_subtask_priority(self, subtask_id: str, priority: Tuple):
+        """
+        Update priorities of subtasks
+
+        Parameters
+        ----------
+        subtask_id
+            id of subtask to update priority
+        priority
+            list of priority of subtasks
+        """
+        raise NotImplementedError
+
+    @update_subtask_priority.batch
+    async def update_subtask_priority(self, args_list, kwargs_list):
+        await self._queueing_ref.update_subtask_priority.batch(
+            *(
+                self._queueing_ref.update_subtask_priority.delay(*args, **kwargs)
+                for args, kwargs in zip(args_list, kwargs_list)
+            )
+        )
+
+    async def cancel_subtasks(
+        self, subtask_ids: List[str], kill_timeout: Union[float, int] = None
+    ):
+        """
+        Cancel pending and running subtasks.
+
+        Parameters
+        ----------
+        subtask_ids
+            ids of subtasks to cancel
+        kill_timeout
+            timeout seconds to kill actor process forcibly
+        """
+        await self._manager_ref.cancel_subtasks(subtask_ids, kill_timeout=kill_timeout)
+
+    async def finish_subtasks(
+        self,
+        subtask_ids: List[str],
+        bands: List[Tuple] = None,
+        schedule_next: bool = True,
+    ):
+        """
+        Mark subtasks as finished, letting scheduling service to schedule
+        next tasks in the ready queue
+
+        Parameters
+        ----------
+        subtask_ids
+            ids of subtasks to mark as finished
+        bands
+            bands of subtasks to mark as finished
+        schedule_next
+            whether to schedule succeeding subtasks
+        """
+        await self._manager_ref.finish_subtasks(subtask_ids, bands, schedule_next)
+
+    async def disable_autoscale_in(self):
+        """Disable autoscale in"""
+        await self._autoscaler.disable_autoscale_in()
+
+    async def try_enable_autoscale_in(self):
+        """Try to enable autoscale in, the autoscale-in will be enabled only when last call corresponding
+        `disable_autoscale_in` has been invoked."""
+        await self._autoscaler.try_enable_autoscale_in()
+
+
+class MockSchedulingAPI(SchedulingAPI):
+    @classmethod
+    async def create(cls: Type[APIType], session_id: str, address: str) -> APIType:
+        from ..supervisor import AutoscalerActor, GlobalResourceManagerActor
+
+        await mo.create_actor(
+            GlobalResourceManagerActor,
+            uid=GlobalResourceManagerActor.default_uid(),
+            address=address,
+        )
+        await mo.create_actor(
+            AutoscalerActor, {}, uid=AutoscalerActor.default_uid(), address=address
+        )
+
+        from .... import resource as mars_resource
+        from ..worker import (
+            SubtaskExecutionActor,
+            WorkerQuotaManagerActor,
+            WorkerSlotManagerActor,
+        )
+
+        await mo.create_actor(
+            SubtaskExecutionActor,
+            subtask_max_retries=0,
+            uid=SubtaskExecutionActor.default_uid(),
+            address=address,
+        )
+        await mo.create_actor(
+            WorkerSlotManagerActor,
+            uid=WorkerSlotManagerActor.default_uid(),
+            address=address,
+        )
+        await mo.create_actor(
+            WorkerQuotaManagerActor,
+            {"quota_size": mars_resource.virtual_memory().total},
+            uid=WorkerQuotaManagerActor.default_uid(),
+            address=address,
+        )
+
+        from ..supervisor import SchedulingSupervisorService
+
+        service = SchedulingSupervisorService({}, address)
+        await service.create_session(session_id)
+        return await super().create(session_id, address)
diff --git a/python/xorbits/_mars/services/scheduling/api/web.py b/python/xorbits/_mars/services/scheduling/api/web.py
new file mode 100644
index 000000000..725fc99a8
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/api/web.py
@@ -0,0 +1,106 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from typing import Callable, List, Optional
+
+from ....lib.aio import alru_cache
+from ...web import MarsServiceWebAPIHandler, MarsWebAPIClientMixin, web_api
+from ..core import SubtaskScheduleSummary
+from .core import AbstractSchedulingAPI
+
+
+class SchedulingWebAPIHandler(MarsServiceWebAPIHandler):
+    _root_pattern = "/api/session/(?P<session_id>[^/]+)/scheduling"
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_cluster_api(self):
+        from ...cluster import ClusterAPI
+
+        return await ClusterAPI.create(self._supervisor_addr)
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_oscar_scheduling_api(self, session_id: str):
+        from ..api import SchedulingAPI
+
+        cluster_api = await self._get_cluster_api()
+        [address] = await cluster_api.get_supervisors_by_keys([session_id])
+        return await SchedulingAPI.create(session_id, address)
+
+    @web_api("subtasks", method="get")
+    async def get_subtask_schedule_summaries(self, session_id: str):
+        oscar_api = await self._get_oscar_scheduling_api(session_id)
+        task_id = self.get_argument("task_id", None) or None
+
+        result = await oscar_api.get_subtask_schedule_summaries(task_id)
+        self.write(
+            json.dumps(
+                {
+                    summary.subtask_id: {
+                        "task_id": summary.task_id,
+                        "subtask_id": summary.subtask_id,
+                        "bands": [
+                            {
+                                "endpoint": band[0],
+                                "band_name": band[1],
+                            }
+                            for band in summary.bands
+                        ],
+                        "num_reschedules": summary.num_reschedules,
+                        "is_finished": summary.is_finished,
+                        "is_cancelled": summary.is_cancelled,
+                    }
+                    for summary in result
+                }
+            )
+        )
+
+
+web_handlers = {SchedulingWebAPIHandler.get_root_pattern(): SchedulingWebAPIHandler}
+
+
+class WebSchedulingAPI(AbstractSchedulingAPI, MarsWebAPIClientMixin):
+    def __init__(
+        self, session_id: str, address: str, request_rewriter: Callable = None
+    ):
+        self._session_id = session_id
+        self._address = address.rstrip("/")
+        self.request_rewriter = request_rewriter
+
+    async def get_subtask_schedule_summaries(
+        self, task_id: Optional[str] = None
+    ) -> List[SubtaskScheduleSummary]:
+        task_id = task_id or ""
+        path = (
+            f"{self._address}/api/session/{self._session_id}/scheduling/subtasks"
+            f"?task_id={task_id}"
+        )
+
+        res = await self._request_url("GET", path)
+        res_json = json.loads(res.body)
+
+        return [
+            SubtaskScheduleSummary(
+                task_id=summary_json["task_id"],
+                subtask_id=summary_json["subtask_id"],
+                bands=[
+                    (band_json["endpoint"], band_json["band_name"])
+                    for band_json in summary_json["bands"]
+                ],
+                num_reschedules=summary_json["num_reschedules"],
+                is_finished=summary_json["is_finished"],
+                is_cancelled=summary_json["is_cancelled"],
+            )
+            for summary_json in res_json.values()
+        ]
diff --git a/python/xorbits/_mars/services/scheduling/core.py b/python/xorbits/_mars/services/scheduling/core.py
new file mode 100644
index 000000000..7fd47fda0
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/core.py
@@ -0,0 +1,34 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+from ...serialization.serializables import (
+    BoolField,
+    FieldTypes,
+    Int32Field,
+    ListField,
+    Serializable,
+    StringField,
+)
+from ...typing import BandType
+
+
+class SubtaskScheduleSummary(Serializable):
+    task_id: str = StringField("task_id")
+    subtask_id: str = StringField("subtask_id")
+    bands: List[BandType] = ListField("bands", FieldTypes.tuple(FieldTypes.string))
+    is_finished: bool = BoolField("is_finished", default=False)
+    is_cancelled: bool = BoolField("is_cancelled", default=False)
+    num_reschedules: int = Int32Field("num_reschedules", default=0)
diff --git a/python/xorbits/_mars/services/scheduling/errors.py b/python/xorbits/_mars/services/scheduling/errors.py
new file mode 100644
index 000000000..c06337481
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/errors.py
@@ -0,0 +1,27 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...core.base import MarsError
+
+
+class NoMatchingSlots(MarsError):
+    def __init__(self, slot_prefix):
+        self.slot_prefix = slot_prefix
+
+    def __str__(self):
+        return str(self.slot_prefix)
+
+
+class NoAvailableBand(MarsError):
+    pass
diff --git a/python/xorbits/_mars/services/scheduling/supervisor/__init__.py b/python/xorbits/_mars/services/scheduling/supervisor/__init__.py
new file mode 100644
index 000000000..aac494d7e
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/supervisor/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .assigner import AssignerActor
+from .autoscale import AutoscalerActor
+from .globalresource import GlobalResourceManagerActor
+from .manager import SubtaskManagerActor
+from .queueing import SubtaskQueueingActor
+from .service import SchedulingSupervisorService
diff --git a/python/xorbits/_mars/services/scheduling/supervisor/assigner.py b/python/xorbits/_mars/services/scheduling/supervisor/assigner.py
new file mode 100644
index 000000000..d5d004d01
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/supervisor/assigner.py
@@ -0,0 +1,292 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import itertools
+from collections import defaultdict
+from typing import Dict, List, Set
+
+import numpy as np
+
+from .... import oscar as mo
+from ....core.operand import Fetch, FetchShuffle
+from ....typing import BandType
+from ...core import NodeRole
+from ...subtask import Subtask
+from ..errors import NoAvailableBand, NoMatchingSlots
+
+
+class AssignerActor(mo.Actor):
+    _bands: List[BandType]
+
+    @classmethod
+    def gen_uid(cls, session_id: str):
+        return f"{session_id}_assigner"
+
+    def __init__(self, session_id: str):
+        self._session_id = session_id
+        self._slots_ref = None
+
+        self._cluster_api = None
+        self._meta_api = None
+
+        self._bands = []
+        self._address_to_bands = dict()
+        self._device_type_to_bands = dict()
+        self._band_watch_task = None
+
+    async def __post_create__(self):
+        from ...cluster.api import ClusterAPI
+        from ...meta.api import MetaAPI
+
+        self._cluster_api = await ClusterAPI.create(self.address)
+        self._meta_api = await MetaAPI.create(
+            session_id=self._session_id, address=self.address
+        )
+
+        async def watch_bands():
+            async for bands in self._cluster_api.watch_all_bands(NodeRole.WORKER):
+                self._update_bands(list(bands))
+
+        self._band_watch_task = asyncio.create_task(watch_bands())
+
+    async def __pre_destroy__(self):
+        if self._band_watch_task is not None:  # pragma: no branch
+            self._band_watch_task.cancel()
+
+    def _update_bands(self, bands: List[BandType]):
+        self._bands = bands
+
+        grouped_bands = itertools.groupby(sorted(self._bands), key=lambda b: b[0])
+        self._address_to_bands = {k: list(v) for k, v in grouped_bands}
+
+        grouped_bands = itertools.groupby(
+            sorted(("numa" if b[1].startswith("numa") else "gpu", b) for b in bands),
+            key=lambda tp: tp[0],
+        )
+        self._device_type_to_bands = {
+            k: [v[1] for v in tps] for k, tps in grouped_bands
+        }
+
+    def _get_device_bands(self, is_gpu: bool):
+        band_prefix = "numa" if not is_gpu else "gpu"
+        filtered_bands = self._device_type_to_bands.get(band_prefix) or []
+        if not filtered_bands:
+            raise NoMatchingSlots("gpu" if is_gpu else "cpu")
+        return filtered_bands
+
+    def _get_random_band(
+        self,
+        is_gpu: bool,
+        exclude_bands: Set[BandType] = None,
+        random_when_unavailable: bool = True,
+    ):
+        bands = self._get_device_bands(is_gpu)
+        if exclude_bands:
+            avail_bands = [band for band in bands if band not in exclude_bands]
+            if avail_bands:
+                return avail_bands[np.random.choice(len(avail_bands))]
+            elif not random_when_unavailable:
+                raise NoAvailableBand(
+                    f"No bands available after excluding bands {exclude_bands}"
+                )
+        return bands[np.random.choice(len(bands))]
+
+    async def assign_subtasks(
+        self,
+        subtasks: List[Subtask],
+        exclude_bands: Set[BandType] = None,
+        random_when_unavailable: bool = True,
+    ):
+        exclude_bands = exclude_bands or set()
+        inp_keys = set()
+        broadcaster_keys = set()
+        selected_bands = dict()
+
+        if not self._bands:
+            self._update_bands(
+                list(await self._cluster_api.get_all_bands(NodeRole.WORKER))
+            )
+
+        for subtask in subtasks:
+            is_gpu = any(c.op.gpu for c in subtask.chunk_graph)
+            if subtask.expect_bands:
+                # exclude expected but unready bands
+                expect_available_bands = [
+                    expect_band
+                    for expect_band in subtask.expect_bands
+                    if expect_band in self._bands and expect_band not in exclude_bands
+                ]
+                # fill in if all expected bands are unready
+                if not expect_available_bands:
+                    expect_available_bands = [
+                        self._get_random_band(
+                            is_gpu, exclude_bands, random_when_unavailable
+                        )
+                    ]
+                selected_bands[subtask.subtask_id] = expect_available_bands
+                continue
+            for indep_chunk in subtask.chunk_graph.iter_indep():
+                if isinstance(indep_chunk.op, Fetch):
+                    if indep_chunk.is_broadcaster:
+                        broadcaster_keys.add(indep_chunk.key)
+                    inp_keys.add(indep_chunk.key)
+                elif isinstance(indep_chunk.op, FetchShuffle):
+                    selected_bands[subtask.subtask_id] = [
+                        self._get_random_band(
+                            is_gpu, exclude_bands, random_when_unavailable
+                        )
+                    ]
+                    break
+
+        fields = ["store_size", "bands"]
+        inp_keys = list(inp_keys)
+        metas = await self._meta_api.get_chunk_meta.batch(
+            *(self._meta_api.get_chunk_meta.delay(key, fields) for key in inp_keys)
+        )
+
+        inp_metas = dict(zip(inp_keys, metas))
+        if broadcaster_keys:
+            # set broadcaster's size as 0 to avoid assigning all successors to same band.
+            for key in broadcaster_keys:
+                inp_metas[key]["store_size"] = 0
+        assigns = []
+        for subtask in subtasks:
+            is_gpu = any(c.op.gpu for c in subtask.chunk_graph)
+            band_prefix = "numa" if not is_gpu else "gpu"
+            filtered_bands = self._get_device_bands(is_gpu)
+
+            if subtask.subtask_id in selected_bands:
+                bands = selected_bands[subtask.subtask_id]
+            else:
+                band_sizes = defaultdict(lambda: 0)
+                for inp in subtask.chunk_graph.iter_indep():
+                    if not isinstance(inp.op, Fetch):  # pragma: no cover
+                        continue
+                    meta = inp_metas[inp.key]
+                    for band in meta["bands"]:
+                        if not band[1].startswith(band_prefix):
+                            sel_bands = [
+                                b
+                                for b in self._address_to_bands[band[0]]
+                                if b[1].startswith(band_prefix)
+                                and b not in exclude_bands
+                            ]
+                            if sel_bands:
+                                band = sel_bands[np.random.choice(len(sel_bands))]
+                        if band not in filtered_bands or band in exclude_bands:
+                            band = self._get_random_band(
+                                is_gpu, exclude_bands, random_when_unavailable
+                            )
+                        band_sizes[band] += meta["store_size"]
+                bands = []
+                max_size = -1
+                for band, size in band_sizes.items():
+                    if size > max_size:
+                        bands = [band]
+                        max_size = size
+                    elif size == max_size:
+                        bands.append(band)
+            band = bands[np.random.choice(len(bands))]
+            if (
+                not random_when_unavailable and band in exclude_bands
+            ):  # pragma: no cover
+                raise NoAvailableBand(
+                    f"No bands available for subtask {subtask.subtask_id} after "
+                    f"excluded {exclude_bands}"
+                )
+            if subtask.bands_specified and band not in subtask.expect_bands:
+                raise NoAvailableBand(
+                    f"No bands available for subtask {subtask.subtask_id} on bands {subtask.expect_bands} "
+                    f"after excluded {exclude_bands}"
+                )
+            assigns.append(band)
+        return assigns
+
+    async def reassign_subtasks(
+        self, band_to_queued_num: Dict[BandType, int]
+    ) -> Dict[BandType, int]:
+        move_queued_subtasks = {}
+        for is_gpu in (False, True):
+            band_name_prefix = "numa" if not is_gpu else "gpu"
+
+            filtered_bands = [
+                band for band in self._bands if band[1].startswith(band_name_prefix)
+            ]
+            filtered_band_to_queued_num = {
+                k: v
+                for k, v in band_to_queued_num.items()
+                if k[1].startswith(band_name_prefix)
+            }
+
+            if not filtered_bands:
+                continue
+
+            num_used_bands = len(filtered_band_to_queued_num.keys())
+            if num_used_bands == 1:
+                [(band, length)] = filtered_band_to_queued_num.items()
+                if length == 0:
+                    move_queued_subtasks.update({band: 0})
+                    continue
+                # no need to balance when there's only one band initially
+                if len(filtered_bands) == 1 and band == filtered_bands[0]:
+                    move_queued_subtasks.update({band: 0})
+                    continue
+            # unready bands recorded in band_num_queued_subtasks, some of them may hold 0 subtasks
+            unready_bands = list(
+                set(filtered_band_to_queued_num.keys()) - set(filtered_bands)
+            )
+            # ready bands not recorded in band_num_queued_subtasks, all of them hold 0 subtasks
+            new_ready_bands = list(
+                set(filtered_bands) - set(filtered_band_to_queued_num.keys())
+            )
+            # when there are new ready bands, make all bands hold same amount of subtasks
+            # when there are no new ready bands now, move out subtasks left on them
+            if not new_ready_bands and unready_bands:
+                filtered_band_to_queued_num = {
+                    k: filtered_band_to_queued_num[k] for k in unready_bands
+                }
+            # approximate total of subtasks moving to each ready band
+            num_all_subtasks = sum(filtered_band_to_queued_num.values())
+            mean = int(num_all_subtasks / len(filtered_bands))
+            # all_bands (namely) includes:
+            # a. ready bands recorded in band_num_queued_subtasks
+            # b. ready bands not recorded in band_num_queued_subtasks
+            # c. unready bands recorded in band_num_queued_subtasks
+            # a. + b. = self._bands, a. + c. = bands in band_num_queued_subtasks
+            all_bands = list(
+                set(filtered_bands) | set(filtered_band_to_queued_num.keys())
+            )
+            # calculate the differential steps of moving subtasks
+            # move < 0 means subtasks should move out and vice versa
+            # unready bands no longer hold subtasks
+            # assuming bands not recorded in band_num_queued_subtasks hold 0 subtasks
+            band_move_nums = {}
+            for band in all_bands:
+                if band in filtered_bands:
+                    band_move_nums[band] = mean - filtered_band_to_queued_num.get(
+                        band, 0
+                    )
+                else:
+                    band_move_nums[band] = -filtered_band_to_queued_num.get(band, 0)
+            # ensure the balance of moving in and out
+            total_move = sum(band_move_nums.values())
+            # int() is going to be closer to zero, so `mean` is no more than actual mean value
+            # total_move = mean * len(self._bands) - num_all_subtasks
+            #            <= actual_mean * len(self._bands) - num_all_subtasks = 0
+            assert total_move <= 0
+            if total_move != 0:
+                band_move_nums[self._get_random_band(False)] -= total_move
+            move_queued_subtasks.update(band_move_nums)
+        return dict(sorted(move_queued_subtasks.items(), key=lambda item: item[1]))
diff --git a/python/xorbits/_mars/services/scheduling/supervisor/autoscale.py b/python/xorbits/_mars/services/scheduling/supervisor/autoscale.py
new file mode 100644
index 000000000..a442a69a6
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/supervisor/autoscale.py
@@ -0,0 +1,441 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import importlib
+import logging
+import random
+import time
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Set
+
+from .... import oscar as mo
+from ....lib.aio import alru_cache
+from ....typing import BandType
+from ...cluster.api import ClusterAPI
+from ...cluster.core import NodeRole, NodeStatus
+from ..errors import NoAvailableBand
+
+logger = logging.getLogger(__name__)
+
+
+class AutoscalerActor(mo.Actor):
+    def __init__(self, autoscale_conf: Dict[str, Any]):
+        self._enabled = autoscale_conf.get("enabled", False)
+        self._autoscale_conf = autoscale_conf
+        self._cluster_api = None
+        self.queueing_refs = dict()
+        self.global_resource_ref = None
+        self._dynamic_workers: Set[str] = set()
+        self._autoscale_in_disable_counter = 0
+
+    async def __post_create__(self):
+        strategy = self._autoscale_conf.get("strategy")
+        if strategy:  # pragma: no cover
+            module, name = strategy.rsplit(".", 1)
+            strategy_cls = getattr(importlib.import_module(module), name)
+        else:
+            strategy_cls = PendingTaskBacklogStrategy
+        from ..supervisor import GlobalResourceManagerActor
+
+        self.global_resource_ref = await mo.actor_ref(
+            GlobalResourceManagerActor.default_uid(), address=self.address
+        )
+        self._cluster_api = await ClusterAPI.create(self.address)
+        self._strategy = await strategy_cls.create(self._autoscale_conf, self)
+        if self._enabled:
+            logger.info(f"Auto scale strategy %s started", self._strategy)
+            await self._strategy.start()
+
+    async def __pre_destroy__(self):
+        if self._enabled:
+            await self._strategy.stop()
+
+    async def register_session(self, session_id: str, address: str):
+        from .queueing import SubtaskQueueingActor
+
+        self.queueing_refs[session_id] = await mo.actor_ref(
+            SubtaskQueueingActor.gen_uid(session_id), address=address
+        )
+
+    async def unregister_session(self, session_id: str):
+        self.queueing_refs.pop(session_id, None)
+
+    async def request_worker(
+        self, worker_cpu: int = None, worker_mem: int = None, timeout: int = None
+    ) -> str:
+        start_time = time.time()
+        worker_address = await self._cluster_api.request_worker(
+            worker_cpu, worker_mem, timeout
+        )
+        if worker_address:
+            self._dynamic_workers.add(worker_address)
+            logger.warning(
+                "Requested new worker %s in %.4f seconds, current dynamic worker nums is %s",
+                worker_address,
+                time.time() - start_time,
+                self.get_dynamic_worker_nums(),
+            )
+            return worker_address
+        else:
+            logger.warning(
+                "Request worker with resource %s failed in %.4f seconds.",
+                dict(worker_cpu=worker_cpu, worker_mem=worker_mem),
+                time.time() - start_time,
+            )
+
+    async def disable_autoscale_in(self):
+        self._autoscale_in_disable_counter += 1
+        if self._enabled:
+            logger.info("Disabled autoscale_in")
+
+    async def try_enable_autoscale_in(self):
+        self._autoscale_in_disable_counter -= 1
+        if self._autoscale_in_disable_counter == 0 and self._enabled:
+            logger.info("Enabled autoscale_in")
+
+    async def release_workers(self, addresses: List[str]) -> List[str]:
+        """
+        Release a group of worker nodes.
+        Parameters
+        ----------
+        addresses : List[str]
+            The addresses of the specified node.
+        """
+        if self._autoscale_in_disable_counter > 0:
+            return []
+        workers_bands = {
+            address: await self.get_worker_bands(address) for address in addresses
+        }
+        logger.info(
+            "Start to release workers %s which have bands %s.",
+            addresses,
+            workers_bands,
+        )
+        for address in addresses:
+            await self._cluster_api.set_node_status(
+                node=address, role=NodeRole.WORKER, status=NodeStatus.STOPPING
+            )
+        # Ensure global_slot_manager get latest bands timely, so that we can invoke `wait_band_idle`
+        # to ensure there won't be new tasks scheduled to the stopping worker.
+        await self.global_resource_ref.refresh_bands()
+        excluded_bands = set(b for bands in workers_bands.values() for b in bands)
+
+        async def release_worker(address):
+            logger.info("Start to release worker %s.", address)
+            worker_bands = workers_bands[address]
+            await asyncio.gather(
+                *[
+                    self.global_resource_ref.wait_band_idle(band)
+                    for band in worker_bands
+                ]
+            )
+            await self._migrate_data_of_bands(worker_bands, excluded_bands)
+            await self._cluster_api.release_worker(address)
+            self._dynamic_workers.remove(address)
+            logger.info("Released worker %s.", address)
+
+        # Release workers one by one to ensure others workers which the current is moving data to
+        # is not being releasing.
+        for address in addresses:
+            await release_worker(address)
+        return addresses
+
+    def get_dynamic_workers(self) -> Set[str]:
+        return self._dynamic_workers
+
+    def get_dynamic_worker_nums(self) -> int:
+        return len(self._dynamic_workers)
+
+    async def get_worker_bands(self, worker_address) -> List[BandType]:
+        node_info = (
+            await self._cluster_api.get_nodes_info(
+                [worker_address], resource=True, exclude_statuses=set()
+            )
+        )[worker_address]
+        return [
+            (worker_address, resource_type)
+            for resource_type in node_info["resource"].keys()
+        ]
+
+    async def _migrate_data_of_bands(
+        self, bands: List[BandType], excluded_bands: Set[BandType]
+    ):
+        """Move data from `bands` to other available bands"""
+        session_ids = list(self.queueing_refs.keys())
+        for session_id in session_ids:
+            from ...meta import MetaAPI
+
+            meta_api = await MetaAPI.create(session_id, self.address)
+
+            batch_fetch, batch_delete = defaultdict(list), defaultdict(list)
+            batch_add_chunk_bands, batch_remove_chunk_bands = [], []
+            for src_band in bands:
+                band_data_keys = await meta_api.get_band_chunks(src_band)
+                for data_key in band_data_keys:
+                    dest_band = await self._select_target_band(
+                        src_band, data_key, excluded_bands
+                    )
+                    logger.debug(
+                        "Move chunk % from band %s to band %s.",
+                        data_key,
+                        src_band,
+                        dest_band,
+                    )
+                    dest_storage_api = await self._get_storage_api(
+                        session_id, dest_band[0]
+                    )
+                    # For ray backend, there will only be meta update rather than data transfer
+                    batch_fetch[dest_storage_api].append(
+                        dest_storage_api.fetch.delay(
+                            data_key, band_name=src_band[1], remote_address=src_band[0]
+                        )
+                    )
+                    src_storage_api = await self._get_storage_api(
+                        session_id, src_band[0]
+                    )
+                    batch_delete[src_storage_api].append(
+                        src_storage_api.delete.delay(data_key)
+                    )
+                    batch_add_chunk_bands.append(
+                        meta_api.add_chunk_bands.delay(data_key, [dest_band])
+                    )
+                    batch_remove_chunk_bands.append(
+                        meta_api.remove_chunk_bands.delay(data_key, [src_band])
+                    )
+            await asyncio.gather(
+                *[api.fetch.batch(*fetches) for api, fetches in batch_fetch.items()]
+            )
+            await meta_api.add_chunk_bands.batch(*batch_add_chunk_bands)
+            await meta_api.remove_chunk_bands.batch(*batch_remove_chunk_bands)
+            await asyncio.gather(
+                *[api.delete.batch(*deletes) for api, deletes in batch_delete.items()]
+            )
+
+    async def _select_target_band(
+        self, band: BandType, data_key: str, excluded_bands: Set[BandType]
+    ):
+        all_bands = await self._cluster_api.get_all_bands()
+        bands = list(
+            b
+            for b in all_bands.keys()
+            if (b[1] == band[1] and b != band and b not in excluded_bands)
+        )
+        if not bands:  # pragma: no cover
+            raise NoAvailableBand(
+                f"No bands to migrate data to, "
+                f"all available bands is {all_bands}, "
+                f"current band is {band}, "
+                f"excluded bands are {excluded_bands}."
+            )
+        # TODO select band based on remaining store space size of other bands
+        return random.choice(bands)
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_storage_api(self, session_id: str, address: str):
+        from ...storage import StorageAPI
+
+        return await StorageAPI.create(session_id, address)
+
+
+class AbstractScaleStrategy(ABC):
+    @classmethod
+    @abstractmethod
+    async def create(cls, autoscale_conf: Dict[str, Any], autoscaler):
+        """Create a autoscale strategy which will decide when to scale in/.out"""
+
+    @abstractmethod
+    async def start(self):
+        """Start auto scale"""
+
+    @abstractmethod
+    async def stop(self):
+        """Stop auto scale"""
+
+
+class PendingTaskBacklogStrategy(AbstractScaleStrategy):
+    _task: Optional[asyncio.Task]
+
+    def __init__(self, autoscale_conf: Dict[str, Any], autoscaler):
+        self._autoscaler = autoscaler
+        self._scheduler_check_interval = autoscale_conf.get(
+            "scheduler_check_interval", 1
+        )
+        self._scheduler_backlog_timeout = autoscale_conf.get(
+            "scheduler_backlog_timeout", 20
+        )
+        self._sustained_scheduler_backlog_timeout = autoscale_conf.get(
+            "sustained_scheduler_backlog_timeout", self._scheduler_backlog_timeout
+        )
+        # Make worker_idle_timeout greater than scheduler_backlog_timeout to
+        # avoid cluster fluctuate back and forth。
+        self._worker_idle_timeout = autoscale_conf.get(
+            "worker_idle_timeout", 2 * self._scheduler_backlog_timeout
+        )
+        self._min_workers = autoscale_conf.get("min_workers", 1)
+        assert self._min_workers >= 1, "Mars need at least 1 worker."
+        self._max_workers = autoscale_conf.get("max_workers", 100)
+        self._task = None
+
+    @classmethod
+    async def create(cls, autoscale_conf: Dict[str, Any], autoscaler):
+        return cls(autoscale_conf, autoscaler)
+
+    async def start(self):
+        self._task = asyncio.create_task(self._run())
+
+    async def _run(self):
+        try:
+            delta = self._min_workers - self._autoscaler.get_dynamic_worker_nums()
+            while delta > 0:
+                logger.info(f"Start to request %s initial workers.", delta)
+                initial_worker_addresses = await asyncio.gather(
+                    *[self._autoscaler.request_worker() for _ in range(delta)]
+                )
+                initial_worker_addresses = [
+                    addr for addr in initial_worker_addresses if addr is not None
+                ]
+                logger.info(
+                    f"Requested %s initial workers %s",
+                    len(initial_worker_addresses),
+                    initial_worker_addresses,
+                )
+                delta = self._min_workers - self._autoscaler.get_dynamic_worker_nums()
+            while True:
+                await asyncio.sleep(self._scheduler_check_interval)
+                await self._run_round()
+        except asyncio.CancelledError:  # pragma: no cover
+            logger.info("Canceled pending task backlog strategy.")
+        except Exception as e:  # pragma: no cover
+            logger.exception("Exception occurred when try to auto scale")
+            raise e
+
+    async def _run_round(self):
+        queueing_refs = list(self._autoscaler.queueing_refs.values())
+        if any([await queueing_ref.all_bands_busy() for queueing_ref in queueing_refs]):
+            await self._scale_out(queueing_refs)
+        else:
+            await self._scale_in()
+
+    async def _scale_out(self, queueing_refs):
+        logger.info(
+            "Try to scale out, current dynamic workers %s",
+            self._autoscaler.get_dynamic_worker_nums(),
+        )
+        start_time = time.time()
+        while not await self._autoscaler.request_worker():
+            logger.warning(
+                "Request worker failed, wait %s seconds and retry.",
+                self._scheduler_check_interval,
+            )
+            await asyncio.sleep(self._scheduler_check_interval)
+        await asyncio.sleep(self._scheduler_backlog_timeout)
+        rnd = 1
+        while any(
+            [await queueing_ref.all_bands_busy() for queueing_ref in queueing_refs]
+        ):
+            worker_num = 2**rnd
+            if (
+                self._autoscaler.get_dynamic_worker_nums() + worker_num
+                > self._max_workers
+            ):
+                worker_num = (
+                    self._max_workers - self._autoscaler.get_dynamic_worker_nums()
+                )
+            while set(
+                await asyncio.gather(
+                    *[self._autoscaler.request_worker() for _ in range(worker_num)]
+                )
+            ) == {None}:
+                logger.warning(
+                    "Request %s workers all failed, wait %s seconds and retry.",
+                    worker_num,
+                    self._scheduler_check_interval,
+                )
+                await asyncio.sleep(self._scheduler_check_interval)
+            rnd += 1
+            await asyncio.sleep(self._sustained_scheduler_backlog_timeout)
+        logger.info(
+            "Scale out finished in %s round, took %s seconds, current dynamic workers %s",
+            rnd,
+            time.time() - start_time,
+            self._autoscaler.get_dynamic_worker_nums(),
+        )
+
+    async def _scale_in(self):
+        idle_bands = set(
+            await self._autoscaler.global_resource_ref.get_idle_bands(
+                self._worker_idle_timeout
+            )
+        )
+        # exclude non-dynamic created workers and ensure all bands of the worker are idle
+        idle_bands = {
+            band
+            for band in idle_bands
+            if band[0] in self._autoscaler.get_dynamic_workers()
+            and idle_bands.issuperset(
+                set(await self._autoscaler.get_worker_bands(band[0]))
+            )
+        }
+        worker_addresses = set(band[0] for band in idle_bands)
+        if worker_addresses:
+            logger.debug(
+                "Bands %s of workers % has been idle for as least %s seconds.",
+                idle_bands,
+                worker_addresses,
+                self._worker_idle_timeout,
+            )
+            while (
+                worker_addresses
+                and self._autoscaler.get_dynamic_worker_nums() - len(worker_addresses)
+                < self._min_workers
+            ):
+                worker_address = worker_addresses.pop()
+                logger.debug(
+                    "Skip offline idle worker %s to keep at least %s dynamic workers. "
+                    "Current total dynamic workers is %s.",
+                    worker_address,
+                    self._min_workers,
+                    self._autoscaler.get_dynamic_worker_nums(),
+                )
+                idle_bands.difference_update(
+                    set(await self._autoscaler.get_worker_bands(worker_address))
+                )
+        if worker_addresses:
+            start_time = time.time()
+            logger.info(
+                "Try to offline idle workers %s with bands %s.",
+                worker_addresses,
+                idle_bands,
+            )
+            try:
+                worker_addresses = await self._autoscaler.release_workers(
+                    worker_addresses
+                )
+                logger.info(
+                    "Finished offline workers %s in %.4f seconds",
+                    worker_addresses,
+                    time.time() - start_time,
+                )
+            except NoAvailableBand as e:  # pragma: no cover
+                logger.warning(
+                    "No enough bands, offline workers %s failed with exception %s.",
+                    worker_addresses,
+                    e,
+                )
+
+    async def stop(self):
+        self._task.cancel()
+        await self._task
diff --git a/python/xorbits/_mars/services/scheduling/supervisor/globalresource.py b/python/xorbits/_mars/services/scheduling/supervisor/globalresource.py
new file mode 100644
index 000000000..8e26889a6
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/supervisor/globalresource.py
@@ -0,0 +1,168 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import time
+from collections import defaultdict
+from typing import DefaultDict, Dict, List, Tuple
+
+from .... import oscar as mo
+from ....resource import Resource, ZeroResource
+from ....typing import BandType
+
+logger = logging.getLogger(__name__)
+
+
+class GlobalResourceManagerActor(mo.Actor):
+    # {(address, resource_type): {(session_id, subtask_id): Resource(...)}}
+    _band_stid_resources: DefaultDict[BandType, Dict[Tuple[str, str], Resource]]
+    _band_used_resources: Dict[BandType, Resource]
+    _band_total_resources: Dict[BandType, Resource]
+
+    def __init__(self):
+        self._band_stid_resources = defaultdict(dict)
+        self._band_used_resources = defaultdict(lambda: ZeroResource)
+        self._band_idle_start_time = dict()
+        self._band_idle_events = dict()
+        self._band_total_resources = dict()
+        self._cluster_api = None
+        self._band_watch_task = None
+
+    async def __post_create__(self):
+        from ...cluster.api import ClusterAPI
+
+        self._cluster_api = await ClusterAPI.create(self.address)
+
+        async def watch_bands():
+            async for bands in self._cluster_api.watch_all_bands():
+                old_bands = set(self._band_total_resources.keys())
+                self._band_total_resources = bands
+                new_bands = set(bands.keys()) - old_bands
+                for band in new_bands:
+                    self._update_band_usage(band, ZeroResource)
+
+        self._band_watch_task = asyncio.create_task(watch_bands())
+
+    async def __pre_destroy__(self):
+        self._band_watch_task.cancel()
+
+    async def refresh_bands(self):
+        self._band_total_resources = await self._cluster_api.get_all_bands()
+
+    @mo.extensible
+    async def apply_subtask_resources(
+        self,
+        band: BandType,
+        session_id: str,
+        subtask_ids: List[str],
+        subtask_resources: List[Resource],
+    ) -> List[str]:
+        if (
+            not self._band_total_resources or band not in self._band_total_resources
+        ):  # pragma: no cover
+            await self.refresh_bands()
+        idx = 0
+        # only ready bands will pass
+        if band in self._band_total_resources:
+            total_resource = self._band_total_resources[band]
+            for stid, subtask_resource in zip(subtask_ids, subtask_resources):
+                band_used_resource = self._band_used_resources[band]
+                if band_used_resource + subtask_resource > total_resource:
+                    break
+                self._band_stid_resources[band][(session_id, stid)] = subtask_resource
+                self._update_band_usage(band, subtask_resource)
+                idx += 1
+        if idx == 0:
+            logger.debug(
+                "No resources available, status: %r, request: %r",
+                self._band_used_resources,
+                subtask_resources,
+            )
+        return subtask_ids[:idx]
+
+    @mo.extensible
+    def update_subtask_resources(
+        self, band: BandType, session_id: str, subtask_id: str, resource: Resource
+    ):
+        session_subtask_id = (session_id, subtask_id)
+        subtask_resources = self._band_stid_resources[band]
+        if session_subtask_id not in subtask_resources:
+            return
+
+        resource_delta = resource - subtask_resources[session_subtask_id]
+        subtask_resources[session_subtask_id] = resource
+        self._update_band_usage(band, resource_delta)
+
+    @mo.extensible
+    def release_subtask_resource(
+        self, band: BandType, session_id: str, subtask_id: str
+    ):
+        # todo ensure slots released when subtasks ends in all means
+        resource_delta = self._band_stid_resources[band].pop(
+            (session_id, subtask_id), ZeroResource
+        )
+        self._update_band_usage(band, -resource_delta)
+
+    def _update_band_usage(self, band: BandType, band_usage_delta: Resource):
+        self._band_used_resources[band] += band_usage_delta
+        # some code path doesn't call `apply_subtask_resources`
+        band_total_resource = self._band_total_resources.get(band)
+        if (
+            band_total_resource is not None
+            and self._band_used_resources[band] > band_total_resource
+        ):  # pragma: no cover
+            raise Exception(
+                f"Resource exceed: band used resource {self._band_used_resources[band]} "
+                f"band total resource {self._band_total_resources[band]}"
+            )
+        if self._band_used_resources[band] <= ZeroResource:
+            self._band_used_resources.pop(band)
+            self._band_idle_start_time[band] = time.time()
+            if band in self._band_idle_events:
+                self._band_idle_events.pop(band).set()
+        else:
+            self._band_idle_start_time[band] = -1
+
+    def get_used_resources(self) -> Dict[BandType, Resource]:
+        return self._band_used_resources
+
+    def get_remaining_resources(self) -> Dict[BandType, Resource]:
+        resources = {}
+        for band, resource in self._band_total_resources.items():
+            used_resource = self.get_used_resources()[band]
+            resources[band] = resource - used_resource
+        return resources
+
+    async def get_idle_bands(self, idle_duration: int):
+        """Return a band list which all bands has been idle for at least `idle_duration` seconds."""
+        now = time.time()
+        idle_bands = []
+        for band in self._band_total_resources.keys():
+            idle_start_time = self._band_idle_start_time.get(band)
+            if idle_start_time is None:  # pragma: no cover
+                # skip new requested band for this round scale in.
+                self._band_idle_start_time[band] = now
+            elif idle_start_time > 0 and now >= idle_start_time + idle_duration:
+                idle_bands.append(band)
+        return idle_bands
+
+    async def wait_band_idle(self, band: BandType):
+        if self._band_idle_start_time[band] <= 0:
+            if band in self._band_idle_events:
+                event = self._band_idle_events[band]
+            else:
+                event = asyncio.Event()
+                self._band_idle_events[band] = event
+            return event.wait()
diff --git a/python/xorbits/_mars/services/scheduling/supervisor/manager.py b/python/xorbits/_mars/services/scheduling/supervisor/manager.py
new file mode 100644
index 000000000..8424937ea
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/supervisor/manager.py
@@ -0,0 +1,446 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import time
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple, Union
+
+from .... import oscar as mo
+from ....lib.aio import alru_cache
+from ....metrics import Metrics
+from ....oscar.backends.context import ProfilingContext
+from ....oscar.errors import MarsError
+from ....oscar.profiling import MARS_ENABLE_PROFILING, ProfilingData
+from ....typing import BandType
+from ....utils import Timer, dataslots
+from ...subtask import Subtask, SubtaskResult, SubtaskStatus
+from ...task import TaskAPI
+from ..core import SubtaskScheduleSummary
+from ..utils import redirect_subtask_errors
+
+logger = logging.getLogger(__name__)
+
+
+# the default times to reschedule subtask.
+DEFAULT_SUBTASK_MAX_RESCHEDULES = 0
+
+
+@dataslots
+@dataclass
+class SubtaskScheduleInfo:
+    subtask: Subtask
+    band_futures: Dict[BandType, asyncio.Future] = field(default_factory=dict)
+    start_time: int = -1
+    end_time: int = -1
+    max_reschedules: int = 0
+    num_reschedules: int = 0
+    num_speculative_concurrent_run: int = 0
+
+    def to_summary(self, **kwargs) -> SubtaskScheduleSummary:
+        return SubtaskScheduleSummary(
+            task_id=self.subtask.task_id,
+            subtask_id=self.subtask.subtask_id,
+            bands=list(self.band_futures.keys()),
+            num_reschedules=self.num_reschedules,
+            **kwargs,
+        )
+
+
+class SubtaskManagerActor(mo.Actor):
+    _subtask_infos: Dict[str, SubtaskScheduleInfo]  # subtask id -> schedule info
+    _subtask_summaries: Dict[str, SubtaskScheduleSummary]  # subtask id -> summary
+
+    @classmethod
+    def gen_uid(cls, session_id: str):
+        return f"{session_id}_subtask_manager"
+
+    def __init__(
+        self,
+        session_id: str,
+        subtask_max_reschedules: int = DEFAULT_SUBTASK_MAX_RESCHEDULES,
+        subtask_cancel_timeout: int = 5,
+        speculation_config: Dict[str, object] = None,
+    ):
+        self._session_id = session_id
+        self._subtask_infos = dict()
+        self._subtask_summaries = dict()
+        self._subtask_max_reschedules = subtask_max_reschedules
+        self._subtask_cancel_timeout = subtask_cancel_timeout
+        self._speculation_config = speculation_config or {}
+        self._queueing_ref = None
+        self._global_resource_ref = None
+        self._submitted_subtask_count = Metrics.counter(
+            "mars.scheduling.submitted_subtask_count",
+            "The count of submitted subtasks to all bands.",
+            ("session_id", "task_id", "stage_id"),
+        )
+        self._finished_subtask_count = Metrics.counter(
+            "mars.scheduling.finished_subtask_count",
+            "The count of finished subtasks of all bands.",
+            ("session_id", "task_id", "stage_id"),
+        )
+        self._canceled_subtask_count = Metrics.counter(
+            "mars.scheduling.canceled_subtask_count",
+            "The count of canceled subtasks of all bands.",
+            ("session_id", "task_id", "stage_id"),
+        )
+        logger.info(
+            "Created SubtaskManager with subtask_max_reschedules %s, "
+            "speculation_config %s",
+            self._subtask_max_reschedules,
+            speculation_config,
+        )
+
+    async def __post_create__(self):
+        from .queueing import SubtaskQueueingActor
+
+        self._queueing_ref = await mo.actor_ref(
+            SubtaskQueueingActor.gen_uid(self._session_id), address=self.address
+        )
+        from ..supervisor import GlobalResourceManagerActor
+
+        self._global_resource_ref = await mo.actor_ref(
+            GlobalResourceManagerActor.default_uid(), address=self.address
+        )
+        from .speculation import SpeculativeScheduler
+
+        self._speculation_execution_scheduler = SpeculativeScheduler(
+            self._queueing_ref, self._global_resource_ref, self._speculation_config
+        )
+        await self._speculation_execution_scheduler.start()
+
+    async def __pre_destroy__(self):
+        await self._speculation_execution_scheduler.stop()
+
+    @alru_cache
+    async def _get_task_api(self):
+        return await TaskAPI.create(self._session_id, self.address)
+
+    async def add_subtasks(self, subtasks: List[Subtask], priorities: List[Tuple]):
+        async with redirect_subtask_errors(self, subtasks):
+            for subtask in subtasks:
+                # the extra_config may be None. the extra config overwrites the default value.
+                subtask_max_reschedules = (
+                    subtask.extra_config.get("subtask_max_reschedules")
+                    if subtask.extra_config
+                    else None
+                )
+                if subtask_max_reschedules is None:
+                    subtask_max_reschedules = self._subtask_max_reschedules
+                if subtask.subtask_id in self._subtask_infos:  # pragma: no cover
+                    raise KeyError(f"Subtask {subtask.subtask_id} already added.")
+                self._subtask_infos[subtask.subtask_id] = SubtaskScheduleInfo(
+                    subtask, max_reschedules=subtask_max_reschedules
+                )
+
+            virtual_subtasks = [subtask for subtask in subtasks if subtask.virtual]
+            for subtask in virtual_subtasks:
+                task_api = await self._get_task_api()
+                await task_api.set_subtask_result(
+                    SubtaskResult(
+                        subtask_id=subtask.subtask_id,
+                        session_id=subtask.session_id,
+                        task_id=subtask.task_id,
+                        stage_id=subtask.stage_id,
+                        progress=1.0,
+                        status=SubtaskStatus.succeeded,
+                    )
+                )
+            await self._queueing_ref.add_subtasks(
+                [subtask for subtask in subtasks if not subtask.virtual], priorities
+            )
+            await self._queueing_ref.submit_subtasks.tell()
+
+    @alru_cache(maxsize=10000)
+    async def _get_execution_ref(self, band: BandType):
+        from ..worker.execution import SubtaskExecutionActor
+
+        return await mo.actor_ref(SubtaskExecutionActor.default_uid(), address=band[0])
+
+    async def finish_subtasks(
+        self,
+        subtask_ids: List[str],
+        bands: List[BandType] = None,
+        schedule_next: bool = True,
+    ):
+        logger.debug("Finished subtasks %s.", subtask_ids)
+        band_tasks = defaultdict(lambda: 0)
+        bands = bands or [None] * len(subtask_ids)
+        for subtask_id, subtask_band in zip(subtask_ids, bands):
+            subtask_info = self._subtask_infos.get(subtask_id, None)
+            if subtask_info is not None:
+                self._finished_subtask_count.record(
+                    1,
+                    {
+                        "session_id": self._session_id,
+                        "task_id": subtask_info.subtask.task_id,
+                        "stage_id": subtask_info.subtask.stage_id,
+                    },
+                )
+                self._subtask_summaries[subtask_id] = subtask_info.to_summary(
+                    is_finished=True
+                )
+                subtask_info.end_time = time.time()
+                self._speculation_execution_scheduler.finish_subtask(subtask_info)
+                #  Cancel subtask on other bands.
+                aio_task = subtask_info.band_futures.pop(subtask_band, None)
+                if aio_task:
+                    await aio_task
+                    if schedule_next:
+                        band_tasks[subtask_band] += 1
+                if subtask_info.band_futures:
+                    # Cancel subtask here won't change subtask status.
+                    # See more in `TaskProcessorActor.set_subtask_result`
+                    logger.info(
+                        "Try to cancel subtask %s on bands %s.",
+                        subtask_id,
+                        set(subtask_info.band_futures.keys()),
+                    )
+                    # Cancel subtask can be async and may need to kill slot which need more time.
+                    # Can't use `tell` here because next line remove subtask info which is needed by
+                    # `cancel_subtasks`.
+                    yield self.ref().cancel_subtasks([subtask_id])
+                # cancel subtask first then pop subtask info.
+                self._subtask_infos.pop(subtask_id, None)
+                if schedule_next:
+                    for band in subtask_info.band_futures.keys():
+                        band_tasks[band] += 1
+        await self._queueing_ref.remove_queued_subtasks(subtask_ids)
+        if band_tasks:
+            tasks = []
+            for band, subtask_count in band_tasks.items():
+                task = asyncio.ensure_future(
+                    self._queueing_ref.submit_subtasks.tell(band, subtask_count)
+                )
+                tasks.append(task)
+            await asyncio.wait(tasks)
+
+    def _get_subtasks_by_ids(self, subtask_ids: List[str]) -> List[Optional[Subtask]]:
+        subtasks = []
+        for stid in subtask_ids:
+            try:
+                subtasks.append(self._subtask_infos[stid].subtask)
+            except KeyError:
+                subtasks.append(None)
+        return subtasks
+
+    async def submit_subtask_to_band(self, subtask_id: str, band: BandType):
+        if subtask_id not in self._subtask_infos:  # pragma: no cover
+            logger.info(
+                "Subtask %s is not in added subtasks set, it may be finished or canceled, skip it.",
+                subtask_id,
+            )
+            return
+        async with redirect_subtask_errors(
+            self, self._get_subtasks_by_ids([subtask_id])
+        ):
+            try:
+                subtask_info = self._subtask_infos[subtask_id]
+                execution_ref = await self._get_execution_ref(band)
+                extra_config = subtask_info.subtask.extra_config
+                enable_profiling = MARS_ENABLE_PROFILING or (
+                    extra_config and extra_config.get("enable_profiling")
+                )
+                profiling_context = (
+                    ProfilingContext(subtask_info.subtask.task_id)
+                    if enable_profiling
+                    else None
+                )
+                self._submitted_subtask_count.record(
+                    1,
+                    {
+                        "session_id": self._session_id,
+                        "task_id": subtask_info.subtask.task_id,
+                        "stage_id": subtask_info.subtask.stage_id,
+                    },
+                )
+                logger.debug("Start run subtask %s in band %s.", subtask_id, band)
+                with Timer() as timer:
+                    task = asyncio.create_task(
+                        execution_ref.run_subtask.options(
+                            profiling_context=profiling_context
+                        ).send(subtask_info.subtask, band[1], self.address)
+                    )
+                    subtask_info.band_futures[band] = task
+                    subtask_info.start_time = time.time()
+                    self._speculation_execution_scheduler.add_subtask(subtask_info)
+                    result = yield task
+                ProfilingData.collect_subtask(
+                    subtask_info.subtask, band, timer.duration
+                )
+                task_api = await self._get_task_api()
+                logger.debug("Finished subtask %s with result %s.", subtask_id, result)
+                await task_api.set_subtask_result(result)
+            except (OSError, MarsError) as ex:
+                # TODO: We should handle ServerClosed Error.
+                if (
+                    subtask_info.subtask.retryable
+                    and subtask_info.num_reschedules < subtask_info.max_reschedules
+                ):
+                    logger.error(
+                        "Reschedule subtask %s due to %s",
+                        subtask_info.subtask.subtask_id,
+                        ex,
+                    )
+                    subtask_info.num_reschedules += 1
+                    await self._queueing_ref.add_subtasks(
+                        [subtask_info.subtask],
+                        [subtask_info.subtask.priority or tuple()],
+                        exclude_bands=set(subtask_info.band_futures.keys()),
+                    )
+                else:
+                    raise ex
+            except asyncio.CancelledError:
+                raise
+            except Exception as ex:
+                if (
+                    subtask_info.subtask.retryable
+                    and subtask_info.num_reschedules < subtask_info.max_reschedules
+                ):
+                    logger.error(
+                        "Failed to reschedule subtask %s, "
+                        "num_reschedules: %s, max_reschedules: %s, unhandled exception: %s",
+                        subtask_info.subtask.subtask_id,
+                        subtask_info.num_reschedules,
+                        subtask_info.max_reschedules,
+                        ex,
+                    )
+                raise ex
+            finally:
+                # make sure slot is released before marking tasks as finished
+                await self._global_resource_ref.release_subtask_resource(
+                    band,
+                    subtask_info.subtask.session_id,
+                    subtask_info.subtask.subtask_id,
+                )
+                logger.debug(
+                    "Slot released for band %s after subtask %s",
+                    band,
+                    subtask_info.subtask.subtask_id,
+                )
+                # We should call submit_subtasks after the resource is released.
+                # If submit_subtasks runs before release_subtask_resource
+                # then the rescheduled subtask may not be submitted due to
+                # no available resource. The mars will hangs.
+                if subtask_info.num_reschedules > 0:
+                    await self._queueing_ref.submit_subtasks.tell()
+
+    async def cancel_subtasks(
+        self, subtask_ids: List[str], kill_timeout: Union[float, int] = None
+    ):
+        kill_timeout = kill_timeout or self._subtask_cancel_timeout
+        logger.info(
+            "Start to cancel subtasks %s, kill timeout is %s.",
+            subtask_ids,
+            kill_timeout,
+        )
+        queued_subtask_ids = []
+        single_cancel_tasks = []
+
+        task_api = await self._get_task_api()
+
+        async def cancel_single_task(subtask, raw_tasks, cancel_tasks):
+            if cancel_tasks:
+                await asyncio.wait(cancel_tasks)
+            if raw_tasks:
+                dones, _ = await asyncio.wait(raw_tasks)
+            else:
+                dones = []
+            if not dones or all(fut.cancelled() for fut in dones):
+                await task_api.set_subtask_result(
+                    SubtaskResult(
+                        subtask_id=subtask.subtask_id,
+                        session_id=subtask.session_id,
+                        task_id=subtask.task_id,
+                        stage_id=subtask.stage_id,
+                        status=SubtaskStatus.cancelled,
+                    )
+                )
+
+        for subtask_id in subtask_ids:
+            if subtask_id not in self._subtask_infos:
+                # subtask may already finished or not submitted at all
+                logger.info(
+                    "Skip cancel subtask %s, it may already finished or not submitted at all",
+                    subtask_id,
+                )
+                continue
+
+            subtask_info = self._subtask_infos[subtask_id]
+            raw_tasks_to_cancel = list(subtask_info.band_futures.values())
+
+            if not raw_tasks_to_cancel:
+                queued_subtask_ids.append(subtask_id)
+                single_cancel_tasks.append(
+                    asyncio.create_task(
+                        cancel_single_task(subtask_info.subtask, [], [])
+                    )
+                )
+            else:
+                cancel_tasks = []
+                for band in subtask_info.band_futures.keys():
+                    execution_ref = await self._get_execution_ref(band)
+                    cancel_tasks.append(
+                        asyncio.create_task(
+                            execution_ref.cancel_subtask(
+                                subtask_id, kill_timeout=kill_timeout
+                            )
+                        )
+                    )
+                single_cancel_tasks.append(
+                    asyncio.create_task(
+                        cancel_single_task(
+                            subtask_info.subtask, raw_tasks_to_cancel, cancel_tasks
+                        )
+                    )
+                )
+        if queued_subtask_ids:
+            # Don't use `finish_subtasks` because it may remove queued
+            await self._queueing_ref.remove_queued_subtasks(queued_subtask_ids)
+        if single_cancel_tasks:
+            yield asyncio.wait(single_cancel_tasks)
+
+        for subtask_id in subtask_ids:
+            subtask_info = self._subtask_infos.pop(subtask_id, None)
+            if subtask_info is not None:
+                self._subtask_summaries[subtask_id] = subtask_info.to_summary(
+                    is_finished=True, is_cancelled=True
+                )
+                self._canceled_subtask_count.record(
+                    1,
+                    {
+                        "session_id": self._session_id,
+                        "task_id": subtask_info.subtask.task_id,
+                        "stage_id": subtask_info.subtask.stage_id,
+                    },
+                )
+        await self._queueing_ref.submit_subtasks.tell()
+        logger.info("Subtasks %s canceled.", subtask_ids)
+
+    def get_schedule_summaries(self, task_id: Optional[str] = None):
+        if task_id is not None:
+            summaries = {
+                subtask_id: summary
+                for subtask_id, summary in self._subtask_summaries.items()
+                if summary.task_id == task_id
+            }
+        else:
+            summaries = dict(self._subtask_summaries)
+        for info in self._subtask_infos.values():
+            if task_id is None or info.subtask.task_id == task_id:
+                summaries[info.subtask.subtask_id] = info.to_summary()
+        return list(summaries.values())
diff --git a/python/xorbits/_mars/services/scheduling/supervisor/queueing.py b/python/xorbits/_mars/services/scheduling/supervisor/queueing.py
new file mode 100644
index 000000000..4f55f26ca
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/supervisor/queueing.py
@@ -0,0 +1,350 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import copy
+import heapq
+import logging
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import DefaultDict, Dict, List, Optional, Set, Tuple, Union
+
+from .... import oscar as mo
+from ....lib.aio import alru_cache
+from ....metrics import Metrics
+from ....resource import ZeroResource
+from ....utils import dataslots
+from ...subtask import Subtask
+from ...task import TaskAPI
+from ..utils import redirect_subtask_errors
+
+logger = logging.getLogger(__name__)
+
+_DEFAULT_SUBMIT_PERIOD = 0
+
+
+@dataslots
+@dataclass
+class HeapItem:
+    subtask: Subtask
+    priority: Tuple
+
+    def __lt__(self, other: "HeapItem"):
+        return self.priority > other.priority
+
+
+class SubtaskQueueingActor(mo.Actor):
+    _stid_to_bands: DefaultDict[str, List[Tuple]]
+    _stid_to_items: Dict[str, HeapItem]
+    _band_queues: DefaultDict[Tuple, List[HeapItem]]
+
+    @classmethod
+    def gen_uid(cls, session_id: str):
+        return f"{session_id}_subtask_queueing"
+
+    def __init__(self, session_id: str, submit_period: Union[float, int] = None):
+        self._session_id = session_id
+        self._stid_to_bands = defaultdict(list)
+        self._stid_to_items = dict()
+        # Note that we need to ensure top item in every band heap queue is valid,
+        # so that we can ensure band queue is busy if the band queue is not empty.
+        self._band_queues = defaultdict(list)
+
+        self._cluster_api = None
+        self._slots_ref = None
+        self._assigner_ref = None
+
+        self._band_to_resource = dict()
+        self._band_watch_task = None
+        self._max_enqueue_id = 0
+
+        self._periodical_submit_task = None
+        self._submit_period = submit_period or _DEFAULT_SUBMIT_PERIOD
+        self._submitted_subtask_number = Metrics.gauge(
+            "mars.band.submitted_subtask_number",
+            "The number of submitted subtask to a band.",
+            ("session_id", "band"),
+        )
+        self._unsubmitted_subtask_number = Metrics.gauge(
+            "mars.band.unsubmitted_subtask_number",
+            "The number of unsubmitted subtask to a band.",
+            ("session_id", "band"),
+        )
+
+    async def __post_create__(self):
+        from ...cluster import ClusterAPI
+
+        self._cluster_api = await ClusterAPI.create(self.address)
+        self._band_to_resource = {}
+
+        async def watch_bands():
+            async for bands in self._cluster_api.watch_all_bands():
+                # confirm ready bands indeed changed
+                if bands != self._band_to_resource:
+                    old_band_resource = self._band_to_resource
+                    self._band_to_resource = copy.deepcopy(bands)
+                    if self._band_queues:
+                        await self.balance_queued_subtasks()
+                        # Refresh global slot manager to get latest bands,
+                        # so that subtasks reassigned to the new bands can be
+                        # ensured to get submitted as least one subtask every band
+                        # successfully.
+                        await self._slots_ref.refresh_bands()
+                        all_bands = {*bands.keys(), *old_band_resource.keys()}
+                        bands_delta = {}
+                        for b in all_bands:
+                            new_resource = bands.get(b, ZeroResource)
+                            old_resource = old_band_resource.get(b, ZeroResource)
+                            delta = new_resource - old_resource
+                            if delta != ZeroResource:
+                                bands_delta[b] = delta
+                        # Submit tasks on new bands manually, otherwise some subtasks
+                        # will never got submitted. Note that we must ensure every new
+                        # band will get at least one subtask submitted successfully.
+                        # Later subtasks submit on the band will be triggered by the
+                        # success of previous subtasks on the same band.
+                        logger.info(
+                            "Bands changed with delta %s, submit all bands.",
+                            bands_delta,
+                        )
+                        await self.ref().submit_subtasks()
+
+        self._band_watch_task = asyncio.create_task(watch_bands())
+
+        from .globalresource import GlobalResourceManagerActor
+
+        [self._slots_ref] = await self._cluster_api.get_supervisor_refs(
+            [GlobalResourceManagerActor.default_uid()]
+        )
+        from .assigner import AssignerActor
+
+        self._assigner_ref = await mo.actor_ref(
+            AssignerActor.gen_uid(self._session_id), address=self.address
+        )
+
+        if self._submit_period > 0:
+            self._periodical_submit_task = self.ref().periodical_submit.tell_delay(
+                delay=self._submit_period
+            )
+
+    async def __pre_destroy__(self):
+        self._band_watch_task.cancel()
+        if self._periodical_submit_task is not None:  # pragma: no branch
+            self._periodical_submit_task.cancel()
+
+    async def periodical_submit(self):
+        await self.ref().submit_subtasks.tell()
+        self._periodical_submit_task = self.ref().periodical_submit.tell_delay(
+            delay=self._submit_period
+        )
+
+    @alru_cache
+    async def _get_task_api(self):
+        return await TaskAPI.create(self._session_id, self.address)
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_manager_ref(self):
+        from .manager import SubtaskManagerActor
+
+        return await mo.actor_ref(
+            SubtaskManagerActor.gen_uid(self._session_id), address=self.address
+        )
+
+    async def add_subtasks(
+        self,
+        subtasks: List[Subtask],
+        priorities: List[Tuple],
+        exclude_bands: Set[Tuple] = None,
+        random_when_unavailable: bool = True,
+    ):
+        bands = await self._assigner_ref.assign_subtasks(
+            subtasks, exclude_bands, random_when_unavailable
+        )
+        for subtask, band, priority in zip(subtasks, bands, priorities):
+            assert band is not None
+            self._stid_to_bands[subtask.subtask_id].append(band)
+            heap_item = self._stid_to_items[subtask.subtask_id] = HeapItem(
+                subtask, priority + (self._max_enqueue_id,)
+            )
+            self._max_enqueue_id += 1
+            heapq.heappush(self._band_queues[band], heap_item)
+            logger.debug(
+                "Subtask %s enqueued to band %s excluded from %s.",
+                subtask.subtask_id,
+                band,
+                exclude_bands,
+            )
+        logger.debug("%d subtasks enqueued", len(subtasks))
+
+    async def submit_subtasks(self, band: Tuple = None, limit: Optional[int] = None):
+        logger.debug("Submitting subtasks with limit %s", limit)
+
+        if not limit and band not in self._band_to_resource:
+            self._band_to_resource = await self._cluster_api.get_all_bands()
+
+        bands = [band] if band is not None else list(self._band_to_resource.keys())
+        submit_aio_tasks = []
+        manager_ref = await self._get_manager_ref()
+
+        apply_delays = []
+        submit_items_list = []
+        submitted_bands = []
+
+        for band in bands:
+            band_limit = limit or (
+                self._band_to_resource[band].num_cpus
+                or self._band_to_resource[band].num_gpus
+            )
+            task_queue = self._band_queues[band]
+            submit_items = dict()
+            while (
+                self._ensure_top_item_valid(task_queue)
+                and len(submit_items) < band_limit
+            ):
+                item = heapq.heappop(task_queue)
+                submit_items[item.subtask.subtask_id] = item
+
+            subtask_ids = list(submit_items)
+            if not subtask_ids:
+                continue
+
+            submitted_bands.append(band)
+            submit_items_list.append(submit_items)
+
+            # Before hbo, when a manager finish a subtask, it will schedule one subtask successfully because
+            # there is a slot idle. But now we have memory requirements, so the subtask may apply resource
+            # from supervisor failed. In such cases, those subtasks will never got scheduled.
+            # TODO We can use `_periodical_submit_task` to submit those subtasks.
+            subtask_resources = [
+                item.subtask.required_resource for item in submit_items.values()
+            ]
+            apply_delays.append(
+                self._slots_ref.apply_subtask_resources.delay(
+                    band, self._session_id, subtask_ids, subtask_resources
+                )
+            )
+
+        async with redirect_subtask_errors(
+            self,
+            [
+                item.subtask
+                for submit_items in submit_items_list
+                for item in submit_items.values()
+            ],
+        ):
+            submitted_ids_list = await self._slots_ref.apply_subtask_resources.batch(
+                *apply_delays
+            )
+
+        for band, submit_items, submitted_ids in zip(
+            submitted_bands, submit_items_list, submitted_ids_list
+        ):
+            subtask_ids = list(submit_items)
+            task_queue = self._band_queues[band]
+
+            async with redirect_subtask_errors(
+                self, [item.subtask for item in submit_items.values()]
+            ):
+                non_submitted_ids = [k for k in submit_items if k not in submitted_ids]
+                tags = {
+                    "session_id": self._session_id,
+                    "band": band[0] if band else "",
+                }
+                self._submitted_subtask_number.record(len(submitted_ids), tags)
+                self._unsubmitted_subtask_number.record(len(non_submitted_ids), tags)
+                if submitted_ids:
+                    for stid in subtask_ids:
+                        if stid not in submitted_ids:
+                            continue
+                        item = submit_items[stid]
+                        logger.debug("Submit subtask %r to band %r", item.subtask, band)
+                        submit_aio_tasks.append(
+                            asyncio.create_task(
+                                manager_ref.submit_subtask_to_band.tell(
+                                    item.subtask.subtask_id, band
+                                )
+                            )
+                        )
+                        await asyncio.sleep(0)
+                        self.remove_queued_subtasks([item.subtask.subtask_id])
+                else:
+                    logger.debug("No slots available")
+
+            for stid in non_submitted_ids:
+                # TODO if subtasks submit failed due to lacking memory/cpu/gpu resources, lower the priority so that
+                # other subtasks can be submitted.
+                heapq.heappush(task_queue, submit_items[stid])
+
+        if submit_aio_tasks:
+            yield asyncio.gather(*submit_aio_tasks)
+
+    def _ensure_top_item_valid(self, task_queue):
+        """Clean invalid subtask item from the queue to ensure that when the queue is not empty,
+        there is always some subtasks waiting being scheduled."""
+        while (
+            task_queue and task_queue[0].subtask.subtask_id not in self._stid_to_items
+        ):
+            #  skip removed items (as they may be re-pushed into the queue)
+            heapq.heappop(task_queue)
+        return bool(task_queue)
+
+    @mo.extensible
+    def update_subtask_priority(self, subtask_id: str, priority: Tuple):
+        if subtask_id not in self._stid_to_bands:
+            return
+        for band in self._stid_to_bands[subtask_id]:
+            new_item = HeapItem(self._stid_to_items[subtask_id].subtask, priority)
+            self._stid_to_items[subtask_id] = new_item
+            heapq.heappush(self._band_queues[band], new_item)
+
+    def remove_queued_subtasks(self, subtask_ids: List[str]):
+        for stid in subtask_ids:
+            bands = self._stid_to_bands.pop(stid, [])
+            self._stid_to_items.pop(stid, None)
+            for band in bands:
+                band_queue = self._band_queues.get(band)
+                self._ensure_top_item_valid(band_queue)
+
+    async def all_bands_busy(self) -> bool:
+        """Return True if all bands queue has tasks waiting to be submitted."""
+        bands = set(self._band_to_resource.keys())
+        if set(self._band_queues.keys()).issuperset(bands):
+            return all(len(self._band_queues[band]) > 0 for band in bands)
+        return False
+
+    async def balance_queued_subtasks(self):
+        # record length of band queues
+        band_num_queued_subtasks = {
+            band: len(queue) for band, queue in self._band_queues.items()
+        }
+        move_queued_subtasks = await self._assigner_ref.reassign_subtasks(
+            band_num_queued_subtasks
+        )
+        items = []
+        # rewrite band queues according to feedbacks from assigner
+        for band, move in move_queued_subtasks.items():
+            task_queue = self._band_queues[band]
+            assert move + len(task_queue) >= 0
+            for _ in range(abs(move)):
+                if move < 0:
+                    # TODO: pop item of low priority
+                    item = heapq.heappop(task_queue)
+                    self._stid_to_bands[item.subtask.subtask_id].remove(band)
+                    items.append(item)
+                elif move > 0:
+                    item = items.pop()
+                    self._stid_to_bands[item.subtask.subtask_id].append(band)
+                    heapq.heappush(task_queue, item)
+            if len(task_queue) == 0:
+                self._band_queues.pop(band)
diff --git a/python/xorbits/_mars/services/scheduling/supervisor/service.py b/python/xorbits/_mars/services/scheduling/supervisor/service.py
new file mode 100644
index 000000000..4ec202daf
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/supervisor/service.py
@@ -0,0 +1,146 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+
+from .... import oscar as mo
+from ...core import AbstractService
+from .autoscale import AutoscalerActor
+from .manager import DEFAULT_SUBTASK_MAX_RESCHEDULES
+
+
+class SchedulingSupervisorService(AbstractService):
+    """
+    Scheduling service on supervisor.
+
+    Scheduling Configuration
+    ------------------------
+    {
+        "scheduling" : {
+            "submit_period": 1,
+            "autoscale" : {
+                "enabled": false,
+                "scheduler_backlog_timeout": 20,
+                "sustained_scheduler_backlog_timeout": 20,
+                "worker_idle_timeout": 40,
+                "min_workers": 1,
+                "max_workers": 100
+            }
+        }
+    }
+    """
+
+    async def start(self):
+        from .globalresource import GlobalResourceManagerActor
+
+        await mo.create_actor(
+            GlobalResourceManagerActor,
+            uid=GlobalResourceManagerActor.default_uid(),
+            address=self._address,
+        )
+
+        autoscale_config = self._config.get("scheduling", {}).get("autoscale", {})
+        await mo.create_actor(
+            AutoscalerActor,
+            autoscale_config,
+            uid=AutoscalerActor.default_uid(),
+            address=self._address,
+        )
+
+    async def stop(self):
+        from .autoscale import AutoscalerActor
+
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=AutoscalerActor.default_uid(), address=self._address
+            )
+        )
+
+        from .globalresource import GlobalResourceManagerActor
+
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=GlobalResourceManagerActor.default_uid(), address=self._address
+            )
+        )
+
+    async def create_session(self, session_id: str):
+        service_config = self._config or dict()
+        scheduling_config = service_config.get("scheduling", {})
+        subtask_max_reschedules = scheduling_config.get(
+            "subtask_max_reschedules", DEFAULT_SUBTASK_MAX_RESCHEDULES
+        )
+        subtask_cancel_timeout = scheduling_config.get("subtask_cancel_timeout", 5)
+        speculation_config = scheduling_config.get("speculation", {})
+
+        from .assigner import AssignerActor
+
+        assigner_coro = mo.create_actor(
+            AssignerActor,
+            session_id,
+            address=self._address,
+            uid=AssignerActor.gen_uid(session_id),
+        )
+
+        from .queueing import SubtaskQueueingActor
+
+        queueing_coro = mo.create_actor(
+            SubtaskQueueingActor,
+            session_id,
+            scheduling_config.get("submit_period"),
+            address=self._address,
+            uid=SubtaskQueueingActor.gen_uid(session_id),
+        )
+
+        await asyncio.gather(assigner_coro, queueing_coro)
+
+        from .manager import SubtaskManagerActor
+
+        await mo.create_actor(
+            SubtaskManagerActor,
+            session_id,
+            subtask_max_reschedules,
+            subtask_cancel_timeout,
+            speculation_config,
+            address=self._address,
+            uid=SubtaskManagerActor.gen_uid(session_id),
+        )
+
+        from ...cluster import ClusterAPI
+        from .autoscale import AutoscalerActor
+
+        cluster_api = await ClusterAPI.create(self._address)
+        [autoscaler_ref] = await cluster_api.get_supervisor_refs(
+            [AutoscalerActor.default_uid()]
+        )
+        await autoscaler_ref.register_session(session_id, self._address)
+
+    async def destroy_session(self, session_id: str):
+        from .assigner import AssignerActor
+        from .autoscale import AutoscalerActor
+        from .manager import SubtaskManagerActor
+        from .queueing import SubtaskQueueingActor
+
+        autoscaler_ref = await mo.actor_ref(
+            AutoscalerActor.default_uid(), address=self._address
+        )
+        await autoscaler_ref.unregister_session(session_id)
+
+        destroy_tasks = []
+        for actor_cls in [SubtaskManagerActor, SubtaskQueueingActor, AssignerActor]:
+            ref = await mo.actor_ref(
+                actor_cls.gen_uid(session_id), address=self._address
+            )
+            destroy_tasks.append(asyncio.create_task(ref.destroy()))
+        await asyncio.gather(*destroy_tasks)
diff --git a/python/xorbits/_mars/services/scheduling/supervisor/speculation.py b/python/xorbits/_mars/services/scheduling/supervisor/speculation.py
new file mode 100644
index 000000000..5cdecfed0
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/supervisor/speculation.py
@@ -0,0 +1,277 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import time
+from collections import defaultdict
+from typing import Dict
+
+import numpy as np
+
+from ....utils import create_task_with_error_log, parse_readable_size
+from ..errors import NoAvailableBand
+from .manager import SubtaskScheduleInfo
+
+logger = logging.getLogger(__name__)
+
+# the default times for speculative subtask execution.
+DEFAULT_SUBTASK_SPECULATION_THRESHOLD = 0.75
+DEFAULT_SUBTASK_SPECULATION_INTERVAL = 5  # time unit: seconds
+DEFAULT_SUBTASK_SPECULATION_MIN_TASK_RUNTIME = 3
+DEFAULT_SUBTASK_SPECULATION_MULTIPLIER = 1.5
+DEFAULT_SUBTASK_MAX_CONCURRENT_RUN = 3
+
+
+class SpeculativeScheduler:
+    _grouped_unfinished_subtasks: Dict[
+        str, Dict[str, SubtaskScheduleInfo]
+    ]  # key is subtask logic key
+    _grouped_finished_subtasks: Dict[
+        str, Dict[str, SubtaskScheduleInfo]
+    ]  # key is subtask logic key
+
+    def __init__(
+        self, queueing_ref, global_resource_ref, speculation_config: Dict[str, any]
+    ):
+        self._grouped_unfinished_subtasks = defaultdict(dict)
+        self._grouped_finished_subtasks = defaultdict(dict)
+        self._queueing_ref = queueing_ref
+        self._global_resource_ref = global_resource_ref
+        self._speculation_config = speculation_config
+        self._subtask_speculation_enabled = speculation_config.get("enabled", False)
+        assert self._subtask_speculation_enabled in (True, False)
+        self._subtask_speculation_dry = speculation_config.get("dry", False)
+        self._subtask_speculation_threshold = parse_readable_size(
+            speculation_config.get("threshold", DEFAULT_SUBTASK_SPECULATION_THRESHOLD)
+        )[0]
+        self._subtask_speculation_interval = speculation_config.get(
+            "interval", DEFAULT_SUBTASK_SPECULATION_INTERVAL
+        )
+        self._subtask_speculation_min_task_runtime = speculation_config.get(
+            "min_task_runtime", DEFAULT_SUBTASK_SPECULATION_MIN_TASK_RUNTIME
+        )
+        self._subtask_speculation_multiplier = speculation_config.get(
+            "multiplier", DEFAULT_SUBTASK_SPECULATION_MULTIPLIER
+        )
+        self._subtask_speculation_max_concurrent_run = speculation_config.get(
+            "max_concurrent_run", DEFAULT_SUBTASK_MAX_CONCURRENT_RUN
+        )
+        if self._subtask_speculation_enabled:
+            assert 1 >= self._subtask_speculation_threshold > 0
+            assert self._subtask_speculation_interval > 0
+            assert self._subtask_speculation_min_task_runtime > 0
+            assert self._subtask_speculation_multiplier > 0
+            assert self._subtask_speculation_max_concurrent_run > 0
+        self._speculation_execution_task = None
+
+    async def start(self):
+        if self._subtask_speculation_enabled:
+            self._speculation_execution_task = create_task_with_error_log(
+                self._speculative_execution_loop()
+            )
+            logger.info(
+                "Speculative execution started with config %s.",
+                self._speculation_config,
+            )
+
+    async def stop(self):
+        if self._subtask_speculation_enabled:
+            self._speculation_execution_task.cancel()
+            try:
+                await self._speculation_execution_task
+            except asyncio.CancelledError:
+                pass
+            logger.info("Speculative execution stopped.")
+
+    def add_subtask(self, subtask_info: SubtaskScheduleInfo):
+        # duplicate subtask add will be handled in `_speculative_execution`.
+        subtask = subtask_info.subtask
+        self._grouped_unfinished_subtasks[subtask.logic_key][
+            subtask.subtask_id
+        ] = subtask_info
+
+    def finish_subtask(self, subtask_info: SubtaskScheduleInfo):
+        subtask = subtask_info.subtask
+        grouped_finished_subtasks = self._grouped_finished_subtasks[subtask.logic_key]
+        grouped_finished_subtasks[subtask.subtask_id] = subtask_info
+        self._grouped_unfinished_subtasks[subtask.logic_key].pop(
+            subtask.subtask_id, None
+        )
+        if len(grouped_finished_subtasks) == subtask.logic_parallelism:
+            self._grouped_finished_subtasks.pop(subtask.logic_key)
+            self._grouped_unfinished_subtasks.pop(subtask.logic_key, None)
+            logger.info(
+                "Subtask group with logic key %s parallelism %s finished.",
+                subtask.logic_key,
+                subtask.logic_parallelism,
+            )
+
+    async def _speculative_execution_loop(self):
+        while True:
+            # check subtasks in the same group which has same logic key periodically, if some subtasks hasn't been
+            # finished in a considerably longer duration, then those subtasks maybe slow/hang subtasks, try resubmit
+            # it to other bands too.
+            await asyncio.sleep(self._subtask_speculation_interval)
+            await self._speculative_execution()
+
+    async def _speculative_execution(self):
+        for logic_key, subtask_infos_dict in dict(
+            self._grouped_finished_subtasks
+        ).items():
+            if not subtask_infos_dict:  # pragma: no cover
+                continue
+            subtask_infos = subtask_infos_dict.values()
+            one_subtask = next(iter(subtask_infos)).subtask
+            parallelism = one_subtask.logic_parallelism
+            spec_threshold = max(
+                1, int(self._subtask_speculation_threshold * parallelism)
+            )
+            # if finished subtasks reached the spec_threshold, try to find slow/hang unfinished subtasks
+            if parallelism > len(subtask_infos) >= spec_threshold:
+                unfinished_subtask_infos = self._grouped_unfinished_subtasks[
+                    logic_key
+                ].values()
+                # sort finished subtasks by running time
+                duration_array = np.sort(
+                    np.array(
+                        [info.end_time - info.start_time for info in subtask_infos]
+                    )
+                )
+                median = np.percentile(duration_array, 50)
+                duration_threshold = max(
+                    median * self._subtask_speculation_multiplier,
+                    self._subtask_speculation_min_task_runtime,
+                )
+                now = time.time()
+                # find subtasks whose duration is large enough so that can be took as slow/hang subtasks
+                unfinished_subtask_infos = [
+                    info
+                    for info in unfinished_subtask_infos
+                    if info not in subtask_infos
+                    and now - info.start_time > duration_threshold
+                ]
+                if not unfinished_subtask_infos:  # pragma: no cover
+                    continue
+                exclude_bands = set()
+                for info in unfinished_subtask_infos:
+                    exclude_bands.update(info.band_futures.keys())
+                remaining_resources = (
+                    await self._global_resource_ref.get_remaining_resources()
+                )
+                logger.warning(
+                    "%s subtasks in %s for group %s has not been finished in %s seconds on bands %s, "
+                    "median duration is %s, average duration for %s finished subtasks "
+                    "is %s. trying speculative running. "
+                    "Current cluster remaining resources %s",
+                    len(unfinished_subtask_infos),
+                    parallelism,
+                    logic_key,
+                    duration_threshold,
+                    exclude_bands,
+                    median,
+                    len(subtask_infos),
+                    duration_array.mean(),
+                    remaining_resources,
+                )
+                # TODO(chaokunyang) If too many subtasks got stale on same node, mark the node as slow node.
+                for subtask_info in unfinished_subtask_infos:
+                    subtask = subtask_info.subtask
+                    if subtask.retryable:
+                        logger.warning(
+                            "Subtask %s has not been finished in %s seconds on bands %s, "
+                            "trying speculative running.",
+                            subtask.subtask_id,
+                            now - subtask_info.start_time,
+                            list(subtask_info.band_futures.keys()),
+                        )
+                        await self._submit_speculative_subtask(
+                            subtask_info, exclude_bands
+                        )
+                    else:
+                        logger.warning(
+                            "Unretryable subtask %s has not been finished in %s seconds "
+                            "on bands %s, median duration is %s, it may hang.",
+                            subtask.subtask_id,
+                            (now - subtask_info.start_time),
+                            list(subtask_info.band_futures.keys()),
+                            median,
+                        )
+                await self._queueing_ref.submit_subtasks.tell()
+
+    async def _submit_speculative_subtask(self, subtask_info, exclude_bands):
+        subtask = subtask_info.subtask
+        if (
+            subtask_info.num_speculative_concurrent_run
+            == self._subtask_speculation_max_concurrent_run
+        ):
+            logger.debug(
+                "Subtask %s speculative run has reached max limit %s, "
+                "won't submit another speculative run.",
+                subtask.subtask_id,
+                self._subtask_speculation_max_concurrent_run,
+            )
+            return
+        if not self._subtask_speculation_dry:
+            if (
+                len(subtask_info.band_futures)
+                < subtask_info.num_speculative_concurrent_run + 1
+            ):
+                # ensure same subtask won't be submitted to same worker.
+                logger.info(
+                    "Speculative execution for subtask %s has not been submitted to worker,"
+                    "waiting for being submitted to worker."
+                    "Cluster resources may be not enough after excluded %s",
+                    subtask.subtask_id,
+                    exclude_bands,
+                )
+                return
+            try:
+                await self._queueing_ref.add_subtasks(
+                    [subtask],
+                    [subtask.priority or tuple()],
+                    exclude_bands=exclude_bands,
+                    random_when_unavailable=False,
+                )
+                logger.info(
+                    "Added subtask %s to queue excluded from %s.",
+                    subtask.subtask_id,
+                    exclude_bands,
+                )
+                subtask_info.num_speculative_concurrent_run += 1
+                if (
+                    subtask_info.num_speculative_concurrent_run
+                    == self._subtask_speculation_max_concurrent_run
+                ):
+                    logger.info(
+                        "Subtask %s reached max speculative execution: %s",
+                        subtask.subtask_id,
+                        self._subtask_speculation_max_concurrent_run,
+                    )
+            except NoAvailableBand:
+                logger.warning(
+                    "No bands available for subtask %s after excluded bands %s, "
+                    "try resubmit later.",
+                    subtask.subtask_id,
+                    exclude_bands,
+                )
+            except KeyError as e:  # pragma: no cover
+                # if the subtask happen to be finished, it's input chunk may got gc, if assigning to band
+                # needs to know input meta, we'll get KeyError or something else, just ignore it.
+                logger.warning(
+                    "Subtask %s may happen to be finished just now, cannot add it to "
+                    "subtask queue, got error %s, just ignore it.",
+                    subtask.subtask_id,
+                    e,
+                )
diff --git a/python/xorbits/_mars/services/scheduling/supervisor/tests/__init__.py b/python/xorbits/_mars/services/scheduling/supervisor/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/supervisor/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/scheduling/supervisor/tests/test_assigner.py b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_assigner.py
new file mode 100644
index 000000000..23dd8fcc9
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_assigner.py
@@ -0,0 +1,375 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+
+import numpy as np
+import pytest
+
+from ..... import oscar as mo
+from .....core import ChunkGraph
+from .....tensor.arithmetic import TensorTreeAdd
+from .....tensor.fetch import TensorFetch
+from ....cluster import ClusterAPI
+from ....cluster.core import NodeRole, NodeStatus
+from ....cluster.supervisor.locator import SupervisorPeerLocatorActor
+from ....cluster.supervisor.node_info import NodeInfoCollectorActor
+from ....cluster.uploader import NodeInfoUploaderActor
+from ....meta import MockMetaAPI
+from ....session import MockSessionAPI
+from ....subtask import Subtask
+from ...errors import NoAvailableBand, NoMatchingSlots
+from ...supervisor import AssignerActor
+
+
+class MockNodeInfoCollectorActor(NodeInfoCollectorActor):
+    def __init__(self, timeout=None, check_interval=None, with_gpu=False):
+        super().__init__(timeout=timeout, check_interval=check_interval)
+        self.ready_bands = {
+            ("address0", "numa-0"): 2,
+            ("address1", "numa-0"): 2,
+            ("address2", "numa-0"): 2,
+            ("address3", "numa-0"): 2,
+        }
+        if with_gpu:
+            self.ready_bands[("address0", "gpu-0")] = 1
+        self.all_bands = self.ready_bands.copy()
+
+    async def update_node_info(
+        self, address, role, env=None, resource=None, detail=None, status=None
+    ):
+        if "address" in address and status == NodeStatus.STOPPING:
+            del self.ready_bands[(address, "numa-0")]
+        await super().update_node_info(address, role, env, resource, detail, status)
+
+    def get_all_bands(self, role=None, statuses=None):
+        if statuses == {NodeStatus.READY}:
+            return self.ready_bands
+        else:
+            return self.all_bands
+
+
+class FakeClusterAPI(ClusterAPI):
+    @classmethod
+    async def create(cls, address: str, **kw):
+        dones, _ = await asyncio.wait(
+            [
+                mo.create_actor(
+                    SupervisorPeerLocatorActor,
+                    "fixed",
+                    address,
+                    uid=SupervisorPeerLocatorActor.default_uid(),
+                    address=address,
+                ),
+                mo.create_actor(
+                    MockNodeInfoCollectorActor,
+                    with_gpu=kw.get("with_gpu", False),
+                    uid=NodeInfoCollectorActor.default_uid(),
+                    address=address,
+                ),
+                mo.create_actor(
+                    NodeInfoUploaderActor,
+                    NodeRole.WORKER,
+                    interval=kw.get("upload_interval"),
+                    band_to_resource=kw.get("band_to_resource"),
+                    use_gpu=kw.get("use_gpu", False),
+                    uid=NodeInfoUploaderActor.default_uid(),
+                    address=address,
+                ),
+            ]
+        )
+
+        for task in dones:
+            try:
+                task.result()
+            except mo.ActorAlreadyExist:  # pragma: no cover
+                pass
+
+        api = await super().create(address=address)
+        await api.mark_node_ready()
+        return api
+
+
+@pytest.fixture
+async def actor_pool(request):
+    pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+    with_gpu = request.param
+
+    async with pool:
+        session_id = "test_session"
+        cluster_api = await FakeClusterAPI.create(
+            pool.external_address, with_gpu=with_gpu
+        )
+        await MockSessionAPI.create(pool.external_address, session_id=session_id)
+        meta_api = await MockMetaAPI.create(session_id, pool.external_address)
+        assigner_ref = await mo.create_actor(
+            AssignerActor,
+            session_id,
+            uid=AssignerActor.gen_uid(session_id),
+            address=pool.external_address,
+        )
+
+        try:
+            yield pool, session_id, assigner_ref, cluster_api, meta_api
+        finally:
+            await mo.destroy_actor(assigner_ref)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("actor_pool", [False], indirect=True)
+async def test_assign_cpu_tasks(actor_pool):
+    pool, session_id, assigner_ref, cluster_api, meta_api = actor_pool
+
+    input1 = TensorFetch(key="a", source_key="a", dtype=np.dtype(int)).new_chunk([])
+    input2 = TensorFetch(key="b", source_key="b", dtype=np.dtype(int)).new_chunk([])
+    input3 = TensorFetch(key="c", source_key="c", dtype=np.dtype(int)).new_chunk([])
+    result_chunk = TensorTreeAdd(args=[input1, input2, input3]).new_chunk(
+        [input1, input2, input3]
+    )
+
+    chunk_graph = ChunkGraph([result_chunk])
+    chunk_graph.add_node(input1)
+    chunk_graph.add_node(input2)
+    chunk_graph.add_node(input3)
+    chunk_graph.add_node(result_chunk)
+    chunk_graph.add_edge(input1, result_chunk)
+    chunk_graph.add_edge(input2, result_chunk)
+    chunk_graph.add_edge(input3, result_chunk)
+
+    await meta_api.set_chunk_meta(
+        input1, memory_size=200, store_size=200, bands=[("address0", "numa-0")]
+    )
+    await meta_api.set_chunk_meta(
+        input2, memory_size=400, store_size=400, bands=[("address1", "numa-0")]
+    )
+    await meta_api.set_chunk_meta(
+        input3, memory_size=400, store_size=400, bands=[("address2", "numa-0")]
+    )
+
+    await cluster_api.set_node_status(
+        node="address1", role=NodeRole.WORKER, status=NodeStatus.STOPPING
+    )
+    await cluster_api.set_node_status(
+        node="address3", role=NodeRole.WORKER, status=NodeStatus.STOPPING
+    )
+
+    subtask = Subtask("test_task", session_id, chunk_graph=chunk_graph)
+    [result] = await assigner_ref.assign_subtasks([subtask])
+    assert result in (("address0", "numa-0"), ("address2", "numa-0"))
+
+    subtask.expect_bands = [("address0", "numa-0")]
+    [result] = await assigner_ref.assign_subtasks([subtask])
+    assert result == ("address0", "numa-0")
+
+    subtask.expect_bands = [("address0", "numa-0"), ("address1", "numa-0")]
+    [result] = await assigner_ref.assign_subtasks([subtask])
+    assert result == ("address0", "numa-0")
+
+    subtask.expect_bands = [("address1", "numa-0")]
+    [result] = await assigner_ref.assign_subtasks([subtask])
+    assert result in (("address0", "numa-0"), ("address2", "numa-0"))
+
+    [result] = await assigner_ref.assign_subtasks(
+        [subtask], exclude_bands={("address0", "numa-0"), ("address2", "numa-0")}
+    )
+    assert result in (("address0", "numa-0"), ("address2", "numa-0"))
+    [result] = await assigner_ref.assign_subtasks(
+        [subtask], exclude_bands={("address0", "numa-0")}, random_when_unavailable=False
+    )
+    assert result == ("address2", "numa-0")
+    with pytest.raises(NoAvailableBand):
+        await assigner_ref.assign_subtasks(
+            [subtask],
+            exclude_bands={("address0", "numa-0"), ("address2", "numa-0")},
+            random_when_unavailable=False,
+        )
+    subtask.bands_specified = True
+    assert result == ("address2", "numa-0")
+    with pytest.raises(NoAvailableBand):
+        await assigner_ref.assign_subtasks([subtask])
+    subtask.bands_specified = False
+
+    result_chunk.op.gpu = True
+    subtask = Subtask("test_task", session_id, chunk_graph=chunk_graph)
+    with pytest.raises(NoMatchingSlots) as err:
+        await assigner_ref.assign_subtasks([subtask])
+    assert "gpu" in str(err.value)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("actor_pool", [False], indirect=True)
+async def test_assign_broadcaster(actor_pool):
+    pool, session_id, assigner_ref, cluster_api, meta_api = actor_pool
+
+    broadcaster = TensorFetch(key="x", source_key="x", dtype=np.dtype(int)).new_chunk(
+        [], is_broadcaster=True
+    )
+    input_chunk = TensorFetch(key="a", source_key="a", dtype=np.dtype(int)).new_chunk(
+        []
+    )
+    result_chunk = TensorTreeAdd(args=[broadcaster, input_chunk]).new_chunk(
+        [broadcaster, input_chunk]
+    )
+
+    chunk_graph = ChunkGraph([result_chunk])
+    chunk_graph.add_node(broadcaster)
+    chunk_graph.add_node(input_chunk)
+    chunk_graph.add_node(result_chunk)
+    chunk_graph.add_edge(broadcaster, result_chunk)
+    chunk_graph.add_edge(input_chunk, result_chunk)
+
+    await meta_api.set_chunk_meta(
+        broadcaster, memory_size=1000, store_size=200, bands=[("address0", "numa-0")]
+    )
+    await meta_api.set_chunk_meta(
+        input_chunk, memory_size=200, store_size=200, bands=[("address1", "numa-0")]
+    )
+
+    subtask = Subtask("test_task", session_id, chunk_graph=chunk_graph)
+    [result] = await assigner_ref.assign_subtasks([subtask])
+    assert result == ("address1", "numa-0")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("actor_pool", [True], indirect=True)
+async def test_assign_gpu_tasks(actor_pool):
+    pool, session_id, assigner_ref, cluster_api, meta_api = actor_pool
+
+    input1 = TensorFetch(key="a", source_key="a", dtype=np.dtype(int)).new_chunk([])
+    input2 = TensorFetch(key="b", source_key="b", dtype=np.dtype(int)).new_chunk([])
+    result_chunk = TensorTreeAdd(args=[input1, input2], gpu=True).new_chunk(
+        [input1, input2]
+    )
+
+    chunk_graph = ChunkGraph([result_chunk])
+    chunk_graph.add_node(input1)
+    chunk_graph.add_node(input2)
+    chunk_graph.add_node(result_chunk)
+    chunk_graph.add_edge(input1, result_chunk)
+    chunk_graph.add_edge(input2, result_chunk)
+
+    await meta_api.set_chunk_meta(
+        input1, memory_size=200, store_size=200, bands=[("address0", "numa-0")]
+    )
+    await meta_api.set_chunk_meta(
+        input2, memory_size=200, store_size=200, bands=[("address0", "numa-0")]
+    )
+
+    subtask = Subtask("test_task", session_id, chunk_graph=chunk_graph)
+    [result] = await assigner_ref.assign_subtasks([subtask])
+    assert result[1].startswith("gpu")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("actor_pool", [False], indirect=True)
+async def test_reassign_subtasks(actor_pool):
+    pool, session_id, assigner_ref, cluster_api, meta_api = actor_pool
+
+    # ('address0', 'numa-0'), ('address1', 'numa-0'), ('address2', 'numa-0') are ready
+    await cluster_api.set_node_status(
+        node="address3", role=NodeRole.WORKER, status=NodeStatus.STOPPING
+    )
+
+    band_num_queued_subtasks = {("address0", "numa-0"): 3, ("address1", "numa-0"): 4}
+    move_queued_subtasks = await assigner_ref.reassign_subtasks(
+        band_num_queued_subtasks
+    )
+    assert move_queued_subtasks in (
+        {
+            ("address1", "numa-0"): -1,
+            ("address0", "numa-0"): -1,
+            ("address2", "numa-0"): 2,
+        },
+        {
+            ("address1", "numa-0"): -2,
+            ("address0", "numa-0"): 0,
+            ("address2", "numa-0"): 2,
+        },
+        {
+            ("address1", "numa-0"): -2,
+            ("address0", "numa-0"): -1,
+            ("address2", "numa-0"): 3,
+        },
+    )
+
+    # ('address0', 'numa-0'), ('address2', 'numa-0') are ready
+    await cluster_api.set_node_status(
+        node="address1", role=NodeRole.WORKER, status=NodeStatus.STOPPING
+    )
+
+    band_num_queued_subtasks = {
+        ("address0", "numa-0"): 9,
+        ("address1", "numa-0"): 7,
+        ("address2", "numa-0"): 0,
+    }
+    move_queued_subtasks = await assigner_ref.reassign_subtasks(
+        band_num_queued_subtasks
+    )
+    assert move_queued_subtasks in (
+        {
+            ("address1", "numa-0"): -7,
+            ("address0", "numa-0"): 3,
+            ("address2", "numa-0"): 4,
+        },
+        {
+            ("address1", "numa-0"): -7,
+            ("address0", "numa-0"): 4,
+            ("address2", "numa-0"): 3,
+        },
+    )
+
+    band_num_queued_subtasks = {("address0", "numa-0"): 9, ("address1", "numa-0"): 7}
+    move_queued_subtasks = await assigner_ref.reassign_subtasks(
+        band_num_queued_subtasks
+    )
+    assert move_queued_subtasks == {
+        ("address1", "numa-0"): -7,
+        ("address0", "numa-0"): -1,
+        ("address2", "numa-0"): 8,
+    }
+
+    band_num_queued_subtasks = {("address1", "numa-0"): 8}
+    move_queued_subtasks = await assigner_ref.reassign_subtasks(
+        band_num_queued_subtasks
+    )
+    assert move_queued_subtasks == {
+        ("address1", "numa-0"): -8,
+        ("address0", "numa-0"): 4,
+        ("address2", "numa-0"): 4,
+    }
+
+    band_num_queued_subtasks = {("address1", "numa-0"): 0}
+    move_queued_subtasks = await assigner_ref.reassign_subtasks(
+        band_num_queued_subtasks
+    )
+    assert move_queued_subtasks == {("address1", "numa-0"): 0}
+
+    # only ('address0', 'numa-0') is ready, i.e. there's only one band initially
+    await cluster_api.set_node_status(
+        node="address2", role=NodeRole.WORKER, status=NodeStatus.STOPPING
+    )
+    band_num_queued_subtasks = {("address0", "numa-0"): 8}
+    move_queued_subtasks = await assigner_ref.reassign_subtasks(
+        band_num_queued_subtasks
+    )
+    assert move_queued_subtasks == {("address0", "numa-0"): 0}
+
+    band_num_queued_subtasks = {("address1", "numa-0"): 8}
+    move_queued_subtasks = await assigner_ref.reassign_subtasks(
+        band_num_queued_subtasks
+    )
+    assert move_queued_subtasks == {
+        ("address1", "numa-0"): -8,
+        ("address0", "numa-0"): 8,
+    }
diff --git a/python/xorbits/_mars/services/scheduling/supervisor/tests/test_globalresource.py b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_globalresource.py
new file mode 100644
index 000000000..84bb27a8e
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_globalresource.py
@@ -0,0 +1,82 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+
+import pytest
+
+from ..... import oscar as mo
+from .....resource import Resource
+from ....cluster import ClusterAPI, MockClusterAPI
+from ....session import MockSessionAPI
+from ...supervisor import GlobalResourceManagerActor
+
+
+@pytest.fixture
+async def actor_pool():
+    pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+
+    async with pool:
+        session_id = "test_session"
+        await MockClusterAPI.create(pool.external_address)
+        await MockSessionAPI.create(pool.external_address, session_id=session_id)
+
+        global_resource_ref = await mo.create_actor(
+            GlobalResourceManagerActor,
+            uid=GlobalResourceManagerActor.default_uid(),
+            address=pool.external_address,
+        )
+
+        try:
+            yield pool, session_id, global_resource_ref
+        finally:
+            await mo.destroy_actor(global_resource_ref)
+            await MockClusterAPI.cleanup(pool.external_address)
+
+
+@pytest.mark.asyncio
+async def test_global_resource(actor_pool):
+    pool, session_id, global_resource_ref = actor_pool
+
+    cluster_api = await ClusterAPI.create(pool.external_address)
+    bands = await cluster_api.get_all_bands()
+    band = (pool.external_address, "numa-0")
+    band_resource = bands[band]
+
+    assert band in await global_resource_ref.get_idle_bands(0)
+    assert ["subtask0"] == await global_resource_ref.apply_subtask_resources(
+        band, session_id, ["subtask0"], [Resource(num_cpus=1)]
+    )
+    assert band not in await global_resource_ref.get_idle_bands(0)
+
+    await global_resource_ref.update_subtask_resources(
+        band, session_id, "subtask0", band_resource
+    )
+    assert [] == await global_resource_ref.apply_subtask_resources(
+        band, session_id, ["subtask1"], [Resource(num_cpus=1)]
+    )
+
+    wait_coro = global_resource_ref.wait_band_idle(band)
+    (done, pending) = await asyncio.wait([wait_coro], timeout=0.5)
+    assert not done
+    await global_resource_ref.release_subtask_resource(band, session_id, "subtask0")
+    (done, pending) = await asyncio.wait([wait_coro], timeout=0.5)
+    assert done
+    assert band in await global_resource_ref.get_idle_bands(0)
+    assert ["subtask1"] == await global_resource_ref.apply_subtask_resources(
+        band, session_id, ["subtask1"], [Resource(num_cpus=1)]
+    )
+    assert (await global_resource_ref.get_remaining_resources())[
+        band
+    ] == band_resource - Resource(num_cpus=1)
diff --git a/python/xorbits/_mars/services/scheduling/supervisor/tests/test_manager.py b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_manager.py
new file mode 100644
index 000000000..d16c993f0
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_manager.py
@@ -0,0 +1,200 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+from collections import defaultdict
+from typing import List, Set, Tuple
+
+import pytest
+
+from ..... import oscar as mo
+from .....typing import BandType
+from ....cluster import MockClusterAPI
+from ....subtask import Subtask, SubtaskResult, SubtaskStatus
+from ....task.supervisor.manager import TaskManagerActor
+from ...supervisor import (
+    GlobalResourceManagerActor,
+    SubtaskManagerActor,
+    SubtaskQueueingActor,
+)
+from ...worker import SubtaskExecutionActor
+
+
+class MockTaskManagerActor(mo.Actor):
+    def __init__(self):
+        self._results = dict()
+
+    def set_subtask_result(self, result: SubtaskResult):
+        self._results[result.subtask_id] = result
+
+    def get_result(self, subtask_id: str) -> SubtaskResult:
+        return self._results[subtask_id]
+
+
+class MockSubtaskQueueingActor(mo.Actor):
+    def __init__(self):
+        self._subtasks = dict()
+        self._error = None
+
+    def add_subtasks(
+        self,
+        subtasks: List[Subtask],
+        priorities: List[Tuple],
+        exclude_bands: Set[Tuple] = None,
+        random_when_unavailable: bool = True,
+    ):
+        if self._error is not None:
+            raise self._error
+        for subtask, priority in zip(subtasks, priorities):
+            self._subtasks[subtask.subtask_id] = (subtask, priority)
+
+    def submit_subtasks(self, band: BandType, limit: int):
+        pass
+
+    def remove_queued_subtasks(self, subtask_ids: List[str]):
+        for stid in subtask_ids:
+            self._subtasks.pop(stid)
+
+    def set_error(self, error):
+        self._error = error
+
+
+class MockSubtaskExecutionActor(mo.StatelessActor):
+    def __init__(self):
+        self._subtask_aiotasks = defaultdict(dict)
+        self._run_subtask_events = {}
+
+    async def set_run_subtask_event(self, subtask_id, event):
+        self._run_subtask_events[subtask_id] = event
+
+    async def run_subtask(
+        self, subtask: Subtask, band_name: str, supervisor_address: str
+    ):
+        self._run_subtask_events[subtask.subtask_id].set()
+        task = self._subtask_aiotasks[subtask.subtask_id][
+            band_name
+        ] = asyncio.create_task(asyncio.sleep(20))
+        return await task
+
+    def cancel_subtask(self, subtask_id: str, kill_timeout: int = 5):
+        for task in self._subtask_aiotasks[subtask_id].values():
+            task.cancel()
+
+    async def wait_subtask(self, subtask_id: str, band_name: str):
+        try:
+            yield self._subtask_aiotasks[subtask_id][band_name]
+        except asyncio.CancelledError:
+            pass
+
+
+@pytest.fixture
+async def actor_pool():
+    pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+
+    async with pool:
+        session_id = "test_session"
+        await MockClusterAPI.create(pool.external_address)
+        queue_ref = await mo.create_actor(
+            MockSubtaskQueueingActor,
+            uid=SubtaskQueueingActor.gen_uid(session_id),
+            address=pool.external_address,
+        )
+        slots_ref = await mo.create_actor(
+            GlobalResourceManagerActor,
+            uid=GlobalResourceManagerActor.default_uid(),
+            address=pool.external_address,
+        )
+        task_manager_ref = await mo.create_actor(
+            MockTaskManagerActor,
+            uid=TaskManagerActor.gen_uid(session_id),
+            address=pool.external_address,
+        )
+        execution_ref = await mo.create_actor(
+            MockSubtaskExecutionActor,
+            uid=SubtaskExecutionActor.default_uid(),
+            address=pool.external_address,
+        )
+        submitter_ref = await mo.create_actor(
+            SubtaskManagerActor,
+            session_id,
+            uid=SubtaskManagerActor.gen_uid(session_id),
+            address=pool.external_address,
+        )
+
+        try:
+            yield pool, session_id, execution_ref, submitter_ref, queue_ref, task_manager_ref
+        finally:
+            await mo.destroy_actor(slots_ref)
+            await MockClusterAPI.cleanup(pool.external_address)
+
+
+@pytest.mark.asyncio
+async def test_subtask_manager(actor_pool):
+    (
+        pool,
+        session_id,
+        execution_ref,
+        manager_ref,
+        queue_ref,
+        task_manager_ref,
+    ) = actor_pool
+
+    subtask1 = Subtask("subtask1", session_id)
+    subtask2 = Subtask("subtask2", session_id)
+
+    await manager_ref.add_subtasks([subtask1, subtask2], [(1,), (2,)])
+    run_subtask1_event, run_subtask2_event = asyncio.Event(), asyncio.Event()
+    await execution_ref.set_run_subtask_event(subtask1.subtask_id, run_subtask1_event)
+    await execution_ref.set_run_subtask_event(subtask2.subtask_id, run_subtask2_event)
+
+    submit1 = asyncio.create_task(
+        manager_ref.submit_subtask_to_band(
+            subtask1.subtask_id, (pool.external_address, "gpu-0")
+        )
+    )
+    submit2 = asyncio.create_task(
+        manager_ref.submit_subtask_to_band(
+            subtask2.subtask_id, (pool.external_address, "gpu-1")
+        )
+    )
+
+    await asyncio.gather(run_subtask1_event.wait(), run_subtask2_event.wait())
+
+    await manager_ref.cancel_subtasks([subtask1.subtask_id, subtask2.subtask_id])
+    await asyncio.wait_for(
+        asyncio.gather(
+            execution_ref.wait_subtask(subtask1.subtask_id, "gpu-0"),
+            execution_ref.wait_subtask(subtask2.subtask_id, "gpu-1"),
+        ),
+        timeout=10,
+    )
+    with pytest.raises(asyncio.CancelledError):
+        await submit1
+    with pytest.raises(asyncio.CancelledError):
+        await submit2
+    assert (
+        await task_manager_ref.get_result(subtask1.subtask_id)
+    ).status == SubtaskStatus.cancelled
+    assert (
+        await task_manager_ref.get_result(subtask2.subtask_id)
+    ).status == SubtaskStatus.cancelled
+
+    subtask3 = Subtask("subtask3", session_id)
+
+    await queue_ref.set_error(ValueError())
+    await manager_ref.add_subtasks.tell([subtask3], [(3,)])
+    await asyncio.sleep(0.1)
+    subtask3_result = await task_manager_ref.get_result(subtask3.subtask_id)
+    assert subtask3_result.status == SubtaskStatus.errored
+    assert isinstance(subtask3_result.error, ValueError)
diff --git a/python/xorbits/_mars/services/scheduling/supervisor/tests/test_queue_balance.py b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_queue_balance.py
new file mode 100644
index 000000000..53f294678
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_queue_balance.py
@@ -0,0 +1,238 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+from collections import defaultdict
+from typing import List, Tuple
+
+import pytest
+
+from ..... import oscar as mo
+from .....resource import Resource
+from ....cluster import ClusterAPI
+from ....cluster.core import NodeRole, NodeStatus
+from ....cluster.supervisor.locator import SupervisorPeerLocatorActor
+from ....cluster.supervisor.node_info import NodeInfoCollectorActor
+from ....cluster.uploader import NodeInfoUploaderActor
+from ....subtask import Subtask
+from ...supervisor import (
+    AssignerActor,
+    GlobalResourceManagerActor,
+    SubtaskManagerActor,
+    SubtaskQueueingActor,
+)
+
+
+class MockNodeInfoCollectorActor(NodeInfoCollectorActor):
+    def __init__(self, timeout=None, check_interval=None):
+        super().__init__(timeout=timeout, check_interval=check_interval)
+        self.ready_nodes = {
+            ("address0", "numa-0"): 2,
+            ("address1", "numa-0"): 2,
+            ("address2", "numa-0"): 2,
+        }
+
+    async def update_node_info(
+        self, address, role, env=None, resource=None, detail=None, status=None
+    ):
+        if "address" in address and status == NodeStatus.STOPPING:
+            del self.ready_nodes[(address, "numa-0")]
+        await super().update_node_info(address, role, env, resource, detail, status)
+
+    def get_all_bands(self, role=None, statuses=None):
+        if statuses == {NodeStatus.READY}:
+            return self.ready_nodes
+        else:
+            return {
+                ("address0", "numa-0"): 2,
+                ("address1", "numa-0"): 2,
+                ("address2", "numa-0"): 2,
+            }
+
+
+class FakeClusterAPI(ClusterAPI):
+    @classmethod
+    async def create(cls, address: str, **kw):
+        dones, _ = await asyncio.wait(
+            [
+                mo.create_actor(
+                    SupervisorPeerLocatorActor,
+                    "fixed",
+                    address,
+                    uid=SupervisorPeerLocatorActor.default_uid(),
+                    address=address,
+                ),
+                mo.create_actor(
+                    MockNodeInfoCollectorActor,
+                    uid=NodeInfoCollectorActor.default_uid(),
+                    address=address,
+                ),
+                mo.create_actor(
+                    NodeInfoUploaderActor,
+                    NodeRole.WORKER,
+                    interval=kw.get("upload_interval"),
+                    band_to_resource=kw.get("band_to_resource"),
+                    use_gpu=kw.get("use_gpu", False),
+                    uid=NodeInfoUploaderActor.default_uid(),
+                    address=address,
+                ),
+            ]
+        )
+
+        for task in dones:
+            try:
+                task.result()
+            except mo.ActorAlreadyExist:  # pragma: no cover
+                pass
+
+        api = await super().create(address=address)
+        await api.mark_node_ready()
+        return api
+
+
+class MockSlotsActor(mo.Actor):
+    @mo.extensible
+    def apply_subtask_resources(
+        self,
+        band: Tuple,
+        session_id: str,
+        subtask_ids: List[str],
+        subtask_slots: List[Resource],
+    ):
+        return subtask_ids
+
+    def refresh_bands(self):
+        pass
+
+    def get_used_resources(self):
+        return {}
+
+
+class MockAssignerActor(mo.Actor):
+    def assign_subtasks(
+        self, subtasks: List[Subtask], exclude_bands=None, random_when_unavailable=True
+    ):
+        return [subtask.expect_bands[0] for subtask in subtasks]
+
+    def reassign_subtasks(self, band_num_queued_subtasks):
+        if len(band_num_queued_subtasks.keys()) == 1:
+            [(band, _)] = band_num_queued_subtasks.items()
+            return {band: 0}
+        return {
+            ("address1", "numa-0"): -8,
+            ("address0", "numa-0"): 0,
+            ("address2", "numa-0"): 8,
+        }
+
+
+class MockSubtaskManagerActor(mo.Actor):
+    def __init__(self):
+        self._submitted_subtask_ids = defaultdict(list)
+
+    @mo.extensible
+    def submit_subtask_to_band(self, subtask_id: str, band: Tuple):
+        print(f"submit subtask {subtask_id} to band {band}")
+        self._submitted_subtask_ids[band].append(subtask_id)
+
+    def dump_data(self):
+        return self._submitted_subtask_ids
+
+
+@pytest.fixture
+async def actor_pool():
+    pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+
+    async with pool:
+        session_id = "test_session"
+        cluster_api = await FakeClusterAPI.create(pool.external_address)
+
+        # create assigner actor
+        await mo.create_actor(
+            MockAssignerActor,
+            uid=AssignerActor.gen_uid(session_id),
+            address=pool.external_address,
+        )
+        # create queueing actor
+        manager_ref = await mo.create_actor(
+            MockSubtaskManagerActor,
+            uid=SubtaskManagerActor.gen_uid(session_id),
+            address=pool.external_address,
+        )
+        # create slots actor
+        slots_ref = await mo.create_actor(
+            MockSlotsActor,
+            uid=GlobalResourceManagerActor.default_uid(),
+            address=pool.external_address,
+        )
+        # create queueing actor
+        queueing_ref = await mo.create_actor(
+            SubtaskQueueingActor,
+            session_id,
+            1,
+            uid=SubtaskQueueingActor.gen_uid(session_id),
+            address=pool.external_address,
+        )
+
+        try:
+            yield pool, session_id, cluster_api, queueing_ref, slots_ref, manager_ref
+        finally:
+            await mo.destroy_actor(queueing_ref)
+
+
+async def _queue_subtasks(num_subtasks, expect_bands, queueing_ref):
+    if not num_subtasks:
+        return
+    subtasks = [Subtask(expect_bands[0] + "-" + str(i)) for i in range(num_subtasks)]
+    for subtask in subtasks:
+        subtask.expect_bands = [expect_bands]
+        subtask.required_resource = Resource(num_cpus=1)
+    priorities = [(i,) for i in range(num_subtasks)]
+
+    await queueing_ref.add_subtasks(subtasks, priorities)
+
+
+@pytest.mark.asyncio
+async def test_subtask_queueing(actor_pool):
+    _pool, session_id, cluster_api, queueing_ref, slots_ref, manager_ref = actor_pool
+    nums_subtasks = [9, 8, 1]
+    expects_bands = [
+        ("address0", "numa-0"),
+        ("address1", "numa-0"),
+        ("address2", "numa-0"),
+    ]
+    for num_subtasks, expect_bands in zip(nums_subtasks, expects_bands):
+        await _queue_subtasks(num_subtasks, expect_bands, queueing_ref)
+
+    await cluster_api.set_node_status(
+        node="address1", role=NodeRole.WORKER, status=NodeStatus.STOPPING
+    )
+
+    # 9 subtasks on ('address0', 'numa-0')
+    await queueing_ref.submit_subtasks(band=("address0", "numa-0"), limit=10)
+    commited_subtask_ids = (await manager_ref.dump_data())[("address0", "numa-0")]
+    assert (
+        len(commited_subtask_ids) == 9
+    ), f"commited_subtask_ids {commited_subtask_ids}"
+
+    # 0 subtasks on ('address1', 'numa-0')
+    await queueing_ref.submit_subtasks(band=("address1", "numa-0"), limit=10)
+    commited_subtask_ids = (await manager_ref.dump_data())[("address0", "numa-0")]
+    assert (
+        len(commited_subtask_ids) == 9
+    ), f"commited_subtask_ids {commited_subtask_ids}"
+
+    # 9 subtasks on ('address2', 'numa-0')
+    await queueing_ref.submit_subtasks(band=("address2", "numa-0"), limit=10)
+    submitted_subtask_ids = await manager_ref.dump_data()
+    assert sum(len(v) for v in submitted_subtask_ids.values()) == 18
diff --git a/python/xorbits/_mars/services/scheduling/supervisor/tests/test_queueing.py b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_queueing.py
new file mode 100644
index 000000000..d6032cc74
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_queueing.py
@@ -0,0 +1,141 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Tuple
+
+import pytest
+
+from ..... import oscar as mo
+from .....resource import Resource
+from ....cluster import MockClusterAPI
+from ....subtask import Subtask
+from ...supervisor import (
+    AssignerActor,
+    GlobalResourceManagerActor,
+    SubtaskManagerActor,
+    SubtaskQueueingActor,
+)
+
+
+class MockSlotsActor(mo.Actor):
+    def __init__(self):
+        self._capacity = -1
+
+    def set_capacity(self, capacity: int):
+        self._capacity = capacity
+
+    @mo.extensible
+    def apply_subtask_resources(
+        self,
+        band: Tuple,
+        session_id: str,
+        subtask_ids: List[str],
+        subtask_resources: List[Resource],
+    ):
+        idx = (
+            min(self._capacity, len(subtask_ids))
+            if self._capacity >= 0
+            else len(subtask_ids)
+        )
+        return subtask_ids[:idx]
+
+
+class MockAssignerActor(mo.Actor):
+    def assign_subtasks(
+        self, subtasks: List[Subtask], exclude_bands=None, random_when_unavailable=True
+    ):
+        return [(self.address, "numa-0")] * len(subtasks)
+
+
+class MockSubtaskManagerActor(mo.Actor):
+    def __init__(self):
+        self._subtask_ids, self._bands = [], []
+
+    @mo.extensible
+    def submit_subtask_to_band(self, subtask_id: str, band: Tuple):
+        self._subtask_ids.append(subtask_id)
+        self._bands.append(band)
+
+    def dump_data(self):
+        return self._subtask_ids, self._bands
+
+
+@pytest.fixture
+async def actor_pool():
+    pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+
+    async with pool:
+        session_id = "test_session"
+        await MockClusterAPI.create(pool.external_address)
+
+        # create assigner actor
+        await mo.create_actor(
+            MockAssignerActor,
+            uid=AssignerActor.gen_uid(session_id),
+            address=pool.external_address,
+        )
+        # create queueing actor
+        manager_ref = await mo.create_actor(
+            MockSubtaskManagerActor,
+            uid=SubtaskManagerActor.gen_uid(session_id),
+            address=pool.external_address,
+        )
+        # create slots actor
+        slots_ref = await mo.create_actor(
+            MockSlotsActor,
+            uid=GlobalResourceManagerActor.default_uid(),
+            address=pool.external_address,
+        )
+        # create queueing actor
+        queueing_ref = await mo.create_actor(
+            SubtaskQueueingActor,
+            session_id,
+            uid=SubtaskQueueingActor.gen_uid(session_id),
+            address=pool.external_address,
+        )
+        try:
+            yield pool, session_id, queueing_ref, slots_ref, manager_ref
+        finally:
+            await mo.destroy_actor(queueing_ref)
+            await MockClusterAPI.cleanup(pool.external_address)
+
+
+@pytest.mark.asyncio
+async def test_subtask_queueing(actor_pool):
+    _pool, session_id, queueing_ref, slots_ref, manager_ref = actor_pool
+    await slots_ref.set_capacity(2)
+
+    subtasks = [Subtask(str(i)) for i in range(5)]
+    priorities = [(i,) for i in range(5)]
+
+    await queueing_ref.add_subtasks(subtasks, priorities)
+    # queue: [4 3 2 1 0]
+    assert await queueing_ref.all_bands_busy()
+    await queueing_ref.submit_subtasks()
+    # queue: [2 1 0]
+    commited_subtask_ids, _commited_bands = await manager_ref.dump_data()
+    assert commited_subtask_ids == ["4", "3"]
+
+    await queueing_ref.remove_queued_subtasks(["1"])
+    # queue: [2 0]
+    await queueing_ref.update_subtask_priority.batch(
+        queueing_ref.update_subtask_priority.delay("0", (3,)),
+        queueing_ref.update_subtask_priority.delay("4", (5,)),
+    )
+    # queue: [0(3) 2]
+    await queueing_ref.submit_subtasks()
+    # queue: []
+    commited_subtasks, _commited_bands = await manager_ref.dump_data()
+    assert commited_subtasks == ["4", "3", "0", "2"]
+    assert not await queueing_ref.all_bands_busy()
diff --git a/python/xorbits/_mars/services/scheduling/supervisor/tests/test_speculation.py b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_speculation.py
new file mode 100644
index 000000000..ed666a483
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/supervisor/tests/test_speculation.py
@@ -0,0 +1,151 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+from typing import List, Set, Tuple
+
+import pytest
+
+from ..... import oscar as mo
+from ....cluster import MockClusterAPI
+from ....subtask import Subtask
+from ...errors import NoAvailableBand
+from ...supervisor import GlobalResourceManagerActor
+from ..manager import SubtaskScheduleInfo
+from ..speculation import SpeculativeScheduler
+
+
+class MockSubtaskQueueingActor(mo.Actor):
+    def __init__(self):
+        self._subtasks = []
+        self._exceptions = []
+
+    async def add_subtasks(
+        self,
+        subtasks: List[Subtask],
+        priorities: List[Tuple],
+        exclude_bands: Set[Tuple] = None,
+        random_when_unavailable: bool = True,
+    ):
+        if {
+            ("addr0", "numa-0"),
+            ("addr1", "numa-0"),
+            ("addr2", "numa-0"),
+        } - exclude_bands == set():
+            self._exceptions.append(NoAvailableBand())
+            raise self._exceptions[-1]
+        self._subtasks.extend(subtasks)
+
+    async def get_subtasks(self):
+        return self._subtasks
+
+    async def get_exceptions(self):
+        return self._exceptions
+
+
+@pytest.fixture
+async def actor_pool():
+    pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+
+    async with pool:
+        session_id = "test_session"
+        cluster_api = await MockClusterAPI.create(pool.external_address)
+        slots_ref = await mo.create_actor(
+            GlobalResourceManagerActor,
+            uid=GlobalResourceManagerActor.default_uid(),
+            address=pool.external_address,
+        )
+        queue_ref = await mo.create_actor(
+            MockSubtaskQueueingActor,
+            address=pool.external_address,
+        )
+        try:
+            yield pool, cluster_api, session_id, slots_ref, queue_ref
+        finally:
+            await mo.destroy_actor(queue_ref)
+            await MockClusterAPI.cleanup(pool.external_address)
+
+
+@pytest.mark.asyncio
+async def test_speculation(actor_pool):
+    pool, cluster_api, session_id, slots_ref, queue_ref = actor_pool
+    speculation_conf = {
+        "enabled": True,
+        "interval": 1000,
+        "threshold": 0.2,
+        "min_task_runtime": 0.01,
+        "multiplier": 1.5,
+        "max_concurrent_run": 2,
+    }
+    speculative_scheduler = SpeculativeScheduler(queue_ref, slots_ref, speculation_conf)
+    await speculative_scheduler.start()
+    await speculative_scheduler._speculative_execution()
+    total_subtasks = 5
+    subtasks = [
+        Subtask(str(i), retryable=False, logic_key=f"logic_key1", logic_parallelism=5)
+        for i in range(total_subtasks)
+    ]
+    subtask_infos = [
+        SubtaskScheduleInfo(subtask, max_reschedules=3) for subtask in subtasks
+    ]
+    # add unfinished subtasks
+    for subtask_info in subtask_infos:
+        speculative_scheduler.add_subtask(subtask_info)
+    await speculative_scheduler._speculative_execution()
+    assert len(speculative_scheduler._grouped_finished_subtasks.values()) == 0
+    # finished some subtasks
+    for subtask_info in subtask_infos[:-1]:
+        speculative_scheduler.finish_subtask(subtask_info)
+    assert (
+        len(next(iter(speculative_scheduler._grouped_finished_subtasks.values())))
+        == total_subtasks - 1
+    )
+    assert (
+        len(next(iter(speculative_scheduler._grouped_unfinished_subtasks.values())))
+        == 1
+    )
+    await speculative_scheduler._speculative_execution()
+    subtask_infos[-1].subtask.retryable = True
+    # pretend subtask has been running on a band.
+    subtask_infos[-1].band_futures[("addr0", "numa-0")] = asyncio.ensure_future(
+        asyncio.sleep(1)
+    )
+    await speculative_scheduler._speculative_execution()
+    submitted = await queue_ref.get_subtasks()
+    # assert stale subtasks resubmitted
+    assert subtask_infos[-1].subtask in submitted
+    await speculative_scheduler._speculative_execution()
+    # if resubmitted subtasks not running, don't resubmitted again.
+    assert 1 == len(await queue_ref.get_subtasks())
+    # pretend subtask has been running on a band.
+    subtask_infos[-1].band_futures[("addr1", "numa-0")] = asyncio.ensure_future(
+        asyncio.sleep(1)
+    )
+    await speculative_scheduler._speculative_execution()
+    # stale subtasks resubmitted again
+    assert 2 == len(await queue_ref.get_subtasks())
+    # pretend subtask has been running on another band.
+    subtask_infos[-1].band_futures[("addr2", "numa-0")] = asyncio.ensure_future(
+        asyncio.sleep(1)
+    )
+    # speculative run reached max limit `max_concurrent_run`, i.e. 2
+    await speculative_scheduler._speculative_execution()
+    # assert raise queue_ref raise NoAvailableBand
+    speculative_scheduler._subtask_speculation_max_concurrent_run += 1
+    await speculative_scheduler._speculative_execution()
+    assert isinstance((await queue_ref.get_exceptions())[0], NoAvailableBand)
+    # finish subtasks
+    speculative_scheduler.finish_subtask(subtask_infos[-1])
+    assert len(speculative_scheduler._grouped_unfinished_subtasks) == 0
+    await speculative_scheduler.stop()
diff --git a/python/xorbits/_mars/services/scheduling/tests/__init__.py b/python/xorbits/_mars/services/scheduling/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/scheduling/tests/test_service.py b/python/xorbits/_mars/services/scheduling/tests/test_service.py
new file mode 100644
index 000000000..4da089e25
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/tests/test_service.py
@@ -0,0 +1,332 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import time
+from collections import defaultdict
+
+import numpy as np
+import pytest
+
+from .... import oscar as mo
+from .... import remote as mr
+from .... import tensor as mt
+from ....core.graph import ChunkGraphBuilder, TileableGraph, TileableGraphBuilder
+from ....resource import Resource
+from ... import NodeRole, start_services, stop_services
+from ...session import SessionAPI
+from ...storage import MockStorageAPI, StorageAPI
+from ...subtask import Subtask, SubtaskResult, SubtaskStatus
+from ...task import new_task_id
+from ...task.supervisor.manager import TaskManagerActor
+from ...web import WebActor
+from .. import SchedulingAPI
+from ..api.web import WebSchedulingAPI
+from ..supervisor import GlobalResourceManagerActor
+
+
+class FakeTaskManager(TaskManagerActor):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._events = defaultdict(list)
+        self._results = dict()
+
+    def set_subtask_result(self, subtask_result: SubtaskResult):
+        self._results[subtask_result.subtask_id] = subtask_result
+        for event in self._events[subtask_result.subtask_id]:
+            event.set()
+        self._events.pop(subtask_result.subtask_id, None)
+
+    def _return_result(self, subtask_id: str):
+        result = self._results[subtask_id]
+        if result.status == SubtaskStatus.cancelled:
+            raise asyncio.CancelledError
+        elif result.status == SubtaskStatus.errored:
+            raise result.error.with_traceback(result.traceback)
+        return result
+
+    async def wait_subtask_result(self, subtask_id: str):
+        if subtask_id in self._results:
+            return self._return_result(subtask_id)
+
+        event = asyncio.Event()
+        self._events[subtask_id].append(event)
+
+        async def waiter():
+            await event.wait()
+            return self._return_result(subtask_id)
+
+        return waiter()
+
+
+def _gen_subtask(t, session_id):
+    graph = TileableGraph([t.data])
+    next(TileableGraphBuilder(graph).build())
+
+    chunk_graph = next(ChunkGraphBuilder(graph, fuse_enabled=False).build())
+    subtask = Subtask(new_task_id(), session_id, new_task_id(), chunk_graph)
+    subtask.required_resource = Resource(num_cpus=1)
+
+    return subtask
+
+
+def _approx_resource(actual, expect):
+    return (
+        pytest.approx(actual.num_cpus) == expect.num_cpus
+        and pytest.approx(actual.num_gpus) == expect.num_cpus
+        and pytest.approx(actual.mem_bytes) == expect.mem_bytes
+    )
+
+
+@pytest.fixture
+async def actor_pools():
+    async def start_pool(is_worker: bool):
+        if is_worker:
+            kw = dict(
+                n_process=2,
+                labels=["main"] + ["numa-0"] * 2,
+                subprocess_start_method="spawn",
+            )
+        else:
+            kw = dict(n_process=0, subprocess_start_method="spawn")
+        pool = await mo.create_actor_pool("127.0.0.1", **kw)
+        await pool.start()
+        return pool
+
+    sv_pool, worker_pool = await asyncio.gather(start_pool(False), start_pool(True))
+
+    config = {
+        "services": [
+            "cluster",
+            "session",
+            "meta",
+            "lifecycle",
+            "scheduling",
+            "subtask",
+            "task",
+            "mutable",
+            "web",
+        ],
+        "cluster": {
+            "backend": "fixed",
+            "lookup_address": sv_pool.external_address,
+            "resource": {"numa-0": Resource(num_cpus=2)},
+        },
+        "meta": {"store": "dict"},
+        "scheduling": {},
+        "subtask": {},
+    }
+    await start_services(NodeRole.SUPERVISOR, config, address=sv_pool.external_address)
+    await start_services(NodeRole.WORKER, config, address=worker_pool.external_address)
+
+    session_id = "test_session"
+    session_api = await SessionAPI.create(sv_pool.external_address)
+    await session_api.create_session(session_id)
+    ref = await mo.actor_ref(
+        FakeTaskManager.gen_uid(session_id), address=sv_pool.external_address
+    )
+    await mo.destroy_actor(ref)
+    task_manager_ref = await mo.create_actor(
+        FakeTaskManager,
+        session_id,
+        uid=FakeTaskManager.gen_uid(session_id),
+        address=sv_pool.external_address,
+    )
+    await MockStorageAPI.create(session_id, worker_pool.external_address)
+
+    try:
+        yield sv_pool, worker_pool, session_id, task_manager_ref
+    finally:
+        await session_api.delete_session(session_id)
+        await MockStorageAPI.cleanup(worker_pool.external_address)
+        await stop_services(
+            NodeRole.WORKER, config, address=worker_pool.external_address
+        )
+        await stop_services(
+            NodeRole.SUPERVISOR, config, address=sv_pool.external_address
+        )
+
+        await asyncio.gather(sv_pool.stop(), worker_pool.stop())
+
+
+async def _get_subtask_summaries_by_web(sv_pool_address, session_id, task_id=None):
+    web_actor = await mo.actor_ref(WebActor.default_uid(), address=sv_pool_address)
+    web_address = await web_actor.get_web_address()
+    web_scheduling_api = WebSchedulingAPI(session_id, web_address)
+    return await web_scheduling_api.get_subtask_schedule_summaries(task_id)
+
+
+@pytest.mark.asyncio
+async def test_schedule_success(actor_pools):
+    sv_pool, worker_pool, session_id, task_manager_ref = actor_pools
+    global_resource_ref = await mo.actor_ref(
+        GlobalResourceManagerActor.default_uid(), address=sv_pool.external_address
+    )
+
+    scheduling_api = await SchedulingAPI.create(session_id, sv_pool.external_address)
+    storage_api = await StorageAPI.create(session_id, worker_pool.external_address)
+
+    a = mt.ones((10, 10), chunk_size=10)
+    b = a + 1
+
+    subtask = _gen_subtask(b, session_id)
+    subtask.expect_bands = [(worker_pool.external_address, "numa-0")]
+    await scheduling_api.add_subtasks([subtask], [(0,)])
+    await task_manager_ref.wait_subtask_result(subtask.subtask_id)
+    await scheduling_api.finish_subtasks([subtask.subtask_id])
+
+    result_key = next(subtask.chunk_graph.iter_indep(reverse=True)).key
+    result = await storage_api.get(result_key)
+    np.testing.assert_array_equal(np.ones((10, 10)) + 1, result)
+
+    assert _approx_resource(
+        (await global_resource_ref.get_used_resources()).get(
+            (worker_pool.external_address, "numa-0"), Resource()
+        ),
+        Resource(),
+    )
+
+    [summary] = await _get_subtask_summaries_by_web(
+        sv_pool.external_address, session_id, subtask.task_id
+    )
+    assert summary.is_finished
+    assert subtask.expect_bands[0] in summary.bands
+
+
+@pytest.mark.asyncio
+async def test_schedule_queue(actor_pools):
+    sv_pool, worker_pool, session_id, task_manager_ref = actor_pools
+    global_resource_ref = await mo.actor_ref(
+        GlobalResourceManagerActor.default_uid(), address=sv_pool.external_address
+    )
+    scheduling_api = await SchedulingAPI.create(session_id, sv_pool.external_address)
+
+    finish_ids, finish_time = [], []
+
+    def _remote_fun(secs):
+        time.sleep(secs)
+        return secs
+
+    async def _waiter_fun(subtask_id):
+        await task_manager_ref.wait_subtask_result(subtask_id)
+        await scheduling_api.finish_subtasks([subtask_id])
+        finish_ids.append(subtask_id)
+        finish_time.append(time.time())
+
+    subtasks = []
+    wait_tasks = []
+    for task_id in range(6):
+        a = mr.spawn(_remote_fun, args=(0.5 + 0.01 * task_id,))
+        subtask = _gen_subtask(a, session_id)
+        subtask.subtask_id = f"test_schedule_queue_subtask_{task_id}"
+        subtask.expect_bands = [(worker_pool.external_address, "numa-0")]
+        subtask.priority = (4 - task_id,)
+        wait_tasks.append(asyncio.create_task(_waiter_fun(subtask.subtask_id)))
+        subtasks.append(subtask)
+
+    await scheduling_api.add_subtasks(subtasks)
+    await scheduling_api.update_subtask_priority(subtasks[-1].subtask_id, (6,))
+    await asyncio.gather(*wait_tasks)
+
+    assert _approx_resource(
+        (await global_resource_ref.get_used_resources()).get(
+            (worker_pool.external_address, "numa-0"), Resource()
+        ),
+        Resource(),
+    )
+
+
+@pytest.mark.asyncio
+async def test_schedule_error(actor_pools):
+    sv_pool, worker_pool, session_id, task_manager_ref = actor_pools
+    global_resource_ref = await mo.actor_ref(
+        GlobalResourceManagerActor.default_uid(), address=sv_pool.external_address
+    )
+    scheduling_api = await SchedulingAPI.create(session_id, sv_pool.external_address)
+
+    exc_types = [ValueError, asyncio.CancelledError, GeneratorExit]
+    for exc_type in exc_types:
+
+        def _remote_fun():
+            raise exc_type
+
+        a = mr.spawn(_remote_fun)
+        subtask = _gen_subtask(a, session_id)
+        subtask.expect_bands = [(worker_pool.external_address, "numa-0")]
+
+        await scheduling_api.add_subtasks([subtask])
+        with pytest.raises(exc_type):
+            await task_manager_ref.wait_subtask_result(subtask.subtask_id)
+
+    assert _approx_resource(
+        (await global_resource_ref.get_used_resources()).get(
+            (worker_pool.external_address, "numa-0"), Resource()
+        ),
+        Resource(),
+    )
+
+
+@pytest.mark.asyncio
+async def test_schedule_cancel(actor_pools):
+    sv_pool, worker_pool, session_id, task_manager_ref = actor_pools
+    global_resource_ref = await mo.actor_ref(
+        GlobalResourceManagerActor.default_uid(), address=sv_pool.external_address
+    )
+    scheduling_api = await SchedulingAPI.create(session_id, sv_pool.external_address)
+
+    def _remote_fun(secs):
+        time.sleep(secs)
+        return secs
+
+    async def _waiter_fun(subtask_id):
+        await task_manager_ref.wait_subtask_result(subtask_id)
+        await scheduling_api.finish_subtasks([subtask_id])
+
+    subtasks = []
+    wait_tasks = []
+    for task_id in range(6):
+        a = mr.spawn(_remote_fun, args=(1 - 0.01 * task_id,))
+        subtask = _gen_subtask(a, session_id)
+        subtask.subtask_id = f"test_schedule_queue_subtask_{task_id}"
+        subtask.expect_bands = [(worker_pool.external_address, "numa-0")]
+        subtask.priority = (4 - task_id,)
+        wait_tasks.append(asyncio.create_task(_waiter_fun(subtask.subtask_id)))
+        subtasks.append(subtask)
+
+    await scheduling_api.add_subtasks(subtasks)
+    await asyncio.gather(*wait_tasks[:2])
+
+    await scheduling_api.cancel_subtasks(
+        [subtask.subtask_id for subtask in subtasks], kill_timeout=0.1
+    )
+
+    for wait_task in wait_tasks[2:]:
+        with pytest.raises(asyncio.CancelledError):
+            await wait_task
+
+    summaries = await _get_subtask_summaries_by_web(
+        sv_pool.external_address, session_id
+    )
+    assert all(
+        summary.is_finished and summary.is_cancelled for summary in summaries[2:]
+    )
+    # `cancel_subtask` will invoke `task_api.set_subtask_result` which is async, wait 1 second so that slot can be
+    # released.
+    await asyncio.sleep(1)
+    assert _approx_resource(
+        (await global_resource_ref.get_used_resources()).get(
+            (worker_pool.external_address, "numa-0"), Resource()
+        ),
+        Resource(),
+    )
diff --git a/python/xorbits/_mars/services/scheduling/utils.py b/python/xorbits/_mars/services/scheduling/utils.py
new file mode 100644
index 000000000..271dd9ec3
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/utils.py
@@ -0,0 +1,62 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import contextlib
+import sys
+
+from ... import oscar as mo
+from ...lib.aio import alru_cache
+from ..subtask import SubtaskResult, SubtaskStatus
+from ..task import TaskAPI
+
+
+@alru_cache
+async def _get_task_api(actor: mo.Actor):
+    return await TaskAPI.create(getattr(actor, "_session_id"), actor.address)
+
+
+@contextlib.asynccontextmanager
+async def redirect_subtask_errors(actor: mo.Actor, subtasks):
+    try:
+        yield
+    except:  # noqa: E722  # pylint: disable=bare-except
+        _, error, traceback = sys.exc_info()
+        status = (
+            SubtaskStatus.cancelled
+            if isinstance(error, asyncio.CancelledError)
+            else SubtaskStatus.errored
+        )
+        task_api = await _get_task_api(actor)
+        coros = []
+        for subtask in subtasks:
+            if subtask is None:  # pragma: no cover
+                continue
+            coros.append(
+                task_api.set_subtask_result(
+                    SubtaskResult(
+                        subtask_id=subtask.subtask_id,
+                        session_id=subtask.session_id,
+                        task_id=subtask.task_id,
+                        stage_id=subtask.stage_id,
+                        progress=1.0,
+                        status=status,
+                        error=error,
+                        traceback=traceback,
+                    )
+                )
+            )
+        tasks = [asyncio.ensure_future(coro) for coro in coros]
+        await asyncio.wait(tasks)
+        raise
diff --git a/python/xorbits/_mars/services/scheduling/worker/__init__.py b/python/xorbits/_mars/services/scheduling/worker/__init__.py
new file mode 100644
index 000000000..42c23612a
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/worker/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .execution import SubtaskExecutionActor
+from .quota import MemQuotaActor, QuotaActor, WorkerQuotaManagerActor
+from .service import SchedulingWorkerService
+from .workerslot import (
+    BandSlotControlActor,
+    BandSlotManagerActor,
+    WorkerSlotManagerActor,
+)
diff --git a/python/xorbits/_mars/services/scheduling/worker/execution.py b/python/xorbits/_mars/services/scheduling/worker/execution.py
new file mode 100644
index 000000000..09791d18b
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/worker/execution.py
@@ -0,0 +1,552 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import functools
+import logging
+import operator
+import pprint
+import sys
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Dict, Optional
+
+from .... import oscar as mo
+from ....core import ExecutionError
+from ....core.graph import DAG
+from ....core.operand import Fetch, FetchShuffle
+from ....lib.aio import alru_cache
+from ....metrics import Metrics
+from ....oscar.errors import MarsError
+from ....storage import StorageLevel
+from ....utils import dataslots, get_chunk_key_to_data_keys, wrap_exception
+from ...cluster import ClusterAPI
+from ...meta import MetaAPI
+from ...storage import StorageAPI
+from ...subtask import Subtask, SubtaskAPI, SubtaskResult, SubtaskStatus
+from .quota import QuotaActor
+from .workerslot import BandSlotManagerActor
+
+logger = logging.getLogger(__name__)
+
+# the default times to run subtask.
+DEFAULT_SUBTASK_MAX_RETRIES = 0
+
+
+@dataslots
+@dataclass
+class SubtaskExecutionInfo:
+    aio_task: asyncio.Task
+    band_name: str
+    supervisor_address: str
+    result: SubtaskResult = field(default_factory=SubtaskResult)
+    cancelling: bool = False
+    max_retries: int = 0
+    num_retries: int = 0
+    slot_id: Optional[int] = None
+    kill_timeout: Optional[int] = None
+
+
+async def _retry_run(
+    subtask: Subtask, subtask_info: SubtaskExecutionInfo, target_async_func, *args
+):
+    assert subtask_info.num_retries >= 0
+    assert subtask_info.max_retries >= 0
+
+    while True:
+        try:
+            return await target_async_func(*args)
+        except (OSError, MarsError) as ex:
+            if subtask_info.num_retries < subtask_info.max_retries:
+                logger.error(
+                    "Rerun[%s/%s] the %s of subtask %s due to %s.",
+                    subtask_info.num_retries,
+                    subtask_info.max_retries,
+                    target_async_func,
+                    subtask.subtask_id,
+                    ex,
+                )
+                subtask_info.num_retries += 1
+                continue
+            if subtask_info.max_retries > 0:
+                message = (
+                    f"Exceed max rerun[{subtask_info.num_retries}/{subtask_info.max_retries}]:"
+                    f" {target_async_func} of subtask {subtask.subtask_id} due to {ex}."
+                )
+                logger.error(message)
+
+                raise wrap_exception(ex, wrap_name="_ExceedMaxRerun", message=message)
+            else:
+                raise ex
+        except asyncio.CancelledError:
+            raise
+        except Exception as ex:
+            if subtask_info.max_retries > 0:
+                message = (
+                    f"Failed to rerun the {target_async_func} of subtask {subtask.subtask_id}, "
+                    f"num_retries: {subtask_info.num_retries}, max_retries: {subtask_info.max_retries} "
+                    f"due to unhandled exception: {ex}."
+                )
+                logger.error(message)
+
+                raise wrap_exception(
+                    ex, wrap_name="_UnhandledException", message=message
+                )
+            else:
+                raise ex
+
+
+def _fill_subtask_result_with_exception(
+    subtask: Subtask, subtask_info: SubtaskExecutionInfo
+):
+    _, exc, tb = sys.exc_info()
+    if isinstance(exc, ExecutionError):
+        exc = exc.nested_error
+        tb = exc.__traceback__
+
+    exc_info = (type(exc), exc, tb)
+    if isinstance(exc, asyncio.CancelledError):
+        status = SubtaskStatus.cancelled
+        logger.exception(
+            "Cancel run subtask %s on band %s",
+            subtask.subtask_id,
+            subtask_info.band_name,
+            exc_info=exc_info,
+        )
+    else:
+        status = SubtaskStatus.errored
+        logger.exception(
+            "Failed to run subtask %s on band %s",
+            subtask.subtask_id,
+            subtask_info.band_name,
+            exc_info=exc_info,
+        )
+    subtask_info.result.status = status
+    subtask_info.result.progress = 1.0
+    subtask_info.result.error = exc
+    subtask_info.result.traceback = tb
+
+
+class SubtaskExecutionActor(mo.StatelessActor):
+    _subtask_info: Dict[str, SubtaskExecutionInfo]
+
+    def __init__(
+        self,
+        subtask_max_retries: int = DEFAULT_SUBTASK_MAX_RETRIES,
+        enable_kill_slot: bool = True,
+        data_prepare_timeout: int = 600,
+    ):
+        self._cluster_api = None
+        self._global_resource_ref = None
+        self._subtask_max_retries = subtask_max_retries
+        self._enable_kill_slot = enable_kill_slot
+        self._data_prepare_timeout = data_prepare_timeout
+
+        self._subtask_info = dict()
+        self._submitted_subtask_count = Metrics.counter(
+            "mars.band.submitted_subtask_count",
+            "The count of submitted subtasks to the current band.",
+            ("band",),
+        )
+        self._finished_subtask_count = Metrics.counter(
+            "mars.band.finished_subtask_count",
+            "The count of finished subtasks of the current band.",
+            ("band",),
+        )
+
+    async def __post_create__(self):
+        self._cluster_api = await ClusterAPI.create(self.address)
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_slot_manager_ref(
+        self, band: str
+    ) -> mo.ActorRefType[BandSlotManagerActor]:
+        return await mo.actor_ref(
+            BandSlotManagerActor.gen_uid(band), address=self.address
+        )
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_band_quota_ref(self, band: str) -> mo.ActorRefType[QuotaActor]:
+        return await mo.actor_ref(QuotaActor.gen_uid(band), address=self.address)
+
+    async def _prepare_input_data(self, subtask: Subtask, band_name: str):
+        queries = []
+        shuffle_queries = []
+        storage_api = await StorageAPI.create(
+            subtask.session_id, address=self.address, band_name=band_name
+        )
+        chunk_key_to_data_keys = get_chunk_key_to_data_keys(subtask.chunk_graph)
+        for chunk in subtask.chunk_graph:
+            if chunk.key in subtask.pure_depend_keys:
+                continue
+            if chunk.op.gpu:  # pragma: no cover
+                to_fetch_band = band_name
+            else:
+                to_fetch_band = "numa-0"
+            if isinstance(chunk.op, Fetch):
+                queries.append(
+                    storage_api.fetch.delay(chunk.key, band_name=to_fetch_band)
+                )
+            elif isinstance(chunk.op, FetchShuffle):
+                for key in chunk_key_to_data_keys[chunk.key]:
+                    shuffle_queries.append(
+                        storage_api.fetch.delay(
+                            key, band_name=to_fetch_band, error="ignore"
+                        )
+                    )
+        if queries:
+            await storage_api.fetch.batch(*queries)
+        if shuffle_queries:
+            # TODO(hks): The batch method doesn't accept different error arguments,
+            #  combine them when it can.
+
+            await storage_api.fetch.batch(*shuffle_queries)
+
+    async def _collect_input_sizes(
+        self, subtask: Subtask, supervisor_address: str, band_name: str
+    ):
+        graph = subtask.chunk_graph
+        sizes = dict()
+
+        fetch_keys = list(
+            set(
+                n.key
+                for n in graph.iter_indep()
+                if isinstance(n.op, Fetch) and n.key not in subtask.pure_depend_keys
+            )
+        )
+        if not fetch_keys:
+            return sizes
+
+        storage_api = await StorageAPI.create(
+            subtask.session_id, address=self.address, band_name=band_name
+        )
+        meta_api = await MetaAPI.create(subtask.session_id, address=supervisor_address)
+
+        fetch_metas = await meta_api.get_chunk_meta.batch(
+            *(
+                meta_api.get_chunk_meta.delay(k, fields=["memory_size", "store_size"])
+                for k in fetch_keys
+            )
+        )
+        data_infos = await storage_api.get_infos.batch(
+            *(storage_api.get_infos.delay(k) for k in fetch_keys)
+        )
+
+        # compute memory quota size. when data located in shared memory, the cost
+        # should be differences between deserialized memory cost and serialized cost,
+        # otherwise we should take deserialized memory cost
+        for key, meta, infos in zip(fetch_keys, fetch_metas, data_infos):
+            level = functools.reduce(operator.or_, (info.level for info in infos))
+            if level & StorageLevel.MEMORY:
+                mem_cost = max(0, meta["memory_size"] - meta["store_size"])
+            else:
+                mem_cost = meta["memory_size"]
+            sizes[key] = (meta["store_size"], mem_cost)
+
+        return sizes
+
+    @classmethod
+    def _estimate_sizes(cls, subtask: Subtask, input_sizes: Dict):
+        size_context = dict(input_sizes.items())
+        graph = subtask.chunk_graph
+
+        key_to_ops = defaultdict(set)
+        chunk_key_to_sizes = defaultdict(lambda: 0)
+        for n in graph:
+            key_to_ops[n.op.key].add(n.op)
+            chunk_key_to_sizes[n.key] += 1
+        key_to_ops = {k: list(v) for k, v in key_to_ops.items()}
+
+        # condense op key graph
+        op_key_graph = DAG()
+        for n in graph.topological_iter():
+            if n.key in subtask.pure_depend_keys:
+                continue
+            if n.op.key not in op_key_graph:
+                op_key_graph.add_node(n.op.key)
+            for succ in graph.iter_successors(n):
+                if succ.op.key not in op_key_graph:
+                    op_key_graph.add_node(succ.op.key)
+                op_key_graph.add_edge(n.op.key, succ.op.key)
+
+        key_stack = list(op_key_graph.iter_indep())
+        pred_ref_count = {k: op_key_graph.count_predecessors(k) for k in op_key_graph}
+        succ_ref_count = {k: op_key_graph.count_successors(k) for k in op_key_graph}
+
+        visited_op_keys = set()
+        total_memory_cost = 0
+        max_memory_cost = sum(calc_size for _, calc_size in size_context.values())
+        while key_stack:
+            key = key_stack.pop()
+            op = key_to_ops[key][0]
+
+            if not isinstance(op, Fetch):
+                op.estimate_size(size_context, op)
+
+            calc_cost = sum(size_context[out.key][1] for out in op.outputs)
+            total_memory_cost += calc_cost
+            max_memory_cost = max(total_memory_cost, max_memory_cost)
+
+            if not isinstance(op, Fetch):
+                # when calculation result is stored, memory cost of calculation
+                #  can be replaced with result memory cost
+                result_cost = sum(size_context[out.key][0] for out in op.outputs)
+                total_memory_cost += result_cost - calc_cost
+
+            visited_op_keys.add(key)
+
+            for succ_op_key in op_key_graph.iter_successors(key):
+                pred_ref_count[succ_op_key] -= 1
+                if pred_ref_count[succ_op_key] == 0:
+                    key_stack.append(succ_op_key)
+
+            for pred_op_key in op_key_graph.iter_predecessors(key):
+                succ_ref_count[pred_op_key] -= 1
+                if succ_ref_count[pred_op_key] == 0:
+                    pred_op = key_to_ops[pred_op_key][0]
+                    outs = key_to_ops[pred_op_key][0].outputs
+                    for out in outs:
+                        chunk_key_to_sizes[out.key] -= 1
+                    # when clearing fetches, subtract memory size, otherwise subtract store size
+                    account_idx = 1 if isinstance(pred_op, Fetch) else 0
+                    pop_result_cost = 0
+                    for out in outs:
+                        # corner case exist when a fetch op and another op has same chunk key
+                        # but their op keys are different
+                        if chunk_key_to_sizes[out.key] == 0:
+                            pop_result_cost += size_context.pop(out.key, (0, 0))[
+                                account_idx
+                            ]
+                        else:
+                            pop_result_cost += size_context.get(out.key, (0, 0))[
+                                account_idx
+                            ]
+                    total_memory_cost -= pop_result_cost
+        return sum(t[0] for t in size_context.values()), max_memory_cost
+
+    @classmethod
+    def _check_cancelling(cls, subtask_info: SubtaskExecutionInfo):
+        if subtask_info.cancelling:
+            raise asyncio.CancelledError
+
+    async def internal_run_subtask(self, subtask: Subtask, band_name: str):
+        subtask_api = SubtaskAPI(self.address)
+        subtask_info = self._subtask_info[subtask.subtask_id]
+        subtask_info.result = SubtaskResult(
+            subtask_id=subtask.subtask_id,
+            session_id=subtask.session_id,
+            task_id=subtask.task_id,
+            stage_id=subtask.stage_id,
+            status=SubtaskStatus.pending,
+        )
+        try:
+            logger.debug("Preparing data for subtask %s", subtask.subtask_id)
+            prepare_data_task = asyncio.create_task(
+                _retry_run(
+                    subtask, subtask_info, self._prepare_input_data, subtask, band_name
+                )
+            )
+            await asyncio.wait_for(
+                prepare_data_task, timeout=self._data_prepare_timeout
+            )
+
+            input_sizes = await self._collect_input_sizes(
+                subtask, subtask_info.supervisor_address, band_name
+            )
+            _store_size, calc_size = await asyncio.to_thread(
+                self._estimate_sizes, subtask, input_sizes
+            )
+            self._check_cancelling(subtask_info)
+
+            batch_quota_req = {(subtask.session_id, subtask.subtask_id): calc_size}
+            logger.debug("Start actual running of subtask %s", subtask.subtask_id)
+            subtask_info.result = await self._retry_run_subtask(
+                subtask, band_name, subtask_api, batch_quota_req
+            )
+        except:  # noqa: E722  # pylint: disable=bare-except
+            _fill_subtask_result_with_exception(subtask, subtask_info)
+        finally:
+            # make sure new slot usages are uploaded in time
+            try:
+                slot_manager_ref = await self._get_slot_manager_ref(band_name)
+                await slot_manager_ref.upload_slot_usages(periodical=False)
+            except:  # noqa: E722  # pylint: disable=bare-except
+                _fill_subtask_result_with_exception(subtask, subtask_info)
+            finally:
+                # pop the subtask info at the end is to cancel the job.
+                self._subtask_info.pop(subtask.subtask_id, None)
+        return subtask_info.result
+
+    async def _retry_run_subtask(
+        self, subtask: Subtask, band_name: str, subtask_api: SubtaskAPI, batch_quota_req
+    ):
+        quota_ref = await self._get_band_quota_ref(band_name)
+        slot_manager_ref = await self._get_slot_manager_ref(band_name)
+        subtask_info = self._subtask_info[subtask.subtask_id]
+        assert subtask_info.num_retries >= 0
+        assert subtask_info.max_retries >= 0
+
+        async def _run_subtask_once():
+            aiotask = None
+            slot_id = None
+            try:
+                await quota_ref.request_batch_quota(batch_quota_req)
+                self._check_cancelling(subtask_info)
+
+                slot_id = await slot_manager_ref.acquire_free_slot(
+                    (subtask.session_id, subtask.subtask_id)
+                )
+                subtask_info.slot_id = slot_id
+                self._check_cancelling(subtask_info)
+
+                subtask_info.result.status = SubtaskStatus.running
+                aiotask = asyncio.create_task(
+                    subtask_api.run_subtask_in_slot(band_name, slot_id, subtask)
+                )
+                return await asyncio.shield(aiotask)
+            except asyncio.CancelledError as ex:
+                try:
+                    if aiotask is not None:
+                        logger.info(
+                            "Start to cancel subtask %s in slot %s on band %s.",
+                            subtask.subtask_id,
+                            slot_id,
+                            band_name,
+                        )
+                        await asyncio.wait_for(
+                            asyncio.shield(
+                                subtask_api.cancel_subtask_in_slot(band_name, slot_id)
+                            ),
+                            subtask_info.kill_timeout,
+                        )
+                except asyncio.TimeoutError:
+                    logger.info(
+                        "Wait for subtask to cancel timed out (%s). "
+                        "Start killing slot %d",
+                        subtask_info.kill_timeout,
+                        slot_id,
+                    )
+                    await slot_manager_ref.kill_slot(slot_id)
+                    sub_pool_address = await slot_manager_ref.get_slot_address(slot_id)
+                    await mo.wait_actor_pool_recovered(sub_pool_address, self.address)
+                except:  # pragma: no cover
+                    logger.exception("Unexpected errors raised when handling cancel")
+                    raise
+                finally:
+                    raise ex
+            except (OSError, MarsError) as ex:
+                if slot_id is not None:
+                    # may encounter subprocess memory error
+                    sub_pool_address = await slot_manager_ref.get_slot_address(slot_id)
+                    await mo.wait_actor_pool_recovered(sub_pool_address, self.address)
+                raise ex
+            finally:
+                # make sure allocated slots are traced
+                if slot_id is None:  # pragma: no cover
+                    slot_id = await slot_manager_ref.get_subtask_slot(
+                        (subtask.session_id, subtask.subtask_id)
+                    )
+                logger.debug(
+                    "Subtask %s running ended, slot_id=%r", subtask.subtask_id, slot_id
+                )
+                if slot_id is not None:
+                    await slot_manager_ref.release_free_slot(
+                        slot_id, (subtask.session_id, subtask.subtask_id)
+                    )
+                    logger.debug(
+                        "Released slot %d for subtask %s", slot_id, subtask.subtask_id
+                    )
+                await quota_ref.release_quotas(tuple(batch_quota_req.keys()))
+
+        # TODO(fyrestone): For the retryable op, we should rerun it when
+        #  any exceptions occurred.
+        if subtask.retryable:
+            return await _retry_run(subtask, subtask_info, _run_subtask_once)
+        else:
+            try:
+                return await _run_subtask_once()
+            except Exception as e:
+                unretryable_op = [
+                    chunk.op
+                    for chunk in subtask.chunk_graph
+                    if not getattr(chunk.op, "retryable", True)
+                ]
+                message = (
+                    f"Run subtask failed due to {e}, the subtask {subtask.subtask_id} is "
+                    f"not retryable, it contains unretryable op: \n"
+                    f"{pprint.pformat(unretryable_op)}"
+                )
+                logger.error(message)
+
+                raise wrap_exception(
+                    e, wrap_name="_UnretryableException", message=message
+                )
+
+    async def run_subtask(
+        self, subtask: Subtask, band_name: str, supervisor_address: str
+    ):
+        if subtask.subtask_id in self._subtask_info:  # pragma: no cover
+            raise Exception(
+                f"Subtask {subtask.subtask_id} is already running on this band[{self.address}]."
+            )
+        logger.debug(
+            "Start to schedule subtask %s on %s.", subtask.subtask_id, self.address
+        )
+        self._submitted_subtask_count.record(1, {"band": self.address})
+        with mo.debug.no_message_trace():
+            task = asyncio.create_task(
+                self.ref().internal_run_subtask(subtask, band_name)
+            )
+
+        logger.debug("Subtask %r accepted in worker %s", subtask, self.address)
+        # the extra_config may be None. the extra config overwrites the default value.
+        subtask_max_retries = (
+            subtask.extra_config.get("subtask_max_retries")
+            if subtask.extra_config
+            else None
+        )
+        if subtask_max_retries is None:
+            subtask_max_retries = self._subtask_max_retries
+
+        self._subtask_info[subtask.subtask_id] = SubtaskExecutionInfo(
+            task, band_name, supervisor_address, max_retries=subtask_max_retries
+        )
+        result = await task
+        self._subtask_info.pop(subtask.subtask_id, None)
+        self._finished_subtask_count.record(1, {"band": self.address})
+        logger.debug("Subtask %s finished with result %s", subtask.subtask_id, result)
+        return result
+
+    async def cancel_subtask(self, subtask_id: str, kill_timeout: Optional[int] = 5):
+        try:
+            subtask_info = self._subtask_info[subtask_id]
+        except KeyError:
+            logger.info("Subtask %s not exists, skip cancel.", subtask_id)
+            return
+        logger.info(
+            "Start to cancel subtask %s in slot %s, kill_timeout is %s",
+            subtask_id,
+            subtask_info.slot_id,
+            kill_timeout,
+        )
+
+        kill_timeout = kill_timeout if self._enable_kill_slot else None
+        if not subtask_info.cancelling:
+            subtask_info.kill_timeout = kill_timeout
+            subtask_info.cancelling = True
+            subtask_info.aio_task.cancel()
+
+        await subtask_info.aio_task
+        self._subtask_info.pop(subtask_id, None)
diff --git a/python/xorbits/_mars/services/scheduling/worker/quota.py b/python/xorbits/_mars/services/scheduling/worker/quota.py
new file mode 100644
index 000000000..f5e048071
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/worker/quota.py
@@ -0,0 +1,428 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import itertools
+import logging
+import time
+from collections import OrderedDict, namedtuple
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple, Union
+
+from .... import oscar as mo
+from .... import resource as mars_resource
+from ....typing import BandType
+from ...cluster import QuotaInfo
+
+logger = logging.getLogger(__name__)
+
+QuotaDumpType = namedtuple("QuotaDumpType", "allocations requests hold_sizes")
+
+
+@dataclass
+class QuotaRequest:
+    req_size: Tuple
+    delta: int
+    req_time: float
+    event: asyncio.Event
+
+
+class QuotaActor(mo.Actor):
+    @classmethod
+    def gen_uid(cls, band_name: str):
+        return f"{band_name}_quota"
+
+    def __init__(self, band: BandType, quota_size: int, **kw):
+        super().__init__()
+        self._requests = OrderedDict()
+
+        self._cluster_api = None
+
+        self._band = band
+        self._band_name = band[1]
+
+        self._quota_size = quota_size
+        self._allocations = dict()
+        self._total_allocated = 0
+
+        self._hold_sizes = dict()
+        self._total_hold = 0
+
+        if kw:  # pragma: no cover
+            logger.warning("Keywords for QuotaActor %r not used", list(kw.keys()))
+
+    async def __post_create__(self):
+        from ...cluster.api import ClusterAPI
+
+        try:
+            self._cluster_api = await ClusterAPI.create(self.address)
+            self._report_quota_info()
+        except mo.ActorNotExist:
+            pass
+
+    async def _has_space(self, delta: int):
+        return self._total_allocated + delta <= self._quota_size
+
+    def _log_allocate(self, msg: str, *args, **kwargs):
+        args += (self._total_allocated, self._quota_size)
+        logger.debug(msg + " Allocated: %s, Total size: %s", *args, **kwargs)
+
+    def _report_quota_info(self):
+        if self._cluster_api is not None:
+            quota_info = QuotaInfo(
+                quota_size=self._quota_size,
+                allocated_size=self._total_allocated,
+                hold_size=self._total_hold,
+            )
+            asyncio.create_task(
+                self._cluster_api.set_band_quota_info(self._band_name, quota_info)
+            )
+
+    async def request_batch_quota(self, batch: Dict):
+        """
+        Request for resources in a batch
+        :param batch: the request dict in form {request_key: request_size, ...}
+        :return: if request is returned immediately, return True, otherwise False
+        """
+        all_allocated = True
+        # check if the request is already allocated
+        for key, size in batch.items():
+            if key not in self._allocations or size > self._allocations.get(key):
+                all_allocated = False
+                break
+
+        self._log_allocate("Receive batch quota request %r on %s.", batch, self.uid)
+        sorted_req = sorted(batch.items(), key=lambda tp: tp[0])
+        keys = tuple(tp[0] for tp in sorted_req)
+        quota_sizes = tuple(tp[1] for tp in sorted_req)
+        delta = sum(v - self._allocations.get(k, 0) for k, v in batch.items())
+
+        # if all requested and allocation can still be applied, apply directly
+        if all_allocated and await self._has_space(delta):
+            self._log_allocate("Quota request %r already allocated.", batch)
+            return
+
+        if delta > self._quota_size:
+            raise ValueError(
+                f"Cannot allocate quota size {delta} "
+                f"larger than total capacity {self._quota_size}."
+            )
+
+        if keys in self._requests:
+            event = self._requests[keys].event
+        else:
+            has_space = await self._has_space(delta)
+            if has_space and not self._requests:
+                # if no previous requests, we can apply directly
+                self._log_allocate(
+                    "Quota request met for key %r on %s.", keys, self.uid
+                )
+                await self.alter_allocations(keys, quota_sizes, allocate=True)
+                return
+            else:
+                # current free space cannot satisfy the request, the request is queued
+                if not has_space:
+                    self._log_allocate(
+                        "Quota request unmet for key %r on %s.", keys, self.uid
+                    )
+                else:
+                    self._log_allocate(
+                        "Quota request queued for key %r on %s.", keys, self.uid
+                    )
+                event = asyncio.Event()
+                quota_request = QuotaRequest(quota_sizes, delta, time.time(), event)
+                if keys not in self._requests:
+                    self._requests[keys] = quota_request
+
+        async def waiter():
+            try:
+                await event.wait()
+            except asyncio.CancelledError as ex:
+                await self.ref().remove_requests.tell(keys)
+                raise ex
+
+        return waiter()
+
+    async def remove_requests(self, keys: Tuple):
+        self._requests.pop(keys, None)
+        await self._process_requests()
+
+    def hold_quotas(self, keys: Tuple):
+        """
+        Mark request quota as already been hold
+
+        Parameters
+        ----------
+        keys : Tuple
+            request keys
+        """
+        for key in keys:
+            try:
+                alloc_size = self._allocations[key]
+            except KeyError:
+                continue
+            self._total_hold += alloc_size - self._hold_sizes.get(key, 0)
+            self._hold_sizes[key] = alloc_size
+
+    async def release_quotas(self, keys: Tuple):
+        """
+        Release allocated quota in batch
+
+        Parameters
+        ----------
+        keys : Tuple
+            request keys
+        """
+        total_alloc_size = 0
+
+        for key in keys:
+            try:
+                alloc_size = self._allocations.pop(key)
+                total_alloc_size += alloc_size
+            except KeyError:
+                continue
+            self._total_hold -= self._hold_sizes.pop(key, 0)
+
+        self._total_allocated -= total_alloc_size
+        if total_alloc_size:
+            await self._process_requests()
+
+            self._report_quota_info()
+            self._log_allocate("Quota keys %s released on %s.", keys, self.uid)
+
+    def dump_data(self):
+        return QuotaDumpType(self._allocations, self._requests, self._hold_sizes)
+
+    def get_allocated_size(self):
+        # get total allocated size, for debug purpose
+        return self._total_allocated
+
+    async def alter_allocations(
+        self,
+        keys: Tuple,
+        quota_sizes: Tuple,
+        handle_shrink: bool = True,
+        allocate: bool = False,
+    ):
+        """
+        Alter multiple requests
+
+        Parameters
+        ----------
+        keys : Tuple
+            keys to update
+        quota_sizes : Tuple
+            new quota sizes, if None, no changes will be made
+        handle_shrink : bool
+            if True and the quota size less than the original, process requests in the queue
+        allocate : bool
+            if True, will allocate resources for new items
+        """
+        quota_sizes = quota_sizes or itertools.repeat(None)
+        total_old_size, total_diff = 0, 0
+        for k, s in zip(keys, quota_sizes):
+            old_size = self._allocations.get(k, 0)
+            size_diff = 0
+
+            if not allocate and k not in self._allocations:
+                total_old_size += old_size
+                continue
+
+            if s != old_size:
+                s = int(s)
+                size_diff = s - old_size
+                self._total_allocated += size_diff
+                self._allocations[k] = s
+                try:
+                    self._total_hold += s - self._hold_sizes[k]
+                    self._hold_sizes[k] = s
+                except KeyError:
+                    pass
+
+            total_old_size += old_size
+            total_diff += size_diff
+        if handle_shrink and total_diff < 0:
+            await self._process_requests()
+
+        self._report_quota_info()
+        self._log_allocate(
+            "Quota keys %r applied on %s. Total old Size: %s, Total diff: %s,",
+            keys,
+            self.uid,
+            total_old_size,
+            total_diff,
+        )
+
+    async def _process_requests(self):
+        """
+        Process quota requests in the queue
+        """
+        removed = []
+        for k, req in self._requests.items():
+            if await self._has_space(req.delta):
+                await self.alter_allocations(
+                    k, req.req_size, handle_shrink=False, allocate=True
+                )
+                req.event.set()
+                removed.append(k)
+            else:
+                # Quota left cannot satisfy the next request, we quit
+                break
+        for k in removed:
+            self._requests.pop(k, None)
+
+
+class MemQuotaActor(QuotaActor):
+    """
+    Actor handling worker memory quota
+    """
+
+    def __init__(
+        self,
+        band: BandType,
+        quota_size: int,
+        hard_limit: int = None,
+        refresh_time: Union[int, float] = None,
+        enable_kill_slot: bool = True,
+    ):
+        super().__init__(band, quota_size)
+        self._hard_limit = hard_limit
+        self._last_memory_available = 0
+        self._refresh_time = refresh_time or 1
+
+        self._enable_kill_slot = enable_kill_slot
+
+        self._stat_refresh_task = None
+        self._slot_manager_ref = None
+
+    async def __post_create__(self):
+        await super().__post_create__()
+        self._stat_refresh_task = self.ref().update_mem_stats.tell_delay(
+            delay=self._refresh_time
+        )
+
+        from .workerslot import BandSlotManagerActor
+
+        try:
+            self._slot_manager_ref = await mo.actor_ref(
+                uid=BandSlotManagerActor.gen_uid(self._band[1]), address=self.address
+            )
+        except mo.ActorNotExist:  # pragma: no cover
+            pass
+
+    async def __pre_destroy__(self):
+        self._stat_refresh_task.cancel()
+
+    async def update_mem_stats(self):
+        """
+        Refresh memory usage
+        """
+        cur_mem_available = mars_resource.virtual_memory().available
+        if cur_mem_available > self._last_memory_available:
+            # memory usage reduced: try reallocate existing requests
+            await self._process_requests()
+        self._last_memory_available = cur_mem_available
+        self._report_quota_info()
+        self._stat_refresh_task = self.ref().update_mem_stats.tell_delay(
+            delay=self._refresh_time
+        )
+
+    async def _has_space(self, delta: int):
+        if self._hard_limit is None:
+            return await super()._has_space(delta)
+
+        mem_stats = mars_resource.virtual_memory()
+        # calc available physical memory
+        available_size = (
+            mem_stats.available
+            - max(0, mem_stats.total - self._hard_limit)
+            - (self._total_allocated - self._total_hold)
+        )
+        if max(delta, 0) >= available_size:
+            logger.warning(
+                "%s met hard memory limitation: request %d, available %d, hard limit %d",
+                self.uid,
+                delta,
+                available_size,
+                self._hard_limit,
+            )
+
+            if self._enable_kill_slot and self._slot_manager_ref is not None:
+                logger.info("Restarting free slots to obtain more memory")
+                await self._slot_manager_ref.restart_free_slots()
+            return False
+        return await super()._has_space(delta)
+
+    def _log_allocate(self, msg: str, *args, **kwargs):  # pragma: no cover
+        if logger.getEffectiveLevel() > logging.DEBUG:
+            return
+
+        if self._hard_limit is None:
+            return super()._log_allocate(msg, *args, **kwargs)
+
+        mem_stats = mars_resource.virtual_memory()
+        # calc available physical memory
+        available_size = (
+            mem_stats.available
+            - max(0, mem_stats.total - self._hard_limit)
+            - (self._total_allocated - self._total_hold)
+        )
+        args += (
+            self._total_allocated,
+            self._quota_size,
+            mem_stats.available,
+            available_size,
+            self._hard_limit,
+            self._total_hold,
+        )
+
+        logger.debug(
+            msg
+            + " Allocated: %s, Quota size: %s, Phy available: %s, Hard available: %s,"
+            " Hard limit: %s, Holding: %s",
+            *args,
+            **kwargs,
+        )
+
+
+class WorkerQuotaManagerActor(mo.Actor):
+    def __init__(self, default_config: Dict, band_configs: Optional[Dict] = None):
+        self._cluster_api = None
+        self._default_config = default_config
+        self._band_configs = band_configs or dict()
+
+        self._band_quota_refs = dict()  # type: Dict[str, mo.ActorRef]
+
+    async def __post_create__(self):
+        from ...cluster.api import ClusterAPI
+
+        self._cluster_api = await ClusterAPI.create(self.address)
+
+        band_to_resource = await self._cluster_api.get_bands()
+        for band in band_to_resource.keys():
+            band_config = self._band_configs.get(band[1], self._default_config)
+            hard_limit = band_config.get("hard_limit")
+            actor_cls = MemQuotaActor if hard_limit else QuotaActor
+            self._band_quota_refs[band] = await mo.create_actor(
+                actor_cls,
+                band,
+                **band_config,
+                uid=MemQuotaActor.gen_uid(band[1]),
+                address=self.address,
+            )
+
+    async def __pre_destroy__(self):
+        await asyncio.gather(
+            *[mo.destroy_actor(ref) for ref in self._band_quota_refs.values()]
+        )
diff --git a/python/xorbits/_mars/services/scheduling/worker/service.py b/python/xorbits/_mars/services/scheduling/worker/service.py
new file mode 100644
index 000000000..208deb96f
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/worker/service.py
@@ -0,0 +1,100 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import oscar as mo
+from ....utils import calc_size_by_str
+from ...core import AbstractService
+from .execution import DEFAULT_SUBTASK_MAX_RETRIES, SubtaskExecutionActor
+from .quota import WorkerQuotaManagerActor
+from .workerslot import WorkerSlotManagerActor
+
+
+class SchedulingWorkerService(AbstractService):
+    """
+    Scheduling service on worker.
+
+    Service Configuration
+    ---------------------
+    {
+        "scheduling": {
+            "mem_quota_size": "80%",
+            "mem_hard_limit": "95%",
+            "enable_kill_slot": true,
+            "data_prepare_timeout": 600,
+            "subtask_max_retries": 1
+        }
+    }
+    """
+
+    async def start(self):
+        from .... import resource as mars_resource
+
+        scheduling_config = self._config.get("scheduling", {})
+        address = self._address
+
+        total_mem = mars_resource.virtual_memory().total
+        mem_quota_size = calc_size_by_str(
+            scheduling_config.get("mem_quota_size", "80%"), total_mem
+        )
+        mem_hard_limit = calc_size_by_str(
+            scheduling_config.get("mem_hard_limit", "95%"), total_mem
+        )
+        enable_kill_slot = scheduling_config.get("enable_kill_slot", True)
+        subtask_max_retries = scheduling_config.get(
+            "subtask_max_retries", DEFAULT_SUBTASK_MAX_RETRIES
+        )
+        data_prepare_timeout = scheduling_config.get("data_prepare_timeout", 600)
+
+        await mo.create_actor(
+            WorkerSlotManagerActor,
+            uid=WorkerSlotManagerActor.default_uid(),
+            address=address,
+        )
+        await mo.create_actor(
+            WorkerQuotaManagerActor,
+            default_config=dict(
+                quota_size=mem_quota_size,
+                hard_limit=mem_hard_limit,
+                enable_kill_slot=enable_kill_slot,
+            ),
+            uid=WorkerQuotaManagerActor.default_uid(),
+            address=address,
+        )
+        await mo.create_actor(
+            SubtaskExecutionActor,
+            subtask_max_retries=subtask_max_retries,
+            enable_kill_slot=enable_kill_slot,
+            data_prepare_timeout=data_prepare_timeout,
+            uid=SubtaskExecutionActor.default_uid(),
+            address=address,
+        )
+
+    async def stop(self):
+        address = self._address
+
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=SubtaskExecutionActor.default_uid(), address=address
+            )
+        )
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=WorkerQuotaManagerActor.default_uid(), address=address
+            )
+        )
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=WorkerSlotManagerActor.default_uid(), address=address
+            )
+        )
diff --git a/python/xorbits/_mars/services/scheduling/worker/tests/__init__.py b/python/xorbits/_mars/services/scheduling/worker/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/worker/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/scheduling/worker/tests/test_execution.py b/python/xorbits/_mars/services/scheduling/worker/tests/test_execution.py
new file mode 100644
index 000000000..cee67f91f
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/worker/tests/test_execution.py
@@ -0,0 +1,534 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+import tempfile
+import time
+import uuid
+from contextlib import asynccontextmanager
+from typing import Tuple
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ..... import oscar as mo
+from ..... import remote as mr
+from .....core import (
+    ChunkGraph,
+    ChunkGraphBuilder,
+    OutputType,
+    TileableGraph,
+    TileableGraphBuilder,
+)
+from .....remote.core import RemoteFunction
+from .....resource import Resource
+from .....tensor.arithmetic import TensorTreeAdd
+from .....tensor.fetch import TensorFetch
+from .....utils import Timer
+from ....cluster import MockClusterAPI
+from ....lifecycle import MockLifecycleAPI
+from ....meta import MockMetaAPI, MockWorkerMetaAPI
+from ....mutable import MockMutableAPI
+from ....session import MockSessionAPI
+from ....storage import MockStorageAPI
+from ....storage.handler import StorageHandlerActor
+from ....subtask import MockSubtaskAPI, Subtask, SubtaskStatus
+from ....task.supervisor.manager import TaskManagerActor
+from ...supervisor import GlobalResourceManagerActor
+from ...worker import BandSlotManagerActor, QuotaActor, SubtaskExecutionActor
+
+
+class CancelDetectActorMixin:
+    @asynccontextmanager
+    async def _delay_method(self):
+        delay_fetch_event = getattr(self, "_delay_fetch_event", None)
+        delay_wait_event = getattr(self, "_delay_wait_event", None)
+        try:
+            if delay_fetch_event is not None:
+                delay_fetch_event.set()
+            if delay_wait_event is not None:
+                await delay_wait_event.wait()
+            yield
+        except asyncio.CancelledError:
+            self._is_cancelled = True
+            raise
+
+    def set_delay_fetch_event(
+        self, fetch_event: asyncio.Event, wait_event: asyncio.Event
+    ):
+        setattr(self, "_delay_fetch_event", fetch_event)
+        setattr(self, "_delay_wait_event", wait_event)
+
+    def get_is_cancelled(self):
+        return getattr(self, "_is_cancelled", False)
+
+
+class MockStorageHandlerActor(StorageHandlerActor, CancelDetectActorMixin):
+    async def fetch_batch(self, *args, **kwargs):
+        async with self._delay_method():
+            return super().fetch_batch(*args, **kwargs)
+
+
+class MockQuotaActor(QuotaActor, CancelDetectActorMixin):
+    def __init__(self, *args, **kw):
+        super().__init__(*args, **kw)
+        self._batch_quota_reqs = []
+
+    async def request_batch_quota(self, batch):
+        self._batch_quota_reqs.append(batch)
+        async with self._delay_method():
+            return super().request_batch_quota(batch)
+
+    def get_batch_quota_reqs(self):
+        return self._batch_quota_reqs
+
+
+class MockBandSlotManagerActor(BandSlotManagerActor, CancelDetectActorMixin):
+    async def acquire_free_slot(self, session_stid: Tuple[str, str], block=True):
+        if getattr(self, "_delay_function", None) != "acquire_free_slot":
+            return super().acquire_free_slot(session_stid, block)
+        else:
+            async with self._delay_method():
+                return super().acquire_free_slot(session_stid, block)
+
+    async def upload_slot_usages(self, periodical: bool = False):
+        if (
+            getattr(self, "_delay_function", None) != "upload_slot_usages"
+            or periodical is True
+        ):
+            return super().upload_slot_usages(periodical)
+        else:
+            async with self._delay_method():
+                return super().upload_slot_usages(periodical)
+
+    def set_delay_function(self, name):
+        self._delay_function = name
+
+
+class MockGlobalResourceManagerActor(
+    GlobalResourceManagerActor, CancelDetectActorMixin
+):
+    async def __post_create__(self):
+        pass
+
+    async def __pre_destroy__(self):
+        pass
+
+    @mo.extensible
+    async def update_subtask_resources(
+        self, band, session_id: str, subtask_id: str, resources: Resource
+    ):
+        pass
+
+
+class MockTaskManager(mo.Actor):
+    def __init__(self):
+        self._results = []
+
+    def set_subtask_result(self, result):
+        self._results.append(result)
+
+    def get_results(self):
+        return self._results
+
+
+@pytest.fixture
+async def actor_pool(request):
+    n_slots, enable_kill = request.param
+    pool = await mo.create_actor_pool(
+        "127.0.0.1", labels=[None] + ["numa-0"] * n_slots, n_process=n_slots
+    )
+
+    async with pool:
+        session_id = "test_session"
+        await MockClusterAPI.create(
+            pool.external_address,
+            band_to_resource={"numa-0": Resource(num_cpus=n_slots)},
+        )
+        await MockSessionAPI.create(pool.external_address, session_id=session_id)
+        meta_api = await MockMetaAPI.create(session_id, pool.external_address)
+        worker_meta_api = await MockWorkerMetaAPI.create(
+            session_id, pool.external_address
+        )
+        await MockLifecycleAPI.create(session_id, pool.external_address)
+        await MockSubtaskAPI.create(pool.external_address)
+        await MockMutableAPI.create(session_id, pool.external_address)
+        storage_api = await MockStorageAPI.create(
+            session_id,
+            pool.external_address,
+            storage_handler_cls=MockStorageHandlerActor,
+        )
+
+        # create assigner actor
+        execution_ref = await mo.create_actor(
+            SubtaskExecutionActor,
+            subtask_max_retries=0,
+            enable_kill_slot=enable_kill,
+            uid=SubtaskExecutionActor.default_uid(),
+            address=pool.external_address,
+        )
+        # create quota actor
+        quota_ref = await mo.create_actor(
+            MockQuotaActor,
+            "numa-0",
+            102400,
+            uid=QuotaActor.gen_uid("numa-0"),
+            address=pool.external_address,
+        )
+        # create dispatcher actor
+        band_slot_ref = await mo.create_actor(
+            MockBandSlotManagerActor,
+            (pool.external_address, "numa-0"),
+            n_slots,
+            uid=BandSlotManagerActor.gen_uid("numa-0"),
+            address=pool.external_address,
+        )
+
+        # create global slot manager actor
+        global_resource_ref = await mo.create_actor(
+            MockGlobalResourceManagerActor,
+            uid=GlobalResourceManagerActor.default_uid(),
+            address=pool.external_address,
+        )
+
+        # create mock task manager actor
+        task_manager_ref = await mo.create_actor(
+            MockTaskManager,
+            uid=TaskManagerActor.gen_uid(session_id),
+            address=pool.external_address,
+        )
+
+        try:
+            yield pool, session_id, meta_api, worker_meta_api, storage_api, execution_ref
+        finally:
+            await mo.destroy_actor(task_manager_ref)
+            await mo.destroy_actor(band_slot_ref)
+            await mo.destroy_actor(global_resource_ref)
+            await mo.destroy_actor(quota_ref)
+            await mo.destroy_actor(execution_ref)
+            await MockStorageAPI.cleanup(pool.external_address)
+            await MockSubtaskAPI.cleanup(pool.external_address)
+            await MockClusterAPI.cleanup(pool.external_address)
+            await MockMutableAPI.cleanup(session_id, pool.external_address)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("actor_pool", [(1, True)], indirect=True)
+async def test_execute_tensor(actor_pool):
+    pool, session_id, meta_api, worker_meta_api, storage_api, execution_ref = actor_pool
+
+    data1 = np.random.rand(10, 10)
+    data2 = np.random.rand(10, 10)
+
+    input1 = TensorFetch(
+        key="input1", source_key="input2", dtype=np.dtype(int)
+    ).new_chunk([])
+    input2 = TensorFetch(
+        key="input2", source_key="input2", dtype=np.dtype(int)
+    ).new_chunk([])
+    result_chunk = TensorTreeAdd(args=[input1, input2]).new_chunk(
+        [input1, input2], shape=data1.shape, dtype=data1.dtype
+    )
+
+    await meta_api.set_chunk_meta(
+        input1,
+        memory_size=data1.nbytes,
+        store_size=data1.nbytes,
+        bands=[(pool.external_address, "numa-0")],
+    )
+    await meta_api.set_chunk_meta(
+        input2,
+        memory_size=data1.nbytes,
+        store_size=data2.nbytes,
+        bands=[(pool.external_address, "numa-0")],
+    )
+    # todo use different storage level when storage ready
+    await storage_api.put(input1.key, data1)
+    await storage_api.put(input2.key, data2)
+
+    chunk_graph = ChunkGraph([result_chunk])
+    chunk_graph.add_node(input1)
+    chunk_graph.add_node(input2)
+    chunk_graph.add_node(result_chunk)
+    chunk_graph.add_edge(input1, result_chunk)
+    chunk_graph.add_edge(input2, result_chunk)
+
+    subtask = Subtask("test_subtask", session_id=session_id, chunk_graph=chunk_graph)
+    await execution_ref.run_subtask(subtask, "numa-0", pool.external_address)
+
+    # check if results are correct
+    result = await storage_api.get(result_chunk.key)
+    np.testing.assert_array_equal(data1 + data2, result)
+
+    # check if quota computations are correct
+    quota_ref = await mo.actor_ref(
+        QuotaActor.gen_uid("numa-0"), address=pool.external_address
+    )
+    [quota] = await quota_ref.get_batch_quota_reqs()
+    assert quota[(subtask.session_id, subtask.subtask_id)] == data1.nbytes
+
+    # check if metas are correct
+    result_meta = await worker_meta_api.get_chunk_meta(result_chunk.key)
+    assert result_meta["object_id"] == result_chunk.key
+    assert result_meta["shape"] == result.shape
+
+
+_cancel_phases = [
+    "prepare",
+    "quota",
+    "slot",
+    "execute",
+    "finally",
+    "immediately",
+]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "actor_pool,cancel_phase",
+    [((1, True), phase) for phase in _cancel_phases],
+    indirect=["actor_pool"],
+)
+async def test_execute_with_cancel(actor_pool, cancel_phase):
+    pool, session_id, meta_api, worker_meta_api, storage_api, execution_ref = actor_pool
+    delay_fetch_event = asyncio.Event()
+    delay_wait_event = asyncio.Event()
+
+    # config for different phases
+    ref_to_delay = None
+    if cancel_phase == "prepare":
+        ref_to_delay = await mo.actor_ref(
+            StorageHandlerActor.gen_uid("numa-0"), address=pool.external_address
+        )
+    elif cancel_phase == "quota":
+        ref_to_delay = await mo.actor_ref(
+            QuotaActor.gen_uid("numa-0"), address=pool.external_address
+        )
+    elif cancel_phase == "slot":
+        ref_to_delay = await mo.actor_ref(
+            BandSlotManagerActor.gen_uid("numa-0"), address=pool.external_address
+        )
+        await ref_to_delay.set_delay_function("acquire_free_slot")
+    elif cancel_phase == "finally":
+        ref_to_delay = await mo.actor_ref(
+            BandSlotManagerActor.gen_uid("numa-0"), address=pool.external_address
+        )
+        await ref_to_delay.set_delay_function("upload_slot_usages")
+    if ref_to_delay:
+        await ref_to_delay.set_delay_fetch_event(delay_fetch_event, delay_wait_event)
+    else:
+        delay_fetch_event.set()
+
+    def delay_fun(delay, _inp1):
+        if not ref_to_delay:
+            time.sleep(delay)
+        return (delay,)
+
+    input1 = TensorFetch(
+        key="input1", source_key="input1", dtype=np.dtype(int)
+    ).new_chunk([])
+    remote_result = RemoteFunction(
+        function=delay_fun, function_args=[100, input1], function_kwargs={}, n_output=1
+    ).new_chunk([input1])
+
+    data1 = np.random.rand(10, 10)
+    await meta_api.set_chunk_meta(
+        input1,
+        memory_size=data1.nbytes,
+        store_size=data1.nbytes,
+        bands=[(pool.external_address, "numa-0")],
+    )
+    await storage_api.put(input1.key, data1)
+
+    chunk_graph = ChunkGraph([remote_result])
+    chunk_graph.add_node(input1)
+    chunk_graph.add_node(remote_result)
+    chunk_graph.add_edge(input1, remote_result)
+
+    subtask = Subtask(
+        f"test_subtask_{uuid.uuid4()}", session_id=session_id, chunk_graph=chunk_graph
+    )
+    aiotask = asyncio.create_task(
+        execution_ref.run_subtask(subtask, "numa-0", pool.external_address)
+    )
+    if ref_to_delay:
+        await delay_fetch_event.wait()
+    else:
+        if cancel_phase != "immediately":
+            await asyncio.sleep(1)
+
+    with Timer() as timer:
+        await asyncio.wait_for(
+            execution_ref.cancel_subtask(subtask.subtask_id, kill_timeout=1),
+            timeout=30,
+        )
+        r = await asyncio.wait_for(aiotask, timeout=30)
+        assert r.status == SubtaskStatus.cancelled
+    assert timer.duration < 15
+
+    # check for different phases
+    if ref_to_delay is not None:
+        assert await ref_to_delay.get_is_cancelled()
+        delay_wait_event.set()
+
+    # test if slot is restored
+    remote_tileable = mr.spawn(delay_fun, args=(0.5, None))
+    graph = TileableGraph([remote_tileable.data])
+    next(TileableGraphBuilder(graph).build())
+
+    chunk_graph = next(ChunkGraphBuilder(graph, fuse_enabled=False).build())
+
+    subtask = Subtask(
+        f"test_subtask2_{uuid.uuid4()}", session_id=session_id, chunk_graph=chunk_graph
+    )
+    await asyncio.wait_for(
+        execution_ref.run_subtask(subtask, "numa-0", pool.external_address), timeout=30
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("actor_pool", [(1, True)], indirect=True)
+async def test_execute_with_pure_deps(actor_pool):
+    pool, session_id, meta_api, worker_meta_api, storage_api, execution_ref = actor_pool
+
+    dep = TensorFetch(key="input1", dtype=np.dtype(int)).new_chunk([])
+
+    def main_fun():
+        return session_id
+
+    remote_result = RemoteFunction(
+        function=main_fun, function_args=[], function_kwargs={}
+    ).new_chunk([dep])
+    # mark `dep` as pure dependency
+    remote_result.op._pure_depends = [True]
+    chunk_graph = ChunkGraph([remote_result])
+    chunk_graph.add_node(dep)
+    chunk_graph.add_node(remote_result)
+    chunk_graph.add_edge(dep, remote_result)
+
+    subtask = Subtask(
+        f"test_subtask_{uuid.uuid4()}", session_id=session_id, chunk_graph=chunk_graph
+    )
+    # subtask shall run well without data of `dep` available
+    await execution_ref.run_subtask(subtask, "numa-0", pool.external_address)
+    res = await storage_api.get(remote_result.key)
+    assert res == session_id
+
+
+def test_estimate_size():
+    from .....dataframe.arithmetic import DataFrameAdd
+    from .....dataframe.fetch import DataFrameFetch
+    from .....dataframe.utils import parse_index
+    from ..execution import SubtaskExecutionActor
+
+    index_value = parse_index(pd.Index([10, 20, 30], dtype=np.int64))
+
+    input1 = DataFrameFetch(
+        output_types=[OutputType.series],
+    ).new_chunk(
+        [], _key="INPUT1", shape=(np.nan,), dtype=np.dtype("O"), index_value=index_value
+    )
+    input2 = DataFrameFetch(
+        output_types=[OutputType.series],
+    ).new_chunk(
+        [], _key="INPUT2", shape=(np.nan,), dtype=np.dtype("O"), index_value=index_value
+    )
+    result_chunk = DataFrameAdd(
+        axis=0, output_types=[OutputType.series], lhs=input1, rhs=input2
+    ).new_chunk(
+        [input1, input2],
+        _key="ADD_RESULT",
+        shape=(np.nan,),
+        dtype=np.dtype("O"),
+        index_value=index_value,
+    )
+
+    chunk_graph = ChunkGraph([result_chunk])
+    chunk_graph.add_node(input1)
+    chunk_graph.add_node(input2)
+    chunk_graph.add_node(result_chunk)
+    chunk_graph.add_edge(input1, result_chunk)
+    chunk_graph.add_edge(input2, result_chunk)
+
+    input_sizes = {
+        "INPUT1": (1024, 1024),
+        "INPUT2": (1024, 1024),
+    }
+
+    subtask = Subtask("test_subtask", session_id="session_id", chunk_graph=chunk_graph)
+    result = SubtaskExecutionActor._estimate_sizes(subtask, input_sizes)
+    assert result[0] == 1024
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("actor_pool", [(1, False)], indirect=True)
+async def test_cancel_without_kill(actor_pool):
+    pool, session_id, meta_api, worker_meta_api, storage_api, execution_ref = actor_pool
+    executed_file = os.path.join(
+        tempfile.gettempdir(), f"mars_test_cancel_without_kill_{os.getpid()}.tmp"
+    )
+
+    def delay_fun(delay):
+        import mars
+
+        open(executed_file, "w").close()
+        time.sleep(delay)
+        mars._slot_marker = 1
+        return delay
+
+    def check_fun():
+        import mars
+
+        return getattr(mars, "_slot_marker", False)
+
+    remote_result = RemoteFunction(
+        function=delay_fun, function_args=[2], function_kwargs={}
+    ).new_chunk([])
+    chunk_graph = ChunkGraph([remote_result])
+    chunk_graph.add_node(remote_result)
+
+    subtask = Subtask(
+        f"test_subtask_{uuid.uuid4()}", session_id=session_id, chunk_graph=chunk_graph
+    )
+    aiotask = asyncio.create_task(
+        execution_ref.run_subtask(subtask, "numa-0", pool.external_address)
+    )
+    await asyncio.sleep(0.5)
+
+    await asyncio.wait_for(
+        execution_ref.cancel_subtask(subtask.subtask_id, kill_timeout=1),
+        timeout=30,
+    )
+    r = await asyncio.wait_for(aiotask, timeout=30)
+    assert r.status == SubtaskStatus.cancelled
+
+    remote_result = RemoteFunction(
+        function=check_fun, function_args=[], function_kwargs={}
+    ).new_chunk([])
+    chunk_graph = ChunkGraph([remote_result])
+    chunk_graph.add_node(remote_result)
+
+    subtask = Subtask(
+        f"test_subtask_{uuid.uuid4()}", session_id=session_id, chunk_graph=chunk_graph
+    )
+    await asyncio.wait_for(
+        execution_ref.run_subtask(subtask, "numa-0", pool.external_address), timeout=30
+    )
+
+    # check if slots not killed (or slot assignment may be cancelled)
+    if os.path.exists(executed_file):
+        assert await storage_api.get(remote_result.key)
+        os.unlink(executed_file)
diff --git a/python/xorbits/_mars/services/scheduling/worker/tests/test_quota.py b/python/xorbits/_mars/services/scheduling/worker/tests/test_quota.py
new file mode 100644
index 000000000..9a9143c41
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/worker/tests/test_quota.py
@@ -0,0 +1,183 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+import sys
+import time
+
+import pytest
+
+from ..... import oscar as mo
+from .....tests.core import mock
+from .....utils import get_next_port
+from ...worker import BandSlotManagerActor, MemQuotaActor, QuotaActor
+
+
+class MockBandSlotManagerActor(mo.Actor):
+    def get_restart_record(self):
+        return getattr(self, "_restart_record", False)
+
+    def restart_free_slots(self):
+        self._restart_record = True
+
+
+@pytest.fixture
+async def actor_pool():
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "fork") if sys.platform != "win32" else None
+    )
+    pool = await mo.create_actor_pool(
+        f"127.0.0.1:{get_next_port()}",
+        n_process=0,
+        subprocess_start_method=start_method,
+    )
+    await pool.start()
+    try:
+        yield pool
+    finally:
+        await pool.stop()
+
+
+@pytest.mark.asyncio
+async def test_quota(actor_pool):
+    quota_ref = await mo.create_actor(
+        QuotaActor,
+        (actor_pool.external_address, "numa-0"),
+        300,
+        uid=QuotaActor.gen_uid("cpu-0"),
+        address=actor_pool.external_address,
+    )  # type: mo.ActorRefType[QuotaActor]
+
+    # test quota options with non-existing keys
+    await quota_ref.hold_quotas(["non_exist"])
+    await quota_ref.release_quotas(["non_exist"])
+
+    with pytest.raises(ValueError):
+        await quota_ref.request_batch_quota({"ERROR": 1000})
+
+    # test quota request with immediate return
+    await quota_ref.request_batch_quota({"0": 100})
+    await quota_ref.request_batch_quota({"0": 50})
+    await quota_ref.request_batch_quota({"0": 200})
+
+    # test request with process_quota=True
+    await quota_ref.request_batch_quota({"0": 200})
+    await quota_ref.alter_allocations(["0"], [190])
+    assert (await quota_ref.dump_data()).allocations["0"] == 190
+
+    await quota_ref.hold_quotas(["0"])
+    assert "0" in (await quota_ref.dump_data()).hold_sizes
+
+    req_task1 = asyncio.create_task(quota_ref.request_batch_quota({"1": 150}))
+    req_task2 = asyncio.create_task(quota_ref.request_batch_quota({"2": 50}))
+    asyncio.create_task(quota_ref.request_batch_quota({"3": 200}))
+    asyncio.create_task(quota_ref.request_batch_quota({"3": 180}))
+
+    await asyncio.sleep(0.1)
+    assert "2" not in (await quota_ref.dump_data()).allocations
+
+    req_task1.cancel()
+    with pytest.raises(asyncio.CancelledError):
+        await req_task1
+
+    await asyncio.wait_for(req_task2, timeout=1)
+    assert "1" not in (await quota_ref.dump_data()).allocations
+    assert "2" in (await quota_ref.dump_data()).allocations
+    assert "3" not in (await quota_ref.dump_data()).allocations
+
+    await quota_ref.release_quotas(["0"])
+    assert "3" in (await quota_ref.dump_data()).allocations
+
+    req_task4 = asyncio.create_task(quota_ref.request_batch_quota({"4": 180}))
+    await asyncio.sleep(0)
+    assert "4" not in (await quota_ref.dump_data()).allocations
+
+    await quota_ref.alter_allocations(["3"], [50])
+    await req_task4
+    assert "4" in (await quota_ref.dump_data()).allocations
+
+
+@pytest.mark.asyncio
+async def test_batch_quota_allocation(actor_pool):
+    quota_ref = await mo.create_actor(
+        QuotaActor,
+        (actor_pool.external_address, "numa-0"),
+        300,
+        uid=QuotaActor.gen_uid("cpu-0"),
+        address=actor_pool.external_address,
+    )  # type: mo.ActorRefType[QuotaActor]
+
+    end_time = []
+
+    async def task_fun(b):
+        await quota_ref.request_batch_quota(b)
+        await asyncio.sleep(0.5)
+        assert set(b.keys()) == set((await quota_ref.dump_data()).allocations.keys())
+        await quota_ref.release_quotas(list(b.keys()))
+        end_time.append(time.time())
+
+    tasks = []
+    for idx in (0, 1):
+        keys = [f"{idx}_0", f"{idx}_1"]
+        batch = dict((k, 100) for k in keys)
+        tasks.append(asyncio.create_task(task_fun(batch)))
+    await asyncio.wait_for(asyncio.gather(*tasks), timeout=10)
+
+    assert abs(end_time[0] - end_time[1]) > 0.4
+    assert await quota_ref.get_allocated_size() == 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("enable_kill_slot", [False, True])
+async def test_mem_quota_allocation(actor_pool, enable_kill_slot):
+    from .....utils import AttributeDict
+
+    mock_mem_stat = AttributeDict(dict(total=300, available=50, used=0, free=50))
+    mock_band_slot_manager_ref = await mo.create_actor(
+        MockBandSlotManagerActor,
+        uid=BandSlotManagerActor.gen_uid("numa-0"),
+        address=actor_pool.external_address,
+    )
+    quota_ref = await mo.create_actor(
+        MemQuotaActor,
+        (actor_pool.external_address, "numa-0"),
+        300,
+        hard_limit=300,
+        refresh_time=0.1,
+        enable_kill_slot=enable_kill_slot,
+        uid=MemQuotaActor.gen_uid("cpu-0"),
+        address=actor_pool.external_address,
+    )  # type: mo.ActorRefType[QuotaActor]
+
+    with mock.patch("mars.resource.virtual_memory", new=lambda: mock_mem_stat):
+        time_recs = [time.time()]
+
+        async def task_fun():
+            await quota_ref.request_batch_quota({"req": 100})
+            await quota_ref.release_quotas(["req"])
+            time_recs.append(time.time())
+
+        task = asyncio.create_task(task_fun())
+        await asyncio.sleep(0.2)
+        assert "req" not in (await quota_ref.dump_data()).allocations
+
+        mock_mem_stat["available"] = 150
+        mock_mem_stat["free"] = 150
+        await asyncio.wait_for(task, timeout=1)
+        assert 0.15 < abs(time_recs[0] - time_recs[1]) < 1
+        assert (
+            bool(await mock_band_slot_manager_ref.get_restart_record())
+            == enable_kill_slot
+        )
diff --git a/python/xorbits/_mars/services/scheduling/worker/tests/test_workerslot.py b/python/xorbits/_mars/services/scheduling/worker/tests/test_workerslot.py
new file mode 100644
index 000000000..9f8d17b1b
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/worker/tests/test_workerslot.py
@@ -0,0 +1,353 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+import sys
+import time
+from typing import Tuple
+
+import pandas as pd
+import psutil
+import pytest
+
+from ..... import oscar as mo
+from .....oscar import ServerClosed
+from .....oscar.backends.allocate_strategy import IdleLabel
+from .....oscar.errors import NoFreeSlot, SlotStateError
+from .....resource import Resource
+from .....tests.core import wait_for_condition
+from .....utils import get_next_port
+from ...supervisor import GlobalResourceManagerActor
+from ...worker import BandSlotControlActor, BandSlotManagerActor
+
+
+class MockGlobalResourceManagerActor(mo.Actor):
+    def __init__(self):
+        self._result = None
+
+    @mo.extensible
+    def update_subtask_resources(
+        self, band: Tuple, session_id: str, subtask_id: str, resources: Resource
+    ):
+        self._result = (band, session_id, subtask_id, resources)
+
+    def get_result(self):
+        return self._result
+
+
+@pytest.fixture
+async def actor_pool(request):
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    n_slots = request.param
+    pool = await mo.create_actor_pool(
+        f"127.0.0.1:{get_next_port()}",
+        n_process=n_slots,
+        labels=[None] + ["numa-0"] * n_slots,
+        subprocess_start_method=start_method,
+    )
+
+    async with pool:
+        global_resource_ref = await mo.create_actor(
+            MockGlobalResourceManagerActor,
+            uid=GlobalResourceManagerActor.default_uid(),
+            address=pool.external_address,
+        )
+        slot_manager_ref = await mo.create_actor(
+            BandSlotManagerActor,
+            (pool.external_address, "numa-0"),
+            n_slots,
+            global_resource_ref,
+            uid=BandSlotManagerActor.gen_uid("numa-0"),
+            address=pool.external_address,
+        )
+        try:
+            yield pool, slot_manager_ref
+        finally:
+            await slot_manager_ref.destroy()
+
+
+ActorPoolType = Tuple[mo.MainActorPoolType, mo.ActorRefType[BandSlotManagerActor]]
+
+
+class TaskActor(mo.Actor):
+    def __init__(self, call_logs, slot_id=0):
+        self._call_logs = call_logs
+        self._dispatch_ref = None
+        self._slot_id = slot_id
+
+    @classmethod
+    def gen_uid(cls, slot_id):
+        return f"{slot_id}_task_actor"
+
+    async def __post_create__(self):
+        self._dispatch_ref = await mo.actor_ref(
+            BandSlotManagerActor.gen_uid("numa-0"), address=self.address
+        )
+        await self._dispatch_ref.register_slot.tell(self._slot_id, os.getpid())
+
+    async def queued_call(self, key, session_stid, delay):
+        try:
+            self._call_logs[key] = time.time()
+            await asyncio.sleep(delay)
+        finally:
+            if session_stid is not None:
+                await self._dispatch_ref.release_free_slot(self._slot_id, session_stid)
+
+    def get_call_logs(self):
+        return self._call_logs
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("actor_pool", [0], indirect=True)
+async def test_slot_assign(actor_pool: ActorPoolType):
+    pool, slot_manager_ref = actor_pool
+
+    call_logs = dict()
+    group_size = 4
+    delay = 1
+    await asyncio.gather(
+        *(
+            mo.create_actor(
+                TaskActor,
+                call_logs,
+                slot_id=slot_id,
+                uid=TaskActor.gen_uid(slot_id),
+                address=pool.external_address,
+            )
+            for slot_id in range(group_size)
+        )
+    )
+    assert len((await slot_manager_ref.dump_data()).free_slots) == group_size
+
+    async def task_fun(idx):
+        session_stid = ("session_id", f"subtask_id{idx}")
+        slot_id = await slot_manager_ref.acquire_free_slot(session_stid)
+        assert slot_id == await slot_manager_ref.get_subtask_slot(session_stid)
+        ref = await mo.actor_ref(
+            uid=TaskActor.gen_uid(slot_id), address=pool.external_address
+        )
+        await ref.queued_call(idx, session_stid, delay)
+
+    tasks = []
+    start_time = time.time()
+    for idx in range(group_size + 1):
+        tasks.append(asyncio.create_task(task_fun(idx)))
+    await asyncio.gather(*tasks)
+
+    log_series = pd.Series(call_logs).sort_index() - start_time
+    assert len(log_series) == group_size + 1
+    assert log_series.iloc[:group_size].max() < delay / 4
+    assert log_series.iloc[group_size:].min() > delay / 4
+
+    call_logs.clear()
+    tasks = []
+    start_time = time.time()
+    for idx in range(group_size * 2 + 1):
+        tasks.append(asyncio.create_task(task_fun(idx)))
+    await asyncio.sleep(delay / 10)
+    tasks[group_size].cancel()
+    await asyncio.wait(tasks)
+
+    with pytest.raises(asyncio.CancelledError):
+        tasks[group_size].result()
+
+    log_series = pd.Series(call_logs).sort_index() - start_time
+
+    assert len(log_series) == group_size * 2
+    assert log_series.iloc[:group_size].max() < delay / 4
+    assert log_series.iloc[group_size:].min() > delay / 4
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("actor_pool", [1], indirect=True)
+async def test_slot_kill(actor_pool: ActorPoolType):
+    pool, slot_manager_ref = actor_pool
+
+    strategy = IdleLabel("numa-0", "task_actor")
+    task_ref = await mo.create_actor(
+        TaskActor, {}, allocate_strategy=strategy, address=pool.external_address
+    )
+
+    assert await mo.actor_ref(
+        BandSlotControlActor.gen_uid("numa-0", 0), address=pool.external_address
+    )
+    delayed_task = asyncio.create_task(task_ref.queued_call("key", None, 10))
+    await asyncio.sleep(0.1)
+
+    # check if process hosting the actor is closed
+    kill_task = asyncio.create_task(slot_manager_ref.kill_slot(0))
+    await asyncio.sleep(0)
+    kill_task2 = asyncio.create_task(slot_manager_ref.kill_slot(0))
+
+    with pytest.raises(ServerClosed):
+        await delayed_task
+
+    # check if slot actor is restored
+    await kill_task
+    # check if secondary task makes no change
+    await kill_task2
+
+    assert await mo.actor_ref(
+        BandSlotControlActor.gen_uid("numa-0", 0), address=pool.external_address
+    )
+
+    async def check_alive():
+        assert await mo.actor_ref(task_ref)
+        return True
+
+    await wait_for_condition(check_alive)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("actor_pool", [3], indirect=True)
+async def test_slot_restart(actor_pool: ActorPoolType):
+    pool, slot_manager_ref = actor_pool
+
+    strategy = IdleLabel("numa-0", "task_actor")
+    task_refs = []
+    for idx in range(3):
+        ref = await mo.create_actor(
+            TaskActor,
+            {},
+            slot_id=idx,
+            allocate_strategy=strategy,
+            address=pool.external_address,
+        )
+        await ref.queued_call("idx", None, idx)
+        task_refs.append(ref)
+
+    await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id1"))
+    slot_id2 = await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id2"))
+    await slot_manager_ref.release_free_slot(slot_id2, ("session_id", "subtask_id2"))
+
+    async def record_finish_time(coro):
+        await coro
+        return time.time()
+
+    restart_task1 = asyncio.create_task(
+        record_finish_time(slot_manager_ref.restart_free_slots())
+    )
+    await asyncio.sleep(0)
+    restart_task2 = asyncio.create_task(
+        record_finish_time(slot_manager_ref.restart_free_slots())
+    )
+    acquire_task = asyncio.create_task(
+        record_finish_time(
+            slot_manager_ref.acquire_free_slot(("session_id", "subtask_id3"))
+        )
+    )
+
+    await asyncio.gather(restart_task1, restart_task2, acquire_task)
+
+    # check only slots with running records are restarted
+    assert len(await task_refs[0].get_call_logs()) > 0
+    assert len(await task_refs[1].get_call_logs()) == 0
+    assert len(await task_refs[2].get_call_logs()) > 0
+
+    assert abs(restart_task1.result() - acquire_task.result()) < 0.1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("actor_pool", [1], indirect=True)
+async def test_report_usage(actor_pool: ActorPoolType):
+    pool, slot_manager_ref = actor_pool
+
+    await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id"))
+    await asyncio.sleep(1.3)
+
+    global_resource_ref = await mo.actor_ref(
+        uid=GlobalResourceManagerActor.default_uid(), address=pool.external_address
+    )
+    _band, session_id, subtask_id, resources = await global_resource_ref.get_result()
+    assert resources.num_cpus == pytest.approx(1.0)
+    assert session_id == "session_id"
+    assert subtask_id == "subtask_id"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("actor_pool", [1], indirect=True)
+async def test_slot_fault_tolerance(actor_pool: ActorPoolType):
+    pool, slot_manager_ref = actor_pool
+    # acquire -> slot restarted = can't acquire more.
+    slot_id = await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id"))
+    await slot_manager_ref.register_slot(slot_id, os.getpid())
+    with pytest.raises(NoFreeSlot):
+        await slot_manager_ref.acquire_free_slot(
+            ("session_id", "subtask_id"), block=False
+        )
+    await slot_manager_ref.release_free_slot(slot_id, ("session_id", "subtask_id"))
+
+    # acquire -> release -> slot restarted = can only acquire once.
+    slot_id = await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id2"))
+    await slot_manager_ref.release_free_slot(slot_id, ("session_id", "subtask_id2"))
+    await slot_manager_ref.register_slot(slot_id, os.getpid())
+    await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id2"))
+    with pytest.raises(NoFreeSlot):
+        await slot_manager_ref.acquire_free_slot(
+            ("session_id", "subtask_id2"), block=False
+        )
+    await slot_manager_ref.release_free_slot(slot_id, ("session_id", "subtask_id2"))
+
+    # acquire -> release -> acquire -> slot restarted = can't acquire more.
+    slot_id = await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id3"))
+    await slot_manager_ref.release_free_slot(slot_id, ("session_id", "subtask_id3"))
+    await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id3"))
+    await slot_manager_ref.register_slot(slot_id, os.getpid())
+    with pytest.raises(NoFreeSlot):
+        await slot_manager_ref.acquire_free_slot(
+            ("session_id", "subtask_id3"), block=False
+        )
+    await slot_manager_ref.release_free_slot(slot_id, ("session_id", "subtask_id3"))
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("actor_pool", [1], indirect=True)
+async def test_slot_exception(actor_pool: ActorPoolType):
+    pool, slot_manager_ref = actor_pool
+
+    # make sure the BandSlotControlActor has registered.
+    slot_id = await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id"))
+    await slot_manager_ref.release_free_slot(slot_id, ("session_id", "subtask_id"))
+
+    if sys.platform == "win32":
+        with pytest.raises(ValueError):
+            await slot_manager_ref.register_slot(1, -1)
+    else:
+        with pytest.raises((psutil.AccessDenied, psutil.NoSuchProcess)):
+            await slot_manager_ref.register_slot(1, 0)
+
+    dump_data = await slot_manager_ref.dump_data()
+    # after the register_slot is correctly handled,
+    # we can assert 1 not in free slots.
+    assert 1 in dump_data.free_slots
+
+    slot_id = await slot_manager_ref.acquire_free_slot(("session_id", "subtask_id"))
+    with pytest.raises(SlotStateError):
+        # release session_stid not matched the acquired value.
+        await slot_manager_ref.release_free_slot(slot_id, ("session_id", "subtask_id1"))
+
+    dump_data = await slot_manager_ref.dump_data()
+    # the slot is not released.
+    assert slot_id not in dump_data.free_slots
+
+    not_acquired_slot = next(iter(dump_data.free_slots))
+    with pytest.raises(SlotStateError):
+        await slot_manager_ref.release_free_slot(
+            not_acquired_slot, ("session_id", "subtask_id1")
+        )
diff --git a/python/xorbits/_mars/services/scheduling/worker/workerslot.py b/python/xorbits/_mars/services/scheduling/worker/workerslot.py
new file mode 100644
index 000000000..60134f1f5
--- /dev/null
+++ b/python/xorbits/_mars/services/scheduling/worker/workerslot.py
@@ -0,0 +1,339 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import os
+import time
+from typing import Dict, List, NamedTuple, Set, Tuple
+
+import psutil
+
+from .... import oscar as mo
+from ....oscar.backends.allocate_strategy import IdleLabel
+from ....oscar.errors import NoFreeSlot, SlotStateError
+from ....resource import Resource
+from ....typing import BandType
+from ...cluster import ClusterAPI, WorkerSlotInfo
+
+logger = logging.getLogger(__name__)
+
+
+class DispatchDumpType(NamedTuple):
+    free_slots: Set
+    fresh_slots: Set
+
+
+class WorkerSlotManagerActor(mo.Actor):
+    _band_slot_infos: Dict[str, List[WorkerSlotInfo]]
+
+    def __init__(self):
+        self._cluster_api = None
+        self._global_resource_ref = None
+
+        self._band_slot_managers = dict()  # type: Dict[str, mo.ActorRef]
+
+    async def __post_create__(self):
+        self._cluster_api = await ClusterAPI.create(self.address)
+
+        band_to_resource = await self._cluster_api.get_bands()
+        for band, resource in band_to_resource.items():
+            self._band_slot_managers[band] = await mo.create_actor(
+                BandSlotManagerActor,
+                band,
+                int(resource.num_cpus or resource.num_gpus),
+                self._global_resource_ref,
+                uid=BandSlotManagerActor.gen_uid(band[1]),
+                address=self.address,
+            )
+
+    async def __pre_destroy__(self):
+        await asyncio.gather(
+            *[mo.destroy_actor(ref) for ref in self._band_slot_managers.values()]
+        )
+
+
+class BandSlotManagerActor(mo.Actor):
+    _free_slots: Set[int]
+    _fresh_slots: Set[int]
+
+    @classmethod
+    def gen_uid(cls, band_name: str):
+        return f"{band_name}_band_slot_manager"
+
+    def __init__(
+        self, band: BandType, n_slots: int, global_resource_ref: mo.ActorRef = None
+    ):
+        super().__init__()
+        self._cluster_api = None
+
+        self._band = band
+        self._band_name = band[1]
+        self._global_resource_ref = global_resource_ref
+        self._n_slots = n_slots
+
+        self._semaphore = asyncio.Semaphore(0)
+        self._slot_control_refs = dict()
+        self._free_slots = set()
+        self._fresh_slots = set()
+        self._slot_kill_events = dict()
+
+        self._restarting = False
+        self._restart_done_event = asyncio.Event()
+
+        self._session_stid_to_slot = dict()
+        self._slot_to_session_stid = dict()
+        self._last_report_time = time.time()
+
+        self._slot_to_proc = dict()
+        self._usage_upload_task = None
+
+    async def __post_create__(self):
+        try:
+            self._cluster_api = await ClusterAPI.create(self.address)
+        except mo.ActorNotExist:
+            pass
+
+        strategy = IdleLabel(self._band_name, f"worker_slot_control")
+        for slot_id in range(self._n_slots):
+            self._slot_control_refs[slot_id] = await mo.create_actor(
+                BandSlotControlActor,
+                self.ref(),
+                self._band_name,
+                slot_id,
+                uid=BandSlotControlActor.gen_uid(self._band_name, slot_id),
+                address=self.address,
+                allocate_strategy=strategy,
+            )
+            self._fresh_slots.add(slot_id)
+
+        self._upload_slot_usage_with_delay()
+
+    async def __pre_destroy__(self):
+        self._usage_upload_task.cancel()
+
+    async def _get_global_resource_ref(self):
+        if self._global_resource_ref is not None:
+            return self._global_resource_ref
+
+        from ..supervisor import GlobalResourceManagerActor
+
+        try:
+            [self._global_resource_ref] = await self._cluster_api.get_supervisor_refs(
+                [GlobalResourceManagerActor.default_uid()]
+            )
+        except mo.ActorNotExist:
+            self._global_resource_ref = None
+        return self._global_resource_ref
+
+    def get_slot_address(self, slot_id: int):
+        return self._slot_control_refs[slot_id].address
+
+    def get_subtask_slot(self, session_stid: Tuple[str, str]):
+        return self._session_stid_to_slot.get(session_stid)
+
+    async def acquire_free_slot(self, session_stid: Tuple[str, str], block=True):
+        if not block and self._semaphore.locked():
+            raise NoFreeSlot(f"No free slot for {session_stid}")
+        yield self._semaphore.acquire()
+        if self._restarting:
+            yield self._restart_done_event.wait()
+
+        slot_id = self._free_slots.pop()
+        self._fresh_slots.difference_update([slot_id])
+        self._slot_to_session_stid[slot_id] = session_stid
+        self._session_stid_to_slot[session_stid] = slot_id
+        logger.debug("Slot %d acquired for subtask %r", slot_id, session_stid)
+        raise mo.Return(slot_id)
+
+    def release_free_slot(self, slot_id: int, session_stid: Tuple[str, str]):
+        acquired_session_stid = self._slot_to_session_stid.pop(slot_id, None)
+        if acquired_session_stid is None:
+            raise SlotStateError(f"Slot {slot_id} is not acquired.")
+        if acquired_session_stid != session_stid:
+            raise SlotStateError(
+                f"Slot {slot_id} releasing state incorrect, "
+                f"the acquired session_stid: {acquired_session_stid}, "
+                f"the releasing session_stid: {session_stid}"
+            )
+        acquired_slot_id = self._session_stid_to_slot.pop(acquired_session_stid)
+        assert (
+            acquired_slot_id == slot_id
+        ), f"{acquired_session_stid}: acquired_slot_id {acquired_slot_id} != slot_id {slot_id}"
+
+        logger.debug("Slot %d released", slot_id)
+
+        if slot_id not in self._free_slots:
+            self._free_slots.add(slot_id)
+            self._semaphore.release()
+
+    def register_slot(self, slot_id: int, pid: int):
+        try:
+            self._fresh_slots.add(slot_id)
+            if slot_id in self._slot_kill_events:
+                event = self._slot_kill_events.pop(slot_id)
+                event.set()
+            if slot_id in self._slot_to_session_stid:
+                # We should release the slot by one role, if the slot is
+                # acquired by the SubtaskExecutionActor, then the slot
+                # should be released by it, too.
+                session_stid = self._slot_to_session_stid[slot_id]
+                logger.info(
+                    "Slot %s registered by pid %s, current acquired session_stid is %s",
+                    slot_id,
+                    pid,
+                    session_stid,
+                )
+            else:
+                if slot_id not in self._free_slots:
+                    self._free_slots.add(slot_id)
+                    self._semaphore.release()
+        finally:
+            # psutil may raises exceptions, but currently we can't handle the register exception,
+            # so put it to the finally.
+            # TODO(fyrestone): handle register_slot failure.
+            self._slot_to_proc[slot_id] = proc = psutil.Process(pid)
+            # collect initial stats for the process
+            proc.cpu_percent(interval=None)
+
+    async def _kill_slot(self, slot_id: int):
+        if slot_id in self._slot_kill_events:
+            await self._slot_kill_events[slot_id].wait()
+            return
+
+        event = self._slot_kill_events[slot_id] = asyncio.Event()
+        # TODO(fyrestone): Make it more reliable. e.g. kill_actor
+        # success but the actor does not restart.
+        try:
+            await mo.kill_actor(self._slot_control_refs[slot_id])
+        except ConnectionError:
+            pass
+        await event.wait()
+
+    async def kill_slot(self, slot_id: int):
+        self._free_slots.difference_update([slot_id])
+        yield self._kill_slot(slot_id)
+
+    async def restart_free_slots(self):
+        if self._restarting:
+            yield self._restart_done_event.wait()
+            return
+
+        self._restart_done_event = asyncio.Event()
+        self._restarting = True
+        slot_ids = [
+            slot_id for slot_id in self._free_slots if slot_id not in self._fresh_slots
+        ]
+        if slot_ids:
+            yield asyncio.gather(*[self._kill_slot(slot_id) for slot_id in slot_ids])
+            logger.info("%d idle slots restarted", len(slot_ids))
+
+        self._restarting = False
+        self._restart_done_event.set()
+
+    def _upload_slot_usage_with_delay(self, delay: int = 1):
+        self._usage_upload_task = self.ref().upload_slot_usages.tell_delay(
+            periodical=True, delay=delay
+        )
+
+    async def upload_slot_usages(self, periodical: bool = False):
+        delays = []
+        slot_infos = []
+        global_resource_ref = await self._get_global_resource_ref()
+
+        if global_resource_ref is None:  # pragma: no cover
+            if periodical:
+                self._upload_slot_usage_with_delay()
+            return
+
+        for slot_id, proc in self._slot_to_proc.items():
+            if slot_id not in self._slot_to_session_stid:
+                continue
+            session_id, subtask_id = self._slot_to_session_stid[slot_id]
+            cpu_usage, gpu_usage, processor_usage = 0, 0, 0
+            if self._band_name.startswith("gpu"):
+                processor_usage = gpu_usage = 1
+            else:
+                try:
+                    processor_usage = cpu_usage = (
+                        proc.cpu_percent(interval=None) / 100.0
+                    )
+                except psutil.NoSuchProcess:  # pragma: no cover
+                    continue
+                except psutil.AccessDenied as e:  # pragma: no cover
+                    logger.warning("Access denied when getting cpu percent: %s", e)
+                    processor_usage = cpu_usage = 0.0
+
+            slot_infos.append(
+                WorkerSlotInfo(
+                    slot_id=slot_id,
+                    session_id=session_id,
+                    subtask_id=subtask_id,
+                    processor_usage=processor_usage,
+                )
+            )
+
+            if global_resource_ref is not None:  # pragma: no branch
+                # FIXME fix band slot mistake
+                delays.append(
+                    global_resource_ref.update_subtask_resources.delay(
+                        self._band[1],
+                        session_id,
+                        subtask_id,
+                        Resource(
+                            num_cpus=max(1.0, cpu_usage), num_gpus=max(1.0, gpu_usage)
+                        ),
+                    )
+                )
+
+        if delays:  # pragma: no branch
+            yield global_resource_ref.update_subtask_resources.batch(*delays)
+        if self._cluster_api is not None:
+            await self._cluster_api.set_band_slot_infos(self._band_name, slot_infos)
+
+        if periodical:
+            self._upload_slot_usage_with_delay()
+
+    def dump_data(self):
+        """
+        Get all refs of slots of a queue
+        """
+        return DispatchDumpType(self._free_slots, self._fresh_slots)
+
+
+class BandSlotControlActor(mo.Actor):
+    @classmethod
+    def gen_uid(cls, band_name: str, slot_id: int):
+        return f"{band_name}_{slot_id}_band_slot_control"
+
+    def __init__(self, manager_ref, band_name, slot_id: int):
+        self._manager_ref = manager_ref
+        self._band_name = band_name
+        self._slot_id = slot_id
+        self._report_task = None
+
+    async def __post_create__(self):
+        self._report_task = asyncio.create_task(self._report_slot_ready())
+
+    async def _report_slot_ready(self):
+        from ...cluster.api import ClusterAPI
+
+        try:
+            self._cluster_api = await ClusterAPI.create(self.address)
+            await self._cluster_api.wait_node_ready()
+        except mo.ActorNotExist:
+            pass
+
+        await mo.wait_actor_pool_recovered(self.address)
+        await self._manager_ref.register_slot.tell(self._slot_id, os.getpid())
diff --git a/python/xorbits/_mars/services/session/__init__.py b/python/xorbits/_mars/services/session/__init__.py
new file mode 100644
index 000000000..d46c9cf03
--- /dev/null
+++ b/python/xorbits/_mars/services/session/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .api import AbstractSessionAPI, MockSessionAPI, SessionAPI, WebSessionAPI
+from .supervisor import SessionActor, SessionManagerActor
diff --git a/python/xorbits/_mars/services/session/api/__init__.py b/python/xorbits/_mars/services/session/api/__init__.py
new file mode 100644
index 000000000..ed925376b
--- /dev/null
+++ b/python/xorbits/_mars/services/session/api/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import AbstractSessionAPI
+from .oscar import MockSessionAPI, SessionAPI
+from .web import WebSessionAPI
diff --git a/python/xorbits/_mars/services/session/api/core.py b/python/xorbits/_mars/services/session/api/core.py
new file mode 100644
index 000000000..e8bc35687
--- /dev/null
+++ b/python/xorbits/_mars/services/session/api/core.py
@@ -0,0 +1,110 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Dict, List, Union
+
+from ..core import SessionInfo
+
+
+class AbstractSessionAPI(ABC):
+    @abstractmethod
+    async def get_sessions(self) -> List[SessionInfo]:
+        """
+        Get information of all sessions
+
+        Returns
+        -------
+        session_infos : List[SessionInfo]
+            List of session infos.
+        """
+
+    @abstractmethod
+    async def create_session(self, session_id: str) -> str:
+        """
+        Create session and return address.
+
+        Parameters
+        ----------
+        session_id : str
+            Session ID
+
+        Returns
+        -------
+        address : str
+            Session address.
+        """
+
+    @abstractmethod
+    async def delete_session(self, session_id: str):
+        """
+        Delete session.
+
+        Parameters
+        ----------
+        session_id : str
+            Session ID.
+        """
+
+    @abstractmethod
+    async def delete_all_sessions(self):
+        """
+        Delete all sessions.
+        """
+
+    @abstractmethod
+    async def get_last_idle_time(
+        self, session_id: Union[str, None] = None
+    ) -> Union[float, None]:
+        """
+        Get session last idle time.
+
+        Parameters
+        ----------
+        session_id : str, None
+            Session ID. None for all sessions.
+
+        Returns
+        -------
+        last_idle_time: str
+            The last idle time if the session(s) is idle else None.
+        """
+
+    @abstractmethod
+    async def fetch_tileable_op_logs(
+        self,
+        session_id: str,
+        tileable_op_key: str,
+        chunk_op_key_to_offsets: Dict[str, List[int]],
+        chunk_op_key_to_sizes: Dict[str, List[int]],
+    ) -> Dict:
+        """
+        Fetch tileable op's logs
+
+        Parameters
+        ----------
+        session_id : str
+            Session ID.
+        tileable_op_key : str
+            Tileable op key.
+        chunk_op_key_to_offsets : str or int or list of int
+            Fetch offsets.
+        chunk_op_key_to_sizes : str or int or list of int
+            Fetch sizes.
+
+        Returns
+        -------
+        logs : dict
+            chunk op key to result.
+        """
diff --git a/python/xorbits/_mars/services/session/api/oscar.py b/python/xorbits/_mars/services/session/api/oscar.py
new file mode 100644
index 000000000..69d0a3c1d
--- /dev/null
+++ b/python/xorbits/_mars/services/session/api/oscar.py
@@ -0,0 +1,207 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Union
+
+from .... import oscar as mo
+from ....lib.aio import alru_cache
+from ....utils import parse_readable_size
+from ..core import SessionInfo
+from ..supervisor import CustomLogMetaActor, SessionActor, SessionManagerActor
+from ..worker import CustomLogActor
+from .core import AbstractSessionAPI
+
+
+class SessionAPI(AbstractSessionAPI):
+    def __init__(
+        self, address: str, session_manager: mo.ActorRefType[SessionManagerActor]
+    ):
+        self._address = address
+        self._session_manager_ref = session_manager
+
+    @classmethod
+    @alru_cache(cache_exceptions=False)
+    async def create(cls, address: str, **kwargs) -> "SessionAPI":
+        if kwargs:  # pragma: no cover
+            raise TypeError(f"SessionAPI.create got unknown arguments: {list(kwargs)}")
+        session_manager = await mo.actor_ref(address, SessionManagerActor.default_uid())
+        return SessionAPI(address, session_manager)
+
+    async def create_session(self, session_id: str) -> str:
+        session_actor_ref = await self._session_manager_ref.create_session(session_id)
+        return session_actor_ref.address
+
+    async def get_sessions(self) -> List[SessionInfo]:
+        return await self._session_manager_ref.get_sessions()
+
+    async def has_session(self, session_id: str) -> bool:
+        """
+        Check if session created.
+
+        Parameters
+        ----------
+        session_id : str
+            Session ID.
+
+        Returns
+        -------
+        if_exists : bool
+        """
+        return await self._session_manager_ref.has_session(session_id)
+
+    async def delete_session(self, session_id: str):
+        await self._session_manager_ref.delete_session(session_id)
+
+    async def delete_all_sessions(self):
+        await self._session_manager_ref.delete_all_sessions()
+
+    @alru_cache(cache_exceptions=False)
+    async def get_session_address(self, session_id: str) -> str:
+        """
+        Get session address.
+
+        Parameters
+        ----------
+        session_id : str
+            Session ID.
+
+        Returns
+        -------
+        address : str
+            Session address.
+        """
+        return (await self._session_manager_ref.get_session_ref(session_id)).address
+
+    async def get_last_idle_time(
+        self, session_id: Union[str, None] = None
+    ) -> Union[float, None]:
+        return await self._session_manager_ref.get_last_idle_time(session_id)
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_session_ref(self, session_id: str) -> mo.ActorRefType[SessionActor]:
+        return await self._session_manager_ref.get_session_ref(session_id)
+
+    async def create_remote_object(
+        self, session_id: str, name: str, object_cls, *args, **kwargs
+    ):
+        session = await self._get_session_ref(session_id)
+        return await session.create_remote_object(name, object_cls, *args, **kwargs)
+
+    async def get_remote_object(self, session_id: str, name: str):
+        session = await self._get_session_ref(session_id)
+        return await session.get_remote_object(name)
+
+    async def destroy_remote_object(self, session_id: str, name: str):
+        session = await self._get_session_ref(session_id)
+        return await session.destroy_remote_object(name)
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_custom_log_meta_ref(
+        self, session_id: str
+    ) -> mo.ActorRefType[CustomLogMetaActor]:
+        session = await self._get_session_ref(session_id)
+        return await mo.actor_ref(
+            mo.ActorRef(session.address, CustomLogMetaActor.gen_uid(session_id))
+        )
+
+    async def register_custom_log_path(
+        self,
+        session_id: str,
+        tileable_op_key: str,
+        chunk_op_key: str,
+        worker_address: str,
+        log_path: str,
+    ):
+        custom_log_meta_ref = await self._get_custom_log_meta_ref(session_id)
+        return await custom_log_meta_ref.register_custom_log_path(
+            tileable_op_key, chunk_op_key, worker_address, log_path
+        )
+
+    @classmethod
+    async def new_custom_log_dir(cls, address: str, session_id: str):
+        try:
+            ref = await mo.actor_ref(mo.ActorRef(address, CustomLogActor.default_uid()))
+        except mo.ActorNotExist:
+            return
+        return await ref.new_custom_log_dir(session_id)
+
+    async def fetch_tileable_op_logs(
+        self,
+        session_id: str,
+        tileable_op_key: str,
+        chunk_op_key_to_offsets: Dict[str, List[int]],
+        chunk_op_key_to_sizes: Dict[str, List[int]],
+    ) -> Dict:
+        custom_log_meta_ref = await self._get_custom_log_meta_ref(session_id)
+        chunk_op_key_to_arr_paths = await custom_log_meta_ref.get_tileable_op_log_paths(
+            tileable_op_key
+        )
+        if chunk_op_key_to_arr_paths is None:
+            return
+        worker_to_kwds = dict()
+        for chunk_op_key, addr_path in chunk_op_key_to_arr_paths.items():
+            worker_address, log_path = addr_path
+            if isinstance(chunk_op_key_to_offsets, dict):
+                offset = chunk_op_key_to_offsets.get(chunk_op_key, 0)
+            elif isinstance(chunk_op_key_to_offsets, str):
+                offset = int(parse_readable_size(chunk_op_key_to_offsets)[0])
+            elif isinstance(chunk_op_key_to_offsets, int):
+                offset = chunk_op_key_to_offsets
+            else:
+                offset = 0
+            if isinstance(chunk_op_key_to_sizes, dict):
+                size = chunk_op_key_to_sizes.get(chunk_op_key, -1)
+            elif isinstance(chunk_op_key_to_sizes, str):
+                size = int(parse_readable_size(chunk_op_key_to_sizes)[0])
+            elif isinstance(chunk_op_key_to_sizes, int):
+                size = chunk_op_key_to_sizes
+            else:
+                size = -1
+            if worker_address not in worker_to_kwds:
+                worker_to_kwds[worker_address] = {
+                    "chunk_op_keys": [],
+                    "log_paths": [],
+                    "offsets": [],
+                    "sizes": [],
+                }
+            kwds = worker_to_kwds[worker_address]
+            kwds["chunk_op_keys"].append(chunk_op_key)
+            kwds["log_paths"].append(log_path)
+            kwds["offsets"].append(offset)
+            kwds["sizes"].append(size)
+        result = dict()
+        for worker, kwds in worker_to_kwds.items():
+            custom_log_ref = await mo.actor_ref(
+                mo.ActorRef(worker, CustomLogActor.default_uid())
+            )
+            chunk_op_keys = kwds.pop("chunk_op_keys")
+            logs = await custom_log_ref.fetch_logs(**kwds)
+            for chunk_op_key, log_result in zip(chunk_op_keys, logs):
+                result[chunk_op_key] = log_result
+        return result
+
+
+class MockSessionAPI(SessionAPI):
+    @classmethod
+    async def create(cls, address: str, **kwargs) -> "SessionAPI":
+        session_id = kwargs.pop("session_id")
+        if kwargs:  # pragma: no cover
+            raise TypeError(f"SessionAPI.create got unknown arguments: {list(kwargs)}")
+
+        session_manager = await mo.create_actor(
+            SessionManagerActor, address=address, uid=SessionManagerActor.default_uid()
+        )
+        if session_id:
+            await session_manager.create_session(session_id, create_services=False)
+        return MockSessionAPI(address, session_manager)
diff --git a/python/xorbits/_mars/services/session/api/web.py b/python/xorbits/_mars/services/session/api/web.py
new file mode 100644
index 000000000..4fdda8b23
--- /dev/null
+++ b/python/xorbits/_mars/services/session/api/web.py
@@ -0,0 +1,176 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from typing import Callable, Dict, List, Union
+
+from ....utils import parse_readable_size
+from ...web import MarsServiceWebAPIHandler, MarsWebAPIClientMixin, web_api
+from ..core import SessionInfo
+from .core import AbstractSessionAPI
+
+
+def _encode_size(size: Union[str, Dict[str, List[int]]]) -> str:
+    if not isinstance(size, dict):
+        return size
+    else:
+        return ",".join(f"{k}={v}" for k, v in size.items())
+
+
+def _decode_size(encoded: str) -> Union[int, str, Dict[str, Union[int, List[int]]]]:
+    if not encoded:
+        return 0
+    if "," not in encoded and "=" not in encoded:
+        try:
+            return int(encoded)
+        except ValueError:
+            return int(parse_readable_size(encoded)[0])
+    else:
+        ret = dict()
+        for kv in encoded.split(","):
+            k, v = kv.split("=", 1)
+            ret[k] = int(parse_readable_size(v)[0])
+        return ret
+
+
+class SessionWebAPIBaseHandler(MarsServiceWebAPIHandler):
+    async def _get_oscar_session_api(self):
+        from .oscar import SessionAPI
+
+        return await self._get_api_by_key(SessionAPI, "Session", with_key_arg=False)
+
+
+class SessionWebAPIHandler(SessionWebAPIBaseHandler):
+    @classmethod
+    def get_root_pattern(cls):
+        return "/api/session(?:/(?P<sub_path>[^/]*)$|$)"
+
+    @web_api("(?P<session_id>[^/]+)", method="put")
+    async def create_session(self, session_id: str):
+        oscar_api = await self._get_oscar_session_api()
+        addr = await oscar_api.create_session(session_id)
+        self.write(addr)
+
+    @web_api("(?P<session_id>[^/]+)", method="delete")
+    async def delete_session(self, session_id: str):
+        oscar_api = await self._get_oscar_session_api()
+        await oscar_api.delete_session(session_id)
+
+    @web_api("", method="delete")
+    async def delete_all_sessions(self):
+        oscar_api = await self._get_oscar_session_api()
+        await oscar_api.delete_all_sessions()
+
+    @web_api(
+        "(?P<session_id>[^/]+)", method="get", arg_filter={"action": "check_exist"}
+    )
+    async def has_session(self, session_id: str):
+        oscar_api = await self._get_oscar_session_api()
+        res = await oscar_api.has_session(session_id)
+        self.write("1" if res else "0")
+
+    @web_api(
+        "(?P<session_id>[^/]*)",
+        method="get",
+        arg_filter={"action": "get_last_idle_time"},
+    )
+    async def get_last_idle_time(self, session_id: str):
+        session_id = session_id or None
+        oscar_api = await self._get_oscar_session_api()
+        res = await oscar_api.get_last_idle_time(session_id)
+        self.write(str(res) if res else "")
+
+    @web_api("", method="get")
+    async def get_sessions(self):
+        oscar_api = await self._get_oscar_session_api()
+        res = await oscar_api.get_sessions()
+        self.write(
+            json.dumps({"sessions": [{"session_id": info.session_id} for info in res]})
+        )
+
+
+class SessionWebLogAPIHandler(SessionWebAPIBaseHandler):
+    _root_pattern = "/api/session/(?P<session_id>[^/]+)/op/(?P<op_key>[^/]+)/log"
+
+    @web_api("", method="get")
+    async def fetch_tileable_op_logs(self, session_id: str, op_key: str):
+        oscar_api = await self._get_oscar_session_api()
+        offsets = _decode_size(self.get_argument("offsets", None))
+        sizes = _decode_size(self.get_argument("sizes", None))
+        log_result = await oscar_api.fetch_tileable_op_logs(
+            session_id, op_key, offsets, sizes
+        )
+        self.write(json.dumps(log_result))
+
+
+web_handlers = {
+    SessionWebAPIHandler.get_root_pattern(): SessionWebAPIHandler,
+    SessionWebLogAPIHandler.get_root_pattern(): SessionWebLogAPIHandler,
+}
+
+
+class WebSessionAPI(AbstractSessionAPI, MarsWebAPIClientMixin):
+    def __init__(self, address: str, request_rewriter: Callable = None):
+        self._address = address.rstrip("/")
+        self.request_rewriter = request_rewriter
+
+    async def get_sessions(self) -> List[SessionInfo]:
+        addr = f"{self._address}/api/session"
+        res = await self._request_url("GET", addr)
+        res_obj = json.loads(res.body.decode())
+        return [SessionInfo(**kw) for kw in res_obj["sessions"]]
+
+    async def create_session(self, session_id: str) -> str:
+        addr = f"{self._address}/api/session/{session_id}"
+        res = await self._request_url(path=addr, method="PUT", data=b"")
+        return res.body.decode()
+
+    async def delete_session(self, session_id: str):
+        addr = f"{self._address}/api/session/{session_id}"
+        await self._request_url(path=addr, method="DELETE")
+
+    async def delete_all_sessions(self):
+        addr = f"{self._address}/api/session"
+        await self._request_url(path=addr, method="DELETE")
+
+    async def has_session(self, session_id: str):
+        addr = f"{self._address}/api/session/{session_id}"
+        params = dict(action="check_exist")
+        res = await self._request_url("GET", addr, params=params)
+        return bool(int(res.body.decode()))
+
+    async def get_last_idle_time(
+        self, session_id: Union[str, None] = None
+    ) -> Union[float, None]:
+        session_id = session_id or ""
+        addr = f"{self._address}/api/session/{session_id}"
+        params = dict(action="get_last_idle_time")
+        res = await self._request_url("GET", addr, params=params)
+        content = res.body.decode()
+        return float(content) if content else None
+
+    async def fetch_tileable_op_logs(
+        self,
+        session_id: str,
+        tileable_op_key: str,
+        chunk_op_key_to_offsets: Dict[str, List[int]],
+        chunk_op_key_to_sizes: Dict[str, List[int]],
+    ) -> Dict:
+        addr = f"{self._address}/api/session/{session_id}/op/{tileable_op_key}/log"
+        params = dict(
+            offsets=_encode_size(chunk_op_key_to_offsets),
+            sizes=_encode_size(chunk_op_key_to_sizes),
+        )
+        res = await self._request_url("GET", addr, params=params)
+        return json.loads(res.body.decode())
diff --git a/python/xorbits/_mars/services/session/core.py b/python/xorbits/_mars/services/session/core.py
new file mode 100644
index 000000000..417c92618
--- /dev/null
+++ b/python/xorbits/_mars/services/session/core.py
@@ -0,0 +1,19 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...serialization.serializables import Serializable, StringField
+
+
+class SessionInfo(Serializable):
+    session_id: str = StringField("session_id")
diff --git a/python/xorbits/_mars/services/session/supervisor/__init__.py b/python/xorbits/_mars/services/session/supervisor/__init__.py
new file mode 100644
index 000000000..c229ed8d4
--- /dev/null
+++ b/python/xorbits/_mars/services/session/supervisor/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import SessionActor, SessionManagerActor
+from .custom_log import CustomLogMetaActor
+from .service import SessionSupervisorService
diff --git a/python/xorbits/_mars/services/session/supervisor/core.py b/python/xorbits/_mars/services/session/supervisor/core.py
new file mode 100644
index 000000000..f8398cd62
--- /dev/null
+++ b/python/xorbits/_mars/services/session/supervisor/core.py
@@ -0,0 +1,236 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import functools
+import logging
+import os
+import time
+from typing import Dict, List, Optional
+
+from .... import oscar as mo
+from ....utils import to_binary
+from ...cluster import ClusterAPI
+from ...core import NodeRole, create_service_session, destroy_service_session
+from ..core import SessionInfo
+
+logger = logging.getLogger(__name__)
+
+
+class SessionManagerActor(mo.Actor):
+    def __init__(self, service_config: Optional[Dict] = None):
+        self._session_refs: Dict[str, mo.ActorRef] = dict()
+        self._cluster_api: Optional[ClusterAPI] = None
+        self._service_config = service_config or dict()
+        self._stored_last_idle_time = None
+
+    async def __post_create__(self):
+        self._cluster_api = await ClusterAPI.create(self.address)
+        self._stored_last_idle_time = time.time()
+
+    async def __pre_destroy__(self):
+        await asyncio.gather(
+            *[mo.destroy_actor(ref) for ref in self._session_refs.values()]
+        )
+
+    async def create_session(self, session_id: str, create_services: bool = True):
+        if session_id in self._session_refs:
+            raise mo.Return(self._session_refs[session_id])
+
+        [address] = await self._cluster_api.get_supervisors_by_keys([session_id])
+        try:
+            session_actor_ref = await mo.create_actor(
+                SessionActor,
+                session_id,
+                self._service_config,
+                address=address,
+                uid=SessionActor.gen_uid(session_id),
+                allocate_strategy=mo.allocate_strategy.RandomSubPool(),
+            )
+        except IndexError:
+            # when there is only one supervisor process, strategy RandomSubPool
+            # fails with IndexError. So we need to retry using strategy Random.
+            session_actor_ref = await mo.create_actor(
+                SessionActor,
+                session_id,
+                self._service_config,
+                address=address,
+                uid=SessionActor.gen_uid(session_id),
+                allocate_strategy=mo.allocate_strategy.Random(),
+            )
+        self._session_refs[session_id] = session_actor_ref
+
+        # sync ref to other managers
+        for supervisor_address in await self._cluster_api.get_supervisors():
+            if supervisor_address == self.address:
+                continue
+            session_manager_ref = await mo.actor_ref(
+                supervisor_address, SessionManagerActor.default_uid()
+            )
+            await session_manager_ref.add_session_ref(session_id, session_actor_ref)
+
+        # let session actor create session-related services
+        if create_services:
+            yield session_actor_ref.create_services()
+
+        raise mo.Return(session_actor_ref)
+
+    def get_sessions(self) -> List[SessionInfo]:
+        return [
+            SessionInfo(session_id=session_id)
+            for session_id in self._session_refs.keys()
+        ]
+
+    def get_session_ref(self, session_id: str):
+        return self._session_refs[session_id]
+
+    def add_session_ref(self, session_id: str, session_actor_ref: mo.ActorRef):
+        self._session_refs[session_id] = session_actor_ref
+
+    def remove_session_ref(self, session_id: str):
+        del self._session_refs[session_id]
+
+    def has_session(self, session_id: str):
+        return session_id in self._session_refs
+
+    async def delete_session(self, session_id):
+        session_actor_ref = self._session_refs.pop(session_id)
+        await session_actor_ref.remove()
+        await mo.destroy_actor(session_actor_ref)
+
+        # sync removing to other managers
+        for supervisor_address in await self._cluster_api.get_supervisors():
+            if supervisor_address == self.address:
+                continue
+            session_manager_ref = await mo.actor_ref(
+                supervisor_address, SessionManagerActor.default_uid()
+            )
+            await session_manager_ref.remove_session_ref(session_id)
+
+    async def delete_all_sessions(self):
+        for session_id in list(self._session_refs):
+            await self.delete_session(session_id)
+
+    async def get_last_idle_time(self, session_id=None):
+        if session_id is not None:
+            session = self._session_refs[session_id]
+            raise mo.Return(await session.get_last_idle_time())
+        else:
+            all_last_idle_time = yield asyncio.gather(
+                *[
+                    session.get_last_idle_time()
+                    for session in self._session_refs.values()
+                ]
+            )
+            if any(last_idle_time is None for last_idle_time in all_last_idle_time):
+                raise mo.Return(None)
+            else:
+                self._stored_last_idle_time = max(
+                    [self._stored_last_idle_time] + all_last_idle_time
+                )
+                raise mo.Return(self._stored_last_idle_time)
+
+
+class SessionActor(mo.Actor):
+    def __init__(self, session_id: str, service_config: Dict):
+        self._session_id = session_id
+
+        self._meta_api = None
+        self._lifecycle_api = None
+        self._task_api = None
+        self._scheduling_api = None
+
+        self._service_config = service_config
+
+        self._custom_log_meta_ref = None
+
+    @classmethod
+    def gen_uid(cls, session_id):
+        return f"{session_id}_session_actor"
+
+    async def __post_create__(self):
+        from .custom_log import CustomLogMetaActor
+
+        self._custom_log_meta_ref = await mo.create_actor(
+            CustomLogMetaActor,
+            self._session_id,
+            address=self.address,
+            uid=CustomLogMetaActor.gen_uid(self._session_id),
+        )
+        logger.debug(
+            "Session %s actor created on pid: %s",
+            self._session_id,
+            os.getpid(),
+        )
+
+    async def remove(self):
+        await destroy_service_session(
+            NodeRole.SUPERVISOR, self._service_config, self._session_id, self.address
+        )
+
+    async def __pre_destroy__(self):
+        await mo.destroy_actor(self._custom_log_meta_ref)
+
+    async def create_services(self):
+        from ...task import TaskAPI
+
+        await create_service_session(
+            NodeRole.SUPERVISOR, self._service_config, self._session_id, self.address
+        )
+        if "task" in self._service_config["services"]:
+            self._task_api = await TaskAPI.create(
+                session_id=self._session_id, address=self.address
+            )
+
+    async def get_last_idle_time(self):
+        if self._task_api is None:
+            return None
+        return await self._task_api.get_last_idle_time()
+
+    async def create_remote_object(self, name: str, object_cls, *args, **kwargs):
+        return await mo.create_actor(
+            RemoteObjectActor,
+            object_cls,
+            args,
+            kwargs,
+            address=self.address,
+            uid=to_binary(name),
+        )
+
+    async def get_remote_object(self, name: str):
+        return await mo.actor_ref(mo.ActorRef(self.address, to_binary(name)))
+
+    async def destroy_remote_object(self, name: str):
+        return await mo.destroy_actor(mo.ActorRef(self.address, to_binary(name)))
+
+
+class RemoteObjectActor(mo.Actor):
+    def __init__(self, object_cls, args, kwargs):
+        self._object = object_cls(*args, **kwargs)
+
+    def __getattr__(self, attr):
+        func = getattr(self._object, attr)
+        if not callable(func):  # pragma: no cover
+            return object.__getattribute__(self._object, attr)
+
+        @functools.wraps(func)
+        async def wrap(*args, **kwargs):
+            # return coroutine to not block current actor
+            if asyncio.iscoroutinefunction(func):
+                return func(*args, **kwargs)
+            else:
+                # for sync call, running in thread
+                return asyncio.to_thread(func, *args, **kwargs)
+
+        return wrap
diff --git a/python/xorbits/_mars/services/session/supervisor/custom_log.py b/python/xorbits/_mars/services/session/supervisor/custom_log.py
new file mode 100644
index 000000000..b09abc456
--- /dev/null
+++ b/python/xorbits/_mars/services/session/supervisor/custom_log.py
@@ -0,0 +1,62 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path
+from collections import defaultdict
+from typing import Dict, Tuple
+
+from .... import oscar as mo
+
+
+class CustomLogMetaActor(mo.Actor):
+    # {tileable_op_key -> {chunk_op_key -> (worker_addr, path)}}
+    _custom_log_path_store: Dict[str, Dict[str, Tuple[str, str]]]
+
+    def __init__(self, session_id: str):
+        self._session_id = session_id
+        self._custom_log_path_store = dict()
+
+    @classmethod
+    def gen_uid(cls, session_id: str):
+        return f"custom_log_{session_id}"
+
+    async def __post_create__(self):
+        from ..worker.custom_log import CustomLogActor
+
+        worker_address_to_paths = defaultdict(set)
+        for address, path in self._custom_log_path_store.values():
+            log_dir = os.path.dirname(path)
+            worker_address_to_paths[address].add(log_dir)
+        for address, paths in worker_address_to_paths.items():
+            ref = await mo.actor_ref(address, CustomLogActor.default_uid())
+            await ref.clear_custom_log_dirs(list(paths))
+
+    def register_custom_log_path(
+        self,
+        tileable_op_key: str,
+        chunk_op_key: str,
+        worker_address: str,
+        log_path: str,
+    ):
+        if tileable_op_key not in self._custom_log_path_store:
+            self._custom_log_path_store[tileable_op_key] = dict()
+        self._custom_log_path_store[tileable_op_key][chunk_op_key] = (
+            worker_address,
+            log_path,
+        )
+
+    def get_tileable_op_log_paths(
+        self, tileable_op_key: str
+    ) -> Dict[str, Tuple[str, str]]:
+        return self._custom_log_path_store.get(tileable_op_key)
diff --git a/python/xorbits/_mars/services/session/supervisor/service.py b/python/xorbits/_mars/services/session/supervisor/service.py
new file mode 100644
index 000000000..f3a865451
--- /dev/null
+++ b/python/xorbits/_mars/services/session/supervisor/service.py
@@ -0,0 +1,45 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import oscar as mo
+from ...core import AbstractService
+from .core import SessionManagerActor
+
+
+class SessionSupervisorService(AbstractService):
+    """
+    Session service on supervisor.
+
+    Session Configuration
+    ---------------------
+    {
+        "session" : {
+        }
+    }
+    """
+
+    async def start(self):
+        await mo.create_actor(
+            SessionManagerActor,
+            self._config,
+            uid=SessionManagerActor.default_uid(),
+            address=self._address,
+        )
+
+    async def stop(self):
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=SessionManagerActor.default_uid(), address=self._address
+            )
+        )
diff --git a/python/xorbits/_mars/services/session/tests/__init__.py b/python/xorbits/_mars/services/session/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/session/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/session/tests/test_service.py b/python/xorbits/_mars/services/session/tests/test_service.py
new file mode 100644
index 000000000..7d9b7b086
--- /dev/null
+++ b/python/xorbits/_mars/services/session/tests/test_service.py
@@ -0,0 +1,206 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import threading
+import time
+
+import numpy as np
+import pytest
+
+from .... import oscar as mo
+from .... import remote as mr
+from ....core import TileableGraph, TileableGraphBuilder
+from ....resource import Resource
+from ....utils import get_next_port
+from ... import NodeRole, start_services, stop_services
+from ...task.api import TaskAPI
+from .. import SessionAPI, WebSessionAPI
+
+
+@pytest.mark.parametrize("test_web", [False, True])
+@pytest.mark.asyncio
+async def test_session_service(test_web):
+    pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+
+    async with pool:
+        config = {
+            "services": ["cluster", "session", "meta"],
+            "cluster": {
+                "backend": "fixed",
+                "lookup_address": pool.external_address,
+            },
+            "meta": {"store": "dict"},
+        }
+        if test_web:
+            config["services"] += ["web"]
+            config["web"] = {"port": get_next_port()}
+
+        await start_services(NodeRole.SUPERVISOR, config, address=pool.external_address)
+
+        if not test_web:
+            session_api = await SessionAPI.create(pool.external_address)
+        else:
+            session_api = WebSessionAPI(f'http://127.0.0.1:{config["web"]["port"]}')
+        session_id = "test_session"
+        session_address = await session_api.create_session(session_id)
+        assert session_address == pool.external_address
+        assert await session_api.has_session(session_id) is True
+        assert (await session_api.get_sessions())[0].session_id == session_id
+        if not test_web:
+            assert await session_api.get_session_address(session_id) == session_address
+        await session_api.delete_session(session_id)
+        assert await session_api.has_session(session_id) is False
+        assert await session_api.get_sessions() == []
+        await session_api.delete_all_sessions()
+        assert await session_api.has_session(session_id) is False
+
+        await stop_services(NodeRole.SUPERVISOR, config, address=pool.external_address)
+
+
+@pytest.mark.asyncio
+async def test_get_last_idle_time():
+    sv_pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+    worker_pool = await mo.create_actor_pool(
+        "127.0.0.1",
+        n_process=2,
+        labels=["main"] + ["numa-0"] * 2,
+        subprocess_start_method="spawn",
+    )
+    async with sv_pool, worker_pool:
+        config = {
+            "services": [
+                "cluster",
+                "session",
+                "meta",
+                "lifecycle",
+                "scheduling",
+                "subtask",
+                "task",
+                "mutable",
+            ],
+            "cluster": {
+                "backend": "fixed",
+                "lookup_address": sv_pool.external_address,
+                "resource": {"numa-0": Resource(num_cpus=2)},
+            },
+            "meta": {"store": "dict"},
+        }
+        await start_services(
+            NodeRole.SUPERVISOR, config, address=sv_pool.external_address
+        )
+        await start_services(
+            NodeRole.WORKER, config, address=worker_pool.external_address
+        )
+
+        start_time = time.time()
+        session_api = await SessionAPI.create(sv_pool.external_address)
+        assert await session_api.get_last_idle_time() < start_time
+
+        session_id = "test_session"
+        await session_api.create_session(session_id)
+        # check last idle time is not None
+        last_idle_time = await session_api.get_last_idle_time(session_id)
+        assert last_idle_time is not None
+        assert await session_api.get_last_idle_time(session_id) == last_idle_time
+        # submit a task
+        task_api = await TaskAPI.create(session_id, sv_pool.external_address)
+
+        def f1():
+            return np.arange(5)
+
+        def f2():
+            return np.arange(5, 10)
+
+        def f3(f1r, f2r):
+            return np.concatenate([f1r, f2r]).sum()
+
+        r1 = mr.spawn(f1)
+        r2 = mr.spawn(f2)
+        r3 = mr.spawn(f3, args=(r1, r2))
+
+        graph = TileableGraph([r3.data])
+        next(TileableGraphBuilder(graph).build())
+        task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False)
+        await task_api.wait_task(task_id)
+        task_result = await task_api.get_task_result(task_id)
+
+        # the error is Actor b'StorageHandlerActor' does not exist
+        assert task_result.error is not None
+
+        # the last idle time is changed
+        new_last_idle_time = await session_api.get_last_idle_time()
+        assert new_last_idle_time is not None
+        assert new_last_idle_time != last_idle_time
+        assert await session_api.get_last_idle_time() == new_last_idle_time
+        assert new_last_idle_time > last_idle_time
+
+        # blocking task.
+        def f4():
+            import time
+
+            time.sleep(10)
+
+        r4 = mr.spawn(f4)
+        graph = TileableGraph([r4.data])
+        next(TileableGraphBuilder(graph).build())
+        await task_api.submit_tileable_graph(graph, fuse_enabled=False)
+        assert await session_api.get_last_idle_time() is None
+
+        await stop_services(
+            NodeRole.WORKER, config, address=worker_pool.external_address
+        )
+        await stop_services(
+            NodeRole.SUPERVISOR, config, address=sv_pool.external_address
+        )
+
+
+@pytest.mark.asyncio
+async def test_dmap():
+    pool = await mo.create_actor_pool("127.0.0.1", n_process=0)
+
+    async with pool:
+        config = {
+            "services": [
+                "cluster",
+                "session",
+                "meta",
+                "lifecycle",
+                "scheduling",
+                "subtask",
+                "task",
+                "mutable",
+            ],
+            "cluster": {
+                "backend": "fixed",
+                "lookup_address": pool.external_address,
+            },
+            "meta": {"store": "dict"},
+        }
+        await start_services(NodeRole.SUPERVISOR, config, address=pool.external_address)
+
+        session_api = await SessionAPI.create(pool.external_address)
+
+        session_id = "test_session"
+        await session_api.create_session(session_id)
+        lock = await session_api.create_remote_object(
+            session_id, "my_lock", threading.Lock
+        )
+        await lock.acquire()
+        lock = await session_api.get_remote_object(session_id, "my_lock")
+        await lock.release()
+        with pytest.raises(AttributeError):
+            await lock.abc()
+        await session_api.destroy_remote_object(session_id, "my_lock")
+
+        await stop_services(NodeRole.SUPERVISOR, config, address=pool.external_address)
diff --git a/python/xorbits/_mars/services/session/worker/__init__.py b/python/xorbits/_mars/services/session/worker/__init__.py
new file mode 100644
index 000000000..1467357ae
--- /dev/null
+++ b/python/xorbits/_mars/services/session/worker/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .custom_log import CustomLogActor
+from .service import SessionWorkerService
diff --git a/python/xorbits/_mars/services/session/worker/custom_log.py b/python/xorbits/_mars/services/session/worker/custom_log.py
new file mode 100644
index 000000000..8441a86d3
--- /dev/null
+++ b/python/xorbits/_mars/services/session/worker/custom_log.py
@@ -0,0 +1,71 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import tempfile
+from typing import Any, Dict, List
+
+from .... import oscar as mo
+from ....lib.aio import AioFileObject
+
+
+class CustomLogActor(mo.Actor):
+    def __init__(self, custom_log_dir: str):
+        self._custom_log_dir = custom_log_dir
+
+    @staticmethod
+    def _get_custom_log_dir(custom_log_dir: str, session_id: str):
+        if custom_log_dir == "auto":
+            return tempfile.mkdtemp(prefix=f"marslog-{session_id}")
+        elif custom_log_dir is None:
+            return
+        else:
+            return os.path.join(custom_log_dir, session_id)
+
+    def new_custom_log_dir(self, session_id: str):
+        custom_log_dir = self._get_custom_log_dir(self._custom_log_dir, session_id)
+        if custom_log_dir:
+            os.makedirs(custom_log_dir, exist_ok=True)
+            return custom_log_dir
+
+    @classmethod
+    def clear_custom_log_dirs(cls, paths: List[str]):
+        _ = [shutil.rmtree(path, ignore_errors=True) for path in paths]
+
+    @classmethod
+    async def fetch_logs(
+        cls, log_paths: List[str], offsets: List[int], sizes: List[int]
+    ) -> List[Dict[str, Any]]:
+        result = []
+        for i, log_path in enumerate(log_paths):
+            log_result = dict()
+
+            offset = offsets[i]
+            size = sizes[i]
+
+            async with AioFileObject(open(log_path)) as f:
+                if offset < 0:
+                    # process negative offset
+                    offset = max(os.path.getsize(log_path) + offset, 0)
+
+                if offset:
+                    await f.seek(offset)
+
+                log_result["log"] = await f.read(size)
+                log_result["offset"] = await f.tell()
+
+            result.append(log_result)
+
+        return result
diff --git a/python/xorbits/_mars/services/session/worker/service.py b/python/xorbits/_mars/services/session/worker/service.py
new file mode 100644
index 000000000..6cf44f496
--- /dev/null
+++ b/python/xorbits/_mars/services/session/worker/service.py
@@ -0,0 +1,45 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import oscar as mo
+from ...core import AbstractService
+from .custom_log import CustomLogActor
+
+
+class SessionWorkerService(AbstractService):
+    """
+    Session service on worker.
+
+    Service Configuration
+    ---------------------
+    {
+        "session" : {
+        }
+    }
+    """
+
+    async def start(self):
+        session_config = self._config.get("session", dict())
+        custom_log_dir = session_config.get("custom_log_dir")
+        await mo.create_actor(
+            CustomLogActor,
+            custom_log_dir,
+            address=self._address,
+            uid=CustomLogActor.default_uid(),
+        )
+
+    async def stop(self):
+        await mo.destroy_actor(
+            mo.create_actor_ref(uid=CustomLogActor.default_uid(), address=self._address)
+        )
diff --git a/python/xorbits/_mars/services/storage/__init__.py b/python/xorbits/_mars/services/storage/__init__.py
new file mode 100644
index 000000000..9743eaac7
--- /dev/null
+++ b/python/xorbits/_mars/services/storage/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .api import MockStorageAPI, StorageAPI, WebStorageAPI
+from .core import DataInfo
+from .errors import DataNotExist
diff --git a/python/xorbits/_mars/services/storage/api/__init__.py b/python/xorbits/_mars/services/storage/api/__init__.py
new file mode 100644
index 000000000..5c0ef6c3f
--- /dev/null
+++ b/python/xorbits/_mars/services/storage/api/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import AbstractStorageAPI
+from .oscar import MockStorageAPI, StorageAPI
+from .web import WebStorageAPI
diff --git a/python/xorbits/_mars/services/storage/api/core.py b/python/xorbits/_mars/services/storage/api/core.py
new file mode 100644
index 000000000..401859776
--- /dev/null
+++ b/python/xorbits/_mars/services/storage/api/core.py
@@ -0,0 +1,81 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Any, List
+
+from ....storage.base import StorageLevel
+from ..core import DataInfo
+
+
+class AbstractStorageAPI(ABC):
+    @abstractmethod
+    async def get(
+        self, data_key: str, conditions: List = None, error: str = "raise"
+    ) -> Any:
+        """
+        Get object by data key.
+
+        Parameters
+        ----------
+        data_key: str
+            date key to get.
+
+        conditions: List
+            Index conditions to pushdown
+
+        error: str
+            raise or ignore
+
+        Returns
+        -------
+            object
+        """
+
+    @abstractmethod
+    async def put(
+        self, data_key: str, obj: object, level: StorageLevel = StorageLevel.MEMORY
+    ) -> DataInfo:
+        """
+        Put object into storage.
+
+        Parameters
+        ----------
+        data_key: str
+            data key to put.
+        obj: object
+            object to put.
+        level: StorageLevel
+            the storage level to put into, MEMORY as default
+
+        Returns
+        -------
+        object information: ObjectInfo
+            the put object information
+        """
+
+    @abstractmethod
+    async def get_infos(self, data_key: str) -> List[DataInfo]:
+        """
+        Get data information items for specific data key
+
+        Parameters
+        ----------
+        data_key
+
+        Returns
+        -------
+        out
+            List of information for specified key
+        """
diff --git a/python/xorbits/_mars/services/storage/api/oscar.py b/python/xorbits/_mars/services/storage/api/oscar.py
new file mode 100644
index 000000000..18922151b
--- /dev/null
+++ b/python/xorbits/_mars/services/storage/api/oscar.py
@@ -0,0 +1,363 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from typing import Any, List, Tuple, Type, TypeVar, Union
+
+from .... import oscar as mo
+from ....lib.aio import alru_cache
+from ....storage.base import StorageFileObject, StorageLevel
+from ...cluster import StorageInfo
+from ..core import (
+    DataInfo,
+    DataManagerActor,
+    StorageManagerActor,
+    WrappedStorageFileObject,
+)
+from ..handler import StorageHandlerActor
+from .core import AbstractStorageAPI
+
+_is_windows = sys.platform.lower().startswith("win")
+APIType = TypeVar("APIType", bound="StorageAPI")
+
+
+class StorageAPI(AbstractStorageAPI):
+    _storage_handler_ref: mo.ActorRefType[StorageHandlerActor]
+    _data_manager_ref: mo.ActorRefType[DataManagerActor]
+
+    def __init__(self, address: str, session_id: str, band_name: str):
+        self._address = address
+        self._session_id = session_id
+        self._band_name = band_name
+
+    async def _init(self):
+        self._storage_handler_ref = await mo.actor_ref(
+            self._address, StorageHandlerActor.gen_uid(self._band_name)
+        )
+        self._data_manager_ref = await mo.actor_ref(
+            self._address, DataManagerActor.default_uid()
+        )
+
+    @classmethod
+    @alru_cache(cache_exceptions=False)
+    async def create(
+        cls: Type[APIType],
+        session_id: str,
+        address: str,
+        band_name: str = "numa-0",
+        **kwargs,
+    ) -> APIType:
+        """
+        Create storage API.
+
+        Parameters
+        ----------
+        session_id: str
+            session id
+
+        address: str
+            worker address
+
+        band_name: str
+            name of band, default as 'numa-0'
+
+        Returns
+        -------
+        storage_api
+            Storage api.
+        """
+        if kwargs:  # pragma: no cover
+            raise TypeError(f'Got unexpected arguments: {",".join(kwargs)}')
+        api = StorageAPI(address, session_id, band_name)
+        await api._init()
+        return api
+
+    async def is_seekable(self, storage_level: StorageLevel = None) -> bool:
+        """
+        If storage backend is seekable.
+        """
+        return await self._storage_handler_ref.is_seekable(storage_level)
+
+    @mo.extensible
+    async def get(
+        self, data_key: str, conditions: List = None, error: str = "raise"
+    ) -> Any:
+        return await self._storage_handler_ref.get(
+            self._session_id, data_key, conditions, error
+        )
+
+    @get.batch
+    async def batch_get(self, args_list, kwargs_list):
+        gets = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            gets.append(
+                self._storage_handler_ref.get.delay(self._session_id, *args, **kwargs)
+            )
+        return await self._storage_handler_ref.get.batch(*gets)
+
+    @mo.extensible
+    async def put(
+        self, data_key: str, obj: object, level: StorageLevel = None
+    ) -> DataInfo:
+        return await self._storage_handler_ref.put(
+            self._session_id, data_key, obj, level
+        )
+
+    @put.batch
+    async def batch_put(self, args_list, kwargs_list):
+        puts = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            puts.append(
+                self._storage_handler_ref.put.delay(self._session_id, *args, **kwargs)
+            )
+        return await self._storage_handler_ref.put.batch(*puts)
+
+    @mo.extensible
+    async def get_infos(self, data_key: str) -> List[DataInfo]:
+        """
+        Get data information items for specific data key
+
+        Parameters
+        ----------
+        data_key
+
+        Returns
+        -------
+        out
+            List of information for specified key
+        """
+        return await self._data_manager_ref.get_data_infos(
+            self._session_id, data_key, self._band_name
+        )
+
+    @mo.extensible
+    async def delete(self, data_key: str, error: str = "raise"):
+        """
+        Delete object.
+
+        Parameters
+        ----------
+        data_key: str
+            object key to delete
+        error: str
+            raise or ignore
+        """
+        await self._storage_handler_ref.delete(self._session_id, data_key, error=error)
+
+    @delete.batch
+    async def batch_delete(self, args_list, kwargs_list):
+        deletes = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            deletes.append(
+                self._storage_handler_ref.delete.delay(
+                    self._session_id, *args, **kwargs
+                )
+            )
+        return await self._storage_handler_ref.delete.batch(*deletes)
+
+    @mo.extensible
+    async def fetch(
+        self,
+        data_key: Union[str, Tuple],
+        level: StorageLevel = None,
+        band_name: str = None,
+        remote_address: str = None,
+        error: str = "raise",
+    ):
+        """
+        Fetch object from remote worker or load object from disk.
+
+        Parameters
+        ----------
+        data_key: str or tuple
+            data key(tuple when is shuffle key) to fetch to current worker
+            with specific level.
+        level: StorageLevel
+            the storage level to put into, MEMORY as default
+        band_name: BandType
+            put data on specific band
+        remote_address:
+            remote address that stores the data
+        error: str
+            raise or ignore
+        """
+        await self._storage_handler_ref.fetch_batch(
+            self._session_id, [data_key], level, band_name, remote_address, error
+        )
+
+    @fetch.batch
+    async def batch_fetch(self, args_list, kwargs_list):
+        extracted_args = []
+        data_keys = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            data_key, level, band_name, dest_address, error = self.fetch.bind(
+                *args, **kwargs
+            )
+            if extracted_args:
+                assert extracted_args == (level, band_name, dest_address, error)
+            extracted_args = (level, band_name, dest_address, error)
+            data_keys.append(data_key)
+        await self._storage_handler_ref.fetch_batch(
+            self._session_id, data_keys, *extracted_args
+        )
+
+    @mo.extensible
+    async def unpin(self, data_key: str, error: str = "raise"):
+        """
+        Unpin the data, allow storage to release the data.
+
+        Parameters
+        ----------
+        data_key: str
+            data key to unpin
+        error: str
+            raise or ignore
+        """
+        await self._storage_handler_ref.unpin(self._session_id, data_key, error)
+
+    @unpin.batch
+    async def batch_unpin(self, args_list, kwargs_list):
+        unpins = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            data_key, error = self.unpin.bind(*args, **kwargs)
+            unpins.append(
+                self._storage_handler_ref.unpin.delay(self._session_id, data_key, error)
+            )
+        return await self._storage_handler_ref.unpin.batch(*unpins)
+
+    async def open_reader(self, data_key: str) -> StorageFileObject:
+        """
+        Return a file-like object for reading.
+
+        Parameters
+        ----------
+        data_key: str
+            data key
+
+        Returns
+        -------
+            return a file-like object.
+        """
+        return await self._storage_handler_ref.open_reader(self._session_id, data_key)
+
+    async def open_writer(
+        self, data_key: Union[Tuple, str], size: int, level: StorageLevel = None
+    ) -> WrappedStorageFileObject:
+        """
+        Return a file-like object for writing data.
+
+        Parameters
+        ----------
+        data_key: str or tuple
+            data key
+        size: int
+            the total size of data
+        level: StorageLevel
+            the storage level to write
+
+        Returns
+        -------
+            return a file-like object.
+        """
+        return await self._storage_handler_ref.open_writer(
+            self._session_id, data_key, size, level
+        )
+
+    async def list(self, level: StorageLevel) -> List:
+        """
+        List all stored data_keys in storage.
+
+        Parameters
+        ----------
+        level: StorageLevel
+            the storage level to list all objects
+
+        Returns
+        -------
+            list of data keys
+        """
+        return await self._storage_handler_ref.list(level=level)
+
+    async def get_storage_level_info(self, level: StorageLevel) -> StorageInfo:
+        """
+        Get storage level's info.
+
+        Parameters
+        ----------
+        level : StorageLevel
+            Storage level.
+
+        Returns
+        -------
+        storage_level_info : StorageInfo
+        """
+        return await self._storage_handler_ref.get_storage_level_info(level)
+
+    async def get_storage_info(self, level: StorageLevel) -> dict:
+        """
+        Get the customized storage backend info of requested storage backend.
+
+        Parameters
+        ----------
+        level: StorageLevel
+            the storage level to fetch the backend info.
+
+        Returns
+        -------
+        info : dict
+            Customized storage backend info dict.
+        """
+        return await self._storage_handler_ref.get_storage_backend_info(level)
+
+
+class MockStorageAPI(StorageAPI):
+    @classmethod
+    async def create(
+        cls: Type[APIType], session_id: str, address: str, **kwargs
+    ) -> APIType:
+        from ..core import StorageManagerActor
+
+        storage_configs = kwargs.get("storage_configs")
+        if not storage_configs:
+            if sys.platform == "darwin":
+                plasma_dir = "/tmp"
+            else:
+                plasma_dir = "/dev/shm"
+            plasma_setup_params = dict(
+                store_memory=10 * 1024 * 1024,
+                plasma_directory=plasma_dir,
+                check_dir_size=False,
+            )
+            if _is_windows:
+                storage_configs = {"shared_memory": {}}
+            else:
+                storage_configs = {
+                    "plasma": plasma_setup_params,
+                }
+
+        storage_handler_cls = kwargs.pop("storage_handler_cls", StorageHandlerActor)
+        await mo.create_actor(
+            StorageManagerActor,
+            storage_configs,
+            storage_handler_cls=storage_handler_cls,
+            uid=StorageManagerActor.default_uid(),
+            address=address,
+        )
+        return await super().create(address=address, session_id=session_id)
+
+    @classmethod
+    async def cleanup(cls: Type[APIType], address: str):
+        await mo.destroy_actor(
+            await mo.actor_ref(address, StorageManagerActor.default_uid())
+        )
diff --git a/python/xorbits/_mars/services/storage/api/web.py b/python/xorbits/_mars/services/storage/api/web.py
new file mode 100644
index 000000000..bca97ff7d
--- /dev/null
+++ b/python/xorbits/_mars/services/storage/api/web.py
@@ -0,0 +1,170 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import Any, Callable, List
+
+from .... import oscar as mo
+from ....storage import StorageLevel
+from ....utils import deserialize_serializable, serialize_serializable
+from ...web import MarsServiceWebAPIHandler, MarsWebAPIClientMixin, web_api
+from ..core import DataInfo
+from .core import AbstractStorageAPI
+
+
+class StorageWebAPIHandler(MarsServiceWebAPIHandler):
+    _root_pattern = "/api/session/(?P<session_id>[^/]+)/storage"
+
+    async def _get_oscar_meta_api(self, session_id: str):
+        from ...meta import MetaAPI
+
+        return await self._get_api_by_key(MetaAPI, session_id)
+
+    async def _get_storage_api_by_object_id(self, session_id: str, object_id: str):
+        from .oscar import StorageAPI
+
+        meta_api = await self._get_oscar_meta_api(session_id)
+        bands = (await meta_api.get_chunk_meta(object_id, ["bands"])).get("bands")
+        if not bands:
+            raise KeyError
+        return await StorageAPI.create(session_id, bands[0][0], bands[0][1])
+
+    @web_api("(?P<data_key>[^/]+)", method="get")
+    async def get_data(self, session_id: str, data_key: str):
+        oscar_api = await self._get_storage_api_by_object_id(session_id, data_key)
+        result = await oscar_api.get(data_key)
+        self.write(serialize_serializable(result))
+
+    @web_api("batch/get", method="post")
+    async def get_batch_data(self, session_id: str):
+        body_args = deserialize_serializable(self.request.body)
+        storage_api_to_gets = defaultdict(list)
+        storage_api_to_idx = defaultdict(list)
+        results = [None] * len(body_args)
+        for i, (data_key, conditions, error) in enumerate(body_args):
+            oscar_api = await self._get_storage_api_by_object_id(session_id, data_key)
+            storage_api_to_idx[oscar_api].append(i)
+            storage_api_to_gets[oscar_api].append(
+                oscar_api.get.delay(data_key, conditions=conditions, error=error)
+            )
+        for api, fetches in storage_api_to_gets.items():
+            data_list = await api.get.batch(*fetches)
+            for idx, data in zip(storage_api_to_idx[api], data_list):
+                results[idx] = data
+        res_data = serialize_serializable(results)
+        self.write(res_data)
+
+    @web_api("(?P<data_key>[^/]+)", method="post")
+    async def get_data_by_post(self, session_id: str, data_key: str):
+        body_args = (
+            deserialize_serializable(self.request.body) if self.request.body else None
+        )
+        conditions = body_args.get("conditions")
+
+        oscar_api = await self._get_storage_api_by_object_id(session_id, data_key)
+        result = await oscar_api.get(data_key, conditions)
+        self.write(serialize_serializable(result))
+
+    @web_api("(?P<data_key>[^/]+)", method="put")
+    async def put_data(self, session_id: str, data_key: str):
+        level = self.get_argument("level", None) or "MEMORY"
+        level = getattr(StorageLevel, level.upper())
+
+        oscar_api = await self._get_storage_api_by_object_id(session_id, data_key)
+        res = await oscar_api.put(
+            data_key, deserialize_serializable(self.request.body), level
+        )
+        self.write(serialize_serializable(res))
+
+    @web_api("(?P<data_key>[^/]+)", method="get", arg_filter={"action": "get_infos"})
+    async def get_infos(self, session_id: str, data_key: str):
+        oscar_api = await self._get_storage_api_by_object_id(session_id, data_key)
+        res = await oscar_api.get_infos(data_key)
+        self.write(serialize_serializable(res))
+
+
+web_handlers = {StorageWebAPIHandler.get_root_pattern(): StorageWebAPIHandler}
+
+
+class WebStorageAPI(AbstractStorageAPI, MarsWebAPIClientMixin):
+    def __init__(
+        self,
+        session_id: str,
+        address: str,
+        band_name: str,
+        request_rewriter: Callable = None,
+    ):
+        self._session_id = session_id
+        self._address = address.rstrip("/")
+        self._band_name = band_name
+        self.request_rewriter = request_rewriter
+
+    @mo.extensible
+    async def get(
+        self, data_key: str, conditions: List = None, error: str = "raise"
+    ) -> Any:
+        path = f"{self._address}/api/session/{self._session_id}/storage/{data_key}"
+        params = dict(error=error)
+        if conditions is not None:
+            params["conditions"] = conditions
+        body = serialize_serializable(params)
+        res = await self._request_url(
+            path=path,
+            method="POST",
+            headers={"Content-Type": "application/octet-stream"},
+            data=body,
+        )
+        return deserialize_serializable(res.body)
+
+    @get.batch
+    async def get_batch(self, args_list, kwargs_list):
+        get_chunks = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            data_key, conditions, error = self.get.bind(*args, **kwargs)
+            get_chunks.append([data_key, conditions, error])
+
+        path = f"{self._address}/api/session/{self._session_id}/storage/batch/get"
+        res = await self._request_url(
+            path=path,
+            method="POST",
+            data=serialize_serializable(get_chunks),
+        )
+        return deserialize_serializable(res.body)
+
+    @mo.extensible
+    async def put(
+        self, data_key: str, obj: object, level: StorageLevel = StorageLevel.MEMORY
+    ) -> DataInfo:
+        params = dict(level=level.name.lower())
+        path = f"{self._address}/api/session/{self._session_id}/storage/{data_key}"
+        res = await self._request_url(
+            path=path,
+            method="PUT",
+            params=params,
+            headers={"Content-Type": "application/octet-stream"},
+            data=serialize_serializable(obj),
+        )
+        return deserialize_serializable(res.body)
+
+    @mo.extensible
+    async def get_infos(self, data_key: str) -> List[DataInfo]:
+        path = f"{self._address}/api/session/{self._session_id}/storage/{data_key}"
+        params = dict(action="get_infos")
+        res = await self._request_url(
+            path=path,
+            method="GET",
+            headers={"Content-Type": "application/octet-stream"},
+            params=params,
+        )
+        return deserialize_serializable(res.body)
diff --git a/python/xorbits/_mars/services/storage/core.py b/python/xorbits/_mars/services/storage/core.py
new file mode 100644
index 000000000..caded2679
--- /dev/null
+++ b/python/xorbits/_mars/services/storage/core.py
@@ -0,0 +1,654 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ... import oscar as mo
+from ...lib.aio import AioFileObject
+from ...oscar.backends.allocate_strategy import IdleLabel, NoIdleSlot
+from ...resource import cuda_card_stats
+from ...storage import StorageLevel, get_storage_backend
+from ...storage.base import ObjectInfo, StorageBackend
+from ...storage.core import StorageFileObject
+from ...utils import dataslots
+from .errors import DataNotExist, StorageFull
+
+logger = logging.getLogger(__name__)
+
+
+def build_data_info(storage_info: ObjectInfo, level, size, band_name=None):
+    # todo handle multiple
+    if band_name is None:
+        band_name = (
+            "numa-0" if storage_info.device is None else f"gpu-{storage_info.device}"
+        )
+    if storage_info.size is None:
+        store_size = size
+    else:
+        store_size = storage_info.size
+    return DataInfo(storage_info.object_id, level, size, store_size, band_name)
+
+
+class WrappedStorageFileObject(AioFileObject):
+    """
+    Wrap to hold ref after write close
+    """
+
+    def __init__(
+        self,
+        file: StorageFileObject,
+        level: StorageLevel,
+        size: int,
+        session_id: str,
+        data_key: Union[str, Tuple],
+        data_manager: mo.ActorRefType["DataManagerActor"],
+        storage_handler: StorageBackend,
+    ):
+        self._object_id = file.object_id
+        super().__init__(file)
+        self._size = size
+        self._level = level
+        self._session_id = session_id
+        self._data_key = data_key
+        self._data_manager = data_manager
+        self._storage_handler = storage_handler
+        # infos for multiple data
+        self._sub_key_infos = dict()
+
+    def __getattr__(self, item):
+        return getattr(self._file, item)
+
+    def commit_once(self, sub_key: Tuple, offset: int, size: int):
+        self._sub_key_infos[sub_key] = (offset, size)
+
+    async def clean_up(self):
+        self._file.close()
+
+    async def close(self):
+        self._file.close()
+        if self._object_id is None:
+            # for some backends like vineyard,
+            # object id is generated after write close
+            self._object_id = self._file.object_id
+        if "w" in self._file.mode:
+            object_info = await self._storage_handler.object_info(self._object_id)
+            object_info.size = self._size
+            data_info = build_data_info(object_info, self._level, self._size)
+            await self._data_manager.put_data_info(
+                self._session_id,
+                self._data_key,
+                data_info,
+                object_info,
+                self._sub_key_infos,
+            )
+
+
+class StorageQuotaActor(mo.Actor):
+    def __init__(
+        self,
+        data_manager: mo.ActorRefType["DataManagerActor"],
+        level: StorageLevel,
+        total_size: Optional[Union[int, float]],
+    ):
+        self._data_manager = data_manager
+        self._total_size = total_size if total_size is None else total_size * 0.95
+        self._used_size = 0
+        self._level = level
+
+    @classmethod
+    def gen_uid(cls, band_name: str, level: StorageLevel):
+        return f"storage_quota_{band_name}_{level}"
+
+    def update_quota(self, size: int):
+        self._used_size += size
+        logger.debug(
+            "Update %s bytes of %s, used size now is %s",
+            size,
+            self._level,
+            self._used_size,
+        )
+
+    def request_quota(self, size: int) -> bool:
+        if self._total_size is not None and size > self._total_size:  # pragma: no cover
+            raise StorageFull(
+                f"Request size {size} is larger than total size {self._total_size}"
+            )
+        if self._total_size is not None and self._used_size + size > self._total_size:
+            logger.debug(
+                "Request %s bytes of %s, used size now is %s,"
+                "space is not enough for the request",
+                size,
+                self._level,
+                self._used_size,
+            )
+            return False
+        else:
+            self._used_size += size
+            logger.debug(
+                "Request %s bytes of %s, used size now is %s, total size is %s",
+                size,
+                self._level,
+                self._used_size,
+                self._total_size,
+            )
+            return True
+
+    def release_quota(self, size: int):
+        self._used_size -= size
+        logger.debug(
+            "Release %s bytes of %s, used size now is %s, total size is %s",
+            size,
+            self._level,
+            self._used_size,
+            self._total_size,
+        )
+
+    def get_quota(self) -> Tuple[float, float]:
+        return self._total_size, self._used_size
+
+
+@dataslots
+@dataclass
+class DataInfo:
+    object_id: object
+    level: StorageLevel
+    memory_size: int
+    store_size: int
+    band: str = None
+    offset: int = None
+
+
+@dataslots
+@dataclass
+class InternalDataInfo:
+    data_info: DataInfo
+    object_info: ObjectInfo
+
+
+@dataslots
+@dataclass
+class SubInfo:
+    store_key: str
+    offset: int
+    size: int
+
+
+class DataManagerActor(mo.Actor):
+    _data_key_to_infos: Dict[Tuple, List[InternalDataInfo]]
+    _data_info_list: Dict[Tuple, Dict]
+    _spill_strategy: Dict[Tuple, Any]
+    _sub_key_to_sub_info: Dict[Tuple, SubInfo]
+    _store_key_to_sub_infos: Dict[Tuple, Dict[Tuple, SubInfo]]
+
+    def __init__(self, bands: List):
+        from .spill import FIFOStrategy
+
+        # mapping key is (session_id, data_key)
+        # mapping value is list of InternalDataInfo
+        self._bands = bands
+        self._data_key_to_infos = defaultdict(list)
+        self._data_info_list = dict()
+        self._spill_strategy = dict()
+        # data key may be a tuple in shuffle cases,
+        # we record the mapping from main key to sub keys,
+        # it's used when decref mapper data using main key
+        self._main_key_to_sub_keys = defaultdict(set)
+        # we may store multiple small data into one file,
+        # it records offset and size.
+        self._sub_key_to_sub_info = dict()
+        self._store_key_to_sub_infos = dict()
+        for level in StorageLevel.__members__.values():
+            for band_name in bands:
+                self._data_info_list[level, band_name] = dict()
+                self._spill_strategy[level, band_name] = FIFOStrategy(level)
+
+    @mo.extensible
+    def get_data_infos(
+        self,
+        session_id: str,
+        data_key: Union[str, Tuple],
+        band_name: str,
+        error: str = "raise",
+    ) -> Optional[Union[List[DataInfo], Dict]]:
+        if (session_id, data_key) in self._data_key_to_infos:
+            available_infos = []
+            for info in self._data_key_to_infos[session_id, data_key]:
+                info_band = info.data_info.band
+                if info_band.startswith("gpu-"):  # pragma: no cover
+                    # not available for different GPU bands
+                    if info_band == band_name:
+                        available_infos.append(info.data_info)
+                else:
+                    available_infos.append(info.data_info)
+            return available_infos
+        else:
+            if error == "raise":
+                raise DataNotExist(f"Data key {session_id, data_key} not exists.")
+            else:
+                return
+
+    @mo.extensible
+    def get_data_info(
+        self,
+        session_id: str,
+        data_key: Union[str, Tuple],
+        band_name: str = None,
+        error: str = "raise",
+    ) -> Union[DataInfo, None]:
+        sub_info = None
+        if (session_id, data_key) in self._sub_key_to_sub_info:
+            sub_info = self._sub_key_to_sub_info[(session_id, data_key)]
+            data_key = sub_info.store_key
+
+        # if the data is stored in multiply levels,
+        # return the lowest level info
+        infos = self.get_data_infos(session_id, data_key, band_name, error)
+        if not infos:
+            return
+        info = sorted(infos, key=lambda x: x.level)[0]
+        if sub_info is not None:
+            return DataInfo(
+                info.object_id,
+                info.level,
+                sub_info.size,
+                sub_info.size,
+                info.band,
+                sub_info.offset,
+            )
+        else:
+            return info
+
+    @mo.extensible
+    def put_data_info(
+        self,
+        session_id: str,
+        data_key: Union[str, Tuple],
+        data_info: DataInfo,
+        object_info: ObjectInfo = None,
+        sub_key_infos: Dict = None,
+    ):
+        info = InternalDataInfo(data_info, object_info)
+        self._data_key_to_infos[(session_id, data_key)].append(info)
+        self._data_info_list[data_info.level, data_info.band][
+            (session_id, data_key)
+        ] = object_info
+        self._spill_strategy[data_info.level, data_info.band].record_put_info(
+            (session_id, data_key), data_info.store_size
+        )
+        if sub_key_infos:
+            for key, (offset, size) in sub_key_infos.items():
+                self._sub_key_to_sub_info[(session_id, key)] = SubInfo(
+                    data_key, offset, size
+                )
+            self._store_key_to_sub_infos[(session_id, data_key)] = sub_key_infos
+        if isinstance(data_key, tuple):
+            self._main_key_to_sub_keys[(session_id, data_key[0])].add(data_key)
+
+    @mo.extensible
+    def delete_data_info(
+        self,
+        session_id: str,
+        data_key: Union[str, Tuple],
+        level: StorageLevel,
+        band_name: str,
+    ):
+        if (session_id, data_key) in self._data_key_to_infos:
+            self._data_info_list[level, band_name].pop((session_id, data_key))
+            self._spill_strategy[level, band_name].record_delete_info(
+                (session_id, data_key)
+            )
+            infos = self._data_key_to_infos[(session_id, data_key)]
+            rest = [info for info in infos if info.data_info.level != level]
+            if len(rest) == 0:
+                del self._data_key_to_infos[(session_id, data_key)]
+            else:  # pragma: no cover
+                self._data_key_to_infos[(session_id, data_key)] = rest
+
+    @mo.extensible
+    def get_store_key(self, session_id: str, data_key: Union[str, Tuple, List]):
+        if (session_id, data_key) in self._sub_key_to_sub_info:
+            return self._sub_key_to_sub_info[(session_id, data_key)].store_key
+        elif (session_id, data_key) in self._main_key_to_sub_keys:
+            # only into when delete mapper main key
+            return list(self._main_key_to_sub_keys[(session_id, data_key)])
+        else:
+            return data_key
+
+    @mo.extensible
+    def get_sub_infos(self, session_id: str, store_key: str):
+        if (session_id, store_key) in self._store_key_to_sub_infos:
+            return self._store_key_to_sub_infos[(session_id, store_key)]
+        else:
+            return None
+
+    def list(self, level: StorageLevel, band_name: str):
+        return list(self._data_info_list[level, band_name].keys())
+
+    @mo.extensible
+    def pin(self, session_id, data_key, band_name, error="raise"):
+        info = self.get_data_info(session_id, data_key, band_name, error=error)
+        if info is not None:
+            self._spill_strategy[info.level, info.band].pin_data((session_id, data_key))
+
+    @mo.extensible
+    def unpin(
+        self,
+        session_id: str,
+        data_keys: List[str],
+        band_name: str,
+        error: str = "raise",
+    ):
+        if error not in ("raise", "ignore"):  # pragma: no cover
+            raise ValueError("error must be raise or ignore")
+        levels = set()
+        for data_key in data_keys:
+            info = self.get_data_info(session_id, data_key, band_name, error)
+            if info:
+                level = info.level
+                self._spill_strategy[level, info.band].unpin_data(
+                    (session_id, data_key)
+                )
+                levels.add(level)
+        return list(levels)
+
+    def get_spillable_size(self, level: StorageLevel, band_name: str):
+        return self._spill_strategy[level, band_name].get_spillable_size()
+
+    async def get_spill_keys(self, level: StorageLevel, band_name: str, size: int):
+        return self._spill_strategy[level, band_name].get_spill_keys(size)
+
+
+class StorageManagerActor(mo.StatelessActor):
+    """
+    Storage manager actor, created only on main process, mainly to setup storage backends
+    and create all the necessary actors for storage service.
+    """
+
+    _data_manager: mo.ActorRefType[DataManagerActor]
+
+    def __init__(
+        self, storage_configs: Dict, transfer_block_size: int = None, **kwargs
+    ):
+        from .handler import StorageHandlerActor
+
+        self._handler_cls = kwargs.pop("storage_handler_cls", StorageHandlerActor)
+        self._storage_configs = storage_configs
+        self._all_bands = None
+        self._cluster_api = None
+        self._upload_task = None
+
+        # params to init and teardown
+        self._init_params = defaultdict(dict)
+        self._teardown_params = defaultdict(dict)
+        self._supervisor_address = None
+
+        # transfer config
+        self._transfer_block_size = transfer_block_size
+        self._quotas = None
+        self._spill_managers = None
+
+    async def __post_create__(self):
+        from ..cluster.api import ClusterAPI
+        from .handler import StorageHandlerActor
+
+        try:
+            self._cluster_api = cluster_api = await ClusterAPI.create(self.address)
+            band_to_resource = await cluster_api.get_bands()
+            self._all_bands = [band[1] for band in band_to_resource]
+        except mo.ActorNotExist:
+            # in some test cases, cluster service is not available
+            self._all_bands = ["numa-0"]
+
+        # stores the mapping from data key to storage info
+        self._data_manager = await mo.create_actor(
+            DataManagerActor,
+            self._all_bands,
+            uid=DataManagerActor.default_uid(),
+            address=self.address,
+        )
+
+        # setup storage backend
+        await self._setup_storage_backends()
+
+        # create in main process
+        default_band_name = "numa-0"
+        await mo.create_actor(
+            self._handler_cls,
+            self._init_params[default_band_name],
+            self._data_manager,
+            self._spill_managers[default_band_name],
+            self._quotas[default_band_name],
+            default_band_name,
+            uid=StorageHandlerActor.gen_uid(default_band_name),
+            address=self.address,
+        )
+
+        # create handler actors for every process
+        await self._create_storage_handler_actors()
+        # create actor for transfer
+        await self._create_transfer_actors()
+        await self.upload_disk_info()
+        # create task for uploading storage usages
+        self._upload_task = asyncio.create_task(self.upload_storage_info())
+
+    async def __pre_destroy__(self):
+        if self._upload_task:
+            self._upload_task.cancel()
+        for _, params in self._teardown_params.items():
+            for backend, teardown_params in params.items():
+                backend_cls = get_storage_backend(backend)
+                await backend_cls.teardown(**teardown_params)
+
+    async def _setup_storage_backends(self):
+        from .spill import SpillManagerActor
+
+        self._quotas = quotas = defaultdict(dict)
+        self._spill_managers = spill_managers = defaultdict(dict)
+        for backend, setup_params in self._storage_configs.items():
+            if backend == "cuda":  # pragma: no cover
+                cuda_infos = await asyncio.to_thread(cuda_card_stats)
+                storage_bands = [s for s in self._all_bands if s.startswith("gpu-")]
+                clients = []
+                for gpu_band in storage_bands:
+                    index = int(gpu_band[4:])
+                    size = cuda_infos[index].fb_mem_info.available
+                    params = dict(size=size, **setup_params)
+                    clients.append(await self._setup_storage(gpu_band, backend, params))
+            else:
+                storage_bands = ["numa-0"]
+                clients = [
+                    await self._setup_storage(band_name, backend, setup_params)
+                    for band_name in storage_bands
+                ]
+
+            for level in StorageLevel.__members__.values():
+                for client, storage_band in zip(clients, storage_bands):
+                    if client.level & level:
+                        logger.debug(
+                            "Create quota manager for %s, total size is %s",
+                            level,
+                            client.size,
+                        )
+                        quotas[storage_band][level] = await mo.create_actor(
+                            StorageQuotaActor,
+                            self._data_manager,
+                            level,
+                            client.size,
+                            uid=StorageQuotaActor.gen_uid(storage_band, level),
+                            address=self.address,
+                        )
+                        spill_managers[storage_band][level] = await mo.create_actor(
+                            SpillManagerActor,
+                            level,
+                            uid=SpillManagerActor.gen_uid(storage_band, level),
+                            address=self.address,
+                        )
+
+    async def _create_storage_handler_actors(self):
+        from .handler import StorageHandlerActor
+        from .transfer import ReceiverManagerActor, SenderManagerActor
+
+        for band_name in self._init_params:
+            strategy = IdleLabel(band_name, "StorageHandler")
+            sender_strategy = IdleLabel(band_name, "sender")
+            receiver_strategy = IdleLabel(band_name, "receiver")
+            init_params = self._get_band_init_params(band_name)
+            band_spill_managers = self._get_band_spill_managers(band_name)
+            band_quotas = self._get_band_quota_refs(band_name)
+            while True:
+                try:
+                    handler_ref = await mo.create_actor(
+                        self._handler_cls,
+                        init_params,
+                        self._data_manager,
+                        band_spill_managers,
+                        band_quotas,
+                        band_name,
+                        uid=StorageHandlerActor.gen_uid(band_name),
+                        address=self.address,
+                        allocate_strategy=strategy,
+                    )
+                    # create transfer actor for GPU bands
+                    if band_name.startswith("gpu-"):  # pragma: no cover
+                        await mo.create_actor(
+                            SenderManagerActor,
+                            band_name,
+                            data_manager_ref=self._data_manager,
+                            storage_handler_ref=handler_ref,
+                            uid=SenderManagerActor.gen_uid(band_name),
+                            address=self.address,
+                            allocate_strategy=sender_strategy,
+                        )
+                        await mo.create_actor(
+                            ReceiverManagerActor,
+                            band_quotas,
+                            handler_ref,
+                            address=self.address,
+                            uid=ReceiverManagerActor.gen_uid(band_name),
+                            allocate_strategy=receiver_strategy,
+                        )
+                except NoIdleSlot:
+                    break
+
+    async def _create_transfer_actors(self):
+        from .handler import StorageHandlerActor
+        from .transfer import ReceiverManagerActor, SenderManagerActor
+
+        default_band_name = "numa-0"
+        sender_strategy = IdleLabel("io", "sender")
+        receiver_strategy = IdleLabel("io", "receiver")
+        handler_strategy = IdleLabel("io", "handler")
+        while True:
+            try:
+                handler_ref = await mo.create_actor(
+                    self._handler_cls,
+                    self._init_params[default_band_name],
+                    self._data_manager,
+                    self._spill_managers[default_band_name],
+                    self._quotas[default_band_name],
+                    default_band_name,
+                    uid=StorageHandlerActor.gen_uid(default_band_name),
+                    address=self.address,
+                    allocate_strategy=handler_strategy,
+                )
+                await mo.create_actor(
+                    SenderManagerActor,
+                    data_manager_ref=self._data_manager,
+                    storage_handler_ref=handler_ref,
+                    uid=SenderManagerActor.gen_uid(default_band_name),
+                    address=self.address,
+                    allocate_strategy=sender_strategy,
+                )
+
+                await mo.create_actor(
+                    ReceiverManagerActor,
+                    self._quotas[default_band_name],
+                    handler_ref,
+                    address=self.address,
+                    uid=ReceiverManagerActor.gen_uid(default_band_name),
+                    allocate_strategy=receiver_strategy,
+                )
+            except NoIdleSlot:
+                break
+
+    def _get_band_init_params(self, band_name):
+        init_params = self._init_params["numa-0"].copy()
+        init_params.update(self._init_params[band_name])
+        return init_params
+
+    def _get_band_quota_refs(self, band_name):
+        band_quotas = self._quotas[band_name].copy()
+        band_quotas.update(self._quotas["numa-0"])
+        return band_quotas
+
+    def _get_band_spill_managers(self, band_name):
+        band_spill_managers = self._spill_managers[band_name].copy()
+        band_spill_managers.update(self._spill_managers["numa-0"])
+        return band_spill_managers
+
+    async def _setup_storage(
+        self, band_name: str, storage_backend: str, storage_config: Dict
+    ):
+        backend = get_storage_backend(storage_backend)
+        storage_config = storage_config or dict()
+        init_params, teardown_params = await backend.setup(**storage_config)
+        client = backend(**init_params)
+        self._init_params[band_name][storage_backend] = init_params
+        self._teardown_params[band_name][storage_backend] = teardown_params
+        return client
+
+    def get_client_params(self):
+        return self._init_params
+
+    async def upload_storage_info(self):
+        from ..cluster import StorageInfo
+
+        if self._cluster_api is not None:
+            while True:
+                upload_tasks = []
+                for band, level_to_quota in self._quotas.items():
+                    for level, quota_ref in level_to_quota.items():
+                        total, used = await quota_ref.get_quota()
+                        used = int(used)
+                        if total is not None:
+                            total = int(total)
+                        storage_info = StorageInfo(
+                            storage_level=level, total_size=total, used_size=used
+                        )
+                        upload_tasks.append(
+                            self._cluster_api.set_band_storage_info.delay(
+                                band, storage_info
+                            )
+                        )
+                await self._cluster_api.set_band_storage_info.batch(*upload_tasks)
+                await asyncio.sleep(0.5)
+
+    async def upload_disk_info(self):
+        from ..cluster import DiskInfo
+
+        disk_infos = []
+        if (
+            self._cluster_api is not None
+            and "filesystem" in self._init_params["numa-0"]
+        ):
+            if self._init_params["numa-0"]["filesystem"]["level"] == StorageLevel.DISK:
+                params = self._init_params["numa-0"]["filesystem"]
+                size = params["size"]
+                for path in params["root_dirs"]:
+                    disk_infos.append(DiskInfo(path=path, limit_size=size))
+                await self._cluster_api.set_node_disk_info(disk_infos)
diff --git a/python/xorbits/_mars/services/storage/errors.py b/python/xorbits/_mars/services/storage/errors.py
new file mode 100644
index 000000000..3c31adce6
--- /dev/null
+++ b/python/xorbits/_mars/services/storage/errors.py
@@ -0,0 +1,26 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...core.base import MarsError
+from ...storage.errors import DataNotExist
+
+DataNotExist = DataNotExist
+
+
+class NoDataToSpill(MarsError):
+    pass
+
+
+class StorageFull(MarsError):
+    pass
diff --git a/python/xorbits/_mars/services/storage/handler.py b/python/xorbits/_mars/services/storage/handler.py
new file mode 100644
index 000000000..f8ad75e49
--- /dev/null
+++ b/python/xorbits/_mars/services/storage/handler.py
@@ -0,0 +1,700 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+from collections import defaultdict
+from typing import Any, Dict, List, Union
+
+from ... import oscar as mo
+from ...serialization import AioDeserializer
+from ...storage import StorageLevel, get_storage_backend
+from ...storage.core import StorageFileObject
+from ...typing import BandType
+from ...utils import calc_data_size, lazy_import
+from ..cluster import ClusterAPI, StorageInfo
+from ..meta import MetaAPI
+from .core import (
+    DataInfo,
+    DataManagerActor,
+    StorageQuotaActor,
+    WrappedStorageFileObject,
+    build_data_info,
+)
+from .errors import DataNotExist, NoDataToSpill
+
+cupy = lazy_import("cupy")
+cudf = lazy_import("cudf")
+
+logger = logging.getLogger(__name__)
+
+
+class StorageHandlerActor(mo.Actor):
+    """
+    Storage handler actor, provide methods like `get`, `put`, etc.
+    This actor is stateful and created on worker's sub pools.
+    """
+
+    def __init__(
+        self,
+        storage_init_params: Dict,
+        data_manager_ref: mo.ActorRefType[DataManagerActor],
+        spill_manager_refs,
+        quota_refs: Dict[StorageLevel, mo.ActorRefType[StorageQuotaActor]],
+        band_name: str = "numa-0",
+    ):
+        from .spill import SpillManagerActor
+
+        self._storage_init_params = storage_init_params
+        self._data_manager_ref = data_manager_ref
+        self._spill_manager_refs: Dict[
+            StorageLevel, mo.ActorRefType[SpillManagerActor]
+        ] = spill_manager_refs
+        self._quota_refs = quota_refs
+        self._band_name = band_name
+        self._supervisor_address = None
+
+    @classmethod
+    def gen_uid(cls, band_name: str):
+        return f"storage_handler_{band_name}"
+
+    @property
+    def highest_level(self):
+        return min(self._quota_refs)
+
+    async def __post_create__(self):
+        self._clients = clients = dict()
+        for backend, init_params in self._storage_init_params.items():
+            logger.debug("Start storage %s with params %s", backend, init_params)
+            storage_cls = get_storage_backend(backend)
+            client = storage_cls(**init_params)
+            for level in StorageLevel.__members__.values():
+                if client.level & level:
+                    clients[level] = client
+
+    def is_seekable(self, level: StorageLevel):
+        if level is None:
+            level = self.highest_level
+        return self._clients[level].is_seekable
+
+    async def _get_data(self, data_info: DataInfo, conditions: List[Any]):
+        if data_info.offset is not None:
+            reader = await self._clients[data_info.level].open_reader(
+                data_info.object_id
+            )
+            await reader.seek(data_info.offset)
+            res = await AioDeserializer(reader).run()
+            if conditions is not None:
+                try:
+                    res = res.iloc[tuple(conditions)]
+                except AttributeError:  # pragma: no cover
+                    res = res[tuple(conditions)]
+        elif conditions is None:
+            res = yield self._clients[data_info.level].get(data_info.object_id)
+        else:
+            try:
+                res = yield self._clients[data_info.level].get(
+                    data_info.object_id, conditions=conditions
+                )
+            except NotImplementedError:
+                data = yield self._clients[data_info.level].get(data_info.object_id)
+                try:
+                    sliced_value = data.iloc[tuple(conditions)]
+                except AttributeError:
+                    sliced_value = data[tuple(conditions)]
+                res = sliced_value
+        raise mo.Return(res)
+
+    @mo.extensible
+    async def get(
+        self,
+        session_id: str,
+        data_key: str,
+        conditions: List = None,
+        error: str = "raise",
+    ):
+        try:
+            data_info = await self._data_manager_ref.get_data_info(
+                session_id, data_key, self._band_name
+            )
+            data = yield self._get_data(data_info, conditions)
+            raise mo.Return(data)
+        except DataNotExist:
+            if error == "raise":
+                raise
+
+    def _get_data_info(
+        self,
+        session_id: str,
+        data_key: str,
+        conditions: List = None,
+        error: str = "raise",
+    ):
+        info = self._data_manager_ref.get_data_info.delay(
+            session_id, data_key, self._band_name, error
+        )
+        return info, conditions
+
+    @get.batch
+    async def batch_get(self, args_list, kwargs_list):
+        infos = []
+        conditions_list = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            info, conditions = self._get_data_info(*args, **kwargs)
+            infos.append(info)
+            conditions_list.append(conditions)
+        data_infos = await self._data_manager_ref.get_data_info.batch(*infos)
+        results = []
+        writer_args = [
+            (info.object_id, info.level)
+            for info in data_infos
+            if info is not None and info.offset is not None
+        ]
+        object_id_to_reader = dict()
+        for object_id, level in writer_args:
+            object_id_to_reader[object_id] = await self._clients[level].open_reader(
+                object_id
+            )
+        for data_info, conditions in zip(data_infos, conditions_list):
+            if data_info is None:
+                results.append(None)
+            elif data_info.offset is not None:
+                reader = object_id_to_reader[data_info.object_id]
+                await reader.seek(data_info.offset)
+                result = await AioDeserializer(reader).run()
+                results.append(result)
+            else:
+                result = yield self._get_data(data_info, conditions)
+                results.append(result)
+        raise mo.Return(results)
+
+    def _get_default_level(self, obj):
+        obj = obj[0] if isinstance(obj, (list, tuple)) else obj
+        if self.highest_level != StorageLevel.GPU:
+            return self.highest_level
+        else:  # pragma: no cover
+            if cudf is not None and isinstance(
+                obj, (cudf.DataFrame, cudf.Series, cudf.Index)
+            ):
+                return StorageLevel.GPU
+            elif cupy is not None and isinstance(obj, cupy.ndarray):
+                return StorageLevel.GPU
+            else:
+                return StorageLevel.MEMORY
+
+    @mo.extensible
+    async def put(
+        self, session_id: str, data_key: str, obj: object, level: StorageLevel = None
+    ) -> DataInfo:
+        if level is None:
+            level = self._get_default_level(obj)
+        size = await asyncio.to_thread(calc_data_size, obj)
+        await self.request_quota_with_spill(level, size)
+        object_info = await self._clients[level].put(obj)
+        data_info = build_data_info(object_info, level, size, self._band_name)
+        await self._data_manager_ref.put_data_info(
+            session_id, data_key, data_info, object_info
+        )
+        if object_info.size is not None and data_info.memory_size != object_info.size:
+            await self._quota_refs[level].update_quota(
+                object_info.size - data_info.memory_size
+            )
+        await self.notify_spillable_space(level)
+        return data_info
+
+    @put.batch
+    async def batch_put(self, args_list, kwargs_list):
+        objs = []
+        data_keys = []
+        session_id = None
+        level = last_level = None
+        sizes = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            session_id, data_key, obj, level = self.put.bind(*args, **kwargs)
+            if level is None:
+                level = self._get_default_level(obj)
+            size = await asyncio.to_thread(calc_data_size, obj)
+            if last_level is not None:
+                assert last_level == level
+            last_level = level
+            objs.append(obj)
+            data_keys.append(data_key)
+            sizes.append(size)
+
+        await self.request_quota_with_spill(level, sum(sizes))
+
+        data_infos = []
+        put_infos = []
+        quota_delta = 0
+        for size, data_key, obj in zip(sizes, data_keys, objs):
+            object_info = await self._clients[level].put(obj)
+            data_info = build_data_info(object_info, level, size, self._band_name)
+            data_infos.append(data_info)
+            if (
+                object_info.size is not None
+                and data_info.memory_size != object_info.size
+            ):
+                # we request memory size before putting, when put finishes,
+                # update quota to the true store size
+                quota_delta += object_info.size - data_info.memory_size
+            put_infos.append(
+                self._data_manager_ref.put_data_info.delay(
+                    session_id, data_key, data_info, object_info
+                )
+            )
+        await self._quota_refs[level].update_quota(quota_delta)
+        await self._data_manager_ref.put_data_info.batch(*put_infos)
+        await self.notify_spillable_space(level)
+        return data_infos
+
+    async def delete_object(
+        self,
+        session_id: str,
+        data_key: Any,
+        data_size: Union[int, float],
+        object_id: Any,
+        level: StorageLevel,
+    ):
+        data_key = await self._data_manager_ref.get_store_key(session_id, data_key)
+        await self._data_manager_ref.delete_data_info(
+            session_id, data_key, level, self._band_name
+        )
+        await self._clients[level].delete(object_id)
+        await self._quota_refs[level].release_quota(data_size)
+
+    @mo.extensible
+    async def delete(self, session_id: str, data_key: str, error: str = "raise"):
+        if error not in ("raise", "ignore"):  # pragma: no cover
+            raise ValueError("error must be raise or ignore")
+
+        data_key = await self._data_manager_ref.get_store_key(session_id, data_key)
+        if isinstance(data_key, list):
+            # delete mapper main key
+            data_keys = data_key
+        else:
+            data_keys = [data_key]
+        for data_key in data_keys:
+            all_infos = await self._data_manager_ref.get_data_infos(
+                session_id, data_key, self._band_name, error
+            )
+            if not all_infos:
+                return
+
+            key_to_infos = (
+                all_infos if isinstance(all_infos, dict) else {data_key: all_infos}
+            )
+
+            for key, infos in key_to_infos.items():
+                for info in infos:
+                    level = info.level
+                    await self._data_manager_ref.delete_data_info(
+                        session_id, key, level, self._band_name
+                    )
+                    await self._clients[level].delete(info.object_id)
+                    await self._quota_refs[level].release_quota(info.store_size)
+
+    @delete.batch
+    async def batch_delete(self, args_list, kwargs_list):
+        session_id = None
+        error = None
+        data_keys = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            session_id, data_key, error = self.delete.bind(*args, **kwargs)
+            data_keys.append(
+                self._data_manager_ref.get_store_key.delay(session_id, data_key)
+            )
+        store_keys = await self._data_manager_ref.get_store_key.batch(*data_keys)
+        data_keys = set()
+        for k in store_keys:
+            if isinstance(k, list):
+                data_keys.update(set(k))
+            else:
+                data_keys.add(k)
+
+        infos_list = await self._data_manager_ref.get_data_infos.batch(
+            *[
+                self._data_manager_ref.get_data_infos.delay(
+                    session_id, data_key, self._band_name, error
+                )
+                for data_key in data_keys
+            ]
+        )
+
+        delete_infos = []
+        to_removes = []
+        level_sizes = defaultdict(lambda: 0)
+        for all_infos, data_key in zip(infos_list, data_keys):
+            if not all_infos:
+                # data not exist and error == 'ignore'
+                continue
+            key_to_infos = (
+                all_infos if isinstance(all_infos, dict) else {data_key: all_infos}
+            )
+
+            for key, infos in key_to_infos.items():
+                for info in infos:
+                    level = info.level
+                    delete_infos.append(
+                        self._data_manager_ref.delete_data_info.delay(
+                            session_id, key, level, info.band
+                        )
+                    )
+                    to_removes.append((level, info.object_id))
+                    level_sizes[level] += info.store_size
+
+        if not delete_infos:
+            # no data to remove
+            return
+
+        await self._data_manager_ref.delete_data_info.batch(*delete_infos)
+        await asyncio.gather(
+            *[self._clients[level].delete(object_id) for level, object_id in to_removes]
+        )
+        for level, size in level_sizes.items():
+            await self._quota_refs[level].release_quota(size)
+
+    @mo.extensible
+    async def open_reader(self, session_id: str, data_key: str) -> StorageFileObject:
+        data_info = await self._data_manager_ref.get_data_info(
+            session_id, data_key, self._band_name
+        )
+        reader = await self._clients[data_info.level].open_reader(data_info.object_id)
+        return reader
+
+    @open_reader.batch
+    async def batch_open_readers(self, args_list, kwargs_list):
+        get_data_infos = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            get_data_infos.append(
+                self._data_manager_ref.get_data_info.delay(
+                    *args, band_name=self._band_name, **kwargs
+                )
+            )
+        data_infos = await self._data_manager_ref.get_data_info.batch(*get_data_infos)
+        return await asyncio.gather(
+            *[
+                self._clients[data_info.level].open_reader(data_info.object_id)
+                for data_info in data_infos
+            ]
+        )
+
+    @mo.extensible
+    async def open_writer(
+        self,
+        session_id: str,
+        data_key: str,
+        size: int,
+        level: StorageLevel,
+        request_quota=True,
+    ) -> WrappedStorageFileObject:
+        if level is None:
+            level = self.highest_level
+        if request_quota:
+            await self.request_quota_with_spill(level, size)
+        writer = await self._clients[level].open_writer(size)
+        return WrappedStorageFileObject(
+            writer,
+            level,
+            size,
+            session_id,
+            data_key,
+            self._data_manager_ref,
+            self._clients[level],
+        )
+
+    @open_writer.batch
+    async def batch_open_writers(self, args_list, kwargs_list):
+        extracted_args = None
+        data_keys, sizes = [], []
+        for args, kwargs in zip(args_list, kwargs_list):
+            session_id, data_key, size, level, request_quota = self.open_writer.bind(
+                *args, **kwargs
+            )
+            if extracted_args:
+                assert extracted_args == (session_id, level, request_quota)
+            extracted_args = (session_id, level, request_quota)
+            data_keys.append(data_key)
+            sizes.append(size)
+        session_id, level, request_quota = extracted_args
+        if level is None:  # pragma: no cover
+            level = self.highest_level
+        if request_quota:  # pragma: no cover
+            await self.request_quota_with_spill(level, sum(sizes))
+        writers = await asyncio.gather(
+            *[self._clients[level].open_writer(size) for size in sizes]
+        )
+        wrapped_writers = []
+        for writer, size, data_key in zip(writers, sizes, data_keys):
+            wrapped_writers.append(
+                WrappedStorageFileObject(
+                    writer,
+                    level,
+                    size,
+                    session_id,
+                    data_key,
+                    self._data_manager_ref,
+                    self._clients[level],
+                )
+            )
+        return wrapped_writers
+
+    async def _get_meta_api(self, session_id: str):
+        if self._supervisor_address is None:
+            cluster_api = await ClusterAPI.create(self.address)
+            [self._supervisor_address] = await cluster_api.get_supervisors_by_keys(
+                [session_id]
+            )
+
+        return await MetaAPI.create(
+            session_id=session_id, address=self._supervisor_address
+        )
+
+    async def _fetch_remote(
+        self,
+        session_id: str,
+        data_keys: List[Union[str, tuple]],
+        remote_band: BandType,
+        error: str,
+    ):
+        remote_manager_ref: mo.ActorRefType[DataManagerActor] = await mo.actor_ref(
+            uid=DataManagerActor.default_uid(), address=remote_band[0]
+        )
+        get_data_infos = []
+        for data_key in data_keys:
+            get_data_infos.append(
+                remote_manager_ref.get_data_info.delay(session_id, data_key, error)
+            )
+        data_infos = await remote_manager_ref.get_data_info.batch(*get_data_infos)
+        data_infos, data_keys = zip(
+            *[
+                (data_info, data_key)
+                for data_info, data_key in zip(data_infos, data_keys)
+                if data_info is not None
+            ]
+        )
+        put_data_info_delays = []
+        fetch_tasks = []
+        for data_info, data_key in zip(data_infos, data_keys):
+            put_data_info_delays.append(
+                self._data_manager_ref.put_data_info.delay(
+                    session_id, data_key, data_info, None
+                )
+            )
+            fetch_tasks.append(
+                self._clients[StorageLevel.REMOTE].fetch(data_info.object_id)
+            )
+        await self._data_manager_ref.put_data_info.batch(*put_data_info_delays)
+        await asyncio.gather(*fetch_tasks)
+
+    async def _fetch_via_transfer(
+        self,
+        session_id: str,
+        data_keys: List[Union[str, tuple]],
+        level: StorageLevel,
+        remote_band: BandType,
+        fetch_band_name: str,
+        error: str,
+    ):
+        from .transfer import SenderManagerActor
+
+        logger.debug("Begin to fetch %s from band %s", data_keys, remote_band)
+        sender_ref: mo.ActorRefType[SenderManagerActor] = await mo.actor_ref(
+            address=remote_band[0], uid=SenderManagerActor.gen_uid(remote_band[1])
+        )
+        await sender_ref.send_batch_data(
+            session_id,
+            data_keys,
+            self._data_manager_ref.address,
+            level,
+            fetch_band_name,
+            error=error,
+        )
+        logger.debug("Finish fetching %s from band %s", data_keys, remote_band)
+
+    async def fetch_batch(
+        self,
+        session_id: str,
+        data_keys: List[str],
+        level: StorageLevel,
+        band_name: str,
+        address: str,
+        error: str,
+    ):
+        if error not in ("raise", "ignore"):  # pragma: no cover
+            raise ValueError("error must be raise or ignore")
+
+        meta_api = await self._get_meta_api(session_id)
+        remote_keys = defaultdict(set)
+        missing_keys = []
+        get_metas = []
+        get_info_delays = []
+        for data_key in data_keys:
+            get_info_delays.append(
+                self._data_manager_ref.get_data_info.delay(
+                    session_id, data_key, band_name, error="ignore"
+                )
+            )
+        data_infos = await self._data_manager_ref.get_data_info.batch(*get_info_delays)
+        pin_delays = []
+        for data_key, info in zip(data_keys, data_infos):
+            # for gpu bands, need transfer between gpu cards
+            if info is not None:
+                if band_name and band_name != info.band:
+                    missing_keys.append(data_key)
+                else:
+                    pin_delays.append(
+                        self._data_manager_ref.pin.delay(
+                            session_id, data_key, self._band_name
+                        )
+                    )
+            else:
+                # Not exists in local, fetch from remote worker
+                missing_keys.append(data_key)
+        if address is None or band_name is None:
+            # some mapper keys are absent, specify error='ignore'
+            # remember that meta only records those main keys
+            get_metas = [
+                (
+                    meta_api.get_chunk_meta.delay(
+                        data_key[0] if isinstance(data_key, tuple) else data_key,
+                        fields=["bands"],
+                        error="ignore",
+                    )
+                )
+                for data_key in missing_keys
+            ]
+        await self._data_manager_ref.pin.batch(*pin_delays)
+
+        if get_metas:
+            metas = await meta_api.get_chunk_meta.batch(*get_metas)
+        else:  # pragma: no cover
+            metas = [{"bands": [(address, band_name)]}] * len(missing_keys)
+        assert len(metas) == len(missing_keys)
+        for data_key, bands in zip(missing_keys, metas):
+            if bands is not None:
+                remote_keys[bands["bands"][0]].add(data_key)
+        transfer_tasks = []
+        fetch_keys = []
+        for band, keys in remote_keys.items():
+            if StorageLevel.REMOTE in self._quota_refs:
+                # if storage support remote level, just fetch object id
+                transfer_tasks.append(
+                    self._fetch_remote(session_id, list(keys), band, error)
+                )
+            else:
+                # fetch via transfer
+                transfer_tasks.append(
+                    self._fetch_via_transfer(
+                        session_id, list(keys), level, band, band_name or band[1], error
+                    )
+                )
+            fetch_keys.extend(list(keys))
+
+        await asyncio.gather(*transfer_tasks)
+
+        set_meta_keys = set()
+        for data_key in fetch_keys:
+            # skip shuffle keys
+            if isinstance(data_key, tuple):
+                set_meta_keys.add(data_key[0])
+            else:
+                set_meta_keys.add(data_key)
+        append_bands_delays = [
+            meta_api.add_chunk_bands.delay(key, [(self.address, self._band_name)])
+            for key in set_meta_keys
+        ]
+
+        if append_bands_delays:
+            await meta_api.add_chunk_bands.batch(*append_bands_delays)
+
+    async def request_quota_with_spill(self, level: StorageLevel, size: int):
+        if await self._quota_refs[level].request_quota(size):
+            return
+        else:
+            total, used = await self._quota_refs[level].get_quota()
+            await self.spill(level, int(used + size - total), size)
+            await self._quota_refs[level].request_quota(size)
+            logger.debug(
+                "Spill is triggered, request %s bytes of %s finished", size, level
+            )
+
+    async def notify_spillable_space(self, level):
+        if await self._spill_manager_refs[level].has_spill_task():
+            total, used = await self._quota_refs[level].get_quota()
+            tasks = []
+            if total is not None:
+                spillable_size = await self._data_manager_ref.get_spillable_size(
+                    level, self._band_name
+                )
+                tasks.append(
+                    self._spill_manager_refs[level].notify_spillable_space(
+                        spillable_size, total - used
+                    )
+                )
+            await asyncio.gather(*tasks)
+
+    async def spill(self, level: StorageLevel, request_size: int, object_size: int):
+        from .spill import spill
+
+        try:
+            await spill(
+                request_size, level, self._band_name, self._data_manager_ref, self
+            )
+        except NoDataToSpill:
+            logger.warning(
+                "No data to spill %s bytes, waiting more space", request_size
+            )
+            size = await self._spill_manager_refs[level].wait_for_space(object_size)
+            await spill(size, level, self._band_name, self._data_manager_ref, self)
+
+    async def list(self, level: StorageLevel) -> List:
+        return await self._data_manager_ref.list(level, self._band_name)
+
+    @mo.extensible
+    async def unpin(self, session_id: str, data_key: str, error: str = "raise"):
+        levels = await self._data_manager_ref.unpin(
+            session_id, [data_key], self._band_name, error
+        )
+        if levels:
+            await self.notify_spillable_space(levels[0])
+
+    @unpin.batch
+    async def batch_unpin(self, args_list, kwargs_list):
+        extracted_args = []
+        data_keys = []
+        for args, kw in zip(args_list, kwargs_list):
+            session_id, data_key, error = self.unpin.bind(*args, **kw)
+            if extracted_args:
+                assert extracted_args == (session_id, error)
+            extracted_args = session_id, error
+            data_keys.append(data_key)
+        if extracted_args:
+            session_id, error = extracted_args
+            levels = await self._data_manager_ref.unpin(
+                session_id, data_keys, self._band_name, error
+            )
+            for level in levels:
+                await self.notify_spillable_space(level)
+
+    async def get_storage_level_info(self, level: StorageLevel) -> StorageInfo:
+        quota_ref = self._quota_refs[level]
+        total_size, used_size = await quota_ref.get_quota()
+        return StorageInfo(
+            storage_level=level,
+            total_size=int(total_size) if total_size else total_size,
+            used_size=int(used_size),
+        )
+
+    async def get_storage_backend_info(self, level: StorageLevel) -> dict:
+        return self._clients[level].backend_info
diff --git a/python/xorbits/_mars/services/storage/spill.py b/python/xorbits/_mars/services/storage/spill.py
new file mode 100644
index 000000000..544d77a91
--- /dev/null
+++ b/python/xorbits/_mars/services/storage/spill.py
@@ -0,0 +1,209 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from typing import List, Tuple
+
+from ... import oscar as mo
+from ...storage import StorageLevel
+from .core import DataManagerActor
+from .errors import NoDataToSpill
+from .handler import StorageHandlerActor
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_SPILL_BLOCK_SIZE = 128 * 1024
+
+
+class SpillStrategy(ABC):
+    @abstractmethod
+    def record_put_info(self, key, data_size: int):
+        """
+        Record the data key and data size when putting into storage
+        """
+
+    @abstractmethod
+    def record_delete_info(self, key):
+        """
+        Record who is removed from storage
+        """
+
+    @abstractmethod
+    def get_spill_keys(self, size: int) -> Tuple[List, List]:
+        """
+        Return sizes and keys for spilling according to spill size
+        """
+
+
+class FIFOStrategy(SpillStrategy):
+    def __init__(self, level: StorageLevel):
+        self._level = level
+        self._data_sizes = dict()
+        self._pinned_keys = defaultdict(int)
+        self._spilling_keys = set()
+
+    def record_put_info(self, key, data_size: int):
+        self._data_sizes[key] = data_size
+
+    def record_delete_info(self, key):
+        self._data_sizes.pop(key, None)
+        if key in self._spilling_keys:
+            self._spilling_keys.remove(key)
+
+    def pin_data(self, key):
+        self._pinned_keys[key] += 1
+
+    def unpin_data(self, key):
+        if key not in self._pinned_keys:
+            return
+        self._pinned_keys[key] -= 1
+        if self._pinned_keys[key] <= 0:
+            del self._pinned_keys[key]
+
+    def get_spillable_size(self):
+        total_size = 0
+        for data_key, data_size in self._data_sizes.items():
+            if (
+                data_key not in self._pinned_keys
+                and data_key not in self._spilling_keys
+            ):
+                total_size += data_size
+        return total_size
+
+    def get_spill_keys(self, size: int) -> Tuple[List, List]:
+        spill_sizes = []
+        spill_keys = []
+        spill_size = 0
+        for data_key, data_size in self._data_sizes.items():
+            if spill_size >= size:
+                break
+            if data_key in self._pinned_keys:
+                continue
+            if data_key in self._spilling_keys:
+                continue
+            spill_sizes.append(data_size)
+            spill_keys.append(data_key)
+            spill_size += data_size
+
+        if spill_size < size:  # pragma: no cover
+            pinned_sizes = dict((k, self._data_sizes[k]) for k in self._pinned_keys)
+            spilling_keys = dict((k, self._data_sizes[k]) for k in self._spilling_keys)
+            logger.debug(
+                "No data can be spilled for level: %s, pinned keys: %s,"
+                " spilling keys: %s",
+                self._level,
+                pinned_sizes,
+                spilling_keys,
+            )
+            raise NoDataToSpill(f"No data can be spilled for level: {self._level}")
+        self._spilling_keys.update(set(spill_keys))
+        return spill_sizes, spill_keys
+
+
+class SpillManagerActor(mo.StatelessActor):
+    """
+    The actor to handle the race condition when NoDataToSpill happens.
+    There are two situations when spill raises `NoDataToSpill`,
+    one is that space is allocated while objects are not put into storage,
+    another is some objects are pinned that can not be spilled,
+    so we create an asyncio event if not have enough objects to spill,
+    when put or unpin happens, we will notify and check spillable size,
+    if size is enough for spilling, call event.set() to wake up spilling task.
+    """
+
+    def __init__(self, level: StorageLevel):
+        self._level = level
+        self._event = None
+        self._lock = asyncio.Lock()
+
+    @classmethod
+    def gen_uid(cls, band_name: str, level: StorageLevel):
+        return f"spill_manager_{band_name}_{level}"
+
+    def has_spill_task(self):
+        return self._event is not None
+
+    def notify_spillable_space(self, spillable_size: int, quota_left: int):
+        event = self._event
+        if event is None:
+            return
+        logger.debug("Notify to check if has space for spilling")
+        if spillable_size + quota_left > event.size:
+            logger.debug(
+                "Check pass, wake up spill task, spill bytes is %s",
+                event.size - quota_left,
+            )
+            event.size = event.size - quota_left
+            event.set()
+
+    async def wait_for_space(self, size: int):
+        # make sure only one spilling task is waiting the event
+        async with self._lock:
+            self._event = event = asyncio.Event()
+            event.size = size
+            await self._event.wait()
+            size = self._event.size
+            self._event = None
+            return size
+
+
+async def spill(
+    request_size: int,
+    level: StorageLevel,
+    band_name: str,
+    data_manager: mo.ActorRefType[DataManagerActor],
+    storage_handler: mo.ActorRefType[StorageHandlerActor],
+    block_size=None,
+    multiplier=1.1,
+):
+    logger.debug(
+        "%s is full, need to spill %s bytes, multiplier is %s",
+        level,
+        request_size,
+        multiplier,
+    )
+    request_size *= multiplier
+    block_size = block_size or DEFAULT_SPILL_BLOCK_SIZE
+    spill_level = level.spill_level()
+    spill_sizes, spill_keys = await data_manager.get_spill_keys(
+        level, band_name, request_size
+    )
+    logger.debug(
+        "Decide to spill %s bytes, data keys are %s", sum(spill_sizes), spill_keys
+    )
+
+    for (session_id, key), size in zip(spill_keys, spill_sizes):
+        reader = await storage_handler.open_reader(session_id, key)
+        writer = await storage_handler.open_writer(session_id, key, size, spill_level)
+        async with reader:
+            async with writer:
+                while True:
+                    block_data = await reader.read(block_size)
+                    if not block_data:
+                        break
+                    else:
+                        await writer.write(block_data)
+        try:
+            await storage_handler.delete_object(
+                session_id, key, size, reader.object_id, level
+            )
+        except KeyError:  # pragma: no cover
+            # workaround for the case that the object
+            # has been deleted during spill
+            logger.debug("Data %s %s is deleted during spill", session_id, key)
+            await storage_handler.delete(session_id, key, error="ignore")
+    logger.debug("Spill finishes, release %s bytes of %s", sum(spill_sizes), level)
diff --git a/python/xorbits/_mars/services/storage/supervisor/__init__.py b/python/xorbits/_mars/services/storage/supervisor/__init__.py
new file mode 100644
index 000000000..cb61fbb57
--- /dev/null
+++ b/python/xorbits/_mars/services/storage/supervisor/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...core import EmptyService
+
+
+class StorageSupervisorService(EmptyService):
+    pass
diff --git a/python/xorbits/_mars/services/storage/tests/__init__.py b/python/xorbits/_mars/services/storage/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/storage/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/storage/tests/test_api.py b/python/xorbits/_mars/services/storage/tests/test_api.py
new file mode 100644
index 000000000..a0d926339
--- /dev/null
+++ b/python/xorbits/_mars/services/storage/tests/test_api.py
@@ -0,0 +1,191 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import tempfile
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .... import oscar as mo
+from .... import tensor as mt
+from ....core import tile
+from ....serialization import AioDeserializer, AioSerializer
+from ....storage import StorageLevel
+from ....tests.core import require_ray
+from ....utils import get_next_port, lazy_import
+from ...cluster import MockClusterAPI
+from ...meta import MockMetaAPI
+from ...session import MockSessionAPI
+from ...web import WebActor
+from ..api import MockStorageAPI, WebStorageAPI
+
+ray = lazy_import("ray")
+vineyard = lazy_import("vineyard")
+
+require_lib = lambda x: x
+storage_configs = []
+
+# plasma backend
+plasma_storage_size = 10 * 1024 * 1024
+if sys.platform == "darwin":
+    plasma_dir = "/tmp"
+else:
+    plasma_dir = "/dev/shm"
+plasma_setup_params = dict(
+    store_memory=plasma_storage_size, plasma_directory=plasma_dir, check_dir_size=False
+)
+if not sys.platform.lower().startswith("win"):
+    storage_configs.append({"plasma": plasma_setup_params})
+
+# ray backend
+if ray is not None:
+    require_lib = require_ray
+    storage_configs.append({"ray": dict()})
+
+# vineyard
+if vineyard is not None:
+    storage_configs.append({"vineyard": dict(vineyard_size="256M")})
+
+# shared_memory
+storage_configs.append({"shared_memory": dict()})
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("storage_configs", storage_configs)
+@pytest.mark.parametrize(
+    "ray_start_regular", [{"enable": ray is not None}], indirect=True
+)
+@require_lib
+async def test_storage_mock_api(ray_start_regular, storage_configs):
+    start_method = "fork" if sys.platform != "win32" else None
+    pool = await mo.create_actor_pool(
+        "127.0.0.1",
+        2,
+        labels=["main", "numa-0", "io"],
+        subprocess_start_method=start_method,
+    )
+    async with pool:
+        session_id = "mock_session_id"
+        storage_api = await MockStorageAPI.create(
+            address=pool.external_address,
+            session_id=session_id,
+            storage_configs=storage_configs,
+        )
+
+        # test put and get
+        value1 = np.random.rand(10, 10)
+        await storage_api.put("data1", value1)
+        get_value1 = await storage_api.get("data1")
+        np.testing.assert_array_equal(value1, get_value1)
+
+        value2 = pd.DataFrame(
+            {
+                "col1": [str(i) for i in range(10)],
+                "col2": np.random.randint(0, 100, (10,)),
+            }
+        )
+        await storage_api.put("data2", value2)
+        get_value2 = await storage_api.get("data2")
+        pd.testing.assert_frame_equal(value2, get_value2)
+
+        sliced_value = await storage_api.get(
+            "data2", conditions=[slice(3, 5), slice(None, None)]
+        )
+        pd.testing.assert_frame_equal(value2.iloc[3:5, :], sliced_value)
+
+        infos = await storage_api.get_infos("data2")
+        assert infos[0].store_size > 0
+
+        await storage_api.delete("data2")
+        buffers = await AioSerializer(value2).run()
+        size = sum(getattr(buf, "nbytes", len(buf)) for buf in buffers)
+        # test open_reader and open_writer
+        writer = await storage_api.open_writer("write_key", size, StorageLevel.MEMORY)
+        async with writer:
+            for buf in buffers:
+                await writer.write(buf)
+
+        reader = await storage_api.open_reader("write_key")
+        async with reader:
+            read_value = await AioDeserializer(reader).run()
+
+        pd.testing.assert_frame_equal(value2, read_value)
+
+        await MockStorageAPI.cleanup(pool.external_address)
+
+
+@pytest.mark.asyncio
+async def test_web_storage_api():
+    from ..api.web import StorageWebAPIHandler
+
+    tempdir = tempfile.mkdtemp()
+    start_method = "fork" if sys.platform != "win32" else None
+    pool = await mo.create_actor_pool(
+        "127.0.0.1", 1, subprocess_start_method=start_method
+    )
+    async with pool:
+        session_id = "mock_session_id"
+        await MockClusterAPI.create(address=pool.external_address)
+        await MockSessionAPI.create(
+            session_id=session_id, address=pool.external_address
+        )
+        meta_api = await MockMetaAPI.create(
+            session_id=session_id, address=pool.external_address
+        )
+        await MockStorageAPI.create(
+            address=pool.external_address,
+            session_id=session_id,
+            storage_configs={
+                "shared_memory": dict(),
+                "disk": dict(root_dirs=[tempdir]),
+            },
+        )
+
+        web_config = {
+            "port": get_next_port(),
+            "web_handlers": {
+                StorageWebAPIHandler.get_root_pattern(): StorageWebAPIHandler
+            },
+        }
+        await mo.create_actor(WebActor, web_config, address=pool.external_address)
+
+        web_storage_api = WebStorageAPI(
+            session_id, f'http://127.0.0.1:{web_config["port"]}', "numa-0"
+        )
+
+        value = np.random.rand(10, 10)
+        t = mt.random.rand(10, 10)
+        t = tile(t)
+        await meta_api.set_chunk_meta(
+            t.chunks[0], bands=[(pool.external_address, "numa-0")]
+        )
+        await web_storage_api.put(t.chunks[0].key, value)
+
+        ret_value = await web_storage_api.get(t.chunks[0].key)
+        np.testing.assert_array_equal(value, ret_value)
+
+        sliced_value = await web_storage_api.get(
+            t.chunks[0].key, conditions=[slice(3, 5), slice(None, None)]
+        )
+        np.testing.assert_array_equal(value[3:5, :], sliced_value)
+
+        infos = await web_storage_api.get_infos(t.chunks[0].key)
+        assert len(infos) == 1
+        assert infos[0].level == StorageLevel.MEMORY
+        assert infos[0].memory_size == t.chunks[0].nbytes
+
+        await MockStorageAPI.cleanup(pool.external_address)
+        await MockClusterAPI.cleanup(pool.external_address)
diff --git a/python/xorbits/_mars/services/storage/tests/test_service.py b/python/xorbits/_mars/services/storage/tests/test_service.py
new file mode 100644
index 000000000..63fa2285f
--- /dev/null
+++ b/python/xorbits/_mars/services/storage/tests/test_service.py
@@ -0,0 +1,209 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .... import oscar as mo
+from ....resource import Resource
+from ....serialization import AioDeserializer, AioSerializer
+from ....storage import StorageLevel
+from ....tests.core import require_cudf, require_cupy
+from ... import NodeRole, start_services, stop_services
+from ...cluster import MockClusterAPI
+from .. import StorageAPI
+
+_is_windows = sys.platform.lower().startswith("win")
+
+
+@pytest.fixture
+async def actor_pools():
+    async def start_pool():
+        start_method = (
+            os.environ.get("POOL_START_METHOD", "forkserver")
+            if sys.platform != "win32"
+            else None
+        )
+        pool = await mo.create_actor_pool(
+            "127.0.0.1",
+            n_process=2,
+            subprocess_start_method=start_method,
+            labels=["main", "numa-0", "io"],
+        )
+        await pool.start()
+        return pool
+
+    worker_pool = await start_pool()
+    try:
+        yield worker_pool
+    finally:
+        await worker_pool.stop()
+
+
+@pytest.mark.asyncio
+async def test_storage_service(actor_pools):
+    worker_pool = actor_pools
+
+    if sys.platform == "darwin":
+        plasma_dir = "/tmp"
+    else:
+        plasma_dir = "/dev/shm"
+    plasma_setup_params = dict(
+        store_memory=10 * 1024 * 1024, plasma_directory=plasma_dir, check_dir_size=False
+    )
+
+    config = {
+        "services": ["storage"],
+        "storage": {
+            "backends": ["plasma" if not _is_windows else "shared_memory"],
+            "plasma": plasma_setup_params,
+        },
+    }
+
+    await start_services(NodeRole.WORKER, config, address=worker_pool.external_address)
+
+    api = await StorageAPI.create("mock_session", worker_pool.external_address)
+    value1 = np.random.rand(10, 10)
+    await api.put("data1", value1)
+    get_value1 = await api.get("data1")
+    np.testing.assert_array_equal(value1, get_value1)
+
+    # test api in subpool
+    subpool_address = list(worker_pool._sub_processes.keys())[0]
+    api2 = await StorageAPI.create("mock_session", subpool_address)
+    assert api2._storage_handler_ref.address == subpool_address
+
+    get_value1 = await api2.get("data1")
+    np.testing.assert_array_equal(value1, get_value1)
+
+    sliced_value = await api2.get("data1", conditions=[slice(None, None), slice(0, 4)])
+    np.testing.assert_array_equal(value1[:, :4], sliced_value)
+
+    await api.unpin("data1")
+
+    value2 = pd.DataFrame(value1)
+    await api2.put("data2", value2)
+
+    get_value2 = await api.get("data2")
+    pd.testing.assert_frame_equal(value2, get_value2)
+
+    # test writer and read
+    buffers = await AioSerializer(value2).run()
+    size = sum(getattr(buf, "nbytes", len(buf)) for buf in buffers)
+    # test open_reader and open_writer
+    writer = await api.open_writer("write_key", size, StorageLevel.MEMORY)
+    async with writer:
+        for buf in buffers:
+            await writer.write(buf)
+
+    reader = await api.open_reader("write_key")
+    async with reader:
+        read_value = await AioDeserializer(reader).run()
+
+    pd.testing.assert_frame_equal(value2, read_value)
+
+    await stop_services(
+        NodeRole.WORKER, address=worker_pool.external_address, config=config
+    )
+
+
+@pytest.fixture
+async def actor_pools_with_gpu():
+    async def start_pool():
+        start_method = (
+            os.environ.get("POOL_START_METHOD", "forkserver")
+            if sys.platform != "win32"
+            else None
+        )
+        pool = await mo.create_actor_pool(
+            "127.0.0.1",
+            n_process=3,
+            subprocess_start_method=start_method,
+            labels=["main", "numa-0", "gpu-0", "io"],
+        )
+        await pool.start()
+        return pool
+
+    worker_pool = await start_pool()
+    try:
+        yield worker_pool
+    finally:
+        await worker_pool.stop()
+
+
+@require_cupy
+@require_cudf
+@pytest.mark.asyncio
+async def test_storage_service_with_cuda(actor_pools_with_gpu):
+    import cudf
+    import cupy
+
+    worker_pool = actor_pools_with_gpu
+
+    if sys.platform == "darwin":
+        plasma_dir = "/tmp"
+    else:
+        plasma_dir = "/dev/shm"
+    plasma_setup_params = dict(
+        store_memory=10 * 1024 * 1024, plasma_directory=plasma_dir, check_dir_size=False
+    )
+
+    config = {
+        "services": ["storage"],
+        "storage": {
+            "backends": ["plasma" if not _is_windows else "shared_memory", "cuda"],
+            "plasma": plasma_setup_params,
+            "cuda": dict(),
+        },
+    }
+
+    await MockClusterAPI.create(
+        worker_pool.external_address,
+        band_to_resource={
+            "numa-0": Resource(num_cpus=1),
+            "gpu-0": Resource(num_gpus=1),
+        },
+        use_gpu=True,
+    )
+    await start_services(NodeRole.WORKER, config, address=worker_pool.external_address)
+
+    storage_api = await StorageAPI.create(
+        "mock_session", worker_pool.external_address, band_name="gpu-0"
+    )
+    data1 = cupy.asarray(np.random.rand(10, 10))
+    await storage_api.put("mock_cupy_key", data1, level=StorageLevel.GPU)
+    get_data1 = await storage_api.get("mock_cupy_key")
+    assert isinstance(get_data1, cupy.ndarray)
+    cupy.testing.assert_array_equal(data1, get_data1)
+
+    data2 = cudf.DataFrame(
+        pd.DataFrame(
+            {
+                "col1": np.arange(10),
+                "col2": [f"str{i}" for i in range(10)],
+                "col3": np.random.rand(10),
+            },
+        )
+    )
+    await storage_api.put("mock_cudf_key", data2, level=StorageLevel.GPU)
+    get_data2 = await storage_api.get("mock_cudf_key")
+    assert isinstance(get_data2, cudf.DataFrame)
+    cudf.testing.assert_frame_equal(data2, get_data2)
+
+    await MockClusterAPI.cleanup(worker_pool.external_address)
+    await stop_services(NodeRole.WORKER, config, address=worker_pool.external_address)
diff --git a/python/xorbits/_mars/services/storage/tests/test_spill.py b/python/xorbits/_mars/services/storage/tests/test_spill.py
new file mode 100644
index 000000000..002799739
--- /dev/null
+++ b/python/xorbits/_mars/services/storage/tests/test_spill.py
@@ -0,0 +1,236 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+import sys
+import tempfile
+
+import numpy as np
+import pytest
+
+from .... import oscar as mo
+from ....storage import PlasmaStorage, StorageLevel
+from ....utils import calc_data_size
+from ...cluster import MockClusterAPI
+from ...cluster.supervisor.node_info import NodeInfoCollectorActor
+from ...cluster.uploader import NodeInfoUploaderActor
+from ..core import StorageManagerActor, StorageQuotaActor, build_data_info
+from ..handler import StorageHandlerActor
+
+# todo enable this test module when spill support added
+#  on storage quotas
+if sys.platform.lower().startswith("win"):
+    pytestmark = pytest.mark.skip
+
+MEMORY_SIZE = 100 * 1024
+
+
+@pytest.fixture
+async def actor_pool():
+    async def start_pool():
+        start_method = (
+            os.environ.get("POOL_START_METHOD", "forkserver")
+            if sys.platform != "win32"
+            else None
+        )
+
+        pool = await mo.create_actor_pool(
+            "127.0.0.1",
+            n_process=2,
+            labels=["main", "numa-0", "io"],
+            subprocess_start_method=start_method,
+        )
+        await pool.start()
+        return pool
+
+    worker_pool = await start_pool()
+    try:
+        yield worker_pool
+    finally:
+        await worker_pool.stop()
+
+
+def _build_storage_config():
+    if sys.platform == "darwin":
+        plasma_dir = "/tmp"
+    else:
+        plasma_dir = "/dev/shm"
+    plasma_setup_params = dict(
+        store_memory=MEMORY_SIZE, plasma_directory=plasma_dir, check_dir_size=False
+    )
+    tempdir = tempfile.mkdtemp()
+    disk_setup_params = dict(root_dirs=tempdir, level="disk")
+    storage_configs = {"plasma": plasma_setup_params, "filesystem": disk_setup_params}
+    return storage_configs
+
+
+@pytest.fixture
+async def create_actors(actor_pool):
+    _ = await MockClusterAPI.create(address=actor_pool.external_address)
+    storage_configs = _build_storage_config()
+    manager_ref = await mo.create_actor(
+        StorageManagerActor,
+        storage_configs,
+        uid=StorageManagerActor.default_uid(),
+        address=actor_pool.external_address,
+    )
+
+    sub_processes = list(actor_pool.sub_processes)
+    yield actor_pool.external_address, sub_processes[0], sub_processes[1]
+    await mo.destroy_actor(manager_ref)
+
+
+@pytest.mark.asyncio
+async def test_spill(create_actors):
+    worker_address, _, _ = create_actors
+    storage_handler = await mo.actor_ref(
+        uid=StorageHandlerActor.gen_uid("numa-0"), address=worker_address
+    )
+
+    storage_manager = await mo.actor_ref(
+        uid=StorageManagerActor.default_uid(), address=worker_address
+    )
+
+    init_params = (await storage_manager.get_client_params())["numa-0"]
+    plasma_init_params = init_params["plasma"]
+    plasma_handler = PlasmaStorage(**plasma_init_params)
+    memory_quota = await mo.actor_ref(
+        StorageQuotaActor,
+        StorageLevel.MEMORY,
+        MEMORY_SIZE,
+        address=worker_address,
+        uid=StorageQuotaActor.gen_uid("numa-0", StorageLevel.MEMORY),
+    )
+
+    # fill to trigger spill
+    session_id = "mock_session"
+    data_list = []
+    key_list = []
+    for i in range(10):
+        data = np.random.randint(0, 10000, (8000,), np.int16)
+        key = f"mock_key_{i}"
+        await storage_handler.put(session_id, key, data, StorageLevel.MEMORY)
+        used = (await memory_quota.get_quota())[1]
+        assert used < MEMORY_SIZE
+        data_list.append(data)
+        key_list.append(key)
+
+    memory_object_list = await storage_handler.list(StorageLevel.MEMORY)
+    disk_object_list = await storage_handler.list(StorageLevel.DISK)
+    assert len(memory_object_list) == 3
+    assert len(disk_object_list) == 7
+
+    for key, data in zip(key_list, data_list):
+        get_data = await storage_handler.get(session_id, key)
+        np.testing.assert_array_equal(data, get_data)
+
+    plasma_list = await plasma_handler.list()
+    assert len(plasma_list) == len(memory_object_list)
+
+
+@pytest.mark.asyncio
+async def test_disk_info(create_actors):
+    worker_address, _, _ = create_actors
+    uploader_ref = await mo.actor_ref(
+        address=worker_address, uid=NodeInfoUploaderActor.default_uid()
+    )
+    await uploader_ref.upload_node_info()
+    collector_ref = await mo.actor_ref(
+        address=worker_address, uid=NodeInfoCollectorActor.default_uid()
+    )
+    storage_manager = await mo.actor_ref(
+        uid=StorageManagerActor.default_uid(), address=worker_address
+    )
+    init_params = (await storage_manager.get_client_params())["numa-0"]
+    assert "filesystem" in init_params
+    assert "level" in init_params["filesystem"]
+    assert init_params["filesystem"]["level"] == StorageLevel.DISK
+
+    node_info = await collector_ref.get_nodes_info(detail=True)
+    disk_partitions = node_info[worker_address]["detail"]["disk"]["partitions"]
+    assert disk_partitions
+    for _, info in disk_partitions.items():
+        assert "inode_used" in info
+
+
+class DelayPutStorageHandler(StorageHandlerActor):
+    async def put(
+        self, session_id: str, data_key: str, obj: object, level: StorageLevel
+    ):
+        size = calc_data_size(obj)
+        await self.request_quota_with_spill(level, size)
+        # sleep to trigger `NoDataToSpill`
+        await asyncio.sleep(0.5)
+        object_info = await self._clients[level].put(obj)
+        data_info = build_data_info(object_info, level, size)
+        await self._data_manager_ref.put_data_info(
+            session_id, data_key, data_info, object_info
+        )
+        if object_info.size is not None and data_info.memory_size != object_info.size:
+            await self._quota_refs[level].update_quota(
+                object_info.size - data_info.memory_size
+            )
+        await self.notify_spillable_space(level)
+        return data_info
+
+
+@pytest.fixture
+async def create_actors_with_delay(actor_pool):
+    storage_configs = _build_storage_config()
+    manager_ref = await mo.create_actor(
+        StorageManagerActor,
+        storage_configs,
+        storage_handler_cls=DelayPutStorageHandler,
+        uid=StorageManagerActor.default_uid(),
+        address=actor_pool.external_address,
+    )
+
+    sub_processes = list(actor_pool.sub_processes)
+    yield actor_pool.external_address, sub_processes[0], sub_processes[1]
+    await mo.destroy_actor(manager_ref)
+
+
+@pytest.mark.asyncio
+async def test_spill_event(create_actors_with_delay):
+    worker_address, sub_pool_address1, sub_pool_address2 = create_actors_with_delay
+    storage_handler1 = await mo.actor_ref(
+        uid=StorageHandlerActor.gen_uid("numa-0"), address=sub_pool_address1
+    )
+    storage_handler2 = await mo.actor_ref(
+        uid=StorageHandlerActor.gen_uid("numa-0"), address=sub_pool_address2
+    )
+    # total store size is 65536, single data size is around 40000
+    # we put two data simultaneously
+    data = np.random.randint(0, 10000, (5000,))
+    session_id = "mock_session"
+    key1 = "mock_key1"
+    key2 = "mock_key2"
+    put1 = asyncio.create_task(
+        storage_handler1.put(session_id, key1, data, StorageLevel.MEMORY)
+    )
+    put2 = asyncio.create_task(
+        storage_handler2.put(session_id, key2, data, StorageLevel.MEMORY)
+    )
+    await asyncio.gather(put1, put2)
+
+    get_data = await storage_handler2.get(session_id, key1)
+    np.testing.assert_array_equal(data, get_data)
+    get_data = await storage_handler1.get(session_id, key2)
+    np.testing.assert_array_equal(data, get_data)
+
+    memory_object_list = await storage_handler1.list(StorageLevel.MEMORY)
+    disk_object_list = await storage_handler1.list(StorageLevel.DISK)
+    assert len(memory_object_list) == 1
+    assert len(disk_object_list) == 1
diff --git a/python/xorbits/_mars/services/storage/tests/test_transfer.py b/python/xorbits/_mars/services/storage/tests/test_transfer.py
new file mode 100644
index 000000000..2e19be19c
--- /dev/null
+++ b/python/xorbits/_mars/services/storage/tests/test_transfer.py
@@ -0,0 +1,322 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+import sys
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .... import oscar as mo
+from ....oscar.backends.allocate_strategy import IdleLabel
+from ....storage import StorageLevel
+from ..core import DataManagerActor, StorageManagerActor, StorageQuotaActor
+from ..errors import DataNotExist
+from ..handler import StorageHandlerActor
+from ..transfer import ReceiverManagerActor, SenderManagerActor
+
+_is_windows = sys.platform.lower().startswith("win")
+
+
+@pytest.fixture
+async def actor_pools():
+    async def start_pool():
+        start_method = (
+            os.environ.get("POOL_START_METHOD", "forkserver")
+            if sys.platform != "win32"
+            else None
+        )
+
+        pool = await mo.create_actor_pool(
+            "127.0.0.1",
+            n_process=2,
+            labels=["main", "numa-0", "io"],
+            subprocess_start_method=start_method,
+        )
+        await pool.start()
+        return pool
+
+    worker_pool_1 = await start_pool()
+    worker_pool_2 = await start_pool()
+    try:
+        yield worker_pool_1, worker_pool_2
+    finally:
+        await worker_pool_1.stop()
+        await worker_pool_2.stop()
+
+
+@pytest.fixture
+async def create_actors(actor_pools):
+    worker_pool_1, worker_pool_2 = actor_pools
+
+    if sys.platform == "darwin":
+        plasma_dir = "/tmp"
+    else:
+        plasma_dir = "/dev/shm"
+    plasma_setup_params = dict(
+        store_memory=5 * 1024 * 1024, plasma_directory=plasma_dir, check_dir_size=False
+    )
+    storage_configs = (
+        {"plasma": plasma_setup_params} if not _is_windows else {"shared_memory": {}}
+    )
+
+    manager_ref1 = await mo.create_actor(
+        StorageManagerActor,
+        storage_configs,
+        uid=StorageManagerActor.default_uid(),
+        address=worker_pool_1.external_address,
+    )
+
+    manager_ref2 = await mo.create_actor(
+        StorageManagerActor,
+        storage_configs,
+        uid=StorageManagerActor.default_uid(),
+        address=worker_pool_2.external_address,
+    )
+    yield worker_pool_1.external_address, worker_pool_2.external_address
+    await mo.destroy_actor(manager_ref1)
+    await mo.destroy_actor(manager_ref2)
+
+
+@pytest.mark.asyncio
+async def test_simple_transfer(create_actors):
+    worker_address_1, worker_address_2 = create_actors
+
+    session_id = "mock_session"
+    data1 = np.random.rand(100, 100)
+    data2 = pd.DataFrame(np.random.randint(0, 100, (500, 10)))
+
+    storage_handler1 = await mo.actor_ref(
+        uid=StorageHandlerActor.gen_uid("numa-0"), address=worker_address_1
+    )
+    storage_handler2 = await mo.actor_ref(
+        uid=StorageHandlerActor.gen_uid("numa-0"), address=worker_address_2
+    )
+
+    await storage_handler1.put(session_id, "data_key1", data1, StorageLevel.MEMORY)
+    await storage_handler1.put(session_id, "data_key2", data2, StorageLevel.MEMORY)
+    await storage_handler2.put(session_id, "data_key3", data2, StorageLevel.MEMORY)
+
+    sender_actor = await mo.actor_ref(
+        address=worker_address_1, uid=SenderManagerActor.gen_uid("numa-0")
+    )
+
+    # send data to worker2 from worker1
+    await sender_actor.send_batch_data(
+        session_id,
+        ["data_key1"],
+        worker_address_2,
+        StorageLevel.MEMORY,
+        block_size=1000,
+    )
+
+    await sender_actor.send_batch_data(
+        session_id,
+        ["data_key2"],
+        worker_address_2,
+        StorageLevel.MEMORY,
+        block_size=1000,
+    )
+
+    get_data1 = await storage_handler2.get(session_id, "data_key1")
+    np.testing.assert_array_equal(data1, get_data1)
+
+    get_data2 = await storage_handler2.get(session_id, "data_key2")
+    pd.testing.assert_frame_equal(data2, get_data2)
+
+    # send data to worker1 from worker2
+    sender_actor = await mo.actor_ref(
+        address=worker_address_2, uid=SenderManagerActor.gen_uid("numa-0")
+    )
+    await sender_actor.send_batch_data(
+        session_id, ["data_key3"], worker_address_1, StorageLevel.MEMORY
+    )
+    get_data3 = await storage_handler1.get(session_id, "data_key3")
+    pd.testing.assert_frame_equal(data2, get_data3)
+
+
+# test for cancelling happens when writing
+class MockReceiverManagerActor(ReceiverManagerActor):
+    async def do_write(self, *args, **kw):
+        await asyncio.sleep(3)
+        await super().do_write(*args, **kw)
+
+
+class MockSenderManagerActor(SenderManagerActor):
+    @staticmethod
+    async def get_receiver_ref(address: str, band_name: str):
+        return await mo.actor_ref(
+            address=address, uid=MockReceiverManagerActor.default_uid()
+        )
+
+
+# test for cancelling happens when creating writer
+class MockReceiverManagerActor2(ReceiverManagerActor):
+    async def create_writers(self, session_id, data_keys, data_sizes, level, sub_infos):
+        await asyncio.sleep(3)
+        return await super().create_writers(
+            session_id, data_keys, data_sizes, level, sub_infos
+        )
+
+
+class MockSenderManagerActor2(SenderManagerActor):
+    @staticmethod
+    async def get_receiver_ref(address: str, band_name: str):
+        return await mo.actor_ref(
+            address=address, uid=MockReceiverManagerActor2.default_uid()
+        )
+
+
+@pytest.mark.parametrize(
+    "mock_sender, mock_receiver",
+    [
+        (MockSenderManagerActor, MockReceiverManagerActor),
+        (MockSenderManagerActor2, MockReceiverManagerActor2),
+    ],
+)
+@pytest.mark.asyncio
+async def test_cancel_transfer(create_actors, mock_sender, mock_receiver):
+    worker_address_1, worker_address_2 = create_actors
+
+    quota_refs = {
+        StorageLevel.MEMORY: await mo.actor_ref(
+            StorageQuotaActor,
+            StorageLevel.MEMORY,
+            5 * 1024 * 1024,
+            address=worker_address_2,
+            uid=StorageQuotaActor.gen_uid("numa-0", StorageLevel.MEMORY),
+        )
+    }
+    data_manager_ref = await mo.actor_ref(
+        uid=DataManagerActor.default_uid(), address=worker_address_1
+    )
+    storage_handler1 = await mo.actor_ref(
+        uid=StorageHandlerActor.gen_uid("numa-0"), address=worker_address_1
+    )
+    storage_handler2 = await mo.actor_ref(
+        uid=StorageHandlerActor.gen_uid("numa-0"), address=worker_address_2
+    )
+
+    sender_actor = await mo.create_actor(
+        mock_sender,
+        data_manager_ref=data_manager_ref,
+        uid=mock_sender.default_uid(),
+        address=worker_address_1,
+        allocate_strategy=IdleLabel("io", "mock_sender"),
+    )
+    await mo.create_actor(
+        mock_receiver,
+        quota_refs,
+        uid=mock_receiver.default_uid(),
+        address=worker_address_2,
+        allocate_strategy=IdleLabel("io", "mock_receiver"),
+    )
+
+    data1 = np.random.rand(10, 10)
+    await storage_handler1.put("mock", "data_key1", data1, StorageLevel.MEMORY)
+    data2 = pd.DataFrame(np.random.rand(100, 100))
+    await storage_handler1.put("mock", "data_key2", data2, StorageLevel.MEMORY)
+
+    used_before = (await quota_refs[StorageLevel.MEMORY].get_quota())[1]
+
+    send_task = asyncio.create_task(
+        sender_actor.send_batch_data(
+            "mock", ["data_key1"], worker_address_2, StorageLevel.MEMORY
+        )
+    )
+
+    await asyncio.sleep(0.5)
+    send_task.cancel()
+
+    with pytest.raises(asyncio.CancelledError):
+        await send_task
+
+    used = (await quota_refs[StorageLevel.MEMORY].get_quota())[1]
+    assert used == used_before
+
+    with pytest.raises(DataNotExist):
+        await storage_handler2.get("mock", "data_key1")
+
+    send_task = asyncio.create_task(
+        sender_actor.send_batch_data(
+            "mock", ["data_key1"], worker_address_2, StorageLevel.MEMORY
+        )
+    )
+    await send_task
+    get_data = await storage_handler2.get("mock", "data_key1")
+    np.testing.assert_array_equal(data1, get_data)
+
+    # cancel when fetch the same data Simultaneously
+    if mock_sender is MockSenderManagerActor:
+        send_task1 = asyncio.create_task(
+            sender_actor.send_batch_data(
+                "mock", ["data_key2"], worker_address_2, StorageLevel.MEMORY
+            )
+        )
+        send_task2 = asyncio.create_task(
+            sender_actor.send_batch_data(
+                "mock", ["data_key2"], worker_address_2, StorageLevel.MEMORY
+            )
+        )
+        await asyncio.sleep(0.5)
+        send_task1.cancel()
+        with pytest.raises(asyncio.CancelledError):
+            await send_task1
+        await send_task2
+        get_data2 = await storage_handler2.get("mock", "data_key2")
+        pd.testing.assert_frame_equal(get_data2, data2)
+
+
+@pytest.mark.asyncio
+async def test_transfer_same_data(create_actors):
+    worker_address_1, worker_address_2 = create_actors
+
+    session_id = "mock_session"
+    data1 = np.random.rand(100, 100)
+    storage_handler1 = await mo.actor_ref(
+        uid=StorageHandlerActor.gen_uid("numa-0"), address=worker_address_1
+    )
+    storage_handler2 = await mo.actor_ref(
+        uid=StorageHandlerActor.gen_uid("numa-0"), address=worker_address_2
+    )
+
+    await storage_handler1.put(session_id, "data_key1", data1, StorageLevel.MEMORY)
+    sender_actor = await mo.actor_ref(
+        address=worker_address_1, uid=SenderManagerActor.gen_uid("numa-0")
+    )
+
+    # send data to worker2 from worker1
+    task1 = asyncio.create_task(
+        sender_actor.send_batch_data(
+            session_id,
+            ["data_key1"],
+            worker_address_2,
+            StorageLevel.MEMORY,
+            block_size=1000,
+        )
+    )
+    task2 = asyncio.create_task(
+        sender_actor.send_batch_data(
+            session_id,
+            ["data_key1"],
+            worker_address_2,
+            StorageLevel.MEMORY,
+            block_size=1000,
+        )
+    )
+    await asyncio.gather(task1, task2)
+    get_data1 = await storage_handler2.get(session_id, "data_key1")
+    np.testing.assert_array_equal(data1, get_data1)
diff --git a/python/xorbits/_mars/services/storage/transfer.py b/python/xorbits/_mars/services/storage/transfer.py
new file mode 100644
index 000000000..3af253cf9
--- /dev/null
+++ b/python/xorbits/_mars/services/storage/transfer.py
@@ -0,0 +1,351 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+from dataclasses import dataclass
+from typing import Dict, List
+
+from ... import oscar as mo
+from ...lib.aio import alru_cache
+from ...storage import StorageLevel
+from ...utils import dataslots
+from .core import DataManagerActor, WrappedStorageFileObject
+from .handler import StorageHandlerActor
+
+DEFAULT_TRANSFER_BLOCK_SIZE = 4 * 1024**2
+
+
+logger = logging.getLogger(__name__)
+
+
+class SenderManagerActor(mo.StatelessActor):
+    def __init__(
+        self,
+        band_name: str = "numa-0",
+        transfer_block_size: int = None,
+        data_manager_ref: mo.ActorRefType[DataManagerActor] = None,
+        storage_handler_ref: mo.ActorRefType[StorageHandlerActor] = None,
+    ):
+        self._band_name = band_name
+        self._data_manager_ref = data_manager_ref
+        self._storage_handler = storage_handler_ref
+        self._transfer_block_size = transfer_block_size or DEFAULT_TRANSFER_BLOCK_SIZE
+
+    @classmethod
+    def gen_uid(cls, band_name: str):
+        return f"sender_manager_{band_name}"
+
+    async def __post_create__(self):
+        if self._storage_handler is None:  # for test
+            self._storage_handler = await mo.actor_ref(
+                self.address, StorageHandlerActor.gen_uid("numa-0")
+            )
+
+    @staticmethod
+    @alru_cache
+    async def get_receiver_ref(address: str, band_name: str):
+        return await mo.actor_ref(
+            address=address, uid=ReceiverManagerActor.gen_uid(band_name)
+        )
+
+    async def _send_data(
+        self,
+        receiver_ref: mo.ActorRefType["ReceiverManagerActor"],
+        session_id: str,
+        data_keys: List[str],
+        block_size: int,
+    ):
+        class BufferedSender:
+            def __init__(self):
+                self._buffers = []
+                self._send_keys = []
+                self._eof_marks = []
+
+            async def flush(self):
+                if self._buffers:
+                    await receiver_ref.receive_part_data(
+                        self._buffers, session_id, self._send_keys, self._eof_marks
+                    )
+
+                self._buffers = []
+                self._send_keys = []
+                self._eof_marks = []
+
+            async def send(self, buffer, eof_mark, key):
+                self._eof_marks.append(eof_mark)
+                self._buffers.append(buffer)
+                self._send_keys.append(key)
+                if sum(len(b) for b in self._buffers) >= block_size:
+                    await self.flush()
+
+        sender = BufferedSender()
+        open_reader_tasks = []
+        for data_key in data_keys:
+            open_reader_tasks.append(
+                self._storage_handler.open_reader.delay(session_id, data_key)
+            )
+        readers = await self._storage_handler.open_reader.batch(*open_reader_tasks)
+
+        for data_key, reader in zip(data_keys, readers):
+            while True:
+                part_data = await reader.read(block_size)
+                # Notes on [How to decide whether the reader reaches EOF?]
+                #
+                # In some storage backend, e.g., the reported memory usage (i.e., the
+                # `store_size`) may not same with the byte size that need to be transferred
+                # when moving to a remote worker. Thus, we think the reader reaches EOF
+                # when a `read` request returns nothing, rather than comparing the `sent_size`
+                # and the `store_size`.
+                #
+                is_eof = not part_data  # can be non-empty bytes, empty bytes and None
+                await sender.send(part_data, is_eof, data_key)
+                if is_eof:
+                    break
+        await sender.flush()
+
+    @mo.extensible
+    async def send_batch_data(
+        self,
+        session_id: str,
+        data_keys: List[str],
+        address: str,
+        level: StorageLevel,
+        band_name: str = "numa-0",
+        block_size: int = None,
+        error: str = "raise",
+    ):
+        logger.debug(
+            "Begin to send data (%s, %s) to %s", session_id, data_keys, address
+        )
+
+        tasks = []
+        for key in data_keys:
+            tasks.append(self._data_manager_ref.get_store_key.delay(session_id, key))
+        data_keys = await self._data_manager_ref.get_store_key.batch(*tasks)
+        data_keys = list(set(data_keys))
+        sub_infos = await self._data_manager_ref.get_sub_infos.batch(
+            *[
+                self._data_manager_ref.get_sub_infos.delay(session_id, key)
+                for key in data_keys
+            ]
+        )
+
+        block_size = block_size or self._transfer_block_size
+        receiver_ref: mo.ActorRefType[
+            ReceiverManagerActor
+        ] = await self.get_receiver_ref(address, band_name)
+        get_infos = []
+        pin_tasks = []
+        for data_key in data_keys:
+            get_infos.append(
+                self._data_manager_ref.get_data_info.delay(
+                    session_id, data_key, self._band_name, error
+                )
+            )
+            pin_tasks.append(
+                self._data_manager_ref.pin.delay(
+                    session_id, data_key, self._band_name, error
+                )
+            )
+        await self._data_manager_ref.pin.batch(*pin_tasks)
+        infos = await self._data_manager_ref.get_data_info.batch(*get_infos)
+        filtered = [
+            (data_info, data_key)
+            for data_info, data_key in zip(infos, data_keys)
+            if data_info is not None
+        ]
+        if filtered:
+            infos, data_keys = zip(*filtered)
+        else:  # pragma: no cover
+            # no data to be transferred
+            return
+        data_sizes = [info.store_size for info in infos]
+        if level is None:
+            level = infos[0].level
+        is_transferring_list = await receiver_ref.open_writers(
+            session_id, data_keys, data_sizes, level, sub_infos
+        )
+        to_send_keys = []
+        to_wait_keys = []
+        for data_key, is_transferring in zip(data_keys, is_transferring_list):
+            if is_transferring:
+                to_wait_keys.append(data_key)
+            else:
+                to_send_keys.append(data_key)
+
+        if to_send_keys:
+            await self._send_data(receiver_ref, session_id, to_send_keys, block_size)
+        if to_wait_keys:
+            await receiver_ref.wait_transfer_done(session_id, to_wait_keys)
+        unpin_tasks = []
+        for data_key in data_keys:
+            unpin_tasks.append(
+                self._data_manager_ref.unpin.delay(
+                    session_id, [data_key], self._band_name, error="ignore"
+                )
+            )
+        await self._data_manager_ref.unpin.batch(*unpin_tasks)
+        logger.debug(
+            "Finish sending data (%s, %s) to %s, total size is %s",
+            session_id,
+            data_keys,
+            address,
+            sum(data_sizes),
+        )
+
+
+@dataslots
+@dataclass
+class WritingInfo:
+    writer: WrappedStorageFileObject
+    size: int
+    level: StorageLevel
+    event: asyncio.Event
+    ref_counts: int
+
+
+class ReceiverManagerActor(mo.StatelessActor):
+    def __init__(
+        self,
+        quota_refs: Dict,
+        storage_handler_ref: mo.ActorRefType[StorageHandlerActor] = None,
+    ):
+        self._quota_refs = quota_refs
+        self._storage_handler = storage_handler_ref
+        self._writing_infos: Dict[tuple, WritingInfo] = dict()
+        self._lock = asyncio.Lock()
+
+    async def __post_create__(self):
+        if self._storage_handler is None:  # for test
+            self._storage_handler = await mo.actor_ref(
+                self.address, StorageHandlerActor.gen_uid("numa-0")
+            )
+
+    @classmethod
+    def gen_uid(cls, band_name: str):
+        return f"receiver_manager_{band_name}"
+
+    def _decref_writing_key(self, session_id: str, data_key: str):
+        self._writing_infos[(session_id, data_key)].ref_counts -= 1
+        if self._writing_infos[(session_id, data_key)].ref_counts == 0:
+            del self._writing_infos[(session_id, data_key)]
+
+    async def create_writers(
+        self,
+        session_id: str,
+        data_keys: List[str],
+        data_sizes: List[int],
+        level: StorageLevel,
+        sub_infos: List,
+    ):
+        tasks = dict()
+        key_to_sub_infos = dict()
+        data_key_to_size = dict()
+        being_processed = []
+        for data_key, data_size, sub_info in zip(data_keys, data_sizes, sub_infos):
+            data_key_to_size[data_key] = data_size
+            if (session_id, data_key) not in self._writing_infos:
+                being_processed.append(False)
+                tasks[data_key] = self._storage_handler.open_writer.delay(
+                    session_id, data_key, data_size, level, request_quota=False
+                )
+                key_to_sub_infos[data_key] = sub_info
+            else:
+                being_processed.append(True)
+                self._writing_infos[(session_id, data_key)].ref_counts += 1
+        if tasks:
+            writers = await self._storage_handler.open_writer.batch(
+                *tuple(tasks.values())
+            )
+            for data_key, writer in zip(tasks, writers):
+                self._writing_infos[(session_id, data_key)] = WritingInfo(
+                    writer, data_key_to_size[data_key], level, asyncio.Event(), 1
+                )
+                if key_to_sub_infos[data_key] is not None:
+                    writer._sub_key_infos = key_to_sub_infos[data_key]
+        return being_processed
+
+    async def open_writers(
+        self,
+        session_id: str,
+        data_keys: List[str],
+        data_sizes: List[int],
+        level: StorageLevel,
+        sub_infos: List,
+    ):
+        async with self._lock:
+            await self._storage_handler.request_quota_with_spill(level, sum(data_sizes))
+            future = asyncio.create_task(
+                self.create_writers(session_id, data_keys, data_sizes, level, sub_infos)
+            )
+            try:
+                return await future
+            except asyncio.CancelledError:
+                await self._quota_refs[level].release_quota(sum(data_sizes))
+                future.cancel()
+                raise
+
+    async def do_write(
+        self, data: list, session_id: str, data_keys: List[str], eof_marks: List[bool]
+    ):
+        # close may be a high-cost operation, use create_task
+        close_tasks = []
+        finished_keys = []
+        for data, data_key, is_eof in zip(data, data_keys, eof_marks):
+            writer = self._writing_infos[(session_id, data_key)].writer
+            if data:
+                await writer.write(data)
+            if is_eof:
+                close_tasks.append(writer.close())
+                finished_keys.append(data_key)
+        await asyncio.gather(*close_tasks)
+        async with self._lock:
+            for data_key in finished_keys:
+                event = self._writing_infos[(session_id, data_key)].event
+                event.set()
+                self._decref_writing_key(session_id, data_key)
+
+    async def receive_part_data(
+        self, data: list, session_id: str, data_keys: List[str], eof_marks: List[bool]
+    ):
+        write_task = asyncio.create_task(
+            self.do_write(data, session_id, data_keys, eof_marks)
+        )
+        try:
+            await asyncio.shield(write_task)
+        except asyncio.CancelledError:
+            async with self._lock:
+                for data_key in data_keys:
+                    if (session_id, data_key) in self._writing_infos:
+                        if self._writing_infos[(session_id, data_key)].ref_counts == 1:
+                            info = self._writing_infos[(session_id, data_key)]
+                            await self._quota_refs[info.level].release_quota(info.size)
+                            await self._storage_handler.delete(
+                                session_id, data_key, error="ignore"
+                            )
+                            await info.writer.clean_up()
+                            info.event.set()
+                            self._decref_writing_key(session_id, data_key)
+                            write_task.cancel()
+                            await write_task
+            raise
+
+    async def wait_transfer_done(self, session_id, data_keys):
+        await asyncio.gather(
+            *[self._writing_infos[(session_id, key)].event.wait() for key in data_keys]
+        )
+        async with self._lock:
+            for data_key in data_keys:
+                self._decref_writing_key(session_id, data_key)
diff --git a/python/xorbits/_mars/services/storage/worker/__init__.py b/python/xorbits/_mars/services/storage/worker/__init__.py
new file mode 100644
index 000000000..d128f2194
--- /dev/null
+++ b/python/xorbits/_mars/services/storage/worker/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .service import StorageWorkerService
diff --git a/python/xorbits/_mars/services/storage/worker/service.py b/python/xorbits/_mars/services/storage/worker/service.py
new file mode 100644
index 000000000..d47d25225
--- /dev/null
+++ b/python/xorbits/_mars/services/storage/worker/service.py
@@ -0,0 +1,78 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import oscar as mo
+from ...core import AbstractService
+from ..core import StorageManagerActor
+
+
+class StorageWorkerService(AbstractService):
+    """
+    Storage service on worker
+
+    Service Configuration
+    ---------------------
+    {
+        "storage": {
+            "backends": ["plasma"],
+            "<storage backend name>"： "<setup params>",
+        }
+    }
+    """
+
+    async def start(self):
+        storage_configs = self._config["storage"]
+        backends = storage_configs.get("backends")
+        options = storage_configs.get("default_config", dict())
+        transfer_block_size = options.get("transfer_block_size", None)
+        backend_config = {}
+        for backend in backends:
+            storage_config = storage_configs.get(backend, dict())
+            backend_config[backend] = storage_config
+            if backend == "ray":
+                # Specify supervisor as ray owner will be costly when mars do shuffle which there will be m*n objects
+                # need to specify supervisor as owner, so enable it only for auto scale to avoid data lost when scale
+                # in. This limit can be removed when ray support ownership transfer.
+                if (
+                    self._config.get("scheduling", {})
+                    .get("autoscale", {})
+                    .get("enabled", False)
+                ):
+                    try:
+                        from ...cluster.api import ClusterAPI
+
+                        cluster_api = await ClusterAPI.create(self._address)
+                        supervisor_address = (await cluster_api.get_supervisors())[0]
+                        # ray storage backend need to set supervisor as owner to avoid data lost when worker dies.
+                        owner = supervisor_address
+                    except mo.ActorNotExist:
+                        owner = self._address
+                else:
+                    owner = self._address
+                storage_config["owner"] = owner
+
+        await mo.create_actor(
+            StorageManagerActor,
+            backend_config,
+            transfer_block_size,
+            uid=StorageManagerActor.default_uid(),
+            address=self._address,
+        )
+
+    async def stop(self):
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                address=self._address, uid=StorageManagerActor.default_uid()
+            )
+        )
diff --git a/python/xorbits/_mars/services/subtask/__init__.py b/python/xorbits/_mars/services/subtask/__init__.py
new file mode 100644
index 000000000..bc8e89895
--- /dev/null
+++ b/python/xorbits/_mars/services/subtask/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .api import MockSubtaskAPI, SubtaskAPI
+from .core import Subtask, SubtaskGraph, SubtaskResult, SubtaskStatus
+from .errors import SlotOccupiedAlready, SubtaskNotExist
diff --git a/python/xorbits/_mars/services/subtask/api.py b/python/xorbits/_mars/services/subtask/api.py
new file mode 100644
index 000000000..f2c3b83d0
--- /dev/null
+++ b/python/xorbits/_mars/services/subtask/api.py
@@ -0,0 +1,115 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import oscar as mo
+from ...lib.aio import alru_cache
+from ...oscar.backends.context import ProfilingContext
+from ...oscar.profiling import MARS_ENABLE_PROFILING
+from .core import Subtask
+
+
+class SubtaskAPI:
+    def __init__(self, address: str):
+        self._address = address
+
+    @classmethod
+    async def create(cls, address: str) -> "SubtaskAPI":
+        return SubtaskAPI(address)
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_runner_ref(self, band_name: str, slot_id: int):
+        from .worker.runner import SubtaskRunnerActor
+
+        return await mo.actor_ref(
+            SubtaskRunnerActor.gen_uid(band_name, slot_id), address=self._address
+        )
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_subtask_processor_ref(self, session_id: str, slot_address: str):
+        from .worker.processor import SubtaskProcessorActor
+
+        return await mo.actor_ref(
+            SubtaskProcessorActor.gen_uid(session_id), address=slot_address
+        )
+
+    async def run_subtask_in_slot(self, band_name: str, slot_id: int, subtask: Subtask):
+        """
+        Run subtask in current worker
+
+        Parameters
+        ----------
+        band_name
+        subtask
+        slot_id
+
+        Returns
+        -------
+
+        """
+        ref = await self._get_runner_ref(band_name, slot_id)
+        extra_config = subtask.extra_config
+        enable_profiling = MARS_ENABLE_PROFILING or (
+            extra_config and extra_config.get("enable_profiling")
+        )
+        profiling_context = (
+            ProfilingContext(task_id=subtask.task_id) if enable_profiling else None
+        )
+        return await ref.run_subtask.options(profiling_context=profiling_context).send(
+            subtask
+        )
+
+    async def cancel_subtask_in_slot(self, band_name: str, slot_id: int):
+        """
+        Cancel subtask running in a worker slot and wait until it is cancelled
+
+        Parameters
+        ----------
+        band_name : str
+            name of a worker band, for instance, 'numa-0'
+        slot_id : int
+            index of a slot in a band
+        """
+        ref = await self._get_runner_ref(band_name, slot_id)
+        await ref.cancel_subtask()
+
+    async def set_running_operand_progress(
+        self, session_id: str, op_key: str, slot_address: str, progress: float
+    ):
+        ref = await self._get_subtask_processor_ref(session_id, slot_address)
+        await ref.set_running_op_progress(op_key, progress)
+
+
+class MockSubtaskAPI(SubtaskAPI):
+    @classmethod
+    async def create(cls, address: str) -> "SubtaskAPI":
+        from .worker.manager import SubtaskRunnerManagerActor
+
+        await mo.create_actor(
+            SubtaskRunnerManagerActor,
+            address,
+            None,
+            uid=SubtaskRunnerManagerActor.default_uid(),
+            address=address,
+        )
+        return await super().create(address)
+
+    @classmethod
+    async def cleanup(cls, address: str):
+        from .worker.manager import SubtaskRunnerManagerActor
+
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=SubtaskRunnerManagerActor.default_uid(), address=address
+            )
+        )
diff --git a/python/xorbits/_mars/services/subtask/core.py b/python/xorbits/_mars/services/subtask/core.py
new file mode 100644
index 000000000..a7c862231
--- /dev/null
+++ b/python/xorbits/_mars/services/subtask/core.py
@@ -0,0 +1,223 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from typing import Iterable, List, Optional, Set, Tuple
+
+from ...core import DAG, ChunkData, ChunkGraph
+from ...resource import Resource
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    DictField,
+    FieldTypes,
+    Float64Field,
+    Int32Field,
+    Int64Field,
+    ListField,
+    ReferenceField,
+    Serializable,
+    StringField,
+    TupleField,
+)
+from ...serialization.serializables.field_type import TupleType
+from ...typing import BandType, ChunkType
+
+
+class SubtaskStatus(Enum):
+    pending = 0
+    running = 1
+    succeeded = 2
+    errored = 3
+    cancelled = 4
+
+    @property
+    def is_done(self) -> bool:
+        return self in (
+            SubtaskStatus.succeeded,
+            SubtaskStatus.errored,
+            SubtaskStatus.cancelled,
+        )
+
+
+class Subtask(Serializable):
+    __slots__ = ("_repr", "_pure_depend_keys", "runtime")
+
+    subtask_id: str = StringField("subtask_id")
+    subtask_name: str = StringField("subtask_name")
+    session_id: str = StringField("session_id")
+    task_id: str = StringField("task_id")
+    chunk_graph: ChunkGraph = ReferenceField("chunk_graph", ChunkGraph)
+    expect_bands: List[BandType] = ListField(
+        "expect_bands", TupleType(FieldTypes.string, FieldTypes.string)
+    )
+    virtual: bool = BoolField("virtual")
+    retryable: bool = BoolField("retryable")
+    priority: Tuple[int, int] = TupleField("priority", FieldTypes.int32)
+    extra_config: dict = DictField("extra_config")
+    stage_id: str = StringField("stage_id")
+    # chunks that need meta updated
+    update_meta_chunks: List[ChunkType] = ListField(
+        "update_meta_chunks", FieldTypes.reference(ChunkData)
+    )
+    # A unique and deterministic key for subtask compute logic. See logic_key in operator.py.
+    logic_key: str = StringField("logic_key")
+    # index for subtask with same compute logic.
+    logic_index: int = Int32Field("logic_index")
+    # parallelism for subtask with same compute logic.
+    logic_parallelism: int = Int32Field("logic_parallelism")
+    # subtask can only run in specified bands in `expect_bands`
+    bands_specified: bool = BoolField("bands_specified")
+    required_resource: Resource = AnyField("required_resource", Resource)
+    # The count of result chunks that are the stage's results.
+    stage_n_outputs: int = Int32Field("stage_n_outputs")
+
+    def __init__(
+        self,
+        subtask_id: str = None,
+        session_id: str = None,
+        task_id: str = None,
+        chunk_graph: ChunkGraph = None,
+        subtask_name: str = None,
+        expect_bands: List[BandType] = None,
+        priority: Tuple[int, int] = None,
+        virtual: bool = False,
+        retryable: bool = True,
+        extra_config: dict = None,
+        stage_id: str = None,
+        update_meta_chunks: List[ChunkType] = None,
+        logic_key: str = None,
+        logic_index: int = None,
+        logic_parallelism: int = None,
+        bands_specified: bool = False,
+        required_resource: Resource = None,
+        stage_n_outputs: int = 0,
+    ):
+        super().__init__(
+            subtask_id=subtask_id,
+            subtask_name=subtask_name,
+            session_id=session_id,
+            task_id=task_id,
+            chunk_graph=chunk_graph,
+            expect_bands=expect_bands,
+            priority=priority,
+            virtual=virtual,
+            retryable=retryable,
+            extra_config=extra_config,
+            stage_id=stage_id,
+            update_meta_chunks=update_meta_chunks,
+            logic_key=logic_key,
+            logic_index=logic_index,
+            logic_parallelism=logic_parallelism,
+            bands_specified=bands_specified,
+            required_resource=required_resource,
+            stage_n_outputs=stage_n_outputs,
+        )
+        self._pure_depend_keys = None
+        self._repr = None
+        self.runtime = None
+
+    def __on_deserialize__(self):
+        super(Subtask, self).__on_deserialize__()
+        self._pure_depend_keys = None
+        self._repr = None
+        self.runtime = None
+
+    @property
+    def expect_band(self):
+        if self.expect_bands:
+            return self.expect_bands[0]
+
+    @property
+    def pure_depend_keys(self) -> Set[str]:
+        if self._pure_depend_keys is not None:
+            return self._pure_depend_keys
+        pure_dep_keys = set()
+        for n in self.chunk_graph:
+            pure_dep_keys.update(
+                inp.key
+                for inp, pure_dep in zip(n.inputs, n.op.pure_depends)
+                if pure_dep
+            )
+        self._pure_depend_keys = pure_dep_keys
+        return pure_dep_keys
+
+    def __repr__(self):
+        if self._repr is not None:
+            return self._repr
+
+        if self.chunk_graph:
+            result_chunk_repr = " ".join(
+                [
+                    f"{type(chunk.op).__name__}({chunk.key})"
+                    for chunk in self.chunk_graph.result_chunks
+                ]
+            )
+        else:  # pragma: no cover
+            result_chunk_repr = None
+        self._repr = f"<Subtask id={self.subtask_id} results=[{result_chunk_repr}]>"
+        return self._repr
+
+
+class SubtaskResult(Serializable):
+    subtask_id: str = StringField("subtask_id")
+    session_id: str = StringField("session_id")
+    task_id: str = StringField("task_id")
+    stage_id: str = StringField("stage_id")
+    status: SubtaskStatus = ReferenceField("status", SubtaskStatus)
+    progress: float = Float64Field("progress", default=0.0)
+    data_size: int = Int64Field("data_size", default=None)
+    bands: List[BandType] = ListField("band", FieldTypes.tuple, default=None)
+    error = AnyField("error", default=None)
+    traceback = AnyField("traceback", default=None)
+    # The following is the execution information of the subtask
+    execution_start_time: float = Float64Field("execution_start_time")
+    execution_end_time: float = Float64Field("execution_end_time")
+
+    def update(self, result: Optional["SubtaskResult"]):
+        if result and result.bands:
+            bands = self.bands or []
+            self.bands = sorted(set(bands + result.bands))
+            self.execution_start_time = result.execution_start_time
+            if hasattr(result, "execution_end_time"):
+                self.execution_end_time = result.execution_end_time
+        return self
+
+
+class SubtaskGraph(DAG, Iterable[Subtask]):
+    """
+    Subtask graph.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._proxy_subtasks = []
+
+    @classmethod
+    def _extract_operands(cls, node: Subtask):
+        from ...core.operand import Fetch, FetchShuffle
+
+        for node in node.chunk_graph:
+            if isinstance(node.op, (Fetch, FetchShuffle)):
+                continue
+            yield node.op
+
+    def add_shuffle_proxy_subtask(self, proxy_subtask):
+        self._proxy_subtasks.append(proxy_subtask)
+
+    def num_shuffles(self) -> int:
+        return len(self._proxy_subtasks)
+
+    def get_shuffle_proxy_subtasks(self):
+        return self._proxy_subtasks
diff --git a/python/xorbits/_mars/services/subtask/errors.py b/python/xorbits/_mars/services/subtask/errors.py
new file mode 100644
index 000000000..c27607ad6
--- /dev/null
+++ b/python/xorbits/_mars/services/subtask/errors.py
@@ -0,0 +1,21 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class SubtaskNotExist(Exception):
+    pass
+
+
+class SlotOccupiedAlready(Exception):
+    pass
diff --git a/python/xorbits/_mars/services/subtask/supervisor/__init__.py b/python/xorbits/_mars/services/subtask/supervisor/__init__.py
new file mode 100644
index 000000000..62c018d71
--- /dev/null
+++ b/python/xorbits/_mars/services/subtask/supervisor/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...core import EmptyService
+
+
+class SubtaskSupervisorService(EmptyService):
+    pass
diff --git a/python/xorbits/_mars/services/subtask/tests/__init__.py b/python/xorbits/_mars/services/subtask/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/subtask/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/subtask/tests/test_service.py b/python/xorbits/_mars/services/subtask/tests/test_service.py
new file mode 100644
index 000000000..928aab651
--- /dev/null
+++ b/python/xorbits/_mars/services/subtask/tests/test_service.py
@@ -0,0 +1,157 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import time
+
+import numpy as np
+import pytest
+
+from .... import oscar as mo
+from .... import remote as mr
+from .... import tensor as mt
+from ....core.graph import ChunkGraphBuilder, TileableGraph, TileableGraphBuilder
+from ....resource import Resource
+from ....utils import Timer
+from ... import NodeRole, start_services, stop_services
+from ...meta import MetaAPI
+from ...session import SessionAPI
+from ...storage import MockStorageAPI
+from ...task import new_task_id
+from ...task.supervisor.manager import TaskManagerActor
+from .. import Subtask, SubtaskAPI, SubtaskResult
+
+
+class FakeTaskManager(TaskManagerActor):
+    def set_subtask_result(self, subtask_result: SubtaskResult):
+        return
+
+
+def _gen_subtask(t, session_id):
+    graph = TileableGraph([t.data])
+    next(TileableGraphBuilder(graph).build())
+
+    chunk_graph = next(ChunkGraphBuilder(graph, fuse_enabled=False).build())
+    subtask = Subtask(new_task_id(), session_id, new_task_id(), chunk_graph)
+
+    return subtask
+
+
+@pytest.fixture
+async def actor_pools():
+    async def start_pool(is_worker: bool):
+        if is_worker:
+            kw = dict(
+                n_process=2,
+                labels=["main"] + ["numa-0"] * 2,
+                subprocess_start_method="spawn",
+            )
+        else:
+            kw = dict(n_process=0, subprocess_start_method="spawn")
+        pool = await mo.create_actor_pool("127.0.0.1", **kw)
+        await pool.start()
+        return pool
+
+    try:
+        sv_pool, worker_pool = await asyncio.gather(start_pool(False), start_pool(True))
+        yield sv_pool, worker_pool
+    finally:
+        await asyncio.gather(sv_pool.stop(), worker_pool.stop())
+
+
+@pytest.mark.asyncio
+async def test_subtask_service(actor_pools):
+    sv_pool, worker_pool = actor_pools
+
+    config = {
+        "services": [
+            "cluster",
+            "session",
+            "meta",
+            "lifecycle",
+            "scheduling",
+            "subtask",
+            "task",
+            "mutable",
+        ],
+        "cluster": {
+            "backend": "fixed",
+            "lookup_address": sv_pool.external_address,
+            "resource": {"numa-0": Resource(num_cpus=2)},
+        },
+        "meta": {"store": "dict"},
+        "scheduling": {},
+        "subtask": {},
+    }
+    await start_services(NodeRole.SUPERVISOR, config, address=sv_pool.external_address)
+    await start_services(NodeRole.WORKER, config, address=worker_pool.external_address)
+
+    session_id = "test_session"
+    session_api = await SessionAPI.create(sv_pool.external_address)
+    await session_api.create_session(session_id)
+    ref = await mo.actor_ref(
+        FakeTaskManager.gen_uid(session_id), address=sv_pool.external_address
+    )
+    await mo.destroy_actor(ref)
+    await mo.create_actor(
+        FakeTaskManager,
+        session_id,
+        uid=FakeTaskManager.gen_uid(session_id),
+        address=sv_pool.external_address,
+    )
+
+    subtask_api = await SubtaskAPI.create(worker_pool.external_address)
+    # create mock meta and storage APIs
+    meta_api = await MetaAPI.create(session_id, sv_pool.external_address)
+    storage_api = await MockStorageAPI.create(session_id, worker_pool.external_address)
+
+    a = mt.ones((10, 10), chunk_size=10)
+    b = a + 1
+
+    subtask = _gen_subtask(b, session_id)
+    assert "TensorAdd" in repr(subtask)
+    await subtask_api.run_subtask_in_slot("numa-0", 0, subtask)
+
+    # check storage
+    expected = np.ones((10, 10)) + 1
+    result_key = subtask.chunk_graph.results[0].key
+    result = await storage_api.get(result_key)
+    np.testing.assert_array_equal(expected, result)
+
+    # check meta
+    chunk_meta = await meta_api.get_chunk_meta(result_key)
+    assert chunk_meta is not None
+    assert chunk_meta["bands"][0] == (worker_pool.external_address, "numa-0")
+
+    def sleep(timeout: int):
+        time.sleep(timeout)
+        return timeout
+
+    b = mr.spawn(sleep, 1)
+
+    subtask2 = _gen_subtask(b, session_id)
+    asyncio.create_task(subtask_api.run_subtask_in_slot("numa-0", 0, subtask2))
+    await asyncio.sleep(0.2)
+    with Timer() as timer:
+        # normal cancel by cancel asyncio Task
+        await asyncio.wait_for(
+            subtask_api.cancel_subtask_in_slot("numa-0", 0), timeout=2
+        )
+    # need 1 sec to reach timeout, then killing actor and wait for auto recovering
+    # the time would not be over 5 sec
+    assert timer.duration < 2
+
+    await MockStorageAPI.cleanup(worker_pool.external_address)
+    await stop_services(NodeRole.WORKER, config, address=worker_pool.external_address)
+    await stop_services(NodeRole.SUPERVISOR, config, address=sv_pool.external_address)
diff --git a/python/xorbits/_mars/services/subtask/utils.py b/python/xorbits/_mars/services/subtask/utils.py
new file mode 100644
index 000000000..f71893933
--- /dev/null
+++ b/python/xorbits/_mars/services/subtask/utils.py
@@ -0,0 +1,79 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, Iterator, List, Tuple
+
+from ...core import ChunkGraph
+from ...core.operand import Fetch, FetchShuffle, MapReduceOperand, VirtualOperand
+from .core import Subtask
+
+
+def iter_input_data_keys(
+    subtask: Subtask,
+    chunk_graph: ChunkGraph,
+    chunk_key_to_data_keys: Dict[str, List[str]],
+) -> Iterator[Tuple[str, bool]]:
+    """An iterator yield (input data key, is shuffle)."""
+    data_keys = set()
+    for chunk in chunk_graph.iter_indep():
+        if isinstance(chunk.op, Fetch) and chunk.key not in subtask.pure_depend_keys:
+            data_keys.add(chunk.key)
+            yield chunk.key, False
+        elif isinstance(chunk.op, FetchShuffle):
+            for key in chunk_key_to_data_keys[chunk.key]:
+                if key not in data_keys:
+                    data_keys.add(key)
+                    yield key, True
+
+
+def get_mapper_data_keys(key: str, context: Dict[str, Any]) -> List[str]:
+    """Get the mapper data keys of key from context."""
+    return [
+        store_key
+        for store_key in context
+        if isinstance(store_key, tuple) and store_key[0] == key
+    ]
+
+
+def iter_output_data(
+    chunk_graph: ChunkGraph, context: Dict[str, Any]
+) -> Iterator[Tuple[str, Any, bool]]:
+    """An iterator yield (output chunk key, output data, is shuffle)."""
+    data_keys = set()
+    for result_chunk in chunk_graph.result_chunks:
+        # skip virtual operands for result chunks
+        if isinstance(result_chunk.op, VirtualOperand):
+            continue
+        key = result_chunk.key
+        if key in context:
+            # non shuffle op
+            data = context[key]
+            # update meta
+            if not isinstance(data, tuple):
+                result_chunk.params = result_chunk.get_params_from_data(data)
+            # check key after update meta
+            if key in data_keys:
+                continue
+            yield key, data, False
+            data_keys.add(key)
+        else:
+            assert isinstance(result_chunk.op, MapReduceOperand)
+            keys = get_mapper_data_keys(key, context)
+            for key in keys:
+                if key in data_keys:
+                    continue
+                # shuffle op
+                data = context[key]
+                yield key, data, True
+                data_keys.add(key)
diff --git a/python/xorbits/_mars/services/subtask/worker/__init__.py b/python/xorbits/_mars/services/subtask/worker/__init__.py
new file mode 100644
index 000000000..41ac782f1
--- /dev/null
+++ b/python/xorbits/_mars/services/subtask/worker/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .service import SubtaskWorkerService
diff --git a/python/xorbits/_mars/services/subtask/worker/manager.py b/python/xorbits/_mars/services/subtask/worker/manager.py
new file mode 100644
index 000000000..367840b1e
--- /dev/null
+++ b/python/xorbits/_mars/services/subtask/worker/manager.py
@@ -0,0 +1,60 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+from typing import Type
+
+from .... import oscar as mo
+from ....oscar.backends.allocate_strategy import IdleLabel
+from .runner import SubtaskRunnerActor
+
+
+class SubtaskRunnerManagerActor(mo.Actor):
+    def __init__(self, worker_address: str, subtask_processor_cls: Type):
+        # specify subtask process class
+        # for test purpose
+        self._worker_address = worker_address
+        self._subtask_processor_cls = subtask_processor_cls
+        self._cluster_api = None
+
+        self._band_slot_runner_refs = dict()
+
+    async def __post_create__(self):
+        from ...cluster.api import ClusterAPI
+
+        self._cluster_api = await ClusterAPI.create(self.address)
+
+        band_to_resource = await self._cluster_api.get_bands()
+        for band, resource in band_to_resource.items():
+            await self._create_band_runner_actors(
+                band[1], int(resource.num_cpus or resource.num_gpus)
+            )
+
+    async def _create_band_runner_actors(self, band_name: str, n_slots: int):
+        strategy = IdleLabel(band_name, "subtask_runner")
+        band = (self.address, band_name)
+        for slot_id in range(n_slots):
+            self._band_slot_runner_refs[(band_name, slot_id)] = await mo.create_actor(
+                SubtaskRunnerActor,
+                band,
+                worker_address=self._worker_address,
+                subtask_processor_cls=self._subtask_processor_cls,
+                uid=SubtaskRunnerActor.gen_uid(band_name, slot_id),
+                address=self.address,
+                allocate_strategy=strategy,
+            )
+
+    async def __pre_destroy__(self):
+        await asyncio.gather(
+            *[mo.destroy_actor(ref) for ref in self._band_slot_runner_refs.values()]
+        )
diff --git a/python/xorbits/_mars/services/subtask/worker/processor.py b/python/xorbits/_mars/services/subtask/worker/processor.py
new file mode 100644
index 000000000..1f8242695
--- /dev/null
+++ b/python/xorbits/_mars/services/subtask/worker/processor.py
@@ -0,0 +1,763 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import sys
+import time
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Set, Tuple, Type
+
+from .... import oscar as mo
+from ....core import ChunkGraph, ExecutionError, OperandType, enter_mode
+from ....core.context import get_context
+from ....core.operand import Fetch, FetchShuffle, execute
+from ....lib.aio import alru_cache
+from ....metrics import Metrics
+from ....optimization.physical import optimize
+from ....serialization import AioSerializer
+from ....typing import BandType, ChunkType
+from ....utils import calc_data_size, get_chunk_key_to_data_keys
+from ...context import ThreadedServiceContext
+from ...meta.api import MetaAPI, WorkerMetaAPI
+from ...session import SessionAPI
+from ...storage import StorageAPI
+from ...task import TaskAPI, task_options
+from ..core import Subtask, SubtaskResult, SubtaskStatus
+from ..utils import get_mapper_data_keys, iter_input_data_keys, iter_output_data
+
+logger = logging.getLogger(__name__)
+
+
+class ProcessorContext(dict):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._current_chunk = None
+
+    def __getattr__(self, attr):
+        ctx = get_context()
+        return getattr(ctx, attr)
+
+    def set_current_chunk(self, chunk: ChunkType):
+        """Set current executing chunk."""
+        self._current_chunk = chunk
+
+    def get_current_chunk(self) -> ChunkType:
+        """Get current executing chunk."""
+        return self._current_chunk
+
+
+BASIC_META_FIELDS = ["memory_size", "store_size", "bands", "object_ref"]
+
+
+class SubtaskProcessor:
+    _chunk_graph: ChunkGraph
+    _chunk_key_to_data_keys: Dict[str, List[str]]
+
+    def __init__(
+        self,
+        subtask: Subtask,
+        session_api: SessionAPI,
+        storage_api: StorageAPI,
+        meta_api: MetaAPI,
+        worker_meta_api: WorkerMetaAPI,
+        band: BandType,
+        supervisor_address: str,
+        engines: List[str] = None,
+    ):
+        self.subtask = subtask
+        self._session_id = self.subtask.session_id
+        self._chunk_graph = subtask.chunk_graph
+        self._actual_chunk_count = len(
+            [
+                chunk
+                for chunk in subtask.chunk_graph
+                if not isinstance(chunk.op, (Fetch, FetchShuffle))
+            ]
+        )
+        self._band = band
+        self._supervisor_address = supervisor_address
+        self._engines = engines if engines is not None else task_options.runtime_engines
+
+        # result
+        self.result = SubtaskResult(
+            subtask_id=subtask.subtask_id,
+            session_id=subtask.session_id,
+            task_id=subtask.task_id,
+            stage_id=subtask.stage_id,
+            status=SubtaskStatus.pending,
+            bands=[self._band],
+            progress=0.0,
+            execution_start_time=time.time(),
+        )
+        self.is_done = asyncio.Event()
+
+        # status and intermediate states
+        # operand progress, from op key to progress
+        self._op_progress: Dict[str, float] = defaultdict(lambda: 0.0)
+        # temp data store that holds chunk data during computation
+        self._processor_context = ProcessorContext()
+        # chunk key to real data keys
+        self._chunk_key_to_data_keys = dict()
+
+        # other service APIs
+        self._session_api = session_api
+        self._storage_api = storage_api
+        self._meta_api = meta_api
+        self._worker_meta_api = worker_meta_api
+
+        # add metrics
+        self._subtask_execution_time = Metrics.gauge(
+            "mars.subtask_execution_time_secs",
+            "Time consuming in seconds to execute a subtask",
+            ("session_id", "subtask_id"),
+        )
+
+    @property
+    def status(self):
+        return self.result.status
+
+    @property
+    def subtask_id(self):
+        return self.subtask.subtask_id
+
+    async def _load_input_data(self):
+        keys, gets, accept_nones = [], [], []
+        for key, is_shuffle in iter_input_data_keys(
+            self.subtask, self._chunk_graph, self._chunk_key_to_data_keys
+        ):
+            keys.append(key)
+            accept_nones.append(not is_shuffle)
+            gets_params = {"error": "ignore"} if is_shuffle else {}
+            gets.append(self._storage_api.get.delay(key, **gets_params))
+        if keys:
+            logger.debug(
+                "Start getting input data, keys: %.500s, subtask id: %s",
+                keys,
+                self.subtask.subtask_id,
+            )
+            inputs = await self._storage_api.get.batch(*gets)
+            self._processor_context.update(
+                {
+                    key: get
+                    for key, get, accept_none in zip(keys, inputs, accept_nones)
+                    if accept_none or get is not None
+                }
+            )
+            logger.debug(
+                "Finish getting input data keys: %.500s, subtask id: %s",
+                keys,
+                self.subtask.subtask_id,
+            )
+        return keys
+
+    @staticmethod
+    async def notify_task_manager_result(
+        supervisor_address: str, result: SubtaskResult
+    ):
+        task_api = await TaskAPI.create(result.session_id, supervisor_address)
+        # notify task service
+        await task_api.set_subtask_result(result)
+
+    def _init_ref_counts(self) -> Dict[str, int]:
+        chunk_graph = self._chunk_graph
+        ref_counts = defaultdict(lambda: 0)
+        # set 1 for result chunks
+        for result_chunk in chunk_graph.result_chunks:
+            ref_counts[result_chunk.key] += 1
+        # iter graph to set ref counts
+        for chunk in chunk_graph:
+            ref_counts[chunk.key] += chunk_graph.count_successors(chunk)
+        return ref_counts
+
+    async def _async_execute_operand(self, ctx: Dict[str, Any], op: OperandType):
+        if not isinstance(op, (Fetch, FetchShuffle)):
+            self._op_progress[op.key] = 0.0
+        get_context().set_running_operand_key(self._session_id, op.key)
+        return asyncio.to_thread(self._execute_operand, ctx, op)
+
+    def set_op_progress(self, op_key: str, progress: float):
+        if op_key in self._op_progress:  # pragma: no branch
+            self._op_progress[op_key] = progress
+
+    @enter_mode(build=False, kernel=True)
+    def _execute_operand(
+        self, ctx: Dict[str, Any], op: OperandType
+    ):  # noqa: R0201  # pylint: disable=no-self-use
+        try:
+            return execute(ctx, op)
+        except BaseException as ex:
+            # wrap exception in execution to avoid side effects
+            raise ExecutionError(ex).with_traceback(ex.__traceback__) from None
+
+    async def _execute_graph(self, chunk_graph: ChunkGraph):
+        loop = asyncio.get_running_loop()
+        ref_counts = self._init_ref_counts()
+
+        # from data_key to results
+        for chunk in chunk_graph.topological_iter():
+            if chunk.key not in self._processor_context:
+                # since `op.execute` may be a time-consuming operation,
+                # we make it run in a thread pool to not block current thread.
+                logger.debug(
+                    "Start executing operand: %s, chunk: %s, subtask id: %s",
+                    chunk.op,
+                    chunk,
+                    self.subtask.subtask_id,
+                )
+                self._processor_context.set_current_chunk(chunk)
+                future = asyncio.create_task(
+                    await self._async_execute_operand(self._processor_context, chunk.op)
+                )
+                to_wait = loop.create_future()
+
+                def cb(fut):
+                    if not to_wait.done():
+                        if fut.exception():
+                            to_wait.set_exception(fut.exception())
+                        else:
+                            to_wait.set_result(fut.result())
+
+                future.add_done_callback(cb)
+
+                try:
+                    await to_wait
+                    logger.debug(
+                        "Finish executing operand: %s, chunk: %s, subtask id: %s",
+                        chunk.op,
+                        chunk,
+                        self.subtask.subtask_id,
+                    )
+                except asyncio.CancelledError:
+                    logger.debug(
+                        "Receive cancel instruction for operand: %s,"
+                        "chunk: %s, subtask id: %s",
+                        chunk.op,
+                        chunk,
+                        self.subtask.subtask_id,
+                    )
+                    # wait for this computation to finish
+                    await future
+                    # if cancelled, stop next computation
+                    logger.debug(
+                        "Cancelled operand: %s, chunk: %s, subtask id: %s",
+                        chunk.op,
+                        chunk,
+                        self.subtask.subtask_id,
+                    )
+                    self.result.status = SubtaskStatus.cancelled
+                    raise
+
+            self.set_op_progress(chunk.op.key, 1.0)
+
+            for inp in chunk_graph.iter_predecessors(chunk):
+                ref_counts[inp.key] -= 1
+                if ref_counts[inp.key] == 0:
+                    # ref count reaches 0, remove it
+                    for key in self._chunk_key_to_data_keys[inp.key]:
+                        if key in self._processor_context:
+                            del self._processor_context[key]
+
+    async def _unpin_data(self, data_keys):
+        # unpin input keys
+        unpins = []
+        shuffle_unpins = []
+        for key in data_keys:
+            if isinstance(key, tuple):
+                # a tuple key means it's a shuffle key,
+                # some shuffle data is None and not stored in storage
+                shuffle_unpins.append(
+                    self._storage_api.unpin.delay(key, error="ignore")
+                )
+            else:
+                unpins.append(self._storage_api.unpin.delay(key))
+        if unpins:
+            await self._storage_api.unpin.batch(*unpins)
+        if shuffle_unpins:
+            # TODO(hks): The batch method doesn't accept different error arguments,
+            #  combine them when it can.
+            await self._storage_api.unpin.batch(*shuffle_unpins)
+
+    async def _store_data(self, chunk_graph: ChunkGraph):
+        # store data into storage
+        data_key_to_puts = {}
+        shuffle_key_to_data = {}
+        is_storage_seekable = await self._storage_api.is_seekable()
+        for key, data, _ in iter_output_data(chunk_graph, self._processor_context):
+            if isinstance(key, tuple) and is_storage_seekable:
+                shuffle_key_to_data[key] = data
+            else:
+                put = self._storage_api.put.delay(key, data)
+                data_key_to_puts[key] = put
+
+        stored_keys = list(data_key_to_puts.keys())
+        puts = data_key_to_puts.values()
+        logger.debug(
+            "Start putting data keys: %s, subtask id: %s",
+            stored_keys,
+            self.subtask.subtask_id,
+        )
+        data_key_to_store_size = dict()
+        data_key_to_memory_size = dict()
+        data_key_to_object_id = dict()
+        if puts:
+            put_infos = asyncio.create_task(self._storage_api.put.batch(*puts))
+            try:
+                store_infos = await put_infos
+                for store_key, store_info in zip(stored_keys, store_infos):
+                    data_key_to_store_size[store_key] = store_info.store_size
+                    data_key_to_memory_size[store_key] = store_info.memory_size
+                    data_key_to_object_id[store_key] = store_info.object_id
+                logger.debug(
+                    "Finish putting data keys: %s, subtask id: %s",
+                    stored_keys,
+                    self.subtask.subtask_id,
+                )
+            except asyncio.CancelledError:
+                logger.debug(
+                    "Cancelling put data keys: %s, subtask id: %s",
+                    stored_keys,
+                    self.subtask.subtask_id,
+                )
+                put_infos.cancel()
+
+                logger.debug(
+                    "Cancelled put data keys: %s, subtask id: %s",
+                    stored_keys,
+                    self.subtask.subtask_id,
+                )
+                self.result.status = SubtaskStatus.cancelled
+                raise
+
+        if shuffle_key_to_data:
+            await self._store_mapper_data(
+                shuffle_key_to_data,
+                data_key_to_store_size,
+                data_key_to_memory_size,
+                data_key_to_object_id,
+            )
+        # clear data
+        self._processor_context = ProcessorContext()
+        return (
+            stored_keys,
+            data_key_to_store_size,
+            data_key_to_memory_size,
+            data_key_to_object_id,
+        )
+
+    async def _write_aggregated_mapper_data(
+        self, key_and_band: Tuple, objects: List, data_keys: List
+    ):
+        serialization_tasks = [AioSerializer(obj).run() for obj in objects]
+
+        def calc_memory_size(objs):
+            return sum(calc_data_size(obj) for obj in objs)
+
+        memory_size = await asyncio.to_thread(calc_memory_size, objects)
+
+        buffer_list = await asyncio.gather(*serialization_tasks)
+        sizes = [
+            sum(b.size if hasattr(b, "size") else len(b) for b in buf)
+            for buf in buffer_list
+        ]
+        writer = await self._storage_api.open_writer(key_and_band, sum(sizes))
+        offset = 0
+        for buffers, size, data_key in zip(buffer_list, sizes, data_keys):
+            for buf in buffers:
+                await writer.write(buf)
+            writer.commit_once(data_key, offset, size)
+            offset += size
+        await writer.close()
+        return key_and_band, memory_size, sum(sizes), writer._object_id
+
+    async def _store_mapper_data(
+        self,
+        shuffle_key_to_data: Dict,
+        data_key_to_store_size: Dict,
+        data_key_to_memory_size: Dict,
+        data_key_to_object_id: Dict,
+    ):
+        band_to_mapper_key = defaultdict(list)
+        for result_chunk in self._chunk_graph.result_chunks:
+            map_reduce_id = getattr(result_chunk, "extra_params", dict()).get(
+                "analyzer_map_reduce_id"
+            )
+            if map_reduce_id is None:
+                continue
+            reducer_index_to_bands = await self._gen_reducer_index_to_bands(
+                self._session_id,
+                self._supervisor_address,
+                self.subtask.task_id,
+                map_reduce_id,
+            )
+            for reducer_index, band in reducer_index_to_bands.items():
+                # mapper key is a tuple
+                band_to_mapper_key[(result_chunk.key, band)].append(
+                    (result_chunk.key, reducer_index)
+                )
+
+        write_tasks = []
+        for key_and_band, shuffle_keys in band_to_mapper_key.items():
+            objects = [shuffle_key_to_data[key] for key in shuffle_keys]
+            write_tasks.append(
+                self._write_aggregated_mapper_data(key_and_band, objects, shuffle_keys)
+            )
+        infos = await asyncio.gather(*write_tasks)
+        for key, memory_size, store_size, object_id in infos:
+            data_key_to_memory_size[key] = memory_size
+            data_key_to_store_size[key] = store_size
+            data_key_to_object_id[key] = object_id
+
+    async def _store_meta(
+        self,
+        chunk_graph: ChunkGraph,
+        data_key_to_store_size: Dict,
+        data_key_to_memory_size: Dict,
+        data_key_to_object_id: Dict,
+        update_meta_chunks: Set[ChunkType],
+    ):
+        # store meta
+        set_chunk_metas = []
+        set_worker_chunk_metas = []
+        result_data_size = 0
+        set_meta_keys = []
+        for result_chunk in chunk_graph.result_chunks:
+            chunk_key = result_chunk.key
+            set_meta_keys.append(chunk_key)
+            if chunk_key in data_key_to_store_size:
+                # normal chunk
+                store_size = data_key_to_store_size[chunk_key]
+                memory_size = data_key_to_memory_size[chunk_key]
+                result_data_size += memory_size
+                object_ref = data_key_to_object_id[chunk_key]
+            else:
+                # mapper chunk
+                mapper_keys = get_mapper_data_keys(chunk_key, data_key_to_store_size)
+                store_size = sum(data_key_to_store_size[k] for k in mapper_keys)
+                memory_size = sum(data_key_to_memory_size[k] for k in mapper_keys)
+                # Skip meta for shuffle
+                object_ref = None
+            # for worker, if chunk in update_meta_chunks
+            # save meta including dtypes_value etc, otherwise,
+            # save basic meta only
+            if result_chunk in update_meta_chunks:
+                set_worker_chunk_metas.append(
+                    self._worker_meta_api.set_chunk_meta.delay(
+                        result_chunk,
+                        memory_size=memory_size,
+                        store_size=store_size,
+                        bands=[self._band],
+                        chunk_key=chunk_key,
+                        exclude_fields=["object_ref"],
+                    )
+                )
+            # for supervisor, only save basic meta that is small like memory_size etc
+            set_chunk_metas.append(
+                self._meta_api.set_chunk_meta.delay(
+                    result_chunk,
+                    memory_size=memory_size,
+                    store_size=store_size,
+                    bands=[self._band],
+                    chunk_key=chunk_key,
+                    object_ref=object_ref,
+                    fields=BASIC_META_FIELDS,
+                )
+            )
+        logger.debug(
+            "Start storing chunk metas for data keys: %s, subtask id: %s",
+            set_meta_keys,
+            self.subtask.subtask_id,
+        )
+        if set_chunk_metas:
+            f = asyncio.get_running_loop().create_future()
+
+            async def set_chunks_meta():
+                coros = []
+                if set_worker_chunk_metas:
+                    coros.append(
+                        self._worker_meta_api.set_chunk_meta.batch(
+                            *set_worker_chunk_metas
+                        )
+                    )
+                coros.append(self._meta_api.set_chunk_meta.batch(*set_chunk_metas))
+                await asyncio.gather(*coros)
+                logger.debug(
+                    "Finish store chunk metas for data keys: %s, subtask id: %s",
+                    set_meta_keys,
+                    self.subtask.subtask_id,
+                )
+                f.set_result(None)
+
+            try:
+                # Since we don't delete chunk data on this worker,
+                # we need to ensure chunk meta are recorded
+                # in meta service, so that `processor.decref_stage`
+                # can delete the chunk data finally.
+                await asyncio.shield(set_chunks_meta())
+            except asyncio.CancelledError:  # pragma: no cover
+                await f
+                raise
+        # set result data size
+        self.result.data_size = result_data_size
+
+    @classmethod
+    @alru_cache(cache_exceptions=False)
+    async def _gen_reducer_index_to_bands(
+        cls, session_id: str, supervisor_address: str, task_id: str, map_reduce_id: int
+    ) -> Dict[Tuple[int], BandType]:
+        task_api = await TaskAPI.create(session_id, supervisor_address)
+        map_reduce_info = await task_api.get_map_reduce_info(task_id, map_reduce_id)
+        assert len(map_reduce_info.reducer_indexes) == len(
+            map_reduce_info.reducer_bands
+        )
+        return {
+            reducer_index: band
+            for reducer_index, band in zip(
+                map_reduce_info.reducer_indexes, map_reduce_info.reducer_bands
+            )
+        }
+
+    async def done(self):
+        if self.result.status == SubtaskStatus.running:
+            self.result.status = SubtaskStatus.succeeded
+            # Only update end time when subtask succeeded
+            self.result.execution_end_time = time.time()
+        self.result.progress = 1.0
+        self.is_done.set()
+
+    async def run(self):
+        self.result.status = SubtaskStatus.running
+        input_keys = None
+        unpinned = False
+        try:
+            raw_result_chunks = list(self._chunk_graph.result_chunks)
+            chunk_graph = optimize(self._chunk_graph, self._engines)
+            self._chunk_key_to_data_keys = get_chunk_key_to_data_keys(chunk_graph)
+            report_progress = asyncio.create_task(self.report_progress_periodically())
+
+            result_chunk_to_optimized = {
+                c: o for c, o in zip(raw_result_chunks, chunk_graph.result_chunks)
+            }
+            raw_update_meta_chunks = self.subtask.update_meta_chunks
+            if raw_update_meta_chunks is None:
+                raw_update_meta_chunks = raw_result_chunks
+            update_meta_chunks = {
+                result_chunk_to_optimized[c] for c in raw_update_meta_chunks
+            }
+
+            # load inputs data
+            input_keys = await self._load_input_data()
+            try:
+                # execute chunk graph
+                await self._execute_graph(chunk_graph)
+            finally:
+                # unpin inputs data
+                unpinned = True
+                await self._unpin_data(input_keys)
+            # store results data
+            (
+                stored_keys,
+                store_sizes,
+                memory_sizes,
+                data_key_to_object_id,
+            ) = await self._store_data(chunk_graph)
+            # store meta
+            await self._store_meta(
+                chunk_graph,
+                store_sizes,
+                memory_sizes,
+                data_key_to_object_id,
+                update_meta_chunks,
+            )
+        except asyncio.CancelledError:
+            self.result.status = SubtaskStatus.cancelled
+            self.result.progress = 1.0
+            raise
+        except (
+            BaseException
+        ) as ex:  # noqa: E722  # nosec  # pylint: disable=bare-except
+            self.result.status = SubtaskStatus.errored
+            self.result.progress = 1.0
+            if isinstance(ex, ExecutionError):
+                self.result.error = ex.nested_error
+                self.result.traceback = ex.nested_error.__traceback__
+            else:  # pragma: no cover
+                _, self.result.error, self.result.traceback = sys.exc_info()
+            await self.done()
+            raise
+        finally:
+            if input_keys is not None and not unpinned:
+                await self._unpin_data(input_keys)
+
+        await self.done()
+        if self.result.status == SubtaskStatus.succeeded:
+            cost_time_secs = (
+                self.result.execution_end_time - self.result.execution_start_time
+            )
+            logger.info(
+                "Time consuming to execute a subtask is %ss with session_id %s, subtask_id %s",
+                cost_time_secs,
+                self._session_id,
+                self.subtask.subtask_id,
+            )
+            self._subtask_execution_time.record(
+                cost_time_secs,
+                {"session_id": self._session_id, "subtask_id": self.subtask.subtask_id},
+            )
+        report_progress.cancel()
+        try:
+            await report_progress
+        except asyncio.CancelledError:
+            pass
+        return self.result
+
+    async def report_progress_periodically(self, interval=0.5, eps=0.001):
+        last_progress = self.result.progress
+        while not self.result.status.is_done:
+            size = self._actual_chunk_count
+            progress = sum(self._op_progress.values()) / size
+            assert progress <= 1
+            self.result.progress = progress
+            if abs(last_progress - progress) >= eps:
+                # report progress
+                if not self.result.status.is_done:
+                    fut = self.notify_task_manager_result(
+                        self._supervisor_address, self.result
+                    )
+                    if fut:
+                        await fut
+            await asyncio.sleep(interval)
+            last_progress = progress
+
+
+class SubtaskProcessorActor(mo.Actor):
+    _session_api: Optional[SessionAPI]
+    _storage_api: Optional[StorageAPI]
+    _meta_api: Optional[MetaAPI]
+    _worker_meta_api: Optional[WorkerMetaAPI]
+    _processor: Optional[SubtaskProcessor]
+    _last_processor: Optional[SubtaskProcessor]
+    _running_aio_task: Optional[asyncio.Task]
+
+    def __init__(
+        self,
+        session_id: str,
+        band: BandType,
+        supervisor_address: str,
+        worker_address: str,
+        subtask_processor_cls: Type[SubtaskProcessor],
+    ):
+        self._session_id = session_id
+        self._band = band
+        self._supervisor_address = supervisor_address
+        self._worker_address = worker_address
+        self._subtask_processor_cls = subtask_processor_cls
+
+        # current processor
+        self._processor = None
+        self._last_processor = None
+        self._running_aio_task = None
+
+        self._session_api = None
+        self._storage_api = None
+        self._meta_api = None
+        self._worker_meta_api = None
+
+    @classmethod
+    def gen_uid(cls, session_id: str):
+        return f"{session_id}_subtask_processor"
+
+    async def __post_create__(self):
+        coros = [
+            SessionAPI.create(self._supervisor_address),
+            StorageAPI.create(self._session_id, self.address, self._band[1]),
+            MetaAPI.create(self._session_id, self._supervisor_address),
+            WorkerMetaAPI.create(self._session_id, self.address),
+        ]
+        coros = [asyncio.ensure_future(coro) for coro in coros]
+        await asyncio.gather(*coros)
+        self._session_api, self._storage_api, self._meta_api, self._worker_meta_api = [
+            coro.result() for coro in coros
+        ]
+
+    async def _init_context(self, session_id: str) -> ThreadedServiceContext:
+        loop = asyncio.get_running_loop()
+        context = ThreadedServiceContext(
+            session_id,
+            self._supervisor_address,
+            self._worker_address,
+            self.address,
+            loop,
+            band=self._band,
+        )
+        await context.init()
+        return context
+
+    async def run(self, subtask: Subtask):
+        logger.info(
+            "Start to run subtask: %r on %s. chunk graph contains %s",
+            subtask,
+            self.address,
+            [c for c in subtask.chunk_graph],
+        )
+
+        assert subtask.session_id == self._session_id
+
+        # init context
+        ctx = await self._init_context(self._session_id)
+        with ctx:
+            processor = self._subtask_processor_cls(
+                subtask,
+                self._session_api,
+                self._storage_api,
+                self._meta_api,
+                self._worker_meta_api,
+                self._band,
+                self._supervisor_address,
+            )
+            self._processor = self._last_processor = processor
+            self._running_aio_task = asyncio.create_task(processor.run())
+            try:
+                result = yield self._running_aio_task
+                logger.info("Finished subtask: %s", subtask.subtask_id)
+                raise mo.Return(result)
+            finally:
+                self._processor = self._running_aio_task = None
+
+    async def wait(self):
+        return self._processor.is_done.wait()
+
+    async def result(self):
+        return self._last_processor.result
+
+    async def cancel(self):
+        logger.info("Cancelling subtask: %s", self._processor.subtask_id)
+
+        aio_task = self._running_aio_task
+        aio_task.cancel()
+
+        async def waiter():
+            try:
+                await aio_task
+            except asyncio.CancelledError:
+                pass
+
+        # return asyncio task to not block current actor
+        return waiter()
+
+    def get_running_subtask_id(self):
+        return self._processor.subtask_id
+
+    def set_running_op_progress(self, op_key: str, progress: float):
+        self._processor.set_op_progress(op_key, progress)
diff --git a/python/xorbits/_mars/services/subtask/worker/runner.py b/python/xorbits/_mars/services/subtask/worker/runner.py
new file mode 100644
index 000000000..8fa3c1263
--- /dev/null
+++ b/python/xorbits/_mars/services/subtask/worker/runner.py
@@ -0,0 +1,143 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import importlib
+import logging
+from typing import Dict, Optional, Type
+
+from .... import oscar as mo
+from ....lib.aio import alru_cache
+from ....typing import BandType
+from ...cluster import ClusterAPI
+from ..core import Subtask, SubtaskResult
+from ..errors import SlotOccupiedAlready
+from .processor import SubtaskProcessor, SubtaskProcessorActor
+
+logger = logging.getLogger(__name__)
+
+
+SubtaskRunnerRef = mo.ActorRefType["SubtaskRunnerActor"]
+
+
+class SubtaskRunnerActor(mo.Actor):
+    _session_id_to_processors: Dict[str, mo.ActorRefType[SubtaskProcessorActor]]
+    _running_processor: Optional[mo.ActorRefType[SubtaskProcessorActor]]
+    _last_processor: Optional[mo.ActorRefType[SubtaskProcessorActor]]
+
+    @classmethod
+    def gen_uid(cls, band_name: str, slot_id: int):
+        return f"slot_{band_name}_{slot_id}_subtask_runner"
+
+    def __init__(
+        self, band: BandType, worker_address: str, subtask_processor_cls: Type = None
+    ):
+        self._band = band
+        self._worker_address = worker_address
+        self._subtask_processor_cls = self._get_subtask_processor_cls(
+            subtask_processor_cls
+        )
+
+        self._cluster_api = None
+
+        self._session_id_to_processors = dict()
+        self._running_processor = None
+        self._last_processor = None
+
+    async def __post_create__(self):
+        self._cluster_api = await ClusterAPI.create(address=self.address)
+
+    async def __pre_destroy__(self):
+        try:
+            await asyncio.gather(
+                *[
+                    mo.destroy_actor(ref)
+                    for ref in self._session_id_to_processors.values()
+                ]
+            )
+        except mo.ActorNotExist:  # pragma: no cover
+            # deleted, ignore
+            pass
+
+    @classmethod
+    def _get_subtask_processor_cls(cls, subtask_processor_cls):
+        if subtask_processor_cls is None:
+            return SubtaskProcessor
+        else:
+            assert isinstance(subtask_processor_cls, str)
+            module, class_name = subtask_processor_cls.rsplit(".", 1)
+            return getattr(importlib.import_module(module), class_name)
+
+    async def _run_subtask(self, subtask: Subtask):
+        processor = await self._init_subtask_processor(subtask)
+        self._subtask_info.processor = processor
+        return await processor.run()
+
+    @alru_cache(cache_exceptions=False)
+    async def _get_supervisor_address(self, session_id: str):
+        [address] = await self._cluster_api.get_supervisors_by_keys([session_id])
+        return address
+
+    async def run_subtask(self, subtask: Subtask):
+        if self._running_processor is not None:  # pragma: no cover
+            running_subtask_id = await self._running_processor.get_running_subtask_id()
+            # current subtask is still running
+            raise SlotOccupiedAlready(
+                f"There is subtask(id: {running_subtask_id}) running in {self.uid} "
+                f"at {self.address}, cannot run subtask {subtask.subtask_id}"
+            )
+
+        session_id = subtask.session_id
+        supervisor_address = await self._get_supervisor_address(session_id)
+        if session_id not in self._session_id_to_processors:
+            try:
+                self._session_id_to_processors[session_id] = await mo.create_actor(
+                    SubtaskProcessorActor,
+                    session_id,
+                    self._band,
+                    supervisor_address,
+                    self._worker_address,
+                    self._subtask_processor_cls,
+                    uid=SubtaskProcessorActor.gen_uid(session_id),
+                    address=self.address,
+                )
+            except mo.ActorAlreadyExist:
+                # when recovering actor pools, the actor created in sub pools
+                # may be recovered already
+                self._session_id_to_processors[session_id] = await mo.actor_ref(
+                    uid=SubtaskProcessorActor.gen_uid(session_id),
+                    address=self.address,
+                )
+        processor = self._session_id_to_processors[session_id]
+        try:
+            self._running_processor = self._last_processor = processor
+            result = yield self._running_processor.run(subtask)
+        finally:
+            self._running_processor = None
+        raise mo.Return(result)
+
+    async def get_subtask_result(self) -> SubtaskResult:
+        return self._last_processor.result()
+
+    def is_runner_free(self):
+        return self._running_processor is None
+
+    async def cancel_subtask(self):
+        if self._running_processor is None:
+            return
+        running_subtask_id = await self._running_processor.get_running_subtask_id()
+        logger.info("Start to cancel subtask %s.", running_subtask_id)
+        await self._running_processor.cancel()
+        self._running_processor = None
+        logger.info("Canceled subtask %s.", running_subtask_id)
diff --git a/python/xorbits/_mars/services/subtask/worker/service.py b/python/xorbits/_mars/services/subtask/worker/service.py
new file mode 100644
index 000000000..2a45f9c7f
--- /dev/null
+++ b/python/xorbits/_mars/services/subtask/worker/service.py
@@ -0,0 +1,49 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import oscar as mo
+from ...core import AbstractService
+from .manager import SubtaskRunnerManagerActor
+
+
+class SubtaskWorkerService(AbstractService):
+    """
+    Subtask service on worker.
+
+    Service Configuration
+    ---------------------
+    {
+        "subtask" : {
+
+        }
+    }
+    """
+
+    async def start(self):
+        subtask_config = self._config.get("subtask", dict())
+        subtask_processor_cls = subtask_config.get("subtask_processor_cls")
+        await mo.create_actor(
+            SubtaskRunnerManagerActor,
+            worker_address=self._address,
+            subtask_processor_cls=subtask_processor_cls,
+            address=self._address,
+            uid=SubtaskRunnerManagerActor.default_uid(),
+        )
+
+    async def stop(self):
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=SubtaskRunnerManagerActor.default_uid(), address=self._address
+            )
+        )
diff --git a/python/xorbits/_mars/services/subtask/worker/tests/__init__.py b/python/xorbits/_mars/services/subtask/worker/tests/__init__.py
new file mode 100644
index 000000000..76a74ffc0
--- /dev/null
+++ b/python/xorbits/_mars/services/subtask/worker/tests/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .subtask_processor import CheckedSubtaskProcessor
diff --git a/python/xorbits/_mars/services/subtask/worker/tests/subtask_processor.py b/python/xorbits/_mars/services/subtask/worker/tests/subtask_processor.py
new file mode 100644
index 000000000..823a69031
--- /dev/null
+++ b/python/xorbits/_mars/services/subtask/worker/tests/subtask_processor.py
@@ -0,0 +1,92 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict
+
+from .....core import OperandType
+from .....tests.core import ObjectCheckMixin, _check_args
+from ...worker.processor import SubtaskProcessor
+
+
+class CheckStorageAPI:
+    def __init__(self, storage_api):
+        self._storage_api = storage_api
+        self._put_data_keys = set()
+
+    def __getattr__(self, item):
+        return getattr(self._storage_api, item)
+
+    @property
+    def put(self):
+        owner = self
+        put = self._storage_api.put
+
+        class _PutWrapper:
+            def delay(self, data_key: str, obj: object, level=None):
+                if data_key in owner._put_data_keys:
+                    raise Exception(f"Duplicate data put: {data_key}, obj: {obj}")
+                else:
+                    owner._put_data_keys.add(data_key)
+                    return put.delay(data_key, obj, level)
+
+            def __getattr__(self, item):
+                return getattr(put, item)
+
+        return _PutWrapper()
+
+
+class CheckedSubtaskProcessor(ObjectCheckMixin, SubtaskProcessor):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        check_options = dict()
+        if self.subtask.extra_config:
+            kwargs = self.subtask.extra_config.copy()
+        else:
+            kwargs = dict()
+        self._operand_executors = operand_executors = kwargs.pop(
+            "operand_executors", dict()
+        )
+        for op, executor in operand_executors.items():
+            op.register_executor(executor)
+        for key in _check_args:
+            check_options[key] = kwargs.get(key, True)
+        self._check_options = check_options
+        self._check_keys = kwargs.get("check_keys")
+        self._storage_api = CheckStorageAPI(self._storage_api)
+
+    def _execute_operand(self, ctx: Dict[str, Any], op: OperandType):
+        super()._execute_operand(ctx, op)
+        if self._check_options.get("check_all", True):
+            for out in op.outputs:
+                if out not in self._chunk_graph.result_chunks:
+                    continue
+                if self._check_keys and out.key not in self._check_keys:
+                    continue
+                # The first char of key is a letter.
+                assert out.key[0] in {"c", "d", "e", "f"}, out.key
+                if out.key not in ctx and any(
+                    k[0] == out.key for k in ctx if isinstance(k, tuple)
+                ):
+                    # both shuffle mapper and reducer
+                    continue
+                self.assert_object_consistent(out, ctx[out.key])
+
+    async def done(self):
+        await super().done()
+        for op in self._operand_executors:
+            try:
+                op.unregister_executor()
+            except KeyError:
+                pass
diff --git a/python/xorbits/_mars/services/subtask/worker/tests/test_subtask.py b/python/xorbits/_mars/services/subtask/worker/tests/test_subtask.py
new file mode 100644
index 000000000..2f3addfe1
--- /dev/null
+++ b/python/xorbits/_mars/services/subtask/worker/tests/test_subtask.py
@@ -0,0 +1,311 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+import sys
+import time
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ..... import dataframe as md
+from ..... import oscar as mo
+from ..... import remote as mr
+from ..... import tensor as mt
+from .....core import ChunkGraph, ExecutionError
+from .....core.context import get_context
+from .....core.graph import ChunkGraphBuilder, TileableGraph, TileableGraphBuilder
+from .....core.operand import OperandStage
+from .....resource import Resource
+from .....utils import Timer
+from ....cluster import MockClusterAPI
+from ....lifecycle import MockLifecycleAPI
+from ....meta import MockMetaAPI, MockWorkerMetaAPI
+from ....mutable import MockMutableAPI
+from ....scheduling import MockSchedulingAPI
+from ....session import MockSessionAPI
+from ....storage import MockStorageAPI
+from ....task import MapReduceInfo, new_task_id
+from ....task.supervisor.manager import TaskConfigurationActor, TaskManagerActor
+from ... import Subtask, SubtaskResult, SubtaskStatus
+from ...worker.manager import SubtaskRunnerManagerActor
+from ...worker.runner import SubtaskRunnerActor, SubtaskRunnerRef
+
+
+class FakeTaskManager(TaskManagerActor):
+    def set_subtask_result(self, subtask_result: SubtaskResult):
+        return
+
+    def get_map_reduce_info(self, task_id: str, map_reduce_id: int) -> MapReduceInfo:
+        return MapReduceInfo(
+            map_reduce_id=0,
+            reducer_indexes=[(0, 0)],
+            reducer_bands=[(self.address, "numa-0")],
+        )
+
+
+@pytest.fixture
+async def actor_pool():
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await mo.create_actor_pool(
+        "127.0.0.1",
+        n_process=3,
+        labels=["main"] + ["numa-0"] * 2 + ["io"],
+        subprocess_start_method=start_method,
+    )
+
+    async with pool:
+        session_id = "test_session"
+        # create mock APIs
+        await MockClusterAPI.create(
+            pool.external_address, band_to_resource={"numa-0": Resource(num_cpus=2)}
+        )
+        await MockSessionAPI.create(pool.external_address, session_id=session_id)
+        meta_api = await MockMetaAPI.create(session_id, pool.external_address)
+        await MockWorkerMetaAPI.create(session_id, pool.external_address)
+        await MockLifecycleAPI.create(session_id, pool.external_address)
+        storage_api = await MockStorageAPI.create(session_id, pool.external_address)
+        await MockSchedulingAPI.create(session_id, pool.external_address)
+        await MockMutableAPI.create(session_id, pool.external_address)
+
+        # create configuration
+        await mo.create_actor(
+            TaskConfigurationActor,
+            dict(),
+            dict(),
+            uid=TaskConfigurationActor.default_uid(),
+            address=pool.external_address,
+        )
+        await mo.create_actor(
+            FakeTaskManager,
+            session_id,
+            uid=FakeTaskManager.gen_uid(session_id),
+            address=pool.external_address,
+        )
+        manager = await mo.create_actor(
+            SubtaskRunnerManagerActor,
+            pool.external_address,
+            None,
+            uid=SubtaskRunnerManagerActor.default_uid(),
+            address=pool.external_address,
+        )
+        try:
+            yield pool, session_id, meta_api, storage_api, manager
+        finally:
+            await MockStorageAPI.cleanup(pool.external_address)
+            await MockClusterAPI.cleanup(pool.external_address)
+            await MockMutableAPI.cleanup(session_id, pool.external_address)
+
+
+def _gen_subtask(t, session_id):
+    graph = TileableGraph([t.data])
+    next(TileableGraphBuilder(graph).build())
+
+    chunk_graph = next(ChunkGraphBuilder(graph, fuse_enabled=False).build())
+    subtask = Subtask(new_task_id(), session_id, new_task_id(), chunk_graph)
+
+    return subtask
+
+
+@pytest.mark.asyncio
+async def test_subtask_success(actor_pool):
+    pool, session_id, meta_api, storage_api, manager = actor_pool
+
+    a = mt.ones((10, 10), chunk_size=10)
+    b = a + 1
+
+    subtask = _gen_subtask(b, session_id)
+    subtask_runner: SubtaskRunnerRef = await mo.actor_ref(
+        SubtaskRunnerActor.gen_uid("numa-0", 0), address=pool.external_address
+    )
+    await subtask_runner.run_subtask(subtask)
+    result = await subtask_runner.get_subtask_result()
+    assert result.status == SubtaskStatus.succeeded
+
+    # check storage
+    expected = np.ones((10, 10)) + 1
+    result_key = subtask.chunk_graph.results[0].key
+    result = await storage_api.get(result_key)
+    np.testing.assert_array_equal(expected, result)
+
+    # check meta
+    chunk_meta = await meta_api.get_chunk_meta(result_key)
+    assert chunk_meta is not None
+    assert chunk_meta["bands"][0] == (pool.external_address, "numa-0")
+    assert await subtask_runner.is_runner_free() is True
+
+
+@pytest.mark.asyncio
+async def test_shuffle_subtask(actor_pool):
+    pool, session_id, meta_api, storage_api, manager = actor_pool
+
+    pdf = pd.DataFrame({"f1": ["a", "b", "a"], "f2": [1, 2, 3]})
+    df = md.DataFrame(pdf)
+    result = df.groupby("f1").sum(method="shuffle")
+
+    graph = TileableGraph([result.data])
+    next(TileableGraphBuilder(graph).build())
+    chunk_graph = next(ChunkGraphBuilder(graph, fuse_enabled=False).build())
+    result_chunks = []
+    new_chunk_graph = ChunkGraph(result_chunks)
+    chunk_graph_iter = chunk_graph.topological_iter()
+    curr = None
+    for _ in range(3):
+        prev = curr
+        curr = next(chunk_graph_iter)
+        new_chunk_graph.add_node(curr)
+        if prev is not None:
+            new_chunk_graph.add_edge(prev, curr)
+    assert curr.op.stage == OperandStage.map
+    curr.op.extra_params = {"analyzer_map_reduce_id": 0}
+    result_chunks.append(curr)
+    subtask = Subtask(new_task_id(), session_id, new_task_id(), new_chunk_graph)
+    subtask_runner: SubtaskRunnerRef = await mo.actor_ref(
+        SubtaskRunnerActor.gen_uid("numa-0", 0), address=pool.external_address
+    )
+    await subtask_runner.run_subtask(subtask)
+    result = await subtask_runner.get_subtask_result()
+    assert result.status == SubtaskStatus.succeeded
+
+
+@pytest.mark.asyncio
+async def test_subtask_failure(actor_pool):
+    pool, session_id, meta_api, storage_api, manager = actor_pool
+
+    # test execution error
+    with mt.errstate(divide="raise"):
+        a = mt.ones((10, 10), chunk_size=10)
+        c = a / 0
+
+    subtask = _gen_subtask(c, session_id)
+    subtask_runner: SubtaskRunnerRef = await mo.actor_ref(
+        SubtaskRunnerActor.gen_uid("numa-0", 0), address=pool.external_address
+    )
+    with pytest.raises(ExecutionError) as ex_info:
+        await subtask_runner.run_subtask(subtask)
+    assert isinstance(ex_info.value.nested_error, FloatingPointError)
+    result = await subtask_runner.get_subtask_result()
+    assert result.status == SubtaskStatus.errored
+    assert isinstance(result.error, FloatingPointError)
+    assert await subtask_runner.is_runner_free() is True
+
+
+@pytest.mark.asyncio
+async def test_cancel_subtask(actor_pool):
+    pool, session_id, meta_api, storage_api, manager = actor_pool
+    subtask_runner: SubtaskRunnerRef = await mo.actor_ref(
+        SubtaskRunnerActor.gen_uid("numa-0", 0), address=pool.external_address
+    )
+
+    def sleep(timeout: int):
+        time.sleep(timeout)
+        return timeout
+
+    b = mr.spawn(sleep, 100)
+
+    subtask = _gen_subtask(b, session_id)
+    asyncio.create_task(subtask_runner.run_subtask(subtask))
+    await asyncio.sleep(0.2)
+    with Timer() as timer:
+        # normal cancel by cancel asyncio Task
+        aio_task = asyncio.create_task(
+            asyncio.wait_for(asyncio.shield(subtask_runner.cancel_subtask()), timeout=1)
+        )
+        assert await subtask_runner.is_runner_free() is False
+        with pytest.raises(asyncio.TimeoutError):
+            await aio_task
+    # need 1 sec to reach timeout, then killing actor and wait for auto recovering
+    # the time would not be over 5 sec
+    assert timer.duration < 5
+
+    async def wait_slot_restore():
+        while True:
+            try:
+                assert await subtask_runner.is_runner_free() is True
+            except (mo.ServerClosed, ConnectionRefusedError, mo.ActorNotExist):
+                await asyncio.sleep(0.5)
+            else:
+                break
+
+    await mo.kill_actor(subtask_runner)
+    await wait_slot_restore()
+
+    a = mr.spawn(sleep, 2)
+
+    subtask2 = _gen_subtask(a, session_id)
+    asyncio.create_task(subtask_runner.run_subtask(subtask2))
+    await asyncio.sleep(0.2)
+    with Timer() as timer:
+        # normal cancel by cancel asyncio Task
+        await asyncio.wait_for(subtask_runner.cancel_subtask(), timeout=6)
+    # do not need to wait 10 sec
+    assert timer.duration < 10
+    assert await subtask_runner.is_runner_free() is True
+
+
+@pytest.mark.asyncio
+async def test_subtask_op_progress(actor_pool):
+    pool, session_id, meta_api, storage_api, manager = actor_pool
+    subtask_runner: SubtaskRunnerRef = await mo.actor_ref(
+        SubtaskRunnerActor.gen_uid("numa-0", 0), address=pool.external_address
+    )
+
+    def progress_sleep(interval: float, count: int):
+        for idx in range(count):
+            time.sleep(interval)
+            get_context().set_progress((1 + idx) * 1.0 / count)
+
+    b = mr.spawn(progress_sleep, args=(0.75, 2))
+
+    subtask = _gen_subtask(b, session_id)
+    aio_task = asyncio.create_task(subtask_runner.run_subtask(subtask))
+    try:
+        await asyncio.sleep(0.5)
+        result = await subtask_runner.get_subtask_result()
+        assert result.progress == 0.0
+
+        await asyncio.sleep(0.75)
+        result = await subtask_runner.get_subtask_result()
+        assert result.progress == 0.5
+    finally:
+        await aio_task
+
+    result = await subtask_runner.get_subtask_result()
+    assert result.progress == 1.0
+
+
+def test_update_subtask_result():
+    subtask_result = SubtaskResult(
+        subtask_id="test_subtask_abc",
+        status=SubtaskStatus.pending,
+        progress=0.0,
+        bands=[("127.0.0.1", "numa-0")],
+    )
+    new_result = SubtaskResult(
+        subtask_id="test_subtask_abc",
+        status=SubtaskStatus.succeeded,
+        progress=1.0,
+        bands=[("127.0.0.1", "numa-0")],
+        execution_start_time=1646125099.622051,
+        execution_end_time=1646125104.448726,
+    )
+    subtask_result.update(new_result)
+    assert subtask_result.execution_start_time == new_result.execution_start_time
+    assert subtask_result.execution_end_time == new_result.execution_end_time
diff --git a/python/xorbits/_mars/services/task/__init__.py b/python/xorbits/_mars/services/task/__init__.py
new file mode 100644
index 000000000..13030132e
--- /dev/null
+++ b/python/xorbits/_mars/services/task/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .api import AbstractTaskAPI, TaskAPI, WebTaskAPI
+from .config import task_options
+from .core import MapReduceInfo, Task, TaskResult, TaskStatus, new_task_id
+from .errors import TaskNotExist
diff --git a/python/xorbits/_mars/services/task/analyzer/__init__.py b/python/xorbits/_mars/services/task/analyzer/__init__.py
new file mode 100644
index 000000000..a2e0c4cfd
--- /dev/null
+++ b/python/xorbits/_mars/services/task/analyzer/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .analyzer import GraphAnalyzer
diff --git a/python/xorbits/_mars/services/task/analyzer/analyzer.py b/python/xorbits/_mars/services/task/analyzer/analyzer.py
new file mode 100644
index 000000000..10d82f5fb
--- /dev/null
+++ b/python/xorbits/_mars/services/task/analyzer/analyzer.py
@@ -0,0 +1,546 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import logging
+from collections import defaultdict, deque
+from typing import Dict, List, Tuple, Type, Union
+
+from ....config import Config
+from ....core import ChunkGraph, ChunkType, enter_mode
+from ....core.operand import (
+    Fetch,
+    LogicKeyGenerator,
+    MapReduceOperand,
+    OperandStage,
+    ShuffleFetchType,
+    ShuffleProxy,
+    VirtualOperand,
+)
+from ....lib.ordered_set import OrderedSet
+from ....resource import Resource
+from ....typing import BandType, OperandType
+from ....utils import build_fetch, build_fetch_shuffle, tokenize
+from ...subtask import Subtask, SubtaskGraph
+from ..core import MapReduceInfo, Task, new_task_id
+from .assigner import AbstractGraphAssigner, GraphAssigner
+from .fusion import Coloring
+
+logger = logging.getLogger(__name__)
+
+
+def need_reassign_worker(op: OperandType) -> bool:
+    # NOTE(qinxuye): special process for reducer
+    # We'd better set reducer op's stage to reduce, however,
+    # in many case, we copy a reducer op from tileable op,
+    # then set stage as reducer one,
+    # it would be quite nasty to take over the __setattr__ and
+    # make reassign_worker True etc.
+    return op.reassign_worker or (
+        isinstance(op, MapReduceOperand) and op.stage == OperandStage.reduce
+    )
+
+
+class GraphAnalyzer:
+    """
+    An subtask graph builder which build subtask graph for chunk graph based on passed band_resource.
+
+    If push shuffle is used, this builder will validate predecessors orders consistency of ShuffleProxy between
+    chunk graph and generated subtask graph.
+    """
+
+    _map_reduce_id = itertools.count()
+
+    def __init__(
+        self,
+        chunk_graph: ChunkGraph,
+        band_resource: Dict[BandType, Resource],
+        task: Task,
+        config: Config,
+        chunk_to_subtasks: Dict[ChunkType, Subtask],
+        graph_assigner_cls: Type[AbstractGraphAssigner] = None,
+        stage_id: str = None,
+        map_reduce_id_to_infos: Dict[int, MapReduceInfo] = None,
+        shuffle_fetch_type: ShuffleFetchType = ShuffleFetchType.FETCH_BY_KEY,
+    ):
+        self._chunk_graph = chunk_graph
+        self._final_result_chunks_set = set(self._chunk_graph.result_chunks)
+        self._band_resource = band_resource
+        self._task = task
+        self._stage_id = stage_id
+        self._config = config
+        self._shuffle_fetch_type = shuffle_fetch_type
+        self._has_shuffle = any(
+            isinstance(c.op, MapReduceOperand) for c in self._chunk_graph
+        )
+        self._fuse_enabled = task.fuse_enabled
+        self._extra_config = task.extra_config
+        self._chunk_to_subtasks = chunk_to_subtasks
+        self._map_reduce_id_to_infos = map_reduce_id_to_infos
+        if graph_assigner_cls is None:
+            graph_assigner_cls = GraphAssigner
+        self._graph_assigner_cls = graph_assigner_cls
+        self._chunk_to_copied = dict()
+        self._logic_key_generator = LogicKeyGenerator()
+
+    @classmethod
+    def next_map_reduce_id(cls) -> int:
+        return next(cls._map_reduce_id)
+
+    @classmethod
+    def _iter_start_ops(cls, chunk_graph: ChunkGraph):
+        visited = set()
+        op_keys = set()
+        start_chunks = deque(chunk_graph.iter_indep())
+        stack = deque([start_chunks.popleft()])
+
+        while stack:
+            chunk = stack.popleft()
+            if chunk not in visited:
+                inp_chunks = chunk_graph.predecessors(chunk)
+                if not inp_chunks or all(
+                    inp_chunk in visited for inp_chunk in inp_chunks
+                ):
+                    if len(inp_chunks) == 0:
+                        op_key = chunk.op.key
+                        if op_key not in op_keys:
+                            op_keys.add(op_key)
+                            yield chunk.op
+                    visited.add(chunk)
+                    stack.extend(c for c in chunk_graph[chunk] if c not in visited)
+                else:
+                    stack.appendleft(chunk)
+                    stack.extendleft(
+                        reversed(
+                            [
+                                c
+                                for c in chunk_graph.predecessors(chunk)
+                                if c not in visited
+                            ]
+                        )
+                    )
+            if not stack and start_chunks:
+                stack.appendleft(start_chunks.popleft())
+
+    def _gen_input_chunks(
+        self,
+        inp_chunks: List[ChunkType],
+        chunk_to_fetch_chunk: Dict[ChunkType, ChunkType],
+    ) -> List[ChunkType]:
+        # gen fetch chunks for input chunks
+        inp_fetch_chunks = []
+        for inp_chunk in inp_chunks:
+            if inp_chunk in chunk_to_fetch_chunk:
+                inp_fetch_chunks.append(chunk_to_fetch_chunk[inp_chunk])
+            elif isinstance(inp_chunk.op, Fetch):
+                chunk_to_fetch_chunk[inp_chunk] = inp_chunk
+                inp_fetch_chunks.append(inp_chunk)
+            elif isinstance(inp_chunk.op, ShuffleProxy):
+                n_reducers = inp_chunk.op.n_reducers
+                fetch_chunk = build_fetch_shuffle(
+                    inp_chunk,
+                    n_reducers=n_reducers,
+                    shuffle_fetch_type=self._shuffle_fetch_type,
+                ).data
+                chunk_to_fetch_chunk[inp_chunk] = fetch_chunk
+                inp_fetch_chunks.append(fetch_chunk)
+            else:
+                fetch_chunk = build_fetch(inp_chunk).data
+                chunk_to_fetch_chunk[inp_chunk] = fetch_chunk
+                inp_fetch_chunks.append(fetch_chunk)
+
+        return inp_fetch_chunks
+
+    @staticmethod
+    def _to_band(band_or_worker: Union[BandType, str]) -> BandType:
+        if isinstance(band_or_worker, tuple) and len(band_or_worker) == 2:
+            # band already
+            return band_or_worker
+        else:
+            return band_or_worker, "numa-0"
+
+    @staticmethod
+    def _get_expect_band(op: OperandType):
+        if op.expect_band is not None:
+            return op.expect_band
+        elif op.expect_worker is not None:
+            return GraphAnalyzer._to_band(op.expect_worker)
+
+    def _gen_subtask_info(
+        self,
+        chunks: List[ChunkType],
+        chunk_to_subtask: Dict[ChunkType, Subtask],
+        chunk_to_bands: Dict[ChunkType, BandType],
+        chunk_to_fetch_chunk: Dict[ChunkType, ChunkType],
+    ) -> Tuple[Subtask, List[Subtask], bool]:
+        # gen subtask and its input subtasks
+        chunks_set = set(chunks)
+        result_chunks = []
+        result_chunks_set = set()
+        chunk_graph = ChunkGraph(result_chunks)
+        out_of_scope_chunks = []
+        chunk_to_copied = self._chunk_to_copied
+        update_meta_chunks = []
+        # subtask properties
+        band = None
+        is_virtual = None
+        retryable = True
+        chunk_priority = None
+        expect_band = None
+        bands_specified = None
+        processed = set()
+        for chunk in chunks:
+            if chunk in processed:
+                continue
+            if expect_band is None:
+                expect_band = self._get_expect_band(chunk.op)
+                bands_specified = expect_band is not None
+            else:  # pragma: no cover
+                curr_expect_band = self._get_expect_band(chunk.op)
+                assert curr_expect_band is None or expect_band == curr_expect_band, (
+                    f"expect_band {curr_expect_band} conflicts with chunks that have same color: "
+                    f"{expect_band}"
+                )
+            # process band
+            chunk_band = chunk_to_bands.get(chunk)
+            if chunk_band is not None:
+                assert (
+                    band is None or band == chunk_band
+                ), "band conflicts with chunks that have same color"
+                band = chunk_band
+            # process is_virtual
+            if isinstance(chunk.op, VirtualOperand):
+                assert is_virtual is None, "only 1 virtual operand can exist"
+                is_virtual = True
+            else:
+                is_virtual = False
+            # process retryable
+            if not chunk.op.retryable:
+                retryable = False
+            # process priority
+            if chunk.op.priority is not None:
+                assert (
+                    chunk_priority is None or chunk_priority == chunk.op.priority
+                ), "priority conflicts with chunks that have same color"
+                chunk_priority = chunk.op.priority
+            # process input chunks
+            inp_chunks = []
+            build_fetch_index_to_chunks = dict()
+            for i, inp_chunk in enumerate(chunk.inputs):
+                if inp_chunk in chunks_set:
+                    inp_chunks.append(chunk_to_copied[inp_chunk])
+                else:
+                    build_fetch_index_to_chunks[i] = inp_chunk
+                    inp_chunks.append(None)
+                    if not isinstance(inp_chunk.op, Fetch):
+                        out_of_scope_chunks.append(inp_chunk)
+            fetch_chunks = self._gen_input_chunks(
+                list(build_fetch_index_to_chunks.values()), chunk_to_fetch_chunk
+            )
+            for i, fetch_chunk in zip(build_fetch_index_to_chunks, fetch_chunks):
+                inp_chunks[i] = fetch_chunk
+            copied_op = chunk.op.copy()
+            copied_op._key = chunk.op.key
+            out_chunks = [
+                c.data
+                for c in copied_op.new_chunks(
+                    inp_chunks, kws=[c.params.copy() for c in chunk.op.outputs]
+                )
+            ]
+            for src_chunk, out_chunk in zip(chunk.op.outputs, out_chunks):
+                processed.add(src_chunk)
+                out_chunk._key = src_chunk.key
+                chunk_graph.add_node(out_chunk)
+                # cannot be copied twice
+                assert src_chunk not in chunk_to_copied
+                chunk_to_copied[src_chunk] = out_chunk
+                if src_chunk in self._final_result_chunks_set:
+                    if out_chunk not in result_chunks_set:
+                        # add to result chunks
+                        result_chunks.append(out_chunk)
+                        # chunk is in the result chunks of full chunk graph
+                        # meta need to be updated
+                        update_meta_chunks.append(out_chunk)
+                        result_chunks_set.add(out_chunk)
+                if not is_virtual:
+                    # skip adding fetch chunk to chunk graph when op is virtual operand
+                    for c in inp_chunks:
+                        if c not in chunk_graph:
+                            chunk_graph.add_node(c)
+                        chunk_graph.add_edge(c, out_chunk)
+        stage_n_outputs = len(result_chunks)
+        # add chunks with no successors into result chunks
+        result_chunks.extend(
+            c
+            for c in chunk_graph.iter_indep(reverse=True)
+            if c not in result_chunks_set
+        )
+        expect_bands = (
+            [expect_band] if bands_specified else ([band] if band is not None else None)
+        )
+        # calculate priority
+        if out_of_scope_chunks:
+            inp_subtasks = []
+            for out_of_scope_chunk in out_of_scope_chunks:
+                copied_out_of_scope_chunk = chunk_to_copied[out_of_scope_chunk]
+                inp_subtask = chunk_to_subtask[out_of_scope_chunk]
+                if (
+                    copied_out_of_scope_chunk
+                    not in inp_subtask.chunk_graph.result_chunks
+                ):
+                    # make sure the chunk that out of scope
+                    # is in the input subtask's results,
+                    # or the meta may be lost
+                    inp_subtask.chunk_graph.result_chunks.append(
+                        copied_out_of_scope_chunk
+                    )
+                inp_subtasks.append(inp_subtask)
+            depth = max(st.priority[0] for st in inp_subtasks) + 1
+        else:
+            inp_subtasks = []
+            depth = 0
+        priority = (depth, chunk_priority or 0)
+
+        subtask = Subtask(
+            subtask_id=new_task_id(),
+            stage_id=self._stage_id,
+            logic_key=self._gen_logic_key(chunks),
+            session_id=self._task.session_id,
+            task_id=self._task.task_id,
+            chunk_graph=chunk_graph,
+            expect_bands=expect_bands,
+            bands_specified=bands_specified,
+            virtual=is_virtual,
+            priority=priority,
+            retryable=retryable,
+            update_meta_chunks=update_meta_chunks,
+            extra_config=self._extra_config,
+            stage_n_outputs=stage_n_outputs,
+        )
+
+        is_shuffle_proxy = False
+        if self._has_shuffle:
+            proxy_chunks = [c for c in result_chunks if isinstance(c.op, ShuffleProxy)]
+            if proxy_chunks:
+                assert len(proxy_chunks) <= 1, proxy_chunks
+                is_shuffle_proxy = True
+        return subtask, inp_subtasks, is_shuffle_proxy
+
+    def _gen_logic_key(self, chunks: List[ChunkType]):
+        return tokenize(
+            *[self._logic_key_generator.get_logic_key(chunk.op) for chunk in chunks]
+        )
+
+    def _gen_map_reduce_info(
+        self, chunk: ChunkType, assign_results: Dict[ChunkType, BandType]
+    ):
+        reducer_ops = OrderedSet(
+            [
+                c.op
+                for c in self._chunk_graph.successors(chunk)
+                if c.op.stage == OperandStage.reduce
+            ]
+        )
+        map_chunks = [
+            c
+            for c in self._chunk_graph.predecessors(chunk)
+            if (c.op.stage == OperandStage.map) or c.is_mapper
+        ]
+        map_reduce_id = self.next_map_reduce_id()
+        for map_chunk in map_chunks:
+            # record analyzer map reduce id for mapper op
+            # copied chunk exists because map chunk must have
+            # been processed before shuffle proxy
+            copied_map_chunk = self._chunk_to_copied[map_chunk]
+            if not hasattr(copied_map_chunk, "extra_params"):  # pragma: no cover
+                copied_map_chunk.extra_params = dict()
+            copied_map_chunk.extra_params["analyzer_map_reduce_id"] = map_reduce_id
+        reducer_bands = [assign_results[r.outputs[0]] for r in reducer_ops]
+        map_reduce_info = MapReduceInfo(
+            map_reduce_id=map_reduce_id,
+            reducer_indexes=[reducer_op.reducer_index for reducer_op in reducer_ops],
+            reducer_bands=reducer_bands,
+        )
+        self._map_reduce_id_to_infos[map_reduce_id] = map_reduce_info
+
+    @enter_mode(build=True)
+    def gen_subtask_graph(
+        self, op_to_bands: Dict[str, BandType] = None
+    ) -> SubtaskGraph:
+        """
+        Analyze chunk graph and generate subtask graph.
+
+        Returns
+        -------
+        subtask_graph: SubtaskGraph
+            Subtask graph.
+        """
+        # reassign worker when specified reassign_worker = True
+        # or it's a reducer operands
+        reassign_worker_ops = [
+            chunk.op for chunk in self._chunk_graph if need_reassign_worker(chunk.op)
+        ]
+        start_ops = (
+            list(self._iter_start_ops(self._chunk_graph))
+            if len(self._chunk_graph) > 0
+            else []
+        )
+
+        # assign start chunks
+        to_assign_ops = start_ops + reassign_worker_ops
+        assigner = self._graph_assigner_cls(
+            self._chunk_graph, to_assign_ops, self._band_resource
+        )
+        # assign expect bands
+        cur_assigns = {
+            op.key: self._get_expect_band(op)
+            for op in start_ops
+            if op.expect_band is not None or op.expect_worker is not None
+        }
+        if op_to_bands:
+            cur_assigns.update(op_to_bands)
+        logger.debug(
+            "Start to assign %s start chunks for task %s",
+            len(start_ops),
+            self._task.task_id,
+        )
+        chunk_to_bands = assigner.assign(cur_assigns=cur_assigns)
+        logger.debug(
+            "Assigned %s start chunks for task %s", len(start_ops), self._task.task_id
+        )
+        # assign expect workers for those specified with `expect_worker` or `expect_band`
+        # skip `start_ops`, which have been assigned before
+        start_ops_set = set(start_ops)
+        for chunk in self._chunk_graph:
+            if chunk not in start_ops_set:
+                if chunk.op.expect_band is not None:
+                    chunk_to_bands[chunk] = chunk.op.expect_band
+                elif chunk.op.expect_worker is not None:
+                    chunk_to_bands[chunk] = self._to_band(chunk.op.expect_worker)
+
+        # color nodes
+        if self._fuse_enabled:
+            logger.debug("Start to fuse chunks for task %s", self._task.task_id)
+            # sort start chunks in coloring as start_ops
+            op_key_to_chunks = defaultdict(list)
+            for chunk in self._chunk_graph:
+                op_key_to_chunks[chunk.op.key].append(chunk)
+            init_chunk_to_bands = dict()
+            for start_op in start_ops:
+                for start_chunk in op_key_to_chunks[start_op.key]:
+                    init_chunk_to_bands[start_chunk] = chunk_to_bands[start_chunk]
+            if (
+                self._has_shuffle
+                and self._shuffle_fetch_type == ShuffleFetchType.FETCH_BY_INDEX
+            ):
+                # ensure no shuffle mapper chunks fused into same subtask.
+                initial_same_color_num = 1
+            else:
+                initial_same_color_num = getattr(
+                    self._config, "initial_same_color_num", None
+                )
+            coloring = Coloring(
+                self._chunk_graph,
+                list(self._band_resource),
+                init_chunk_to_bands,
+                initial_same_color_num=initial_same_color_num,
+                as_broadcaster_successor_num=getattr(
+                    self._config, "as_broadcaster_successor_num", None
+                ),
+            )
+            chunk_to_colors = coloring.color()
+        else:
+            # if not fuse enabled, color all chunks with different colors
+            op_to_colors = dict()
+            chunk_to_colors = dict()
+            color_gen = itertools.count()
+            for c in self._chunk_graph.topological_iter():
+                if c.op not in op_to_colors:
+                    chunk_to_colors[c] = op_to_colors[c.op] = next(color_gen)
+                else:
+                    chunk_to_colors[c] = op_to_colors[c.op]
+        color_to_chunks = defaultdict(list)
+        for chunk, color in chunk_to_colors.items():
+            if not isinstance(chunk.op, Fetch):
+                color_to_chunks[color].append(chunk)
+
+        # gen subtask graph
+        subtask_graph = SubtaskGraph()
+        chunk_to_fetch_chunk = dict()
+        chunk_to_subtask = self._chunk_to_subtasks
+        # states
+        visited = set()
+        logic_key_to_subtasks = defaultdict(list)
+        if self._shuffle_fetch_type == ShuffleFetchType.FETCH_BY_INDEX:
+            for chunk in self._chunk_graph.topological_iter():
+                if not isinstance(chunk.op, ShuffleProxy):
+                    continue
+                # Can't use `OperandStage.map` to find mappers directly, since `stage` of some operand
+                # such as `DataFrameIndexAlign` are `OperandStage.map` but not a shuffle mapper sometimes.
+                mapper_chunks = self._chunk_graph.predecessors(chunk)
+                for mapper_chunk in mapper_chunks:
+                    chunk_color = chunk_to_colors[mapper_chunk]
+                    same_color_chunks = color_to_chunks[chunk_color]
+                    mappers = [
+                        c
+                        for c in same_color_chunks
+                        if c.op.stage == OperandStage.map
+                        and any(
+                            isinstance(succ.op, ShuffleProxy)
+                            for succ in self._chunk_graph.iter_successors(c)
+                        )
+                    ]
+                    if len(mappers) > 1:
+                        # ensure every subtask contains only at most one mapper
+                        for mapper in mappers:
+                            same_color_chunks.remove(mapper)
+                            mapper_color = coloring.next_color()
+                            chunk_to_colors[mapper] = mapper_color
+                            color_to_chunks[mapper_color] = [mapper]
+        for chunk in self._chunk_graph.topological_iter():
+            if chunk in visited or isinstance(chunk.op, Fetch):
+                # skip fetch chunk
+                continue
+
+            color = chunk_to_colors[chunk]
+            same_color_chunks = color_to_chunks[color]
+            if all(isinstance(c.op, Fetch) for c in same_color_chunks):
+                # all fetch ops, no need to gen subtask
+                continue
+            subtask, inp_subtasks, is_shuffle_proxy = self._gen_subtask_info(
+                same_color_chunks,
+                chunk_to_subtask,
+                chunk_to_bands,
+                chunk_to_fetch_chunk,
+            )
+            subtask_graph.add_node(subtask)
+            if is_shuffle_proxy:
+                subtask_graph.add_shuffle_proxy_subtask(subtask)
+            logic_key_to_subtasks[subtask.logic_key].append(subtask)
+            for inp_subtask in inp_subtasks:
+                subtask_graph.add_edge(inp_subtask, subtask)
+
+            for c in same_color_chunks:
+                chunk_to_subtask[c] = subtask
+            if self._map_reduce_id_to_infos is not None and isinstance(
+                chunk.op, ShuffleProxy
+            ):
+                self._gen_map_reduce_info(chunk, chunk_to_bands)
+            visited.update(same_color_chunks)
+
+        for subtasks in logic_key_to_subtasks.values():
+            for logic_index, subtask in enumerate(subtasks):
+                subtask.logic_index = logic_index
+                subtask.logic_parallelism = len(subtasks)
+        return subtask_graph
diff --git a/python/xorbits/_mars/services/task/analyzer/assigner.py b/python/xorbits/_mars/services/task/analyzer/assigner.py
new file mode 100644
index 000000000..870dc4df1
--- /dev/null
+++ b/python/xorbits/_mars/services/task/analyzer/assigner.py
@@ -0,0 +1,243 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from operator import itemgetter
+from typing import Dict, List, Union
+
+import numpy as np
+
+from ....core import ChunkData, ChunkGraph
+from ....core.operand import Fetch, Operand
+from ....lib.ordered_set import OrderedSet
+from ....resource import Resource
+from ....typing import BandType
+from ....utils import implements
+
+
+class AbstractGraphAssigner(ABC):
+    """
+    Assign start nodes.
+    """
+
+    def __init__(
+        self,
+        chunk_graph: ChunkGraph,
+        start_ops: List[Operand],
+        band_resource: Dict[BandType, Resource],
+    ):
+        self._chunk_graph = chunk_graph
+        self._start_ops = start_ops
+        self._band_resource = band_resource
+
+    @abstractmethod
+    def assign(self, cur_assigns: Dict[str, str] = None) -> Dict[ChunkData, BandType]:
+        """
+        Assign start nodes to bands.
+
+        cur_assigns : dict
+            op already assigned.
+
+        Returns
+        -------
+        node_to_bands : dict
+            From node to band.
+        """
+
+    def _is_gpu_band(self) -> bool:
+        gpu_ops = (
+            [op for op in self._start_ops if not isinstance(op, Fetch)]
+            if self._start_ops
+            else []
+        )
+        if gpu_ops and all(op.gpu for op in gpu_ops):
+            return True
+        return False
+
+    def get_device_band_slots(self) -> Dict[BandType, int]:
+        if self._is_gpu_band():  # pragma: no cover
+            band_prefix = "gpu"
+        else:
+            band_prefix = "numa"
+        return {
+            band: resource.num_cpus or resource.num_gpus
+            for band, resource in self._band_resource.items()
+            if band[1].startswith(band_prefix)
+        }
+
+
+class GraphAssigner(AbstractGraphAssigner):
+    def __init__(
+        self,
+        chunk_graph: ChunkGraph,
+        start_ops: List[Operand],
+        band_resource: Dict[BandType, Resource],
+    ):
+        super().__init__(chunk_graph, start_ops, band_resource)
+        self._op_keys: OrderedSet = OrderedSet([start_op.key for start_op in start_ops])
+
+    def _calc_band_assign_limits(
+        self, initial_count: int, occupied: Dict[BandType, int]
+    ) -> Dict[BandType, int]:
+        """
+        Calculate limitation of number of initial operands for bands.
+
+        Parameters
+        ----------
+        initial_count : int
+            Number of nodes that is ready for running.
+        occupied : dict
+           Band to those initials that already assigned.
+
+        Returns
+        -------
+        slot_assign_limits: dict
+            Slot to limitation of number of initial operands.
+        """
+        actual_count: int = initial_count - sum(occupied.values())
+        band_slots = sorted(
+            self.get_device_band_slots().items(), key=itemgetter(1), reverse=True
+        )
+        bands: List[BandType] = [it[0] for it in band_slots]
+        slots = np.asarray([it[1] for it in band_slots], dtype=np.float32)
+
+        # remove assigned nodes from limitations
+        counts = initial_count * slots / slots.sum()
+        for i, band in enumerate(bands):
+            counts[i] = max(0, counts[i] - occupied.get(band, 0))
+
+        # all assigned, nothing to do
+        if counts.sum() == 0:
+            return {band: 0 for band in bands}
+
+        # assign remaining nodes
+        counts = (actual_count * counts / counts.sum()).astype(np.int32)
+        pos = 0
+        rest = actual_count - counts.sum()
+        while rest > 0:
+            counts[pos] += 1
+            rest -= 1
+            pos = (pos + 1) % len(counts)
+        return dict(zip(bands, counts))
+
+    @classmethod
+    def _assign_by_bfs(
+        cls,
+        undirected_chunk_graph: ChunkGraph,
+        start: ChunkData,
+        band: BandType,
+        initial_sizes: Dict[BandType, int],
+        spread_limits: Dict[BandType, float],
+        key_to_assign: OrderedSet,
+        assigned_record: Dict[str, Union[str, BandType]],
+    ):
+        """
+        Assign initial nodes using breath-first search given initial sizes and
+        limitations of spread range.
+        """
+        if initial_sizes[band] <= 0:
+            return
+
+        assigned = 0
+        spread_range = 0
+        for chunk in undirected_chunk_graph.bfs(start=start, visit_predicate="all"):
+            op_key = chunk.op.key
+            if op_key in assigned_record:
+                continue
+            spread_range += 1
+            # `op_key` may not be in `key_to_assign`,
+            # but we need to record it to avoid iterate the node repeatedly.
+            assigned_record[op_key] = band
+            if op_key not in key_to_assign:
+                continue
+            assigned += 1
+            if spread_range >= spread_limits[band] or assigned >= initial_sizes[band]:
+                break
+        initial_sizes[band] -= assigned
+
+    def _build_undirected_chunk_graph(
+        self, chunk_to_assign: List[ChunkData]
+    ) -> ChunkGraph:
+        chunk_graph = self._chunk_graph.copy()
+        # remove edges for all chunk_to_assign which may contain chunks
+        # that need be reassigned
+        for chunk in chunk_to_assign:
+            if chunk_graph.count_predecessors(chunk) > 0:
+                for pred in list(chunk_graph.predecessors(chunk)):
+                    chunk_graph.remove_edge(pred, chunk)
+        return chunk_graph.build_undirected()
+
+    @implements(AbstractGraphAssigner.assign)
+    def assign(
+        self, cur_assigns: Dict[str, BandType] = None
+    ) -> Dict[ChunkData, BandType]:
+        graph = self._chunk_graph
+        assign_result = dict()
+        cur_assigns = cur_assigns or dict()
+        # assigned by expect worker or band
+        initial_assigned_op_keys = set(cur_assigns)
+
+        op_key_to_chunks = defaultdict(list)
+        for chunk in graph:
+            op_key_to_chunks[chunk.op.key].append(chunk)
+
+        op_keys = OrderedSet(self._op_keys)
+        chunk_to_assign = [
+            op_key_to_chunks[op_key][0]
+            for op_key in op_keys
+            if op_key not in cur_assigns
+        ]
+        assigned_counts = defaultdict(lambda: 0)
+        for band in cur_assigns.values():
+            assigned_counts[band] += 1
+
+        # build undirected graph
+        undirected_chunk_graph = self._build_undirected_chunk_graph(chunk_to_assign)
+
+        # calculate the number of chunks to be assigned to each band
+        # given number of bands and existing assignments
+        band_quotas = self._calc_band_assign_limits(
+            len(chunk_to_assign) + sum(assigned_counts.values()), assigned_counts
+        )
+
+        # calculate expected descendant count (spread range) of
+        # every band and subtract assigned number from it
+        average_spread_range = len(graph) * 1.0 / len(self.get_device_band_slots())
+        spread_ranges = defaultdict(lambda: average_spread_range)
+        # assign from other chunks to be assigned
+        # TODO: sort by what?
+        sorted_candidates = chunk_to_assign.copy()
+        while max(band_quotas.values()):
+            band = max(band_quotas, key=lambda k: band_quotas[k])
+            cur = sorted_candidates.pop()
+            while cur.op.key in cur_assigns:
+                cur = sorted_candidates.pop()
+            self._assign_by_bfs(
+                undirected_chunk_graph,
+                cur,
+                band,
+                band_quotas,
+                spread_ranges,
+                op_keys,
+                cur_assigns,
+            )
+
+        key_to_assign = {n.op.key for n in chunk_to_assign} | initial_assigned_op_keys
+        for op_key, band in cur_assigns.items():
+            if op_key in key_to_assign:
+                for chunk in op_key_to_chunks[op_key]:
+                    assign_result[chunk] = band
+
+        return assign_result
diff --git a/python/xorbits/_mars/services/task/analyzer/fusion.py b/python/xorbits/_mars/services/task/analyzer/fusion.py
new file mode 100644
index 000000000..653a3d83a
--- /dev/null
+++ b/python/xorbits/_mars/services/task/analyzer/fusion.py
@@ -0,0 +1,194 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from collections import defaultdict
+from typing import Dict, List
+
+from ....config import options
+from ....core import ChunkGraph
+from ....core.operand import VirtualOperand
+from ....typing import BandType, ChunkType, OperandType
+
+
+class Coloring:
+    """
+    Coloring a chunk graph according to an algorithm
+    described in https://github.com/mars-project/mars/issues/2435
+    """
+
+    def __init__(
+        self,
+        chunk_graph: ChunkGraph,
+        all_bands: List[BandType],
+        chunk_to_bands: Dict[ChunkType, BandType],
+        initial_same_color_num: int = None,
+        as_broadcaster_successor_num: int = None,
+    ):
+        self.chunk_graph = chunk_graph
+        self.all_bands = all_bands
+        self.chunk_to_bands = chunk_to_bands
+        if initial_same_color_num is None:
+            has_gpu = any(c.op.gpu for c in chunk_graph)
+            if not has_gpu:
+                initial_same_color_num = max(options.combine_size // 2, 1)
+            else:
+                # if gpu exists, we try to fuse more node to reduce cost
+                initial_same_color_num = max(options.combine_size * 2, 1)
+        self.initial_same_color_num = initial_same_color_num
+        if as_broadcaster_successor_num is None:
+            as_broadcaster_successor_num = options.combine_size * 2
+        self.successor_same_color_num = as_broadcaster_successor_num
+
+        self._coloring_iter = itertools.count()
+
+    def next_color(self) -> int:
+        return next(self._coloring_iter)
+
+    @classmethod
+    def _can_color_same(cls, chunk: ChunkType, predecessors: List[ChunkType]) -> bool:
+        if (
+            # VirtualOperand cannot be fused
+            any(isinstance(n.op, VirtualOperand) for n in [chunk] + predecessors)
+            # allocated on different bands
+            or len({n.op.gpu for n in [chunk] + predecessors}) > 1
+            # expect worker changed
+            or len({n.op.expect_worker for n in [chunk] + predecessors}) > 1
+            # scheduling hint tells that cannot be fused
+            or (
+                chunk.op.scheduling_hint is not None
+                and not chunk.op.scheduling_hint.can_be_fused()
+            )
+        ):
+            return False
+        return True
+
+    def _color_init_nodes(self) -> Dict[OperandType, int]:
+        # for initial op with same band but different priority
+        # we color them w/ different colors,
+        # to prevent from wrong fusion.
+        # e.g. md.read_csv ensure incremental index by generating
+        # chunks with ascending priorities (smaller one has higher priority),
+        # chunk 0 has higher priority than chunk 1,
+        # so that when chunk 1 executing, it would know chunk 0's shape
+        # TODO: make it general instead handle priority as a special case
+        band_priority_to_colors = dict()
+        for chunk, band in self.chunk_to_bands.items():
+            band_priority = (band, chunk.op.priority)
+            if band_priority not in band_priority_to_colors:
+                band_priority_to_colors[band_priority] = self.next_color()
+
+        band_priority_to_color_list = defaultdict(list)
+        for (band, priority), color in band_priority_to_colors.items():
+            band_priority_to_color_list[band, priority].append(color)
+        color_to_size = defaultdict(lambda: 0)
+        op_to_colors = dict()
+        for chunk, band in self.chunk_to_bands.items():
+            priority = chunk.op.priority
+            color = band_priority_to_color_list[band, priority][-1]
+            size = color_to_size[color]
+            if size >= self.initial_same_color_num:
+                color = self.next_color()
+                band_priority_to_color_list[band, priority].append(color)
+            color_to_size[color] += 1
+            op_to_colors[chunk.op] = color
+        return op_to_colors
+
+    def color(self) -> Dict[ChunkType, int]:
+        chunk_to_colors = dict()
+
+        # step 1: Coloring the initial nodes according to the bands that assigned by assigner
+        op_to_colors = self._color_init_nodes()
+
+        # step2: Propagate color in the topological order,
+        # if the input nodes have same color, color it with the same color;
+        # otherwise, color with a new color.
+        broadcaster_chunk_set = set()
+        for chunk in self.chunk_graph.topological_iter():
+            if self.chunk_graph.count_successors(chunk) > self.successor_same_color_num:
+                # is broadcaster
+                broadcaster_chunk_set.add(chunk)
+
+            if chunk.op in op_to_colors:
+                # colored
+                chunk_to_colors[chunk] = op_to_colors[chunk.op]
+                continue
+
+            predecessors = self.chunk_graph.predecessors(chunk)
+            pred_colors = {op_to_colors[pred.op] for pred in predecessors}
+            if len(predecessors) == 1 and predecessors[0] in broadcaster_chunk_set:
+                # TODO: handle situation that chunks which specify reassign_workers
+                # predecessor is broadcaster, just allocate a new color
+                color = self.next_color()
+            elif len(pred_colors) == 1:
+                if self._can_color_same(chunk, predecessors):
+                    # predecessors have only 1 color, will color with same one
+                    color = next(iter(pred_colors))
+                else:
+                    color = self.next_color()
+            else:
+                # has more than 1 color, color a new one
+                assert len(pred_colors) > 1
+                color = self.next_color()
+
+            op_to_colors[chunk.op] = chunk_to_colors[chunk] = color
+
+        # step 3: Propagate with reversed topological order,
+        # check a node with its inputs, if all inputs have different color with itself, skip;
+        # otherwise, if some of inputs have the same color, but some others have different color,
+        # color the input nodes with same one with a new color, and propagate to its inputs and so on.
+        for chunk in self.chunk_graph.topological_iter(reverse=True):
+            pred_colors = {
+                op_to_colors[pred.op]
+                for pred in self.chunk_graph.iter_successors(chunk)
+            }
+            chunk_color = chunk_to_colors[chunk]
+            if chunk_color in pred_colors and len(pred_colors) > 1:
+                # conflict
+                # color the successors with new colors
+                stack = []
+                for succ in self.chunk_graph.iter_successors(chunk):
+                    if chunk_to_colors[succ] == chunk_color:
+                        new_color = op_to_colors[succ.op] = self.next_color()
+                        for c in succ.op.outputs:
+                            if c not in self.chunk_graph:  # pragma: no cover
+                                continue
+                            chunk_to_colors[c] = new_color
+                            stack.extend(self.chunk_graph.successors(c))
+                # color the descendants with same color to the new one
+                # the descendants will not be visited more than 2 times
+                while len(stack) > 0:
+                    node = stack.pop()
+                    node_color = chunk_to_colors[node]
+                    if node_color == chunk_color:
+                        # same color, recolor to the new one
+                        node_pred_colors = list(
+                            {
+                                op_to_colors[inp.op]
+                                for inp in self.chunk_graph.iter_predecessors(node)
+                            }
+                        )
+                        node_input_same_color = len(node_pred_colors) == 1
+                        if node_input_same_color:
+                            node_new_color = node_pred_colors[0]
+                        else:
+                            node_new_color = self.next_color()
+                        op_to_colors[node.op] = node_new_color
+                        for c in node.op.outputs:
+                            if c not in self.chunk_graph:  # pragma: no cover
+                                continue
+                            chunk_to_colors[c] = node_new_color
+                            stack.extend(self.chunk_graph.successors(c))
+
+        return chunk_to_colors
diff --git a/python/xorbits/_mars/services/task/analyzer/tests/__init__.py b/python/xorbits/_mars/services/task/analyzer/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/task/analyzer/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/task/analyzer/tests/test_analyzer.py b/python/xorbits/_mars/services/task/analyzer/tests/test_analyzer.py
new file mode 100644
index 000000000..e00690a31
--- /dev/null
+++ b/python/xorbits/_mars/services/task/analyzer/tests/test_analyzer.py
@@ -0,0 +1,81 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from ..... import dataframe as md
+from ..... import tensor as mt
+from .....config import Config
+from .....core.operand.shuffle import ShuffleFetchType, ShuffleProxy
+from .....resource import Resource
+from ...core import Task
+from ..analyzer import GraphAnalyzer
+
+t1 = mt.random.RandomState(0).rand(31, 27, chunk_size=10)
+t2 = t1.reshape(27, 31)
+t2.op.extra_params["_reshape_with_shuffle"] = True
+df1 = md.DataFrame(t1, columns=[f"c{i}" for i in range(t1.shape[1])])
+df2 = df1.groupby(["c1"]).apply(lambda pdf: pdf.sum())
+
+
+@pytest.mark.parametrize("tileable", [df1.describe(), df2, t2])
+@pytest.mark.parametrize("fuse", [True, False])
+def test_shuffle_graph(tileable, fuse):
+    # can't test df.groupby and mt.bincount, those chunk graph build depend on ctx.get_chunks_meta/get_chunks_result
+    chunk_graph = tileable.build_graph(tile=True)
+    assert len(chunk_graph) > 0
+    all_bands = [(f"address_{i}", "numa-0") for i in range(5)]
+    band_resource = dict((band, Resource(num_cpus=1)) for band in all_bands)
+    task = Task("mock_task", "mock_session", fuse_enabled=fuse)
+    analyzer = GraphAnalyzer(
+        chunk_graph,
+        band_resource,
+        task,
+        Config(),
+        dict(),
+        shuffle_fetch_type=ShuffleFetchType.FETCH_BY_INDEX,
+    )
+    subtask_graph = analyzer.gen_subtask_graph()
+    proxy_subtasks = []
+    for subtask in subtask_graph:
+        for c in subtask.chunk_graph.results:
+            if isinstance(c.op, ShuffleProxy):
+                assert len(subtask.chunk_graph.results) == 1
+                proxy_subtasks.append(subtask)
+    proxy_chunks = [
+        c
+        for subtask in proxy_subtasks
+        for c in chunk_graph
+        if subtask.chunk_graph.results[0].key == c.key
+    ]
+    assert len(proxy_subtasks) == len(proxy_chunks)
+    assert len(proxy_subtasks) > 0
+    assert len(proxy_subtasks) == len(subtask_graph.get_shuffle_proxy_subtasks())
+    for proxy_chunk, proxy_subtask in zip(proxy_chunks, proxy_subtasks):
+        reducer_subtasks = subtask_graph.successors(proxy_subtask)
+        for reducer_subtask in reducer_subtasks:
+            start_chunks = list(reducer_subtask.chunk_graph.iter_indep())
+            assert len(start_chunks) == 1
+            assert (
+                start_chunks[0].op.shuffle_fetch_type == ShuffleFetchType.FETCH_BY_INDEX
+            )
+        reducer_chunks = chunk_graph.successors(proxy_chunk)
+        # single reducer may have multiple output chunks, see `PSRSShuffle._execute_reduce
+        if len(reducer_subtasks) != len(reducer_chunks):
+            assert len(reducer_subtasks) == len(set(c.op for c in reducer_chunks))
+        mapper_subtasks = subtask_graph.predecessors(proxy_subtask)
+        for mapper_subtask in mapper_subtasks:
+            assert len(mapper_subtask.chunk_graph.results) == 1
+        mapper_chunks = chunk_graph.predecessors(proxy_chunk)
+        assert len(mapper_subtasks) == len(mapper_chunks)
diff --git a/python/xorbits/_mars/services/task/analyzer/tests/test_assigner.py b/python/xorbits/_mars/services/task/analyzer/tests/test_assigner.py
new file mode 100644
index 000000000..cc64f0cb8
--- /dev/null
+++ b/python/xorbits/_mars/services/task/analyzer/tests/test_assigner.py
@@ -0,0 +1,108 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from ..... import dataframe as md
+from .....config import Config
+from .....core import ChunkGraph
+from .....core.graph.builder.utils import build_graph
+from .....core.operand import OperandStage
+from .....resource import Resource
+from .....tensor.arithmetic import TensorAdd
+from .....tensor.fetch import TensorFetch
+from .....tensor.random import TensorRand
+from ...core import Task
+from ..analyzer import GraphAnalyzer, need_reassign_worker
+from ..assigner import GraphAssigner
+
+
+def test_assigner_with_fetch_inputs():
+    band_num = 8
+    all_bands = [(f"address_{i}", "numa-0") for i in range(band_num)]
+    inputs = [
+        TensorFetch(key=str(i), source_key=str(i), dtype=np.dtype(int)).new_chunk([])
+        for i in range(band_num)
+    ]
+    no_fetch_inputs = [TensorRand(i).new_chunk([]) for i in range(4)]
+    results = [TensorAdd(lhs=inp, rhs=1).new_chunk([inp]) for inp in inputs]
+    cur_assigns = dict(
+        (fetch_chunk.op.key, band[0][0])
+        for fetch_chunk, band in zip(reversed(inputs), all_bands)
+    )
+
+    chunk_graph = ChunkGraph()
+    for fetch_chunk, add_chunk in zip(inputs, results):
+        chunk_graph.add_node(fetch_chunk)
+        chunk_graph.add_node(add_chunk)
+        chunk_graph.add_edge(fetch_chunk, add_chunk)
+    for inp in no_fetch_inputs:
+        results.append(inp)
+        chunk_graph.add_node(inp)
+    chunk_graph.results = results
+
+    band_resource = dict((band, Resource(num_cpus=1)) for band in all_bands)
+
+    task = Task("mock_task", "mock_session")
+    analyzer = GraphAnalyzer(chunk_graph, band_resource, task, Config(), dict())
+    subtask_graph = analyzer.gen_subtask_graph(cur_assigns)
+
+    assigner = GraphAssigner(
+        chunk_graph, list(GraphAnalyzer._iter_start_ops(chunk_graph)), band_resource
+    )
+    assigns = assigner.assign(cur_assigns)
+    key_to_assign = dict((c.key, band) for c, band in assigns.items())
+    for subtask in subtask_graph:
+        input_chunks = list(subtask.chunk_graph.iter_indep())
+        if all(isinstance(inp.op, TensorFetch) for inp in input_chunks):
+            # all inputs are fetch, expect band should be None
+            assert subtask.expect_band is None
+        else:
+            # if subtask has truly initial chunks, expect band should be
+            # same as assign results
+            for inp in input_chunks:
+                if not isinstance(inp.op, TensorFetch):
+                    assert subtask.expect_band == key_to_assign[inp.key]
+
+
+def test_shuffle_assign():
+    band_num = 8
+    all_bands = [(f"address_{i}", "numa-0") for i in range(band_num)]
+
+    pdf = pd.DataFrame(np.random.rand(32, 4))
+    df = md.DataFrame(pdf, chunk_size=4)
+    r = df.groupby(0).sum(method="shuffle")
+    chunk_graph = build_graph([r], tile=True)
+
+    band_resource = dict((band, Resource(num_cpus=1)) for band in all_bands)
+
+    reassign_worker_ops = [
+        chunk.op for chunk in chunk_graph if need_reassign_worker(chunk.op)
+    ]
+    start_ops = list(GraphAnalyzer._iter_start_ops(chunk_graph))
+    to_assign_ops = start_ops + reassign_worker_ops
+
+    assigner = GraphAssigner(chunk_graph, to_assign_ops, band_resource)
+    assigns = assigner.assign()
+    assert len(assigns) == 16
+    init_assigns = set()
+    reducer_assigns = set()
+    for chunk, assign in assigns.items():
+        if chunk.op.stage == OperandStage.reduce:
+            reducer_assigns.add(assign)
+        else:
+            init_assigns.add(assign)
+    # init and reducers are assigned on all bands
+    assert len(init_assigns) == len(reducer_assigns) == 8
diff --git a/python/xorbits/_mars/services/task/analyzer/tests/test_fusion.py b/python/xorbits/_mars/services/task/analyzer/tests/test_fusion.py
new file mode 100644
index 000000000..ece9c18ac
--- /dev/null
+++ b/python/xorbits/_mars/services/task/analyzer/tests/test_fusion.py
@@ -0,0 +1,236 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .....core import ChunkGraph
+from .....tensor.arithmetic import TensorTreeAdd
+from ..fusion import Coloring
+
+
+def test_simple_coloring():
+    # graph: https://user-images.githubusercontent.com/357506/132340029-b595afcf-3cec-44cb-b1c3-aac379e2e607.png
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(8)
+    ]
+    graph = ChunkGraph([chunks[3], chunks[7]])
+    for c in chunks:
+        graph.add_node(c)
+    chunks[2].op._inputs = [chunks[0], chunks[1]]
+    graph.add_edge(chunks[0], chunks[2])
+    graph.add_edge(chunks[1], chunks[2])
+    chunks[3].op._inputs = [chunks[2]]
+    graph.add_edge(chunks[2], chunks[3])
+    chunks[6].op._inputs = [chunks[4], chunks[5]]
+    graph.add_edge(chunks[4], chunks[6])
+    graph.add_edge(chunks[5], chunks[6])
+    chunks[7].op._inputs = [chunks[6]]
+    graph.add_edge(chunks[6], chunks[7])
+
+    all_bands = [("127.0.0.1", "0"), ("127.0.0.1", "1")]
+    chunk_to_bands = {
+        chunks[0]: all_bands[0],
+        chunks[1]: all_bands[0],
+        chunks[4]: all_bands[1],
+        chunks[5]: all_bands[1],
+    }
+
+    # allocate node 0, 1 with band 0, node 4, 5 with band 1
+    coloring = Coloring(graph, all_bands, chunk_to_bands)
+    chunk_to_colors = coloring.color()
+    assert len(set(chunk_to_colors.values())) == 2
+    assert (
+        chunk_to_colors[chunks[0]]
+        == chunk_to_colors[chunks[1]]
+        == chunk_to_colors[chunks[2]]
+        == chunk_to_colors[chunks[3]]
+    )
+    assert (
+        chunk_to_colors[chunks[4]]
+        == chunk_to_colors[chunks[5]]
+        == chunk_to_colors[chunks[6]]
+        == chunk_to_colors[chunks[7]]
+    )
+
+    # initial nodes all have different colors
+    coloring = Coloring(graph, all_bands, chunk_to_bands, initial_same_color_num=1)
+    chunk_to_colors = coloring.color()
+    assert len(set(chunk_to_colors.values())) == 6
+    assert (
+        len(
+            {
+                chunk_to_colors[chunks[0]],
+                chunk_to_colors[chunks[1]],
+                chunk_to_colors[chunks[2]],
+            }
+        )
+        == 3
+    )
+    assert chunk_to_colors[chunks[2]] == chunk_to_colors[chunks[3]]
+
+
+def test_coloring_with_gpu_attr():
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(8)
+    ]
+    graph = ChunkGraph([chunks[3], chunks[7]])
+    for c in chunks:
+        graph.add_node(c)
+
+    # two lines, one line can be fused as one task,
+    # the other cannot, because gpu attributes are different
+    chunks[0].op.gpu = True
+    chunks[1].op.gpu = True
+    chunks[1].op._inputs = [chunks[0]]
+    graph.add_edge(chunks[0], chunks[1])
+    chunks[2].op._inputs = [chunks[1]]
+    graph.add_edge(chunks[1], chunks[2])
+    chunks[3].op._inputs = [chunks[2]]
+    graph.add_edge(chunks[2], chunks[3])
+    chunks[5].op._inputs = [chunks[4]]
+    graph.add_edge(chunks[4], chunks[5])
+    chunks[6].op._inputs = [chunks[5]]
+    graph.add_edge(chunks[5], chunks[6])
+    chunks[7].op._inputs = [chunks[6]]
+    graph.add_edge(chunks[6], chunks[7])
+
+    all_bands = [("127.0.0.1", "0"), ("127.0.0.1", "1")]
+    chunk_to_bands = {
+        chunks[0]: all_bands[0],
+        chunks[4]: all_bands[1],
+    }
+
+    coloring = Coloring(graph, all_bands, chunk_to_bands)
+    chunk_to_colors = coloring.color()
+    assert len(set(chunk_to_colors.values())) == 3
+    assert chunk_to_colors[chunks[0]] == chunk_to_colors[chunks[1]]
+    assert chunk_to_colors[chunks[2]] == chunk_to_colors[chunks[3]]
+    assert (
+        chunk_to_colors[chunks[4]]
+        == chunk_to_colors[chunks[5]]
+        == chunk_to_colors[chunks[6]]
+        == chunk_to_colors[chunks[7]]
+    )
+
+
+def test_complex_coloring():
+    # graph: https://user-images.githubusercontent.com/357506/132340055-f08106dd-b507-4e24-bc79-8364d6e1ef79.png
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data
+        for n in range(13)
+    ]
+    graph = ChunkGraph([chunks[7], chunks[12]])
+    for c in chunks:
+        graph.add_node(c)
+    chunks[2].op._inputs = [chunks[0], chunks[1]]
+    graph.add_edge(chunks[0], chunks[2])
+    graph.add_edge(chunks[1], chunks[2])
+    chunks[10].op._inputs = [chunks[8], chunks[9]]
+    graph.add_edge(chunks[8], chunks[10])
+    graph.add_edge(chunks[9], chunks[10])
+    chunks[3].op._inputs = [chunks[2]]
+    graph.add_edge(chunks[2], chunks[3])
+    chunks[4].op._inputs = [chunks[3]]
+    graph.add_edge(chunks[3], chunks[4])
+    chunks[5].op._inputs = [chunks[2], chunks[10]]
+    graph.add_edge(chunks[2], chunks[5])
+    graph.add_edge(chunks[10], chunks[5])
+    chunks[6].op._inputs = [chunks[5]]
+    graph.add_edge(chunks[5], chunks[6])
+    chunks[7].op._inputs = [chunks[4], chunks[6]]
+    graph.add_edge(chunks[4], chunks[7])
+    graph.add_edge(chunks[6], chunks[7])
+    chunks[11].op._inputs = [chunks[10]]
+    graph.add_edge(chunks[10], chunks[11])
+    chunks[12].op._inputs = [chunks[6], chunks[11]]
+    graph.add_edge(chunks[6], chunks[12])
+    graph.add_edge(chunks[11], chunks[12])
+
+    all_bands = [("127.0.0.1", "0"), ("127.0.0.1", "1")]
+    chunk_to_bands = {
+        chunks[0]: all_bands[0],
+        chunks[1]: all_bands[0],
+        chunks[8]: all_bands[1],
+        chunks[9]: all_bands[1],
+    }
+    # allocate node 0, 1 with band 0, node 8, 9 with band 1
+    coloring = Coloring(graph, all_bands, chunk_to_bands)
+    chunk_to_colors = coloring.color()
+    assert len(set(chunk_to_colors.values())) == 7
+    assert (
+        chunk_to_colors[chunks[0]]
+        == chunk_to_colors[chunks[1]]
+        == chunk_to_colors[chunks[2]]
+    )
+    assert chunk_to_colors[chunks[3]] == chunk_to_colors[chunks[4]]
+    assert chunk_to_colors[chunks[5]] == chunk_to_colors[chunks[6]]
+    assert (
+        chunk_to_colors[chunks[8]]
+        == chunk_to_colors[chunks[9]]
+        == chunk_to_colors[chunks[10]]
+    )
+    assert (
+        len(
+            {
+                chunk_to_colors[chunks[0]],
+                chunk_to_colors[chunks[3]],
+                chunk_to_colors[chunks[5]],
+                chunk_to_colors[chunks[7]],
+                chunk_to_colors[chunks[8]],
+                chunk_to_colors[chunks[11]],
+                chunk_to_colors[chunks[12]],
+            }
+        )
+        == 7
+    )
+
+
+def test_coloring_broadcaster():
+    chunks = [
+        TensorTreeAdd(args=[], _key=str(n)).new_chunk(None, None).data for n in range(3)
+    ]
+    graph = ChunkGraph([chunks[2]])
+    for c in chunks:
+        graph.add_node(c)
+    chunks[1].op._inputs = [chunks[0]]
+    graph.add_edge(chunks[0], chunks[1])
+    chunks[2].op._inputs = [chunks[0]]
+    graph.add_edge(chunks[0], chunks[2])
+
+    all_bands = [("127.0.0.1", "0"), ("127.0.0.1", "1")]
+    chunk_to_bands = {
+        chunks[0]: all_bands[0],
+    }
+
+    coloring = Coloring(graph, all_bands, chunk_to_bands)
+    chunk_to_colors = coloring.color()
+    assert len(set(chunk_to_colors.values())) == 1
+    assert (
+        chunk_to_colors[chunks[0]]
+        == chunk_to_colors[chunks[1]]
+        == chunk_to_colors[chunks[2]]
+    )
+    coloring = Coloring(
+        graph, all_bands, chunk_to_bands, as_broadcaster_successor_num=1
+    )
+    chunk_to_colors = coloring.color()
+    assert len(set(chunk_to_colors.values())) == 3
+    assert (
+        len(
+            {
+                chunk_to_colors[chunks[0]],
+                chunk_to_colors[chunks[1]],
+                chunk_to_colors[chunks[2]],
+            }
+        )
+        == 3
+    )
diff --git a/python/xorbits/_mars/services/task/api/__init__.py b/python/xorbits/_mars/services/task/api/__init__.py
new file mode 100644
index 000000000..d1e1792c9
--- /dev/null
+++ b/python/xorbits/_mars/services/task/api/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import AbstractTaskAPI
+from .oscar import TaskAPI
+from .web import WebTaskAPI
diff --git a/python/xorbits/_mars/services/task/api/core.py b/python/xorbits/_mars/services/task/api/core.py
new file mode 100644
index 000000000..47c7330ac
--- /dev/null
+++ b/python/xorbits/_mars/services/task/api/core.py
@@ -0,0 +1,147 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import List, Union
+
+from ....core import Tileable
+from ..core import TaskResult, TileableGraph
+
+
+class AbstractTaskAPI(ABC):
+    @abstractmethod
+    async def get_task_results(self, progress: bool = False) -> List[TaskResult]:
+        """
+        Get results of all tasks in the session
+
+        Parameters
+        ----------
+        progress : bool
+            If True, will return task progress
+
+        Returns
+        -------
+        task_results: List[TaskResult]
+            List of task results
+        """
+
+    @abstractmethod
+    async def submit_tileable_graph(
+        self,
+        graph: TileableGraph,
+        fuse_enabled: bool = True,
+        extra_config: dict = None,
+    ) -> str:
+        """
+        Submit a tileable graph
+
+        Parameters
+        ----------
+        graph : TileableGraph
+            Tileable graph.
+        task_name : str
+            Task name
+        fuse_enabled : bool
+            Enable fuse optimization
+        extra_config : dict
+            Extra config.
+
+        Returns
+        -------
+        task_id : str
+            Task ID.
+        """
+
+    @abstractmethod
+    async def wait_task(self, task_id: str, timeout: float = None):
+        """
+        Wait for a task to finish.
+
+        Parameters
+        ----------
+        task_id : str
+            Task ID
+        timeout: float
+            Second to timeout
+        """
+
+    @abstractmethod
+    async def cancel_task(self, task_id: str):
+        """
+        Cancel task.
+
+        Parameters
+        ----------
+        task_id : str
+            Task ID.
+        """
+
+    @abstractmethod
+    async def get_task_result(self, task_id: str) -> TaskResult:
+        """
+        Get task status.
+
+        Parameters
+        ----------
+        task_id : str
+            Task ID.
+
+        Returns
+        -------
+        result : TaskResult
+            Task result.
+        """
+
+    @abstractmethod
+    async def get_task_progress(self, task_id: str) -> float:
+        """
+        Get task progress.
+
+        Parameters
+        ----------
+        task_id : str
+            Task ID.
+
+        Returns
+        -------
+        progress : float
+            Get task progress.
+        """
+
+    @abstractmethod
+    async def get_fetch_tileables(self, task_id: str) -> List[Tileable]:
+        """
+        Get fetch tileable for a task.
+
+        Parameters
+        ----------
+        task_id : str
+            Task ID.
+
+        Returns
+        -------
+        fetch_tileable_list
+            Fetch tileable list.
+        """
+
+    @abstractmethod
+    async def get_last_idle_time(self) -> Union[float, None]:
+        """
+        Get last idle time from task manager.
+
+        Returns
+        -------
+        last_idle_time: float
+            The last idle time if the task manager is idle else None.
+        """
diff --git a/python/xorbits/_mars/services/task/api/oscar.py b/python/xorbits/_mars/services/task/api/oscar.py
new file mode 100644
index 000000000..9750f5590
--- /dev/null
+++ b/python/xorbits/_mars/services/task/api/oscar.py
@@ -0,0 +1,112 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Union
+
+from .... import oscar as mo
+from ....core import Tileable
+from ....lib.aio import alru_cache
+from ...subtask import SubtaskResult
+from ..core import MapReduceInfo, TaskResult, TileableGraph
+from ..supervisor.manager import TaskManagerActor
+from .core import AbstractTaskAPI
+
+
+class TaskAPI(AbstractTaskAPI):
+    def __init__(
+        self, session_id: str, task_manager_ref: mo.ActorRefType[TaskManagerActor]
+    ):
+        self._session_id = session_id
+        self._task_manager_ref = task_manager_ref
+
+    @classmethod
+    @alru_cache(cache_exceptions=False)
+    async def create(cls, session_id: str, address: str) -> "TaskAPI":
+        """
+        Create Task API.
+
+        Parameters
+        ----------
+        session_id : str
+            Session ID
+        address : str
+            Supervisor address.
+
+        Returns
+        -------
+        task_api
+            Task API.
+        """
+        task_manager_ref = await mo.actor_ref(
+            address, TaskManagerActor.gen_uid(session_id)
+        )
+        return TaskAPI(session_id, task_manager_ref)
+
+    async def get_task_results(self, progress: bool = False) -> List[TaskResult]:
+        return await self._task_manager_ref.get_task_results(progress)
+
+    async def submit_tileable_graph(
+        self,
+        graph: TileableGraph,
+        fuse_enabled: bool = None,
+        extra_config: dict = None,
+    ) -> str:
+        try:
+            return await self._task_manager_ref.submit_tileable_graph(
+                graph, fuse_enabled=fuse_enabled, extra_config=extra_config
+            )
+        except mo.ActorNotExist:
+            raise RuntimeError("Session closed already")
+
+    async def get_tileable_graph_as_json(self, task_id: str):
+        return await self._task_manager_ref.get_tileable_graph_dict_by_task_id(task_id)
+
+    async def get_tileable_details(self, task_id: str):
+        return await self._task_manager_ref.get_tileable_details(task_id)
+
+    async def get_tileable_subtasks(
+        self, task_id: str, tileable_id: str, with_input_output: bool
+    ):
+        return await self._task_manager_ref.get_tileable_subtasks(
+            task_id, tileable_id, with_input_output
+        )
+
+    async def wait_task(self, task_id: str, timeout: float = None):
+        return await self._task_manager_ref.wait_task(task_id, timeout=timeout)
+
+    async def get_task_result(self, task_id: str) -> TaskResult:
+        return await self._task_manager_ref.get_task_result(task_id)
+
+    async def get_task_progress(self, task_id: str) -> float:
+        return await self._task_manager_ref.get_task_progress(task_id)
+
+    async def cancel_task(self, task_id: str):
+        return await self._task_manager_ref.cancel_task(task_id)
+
+    async def get_fetch_tileables(self, task_id: str) -> List[Tileable]:
+        return await self._task_manager_ref.get_task_result_tileables(task_id)
+
+    async def set_subtask_result(self, subtask_result: SubtaskResult):
+        return await self._task_manager_ref.set_subtask_result.tell(subtask_result)
+
+    async def get_last_idle_time(self) -> Union[float, None]:
+        return await self._task_manager_ref.get_last_idle_time()
+
+    async def remove_tileables(self, tileable_keys: List[str]):
+        return await self._task_manager_ref.remove_tileables(tileable_keys)
+
+    async def get_map_reduce_info(
+        self, task_id: str, map_reduce_id: int
+    ) -> MapReduceInfo:
+        return await self._task_manager_ref.get_map_reduce_info(task_id, map_reduce_id)
diff --git a/python/xorbits/_mars/services/task/api/web.py b/python/xorbits/_mars/services/task/api/web.py
new file mode 100644
index 000000000..86f10d01a
--- /dev/null
+++ b/python/xorbits/_mars/services/task/api/web.py
@@ -0,0 +1,296 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import base64
+import json
+from typing import Callable, List, Optional, Union
+
+from ....core import Tileable, TileableGraph
+from ....lib.tbcode import dump_traceback_code, load_traceback_code
+from ....utils import deserialize_serializable, serialize_serializable
+from ...web import MarsServiceWebAPIHandler, MarsWebAPIClientMixin, web_api
+from ..core import TaskResult, TaskStatus
+from .core import AbstractTaskAPI
+
+
+def _json_serial_task_result(result: Optional[TaskResult]):
+    if result is None:
+        return {}
+    res_json = {
+        "task_id": result.task_id,
+        "session_id": result.session_id,
+        "stage_id": result.stage_id,
+        "start_time": result.start_time,
+        "end_time": result.end_time,
+        "progress": result.progress,
+        "status": result.status.value,
+        "profiling": result.profiling,
+    }
+    if result.error is not None:
+        res_json["error"] = base64.b64encode(
+            serialize_serializable(result.error)
+        ).decode()
+        res_json["traceback"] = base64.b64encode(
+            serialize_serializable(result.traceback)
+        ).decode()
+        res_json["traceback_code"] = dump_traceback_code(result.traceback)
+    return res_json
+
+
+def _json_deserial_task_result(d: dict) -> Optional[TaskResult]:
+    if not d:
+        return None
+    if "error" in d:
+        d["error"] = deserialize_serializable(base64.b64decode(d["error"]))
+        d["traceback"] = deserialize_serializable(base64.b64decode(d["traceback"]))
+        load_traceback_code(d.pop("traceback_code"))
+    d["status"] = TaskStatus(d["status"])
+    return TaskResult(**d)
+
+
+class TaskWebAPIHandler(MarsServiceWebAPIHandler):
+    _root_pattern = "/api/session/(?P<session_id>[^/]+)/task"
+
+    async def _get_oscar_task_api(self, session_id: str):
+        from .oscar import TaskAPI
+
+        return await self._get_api_by_key(TaskAPI, session_id)
+
+    @web_api("", method="post")
+    async def submit_tileable_graph(self, session_id: str):
+        body_args = (
+            deserialize_serializable(self.request.body) if self.request.body else None
+        )
+
+        fuse_enabled = body_args.get("fuse")
+
+        graph = body_args["graph"]
+        extra_config = body_args.get("extra_config", None)
+        if extra_config:
+            extra_config = deserialize_serializable(extra_config)
+
+        oscar_api = await self._get_oscar_task_api(session_id)
+        task_id = await oscar_api.submit_tileable_graph(
+            graph,
+            fuse_enabled=fuse_enabled,
+            extra_config=extra_config,
+        )
+        self.write(task_id)
+
+    @web_api("", method="get", cache_blocking=True)
+    async def get_task_results(self, session_id: str):
+        progress = bool(int(self.get_argument("progress", "0")))
+        oscar_api = await self._get_oscar_task_api(session_id)
+        res = await oscar_api.get_task_results(progress=progress)
+        self.write(json.dumps({"tasks": [_json_serial_task_result(r) for r in res]}))
+
+    @web_api(
+        "(?P<task_id>[^/]+)",
+        method="get",
+        arg_filter={"action": "fetch_tileables"},
+        cache_blocking=True,
+    )
+    async def get_fetch_tileables(self, session_id: str, task_id: str):
+        oscar_api = await self._get_oscar_task_api(session_id)
+        res = await oscar_api.get_fetch_tileables(task_id)
+        self.write(serialize_serializable(res))
+
+    @web_api("(?P<task_id>[^/]+)", method="get", cache_blocking=True)
+    async def get_task_result(self, session_id: str, task_id: str):
+        oscar_api = await self._get_oscar_task_api(session_id)
+        res = await oscar_api.get_task_result(task_id)
+        self.write(json.dumps(_json_serial_task_result(res)))
+
+    @web_api(
+        "(?P<task_id>[^/]+)/tileable_graph",
+        method="get",
+        arg_filter={"action": "get_tileable_graph_as_json"},
+        cache_blocking=True,
+    )
+    async def get_tileable_graph_as_json(self, session_id: str, task_id: str):
+        oscar_api = await self._get_oscar_task_api(session_id)
+        res = await oscar_api.get_tileable_graph_as_json(task_id)
+        self.write(json.dumps(res))
+
+    @web_api("(?P<task_id>[^/]+)/tileable_detail", method="get", cache_blocking=True)
+    async def get_tileable_details(self, session_id: str, task_id: str):
+        oscar_api = await self._get_oscar_task_api(session_id)
+        res = await oscar_api.get_tileable_details(task_id)
+        self.write(json.dumps(res))
+
+    @web_api(
+        "(?P<task_id>[^/]+)/(?P<tileable_id>[^/]+)/subtask",
+        method="get",
+        cache_blocking=True,
+    )
+    async def get_tileable_subtasks(
+        self, session_id: str, task_id: str, tileable_id: str
+    ):
+        with_input_output = self.get_argument("with_input_output", "false") == "true"
+        oscar_api = await self._get_oscar_task_api(session_id)
+        res = await oscar_api.get_tileable_subtasks(
+            task_id, tileable_id, with_input_output
+        )
+        self.write(json.dumps(res))
+
+    @web_api(
+        "(?P<task_id>[^/]+)",
+        method="get",
+        arg_filter={"action": "progress"},
+        cache_blocking=True,
+    )
+    async def get_task_progress(self, session_id: str, task_id: str):
+        oscar_api = await self._get_oscar_task_api(session_id)
+        res = await oscar_api.get_task_progress(task_id)
+        self.write(str(res))
+
+    @web_api("", method="get", arg_filter={"action": "last_idle_time"})
+    async def get_last_idle_time(self, session_id: str):
+        oscar_api = await self._get_oscar_task_api(session_id)
+        res = await oscar_api.get_last_idle_time()
+        if res:
+            self.write(str(res))
+
+    @web_api("(?P<task_id>[^/]+)", method="get", arg_filter={"action": "wait"})
+    async def wait_task(self, session_id: str, task_id: str):
+        timeout = self.get_argument("timeout", None) or None
+        timeout = float(timeout) if timeout is not None else None
+        oscar_api = await self._get_oscar_task_api(session_id)
+        if timeout:
+            try:
+                res = await asyncio.wait_for(
+                    asyncio.shield(oscar_api.wait_task(task_id, timeout)),
+                    timeout=timeout,
+                )
+                self.write(json.dumps(_json_serial_task_result(res)))
+            except asyncio.TimeoutError:
+                self.write(json.dumps({}))
+        else:
+            res = await oscar_api.wait_task(task_id, timeout)
+            self.write(json.dumps(_json_serial_task_result(res)))
+
+    @web_api("(?P<task_id>[^/]+)", method="delete")
+    async def cancel_task(self, session_id: str, task_id: str):
+        oscar_api = await self._get_oscar_task_api(session_id)
+        await oscar_api.cancel_task(task_id)
+
+
+web_handlers = {TaskWebAPIHandler.get_root_pattern(): TaskWebAPIHandler}
+
+
+class WebTaskAPI(AbstractTaskAPI, MarsWebAPIClientMixin):
+    def __init__(
+        self, session_id: str, address: str, request_rewriter: Callable = None
+    ):
+        self._session_id = session_id
+        self._address = address.rstrip("/")
+        self.request_rewriter = request_rewriter
+
+    async def get_task_results(self, progress: bool = False) -> List[TaskResult]:
+        path = f"{self._address}/api/session/{self._session_id}/task"
+        params = {"progress": int(progress)}
+        res = await self._request_url("GET", path, params=params)
+        return [
+            _json_deserial_task_result(d)
+            for d in json.loads(res.body.decode())["tasks"]
+        ]
+
+    async def submit_tileable_graph(
+        self,
+        graph: TileableGraph,
+        fuse_enabled: bool = True,
+        extra_config: dict = None,
+    ) -> str:
+        path = f"{self._address}/api/session/{self._session_id}/task"
+        extra_config_ser = (
+            serialize_serializable(extra_config) if extra_config else None
+        )
+        body = serialize_serializable(
+            {
+                "fuse": fuse_enabled,
+                "graph": graph,
+                "extra_config": extra_config_ser,
+            }
+        )
+        res = await self._request_url(
+            path=path,
+            method="POST",
+            headers={"Content-Type": "application/octet-stream"},
+            data=body,
+        )
+        return res.body.decode().strip()
+
+    async def get_fetch_tileables(self, task_id: str) -> List[Tileable]:
+        path = (
+            f"{self._address}/api/session/{self._session_id}/task/{task_id}"
+            f"?action=fetch_tileables"
+        )
+        res = await self._request_url("GET", path)
+        return deserialize_serializable(res.body)
+
+    async def get_task_result(self, task_id: str) -> TaskResult:
+        path = f"{self._address}/api/session/{self._session_id}/task/{task_id}"
+        res = await self._request_url("GET", path)
+        return _json_deserial_task_result(json.loads(res.body.decode()))
+
+    async def get_task_progress(self, task_id: str) -> float:
+        path = f"{self._address}/api/session/{self._session_id}/task/{task_id}"
+        params = dict(action="progress")
+        res = await self._request_url("GET", path, params=params)
+        return float(res.body.decode())
+
+    async def get_last_idle_time(self) -> Union[float, None]:
+        path = f"{self._address}/api/session/{self._session_id}/task"
+        params = dict(action="last_idle_time")
+        res = await self._request_url("GET", path, params=params)
+        content = res.body.decode()
+        return float(content) if content else None
+
+    async def wait_task(self, task_id: str, timeout: float = None):
+        path = f"{self._address}/api/session/{self._session_id}/task/{task_id}"
+        # increase client timeout to handle network overhead during entire request
+        client_timeout = timeout + 3 if timeout else 0
+        params = {"action": "wait", "timeout": "" if timeout is None else str(timeout)}
+        res = await self._request_url(
+            "GET", path, params=params, request_timeout=client_timeout
+        )
+        return _json_deserial_task_result(json.loads(res.body.decode()))
+
+    async def cancel_task(self, task_id: str):
+        path = f"{self._address}/api/session/{self._session_id}/task/{task_id}"
+        await self._request_url(path=path, method="DELETE")
+
+    async def get_tileable_graph_as_json(self, task_id: str):
+        path = f"{self._address}/api/session/{self._session_id}/task/{task_id}/tileable_graph"
+        params = dict(action="get_tileable_graph_as_json")
+        res = await self._request_url(path=path, params=params, method="GET")
+        return json.loads(res.body.decode())
+
+    async def get_tileable_details(self, task_id: str):
+        path = f"{self._address}/api/session/{self._session_id}/task/{task_id}/tileable_detail"
+        res = await self._request_url(path=path, method="GET")
+        return json.loads(res.body.decode())
+
+    async def get_tileable_subtasks(
+        self, task_id: str, tileable_id: str, with_input_output: bool
+    ):
+        with_input_output = "true" if with_input_output else "false"
+        path = f"{self._address}/api/session/{self._session_id}/task/{task_id}/{tileable_id}/subtask"
+        params = {
+            "action": "fetch_graph",
+            "with_input_output": with_input_output,
+        }
+        res = await self._request_url(path=path, params=params, method="GET")
+        return json.loads(res.body.decode())
diff --git a/python/xorbits/_mars/services/task/config.py b/python/xorbits/_mars/services/task/config.py
new file mode 100644
index 000000000..7fb76001f
--- /dev/null
+++ b/python/xorbits/_mars/services/task/config.py
@@ -0,0 +1,26 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...config import Config, is_bool, is_integer, is_list
+
+task_options = Config()
+
+# supervisor
+task_options.register_option("optimize_tileable_graph", True, validator=is_bool)
+task_options.register_option("optimize_chunk_graph", True, validator=is_bool)
+task_options.register_option("fuse_enabled", True, validator=is_bool)
+task_options.register_option("reserved_finish_tasks", 25, validator=is_integer)
+
+# worker
+task_options.register_option("runtime_engines", ["numexpr", "cupy"], validator=is_list)
diff --git a/python/xorbits/_mars/services/task/core.py b/python/xorbits/_mars/services/task/core.py
new file mode 100644
index 000000000..5c871128d
--- /dev/null
+++ b/python/xorbits/_mars/services/task/core.py
@@ -0,0 +1,120 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+from enum import Enum
+from string import ascii_letters, digits
+from typing import Any, Dict, List, Optional, Tuple
+
+from ...core import TileableGraph
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    DictField,
+    FieldTypes,
+    Float64Field,
+    Int32Field,
+    ListField,
+    ReferenceField,
+    Serializable,
+    StringField,
+)
+from ...typing import BandType
+
+
+class TaskStatus(Enum):
+    pending = 0
+    running = 1
+    terminated = 2
+
+
+class Task(Serializable):
+    task_id: str = StringField("task_id")
+    session_id: str = StringField("session_id")
+    tileable_graph: TileableGraph = ReferenceField("tileable_graph", TileableGraph)
+    fuse_enabled: bool = BoolField("fuse_enabled")
+    extra_config: dict = DictField("extra_config")
+
+    def __init__(
+        self,
+        task_id: str = None,
+        session_id: str = None,
+        tileable_graph: TileableGraph = None,
+        fuse_enabled: bool = True,
+        extra_config: dict = None,
+    ):
+        super().__init__(
+            task_id=task_id,
+            session_id=session_id,
+            tileable_graph=tileable_graph,
+            fuse_enabled=fuse_enabled,
+            extra_config=extra_config,
+        )
+
+
+class TaskResult(Serializable):
+    task_id: str = StringField("task_id")
+    session_id: str = StringField("session_id")
+    stage_id: str = StringField("stage_id")
+    start_time: Optional[float] = Float64Field("start_time")
+    end_time: Optional[float] = Float64Field("end_time")
+    progress: Optional[float] = Float64Field("progress")
+    status: TaskStatus = ReferenceField("status", TaskStatus)
+    error = AnyField("error")
+    traceback = AnyField("traceback")
+    profiling: Dict = DictField("profiling")
+
+    def __init__(
+        self,
+        task_id: str = None,
+        session_id: str = None,
+        stage_id: str = None,
+        start_time: Optional[float] = None,
+        end_time: Optional[float] = None,
+        progress: Optional[float] = None,
+        status: TaskStatus = None,
+        error: Any = None,
+        traceback: Any = None,
+        profiling: Dict = None,
+    ):
+        super().__init__(
+            task_id=task_id,
+            session_id=session_id,
+            stage_id=stage_id,
+            start_time=start_time,
+            end_time=end_time,
+            progress=progress,
+            status=status,
+            error=error,
+            traceback=traceback,
+            profiling=profiling,
+        )
+
+
+def new_task_id():
+    return "".join(random.choice(ascii_letters + digits) for _ in range(24))
+
+
+class MapReduceInfo(Serializable):
+    # record map reduce info during analyzing
+    # record reducer indexes, and assigned bands
+    map_reduce_id: int = Int32Field("map_reduce_id")
+    reducer_indexes: List[Tuple[int]] = ListField(
+        "reducer_indexes", FieldTypes.tuple(FieldTypes.int64), default_factory=list
+    )
+    reducer_bands: List[BandType] = ListField(
+        "reducer_bands",
+        FieldTypes.tuple(FieldTypes.string, FieldTypes.string),
+        default_factory=list,
+    )
diff --git a/python/xorbits/_mars/services/task/errors.py b/python/xorbits/_mars/services/task/errors.py
new file mode 100644
index 000000000..e431bf07d
--- /dev/null
+++ b/python/xorbits/_mars/services/task/errors.py
@@ -0,0 +1,19 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...core.base import MarsError
+
+
+class TaskNotExist(MarsError):
+    pass
diff --git a/python/xorbits/_mars/services/task/execution/__init__.py b/python/xorbits/_mars/services/task/execution/__init__.py
new file mode 100644
index 000000000..61f9a12fe
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .mars import *
+from .ray import *
diff --git a/python/xorbits/_mars/services/task/execution/api.py b/python/xorbits/_mars/services/task/execution/api.py
new file mode 100644
index 000000000..718e0805a
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/api.py
@@ -0,0 +1,256 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Type, Union
+
+from ....core import Chunk, ChunkGraph, TileContext
+from ....core.operand.shuffle import ShuffleFetchType
+from ....resource import Resource
+from ....typing import BandType
+from ....utils import merge_dict
+from ...subtask import SubtaskGraph, SubtaskResult
+
+
+class ExecutionConfig:
+    """
+    The config for execution backends.
+
+    This class should ONLY provide the APIs for the parts other than
+    just the execution. Each backend may have a different implementation
+    of the API.
+
+    If some configuration is for a specific backend. They should be in
+    the backend config. e.g. `get_mars_special_config()` should be in
+    the `MarsExecutionConfig`.
+    """
+
+    name = None
+
+    def __init__(self, config: Dict):
+        """
+        An example of config:
+        {
+            "backend": "mars",
+            "mars": {
+                "n_worker": 1,
+                "n_cpu": 2,
+                ...
+            },
+        }
+        """
+        self._config = config
+
+    def merge_from(self, execution_config: "ExecutionConfig") -> "ExecutionConfig":
+        assert isinstance(execution_config, ExecutionConfig)
+        assert self.backend == execution_config.backend
+        merge_dict(
+            self._config,
+            execution_config.get_config_dict(),
+        )
+        return self
+
+    @property
+    def backend(self) -> str:
+        """The backend from config."""
+        return self._config["backend"]
+
+    def get_config_dict(self) -> Dict:
+        """Get the execution config dict."""
+        return self._config
+
+    @abstractmethod
+    def get_deploy_band_resources(self) -> List[Dict[str, Resource]]:
+        """Get the band resources for deployment."""
+
+    @abstractmethod
+    def get_shuffle_fetch_type(self) -> ShuffleFetchType:
+        """Get shuffle fetch type for shuffle execution"""
+
+    @classmethod
+    def from_config(cls, config: Dict, backend: str = None) -> "ExecutionConfig":
+        """Construct an execution config instance from config."""
+        execution_config = config["task"]["execution_config"]
+        return cls.from_execution_config(execution_config, backend)
+
+    @classmethod
+    def from_execution_config(
+        cls, execution_config: Union[Dict, "ExecutionConfig"], backend: str = None
+    ) -> "ExecutionConfig":
+        """Construct an execution config instance from execution config."""
+        if isinstance(execution_config, ExecutionConfig):
+            assert backend is None
+            return execution_config
+        if backend is not None:
+            name = execution_config["backend"] = backend
+        else:
+            name = execution_config.setdefault("backend", "mars")
+        config_cls = _name_to_config_cls[name]
+        execution_config.setdefault(name, {})
+        return config_cls(execution_config)
+
+    @classmethod
+    def from_params(
+        cls,
+        backend: str,
+        n_worker: int,
+        n_cpu: int,
+        mem_bytes: int = 0,
+        cuda_devices: List[List[int]] = None,
+        **kwargs,
+    ) -> "ExecutionConfig":
+        """Construct an execution config instance from params."""
+        execution_config = {
+            "backend": backend,
+            backend: dict(
+                {
+                    "n_worker": n_worker,
+                    "n_cpu": n_cpu,
+                    "mem_bytes": mem_bytes,
+                    "cuda_devices": cuda_devices,
+                },
+                **kwargs,
+            ),
+        }
+        return cls.from_execution_config(execution_config)
+
+
+_name_to_config_cls: Dict[str, Type[ExecutionConfig]] = {}
+
+
+def register_config_cls(config_cls: Type[ExecutionConfig]):
+    _name_to_config_cls[config_cls.name] = config_cls
+    return config_cls
+
+
+@dataclass
+class ExecutionChunkResult:
+    meta: Dict  # The chunk meta for iterative tiling.
+    context: Any  # The context info, e.g. ray.ObjectRef.
+
+
+class TaskExecutor(ABC):
+    name = None
+
+    @classmethod
+    @abstractmethod
+    async def create(
+        cls,
+        config: Union[Dict, ExecutionConfig],
+        *,
+        session_id: str,
+        address: str,
+        task,
+        tile_context: TileContext,
+        **kwargs,
+    ) -> "TaskExecutor":
+        backend_config = ExecutionConfig.from_execution_config(config)
+        executor_cls = _name_to_task_executor_cls[backend_config.backend]
+        if executor_cls.create.__func__ is TaskExecutor.create.__func__:
+            raise NotImplementedError(
+                f"The {executor_cls} should implement the abstract classmethod `create`."
+            )
+        return await executor_cls.create(
+            backend_config,
+            session_id=session_id,
+            address=address,
+            task=task,
+            tile_context=tile_context,
+            **kwargs,
+        )
+
+    @abstractmethod
+    def get_execution_config(self) -> ExecutionConfig:
+        """Return execution config."""
+
+    def destroy(self):
+        """Destroy the executor."""
+
+    async def __aenter__(self):
+        """Called when begin to execute the task."""
+
+    @abstractmethod
+    async def execute_subtask_graph(
+        self,
+        stage_id: str,
+        subtask_graph: SubtaskGraph,
+        chunk_graph: ChunkGraph,
+        tile_context: TileContext,
+        context: Any = None,
+    ) -> Dict[Chunk, ExecutionChunkResult]:
+        """Execute a subtask graph and returns result."""
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Called when finish the task."""
+
+    @abstractmethod
+    async def get_available_band_resources(self) -> Dict[BandType, Resource]:
+        """Get available band resources."""
+
+    @abstractmethod
+    async def get_progress(self) -> float:
+        """Get the execution progress."""
+
+    @abstractmethod
+    async def cancel(self):
+        """Cancel execution."""
+
+    # The following APIs are for compatible with mars backend, they
+    # will be removed as soon as possible.
+    async def set_subtask_result(self, subtask_result: SubtaskResult):
+        """Set the subtask result."""
+
+    def get_stage_processors(self):
+        """Get stage processors."""
+
+
+_name_to_task_executor_cls: Dict[str, Type[TaskExecutor]] = {}
+
+
+def register_executor_cls(executor_cls: Type[TaskExecutor]):
+    _name_to_task_executor_cls[executor_cls.name] = executor_cls
+    return executor_cls
+
+
+class Fetcher:
+    """The data fetcher for execution backends."""
+
+    name = None
+    required_meta_keys = ()  # The required meta keys.
+
+    @abstractmethod
+    def __init__(self, **kwargs):
+        pass
+
+    @abstractmethod
+    async def append(self, chunk_key: str, chunk_meta: Dict, conditions: List = None):
+        """Append chunk key and related infos."""
+
+    @abstractmethod
+    async def get(self):
+        """Get all the data of appended chunk keys."""
+
+    @classmethod
+    def create(cls, backend: str, **kwargs) -> "Fetcher":
+        fetcher_cls = _name_to_fetcher_cls[backend]
+        return fetcher_cls(**kwargs)
+
+
+_name_to_fetcher_cls: Dict[str, Type[Fetcher]] = {}
+
+
+def register_fetcher_cls(fetcher_cls: Type[Fetcher]):
+    _name_to_fetcher_cls[fetcher_cls.name] = fetcher_cls
+    return fetcher_cls
diff --git a/python/xorbits/_mars/services/task/execution/mars/__init__.py b/python/xorbits/_mars/services/task/execution/mars/__init__.py
new file mode 100644
index 000000000..b0df63861
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/mars/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import MarsExecutionConfig
+from .executor import MarsTaskExecutor
+from .fetcher import MarsFetcher
diff --git a/python/xorbits/_mars/services/task/execution/mars/config.py b/python/xorbits/_mars/services/task/execution/mars/config.py
new file mode 100644
index 000000000..962b7d898
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/mars/config.py
@@ -0,0 +1,35 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List
+
+from .....core.operand.shuffle import ShuffleFetchType
+from .....resource import Resource
+from ..api import ExecutionConfig, register_config_cls
+from ..utils import get_band_resources_from_config
+
+
+@register_config_cls
+class MarsExecutionConfig(ExecutionConfig):
+    name = "mars"
+
+    def __init__(self, execution_config: Dict):
+        super().__init__(execution_config)
+        self._mars_execution_config = execution_config[self.backend]
+
+    def get_deploy_band_resources(self) -> List[Dict[str, Resource]]:
+        return get_band_resources_from_config(self._mars_execution_config)
+
+    def get_shuffle_fetch_type(self) -> ShuffleFetchType:
+        return ShuffleFetchType.FETCH_BY_KEY
diff --git a/python/xorbits/_mars/services/task/execution/mars/executor.py b/python/xorbits/_mars/services/task/execution/mars/executor.py
new file mode 100644
index 000000000..fb5b4d198
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/mars/executor.py
@@ -0,0 +1,461 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import sys
+from collections import defaultdict
+from typing import Dict, List, Optional, Set
+
+from ..... import oscar as mo
+from .....core import ChunkGraph, TileContext
+from .....core.operand import Fetch, MapReduceOperand, OperandStage, ShuffleProxy
+from .....lib.aio import alru_cache
+from .....oscar.profiling import ProfilingData
+from .....resource import Resource
+from .....typing import BandType, TileableType
+from .....utils import Timer
+from ....cluster.api import ClusterAPI
+from ....context import ThreadedServiceContext
+from ....lifecycle.api import LifecycleAPI
+from ....meta.api import MetaAPI
+from ....scheduling import SchedulingAPI
+from ....subtask import Subtask, SubtaskGraph, SubtaskResult, SubtaskStatus
+from ...core import Task
+from ..api import TaskExecutor, register_executor_cls
+from ..utils import ResultTileablesLifecycle
+from .config import MarsExecutionConfig
+from .resource import ResourceEvaluator
+from .stage import TaskStageProcessor
+
+logger = logging.getLogger(__name__)
+
+
+def _get_n_reducers(subtask: Subtask) -> int:
+    return len(
+        [
+            r
+            for r in subtask.chunk_graph
+            if isinstance(r.op, MapReduceOperand) and r.op.stage == OperandStage.reduce
+        ]
+    )
+
+
+@register_executor_cls
+class MarsTaskExecutor(TaskExecutor):
+    name = "mars"
+    _stage_processors: List[TaskStageProcessor]
+    _stage_tile_progresses: List[float]
+    _cur_stage_processor: Optional[TaskStageProcessor]
+    _meta_updated_tileables: Set[TileableType]
+    _ctx: ThreadedServiceContext
+
+    def __init__(
+        self,
+        config: MarsExecutionConfig,
+        task: Task,
+        tile_context: TileContext,
+        cluster_api: ClusterAPI,
+        lifecycle_api: LifecycleAPI,
+        scheduling_api: SchedulingAPI,
+        meta_api: MetaAPI,
+        resource_evaluator: ResourceEvaluator,
+        ctx: ThreadedServiceContext,
+    ):
+        self._config = config
+        self._task = task
+        self._tileable_graph = task.tileable_graph
+        self._raw_tile_context = tile_context.copy()
+        self._tile_context = tile_context
+        self._session_id = task.session_id
+
+        # api
+        self._cluster_api = cluster_api
+        self._lifecycle_api = lifecycle_api
+        self._scheduling_api = scheduling_api
+        self._meta_api = meta_api
+
+        self._stage_processors = []
+        self._stage_tile_progresses = []
+        self._cur_stage_processor = None
+        self._result_tileables_lifecycle = None
+        self._subtask_decref_events = dict()
+        self._meta_updated_tileables = set()
+
+        # Evaluate and initialize subtasks required resource.
+        self._resource_evaluator = resource_evaluator
+
+        # context
+        self._ctx = ctx
+
+    @classmethod
+    async def create(
+        cls,
+        config: MarsExecutionConfig,
+        *,
+        session_id: str,
+        address: str,
+        task: Task,
+        tile_context: TileContext,
+        **kwargs,
+    ) -> "MarsTaskExecutor":
+        assert (
+            len(kwargs) == 0
+        ), f"Unexpected kwargs for {cls.__name__}.create: {kwargs}"
+        cluster_api, lifecycle_api, scheduling_api, meta_api = await cls._get_apis(
+            session_id, address
+        )
+        resource_evaluator = await ResourceEvaluator.create(
+            config.get_config_dict(),
+            session_id=task.session_id,
+            task_id=task.task_id,
+            cluster_api=cluster_api,
+        )
+        ctx = await cls._init_context(session_id, address)
+        return cls(
+            config,
+            task,
+            tile_context,
+            cluster_api,
+            lifecycle_api,
+            scheduling_api,
+            meta_api,
+            resource_evaluator,
+            ctx,
+        )
+
+    def get_execution_config(self):
+        return self._config
+
+    @classmethod
+    @alru_cache(cache_exceptions=False)
+    async def _get_apis(cls, session_id: str, address: str):
+        return await asyncio.gather(
+            ClusterAPI.create(address),
+            LifecycleAPI.create(session_id, address),
+            SchedulingAPI.create(session_id, address),
+            MetaAPI.create(session_id, address),
+        )
+
+    @classmethod
+    async def _init_context(
+        cls, session_id: str, address: str
+    ) -> ThreadedServiceContext:
+        loop = asyncio.get_running_loop()
+        context = ThreadedServiceContext(
+            session_id, address, address, address, loop=loop
+        )
+        await context.init()
+        return context
+
+    async def __aenter__(self):
+        profiling = ProfilingData[self._task.task_id, "general"]
+        # incref fetch tileables to ensure fetch data not deleted
+        with Timer() as timer:
+            await self._incref_fetch_tileables()
+        profiling.set("incref_fetch_tileables", timer.duration)
+        self._result_tileables_lifecycle = ResultTileablesLifecycle(
+            self._tileable_graph, self._tile_context, self._lifecycle_api
+        )
+        self._ctx.__enter__()
+
+    async def execute_subtask_graph(
+        self,
+        stage_id: str,
+        subtask_graph: SubtaskGraph,
+        chunk_graph: ChunkGraph,
+        tile_context: TileContext,
+        context=None,
+    ):
+        available_bands = await self.get_available_band_resources()
+        await self._result_tileables_lifecycle.incref_tiled()
+        stage_processor = TaskStageProcessor(
+            stage_id,
+            self._task,
+            chunk_graph,
+            subtask_graph,
+            list(available_bands),
+            tile_context,
+            self._scheduling_api,
+            self._meta_api,
+        )
+        await self._incref_stage(stage_processor)
+        await self._resource_evaluator.evaluate(stage_processor)
+        self._stage_processors.append(stage_processor)
+        self._cur_stage_processor = stage_processor
+        # get the tiled progress for current stage
+        prev_progress = sum(self._stage_tile_progresses)
+        curr_tile_progress = self._tile_context.get_all_progress() - prev_progress
+        self._stage_tile_progresses.append(curr_tile_progress)
+        return await stage_processor.run()
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        # clean-ups
+        decrefs = []
+        error_or_cancelled = False
+        for stage_processor in self._stage_processors:
+            if stage_processor.error_or_cancelled():
+                error_or_cancelled = True
+            decrefs.append(self._decref_stage.delay(stage_processor))
+        await self._decref_stage.batch(*decrefs)
+        # revert fetch incref
+        await self._decref_fetch_tileables()
+        if error_or_cancelled:
+            # revert result incref if error or cancelled
+            await self._result_tileables_lifecycle.decref_tracked()
+        await self._resource_evaluator.report()
+        self._ctx.__exit__(exc_type, exc_val, exc_tb)
+
+    async def get_available_band_resources(self) -> Dict[BandType, Resource]:
+        async for bands in self._cluster_api.watch_all_bands():
+            if bands:
+                return bands
+
+    async def get_progress(self) -> float:
+        # get progress of stages
+        executor_progress = 0.0
+        assert len(self._stage_tile_progresses) == len(self._stage_processors)
+        for stage_processor, stage_tile_progress in zip(
+            self._stage_processors, self._stage_tile_progresses
+        ):
+            if stage_processor.subtask_graph is None:  # pragma: no cover
+                # generating subtask
+                continue
+            n_subtask = len(stage_processor.subtask_graph)
+            if n_subtask == 0:  # pragma: no cover
+                continue
+            progress = sum(
+                result.progress for result in stage_processor.subtask_results.values()
+            )
+            progress += sum(
+                result.progress
+                for subtask_key, result in stage_processor.subtask_snapshots.items()
+                if subtask_key not in stage_processor.subtask_results
+            )
+            subtask_progress = progress / n_subtask
+            executor_progress += subtask_progress * stage_tile_progress
+        return executor_progress
+
+    async def cancel(self):
+        if self._cur_stage_processor is not None:
+            await self._cur_stage_processor.cancel()
+
+    async def set_subtask_result(self, subtask_result: SubtaskResult):
+        if self._cur_stage_processor is None or (
+            subtask_result.stage_id
+            and self._cur_stage_processor.stage_id != subtask_result.stage_id
+        ):
+            logger.warning(
+                "Stage %s for subtask %s not exists, got stale subtask result %s which may be "
+                "speculative execution from previous stages, just ignore it.",
+                subtask_result.stage_id,
+                subtask_result.subtask_id,
+                subtask_result,
+            )
+            return
+        stage_processor = self._cur_stage_processor
+        subtask = stage_processor.subtask_id_to_subtask[subtask_result.subtask_id]
+
+        prev_result = stage_processor.subtask_results.get(subtask)
+        if prev_result and (
+            prev_result.status == SubtaskStatus.succeeded
+            or prev_result.progress > subtask_result.progress
+        ):
+            logger.info(
+                "Skip set subtask %s with result %s, previous result is %s.",
+                subtask.subtask_id,
+                subtask_result,
+                prev_result,
+            )
+            # For duplicate run of subtasks, if the progress is smaller or the subtask has finished or canceled
+            # in task speculation, just do nothing.
+            # TODO(chaokunyang) If duplicate run of subtasks failed, it may be the fault in worker node,
+            #  print the exception, and if multiple failures on the same node, remove the node from the cluster.
+            return
+        if subtask_result.bands:
+            [band] = subtask_result.bands
+        else:
+            band = None
+        stage_processor.subtask_snapshots[subtask] = subtask_result.update(
+            stage_processor.subtask_snapshots.get(subtask)
+        )
+        if subtask_result.status.is_done:
+            # update stage_processor.subtask_results to avoid concurrent set_subtask_result
+            # since we release lock when `_decref_input_subtasks`.
+            stage_processor.subtask_results[subtask] = subtask_result.update(
+                stage_processor.subtask_results.get(subtask)
+            )
+            try:
+                # Since every worker will call supervisor to set subtask result,
+                # we need to release actor lock to make `decref_chunks` parallel to avoid blocking
+                # other `set_subtask_result` calls.
+                # If speculative execution enabled, concurrent subtasks may got error since input chunks may
+                # got deleted. But it's OK because the current subtask run has succeed.
+                if subtask.subtask_id not in stage_processor.decref_subtask:
+                    stage_processor.decref_subtask.add(subtask.subtask_id)
+                    await self._decref_input_subtasks(
+                        subtask, stage_processor.subtask_graph
+                    )
+
+            except:  # noqa: E722  # nosec  # pylint: disable=bare-except  # pragma: no cover
+                logger.debug(
+                    "Decref input subtasks for subtask %s failed.", subtask.subtask_id
+                )
+                _, err, tb = sys.exc_info()
+                if subtask_result.status not in (
+                    SubtaskStatus.errored,
+                    SubtaskStatus.cancelled,
+                ):
+                    subtask_result.status = SubtaskStatus.errored
+                    subtask_result.error = err
+                    subtask_result.traceback = tb
+            await stage_processor.set_subtask_result(subtask_result, band=band)
+
+    def get_stage_processors(self):
+        return self._stage_processors
+
+    async def _incref_fetch_tileables(self):
+        # incref fetch tileables in tileable graph to prevent them from deleting
+        to_incref_tileable_keys = [
+            tileable.op.source_key
+            for tileable in self._tileable_graph
+            if isinstance(tileable.op, Fetch) and tileable in self._raw_tile_context
+        ]
+        await self._lifecycle_api.incref_tileables(to_incref_tileable_keys)
+
+    async def _decref_fetch_tileables(self):
+        fetch_tileable_keys = [
+            tileable.op.source_key
+            for tileable in self._tileable_graph
+            if isinstance(tileable.op, Fetch) and tileable in self._raw_tile_context
+        ]
+        await self._lifecycle_api.decref_tileables(fetch_tileable_keys)
+
+    async def _incref_stage(self, stage_processor: "TaskStageProcessor"):
+        subtask_graph = stage_processor.subtask_graph
+        incref_chunk_key_to_counts = defaultdict(lambda: 0)
+        for subtask in subtask_graph:
+            # for subtask has successors, incref number of successors
+            n = subtask_graph.count_successors(subtask)
+            for c in subtask.chunk_graph.results:
+                incref_chunk_key_to_counts[c.key] += n
+            # process reducer, incref mapper chunks
+            for pre_graph in subtask_graph.iter_predecessors(subtask):
+                for chk in pre_graph.chunk_graph.results:
+                    if isinstance(chk.op, ShuffleProxy):
+                        n_reducers = _get_n_reducers(subtask)
+                        for map_chunk in chk.inputs:
+                            incref_chunk_key_to_counts[map_chunk.key] += n_reducers
+        result_chunks = stage_processor.chunk_graph.result_chunks
+        for c in result_chunks:
+            incref_chunk_key_to_counts[c.key] += 1
+        logger.debug(
+            "Incref chunks for stage %s: %s",
+            stage_processor.stage_id,
+            incref_chunk_key_to_counts,
+        )
+        await self._lifecycle_api.incref_chunks(
+            list(incref_chunk_key_to_counts),
+            counts=list(incref_chunk_key_to_counts.values()),
+        )
+
+    @classmethod
+    def _get_decref_stage_chunk_key_to_counts(
+        cls, stage_processor: "TaskStageProcessor"
+    ) -> Dict[str, int]:
+        decref_chunk_key_to_counts = defaultdict(lambda: 0)
+        error_or_cancelled = stage_processor.error_or_cancelled()
+        if stage_processor.subtask_graph:
+            subtask_graph = stage_processor.subtask_graph
+            if error_or_cancelled:
+                # error or cancel, rollback incref for subtask results
+                for subtask in subtask_graph:
+                    if subtask.subtask_id in stage_processor.decref_subtask:
+                        continue
+                    stage_processor.decref_subtask.add(subtask.subtask_id)
+                    # if subtask not executed, rollback incref of predecessors
+                    for inp_subtask in subtask_graph.predecessors(subtask):
+                        for c in inp_subtask.chunk_graph.results:
+                            decref_chunk_key_to_counts[c.key] += 1
+        # decref result of chunk graphs
+        for c in stage_processor.chunk_graph.results:
+            decref_chunk_key_to_counts[c.key] += 1
+        return decref_chunk_key_to_counts
+
+    @mo.extensible
+    async def _decref_stage(self, stage_processor: "TaskStageProcessor"):
+        decref_chunk_key_to_counts = self._get_decref_stage_chunk_key_to_counts(
+            stage_processor
+        )
+        logger.debug(
+            "Decref chunks when stage %s finish: %s",
+            stage_processor.stage_id,
+            decref_chunk_key_to_counts,
+        )
+        await self._lifecycle_api.decref_chunks(
+            list(decref_chunk_key_to_counts),
+            counts=list(decref_chunk_key_to_counts.values()),
+        )
+
+    @_decref_stage.batch
+    async def _decref_stage(self, args_list, kwargs_list):
+        decref_chunk_key_to_counts = defaultdict(lambda: 0)
+        for args, kwargs in zip(args_list, kwargs_list):
+            chunk_key_to_counts = self._get_decref_stage_chunk_key_to_counts(
+                *args, **kwargs
+            )
+            for k, c in chunk_key_to_counts.items():
+                decref_chunk_key_to_counts[k] += c
+        logger.debug("Decref chunks when stages finish: %s", decref_chunk_key_to_counts)
+        await self._lifecycle_api.decref_chunks(
+            list(decref_chunk_key_to_counts),
+            counts=list(decref_chunk_key_to_counts.values()),
+        )
+
+    async def _decref_input_subtasks(
+        self, subtask: Subtask, subtask_graph: SubtaskGraph
+    ):
+        # make sure subtasks are decreffed only once
+        if subtask.subtask_id not in self._subtask_decref_events:
+            self._subtask_decref_events[subtask.subtask_id] = asyncio.Event()
+        else:  # pragma: no cover
+            await self._subtask_decref_events[subtask.subtask_id].wait()
+            return
+
+        decref_chunk_key_to_counts = defaultdict(lambda: 0)
+        for in_subtask in subtask_graph.iter_predecessors(subtask):
+            for result_chunk in in_subtask.chunk_graph.results:
+                # for reducer chunk, decref mapper chunks
+                if isinstance(result_chunk.op, ShuffleProxy):
+                    n_reducers = _get_n_reducers(subtask)
+                    for inp in result_chunk.inputs:
+                        decref_chunk_key_to_counts[inp.key] += n_reducers
+                decref_chunk_key_to_counts[result_chunk.key] += 1
+        logger.debug(
+            "Decref chunks %s when subtask %s finish",
+            decref_chunk_key_to_counts,
+            subtask.subtask_id,
+        )
+        await self._lifecycle_api.decref_chunks(
+            list(decref_chunk_key_to_counts),
+            counts=list(decref_chunk_key_to_counts.values()),
+        )
+
+        # `set_subtask_result` will be called when subtask finished
+        # but report progress will call set_subtask_result too,
+        # so it have risk to duplicate decrease some subtask input object reference,
+        # it will cause object reference count lower zero
+        # TODO(Catch-Bull): Pop asyncio.Event when current subtask `set_subtask_result`
+        # will never be called
+        self._subtask_decref_events[subtask.subtask_id].set()
diff --git a/python/xorbits/_mars/services/task/execution/mars/fetcher.py b/python/xorbits/_mars/services/task/execution/mars/fetcher.py
new file mode 100644
index 000000000..957ea2558
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/mars/fetcher.py
@@ -0,0 +1,56 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+from collections import defaultdict, namedtuple
+from typing import Dict, List
+
+from ..api import Fetcher, register_fetcher_cls
+
+_GetWithIndex = namedtuple("GetWithIndex", ["get", "index"])
+
+
+@register_fetcher_cls
+class MarsFetcher(Fetcher):
+    name = "mars"
+    required_meta_keys = ("bands",)
+
+    def __init__(self, get_storage_api, **kwargs):
+        self._get_storage_api = get_storage_api
+        self._storage_api_to_gets = defaultdict(list)
+        self._counter = 0
+
+    async def append(self, chunk_key: str, chunk_meta: Dict, conditions: List = None):
+        band = None
+        if chunk_meta:
+            bands = chunk_meta.get("bands")
+            if bands:
+                band = bands[0]
+        storage_api = await self._get_storage_api(band)
+        get = _GetWithIndex(
+            storage_api.get.delay(chunk_key, conditions=conditions), self._counter
+        )
+        self._storage_api_to_gets[storage_api].append(get)
+        self._counter += 1
+
+    async def get(self):
+        results = [None] * self._counter
+        for storage_api in self._storage_api_to_gets:
+            gets = self._storage_api_to_gets[storage_api]
+            fetched_data = await storage_api.get.batch(
+                *map(operator.itemgetter(0), gets)
+            )
+            for get, data in zip(gets, fetched_data):
+                results[get.index] = data
+        return results
diff --git a/python/xorbits/_mars/services/task/execution/mars/resource.py b/python/xorbits/_mars/services/task/execution/mars/resource.py
new file mode 100644
index 000000000..e3e85c089
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/mars/resource.py
@@ -0,0 +1,95 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Type
+
+from .....resource import Resource
+
+_name_to_resource_evaluator: Dict[str, Type["ResourceEvaluator"]] = {}
+
+
+def register_resource_evaluator(evaluator_cls: Type["ResourceEvaluator"]):
+    _name_to_resource_evaluator[evaluator_cls.name] = evaluator_cls
+    return evaluator_cls
+
+
+def init_default_resource_for_subtask(subtask_graph: "SubtaskGraph"):  # noqa: F821
+    for subtask in subtask_graph.iter_nodes():
+        is_gpu = any(c.op.gpu for c in subtask.chunk_graph)
+        subtask.required_resource = (
+            Resource(num_gpus=1) if is_gpu else Resource(num_cpus=1)
+        )
+
+
+class ResourceEvaluator(ABC):
+    """
+    Resource evaluator is used to estimate and set resources required by
+    subtasks. It can be an internal service or an external service. If it
+    is an internal service, we can set default of adjustable resources for
+    subtasks. If it is an external service, we should report the running
+    result of the task to the external service, so that it can accurately
+    predict the required resources of subtasks based on the historical
+    running information, we call it HBO.
+
+    Best practice
+    ----------
+    You can follow the steps below to add a new resource evaluator:
+        * Inherit `ResourceEvaluator` and implement `create`, `evaluate`
+          and `report` methods. The `create` method is to create a new
+          resource evaluator instance. The `evaluate` method is to estimate
+          and set required resources for the subtasks of a task stage. And
+          this method must be implemented. The `report` method is to report
+          the running information and result of the task. And this method
+          does not have to be implemented.
+
+        * Add default configs of the new evaluator needed in `base_config.xml`
+          or its descendant files.
+
+        * Set the `resource_evaluator` to choose a resource evaluator in
+          `base_config.xml` when running a mars job.
+    """
+
+    name = None
+
+    @classmethod
+    @abstractmethod
+    async def create(cls, config: Dict[str, Any], **kwargs) -> "ResourceEvaluator":
+        name = config.get("resource_evaluator", "default")
+        evaluator_config = config.get(name, {})
+        evaluator_cls = _name_to_resource_evaluator[name]
+        return await evaluator_cls.create(evaluator_config, **kwargs)
+
+    @abstractmethod
+    async def evaluate(self, stage_processor: "TaskStageProcessor"):  # noqa: F821
+        """Called before executing a task stage."""
+
+    @abstractmethod
+    async def report(self):
+        """Called after executing a task."""
+
+
+@register_resource_evaluator
+class DefaultEvaluator(ResourceEvaluator):
+    name = "default"
+
+    @classmethod
+    async def create(cls, config, **kwargs) -> "ResourceEvaluator":
+        return cls()
+
+    async def evaluate(self, stage_processor: "TaskStageProcessor"):  # noqa: F821
+        init_default_resource_for_subtask(stage_processor.subtask_graph)
+
+    async def report(self):
+        pass
diff --git a/python/xorbits/_mars/services/task/execution/mars/stage.py b/python/xorbits/_mars/services/task/execution/mars/stage.py
new file mode 100644
index 000000000..afa2ec9f9
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/mars/stage.py
@@ -0,0 +1,351 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import itertools
+import logging
+import time
+from collections import defaultdict
+from typing import Dict, List
+
+from ..... import oscar as mo
+from .....core import Chunk, ChunkGraph
+from .....core.operand import Fetch, Fuse
+from .....metrics import Metrics
+from .....typing import BandType, TileableType
+from .....utils import get_chunk_params
+from ....meta import MetaAPI, WorkerMetaAPI
+from ....scheduling import SchedulingAPI
+from ....subtask import Subtask, SubtaskGraph, SubtaskResult, SubtaskStatus
+from ....task.core import Task, TaskResult, TaskStatus
+from ..api import ExecutionChunkResult
+
+logger = logging.getLogger(__name__)
+
+
+class TaskStageProcessor:
+    def __init__(
+        self,
+        stage_id: str,
+        task: Task,
+        chunk_graph: ChunkGraph,
+        subtask_graph: SubtaskGraph,
+        bands: List[BandType],
+        tile_context: Dict[TileableType, TileableType],
+        scheduling_api: SchedulingAPI,
+        meta_api: MetaAPI,
+    ):
+        self.stage_id = stage_id
+        self.task = task
+        self.chunk_graph = chunk_graph
+        self.subtask_graph = subtask_graph
+        self._bands = bands
+        self._tile_context = tile_context
+
+        # APIs
+        self._scheduling_api = scheduling_api
+        self._meta_api = meta_api
+
+        # gen subtask_id to subtask
+        self.subtask_id_to_subtask = {
+            subtask.subtask_id: subtask for subtask in subtask_graph
+        }
+        self._subtask_to_bands: Dict[Subtask, BandType] = dict()
+        self.subtask_snapshots: Dict[Subtask, SubtaskResult] = dict()
+        self.subtask_results: Dict[Subtask, SubtaskResult] = dict()
+        self._submitted_subtask_ids = set()
+
+        # All subtask IDs whose input chunk reference count is reduced.
+        self.decref_subtask = set()
+
+        self._band_manager: Dict[BandType, mo.ActorRef] = dict()
+
+        # result
+        self.result = TaskResult(
+            task.task_id,
+            task.session_id,
+            self.stage_id,
+            status=TaskStatus.pending,
+            start_time=time.time(),
+        )
+        # status
+        self._done = asyncio.Event()
+        self._cancelled = asyncio.Event()
+
+        # add metrics
+        self._stage_execution_time = Metrics.gauge(
+            "mars.stage_execution_time_secs",
+            "Time consuming in seconds to execute a stage",
+            ("session_id", "task_id", "stage_id"),
+        )
+
+    def is_cancelled(self):
+        return self._cancelled.is_set()
+
+    async def _schedule_subtasks(self, subtasks: List[Subtask]):
+        subtasks = [
+            subtask
+            for subtask in subtasks
+            if subtask.subtask_id not in self._submitted_subtask_ids
+        ]
+        if not subtasks:
+            return
+        self._submitted_subtask_ids.update(subtask.subtask_id for subtask in subtasks)
+        return await self._scheduling_api.add_subtasks(
+            subtasks, [subtask.priority for subtask in subtasks]
+        )
+
+    async def _get_stage_result(self):
+        chunks = []
+        get_meta = []
+        results_chunks = self.chunk_graph.result_chunks
+        for chunk in results_chunks:
+            if isinstance(chunk.op, Fetch):
+                continue
+            chunks.append(chunk)
+            if isinstance(chunk.op, Fuse):
+                chunk = chunk.chunk
+            get_meta.append(
+                self._meta_api.get_chunk_meta.delay(
+                    chunk.key,
+                    # only fetch bands from supervisor meta
+                    fields=["bands"],
+                )
+            )
+        metas = await self._meta_api.get_chunk_meta.batch(*get_meta)
+        execution_chunk_results = {
+            chunk: ExecutionChunkResult(meta=meta, context=None)
+            for chunk, meta in zip(chunks, metas)
+        }
+        await self._update_result_meta(execution_chunk_results)
+        return execution_chunk_results
+
+    def _schedule_done(self):
+        self._done.set()
+
+    async def set_subtask_result(self, result: SubtaskResult, band: BandType = None):
+        assert result.status.is_done
+        subtask = self.subtask_id_to_subtask[result.subtask_id]
+        #  update subtask_results in `TaskProcessorActor.set_subtask_result`
+        self._submitted_subtask_ids.difference_update([result.subtask_id])
+
+        all_done = len(self.subtask_results) == len(self.subtask_graph)
+        error_or_cancelled = result.status in (
+            SubtaskStatus.errored,
+            SubtaskStatus.cancelled,
+        )
+
+        if all_done or error_or_cancelled:
+            # tell scheduling to finish subtasks
+            await self._scheduling_api.finish_subtasks(
+                [result.subtask_id], bands=[band], schedule_next=not error_or_cancelled
+            )
+            if self.result.status != TaskStatus.terminated:
+                self.result = TaskResult(
+                    self.task.task_id,
+                    self.task.session_id,
+                    self.stage_id,
+                    start_time=self.result.start_time,
+                    end_time=time.time(),
+                    status=TaskStatus.terminated,
+                    error=result.error,
+                    traceback=result.traceback,
+                )
+                if not all_done and error_or_cancelled:
+                    if result.status == SubtaskStatus.errored:
+                        logger.exception(
+                            "Subtask %s errored",
+                            subtask.subtask_id,
+                            exc_info=(
+                                type(result.error),
+                                result.error,
+                                result.traceback,
+                            ),
+                        )
+                    if result.status == SubtaskStatus.cancelled:  # pragma: no cover
+                        logger.warning(
+                            "Subtask %s from band %s canceled.",
+                            subtask.subtask_id,
+                            band,
+                        )
+                    logger.info(
+                        "Start to cancel stage %s of task %s.", self.stage_id, self.task
+                    )
+                    # if error or cancel, cancel all submitted subtasks
+                    await self._scheduling_api.cancel_subtasks(
+                        list(self._submitted_subtask_ids)
+                    )
+                self._schedule_done()
+                cost_time_secs = self.result.end_time - self.result.start_time
+                logger.info(
+                    "Time consuming to execute a stage is %ss with "
+                    "session id %s, task id %s, stage id %s",
+                    cost_time_secs,
+                    self.result.session_id,
+                    self.result.task_id,
+                    self.result.stage_id,
+                )
+                self._stage_execution_time.record(
+                    cost_time_secs,
+                    {
+                        "session_id": self.result.session_id,
+                        "task_id": self.result.task_id,
+                        "stage_id": self.result.stage_id,
+                    },
+                )
+        else:
+            # not terminated, push success subtasks to queue if they are ready
+            to_schedule_subtasks = []
+            for succ_subtask in self.subtask_graph.successors(subtask):
+                if succ_subtask in self.subtask_results:  # pragma: no cover
+                    continue
+                pred_subtasks = self.subtask_graph.predecessors(succ_subtask)
+                if all(
+                    pred_subtask in self.subtask_results
+                    for pred_subtask in pred_subtasks
+                ):
+                    # all predecessors finished
+                    to_schedule_subtasks.append(succ_subtask)
+            await self._schedule_subtasks(to_schedule_subtasks)
+            await self._scheduling_api.finish_subtasks(
+                [result.subtask_id], bands=[band]
+            )
+
+    async def run(self):
+        try:
+            if self.subtask_graph.num_shuffles() > 0:
+                # disable scale-in when shuffle is executing so that we can skip
+                # store shuffle meta in supervisor.
+                await self._scheduling_api.disable_autoscale_in()
+            return await self._run()
+        finally:
+            if self.subtask_graph.num_shuffles() > 0:
+                await self._scheduling_api.try_enable_autoscale_in()
+
+    async def _run(self):
+        if len(self.subtask_graph) == 0:
+            # no subtask to schedule, set status to done
+            self._schedule_done()
+            self.result.status = TaskStatus.terminated
+            return {}
+
+        # schedule independent subtasks
+        indep_subtasks = list(self.subtask_graph.iter_indep())
+        await self._schedule_subtasks(indep_subtasks)
+
+        # wait for completion
+        await self._done.wait()
+        if self.error_or_cancelled():
+            if self.result.error is not None:
+                raise self.result.error.with_traceback(self.result.traceback)
+            else:
+                raise asyncio.CancelledError()
+        return await self._get_stage_result()
+
+    async def cancel(self):
+        logger.info("Start to cancel stage %s of task %s.", self.stage_id, self.task)
+        if self._done.is_set():  # pragma: no cover
+            # already finished, ignore cancel
+            return
+        self._cancelled.set()
+        # cancel running subtasks
+        await self._scheduling_api.cancel_subtasks(list(self._submitted_subtask_ids))
+        self._done.set()
+
+    def error_or_cancelled(self) -> bool:
+        if self.result.error is not None:
+            return True
+        if self.is_cancelled():
+            return True
+        return False
+
+    async def _update_result_meta(
+        self, chunk_to_result: Dict[Chunk, ExecutionChunkResult]
+    ):
+        session_id = self.task.session_id
+        tile_context = self._tile_context
+
+        update_meta_chunks = chunk_to_result.keys() - set(
+            itertools.chain.from_iterable(
+                (c.data for c in tiled_tileable.chunks)
+                for tiled_tileable in tile_context.values()
+            )
+        )
+
+        worker_meta_api_to_chunk_delays = defaultdict(dict)
+        for c in update_meta_chunks:
+            address = chunk_to_result[c].meta["bands"][0][0]
+            meta_api = await WorkerMetaAPI.create(session_id, address)
+            call = meta_api.get_chunk_meta.delay(
+                c.key, fields=list(get_chunk_params(c).keys())
+            )
+            worker_meta_api_to_chunk_delays[meta_api][c] = call
+        for tileable in tile_context.values():
+            chunks = [c.data for c in tileable.chunks]
+            for c, params_fields in zip(chunks, self._get_params_fields(tileable)):
+                address = chunk_to_result[c].meta["bands"][0][0]
+                meta_api = await WorkerMetaAPI.create(session_id, address)
+                call = meta_api.get_chunk_meta.delay(c.key, fields=params_fields)
+                worker_meta_api_to_chunk_delays[meta_api][c] = call
+        coros = []
+        for worker_meta_api, chunk_delays in worker_meta_api_to_chunk_delays.items():
+            coros.append(worker_meta_api.get_chunk_meta.batch(*chunk_delays.values()))
+        worker_metas = await asyncio.gather(*coros)
+        for chunk_delays, metas in zip(
+            worker_meta_api_to_chunk_delays.values(), worker_metas
+        ):
+            for c, meta in zip(chunk_delays, metas):
+                chunk_to_result[c].meta = meta
+
+    @classmethod
+    def _get_params_fields(cls, tileable: TileableType):
+        from .....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
+        from .....tensor.core import TENSOR_TYPE
+
+        params_fields = []
+        fields = get_chunk_params(tileable.chunks[0])
+        if isinstance(tileable, DATAFRAME_TYPE):
+            for c in tileable.chunks:
+                cur_fields = set(fields)
+                if c.index[1] > 0:
+                    # skip fetch index_value for i >= 1 on column axis
+                    cur_fields.remove("index_value")
+                if c.index[0] > 0:
+                    # skip fetch dtypes_value for i >= 1 on index axis
+                    cur_fields.remove("dtypes_value")
+                if c.index[0] > 0 and c.index[1] > 0:
+                    # fetch shape only for i == 0 on index or column axis
+                    cur_fields.remove("shape")
+                params_fields.append(list(cur_fields))
+        elif isinstance(tileable, SERIES_TYPE):
+            for c in tileable.chunks:
+                cur_fields = set(fields)
+                if c.index[0] > 0:
+                    # skip fetch name and dtype for i >= 1
+                    cur_fields.remove("name")
+                    cur_fields.remove("dtype")
+                params_fields.append(list(cur_fields))
+        elif isinstance(tileable, TENSOR_TYPE):
+            for i, c in enumerate(tileable.chunks):
+                cur_fields = set(fields)
+                if c.ndim > 1 and all(j > 0 for j in c.index):
+                    cur_fields.remove("shape")
+                if i > 0:
+                    cur_fields.remove("dtype")
+                    cur_fields.remove("order")
+                params_fields.append(list(cur_fields))
+        else:
+            for _ in tileable.chunks:
+                params_fields.append(list(fields))
+        return params_fields
diff --git a/python/xorbits/_mars/services/task/execution/mars/tests/__init__.py b/python/xorbits/_mars/services/task/execution/mars/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/mars/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/task/execution/mars/tests/test_resource.py b/python/xorbits/_mars/services/task/execution/mars/tests/test_resource.py
new file mode 100644
index 000000000..9c893ffaf
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/mars/tests/test_resource.py
@@ -0,0 +1,100 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict
+
+import numpy as np
+import pytest
+
+from ...... import dataframe as md
+from ...... import tensor as mt
+from ......config import Config
+from ......core import ChunkGraphBuilder, Tileable, TileableGraph
+from ......resource import Resource
+from .... import Task
+from ....analyzer import GraphAnalyzer
+from ..resource import DefaultEvaluator, ResourceEvaluator, register_resource_evaluator
+from ..stage import TaskStageProcessor
+
+
+@register_resource_evaluator
+class MockedEvaluator(ResourceEvaluator):
+    name = "mock"
+
+    def __init__(self, config, **kwargs):
+        self._config = config
+
+    @classmethod
+    async def create(cls, config: Dict[str, Any], **kwargs) -> "ResourceEvaluator":
+        return cls(config, **kwargs)
+
+    async def evaluate(self, stage_processor: "TaskStageProcessor"):
+        pass
+
+    async def report(self):
+        pass
+
+
+def _build_chunk_graph(tileable_graph: TileableGraph):
+    return next(ChunkGraphBuilder(tileable_graph).build())
+
+
+async def _gen_stage_processor(t):
+    tileable_graph = t.build_graph(tile=False)
+    chunk_graph = _build_chunk_graph(tileable_graph)
+    bands = [(f"address_{i}", "numa-0") for i in range(4)]
+    band_resource = dict((band, Resource(num_cpus=1)) for band in bands)
+    task = Task("mock_task", "mock_session", tileable_graph)
+    analyzer = GraphAnalyzer(chunk_graph, band_resource, task, Config(), dict())
+    subtask_graph = analyzer.gen_subtask_graph()
+    stage_processor = TaskStageProcessor(
+        "stage_id", task, chunk_graph, subtask_graph, bands, None, None, None
+    )
+    return stage_processor
+
+
+async def _test_default_evaluator(config: Dict[str, Any], t: Tileable):
+    resource_evaluator = await ResourceEvaluator.create(config)
+    assert resource_evaluator is not None
+    assert isinstance(resource_evaluator, DefaultEvaluator)
+    stage_processor = await _gen_stage_processor(t)
+    await resource_evaluator.evaluate(stage_processor)
+    for subtask in stage_processor.subtask_graph.iter_nodes():
+        is_gpu = any(c.op.gpu for c in subtask.chunk_graph)
+        assert (
+            subtask.required_resource == Resource(num_gpus=1)
+            if is_gpu
+            else Resource(num_cpus=1)
+        )
+    assert await resource_evaluator.report() is None
+
+
+@pytest.mark.asyncio
+async def test_resource_evaluator():
+    # test mocked resource evaluator
+    resource_evaluator = await ResourceEvaluator.create({"resource_evaluator": "mock"})
+    assert resource_evaluator is not None
+    assert isinstance(resource_evaluator, MockedEvaluator)
+
+    # test default resource evaluator
+    t = mt.ones((10, 10), chunk_size=5) + 1
+    await _test_default_evaluator({}, t)
+    await _test_default_evaluator({"resource_evaluator": "default"}, t)
+    df = md.DataFrame(
+        np.random.randint(0, 100, size=(100_000, 4)),
+        columns=list("ABCD"),
+        chunk_size=1000,
+    )
+    df = df[df["A"] > 50]
+    await _test_default_evaluator({}, df)
diff --git a/python/xorbits/_mars/services/task/execution/ray/__init__.py b/python/xorbits/_mars/services/task/execution/ray/__init__.py
new file mode 100644
index 000000000..84e0ac757
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/ray/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import RayExecutionConfig
+from .executor import RayTaskExecutor
+from .fetcher import RayFetcher
diff --git a/python/xorbits/_mars/services/task/execution/ray/config.py b/python/xorbits/_mars/services/task/execution/ray/config.py
new file mode 100644
index 000000000..ab1c374a3
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/ray/config.py
@@ -0,0 +1,99 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+from typing import Dict, List, Union
+
+from .....core.operand import ShuffleFetchType
+from .....resource import Resource
+from ..api import ExecutionConfig, register_config_cls
+from ..utils import get_band_resources_from_config
+
+logger = logging.getLogger(__name__)
+
+IN_RAY_CI = os.environ.get("MARS_CI_BACKEND", "mars") == "ray"
+# The default interval seconds to update progress and collect garbage.
+DEFAULT_MONITOR_INTERVAL_SECONDS = 0 if IN_RAY_CI else 1
+DEFAULT_LOG_INTERVAL_SECONDS = 60
+DEFAULT_CHECK_SLOW_SUBTASKS_INTERVAL_SECONDS = 120
+
+
+@register_config_cls
+class RayExecutionConfig(ExecutionConfig):
+    name = "ray"
+
+    def __init__(self, execution_config: Dict):
+        super().__init__(execution_config)
+        self._ray_execution_config = execution_config[self.backend]
+
+    def get_band_resources(self):
+        """
+        Get the band resources from config for generating ray virtual
+        resources.
+        """
+        return get_band_resources_from_config(self._ray_execution_config)
+
+    def get_deploy_band_resources(self) -> List[Dict[str, Resource]]:
+        return []
+
+    def get_subtask_max_retries(self):
+        return self._ray_execution_config["subtask_max_retries"]
+
+    def get_subtask_num_cpus(self) -> Union[int, float]:
+        return self._ray_execution_config.get("subtask_num_cpus", 1)
+
+    def get_subtask_memory(self) -> Union[int, float]:
+        return self._ray_execution_config.get("subtask_memory", None)
+
+    def get_n_cpu(self):
+        return self._ray_execution_config["n_cpu"]
+
+    def get_n_worker(self):
+        return self._ray_execution_config["n_worker"]
+
+    def get_monitor_interval_seconds(self):
+        """
+        The interval seconds for the monitor task to update progress and
+        collect garbage.
+        """
+        return self._ray_execution_config.get(
+            "monitor_interval_seconds", DEFAULT_MONITOR_INTERVAL_SECONDS
+        )
+
+    def get_log_interval_seconds(self):
+        return self._ray_execution_config.get(
+            "log_interval_seconds", DEFAULT_LOG_INTERVAL_SECONDS
+        )
+
+    def get_check_slow_subtasks_interval_seconds(self) -> float:
+        return self._ray_execution_config.get(
+            "check_slow_subtasks_interval_seconds",
+            DEFAULT_CHECK_SLOW_SUBTASKS_INTERVAL_SECONDS,
+        )
+
+    def get_check_slow_subtask_iqr_ratio(self) -> float:
+        # https://en.wikipedia.org/wiki/Box_plot
+        # iqr = q3 - q1
+        # duration_threshold = q3 + check_slow_subtasks_iqr_ratio * (q3 - q1)
+        # So, the value == 3, extremely slow(probably hang); value == 1.5, slow
+        return self._ray_execution_config.get("check_slow_subtasks_iqr_ratio", 3)
+
+    def get_shuffle_fetch_type(self) -> ShuffleFetchType:
+        return ShuffleFetchType.FETCH_BY_INDEX
+
+    def get_gc_method(self):
+        method = self._ray_execution_config.get("gc_method", "submitted")
+        assert method in ["submitted", "completed"]
+        return method
diff --git a/python/xorbits/_mars/services/task/execution/ray/context.py b/python/xorbits/_mars/services/task/execution/ray/context.py
new file mode 100644
index 000000000..5b6f2f4b3
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/ray/context.py
@@ -0,0 +1,243 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from dataclasses import asdict
+from typing import Callable, Dict, List
+
+from .....core.context import Context
+from .....storage.base import StorageLevel
+from .....typing import ChunkType
+from .....utils import implements, lazy_import, sync_to_async
+from ....context import ThreadedServiceContext
+from .config import RayExecutionConfig
+
+ray = lazy_import("ray")
+logger = logging.getLogger(__name__)
+
+
+class RayRemoteObjectManager:
+    """The remote object manager in task state actor."""
+
+    def __init__(self):
+        self._named_remote_objects = {}
+
+    def create_remote_object(self, name: str, object_cls, *args, **kwargs):
+        remote_object = object_cls(*args, **kwargs)
+        self._named_remote_objects[name] = remote_object
+
+    def destroy_remote_object(self, name: str):
+        self._named_remote_objects.pop(name, None)
+
+    async def call_remote_object(self, name: str, attr: str, *args, **kwargs):
+        remote_object = self._named_remote_objects[name]
+        meth = getattr(remote_object, attr)
+        async_meth = sync_to_async(meth)
+        return await async_meth(*args, **kwargs)
+
+
+class _RayRemoteObjectWrapper:
+    def __init__(self, task_state_actor: "ray.actor.ActorHandle", name: str):
+        self._task_state_actor = task_state_actor
+        self._name = name
+
+    def __getattr__(self, attr):
+        def wrap(*args, **kwargs):
+            r = self._task_state_actor.call_remote_object.remote(
+                self._name, attr, *args, **kwargs
+            )
+            return ray.get(r)
+
+        return wrap
+
+
+class _RayRemoteObjectContext:
+    def __init__(
+        self,
+        get_or_create_actor: Callable[[], "ray.actor.ActorHandle"],
+        *args,
+        **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self._get_or_create_actor = get_or_create_actor
+        self._task_state_actor = None
+
+    def _get_task_state_actor(self) -> "ray.actor.ActorHandle":
+        # Get the RayTaskState actor, this is more clear and faster than wraps
+        # the `get_or_create_actor` by lru_cache in __init__ because this method
+        # is called as needed.
+        if self._task_state_actor is None:
+            self._task_state_actor = self._get_or_create_actor()
+        return self._task_state_actor
+
+    @implements(Context.create_remote_object)
+    def create_remote_object(self, name: str, object_cls, *args, **kwargs):
+        task_state_actor = self._get_task_state_actor()
+        r = task_state_actor.create_remote_object.remote(
+            name, object_cls, *args, **kwargs
+        )
+        # Make sure the actor is created. The remote object may not be created
+        # when get_remote_object from worker because the callers of
+        # create_remote_object and get_remote_object are not in the same worker.
+        # Use sync Ray actor requires this `ray.get`, too.
+        ray.get(r)
+        return _RayRemoteObjectWrapper(task_state_actor, name)
+
+    @implements(Context.get_remote_object)
+    def get_remote_object(self, name: str):
+        task_state_actor = self._get_task_state_actor()
+        return _RayRemoteObjectWrapper(task_state_actor, name)
+
+    @implements(Context.destroy_remote_object)
+    def destroy_remote_object(self, name: str):
+        task_state_actor = self._get_task_state_actor()
+        task_state_actor.destroy_remote_object.remote(name)
+
+
+# TODO(fyrestone): Implement more APIs for Ray.
+class RayExecutionContext(_RayRemoteObjectContext, ThreadedServiceContext):
+    """The context for tiling."""
+
+    def __init__(
+        self,
+        config: RayExecutionConfig,
+        task_context: Dict,
+        task_chunks_meta: Dict,
+        worker_addresses: List[str],
+        *args,
+        **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self._config = config
+        self._task_context = task_context
+        self._task_chunks_meta = task_chunks_meta
+        self._worker_addresses = worker_addresses
+
+    @implements(Context.get_chunks_result)
+    def get_chunks_result(self, data_keys: List[str], fetch_only: bool = False) -> List:
+        logger.info("Getting %s chunks result.", len(data_keys))
+        object_refs = [self._task_context[key] for key in data_keys]
+        result = ray.get(object_refs)
+        logger.info("Got %s chunks result.", len(result))
+        return result if not fetch_only else None
+
+    @implements(Context.get_chunks_meta)
+    def get_chunks_meta(
+        self, data_keys: List[str], fields: List[str] = None, error="raise"
+    ) -> List[Dict]:
+        if not self._task_chunks_meta:
+            result = self._call(
+                self._get_chunks_meta_from_service(
+                    data_keys, fields=fields, error=error
+                )
+            )
+        else:
+            result = [{}] * len(data_keys)
+            missing_key_indexes = []
+            missing_keys = []
+            for idx, key in enumerate(data_keys):
+                try:
+                    chunk_meta = self._task_chunks_meta[key]
+                except KeyError:
+                    missing_key_indexes.append(idx)
+                    missing_keys.append(key)
+                else:
+                    meta = asdict(chunk_meta)
+                    meta = {f: meta.get(f) for f in fields}
+                    result[idx] = meta
+            if missing_keys:
+                missing_meta = self._call(
+                    self._get_chunks_meta_from_service(
+                        missing_keys, fields=fields, error=error
+                    )
+                )
+                for idx, meta in zip(missing_key_indexes, missing_meta):
+                    result[idx] = meta
+        return result
+
+    async def _get_chunks_meta_from_service(
+        self, data_keys: List[str], fields: List[str] = None, error="raise"
+    ) -> List[Dict]:
+        get_metas = [
+            self._meta_api.get_chunk_meta.delay(data_key, fields=fields, error=error)
+            for data_key in data_keys
+        ]
+        return await self._meta_api.get_chunk_meta.batch(*get_metas)
+
+    @implements(Context.get_total_n_cpu)
+    def get_total_n_cpu(self) -> int:
+        # TODO(fyrestone): Support auto scaling.
+        return self._config.get_n_cpu() * self._config.get_n_worker()
+
+    @implements(Context.get_worker_addresses)
+    def get_worker_addresses(self) -> List[str]:
+        # Returns virtual worker addresses.
+        return self._worker_addresses
+
+
+# TODO(fyrestone): Implement more APIs for Ray.
+class RayExecutionWorkerContext(_RayRemoteObjectContext, dict):
+    """The context for executing operands."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._current_chunk = None
+
+    @classmethod
+    @implements(Context.new_custom_log_dir)
+    def new_custom_log_dir(cls):
+        logger.info(
+            "%s does not support register_custom_log_path / new_custom_log_dir",
+            cls.__name__,
+        )
+        return None
+
+    @staticmethod
+    @implements(Context.register_custom_log_path)
+    def register_custom_log_path(
+        session_id: str,
+        tileable_op_key: str,
+        chunk_op_key: str,
+        worker_address: str,
+        log_path: str,
+    ):
+        raise NotImplementedError
+
+    @classmethod
+    @implements(Context.set_progress)
+    def set_progress(cls, progress: float):
+        logger.info(
+            "%s does not support set_running_operand_key / set_progress", cls.__name__
+        )
+
+    @staticmethod
+    @implements(Context.set_running_operand_key)
+    def set_running_operand_key(session_id: str, op_key: str):
+        raise NotImplementedError
+
+    @classmethod
+    @implements(Context.get_storage_info)
+    def get_storage_info(
+        cls, address: str = None, level: StorageLevel = StorageLevel.MEMORY
+    ):
+        logger.info("%s does not support get_storage_info", cls.__name__)
+        return {}
+
+    def set_current_chunk(self, chunk: ChunkType):
+        """Set current executing chunk."""
+        self._current_chunk = chunk
+
+    def get_current_chunk(self) -> ChunkType:
+        """Set current executing chunk."""
+        return self._current_chunk
diff --git a/python/xorbits/_mars/services/task/execution/ray/executor.py b/python/xorbits/_mars/services/task/execution/ray/executor.py
new file mode 100644
index 000000000..3be339a57
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/ray/executor.py
@@ -0,0 +1,1086 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import collections
+import enum
+import functools
+import itertools
+import logging
+import operator
+import time
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List
+
+import numpy as np
+
+from .....core import Chunk, ChunkGraph, TileContext
+from .....core.context import set_context
+from .....core.operand import Fetch, Fuse, VirtualOperand, execute
+from .....core.operand.fetch import FetchShuffle
+from .....lib.aio import alru_cache
+from .....metrics.api import Metrics, init_metrics
+from .....resource import Resource
+from .....serialization import deserialize, serialize
+from .....typing import BandType
+from .....utils import (
+    aiotask_wrapper,
+    calc_data_size,
+    classproperty,
+    get_chunk_params,
+    lazy_import,
+)
+from ....lifecycle.api import LifecycleAPI
+from ....meta.api import MetaAPI
+from ....subtask import Subtask, SubtaskGraph
+from ....subtask.utils import iter_output_data
+from ...core import Task
+from ..api import ExecutionChunkResult, TaskExecutor, register_executor_cls
+from ..utils import ResultTileablesLifecycle
+from .config import IN_RAY_CI, RayExecutionConfig
+from .context import (
+    RayExecutionContext,
+    RayExecutionWorkerContext,
+    RayRemoteObjectManager,
+)
+from .shuffle import ShuffleManager
+
+ray = lazy_import("ray")
+logger = logging.getLogger(__name__)
+
+
+class RayMetrics:
+    _submitted_subtask_number = None
+    _started_subtask_number = None
+    _completed_subtask_number = None
+
+    @classproperty
+    def submitted_subtask_number(self):
+        if RayMetrics._submitted_subtask_number is None:
+            RayMetrics._submitted_subtask_number = Metrics.counter(
+                "mars.ray_dag.submitted_subtask_number",
+                "The number of submitted subtask.",
+                ("session_id", "task_id", "stage_id"),
+            )
+        return RayMetrics._submitted_subtask_number
+
+    @classproperty
+    def started_subtask_number(self):
+        if RayMetrics._started_subtask_number is None:
+            RayMetrics._started_subtask_number = Metrics.counter(
+                "mars.ray_dag.started_subtask_number",
+                "The number of started subtask.",
+            )
+        return RayMetrics._started_subtask_number
+
+    @classproperty
+    def completed_subtask_number(self):
+        if RayMetrics._completed_subtask_number is None:
+            RayMetrics._completed_subtask_number = Metrics.counter(
+                "mars.ray_dag.completed_subtask_number",
+                "The number of completed subtask.",
+            )
+        return RayMetrics._completed_subtask_number
+
+
+class RayTaskState(RayRemoteObjectManager):
+    handle = None
+
+    @classmethod
+    def get_handle(cls):
+        """Get the RayTaskState actor handle."""
+        logger.info("Getting RayTaskState handle.")
+        return ray.get_actor(cls.__name__)
+
+    @classmethod
+    def create(cls):
+        """Create a RayTaskState actor."""
+        logger.info("Creating RayTaskState actor.")
+        name = cls.__name__
+        try:
+            cls.handle = ray.get_actor(name)
+        except ValueError:
+            # Attempt to create it (may race with other attempts).
+            try:
+                cls.handle = ray.remote(cls).options(name=name).remote()
+            except ValueError:  # pragma: no cover
+                # We lost the creation race, ignore.
+                cls.handle = ray.get_actor(name)
+        return cls.handle
+
+
+_optimize_physical = None
+
+
+def _optimize_subtask_graph(subtask_graph):
+    global _optimize_physical
+
+    if _optimize_physical is None:
+        from .....optimization.physical import optimize as _optimize_physical
+    return _optimize_physical(subtask_graph)
+
+
+class _SubtaskGC:
+    """GC the inputs of subtask chunk."""
+
+    def __init__(
+        self,
+        subtask_chunk_graph: ChunkGraph,
+        context: RayExecutionWorkerContext,
+    ):
+        self._subtask_chunk_graph = subtask_chunk_graph
+        self._context = context
+        ref_counts = collections.defaultdict(lambda: 0)
+        # Set 1 for result chunks.
+        for result_chunk in subtask_chunk_graph.result_chunks:
+            ref_counts[result_chunk.key] += 1
+        # Iter graph to set ref counts.
+        for chunk in subtask_chunk_graph:
+            ref_counts[chunk.key] += subtask_chunk_graph.count_successors(chunk)
+        self._chunk_key_ref_counts = ref_counts
+
+    def gc_inputs(self, chunk: Chunk):
+        ref_counts = self._chunk_key_ref_counts
+        for inp in self._subtask_chunk_graph.iter_predecessors(chunk):
+            ref_counts[inp.key] -= 1
+            if ref_counts[inp.key] == 0:
+                self._context.pop(inp.key, None)
+
+
+def execute_subtask(
+    subtask_id: str,
+    subtask_chunk_graph: ChunkGraph,
+    output_meta_n_keys: int,
+    is_mapper,
+    *inputs,
+):
+    """
+    The function used for execute subtask in ray task.
+
+    Parameters
+    ----------
+    subtask_id: str
+        id of subtask
+    subtask_chunk_graph: ChunkGraph
+        chunk graph for subtask
+    output_meta_n_keys: int
+        will be 0 if subtask is a shuffle mapper.
+    is_mapper: bool
+        Whether current subtask is a shuffle mapper. Note that shuffle reducers such as `DataFrameDropDuplicates`
+        can be a mapper at the same time.
+    inputs:
+        inputs for current subtask
+
+    Returns
+    -------
+        subtask outputs and meta for outputs if `output_meta_keys` is provided.
+    """
+    init_metrics("ray")
+    RayMetrics.started_subtask_number.record(1)
+    ray_task_id = ray.get_runtime_context().task_id
+    subtask_chunk_graph = deserialize(*subtask_chunk_graph)
+    logger.info("Start subtask: %s, ray task id: %s.", subtask_id, ray_task_id)
+    # Optimize chunk graph.
+    subtask_chunk_graph = _optimize_subtask_graph(subtask_chunk_graph)
+    fetch_chunks, shuffle_fetch_chunk = _get_fetch_chunks(subtask_chunk_graph)
+    context = RayExecutionWorkerContext(RayTaskState.get_handle)
+    if shuffle_fetch_chunk is not None:
+        # The subtask is a reducer subtask.
+        n_mappers = shuffle_fetch_chunk.op.n_mappers
+        # Some reducer may have multiple output chunks, see `PSRSshuffle._execute_reduce` and
+        # https://user-images.githubusercontent.com/12445254/168569524-f09e42a7-653a-4102-bdf0-cc1631b3168d.png
+        reducer_chunks = subtask_chunk_graph.successors(shuffle_fetch_chunk)
+        reducer_operands = set(c.op for c in reducer_chunks)
+        if len(reducer_operands) != 1:  # pragma: no cover
+            raise ValueError(
+                f"Subtask {subtask_id} has more than 1 reduce operands: {subtask_chunk_graph.to_dot()}"
+            )
+        reducer_operand = reducer_chunks[0].op
+        reducer_index = reducer_operand.reducer_index
+        # Virtual shuffle keys, keep this in sync with `MapReducerOperand#_iter_mapper_key_idx_pairs`
+        context.update(
+            {(i, reducer_index): block for i, block in enumerate(inputs[-n_mappers:])}
+        )
+        inputs = inputs[:-n_mappers]
+    shuffle_input_key_count = len(context)
+    # Create a subtask GC object.
+    subtask_gc = _SubtaskGC(subtask_chunk_graph, context)
+    # Update non shuffle inputs to context.
+    context.update(zip((start_chunk.key for start_chunk in fetch_chunks), inputs))
+
+    for chunk in subtask_chunk_graph.topological_iter():
+        if chunk.key not in context:
+            try:
+                context.set_current_chunk(chunk)
+                execute(context, chunk.op)
+            except Exception:
+                logger.exception(
+                    "Execute operand %s of graph %s failed.",
+                    chunk.op,
+                    subtask_chunk_graph.to_dot(),
+                )
+                raise
+        subtask_gc.gc_inputs(chunk)
+
+    # For non-mapper subtask, output context is chunk key to results.
+    # For mapper subtasks, output context is data key to results.
+    # `iter_output_data` must ensure values order since we only return values.
+    normal_output = {}
+    mapper_output = {}
+    for key, data, is_mapper_block in iter_output_data(subtask_chunk_graph, context):
+        if is_mapper_block:
+            mapper_output[key] = data
+        else:
+            normal_output[key] = data
+
+    # The inputs are referenced by the Ray worker in _raylet.pyx, GC them in Mars is useless.
+    # So, subtask GC has skipped GC shuffle input keys in order to simplify the implementation.
+    expect_context_count = (
+        len(normal_output) + len(mapper_output) + shuffle_input_key_count
+    )
+    assert (
+        len(context) == expect_context_count
+    ), f"The remaining context count mismatch: {len(context)}(actual) != {expect_context_count}(expected)."
+
+    output_values = []
+    # assert output keys order consistent
+    if is_mapper:
+        # mapper may produce outputs which isn't shuffle blocks, such as TensorUnique._execute_agg_reduce.
+        mapper_main_keys = set(k[0] for k in mapper_output.keys())
+        assert len(mapper_main_keys) == 1, mapper_main_keys
+        # sorted reducer_index's consistency with reducer_ordinal is checked in
+        # `OperandTilesHandler._check_shuffle_reduce_chunks`.
+        # So sort keys by reducer_index to ensure mapper outputs consist with reducer_ordinal,
+        # then downstream can fetch shuffle blocks by reducer_ordinal.
+        mapper_output = dict(sorted(mapper_output.items(), key=lambda item: item[0][1]))
+    if output_meta_n_keys:
+        output_meta = {}
+        # for non-shuffle subtask, record meta in supervisor.
+        for chunk in subtask_chunk_graph.result_chunks[:output_meta_n_keys]:
+            chunk_key = chunk.key
+            if chunk_key not in output_meta:
+                if isinstance(chunk.op, Fuse):  # pragma: no cover
+                    # fuse op
+                    chunk = chunk.chunk
+                data = context[chunk_key]
+                memory_size = calc_data_size(data)
+                output_meta[chunk_key] = get_chunk_params(chunk), memory_size
+        output_values.append(output_meta)
+    output_values.extend(normal_output.values())
+    output_values.extend(mapper_output.values())
+    logger.info("Complete subtask: %s, ray task id: %s.", subtask_id, ray_task_id)
+    RayMetrics.completed_subtask_number.record(1)
+    return output_values[0] if len(output_values) == 1 else output_values
+
+
+def _get_fetch_chunks(chunk_graph):
+    fetch_chunks = []
+    shuffle_fetch_chunk = None
+    for start_chunk in chunk_graph.iter_indep():
+        if isinstance(start_chunk.op, FetchShuffle):
+            assert shuffle_fetch_chunk is None, shuffle_fetch_chunk
+            shuffle_fetch_chunk = start_chunk
+        elif isinstance(start_chunk.op, Fetch):
+            fetch_chunks.append(start_chunk)
+    return sorted(fetch_chunks, key=operator.attrgetter("key")), shuffle_fetch_chunk
+
+
+def _get_subtask_out_info(
+    subtask_chunk_graph: ChunkGraph, is_mapper: bool, n_reducers: int = None
+):
+    # output_keys might be duplicate in chunk graph, use dict to deduplicate.
+    # output_keys order should be consistent with remote `execute_subtask`,
+    # dict can preserve insert order.
+    output_keys = {}
+    shuffle_chunk = None
+    if is_mapper:
+        assert n_reducers is not None
+        if len(subtask_chunk_graph.result_chunks) == 1:
+            return set(), n_reducers
+        for chunk in subtask_chunk_graph.result_chunks:
+            if not chunk.is_mapper:
+                output_keys[chunk.key] = 1
+                # mapper may produce outputs which isn't shuffle blocks, such as TensorUnique._execute_agg_reduce
+                # which is  mapper too, but some outputs are not mapper blocks:
+                # https://user-images.githubusercontent.com/12445254/184132642-a19259fd-43d6-4a27-a033-4aaa97d7586e.svg
+            else:
+                assert shuffle_chunk is None, (shuffle_chunk, chunk)
+                shuffle_chunk = chunk
+        return output_keys.keys(), len(output_keys) + n_reducers
+    for chunk in subtask_chunk_graph.result_chunks:
+        if isinstance(
+            chunk.op, VirtualOperand
+        ):  # FIXME(chaokunyang) no need to check this?
+            continue
+        else:
+            output_keys[chunk.key] = 1
+    return output_keys.keys(), len(output_keys)
+
+
+class OrderedSet:
+    def __init__(self):
+        self._d = set()
+        self._l = list()
+
+    def add(self, item):
+        self._d.add(item)
+        self._l.append(item)
+        assert len(self._d) == len(self._l)
+
+    def update(self, items):
+        tmp = list(items) if isinstance(items, collections.Iterator) else items
+        self._l.extend(tmp)
+        self._d.update(tmp)
+        assert len(self._d) == len(self._l)
+
+    def __contains__(self, item):
+        return item in self._d
+
+    def __getitem__(self, item):
+        return self._l[item]
+
+    def __len__(self):
+        return len(self._d)
+
+
+class _RayExecutionStage(enum.Enum):
+    INIT = 0
+    SUBMITTING = 1
+    WAITING = 2
+
+
+@dataclass
+class _RayChunkMeta:
+    memory_size: int
+
+
+@dataclass
+class _RayMonitorContext:
+    stage: _RayExecutionStage = _RayExecutionStage.INIT
+    submitted_subtasks: OrderedSet = field(default_factory=OrderedSet)
+    completed_subtasks: OrderedSet = field(default_factory=OrderedSet)
+    # The shuffle manager for monitor task to GC the object refs of shuffles.
+    shuffle_manager: ShuffleManager = None
+    # The first output object ref of a Subtask to the Subtask.
+    object_ref_to_subtask: Dict["ray.ObjectRef", Subtask] = field(default_factory=dict)
+    # Stage chunk keys may be duplicate.
+    # TODO(fyrestone): Remove this if Mars chunk keys are unique.
+    chunk_key_ref_count: Dict[str, int] = field(
+        default_factory=lambda: collections.defaultdict(int)
+    )
+
+
+@dataclass
+class _RaySubtaskRuntime:
+    start_time: float = 0.0
+
+
+class _RaySlowSubtaskChecker:
+    @dataclass
+    class _CheckInfo:
+        count: int
+        duration_threshold: float
+
+    def __init__(
+        self,
+        total_subtask_count: int,
+        submitted_subtasks: OrderedSet,
+        completed_subtasks: OrderedSet,
+        interquartile_range_ratio: float = 3,
+    ):
+        self._total_subtask_count = total_subtask_count
+        self._submitted_subtasks = submitted_subtasks
+        self._completed_subtasks = completed_subtasks
+        self._logic_key_to_subtask_costs = collections.defaultdict(list)
+        self._logic_key_to_check_info = dict()
+        self._ratio = interquartile_range_ratio
+
+    def update(self):
+        i = 0
+        j = 0
+        while i < self._total_subtask_count or j < self._total_subtask_count:
+            curr_time = time.time()
+            while i < len(self._submitted_subtasks):
+                subtask = self._submitted_subtasks[i]
+                subtask.runtime.start_time = curr_time
+                i += 1
+            while j < len(self._completed_subtasks):
+                subtask = self._completed_subtasks[j]
+                self._logic_key_to_subtask_costs[subtask.logic_key].append(
+                    curr_time - subtask.runtime.start_time
+                )
+                j += 1
+            yield
+
+    def is_slow(self, subtask: Subtask):
+        logic_key = subtask.logic_key
+        if logic_key not in self._logic_key_to_subtask_costs:
+            # The subtask logic key has no costs.
+            return False
+        logic_parallelism = subtask.logic_parallelism
+        if not logic_parallelism:
+            # Invalid parallelism.
+            return False
+        subtask_costs = self._logic_key_to_subtask_costs[logic_key]
+        complete_count = len(subtask_costs)
+        if complete_count / logic_parallelism < 0.75:
+            # Too few complete subtasks.
+            return False
+        check_info = self._logic_key_to_check_info.get(logic_key)
+        if check_info is None or check_info.count != complete_count:
+            arr = np.array(subtask_costs)
+            # Please refer to: https://en.wikipedia.org/wiki/Box_plot
+            q1, q3 = np.quantile(arr, 0.25), np.quantile(arr, 0.75)
+            duration_threshold = q3 + self._ratio * (q3 - q1)
+            self._logic_key_to_check_info[
+                logic_key
+            ] = _RaySlowSubtaskChecker._CheckInfo(complete_count, duration_threshold)
+        else:
+            duration_threshold = check_info.duration_threshold
+        assert subtask.runtime.start_time > 0
+        return time.time() - subtask.runtime.start_time > duration_threshold
+
+
+@register_executor_cls
+class RayTaskExecutor(TaskExecutor):
+    name = "ray"
+
+    def __init__(
+        self,
+        config: RayExecutionConfig,
+        task: Task,
+        tile_context: TileContext,
+        task_context: Dict[str, "ray.ObjectRef"],
+        task_chunks_meta: Dict[str, _RayChunkMeta],
+        lifecycle_api: LifecycleAPI,
+        meta_api: MetaAPI,
+    ):
+        logger.info(
+            "Start task %s with GC method %s.",
+            task.task_id,
+            config.get_gc_method(),
+        )
+        self._config = config
+        self._task = task
+        self._tile_context = tile_context
+        self._task_context = task_context
+        self._task_chunks_meta = task_chunks_meta
+        self._ray_executor = self._get_ray_executor()
+
+        # API
+        self._lifecycle_api = lifecycle_api
+        self._meta_api = meta_api
+
+        self._available_band_resources = None
+        self._result_tileables_lifecycle = None
+
+        # For progress and task cancel
+        self._stage_index = 0
+        self._pre_all_stages_progress = 0.0
+        self._pre_all_stages_tile_progress = 0.0
+        self._cur_stage_progress = 0.0
+        self._cur_stage_tile_progress = 0.0
+        self._execute_subtask_graph_aiotask = None
+        self._cancelled = False
+
+    @classmethod
+    async def create(
+        cls,
+        config: RayExecutionConfig,
+        *,
+        session_id: str,
+        address: str,
+        task: Task,
+        tile_context: TileContext,
+        **kwargs,
+    ) -> "RayTaskExecutor":
+        lifecycle_api, meta_api = await cls._get_apis(session_id, address)
+        task_context = {}
+        task_chunks_meta = {}
+
+        executor = cls(
+            config,
+            task,
+            tile_context,
+            task_context,
+            task_chunks_meta,
+            lifecycle_api,
+            meta_api,
+        )
+        available_band_resources = await executor.get_available_band_resources()
+        worker_addresses = list(
+            map(operator.itemgetter(0), available_band_resources.keys())
+        )
+        await cls._init_context(
+            config,
+            task_context,
+            task_chunks_meta,
+            RayTaskState.create,
+            worker_addresses,
+            session_id,
+            address,
+        )
+        return executor
+
+    def get_execution_config(self):
+        return self._config
+
+    # noinspection DuplicatedCode
+    def destroy(self):
+        logger.info("Complete task %s.", self._task.task_id)
+        self._task = None
+        self._tile_context = None
+        self._task_context = {}
+        self._task_chunks_meta = {}
+        self._ray_executor = None
+
+        # API
+        self._lifecycle_api = None
+        self._meta_api = None
+
+        self._available_band_resources = None
+        self._result_tileables_lifecycle = None
+
+        # For progress and task cancel
+        self._stage_index = 0
+        self._pre_all_stages_progress = 1.0
+        self._pre_all_stages_tile_progress = 1.0
+        self._cur_stage_progress = 1.0
+        self._cur_stage_tile_progress = 1.0
+        self._execute_subtask_graph_aiotask = None
+        self._cancelled = None
+        self._config = None
+
+    @classmethod
+    @alru_cache(cache_exceptions=False)
+    async def _get_apis(cls, session_id: str, address: str):
+        return await asyncio.gather(
+            LifecycleAPI.create(session_id, address),
+            MetaAPI.create(session_id, address),
+        )
+
+    @staticmethod
+    @functools.lru_cache(maxsize=None)  # Specify maxsize=None to make it faster
+    def _get_ray_executor():
+        # Export remote function once.
+        return ray.remote(execute_subtask)
+
+    @classmethod
+    async def _init_context(
+        cls,
+        config: RayExecutionConfig,
+        task_context: Dict[str, "ray.ObjectRef"],
+        task_chunks_meta: Dict[str, _RayChunkMeta],
+        create_task_state_actor: Callable[[], "ray.actor.ActorHandle"],
+        worker_addresses: List[str],
+        session_id: str,
+        address: str,
+    ):
+        loop = asyncio.get_running_loop()
+        context = RayExecutionContext(
+            config,
+            task_context,
+            task_chunks_meta,
+            worker_addresses,
+            create_task_state_actor,
+            session_id,
+            address,
+            address,
+            address,
+            loop=loop,
+        )
+        await context.init()
+        set_context(context)
+
+    async def __aenter__(self):
+        self._result_tileables_lifecycle = ResultTileablesLifecycle(
+            self._task.tileable_graph, self._tile_context, self._lifecycle_api
+        )
+
+    async def execute_subtask_graph(
+        self,
+        stage_id: str,
+        subtask_graph: SubtaskGraph,
+        chunk_graph: ChunkGraph,
+        tile_context: TileContext,
+        context: Any = None,
+    ) -> Dict[Chunk, ExecutionChunkResult]:
+        if self._cancelled is True:  # pragma: no cover
+            raise asyncio.CancelledError()
+        self._stage_index += 1
+        stage_id = f"{self._stage_index}:{stage_id}"
+        logger.info("Start stage %s.", stage_id)
+        self._execute_subtask_graph_aiotask = asyncio.current_task()
+
+        monitor_context = _RayMonitorContext()
+        monitor_aiotask = asyncio.create_task(
+            self._update_progress_and_collect_garbage(
+                stage_id,
+                subtask_graph,
+                chunk_graph,
+                monitor_context,
+                self._config.get_monitor_interval_seconds(),
+                self._config.get_gc_method(),
+            )
+        )
+        try:
+            # Previous execution may have duplicate tileable ids, the tileable may be decref
+            # during execution, so we should track and incref the result tileables before execute.
+            await self._result_tileables_lifecycle.incref_tiled()
+            return await self._execute_subtask_graph(
+                stage_id, subtask_graph, chunk_graph, monitor_context
+            )
+        except asyncio.CancelledError:
+            logger.info(
+                "Cancel %s ray tasks of stage %s.",
+                len(monitor_context.object_ref_to_subtask),
+                stage_id,
+            )
+            for object_ref in monitor_context.object_ref_to_subtask.keys():
+                ray.cancel(object_ref, force=True)
+            raise
+        finally:
+            logger.info("Clear stage %s.", stage_id)
+            monitor_aiotask.cancel()
+            for subtask in subtask_graph:
+                subtask.runtime = None
+            for key in self._task_context.keys() - self._task_chunks_meta.keys():
+                self._task_context.pop(key)
+
+    async def _execute_subtask_graph(
+        self,
+        stage_id: str,
+        subtask_graph: SubtaskGraph,
+        chunk_graph: ChunkGraph,
+        monitor_context: _RayMonitorContext,
+    ) -> Dict[Chunk, ExecutionChunkResult]:
+        task_context = self._task_context
+        self._pre_all_stages_tile_progress = (
+            self._pre_all_stages_tile_progress + self._cur_stage_tile_progress
+        )
+        self._cur_stage_tile_progress = (
+            self._tile_context.get_all_progress() - self._pre_all_stages_tile_progress
+        )
+        shuffle_manager = ShuffleManager(subtask_graph)
+        monitor_context.stage = _RayExecutionStage.SUBMITTING
+        monitor_context.shuffle_manager = shuffle_manager
+        logger.info(
+            "Submitting %s subtasks of stage %s which contains shuffles: %s",
+            len(subtask_graph),
+            stage_id,
+            shuffle_manager.info(),
+        )
+        subtask_max_retries = self._config.get_subtask_max_retries()
+        subtask_num_cpus = self._config.get_subtask_num_cpus()
+        subtask_memory = self._config.get_subtask_memory()
+        metrics_tags = {
+            "session_id": self._task.session_id,
+            "task_id": self._task.task_id,
+            "stage_id": stage_id,
+        }
+        output_meta_object_refs = []
+        for subtask in subtask_graph.topological_iter():
+            if subtask.virtual:
+                continue
+            subtask_chunk_graph = subtask.chunk_graph
+            input_object_refs = await self._load_subtask_inputs(
+                stage_id, subtask, task_context, shuffle_manager
+            )
+            # Can't use `subtask_graph.count_successors(subtask) == 0` to check output meta, because a subtask
+            # may have some outputs which are dependent by downstream, but other outputs are not. see
+            # https://user-images.githubusercontent.com/12445254/168484663-a4caa3f4-0ccc-4cd7-bf20-092356815073.png
+            is_mapper, n_reducers = shuffle_manager.is_mapper(subtask), None
+            if is_mapper:
+                n_reducers = shuffle_manager.get_n_reducers(subtask)
+            output_keys, out_count = _get_subtask_out_info(
+                subtask_chunk_graph, is_mapper, n_reducers
+            )
+            if is_mapper:
+                # shuffle meta won't be recorded in meta service.
+                output_count = out_count
+            else:
+                output_count = out_count + bool(subtask.stage_n_outputs)
+            assert output_count != 0
+            subtask_max_retries = subtask_max_retries if subtask.retryable else 0
+            output_object_refs = self._ray_executor.options(
+                num_cpus=subtask_num_cpus,
+                num_returns=output_count,
+                max_retries=subtask_max_retries,
+                memory=subtask_memory,
+                scheduling_strategy="DEFAULT" if len(input_object_refs) else "SPREAD",
+            ).remote(
+                subtask.subtask_id,
+                serialize(subtask_chunk_graph, context={"serializer": "ray"}),
+                subtask.stage_n_outputs,
+                is_mapper,
+                *input_object_refs,
+            )
+            await asyncio.sleep(0)
+            if output_count == 1:
+                output_object_refs = [output_object_refs]
+            RayMetrics.submitted_subtask_number.record(1, metrics_tags)
+            monitor_context.submitted_subtasks.add(subtask)
+            monitor_context.object_ref_to_subtask[output_object_refs[0]] = subtask
+            subtask.runtime = _RaySubtaskRuntime()
+            if subtask.stage_n_outputs:
+                meta_object_ref, *output_object_refs = output_object_refs
+                # TODO(fyrestone): Fetch(not get) meta object here.
+                output_meta_object_refs.append(meta_object_ref)
+            if is_mapper:
+                shuffle_manager.add_mapper_output_refs(
+                    subtask, output_object_refs[-n_reducers:]
+                )
+                output_object_refs = output_object_refs[:-n_reducers]
+            # Mars chunk keys may be duplicate, so we should track the ref count.
+            for chunk_key, object_ref in zip(output_keys, output_object_refs):
+                if chunk_key in task_context:
+                    monitor_context.chunk_key_ref_count[chunk_key] += 1
+                task_context[chunk_key] = object_ref
+        logger.info("Submitted %s subtasks of stage %s.", len(subtask_graph), stage_id)
+
+        monitor_context.stage = _RayExecutionStage.WAITING
+        key_to_meta = {}
+        if len(output_meta_object_refs) > 0:
+            # TODO(fyrestone): Optimize update meta by fetching partial meta.
+            meta_count = len(output_meta_object_refs)
+            logger.info("Getting %s metas of stage %s.", meta_count, stage_id)
+            meta_list = await asyncio.gather(*output_meta_object_refs)
+            for meta in meta_list:
+                for key, (params, memory_size) in meta.items():
+                    key_to_meta[key] = params
+                    self._task_chunks_meta[key] = _RayChunkMeta(memory_size=memory_size)
+            logger.info("Got %s metas of stage %s.", meta_count, stage_id)
+
+        chunk_to_meta = {}
+        # ray.wait requires the object ref list is unique.
+        output_object_refs = set()
+        for chunk in chunk_graph.result_chunks:
+            chunk_key = chunk.key
+            # The result chunk may be in previous stage result,
+            # then the chunk does not have to be processed.
+            if chunk_key in task_context:
+                object_ref = task_context[chunk_key]
+                output_object_refs.add(object_ref)
+                chunk_params = key_to_meta.get(chunk_key)
+                if chunk_params is not None:
+                    chunk_to_meta[chunk] = ExecutionChunkResult(
+                        chunk_params, object_ref
+                    )
+
+        logger.info("Waiting for stage %s complete.", stage_id)
+        # Patched the asyncio.to_thread for Python < 3.9 at mars/lib/aio/__init__.py
+        await asyncio.to_thread(ray.wait, list(output_object_refs), fetch_local=False)
+
+        logger.info("Complete stage %s.", stage_id)
+        return chunk_to_meta
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            await self._result_tileables_lifecycle.decref_tracked()
+            try:
+                await self.cancel()
+            except BaseException:  # noqa: E722  # nosec  # pylint: disable=bare-except
+                pass
+            return
+
+        # Update info if no exception occurs.
+        update_metas = []
+        for tileable in self._task.tileable_graph.result_tileables:
+            tileable = tileable.data if hasattr(tileable, "data") else tileable
+            chunk_keys = []
+            for chunk in self._tile_context[tileable].chunks:
+                chunk_key = chunk.key
+                chunk_keys.append(chunk_key)
+                if (
+                    chunk_key in self._task_context
+                    and chunk_key in self._task_chunks_meta
+                ):
+                    # Some tileable graph may have result chunks that not be executed,
+                    # for example:
+                    # r, b = cut(series, bins, retbins=True)
+                    #     r_result = r.execute().fetch()
+                    #     b_result = b.execute().fetch() <- This is the case
+                    object_ref = self._task_context[chunk_key]
+                    chunk_meta = self._task_chunks_meta[chunk_key]
+                    update_metas.append(
+                        self._meta_api.set_chunk_meta.delay(
+                            chunk,
+                            bands=[],
+                            object_ref=object_ref,
+                            memory_size=chunk_meta.memory_size,
+                        )
+                    )
+        if update_metas:
+            await self._meta_api.set_chunk_meta.batch(*update_metas)
+
+    async def get_available_band_resources(self) -> Dict[BandType, Resource]:
+        if self._available_band_resources is None:
+            band_resources = self._config.get_band_resources()
+            virtual_band_resources = {}
+            idx = 0
+            for band_resource in band_resources:
+                for band, resource in band_resource.items():
+                    virtual_band_resources[
+                        (f"ray_virtual_address_{idx}:0", band)
+                    ] = resource
+                    idx += 1
+            self._available_band_resources = virtual_band_resources
+
+        return self._available_band_resources
+
+    async def get_progress(self) -> float:
+        """Get the execution progress."""
+        return self._cur_stage_progress
+
+    async def cancel(self):
+        """Cancel the task execution."""
+        logger.info("Start to cancel task %s.", self._task)
+        if self._task is None or self._cancelled is True:
+            return
+        self._cancelled = True
+        if self._execute_subtask_graph_aiotask is not None:
+            self._execute_subtask_graph_aiotask.cancel()
+
+    async def _load_subtask_inputs(
+        self,
+        stage_id: str,
+        subtask: Subtask,
+        context: Dict,
+        shuffle_manager: ShuffleManager,
+    ):
+        """
+        Load input object refs of subtask from context.
+
+        It updates the context if the input object refs are fetched from
+        the meta service.
+        """
+        normal_object_refs = []
+        shuffle_object_refs = []
+        key_to_get_meta = {}
+        # for non-shuffle chunks, chunk key will be used for indexing object refs.
+        # for shuffle chunks, mapper subtasks will have only one mapper chunk, and all outputs for mapper
+        # subtask will be shuffle blocks, the downstream reducers will receive inputs in the mappers order.
+        fetch_chunks, shuffle_fetch_chunk = _get_fetch_chunks(subtask.chunk_graph)
+        for index, fetch_chunk in enumerate(fetch_chunks):
+            chunk_key = fetch_chunk.key
+            # pure_depend data is not used, skip it.
+            if chunk_key in subtask.pure_depend_keys:
+                normal_object_refs.append(None)
+            elif chunk_key in context:
+                normal_object_refs.append(context[chunk_key])
+            else:
+                normal_object_refs.append(None)
+                key_to_get_meta[index] = self._meta_api.get_chunk_meta.delay(
+                    chunk_key, fields=["object_refs"]
+                )
+        if shuffle_fetch_chunk is not None:
+            # shuffle meta won't be recorded in meta service, query it from shuffle manager.
+            shuffle_object_refs = list(shuffle_manager.get_reducer_input_refs(subtask))
+
+        if key_to_get_meta:
+            logger.debug(
+                "Fetch %s metas and update context of stage %s.",
+                len(key_to_get_meta),
+                stage_id,
+            )
+            meta_list = await self._meta_api.get_chunk_meta.batch(
+                *key_to_get_meta.values()
+            )
+            for index, meta in zip(key_to_get_meta.keys(), meta_list):
+                object_ref = meta["object_refs"][0]
+                normal_object_refs[index] = object_ref
+                context[fetch_chunks[index].key] = object_ref
+        return normal_object_refs + shuffle_object_refs
+
+    @aiotask_wrapper(exit_if_exception=IN_RAY_CI)
+    async def _update_progress_and_collect_garbage(
+        self,
+        stage_id: str,
+        subtask_graph: SubtaskGraph,
+        chunk_graph: ChunkGraph,
+        monitor_context: _RayMonitorContext,
+        interval_seconds: float,
+        method: str,
+    ):
+        total = sum(not subtask.virtual for subtask in subtask_graph)
+        completed_subtasks = monitor_context.completed_subtasks
+        submitted_subtasks = monitor_context.submitted_subtasks
+        result_chunk_keys = {chunk.key for chunk in chunk_graph.result_chunks}
+        chunk_key_ref_count = monitor_context.chunk_key_ref_count
+        object_ref_to_subtask = monitor_context.object_ref_to_subtask
+        slow_subtask_checker = _RaySlowSubtaskChecker(
+            total,
+            submitted_subtasks,
+            completed_subtasks,
+            self._config.get_check_slow_subtask_iqr_ratio(),
+        )
+
+        def gc():
+            """
+            Consume the completed subtasks and collect garbage.
+
+            GC the output object refs of the subtask which successors are submitted
+            (not completed as above) can reduce the memory peaks, but we can't cancel
+            and rerun slow subtasks because the input object refs of running subtasks
+            may be deleted.
+            """
+            i = 0
+            gc_subtasks = set()
+            gc_targets = (
+                submitted_subtasks if method == "submitted" else completed_subtasks
+            )
+
+            while i < total:
+                while i >= len(gc_targets):
+                    yield
+                # Iterate the completed subtasks once.
+                subtask = gc_targets[i]
+                i += 1
+                logger.debug("GC[stage=%s] subtask: %s", stage_id, subtask)
+
+                # Note: There may be a scenario in which delayed gc occurs.
+                # When a subtask has more than one predecessor, like A, B,
+                # and in the `for ... in ...` loop we get A firstly while
+                # B's successors are completed, A's not. Then we cannot remove
+                # B's results chunks before A's.
+                for pred in subtask_graph.iter_predecessors(subtask):
+                    if pred in gc_subtasks:
+                        continue
+                    for succ in subtask_graph.iter_successors(pred):
+                        while succ not in gc_targets:
+                            yield
+                    if pred.virtual:
+                        # For virtual subtask, remove all the predecessors if it is
+                        # completed.
+                        ppreds = subtask_graph.predecessors(pred)
+                        gc_subtasks.update(ppreds)
+                        gc_chunks = itertools.chain(
+                            *(p.chunk_graph.results for p in ppreds)
+                        )
+                        # Remove object refs from shuffle manager.
+                        for p in ppreds:
+                            logger.debug("GC[stage=%s] shuffle: %s", stage_id, p)
+                            monitor_context.shuffle_manager.remove_object_refs(p)
+                    else:
+                        gc_subtasks.add(pred)
+                        gc_chunks = pred.chunk_graph.results
+                    # We use ref count to handle duplicate chunk keys, so here decref
+                    # should be the same as incref, use deduped chunk keys of a subtask.
+                    pred_result_keys = set()
+                    for chunk in gc_chunks:
+                        chunk_key = chunk.key
+                        if chunk_key in pred_result_keys:
+                            continue
+                        pred_result_keys.add(chunk_key)
+                        # We need to check the GC chunk key is not in the
+                        # result meta keys, because there are some special
+                        # cases that the result meta keys are not the leaves.
+                        #
+                        # example: test_cut_execution
+                        if chunk_key not in result_chunk_keys:
+                            logger.debug("GC[stage=%s] chunk: %s", stage_id, chunk)
+                            ref_count = chunk_key_ref_count.get(chunk_key, 0)
+                            if ref_count == 0:
+                                self._task_context.pop(chunk_key, None)
+                            else:
+                                chunk_key_ref_count[chunk_key] = ref_count - 1
+
+            # TODO(fyrestone): Check the remaining self._task_context.keys()
+            # in the result subtasks
+
+        collect_garbage = gc()
+        update_subtask_cost = slow_subtask_checker.update()
+        last_log_time = last_check_slow_time = time.time()
+        log_interval_seconds = self._config.get_log_interval_seconds()
+        check_slow_subtasks_interval_seconds = (
+            self._config.get_check_slow_subtasks_interval_seconds()
+        )
+        stage_to_log_func = {
+            _RayExecutionStage.SUBMITTING: lambda: logger.info(
+                "Submitted [%s/%s] subtasks of stage %s.",
+                len(submitted_subtasks),
+                total,
+                stage_id,
+            ),
+            _RayExecutionStage.WAITING: lambda: logger.info(
+                "Completed [%s/%s] subtasks of stage %s, one of waiting ray tasks: %s",
+                len(completed_subtasks),
+                total,
+                stage_id,
+                next(iter(object_ref_to_subtask)).task_id()
+                if object_ref_to_subtask
+                else None,
+            ),
+        }
+
+        while len(completed_subtasks) < total:
+            curr_time = time.time()
+            if monitor_context.stage != _RayExecutionStage.INIT:
+                if curr_time - last_log_time > log_interval_seconds:  # pragma: no cover
+                    stage_to_log_func[monitor_context.stage]()
+                    last_log_time = curr_time
+
+            if len(object_ref_to_subtask) <= 0:  # pragma: no cover
+                await asyncio.sleep(interval_seconds)
+                # We should run ray.wait after at least one Ray task is submitted.
+                # Please refer to: https://github.com/mars-project/mars/issues/3274
+                continue
+
+            # Only wait for unready subtask object refs.
+            ready_objects, unready_objects = await asyncio.to_thread(
+                ray.wait,
+                list(object_ref_to_subtask.keys()),
+                num_returns=len(object_ref_to_subtask),
+                timeout=0,
+                fetch_local=False,
+            )
+
+            # Pop the completed subtasks from object_ref_to_subtask.
+            completed_subtasks.update(map(object_ref_to_subtask.pop, ready_objects))
+            # Update progress.
+            stage_progress = (
+                len(completed_subtasks) / total * self._cur_stage_tile_progress
+            )
+            self._cur_stage_progress = self._pre_all_stages_progress + stage_progress
+            # Update subtask cost group by the logic key to logic_key_to_subtask_costs.
+            for _ in update_subtask_cost:
+                break
+            # Collect garbage, use `for ... in ...` to avoid raising StopIteration.
+            for _ in collect_garbage:
+                break
+            # Check slow subtasks, after update_subtask_cost.
+            if monitor_context.stage == _RayExecutionStage.WAITING:
+                if len(completed_subtasks) > 0 and (
+                    curr_time - last_check_slow_time
+                    > check_slow_subtasks_interval_seconds
+                ):
+                    slow_objects = []
+                    for obj in unready_objects:
+                        maybe_slow_subtask = object_ref_to_subtask[obj]
+                        slow = slow_subtask_checker.is_slow(maybe_slow_subtask)
+                        if slow:
+                            slow_objects.append(obj)
+                    if len(slow_objects) > 0:
+                        logger.info(
+                            "Slow tasks(%s): %s",
+                            len(slow_objects),
+                            [o.task_id() for o in slow_objects[:5]],
+                        )
+                    else:
+                        logger.debug(
+                            "No slow tasks in %s unready tasks.", len(unready_objects)
+                        )
+                    last_check_slow_time = curr_time
+            # Fast to next loop and give it a chance to update object_ref_to_subtask.
+            await asyncio.sleep(interval_seconds if len(ready_objects) == 0 else 0)
diff --git a/python/xorbits/_mars/services/task/execution/ray/fetcher.py b/python/xorbits/_mars/services/task/execution/ray/fetcher.py
new file mode 100644
index 000000000..3636c2968
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/ray/fetcher.py
@@ -0,0 +1,69 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import functools
+from collections import namedtuple
+from typing import Dict, List
+
+from .....utils import lazy_import
+from ..api import Fetcher, register_fetcher_cls
+
+ray = lazy_import("ray")
+_FetchInfo = namedtuple("FetchInfo", ["key", "object_ref", "conditions"])
+
+
+def _query_object_with_condition(o, conditions):
+    try:
+        return o.iloc[conditions]
+    except AttributeError:
+        return o[conditions]
+
+
+@register_fetcher_cls
+class RayFetcher(Fetcher):
+    name = "ray"
+    required_meta_keys = ("object_refs",)
+
+    def __init__(self, **kwargs):
+        self._fetch_info_list = []
+        self._no_conditions = True
+
+    @staticmethod
+    @functools.lru_cache(maxsize=None)  # Specify maxsize=None to make it faster
+    def _remote_query_object_with_condition():
+        # Export remote function once.
+        return ray.remote(_query_object_with_condition)
+
+    async def append(self, chunk_key: str, chunk_meta: Dict, conditions: List = None):
+        if conditions is not None:
+            self._no_conditions = False
+        self._fetch_info_list.append(
+            _FetchInfo(chunk_key, chunk_meta["object_refs"][0], conditions)
+        )
+
+    async def get(self):
+        if self._no_conditions:
+            return await asyncio.gather(
+                *(info.object_ref for info in self._fetch_info_list)
+            )
+        refs = [None] * len(self._fetch_info_list)
+        for index, fetch_info in enumerate(self._fetch_info_list):
+            if fetch_info.conditions is None:
+                refs[index] = fetch_info.object_ref
+            else:
+                refs[index] = self._remote_query_object_with_condition().remote(
+                    fetch_info.object_ref, tuple(fetch_info.conditions)
+                )
+        return await asyncio.gather(*refs)
diff --git a/python/xorbits/_mars/services/task/execution/ray/shuffle.py b/python/xorbits/_mars/services/task/execution/ray/shuffle.py
new file mode 100644
index 000000000..81d1b4c26
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/ray/shuffle.py
@@ -0,0 +1,176 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Iterable, List
+
+import numpy as np
+
+from .....core.operand import MapReduceOperand, OperandStage
+from .....utils import lazy_import
+from ....subtask import Subtask, SubtaskGraph
+
+ray = lazy_import("ray")
+
+
+class ShuffleManager:
+    """Manage shuffle execution for ray by resolve dependencies between mappers outputs and reducers inputs based on
+    mapper and reducer index.
+    """
+
+    def __init__(self, subtask_graph: SubtaskGraph):
+        self._subtask_graph = subtask_graph
+        self._proxy_subtasks = subtask_graph.get_shuffle_proxy_subtasks()
+        self._num_shuffles = subtask_graph.num_shuffles()
+        self._mapper_output_refs = []
+        self._mapper_indices = {}
+        self._reducer_indices = {}
+        for shuffle_index, proxy_subtask in enumerate(self._proxy_subtasks):
+            # Note that the reducers can also be mappers such as `DuplicateOperand`.
+            mapper_subtasks = subtask_graph.predecessors(proxy_subtask)
+            reducer_subtasks = subtask_graph.successors(proxy_subtask)
+            n_mappers = len(mapper_subtasks)
+            n_reducers = proxy_subtask.chunk_graph.results[0].op.n_reducers
+            mapper_output_arr = np.empty((n_mappers, n_reducers), dtype=object)
+            self._mapper_output_refs.append(mapper_output_arr)
+            self._mapper_indices.update(
+                {
+                    subtask: (shuffle_index, mapper_index)
+                    for mapper_index, subtask in enumerate(mapper_subtasks)
+                }
+            )
+            # reducers subtask should be sorted by reducer_index and MapReduceOperand.map should insert shuffle block
+            # in reducers order, otherwise shuffle blocks will be sent to wrong reducers.
+            sorted_filled_reducer_subtasks = self._get_sorted_filled_reducers(
+                reducer_subtasks, n_reducers
+            )
+            self._reducer_indices.update(
+                {
+                    subtask: (shuffle_index, reducer_ordinal)
+                    for reducer_ordinal, subtask in enumerate(
+                        sorted_filled_reducer_subtasks
+                    )
+                }
+            )
+
+    @staticmethod
+    def _get_sorted_filled_reducers(
+        reducer_subtasks: Iterable[Subtask], n_reducers: int
+    ):
+        # For operands such as `PSRSAlign`, sometimes `reducer_subtasks` might be less than `n_reducers`.
+        # fill missing reducers with `None`.
+        filled_reducers = [None] * n_reducers
+        for subtask in reducer_subtasks:
+            reducer_ordinal = _get_reducer_operand(subtask.chunk_graph).reducer_ordinal
+            filled_reducers[reducer_ordinal] = subtask
+        return filled_reducers
+
+    def has_shuffle(self):
+        """
+        Whether current subtask graph has shuffles to execute.
+        """
+        return self._num_shuffles > 0
+
+    def add_mapper_output_refs(
+        self, subtask: Subtask, output_object_refs: List["ray.ObjectRef"]
+    ):
+        """
+        Record mapper output ObjectRefs which will be used by reducers later.
+
+        Parameters
+        ----------
+        subtask
+        output_object_refs : List["ray.ObjectRef"]
+            Mapper output ObjectRefs.
+        """
+        shuffle_index, mapper_index = self._mapper_indices[subtask]
+        self._mapper_output_refs[shuffle_index][mapper_index] = np.array(
+            output_object_refs
+        )
+
+    def get_reducer_input_refs(self, subtask: Subtask) -> List["ray.ObjectRef"]:
+        """
+        Get the reducer inputs ObjectRefs output by mappers.
+
+        Parameters
+        ----------
+        subtask : Subtask
+            A reducer subtask.
+        Returns
+        -------
+        input_refs : List["ray.ObjectRef"]
+            The reducer inputs ObjectRefs output by mappers.
+        """
+        shuffle_index, reducer_ordinal = self._reducer_indices[subtask]
+        return self._mapper_output_refs[shuffle_index][:, reducer_ordinal]
+
+    def get_n_reducers(self, subtask: Subtask):
+        """
+        Get the number of shuffle blocks that a mapper operand outputs,
+        which is also the number of the reducers when tiling shuffle operands.
+        Note that this might be greater than actual number of the reducers in the subtask graph,
+        because some reducers may not be added to chunk graph.
+
+        Parameters
+        ----------
+        subtask : Subtask
+            A mapper or reducer subtask.
+        Returns
+        -------
+        n_reducers : int
+            The number of shuffle blocks that a mapper operand outputs.
+        """
+        index = self._mapper_indices.get(subtask) or self._reducer_indices.get(subtask)
+        if index is None:
+            raise ValueError(f"The {subtask} should be a mapper or a reducer.")
+        else:
+            shuffle_index, _ = index
+            return self._mapper_output_refs[shuffle_index].shape[1]
+
+    def is_mapper(self, subtask: Subtask):
+        """
+        Check whether a subtask is a mapper subtask. Note the even this a mapper subtask, it can be a reducer subtask
+        at the same time such as `DuplicateOperand`, see
+        https://user-images.githubusercontent.com/12445254/174305282-f7c682a9-0346-47fe-a34c-1e384e6a1775.svg
+        """
+        return subtask in self._mapper_indices
+
+    def info(self):
+        """
+        A list of (mapper count, reducer count).
+        """
+        return [shuffle_mapper.shape for shuffle_mapper in self._mapper_output_refs]
+
+    def remove_object_refs(self, subtask: Subtask):
+        """
+        Set the object refs to None by subtask.
+        """
+        index = self._mapper_indices.get(subtask)
+        if index is not None:
+            shuffle_index, mapper_index = index
+            self._mapper_output_refs[shuffle_index][mapper_index].fill(None)
+            return
+        index = self._reducer_indices.get(subtask)
+        if index is not None:
+            shuffle_index, reducer_ordinal = index
+            self._mapper_output_refs[shuffle_index][:, reducer_ordinal].fill(None)
+            return
+        raise ValueError(f"The {subtask} should be a mapper or a reducer.")
+
+
+def _get_reducer_operand(subtask_chunk_graph):
+    return next(
+        c.op
+        for c in subtask_chunk_graph
+        if isinstance(c.op, MapReduceOperand) and c.op.stage == OperandStage.reduce
+    )
diff --git a/python/xorbits/_mars/services/task/execution/ray/tests/__init__.py b/python/xorbits/_mars/services/task/execution/ray/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/ray/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/task/execution/ray/tests/test_ray_execution_backend.py b/python/xorbits/_mars/services/task/execution/ray/tests/test_ray_execution_backend.py
new file mode 100644
index 000000000..97bf6e3fd
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/ray/tests/test_ray_execution_backend.py
@@ -0,0 +1,723 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import time
+from collections import Counter
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ...... import dataframe as md
+from ...... import tensor as mt
+from ......config import Config
+from ......core import TileContext
+from ......core.context import get_context
+from ......core.graph import ChunkGraphBuilder, TileableGraph, TileableGraphBuilder
+from ......core.operand import ShuffleFetchType
+from ......lib.aio.isolation import new_isolation, stop_isolation
+from ......resource import Resource
+from ......serialization import serialize
+from ......tests.core import mock, require_ray
+from ......utils import get_chunk_params, lazy_import
+from .....context import ThreadedServiceContext
+from .....subtask import Subtask
+from ....analyzer import GraphAnalyzer
+from ....core import Task, new_task_id
+from ..config import RayExecutionConfig
+from ..context import (
+    RayExecutionContext,
+    RayExecutionWorkerContext,
+    RayRemoteObjectManager,
+    _RayRemoteObjectContext,
+)
+from ..executor import (
+    OrderedSet,
+    RayTaskExecutor,
+    RayTaskState,
+    _RayChunkMeta,
+    _RaySlowSubtaskChecker,
+    _RaySubtaskRuntime,
+    execute_subtask,
+)
+from ..fetcher import RayFetcher
+from ..shuffle import ShuffleManager
+
+ray = lazy_import("ray")
+
+
+def _gen_subtask_chunk_graph(t):
+    graph = TileableGraph([t.data])
+    next(TileableGraphBuilder(graph).build())
+    return next(ChunkGraphBuilder(graph, fuse_enabled=False).build())
+
+
+def _gen_subtask_graph(t):
+    tileable_graph = t.build_graph(tile=False)
+    chunk_graph = next(ChunkGraphBuilder(tileable_graph).build())
+    bands = [(f"address_{i}", "numa-0") for i in range(4)]
+    band_resource = dict((band, Resource(num_cpus=1)) for band in bands)
+    task = Task("mock_task", "mock_session", tileable_graph)
+    analyzer = GraphAnalyzer(
+        chunk_graph,
+        band_resource,
+        task,
+        Config(),
+        dict(),
+        shuffle_fetch_type=ShuffleFetchType.FETCH_BY_INDEX,
+    )
+    subtask_graph = analyzer.gen_subtask_graph()
+    return chunk_graph, subtask_graph
+
+
+class MockRayTaskExecutor(RayTaskExecutor):
+    def __init__(self, *args, **kwargs):
+        self._set_attrs = Counter()
+        self._monitor_tasks = []
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    async def _get_apis(cls, session_id: str, address: str):
+        return None, None
+
+    @staticmethod
+    def _get_ray_executor():
+        # Export remote function once.
+        return None
+
+    async def get_available_band_resources(self):
+        return {}
+
+    async def execute_subtask_graph(self, *args, **kwargs):
+        self._monitor_tasks.clear()
+        return await super().execute_subtask_graph(*args, **kwargs)
+
+    async def _update_progress_and_collect_garbage(self, *args, **kwargs):
+        # Infinite loop to test monitor task cancel.
+        self._monitor_tasks.append(asyncio.current_task())
+        return await super()._update_progress_and_collect_garbage(*args, **kwargs)
+
+    def monitor_tasks(self):
+        return self._monitor_tasks
+
+    def set_attr_counter(self):
+        return self._set_attrs
+
+    def __setattr__(self, key, value):
+        super().__setattr__(key, value)
+        self._set_attrs[key] += 1
+
+
+class MockTileContext(TileContext):
+    def get_all_progress(self) -> float:
+        return 1.0
+
+
+@require_ray
+@pytest.mark.asyncio
+@mock.patch("mars.services.task.execution.ray.executor.RayTaskState.create")
+@mock.patch("mars.services.task.execution.ray.context.RayExecutionContext.init")
+@mock.patch("ray.get")
+async def test_ray_executor_create(
+    mock_ray_get, mock_execution_context_init, mock_task_state_actor_create
+):
+    task = Task("mock_task", "mock_session", TileableGraph([]))
+
+    # Create RayTaskState actor as needed by default.
+    mock_config = RayExecutionConfig.from_execution_config({"backend": "ray"})
+    executor = await MockRayTaskExecutor.create(
+        mock_config,
+        session_id="mock_session_id",
+        address="mock_address",
+        task=task,
+        tile_context=TileContext(),
+    )
+    assert isinstance(executor, MockRayTaskExecutor)
+    assert mock_task_state_actor_create.call_count == 0
+    ctx = get_context()
+    assert isinstance(ctx, RayExecutionContext)
+    ctx.create_remote_object("abc", lambda: None)
+    assert mock_ray_get.call_count == 1
+    assert mock_task_state_actor_create.call_count == 1
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_ray_executor_destroy():
+    task = Task("mock_task", "mock_session", TileableGraph([]))
+    mock_config = RayExecutionConfig.from_execution_config({"backend": "ray"})
+    executor = MockRayTaskExecutor(
+        config=mock_config,
+        task=task,
+        tile_context=TileContext(),
+        task_context={},
+        task_chunks_meta={},
+        lifecycle_api=None,
+        meta_api=None,
+    )
+    counter = executor.set_attr_counter()
+    assert len(counter) > 0
+    keys = executor.__dict__.keys()
+    assert counter.keys() >= keys
+    counter.clear()
+    executor.destroy()
+    keys = set(keys) - {"_set_attrs", "_monitor_tasks"}
+    assert counter.keys() == keys, "Some keys are not reset in destroy()."
+    for k, v in counter.items():
+        assert v == 1
+    assert await executor.get_progress() == 1.0
+
+
+@require_ray
+@mock.patch("ray.get_runtime_context")
+def test_ray_execute_subtask_basic(_):
+    raw = np.ones((10, 10))
+    raw_expect = raw + 1
+    a = mt.ones((10, 10), chunk_size=10)
+    b = a + 1
+
+    subtask_id = new_task_id()
+    subtask_chunk_graph = _gen_subtask_chunk_graph(b)
+    r = execute_subtask(subtask_id, serialize(subtask_chunk_graph), 0, False)
+    np.testing.assert_array_equal(r, raw_expect)
+    test_get_meta_chunk = subtask_chunk_graph.result_chunks[0]
+    r = execute_subtask(subtask_id, serialize(subtask_chunk_graph), 1, False)
+    assert len(r) == 2
+    meta_dict, r = r
+    assert len(meta_dict) == 1
+    assert meta_dict[test_get_meta_chunk.key][0] == get_chunk_params(
+        test_get_meta_chunk
+    )
+    np.testing.assert_array_equal(r, raw_expect)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_ray_fetcher(ray_start_regular_shared2):
+    pd_value = pd.DataFrame(
+        {
+            "col1": [str(i) for i in range(10)],
+            "col2": np.random.randint(0, 100, (10,)),
+        }
+    )
+    pd_object_ref = ray.put(pd_value)
+    np_value = np.asarray([1, 3, 6, 2, 4])
+    np_object_ref = ray.put(np_value)
+    # Test RayFetcher to fetch mixed values.
+    fetcher = RayFetcher()
+    await fetcher.append("pd_key", {"object_refs": [pd_object_ref]})
+    await fetcher.append("np_key", {"object_refs": [np_object_ref]})
+    await fetcher.append("pd_key", {"object_refs": [pd_object_ref]}, [slice(1, 3, 1)])
+    await fetcher.append("np_key", {"object_refs": [np_object_ref]}, [slice(1, 3, 1)])
+    results = await fetcher.get()
+    pd.testing.assert_frame_equal(results[0], pd_value)
+    np.testing.assert_array_equal(results[1], np_value)
+    pd.testing.assert_frame_equal(results[2], pd_value.iloc[1:3])
+    np.testing.assert_array_equal(results[3], np_value[1:3])
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_ray_remote_object(ray_start_regular_shared2):
+    class _TestRemoteObject:
+        def __init__(self, i):
+            self._i = i
+
+        def value(self):
+            return self._i
+
+        def foo(self, a, b):
+            return self._i + a + b
+
+        async def bar(self, a, b):
+            return self._i * a * b
+
+    # Test RayTaskState reference
+    state = RayTaskState.create()
+    await state.create_remote_object.remote("aaa", _TestRemoteObject, 123)
+    assert await state.call_remote_object.remote("aaa", "value") == 123
+    state = RayTaskState.create()
+    assert await state.call_remote_object.remote("aaa", "value") == 123
+
+    # Test RayRemoteObjectManager
+    name = "abc"
+    manager = RayRemoteObjectManager()
+    manager.create_remote_object(name, _TestRemoteObject, 2)
+    r = await manager.call_remote_object(name, "foo", 3, 4)
+    assert r == 9
+    r = await manager.call_remote_object(name, "bar", 3, 4)
+    assert r == 24
+    manager.destroy_remote_object(name)
+    with pytest.raises(KeyError):
+        await manager.call_remote_object(name, "foo", 3, 4)
+
+    # Test _RayRemoteObjectContext
+    context = _RayRemoteObjectContext(lambda: RayTaskState.create())
+    context.create_remote_object(name, _TestRemoteObject, 2)
+    remote_object = context.get_remote_object(name)
+    r = remote_object.foo(3, 4)
+    assert r == 9
+    r = remote_object.bar(3, 4)
+    assert r == 24
+    context.destroy_remote_object(name)
+    with pytest.raises(KeyError):
+        remote_object.foo(3, 4)
+
+    class MyException(Exception):
+        pass
+
+    class _ErrorRemoteObject:
+        def __init__(self):
+            raise MyException()
+
+    with pytest.raises(MyException):
+        context.create_remote_object(name, _ErrorRemoteObject)
+
+    handle = RayTaskState.get_handle()
+    assert handle is not None
+
+
+@require_ray
+def test_ray_execution_context(ray_start_regular_shared2):
+    value = 123
+    o = ray.put(value)
+
+    def fake_init(self):
+        pass
+
+    async def fake_get_chunks_meta_from_service(
+        self, data_keys, fields=None, error="raise"
+    ):
+        mock_meta = {"meta_1": {fields[0]: 1}, "meta_3": {fields[0]: 3}}
+        return [mock_meta[k] for k in data_keys]
+
+    with mock.patch.object(
+        ThreadedServiceContext, "__init__", new=fake_init
+    ), mock.patch.object(
+        RayExecutionContext,
+        "_get_chunks_meta_from_service",
+        new=fake_get_chunks_meta_from_service,
+    ):
+        mock_config = RayExecutionConfig.from_execution_config({"backend": "ray"})
+        mock_worker_addresses = ["mock_worker_address"]
+        isolation = new_isolation("test", threaded=True)
+        try:
+            context = RayExecutionContext(
+                mock_config, {"abc": o}, {}, mock_worker_addresses, lambda: None
+            )
+            context._loop = isolation.loop
+            r = context.get_chunks_result(["abc"])
+            assert r == [value]
+
+            r = context.get_worker_addresses()
+            assert r == mock_worker_addresses
+
+            r = context.get_chunks_meta(["meta_1"], fields=["memory_size"])
+            assert r == [{"memory_size": 1}]
+
+            context._task_chunks_meta["meta_1"] = _RayChunkMeta(memory_size=2)
+            r = context.get_chunks_meta(["meta_1", "meta_3"], fields=["memory_size"])
+            assert r == [{"memory_size": 2}, {"memory_size": 3}]
+        finally:
+            stop_isolation("test")
+
+
+def test_ray_execution_worker_context():
+    context = RayExecutionWorkerContext(lambda: None)
+    with pytest.raises(NotImplementedError):
+        context.set_running_operand_key("mock_session_id", "mock_op_key")
+    with pytest.raises(NotImplementedError):
+        context.register_custom_log_path(
+            "mock_session_id",
+            "mock_tileable_op_key",
+            "mock_chunk_op_key",
+            "mock_worker_address",
+            "mock_log_path",
+        )
+
+    assert context.set_progress(0.1) is None
+    assert context.new_custom_log_dir() is None
+    assert context.get_storage_info("mock_address") == {}
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_ray_execution_config(ray_start_regular_shared2):
+    t1 = mt.random.randint(10, size=(100, 10), chunk_size=100)
+    chunk_graph, subtask_graph = _gen_subtask_graph(t1)
+
+    real_executor = RayTaskExecutor._get_ray_executor()
+
+    class MockExecutor:
+        opt = {}
+
+        @classmethod
+        def options(cls, *args, **kwargs):
+            cls.opt = kwargs
+            return real_executor.options(*args, **kwargs)
+
+    task = Task("mock_task", "mock_session", TileableGraph([]))
+    mock_config = RayExecutionConfig.from_execution_config(
+        {
+            "backend": "ray",
+            "ray": {
+                "monitor_interval_seconds": 0,
+                "subtask_max_retries": 4,
+                "subtask_num_cpus": 0.8,
+                "subtask_memory": 1001,
+                "n_cpu": 1,
+                "n_worker": 1,
+            },
+        }
+    )
+    tile_context = MockTileContext()
+    executor = MockRayTaskExecutor(
+        config=mock_config,
+        task=task,
+        tile_context=tile_context,
+        task_context={},
+        task_chunks_meta={},
+        lifecycle_api=None,
+        meta_api=None,
+    )
+    executor._ray_executor = MockExecutor
+    async with executor:
+        await executor.execute_subtask_graph(
+            "mock_stage", subtask_graph, chunk_graph, tile_context
+        )
+
+    assert MockExecutor.opt["num_cpus"] == 0.8
+    assert MockExecutor.opt["max_retries"] == 4
+    assert MockExecutor.opt["memory"] == 1001
+
+
+@require_ray
+@pytest.mark.asyncio
+@pytest.mark.parametrize("gc_method", ["submitted", "completed"])
+async def test_executor_context_gc(ray_start_regular_shared2, gc_method):
+    popped_seq = []
+
+    class MockTaskContext(dict):
+        def pop(self, k, d=None):
+            popped_seq.append(k)
+            return super().pop(k, d)
+
+    t1 = mt.random.randint(10, size=(100, 10), chunk_size=100)
+    t2 = mt.random.randint(10, size=(100, 10), chunk_size=50)
+    t3 = t2 + t1
+    t4 = t3.sum(0)
+    chunk_graph, subtask_graph = _gen_subtask_graph(t4)
+    task = Task("mock_task", "mock_session", TileableGraph([]), fuse_enabled=True)
+    mock_config = RayExecutionConfig.from_execution_config(
+        {
+            "backend": "ray",
+            "ray": {
+                "monitor_interval_seconds": 0,
+                "log_interval_seconds": 0,
+                "subtask_max_retries": 0,
+                "n_cpu": 1,
+                "n_worker": 1,
+                "gc_method": gc_method,
+            },
+        }
+    )
+    tile_context = MockTileContext()
+    task_context = MockTaskContext()
+    executor = MockRayTaskExecutor(
+        config=mock_config,
+        task=task,
+        tile_context=tile_context,
+        task_context=task_context,
+        task_chunks_meta={},
+        lifecycle_api=None,
+        meta_api=None,
+    )
+    executor._ray_executor = RayTaskExecutor._get_ray_executor()
+
+    original_execute_subtask_graph = executor._execute_subtask_graph
+
+    async def _wait_gc_execute_subtask_graph(*args, **kwargs):
+        # Mock _execute_subtask_graph to wait the monitor task done.
+        await original_execute_subtask_graph(*args, **kwargs)
+        await executor.monitor_tasks()[0]
+
+    with mock.patch.object(
+        executor, "_execute_subtask_graph", _wait_gc_execute_subtask_graph
+    ):
+        async with executor:
+            await executor.execute_subtask_graph(
+                "mock_stage", subtask_graph, chunk_graph, tile_context
+            )
+            await asyncio.sleep(0)
+            assert len(executor.monitor_tasks()) == 1
+            assert executor.monitor_tasks()[0].done()
+
+    assert len(task_context) == 1
+    assert len(popped_seq) == 6
+    subtasks = list(subtask_graph.topological_iter())
+    chunk_keys1 = set(
+        map(
+            lambda c: c.key,
+            (
+                subtasks[0].chunk_graph.results
+                + subtasks[1].chunk_graph.results
+                + subtasks[3].chunk_graph.results
+            ),
+        )
+    )
+    chunk_keys2 = set(
+        map(
+            lambda c: c.key,
+            (subtasks[2].chunk_graph.results + subtasks[4].chunk_graph.results),
+        )
+    )
+    assert chunk_keys1 == set(popped_seq[0:4])
+    assert chunk_keys2 == set(popped_seq[4:])
+
+    task_context.clear()
+
+    original_update_progress_and_collect_garbage = (
+        executor._update_progress_and_collect_garbage
+    )
+
+    async def infinite_update_progress_and_collect_garbage(*args, **kwargs):
+        # Mock _update_progress_and_collect_garbage that never done.
+        await original_update_progress_and_collect_garbage(*args, **kwargs)
+        while True:
+            await asyncio.sleep(0)
+
+    with mock.patch("logging.Logger.info") as log_patch, mock.patch.object(
+        executor,
+        "_update_progress_and_collect_garbage",
+        infinite_update_progress_and_collect_garbage,
+    ):
+        async with executor:
+            await executor.execute_subtask_graph(
+                "mock_stage2", subtask_graph, chunk_graph, tile_context
+            )
+            await asyncio.sleep(0)
+            assert len(executor.monitor_tasks()) == 1
+            assert executor.monitor_tasks()[0].done()
+        assert log_patch.call_count > 0
+        args = [c.args[0] for c in log_patch.call_args_list]
+        assert any("Submitted [%s/%s]" in a for a in args)
+        assert any("Completed [%s/%s]" in a for a in args)
+
+    assert len(task_context) == 1
+
+    task_context.clear()
+
+    # Test the monitor aiotask is done even an exception is raised.
+    async def _raise_load_subtask_inputs(*args, **kwargs):
+        # Mock _load_subtask_inputs to raise an exception.
+        await asyncio.sleep(0)
+        1 / 0
+
+    with mock.patch.object(
+        executor, "_load_subtask_inputs", _raise_load_subtask_inputs
+    ):
+        async with executor:
+            with pytest.raises(ZeroDivisionError):
+                await executor.execute_subtask_graph(
+                    "mock_stage3", subtask_graph, chunk_graph, tile_context
+                )
+            await asyncio.sleep(0)
+            assert len(executor.monitor_tasks()) == 1
+            assert executor.monitor_tasks()[0].done()
+
+
+@require_ray
+@pytest.mark.asyncio
+@pytest.mark.parametrize("gc_method", ["submitted", "completed"])
+async def test_execute_shuffle(ray_start_regular_shared2, gc_method):
+    chunk_size, n_rows = 10, 50
+    df = md.DataFrame(
+        pd.DataFrame(np.random.rand(n_rows, 3), columns=list("abc")),
+        chunk_size=chunk_size,
+    )
+    df2 = df.groupby(["a"]).apply(lambda x: x)
+    chunk_graph, subtask_graph = _gen_subtask_graph(df2)
+    task = Task("mock_task", "mock_session", TileableGraph([]), fuse_enabled=True)
+
+    class MockRayExecutor:
+        @staticmethod
+        def options(**kwargs):
+            num_returns = kwargs["num_returns"]
+
+            class _Wrapper:
+                @staticmethod
+                def remote(*args):
+                    args = [
+                        ray.get(a) if isinstance(a, ray.ObjectRef) else a for a in args
+                    ]
+                    r = execute_subtask(*args)
+                    assert len(r) == num_returns
+                    return [ray.put(i) for i in r]
+
+            return _Wrapper
+
+    mock_config = RayExecutionConfig.from_execution_config(
+        {
+            "backend": "ray",
+            "ray": {
+                "monitor_interval_seconds": 0,
+                "subtask_max_retries": 0,
+                "n_cpu": 1,
+                "n_worker": 1,
+                "gc_method": gc_method,
+            },
+        }
+    )
+    tile_context = MockTileContext()
+    task_context = {}
+    executor = MockRayTaskExecutor(
+        config=mock_config,
+        task=task,
+        tile_context=tile_context,
+        task_context=task_context,
+        task_chunks_meta={},
+        lifecycle_api=None,
+        meta_api=None,
+    )
+    executor._ray_executor = MockRayExecutor
+
+    # Test ShuffleManager.remove_object_refs
+    sm = ShuffleManager(subtask_graph)
+    sm._mapper_output_refs[0].fill(1)
+    sm.remove_object_refs(next(iter(sm._reducer_indices.keys())))
+    assert pd.isnull(sm._mapper_output_refs[0][:, 0]).all()
+    sm._mapper_output_refs[0].fill(1)
+    sm.remove_object_refs(next(iter(sm._mapper_indices.keys())))
+    assert pd.isnull(sm._mapper_output_refs[0][0]).all()
+    with pytest.raises(ValueError):
+        sm.remove_object_refs(None)
+
+    original_execute_subtask_graph = executor._execute_subtask_graph
+
+    async def _wait_gc_execute_subtask_graph(
+        stage_id, subtask_graph, chunk_graph, monitor_context
+    ):
+        # Mock _execute_subtask_graph to wait the monitor task done.
+        await original_execute_subtask_graph(
+            stage_id, subtask_graph, chunk_graph, monitor_context
+        )
+        await executor.monitor_tasks()[0]
+        assert pd.isnull(monitor_context.shuffle_manager._mapper_output_refs[0]).all()
+
+    with mock.patch.object(
+        executor, "_execute_subtask_graph", _wait_gc_execute_subtask_graph
+    ), mock.patch("ray.get_runtime_context"):
+        async with executor:
+            await executor.execute_subtask_graph(
+                "mock_stage", subtask_graph, chunk_graph, tile_context
+            )
+        await asyncio.sleep(0)
+        assert len(executor.monitor_tasks()) == 1
+        assert executor.monitor_tasks()[0].done()
+
+    assert len(task_context) == len(chunk_graph.results)
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_slow_subtask_checker():
+    subtasks = [
+        Subtask(str(i), logic_key=f"logic_key1", logic_parallelism=5) for i in range(5)
+    ]
+    for s in subtasks:
+        s.runtime = _RaySubtaskRuntime()
+    submitted = OrderedSet()
+    completed = OrderedSet()
+    now = time.time()
+    checker = _RaySlowSubtaskChecker(5, submitted, completed)
+    updater = checker.update()
+    for s in subtasks:
+        submitted.add(s)
+    for _ in updater:
+        break
+    assert all(s.runtime.start_time >= now for s in subtasks)
+    await asyncio.sleep(0.01)
+    assert not any(checker.is_slow(s) for s in subtasks)
+    completed.add(subtasks[0])
+    completed.add(subtasks[1])
+    for _ in updater:
+        break
+    await asyncio.sleep(0.01)
+    completed.add(subtasks[2])
+    assert not any(checker.is_slow(s) for s in subtasks[3:])
+    completed.add(subtasks[3])
+    for _ in updater:
+        break
+    assert not checker.is_slow(subtasks[4])
+    await asyncio.sleep(0.1)
+    assert checker.is_slow(subtasks[4])
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_execute_slow_task(ray_start_regular_shared2):
+    t1 = mt.random.randint(10, size=(100, 10), chunk_size=10)
+    t2 = mt.random.randint(10, size=(100, 10), chunk_size=30)
+    t3 = t2 + t1
+    t4 = t3.sum(0)
+    chunk_graph, subtask_graph = _gen_subtask_graph(t4)
+    task = Task("mock_task", "mock_session", TileableGraph([]), fuse_enabled=True)
+    mock_config = RayExecutionConfig.from_execution_config(
+        {
+            "backend": "ray",
+            "ray": {
+                "monitor_interval_seconds": 0,
+                "log_interval_seconds": 0,
+                "check_slow_subtasks_interval_seconds": 0,
+                "subtask_max_retries": 0,
+                "n_cpu": 1,
+                "n_worker": 1,
+            },
+        }
+    )
+    tile_context = MockTileContext()
+    executor = MockRayTaskExecutor(
+        config=mock_config,
+        task=task,
+        tile_context=tile_context,
+        task_context={},
+        task_chunks_meta={},
+        lifecycle_api=None,
+        meta_api=None,
+    )
+    slow_subtask_id = list(subtask_graph)[-1].subtask_id
+
+    def mock_execute_subtask(subtask_id, *args):
+        if subtask_id == slow_subtask_id:
+            time.sleep(1)
+        return execute_subtask(subtask_id, *args)
+
+    executor._ray_executor = ray.remote(mock_execute_subtask)
+
+    with mock.patch("logging.Logger.info") as log_patch:
+        async with executor:
+            await executor.execute_subtask_graph(
+                "mock_stage2", subtask_graph, chunk_graph, tile_context
+            )
+            await asyncio.sleep(0)
+            assert len(executor.monitor_tasks()) == 1
+            assert executor.monitor_tasks()[0].done()
+        assert log_patch.call_count > 0
+        slow_ray_object_refs = set()
+        for c in log_patch.call_args_list:
+            if c.args[0] == "Slow tasks(%s): %s":
+                count, object_refs = c.args[1:]
+                assert count >= 1
+                slow_ray_object_refs.update(object_refs)
+        assert len(slow_ray_object_refs) >= 1
diff --git a/python/xorbits/_mars/services/task/execution/utils.py b/python/xorbits/_mars/services/task/execution/utils.py
new file mode 100644
index 000000000..854539d9e
--- /dev/null
+++ b/python/xorbits/_mars/services/task/execution/utils.py
@@ -0,0 +1,96 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Dict, List
+
+from ....core import TileableGraph, TileContext
+from ....resource import Resource
+from ...lifecycle.api import LifecycleAPI
+
+
+def get_band_resources_from_config(
+    backend_execution_config: Dict,
+) -> List[Dict[str, Resource]]:
+    config = backend_execution_config
+    n_worker: int = config["n_worker"]
+    n_cpu: int = config["n_cpu"]
+    mem_bytes: int = config["mem_bytes"]
+    cuda_devices: List[List[int]] = config.get("cuda_devices")
+
+    bands_to_resource = []
+    worker_cpus = n_cpu // n_worker
+    cuda_devices = cuda_devices or ([[]] * n_worker)
+    if sum(len(devices) for devices in cuda_devices) == 0:
+        assert worker_cpus > 0, (
+            f"{n_cpu} cpus are not enough " f"for {n_worker}, try to decrease workers."
+        )
+    mem_bytes = mem_bytes // n_worker
+    for _, devices in zip(range(n_worker), cuda_devices):
+        worker_band_to_resource = dict()
+        if worker_cpus > 0:
+            worker_band_to_resource["numa-0"] = Resource(
+                num_cpus=worker_cpus, mem_bytes=mem_bytes
+            )
+        for i in devices:
+            worker_band_to_resource[f"gpu-{i}"] = Resource(num_gpus=1)
+        bands_to_resource.append(worker_band_to_resource)
+    return bands_to_resource
+
+
+class ResultTileablesLifecycle:
+    def __init__(
+        self,
+        tileable_graph: TileableGraph,
+        tile_context: TileContext,
+        lifecycle_api: LifecycleAPI,
+    ):
+        self._tileable_graph = tileable_graph
+        self._tile_context = tile_context
+        self._lifecycle_api = lifecycle_api
+        self._lifecycle_tracked_tileables = set()
+        self._lifecycle_untracked_tileables = set(self._tileable_graph.result_tileables)
+
+    async def incref_tiled(self):
+        # track and incref result tileables if tiled
+        tracks = [], []
+        new_track_tileables = set()
+        for tileable in self._lifecycle_untracked_tileables:
+            try:
+                tiled_tileable = self._tile_context[tileable]
+            except KeyError:
+                # not tiled, skip
+                pass
+            else:
+                tileable_key = tileable.key
+                tracks[0].append(tileable_key)
+                tracks[1].append(
+                    self._lifecycle_api.track.delay(
+                        tileable_key, [c.key for c in tiled_tileable.chunks]
+                    )
+                )
+                new_track_tileables.add(tileable)
+
+        if any(tracks):
+            # TODO(fyrestone): make the decref cancellation safe or
+            # make all the tileable ids unique.
+            self._lifecycle_untracked_tileables -= new_track_tileables
+            self._lifecycle_tracked_tileables |= new_track_tileables
+            await self._lifecycle_api.track.batch(*tracks[1])
+            await self._lifecycle_api.incref_tileables(tracks[0])
+
+    async def decref_tracked(self):
+        await self._lifecycle_api.decref_tileables(
+            [t.key for t in self._lifecycle_tracked_tileables]
+        )
diff --git a/python/xorbits/_mars/services/task/supervisor/__init__.py b/python/xorbits/_mars/services/task/supervisor/__init__.py
new file mode 100644
index 000000000..bd174cbc7
--- /dev/null
+++ b/python/xorbits/_mars/services/task/supervisor/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .service import TaskSupervisorService
diff --git a/python/xorbits/_mars/services/task/supervisor/graph_visualizer.py b/python/xorbits/_mars/services/task/supervisor/graph_visualizer.py
new file mode 100644
index 000000000..0d09da9ad
--- /dev/null
+++ b/python/xorbits/_mars/services/task/supervisor/graph_visualizer.py
@@ -0,0 +1,148 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from io import StringIO
+from typing import Dict, List
+
+from ....core.operand import Fetch, FetchShuffle
+from ...subtask import Subtask, SubtaskGraph
+
+
+class GraphVisualizer:
+    @classmethod
+    def to_dot(cls, subtask_graphs: List[SubtaskGraph]):
+        sio = StringIO()
+        sio.write("digraph {\n")
+        sio.write("splines=curved\n")
+        sio.write("rankdir=BT\n")
+        sio.write("graph [compound=true];\n")
+        subgraph_index = 0
+        current_stage = 0
+        result_chunk_to_subtask = dict()
+        line_colors = dict()
+        color_iter = iter(itertools.cycle(range(1, 9)))
+        for stage_line in itertools.combinations(range(len(subtask_graphs))[::-1], 2):
+            line_colors[stage_line] = f'"/spectral9/{next(color_iter)}"'
+
+        for subtask_graph in subtask_graphs:
+            for subtask in subtask_graph.topological_iter():
+                current_cluster = f"cluster_{subgraph_index}"
+                sio.write(
+                    cls._export_subtask_to_dot(
+                        subtask,
+                        current_cluster,
+                        current_stage,
+                        line_colors,
+                        result_chunk_to_subtask,
+                    )
+                )
+                for c in subtask.chunk_graph.results:
+                    result_chunk_to_subtask[c.key] = [current_stage, current_cluster]
+                subgraph_index += 1
+            current_stage += 1
+        sio.write("}")
+        return sio.getvalue()
+
+    @classmethod
+    def _gen_chunk_key(cls, chunk, trunc_key):
+        if "_" in chunk.key:
+            key, index = chunk.key.split("_", 1)
+            return "_".join([key[:trunc_key], index])
+        else:  # pragma: no cover
+            return chunk.key[:trunc_key]
+
+    @classmethod
+    def _export_subtask_to_dot(
+        cls,
+        subtask: Subtask,
+        subgraph_name: str,
+        current_stage: int,
+        line_colors: Dict,
+        chunk_key_to_subtask: Dict[str, List],
+        trunc_key: int = 5,
+    ):
+        chunk_graph = subtask.chunk_graph
+        sio = StringIO()
+        chunk_style = "[shape=box]"
+        operand_style = "[shape=circle]"
+
+        visited = set()
+        all_nodes = []
+        for node in chunk_graph.iter_nodes():
+            op = node.op
+            if isinstance(node.op, (Fetch, FetchShuffle)):
+                continue
+            op_name = type(op).__name__
+            if op.stage is not None:
+                op_name = f"{op_name}:{op.stage.name}"
+            if op.key in visited:
+                continue
+            for input_chunk in op.inputs or []:
+                if input_chunk.key not in visited and not isinstance(
+                    input_chunk.op, (Fetch, FetchShuffle)
+                ):  # pragma: no cover
+                    node_name = f'"Chunk:{cls._gen_chunk_key(input_chunk, trunc_key)}"'
+                    sio.write(f"{node_name} {chunk_style}\n")
+                    all_nodes.append(node_name)
+                    visited.add(input_chunk.key)
+                if op.key not in visited:
+                    node_name = f'"{op_name}:{op.key[:trunc_key]}"'
+                    sio.write(f"{node_name} {operand_style}\n")
+                    all_nodes.append(node_name)
+                    visited.add(op.key)
+                if (
+                    isinstance(input_chunk.op, (Fetch, FetchShuffle))
+                    and input_chunk.key in chunk_key_to_subtask
+                ):
+                    stage, tail_cluster = chunk_key_to_subtask[input_chunk.key]
+                    if stage == current_stage:
+                        line_style = "style=bold"
+                    else:
+                        line_style = (
+                            f"style=bold color={line_colors[(current_stage, stage)]}"
+                        )
+                    sio.write(
+                        f'"Chunk:{cls._gen_chunk_key(input_chunk, trunc_key)}" ->'
+                        f' "{op_name}:{op.key[:trunc_key]}" '
+                        f"[lhead={subgraph_name} ltail={tail_cluster} {line_style}];\n"
+                    )
+                else:
+                    sio.write(
+                        f'"Chunk:{cls._gen_chunk_key(input_chunk, trunc_key)}" -> '
+                        f'"{op_name}:{op.key[:trunc_key]}"\n'
+                    )
+
+            for output_chunk in op.outputs or []:
+                if output_chunk.key not in visited:
+                    node_name = f'"Chunk:{cls._gen_chunk_key(output_chunk, trunc_key)}"'
+                    sio.write(f"{node_name} {chunk_style}\n")
+                    all_nodes.append(node_name)
+                    visited.add(output_chunk.key)
+                if op.key not in visited:
+                    node_name = f'"{op_name}:{op.key[:trunc_key]}"'
+                    sio.write(f"{node_name} {operand_style}\n")
+                    all_nodes.append(node_name)
+                    visited.add(op.key)
+                sio.write(
+                    f'"{op_name}:{op.key[:trunc_key]}" -> '
+                    f'"Chunk:{cls._gen_chunk_key(output_chunk, trunc_key)}"\n'
+                )
+        # write subgraph info
+        sio.write(f"subgraph {subgraph_name} {{\n")
+        nodes_str = " ".join(all_nodes)
+        sio.write(f"{nodes_str};\n")
+        sio.write(f'label="{subtask.subtask_id}";\n}}')
+        sio.write("\n")
+        return sio.getvalue()
diff --git a/python/xorbits/_mars/services/task/supervisor/manager.py b/python/xorbits/_mars/services/task/supervisor/manager.py
new file mode 100644
index 000000000..e92626b50
--- /dev/null
+++ b/python/xorbits/_mars/services/task/supervisor/manager.py
@@ -0,0 +1,389 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import contextlib
+import importlib
+import logging
+import time
+import weakref
+from collections import defaultdict, deque
+from dataclasses import dataclass
+from typing import Any, Dict, List, Type
+
+from .... import oscar as mo
+from ....core import TileableGraph, TileableType, TileContext, enter_mode
+from ....core.operand import Fetch
+from ....oscar.errors import ActorNotExist, ServerClosed
+from ....utils import _is_ci, aiotask_wrapper
+from ...subtask import SubtaskGraph, SubtaskResult
+from ..config import task_options
+from ..core import MapReduceInfo, Task, TaskStatus, new_task_id
+from ..errors import TaskNotExist
+from .preprocessor import TaskPreprocessor
+from .processor import TaskProcessor
+from .task import TaskProcessorActor
+
+logger = logging.getLogger(__name__)
+
+
+class TaskConfigurationActor(mo.Actor):
+    def __init__(
+        self,
+        task_conf: Dict[str, Any],
+        execution_config: Dict[str, Any],
+        task_processor_cls: Type[TaskProcessor] = None,
+        task_preprocessor_cls: Type[TaskPreprocessor] = None,
+    ):
+        for name, value in task_conf.items():
+            setattr(task_options, name, value)
+        self._execution_config = execution_config
+        self._task_processor_cls = task_processor_cls
+        self._task_preprocessor_cls = task_preprocessor_cls
+
+    def get_config(self):
+        return {
+            "task_options": task_options,
+            "execution_config": self._execution_config,
+            "task_processor_cls": self._task_processor_cls,
+            "task_preprocessor_cls": self._task_preprocessor_cls,
+        }
+
+
+class _RefHolder:
+    pass
+
+
+@dataclass
+class ResultTileableInfo:
+    tileable: TileableType
+    processor_ref: mo.ActorRefType[TaskProcessorActor]
+    ref_holder: _RefHolder
+
+
+class TaskManagerActor(mo.Actor):
+    _task_id_to_processor_ref: Dict[str, mo.ActorRefType[TaskProcessorActor]]
+    _result_tileable_key_to_info: Dict[str, List[ResultTileableInfo]]
+
+    def __init__(self, session_id: str):
+        self._session_id = session_id
+
+        self._config = None
+        self._execution_config = None
+        self._task_processor_cls = None
+        self._task_preprocessor_cls = None
+        self._last_idle_time = None
+
+        self._task_id_to_processor_ref = dict()
+        self._result_tileable_key_to_info = defaultdict(list)
+
+    async def __post_create__(self):
+        # get config
+        configuration_ref = await mo.actor_ref(
+            TaskConfigurationActor.default_uid(), address=self.address
+        )
+        task_conf = await configuration_ref.get_config()
+        (
+            self._config,
+            self._execution_config,
+            self._task_processor_cls,
+            self._task_preprocessor_cls,
+        ) = (
+            task_conf["task_options"],
+            task_conf["execution_config"],
+            task_conf["task_processor_cls"],
+            task_conf["task_preprocessor_cls"],
+        )
+        self._task_preprocessor_cls = self._get_task_preprocessor_cls()
+        reserved_finish_tasks = task_conf["task_options"].reserved_finish_tasks
+        logger.info("Task manager reserves %s finish tasks.", reserved_finish_tasks)
+        self._reserved_finish_tasks = deque(maxlen=reserved_finish_tasks)
+
+    async def __pre_destroy__(self):
+        # Avoid RuntimeError: dictionary changed size during iteration.
+        coros = [
+            processor_ref.destroy()
+            for processor_ref in self._task_id_to_processor_ref.values()
+        ]
+        await asyncio.gather(*coros)
+
+    @staticmethod
+    def gen_uid(session_id):
+        return f"{session_id}_task_manager"
+
+    @enter_mode(kernel=True)
+    async def submit_tileable_graph(
+        self,
+        graph: TileableGraph,
+        fuse_enabled: bool = None,
+        extra_config: dict = None,
+    ) -> str:
+        self._last_idle_time = None
+        # new task with task_name
+        task_id = new_task_id()
+
+        uid = TaskProcessorActor.gen_uid(self._session_id, task_id)
+        # gen main task which mean each submission from user
+        processor_ref = await mo.create_actor(
+            TaskProcessorActor,
+            self._session_id,
+            task_id,
+            task_processor_cls=self._task_processor_cls,
+            address=self.address,
+            uid=uid,
+        )
+        self._task_id_to_processor_ref[task_id] = processor_ref
+
+        if fuse_enabled is None:
+            fuse_enabled = self._config.fuse_enabled
+        # gen task
+        task = Task(
+            task_id,
+            self._session_id,
+            graph,
+            fuse_enabled=fuse_enabled,
+            extra_config=extra_config,
+        )
+        # gen task processor
+        tiled_context = await self._gen_tiled_context(graph)
+        await processor_ref.add_task(
+            task,
+            tiled_context,
+            self._config,
+            self._execution_config,
+            self._task_preprocessor_cls,
+        )
+
+        def _on_finalize():
+            # The loop may be closed before the weakref is dead.
+            if loop.is_running():
+                loop.create_task(
+                    self._move_task_to_reserved(loop, task_id, processor_ref)
+                )
+
+        loop = asyncio.get_running_loop()
+        task_ref = _RefHolder()
+        weakref.finalize(task_ref, _on_finalize)
+        for tileable in graph.result_tileables:
+            info = ResultTileableInfo(
+                tileable=tileable, processor_ref=processor_ref, ref_holder=task_ref
+            )
+            logger.debug(
+                "Add tileable info, task id: %s, tileable key: %s",
+                task_id,
+                tileable.key,
+            )
+            self._result_tileable_key_to_info[tileable.key].append(info)
+
+        return task_id
+
+    @aiotask_wrapper(exit_if_exception=_is_ci)
+    async def _move_task_to_reserved(self, loop, task_id, processor_ref):
+        # TODO(fyrestone): Find a better way to wait and destroy the processor actor.
+        with contextlib.suppress(ActorNotExist, ServerClosed, ConnectionRefusedError):
+            await processor_ref.wait()
+
+        logger.debug("Move task %s to reserved.", task_id)
+        ref_holder = _RefHolder()
+        self._reserved_finish_tasks.append(ref_holder)
+
+        @aiotask_wrapper(exit_if_exception=_is_ci)
+        async def _destroy_actor():
+            with contextlib.suppress(
+                ActorNotExist, ServerClosed, ConnectionRefusedError
+            ):
+                await processor_ref.destroy()
+
+        def _remove_task():
+            logger.debug("Remove task %s.", task_id)
+            self._task_id_to_processor_ref.pop(task_id, None)
+            if loop.is_running():
+                loop.create_task(_destroy_actor())
+
+        weakref.finalize(ref_holder, _remove_task)
+
+    async def get_subtask_graphs(self, task_id: str) -> List[SubtaskGraph]:
+        try:
+            processor_ref = self._task_id_to_processor_ref[task_id]
+        except KeyError:  # pragma: no cover
+            raise TaskNotExist(f"Task {task_id} does not exist")
+
+        return processor_ref.get_subtask_graphs(task_id)
+
+    async def get_tileable_graph_dict_by_task_id(self, task_id: str):
+        try:
+            processor_ref = self._task_id_to_processor_ref[task_id]
+        except KeyError:
+            raise TaskNotExist(f"Task {task_id} does not exist")
+
+        res = await processor_ref.get_tileable_graph_as_dict()
+        return res
+
+    async def get_tileable_details(self, task_id):
+        try:
+            processor_ref = self._task_id_to_processor_ref[task_id]
+        except KeyError:
+            raise TaskNotExist(f"Task {task_id} does not exist")
+
+        return await processor_ref.get_tileable_details()
+
+    async def get_tileable_subtasks(self, task_id, tileable_id, with_input_output):
+        try:
+            processor_ref = self._task_id_to_processor_ref[task_id]
+        except KeyError:
+            raise TaskNotExist(f"Task {task_id} does not exist")
+
+        return await processor_ref.get_tileable_subtasks(tileable_id, with_input_output)
+
+    async def _gen_tiled_context(self, graph: TileableGraph) -> TileContext:
+        # process graph, add fetch node to tiled context
+        tiled_context = TileContext()
+        for tileable in graph:
+            if isinstance(tileable.op, Fetch) and tileable.is_coarse():
+                info_list = self._result_tileable_key_to_info[tileable.key]
+                assert info_list, f"The tileable {tileable.key} has no info."
+                info = info_list[-1]
+                tiled_context[tileable] = await info.processor_ref.get_result_tileable(
+                    tileable.key
+                )
+        return tiled_context
+
+    def _get_task_preprocessor_cls(self):
+        if self._task_preprocessor_cls is not None:
+            assert isinstance(self._task_preprocessor_cls, str)
+            module, name = self._task_preprocessor_cls.rsplit(".", 1)
+            return getattr(importlib.import_module(module), name)
+        else:
+            return TaskPreprocessor
+
+    async def wait_task(self, task_id: str, timeout: int = None):
+        try:
+            processor_ref = self._task_id_to_processor_ref[task_id]
+        except KeyError:  # pragma: no cover
+            raise TaskNotExist(f"Task {task_id} does not exist")
+
+        return processor_ref.wait(timeout)
+
+    async def cancel_task(self, task_id: str):
+        try:
+            processor_ref = self._task_id_to_processor_ref[task_id]
+        except KeyError:  # pragma: no cover
+            raise TaskNotExist(f"Task {task_id} does not exist")
+
+        yield processor_ref.cancel()
+
+    async def get_task_results(self, progress: bool = False):
+        if not self._task_id_to_processor_ref:
+            raise mo.Return([])
+
+        results = yield asyncio.gather(
+            *[ref.result() for ref in self._task_id_to_processor_ref.values()]
+        )
+
+        if progress:
+            task_to_result = {res.task_id: res for res in results}
+
+            progress_task_ids = []
+            for res in results:
+                if res.status != TaskStatus.terminated:
+                    progress_task_ids.append(res.task_id)
+                else:
+                    res.progress = 1.0
+
+            progresses = yield asyncio.gather(
+                *[
+                    self._task_id_to_processor_ref[task_id].progress()
+                    for task_id in progress_task_ids
+                ]
+            )
+            for task_id, progress in zip(progress_task_ids, progresses):
+                task_to_result[task_id].progress = progress
+
+        raise mo.Return(results)
+
+    async def get_task_result(self, task_id: str):
+        try:
+            processor_ref = self._task_id_to_processor_ref[task_id]
+        except KeyError:  # pragma: no cover
+            raise TaskNotExist(f"Task {task_id} does not exist")
+
+        return await processor_ref.result()
+
+    async def get_task_result_tileables(self, task_id: str):
+        try:
+            processor_ref = self._task_id_to_processor_ref[task_id]
+        except KeyError:  # pragma: no cover
+            raise TaskNotExist(f"Task {task_id} does not exist")
+
+        return await processor_ref.get_result_tileables()
+
+    async def set_subtask_result(self, subtask_result: SubtaskResult):
+        task_id = subtask_result.task_id
+        try:
+            processor_ref = self._task_id_to_processor_ref[task_id]
+        except KeyError:  # pragma: no cover
+            # raise TaskNotExist(f'Task {task_id} does not exist')
+            logger.warning(
+                "Current task is finished, got stale result %s  for subtask %s "
+                "which may be speculative execution from previous tasks, just ignore it.",
+                subtask_result.subtask_id,
+                subtask_result,
+            )
+            return
+
+        yield processor_ref.set_subtask_result(subtask_result)
+
+    @mo.extensible
+    async def get_task_progress(self, task_id: str) -> float:
+        try:
+            processor_ref = self._task_id_to_processor_ref[task_id]
+        except KeyError:  # pragma: no cover
+            raise TaskNotExist(f"Task {task_id} does not exist")
+
+        return await processor_ref.progress()
+
+    async def get_last_idle_time(self):
+        if self._last_idle_time is None:
+            for processor_ref in self._task_id_to_processor_ref.values():
+                if not await processor_ref.is_done():
+                    break
+            else:
+                self._last_idle_time = time.time()
+        return self._last_idle_time
+
+    async def remove_tileables(self, tileable_keys: List[str]):
+        # TODO(fyrestone) yield if needed.
+        logger.debug("Remove tileable info: %s", tileable_keys)
+        for key in tileable_keys:
+            info_list = self._result_tileable_key_to_info.pop(key, [])
+            if info_list:
+                processor_is_done = await asyncio.gather(
+                    *(info.processor_ref.is_done() for info in info_list)
+                )
+                not_done_info = [
+                    info
+                    for info, is_done in zip(info_list, processor_is_done)
+                    if not is_done
+                ]
+                self._result_tileable_key_to_info[key] = not_done_info
+
+    async def get_map_reduce_info(
+        self, task_id: str, map_reduce_id: int
+    ) -> MapReduceInfo:
+        try:
+            processor_ref = self._task_id_to_processor_ref[task_id]
+        except KeyError:  # pragma: no cover
+            raise TaskNotExist(f"Task {task_id} does not exist")
+
+        return await processor_ref.get_map_reduce_info(map_reduce_id)
diff --git a/python/xorbits/_mars/services/task/supervisor/preprocessor.py b/python/xorbits/_mars/services/task/supervisor/preprocessor.py
new file mode 100644
index 000000000..3cfdd566e
--- /dev/null
+++ b/python/xorbits/_mars/services/task/supervisor/preprocessor.py
@@ -0,0 +1,264 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import asyncio
+import logging
+from functools import partial
+from typing import Callable, Dict, Iterable, List, Set
+
+from ....config import Config
+from ....core import ChunkGraph, ChunkGraphBuilder, TileableGraph, TileContext
+from ....core.graph.builder.chunk import Tiler, _TileableHandler
+from ....core.operand import Fetch, ShuffleFetchType
+from ....resource import Resource
+from ....typing import BandType, ChunkType, TileableType
+from ...subtask import Subtask, SubtaskGraph
+from ..analyzer import GraphAnalyzer
+from ..core import MapReduceInfo, Task
+
+logger = logging.getLogger(__name__)
+
+
+class CancellableTiler(Tiler):
+    def __init__(
+        self,
+        tileable_graph: TileableGraph,
+        tile_context: TileContext,
+        processed_chunks: Set[str],
+        chunk_to_fetch: Dict[ChunkType, ChunkType],
+        add_nodes: Callable,
+        cancelled: asyncio.Event = None,
+        check_duplicated_submission: bool = False,
+    ):
+        super().__init__(
+            tileable_graph, tile_context, processed_chunks, chunk_to_fetch, add_nodes
+        )
+        self._cancelled = cancelled
+        self._check_duplicated_submission = check_duplicated_submission
+
+    @property
+    def cancelled(self):
+        return self._cancelled.is_set()
+
+    def _gen_tileable_handlers(self, next_tileable_handlers: List[_TileableHandler]):
+        for tile_handler in super()._gen_tileable_handlers(next_tileable_handlers):
+            if not self.cancelled:
+                yield tile_handler
+            else:
+                break
+
+    def _gen_result_chunks(
+        self,
+        chunk_graph: ChunkGraph,
+        next_tileable_handlers: List[_TileableHandler],
+    ):
+        if not self.cancelled:
+            return super()._gen_result_chunks(chunk_graph, next_tileable_handlers)
+        else:
+            return
+
+    def _iter_without_check(self):
+        while self._tileable_handlers:
+            to_update_tileables = self._iter()
+            if not self.cancelled:
+                yield self._cur_chunk_graph
+            if not self.cancelled:
+                for t in to_update_tileables:
+                    t.refresh_params()
+            else:
+                break
+
+    def _iter_with_check(self):
+        chunk_set = set()
+        chunk_graphs = []
+        for chunk_graph in self._iter_without_check():
+            chunk_graphs.append(chunk_graph)
+            chunks = []
+            for chunk in chunk_graph:
+                if isinstance(chunk.op, Fetch):
+                    continue
+                if chunk in chunk_set:
+                    raise RuntimeError(f"chunk {chunk} submitted repeatedly")
+                chunks.append(chunk)
+            chunk_set.update(chunks)
+            yield chunk_graph
+
+    def __iter__(self):
+        if not self._check_duplicated_submission:
+            return self._iter_without_check()
+        else:
+            return self._iter_with_check()
+
+
+class TaskPreprocessor:
+    __slots__ = (
+        "_task",
+        "tileable_graph",
+        "tile_context",
+        "_config",
+        "tileable_optimization_records",
+        "chunk_optimization_records_list",
+        "_cancelled",
+        "_done",
+        "map_reduce_id_to_infos",
+    )
+
+    tile_context: TileContext
+    map_reduce_id_to_infos: Dict[int, MapReduceInfo]
+
+    def __init__(
+        self,
+        task: Task,
+        tiled_context: TileContext = None,
+        config: Config = None,
+    ):
+        self._task = task
+        self.tileable_graph = task.tileable_graph
+        self._config = config
+
+        self.tile_context = tiled_context
+        self.tileable_optimization_records = None
+        self.chunk_optimization_records_list = []
+        self.map_reduce_id_to_infos = dict()
+
+        self._cancelled = asyncio.Event()
+        self._done = asyncio.Event()
+
+    def optimize(self) -> TileableGraph:
+        """
+        Optimize tileable graph.
+
+        Returns
+        -------
+        optimized_graph: TileableGraph
+
+        """
+        from ....optimization.logical.tileable import (
+            optimize as optimize_tileable_graph,
+        )
+
+        if self._config.optimize_tileable_graph:
+            # enable optimization
+            self.tileable_optimization_records = optimize_tileable_graph(
+                self.tileable_graph
+            )
+        return self.tileable_graph
+
+    def _fill_fetch_tileable_with_chunks(self, tileable_graph: TileableGraph):
+        for t in tileable_graph:
+            if isinstance(t.op, Fetch) and t in self.tile_context:
+                tiled = self.tile_context[t]
+                t._chunks = tiled.chunks
+                t._nsplits = tiled.nsplits
+
+    def _get_tiler_cls(self) -> Callable:
+        extra_config = self._task.extra_config or dict()
+        check_duplicated_submission = extra_config.get(
+            "check_duplicated_submission", False
+        )
+        return partial(
+            CancellableTiler,
+            cancelled=self._cancelled,
+            check_duplicated_submission=check_duplicated_submission,
+        )
+
+    def tile(self, tileable_graph: TileableGraph) -> Iterable[ChunkGraph]:
+        """
+        Generate chunk graphs
+
+        Returns
+        -------
+        chunk_graph_generator: Generator
+             Chunk graphs.
+        """
+        from ....optimization.logical.chunk import optimize as optimize_chunk_graph
+
+        self._fill_fetch_tileable_with_chunks(tileable_graph)
+        # iterative chunk graph builder
+        chunk_graph_builder = ChunkGraphBuilder(
+            tileable_graph,
+            fuse_enabled=self._task.fuse_enabled,
+            tile_context=self.tile_context,
+            tiler_cls=self._get_tiler_cls(),
+        )
+        optimize = self._config.optimize_chunk_graph
+        for t in tileable_graph:
+            if hasattr(t.op, "logic_key") and t.op.logic_key is None:
+                t.op.logic_key = t.op.get_logic_key()
+        for chunk_graph in chunk_graph_builder.build():
+            if len(chunk_graph) == 0:
+                continue
+            # optimize chunk graph
+            if optimize:
+                self.chunk_optimization_records_list.append(
+                    optimize_chunk_graph(chunk_graph)
+                )
+            yield chunk_graph
+
+    def post_chunk_graph_execution(self):  # pylint: disable=no-self-use
+        """Post calling after execution of current chunk graph"""
+
+    def analyze(
+        self,
+        chunk_graph: ChunkGraph,
+        chunk_to_subtasks: Dict[ChunkType, Subtask],
+        available_bands: Dict[BandType, Resource],
+        stage_id: str = None,
+        op_to_bands: Dict[str, BandType] = None,
+        shuffle_fetch_type: ShuffleFetchType = None,
+    ) -> SubtaskGraph:
+        logger.debug("Start to gen subtask graph for task %s", self._task.task_id)
+        task = self._task
+        analyzer = GraphAnalyzer(
+            chunk_graph,
+            available_bands,
+            task,
+            self._config,
+            chunk_to_subtasks,
+            stage_id=stage_id,
+            shuffle_fetch_type=shuffle_fetch_type,
+            map_reduce_id_to_infos=self.map_reduce_id_to_infos,
+        )
+        graph = analyzer.gen_subtask_graph(op_to_bands)
+        logger.debug(
+            "Generated subtask graph of %s subtasks for task %s",
+            len(graph),
+            self._task.task_id,
+        )
+        return graph
+
+    def _get_done(self):
+        return self._done.is_set()
+
+    def _set_done(self, is_done: bool):
+        if is_done:
+            self._done.set()
+        else:  # pragma: no cover
+            self._done.clear()
+
+    done = property(_get_done, _set_done)
+
+    def cancel(self):
+        self._cancelled.set()
+
+    def get_tiled(self, tileable: TileableType):
+        tileable = tileable.data if hasattr(tileable, "data") else tileable
+        return self.tile_context[tileable]
+
+    def get_map_reduce_info(self, map_reduce_id: int) -> MapReduceInfo:
+        return self.map_reduce_id_to_infos[map_reduce_id]
+
+    def __await__(self):
+        return self._done.wait().__await__()
diff --git a/python/xorbits/_mars/services/task/supervisor/processor.py b/python/xorbits/_mars/services/task/supervisor/processor.py
new file mode 100644
index 000000000..90910c436
--- /dev/null
+++ b/python/xorbits/_mars/services/task/supervisor/processor.py
@@ -0,0 +1,471 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import os
+import tempfile
+import time
+from typing import Dict, Iterator, List, Optional, Set
+
+from ....core import Chunk, ChunkGraph, TileableGraph, TileContext
+from ....core.operand import Fetch
+from ....metrics import Metrics
+from ....optimization.logical import OptimizationRecords
+from ....oscar.profiling import MARS_ENABLE_PROFILING, ProfilingData
+from ....typing import ChunkType, TileableType
+from ....utils import Timer
+from ...subtask import Subtask, SubtaskResult
+from ..core import MapReduceInfo, Task, TaskResult, TaskStatus, new_task_id
+from ..execution.api import ExecutionChunkResult, TaskExecutor
+from .preprocessor import TaskPreprocessor
+
+logger = logging.getLogger(__name__)
+
+MARS_ENABLE_DUMPING_SUBTASK_GRAPH = int(os.environ.get("MARS_DUMP_SUBTASK_GRAPH", 0))
+
+
+class TaskProcessor:
+    _tileable_to_subtasks: Dict[TileableType, List[Subtask]]
+    _tileable_id_to_tileable: Dict[str, TileableType]
+    _chunk_to_subtasks: Dict[ChunkType, Subtask]
+    _stage_tileables: Set[TileableType]
+
+    def __init__(
+        self,
+        task: Task,
+        preprocessor: TaskPreprocessor,
+        executor: TaskExecutor,
+    ):
+        self._task = task
+        self._preprocessor = preprocessor
+        self._executor = executor
+
+        self._tileable_id_to_tileable = dict()
+        self._chunk_to_subtasks = dict()
+        self._stage_tileables = set()
+
+        if MARS_ENABLE_PROFILING:
+            ProfilingData.init(task.task_id)
+        elif task.extra_config and task.extra_config.get("enable_profiling"):
+            ProfilingData.init(task.task_id, task.extra_config["enable_profiling"])
+
+        self._dump_subtask_graph = False
+        self._subtask_graphs = []
+        if MARS_ENABLE_DUMPING_SUBTASK_GRAPH or (
+            task.extra_config and task.extra_config.get("dump_subtask_graph")
+        ):
+            self._dump_subtask_graph = True
+
+        self.result = TaskResult(
+            task_id=task.task_id,
+            session_id=task.session_id,
+            start_time=time.time(),
+            status=TaskStatus.pending,
+        )
+        self.done = asyncio.Event()
+
+        # add metrics
+        self._chunk_graph_gen_time = Metrics.gauge(
+            "mars.chunk_graph_gen_time_secs",
+            "Time consuming in seconds to generate a chunk graph",
+            ("session_id", "task_id"),
+        )
+        self._subtask_graph_gen_time = Metrics.gauge(
+            "mars.subtask_graph_gen_time_secs",
+            "Time consuming in seconds to generate a subtask graph",
+            ("session_id", "task_id", "stage_id"),
+        )
+        self._task_execution_time = Metrics.gauge(
+            "mars.task_execution_time_secs",
+            "Time consuming in seconds to execute a task",
+            ("session_id", "task_id"),
+        )
+
+    @property
+    def task_id(self):
+        return self._task.task_id
+
+    @property
+    def tileable_graph(self):
+        return self._preprocessor.tileable_graph
+
+    @property
+    def tileable_id_to_tileable(self):
+        return self._tileable_id_to_tileable
+
+    @property
+    def tile_context(self) -> TileContext:
+        return self._preprocessor.tile_context
+
+    @property
+    def stage_processors(self):
+        # TODO(fyrestone): Remove it.
+        return self._executor.get_stage_processors()
+
+    def get_tiled(self, tileable: TileableType):
+        return self._preprocessor.get_tiled(tileable)
+
+    def get_subtasks(self, chunks: List[ChunkType]) -> List[Subtask]:
+        return [self._chunk_to_subtasks[chunk] for chunk in chunks]
+
+    def get_tileable_to_subtasks(self) -> Dict[TileableType, List[Subtask]]:
+        tile_context = self.tile_context
+        result = dict()
+        for tileable, infos in tile_context.get_tileable_tile_infos().items():
+            subtasks = []
+            for info in infos:
+                chunks = [
+                    c for c in info.generated_chunks if not isinstance(c.op, Fetch)
+                ]
+                subtasks.extend(self.get_subtasks(chunks))
+            result[tileable] = subtasks
+        return result
+
+    @staticmethod
+    async def _get_next_chunk_graph(
+        chunk_graph_iter: Iterator[ChunkGraph],
+    ) -> Optional[ChunkGraph]:
+        def next_chunk_graph():
+            try:
+                return next(chunk_graph_iter)
+            except StopIteration:
+                return
+
+        fut = asyncio.to_thread(next_chunk_graph)
+        chunk_graph = await fut
+        return chunk_graph
+
+    async def _iter_stage_chunk_graph(self):
+        tileable_graph = self._preprocessor.tileable_graph
+        chunk_graph_iter = iter(self._preprocessor.tile(tileable_graph))
+        while True:
+            with Timer() as stage_timer:
+                with Timer() as timer:
+                    chunk_graph = await self._get_next_chunk_graph(chunk_graph_iter)
+                    if chunk_graph is None:
+                        # tile finished
+                        self._preprocessor.done = True
+                        return
+                stage_id = new_task_id()
+                stage_profiler = ProfilingData[self._task.task_id, "general"].nest(
+                    f"stage_{stage_id}"
+                )
+                stage_profiler.set(f"tile({len(chunk_graph)})", timer.duration)
+                logger.info(
+                    "Time consuming to gen a chunk graph is %ss with session id %s, task id %s",
+                    timer.duration,
+                    self._task.session_id,
+                    self._task.task_id,
+                )
+                self._chunk_graph_gen_time.record(
+                    timer.duration,
+                    {
+                        "session_id": self._task.session_id,
+                        "task_id": self._task.task_id,
+                    },
+                )
+                yield stage_id, stage_profiler, chunk_graph
+
+            stage_profiler.set("total", stage_timer.duration)
+
+    async def _process_stage_chunk_graph(
+        self,
+        stage_id: str,
+        stage_profiler,
+        chunk_graph: ChunkGraph,
+    ):
+        available_bands = await self._executor.get_available_band_resources()
+        meta_api = self._executor._meta_api
+        get_meta_tasks = []
+        fetch_op_keys = []
+        for c in chunk_graph.iter_indep():
+            if isinstance(c.op, Fetch):
+                get_meta_tasks.append(
+                    meta_api.get_chunk_meta.delay(c.key, fields=["bands"])
+                )
+                fetch_op_keys.append(c.op.key)
+        # TODO(fyrestone): A more general way to get the key to bands
+        # for all execution backends.
+        try:
+            key_to_bands = await meta_api.get_chunk_meta.batch(*get_meta_tasks)
+            fetch_op_to_bands = dict(
+                (key, meta["bands"][0])
+                for key, meta in zip(fetch_op_keys, key_to_bands)
+            )
+        except (KeyError, IndexError):
+            fetch_op_to_bands = {}
+        shuffle_fetch_type = (
+            self._executor.get_execution_config().get_shuffle_fetch_type()
+        )
+        with Timer() as timer:
+            subtask_graph = await asyncio.to_thread(
+                self._preprocessor.analyze,
+                chunk_graph,
+                self._chunk_to_subtasks,
+                available_bands,
+                stage_id=stage_id,
+                op_to_bands=fetch_op_to_bands,
+                shuffle_fetch_type=shuffle_fetch_type,
+            )
+            if self._dump_subtask_graph:
+                self._subtask_graphs.append(subtask_graph)
+        stage_profiler.set(f"gen_subtask_graph({len(subtask_graph)})", timer.duration)
+        logger.info(
+            "Time consuming to gen a subtask graph is %ss with session id %s, task id %s, stage id %s",
+            timer.duration,
+            self._task.session_id,
+            self._task.task_id,
+            stage_id,
+        )
+        self._subtask_graph_gen_time.record(
+            timer.duration,
+            {
+                "session_id": self._task.session_id,
+                "task_id": self._task.task_id,
+                "stage_id": stage_id,
+            },
+        )
+
+        tile_context = await asyncio.to_thread(
+            self._get_stage_tile_context,
+            {c for c in chunk_graph.result_chunks if not isinstance(c.op, Fetch)},
+        )
+
+        with Timer() as timer:
+            chunk_to_result = await self._executor.execute_subtask_graph(
+                stage_id, subtask_graph, chunk_graph, tile_context
+            )
+        stage_profiler.set("run", timer.duration)
+
+        self._preprocessor.post_chunk_graph_execution()
+        if self._preprocessor.chunk_optimization_records_list:
+            optimization_records = self._preprocessor.chunk_optimization_records_list[
+                -1
+            ]
+        else:
+            optimization_records = None
+        self._update_stage_meta(chunk_to_result, tile_context, optimization_records)
+
+    def _get_stage_tile_context(self, result_chunks: Set[Chunk]) -> TileContext:
+        collected = self._stage_tileables
+        tile_context = TileContext()
+        for tileable in self.tileable_graph:
+            if tileable in collected:
+                continue
+            tiled_tileable = self._preprocessor.tile_context.get(tileable)
+            if tiled_tileable is not None:
+                tileable_chunks = [c.data for c in tiled_tileable.chunks]
+                if any(c not in result_chunks for c in tileable_chunks):
+                    continue
+                tile_context[tileable] = tiled_tileable
+                collected.add(tileable)
+        return tile_context
+
+    @classmethod
+    def _update_stage_meta(
+        cls,
+        chunk_to_result: Dict[Chunk, ExecutionChunkResult],
+        tile_context: TileContext,
+        optimization_records: OptimizationRecords,
+    ):
+        for tiled_tileable in tile_context.values():
+            cls._update_result_meta(chunk_to_result, tiled_tileable)
+
+        for c, r in chunk_to_result.items():
+            c.params = r.meta
+            original_chunk = (
+                optimization_records and optimization_records.get_original_entity(c)
+            )
+            if original_chunk is not None:
+                original_chunk.params = r.meta
+
+        for tileable, tiled_tileable in tile_context.items():
+            tiled_tileable.refresh_params()
+            tileable.params = tiled_tileable.params
+
+    @classmethod
+    def _update_result_meta(
+        cls, chunk_to_result: Dict[Chunk, ExecutionChunkResult], tileable: TileableType
+    ):
+        from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
+        from ....tensor.core import TENSOR_TYPE
+
+        chunks = [c.data for c in tileable.chunks]
+        if isinstance(tileable, DATAFRAME_TYPE):
+            for c in chunks:
+                i, j = c.index
+                meta = chunk_to_result[c].meta
+                shape = meta.get("shape")
+                update_shape = shape is None
+                shape = shape if not update_shape else [None, None]
+                if i > 0:
+                    # update dtypes_value
+                    c0j = chunk_to_result[tileable.cix[0, j].data].meta
+                    meta["dtypes_value"] = c0j["dtypes_value"]
+                    if update_shape:
+                        shape[1] = c0j["shape"][1]
+                if j > 0:
+                    # update index_value
+                    ci0 = chunk_to_result[tileable.cix[i, 0].data].meta
+                    meta["index_value"] = ci0["index_value"]
+                    if update_shape:
+                        shape[0] = ci0["shape"][0]
+                if update_shape:
+                    meta["shape"] = tuple(shape)
+        elif isinstance(tileable, SERIES_TYPE):
+            first_meta = chunk_to_result[chunks[0]].meta
+            for c in chunks:
+                i = c.index[0]
+                meta = chunk_to_result[c].meta
+                if i > 0:
+                    meta["name"] = first_meta["name"]
+                    meta["dtype"] = first_meta["dtype"]
+        elif isinstance(tileable, TENSOR_TYPE):
+            ndim = tileable.ndim
+            for i, c in enumerate(chunks):
+                meta = chunk_to_result[c].meta
+                if "shape" not in meta:
+                    shape = []
+                    for i, ind in enumerate(c.index):
+                        ind0 = [0] * ndim
+                        ind0[i] = ind
+                        c0 = tileable.cix[tuple(ind0)].data
+                        shape.append(chunk_to_result[c0].meta["shape"][i])
+                    meta["shape"] = tuple(shape)
+                if i > 0:
+                    first = chunk_to_result[chunks[0]].meta
+                    meta["dtype"] = first["dtype"]
+                    meta["order"] = first["order"]
+
+    async def run(self):
+        try:
+            profiling = ProfilingData[self.task_id, "general"]
+            self.result.status = TaskStatus.running
+            # optimization
+            with Timer() as timer:
+                # optimization, run it in executor,
+                # since optimization may be a CPU intensive operation
+                await asyncio.to_thread(self._preprocessor.optimize)
+
+            profiling.set("optimize", timer.duration)
+
+            self._tileable_id_to_tileable = await asyncio.to_thread(
+                self._get_tileable_id_to_tileable, self._preprocessor.tileable_graph
+            )
+
+            async with self._executor:
+                async for stage_args in self._iter_stage_chunk_graph():
+                    await self._process_stage_chunk_graph(*stage_args)
+        except Exception as ex:
+            self.result.error = ex
+            self.result.traceback = ex.__traceback__
+        finally:
+            self._gen_result()
+            self._finish()
+
+    async def get_progress(self) -> float:
+        # get tileable proportion that is tiled
+        return await self._executor.get_progress()
+
+    async def cancel(self):
+        self._preprocessor.cancel()
+        await self._executor.cancel()
+
+    async def set_subtask_result(self, subtask_result: SubtaskResult):
+        await self._executor.set_subtask_result(subtask_result)
+
+    @staticmethod
+    def _get_tileable_id_to_tileable(
+        tileable_graph: TileableGraph,
+    ) -> Dict[str, TileableType]:
+        tileable_id_to_tileable = dict()
+
+        for tileable in tileable_graph:
+            tileable_id_to_tileable[str(tileable.key)] = tileable
+
+        return tileable_id_to_tileable
+
+    def _gen_result(self):
+        self.result.status = TaskStatus.terminated
+        self.result.end_time = time.time()
+        cost_time_secs = self.result.end_time - self.result.start_time
+        logger.info(
+            "Time consuming to execute a task is %ss with session id %s, task id %s",
+            cost_time_secs,
+            self._task.session_id,
+            self._task.task_id,
+        )
+        self._task_execution_time.record(
+            cost_time_secs,
+            {"session_id": self._task.session_id, "task_id": self._task.task_id},
+        )
+
+    def get_map_reduce_info(self, map_reduce_id: int) -> MapReduceInfo:
+        return self._preprocessor.get_map_reduce_info(map_reduce_id)
+
+    def dump_subtask_graph(self):
+        from .graph_visualizer import GraphVisualizer
+
+        try:  # pragma: no cover
+            import graphviz
+        except ImportError:
+            graphviz = None
+
+        dot = GraphVisualizer.to_dot(self._subtask_graphs)
+        directory = os.environ.get("MARS_DUMP_SUBTASK_GRAPH_DIR")
+        if directory is None:
+            directory = tempfile.gettempdir()
+        os.makedirs(directory, exist_ok=True)
+        file_name = f"mars-{self.task_id}"
+        logger.info(
+            "Subtask graph of task %s is stored in %s",
+            self._task.task_id,
+            os.path.join(directory, file_name),
+        )
+        if graphviz is not None:  # pragma: no cover
+            try:
+                g = graphviz.Source(dot)
+                g.view(file_name, directory=directory)
+                return
+            except graphviz.ExecutableNotFound:  # pragma: no cover
+                logger.info("dot executable is not found, dump dot file instead.")
+
+        with open(os.path.join(directory, file_name), "w") as f:
+            f.write(dot)
+
+    def _finish(self):
+        self._executor.destroy()
+        self.done.set()
+        if self._dump_subtask_graph:
+            self.dump_subtask_graph()
+        if MARS_ENABLE_PROFILING or (
+            self._task.extra_config and self._task.extra_config.get("enable_profiling")
+        ):
+            ProfilingData[self._task.task_id, "general"].set(
+                "total", time.time() - self.result.start_time
+            )
+            serialization = ProfilingData[self._task.task_id, "serialization"]
+            if not serialization.empty():
+                serialization.set(
+                    "total",
+                    sum(serialization.values()),
+                )
+            data = ProfilingData.pop(self._task.task_id)
+            self.result.profiling = {
+                "supervisor": data,
+            }
+
+    def is_done(self) -> bool:
+        return self.done.is_set()
diff --git a/python/xorbits/_mars/services/task/supervisor/service.py b/python/xorbits/_mars/services/task/supervisor/service.py
new file mode 100644
index 000000000..e42738f8a
--- /dev/null
+++ b/python/xorbits/_mars/services/task/supervisor/service.py
@@ -0,0 +1,77 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import oscar as mo
+from ...core import AbstractService
+from .manager import TaskConfigurationActor, TaskManagerActor
+
+
+class TaskSupervisorService(AbstractService):
+    """
+    Task service on supervisor.
+
+    Service Configuration
+    ---------------------
+    {
+        "task": {
+            "default_config": {
+                "optimize_tileable_graph": True,
+                "optimize_chunk_graph": True,
+                "fuse_enabled": True,
+                "reserved_finish_tasks": 10
+            },
+            "execution_config": {
+                "backend": "mars",
+                "mars": {},
+            }
+        }
+    }
+    """
+
+    async def start(self):
+        task_config = self._config.get("task", dict())
+        options = task_config.get("default_config", dict())
+        execution_config = task_config.get("execution_config", dict())
+        task_processor_cls = task_config.get("task_processor_cls")
+        task_preprocessor_cls = task_config.get("task_preprocessor_cls")
+        await mo.create_actor(
+            TaskConfigurationActor,
+            options,
+            execution_config=execution_config,
+            task_processor_cls=task_processor_cls,
+            task_preprocessor_cls=task_preprocessor_cls,
+            address=self._address,
+            uid=TaskConfigurationActor.default_uid(),
+        )
+
+    async def stop(self):
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=TaskConfigurationActor.default_uid(), address=self._address
+            )
+        )
+
+    async def create_session(self, session_id: str):
+        await mo.create_actor(
+            TaskManagerActor,
+            session_id,
+            address=self._address,
+            uid=TaskManagerActor.gen_uid(session_id),
+        )
+
+    async def destroy_session(self, session_id: str):
+        task_manager_ref = await mo.actor_ref(
+            self._address, TaskManagerActor.gen_uid(session_id)
+        )
+        return await mo.destroy_actor(task_manager_ref)
diff --git a/python/xorbits/_mars/services/task/supervisor/task.py b/python/xorbits/_mars/services/task/supervisor/task.py
new file mode 100644
index 000000000..11dad0d3f
--- /dev/null
+++ b/python/xorbits/_mars/services/task/supervisor/task.py
@@ -0,0 +1,428 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import dataclasses
+import importlib
+import logging
+from typing import Any, Dict, List, Optional, Set, Type
+
+from .... import oscar as mo
+from ....config import Config
+from ....core import TileContext
+from ....core.operand import Fetch
+from ....typing import TileableType
+from ....utils import build_fetch
+from ...subtask import SubtaskGraph, SubtaskResult, SubtaskStatus
+from ..core import MapReduceInfo, Task, TaskStatus
+from ..execution.api import TaskExecutor
+from .preprocessor import TaskPreprocessor
+from .processor import TaskProcessor
+
+logger = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass
+class _TileableStageInfo:
+    progress: float
+    subtask_ids: Set[str]
+
+
+@dataclasses.dataclass
+class _TileableDetailInfo:
+    progress: float
+    subtask_count: int
+    status: int
+    properties: Dict[str, Any]
+
+
+class _TaskInfoProcessorMixin:
+    _task_id_to_processor: Dict[str, TaskProcessor]
+    _tileable_to_details_cache: Dict[TileableType, _TileableDetailInfo]
+
+    def _init_cache(self):
+        try:
+            return self._tileable_to_details_cache
+        except AttributeError:
+            cache = self._tileable_to_details_cache = dict()
+            return cache
+
+    def _get_all_subtask_results(self) -> Dict[str, SubtaskResult]:
+        subtask_results = dict()
+        for processor in self._task_id_to_processor.values():
+            for stage in processor.stage_processors:
+                for subtask, result in stage.subtask_results.items():
+                    subtask_results[subtask.subtask_id] = result
+                for subtask, result in stage.subtask_snapshots.items():
+                    if subtask.subtask_id in subtask_results:
+                        continue
+                    subtask_results[subtask.subtask_id] = result
+        return subtask_results
+
+    def _get_tileable_infos(self) -> Dict[TileableType, _TileableDetailInfo]:
+        cache = self._init_cache()
+
+        tileable_to_stage_infos: Dict[TileableType, List[_TileableStageInfo]] = dict()
+        for processor in self._task_id_to_processor.values():
+            tile_context = processor.tile_context
+            for tileable, infos in tile_context.get_tileable_tile_infos().items():
+                tileable_to_stage_infos[tileable] = []
+                if tileable in cache:
+                    # cached
+                    continue
+                for info in infos:
+                    chunks = [
+                        c for c in info.generated_chunks if not isinstance(c.op, Fetch)
+                    ]
+                    try:
+                        subtask_ids = {
+                            st.subtask_id for st in processor.get_subtasks(chunks)
+                        }
+                    except KeyError:  # pragma: no cover
+                        subtask_ids = None
+                    stage_info = _TileableStageInfo(
+                        progress=info.tile_progress, subtask_ids=subtask_ids
+                    )
+                    tileable_to_stage_infos[tileable].append(stage_info)
+
+        tileable_to_defails = dict()
+        subtask_id_to_results = self._get_all_subtask_results()
+        for tileable, infos in tileable_to_stage_infos.items():
+            if tileable in cache:
+                # cached
+                tileable_to_defails[tileable] = cache[tileable]
+                continue
+
+            statuses = set()
+            progress = 0.0 if not isinstance(tileable.op, Fetch) else 1.0
+            n_subtask = 0
+            for stage_info in infos:
+                tile_progress = stage_info.progress
+                stage_progress = 0.0
+                if stage_info.subtask_ids is None:
+                    continue
+                for subtask_id in stage_info.subtask_ids:
+                    try:
+                        result = subtask_id_to_results[subtask_id]
+                        stage_progress += result.progress * tile_progress
+                        statuses.add(result.status)
+                    except KeyError:
+                        # pending
+                        statuses.add(SubtaskStatus.pending)
+                n_subtask += len(stage_info.subtask_ids)
+                if stage_info.subtask_ids:
+                    progress += stage_progress / len(stage_info.subtask_ids)
+                else:
+                    progress += tile_progress
+
+            # calc status
+            if (not statuses or statuses == {SubtaskStatus.succeeded}) and abs(
+                progress - 1.0
+            ) < 1e-3:
+                status = SubtaskStatus.succeeded
+            elif statuses == {SubtaskStatus.cancelled}:
+                status = SubtaskStatus.cancelled
+            elif statuses == {SubtaskStatus.pending}:
+                status = SubtaskStatus.pending
+            elif SubtaskStatus.errored in statuses:
+                status = SubtaskStatus.errored
+            else:
+                status = SubtaskStatus.running
+
+            props = tileable.op.to_kv(
+                exclude_fields=("_key", "_id"), accept_value_types=(int, float, str)
+            )
+            info = _TileableDetailInfo(
+                progress=progress,
+                subtask_count=n_subtask,
+                status=status.value,
+                properties=props,
+            )
+            tileable_to_defails[tileable] = info
+            if status.is_done and tileable not in cache:
+                cache[tileable] = info
+
+        return tileable_to_defails
+
+    async def get_tileable_details(self):
+        tileable_to_details = yield asyncio.to_thread(self._get_tileable_infos)
+        raise mo.Return(
+            {
+                t.key: {
+                    "progress": info.progress,
+                    "subtaskCount": info.subtask_count,
+                    "status": info.status,
+                    "properties": info.properties,
+                }
+                for t, info in tileable_to_details.items()
+            }
+        )
+
+    def _get_tileable_graph_as_dict(self):
+        processor = list(self._task_id_to_processor.values())[-1]
+        tileable_graph = processor.tileable_graph
+
+        node_list = []
+        edge_list = []
+
+        visited = set()
+
+        for chunk in tileable_graph:
+            if chunk.key in visited:  # pragma: no cover
+                continue
+            visited.add(chunk.key)
+
+            node_name = str(chunk.op)
+
+            node_list.append({"tileableId": chunk.key, "tileableName": node_name})
+            for inp, is_pure_dep in zip(chunk.inputs, chunk.op.pure_depends):
+                if inp not in tileable_graph:  # pragma: no cover
+                    continue
+                edge_list.append(
+                    {
+                        "fromTileableId": inp.key,
+                        "toTileableId": chunk.key,
+                        "linkType": 1 if is_pure_dep else 0,
+                    }
+                )
+
+        graph_dict = {"tileables": node_list, "dependencies": edge_list}
+        return graph_dict
+
+    async def get_tileable_graph_as_dict(self):
+        return await asyncio.to_thread(self._get_tileable_graph_as_dict)
+
+    def _get_tileable_subtasks(self, tileable_id: str, with_input_output: bool):
+        returned_subtasks = dict()
+        subtask_id_to_types = dict()
+
+        subtask_details = dict()
+        subtask_graph = subtask_results = subtask_snapshots = None
+        for processor in self._task_id_to_processor.values():
+            tileable_to_subtasks = processor.get_tileable_to_subtasks()
+            tileable_id_to_tileable = processor.tileable_id_to_tileable
+            for stage in processor.stage_processors:
+                if tileable_id in tileable_id_to_tileable:
+                    tileable = tileable_id_to_tileable[tileable_id]
+                    returned_subtasks = {
+                        subtask.subtask_id: subtask
+                        for subtask in tileable_to_subtasks[tileable]
+                    }
+                    subtask_graph = stage.subtask_graph
+                    subtask_results = stage.subtask_results
+                    subtask_snapshots = stage.subtask_snapshots
+                    break
+            if returned_subtasks:
+                break
+
+        if subtask_graph is None:  # pragma: no cover
+            return {}
+
+        if with_input_output:
+            for subtask in list(returned_subtasks.values()):
+                for pred in subtask_graph.iter_predecessors(subtask):
+                    if pred.subtask_id in returned_subtasks:  # pragma: no cover
+                        continue
+                    returned_subtasks[pred.subtask_id] = pred
+                    subtask_id_to_types[pred.subtask_id] = "Input"
+                for succ in subtask_graph.iter_successors(subtask):
+                    if succ.subtask_id in returned_subtasks:  # pragma: no cover
+                        continue
+                    returned_subtasks[succ.subtask_id] = succ
+                    subtask_id_to_types[succ.subtask_id] = "Output"
+
+        for subtask in returned_subtasks.values():
+            subtask_result = subtask_results.get(
+                subtask,
+                subtask_snapshots.get(
+                    subtask,
+                    SubtaskResult(
+                        progress=0.0,
+                        status=SubtaskStatus.pending,
+                        stage_id=subtask.stage_id,
+                    ),
+                ),
+            )
+            subtask_details[subtask.subtask_id] = {
+                "name": subtask.subtask_name,
+                "status": subtask_result.status.value,
+                "progress": subtask_result.progress,
+                "nodeType": subtask_id_to_types.get(subtask.subtask_id, "Calculation"),
+            }
+
+        for subtask in returned_subtasks.values():
+            pred_ids = []
+            for pred in subtask_graph.iter_predecessors(subtask):
+                if pred.subtask_id in returned_subtasks:
+                    pred_ids.append(pred.subtask_id)
+            subtask_details[subtask.subtask_id]["fromSubtaskIds"] = pred_ids
+        return subtask_details
+
+    async def get_tileable_subtasks(self, tileable_id: str, with_input_output: bool):
+        return await asyncio.to_thread(
+            self._get_tileable_subtasks, tileable_id, with_input_output
+        )
+
+
+class TaskProcessorActor(mo.Actor, _TaskInfoProcessorMixin):
+    _task_id_to_processor: Dict[str, TaskProcessor]
+    _cur_processor: Optional[TaskProcessor]
+
+    def __init__(
+        self,
+        session_id: str,
+        task_id: str,
+        task_processor_cls: Type[TaskPreprocessor] = None,
+    ):
+        self.session_id = session_id
+        self.task_id = task_id
+
+        self._task_processor_cls = self._get_task_processor_cls(task_processor_cls)
+        self._task_id_to_processor = dict()
+        self._cur_processor = None
+
+    @classmethod
+    def gen_uid(cls, session_id: str, task_id: str):
+        return f"task_processor_{session_id}_{task_id}"
+
+    async def add_task(
+        self,
+        task: Task,
+        tiled_context: TileContext,
+        config: Config,
+        execution_config: Dict,
+        task_preprocessor_cls: Type[TaskPreprocessor],
+    ):
+        task_preprocessor = task_preprocessor_cls(
+            task, tiled_context=tiled_context, config=config
+        )
+        task_executor = await TaskExecutor.create(
+            execution_config,
+            task=task,
+            session_id=self.session_id,
+            address=self.address,
+            tile_context=task_preprocessor.tile_context,
+        )
+        processor = self._task_processor_cls(
+            task,
+            task_preprocessor,
+            task_executor,
+        )
+        self._task_id_to_processor[task.task_id] = processor
+
+        # tell self to start running
+        await self.ref().start.tell()
+
+    @classmethod
+    def _get_task_processor_cls(cls, task_processor_cls):
+        if task_processor_cls is not None:  # pragma: no cover
+            if isinstance(task_processor_cls, type):
+                return task_processor_cls
+            assert isinstance(task_processor_cls, str)
+            module, name = task_processor_cls.rsplit(".", 1)
+            return getattr(importlib.import_module(module), name)
+        else:
+            return TaskProcessor
+
+    def _get_unprocessed_task_processor(self):
+        for processor in self._task_id_to_processor.values():
+            if processor.result.status == TaskStatus.pending:
+                return processor
+
+    async def start(self):
+        if self._cur_processor is not None:  # pragma: no cover
+            # some processor is running
+            return
+
+        processor = self._get_unprocessed_task_processor()
+        if processor is None:  # pragma: no cover
+            return
+        self._cur_processor = processor
+        try:
+            yield processor.run()
+        finally:
+            self._cur_processor = None
+
+    async def wait(self, timeout: int = None):
+        fs = [
+            asyncio.ensure_future(processor.done.wait())
+            for processor in self._task_id_to_processor.values()
+        ]
+
+        _, pending = yield asyncio.wait(fs, timeout=timeout)
+        if not pending:
+            raise mo.Return(self.result())
+        else:
+            _ = [fut.cancel() for fut in pending]
+
+    async def cancel(self):
+        if self._cur_processor:
+            await self._cur_processor.cancel()
+
+    def result(self):
+        terminated_result = None
+        for processor in self._task_id_to_processor.values():
+            if processor.result.status != TaskStatus.terminated:
+                return processor.result
+            else:
+                terminated_result = processor.result
+        return terminated_result
+
+    async def progress(self):
+        processor_progresses = [
+            await processor.get_progress()
+            for processor in self._task_id_to_processor.values()
+        ]
+        return sum(processor_progresses) / len(processor_progresses)
+
+    def get_result_tileables(self):
+        processor = list(self._task_id_to_processor.values())[-1]
+        tileable_graph = processor.tileable_graph
+        result = []
+        for result_tileable in tileable_graph.result_tileables:
+            tiled = processor.get_tiled(result_tileable)
+            result.append(build_fetch(tiled))
+        return result
+
+    def get_subtask_graphs(self, task_id: str) -> List[SubtaskGraph]:
+        return [
+            stage_processor.subtask_graph
+            for stage_processor in self._task_id_to_processor[task_id].stage_processors
+        ]
+
+    def get_result_tileable(self, tileable_key: str):
+        processor = list(self._task_id_to_processor.values())[-1]
+        tileable_graph = processor.tileable_graph
+        for result_tileable in tileable_graph.result_tileables:
+            if result_tileable.key == tileable_key:
+                tiled = processor.get_tiled(result_tileable)
+                return build_fetch(tiled)
+        raise KeyError(f"Tileable {tileable_key} does not exist")  # pragma: no cover
+
+    async def set_subtask_result(self, subtask_result: SubtaskResult):
+        logger.debug(
+            "Set subtask %s with result %s.", subtask_result.subtask_id, subtask_result
+        )
+        if self._cur_processor is not None:
+            yield self._cur_processor.set_subtask_result(subtask_result)
+
+    async def get_map_reduce_info(self, map_reduce_id: int) -> MapReduceInfo:
+        for processor in self._task_id_to_processor.values():
+            return processor.get_map_reduce_info(map_reduce_id)
+
+    def is_done(self) -> bool:
+        for processor in self._task_id_to_processor.values():
+            if not processor.is_done():
+                return False
+        return True
diff --git a/python/xorbits/_mars/services/task/supervisor/tests/__init__.py b/python/xorbits/_mars/services/task/supervisor/tests/__init__.py
new file mode 100644
index 000000000..a0122bfa4
--- /dev/null
+++ b/python/xorbits/_mars/services/task/supervisor/tests/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .task_preprocessor import CheckedTaskPreprocessor
diff --git a/python/xorbits/_mars/services/task/supervisor/tests/task_preprocessor.py b/python/xorbits/_mars/services/task/supervisor/tests/task_preprocessor.py
new file mode 100644
index 000000000..c98f73f10
--- /dev/null
+++ b/python/xorbits/_mars/services/task/supervisor/tests/task_preprocessor.py
@@ -0,0 +1,259 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from collections import defaultdict
+from functools import partial
+from typing import Callable, Dict, List
+
+import numpy as np
+
+from .....core import (
+    OBJECT_TYPE,
+    ChunkGraph,
+    TileableType,
+    enter_mode,
+    register,
+    unregister,
+)
+from .....core.operand import Fetch, ShuffleProxy
+from .....core.operand.shuffle import ShuffleFetchType
+from .....resource import Resource
+from .....tests.core import ObjectCheckMixin, _check_args
+from .....typing import BandType, ChunkType
+from ....subtask import Subtask, SubtaskGraph
+from ...analyzer import GraphAnalyzer
+from ..preprocessor import CancellableTiler, TaskPreprocessor
+
+
+class CheckedTaskPreprocessor(ObjectCheckMixin, TaskPreprocessor):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._raw_chunk_shapes = dict()
+        self._tileable_checked = dict()
+
+        check_options = dict()
+        kwargs = self._task.extra_config or dict()
+        self._operand_tile_handlers = operand_tile_handlers = kwargs.pop(
+            "operand_tile_handlers", dict()
+        )
+        for op, tile_handler in operand_tile_handlers.items():
+            register(op, tile_handler)
+        for key in _check_args:
+            check_options[key] = kwargs.get(key, True)
+        self._check_options = check_options
+        self._check_duplicated_operand_keys = bool(
+            kwargs.get("check_duplicated_operand_keys")
+        )
+
+    def _get_done(self):
+        return super()._get_done()
+
+    def _set_done(self, is_done: bool):
+        super()._set_done(is_done)
+        for op in self._operand_tile_handlers:
+            unregister(op)
+
+    done = property(_get_done, _set_done)
+
+    def _check_nsplits(self, tiled: TileableType):
+        if tiled.nsplits is None or (tiled.nsplits == () and len(tiled.chunks) == 1):
+            return
+
+        nsplit_chunk_shape = tuple(len(s) for s in tiled.nsplits)
+        if nsplit_chunk_shape != tiled.chunk_shape:
+            raise AssertionError(
+                "Operand %r: shape of nsplits %r not consistent with chunk shape %r"
+                % (tiled.op, nsplit_chunk_shape, tiled.chunk_shape)
+            ) from None
+
+        nsplit_shape = tuple(np.sum(s) for s in tiled.nsplits)
+        try:
+            self.assert_shape_consistent(nsplit_shape, tiled.shape)
+        except AssertionError:
+            raise AssertionError(
+                "Operand %r: shape computed from nsplits %r -> %r not consistent with real shape %r"
+                % (tiled.op, tiled.nsplits, nsplit_shape, tiled.shape)
+            ) from None
+
+        for c in tiled.chunks:
+            try:
+                tiled_c = tiled.cix[c.index]
+            except ValueError as ex:
+                raise AssertionError(
+                    "Operand %r: Malformed index %r, nsplits is %r. Raw error is %r"
+                    % (c.op, c.index, tiled.nsplits, ex)
+                ) from None
+
+            if tiled_c is not c:
+                raise AssertionError(
+                    "Operand %r: Cannot spot chunk via index %r, nsplits is %r"
+                    % (c.op, c.index, tiled.nsplits)
+                )
+        for cid, shape in enumerate(itertools.product(*tiled.nsplits)):
+            chunk_shape = (
+                self._raw_chunk_shapes.get(tiled.chunks[cid].key)
+                or tiled.chunks[cid].shape
+            )
+            if len(shape) != len(chunk_shape):
+                raise AssertionError(
+                    "Operand %r: Shape in nsplits %r does not meet shape in chunk %r"
+                    % (tiled.chunks[cid].op, shape, chunk_shape)
+                )
+            for s1, s2 in zip(shape, chunk_shape):
+                if (not (np.isnan(s1) and np.isnan(s2))) and s1 != s2:
+                    raise AssertionError(
+                        "Operand %r: Shape in nsplits %r does not meet shape in chunk %r"
+                        % (tiled.chunks[cid].op, shape, chunk_shape)
+                    )
+
+    def post_chunk_graph_execution(self):
+        for tileable in self.tileable_graph:
+            tiled_tileable = self.tile_context.get(tileable)
+            if (
+                tiled_tileable is not None
+                and self._check_options["check_nsplits"]
+                and tileable.key not in self._tileable_checked
+                and not isinstance(tileable, OBJECT_TYPE)
+            ):
+                self._check_nsplits(tiled_tileable)
+                self._tileable_checked[tileable.key] = True
+
+    def _get_tiler_cls(self) -> Callable:
+        extra_config = self._task.extra_config or dict()
+        check_duplicated_submission = extra_config.get(
+            "check_duplicated_submission", True
+        )
+        return partial(
+            CancellableTiler,
+            cancelled=self._cancelled,
+            check_duplicated_submission=check_duplicated_submission,
+        )
+
+    @enter_mode(build=True)
+    def analyze(
+        self,
+        chunk_graph: ChunkGraph,
+        chunk_to_subtasks: Dict[ChunkType, Subtask],
+        available_bands: Dict[BandType, Resource],
+        stage_id: str,
+        op_to_bands: Dict[str, BandType] = None,
+        shuffle_fetch_type: ShuffleFetchType = None,
+    ) -> SubtaskGraph:
+        checked_chunks = set()
+        for tileable in self.tileable_graph:
+            try:
+                tiled = self.get_tiled(tileable)
+                self._check_shuffle_reduce_chunks(tiled.chunks, checked_chunks)
+            except KeyError:
+                pass
+
+        # check if duplicated operand keys exist
+        if self._check_duplicated_operand_keys and len(
+            {c.key for c in chunk_graph}
+        ) < len(
+            chunk_graph
+        ):  # pragma: no cover
+            raise AssertionError("Duplicated operands exist")
+        # record shapes generated in tile
+        for n in chunk_graph:
+            self._raw_chunk_shapes[n.key] = getattr(n, "shape", None)
+        task = self._task
+        analyzer = GraphAnalyzer(
+            chunk_graph,
+            available_bands,
+            task,
+            self._config,
+            chunk_to_subtasks,
+            shuffle_fetch_type=shuffle_fetch_type,
+            map_reduce_id_to_infos=self.map_reduce_id_to_infos,
+        )
+        subtask_graph = analyzer.gen_subtask_graph()
+        results = set(
+            analyzer._chunk_to_copied[c]
+            for c in chunk_graph.results
+            if not isinstance(c.op, Fetch)
+        )
+        for subtask in subtask_graph:
+            if subtask.extra_config is None:
+                subtask.extra_config = dict()
+            if all(c not in results for c in subtask.chunk_graph.results):
+                subtask.extra_config["check_all"] = False
+            else:
+                subtask.extra_config["check_keys"] = [
+                    c.key for c in subtask.chunk_graph.results if c in results
+                ]
+            proxy_chunks = [
+                c for c in subtask.chunk_graph if isinstance(c.op, ShuffleProxy)
+            ]
+            if proxy_chunks:
+                assert len(proxy_chunks) == 1, proxy_chunks
+                proxy_chunk_key = proxy_chunks[0].key
+                proxy_chunk = next(c for c in chunk_graph if c.key == proxy_chunk_key)
+                reducer_chunks = chunk_graph.successors(proxy_chunk)
+                n_reducers_list = [c.op.n_reducers for c in reducer_chunks]
+                n_reducers = n_reducers_list[0]
+                reducer_ordinals = [c.op.reducer_ordinal for c in reducer_chunks]
+                assert set(reducer_ordinals).issubset(list(range(n_reducers))), (
+                    reducer_ordinals,
+                    n_reducers,
+                )
+                assert len(set(n_reducers_list)) == 1, n_reducers_list
+                mapper_chunks = chunk_graph.predecessors(proxy_chunk)
+                assert proxy_chunk.op.n_mappers == len(mapper_chunks), (
+                    proxy_chunk.op.n_mappers,
+                    mapper_chunks,
+                )
+                # If some reducer data are not used by downstream, then it won't be included in the chunk graph.
+                assert proxy_chunk.op.n_reducers >= n_reducers, (
+                    proxy_chunk.op.n_reducers,
+                    n_reducers,
+                )
+        return subtask_graph
+
+    @classmethod
+    def _check_shuffle_reduce_chunks(cls, chunks: List, checked_chunks):
+        """Check shuffle reduce chunks sorted reducer_index consistent with reducer_ordinal. So shuffle mapper blocks
+        can be sorted by reducer_index, and the reducer can fetch mapper data by reducer_ordinal.
+        """
+        chunks = [c for c in chunks or [] if c not in checked_chunks]
+        if not chunks:
+            return
+        from .....core.operand import MapReduceOperand, OperandStage, ShuffleProxy
+
+        reduce_chunks = defaultdict(list)
+        for c in chunks:
+            checked_chunks.add(c)
+            if isinstance(c.op, MapReduceOperand) and c.op.stage == OperandStage.reduce:
+                shuffle_proxies = [
+                    c for c in c.inputs if isinstance(c.op, ShuffleProxy)
+                ]
+                assert len(shuffle_proxies) == 1, (c.inputs, shuffle_proxies)
+                reduce_chunks[shuffle_proxies[0]].append(c)
+            else:
+                cls._check_shuffle_reduce_chunks(c.inputs, checked_chunks)
+        for _, reduce_chunks in reduce_chunks.items():
+            sorted_chunks_by_indices = sorted(
+                reduce_chunks, key=lambda c: c.op.reducer_index
+            )
+            sorted_chunks_by_ordinals = sorted(
+                reduce_chunks, key=lambda c: c.op.reducer_ordinal
+            )
+            for c1, c2 in zip(sorted_chunks_by_indices, sorted_chunks_by_ordinals):
+                assert c1.op.reducer_index == c2.op.reducer_index, (
+                    sorted_chunks_by_indices,
+                    sorted_chunks_by_ordinals,
+                )
+            for c in reduce_chunks:
+                cls._check_shuffle_reduce_chunks(c.inputs, checked_chunks)
diff --git a/python/xorbits/_mars/services/task/supervisor/tests/test_task_manager.py b/python/xorbits/_mars/services/task/supervisor/tests/test_task_manager.py
new file mode 100644
index 000000000..caff6ab21
--- /dev/null
+++ b/python/xorbits/_mars/services/task/supervisor/tests/test_task_manager.py
@@ -0,0 +1,706 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import gc
+import os
+import sys
+import tempfile
+import time
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from ..... import dataframe as md
+from ..... import oscar as mo
+from ..... import remote as mr
+from ..... import tensor as mt
+from .....conftest import MARS_CI_BACKEND
+from .....core import Tileable, TileableGraph, TileableGraphBuilder
+from .....core.operand import Fetch
+from .....oscar.backends.allocate_strategy import MainPool
+from .....resource import Resource
+from .....storage import StorageLevel
+from .....utils import Timer, merge_chunks
+from ....cluster import MockClusterAPI
+from ....lifecycle import MockLifecycleAPI
+from ....meta import MetaAPI, MockMetaAPI, MockWorkerMetaAPI
+from ....mutable import MockMutableAPI
+from ....scheduling import MockSchedulingAPI
+from ....session import MockSessionAPI
+from ....storage import MockStorageAPI, StorageAPI
+from ....subtask import MockSubtaskAPI
+from ...core import TaskResult, TaskStatus
+from ...execution.api import ExecutionConfig, Fetcher
+from ..manager import TaskConfigurationActor, TaskManagerActor
+
+
+@pytest.fixture
+async def actor_pool():
+    backend = MARS_CI_BACKEND
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await mo.create_actor_pool(
+        "127.0.0.1",
+        n_process=3,
+        labels=["main"] + ["numa-0"] * 2 + ["io"],
+        subprocess_start_method=start_method,
+    )
+
+    async with pool:
+        session_id = "test_session"
+        # create mock APIs
+        await MockClusterAPI.create(
+            pool.external_address, band_to_resource={"numa-0": Resource(num_cpus=2)}
+        )
+        await MockSessionAPI.create(pool.external_address, session_id=session_id)
+        meta_api = await MockMetaAPI.create(session_id, pool.external_address)
+        await MockWorkerMetaAPI.create(session_id, pool.external_address)
+        lifecycle_api = await MockLifecycleAPI.create(session_id, pool.external_address)
+        storage_api = await MockStorageAPI.create(session_id, pool.external_address)
+        await MockSchedulingAPI.create(session_id, pool.external_address)
+        await MockSubtaskAPI.create(pool.external_address)
+        await MockMutableAPI.create(session_id, pool.external_address)
+
+        # create configuration
+        config = ExecutionConfig.from_params(
+            backend=backend,
+            n_worker=1,
+            n_cpu=2,
+            subtask_max_retries=3,
+        )
+        await mo.create_actor(
+            TaskConfigurationActor,
+            dict(),
+            config.get_config_dict(),
+            uid=TaskConfigurationActor.default_uid(),
+            address=pool.external_address,
+        )
+        # create task manager
+        manager = await mo.create_actor(
+            TaskManagerActor,
+            session_id,
+            uid=TaskManagerActor.gen_uid(session_id),
+            address=pool.external_address,
+            allocate_strategy=MainPool(),
+        )
+        try:
+            yield backend, pool, session_id, meta_api, lifecycle_api, storage_api, manager
+        finally:
+            await MockStorageAPI.cleanup(pool.external_address)
+            await MockClusterAPI.cleanup(pool.external_address)
+            await MockMutableAPI.cleanup(session_id, pool.external_address)
+
+
+async def _merge_data(
+    execution_backend: str,
+    fetch_tileable: Tileable,
+    meta_api: MetaAPI,
+    storage_api: StorageAPI,
+):
+    async def _get_storage_api(band):
+        return storage_api
+
+    fetcher = Fetcher.create(execution_backend, get_storage_api=_get_storage_api)
+    get_metas = []
+    for chunk in fetch_tileable.chunks:
+        get_metas.append(
+            meta_api.get_chunk_meta.delay(chunk.key, fields=fetcher.required_meta_keys)
+        )
+    metas = await meta_api.get_chunk_meta.batch(*get_metas)
+    for chunk, meta in zip(fetch_tileable.chunks, metas):
+        await fetcher.append(chunk.key, meta)
+    data = await fetcher.get()
+    index_data = [(c.index, d) for c, d in zip(fetch_tileable.chunks, data)]
+    return merge_chunks(index_data)
+
+
+@pytest.mark.asyncio
+@pytest.mark.ray_dag
+async def test_run_task(actor_pool):
+    (
+        execution_backend,
+        pool,
+        session_id,
+        meta_api,
+        lifecycle_api,
+        storage_api,
+        manager,
+    ) = actor_pool
+
+    raw = np.random.RandomState(0).rand(10, 10)
+    a = mt.tensor(raw, chunk_size=5)
+    b = a + 1
+
+    graph = TileableGraph([b.data])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False)
+    assert isinstance(task_id, str)
+
+    await manager.wait_task(task_id)
+    task_result: TaskResult = await manager.get_task_result(task_id)
+
+    assert task_result.status == TaskStatus.terminated
+    if task_result.error is not None:
+        raise task_result.error.with_traceback(task_result.traceback)
+    assert await manager.get_task_progress(task_id) == 1.0
+
+    result_tileable = (await manager.get_task_result_tileables(task_id))[0]
+    result = await _merge_data(
+        execution_backend, result_tileable, meta_api, storage_api
+    )
+    np.testing.assert_array_equal(result, raw + 1)
+
+    # test ref counts
+    assert (await lifecycle_api.get_tileable_ref_counts([b.key]))[0] == 1
+    assert (
+        await lifecycle_api.get_chunk_ref_counts(
+            [c.key for c in result_tileable.chunks]
+        )
+    ) == [1] * len(result_tileable.chunks)
+
+
+@pytest.mark.asyncio
+@pytest.mark.ray_dag
+async def test_run_tasks_with_same_name(actor_pool):
+    (
+        execution_backend,
+        pool,
+        session_id,
+        meta_api,
+        lifecycle_api,
+        storage_api,
+        manager,
+    ) = actor_pool
+
+    raw = np.random.RandomState(0).rand(10, 10)
+    a = mt.tensor(raw, chunk_size=5)
+    b = a + 1
+    c = a * 2
+
+    for t, e in zip([b, c], [raw + 1, raw * 2]):
+        graph = TileableGraph([t.data])
+        next(TileableGraphBuilder(graph).build())
+
+        task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False)
+        assert isinstance(task_id, str)
+
+        await manager.wait_task(task_id)
+        task_result: TaskResult = await manager.get_task_result(task_id)
+
+        assert task_result.status == TaskStatus.terminated
+        if task_result.error is not None:
+            raise task_result.error.with_traceback(task_result.traceback)
+        assert await manager.get_task_progress(task_id) == 1.0
+
+        result_tileable = (await manager.get_task_result_tileables(task_id))[0]
+        result = await _merge_data(
+            execution_backend, result_tileable, meta_api, storage_api
+        )
+        np.testing.assert_array_equal(result, e)
+
+
+@pytest.mark.asyncio
+@pytest.mark.ray_dag
+async def test_error_task(actor_pool):
+    (
+        execution_backend,
+        pool,
+        session_id,
+        meta_api,
+        lifecycle_api,
+        storage_api,
+        manager,
+    ) = actor_pool
+
+    with mt.errstate(divide="raise"):
+        a = mt.ones((10, 10), chunk_size=5)
+        c = a / 0
+
+    graph = TileableGraph([c.data])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False)
+    assert isinstance(task_id, str)
+
+    await manager.wait_task(task_id)
+    task_result: TaskResult = await manager.get_task_result(task_id)
+
+    assert task_result.status == TaskStatus.terminated
+    assert task_result.error is not None
+    assert isinstance(task_result.error, FloatingPointError)
+
+    # test ref counts
+    assert (await lifecycle_api.get_tileable_ref_counts([c.key]))[0] == 0
+    assert len(await lifecycle_api.get_all_chunk_ref_counts()) == 0
+
+
+@pytest.mark.asyncio
+async def test_cancel_task(actor_pool):
+    (
+        execution_backend,
+        pool,
+        session_id,
+        meta_api,
+        lifecycle_api,
+        storage_api,
+        manager,
+    ) = actor_pool
+
+    def func():
+        time.sleep(200)
+
+    rs = [mr.spawn(func) for _ in range(10)]
+
+    graph = TileableGraph([r.data for r in rs])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False)
+    assert isinstance(task_id, str)
+
+    await asyncio.sleep(0.5)
+
+    with Timer() as timer:
+        await manager.cancel_task(task_id)
+        await manager.wait_task(task_id)
+        result = await manager.get_task_result(task_id)
+        assert result.status == TaskStatus.terminated
+
+    assert timer.duration < 25
+
+    keys = [r.key for r in rs]
+    del rs
+    gc.collect()
+    await asyncio.sleep(0.5)
+
+    # test ref counts
+    assert (await lifecycle_api.get_tileable_ref_counts(keys)) == [0] * len(keys)
+
+
+@pytest.mark.asyncio
+@pytest.mark.ray_dag
+async def test_iterative_tiling(actor_pool):
+    (
+        execution_backend,
+        pool,
+        session_id,
+        meta_api,
+        lifecycle_api,
+        storage_api,
+        manager,
+    ) = actor_pool
+
+    rs = np.random.RandomState(0)
+    raw_a = rs.rand(10, 10)
+    raw_b = rs.rand(10, 10)
+    a = mt.tensor(raw_a, chunk_size=5)
+    b = mt.tensor(raw_b, chunk_size=5)
+
+    d = a[a[:, 0] < 3] + b[b[:, 0] < 3]
+    graph = TileableGraph([d.data])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False)
+    assert isinstance(task_id, str)
+
+    await manager.wait_task(task_id)
+    task_result: TaskResult = await manager.get_task_result(task_id)
+
+    assert task_result.status == TaskStatus.terminated
+    if task_result.error is not None:
+        raise task_result.error.with_traceback(task_result.traceback)
+    assert await manager.get_task_progress(task_id) == 1.0
+
+    expect = raw_a[raw_a[:, 0] < 3] + raw_b[raw_b[:, 0] < 3]
+    result_tileable = (await manager.get_task_result_tileables(task_id))[0]
+    result = await _merge_data(
+        execution_backend, result_tileable, meta_api, storage_api
+    )
+    np.testing.assert_array_equal(result, expect)
+
+    # test ref counts
+    assert (await lifecycle_api.get_tileable_ref_counts([d.key]))[0] == 1
+    assert (
+        await lifecycle_api.get_chunk_ref_counts(
+            [c.key for c in result_tileable.chunks]
+        )
+    ) == [1] * len(result_tileable.chunks)
+
+
+@pytest.mark.asyncio
+async def test_prune_in_iterative_tiling(actor_pool):
+    (
+        execution_backend,
+        pool,
+        session_id,
+        meta_api,
+        lifecycle_api,
+        storage_api,
+        manager,
+    ) = actor_pool
+
+    raw = pd.DataFrame(np.random.RandomState(0).rand(1000, 10))
+    df = md.DataFrame(raw, chunk_size=100)
+    df2 = df.groupby(0).agg("sum")
+
+    graph = TileableGraph([df2.data])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await manager.submit_tileable_graph(graph, fuse_enabled=True)
+    assert isinstance(task_id, str)
+
+    await manager.wait_task(task_id)
+    task_result: TaskResult = await manager.get_task_result(task_id)
+
+    assert task_result.status == TaskStatus.terminated
+    if task_result.error is not None:
+        raise task_result.error.with_traceback(task_result.traceback)
+    assert await manager.get_task_progress(task_id) == 1.0
+
+    expect = raw.groupby(0).agg("sum")
+    result_tileable = (await manager.get_task_result_tileables(task_id))[0]
+    result = await _merge_data(
+        execution_backend, result_tileable, meta_api, storage_api
+    )
+    pd.testing.assert_frame_equal(expect, result)
+
+    subtask_graphs = await manager.get_subtask_graphs(task_id)
+    assert len(subtask_graphs) == 2
+
+    # the first subtask graph should have only 2 subtasks after pruning
+    assert len(subtask_graphs[0]) == 2
+    nodes = [
+        n
+        for st in subtask_graphs[0]
+        for n in st.chunk_graph
+        if not isinstance(n.op, Fetch)
+    ]
+    assert len(nodes) == 8
+    result_nodes = [n for st in subtask_graphs[0] for n in st.chunk_graph.results]
+    assert len(result_nodes) == 4
+    assert all("GroupByAgg" in str(n.op) for n in result_nodes)
+
+    # second subtask graph
+    assert len(subtask_graphs[1]) == 6
+    all_nodes = nodes + [
+        n
+        for st in subtask_graphs[1]
+        for n in st.chunk_graph
+        if not isinstance(n.op, Fetch)
+    ]
+    assert len(all_nodes) == 28
+    assert len({n.key for n in all_nodes}) == 28
+
+    df3 = df[df[0] < 1].rechunk(200)
+
+    graph = TileableGraph([df3.data])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await manager.submit_tileable_graph(graph, fuse_enabled=True)
+    assert isinstance(task_id, str)
+
+    await manager.wait_task(task_id)
+    task_result: TaskResult = await manager.get_task_result(task_id)
+
+    assert task_result.status == TaskStatus.terminated
+    if task_result.error is not None:
+        raise task_result.error.with_traceback(task_result.traceback)
+    assert await manager.get_task_progress(task_id) == 1.0
+
+    result_tileable = (await manager.get_task_result_tileables(task_id))[0]
+    result = await _merge_data(
+        execution_backend, result_tileable, meta_api, storage_api
+    )
+    pd.testing.assert_frame_equal(raw, result)
+
+    subtask_graphs = await manager.get_subtask_graphs(task_id)
+    assert len(subtask_graphs) == 2
+
+    # the first subtask graph
+    assert len(subtask_graphs[0]) == 5
+    nodes = [
+        n
+        for st in subtask_graphs[0]
+        for n in st.chunk_graph
+        if not isinstance(n.op, Fetch)
+    ]
+    assert len(nodes) == 40
+    result_nodes = [n for st in subtask_graphs[0] for n in st.chunk_graph.results]
+    assert len(result_nodes) == 10
+
+    # second subtask graph
+    assert len(subtask_graphs[1]) == 5
+    all_nodes = nodes + [
+        n
+        for st in subtask_graphs[1]
+        for n in st.chunk_graph
+        if not isinstance(n.op, Fetch)
+    ]
+    assert len(all_nodes) == 45
+    assert len({n.key for n in all_nodes}) == 45
+
+
+@pytest.mark.asyncio
+async def test_shuffle(actor_pool):
+    (
+        execution_backend,
+        pool,
+        session_id,
+        meta_api,
+        lifecycle_api,
+        storage_api,
+        manager,
+    ) = actor_pool
+
+    rs = np.random.RandomState(0)
+    raw = rs.rand(10, 10)
+    raw2 = rs.randint(10, size=(10,))
+    a = mt.tensor(raw, chunk_size=5)
+    b = mt.tensor(raw2, chunk_size=5)
+    c = a[b]
+
+    graph = TileableGraph([c.data])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False)
+    assert isinstance(task_id, str)
+
+    await manager.wait_task(task_id)
+    task_result: TaskResult = await manager.get_task_result(task_id)
+
+    assert task_result.status == TaskStatus.terminated
+    if task_result.error is not None:
+        raise task_result.error.with_traceback(task_result.traceback)
+    assert await manager.get_task_progress(task_id) == 1.0
+
+    expect = raw[raw2]
+    result_tileable = (await manager.get_task_result_tileables(task_id))[0]
+    result = await _merge_data(
+        execution_backend, result_tileable, meta_api, storage_api
+    )
+    np.testing.assert_array_equal(result, expect)
+
+    # test generating map reduce info
+    subtask_graphs = (await manager.get_subtask_graphs(task_id))[0]
+    map_reduce_ids = []
+    for subtask in subtask_graphs:
+        for chunk in subtask.chunk_graph.result_chunks:
+            map_reduce_id = getattr(chunk, "extra_params", dict()).get(
+                "analyzer_map_reduce_id"
+            )
+            if map_reduce_id is not None:
+                map_reduce_ids.append(map_reduce_id)
+    assert len(map_reduce_ids) > 0
+    map_reduce_info = await manager.get_map_reduce_info(task_id, map_reduce_ids[0])
+    assert (
+        len(set(map_reduce_info.reducer_indexes))
+        == len(map_reduce_info.reducer_indexes)
+        == len(map_reduce_info.reducer_bands)
+        > 0
+    )
+
+    # test ref counts
+    assert (await lifecycle_api.get_tileable_ref_counts([c.key]))[0] == 1
+    assert (
+        await lifecycle_api.get_chunk_ref_counts(
+            [c.key for c in result_tileable.chunks]
+        )
+    ) == [1] * len(result_tileable.chunks)
+    await lifecycle_api.decref_tileables([c.key])
+    ref_counts = await lifecycle_api.get_all_chunk_ref_counts()
+    assert len(ref_counts) == 0
+
+    # test if exists in storage
+    assert len(await storage_api.list(level=StorageLevel.MEMORY)) == 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.ray_dag
+async def test_numexpr(actor_pool):
+    (
+        execution_backend,
+        pool,
+        session_id,
+        meta_api,
+        lifecycle_api,
+        storage_api,
+        manager,
+    ) = actor_pool
+
+    raw = np.random.rand(10, 10)
+    t = mt.tensor(raw, chunk_size=5)
+    t2 = (t + 1) * 2 - 0.3
+
+    graph = TileableGraph([t2.data])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await manager.submit_tileable_graph(graph, fuse_enabled=True)
+    assert isinstance(task_id, str)
+
+    await manager.wait_task(task_id)
+    task_result: TaskResult = await manager.get_task_result(task_id)
+
+    assert task_result.status == TaskStatus.terminated
+    if task_result.error is not None:
+        raise task_result.error.with_traceback(task_result.traceback)
+    assert await manager.get_task_progress(task_id) == 1.0
+
+    expect = (raw + 1) * 2 - 0.3
+    result_tileable = (await manager.get_task_result_tileables(task_id))[0]
+    result = await _merge_data(
+        execution_backend, result_tileable, meta_api, storage_api
+    )
+    np.testing.assert_array_equal(result, expect)
+
+    # test ref counts
+    assert (await lifecycle_api.get_tileable_ref_counts([t2.key]))[0] == 1
+    assert (
+        await lifecycle_api.get_chunk_ref_counts(
+            [c.key for c in result_tileable.chunks]
+        )
+    ) == [1] * len(result_tileable.chunks)
+
+
+@pytest.mark.asyncio
+@pytest.mark.ray_dag
+async def test_optimization(actor_pool):
+    (
+        execution_backend,
+        pool,
+        session_id,
+        meta_api,
+        lifecycle_api,
+        storage_api,
+        manager,
+    ) = actor_pool
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_path = os.path.join(tempdir, "test.csv")
+
+        pdf = pd.DataFrame(
+            {
+                "a": [3, 4, 5, 3, 5, 4, 1, 2, 3],
+                "b": [1, 3, 4, 5, 6, 5, 4, 4, 4],
+                "c": list("aabaaddce"),
+                "d": list("abaaaddce"),
+            }
+        )
+        pdf.to_csv(file_path, index=False)
+
+        df = md.read_csv(file_path, incremental_index=True)
+        df2 = df.groupby("c").agg({"a": "sum"})
+        df3 = df[["b", "a"]]
+
+        graph = TileableGraph([df2.data, df3.data])
+        next(TileableGraphBuilder(graph).build())
+
+        task_id = await manager.submit_tileable_graph(graph)
+        assert isinstance(task_id, str)
+
+        await manager.wait_task(task_id)
+        task_result: TaskResult = await manager.get_task_result(task_id)
+
+        assert task_result.status == TaskStatus.terminated
+        if task_result.error is not None:
+            raise task_result.error.with_traceback(task_result.traceback)
+        assert await manager.get_task_progress(task_id) == 1.0
+
+        expect = pdf.groupby("c").agg({"a": "sum"})
+        result_tileables = await manager.get_task_result_tileables(task_id)
+        result1 = result_tileables[0]
+        result = await _merge_data(execution_backend, result1, meta_api, storage_api)
+        np.testing.assert_array_equal(result, expect)
+
+        expect = pdf[["b", "a"]]
+        result2 = result_tileables[1]
+        result = await _merge_data(execution_backend, result2, meta_api, storage_api)
+        np.testing.assert_array_equal(result, expect)
+
+        # test ref counts
+        assert (await lifecycle_api.get_tileable_ref_counts([df3.key]))[0] == 1
+        assert (
+            await lifecycle_api.get_chunk_ref_counts(
+                [c.key for c in result_tileables[1].chunks]
+            )
+        ) == [1] * len(result_tileables[1].chunks)
+
+        # test ref counts
+        assert (await lifecycle_api.get_tileable_ref_counts([df3.key]))[0] == 1
+        assert (
+            await lifecycle_api.get_chunk_ref_counts(
+                [c.key for c in result_tileables[1].chunks]
+            )
+        ) == [1] * len(result_tileables[1].chunks)
+
+
+@pytest.mark.asyncio
+@pytest.mark.ray_dag
+async def test_dump_subtask_graph(actor_pool):
+    (
+        execution_backend,
+        pool,
+        session_id,
+        meta_api,
+        lifecycle_api,
+        storage_api,
+        manager,
+    ) = actor_pool
+
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        {
+            "c1": rs.randint(20, size=100),
+            "c2": rs.choice(["a", "b", "c"], (100,)),
+            "c3": rs.rand(100),
+        }
+    )
+    mdf = md.DataFrame(raw, chunk_size=20)
+    # groupby will generate multiple tasks
+    r = mdf.groupby("c2").agg("sum")
+    graph = TileableGraph([r.data])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await manager.submit_tileable_graph(
+        graph,
+        fuse_enabled=True,
+        extra_config={"dump_subtask_graph": True},
+    )
+    assert isinstance(task_id, str)
+
+    await manager.wait_task(task_id)
+
+    result_tileable = (await manager.get_task_result_tileables(task_id))[0]
+    result = await _merge_data(
+        execution_backend, result_tileable, meta_api, storage_api
+    )
+    pd.testing.assert_frame_equal(result.sort_index(), raw.groupby("c2").agg("sum"))
+
+    # read dot file
+    file_path = os.path.join(tempfile.gettempdir(), f"mars-{task_id}")
+    with open(file_path) as f:
+        text = f.read()
+        assert "style=bold" in text
+        assert 'color="/spectral9/' in text
+        for c in result_tileable.chunks:
+            assert c.key[:5] in text
+    os.remove(file_path)
+
+    pdf_path = os.path.join(tempfile.gettempdir(), f"mars-{task_id}.pdf")
+    if os.path.exists(pdf_path):
+        os.remove(pdf_path)
diff --git a/python/xorbits/_mars/services/task/supervisor/tests/test_task_manager_on_ray.py b/python/xorbits/_mars/services/task/supervisor/tests/test_task_manager_on_ray.py
new file mode 100644
index 000000000..b58fb2b6a
--- /dev/null
+++ b/python/xorbits/_mars/services/task/supervisor/tests/test_task_manager_on_ray.py
@@ -0,0 +1,52 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from ..... import oscar as mo
+from .....oscar.backends.ray.utils import placement_group_info_to_addresses
+from .....tests.core import require_ray
+from .....utils import lazy_import
+from ..manager import TaskConfigurationActor
+
+ray = lazy_import("ray")
+
+
+@require_ray
+@pytest.mark.asyncio
+async def test_task_manager_creation(ray_start_regular):
+    mo.setup_cluster(
+        address_to_resources=placement_group_info_to_addresses(
+            "test_cluster", [{"CPU": 2}]
+        )
+    )
+    # the pool is an ActorHandle, it does not have an async context.
+    pool = await mo.create_actor_pool(
+        "ray://test_cluster/0/0", n_process=2, labels=[None] + ["numa-0"] * 2
+    )
+    assert pool
+
+    # create configuration
+    await mo.create_actor(
+        TaskConfigurationActor,
+        dict(),
+        dict(),
+        uid=TaskConfigurationActor.default_uid(),
+        address="ray://test_cluster/0/0",
+    )
+
+    configuration_ref = await mo.actor_ref(
+        TaskConfigurationActor.default_uid(), address="ray://test_cluster/0/0"
+    )
+    await configuration_ref.get_config()
diff --git a/python/xorbits/_mars/services/task/tests/__init__.py b/python/xorbits/_mars/services/task/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/task/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/task/tests/test_service.py b/python/xorbits/_mars/services/task/tests/test_service.py
new file mode 100644
index 000000000..fdf981f8a
--- /dev/null
+++ b/python/xorbits/_mars/services/task/tests/test_service.py
@@ -0,0 +1,633 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import time
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .... import dataframe as md
+from .... import oscar as mo
+from .... import remote as mr
+from .... import tensor as mt
+from ....core import TileableGraph, TileableGraphBuilder, TileStatus, recursive_tile
+from ....core.context import get_context
+from ....resource import Resource
+from ....tensor.core import TensorOrder
+from ....tensor.operands import TensorOperand, TensorOperandMixin
+from ....utils import Timer, build_fetch
+from ... import NodeRole, start_services, stop_services
+from ...meta import MetaAPI
+from ...session import SessionAPI
+from ...storage import MockStorageAPI
+from ...subtask import SubtaskStatus
+from ...web import WebActor
+from .. import TaskAPI, TaskStatus, WebTaskAPI
+from ..errors import TaskNotExist
+from ..supervisor.processor import TaskProcessor
+
+
+@pytest.fixture
+async def actor_pools():
+    async def start_pool(is_worker: bool):
+        if is_worker:
+            kw = dict(
+                n_process=3,
+                labels=["main"] + ["numa-0"] * 2 + ["io"],
+                subprocess_start_method="spawn",
+            )
+        else:
+            kw = dict(n_process=1, subprocess_start_method="spawn")
+        pool = await mo.create_actor_pool("127.0.0.1", **kw)
+        await pool.start()
+        return pool
+
+    sv_pool, worker_pool = await asyncio.gather(start_pool(False), start_pool(True))
+    try:
+        yield sv_pool, worker_pool
+    finally:
+        await asyncio.gather(sv_pool.stop(), worker_pool.stop())
+
+
+async def _start_services(
+    supervisor_pool, worker_pool, request, task_processor_cls=None
+):
+    config = {
+        "services": [
+            "cluster",
+            "session",
+            "meta",
+            "lifecycle",
+            "scheduling",
+            "subtask",
+            "task",
+            "mutable",
+        ],
+        "cluster": {
+            "backend": "fixed",
+            "lookup_address": supervisor_pool.external_address,
+            "resource": {"numa-0": Resource(num_cpus=2)},
+        },
+        "meta": {"store": "dict"},
+        "scheduling": {},
+        "task": {},
+    }
+    if task_processor_cls:
+        config["task"]["task_processor_cls"] = task_processor_cls
+    if request:
+        config["services"].append("web")
+    await start_services(
+        NodeRole.SUPERVISOR, config, address=supervisor_pool.external_address
+    )
+    await start_services(NodeRole.WORKER, config, address=worker_pool.external_address)
+
+    session_id = "test_session"
+    session_api = await SessionAPI.create(supervisor_pool.external_address)
+    await session_api.create_session(session_id)
+
+    if not request.param:
+        task_api = await TaskAPI.create(session_id, supervisor_pool.external_address)
+    else:
+        web_actor = await mo.actor_ref(
+            WebActor.default_uid(), address=supervisor_pool.external_address
+        )
+        web_address = await web_actor.get_web_address()
+        task_api = WebTaskAPI(session_id, web_address)
+
+    assert await task_api.get_task_results() == []
+
+    # create mock meta and storage APIs
+    _ = await MetaAPI.create(session_id, supervisor_pool.external_address)
+    storage_api = await MockStorageAPI.create(session_id, worker_pool.external_address)
+    return task_api, storage_api, config
+
+
+@pytest.mark.parametrize(indirect=True)
+@pytest.fixture(params=[False, True])
+async def start_test_service(actor_pools, request):
+    sv_pool, worker_pool = actor_pools
+
+    task_api, storage_api, config = await _start_services(sv_pool, worker_pool, request)
+
+    try:
+        yield sv_pool.external_address, task_api, storage_api
+    finally:
+        await MockStorageAPI.cleanup(worker_pool.external_address)
+        await stop_services(NodeRole.WORKER, config, worker_pool.external_address)
+        await stop_services(NodeRole.SUPERVISOR, config, sv_pool.external_address)
+
+
+class MockTaskProcessor(TaskProcessor):
+    @classmethod
+    def _get_decref_stage_chunk_keys(cls, stage_processor):
+        import time
+
+        # time.sleep to block async thread
+        time.sleep(5)
+        return super()._get_decref_stage_chunk_keys(stage_processor)
+
+
+@pytest.mark.parametrize(indirect=True)
+@pytest.fixture(params=[True])
+async def start_test_service_with_mock(actor_pools, request):
+    sv_pool, worker_pool = actor_pools
+
+    task_api, storage_api, config = await _start_services(
+        sv_pool,
+        worker_pool,
+        request,
+        task_processor_cls="mars.services.task.tests.test_service.MockTaskProcessor",
+    )
+
+    try:
+        yield sv_pool.external_address, task_api, storage_api
+    finally:
+        await MockStorageAPI.cleanup(worker_pool.external_address)
+        await stop_services(NodeRole.WORKER, config, worker_pool.external_address)
+        await stop_services(NodeRole.SUPERVISOR, config, sv_pool.external_address)
+
+
+@pytest.mark.asyncio
+async def test_task_timeout_execution(start_test_service_with_mock):
+    _sv_pool_address, task_api, storage_api = start_test_service_with_mock
+
+    def f1():
+        return np.arange(5)
+
+    def f2():
+        return np.arange(5, 10)
+
+    def f3(f1r, f2r):
+        return np.concatenate([f1r, f2r]).sum()
+
+    r1 = mr.spawn(f1)
+    r2 = mr.spawn(f2)
+    r3 = mr.spawn(f3, args=(r1, r2))
+
+    graph = TileableGraph([r3.data])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False)
+    assert await task_api.get_last_idle_time() is None
+    assert isinstance(task_id, str)
+
+    await task_api.wait_task(task_id, timeout=2)
+    task_result = await task_api.get_task_result(task_id)
+
+    assert task_result.status == TaskStatus.terminated
+
+
+@pytest.mark.asyncio
+async def test_task_execution(start_test_service):
+    _sv_pool_address, task_api, storage_api = start_test_service
+
+    def f1():
+        return np.arange(5)
+
+    def f2():
+        return np.arange(5, 10)
+
+    def f3(f1r, f2r):
+        return np.concatenate([f1r, f2r]).sum()
+
+    r1 = mr.spawn(f1)
+    r2 = mr.spawn(f2)
+    r3 = mr.spawn(f3, args=(r1, r2))
+
+    graph = TileableGraph([r3.data])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False)
+    assert await task_api.get_last_idle_time() is None
+    assert isinstance(task_id, str)
+
+    await task_api.wait_task(task_id)
+    task_result = await task_api.get_task_result(task_id)
+
+    assert task_result.status == TaskStatus.terminated
+    assert await task_api.get_last_idle_time() is not None
+    if task_result.error is not None:
+        raise task_result.error.with_traceback(task_result.traceback)
+
+    result_tileable = (await task_api.get_fetch_tileables(task_id))[0]
+    data_key = result_tileable.chunks[0].key
+    assert await storage_api.get(data_key) == 45
+
+
+@pytest.mark.asyncio
+async def test_task_error(start_test_service):
+    _sv_pool_address, task_api, storage_api = start_test_service
+
+    # test job cancel
+    def f1():
+        raise SystemError
+
+    rs = [mr.spawn(f1) for _ in range(10)]
+
+    graph = TileableGraph([r.data for r in rs])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False)
+
+    await task_api.wait_task(task_id, timeout=10)
+    results = await task_api.get_task_results(progress=True)
+    assert isinstance(results[0].error, SystemError)
+
+
+@pytest.mark.asyncio
+async def test_task_cancel(start_test_service):
+    _sv_pool_address, task_api, storage_api = start_test_service
+
+    # test job cancel
+    def f1():
+        time.sleep(100)
+
+    rs = [mr.spawn(f1) for _ in range(10)]
+
+    graph = TileableGraph([r.data for r in rs])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False)
+    await asyncio.sleep(1)
+    with Timer() as timer:
+        await task_api.cancel_task(task_id)
+        await asyncio.sleep(1)
+        result = await task_api.get_task_result(task_id)
+        assert result.status == TaskStatus.terminated
+    assert timer.duration < 20
+    await asyncio.sleep(0.1)
+    assert await task_api.get_last_idle_time() is not None
+
+    results = await task_api.get_task_results(progress=True)
+    assert all(result.status == TaskStatus.terminated for result in results)
+
+
+class _ProgressController:
+    def __init__(self):
+        self._step_event = asyncio.Event()
+
+    async def wait(self):
+        await self._step_event.wait()
+        self._step_event.clear()
+
+    def set(self):
+        self._step_event.set()
+
+
+@pytest.mark.asyncio
+async def test_task_progress(start_test_service):
+    sv_pool_address, task_api, storage_api = start_test_service
+
+    session_api = await SessionAPI.create(address=sv_pool_address)
+    ref = await session_api.create_remote_object(
+        task_api._session_id, "progress_controller", _ProgressController
+    )
+
+    def f1(count: int):
+        progress_controller = get_context().get_remote_object("progress_controller")
+        for idx in range(count):
+            progress_controller.wait()
+            get_context().set_progress((1 + idx) * 1.0 / count)
+
+    r = mr.spawn(f1, args=(2,))
+
+    graph = TileableGraph([r.data])
+    next(TileableGraphBuilder(graph).build())
+
+    await task_api.submit_tileable_graph(graph, fuse_enabled=False)
+
+    await asyncio.sleep(0.2)
+    results = await task_api.get_task_results(progress=True)
+    assert results[0].progress == 0.0
+
+    await ref.set()
+    await asyncio.sleep(1)
+    results = await task_api.get_task_results(progress=True)
+    assert results[0].progress == 0.5
+
+    await ref.set()
+    await asyncio.sleep(1)
+    results = await task_api.get_task_results(progress=True)
+    assert results[0].progress == 1.0
+
+
+class _TileProgressOperand(TensorOperand, TensorOperandMixin):
+    @classmethod
+    def tile(cls, op: "_TileProgressOperand"):
+        progress_controller = get_context().get_remote_object("progress_controller")
+
+        t = yield from recursive_tile(mt.random.rand(10, 10, chunk_size=5))
+        yield TileStatus(t.chunks, progress=0.25)
+        progress_controller.wait()
+
+        new_op = op.copy()
+        params = op.outputs[0].params.copy()
+        params["chunks"] = t.chunks
+        params["nsplits"] = t.nsplits
+        return new_op.new_tileables(t.inputs, kws=[params])
+
+
+@pytest.mark.asyncio
+async def test_task_tile_progress(start_test_service):
+    sv_pool_address, task_api, storage_api = start_test_service
+
+    session_api = await SessionAPI.create(address=sv_pool_address)
+    ref = await session_api.create_remote_object(
+        task_api._session_id, "progress_controller", _ProgressController
+    )
+
+    t = _TileProgressOperand(dtype=np.dtype(np.float64)).new_tensor(
+        None, (10, 10), order=TensorOrder.C_ORDER
+    )
+
+    graph = TileableGraph([t.data])
+    next(TileableGraphBuilder(graph).build())
+
+    await task_api.submit_tileable_graph(graph, fuse_enabled=False)
+
+    await asyncio.sleep(1)
+    results = await task_api.get_task_results(progress=True)
+    assert results[0].progress == 0.25
+
+    await ref.set()
+    await asyncio.sleep(1)
+    results = await task_api.get_task_results(progress=True)
+    assert results[0].progress == 1.0
+
+
+@pytest.mark.asyncio
+async def test_get_tileable_graph(start_test_service):
+    _sv_pool_address, task_api, storage_api = start_test_service
+
+    def f1():
+        return np.arange(5)
+
+    def f2():
+        return np.arange(5, 10)
+
+    def f3(f1r, f2r):
+        return np.concatenate([f1r, f2r]).sum()
+
+    r1 = mr.spawn(f1)
+    r2 = mr.spawn(f2)
+    r3 = mr.spawn(f3, args=(r1, r2))
+
+    graph = TileableGraph([r3.data])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False)
+    try:
+        with pytest.raises(TaskNotExist):
+            await task_api.get_tileable_graph_as_json("non_exist")
+
+        tileable_detail = await task_api.get_tileable_graph_as_json(task_id)
+
+        num_tileable = len(tileable_detail.get("tileables"))
+        num_dependencies = len(tileable_detail.get("dependencies"))
+        assert num_tileable > 0
+        assert num_dependencies <= (num_tileable / 2) * (num_tileable / 2)
+
+        assert (num_tileable == 1 and num_dependencies == 0) or (
+            num_tileable > 1 and num_dependencies > 0
+        )
+
+        graph_nodes = []
+        graph_dependencies = []
+        for node in graph.iter_nodes():
+            graph_nodes.append(node.key)
+
+            for node_successor in graph.iter_successors(node):
+                graph_dependencies.append(
+                    {
+                        "fromTileableId": node.key,
+                        "toTileableId": node_successor.key,
+                        "linkType": 0,
+                    }
+                )
+
+        for tileable in tileable_detail.get("tileables"):
+            graph_nodes.remove(tileable.get("tileableId"))
+
+        assert len(graph_nodes) == 0
+
+        for i in range(num_dependencies):
+            dependency = tileable_detail.get("dependencies")[i]
+            assert graph_dependencies[i] == dependency
+    finally:
+        await task_api.wait_task(task_id, timeout=120)
+
+
+@pytest.mark.asyncio
+async def test_get_tileable_details(start_test_service):
+    sv_pool_address, task_api, storage_api = start_test_service
+
+    session_api = await SessionAPI.create(address=sv_pool_address)
+    ref = await session_api.create_remote_object(
+        task_api._session_id, "progress_controller", _ProgressController
+    )
+
+    with pytest.raises(TaskNotExist):
+        await task_api.get_tileable_details("non_exist")
+
+    def f(*_args, raises=False):
+        get_context().set_progress(0.5)
+        if raises:
+            raise ValueError
+        progress_controller = get_context().get_remote_object("progress_controller")
+        progress_controller.wait()
+        get_context().set_progress(1.0)
+
+    # test non-fused DAGs
+    r1 = mr.spawn(f)
+    r2 = mr.spawn(f, args=(r1, 0))
+    r3 = mr.spawn(f, args=(r1, 1))
+
+    graph = TileableGraph([r2.data, r3.data])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False)
+
+    def _get_fields(details, field, wrapper=None):
+        rs = [r1, r2, r3]
+        ret = [details[r.key][field] for r in rs]
+        if wrapper:
+            ret = [wrapper(v) for v in ret]
+        return ret
+
+    await asyncio.sleep(1)
+    details = await task_api.get_tileable_details(task_id)
+    assert _get_fields(details, "progress") == [0.5, 0.0, 0.0]
+    assert (
+        _get_fields(details, "status", SubtaskStatus)
+        == [SubtaskStatus.running] + [SubtaskStatus.pending] * 2
+    )
+
+    await ref.set()
+    await asyncio.sleep(1)
+    details = await task_api.get_tileable_details(task_id)
+    assert _get_fields(details, "progress") == [1.0, 0.5, 0.5]
+    assert (
+        _get_fields(details, "status", SubtaskStatus)
+        == [SubtaskStatus.succeeded] + [SubtaskStatus.running] * 2
+    )
+
+    await ref.set()
+    await task_api.wait_task(task_id)
+
+    # test fused DAGs
+    r5 = mr.spawn(f, args=(0,))
+    r6 = mr.spawn(f, args=(r5,))
+
+    graph = TileableGraph([r6.data])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=True)
+
+    await asyncio.sleep(1)
+    details = await task_api.get_tileable_details(task_id)
+    assert details[r5.key]["progress"] == details[r6.key]["progress"] == 0.25
+
+    await ref.set()
+    await asyncio.sleep(0.1)
+    await ref.set()
+    await task_api.wait_task(task_id)
+
+    # test raises
+    r7 = mr.spawn(f, kwargs={"raises": 1})
+
+    graph = TileableGraph([r7.data])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=True)
+    await task_api.wait_task(task_id)
+    details = await task_api.get_tileable_details(task_id)
+    assert details[r7.key]["status"] == SubtaskStatus.errored.value
+
+    for tileable in details.keys():
+        for property_key, property_value in (
+            details.get(tileable).get("properties").items()
+        ):
+            assert property_key != "key"
+            assert property_key != "id"
+            assert isinstance(property_value, (int, float, str))
+
+    # test merge
+    d1 = pd.DataFrame({"a": np.random.rand(100), "b": np.random.randint(3, size=100)})
+    d2 = pd.DataFrame({"c": np.random.rand(100), "b": np.random.randint(3, size=100)})
+    df1 = md.DataFrame(d1, chunk_size=10)
+    df2 = md.DataFrame(d2, chunk_size=10)
+
+    graph = TileableGraph([df1.data, df2.data])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=True)
+    await task_api.wait_task(task_id)
+    details = await task_api.get_tileable_details(task_id)
+    assert details[df1.key]["progress"] == details[df2.key]["progress"] == 1.0
+
+    f1 = build_fetch(df1)
+    f2 = build_fetch(df2)
+    df3 = f1.merge(f2, auto_merge="none", bloom_filter=False)
+    graph = TileableGraph([df3.data])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=True)
+    await task_api.wait_task(task_id)
+    for _ in range(2):
+        # get twice to ensure cache work
+        details = await task_api.get_tileable_details(task_id)
+        assert (
+            details[df3.key]["progress"]
+            == details[f1.key]["progress"]
+            == details[f2.key]["progress"]
+            == 1.0
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("with_input_output", [False, True])
+async def test_get_tileable_subtasks(start_test_service, with_input_output):
+    sv_pool_address, task_api, storage_api = start_test_service
+
+    def a():
+        return md.DataFrame([[1, 2], [3, 4]])
+
+    def b():
+        return md.DataFrame([[1, 2, 3, 4], [4, 3, 2, 1]])
+
+    def c(a, b):
+        return (
+            a.sum()
+            * a.product()
+            * b.sum()
+            * a.sum()
+            / a.sum()
+            * b.product()
+            / a.product()
+        )
+
+    ra = mr.spawn(a)
+    rb = mr.spawn(b)
+    rc = mr.spawn(c, args=(ra, rb))
+
+    graph = TileableGraph([rc.data])
+    next(TileableGraphBuilder(graph).build())
+
+    task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False)
+
+    await asyncio.sleep(1)
+
+    try:
+        tileable_graph_json = await task_api.get_tileable_graph_as_json(task_id)
+        for tileable_json in tileable_graph_json["tileables"]:
+            tileable_id = tileable_json["tileableId"]
+            subtask_details = await task_api.get_tileable_subtasks(
+                task_id, tileable_id, True
+            )
+
+            subtask_deps = []
+            for subtask_id, subtask_detail in subtask_details.items():
+                for from_subtask_id in subtask_detail.get("fromSubtaskIds", ()):
+                    subtask_deps.append((from_subtask_id, subtask_id))
+            assert len(subtask_details) > 0
+
+            for from_id, to_id in subtask_deps:
+                assert from_id in subtask_details
+                assert to_id in subtask_details
+
+            if with_input_output:
+                tileable_inputs = [
+                    dep["fromTileableId"]
+                    for dep in tileable_graph_json["dependencies"]
+                    if dep["toTileableId"] == tileable_id
+                ]
+                tileable_outputs = [
+                    dep["toTileableId"]
+                    for dep in tileable_graph_json["dependencies"]
+                    if dep["fromTileableId"] == tileable_id
+                ]
+                if tileable_inputs:
+                    assert any(
+                        detail["nodeType"] == "Input"
+                        for detail in subtask_details.values()
+                    )
+                if tileable_outputs:
+                    assert any(
+                        detail["nodeType"] == "Output"
+                        for detail in subtask_details.values()
+                    )
+    finally:
+        await task_api.wait_task(task_id, timeout=120)
diff --git a/python/xorbits/_mars/services/task/worker/__init__.py b/python/xorbits/_mars/services/task/worker/__init__.py
new file mode 100644
index 000000000..55b7ebca7
--- /dev/null
+++ b/python/xorbits/_mars/services/task/worker/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...core import EmptyService
+
+
+class TaskWorkerService(EmptyService):
+    pass
diff --git a/python/xorbits/_mars/services/tests/__init__.py b/python/xorbits/_mars/services/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/tests/fault_injection_manager.py b/python/xorbits/_mars/services/tests/fault_injection_manager.py
new file mode 100644
index 000000000..e6e6497f6
--- /dev/null
+++ b/python/xorbits/_mars/services/tests/fault_injection_manager.py
@@ -0,0 +1,98 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+import os
+import uuid
+from abc import ABC, abstractmethod
+
+from ...core.base import MarsError
+from ..session import SessionAPI
+
+
+class ExtraConfigKey:
+    FAULT_INJECTION_MANAGER_NAME = "fault_injection_manager_name"
+
+
+class FaultPosition(enum.Enum):
+    ON_EXECUTE_OPERAND = 0
+    ON_RUN_SUBTASK = 1
+
+
+class FaultType(enum.Enum):
+    NoFault = 0
+    Exception = 1
+    UnhandledException = 2
+    ProcessExit = 3
+
+
+class FaultInjectionError(MarsError):
+    pass
+
+
+class FaultInjectionUnhandledError(Exception):
+    pass
+
+
+def handle_fault(fault):
+    if fault == FaultType.Exception:
+        raise FaultInjectionError("Fault Injection")
+    elif fault == FaultType.UnhandledException:
+        raise FaultInjectionUnhandledError("Fault Injection Unhandled")
+    elif fault == FaultType.ProcessExit:
+        # used to simulate process crash, no cleanup.
+        os._exit(-1)
+    assert fault == FaultType.NoFault, f"Got unexpected fault: {fault}"
+
+
+class AbstractFaultInjectionManager(ABC):
+    """
+    The abstract base of fault injection manager for test.
+    """
+
+    name = str(uuid.uuid4())
+
+    @abstractmethod
+    def get_fault(self, pos: FaultPosition, ctx=None) -> FaultType:
+        """
+        Get fault at position.
+
+        Parameters
+        ----------
+        pos
+            The fault position.
+        ctx
+            The fault context.
+
+        Returns
+        -------
+            The fault type.
+        """
+        pass
+
+    @classmethod
+    async def create(cls, session_id, supervisor_address):
+        """
+        Create the fault injection manager on supervisor.
+
+        Parameters
+        ----------
+        session_id
+            The session id.
+        supervisor_address
+            The supervisor address.
+        -------
+        """
+        session_api = await SessionAPI.create(supervisor_address)
+        await session_api.create_remote_object(session_id, cls.name, cls)
diff --git a/python/xorbits/_mars/services/tests/fault_injection_patch.py b/python/xorbits/_mars/services/tests/fault_injection_patch.py
new file mode 100644
index 000000000..8d09f0dc1
--- /dev/null
+++ b/python/xorbits/_mars/services/tests/fault_injection_patch.py
@@ -0,0 +1,96 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict
+
+from ... import oscar as mo
+from ...core import OperandType
+from ...lib.aio import alru_cache
+from ...tests.core import patch_cls
+from ...tests.core import patch_super as super
+from ..scheduling.worker.execution import SubtaskExecutionActor
+from ..session import SessionAPI
+from ..subtask import Subtask
+from ..subtask.worker.processor import SubtaskProcessor
+from ..tests.fault_injection_manager import (
+    AbstractFaultInjectionManager,
+    ExtraConfigKey,
+    FaultPosition,
+    handle_fault,
+)
+
+
+@patch_cls(SubtaskExecutionActor)
+class FaultInjectedSubtaskExecutionActor(SubtaskExecutionActor):
+    @alru_cache(cache_exceptions=False)
+    async def _get_fault_injection_manager_ref(
+        self, supervisor_address: str, session_id: str, name: str
+    ) -> mo.ActorRefType[AbstractFaultInjectionManager]:
+        session_api = await self._get_session_api(supervisor_address)
+        return await session_api.get_remote_object(session_id, name)
+
+    @staticmethod
+    @alru_cache(cache_exceptions=False)
+    async def _get_session_api(supervisor_address: str):
+        return await SessionAPI.create(supervisor_address)
+
+    async def internal_run_subtask(self, subtask: Subtask, band_name: str):
+        # fault injection
+        if subtask.extra_config:
+            fault_injection_manager_name = subtask.extra_config.get(
+                ExtraConfigKey.FAULT_INJECTION_MANAGER_NAME
+            )
+            if fault_injection_manager_name is not None:
+                subtask_info = self._subtask_info[subtask.subtask_id]
+                fault_injection_manager = await self._get_fault_injection_manager_ref(
+                    subtask_info.supervisor_address,
+                    subtask.session_id,
+                    fault_injection_manager_name,
+                )
+                fault = await fault_injection_manager.get_fault(
+                    FaultPosition.ON_RUN_SUBTASK, {"subtask": subtask}
+                )
+                handle_fault(fault)
+        return super().internal_run_subtask(subtask, band_name)
+
+
+@patch_cls(SubtaskProcessor)
+class FaultInjectionSubtaskProcessor(SubtaskProcessor):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._fault_injection_manager_ref: mo.ActorRefType[
+            AbstractFaultInjectionManager
+        ] = None
+
+    async def run(self):
+        if self.subtask.extra_config:
+            fault_injection_manager_name = self.subtask.extra_config.get(
+                ExtraConfigKey.FAULT_INJECTION_MANAGER_NAME
+            )
+            if fault_injection_manager_name is not None:
+                self._fault_injection_manager_ref = (
+                    await self._session_api.get_remote_object(
+                        self._session_id, fault_injection_manager_name
+                    )
+                )
+        return await super().run()
+
+    async def _async_execute_operand(self, ctx: Dict[str, Any], op: OperandType):
+        if self._fault_injection_manager_ref is not None:
+            fault = await self._fault_injection_manager_ref.get_fault(
+                FaultPosition.ON_EXECUTE_OPERAND,
+                {"subtask": self.subtask, "operand": op},
+            )
+            handle_fault(fault)
+        return await super()._async_execute_operand(ctx, op)
diff --git a/python/xorbits/_mars/services/tests/test_core.py b/python/xorbits/_mars/services/tests/test_core.py
new file mode 100644
index 000000000..8e5c8857a
--- /dev/null
+++ b/python/xorbits/_mars/services/tests/test_core.py
@@ -0,0 +1,102 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from tornado import httpclient
+
+from ... import oscar as mo
+from ...utils import get_next_port
+from .. import (
+    NodeRole,
+    create_service_session,
+    destroy_service_session,
+    start_services,
+    stop_services,
+)
+
+
+@pytest.fixture
+async def actor_pool_context():
+    pool = await mo.create_actor_pool(f"127.0.0.1:{get_next_port()}", n_process=0)
+    await pool.start()
+    try:
+        yield pool
+    finally:
+        await pool.stop()
+
+
+@pytest.mark.asyncio
+async def test_start_service(actor_pool_context):
+    from .test_svcs.test_svc1.supervisor import SvcSessionActor1
+
+    pool = actor_pool_context
+    web_port = get_next_port()
+    config = {
+        "services": [["test_svc1"], "test_svc2", "test_warn_svc", "web"],
+        "modules": "mars.services.tests.test_svcs",
+        "test_svc1": {"uid": "TestActor1", "arg1": "val1"},
+        "test_svc2": {"uid": "TestActor2", "arg2": "val2", "ref": "TestActor1"},
+        "web": {"port": web_port},
+    }
+    with pytest.warns(RuntimeWarning) as record:
+        await start_services(NodeRole.SUPERVISOR, config, address=pool.external_address)
+        assert "test_warn_svc" in str(record[-1].message)
+
+    ref1 = await mo.actor_ref("TestActor1", address=pool.external_address)
+    ref2 = await mo.actor_ref("TestActor2", address=pool.external_address)
+    assert await ref1.get_arg() == "val1"
+    assert await ref2.get_arg() == "val1:val2"
+
+    with pytest.raises(ImportError):
+        await start_services(
+            NodeRole.SUPERVISOR,
+            {"services": ["non-exist-svc"]},
+            address=pool.external_address,
+        )
+
+    session_id = "test_session"
+    await create_service_session(
+        NodeRole.SUPERVISOR,
+        config,
+        session_id=session_id,
+        address=pool.external_address,
+    )
+    assert await mo.has_actor(
+        mo.create_actor_ref(
+            uid=SvcSessionActor1.gen_uid(session_id), address=pool.external_address
+        )
+    )
+    await destroy_service_session(
+        NodeRole.SUPERVISOR,
+        config,
+        session_id=session_id,
+        address=pool.external_address,
+    )
+    assert not await mo.has_actor(
+        mo.create_actor_ref(
+            uid=SvcSessionActor1.gen_uid(session_id), address=pool.external_address
+        )
+    )
+
+    client = httpclient.AsyncHTTPClient()
+    resp = await client.fetch(f"http://127.0.0.1:{web_port}/test_actor1/test_api")
+    assert resp.body.decode() == "val1"
+
+    await stop_services(NodeRole.SUPERVISOR, config, address=pool.external_address)
+    assert not await mo.has_actor(
+        mo.create_actor_ref("TestActor1", address=pool.external_address)
+    )
+    assert not await mo.has_actor(
+        mo.create_actor_ref("TestActor2", address=pool.external_address)
+    )
diff --git a/python/xorbits/_mars/services/tests/test_patch.py b/python/xorbits/_mars/services/tests/test_patch.py
new file mode 100644
index 000000000..ad2552e37
--- /dev/null
+++ b/python/xorbits/_mars/services/tests/test_patch.py
@@ -0,0 +1,110 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+
+class A:
+    def __init__(self):
+        self.value = ["A"]
+
+    def test_method(self):
+        return ["A"]
+
+    @classmethod
+    def test_classmethod(cls):
+        return ["A"]
+
+
+class B(A):
+    def __init__(self):
+        super().__init__()
+        self.value += ["B"]
+
+    def test_method(self):
+        return super().test_method() + ["B"]
+
+    def test_method2(self):
+        return super().test_method() + ["BB"]
+
+    @classmethod
+    def test_classmethod(cls):
+        return super().test_classmethod() + ["B"]
+
+    @classmethod
+    def test_classmethod2(cls):
+        return super().test_classmethod() + ["BB"]
+
+
+class C(B):
+    def __init__(self):
+        super().__init__()
+        self.value += ["C"]
+
+    def test_method(self):
+        return super().test_method() + ["C"]
+
+    @classmethod
+    def test_classmethod(cls):
+        return super().test_classmethod() + ["C"]
+
+
+class Dummy:
+    pass
+
+
+def test_patch_super():
+    from ...tests.core import patch_cls
+    from ...tests.core import patch_super as super
+
+    @patch_cls(B)
+    class D(B):
+        def __init__(self):
+            super().__init__()
+            self.value += ["D"]
+
+        def test_method(self):
+            return super().test_method() + super().test_method2() + ["D"]
+
+        @classmethod
+        def test_classmethod(cls):
+            return super().test_classmethod() + super().test_classmethod2() + ["D"]
+
+    b = B()
+    assert B.test_classmethod() == ["A", "B", "A", "BB", "D"]
+    assert b.test_method() == ["A", "B", "A", "BB", "D"]
+    assert b.value == ["A", "B", "D"]
+
+    c = C()
+    assert C.test_classmethod() == ["A", "B", "A", "BB", "D", "C"]
+    assert c.test_method() == ["A", "B", "A", "BB", "D", "C"]
+    assert c.value == ["A", "B", "D", "C"]
+
+    @patch_cls(Dummy)
+    class E:
+        def __init__(self):
+            super().__init__()
+
+        def test_method(self):
+            return super().test_method() + ["D"]
+
+        @classmethod
+        def test_classmethod(cls):
+            return super().test_classmethod() + ["D"]
+
+    dummy = Dummy()
+    with pytest.raises(AttributeError):
+        dummy.test_method()
+    with pytest.raises(AttributeError):
+        Dummy.test_classmethod()
diff --git a/python/xorbits/_mars/services/tests/test_svcs/__init__.py b/python/xorbits/_mars/services/tests/test_svcs/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/tests/test_svcs/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/tests/test_svcs/test_svc1/__init__.py b/python/xorbits/_mars/services/tests/test_svcs/test_svc1/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/tests/test_svcs/test_svc1/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/tests/test_svcs/test_svc1/api/__init__.py b/python/xorbits/_mars/services/tests/test_svcs/test_svc1/api/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/tests/test_svcs/test_svc1/api/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/tests/test_svcs/test_svc1/api/web.py b/python/xorbits/_mars/services/tests/test_svcs/test_svc1/api/web.py
new file mode 100644
index 000000000..50f38c99b
--- /dev/null
+++ b/python/xorbits/_mars/services/tests/test_svcs/test_svc1/api/web.py
@@ -0,0 +1,27 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...... import oscar as mo
+from .....web import MarsRequestHandler
+
+
+class TestWebHandler(MarsRequestHandler):
+    async def get(self):
+        ref = await mo.actor_ref("TestActor1", address=self._supervisor_addr)
+        self.write(str(await ref.get_arg()))
+
+
+web_handlers = {
+    "/test_actor1/test_api": TestWebHandler,
+}
diff --git a/python/xorbits/_mars/services/tests/test_svcs/test_svc1/supervisor.py b/python/xorbits/_mars/services/tests/test_svcs/test_svc1/supervisor.py
new file mode 100644
index 000000000..a7a4beb25
--- /dev/null
+++ b/python/xorbits/_mars/services/tests/test_svcs/test_svc1/supervisor.py
@@ -0,0 +1,65 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..... import oscar as mo
+from ....core import AbstractService
+
+
+class SvcActor1(mo.Actor):
+    def __init__(self, arg):
+        super().__init__()
+        self._arg = arg
+
+    def get_arg(self):
+        return self._arg
+
+
+class SvcSessionActor1(mo.Actor):
+    @classmethod
+    def gen_uid(cls, session_id: str):
+        return f"{session_id}_svc_session_actor1"
+
+
+class TestService1(AbstractService):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    async def start(self):
+        svc_config = self._config["test_svc1"]
+        await mo.create_actor(
+            SvcActor1,
+            uid=svc_config["uid"],
+            arg=svc_config["arg1"],
+            address=self._address,
+        )
+
+    async def stop(self):
+        svc_config = self._config["test_svc1"]
+        await mo.destroy_actor(
+            mo.create_actor_ref(uid=svc_config["uid"], address=self._address)
+        )
+
+    async def create_session(self, session_id: str):
+        await mo.create_actor(
+            SvcSessionActor1,
+            uid=SvcSessionActor1.gen_uid(session_id),
+            address=self._address,
+        )
+
+    async def destroy_session(self, session_id: str):
+        await mo.destroy_actor(
+            mo.create_actor_ref(
+                uid=SvcSessionActor1.gen_uid(session_id), address=self._address
+            )
+        )
diff --git a/python/xorbits/_mars/services/tests/test_svcs/test_svc2/__init__.py b/python/xorbits/_mars/services/tests/test_svcs/test_svc2/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/tests/test_svcs/test_svc2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/tests/test_svcs/test_svc2/supervisor.py b/python/xorbits/_mars/services/tests/test_svcs/test_svc2/supervisor.py
new file mode 100644
index 000000000..efe5b8480
--- /dev/null
+++ b/python/xorbits/_mars/services/tests/test_svcs/test_svc2/supervisor.py
@@ -0,0 +1,50 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..... import oscar as mo
+from ....core import AbstractService
+
+
+class SvcActor2(mo.Actor):
+    def __init__(self, arg, ref_uid):
+        super().__init__()
+        self._arg = arg
+        self._ref_uid = ref_uid
+        self._ref = None
+
+    async def __post_create__(self):
+        self._ref = await mo.actor_ref(self._ref_uid, address=self.address)
+
+    async def get_arg(self):
+        return await self._ref.get_arg() + ":" + self._arg
+
+
+class TestService2(AbstractService):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._ref = None
+
+    async def start(self):
+        svc_config = self._config["test_svc2"]
+        self._ref = await mo.create_actor(
+            SvcActor2,
+            uid=svc_config["uid"],
+            arg=svc_config["arg2"],
+            ref_uid=svc_config["ref"],
+            address=self._address,
+        )
+
+    async def stop(self):
+        assert self._ref is not None
+        await self._ref.destroy()
diff --git a/python/xorbits/_mars/services/tests/test_svcs/test_warn_svc/__init__.py b/python/xorbits/_mars/services/tests/test_svcs/test_warn_svc/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/tests/test_svcs/test_warn_svc/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/tests/test_svcs/test_warn_svc/supervisor.py b/python/xorbits/_mars/services/tests/test_svcs/test_warn_svc/supervisor.py
new file mode 100644
index 000000000..6566ff30e
--- /dev/null
+++ b/python/xorbits/_mars/services/tests/test_svcs/test_warn_svc/supervisor.py
@@ -0,0 +1,19 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....core import AbstractService
+
+
+class AbsDeriveService(AbstractService):
+    pass
diff --git a/python/xorbits/_mars/services/web/__init__.py b/python/xorbits/_mars/services/web/__init__.py
new file mode 100644
index 000000000..4021fe30a
--- /dev/null
+++ b/python/xorbits/_mars/services/web/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .api import OscarWebAPI
+from .core import (
+    MarsRequestHandler,
+    MarsServiceWebAPIHandler,
+    MarsWebAPIClientMixin,
+    web_api,
+)
+
+try:
+    from .supervisor import WebActor
+except ImportError:  # pragma: no cover
+    pass
diff --git a/python/xorbits/_mars/services/web/api/__init__.py b/python/xorbits/_mars/services/web/api/__init__.py
new file mode 100644
index 000000000..807c8487e
--- /dev/null
+++ b/python/xorbits/_mars/services/web/api/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .oscar import OscarWebAPI
diff --git a/python/xorbits/_mars/services/web/api/oscar.py b/python/xorbits/_mars/services/web/api/oscar.py
new file mode 100644
index 000000000..53df33fca
--- /dev/null
+++ b/python/xorbits/_mars/services/web/api/oscar.py
@@ -0,0 +1,35 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Type
+
+from .... import oscar as mo
+from ....lib.aio import alru_cache
+
+
+class OscarWebAPI:
+    def __init__(self, address: str, web_ref: mo.ActorRef):
+        self._address = address
+        self._web_ref = web_ref
+
+    @classmethod
+    @alru_cache(cache_exceptions=False)
+    async def create(cls: Type["OscarWebAPI"], address: str) -> "OscarWebAPI":
+        from ..supervisor import WebActor
+
+        ref = await mo.actor_ref(WebActor.default_uid(), address=address)
+        return cls(address, ref)
+
+    async def get_web_address(self) -> str:
+        return await self._web_ref.get_web_address()
diff --git a/python/xorbits/_mars/services/web/api/web.py b/python/xorbits/_mars/services/web/api/web.py
new file mode 100644
index 000000000..e8905e23a
--- /dev/null
+++ b/python/xorbits/_mars/services/web/api/web.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+from ..core import MarsRequestHandler
+
+
+class MarsApiEntryHandler(MarsRequestHandler):
+    def get(self):
+        import mars
+
+        version = mars.__version__
+        self.write(json.dumps({"mars_version": version}))
+
+
+web_handlers = {"/api": MarsApiEntryHandler}
diff --git a/python/xorbits/_mars/services/web/core.py b/python/xorbits/_mars/services/web/core.py
new file mode 100644
index 000000000..bdbbf7c76
--- /dev/null
+++ b/python/xorbits/_mars/services/web/core.py
@@ -0,0 +1,273 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import functools
+import inspect
+import logging
+import re
+import sys
+import urllib.parse
+from collections import defaultdict
+from typing import Callable, Dict, List, NamedTuple, Optional, Type, Union
+
+from tornado import httpclient, web
+from tornado.simple_httpclient import HTTPRequest, HTTPTimeoutError
+
+from ...lib.aio import alru_cache
+from ...utils import deserialize_serializable, serialize_serializable
+
+if sys.version_info[:2] == (3, 6):
+    # make sure typing works
+    re.Pattern = type(re.compile(r".*"))
+
+logger = logging.getLogger(__name__)
+_ROOT_PLACEHOLDER = "ROOT_PLACEHOLDER"
+
+
+class MarsRequestHandler(web.RequestHandler):  # pragma: no cover
+    def initialize(self, supervisor_addr: str = None):
+        self._supervisor_addr = supervisor_addr
+
+
+class _WebApiDef(NamedTuple):
+    sub_pattern: str
+    sub_pattern_compiled: re.Pattern
+    method: str
+    arg_filter: Optional[Dict] = None
+
+
+def web_api(
+    sub_pattern: str,
+    method: Union[str, List[str]],
+    arg_filter: Optional[Dict] = None,
+    cache_blocking: bool = False,
+):
+    if not sub_pattern.endswith("$"):  # pragma: no branch
+        sub_pattern += "$"
+    methods = method if isinstance(method, list) else [method]
+
+    def wrapper(func):
+        @functools.wraps(func)
+        async def wrapped(self: "MarsServiceWebAPIHandler", *args, **kwargs):
+            try:
+                if not inspect.iscoroutinefunction(func):
+                    return func(self, *args, **kwargs)
+                elif not cache_blocking or self.request.method.lower() != "get":
+                    res = await func(self, *args, **kwargs)
+                else:
+                    res = await self._create_or_get_url_future(
+                        func, self, *args, **kwargs
+                    )
+                return res
+            except GeneratorExit:
+                raise
+            except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+                exc_type, exc, tb = sys.exc_info()
+                err_msg = (
+                    f"{exc_type.__name__} when handling request with "
+                    f"{type(self).__name__}.{func.__name__}"
+                )
+                logger.exception(err_msg)
+                self.write(serialize_serializable((exc, tb)))
+                self.set_status(500, err_msg)
+
+        wrapped._web_api_defs = [
+            _WebApiDef(sub_pattern, re.compile(sub_pattern), m, arg_filter)
+            for m in methods
+        ]
+        return wrapped
+
+    return wrapper
+
+
+@alru_cache(cache_exceptions=False)
+async def _get_cluster_api(address: str):
+    from ..cluster import ClusterAPI
+
+    return await ClusterAPI.create(address)
+
+
+@alru_cache(cache_exceptions=False)
+async def _get_api_by_key(
+    api_cls: Type, session_id: str, address: str, with_key_arg: bool = True
+):
+    cluster_api = await _get_cluster_api(address)
+    [address] = await cluster_api.get_supervisors_by_keys([session_id])
+    if with_key_arg:
+        return await api_cls.create(session_id, address)
+    else:
+        return await api_cls.create(address)
+
+
+class MarsServiceWebAPIHandler(MarsRequestHandler):
+    _root_pattern: str = None
+    _method_to_handlers: Dict[str, Dict[Callable, _WebApiDef]] = None
+    _uri_to_futures: Dict[str, asyncio.Task] = None
+
+    def __init__(self, *args, **kwargs):
+        self._collect_services()
+        super().__init__(*args, **kwargs)
+
+    def _get_api_by_key(
+        self, api_cls: Type, session_id: str, with_key_arg: bool = True
+    ):
+        return _get_api_by_key(
+            api_cls,
+            session_id,
+            address=self._supervisor_addr,
+            with_key_arg=with_key_arg,
+        )
+
+    def _create_or_get_url_future(self, func, *args, **kw):
+        if self._uri_to_futures is None:
+            type(self)._uri_to_futures = dict()
+
+        uri = self.request.uri
+        if uri in self._uri_to_futures:
+            return self._uri_to_futures[uri]
+
+        def _future_remover(_fut):
+            self._uri_to_futures.pop(uri, None)
+
+        task = self._uri_to_futures[uri] = asyncio.create_task(func(*args, **kw))
+        task.add_done_callback(_future_remover)
+        return task
+
+    @classmethod
+    def _collect_services(cls):
+        if cls._method_to_handlers is not None:
+            return
+
+        cls._method_to_handlers = defaultdict(dict)
+        for attr in dir(cls):
+            handle_func = getattr(cls, attr, None)
+            if not hasattr(handle_func, "_web_api_defs"):
+                continue
+            web_api_defs = getattr(
+                handle_func, "_web_api_defs"
+            )  # type: List[_WebApiDef]
+            for api_def in web_api_defs:
+                cls._method_to_handlers[api_def.method.lower()][handle_func] = api_def
+
+    def prepare(self):
+        self.set_header("Content-Type", "application/octet-stream")
+
+    @classmethod
+    def get_root_pattern(cls):
+        return cls._root_pattern + "(?:/(?P<sub_path>.*)$|$)"
+
+    @functools.lru_cache(100)
+    def _route_sub_path(self, http_method: str, sub_path: str):
+        handlers = self._method_to_handlers[http_method.lower()]
+        method, kwargs = None, None
+        for handler_method, web_api_def in handlers.items():
+            match = web_api_def.sub_pattern_compiled.match(sub_path)
+            if match is not None:
+                if web_api_def.arg_filter is not None:
+                    if not all(
+                        self.get_argument(k, None) == v
+                        for k, v in web_api_def.arg_filter.items()
+                    ):
+                        continue
+                    method, kwargs = handler_method, dict(match.groupdict())
+                elif method is None:
+                    # method matched with arg_filter shall not be overwritten
+                    method, kwargs = handler_method, dict(match.groupdict())
+        if method is not None:
+            return method, kwargs
+        else:
+            raise web.HTTPError(
+                404,
+                f"{sub_path} does not match any defined APIs "
+                f"with method {http_method.upper()}",
+            )
+
+    def _make_handle_http_method(http_method: str):
+        async def _handle_http_method(self: "MarsServiceWebAPIHandler", **kwargs):
+            # make sure results from APIs is not stored
+            self.add_header("Cache-Control", "no-store")
+
+            sub_path = kwargs.pop("sub_path", None) or ""
+            method, kw = self._route_sub_path(http_method, sub_path)
+            kw.update(kwargs)
+            res = method(self, **kw)
+            if inspect.isawaitable(res):
+                await res
+
+        _handle_http_method.__name__ = http_method.lower()
+        return _handle_http_method
+
+    get = _make_handle_http_method("get")
+    put = _make_handle_http_method("put")
+    post = _make_handle_http_method("post")
+    patch = _make_handle_http_method("patch")
+    delete = _make_handle_http_method("delete")
+
+    del _make_handle_http_method
+
+
+class MarsWebAPIClientMixin:
+    @property
+    def _client(self):
+        try:
+            return self._client_obj
+        except AttributeError:
+            self._client_obj = httpclient.AsyncHTTPClient()
+            return self._client_obj
+
+    @property
+    def request_rewriter(self) -> Callable:
+        return getattr(self, "_request_rewriter", None)
+
+    @request_rewriter.setter
+    def request_rewriter(self, value: Callable):
+        self._request_rewriter = value
+
+    async def _request_url(self, method, path, **kwargs):
+        self._running_loop = asyncio.get_running_loop()
+
+        if "data" in kwargs:
+            kwargs["body"] = kwargs.pop("data")
+
+        if "params" in kwargs:
+            params = kwargs.pop("params")
+            for k, v in params.items():
+                if isinstance(v, (list, tuple, set)):
+                    params[k] = ",".join(str(i) for i in v)
+            url_params = urllib.parse.urlencode(params)
+            path_connector = "?" if "?" not in path else "&"
+            path += path_connector + url_params
+
+        try:
+            request = HTTPRequest(path, method=method, **kwargs)
+            if self.request_rewriter:
+                request = self.request_rewriter(request)
+            res = await self._client.fetch(request, raise_error=False)
+        except HTTPTimeoutError as ex:
+            raise TimeoutError(str(ex)) from None
+
+        if res.code < 400:
+            return res
+        else:
+            exc, tb = None, None
+            try:
+                exc, tb = deserialize_serializable(res.body)
+            except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+                pass
+
+            if exc is None:
+                raise res.error
+            else:
+                raise exc.with_traceback(tb)
diff --git a/python/xorbits/_mars/services/web/handlers.py b/python/xorbits/_mars/services/web/handlers.py
new file mode 100644
index 000000000..cc1fc8c05
--- /dev/null
+++ b/python/xorbits/_mars/services/web/handlers.py
@@ -0,0 +1,45 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from tornado import web
+
+from .core import MarsRequestHandler
+
+
+class IndexHandler(MarsRequestHandler):
+    def _get_index_page(self):
+        try:
+            return self._index_page
+        except AttributeError:
+            index_file = os.path.join(
+                os.path.dirname(os.path.abspath(__file__)), "index.html"
+            )
+            with open(index_file, "r") as file_obj:
+                self._index_page = file_obj.read()
+            return self._index_page
+
+    def get(self):
+        self.write(self._get_index_page())
+
+
+handlers = {"/": IndexHandler}
+
+static_handlers = {
+    r"[^\?\&]*/static/(.*)": (
+        web.StaticFileHandler,
+        {"path": os.path.join(os.path.dirname(__file__), "static")},
+    )
+}
diff --git a/python/xorbits/_mars/services/web/index.html b/python/xorbits/_mars/services/web/index.html
new file mode 100644
index 000000000..29c1af7cb
--- /dev/null
+++ b/python/xorbits/_mars/services/web/index.html
@@ -0,0 +1,18 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <title>Mars UI</title>
+    <meta name="viewport" content="minimum-scale=1, initial-scale=1, width=device-width" />
+    <link rel="icon" href="data:;base64,iVBORw0KGgo=" />
+    <script type="text/javascript">
+      if (!window.location.pathname.endsWith("/")) {
+        window.location.assign(window.location.origin + window.location.pathname
+                + "/" + window.location.hash);
+      }
+    </script>
+  </head>
+  <body>
+    <script async src="static/bundle.js"></script>
+    <div id="root"></div>
+  </body>
+</html>
diff --git a/python/xorbits/_mars/services/web/supervisor.py b/python/xorbits/_mars/services/web/supervisor.py
new file mode 100644
index 000000000..3d5405032
--- /dev/null
+++ b/python/xorbits/_mars/services/web/supervisor.py
@@ -0,0 +1,118 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import logging
+import os
+
+from tornado import web
+
+from ... import oscar as mo
+from ...utils import get_next_port
+from ..core import AbstractService
+
+logger = logging.getLogger(__name__)
+
+
+class WebActor(mo.Actor):
+    def __init__(self, config):
+        super().__init__()
+        self._config = config
+        self._web_server = None
+        self._web_app = None
+
+        extra_mod_names = self._config.get("extra_discovery_modules") or []
+        web_handlers = self._config.get("web_handlers", {})
+        for mod_name in extra_mod_names:
+            try:
+                web_mod = importlib.import_module(mod_name)
+                web_handlers.update(getattr(web_mod, "web_handlers", {}))
+            except ImportError:  # pragma: no cover
+                pass
+
+    async def __post_create__(self):
+        from .handlers import handlers, static_handlers
+
+        supervisor_addr = self.address
+
+        host = self._config.get("host") or "0.0.0.0"
+        port = self._config.get("port") or get_next_port()
+        self._web_address = f"http://{host}:{port}"
+        handlers.update(self._config.get("web_handlers", {}))
+        web_handlers = []
+        for p, h in handlers.items():
+            web_handlers.append((p, h, {"supervisor_addr": supervisor_addr}))
+        web_handlers.extend([(*[p], *v) for p, v in static_handlers.items()])
+
+        retrial = 5
+        while retrial:
+            try:
+                if port is None:
+                    port = get_next_port()
+
+                # For debugging tornado, use debug=True to enable hot deploy
+                self._web_app = web.Application(web_handlers)
+                self._web_server = self._web_app.listen(port, host)
+                logger.info("Mars Web started at %s:%d", host, port)
+                break
+            except OSError:  # pragma: no cover
+                if port is not None:
+                    raise
+                retrial -= 1
+                if retrial == 0:
+                    raise
+
+    async def __pre_destroy__(self):
+        if self._web_server is not None:
+            self._web_server.stop()
+
+    def get_web_address(self):
+        web_address = self._web_address
+        if os.name == "nt":
+            web_address = web_address.replace("0.0.0.0", "127.0.0.1")
+        return web_address
+
+
+class WebSupervisorService(AbstractService):
+    """
+    Web service on supervisor.
+
+    Service Configuration
+    ---------------------
+    {
+        "web": {
+            "host": "<web host>",
+            "port": "<web port>",
+            "web_handlers": [
+                <web_handlers>,
+            ],
+            "extra_discovery_modules": [
+                "path.to.modules",
+            ]
+        }
+    }
+    """
+
+    async def start(self):
+        await mo.create_actor(
+            WebActor,
+            config=self._config.get("web", {}),
+            uid=WebActor.default_uid(),
+            address=self._address,
+        )
+
+    async def stop(self):
+        await mo.destroy_actor(
+            mo.create_actor_ref(uid=WebActor.default_uid(), address=self._address)
+        )
diff --git a/python/xorbits/_mars/services/web/tests/__init__.py b/python/xorbits/_mars/services/web/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/services/web/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/services/web/tests/extra_handler.py b/python/xorbits/_mars/services/web/tests/extra_handler.py
new file mode 100644
index 000000000..09450983e
--- /dev/null
+++ b/python/xorbits/_mars/services/web/tests/extra_handler.py
@@ -0,0 +1,23 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core import MarsRequestHandler
+
+
+class ExtraTestHandler(MarsRequestHandler):
+    def get(self):
+        self.write("Test")
+
+
+web_handlers = {"/api/extra_test": ExtraTestHandler}
diff --git a/python/xorbits/_mars/services/web/tests/test_core.py b/python/xorbits/_mars/services/web/tests/test_core.py
new file mode 100644
index 000000000..dfc029a13
--- /dev/null
+++ b/python/xorbits/_mars/services/web/tests/test_core.py
@@ -0,0 +1,168 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+import sys
+
+import pytest
+from tornado import httpclient
+
+from .... import oscar as mo
+from ....utils import get_next_port
+from .. import MarsServiceWebAPIHandler, MarsWebAPIClientMixin, WebActor, web_api
+from ..api.web import MarsApiEntryHandler
+
+
+class TestAPIHandler(MarsServiceWebAPIHandler):
+    __test__ = False
+    _root_pattern = "/api/test/(?P<test_id>[^/]+)"
+    _call_counter = 0
+
+    @web_api("", method="get")
+    def get_method_root(self, test_id):
+        self.write(f"get_root_value_{test_id}")
+
+    @web_api("", method="post")
+    def post_method_root(self, test_id):
+        self.write(f"post_root_value_{test_id}")
+
+    @web_api("subtest/(?P<subtest_id>[^/]+)", method="get")
+    def get_method_sub_patt(self, test_id, subtest_id):
+        self.write(f"get_sub_value_{test_id}_{subtest_id}")
+
+    @web_api("subtest/(?P<subtest_id>[^/]+)", method="get", arg_filter={"action": "a1"})
+    async def get_method_sub_patt_match_arg1(self, test_id, subtest_id):
+        self.write(f"get_sub_value_{test_id}_{subtest_id}_action1")
+
+    @web_api("subtest/(?P<subtest_id>[^/]+)", method="get", arg_filter={"action": "a2"})
+    async def get_method_sub_patt_match_arg2(self, test_id, subtest_id):
+        self.write(f"get_sub_value_{test_id}_{subtest_id}_action2")
+
+    @web_api("subtest_error", method="get")
+    def get_with_error(self, test_id):
+        raise ValueError
+
+    @web_api("subtest_delay", method="get")
+    async def get_with_timeout(self, test_id):
+        await asyncio.sleep(100)
+        raise ValueError(test_id)
+
+    @web_api("subtest_delay_cache", method="get", cache_blocking=True)
+    async def get_with_blocking_cache(self, test_id):
+        await asyncio.sleep(1)
+        type(self)._call_counter += 1
+        self.write(test_id)
+
+
+@pytest.fixture
+async def actor_pool():
+    start_method = (
+        os.environ.get("POOL_START_METHOD", "forkserver")
+        if sys.platform != "win32"
+        else None
+    )
+    pool = await mo.create_actor_pool(
+        "127.0.0.1", n_process=0, subprocess_start_method=start_method
+    )
+    async with pool:
+        web_config = {
+            "host": "127.0.0.1",
+            "port": get_next_port(),
+            "web_handlers": {
+                "/api": MarsApiEntryHandler,
+                TestAPIHandler.get_root_pattern(): TestAPIHandler,
+            },
+            "extra_discovery_modules": ["mars.services.web.tests.extra_handler"],
+        }
+        await mo.create_actor(WebActor, web_config, address=pool.external_address)
+        yield pool, web_config["port"]
+
+
+class SimpleWebClient(MarsWebAPIClientMixin):
+    async def fetch(self, path, method="GET", **kwargs):
+        return await self._request_url(method, path, **kwargs)
+
+
+@pytest.mark.asyncio
+async def test_web_api(actor_pool):
+    _pool, web_port = actor_pool
+    recorded_urls = []
+
+    def url_recorder(request):
+        recorded_urls.append(request.url)
+        return request
+
+    client = SimpleWebClient()
+    client.request_rewriter = url_recorder
+
+    res = await client.fetch(f"http://localhost:{web_port}/")
+    assert res.body.decode()
+
+    res = await client.fetch(f"http://localhost:{web_port}/api")
+    assert res.body.decode()
+
+    res = await client.fetch(f"http://localhost:{web_port}/api/test/test_id")
+    assert res.body.decode() == "get_root_value_test_id"
+
+    res = await client.fetch(
+        f"http://localhost:{web_port}/api/test/test_id", method="POST", data=b""
+    )
+    assert res.body.decode() == "post_root_value_test_id"
+
+    res = await client.fetch(
+        f"http://localhost:{web_port}/api/test/test_id/subtest/sub_tid"
+    )
+    assert res.body.decode() == "get_sub_value_test_id_sub_tid"
+
+    res = await client.fetch(
+        f"http://localhost:{web_port}/api/test/test_id/subtest/sub_tid?action=a1"
+    )
+    assert res.body.decode() == "get_sub_value_test_id_sub_tid_action1"
+
+    res = await client.fetch(
+        f"http://localhost:{web_port}/api/test/test_id/subtest/sub_tid?action=a2"
+    )
+    assert res.body.decode() == "get_sub_value_test_id_sub_tid_action2"
+
+    with pytest.raises(httpclient.HTTPError) as excinfo:
+        await client.fetch(f"http://localhost:{web_port}/api/test/test_id/non_exist")
+    assert excinfo.value.code == 404
+
+    with pytest.raises(ValueError):
+        await client.fetch(
+            f"http://localhost:{web_port}/api/test/test_id/subtest_error"
+        )
+
+    # test multiple request into long immutable requests
+    req_uri = f"http://localhost:{web_port}/api/test/test_id/subtest_delay_cache"
+    tasks = [asyncio.create_task(client.fetch(req_uri)) for _ in range(2)]
+    await asyncio.sleep(0.5)
+    assert TestAPIHandler._call_counter == 0
+    assert len(TestAPIHandler._uri_to_futures) == 1
+
+    await asyncio.gather(*tasks)
+    assert TestAPIHandler._call_counter == 1
+    assert len(TestAPIHandler._uri_to_futures) == 0
+
+    with pytest.raises(TimeoutError):
+        await client.fetch(
+            f"http://localhost:{web_port}/api/test/test_id/subtest_delay",
+            request_timeout=0.5,
+        )
+
+    res = await client.fetch(f"http://localhost:{web_port}/api/extra_test")
+    assert "Test" in res.body.decode()
+
+    assert len(recorded_urls) > 0
diff --git a/python/xorbits/_mars/services/web/worker.py b/python/xorbits/_mars/services/web/worker.py
new file mode 100644
index 000000000..77ff70458
--- /dev/null
+++ b/python/xorbits/_mars/services/web/worker.py
@@ -0,0 +1,19 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core import EmptyService
+
+
+class WebWorkerService(EmptyService):
+    pass
diff --git a/python/xorbits/_mars/session.py b/python/xorbits/_mars/session.py
new file mode 100644
index 000000000..f9b28b7f9
--- /dev/null
+++ b/python/xorbits/_mars/session.py
@@ -0,0 +1,23 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .deploy.oscar.session import execute, fetch, fetch_log, new_session, stop_server
+
+__all__ = [
+    "new_session",
+    "execute",
+    "fetch",
+    "fetch_log",
+    "stop_server",
+]
diff --git a/python/xorbits/_mars/storage/__init__.py b/python/xorbits/_mars/storage/__init__.py
new file mode 100644
index 000000000..db448c859
--- /dev/null
+++ b/python/xorbits/_mars/storage/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import StorageLevel, get_storage_backend
+from .cuda import CudaStorage
+from .filesystem import FileSystemStorage
+from .ray import RayStorage
+from .shared_memory import SharedMemoryStorage
+
+try:
+    # require vineyard, pyarrow
+    from .vineyard import VineyardStorage
+except ImportError:
+    pass
+try:
+    # require pyarrow
+    from .plasma import PlasmaStorage
+except ImportError:
+    pass
diff --git a/python/xorbits/_mars/storage/base.py b/python/xorbits/_mars/storage/base.py
new file mode 100644
index 000000000..e4fb75889
--- /dev/null
+++ b/python/xorbits/_mars/storage/base.py
@@ -0,0 +1,293 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import operator
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Dict, List, Tuple, Type, Union
+
+from ..utils import dataslots
+from .core import StorageFileObject
+
+_storage_backends = dict()
+
+
+def register_storage_backend(backend: Type["StorageBackend"]):
+    _storage_backends[backend.name] = backend
+    return backend
+
+
+def get_storage_backend(backend_name) -> Type["StorageBackend"]:
+    return _storage_backends[backend_name]
+
+
+_ComparableLevel = Union[int, "StorageLevel"]
+
+
+class StorageLevel(Enum):
+    GPU = 1 << 0
+    MEMORY = 1 << 1
+    DISK = 1 << 2
+    REMOTE = 1 << 3
+
+    def __and__(self, other: _ComparableLevel):
+        other_value = getattr(other, "value", other)
+        return self.value & other_value
+
+    __rand__ = __and__
+
+    def __or__(self, other: _ComparableLevel):
+        other_value = getattr(other, "value", other)
+        return self.value | other_value
+
+    __ror__ = __or__
+
+    def __lt__(self, other: _ComparableLevel):
+        other_value = getattr(other, "value", other)
+        return self.value < other_value
+
+    def __gt__(self, other: _ComparableLevel):
+        other_value = getattr(other, "value", other)
+        return self.value > other_value
+
+    def spill_level(self):
+        if self == StorageLevel.GPU:
+            return StorageLevel.MEMORY
+        elif self == StorageLevel.MEMORY:
+            return StorageLevel.DISK
+        else:  # pragma: no cover
+            raise ValueError(f"Level {self} doesn't have spill level")
+
+    @staticmethod
+    def from_str(s: str):
+        level_mapping = StorageLevel.__members__
+        level_strings = [ss.strip() for ss in s.upper().split("|")]
+        levels = []
+        for ls in level_strings:
+            if ls not in level_mapping:  # pragma: no cover
+                raise ValueError(f"Unknown level {ls}")
+            levels.append(level_mapping[ls])
+        return functools.reduce(operator.or_, levels)
+
+
+@dataslots
+@dataclass
+class ObjectInfo:
+    size: int = None
+    device: int = None
+    object_id: Any = None
+
+
+class StorageBackend(ABC):
+    name = None
+    is_seekable = True
+
+    @classmethod
+    @abstractmethod
+    async def setup(cls, **kwargs) -> Tuple[Dict, Dict]:
+        """
+        Setup environments, for example, start plasma store for plasma backend.
+
+        Parameters
+        ----------
+        kwargs : kwargs
+            Kwargs for setup.
+
+        Returns
+        -------
+        Tuple of two dicts
+            Dicts for initialization and teardown.
+        """
+
+    @staticmethod
+    async def teardown(**kwargs):
+        """
+        Clean up the environments.
+
+        Parameters
+        ----------
+        kwargs : kwargs
+             Parameters for clean up.
+        """
+
+    @property
+    def size(self) -> Union[int, None]:
+        """
+        The total size of storage.
+
+        Returns
+        -------
+        Size: int
+            Total size of storage.
+        """
+        return None
+
+    @property
+    @abstractmethod
+    def level(self) -> StorageLevel:
+        """
+        Level of current storage backend.
+
+        Returns
+        -------
+        Level: StorageLevel
+            storage level.
+        """
+
+    @property
+    def backend_info(self) -> dict:
+        """
+        Get the customized backend info of this storage backend.
+
+        Returns
+        -------
+        info: dict
+            Customized storage backend info dict.
+        """
+        return {"name": self.name}
+
+    @abstractmethod
+    async def get(self, object_id, **kwargs) -> object:
+        """
+        Get object by key. For some backends, `columns` or `slice` can pass to get part of data.
+
+        Parameters
+        ----------
+        object_id : object id
+            Object id to get.
+
+        kwargs:
+            Additional keyword arguments
+
+        Returns
+        -------
+        Python object
+        """
+
+    @abstractmethod
+    async def put(self, obj, importance: int = 0) -> ObjectInfo:
+        """
+        Put object into storage with object_id.
+
+        Parameters
+        ----------
+        obj : python object
+            Object to put.
+
+        importance: int
+             The priority to spill when storage is full
+
+        Returns
+        -------
+        ObjectInfo
+            object information including size, raw_size, device
+        """
+
+    @abstractmethod
+    async def delete(self, object_id):
+        """
+        Delete object from storage by object_id.
+
+        Parameters
+        ----------
+        object_id
+            object id
+        """
+
+    @abstractmethod
+    async def object_info(self, object_id) -> ObjectInfo:
+        """
+        Get information about stored object.
+
+        Parameters
+        ----------
+        object_id
+            object id
+
+        Returns
+        -------
+        ObjectInfo
+            Object info including size, device and etc.
+        """
+
+    @abstractmethod
+    async def open_writer(self, size=None) -> StorageFileObject:
+        """
+        Return a file-like object for writing.
+
+        Parameters
+        ----------
+        size: int
+            Maximum size in bytes
+
+        Returns
+        -------
+        fileobj: StorageFileObject
+        """
+
+    @abstractmethod
+    async def open_reader(self, object_id) -> StorageFileObject:
+        """
+        Return a file-like object for reading.
+
+        Parameters
+        ----------
+        object_id
+            Object id
+
+        Returns
+        -------
+        fileobj: StorageFileObject
+        """
+
+    async def list(self) -> List:
+        """
+        List all stored objects in storage.
+
+        Returns
+        -------
+        List of objects
+        """
+
+    async def fetch(self, object_id):
+        """
+        Fetch object to current worker.
+
+        Parameters
+        ----------
+        object_id
+            Object id.
+        """
+
+    async def pin(self, object_id):
+        """
+        Pin the data to prevent the data being released or spilled.
+
+        Parameters
+        ----------
+        object_id
+            object id
+        """
+
+    async def unpin(self, object_id):
+        """
+        Unpin the data, allow storage to release the data.
+
+        Parameters
+        ----------
+        object_id
+            object id
+        """
diff --git a/python/xorbits/_mars/storage/core.py b/python/xorbits/_mars/storage/core.py
new file mode 100644
index 000000000..89055c4ce
--- /dev/null
+++ b/python/xorbits/_mars/storage/core.py
@@ -0,0 +1,155 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+from abc import ABC, abstractmethod
+from concurrent.futures import Executor
+from typing import Any, Optional, Union
+
+from ..lib.aio import AioFileObject
+
+
+class StorageFileObject(AioFileObject):
+    def __init__(
+        self,
+        file: Any,
+        object_id: Any,
+        loop: asyncio.BaseEventLoop = None,
+        executor: Executor = None,
+    ):
+        self._object_id = object_id
+        super().__init__(file, loop=loop, executor=executor)
+
+    @property
+    def object_id(self):
+        return self._object_id
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await super().__aexit__(exc_type, exc_val, exc_tb)
+        if self._executor:
+            self._executor.shutdown(wait=False)
+
+
+class BufferWrappedFileObject(ABC):
+    def __init__(self, object_id: Any, mode: str, size: Optional[int] = None):
+        # check arguments
+        assert mode in ("w", "r"), 'mode must be "w" or "r"'
+        if mode == "w" and size is None:  # pragma: no cover
+            raise ValueError("size must be provided to write")
+
+        self._object_id = object_id
+        self._size = size
+        self._mode = mode
+
+        self._offset = 0
+        self._initialized = False
+        self._closed = False
+
+        self._mv = None
+        self._buffer = None
+
+    @abstractmethod
+    def _read_init(self):
+        """
+        Initialization for read purpose.
+        """
+
+    @abstractmethod
+    def _write_init(self):
+        """
+        Initialization for write purpose.
+        """
+
+    @property
+    def object_id(self):
+        return self._object_id
+
+    @property
+    def buffer(self):
+        return self._buffer
+
+    @property
+    def mode(self):
+        return self._mode
+
+    def read(self, size=-1):
+        if not self._initialized:
+            self._read_init()
+            self._initialized = True
+
+        offset = self._offset
+        size = self._size if size < 0 else size
+        end = min(self._size, offset + size)
+        result = self._mv[offset:end]
+        self._offset = end
+        return result
+
+    def write(self, content: Union[bytes, memoryview]):
+        if not self._initialized:
+            self._write_init()
+            self._initialized = True
+
+        offset = self._offset
+        content_length = getattr(content, "nbytes", len(content))
+        new_offset = offset + content_length
+        self._mv[offset:new_offset] = content
+        self._offset = new_offset
+
+    def seek(self, offset: int, whence: int = os.SEEK_SET):
+        if not self._initialized:
+            self._read_init()
+            self._initialized = True
+
+        if whence == os.SEEK_END:
+            new_offset = self._size + offset
+        elif whence == os.SEEK_CUR:
+            new_offset = self._offset + offset
+        else:
+            assert whence == os.SEEK_SET
+            new_offset = offset
+        if new_offset < 0 or new_offset >= self._size:
+            raise ValueError(
+                f"File offset should be limited to (0, {self._size}), "
+                f"now is {new_offset}"
+            )
+        self._offset = new_offset
+        return self._offset
+
+    def tell(self):
+        return self._offset
+
+    @abstractmethod
+    def _read_close(self):
+        """
+        Close for read.
+        """
+
+    @abstractmethod
+    def _write_close(self):
+        """
+        Close for write.
+        """
+
+    def close(self):
+        if self._closed:
+            return
+
+        self._closed = True
+        if self._mode == "w":
+            self._write_close()
+        else:
+            self._read_close()
+        self._mv = None
+        self._buffer = None
diff --git a/python/xorbits/_mars/storage/cuda.py b/python/xorbits/_mars/storage/cuda.py
new file mode 100644
index 000000000..274aed44d
--- /dev/null
+++ b/python/xorbits/_mars/storage/cuda.py
@@ -0,0 +1,309 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ctypes
+import pickle
+import uuid
+from io import BytesIO
+from typing import Dict, List, Tuple, Union
+
+import numpy as np
+import pandas as pd
+
+from ..serialization import deserialize, serialize
+from ..utils import implements, lazy_import
+from .base import ObjectInfo, StorageBackend, StorageLevel, register_storage_backend
+from .core import StorageFileObject
+
+cupy = lazy_import("cupy")
+cudf = lazy_import("cudf")
+
+
+_id_to_buffers = dict()
+
+
+class CudaFileObject:
+    def __init__(self, mode: str, object_id: str, size: int = None):
+        self._mode = mode
+        self._object_id = object_id
+        self._size = size
+        self._closed = False
+        self._buffers = None
+        self._headers = None
+        self._offset = None
+        # for read
+        self._has_read_headers = None
+        # for write
+        self._has_write_headers = None
+        self._cur_buffer_index = None
+        if "r" in mode:
+            assert object_id is not None
+            self._initialize_read()
+        elif "w" in mode:
+            self._initialize_write()
+
+    @property
+    def object_id(self):
+        return self._object_id
+
+    @property
+    def mode(self):
+        return self._mode
+
+    def _initialize_read(self):
+        from cudf.core.buffer import Buffer
+        from cupy.cuda.memory import UnownedMemory
+
+        self._offset = 0
+        self._has_read_headers = False
+        self._buffers = []
+        (metas, serialized), buffers = _id_to_buffers[self._object_id]
+        self._headers = headers = (metas.copy(), serialized)
+        buffer_types = []
+        for buf in buffers:
+            if isinstance(buf, cupy.ndarray):
+                ptr, size = buf.data.ptr, buf.size
+                self._buffers.append(UnownedMemory(ptr, size, Buffer(ptr, size=size)))
+                buffer_types.append(["cuda", size])
+            elif isinstance(buf, Buffer):
+                ptr, size = buf.ptr, buf.size
+                if size == 0:
+                    # empty buffer cannot construct a UnownedMemory
+                    self._buffers.append(None)
+                else:
+                    self._buffers.append(UnownedMemory(ptr, size, Buffer(ptr, size)))
+                buffer_types.append(["cuda", size])
+            else:
+                size = getattr(buf, "size", len(buf))
+                self._buffers.append(buf)
+                buffer_types.append(["memory", size])
+        headers[0]["buffer_types"] = buffer_types
+
+    def _initialize_write(self):
+        self._had_write_headers = False
+        self._cur_buffer_index = 0
+        self._buffers = []
+        self._offset = 0
+
+    def read(self, size: int):
+        # we read cuda_header first and then read cuda buffers one by one,
+        # the return value's size is not exactly the specified size.
+        from cudf.core.buffer import Buffer
+        from cupy.cuda import MemoryPointer
+        from cupy.cuda.memory import UnownedMemory
+
+        if not self._has_read_headers:
+            self._has_read_headers = True
+            return pickle.dumps(self._headers)
+        if len(self._buffers) == 0:
+            return ""
+        cur_buf = self._buffers[0]
+        # current buf read to end
+        if cur_buf is None:
+            # empty cuda buffer
+            content = Buffer.empty(0)
+            self._offset = 0
+            self._buffers.pop(0)
+            return content
+        elif size >= cur_buf.size - self._offset:
+            if isinstance(cur_buf, UnownedMemory):
+                cupy_pointer = MemoryPointer(cur_buf, self._offset)
+                content = Buffer(cupy_pointer.ptr, size=cur_buf.size - self._offset)
+            else:
+                content = cur_buf[self._offset : self._offset + size]
+            self._offset = 0
+            self._buffers.pop(0)
+            return content
+        else:
+            if isinstance(cur_buf, UnownedMemory):
+                cupy_pointer = MemoryPointer(cur_buf, self._offset)
+                self._offset += size
+                return Buffer(cupy_pointer.ptr, size=size)
+            else:
+                self._offset += size
+                return cur_buf[self._offset, self._offset + size]
+
+    def write(self, content):
+        from cupy.cuda import MemoryPointer
+        from cupy.cuda.memory import UnownedMemory
+        from rmm import DeviceBuffer
+
+        if not self._has_write_headers:
+            self._headers = headers = pickle.loads(content)
+            buffer_types = headers[0]["buffer_types"]
+            for buffer_type, size in buffer_types:
+                if buffer_type == "cuda":
+                    self._buffers.append(DeviceBuffer(size=size))
+                else:
+                    self._buffers.append(BytesIO())
+            self._has_write_headers = True
+            return
+
+        cur_buf = self._buffers[self._cur_buffer_index]
+        cur_buf_size = self._headers[0]["buffer_types"][self._cur_buffer_index][1]
+        if isinstance(cur_buf, DeviceBuffer):
+            cur_cupy_memory = UnownedMemory(cur_buf.ptr, cur_buf.size, cur_buf)
+            cupy_pointer = MemoryPointer(cur_cupy_memory, self._offset)
+
+            if isinstance(content, bytes):
+                content_length = len(content)
+                source_mem = np.frombuffer(content, dtype="uint8").ctypes.data_as(
+                    ctypes.c_void_p
+                )
+            else:
+                source_mem = MemoryPointer(
+                    UnownedMemory(content.ptr, content.size, content), 0
+                )
+                content_length = source_mem.mem.size
+            cupy_pointer.copy_from(source_mem, content_length)
+        else:
+            content_length = len(content)
+            cur_buf.write(content)
+        if content_length + self._offset >= cur_buf_size:
+            if isinstance(cur_buf, BytesIO):
+                self._buffers[self._cur_buffer_index] = cur_buf.getvalue()
+            self._cur_buffer_index += 1
+            self._offset = 0
+        else:
+            self._offset += content_length
+
+    def _read_close(self):
+        self._offset = None
+        self._cuda_buffers = None
+        self._cuda_header = None
+        self._has_read_headers = None
+
+    def _write_close(self):
+        headers = self._headers
+        headers[0].pop("buffer_types")
+        # hold cuda buffers
+
+        _id_to_buffers[self._object_id] = headers, self._buffers
+
+        self._has_write_headers = None
+        self._cur_buffer_index = None
+        self._cuda_buffers = None
+        self._cuda_header = None
+        self._offset = None
+
+    def close(self):
+        if self._closed:
+            return
+        self._closed = True
+        if self._mode == "w":
+            self._write_close()
+        else:
+            self._read_close()
+
+
+@register_storage_backend
+class CudaStorage(StorageBackend):
+    name = "cuda"
+    is_seekable = False
+
+    def __init__(self, size=None):
+        self._size = size
+
+    @classmethod
+    @implements(StorageBackend.setup)
+    async def setup(cls, **kwargs) -> Tuple[Dict, Dict]:
+        size = kwargs.pop("size", None)
+        if kwargs:  # pragma: no cover
+            raise TypeError(f'CudaStorage got unexpected config: {",".join(kwargs)}')
+
+        return dict(size=size), dict()
+
+    @staticmethod
+    @implements(StorageBackend.teardown)
+    async def teardown(**kwargs):
+        pass
+
+    @property
+    @implements(StorageBackend.level)
+    def level(self):
+        return StorageLevel.GPU
+
+    @property
+    @implements(StorageBackend.size)
+    def size(self) -> Union[int, None]:
+        return self._size
+
+    @staticmethod
+    def _to_cuda(obj):  # pragma: no cover
+        if isinstance(obj, np.ndarray):
+            return cupy.asarray(obj)
+        elif isinstance(obj, pd.DataFrame):
+            return cudf.DataFrame.from_pandas(obj)
+        elif isinstance(obj, pd.Series):
+            return cudf.Series.from_pandas(obj)
+        return obj
+
+    @implements(StorageBackend.get)
+    async def get(self, object_id: str, **kwargs) -> object:
+        from cudf.core.buffer import Buffer as CPBuffer
+        from rmm import DeviceBuffer
+
+        headers, buffers = _id_to_buffers[object_id]
+        new_buffers = []
+        for buf in buffers:
+            if isinstance(buf, cupy.ndarray):
+                new_buffers.append(DeviceBuffer(ptr=buf.data.ptr, size=buf.size))
+            elif isinstance(buf, CPBuffer):
+                new_buffers.append(DeviceBuffer(ptr=buf.ptr, size=buf.size))
+            else:
+                new_buffers.append(buf)
+        return deserialize(headers, new_buffers)
+
+    @implements(StorageBackend.put)
+    async def put(self, obj, importance=0) -> ObjectInfo:
+        from cudf.core.buffer import Buffer as CPBuffer
+
+        string_id = str(uuid.uuid4())
+        headers, buffers = serialize(obj)
+        size = sum(
+            buf.size for buf in buffers if isinstance(buf, (cupy.ndarray, CPBuffer))
+        )
+        _id_to_buffers[string_id] = headers, buffers
+        return ObjectInfo(size=size, object_id=string_id)
+
+    @implements(StorageBackend.delete)
+    async def delete(self, object_id: str):
+        if object_id in _id_to_buffers:
+            del _id_to_buffers[object_id]
+
+    @implements(StorageBackend.object_info)
+    async def object_info(self, object_id: str) -> ObjectInfo:
+        from cudf.core.buffer import Buffer as CPBuffer
+
+        size = sum(
+            buf.size
+            for buf in _id_to_buffers[object_id][1]
+            if isinstance(buf, (cupy.ndarray, CPBuffer))
+        )
+        return ObjectInfo(size=size, object_id=object_id)
+
+    @implements(StorageBackend.open_writer)
+    async def open_writer(self, size=None) -> StorageFileObject:
+        object_id = str(uuid.uuid4())
+        cuda_writer = CudaFileObject(object_id=object_id, mode="w", size=size)
+        return StorageFileObject(cuda_writer, object_id=object_id)
+
+    @implements(StorageBackend.open_reader)
+    async def open_reader(self, object_id) -> StorageFileObject:
+        cuda_reader = CudaFileObject(mode="r", object_id=object_id)
+        return StorageFileObject(cuda_reader, object_id=object_id)
+
+    @implements(StorageBackend.list)
+    async def list(self) -> List:  # pragma: no cover
+        raise NotImplementedError("Cuda storage doesn't support `list` method.")
diff --git a/python/xorbits/_mars/storage/errors.py b/python/xorbits/_mars/storage/errors.py
new file mode 100644
index 000000000..2fc160d7b
--- /dev/null
+++ b/python/xorbits/_mars/storage/errors.py
@@ -0,0 +1,19 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core.base import MarsError
+
+
+class DataNotExist(MarsError):
+    pass
diff --git a/python/xorbits/_mars/storage/filesystem.py b/python/xorbits/_mars/storage/filesystem.py
new file mode 100644
index 000000000..1b801ce7f
--- /dev/null
+++ b/python/xorbits/_mars/storage/filesystem.py
@@ -0,0 +1,149 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import uuid
+from typing import Dict, List, Optional, Tuple
+
+from ..lib.aio import AioFilesystem
+from ..lib.filesystem import FileSystem, get_fs
+from ..serialization import AioDeserializer, AioSerializer
+from ..utils import implements, mod_hash
+from .base import ObjectInfo, StorageBackend, StorageLevel, register_storage_backend
+from .core import StorageFileObject
+
+
+@register_storage_backend
+class FileSystemStorage(StorageBackend):
+    name = "filesystem"
+
+    def __init__(
+        self, fs: FileSystem, root_dirs: List[str], level: StorageLevel, size: int
+    ):
+        self._fs = AioFilesystem(fs)
+        self._root_dirs = root_dirs
+        self._level = level
+        self._size = size
+
+    @classmethod
+    @implements(StorageBackend.setup)
+    async def setup(cls, **kwargs) -> Tuple[Dict, Dict]:
+        root_dirs = kwargs.pop("root_dirs")
+        level = kwargs.pop("level")
+        size = kwargs.pop("size", None)
+        fs = kwargs.pop("fs", None)
+        if kwargs:  # pragma: no cover
+            raise TypeError(
+                f'FileSystemStorage got unexpected config: {",".join(kwargs)}'
+            )
+
+        if isinstance(root_dirs, str):
+            root_dirs = root_dirs.split(":")
+        if isinstance(level, str):
+            level = StorageLevel.from_str(level)
+
+        if fs is None:
+            fs = get_fs(root_dirs[0])
+
+        for d in root_dirs:
+            if not fs.exists(d):
+                fs.mkdir(d)
+        params = dict(fs=fs, root_dirs=root_dirs, level=level, size=size)
+        return params, params
+
+    @staticmethod
+    @implements(StorageBackend.teardown)
+    async def teardown(**kwargs):
+        fs = kwargs.get("fs")
+        root_dirs = kwargs.get("root_dirs")
+        for d in root_dirs:
+            fs.delete(d, recursive=True)
+
+    @property
+    @implements(StorageBackend.level)
+    def level(self) -> StorageLevel:
+        return self._level
+
+    @property
+    @implements(StorageBackend.size)
+    def size(self) -> Optional[int]:
+        return self._size
+
+    def _generate_path(self):
+        file_name = str(uuid.uuid4())
+        selected_index = mod_hash(file_name, len(self._root_dirs))
+        selected_dir = self._root_dirs[selected_index]
+        return os.path.join(selected_dir, file_name)
+
+    @implements(StorageBackend.get)
+    async def get(self, object_id, **kwargs) -> object:
+        if kwargs:  # pragma: no cover
+            raise NotImplementedError(f'Got unsupported args: {",".join(kwargs)}')
+
+        file = await self._fs.open(object_id, "rb")
+        async with file as f:
+            deserializer = AioDeserializer(f)
+            return await deserializer.run()
+
+    @implements(StorageBackend.put)
+    async def put(self, obj, importance: int = 0) -> ObjectInfo:
+        serializer = AioSerializer(obj)
+        buffers = await serializer.run()
+        buffer_size = sum(getattr(buf, "nbytes", len(buf)) for buf in buffers)
+
+        path = self._generate_path()
+        file = await self._fs.open(path, "wb")
+        async with file as f:
+            for buffer in buffers:
+                await f.write(buffer)
+
+        return ObjectInfo(size=buffer_size, object_id=path)
+
+    @implements(StorageBackend.delete)
+    async def delete(self, object_id):
+        await self._fs.delete(object_id)
+
+    @implements(StorageBackend.list)
+    async def list(self) -> List:
+        file_list = []
+        for d in self._root_dirs:
+            file_list.extend(list(await self._fs.ls(d)))
+        return file_list
+
+    @implements(StorageBackend.object_info)
+    async def object_info(self, object_id) -> ObjectInfo:
+        stat = await self._fs.stat(object_id)
+        return ObjectInfo(size=stat["size"], object_id=object_id)
+
+    @implements(StorageBackend.open_writer)
+    async def open_writer(self, size=None) -> StorageFileObject:
+        path = self._generate_path()
+        file = await self._fs.open(path, "wb")
+        return StorageFileObject(file, file.name)
+
+    @implements(StorageBackend.open_reader)
+    async def open_reader(self, object_id) -> StorageFileObject:
+        file = await self._fs.open(object_id, "rb")
+        return StorageFileObject(file, file.name)
+
+
+@register_storage_backend
+class DiskStorage(FileSystemStorage):
+    name = "disk"
+
+    @classmethod
+    @implements(StorageBackend.setup)
+    async def setup(cls, **kwargs) -> Tuple[Dict, Dict]:
+        kwargs["level"] = StorageLevel.DISK
+        return await super().setup(**kwargs)
diff --git a/python/xorbits/_mars/storage/plasma.py b/python/xorbits/_mars/storage/plasma.py
new file mode 100644
index 000000000..bd1667361
--- /dev/null
+++ b/python/xorbits/_mars/storage/plasma.py
@@ -0,0 +1,287 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import sys
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Any, Dict, List, Optional, Tuple
+
+import psutil
+import pyarrow as pa
+
+from ..resource import virtual_memory
+from ..serialization import AioDeserializer, AioSerializer
+from ..utils import calc_size_by_str, dataslots, implements, lazy_import
+from .base import ObjectInfo, StorageBackend, StorageLevel, register_storage_backend
+from .core import BufferWrappedFileObject, StorageFileObject
+from .errors import DataNotExist
+
+plasma = lazy_import("pyarrow.plasma", rename="plasma")
+if sys.platform.startswith("win"):
+    plasma = None
+
+PAGE_SIZE = 64 * 1024
+
+
+class PlasmaFileObject(BufferWrappedFileObject):
+    def __init__(
+        self,
+        plasma_client: "plasma.PlasmaClient",
+        object_id: Any,
+        mode: str,
+        size: Optional[int] = None,
+    ):
+        self._plasma_client = plasma_client
+        self._file = None
+        super().__init__(object_id, mode, size=size)
+
+    @property
+    def buffer(self):
+        return getattr(self, "_buffer", None)
+
+    def _write_init(self):
+        self._buffer = buf = self._plasma_client.create(self._object_id, self._size)
+        file = self._file = pa.FixedSizeBufferWriter(buf)
+        file.set_memcopy_threads(6)
+
+    def _read_init(self):
+        self._buffer = buf = self._plasma_client.get_buffers([self._object_id])[0]
+        self._mv = memoryview(buf)
+        self._size = len(buf)
+
+    def write(self, content: bytes):
+        if not self._initialized:
+            self._write_init()
+            self._initialized = True
+
+        return self._file.write(content)
+
+    def _write_close(self):
+        try:
+            self._plasma_client.seal(self._object_id)
+        except plasma.PlasmaObjectNotFound:
+            pass
+        self._file = None
+
+    def _read_close(self):
+        pass
+
+
+class PlasmaStorageFileObject(StorageFileObject):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._buffer = None
+
+    async def close(self):
+        self._buffer = self._file.buffer
+        await super().close()
+
+
+@dataslots
+@dataclass
+class PlasmaObjectInfo(ObjectInfo):
+    buffer: memoryview = None
+    plasma_socket: str = None
+
+    @classmethod
+    @lru_cache(5)
+    def _get_plasma_client(cls, socket):
+        return plasma.connect(socket)
+
+    def __getstate__(self):
+        return self.size, self.device, self.object_id, self.plasma_socket
+
+    def __setstate__(self, state):
+        self.size, self.device, self.object_id, self.plasma_socket = state
+        client = self._get_plasma_client(self.plasma_socket)
+        [self.buffer] = client.get_buffers([self.object_id])
+
+
+def get_actual_capacity(plasma_client: "plasma.PlasmaClient") -> int:
+    """
+    Get actual capacity of plasma store
+
+    Parameters
+    ----------
+    plasma_client: PlasmaClient
+        Plasma client.
+
+    Returns
+    -------
+    size: int
+        Actual storage size in bytes
+    """
+    store_limit = plasma_client.store_capacity()
+
+    left_size = store_limit
+    alloc_fraction = 1
+    while True:
+        allocate_size = int(left_size * alloc_fraction / PAGE_SIZE) * PAGE_SIZE
+        try:
+            obj_id = plasma.ObjectID.from_random()
+            buf = [plasma_client.create(obj_id, allocate_size)]
+            plasma_client.seal(obj_id)
+            del buf[:]
+            break
+        except plasma.PlasmaStoreFull:  # pragma: no cover
+            alloc_fraction *= 0.99
+        finally:
+            plasma_client.evict(allocate_size)
+    return allocate_size
+
+
+@register_storage_backend
+class PlasmaStorage(StorageBackend):
+    name = "plasma"
+
+    def __init__(
+        self,
+        plasma_socket: str = None,
+        plasma_directory: str = None,
+        capacity: int = None,
+        check_dir_size: bool = True,
+    ):
+        self._plasma_socket = plasma_socket
+        self._client = plasma.connect(plasma_socket)
+        self._plasma_directory = plasma_directory
+        self._capacity = capacity
+        self._check_dir_size = check_dir_size
+
+    @classmethod
+    @implements(StorageBackend.setup)
+    async def setup(cls, **kwargs) -> Tuple[Dict, Dict]:
+        loop = asyncio.get_running_loop()
+        store_memory = kwargs.pop("store_memory")
+        plasma_directory = kwargs.pop("plasma_directory", None)
+        check_dir_size = kwargs.pop("check_dir_size", True)
+
+        if kwargs:
+            raise TypeError(f'PlasmaStorage got unexpected config: {",".join(kwargs)}')
+
+        store_memory = int(
+            calc_size_by_str(store_memory, virtual_memory().total) * 0.95
+        )
+        plasma_store = plasma.start_plasma_store(
+            store_memory, plasma_directory=plasma_directory
+        )
+        plasma_socket = (await loop.run_in_executor(None, plasma_store.__enter__))[0]
+        init_params = dict(
+            plasma_socket=plasma_socket,
+            plasma_directory=plasma_directory,
+            check_dir_size=check_dir_size,
+        )
+        client = plasma.connect(plasma_socket)
+        actual_capacity = await loop.run_in_executor(None, get_actual_capacity, client)
+        init_params["capacity"] = actual_capacity
+        teardown_params = dict(plasma_store=plasma_store)
+        return init_params, teardown_params
+
+    @staticmethod
+    @implements(StorageBackend.teardown)
+    async def teardown(**kwargs):
+        plasma_store = kwargs.get("plasma_store")
+        plasma_store.__exit__(None, None, None)
+
+    @property
+    @implements(StorageBackend.level)
+    def level(self) -> StorageLevel:
+        return StorageLevel.MEMORY
+
+    @property
+    @implements(StorageBackend.size)
+    def size(self) -> Optional[int]:
+        return self._capacity
+
+    def _check_plasma_limit(self, size: int):
+        used_size = psutil.disk_usage(self._plasma_directory).used
+        total = psutil.disk_usage(self._plasma_directory).total
+        if used_size + size > total * 0.95:  # pragma: no cover
+            raise plasma.PlasmaStoreFull
+
+    def _generate_object_id(self):
+        while True:
+            new_id = plasma.ObjectID.from_random()
+            if not self._client.contains(new_id):
+                return new_id
+
+    @implements(StorageBackend.get)
+    async def get(self, object_id, **kwargs) -> object:
+        if kwargs:  # pragma: no cover
+            raise NotImplementedError(f'Got unsupported args: {",".join(kwargs)}')
+
+        if not self._client.contains(object_id):  # pragma: no cover
+            raise DataNotExist(f"Data {object_id} not exists")
+
+        plasma_file = PlasmaFileObject(self._client, object_id, mode="r")
+
+        async with StorageFileObject(plasma_file, object_id) as f:
+            deserializer = AioDeserializer(f)
+            return await deserializer.run()
+
+    @implements(StorageBackend.put)
+    async def put(self, obj, importance=0) -> ObjectInfo:
+        object_id = self._generate_object_id()
+
+        serializer = AioSerializer(obj)
+        buffers = await serializer.run()
+        buffer_size = sum(getattr(buf, "nbytes", len(buf)) for buf in buffers)
+
+        plasma_file = PlasmaFileObject(
+            self._client, object_id, mode="w", size=buffer_size
+        )
+        async with StorageFileObject(plasma_file, object_id) as f:
+            for buffer in buffers:
+                await f.write(buffer)
+
+        return PlasmaObjectInfo(
+            size=buffer_size,
+            object_id=object_id,
+            buffer=plasma_file.buffer,
+            plasma_socket=self._plasma_socket,
+        )
+
+    @implements(StorageBackend.delete)
+    async def delete(self, object_id):
+        self._client.delete([object_id])
+
+    @implements(StorageBackend.object_info)
+    async def object_info(self, object_id) -> ObjectInfo:
+        buf = self._client.get_buffers([object_id])[0]
+        return PlasmaObjectInfo(
+            size=buf.size,
+            object_id=object_id,
+            buffer=buf,
+            plasma_socket=self._plasma_socket,
+        )
+
+    @implements(StorageBackend.open_writer)
+    async def open_writer(self, size=None) -> StorageFileObject:
+        if size is None:  # pragma: no cover
+            raise ValueError("size must be provided for plasma backend")
+
+        new_id = self._generate_object_id()
+        plasma_writer = PlasmaFileObject(self._client, new_id, size=size, mode="w")
+        return PlasmaStorageFileObject(plasma_writer, object_id=new_id)
+
+    @implements(StorageBackend.open_reader)
+    async def open_reader(self, object_id) -> StorageFileObject:
+        if not self._client.contains(object_id):  # pragma: no cover
+            raise DataNotExist(f"Data {object_id} not exists")
+        plasma_reader = PlasmaFileObject(self._client, object_id, mode="r")
+        return PlasmaStorageFileObject(plasma_reader, object_id=object_id)
+
+    @implements(StorageBackend.list)
+    async def list(self) -> List:
+        return list(self._client.list())
diff --git a/python/xorbits/_mars/storage/ray.py b/python/xorbits/_mars/storage/ray.py
new file mode 100644
index 000000000..46b4c4db4
--- /dev/null
+++ b/python/xorbits/_mars/storage/ray.py
@@ -0,0 +1,248 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Dict, List, Tuple
+
+from ..lib import sparse
+from ..metrics import Metrics, Percentile, record_time_cost_percentile
+from ..oscar.debug import debug_async_timeout
+from ..utils import implements, lazy_import, register_ray_serializer
+from .base import ObjectInfo, StorageBackend, StorageLevel, register_storage_backend
+from .core import BufferWrappedFileObject, StorageFileObject
+
+ray = lazy_import("ray")
+
+
+# TODO(fyrestone): make the SparseMatrix pickleable.
+
+
+def _mars_sparse_matrix_serializer(value):
+    return [value.shape, value.spmatrix]
+
+
+def _mars_sparse_matrix_deserializer(obj) -> sparse.SparseNDArray:
+    shape, spmatrix = obj
+    return sparse.matrix.SparseMatrix(spmatrix, shape=shape)
+
+
+def _register_sparse_matrix_serializer():
+    # register a custom serializer for Mars SparseMatrix
+    register_ray_serializer(
+        sparse.matrix.SparseMatrix,
+        serializer=_mars_sparse_matrix_serializer,
+        deserializer=_mars_sparse_matrix_deserializer,
+    )
+
+
+class RayFileLikeObject:
+    def __init__(self):
+        self._buffers = []
+        self._size = 0
+
+    def write(self, content: bytes):
+        self._buffers.append(content)
+        self._size += len(content)
+
+    def readinto(self, buffer):
+        read_bytes = 0
+        for b in self._buffers:
+            read_pos = read_bytes + len(b)
+            buffer[read_bytes:read_pos] = b
+            read_bytes = read_pos
+        return read_bytes
+
+    def close(self):
+        self._buffers.clear()
+        self._size = 0
+
+    def tell(self):
+        return self._size
+
+
+class RayFileObject(BufferWrappedFileObject):
+    def __init__(self, object_id: Any, mode: str):
+        super().__init__(object_id, mode, size=0)
+
+    def _write_init(self):
+        self._buffer = RayFileLikeObject()
+
+    def _read_init(self):
+        self._buffer = ray.get(self._object_id)
+        self._mv = memoryview(self._buffer)
+        self._size = len(self._buffer)
+
+    def write(self, content: bytes):
+        if not self._initialized:
+            self._write_init()
+            self._initialized = True
+
+        return self._buffer.write(content)
+
+    def _write_close(self):
+        worker = ray.worker.global_worker
+        metadata = ray.ray_constants.OBJECT_METADATA_TYPE_RAW
+        args = [metadata, self._buffer.tell(), self._buffer, self._object_id]
+        try:
+            worker.core_worker.put_file_like_object(*args)
+        except TypeError:
+            args.append(None)  # owner_address for ray >= 1.3.0
+            worker.core_worker.put_file_like_object(*args)
+
+    def _read_close(self):
+        pass
+
+
+_support_specify_owner = None
+
+
+def support_specify_owner():
+    global _support_specify_owner
+    if _support_specify_owner is None:
+        sig = inspect.signature(ray.put)
+        _support_specify_owner = "_owner" in sig.parameters
+    return _support_specify_owner
+
+
+@register_storage_backend
+class RayStorage(StorageBackend):
+    name = "ray"
+    is_seekable = False
+
+    def __init__(self, *args, **kwargs):
+        self._owner_address = kwargs.get("owner")
+        self._owner = None  # A ray actor which will own the objects put by workers.
+        self._storage_get_metrics = [
+            (
+                Percentile.PercentileType.P99,
+                Metrics.gauge(
+                    "mars.storage.ray.get_cost_time_p99_seconds",
+                    "P99 time consuming in seconds to get object, every 1000 times report once.",
+                ).record,
+                1000,
+            ),
+            (
+                Percentile.PercentileType.P95,
+                Metrics.gauge(
+                    "mars.storage.ray.get_cost_time_p95_seconds",
+                    "P95 time consuming in seconds to get object, every 1000 times report once.",
+                ).record,
+                1000,
+            ),
+            (
+                Percentile.PercentileType.P90,
+                Metrics.gauge(
+                    "mars.storage.ray.get_cost_time_p90_seconds",
+                    "P90 time consuming in seconds to get object, every 1000 times report once.",
+                ).record,
+                1000,
+            ),
+        ]
+
+        self._storage_put_metrics = [
+            (
+                Percentile.PercentileType.P99,
+                Metrics.gauge(
+                    "mars.storage.ray.put_cost_time_p99_seconds",
+                    "P99 time consuming in seconds to put object, every 1000 times report once.",
+                ).record,
+                1000,
+            ),
+            (
+                Percentile.PercentileType.P95,
+                Metrics.gauge(
+                    "mars.storage.ray.put_cost_time_p95_seconds",
+                    "P95 time consuming in seconds to put object, every 1000 times report once.",
+                ).record,
+                1000,
+            ),
+            (
+                Percentile.PercentileType.P90,
+                Metrics.gauge(
+                    "mars.storage.ray.put_cost_time_p90_seconds",
+                    "P90 time consuming in seconds to put object, every 1000 times report once.",
+                ).record,
+                1000,
+            ),
+        ]
+
+    @classmethod
+    @implements(StorageBackend.setup)
+    async def setup(cls, **kwargs) -> Tuple[Dict, Dict]:
+        _register_sparse_matrix_serializer()
+        return kwargs, dict()
+
+    @staticmethod
+    @implements(StorageBackend.teardown)
+    async def teardown(**kwargs):
+        pass
+
+    @property
+    @implements(StorageBackend.level)
+    def level(self) -> StorageLevel:
+        # TODO(fyrestone): return StorageLevel.MEMORY & StorageLevel.DISK
+        # if object spilling is available.
+        return StorageLevel.MEMORY | StorageLevel.REMOTE
+
+    @implements(StorageBackend.get)
+    async def get(self, object_id, **kwargs) -> object:
+        if kwargs:  # pragma: no cover
+            raise NotImplementedError(f'Got unsupported args: {",".join(kwargs)}')
+        with debug_async_timeout(
+            "ray_object_retrieval_timeout",
+            "Storage get object timeout, ObjectRef: %s",
+            object_id,
+        ):
+            with record_time_cost_percentile(self._storage_get_metrics):
+                return await object_id
+
+    @implements(StorageBackend.put)
+    async def put(self, obj, importance=0) -> ObjectInfo:
+        with record_time_cost_percentile(self._storage_put_metrics):
+            if support_specify_owner() and self._owner_address:
+                if not self._owner:
+                    self._owner = ray.get_actor(self._owner_address)
+                object_id = ray.put(obj, _owner=self._owner)
+            else:
+                object_id = ray.put(obj)
+        # We can't get the serialized bytes length from ray.put
+        return ObjectInfo(object_id=object_id)
+
+    @implements(StorageBackend.delete)
+    async def delete(self, object_id):
+        ray.internal.free(object_id)
+
+    @implements(StorageBackend.object_info)
+    async def object_info(self, object_id) -> ObjectInfo:
+        # The performance of obtaining the object size is poor.
+        return ObjectInfo(object_id=object_id)
+
+    @implements(StorageBackend.open_writer)
+    async def open_writer(self, size=None) -> StorageFileObject:
+        new_id = ray.ObjectRef.from_random()
+        ray_writer = RayFileObject(new_id, mode="w")
+        return StorageFileObject(ray_writer, object_id=new_id)
+
+    @implements(StorageBackend.open_reader)
+    async def open_reader(self, object_id) -> StorageFileObject:
+        ray_reader = RayFileObject(object_id, mode="r")
+        return StorageFileObject(ray_reader, object_id=object_id)
+
+    @implements(StorageBackend.list)
+    async def list(self) -> List:
+        raise NotImplementedError("Ray storage does not support list")
+
+    @implements(StorageBackend.fetch)
+    async def fetch(self, object_id):
+        pass
diff --git a/python/xorbits/_mars/storage/shared_memory.py b/python/xorbits/_mars/storage/shared_memory.py
new file mode 100644
index 000000000..24bf8fe0e
--- /dev/null
+++ b/python/xorbits/_mars/storage/shared_memory.py
@@ -0,0 +1,224 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+import random
+import struct
+import sys
+from dataclasses import dataclass
+from string import ascii_letters, digits
+from typing import Any, Dict, List, Optional, Tuple
+
+try:
+    if sys.version_info[:2] >= (3, 8):
+        # builtin package for Python 3.8+
+        from multiprocessing.shared_memory import SharedMemory
+    else:
+        # backport package for Python 3.7-
+        from shared_memory import SharedMemory
+
+    class SharedMemoryForRead(SharedMemory):
+        def __del__(self):
+            # close fd only
+            fd = self._fd
+            if os.name != "nt" and fd >= 0:
+                os.close(fd)
+
+except ImportError:  # pragma: no cover
+    # allow shared_memory package to be absent
+    SharedMemory = SharedMemoryForRead = None
+
+from ..serialization import AioDeserializer, AioSerializer
+from ..utils import dataslots, implements
+from .base import ObjectInfo, StorageBackend, StorageLevel, register_storage_backend
+from .core import BufferWrappedFileObject, StorageFileObject
+
+_is_windows: bool = sys.platform.startswith("win")
+_qword_pack = struct.Struct("<Q")
+
+
+@dataslots
+@dataclass
+class WinShmObjectInfo(ObjectInfo):
+    shm: Any = None
+
+
+class SharedMemoryFileObject(BufferWrappedFileObject):
+    def __init__(self, object_id: Any, mode: str, size: Optional[int] = None):
+        self.shm = None
+        super().__init__(object_id, mode, size=size)
+
+    def _write_actual_size(self):
+        # we need to reopen the SharedMemory object as the size
+        # of the original one is less than the actual size.
+        actual_shm = SharedMemory(name=self._object_id)
+        actual_shm.buf[-8:] = _qword_pack.pack(self._size)
+
+    def _write_init(self):
+        # keep last 8 bytes to record actual memory size
+        self.shm = shm = SharedMemory(
+            name=self._object_id, create=True, size=self._size + 8
+        )
+        self._write_actual_size()
+        self._buffer = self._mv = shm.buf
+
+    def _read_init(self):
+        self.shm = shm = SharedMemoryForRead(name=self._object_id)
+        self._buffer = self._mv = shm.buf
+        if self._size is None:
+            (self._size,) = _qword_pack.unpack(shm.buf[-8:])
+
+    def _write_close(self):
+        pass
+
+    def _read_close(self):
+        pass
+
+
+class ShmStorageFileObject(StorageFileObject):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._shm = None
+
+    async def close(self):
+        if _is_windows:
+            self._shm = self._file.shm
+        await super().close()
+
+
+@register_storage_backend
+class SharedMemoryStorage(StorageBackend):
+    name = "shared_memory"
+
+    def __init__(self, **kw):
+        if kw:  # pragma: no cover
+            raise TypeError(
+                f'SharedMemoryStorage got unexpected arguments: {",".join(kw)}'
+            )
+        # for test purpose, in real usage,
+        # each storage object holds different object ids,
+        # we cannot do any operation according to
+        # this property only
+        self._object_ids = set()
+
+    @classmethod
+    @implements(StorageBackend.setup)
+    async def setup(cls, **kwargs) -> Tuple[Dict, Dict]:
+        if kwargs:  # pragma: no cover
+            raise TypeError(
+                f'SharedMemoryStorage got unexpected config: {",".join(kwargs)}'
+            )
+
+        return dict(), dict()
+
+    @staticmethod
+    @implements(StorageBackend.teardown)
+    async def teardown(**kwargs):
+        object_ids = kwargs.get("object_ids") or ()
+        for object_id in object_ids:
+            try:
+                shm = SharedMemory(name=object_id)
+                shm.unlink()
+                await asyncio.sleep(0)
+            except FileNotFoundError:
+                pass
+
+    @property
+    @implements(StorageBackend.level)
+    def level(self) -> StorageLevel:
+        return StorageLevel.MEMORY
+
+    @classmethod
+    def _generate_object_id(cls):
+        return "".join(random.choice(ascii_letters + digits) for _ in range(30))
+
+    @implements(StorageBackend.get)
+    async def get(self, object_id, **kwargs) -> object:
+        if kwargs:  # pragma: no cover
+            raise NotImplementedError(f'Got unsupported args: {",".join(kwargs)}')
+
+        shm_file = SharedMemoryFileObject(object_id, mode="r")
+
+        async with StorageFileObject(shm_file, object_id) as f:
+            deserializer = AioDeserializer(f)
+            return await deserializer.run()
+
+    @implements(StorageBackend.put)
+    async def put(self, obj, importance=0) -> ObjectInfo:
+        object_id = self._generate_object_id()
+
+        serializer = AioSerializer(obj)
+        buffers = await serializer.run()
+        buffer_size = sum(getattr(buf, "nbytes", len(buf)) for buf in buffers)
+
+        shm_file = SharedMemoryFileObject(object_id, mode="w", size=buffer_size)
+        async with StorageFileObject(shm_file, object_id) as f:
+            for buffer in buffers:
+                await f.write(buffer)
+
+        self._object_ids.add(object_id)
+        if _is_windows:
+            return WinShmObjectInfo(
+                size=buffer_size, object_id=object_id, shm=shm_file.shm
+            )
+        else:
+            return ObjectInfo(size=buffer_size, object_id=object_id)
+
+    @implements(StorageBackend.delete)
+    async def delete(self, object_id):
+        try:
+            shm = SharedMemory(name=object_id)
+            shm.unlink()
+            shm.close()
+        except FileNotFoundError:
+            if sys.platform == "win32":
+                # skip file not found error for windows
+                pass
+            else:  # pragma: no cover
+                raise
+        try:
+            self._object_ids.remove(object_id)
+        except KeyError:  # pragma: no cover
+            return
+
+    @implements(StorageBackend.object_info)
+    async def object_info(self, object_id) -> ObjectInfo:
+        shm_file = SharedMemoryFileObject(object_id, mode="r")
+
+        async with ShmStorageFileObject(shm_file, object_id) as f:
+            deserializer = AioDeserializer(f)
+            size = await deserializer.get_size()
+        if not _is_windows:
+            return ObjectInfo(size=size, object_id=object_id)
+        else:
+            return WinShmObjectInfo(size=size, object_id=object_id, shm=shm_file)
+
+    @implements(StorageBackend.open_writer)
+    async def open_writer(self, size=None) -> StorageFileObject:
+        if size is None:  # pragma: no cover
+            raise ValueError("size must be provided for shared memory backend")
+
+        new_id = self._generate_object_id()
+        shm_file = SharedMemoryFileObject(new_id, size=size, mode="w")
+        return ShmStorageFileObject(shm_file, object_id=new_id)
+
+    @implements(StorageBackend.open_reader)
+    async def open_reader(self, object_id) -> StorageFileObject:
+        shm_file = SharedMemoryFileObject(object_id, mode="r")
+        return ShmStorageFileObject(shm_file, object_id=object_id)
+
+    @implements(StorageBackend.list)
+    async def list(self) -> List:  # pragma: no cover
+        raise NotImplementedError("Shared memory storage does not support list")
diff --git a/python/xorbits/_mars/storage/tests/__init__.py b/python/xorbits/_mars/storage/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/storage/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/storage/tests/test_base.py b/python/xorbits/_mars/storage/tests/test_base.py
new file mode 100644
index 000000000..b6fca7dcd
--- /dev/null
+++ b/python/xorbits/_mars/storage/tests/test_base.py
@@ -0,0 +1,26 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .. import StorageLevel
+
+
+def test_storage_level():
+    s = "memory"
+    assert StorageLevel.MEMORY == StorageLevel.from_str(s)
+
+    s = "disk | memory"
+    assert StorageLevel.DISK | StorageLevel.MEMORY == StorageLevel.from_str(s)
+
+    s = " MEMORY|REMOTE "
+    assert StorageLevel.MEMORY | StorageLevel.REMOTE == StorageLevel.from_str(s)
diff --git a/python/xorbits/_mars/storage/tests/test_libs.py b/python/xorbits/_mars/storage/tests/test_libs.py
new file mode 100644
index 000000000..047cd3397
--- /dev/null
+++ b/python/xorbits/_mars/storage/tests/test_libs.py
@@ -0,0 +1,330 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pkgutil
+import sys
+import tempfile
+
+import numpy as np
+import pandas as pd
+import pytest
+import scipy.sparse as sps
+
+from ...lib.filesystem import LocalFileSystem
+from ...lib.sparse import SparseMatrix, SparseNDArray
+from ...serialization import AioDeserializer, AioSerializer
+from ...tests.core import require_cudf, require_cupy, require_ray
+from ..base import StorageLevel
+from ..cuda import CudaStorage
+from ..filesystem import DiskStorage
+from ..plasma import PlasmaStorage
+from ..ray import RayStorage
+from ..shared_memory import SharedMemoryStorage
+from ..vineyard import VineyardStorage
+
+try:
+    import vineyard
+except ImportError:
+    vineyard = None
+try:
+    import ray
+except ImportError:
+    ray = None
+
+require_lib = lambda x: x
+params = [
+    "filesystem",
+    "shared_memory",
+]
+if (
+    not sys.platform.startswith("win")
+    and pkgutil.find_loader("pyarrow.plasma") is not None
+):
+    params.append("plasma")
+if vineyard is not None:
+    params.append("vineyard")
+if ray is not None:
+    params.append("ray")
+    require_lib = require_ray
+
+
+@pytest.mark.parametrize(
+    "ray_start_regular", [{"enable": ray is not None}], indirect=True
+)
+@pytest.fixture(params=params)
+async def storage_context(ray_start_regular, request):
+    if request.param == "filesystem":
+        tempdir = tempfile.mkdtemp()
+        params, teardown_params = await DiskStorage.setup(
+            fs=LocalFileSystem(), root_dirs=[tempdir]
+        )
+        storage = DiskStorage(**params)
+        assert storage.level == StorageLevel.DISK
+
+        yield storage
+
+        await storage.teardown(**teardown_params)
+    elif request.param == "plasma":
+        plasma_storage_size = 10 * 1024 * 1024
+        if sys.platform == "darwin":
+            plasma_dir = "/tmp"
+        else:
+            plasma_dir = "/dev/shm"
+        params, teardown_params = await PlasmaStorage.setup(
+            store_memory=plasma_storage_size,
+            plasma_directory=plasma_dir,
+            check_dir_size=False,
+        )
+        storage = PlasmaStorage(**params)
+        assert storage.level == StorageLevel.MEMORY
+
+        yield storage
+
+        await PlasmaStorage.teardown(**teardown_params)
+    elif request.param == "vineyard":
+        vineyard_size = "256M"
+        params, teardown_params = await VineyardStorage.setup(
+            vineyard_size=vineyard_size
+        )
+        storage = VineyardStorage(**params)
+        assert storage.level == StorageLevel.MEMORY
+
+        yield storage
+
+        await VineyardStorage.teardown(**teardown_params)
+    elif request.param == "shared_memory":
+        params, teardown_params = await SharedMemoryStorage.setup()
+        storage = SharedMemoryStorage(**params)
+        assert storage.level == StorageLevel.MEMORY
+
+        yield storage
+
+        teardown_params["object_ids"] = storage._object_ids
+        await SharedMemoryStorage.teardown(**teardown_params)
+    elif request.param == "ray":
+        params, teardown_params = await RayStorage.setup()
+        storage = RayStorage(**params)
+        assert storage.level == StorageLevel.MEMORY | StorageLevel.REMOTE
+
+        yield storage
+
+        await RayStorage.teardown(**teardown_params)
+
+
+def test_storage_level():
+    level = StorageLevel.DISK | StorageLevel.MEMORY
+    assert level == StorageLevel.DISK.value | StorageLevel.MEMORY.value
+
+    assert (StorageLevel.DISK | StorageLevel.MEMORY) & StorageLevel.DISK
+    assert not (StorageLevel.DISK | StorageLevel.MEMORY) & StorageLevel.GPU
+
+    assert StorageLevel.GPU < StorageLevel.MEMORY < StorageLevel.DISK
+    assert StorageLevel.DISK > StorageLevel.MEMORY > StorageLevel.GPU
+
+
+@pytest.mark.asyncio
+@require_lib
+@pytest.mark.parametrize(
+    "ray_start_regular", [{"enable": ray is not None}], indirect=True
+)
+async def test_base_operations(ray_start_regular, storage_context):
+    storage = storage_context
+
+    data1 = np.random.rand(10, 10)
+    put_info1 = await storage.put(data1)
+    get_data1 = await storage.get(put_info1.object_id)
+    np.testing.assert_array_equal(data1, get_data1)
+
+    info1 = await storage.object_info(put_info1.object_id)
+    # FIXME: remove os check when size issue fixed
+    assert info1.size == put_info1.size
+
+    data2 = pd.DataFrame(
+        {
+            "col1": np.arange(10),
+            "col2": [f"str{i}" for i in range(10)],
+            "col3": np.random.rand(10),
+        },
+    )
+    put_info2 = await storage.put(data2)
+    get_data2 = await storage.get(put_info2.object_id)
+    pd.testing.assert_frame_equal(data2, get_data2)
+
+    info2 = await storage.object_info(put_info2.object_id)
+    # FIXME: remove os check when size issue fixed
+    assert info2.size == put_info2.size
+
+    # FIXME: remove when list functionality is ready for vineyard.
+    if not isinstance(storage, (VineyardStorage, SharedMemoryStorage, RayStorage)):
+        num = len(await storage.list())
+        assert num == 2
+        await storage.delete(info2.object_id)
+
+    # test SparseMatrix
+    s1 = sps.csr_matrix([[1, 0, 1], [0, 0, 1]])
+    s = SparseNDArray(s1)
+    put_info3 = await storage.put(s)
+    get_data3 = await storage.get(put_info3.object_id)
+    assert isinstance(get_data3, SparseMatrix)
+    np.testing.assert_array_equal(get_data3.toarray(), s1.A)
+    np.testing.assert_array_equal(get_data3.todense(), s1.A)
+
+
+@pytest.mark.asyncio
+@require_lib
+@pytest.mark.parametrize(
+    "ray_start_regular", [{"enable": ray is not None}], indirect=True
+)
+async def test_reader_and_writer(ray_start_regular, storage_context):
+    storage = storage_context
+
+    if isinstance(storage, VineyardStorage):
+        pytest.skip(
+            "open_{reader,writer} in vineyard doesn't use the DEFAULT_SERIALIZATION"
+        )
+
+    # test writer and reader
+    t = np.random.random(10)
+    buffers = await AioSerializer(t).run()
+    size = sum(getattr(buf, "nbytes", len(buf)) for buf in buffers)
+    async with await storage.open_writer(size=size) as writer:
+        for buf in buffers:
+            await writer.write(buf)
+
+    async with await storage.open_reader(writer.object_id) as reader:
+        r = await AioDeserializer(reader).run()
+
+    np.testing.assert_array_equal(t, r)
+
+    # test writer and reader with seek offset
+    t = np.random.random(10)
+    buffers = await AioSerializer(t).run()
+    size = sum(getattr(buf, "nbytes", len(buf)) for buf in buffers)
+    async with await storage.open_writer(size=20 + size) as writer:
+        await writer.write(b" " * 10)
+        for buf in buffers:
+            await writer.write(buf)
+        await writer.write(b" " * 10)
+
+    async with await storage.open_reader(writer.object_id) as reader:
+        with pytest.raises((OSError, ValueError)):
+            await reader.seek(-1)
+
+        assert 5 == await reader.seek(5)
+        assert 10 == await reader.seek(5, os.SEEK_CUR)
+        assert 10 == await reader.seek(-10 - size, os.SEEK_END)
+        assert 10 == await reader.tell()
+        r = await AioDeserializer(reader).run()
+
+    np.testing.assert_array_equal(t, r)
+
+
+@pytest.mark.asyncio
+@require_lib
+@pytest.mark.parametrize(
+    "ray_start_regular", [{"enable": ray is not None}], indirect=True
+)
+async def test_reader_and_writer_vineyard(ray_start_regular, storage_context):
+    storage = storage_context
+
+    if not isinstance(storage, VineyardStorage):
+        pytest.skip(
+            "open_{reader,writer} in vineyard doesn't use the DEFAULT_SERIALIZATION"
+        )
+
+    # test writer and reader
+    t = np.random.random(10)
+    tinfo = await storage.put(t)
+
+    # testing the roundtrip of `open_{reader,writer}`.
+
+    buffers = []
+    async with await storage.open_reader(tinfo.object_id) as reader:
+        while True:
+            buf = await reader.read()
+            if buf:
+                buffers.append(buf)
+            else:
+                break
+
+    writer_object_id = None
+    async with await storage.open_writer() as writer:
+        for buf in buffers:
+            await writer.write(buf)
+
+        # The `object_id` of `StorageFileObject` returned by `open_writer` in vineyard
+        # storage only available after `close` and before `__exit__` of `AioFileObject`.
+        #
+        # As `StorageFileObject.object_id` is only used for testing here, I think its
+        # fine to have such a hack.
+        await writer.close()
+        writer_object_id = writer._file._object_id
+
+    t2 = await storage.get(writer_object_id)
+    np.testing.assert_array_equal(t, t2)
+
+
+@require_cupy
+@require_cudf
+@pytest.mark.asyncio
+async def test_cuda_backend():
+    import cudf
+    import cupy
+
+    params, teardown_params = await CudaStorage.setup()
+    storage = CudaStorage(**params)
+    assert storage.level == StorageLevel.GPU
+
+    data1 = cupy.asarray(np.random.rand(10, 10))
+    put_info1 = await storage.put(data1)
+    get_data1 = await storage.get(put_info1.object_id)
+    cupy.testing.assert_array_equal(data1, get_data1)
+
+    info1 = await storage.object_info(put_info1.object_id)
+    assert info1.size == put_info1.size
+
+    data2 = cudf.DataFrame(
+        pd.DataFrame(
+            {
+                "col1": np.arange(10),
+                "col2": [f"str{i}" for i in range(10)],
+                "col3": np.random.rand(10),
+            },
+        )
+    )
+    put_info2 = await storage.put(data2)
+    get_data2 = await storage.get(put_info2.object_id)
+    cudf.testing.assert_frame_equal(data2, get_data2)
+
+    info2 = await storage.object_info(put_info2.object_id)
+    assert info2.size == put_info2.size
+
+    await CudaStorage.teardown(**teardown_params)
+
+    # test writer and reader
+    read_chunk = 100
+    writer = await storage.open_writer(put_info1.size)
+    async with await storage.open_reader(put_info1.object_id) as reader:
+        while True:
+            content = await reader.read(read_chunk)
+            if content:
+                await writer.write(content)
+            else:
+                break
+    writer._file._write_close()
+    write_data = await storage.get(writer._file._object_id)
+    cupy.testing.assert_array_equal(write_data, get_data1)
+
+    await storage.delete(put_info1.object_id)
diff --git a/python/xorbits/_mars/storage/vineyard.py b/python/xorbits/_mars/storage/vineyard.py
new file mode 100644
index 000000000..e90eafb8c
--- /dev/null
+++ b/python/xorbits/_mars/storage/vineyard.py
@@ -0,0 +1,220 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import sys
+from io import UnsupportedOperation
+from typing import Dict, List, Optional, Tuple
+
+from ..lib import sparse
+from ..resource import virtual_memory
+from ..utils import calc_size_by_str, implements, lazy_import
+from .base import ObjectInfo, StorageBackend, StorageLevel, register_storage_backend
+from .core import BufferWrappedFileObject, StorageFileObject
+
+vineyard = lazy_import("vineyard")
+vy_data_pickle = lazy_import("vineyard.data.pickle", rename="vy_data_pickle")
+vy_data_utils = lazy_import("vineyard.data.utils", rename="vy_data_utils")
+pyarrow = lazy_import("pyarrow")
+
+if sys.platform.startswith("win"):
+    vineyard = vy_data_pickle = vy_data_utils = None
+
+logger = logging.getLogger(__name__)
+
+# Setup support for mars datatypes on vineyard
+
+
+def mars_sparse_matrix_builder(client, value, builder, **kw):
+    meta = vineyard.ObjectMeta()
+    meta["typename"] = "vineyard::SparseMatrix<%s>" % value.dtype.name
+    meta["shape_"] = vy_data_utils.to_json(value.shape)
+    meta.add_member("spmatrix", builder.run(client, value.spmatrix, **kw))
+    return client.create_metadata(meta)
+
+
+def mars_sparse_matrix_resolver(obj, resolver) -> sparse.SparseNDArray:
+    meta = obj.meta
+    shape = vy_data_utils.from_json(meta["shape_"])
+    spmatrix = resolver.run(obj.member("spmatrix"))
+    return sparse.matrix.SparseMatrix(spmatrix, shape=shape)
+
+
+def _register_vineyard_matrices():
+    vineyard.core.default_builder_context.register(
+        sparse.matrix.SparseMatrix, mars_sparse_matrix_builder
+    )
+    vineyard.core.default_resolver_context.register(
+        "vineyard::SparseMatrix", mars_sparse_matrix_resolver
+    )
+
+
+class VineyardFileObject(BufferWrappedFileObject):
+    def __init__(
+        self, vineyard_client, object_id, mode: str, size: Optional[int] = None
+    ):
+        self._client = vineyard_client
+        self._file = None
+
+        self._reader = None
+        self._writer = None
+
+        if size is None:
+            size = -1  # unknown estimated size.
+
+        super().__init__(object_id, mode, size=size)
+
+    def _read_init(self):
+        self._reader = vy_data_pickle.PickledReader(self._client.get(self._object_id))
+        self._size = self._reader.store_size
+
+    def _write_init(self):
+        self._writer = vy_data_pickle.PickledWriter(self._size)
+
+    @property
+    def buffer(self):
+        raise UnsupportedOperation(
+            "VineyardFileObject doesn't support the direct 'buffer' property"
+        )
+
+    def read(self, size=-1):
+        if not self._initialized:
+            self._read_init()
+            self._initialized = True
+        return self._reader.read(size)
+
+    def write(self, content: bytes):
+        if not self._initialized:
+            self._write_init()
+            self._initialized = True
+        return self._writer.write(content)
+
+    def _read_close(self):
+        self._reader = None
+
+    def _write_close(self):
+        self._writer.close()
+        self._object_id = self._client.put(self._writer.value)
+        self._writer = None
+
+
+@register_storage_backend
+class VineyardStorage(StorageBackend):
+    name = "vineyard"
+    is_seekable = False
+
+    def __init__(self, vineyard_size: int, vineyard_socket: str = None):
+        _register_vineyard_matrices()
+
+        self._size = vineyard_size
+        self._vineyard_socket = vineyard_socket
+        self._client = vineyard.connect(vineyard_socket)
+
+    @classmethod
+    @implements(StorageBackend.setup)
+    async def setup(cls, **kwargs) -> Tuple[Dict, Dict]:
+        loop = asyncio.get_running_loop()
+        etcd_endpoints = kwargs.pop("etcd_endpoints", "127.0.0.1:2379")
+        etcd_prefix = kwargs.pop("etcd_prefix", "vineyard")
+        vineyard_size = kwargs.pop("vineyard_size", "1Gi")
+        vineyard_socket = kwargs.pop("vineyard_socket", None)
+        vineyardd_path = kwargs.pop("vineyardd_path", None)
+
+        if kwargs:
+            raise TypeError(
+                f'VineyardStorage got unexpected config: {",".join(kwargs)}'
+            )
+
+        vineyard_size = calc_size_by_str(vineyard_size, virtual_memory().total)
+        if vineyard_socket is not None:  # pragma: no cover
+            vineyard_store = None
+        else:
+            vineyard_store = vineyard.deploy.local.start_vineyardd(
+                etcd_endpoints,
+                etcd_prefix,
+                vineyardd_path,
+                vineyard_size,
+                vineyard_socket,
+                rpc=False,
+            )
+            vineyard_socket = (
+                await loop.run_in_executor(None, vineyard_store.__enter__)
+            )[1]
+        init_params = dict(vineyard_size=vineyard_size, vineyard_socket=vineyard_socket)
+        teardown_params = dict(vineyard_store=vineyard_store)
+        return init_params, teardown_params
+
+    @staticmethod
+    @implements(StorageBackend.teardown)
+    async def teardown(**kwargs):
+        vineyard_store = kwargs.get("vineyard_store")
+        if vineyard_store is not None:
+            vineyard_store.__exit__(None, None, None)
+
+    @property
+    @implements(StorageBackend.level)
+    def level(self) -> StorageLevel:
+        return StorageLevel.MEMORY
+
+    @property
+    @implements(StorageBackend.size)
+    def size(self) -> Optional[int]:
+        return self._size
+
+    @property
+    @implements(StorageBackend.backend_info)
+    def backend_info(self):
+        return {
+            "name": self.name,
+            "socket": self._vineyard_socket,
+            "instance_id": self._client.instance_id,
+        }
+
+    @implements(StorageBackend.get)
+    async def get(self, object_id, **kwargs) -> object:
+        if kwargs:  # pragma: no cover
+            raise NotImplementedError(f'Got unsupported args: {",".join(kwargs)}')
+
+        return self._client.get(object_id)
+
+    @implements(StorageBackend.put)
+    async def put(self, obj, importance: int = 0) -> ObjectInfo:
+        object_id = self._client.put(obj)
+        size = self._client.get_meta(object_id).nbytes
+        return ObjectInfo(size=size, object_id=object_id)
+
+    @implements(StorageBackend.delete)
+    async def delete(self, object_id):
+        self._client.delete([object_id], deep=True)
+
+    @implements(StorageBackend.object_info)
+    async def object_info(self, object_id) -> ObjectInfo:
+        size = self._client.get_meta(object_id).nbytes
+        return ObjectInfo(size=size, object_id=object_id)
+
+    @implements(StorageBackend.open_writer)
+    async def open_writer(self, size=None) -> StorageFileObject:
+        vineyard_writer = VineyardFileObject(self._client, None, size=size, mode="w")
+        return StorageFileObject(vineyard_writer, object_id=None)
+
+    @implements(StorageBackend.open_reader)
+    async def open_reader(self, object_id) -> StorageFileObject:
+        vineyard_reader = VineyardFileObject(self._client, object_id, mode="r")
+        return StorageFileObject(vineyard_reader, object_id=object_id)
+
+    @implements(StorageBackend.list)
+    async def list(self) -> List:
+        # FIXME: vineyard's list_objects not equal to plasma
+        raise NotImplementedError
diff --git a/python/xorbits/_mars/supervisor.py b/python/xorbits/_mars/supervisor.py
new file mode 100644
index 000000000..183516a6c
--- /dev/null
+++ b/python/xorbits/_mars/supervisor.py
@@ -0,0 +1,23 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# shortcut to support
+# python -m mars.supervisor
+
+from .deploy.oscar.supervisor import main
+from .utils import ensure_coverage
+
+if __name__ == "__main__":
+    ensure_coverage()
+    main()
diff --git a/python/xorbits/_mars/tensor/__init__.py b/python/xorbits/_mars/tensor/__init__.py
new file mode 100644
index 000000000..64c5e0539
--- /dev/null
+++ b/python/xorbits/_mars/tensor/__init__.py
@@ -0,0 +1,380 @@
+# isort: skip_file
+# Copyright 1999-2021 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .datasource import (
+    tensor,
+    array,
+    asarray,
+    ascontiguousarray,
+    asfortranarray,
+    scalar,
+    empty,
+    empty_like,
+    ones,
+    ones_like,
+    zeros,
+    zeros_like,
+    full,
+    full_like,
+    arange,
+    diag,
+    diagflat,
+    eye,
+    identity,
+    linspace,
+    meshgrid,
+    indices,
+    tril,
+    triu,
+    fromtiledb,
+    fromtiledb as from_tiledb,
+    from_dataframe,
+    fromhdf5,
+    fromhdf5 as from_hdf5,
+    fromzarr,
+    fromzarr as from_zarr,
+    fromvineyard,
+    fromvineyard as from_vineyard,
+)
+from .datastore import (
+    totiledb,
+    totiledb as to_tiledb,
+    tohdf5,
+    tohdf5 as to_hdf5,
+    tozarr,
+    tozarr as to_zarr,
+    tovineyard,
+    tovineyard as to_vineyard,
+)  # pylint: disable=reimported
+from .base import (
+    result_type,
+    ndim,
+    copyto,
+    transpose,
+    where,
+    broadcast_to,
+    broadcast_arrays,
+    expand_dims,
+    rollaxis,
+    swapaxes,
+    moveaxis,
+    ravel,
+    atleast_1d,
+    atleast_2d,
+    atleast_3d,
+    argwhere,
+    array_split,
+    split,
+    hsplit,
+    vsplit,
+    dsplit,
+    roll,
+    squeeze,
+    diff,
+    ediff1d,
+    flip,
+    flipud,
+    fliplr,
+    repeat,
+    tile,
+    isin,
+    searchsorted,
+    unique,
+    sort,
+    argsort,
+    partition,
+    argpartition,
+    topk,
+    argtopk,
+    copy,
+    trapz,
+    shape,
+    insert,
+    delete,
+    in1d,
+    setdiff1d,
+)
+from .arithmetic import (
+    add,
+    subtract,
+    multiply,
+    divide,
+    truediv as true_divide,
+    floordiv as floor_divide,
+    mod,
+    power,
+    float_power,
+    fmod,
+    sqrt,
+    around,
+    round_,
+    round_ as round,
+    logaddexp,
+    logaddexp2,
+    negative,
+    positive,
+    absolute,
+    fabs,
+    absolute as abs,
+    rint,
+    sign,
+    degrees,
+    radians,
+    conj,
+    conjugate,
+    exp,
+    exp2,
+    log,
+    log2,
+    log10,
+    expm1,
+    log1p,
+    square,
+    cbrt,
+    reciprocal,
+    equal,
+    not_equal,
+    less,
+    less_equal,
+    greater,
+    greater_equal,
+    sin,
+    cos,
+    tan,
+    arcsin,
+    arccos,
+    arctan,
+    arctan2,
+    hypot,
+    sinh,
+    cosh,
+    tanh,
+    arcsinh,
+    arccosh,
+    arctanh,
+    deg2rad,
+    rad2deg,
+    bitand as bitwise_and,
+    bitor as bitwise_or,
+    bitxor as bitwise_xor,
+    invert,
+    invert as bitwise_not,
+    lshift as left_shift,
+    rshift as right_shift,
+    logical_and,
+    logical_or,
+    logical_xor,
+    logical_not,
+    maximum,
+    minimum,
+    floor,
+    ceil,
+    trunc,
+    remainder,
+    fmax,
+    fmin,
+    isfinite,
+    isinf,
+    isnan,
+    signbit,
+    copysign,
+    nextafter,
+    spacing,
+    clip,
+    isclose,
+    ldexp,
+    frexp,
+    modf,
+    angle,
+    isreal,
+    iscomplex,
+    real,
+    imag,
+    fix,
+    i0,
+    sinc,
+    nan_to_num,
+    tree_add,
+    tree_multiply,
+)
+from .statistics import (
+    average,
+    bincount,
+    cov,
+    corrcoef,
+    digitize,
+    ptp,
+    histogram_bin_edges,
+    histogram,
+    median,
+    quantile,
+    percentile,
+)
+from .linalg.tensordot import tensordot
+from .linalg.dot import dot
+from .linalg.inner import inner, innerproduct
+from .linalg.vdot import vdot
+from .linalg.matmul import matmul
+from .reduction import (
+    sum,
+    nansum,
+    prod,
+    prod as product,
+    nanprod,
+    max,
+    max as amax,
+    nanmax,
+    min,
+    min as amin,
+    nanmin,
+    all,
+    any,
+    mean,
+    nanmean,
+    argmax,
+    nanargmax,
+    argmin,
+    nanargmin,
+    cumsum,
+    cumprod,
+    var,
+    std,
+    nanvar,
+    nanstd,
+    nancumsum,
+    nancumprod,
+    count_nonzero,
+    allclose,
+    array_equal,
+)
+from .reshape import reshape
+from .merge import (
+    concatenate,
+    stack,
+    hstack,
+    vstack,
+    dstack,
+    column_stack,
+    union1d,
+    block,
+    append,
+)
+from .indexing import (
+    take,
+    compress,
+    extract,
+    choose,
+    unravel_index,
+    nonzero,
+    flatnonzero,
+    fill_diagonal,
+)
+from .rechunk import rechunk
+from .einsum import einsum
+from .images import imread
+
+# noinspection PyUnresolvedReferences
+from .lib.index_tricks import mgrid, ogrid, ndindex, r_, c_
+
+from . import random
+from . import fft
+from . import linalg
+from . import lib
+from . import special
+from . import stats
+
+# types
+from .core import Tensor
+
+# noinspection PyUnresolvedReferences
+from ..core import ExecutableTuple
+
+# noinspection PyUnresolvedReferences
+from numpy import (
+    newaxis,
+    AxisError,
+    inf,
+    Inf,
+    NINF,
+    nan,
+    NAN,
+    NaN,
+    pi,
+    e,
+    errstate,
+    geterr,
+    seterr,
+)
+
+# import numpy types
+# noinspection PyUnresolvedReferences
+from numpy import (
+    dtype,
+    number,
+    inexact,
+    floating,
+    complexfloating,
+    integer,
+    signedinteger,
+    unsignedinteger,
+    character,
+    generic,
+    flexible,
+    int_ as int,
+    bool_ as bool,
+    float_ as float,
+    cfloat,
+    bytes_,
+    unicode_,
+    void,
+    object_ as object,
+    intc,
+    intp,
+    int8,
+    int16,
+    int32,
+    int64,
+    uint8,
+    uint16,
+    uint32,
+    uint64,
+    uint,
+    float16,
+    float32,
+    float64,
+    double,
+    complex64,
+    complex128,
+    datetime64,
+    timedelta64,
+)
+
+# noinspection PyUnresolvedReferences
+from numpy import finfo
+
+# register fuse op and fetch op
+from .fuse import TensorFuseChunk, TensorCpFuseChunk, TensorNeFuseChunk
+from .fetch import TensorFetch, TensorFetchShuffle
+from . import ufunc
+
+del (
+    TensorFuseChunk,
+    TensorCpFuseChunk,
+    TensorNeFuseChunk,
+    TensorFetch,
+    TensorFetchShuffle,
+    ufunc,
+)
diff --git a/python/xorbits/_mars/tensor/arithmetic/__init__.py b/python/xorbits/_mars/tensor/arithmetic/__init__.py
new file mode 100644
index 000000000..5f05cc287
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/__init__.py
@@ -0,0 +1,313 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...core import is_build_mode
+from .abs import TensorAbs, abs
+from .absolute import TensorAbsolute, absolute
+from .add import TensorAdd, TensorTreeAdd, add, tree_add
+from .angle import TensorAngle, angle
+from .arccos import TensorArccos, arccos
+from .arccosh import TensorArccosh, arccosh
+from .arcsin import TensorArcsin, arcsin
+from .arcsinh import TensorArcsinh, arcsinh
+from .arctan import TensorArctan, arctan
+from .arctan2 import TensorArctan2, arctan2
+from .arctanh import TensorArctanh, arctanh
+from .around import TensorAround
+from .around import around
+from .around import around as round_
+from .bitand import TensorBitand, bitand
+from .bitor import TensorBitor, bitor
+from .bitxor import TensorBitxor, bitxor
+from .cbrt import TensorCbrt, cbrt
+from .ceil import TensorCeil, ceil
+from .clip import TensorClip, clip
+from .conj import TensorConj
+from .conj import conj
+from .conj import conj as conjugate
+from .copysign import TensorCopysign, copysign
+from .cos import TensorCos, cos
+from .cosh import TensorCosh, cosh
+from .deg2rad import TensorDeg2rad, deg2rad
+from .degrees import TensorDegrees, degrees
+from .divide import TensorDivide, divide
+from .equal import TensorEqual, equal
+from .exp import TensorExp, exp
+from .exp2 import TensorExp2, exp2
+from .expm1 import TensorExpm1, expm1
+from .fabs import TensorFabs, fabs
+from .fix import TensorFix, fix
+from .float_power import TensorFloatPower, float_power
+from .floor import TensorFloor, floor
+from .floordiv import TensorFloorDiv, floordiv
+from .fmax import TensorFMax, fmax
+from .fmin import TensorFMin, fmin
+from .fmod import TensorFMod, fmod
+from .frexp import TensorFrexp, frexp
+from .greater import TensorGreaterThan, greater
+from .greater_equal import TensorGreaterEqual, greater_equal
+from .hypot import TensorHypot, hypot
+from .i0 import TensorI0, i0
+from .imag import TensorImag, imag
+from .invert import TensorInvert, invert
+from .isclose import TensorIsclose, isclose
+from .iscomplex import TensorIsComplex, iscomplex
+from .isfinite import TensorIsFinite, isfinite
+from .isinf import TensorIsInf, isinf
+from .isnan import TensorIsNan, isnan
+from .isreal import TensorIsReal, isreal
+from .ldexp import TensorLdexp, ldexp
+from .less import TensorLessThan, less
+from .less_equal import TensorLessEqual, less_equal
+from .log import TensorLog, log
+from .log1p import TensorLog1p, log1p
+from .log2 import TensorLog2, log2
+from .log10 import TensorLog10, log10
+from .logaddexp import TensorLogAddExp, logaddexp
+from .logaddexp2 import TensorLogAddExp2, logaddexp2
+from .logical_and import TensorAnd, logical_and
+from .logical_not import TensorNot, logical_not
+from .logical_or import TensorOr, logical_or
+from .logical_xor import TensorXor, logical_xor
+from .lshift import TensorLshift, lshift
+from .maximum import TensorMaximum, maximum
+from .minimum import TensorMinimum, minimum
+from .mod import TensorMod
+from .mod import mod
+from .mod import mod as remainder
+from .modf import TensorModf, modf
+from .multiply import TensorMultiply, TensorTreeMultiply, multiply, tree_multiply
+from .nan_to_num import TensorNanToNum, nan_to_num
+from .negative import TensorNegative, negative
+from .nextafter import TensorNextafter, nextafter
+from .not_equal import TensorNotEqual, not_equal
+from .positive import TensorPositive, positive
+from .power import TensorPower, power
+from .rad2deg import TensorRad2deg, rad2deg
+from .radians import TensorRadians, radians
+from .real import TensorReal, real
+from .reciprocal import TensorReciprocal, reciprocal
+from .rint import TensorRint, rint
+from .rshift import TensorRshift, rshift
+from .setimag import TensorSetImag
+from .setreal import TensorSetReal
+from .sign import TensorSign, sign
+from .signbit import TensorSignbit, signbit
+from .sin import TensorSin, sin
+from .sinc import TensorSinc, sinc
+from .sinh import TensorSinh, sinh
+from .spacing import TensorSpacing, spacing
+from .sqrt import TensorSqrt, sqrt
+from .square import TensorSquare, square
+from .subtract import TensorSubtract, subtract
+from .tan import TensorTan, tan
+from .tanh import TensorTanh, tanh
+from .truediv import TensorTrueDiv, truediv
+from .trunc import TensorTrunc, trunc
+
+
+def _wrap_iop(func):
+    def inner(self, *args, **kwargs):
+        kwargs["out"] = self
+        return func(self, *args, **kwargs)
+
+    return inner
+
+
+def _install():
+    from ..core import TENSOR_TYPE, Tensor, TensorData
+    from ..datasource import tensor as astensor
+    from .add import add, radd
+    from .bitand import bitand, rbitand
+    from .bitor import bitor, rbitor
+    from .bitxor import bitxor, rbitxor
+    from .divide import divide, rdivide
+    from .floordiv import floordiv, rfloordiv
+    from .lshift import lshift, rlshift
+    from .mod import mod, rmod
+    from .multiply import multiply, rmultiply
+    from .power import power, rpower
+    from .rshift import rrshift, rshift
+    from .subtract import rsubtract, subtract
+    from .truediv import rtruediv, truediv
+
+    def _wrap_equal(func):
+        def eq(x1, x2, **kwargs):
+            if is_build_mode():
+                return astensor(x1)._equals(x2)
+            return func(x1, x2, **kwargs)
+
+        return eq
+
+    for cls in TENSOR_TYPE:
+        setattr(cls, "__add__", add)
+        setattr(cls, "__iadd__", _wrap_iop(add))
+        setattr(cls, "__radd__", radd)
+        setattr(cls, "__sub__", subtract)
+        setattr(cls, "__isub__", _wrap_iop(subtract))
+        setattr(cls, "__rsub__", rsubtract)
+        setattr(cls, "__mul__", multiply)
+        setattr(cls, "__imul__", _wrap_iop(multiply))
+        setattr(cls, "__rmul__", rmultiply)
+        setattr(cls, "__div__", divide)
+        setattr(cls, "__idiv__", _wrap_iop(divide))
+        setattr(cls, "__rdiv__", rdivide)
+        setattr(cls, "__truediv__", truediv)
+        setattr(cls, "__itruediv__", _wrap_iop(truediv))
+        setattr(cls, "__rtruediv__", rtruediv)
+        setattr(cls, "__floordiv__", floordiv)
+        setattr(cls, "__ifloordiv__", _wrap_iop(floordiv))
+        setattr(cls, "__rfloordiv__", rfloordiv)
+        setattr(cls, "__pow__", power)
+        setattr(cls, "__ipow__", _wrap_iop(power))
+        setattr(cls, "__rpow__", rpower)
+        setattr(cls, "__mod__", mod)
+        setattr(cls, "__imod__", _wrap_iop(mod))
+        setattr(cls, "__rmod__", rmod)
+        setattr(cls, "__lshift__", lshift)
+        setattr(cls, "__ilshift__", _wrap_iop(lshift))
+        setattr(cls, "__rlshift__", rlshift)
+        setattr(cls, "__rshift__", rshift)
+        setattr(cls, "__irshift__", _wrap_iop(rshift))
+        setattr(cls, "__rrshift__", rrshift)
+
+        setattr(cls, "__eq__", _wrap_equal(equal))
+        setattr(cls, "__ne__", not_equal)
+        setattr(cls, "__lt__", less)
+        setattr(cls, "__le__", less_equal)
+        setattr(cls, "__gt__", greater)
+        setattr(cls, "__ge__", greater_equal)
+        setattr(cls, "__and__", bitand)
+        setattr(cls, "__iand__", _wrap_iop(bitand))
+        setattr(cls, "__rand__", rbitand)
+        setattr(cls, "__or__", bitor)
+        setattr(cls, "__ior__", _wrap_iop(bitor))
+        setattr(cls, "__ror__", rbitor)
+        setattr(cls, "__xor__", bitxor)
+        setattr(cls, "__ixor__", _wrap_iop(bitxor))
+        setattr(cls, "__rxor__", rbitxor)
+
+        setattr(cls, "__neg__", negative)
+        setattr(cls, "__pos__", positive)
+        setattr(cls, "__abs__", abs)
+        setattr(cls, "__invert__", invert)
+
+    setattr(Tensor, "round", round_)
+    setattr(Tensor, "conj", conj)
+    setattr(Tensor, "conjugate", conjugate)
+    setattr(TensorData, "round", round_)
+    setattr(TensorData, "conj", conj)
+    setattr(TensorData, "conjugate", conjugate)
+
+
+_install()
+del _install
+
+
+BIN_UFUNC = {
+    add,
+    subtract,
+    multiply,
+    divide,
+    truediv,
+    floordiv,
+    power,
+    mod,
+    fmod,
+    logaddexp,
+    logaddexp2,
+    equal,
+    not_equal,
+    less,
+    less_equal,
+    greater,
+    greater_equal,
+    arctan2,
+    hypot,
+    bitand,
+    bitor,
+    bitxor,
+    lshift,
+    rshift,
+    logical_and,
+    logical_or,
+    logical_xor,
+    maximum,
+    minimum,
+    float_power,
+    remainder,
+    fmax,
+    fmin,
+    copysign,
+    nextafter,
+    ldexp,
+}
+
+UNARY_UFUNC = {
+    square,
+    arcsinh,
+    rint,
+    sign,
+    conj,
+    tan,
+    absolute,
+    deg2rad,
+    log,
+    fabs,
+    exp2,
+    invert,
+    negative,
+    sqrt,
+    arctan,
+    positive,
+    cbrt,
+    log10,
+    sin,
+    rad2deg,
+    log2,
+    arcsin,
+    expm1,
+    arctanh,
+    cosh,
+    sinh,
+    cos,
+    reciprocal,
+    tanh,
+    log1p,
+    exp,
+    arccos,
+    arccosh,
+    around,
+    logical_not,
+    conjugate,
+    isfinite,
+    isinf,
+    isnan,
+    signbit,
+    spacing,
+    floor,
+    ceil,
+    trunc,
+    degrees,
+    radians,
+    angle,
+    isreal,
+    iscomplex,
+    real,
+    imag,
+    fix,
+    i0,
+    sinc,
+    nan_to_num,
+}
diff --git a/python/xorbits/_mars/tensor/arithmetic/abs.py b/python/xorbits/_mars/tensor/arithmetic/abs.py
new file mode 100644
index 000000000..616ff3c29
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/abs.py
@@ -0,0 +1,66 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorAbs(TensorUnaryOp):
+    _op_type_ = OperandDef.ABS
+    _func_name = "abs"
+
+
+@infer_dtype(np.abs)
+def abs(x, out=None, where=None, **kwargs):
+    r"""
+    Calculate the absolute value element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    absolute : Tensor
+        An tensor containing the absolute value of
+        each element in `x`.  For complex input, ``a + ib``, the
+        absolute value is :math:`\sqrt{ a^2 + b^2 }`.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.array([-1.2, 1.2])
+    >>> mt.absolute(x).execute()
+    array([ 1.2,  1.2])
+    >>> mt.absolute(1.2 + 1j).execute()
+    1.5620499351813308
+    """
+    op = TensorAbs(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/absolute.py b/python/xorbits/_mars/tensor/arithmetic/absolute.py
new file mode 100644
index 000000000..6c72132a7
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/absolute.py
@@ -0,0 +1,66 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorAbsolute(TensorUnaryOp):
+    _op_type_ = OperandDef.ABSOLUTE
+    _func_name = "absolute"
+
+
+@infer_dtype(np.absolute)
+def absolute(x, out=None, where=None, **kwargs):
+    r"""
+    Calculate the absolute value element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    absolute : Tensor
+        An tensor containing the absolute value of
+        each element in `x`.  For complex input, ``a + ib``, the
+        absolute value is :math:`\sqrt{ a^2 + b^2 }`.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.array([-1.2, 1.2])
+    >>> mt.absolute(x).execute()
+    array([ 1.2,  1.2])
+    >>> mt.absolute(1.2 + 1j).execute()
+    1.5620499351813308
+    """
+    op = TensorAbsolute(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/add.py b/python/xorbits/_mars/tensor/arithmetic/add.py
new file mode 100644
index 000000000..9c6665532
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/add.py
@@ -0,0 +1,124 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import reduce
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import BoolField
+from ..array_utils import as_same_device, device
+from ..datasource import scalar
+from ..utils import infer_dtype
+from .core import TensorBinOp, TensorMultiOp
+from .utils import TreeReductionBuilder, arithmetic_operand, tree_op_estimate_size
+
+
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorAdd(TensorBinOp):
+    _op_type_ = OperandDef.ADD
+    _func_name = "add"
+
+
+@infer_dtype(np.add)
+def add(x1, x2, out=None, where=None, **kwargs):
+    """
+    Add arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        The tensors to be added.  If ``x1.shape != x2.shape``, they must be
+        broadcastable to a common shape (which may be the shape of one or
+        the other).
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    add : Tensor or scalar
+        The sum of `x1` and `x2`, element-wise.  Returns a scalar if
+        both  `x1` and `x2` are scalars.
+
+    Notes
+    -----
+    Equivalent to `x1` + `x2` in terms of tensor broadcasting.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.add(1.0, 4.0).execute()
+    5.0
+    >>> x1 = mt.arange(9.0).reshape((3, 3))
+    >>> x2 = mt.arange(3.0)
+    >>> mt.add(x1, x2).execute()
+    array([[  0.,   2.,   4.],
+           [  3.,   5.,   7.],
+           [  6.,   8.,  10.]])
+    """
+    op = TensorAdd(**kwargs)
+    return op(x1, x2, out=out, where=where)
+
+
+@infer_dtype(np.add, reverse=True)
+def radd(x1, x2, **kwargs):
+    op = TensorAdd(**kwargs)
+    return op.rcall(x1, x2)
+
+
+class TensorTreeAdd(TensorMultiOp):
+    _op_type_ = OperandDef.TREE_ADD
+    _func_name = "add"
+
+    ignore_empty_input = BoolField("ignore_empty_input", default=False)
+
+    @classmethod
+    def _is_sparse(cls, *args):
+        if args and all(hasattr(x, "issparse") and x.issparse() for x in args):
+            return True
+        return False
+
+    @classmethod
+    def execute(cls, ctx, op: "TensorTreeAdd"):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+        if op.ignore_empty_input:
+            inputs = [inp for inp in inputs if not hasattr(inp, "size") or inp.size > 0]
+
+        with device(device_id):
+            ctx[op.outputs[0].key] = reduce(xp.add, inputs)
+
+    @classmethod
+    def estimate_size(cls, ctx, op):
+        tree_op_estimate_size(ctx, op)
+
+
+@infer_dtype(lambda *args: reduce(np.add, args))
+def tree_add(*args, combine_size=None, **kwargs):
+    class MultiplyBuilder(TreeReductionBuilder):
+        def _build_reduction(self, inputs, final=False):
+            op = TensorTreeAdd(args=inputs, **kwargs)
+            return op(*inputs)
+
+    args = [scalar(a) if np.isscalar(a) else a for a in args]
+    return MultiplyBuilder(combine_size).build(args)
diff --git a/python/xorbits/_mars/tensor/arithmetic/angle.py b/python/xorbits/_mars/tensor/arithmetic/angle.py
new file mode 100644
index 000000000..d8dc729d4
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/angle.py
@@ -0,0 +1,88 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import BoolField
+from ..array_utils import as_same_device, device
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(init=False, sparse_mode="unary")
+class TensorAngle(TensorUnaryOp):
+    _op_type_ = OperandDef.ANGLE
+    _func_name = "angle"
+
+    _deg = BoolField("deg")
+
+    @property
+    def deg(self):
+        return self._deg
+
+    def __init__(
+        self, deg=None, casting="same_kind", err=None, dtype=None, sparse=False, **kw
+    ):
+        err = err if err is not None else np.geterr()
+        super().__init__(
+            _deg=deg, _casting=casting, _err=err, dtype=dtype, sparse=sparse, **kw
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (z,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            ctx[op.outputs[0].key] = xp.angle(z, deg=op.deg)
+
+
+@infer_dtype(np.angle)
+def angle(z, deg=False, **kwargs):
+    """
+    Return the angle of the complex argument.
+
+    Parameters
+    ----------
+    z : array_like
+        A complex number or sequence of complex numbers.
+    deg : bool, optional
+        Return angle in degrees if True, radians if False (default).
+
+    Returns
+    -------
+    angle : Tensor or scalar
+        The counterclockwise angle from the positive real axis on
+        the complex plane, with dtype as numpy.float64.
+
+    See Also
+    --------
+    arctan2
+    absolute
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.angle([1.0, 1.0j, 1+1j]).execute()               # in radians
+    array([ 0.        ,  1.57079633,  0.78539816])
+    >>> mt.angle(1+1j, deg=True).execute()                  # in degrees
+    45.0
+
+    """
+    op = TensorAngle(deg=deg, **kwargs)
+    return op(z)
diff --git a/python/xorbits/_mars/tensor/arithmetic/arccos.py b/python/xorbits/_mars/tensor/arithmetic/arccos.py
new file mode 100644
index 000000000..cd3ad8527
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/arccos.py
@@ -0,0 +1,102 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorArccos(TensorUnaryOp):
+    _op_type_ = OperandDef.ARCCOS
+
+    _func_name = "arccos"
+
+
+@infer_dtype(np.arccos)
+def arccos(x, out=None, where=None, **kwargs):
+    """
+    Trigonometric inverse cosine, element-wise.
+
+    The inverse of `cos` so that, if ``y = cos(x)``, then ``x = arccos(y)``.
+
+    Parameters
+    ----------
+    x : array_like
+        `x`-coordinate on the unit circle.
+        For real arguments, the domain is [-1, 1].
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    angle : Tensor
+        The angle of the ray intersecting the unit circle at the given
+        `x`-coordinate in radians [0, pi]. If `x` is a scalar then a
+        scalar is returned, otherwise an array of the same shape as `x`
+        is returned.
+
+    See Also
+    --------
+    cos, arctan, arcsin
+
+    Notes
+    -----
+    `arccos` is a multivalued function: for each `x` there are infinitely
+    many numbers `z` such that `cos(z) = x`. The convention is to return
+    the angle `z` whose real part lies in `[0, pi]`.
+
+    For real-valued input data types, `arccos` always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `arccos` is a complex analytic function that
+    has branch cuts `[-inf, -1]` and `[1, inf]` and is continuous from
+    above on the former and from below on the latter.
+
+    The inverse `cos` is also known as `acos` or cos^-1.
+
+    References
+    ----------
+    M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
+    10th printing, 1964, pp. 79. http://www.math.sfu.ca/~cbm/aands/
+
+    Examples
+    --------
+    We expect the arccos of 1 to be 0, and of -1 to be pi:
+    >>> import mars.tensor as mt
+
+    >>> mt.arccos([1, -1]).execute()
+    array([ 0.        ,  3.14159265])
+
+    Plot arccos:
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = mt.linspace(-1, 1, num=100)
+    >>> plt.plot(x.execute(), mt.arccos(x).execute())
+    >>> plt.axis('tight')
+    >>> plt.show()
+    """
+    op = TensorArccos(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/arccosh.py b/python/xorbits/_mars/tensor/arithmetic/arccosh.py
new file mode 100644
index 000000000..572c89992
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/arccosh.py
@@ -0,0 +1,89 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorArccosh(TensorUnaryOp):
+    _op_type_ = OperandDef.ARCCOSH
+    _func_name = "arccosh"
+
+
+@infer_dtype(np.arccosh)
+def arccosh(x, out=None, where=None, **kwargs):
+    """
+    Inverse hyperbolic cosine, element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    arccosh : Tensor
+        Array of the same shape as `x`.
+
+    See Also
+    --------
+
+    cosh, arcsinh, sinh, arctanh, tanh
+
+    Notes
+    -----
+    `arccosh` is a multivalued function: for each `x` there are infinitely
+    many numbers `z` such that `cosh(z) = x`. The convention is to return the
+    `z` whose imaginary part lies in `[-pi, pi]` and the real part in
+    ``[0, inf]``.
+
+    For real-valued input data types, `arccosh` always returns real output.
+    For each value that cannot be expressed as a real number or infinity, it
+    yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `arccosh` is a complex analytical function that
+    has a branch cut `[-inf, 1]` and is continuous from above on it.
+
+    References
+    ----------
+    .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
+           10th printing, 1964, pp. 86. http://www.math.sfu.ca/~cbm/aands/
+    .. [2] Wikipedia, "Inverse hyperbolic function",
+           http://en.wikipedia.org/wiki/Arccosh
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.arccosh([mt.e, 10.0]).execute()
+    array([ 1.65745445,  2.99322285])
+    >>> mt.arccosh(1).execute()
+    0.0
+    """
+    op = TensorArccosh(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/arcsin.py b/python/xorbits/_mars/tensor/arithmetic/arcsin.py
new file mode 100644
index 000000000..8b05fd304
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/arcsin.py
@@ -0,0 +1,92 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorArcsin(TensorUnaryOp):
+    _op_type_ = OperandDef.ARCSIN
+    _func_name = "arcsin"
+
+
+@infer_dtype(np.arcsin)
+def arcsin(x, out=None, where=None, **kwargs):
+    """
+    Inverse sine, element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        `y`-coordinate on the unit circle.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    angle : Tensor
+        The inverse sine of each element in `x`, in radians and in the
+        closed interval ``[-pi/2, pi/2]``.  If `x` is a scalar, a scalar
+        is returned, otherwise a tensor.
+
+    See Also
+    --------
+    sin, cos, arccos, tan, arctan, arctan2, emath.arcsin
+
+    Notes
+    -----
+    `arcsin` is a multivalued function: for each `x` there are infinitely
+    many numbers `z` such that :math:`sin(z) = x`.  The convention is to
+    return the angle `z` whose real part lies in [-pi/2, pi/2].
+
+    For real-valued input data types, *arcsin* always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `arcsin` is a complex analytic function that
+    has, by convention, the branch cuts [-inf, -1] and [1, inf]  and is
+    continuous from above on the former and from below on the latter.
+
+    The inverse sine is also known as `asin` or sin^{-1}.
+
+    References
+    ----------
+    Abramowitz, M. and Stegun, I. A., *Handbook of Mathematical Functions*,
+    10th printing, New York: Dover, 1964, pp. 79ff.
+    http://www.math.sfu.ca/~cbm/aands/
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> mt.arcsin(1).execute()     # pi/2
+    1.5707963267948966
+    >>> mt.arcsin(-1).execute()    # -pi/2
+    -1.5707963267948966
+    >>> mt.arcsin(0).execute()
+    0.0
+    """
+    op = TensorArcsin(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/arcsinh.py b/python/xorbits/_mars/tensor/arithmetic/arcsinh.py
new file mode 100644
index 000000000..1472c37f4
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/arcsinh.py
@@ -0,0 +1,84 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorArcsinh(TensorUnaryOp):
+    _op_type_ = OperandDef.ARCSINH
+    _func_name = "arcsinh"
+
+
+@infer_dtype(np.arcsinh)
+def arcsinh(x, out=None, where=None, **kwargs):
+    """
+    Inverse hyperbolic sine element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : Tensor
+        Tensor of of the same shape as `x`.
+
+    Notes
+    -----
+    `arcsinh` is a multivalued function: for each `x` there are infinitely
+    many numbers `z` such that `sinh(z) = x`. The convention is to return the
+    `z` whose imaginary part lies in `[-pi/2, pi/2]`.
+
+    For real-valued input data types, `arcsinh` always returns real output.
+    For each value that cannot be expressed as a real number or infinity, it
+    returns ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `arccos` is a complex analytical function that
+    has branch cuts `[1j, infj]` and `[-1j, -infj]` and is continuous from
+    the right on the former and from the left on the latter.
+
+    The inverse hyperbolic sine is also known as `asinh` or ``sinh^-1``.
+
+    References
+    ----------
+    .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
+           10th printing, 1964, pp. 86. http://www.math.sfu.ca/~cbm/aands/
+    .. [2] Wikipedia, "Inverse hyperbolic function",
+           http://en.wikipedia.org/wiki/Arcsinh
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.arcsinh(mt.array([mt.e, 10.0])).execute()
+    array([ 1.72538256,  2.99822295])
+    """
+    op = TensorArcsinh(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/arctan.py b/python/xorbits/_mars/tensor/arithmetic/arctan.py
new file mode 100644
index 000000000..1610a3880
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/arctan.py
@@ -0,0 +1,104 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorArctan(TensorUnaryOp):
+    _op_type_ = OperandDef.ARCTAN
+    _func_name = "arctan"
+
+
+@infer_dtype(np.arctan)
+def arctan(x, out=None, where=None, **kwargs):
+    """
+    Trigonometric inverse tangent, element-wise.
+
+    The inverse of tan, so that if ``y = tan(x)`` then ``x = arctan(y)``.
+
+    Parameters
+    ----------
+    x : array_like
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : Tensor
+        Out has the same shape as `x`.  Its real part is in
+        ``[-pi/2, pi/2]`` (``arctan(+/-inf)`` returns ``+/-pi/2``).
+        It is a scalar if `x` is a scalar.
+
+    See Also
+    --------
+    arctan2 : The "four quadrant" arctan of the angle formed by (`x`, `y`)
+        and the positive `x`-axis.
+    angle : Argument of complex values.
+
+    Notes
+    -----
+    `arctan` is a multi-valued function: for each `x` there are infinitely
+    many numbers `z` such that tan(`z`) = `x`.  The convention is to return
+    the angle `z` whose real part lies in [-pi/2, pi/2].
+
+    For real-valued input data types, `arctan` always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `arctan` is a complex analytic function that
+    has [`1j, infj`] and [`-1j, -infj`] as branch cuts, and is continuous
+    from the left on the former and from the right on the latter.
+
+    The inverse tangent is also known as `atan` or tan^{-1}.
+
+    References
+    ----------
+    Abramowitz, M. and Stegun, I. A., *Handbook of Mathematical Functions*,
+    10th printing, New York: Dover, 1964, pp. 79.
+    http://www.math.sfu.ca/~cbm/aands/
+
+    Examples
+    --------
+    We expect the arctan of 0 to be 0, and of 1 to be pi/4:
+    >>> import mars.tensor as mt
+
+    >>> mt.arctan([0, 1]).execute()
+    array([ 0.        ,  0.78539816])
+
+    >>> mt.pi/4
+    0.78539816339744828
+
+    Plot arctan:
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = mt.linspace(-10, 10)
+    >>> plt.plot(x.execute(), mt.arctan(x).execute())
+    >>> plt.axis('tight')
+    >>> plt.show()
+    """
+    op = TensorArctan(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/arctan2.py b/python/xorbits/_mars/tensor/arithmetic/arctan2.py
new file mode 100644
index 000000000..089e0cf3d
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/arctan2.py
@@ -0,0 +1,126 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand
+class TensorArctan2(TensorBinOp):
+    _op_type_ = OperandDef.ARCTAN2
+    _func_name = "arctan2"
+
+    @classmethod
+    def _is_sparse(cls, x1, x2):
+        if hasattr(x1, "issparse") and x1.issparse():
+            # if x1 is sparse, will be sparse always
+            return True
+        elif np.isscalar(x1) and x1 == 0:
+            # x1 == 0, return sparse if x2 is
+            return x2.issparse() if hasattr(x2, "issparse") else False
+        return False
+
+
+@infer_dtype(np.arctan2)
+def arctan2(x1, x2, out=None, where=None, **kwargs):
+    """
+    Element-wise arc tangent of ``x1/x2`` choosing the quadrant correctly.
+
+    The quadrant (i.e., branch) is chosen so that ``arctan2(x1, x2)`` is
+    the signed angle in radians between the ray ending at the origin and
+    passing through the point (1,0), and the ray ending at the origin and
+    passing through the point (`x2`, `x1`).  (Note the role reversal: the
+    "`y`-coordinate" is the first function parameter, the "`x`-coordinate"
+    is the second.)  By IEEE convention, this function is defined for
+    `x2` = +/-0 and for either or both of `x1` and `x2` = +/-inf (see
+    Notes for specific values).
+
+    This function is not defined for complex-valued arguments; for the
+    so-called argument of complex values, use `angle`.
+
+    Parameters
+    ----------
+    x1 : array_like, real-valued
+        `y`-coordinates.
+    x2 : array_like, real-valued
+        `x`-coordinates. `x2` must be broadcastable to match the shape of
+        `x1` or vice versa.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    angle : Tensor
+        Array of angles in radians, in the range ``[-pi, pi]``.
+
+    See Also
+    --------
+    arctan, tan, angle
+
+    Notes
+    -----
+    *arctan2* is identical to the `atan2` function of the underlying
+    C library.  The following special values are defined in the C
+    standard: [1]_
+
+    ====== ====== ================
+    `x1`   `x2`   `arctan2(x1,x2)`
+    ====== ====== ================
+    +/- 0  +0     +/- 0
+    +/- 0  -0     +/- pi
+     > 0   +/-inf +0 / +pi
+     < 0   +/-inf -0 / -pi
+    +/-inf +inf   +/- (pi/4)
+    +/-inf -inf   +/- (3*pi/4)
+    ====== ====== ================
+
+    Note that +0 and -0 are distinct floating point numbers, as are +inf
+    and -inf.
+
+    References
+    ----------
+    .. [1] ISO/IEC standard 9899:1999, "Programming language C."
+
+    Examples
+    --------
+    Consider four points in different quadrants:
+    >>> import mars.tensor as mt
+
+    >>> x = mt.array([-1, +1, +1, -1])
+    >>> y = mt.array([-1, -1, +1, +1])
+    >>> (mt.arctan2(y, x) * 180 / mt.pi).execute()
+    array([-135.,  -45.,   45.,  135.])
+
+    Note the order of the parameters. `arctan2` is defined also when `x2` = 0
+    and at several other special points, obtaining values in
+    the range ``[-pi, pi]``:
+
+    >>> mt.arctan2([1., -1.], [0., 0.]).execute()
+    array([ 1.57079633, -1.57079633])
+    >>> mt.arctan2([0., 0., mt.inf], [+0., -0., mt.inf]).execute()
+    array([ 0.        ,  3.14159265,  0.78539816])
+    """
+    op = TensorArctan2(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/arctanh.py b/python/xorbits/_mars/tensor/arithmetic/arctanh.py
new file mode 100644
index 000000000..90aefb8ab
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/arctanh.py
@@ -0,0 +1,84 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorArctanh(TensorUnaryOp):
+    _op_type_ = OperandDef.ARCTANH
+    _func_name = "arctanh"
+
+
+@infer_dtype(np.arctanh)
+def arctanh(x, out=None, where=None, **kwargs):
+    """
+    Inverse hyperbolic tangent element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : Tensor
+        Array of the same shape as `x`.
+
+    Notes
+    -----
+    `arctanh` is a multivalued function: for each `x` there are infinitely
+    many numbers `z` such that `tanh(z) = x`. The convention is to return
+    the `z` whose imaginary part lies in `[-pi/2, pi/2]`.
+
+    For real-valued input data types, `arctanh` always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `arctanh` is a complex analytical function
+    that has branch cuts `[-1, -inf]` and `[1, inf]` and is continuous from
+    above on the former and from below on the latter.
+
+    The inverse hyperbolic tangent is also known as `atanh` or ``tanh^-1``.
+
+    References
+    ----------
+    .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
+           10th printing, 1964, pp. 86. http://www.math.sfu.ca/~cbm/aands/
+    .. [2] Wikipedia, "Inverse hyperbolic function",
+           http://en.wikipedia.org/wiki/Arctanh
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.arctanh([0, -0.5]).execute()
+    array([ 0.        , -0.54930614])
+    """
+    op = TensorArctanh(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/around.py b/python/xorbits/_mars/tensor/arithmetic/around.py
new file mode 100644
index 000000000..58c896cb9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/around.py
@@ -0,0 +1,141 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import Int32Field
+from ..array_utils import as_same_device, device
+from ..datasource import tensor as astensor
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(init=False, sparse_mode="unary")
+class TensorAround(TensorUnaryOp):
+    _op_type_ = OperandDef.AROUND
+
+    _decimals = Int32Field("decimals")
+    _func_name = "around"
+
+    @property
+    def decimals(self):
+        return self._decimals
+
+    def __init__(
+        self,
+        decimals=None,
+        casting="same_kind",
+        err=None,
+        dtype=None,
+        sparse=False,
+        **kw
+    ):
+        err = err if err is not None else np.geterr()
+        super().__init__(
+            _decimals=decimals,
+            _casting=casting,
+            _err=err,
+            dtype=dtype,
+            sparse=sparse,
+            **kw
+        )
+
+    @property
+    def ufunc_extra_params(self):
+        return {"decimals": self._decimals}
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (a,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            ctx[op.outputs[0].key] = xp.around(a, decimals=op.decimals)
+
+
+def around(a, decimals=0, out=None):
+    """
+    Evenly round to the given number of decimals.
+
+    Parameters
+    ----------
+    a : array_like
+        Input data.
+    decimals : int, optional
+        Number of decimal places to round to (default: 0).  If
+        decimals is negative, it specifies the number of positions to
+        the left of the decimal point.
+    out : Tensor, optional
+        Alternative output tensor in which to place the result. It must have
+        the same shape as the expected output, but the type of the output
+        values will be cast if necessary.
+
+    Returns
+    -------
+    rounded_array : Tensor
+        An tensor of the same type as `a`, containing the rounded values.
+        Unless `out` was specified, a new tensor is created.  A reference to
+        the result is returned.
+
+        The real and imaginary parts of complex numbers are rounded
+        separately.  The result of rounding a float is a float.
+
+    See Also
+    --------
+    Tensor.round : equivalent method
+
+    ceil, fix, floor, rint, trunc
+
+
+    Notes
+    -----
+    For values exactly halfway between rounded decimal values, NumPy
+    rounds to the nearest even value. Thus 1.5 and 2.5 round to 2.0,
+    -0.5 and 0.5 round to 0.0, etc. Results may also be surprising due
+    to the inexact representation of decimal fractions in the IEEE
+    floating point standard [1]_ and errors introduced when scaling
+    by powers of ten.
+
+    References
+    ----------
+    .. [1] "Lecture Notes on the Status of  IEEE 754", William Kahan,
+           http://www.cs.berkeley.edu/~wkahan/ieee754status/IEEE754.PDF
+    .. [2] "How Futile are Mindless Assessments of
+           Roundoff in Floating-Point Computation?", William Kahan,
+           http://www.cs.berkeley.edu/~wkahan/Mindless.pdf
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.around([0.37, 1.64]).execute()
+    array([ 0.,  2.])
+    >>> mt.around([0.37, 1.64], decimals=1).execute()
+    array([ 0.4,  1.6])
+    >>> mt.around([.5, 1.5, 2.5, 3.5, 4.5]).execute() # rounds to nearest even value
+    array([ 0.,  2.,  2.,  4.,  4.])
+    >>> mt.around([1,2,3,11], decimals=1).execute() # tensor of ints is returned
+    array([ 1,  2,  3, 11])
+    >>> mt.around([1,2,3,11], decimals=-1).execute()
+    array([ 0,  0,  0, 10])
+
+    """
+    dtype = astensor(a).dtype
+    op = TensorAround(decimals=decimals, dtype=dtype)
+    return op(a, out=out)
+
+
+round_ = around
diff --git a/python/xorbits/_mars/tensor/arithmetic/bitand.py b/python/xorbits/_mars/tensor/arithmetic/bitand.py
new file mode 100644
index 000000000..6ada55559
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/bitand.py
@@ -0,0 +1,93 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_or")
+class TensorBitand(TensorBinOp):
+    _op_type_ = OperandDef.BITAND
+    _func_name = "bitwise_and"
+
+
+@infer_dtype(np.bitwise_and)
+def bitand(x1, x2, out=None, where=None, **kwargs):
+    """
+    Compute the bit-wise AND of two tensors element-wise.
+
+    Computes the bit-wise AND of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``&``.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Only integer and boolean types are handled.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : array_like
+        Result.
+
+    See Also
+    --------
+    logical_and
+    bitwise_or
+    bitwise_xor
+
+    Examples
+    --------
+    The number 13 is represented by ``00001101``.  Likewise, 17 is
+    represented by ``00010001``.  The bit-wise AND of 13 and 17 is
+    therefore ``000000001``, or 1:
+
+    >>> import mars.tensor as mt
+
+    >>> mt.bitwise_and(13, 17).execute()
+    1
+
+    >>> mt.bitwise_and(14, 13).execute()
+    12
+    >>> mt.bitwise_and([14,3], 13).execute()
+    array([12,  1])
+
+    >>> mt.bitwise_and([11,7], [4,25]).execute()
+    array([0, 1])
+    >>> mt.bitwise_and(mt.array([2,5,255]), mt.array([3,14,16])).execute()
+    array([ 2,  4, 16])
+    >>> mt.bitwise_and([True, True], [False, True]).execute()
+    array([False,  True])
+    """
+    op = TensorBitand(**kwargs)
+    return op(x1, x2, out=out, where=where)
+
+
+@infer_dtype(np.bitwise_and, reverse=True)
+def rbitand(x1, x2, **kwargs):
+    op = TensorBitand(**kwargs)
+    return op.rcall(x1, x2)
diff --git a/python/xorbits/_mars/tensor/arithmetic/bitor.py b/python/xorbits/_mars/tensor/arithmetic/bitor.py
new file mode 100644
index 000000000..aeacb7f65
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/bitor.py
@@ -0,0 +1,100 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_or")
+class TensorBitor(TensorBinOp):
+    _op_type_ = OperandDef.BITOR
+    _func_name = "bitwise_or"
+
+
+@infer_dtype(np.bitwise_or)
+def bitor(x1, x2, out=None, where=None, **kwargs):
+    """
+    Compute the bit-wise OR of two tensors element-wise.
+
+    Computes the bit-wise OR of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``|``.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Only integer and boolean types are handled.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : array_like
+        Result.
+
+    See Also
+    --------
+    logical_or
+    bitwise_and
+    bitwise_xor
+    binary_repr :
+        Return the binary representation of the input number as a string.
+
+    Examples
+    --------
+    The number 13 has the binaray representation ``00001101``. Likewise,
+    16 is represented by ``00010000``.  The bit-wise OR of 13 and 16 is
+    then ``000111011``, or 29:
+
+    >>> import mars.tensor as mt
+
+    >>> mt.bitwise_or(13, 16).execute()
+    29
+
+    >>> mt.bitwise_or(32, 2).execute()
+    34
+    >>> mt.bitwise_or([33, 4], 1).execute()
+    array([33,  5])
+    >>> mt.bitwise_or([33, 4], [1, 2]).execute()
+    array([33,  6])
+
+    >>> mt.bitwise_or(mt.array([2, 5, 255]), mt.array([4, 4, 4])).execute()
+    array([  6,   5, 255])
+    >>> (mt.array([2, 5, 255]) | mt.array([4, 4, 4])).execute()
+    array([  6,   5, 255])
+    >>> mt.bitwise_or(mt.array([2, 5, 255, 2147483647], dtype=mt.int32),
+    ...               mt.array([4, 4, 4, 2147483647], dtype=mt.int32)).execute()
+    array([         6,          5,        255, 2147483647])
+    >>> mt.bitwise_or([True, True], [False, True]).execute()
+    array([ True,  True])
+    """
+    op = TensorBitor(**kwargs)
+    return op(x1, x2, out=out, where=where)
+
+
+@infer_dtype(np.bitwise_or, reverse=True)
+def rbitor(x1, x2, **kwargs):
+    op = TensorBitor(**kwargs)
+    return op.rcall(x1, x2)
diff --git a/python/xorbits/_mars/tensor/arithmetic/bitxor.py b/python/xorbits/_mars/tensor/arithmetic/bitxor.py
new file mode 100644
index 000000000..f840890bf
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/bitxor.py
@@ -0,0 +1,93 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_or")
+class TensorBitxor(TensorBinOp):
+    _op_type_ = OperandDef.BITXOR
+    _func_name = "bitwise_xor"
+
+
+@infer_dtype(np.bitwise_xor)
+def bitxor(x1, x2, out=None, where=None, **kwargs):
+    """
+    Compute the bit-wise XOR of two arrays element-wise.
+
+    Computes the bit-wise XOR of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``^``.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Only integer and boolean types are handled.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : array_like
+        Result.
+
+    See Also
+    --------
+    logical_xor
+    bitwise_and
+    bitwise_or
+    binary_repr :
+        Return the binary representation of the input number as a string.
+
+    Examples
+    --------
+    The number 13 is represented by ``00001101``. Likewise, 17 is
+    represented by ``00010001``.  The bit-wise XOR of 13 and 17 is
+    therefore ``00011100``, or 28:
+
+    >>> import mars.tensor as mt
+
+    >>> mt.bitwise_xor(13, 17).execute()
+    28
+
+    >>> mt.bitwise_xor(31, 5).execute()
+    26
+    >>> mt.bitwise_xor([31,3], 5).execute()
+    array([26,  6])
+
+    >>> mt.bitwise_xor([31,3], [5,6]).execute()
+    array([26,  5])
+    >>> mt.bitwise_xor([True, True], [False, True]).execute()
+    array([ True, False])
+    """
+    op = TensorBitxor(**kwargs)
+    return op(x1, x2, out=out, where=where)
+
+
+@infer_dtype(np.bitwise_xor, reverse=True)
+def rbitxor(x1, x2, **kwargs):
+    op = TensorBitxor(**kwargs)
+    return op.rcall(x1, x2)
diff --git a/python/xorbits/_mars/tensor/arithmetic/cbrt.py b/python/xorbits/_mars/tensor/arithmetic/cbrt.py
new file mode 100644
index 000000000..a865bfc47
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/cbrt.py
@@ -0,0 +1,64 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorCbrt(TensorUnaryOp):
+    _op_type_ = OperandDef.CBRT
+    _func_name = "cbrt"
+
+
+@infer_dtype(np.cbrt)
+def cbrt(x, out=None, where=None, **kwargs):
+    """
+    Return the cube-root of an tensor, element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        The values whose cube-roots are required.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        An tensor of the same shape as `x`, containing the cube
+        cube-root of each element in `x`.
+        If `out` was provided, `y` is a reference to it.
+
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.cbrt([1,8,27]).execute()
+    array([ 1.,  2.,  3.])
+    """
+    op = TensorCbrt(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/ceil.py b/python/xorbits/_mars/tensor/arithmetic/ceil.py
new file mode 100644
index 000000000..218300103
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/ceil.py
@@ -0,0 +1,69 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorCeil(TensorUnaryOp):
+    _op_type_ = OperandDef.CEIL
+    _func_name = "ceil"
+
+
+@infer_dtype(np.ceil)
+def ceil(x, out=None, where=None, **kwargs):
+    r"""
+    Return the ceiling of the input, element-wise.
+
+    The ceil of the scalar `x` is the smallest integer `i`, such that
+    `i >= x`.  It is often denoted as :math:`\lceil x \rceil`.
+
+    Parameters
+    ----------
+    x : array_like
+        Input data.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor or scalar
+        The ceiling of each element in `x`, with `float` dtype.
+
+    See Also
+    --------
+    floor, trunc, rint
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])
+    >>> mt.ceil(a).execute()
+    array([-1., -1., -0.,  1.,  2.,  2.,  2.])
+    """
+    op = TensorCeil(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/clip.py b/python/xorbits/_mars/tensor/arithmetic/clip.py
new file mode 100644
index 000000000..4d5f6a75b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/clip.py
@@ -0,0 +1,205 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from numbers import Number
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import ENTITY_TYPE
+from ...serialization.serializables import AnyField, KeyField
+from ..array_utils import as_same_device, device
+from ..core import Tensor
+from ..datasource import tensor as astensor
+from ..utils import broadcast_shape
+from .core import TensorElementWise, TensorOperand, filter_inputs
+
+
+class TensorClip(TensorOperand, TensorElementWise):
+    _op_type_ = OperandDef.CLIP
+
+    _a = KeyField("a")
+    _a_min = AnyField("a_min")
+    _a_max = AnyField("a_max")
+    _out = KeyField("out")
+
+    def __init__(self, a=None, a_min=None, a_max=None, out=None, **kw):
+        super().__init__(_a=a, _a_min=a_min, _a_max=a_max, _out=out, **kw)
+
+    @property
+    def a(self):
+        return self._a
+
+    @property
+    def a_min(self):
+        return self._a_min
+
+    @property
+    def a_max(self):
+        return self._a_max
+
+    @property
+    def out(self):
+        return getattr(self, "_out", None)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        self._a = next(inputs_iter)
+        if isinstance(self._a_min, ENTITY_TYPE):
+            self._a_min = next(inputs_iter)
+        if isinstance(self._a_max, ENTITY_TYPE):
+            self._a_max = next(inputs_iter)
+        if getattr(self, "_out", None) is not None:
+            self._out = next(inputs_iter)
+
+    def __call__(self, a, a_min, a_max, out=None):
+        a = astensor(a)
+        tensors = [a]
+        sparse = a.issparse()
+
+        if isinstance(a_min, Number):
+            if a_min > 0:
+                sparse = False
+            a_min_dtype = np.array(a_min).dtype
+        elif a_min is not None:
+            a_min = astensor(a_min)
+            tensors.append(a_min)
+            if not a_min.issparse():
+                sparse = False
+            a_min_dtype = a_min.dtype
+        else:
+            a_min_dtype = None
+        self._a_min = a_min
+
+        if isinstance(a_max, Number):
+            if a_max < 0:
+                sparse = False
+            a_max_dtype = np.array(a_max).dtype
+        elif a_max is not None:
+            a_max = astensor(a_max)
+            tensors.append(a_max)
+            if not a_max.issparse():
+                sparse = False
+            a_max_dtype = a_max.dtype
+        else:
+            a_max_dtype = None
+        self._a_max = a_max
+
+        if out is not None:
+            if isinstance(out, Tensor):
+                self._out = out
+            else:
+                raise TypeError(f"out should be Tensor object, got {type(out)} instead")
+
+        dtypes = [dt for dt in [a.dtype, a_min_dtype, a_max_dtype] if dt is not None]
+        dtype = np.result_type(*dtypes)
+        # check broadcast
+        shape = broadcast_shape(*[t.shape for t in tensors])
+
+        setattr(self, "sparse", sparse)
+        inputs = filter_inputs([a, a_min, a_max, out])
+        t = self.new_tensor(inputs, shape)
+
+        if out is None:
+            setattr(self, "dtype", dtype)
+            return t
+
+        # if `out` is specified, use out's dtype and shape
+        out_shape, out_dtype = out.shape, out.dtype
+
+        if t.shape != out_shape:
+            t = self.new_tensor(inputs, out_shape)
+        setattr(self, "dtype", out_dtype)
+
+        out.data = t.data
+        return out
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        inputs_iter = iter(inputs)
+        a = next(inputs_iter)
+        a_min = (
+            next(inputs_iter) if isinstance(op.a_min, type(op.outputs[0])) else op.a_min
+        )
+        a_max = (
+            next(inputs_iter) if isinstance(op.a_max, type(op.outputs[0])) else op.a_max
+        )
+        out = next(inputs_iter).copy() if op.out is not None else None
+
+        with device(device_id):
+            kw = {}
+            if out is not None:
+                kw["out"] = out
+            ctx[op.outputs[0].key] = xp.clip(a, a_min, a_max, **kw)
+
+
+def clip(a, a_min, a_max, out=None):
+    """
+    Clip (limit) the values in a tensor.
+
+    Given an interval, values outside the interval are clipped to
+    the interval edges.  For example, if an interval of ``[0, 1]``
+    is specified, values smaller than 0 become 0, and values larger
+    than 1 become 1.
+
+    Parameters
+    ----------
+    a : array_like
+        Tensor containing elements to clip.
+    a_min : scalar or array_like or `None`
+        Minimum value. If `None`, clipping is not performed on lower
+        interval edge. Not more than one of `a_min` and `a_max` may be
+        `None`.
+    a_max : scalar or array_like or `None`
+        Maximum value. If `None`, clipping is not performed on upper
+        interval edge. Not more than one of `a_min` and `a_max` may be
+        `None`. If `a_min` or `a_max` are array_like, then the three
+        arrays will be broadcasted to match their shapes.
+    out : Tensor, optional
+        The results will be placed in this tensor. It may be the input
+        array for in-place clipping.  `out` must be of the right shape
+        to hold the output.  Its type is preserved.
+
+    Returns
+    -------
+    clipped_array : Tensor
+        An tensor with the elements of `a`, but where values
+        < `a_min` are replaced with `a_min`, and those > `a_max`
+        with `a_max`.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.arange(10)
+    >>> mt.clip(a, 1, 8).execute()
+    array([1, 1, 2, 3, 4, 5, 6, 7, 8, 8])
+    >>> a.execute()
+    array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+    >>> mt.clip(a, 3, 6, out=a).execute()
+    array([3, 3, 3, 3, 4, 5, 6, 6, 6, 6])
+    >>> a = mt.arange(10)
+    >>> a.execute()
+    array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+    >>> mt.clip(a, [3, 4, 1, 1, 1, 4, 4, 4, 4, 4], 8).execute()
+    array([3, 4, 2, 3, 4, 5, 6, 7, 8, 8])
+
+    """
+    op = TensorClip(a=a, a_min=a_min, a_max=a_max, out=out)
+    return op(a, a_min, a_max, out=out)
diff --git a/python/xorbits/_mars/tensor/arithmetic/conj.py b/python/xorbits/_mars/tensor/arithmetic/conj.py
new file mode 100644
index 000000000..4fd183a6b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/conj.py
@@ -0,0 +1,72 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorConj(TensorUnaryOp):
+    _op_type_ = OperandDef.CONJ
+    _func_name = "conj"
+
+
+@infer_dtype(np.conj)
+def conj(x, out=None, where=None, **kwargs):
+    """
+    Return the complex conjugate, element-wise.
+
+    The complex conjugate of a complex number is obtained by changing the
+    sign of its imaginary part.
+
+    Parameters
+    ----------
+    x : array_like
+        Input value.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        The complex conjugate of `x`, with same dtype as `y`.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.conjugate(1+2j).execute()
+    (1-2j)
+
+    >>> x = mt.eye(2) + 1j * mt.eye(2)
+    >>> mt.conjugate(x).execute()
+    array([[ 1.-1.j,  0.-0.j],
+           [ 0.-0.j,  1.-1.j]])
+    """
+    op = TensorConj(**kwargs)
+    return op(x, out=out, where=where)
+
+
+conjugate = conj
diff --git a/python/xorbits/_mars/tensor/arithmetic/copysign.py b/python/xorbits/_mars/tensor/arithmetic/copysign.py
new file mode 100644
index 000000000..db369ba2d
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/copysign.py
@@ -0,0 +1,76 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="always_false")
+class TensorCopysign(TensorBinOp):
+    _op_type_ = OperandDef.COPYSIGN
+    _func_name = "copysign"
+
+
+@infer_dtype(np.copysign)
+def copysign(x1, x2, out=None, where=None, **kwargs):
+    """
+    Change the sign of x1 to that of x2, element-wise.
+
+    If both arguments are arrays or sequences, they have to be of the same
+    length. If `x2` is a scalar, its sign will be copied to all elements of
+    `x1`.
+
+    Parameters
+    ----------
+    x1 : array_like
+        Values to change the sign of.
+    x2 : array_like
+        The sign of `x2` is copied to `x1`.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : array_like
+        The values of `x1` with the sign of `x2`.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.copysign(1.3, -1).execute()
+    -1.3
+    >>> (1/mt.copysign(0, 1)).execute()
+    inf
+    >>> (1/mt.copysign(0, -1)).execute()
+    -inf
+
+    >>> mt.copysign([-1, 0, 1], -1.1).execute()
+    array([-1., -0., -1.])
+    >>> mt.copysign([-1, 0, 1], mt.arange(3)-1).execute()
+    array([-1.,  0.,  1.])
+    """
+    op = TensorCopysign(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/core.py b/python/xorbits/_mars/tensor/arithmetic/core.py
new file mode 100644
index 000000000..a5180316a
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/core.py
@@ -0,0 +1,788 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+
+from ...core import ExecutableTuple
+from ...serialization.serializables import (
+    AnyField,
+    DictField,
+    FieldTypes,
+    KeyField,
+    ListField,
+    StringField,
+)
+from ...utils import has_unknown_shape
+from ..array_utils import as_same_device, convert_order, device
+from ..core import Tensor, TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+from ..utils import (
+    broadcast_shape,
+    check_order,
+    check_out_param,
+    filter_inputs,
+    unify_chunks,
+)
+
+
+class TensorElementWise(TensorOperandMixin):
+    __slots__ = ()
+
+    @classmethod
+    def tile(cls, op):
+        if len(op.inputs) > 1:
+            if has_unknown_shape(*op.inputs):
+                yield
+        inputs = yield from unify_chunks(
+            *[(input, list(range(input.ndim))[::-1]) for input in op.inputs]
+        )
+
+        chunk_shapes = [t.chunk_shape for t in inputs]
+        out_chunk_shape = broadcast_shape(*chunk_shapes)
+
+        out_chunks = [list() for _ in op.outputs]
+        nsplits = [[np.nan] * shape for shape in out_chunk_shape]
+        get_index = lambda idx, t: tuple(
+            0 if t.nsplits[i] == (1,) else ix for i, ix in enumerate(idx)
+        )
+        for out_index in itertools.product(*(map(range, out_chunk_shape))):
+            in_chunks = [
+                t.cix[get_index(out_index[-t.ndim :], t)]
+                if t.ndim != 0
+                else t.chunks[0]
+                for t in inputs
+            ]
+            chunk_op = op.copy().reset_key()
+            chunk_shape = broadcast_shape(*(c.shape for c in in_chunks))
+            chunks = chunk_op.new_chunks(
+                in_chunks,
+                shape=chunk_shape,
+                index=out_index,
+                kws=[
+                    {"side": str(i), "order": o.order, "dtype": o.dtype}
+                    for i, o in enumerate(op.outputs)
+                ],
+            )
+            for i, out_chunk in enumerate(chunks):
+                out_chunks[i].append(out_chunk)
+            for i, idx, s in zip(itertools.count(0), out_index, chunks[0].shape):
+                nsplits[i][idx] = s
+
+        new_op = op.copy().reset_key()
+        kws = []
+        for out_chunk, o in zip(out_chunks, op.outputs):
+            params = o.params.copy()
+            params["chunks"] = out_chunk
+            params["nsplits"] = nsplits
+            kws.append(params)
+        return new_op.new_tensors(list(inputs), kws=kws, output_limit=len(op.outputs))
+
+
+class TensorElementWiseWithInputs(TensorElementWise):
+    def _set_sparse(self, inputs):
+        raise NotImplementedError
+
+    def _new_tileables(self, inputs, kws=None, **kw):
+        self._set_sparse(inputs)
+        return super()._new_tileables(inputs, kws=kws, **kw)
+
+    def _new_chunks(self, inputs, kws=None, **kw):
+        self._set_sparse(inputs)
+        return super()._new_chunks(inputs, kws=kws, **kw)
+
+
+def _handle_out_dtype(val, dtype):
+    if val.dtype != dtype:
+        return val.astype(dtype)
+    return val
+
+
+class TensorBinOpMixin(TensorElementWiseWithInputs):
+    __slots__ = ()
+
+    def check_inputs(self, inputs):
+        if len(inputs) > 4:
+            raise ValueError(
+                f"Binary operand's inputs should less than or equal 4, got {len(inputs)}"
+            )
+
+    @classmethod
+    def _get_func(cls, xp):
+        func_name = getattr(cls, "_func_name")
+        return getattr(xp, func_name)
+
+    @classmethod
+    def _execute_gpu(cls, op, xp, lhs, rhs, **kw):
+        if kw.get("out") is not None:
+            kw["out"] = xp.asarray(kw["out"])
+        r = cls._get_func(xp)(lhs, rhs, **kw)
+        return convert_order(r, op.outputs[0].order.value)
+
+    @classmethod
+    def _execute_cpu(cls, op, xp, lhs, rhs, **kw):
+        kw["order"] = op.order
+        if kw.get("out") is not None:
+            kw["out"] = np.asarray(kw["out"])
+        return cls._get_func(xp)(lhs, rhs, **kw)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            kw = {"casting": op.casting} if op.out is not None else {}
+
+            inputs_iter = iter(inputs)
+            lhs = op.lhs if np.isscalar(op.lhs) else next(inputs_iter)
+            rhs = op.rhs if np.isscalar(op.rhs) else next(inputs_iter)
+            if op.out is not None:
+                kw["out"] = next(inputs_iter).copy()
+            if op.where is not None:
+                kw["where"] = next(inputs_iter)
+
+            with np.errstate(**op.err):
+                if op.is_gpu():
+                    ret = cls._execute_gpu(op, xp, lhs, rhs, **kw)
+                else:
+                    ret = cls._execute_cpu(op, xp, lhs, rhs, **kw)
+                ctx[op.outputs[0].key] = _handle_out_dtype(ret, op.dtype)
+
+
+class TensorBinOp(TensorOperand, TensorBinOpMixin):
+    _lhs = AnyField("lhs")
+    _rhs = AnyField("rhs")
+    _out = KeyField("out")
+    _where = KeyField("where")
+    _casting = StringField("casting")
+    _order = StringField("order")
+    _err = DictField("err", FieldTypes.string, FieldTypes.string)
+
+    def __init__(self, lhs=None, rhs=None, out=None, where=None, order=None, **kwargs):
+        super().__init__(
+            _lhs=lhs, _rhs=rhs, _out=out, _where=where, _order=order, **kwargs
+        )
+        if self._order is None:
+            self._order = "K"
+        check_order(self._order)
+
+    @property
+    def lhs(self):
+        return self._lhs
+
+    @property
+    def rhs(self):
+        return self._rhs
+
+    @property
+    def out(self):
+        return getattr(self, "_out", None)
+
+    @property
+    def where(self):
+        return getattr(self, "_where", None)
+
+    @property
+    def order(self):
+        return getattr(self, "_order", None)
+
+    @property
+    def casting(self):
+        return getattr(self, "_casting", None)
+
+    @property
+    def err(self):
+        return getattr(self, "_err", dict())
+
+    @classmethod
+    def _is_sparse(cls, x1, x2):
+        return False
+
+    def _set_sparse(self, inputs):
+        inputs_iter = iter(inputs)
+        x1 = self._lhs if np.isscalar(self._lhs) else next(inputs_iter)
+        x2 = self._rhs if np.isscalar(self._rhs) else next(inputs_iter)
+        setattr(self, "sparse", self._is_sparse(x1, x2))
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+
+        self._lhs = self._lhs if np.isscalar(self._lhs) else next(inputs_iter)
+        self._rhs = self._rhs if np.isscalar(self._rhs) else next(inputs_iter)
+        if getattr(self, "_out", None) is not None:
+            self._out = next(inputs_iter)
+        if getattr(self, "_where", None) is not None:
+            self._where = next(inputs_iter)
+
+    def _process_inputs(self, x1, x2, out, where):
+        x1 = x1 if np.isscalar(x1) else astensor(x1)
+        x2 = x2 if np.isscalar(x2) else astensor(x2)
+        self._lhs = x1
+        self._rhs = x2
+
+        if out is not None:
+            if isinstance(out, Tensor):
+                self._out = out
+            else:
+                raise TypeError(f"out should be Tensor object, got {type(out)} instead")
+        if where is True:
+            where = None
+        if where is not None:
+            where = astensor(where)
+            self._where = where
+
+        return x1, x2, out, where
+
+    def _calc_order(self, x1, x2, out):
+        if out is not None:
+            return out.order
+
+        if self._order in "KA":
+            orders = []
+            if not np.isscalar(x1):
+                orders.append(x1.order)
+            if not np.isscalar(x2):
+                orders.append(x2.order)
+
+            if len(orders) == 0:
+                return TensorOrder.C_ORDER
+            elif any(order == TensorOrder.C_ORDER for order in orders):
+                return TensorOrder.C_ORDER
+            else:
+                return TensorOrder.F_ORDER
+        elif self._order == "C":
+            return TensorOrder.C_ORDER
+        else:
+            return TensorOrder.F_ORDER
+
+    @property
+    def ufunc_extra_params(self):
+        return dict()
+
+    def _call_tensor_ufunc(self, x1, x2, out=None, where=None):
+        if hasattr(x1, "__tensor_ufunc__") or hasattr(x2, "__tensor_ufunc__"):
+            ufunc = (
+                x1.__tensor_ufunc__
+                if hasattr(x1, "__tensor_ufunc__")
+                else x2.__tensor_ufunc__
+            )
+            ret = ufunc(type(self), [x1, x2], out, where, **self.ufunc_extra_params)
+            if ret is NotImplemented:
+                return
+            return ret
+
+    def _call(self, x1, x2, out=None, where=None):
+        # check tensor ufunc, if x1 or x2 is not a tensor, e.g. Mars DataFrame
+        # which implements tensor ufunc, will delegate the computation
+        # to it if possible
+        ret = self._call_tensor_ufunc(x1, x2, out=out, where=where)
+        if ret is not None:
+            return ret
+
+        x1, x2, out, where = self._process_inputs(x1, x2, out, where)
+        # check broadcast
+        x1_shape = () if np.isscalar(x1) else x1.shape
+        x2_shape = () if np.isscalar(x2) else x2.shape
+        shape = broadcast_shape(x1_shape, x2_shape)
+        order = self._calc_order(x1, x2, out)
+
+        inputs = filter_inputs([x1, x2, out, where])
+        t = self.new_tensor(inputs, shape, order=order)
+
+        if out is None:
+            return t
+
+        check_out_param(out, t, getattr(self, "_casting"))
+        out_shape, out_dtype = out.shape, out.dtype
+
+        # if `out` is specified, use out's dtype and shape
+        if t.shape != out_shape:
+            t = self.new_tensor(inputs, out_shape, order=order)
+        setattr(self, "dtype", out_dtype)
+
+        out.data = t.data
+        return out
+
+    def __call__(self, x1, x2, out=None, where=None):
+        return self._call(x1, x2, out=out, where=where)
+
+    def rcall(self, x1, x2, out=None, where=None):
+        return self._call(x2, x1, out=out, where=where)
+
+
+class TensorUnaryOpMixin(TensorElementWiseWithInputs):
+    __slots__ = ()
+
+    def check_inputs(self, inputs):
+        if len(inputs) > 3:
+            raise ValueError(
+                f"Binary operand's inputs should less than or equal 3, got {len(inputs)}"
+            )
+
+    @classmethod
+    def _get_func(cls, xp):
+        func_name = getattr(cls, "_func_name")
+        return getattr(xp, func_name)
+
+    @classmethod
+    def _execute_gpu(cls, op, xp, inp, **kw):
+        r = cls._get_func(xp)(inp, **kw)
+        return convert_order(r, op.outputs[0].order.value)
+
+    @classmethod
+    def _execute_cpu(cls, op, xp, inp, **kw):
+        if op.order != "K":
+            kw["order"] = op.order
+        return cls._get_func(xp)(inp, **kw)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            kw = {"casting": op.casting} if op.out else {}
+
+            if op.out and op.where:
+                inputs, kw["out"], kw["where"] = (
+                    inputs[:-2],
+                    inputs[-2].copy(),
+                    inputs[-1],
+                )
+            elif op.out:
+                inputs, kw["out"] = inputs[:-1], inputs[-1].copy()
+            elif op.where:
+                inputs, kw["where"] = inputs[:-1], inputs[-1]
+
+            with np.errstate(**op.err):
+                if op.is_gpu():
+                    ret = cls._execute_gpu(op, xp, inputs[0], **kw)
+                else:
+                    ret = cls._execute_cpu(op, xp, inputs[0], **kw)
+                ctx[op.outputs[0].key] = _handle_out_dtype(ret, op.dtype)
+
+
+class TensorUnaryOp(TensorOperand, TensorUnaryOpMixin):
+    _input = KeyField("input")
+    _out = KeyField("out")
+    _where = KeyField("where")
+    _casting = StringField("casting")
+    _order = StringField("order")
+    _err = DictField("err", FieldTypes.string, FieldTypes.string)
+
+    def __init__(self, out=None, where=None, order=None, **kwargs):
+        super().__init__(_out=out, _where=where, _order=order, **kwargs)
+        if self._order is None:
+            self._order = "K"
+        check_order(self._order)
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def out(self):
+        return getattr(self, "_out", None)
+
+    @property
+    def where(self):
+        return getattr(self, "_where", None)
+
+    @property
+    def order(self):
+        return getattr(self, "_order", None)
+
+    @property
+    def casting(self):
+        return getattr(self, "_casting", None)
+
+    @property
+    def err(self):
+        return getattr(self, "_err", dict())
+
+    @classmethod
+    def _is_sparse(cls, x):
+        if hasattr(x, "issparse") and x.issparse():
+            return True
+        else:
+            return False
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+
+        self._input = next(inputs_iter)
+        if getattr(self, "_out", None) is not None:
+            self._out = next(inputs_iter)
+        if getattr(self, "_where", None) is not None:
+            self._where = next(inputs_iter)
+
+    def _process_inputs(self, x, out, where):
+        x = astensor(x)
+
+        if out is not None:
+            if isinstance(out, Tensor):
+                self._out = out
+            else:
+                raise TypeError(f"out should be Tensor object, got {type(out)} instead")
+        if where is True:
+            where = None
+        if where is not None:
+            where = astensor(where)
+            self._where = where
+
+        return x, out, where
+
+    def _set_sparse(self, inputs):
+        setattr(self, "sparse", self._is_sparse(inputs[0]))
+
+    def _calc_order(self, x, out):
+        if out is not None:
+            return out.order
+
+        if self._order in "KA":
+            return x.order
+        elif self._order == "C":
+            return TensorOrder.C_ORDER
+        else:
+            return TensorOrder.F_ORDER
+
+    @property
+    def ufunc_extra_params(self):
+        return dict()
+
+    def _call_tensor_ufunc(self, x, out=None, where=None):
+        if hasattr(x, "__tensor_ufunc__"):
+            ret = x.__tensor_ufunc__(
+                type(self), [x], out, where, **self.ufunc_extra_params
+            )
+            if ret is NotImplemented:
+                return
+            return ret
+
+    def _call(self, x, out=None, where=None):
+        # check tensor ufunc, if x is not a tensor, e.g. Mars DataFrame
+        # which implements tensor ufunc, will delegate the computation
+        # to it if possible
+        ret = self._call_tensor_ufunc(x, out=out, where=where)
+        if ret is not None:
+            return ret
+
+        x, out, where = self._process_inputs(x, out, where)
+        shape = x.shape
+        order = self._calc_order(x, out)
+
+        inputs = filter_inputs([x, out, where])
+        t = self.new_tensor(inputs, shape, order=order)
+
+        if out is None:
+            return t
+
+        check_out_param(out, t, getattr(self, "_casting"))
+        out_shape, out_dtype = out.shape, out.dtype
+
+        # if `out` is specified, use out's dtype and shape
+        if t.shape != out_shape:
+            t = self.new_tensor(inputs, out_shape, order=order)
+        setattr(self, "dtype", out_dtype)
+
+        out.data = t.data
+        return out
+
+    def __call__(self, x, out=None, where=None):
+        return self._call(x, out=out, where=where)
+
+
+class TensorOutBinOp(TensorOperand, TensorElementWiseWithInputs):
+    _input = KeyField("input")
+    _out1 = KeyField("out1")
+    _out2 = KeyField("out2")
+    _where = KeyField("where")
+    _order = StringField("order")
+    _casting = StringField("casting")
+
+    def __init__(self, out1=None, out2=None, where=None, order=None, **kwargs):
+        super().__init__(_out1=out1, _out2=out2, _where=where, _order=order, **kwargs)
+        if self._order is None:
+            self._order = "K"
+        check_order(self._order)
+
+    @property
+    def output_limit(self):
+        return 2
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def out1(self):
+        return getattr(self, "_out1", None)
+
+    @property
+    def out2(self):
+        return getattr(self, "_out2", None)
+
+    @property
+    def where(self):
+        return getattr(self, "_where", None)
+
+    @property
+    def order(self):
+        return getattr(self, "_order", None)
+
+    @property
+    def casting(self):
+        return getattr(self, "_casting", None)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+
+        self._input = next(inputs_iter)
+        if getattr(self, "_out1", None) is not None:
+            self._out1 = next(inputs_iter)
+        if getattr(self, "_out2", None) is not None:
+            self._out2 = next(inputs_iter)
+        if getattr(self, "_where", None) is not None:
+            self._where = next(inputs_iter)
+
+    def _process_inputs(self, x, out1, out2, where):
+        x = astensor(x)
+
+        if out1 is not None:
+            if isinstance(out1, Tensor):
+                self._out1 = out1
+            else:
+                raise TypeError(
+                    f"out1 should be Tensor object, got {type(out1)} instead"
+                )
+        if out2 is not None:
+            if isinstance(out2, Tensor):
+                self._out2 = out2
+            else:
+                raise TypeError(
+                    f"out2 should be Tensor object, got {type(out2)} instead"
+                )
+        if where is True:
+            where = None
+        if where is not None:
+            where = astensor(where)
+            self._where = where
+
+        return x, out1, out2, where
+
+    @classmethod
+    def _is_sparse(cls, x):
+        return False
+
+    def _set_sparse(self, inputs):
+        setattr(self, "sparse", self._is_sparse(inputs[0]))
+
+    @property
+    def _fun(self):
+        raise NotImplementedError
+
+    def _calc_order(self, x, out):
+        if out is not None:
+            return out.order
+
+        if self._order in "KA":
+            return x.order
+        elif self._order == "C":
+            return TensorOrder.C_ORDER
+        else:
+            return TensorOrder.F_ORDER
+
+    def _call(self, x, out1=None, out2=None, out=None, where=None):
+        dtype = [r.dtype for r in self._fun(np.empty(1, dtype=x.dtype))]
+
+        out = out or (None, None)
+        out1 = out1 or out[0]
+        out2 = out2 or out[1]
+        x, out1, out2, where = self._process_inputs(x, out1, out2, where)
+        shape = x.shape
+        order1 = self._calc_order(x, out1)
+        order2 = self._calc_order(x, out2)
+
+        inputs = filter_inputs([x, out1, out2, where])
+        t1, t2 = self.new_tensors(
+            inputs,
+            shape,
+            kws=[
+                {"order": order1, "dtype": dtype[0], "side": "left"},
+                {"order": order2, "dtype": dtype[1], "side": "right"},
+            ],
+        )
+
+        if out1 is None and out2 is None:
+            return ExecutableTuple([t1, t2])
+
+        if out1 is not None:
+            check_out_param(out1, t1, getattr(self, "_casting"))
+            out1_shape, out1_dtype = out1.shape, out1.dtype
+        else:
+            out1_shape, out1_dtype = t1.shape, t1.dtype
+        if out2 is not None:
+            check_out_param(out2, t2, getattr(self, "_casting"))
+            out2_shape, out2_dtype = out2.shape, out2.dtype
+        else:
+            out2_shape, out2_dtype = t2.shape, t2.dtype
+        # if `out` is specified, use out's dtype and shape
+        if t1.shape != out1_shape or t2.shape != out2_shape:
+            t1, t2 = self.new_tensor(
+                inputs,
+                [out1_shape, out2_shape],
+                kws=[
+                    {"order": order1, "dtype": out1_dtype},
+                    {"order": order2, "dtype": out2_dtype},
+                ],
+            )
+
+        if out1 is not None:
+            out1.data = t1.data
+        else:
+            out1 = t1
+        if out2 is not None:
+            out2.data = t2.data
+        else:
+            out2 = t2
+        return ExecutableTuple([out1, out2])
+
+    def __call__(self, x, out1=None, out2=None, out=None, where=None):
+        return self._call(x, out1=out1, out2=out2, out=out, where=where)
+
+
+class TensorMultiOp(TensorElementWiseWithInputs, TensorOperand):
+    _args = ListField("args")
+    _out = KeyField("out")
+    _where = KeyField("where")
+    _casting = StringField("casting")
+    _order = StringField("order")
+    _err = DictField("err", FieldTypes.string, FieldTypes.string)
+
+    def __init__(
+        self,
+        args=None,
+        out=None,
+        where=None,
+        casting=None,
+        order=None,
+        err=None,
+        **kwargs,
+    ):
+        super().__init__(
+            _args=args,
+            _out=out,
+            _where=where,
+            _order=order,
+            _casting=casting,
+            _er=err,
+            **kwargs,
+        )
+        if self._casting is None:
+            self._casting = "same_kind"
+        if self._order is None:
+            self._order = "K"
+        check_order(self._order)
+
+    @property
+    def args(self):
+        return getattr(self, "_args", None)
+
+    @property
+    def out(self):
+        return getattr(self, "_out", None)
+
+    @property
+    def order(self):
+        return getattr(self, "_order", None)
+
+    @property
+    def casting(self):
+        return getattr(self, "_casting", None)
+
+    @property
+    def err(self):
+        return getattr(self, "_err", dict())
+
+    @classmethod
+    def _is_sparse(cls, *args):
+        return False
+
+    def _set_sparse(self, inputs):
+        inputs_iter = iter(inputs or ())
+        args = list(self._args)
+        for idx in range(len(self._args)):
+            if not np.isscalar(self._args[idx]):
+                args[idx] = next(inputs_iter)
+        setattr(self, "sparse", self._is_sparse(*args))
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(inputs or ())
+
+        args = list(self._args)
+        for idx in range(len(args)):
+            if not np.isscalar(args[idx]):
+                args[idx] = next(inputs_iter)
+        self._args = args
+
+        if getattr(self, "_out", None) is not None:
+            self._out = next(inputs_iter)
+        if getattr(self, "_where", None) is not None:
+            self._where = next(inputs_iter)
+
+    def _process_inputs(self, *args, out=None):
+        self._args = [a if np.isscalar(a) else astensor(a) for a in args]
+
+        if out is not None:
+            if isinstance(out, Tensor):
+                self._out = out
+            else:
+                raise TypeError(f"out should be Tensor object, got {type(out)} instead")
+
+        return args + (out,)
+
+    def __call__(self, *args, out=None):
+        proc_inputs_results = self._process_inputs(*args, out=out)
+        args = proc_inputs_results[:-1]
+        (out,) = proc_inputs_results[-1:]
+        # check broadcast
+        shapes = [() if np.isscalar(a) else a.shape for a in self._args]
+        shape = broadcast_shape(*shapes)
+        order = out.order if out is not None else None
+
+        inputs = filter_inputs(list(args) + [out])
+        t = self.new_tensor(inputs, shape, order=order)
+
+        if out is None:
+            return t
+
+        check_out_param(out, t, getattr(self, "_casting"))
+        out_shape, out_dtype = out.shape, out.dtype
+
+        # if `out` is specified, use out's dtype and shape
+        if t.shape != out_shape:
+            t = self.new_tensor(inputs, out_shape, order=order)
+        setattr(self, "dtype", out_dtype)
+
+        out.data = t.data
+        return out
diff --git a/python/xorbits/_mars/tensor/arithmetic/cos.py b/python/xorbits/_mars/tensor/arithmetic/cos.py
new file mode 100644
index 000000000..afc6f60f2
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/cos.py
@@ -0,0 +1,83 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorCos(TensorUnaryOp):
+    _op_type_ = OperandDef.COS
+    _func_name = "cos"
+
+
+@infer_dtype(np.cos)
+def cos(x, out=None, where=None, **kwargs):
+    """
+    Cosine element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor in radians.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        The corresponding cosine values.
+
+    Notes
+    -----
+    If `out` is provided, the function writes the result into it,
+    and returns a reference to `out`.  (See Examples)
+
+    References
+    ----------
+    M. Abramowitz and I. A. Stegun, Handbook of Mathematical Functions.
+    New York, NY: Dover, 1972.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.cos(mt.array([0, mt.pi/2, mt.pi])).execute()
+    array([  1.00000000e+00,   6.12303177e-17,  -1.00000000e+00])
+    >>>
+    >>> # Example of providing the optional output parameter
+    >>> out1 = mt.empty(1)
+    >>> out2 = mt.cos([0.1], out1)
+    >>> out2 is out1
+    True
+    >>>
+    >>> # Example of ValueError due to provision of shape mis-matched `out`
+    >>> mt.cos(mt.zeros((3,3)),mt.zeros((2,2)))
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    ValueError: operands could not be broadcast together with shapes (3,3) (2,2)
+    """
+    op = TensorCos(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/cosh.py b/python/xorbits/_mars/tensor/arithmetic/cosh.py
new file mode 100644
index 000000000..d34d40150
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/cosh.py
@@ -0,0 +1,70 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorCosh(TensorUnaryOp):
+    _op_type_ = OperandDef.COSH
+    _func_name = "cosh"
+
+
+@infer_dtype(np.cosh)
+def cosh(x, out=None, where=None, **kwargs):
+    """
+    Hyperbolic cosine, element-wise.
+
+    Equivalent to ``1/2 * (mt.exp(x) + mt.exp(-x))`` and ``mt.cos(1j*x)``.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : Tensor
+        Output array of same shape as `x`.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.cosh(0).execute()
+    1.0
+
+    The hyperbolic cosine describes the shape of a hanging cable:
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = mt.linspace(-4, 4, 1000)
+    >>> plt.plot(x.execute(), mt.cosh(x).execute())
+    >>> plt.show()
+    """
+    op = TensorCosh(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/deg2rad.py b/python/xorbits/_mars/tensor/arithmetic/deg2rad.py
new file mode 100644
index 000000000..90ffd8136
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/deg2rad.py
@@ -0,0 +1,70 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorDeg2rad(TensorUnaryOp):
+    _op_type_ = OperandDef.DEG2RAD
+    _func_name = "deg2rad"
+
+
+@infer_dtype(np.deg2rad)
+def deg2rad(x, out=None, where=None, **kwargs):
+    """
+    Convert angles from degrees to radians.
+
+    Parameters
+    ----------
+    x : array_like
+        Angles in degrees.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        The corresponding angle in radians.
+
+    See Also
+    --------
+    rad2deg : Convert angles from radians to degrees.
+    unwrap : Remove large jumps in angle by wrapping.
+
+    Notes
+    -----
+    ``deg2rad(x)`` is ``x * pi / 180``.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.deg2rad(180).execute()
+    3.1415926535897931
+    """
+    op = TensorDeg2rad(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/degrees.py b/python/xorbits/_mars/tensor/arithmetic/degrees.py
new file mode 100644
index 000000000..6b022fb1f
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/degrees.py
@@ -0,0 +1,75 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorDegrees(TensorUnaryOp):
+    _op_type_ = OperandDef.DEGREES
+    _func_name = "degrees"
+
+
+@infer_dtype(np.degrees)
+def degrees(x, out=None, where=None, **kwargs):
+    """
+    Convert angles from radians to degrees.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor in radians.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor of floats
+        The corresponding degree values; if `out` was supplied this is a
+        reference to it.
+
+    See Also
+    --------
+    rad2deg : equivalent function
+
+    Examples
+    --------
+    Convert a radian array to degrees
+
+    >>> import mars.tensor as mt
+
+    >>> rad = mt.arange(12.)*mt.pi/6
+    >>> mt.degrees(rad).execute()
+    array([   0.,   30.,   60.,   90.,  120.,  150.,  180.,  210.,  240.,
+            270.,  300.,  330.])
+
+    >>> out = mt.zeros((rad.shape))
+    >>> r = mt.degrees(out)
+    >>> mt.all(r == out).execute()
+    True
+    """
+    op = TensorDegrees(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/divide.py b/python/xorbits/_mars/tensor/arithmetic/divide.py
new file mode 100644
index 000000000..d166d13bb
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/divide.py
@@ -0,0 +1,112 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand
+class TensorDivide(TensorBinOp):
+    _op_type_ = OperandDef.DIV
+    _func_name = "divide"
+
+    @classmethod
+    def _is_sparse(cls, x1, x2):
+        if not np.isscalar(x1) and not np.isscalar(x2):
+            return False
+        if hasattr(x1, "issparse") and x1.issparse():
+            if x2 != 0:
+                return True
+            else:
+                raise ZeroDivisionError("float division by zero")
+
+
+@infer_dtype(np.divide)
+def divide(x1, x2, out=None, where=None, **kwargs):
+    """
+    Divide arguments element-wise.
+
+    Parameters
+    ----------
+    x1 : array_like
+        Dividend tensor.
+    x2 : array_like
+        Divisor tensor.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : Tensor
+        The quotient `x1/x2`, element-wise. Returns a scalar if both `x1` and `x2` are scalars.
+
+    Notes
+    -----
+    Equivalent to `x1` / `x2` in terms of array-broadcasting.
+
+    Behavior on division by zero can be changed using `seterr`.
+
+    In Python 2, when both `x1` and `x2` are of an integer type, `divide` will behave like `floor_divide`.
+    In Python 3, it behaves like `true_divide`.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.divide(2.0, 4.0).execute()
+    0.5
+    >>> x1 = mt.arange(9.0).reshape((3, 3))
+    >>> x2 = mt.arange(3.0)
+    >>> mt.divide(x1, x2).execute()
+    array([[ NaN,  1. ,  1. ],
+           [ Inf,  4. ,  2.5],
+           [ Inf,  7. ,  4. ]])
+    Note the behavior with integer types (Python 2 only):
+    >>> mt.divide(2, 4).execute()
+    0
+    >>> mt.divide(2, 4.).execute()
+    0.5
+    Division by zero always yields zero in integer arithmetic (again, Python 2 only),
+    and does not raise an exception or a warning:
+    >>> mt.divide(mt.array([0, 1], dtype=int), mt.array([0, 0], dtype=int)).execute()
+    array([0, 0])
+    Division by zero can, however, be caught using seterr:
+    >>> old_err_state = mt.seterr(divide='raise')
+    >>> mt.divide(1, 0).execute()
+    Traceback (most recent call last):
+    ...
+    FloatingPointError: divide by zero encountered in divide
+    >>> ignored_states = mt.seterr(**old_err_state)
+    >>> mt.divide(1, 0).execute()
+    0
+    """
+    op = TensorDivide(**kwargs)
+    return op(x1, x2, out=out, where=where)
+
+
+@infer_dtype(np.divide, reverse=True)
+def rdivide(x1, x2, **kwargs):
+    op = TensorDivide(**kwargs)
+    return op.rcall(x1, x2)
diff --git a/python/xorbits/_mars/tensor/arithmetic/equal.py b/python/xorbits/_mars/tensor/arithmetic/equal.py
new file mode 100644
index 000000000..a77fcb37f
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/equal.py
@@ -0,0 +1,74 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import inject_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorEqual(TensorBinOp):
+    _op_type_ = OperandDef.EQ
+    _func_name = "equal"
+
+
+@inject_dtype(np.bool_)
+def equal(x1, x2, out=None, where=None, **kwargs):
+    """
+    Return (x1 == x2) element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input tensors of the same shape.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+        For other keyword-only arguments, see the
+        :ref:`ufunc docs <ufuncs.kwargs>`.
+
+    Returns
+    -------
+    out : Tensor or bool
+        Output tensor of bools, or a single bool if x1 and x2 are scalars.
+
+    See Also
+    --------
+    not_equal, greater_equal, less_equal, greater, less
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.equal([0, 1, 3], mt.arange(3)).execute()
+    array([ True,  True, False])
+
+    What is compared are values, not types. So an int (1) and a tensor of
+    length one can evaluate as True:
+
+    >>> mt.equal(1, mt.ones(1))
+    array([ True])
+    """
+
+    op = TensorEqual(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/exp.py b/python/xorbits/_mars/tensor/arithmetic/exp.py
new file mode 100644
index 000000000..e722339e2
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/exp.py
@@ -0,0 +1,104 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorExp(TensorUnaryOp):
+    _op_type_ = OperandDef.EXP
+    _func_name = "exp"
+
+
+@infer_dtype(np.exp)
+def exp(x, out=None, where=None, **kwargs):
+    r"""
+    Calculate the exponential of all elements in the input tensor.
+
+    Parameters
+    ----------
+    x : array_like
+        Input values.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+        For other keyword-only arguments, see the
+        :ref:`ufunc docs <ufuncs.kwargs>`.
+
+    Returns
+    -------
+    out : Tensor
+        Output tensor, element-wise exponential of `x`.
+
+    See Also
+    --------
+    expm1 : Calculate ``exp(x) - 1`` for all elements in the array.
+    exp2  : Calculate ``2**x`` for all elements in the array.
+
+    Notes
+    -----
+    The irrational number ``e`` is also known as Euler's number.  It is
+    approximately 2.718281, and is the base of the natural logarithm,
+    ``ln`` (this means that, if :math:`x = \ln y = \log_e y`,
+    then :math:`e^x = y`. For real input, ``exp(x)`` is always positive.
+
+    For complex arguments, ``x = a + ib``, we can write
+    :math:`e^x = e^a e^{ib}`.  The first term, :math:`e^a`, is already
+    known (it is the real argument, described above).  The second term,
+    :math:`e^{ib}`, is :math:`\cos b + i \sin b`, a function with
+    magnitude 1 and a periodic phase.
+
+    References
+    ----------
+    .. [1] Wikipedia, "Exponential function",
+           http://en.wikipedia.org/wiki/Exponential_function
+    .. [2] M. Abramovitz and I. A. Stegun, "Handbook of Mathematical Functions
+           with Formulas, Graphs, and Mathematical Tables," Dover, 1964, p. 69,
+           http://www.math.sfu.ca/~cbm/aands/page_69.htm
+
+    Examples
+    --------
+    Plot the magnitude and phase of ``exp(x)`` in the complex plane:
+
+    >>> import mars.tensor as mt
+    >>> import matplotlib.pyplot as plt
+
+    >>> x = mt.linspace(-2*mt.pi, 2*mt.pi, 100)
+    >>> xx = x + 1j * x[:, mt.newaxis] # a + ib over complex plane
+    >>> out = mt.exp(xx)
+
+    >>> plt.subplot(121)
+    >>> plt.imshow(mt.abs(out).execute(),
+    ...            extent=[-2*mt.pi, 2*mt.pi, -2*mt.pi, 2*mt.pi], cmap='gray')
+    >>> plt.title('Magnitude of exp(x)')
+
+    >>> plt.subplot(122)
+    >>> plt.imshow(mt.angle(out).execute(),
+    ...            extent=[-2*mt.pi, 2*mt.pi, -2*mt.pi, 2*mt.pi], cmap='hsv')
+    >>> plt.title('Phase (angle) of exp(x)')
+    >>> plt.show()
+    """
+    op = TensorExp(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/exp2.py b/python/xorbits/_mars/tensor/arithmetic/exp2.py
new file mode 100644
index 000000000..75a4bb5e3
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/exp2.py
@@ -0,0 +1,65 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorExp2(TensorUnaryOp):
+    _op_type_ = OperandDef.EXP2
+    _func_name = "exp2"
+
+
+@infer_dtype(np.exp2)
+def exp2(x, out=None, where=None, **kwargs):
+    """
+    Calculate `2**p` for all `p` in the input tensor.
+
+    Parameters
+    ----------
+    x : array_like
+        Input values.
+    out : Tensor, None, or tuple of tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : Tensor
+        Element-wise 2 to the power `x`.
+
+    See Also
+    --------
+    power
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.exp2([2, 3]).execute()
+    array([ 4.,  8.])
+    """
+    op = TensorExp2(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/expm1.py b/python/xorbits/_mars/tensor/arithmetic/expm1.py
new file mode 100644
index 000000000..fa8594c70
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/expm1.py
@@ -0,0 +1,77 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorExpm1(TensorUnaryOp):
+    _op_type_ = OperandDef.EXPM1
+    _func_name = "expm1"
+
+
+@infer_dtype(np.expm1)
+def expm1(x, out=None, where=None, **kwargs):
+    """
+    Calculate ``exp(x) - 1`` for all elements in the tensor.
+
+    Parameters
+    ----------
+    x : array_like
+       Input values.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : Tensor
+        Element-wise exponential minus one: ``out = exp(x) - 1``.
+
+    See Also
+    --------
+    log1p : ``log(1 + x)``, the inverse of expm1.
+
+
+    Notes
+    -----
+    This function provides greater precision than ``exp(x) - 1``
+    for small values of ``x``.
+
+    Examples
+    --------
+    The true value of ``exp(1e-10) - 1`` is ``1.00000000005e-10`` to
+    about 32 significant digits. This example shows the superiority of
+    expm1 in this case.
+
+    >>> import mars.tensor as mt
+
+    >>> mt.expm1(1e-10).execute()
+    1.00000000005e-10
+    >>> (mt.exp(1e-10) - 1).execute()
+    1.000000082740371e-10
+    """
+    op = TensorExpm1(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/fabs.py b/python/xorbits/_mars/tensor/arithmetic/fabs.py
new file mode 100644
index 000000000..5e2f22018
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/fabs.py
@@ -0,0 +1,72 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorFabs(TensorUnaryOp):
+    _op_type_ = OperandDef.FABS
+    _func_name = "fabs"
+
+
+@infer_dtype(np.fabs)
+def fabs(x, out=None, where=None, **kwargs):
+    """
+    Compute the absolute values element-wise.
+
+    This function returns the absolute values (positive magnitude) of the
+    data in `x`. Complex values are not handled, use `absolute` to find the
+    absolute values of complex data.
+
+    Parameters
+    ----------
+    x : array_like
+        The tensor of numbers for which the absolute values are required. If
+        `x` is a scalar, the result `y` will also be a scalar.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor or scalar
+        The absolute values of `x`, the returned values are always floats.
+
+    See Also
+    --------
+    absolute : Absolute values including `complex` types.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.fabs(-1).execute()
+    1.0
+    >>> mt.fabs([-1.2, 1.2]).execute()
+    array([ 1.2,  1.2])
+    """
+    op = TensorFabs(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/fix.py b/python/xorbits/_mars/tensor/arithmetic/fix.py
new file mode 100644
index 000000000..c702b012b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/fix.py
@@ -0,0 +1,67 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorFix(TensorUnaryOp):
+    _op_type_ = OperandDef.FIX
+    _func_name = "fix"
+
+
+@infer_dtype(np.fix)
+def fix(x, out=None, **kwargs):
+    """
+    Round to nearest integer towards zero.
+
+    Round a tensor of floats element-wise to nearest integer towards zero.
+    The rounded values are returned as floats.
+
+    Parameters
+    ----------
+    x   : array_like
+        An tensor of floats to be rounded
+    out : Tensor, optional
+        Output tensor
+
+    Returns
+    -------
+    out : Tensor of floats
+        The array of rounded numbers
+
+    See Also
+    --------
+    trunc, floor, ceil
+    around : Round to given number of decimals
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.fix(3.14).execute()
+    3.0
+    >>> mt.fix(3).execute()
+    3.0
+    >>> mt.fix([2.1, 2.9, -2.1, -2.9]).execute()
+    array([ 2.,  2., -2., -2.])
+
+    """
+    op = TensorFix(**kwargs)
+    return op(x, out=out)
diff --git a/python/xorbits/_mars/tensor/arithmetic/float_power.py b/python/xorbits/_mars/tensor/arithmetic/float_power.py
new file mode 100644
index 000000000..02f8c4655
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/float_power.py
@@ -0,0 +1,101 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import inject_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand
+class TensorFloatPower(TensorBinOp):
+    _op_type_ = OperandDef.FLOAT_POWER
+    _func_name = "float_power"
+
+    @classmethod
+    def _is_sparse(cls, x1, x2):
+        if hasattr(x1, "issparse") and x1.issparse():
+            return True
+        return False
+
+
+@inject_dtype(np.float64)
+def float_power(x1, x2, out=None, where=None, **kwargs):
+    """
+    First tensor elements raised to powers from second array, element-wise.
+
+    Raise each base in `x1` to the positionally-corresponding power in `x2`.
+    `x1` and `x2` must be broadcastable to the same shape. This differs from
+    the power function in that integers, float16, and float32  are promoted to
+    floats with a minimum precision of float64 so that the result is always
+    inexact.  The intent is that the function will return a usable result for
+    negative powers and seldom overflow for positive powers.
+
+    Parameters
+    ----------
+    x1 : array_like
+        The bases.
+    x2 : array_like
+        The exponents.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        The bases in `x1` raised to the exponents in `x2`.
+
+    See Also
+    --------
+    power : power function that preserves type
+
+    Examples
+    --------
+    Cube each element in a list.
+
+    >>> import mars.tensor as mt
+
+    >>> x1 = range(6)
+    >>> x1
+    [0, 1, 2, 3, 4, 5]
+    >>> mt.float_power(x1, 3).execute()
+    array([   0.,    1.,    8.,   27.,   64.,  125.])
+
+    Raise the bases to different exponents.
+
+    >>> x2 = [1.0, 2.0, 3.0, 3.0, 2.0, 1.0]
+    >>> mt.float_power(x1, x2).execute()
+    array([  0.,   1.,   8.,  27.,  16.,   5.])
+
+    The effect of broadcasting.
+
+    >>> x2 = mt.array([[1, 2, 3, 3, 2, 1], [1, 2, 3, 3, 2, 1]])
+    >>> x2.execute()
+    array([[1, 2, 3, 3, 2, 1],
+           [1, 2, 3, 3, 2, 1]])
+    >>> mt.float_power(x1, x2).execute()
+    array([[  0.,   1.,   8.,  27.,  16.,   5.],
+           [  0.,   1.,   8.,  27.,  16.,   5.]])
+    """
+    op = TensorFloatPower(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/floor.py b/python/xorbits/_mars/tensor/arithmetic/floor.py
new file mode 100644
index 000000000..3b402ab98
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/floor.py
@@ -0,0 +1,75 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorFloor(TensorUnaryOp):
+    _op_type_ = OperandDef.FLOOR
+    _func_name = "floor"
+
+
+@infer_dtype(np.floor)
+def floor(x, out=None, where=None, **kwargs):
+    r"""
+    Return the floor of the input, element-wise.
+
+    The floor of the scalar `x` is the largest integer `i`, such that
+    `i <= x`.  It is often denoted as :math:`\lfloor x \rfloor`.
+
+    Parameters
+    ----------
+    x : array_like
+        Input data.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor or scalar
+        The floor of each element in `x`.
+
+    See Also
+    --------
+    ceil, trunc, rint
+
+    Notes
+    -----
+    Some spreadsheet programs calculate the "floor-towards-zero", in other
+    words ``floor(-2.5) == -2``.  NumPy instead uses the definition of
+    `floor` where `floor(-2.5) == -3`.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])
+    >>> mt.floor(a).execute()
+    array([-2., -2., -1.,  0.,  1.,  1.,  2.])
+    """
+    op = TensorFloor(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/floordiv.py b/python/xorbits/_mars/tensor/arithmetic/floordiv.py
new file mode 100644
index 000000000..6b680ea5a
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/floordiv.py
@@ -0,0 +1,92 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand
+class TensorFloorDiv(TensorBinOp):
+    _op_type_ = OperandDef.FLOORDIV
+    _func_name = "floor_divide"
+
+    @classmethod
+    def _is_sparse(cls, x1, x2):
+        if hasattr(x1, "issparse") and x1.issparse():
+            if x2 != 0:
+                return True
+            else:
+                raise ZeroDivisionError("float division by zero")
+        return False
+
+
+@infer_dtype(np.floor_divide)
+def floordiv(x1, x2, out=None, where=None, **kwargs):
+    """
+    Return the largest integer smaller or equal to the division of the inputs.
+    It is equivalent to the Python ``//`` operator and pairs with the
+    Python ``%`` (`remainder`), function so that ``b = a % b + b * (a // b)``
+    up to roundoff.
+
+    Parameters
+    ----------
+    x1 : array_like
+        Numerator.
+    x2 : array_like
+        Denominator.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        y = floor(`x1`/`x2`)
+
+
+    See Also
+    --------
+    remainder : Remainder complementary to floor_divide.
+    divmod : Simultaneous floor division and remainder.
+    divide : Standard division.
+    floor : Round a number to the nearest integer toward minus infinity.
+    ceil : Round a number to the nearest integer toward infinity.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.floor_divide(7,3).execute()
+    2
+    >>> mt.floor_divide([1., 2., 3., 4.], 2.5).execute()
+    array([ 0.,  0.,  1.,  1.])
+    """
+    op = TensorFloorDiv(**kwargs)
+    return op(x1, x2, out=out, where=where)
+
+
+@infer_dtype(np.floor_divide, reverse=True)
+def rfloordiv(x1, x2, **kwargs):
+    op = TensorFloorDiv(**kwargs)
+    return op.rcall(x1, x2)
diff --git a/python/xorbits/_mars/tensor/arithmetic/fmax.py b/python/xorbits/_mars/tensor/arithmetic/fmax.py
new file mode 100644
index 000000000..1e41af8d6
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/fmax.py
@@ -0,0 +1,103 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand
+class TensorFMax(TensorBinOp):
+    _op_type_ = OperandDef.FMAX
+    _func_name = "fmax"
+
+    @classmethod
+    def _is_sparse(cls, x1, x2):
+        if hasattr(x1, "issparse") and x1.issparse() and np.isscalar(x2) and x2 <= 0:
+            return True
+        if hasattr(x2, "issparse") and x2.issparse() and np.isscalar(x1) and x1 <= 0:
+            return True
+        return False
+
+
+@infer_dtype(np.fmax)
+def fmax(x1, x2, out=None, where=None, **kwargs):
+    """
+    Element-wise maximum of array elements.
+
+    Compare two tensors and returns a new tensor containing the element-wise
+    maxima. If one of the elements being compared is a NaN, then the
+    non-nan element is returned. If both elements are NaNs then the first
+    is returned.  The latter distinction is important for complex NaNs,
+    which are defined as at least one of the real or imaginary parts being
+    a NaN. The net effect is that NaNs are ignored when possible.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        The tensors holding the elements to be compared. They must have
+        the same shape.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor or scalar
+        The maximum of `x1` and `x2`, element-wise.  Returns scalar if
+        both  `x1` and `x2` are scalars.
+
+    See Also
+    --------
+    fmin :
+        Element-wise minimum of two tensors, ignores NaNs.
+    maximum :
+        Element-wise maximum of two tensors, propagates NaNs.
+    amax :
+        The maximum value of an tensor along a given axis, propagates NaNs.
+    nanmax :
+        The maximum value of an tensor along a given axis, ignores NaNs.
+
+    minimum, amin, nanmin
+
+    Notes
+    -----
+    The fmax is equivalent to ``mt.where(x1 >= x2, x1, x2)`` when neither
+    x1 nor x2 are NaNs, but it is faster and does proper broadcasting.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.fmax([2, 3, 4], [1, 5, 2]).execute()
+    array([ 2.,  5.,  4.])
+
+    >>> mt.fmax(mt.eye(2), [0.5, 2]).execute()
+    array([[ 1. ,  2. ],
+           [ 0.5,  2. ]])
+
+    >>> mt.fmax([mt.nan, 0, mt.nan],[0, mt.nan, mt.nan]).execute()
+    array([  0.,   0.,  NaN])
+    """
+    op = TensorFMax(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/fmin.py b/python/xorbits/_mars/tensor/arithmetic/fmin.py
new file mode 100644
index 000000000..715124cf5
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/fmin.py
@@ -0,0 +1,104 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand
+class TensorFMin(TensorBinOp):
+    _op_type_ = OperandDef.FMIN
+    _func_name = "fmin"
+
+    @classmethod
+    def _is_sparse(cls, x1, x2):
+        if hasattr(x1, "issparse") and x1.issparse() and np.isscalar(x2) and x2 >= 0:
+            return True
+        if hasattr(x2, "issparse") and x2.issparse() and np.isscalar(x1) and x1 >= 0:
+            return True
+        return False
+
+
+@infer_dtype(np.fmin)
+def fmin(x1, x2, out=None, where=None, **kwargs):
+    """
+    Element-wise minimum of array elements.
+
+    Compare two tensors and returns a new tensor containing the element-wise
+    minima. If one of the elements being compared is a NaN, then the
+    non-nan element is returned. If both elements are NaNs then the first
+    is returned.  The latter distinction is important for complex NaNs,
+    which are defined as at least one of the real or imaginary parts being
+    a NaN. The net effect is that NaNs are ignored when possible.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        The tensors holding the elements to be compared. They must have
+        the same shape.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor or scalar
+        The minimum of `x1` and `x2`, element-wise.  Returns scalar if
+        both  `x1` and `x2` are scalars.
+
+    See Also
+    --------
+    fmax :
+        Element-wise maximum of two tensors, ignores NaNs.
+    minimum :
+        Element-wise minimum of two tensors, propagates NaNs.
+    amin :
+        The minimum value of a tensor along a given axis, propagates NaNs.
+    nanmin :
+        The minimum value of a tensor along a given axis, ignores NaNs.
+
+    maximum, amax, nanmax
+
+    Notes
+    -----
+
+    The fmin is equivalent to ``mt.where(x1 <= x2, x1, x2)`` when neither
+    x1 nor x2 are NaNs, but it is faster and does proper broadcasting.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.fmin([2, 3, 4], [1, 5, 2]).execute()
+    array([1, 3, 2])
+
+    >>> mt.fmin(mt.eye(2), [0.5, 2]).execute()
+    array([[ 0.5,  0. ],
+           [ 0. ,  1. ]])
+
+    >>> mt.fmin([mt.nan, 0, mt.nan],[0, mt.nan, mt.nan]).execute()
+    array([  0.,   0.,  NaN])
+    """
+    op = TensorFMin(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/fmod.py b/python/xorbits/_mars/tensor/arithmetic/fmod.py
new file mode 100644
index 000000000..b06494115
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/fmod.py
@@ -0,0 +1,97 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_or")
+class TensorFMod(TensorBinOp):
+    _op_type_ = OperandDef.FMOD
+    _func_name = "fmod"
+
+
+@infer_dtype(np.fmod)
+def fmod(x1, x2, out=None, where=None, **kwargs):
+    """
+    Return the element-wise remainder of division.
+
+    This is the NumPy implementation of the C library function fmod, the
+    remainder has the same sign as the dividend `x1`. It is equivalent to
+    the Matlab(TM) ``rem`` function and should not be confused with the
+    Python modulus operator ``x1 % x2``.
+
+    Parameters
+    ----------
+    x1 : array_like
+      Dividend.
+    x2 : array_like
+      Divisor.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+        For other keyword-only arguments, see the
+        :ref:`ufunc docs <ufuncs.kwargs>`.
+
+    Returns
+    -------
+    y : Tensor_like
+      The remainder of the division of `x1` by `x2`.
+
+    See Also
+    --------
+    remainder : Equivalent to the Python ``%`` operator.
+    divide
+
+    Notes
+    -----
+    The result of the modulo operation for negative dividend and divisors
+    is bound by conventions. For `fmod`, the sign of result is the sign of
+    the dividend, while for `remainder` the sign of the result is the sign
+    of the divisor. The `fmod` function is equivalent to the Matlab(TM)
+    ``rem`` function.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.fmod([-3, -2, -1, 1, 2, 3], 2).execute()
+    array([-1,  0, -1,  1,  0,  1])
+    >>> mt.remainder([-3, -2, -1, 1, 2, 3], 2).execute()
+    array([1, 0, 1, 1, 0, 1])
+
+    >>> mt.fmod([5, 3], [2, 2.]).execute()
+    array([ 1.,  1.])
+    >>> a = mt.arange(-3, 3).reshape(3, 2)
+    >>> a.execute()
+    array([[-3, -2],
+           [-1,  0],
+           [ 1,  2]])
+    >>> mt.fmod(a, [2,2]).execute()
+    array([[-1,  0],
+           [-1,  0],
+           [ 1,  0]])
+    """
+    op = TensorFMod(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/frexp.py b/python/xorbits/_mars/tensor/arithmetic/frexp.py
new file mode 100644
index 000000000..5aa07ae1a
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/frexp.py
@@ -0,0 +1,128 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..array_utils import as_same_device, device
+from .core import TensorOutBinOp
+
+
+class TensorFrexp(TensorOutBinOp):
+    _op_type_ = OperandDef.FREXP
+    _func_name = "frexp"
+
+    def __init__(self, casting="same_kind", dtype=None, sparse=False, **kw):
+        super().__init__(_casting=casting, dtype=dtype, sparse=sparse, **kw)
+
+    @property
+    def _fun(self):
+        return np.frexp
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            kw = {"casting": op.casting}
+
+            inputs_iter = iter(inputs)
+            input = next(inputs_iter)
+            if op.out1 is not None:
+                out1 = next(inputs_iter)
+            else:
+                out1 = None
+            if op.out2 is not None:
+                out2 = next(inputs_iter)
+            else:
+                out2 = None
+            if op.where is not None:
+                where = kw["where"] = next(inputs_iter)
+            else:
+                where = None
+            kw["order"] = op.order
+
+            # The out1 out2 are immutable because they are got from
+            # the shared memory.
+            mantissa, exponent = xp.frexp(input)
+            if where is not None:
+                mantissa, exponent = (
+                    xp.where(where, mantissa, out1),
+                    xp.where(where, exponent, out2),
+                )
+
+            for c, res in zip(op.outputs, (mantissa, exponent)):
+                ctx[c.key] = res
+
+
+def frexp(x, out1=None, out2=None, out=None, where=None, **kwargs):
+    """
+    Decompose the elements of x into mantissa and twos exponent.
+
+    Returns (`mantissa`, `exponent`), where `x = mantissa * 2**exponent``.
+    The mantissa is lies in the open interval(-1, 1), while the twos
+    exponent is a signed integer.
+
+    Parameters
+    ----------
+    x : array_like
+        Tensor of numbers to be decomposed.
+    out1 : Tensor, optional
+        Output tensor for the mantissa. Must have the same shape as `x`.
+    out2 : Tensor, optional
+        Output tensor for the exponent. Must have the same shape as `x`.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    (mantissa, exponent) : tuple of tensors, (float, int)
+        `mantissa` is a float array with values between -1 and 1.
+        `exponent` is an int array which represents the exponent of 2.
+
+    See Also
+    --------
+    ldexp : Compute ``y = x1 * 2**x2``, the inverse of `frexp`.
+
+    Notes
+    -----
+    Complex dtypes are not supported, they will raise a TypeError.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.arange(9)
+    >>> y1, y2 = mt.frexp(x)
+
+    >>> y1_result, y2_result = mt.ExecutableTuple([y1, y2]).execute()
+    >>> y1_result
+    array([ 0.   ,  0.5  ,  0.5  ,  0.75 ,  0.5  ,  0.625,  0.75 ,  0.875,
+            0.5  ])
+    >>> y2_result
+    array([0, 1, 2, 2, 3, 3, 3, 3, 4])
+    >>> (y1 * 2**y2).execute()
+    array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.])
+    """
+    op = TensorFrexp(**kwargs)
+    return op(x, out1=out1, out2=out2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/greater.py b/python/xorbits/_mars/tensor/arithmetic/greater.py
new file mode 100644
index 000000000..6f448e3a4
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/greater.py
@@ -0,0 +1,75 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import inject_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorGreaterThan(TensorBinOp):
+    _op_type_ = OperandDef.GT
+    _func_name = "greater"
+
+
+@inject_dtype(np.bool_)
+def greater(x1, x2, out=None, where=None, **kwargs):
+    """
+    Return the truth value of (x1 > x2) element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input tensors.  If ``x1.shape != x2.shape``, they must be
+        broadcastable to a common shape (which may be the shape of one or
+        the other).
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : bool or Tensor of bool
+        Array of bools, or a single bool if `x1` and `x2` are scalars.
+
+
+    See Also
+    --------
+    greater_equal, less, less_equal, equal, not_equal
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.greater([4,2],[2,2]).execute()
+    array([ True, False])
+
+    If the inputs are ndarrays, then np.greater is equivalent to '>'.
+
+    >>> a = mt.array([4,2])
+    >>> b = mt.array([2,2])
+    >>> (a > b).execute()
+    array([ True, False])
+    """
+    op = TensorGreaterThan(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/greater_equal.py b/python/xorbits/_mars/tensor/arithmetic/greater_equal.py
new file mode 100644
index 000000000..17b7c60af
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/greater_equal.py
@@ -0,0 +1,67 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import inject_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorGreaterEqual(TensorBinOp):
+    _op_type_ = OperandDef.GE
+    _func_name = "greater_equal"
+
+
+@inject_dtype(np.bool_)
+def greater_equal(x1, x2, out=None, where=None, **kwargs):
+    """
+    Return the truth value of (x1 >= x2) element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input tensors.  If ``x1.shape != x2.shape``, they must be
+        broadcastable to a common shape (which may be the shape of one or
+        the other).
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : bool or Tensor of bool
+        Array of bools, or a single bool if `x1` and `x2` are scalars.
+
+    See Also
+    --------
+    greater, less, less_equal, equal, not_equal
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.greater_equal([4, 2, 1], [2, 2, 2]).execute()
+    array([ True, True, False])
+    """
+    op = TensorGreaterEqual(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/hypot.py b/python/xorbits/_mars/tensor/arithmetic/hypot.py
new file mode 100644
index 000000000..7ee1ccec0
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/hypot.py
@@ -0,0 +1,75 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorHypot(TensorBinOp):
+    _op_type_ = OperandDef.HYPOT
+    _func_name = "hypot"
+
+
+@infer_dtype(np.hypot)
+def hypot(x1, x2, out=None, where=None, **kwargs):
+    """
+    Given the "legs" of a right triangle, return its hypotenuse.
+
+    Equivalent to ``sqrt(x1**2 + x2**2)``, element-wise.  If `x1` or
+    `x2` is scalar_like (i.e., unambiguously cast-able to a scalar type),
+    it is broadcast for use with each element of the other argument.
+    (See Examples)
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Leg of the triangle(s).
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    z : Tensor
+        The hypotenuse of the triangle(s).
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.hypot(3*mt.ones((3, 3)), 4*mt.ones((3, 3))).execute()
+    array([[ 5.,  5.,  5.],
+           [ 5.,  5.,  5.],
+           [ 5.,  5.,  5.]])
+
+    Example showing broadcast of scalar_like argument:
+
+    >>> mt.hypot(3*mt.ones((3, 3)), [4]).execute()
+    array([[ 5.,  5.,  5.],
+           [ 5.,  5.,  5.],
+           [ 5.,  5.,  5.]])
+    """
+    op = TensorHypot(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/i0.py b/python/xorbits/_mars/tensor/arithmetic/i0.py
new file mode 100644
index 000000000..27064e45b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/i0.py
@@ -0,0 +1,97 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..array_utils import get_array_module, is_sparse_module
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorI0(TensorUnaryOp):
+    _op_type_ = OperandDef.I0
+    _func_name = "i0"
+
+    @classmethod
+    def execute(cls, ctx, op):
+        x = ctx[op.inputs[0].key]
+        xp = get_array_module(x)
+        res = xp.i0(x)
+        if not is_sparse_module(xp):
+            res = res.reshape(op.outputs[0].shape)
+        ctx[op.outputs[0].key] = res
+
+
+@infer_dtype(np.i0)
+def i0(x, **kwargs):
+    """
+    Modified Bessel function of the first kind, order 0.
+
+    Usually denoted :math:`I_0`.  This function does broadcast, but will *not*
+    "up-cast" int dtype arguments unless accompanied by at least one float or
+    complex dtype argument (see Raises below).
+
+    Parameters
+    ----------
+    x : array_like, dtype float or complex
+        Argument of the Bessel function.
+
+    Returns
+    -------
+    out : Tensor, shape = x.shape, dtype = x.dtype
+        The modified Bessel function evaluated at each of the elements of `x`.
+
+    Raises
+    ------
+    TypeError: array cannot be safely cast to required type
+        If argument consists exclusively of int dtypes.
+
+    See Also
+    --------
+    scipy.special.iv, scipy.special.ive
+
+    Notes
+    -----
+    We use the algorithm published by Clenshaw [1]_ and referenced by
+    Abramowitz and Stegun [2]_, for which the function domain is
+    partitioned into the two intervals [0,8] and (8,inf), and Chebyshev
+    polynomial expansions are employed in each interval. Relative error on
+    the domain [0,30] using IEEE arithmetic is documented [3]_ as having a
+    peak of 5.8e-16 with an rms of 1.4e-16 (n = 30000).
+
+    References
+    ----------
+    .. [1] C. W. Clenshaw, "Chebyshev series for mathematical functions", in
+           *National Physical Laboratory Mathematical Tables*, vol. 5, London:
+           Her Majesty's Stationery Office, 1962.
+    .. [2] M. Abramowitz and I. A. Stegun, *Handbook of Mathematical
+           Functions*, 10th printing, New York: Dover, 1964, pp. 379.
+           http://www.math.sfu.ca/~cbm/aands/page_379.htm
+    .. [3] http://kobesearch.cpan.org/htdocs/Math-Cephes/Math/Cephes.html
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.i0([0.]).execute()
+    array([1.])
+    >>> mt.i0([0., 1. + 2j]).execute()
+    array([ 1.00000000+0.j        ,  0.18785373+0.64616944j])
+
+    """
+    op = TensorI0(**kwargs)
+    return op(x)
diff --git a/python/xorbits/_mars/tensor/arithmetic/imag.py b/python/xorbits/_mars/tensor/arithmetic/imag.py
new file mode 100644
index 000000000..5fd30e3d7
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/imag.py
@@ -0,0 +1,65 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorImag(TensorUnaryOp):
+    _op_type_ = OperandDef.IMAG
+    _func_name = "imag"
+
+
+@infer_dtype(np.imag)
+def imag(val, **kwargs):
+    """
+    Return the imaginary part of the complex argument.
+
+    Parameters
+    ----------
+    val : array_like
+        Input tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        The imaginary component of the complex argument. If `val` is real,
+        the type of `val` is used for the output.  If `val` has complex
+        elements, the returned type is float.
+
+    See Also
+    --------
+    real, angle, real_if_close
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([1+2j, 3+4j, 5+6j])
+    >>> a.imag.execute()
+    array([ 2.,  4.,  6.])
+    >>> a.imag = mt.array([8, 10, 12])
+    >>> a.execute()
+    array([ 1. +8.j,  3.+10.j,  5.+12.j])
+    >>> mt.imag(1 + 1j).execute()
+    1.0
+
+    """
+    op = TensorImag(**kwargs)
+    return op(val)
diff --git a/python/xorbits/_mars/tensor/arithmetic/invert.py b/python/xorbits/_mars/tensor/arithmetic/invert.py
new file mode 100644
index 000000000..33c7fca18
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/invert.py
@@ -0,0 +1,108 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorInvert(TensorUnaryOp):
+    _op_type_ = OperandDef.INVERT
+    _func_name = "invert"
+
+
+@infer_dtype(np.invert)
+def invert(x, out=None, where=None, **kwargs):
+    """
+    Compute bit-wise inversion, or bit-wise NOT, element-wise.
+
+    Computes the bit-wise NOT of the underlying binary representation of
+    the integers in the input tensors. This ufunc implements the C/Python
+    operator ``~``.
+
+    For signed integer inputs, the two's complement is returned.  In a
+    two's-complement system negative numbers are represented by the two's
+    complement of the absolute value. This is the most common method of
+    representing signed integers on computers [1]_. A N-bit
+    two's-complement system can represent every integer in the range
+    :math:`-2^{N-1}` to :math:`+2^{N-1}-1`.
+
+    Parameters
+    ----------
+    x : array_like
+        Only integer and boolean types are handled.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : array_like
+        Result.
+
+    See Also
+    --------
+    bitwise_and, bitwise_or, bitwise_xor
+    logical_not
+
+    Notes
+    -----
+    `bitwise_not` is an alias for `invert`:
+
+    >>> import mars.tensor as mt
+
+    >>> mt.bitwise_not is mt.invert
+    True
+
+    References
+    ----------
+    .. [1] Wikipedia, "Two's complement",
+        http://en.wikipedia.org/wiki/Two's_complement
+
+    Examples
+    --------
+    We've seen that 13 is represented by ``00001101``.
+    The invert or bit-wise NOT of 13 is then:
+
+    >>> mt.invert(mt.array([13], dtype=mt.uint8)).execute()
+    array([242], dtype=uint8)
+
+    The result depends on the bit-width:
+
+    >>> mt.invert(mt.array([13], dtype=mt.uint16)).execute()
+    array([65522], dtype=uint16)
+
+    When using signed integer types the result is the two's complement of
+    the result for the unsigned type:
+
+    >>> mt.invert(mt.array([13], dtype=mt.int8)).execute()
+    array([-14], dtype=int8)
+
+    Booleans are accepted as well:
+
+    >>> mt.invert(mt.array([True, False])).execute()
+    array([False,  True])
+    """
+    op = TensorInvert(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/isclose.py b/python/xorbits/_mars/tensor/arithmetic/isclose.py
new file mode 100644
index 000000000..c20446214
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/isclose.py
@@ -0,0 +1,157 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import BoolField, Float64Field
+from ..array_utils import as_same_device, device
+from .core import TensorBinOp
+
+
+class TensorIsclose(TensorBinOp):
+    _op_type_ = OperandDef.ISCLOSE
+
+    _rtol = Float64Field("rtol")
+    _atol = Float64Field("atol")
+    _equal_nan = BoolField("equal_nan")
+
+    def __init__(
+        self,
+        rtol=None,
+        atol=None,
+        equal_nan=None,
+        casting="same_kind",
+        err=None,
+        sparse=False,
+        **kw
+    ):
+        err = err if err is not None else np.geterr()
+        super().__init__(
+            _rtol=rtol,
+            _atol=atol,
+            _equal_nan=equal_nan,
+            _casting=casting,
+            _err=err,
+            sparse=sparse,
+            **kw
+        )
+
+    @property
+    def rtol(self):
+        return self._rtol
+
+    @property
+    def atol(self):
+        return self._atol
+
+    @property
+    def equal_nan(self):
+        return self._equal_nan
+
+    @classmethod
+    def _is_sparse(cls, x1, x2):
+        if (
+            hasattr(x1, "issparse")
+            and x1.issparse()
+            and np.isscalar(x2)
+            and not np.isclose(x2, 0)
+        ):
+            return True
+        if (
+            hasattr(x2, "issparse")
+            and x2.issparse()
+            and np.isscalar(x1)
+            and not np.isclose(x1, 0)
+        ):
+            return True
+        return False
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            a = op.lhs if np.isscalar(op.lhs) else inputs[0]
+            b = op.rhs if np.isscalar(op.rhs) else inputs[-1]
+
+            ctx[op.outputs[0].key] = xp.isclose(
+                a, b, atol=op.atol, rtol=op.rtol, equal_nan=op.equal_nan
+            )
+
+
+def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
+    """
+    Returns a boolean tensor where two tensors are element-wise equal within a
+    tolerance.
+
+    The tolerance values are positive, typically very small numbers.  The
+    relative difference (`rtol` * abs(`b`)) and the absolute difference
+    `atol` are added together to compare against the absolute difference
+    between `a` and `b`.
+
+    Parameters
+    ----------
+    a, b : array_like
+        Input tensors to compare.
+    rtol : float
+        The relative tolerance parameter (see Notes).
+    atol : float
+        The absolute tolerance parameter (see Notes).
+    equal_nan : bool
+        Whether to compare NaN's as equal.  If True, NaN's in `a` will be
+        considered equal to NaN's in `b` in the output tensor.
+
+    Returns
+    -------
+    y : array_like
+        Returns a boolean tensor of where `a` and `b` are equal within the
+        given tolerance. If both `a` and `b` are scalars, returns a single
+        boolean value.
+
+    See Also
+    --------
+    allclose
+
+    Notes
+    -----
+
+    For finite values, isclose uses the following equation to test whether
+    two floating point values are equivalent.
+
+     absolute(`a` - `b`) <= (`atol` + `rtol` * absolute(`b`))
+
+    The above equation is not symmetric in `a` and `b`, so that
+    `isclose(a, b)` might be different from `isclose(b, a)` in
+    some rare cases.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.isclose([1e10,1e-7], [1.00001e10,1e-8]).execute()
+    array([True, False])
+    >>> mt.isclose([1e10,1e-8], [1.00001e10,1e-9]).execute()
+    array([True, True])
+    >>> mt.isclose([1e10,1e-8], [1.0001e10,1e-9]).execute()
+    array([False, True])
+    >>> mt.isclose([1.0, mt.nan], [1.0, mt.nan]).execute()
+    array([True, False])
+    >>> mt.isclose([1.0, mt.nan], [1.0, mt.nan], equal_nan=True).execute()
+    array([True, True])
+    """
+    op = TensorIsclose(rtol=rtol, atol=atol, equal_nan=equal_nan, dtype=np.dtype(bool))
+    return op(a, b)
diff --git a/python/xorbits/_mars/tensor/arithmetic/iscomplex.py b/python/xorbits/_mars/tensor/arithmetic/iscomplex.py
new file mode 100644
index 000000000..79c21c4ab
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/iscomplex.py
@@ -0,0 +1,62 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import inject_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorIsComplex(TensorUnaryOp):
+    _op_type_ = OperandDef.ISCOMPLEX
+    _func_name = "iscomplex"
+
+
+@inject_dtype(np.bool_)
+def iscomplex(x, **kwargs):
+    """
+    Returns a bool tensor, where True if input element is complex.
+
+    What is tested is whether the input has a non-zero imaginary part, not if
+    the input type is complex.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor.
+
+    Returns
+    -------
+    out : Tensor of bools
+        Output tensor.
+
+    See Also
+    --------
+    isreal
+    iscomplexobj : Return True if x is a complex type or an array of complex
+                   numbers.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.iscomplex([1+1j, 1+0j, 4.5, 3, 2, 2j]).execute()
+    array([ True, False, False, False, False,  True])
+
+    """
+    op = TensorIsComplex(**kwargs)
+    return op(x)
diff --git a/python/xorbits/_mars/tensor/arithmetic/isfinite.py b/python/xorbits/_mars/tensor/arithmetic/isfinite.py
new file mode 100644
index 000000000..c1a6372b5
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/isfinite.py
@@ -0,0 +1,104 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import inject_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorIsFinite(TensorUnaryOp):
+    _op_type_ = OperandDef.ISFINITE
+    _func_name = "isfinite"
+
+
+@inject_dtype(np.bool_)
+def isfinite(x, out=None, where=None, **kwargs):
+    """
+    Test element-wise for finiteness (not infinity or not Not a Number).
+
+    The result is returned as a boolean tensor.
+
+    Parameters
+    ----------
+    x : array_like
+        Input values.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor, bool
+        For scalar input, the result is a new boolean with value True
+        if the input is finite; otherwise the value is False (input is
+        either positive infinity, negative infinity or Not a Number).
+
+        For array input, the result is a boolean array with the same
+        dimensions as the input and the values are True if the
+        corresponding element of the input is finite; otherwise the values
+        are False (element is either positive infinity, negative infinity
+        or Not a Number).
+
+    See Also
+    --------
+    isinf, isneginf, isposinf, isnan
+
+    Notes
+    -----
+    Not a Number, positive infinity and negative infinity are considered
+    to be non-finite.
+
+    Mars uses the IEEE Standard for Binary Floating-Point for Arithmetic
+    (IEEE 754). This means that Not a Number is not equivalent to infinity.
+    Also that positive infinity is not equivalent to negative infinity. But
+    infinity is equivalent to positive infinity.  Errors result if the
+    second argument is also supplied when `x` is a scalar input, or if
+    first and second arguments have different shapes.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.isfinite(1).execute()
+    True
+    >>> mt.isfinite(0).execute()
+    True
+    >>> mt.isfinite(mt.nan).execute()
+    False
+    >>> mt.isfinite(mt.inf).execute()
+    False
+    >>> mt.isfinite(mt.NINF).execute()
+    False
+    >>> mt.isfinite([mt.log(-1.).execute(),1.,mt.log(0).execute()]).execute()
+    array([False,  True, False])
+
+    >>> x = mt.array([-mt.inf, 0., mt.inf])
+    >>> y = mt.array([2, 2, 2])
+    >>> mt.isfinite(x, y).execute()
+    array([0, 1, 0])
+    >>> y.execute()
+    array([0, 1, 0])
+    """
+    op = TensorIsFinite(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/isinf.py b/python/xorbits/_mars/tensor/arithmetic/isinf.py
new file mode 100644
index 000000000..05da391b9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/isinf.py
@@ -0,0 +1,101 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import inject_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorIsInf(TensorUnaryOp):
+    _op_type_ = OperandDef.ISINF
+    _func_name = "isinf"
+
+
+@inject_dtype(np.bool_)
+def isinf(x, out=None, where=None, **kwargs):
+    """
+    Test element-wise for positive or negative infinity.
+
+    Returns a boolean array of the same shape as `x`, True where ``x ==
+    +/-inf``, otherwise False.
+
+    Parameters
+    ----------
+    x : array_like
+        Input values
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : bool (scalar) or boolean Tensor
+        For scalar input, the result is a new boolean with value True if
+        the input is positive or negative infinity; otherwise the value is
+        False.
+
+        For tensor input, the result is a boolean tensor with the same shape
+        as the input and the values are True where the corresponding
+        element of the input is positive or negative infinity; elsewhere
+        the values are False.  If a second argument was supplied the result
+        is stored there.  If the type of that array is a numeric type the
+        result is represented as zeros and ones, if the type is boolean
+        then as False and True, respectively.  The return value `y` is then
+        a reference to that tensor.
+
+    See Also
+    --------
+    isneginf, isposinf, isnan, isfinite
+
+    Notes
+    -----
+    Mars uses the IEEE Standard for Binary Floating-Point for Arithmetic
+    (IEEE 754).
+
+    Errors result if the second argument is supplied when the first
+    argument is a scalar, or if the first and second arguments have
+    different shapes.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.isinf(mt.inf).execute()
+    True
+    >>> mt.isinf(mt.nan).execute()
+    False
+    >>> mt.isinf(mt.NINF).execute()
+    True
+    >>> mt.isinf([mt.inf, -mt.inf, 1.0, mt.nan]).execute()
+    array([ True,  True, False, False])
+
+    >>> x = mt.array([-mt.inf, 0., mt.inf])
+    >>> y = mt.array([2, 2, 2])
+    >>> mt.isinf(x, y).execute()
+    array([1, 0, 1])
+    >>> y.execute()
+    array([1, 0, 1])
+    """
+    op = TensorIsInf(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/isnan.py b/python/xorbits/_mars/tensor/arithmetic/isnan.py
new file mode 100644
index 000000000..595f372e3
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/isnan.py
@@ -0,0 +1,80 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import inject_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorIsNan(TensorUnaryOp):
+    _op_type_ = OperandDef.ISNAN
+    _func_name = "isnan"
+
+
+@inject_dtype(np.bool_)
+def isnan(x, out=None, where=None, **kwargs):
+    """
+    Test element-wise for NaN and return result as a boolean tensor.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor or bool
+        For scalar input, the result is a new boolean with value True if
+        the input is NaN; otherwise the value is False.
+
+        For array input, the result is a boolean tensor of the same
+        dimensions as the input and the values are True if the
+        corresponding element of the input is NaN; otherwise the values are
+        False.
+
+    See Also
+    --------
+    isinf, isneginf, isposinf, isfinite, isnat
+
+    Notes
+    -----
+    Mars uses the IEEE Standard for Binary Floating-Point for Arithmetic
+    (IEEE 754). This means that Not a Number is not equivalent to infinity.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.isnan(mt.nan).execute()
+    True
+    >>> mt.isnan(mt.inf).execute()
+    False
+    >>> mt.isnan([mt.log(-1.).execute(),1.,mt.log(0).execute()]).execute()
+    array([ True, False, False])
+    """
+    op = TensorIsNan(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/isreal.py b/python/xorbits/_mars/tensor/arithmetic/isreal.py
new file mode 100644
index 000000000..03abe9f1d
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/isreal.py
@@ -0,0 +1,61 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import inject_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorIsReal(TensorUnaryOp):
+    _op_type_ = OperandDef.ISREAL
+    _func_name = "isreal"
+
+
+@inject_dtype(np.bool_)
+def isreal(x, **kwargs):
+    """
+    Returns a bool tensor, where True if input element is real.
+
+    If element has complex type with zero complex part, the return value
+    for that element is True.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor.
+
+    Returns
+    -------
+    out : Tensor, bool
+        Boolean tensor of same shape as `x`.
+
+    See Also
+    --------
+    iscomplex
+    isrealobj : Return True if x is not a complex type.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.isreal([1+1j, 1+0j, 4.5, 3, 2, 2j]).execute()
+    array([False,  True,  True,  True,  True, False])
+
+    """
+    op = TensorIsReal(**kwargs)
+    return op(x)
diff --git a/python/xorbits/_mars/tensor/arithmetic/ldexp.py b/python/xorbits/_mars/tensor/arithmetic/ldexp.py
new file mode 100644
index 000000000..213f92a6d
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/ldexp.py
@@ -0,0 +1,97 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand
+class TensorLdexp(TensorBinOp):
+    _op_type_ = OperandDef.LDEXP
+    _func_name = "ldexp"
+
+    @classmethod
+    def _is_sparse(cls, x1, x2):
+        if hasattr(x1, "issparse") and x1.issparse():
+            return True
+        return False
+
+
+@infer_dtype(np.ldexp)
+def ldexp(x1, x2, out=None, where=None, **kwargs):
+    """
+    Returns x1 * 2**x2, element-wise.
+
+    The mantissas `x1` and twos exponents `x2` are used to construct
+    floating point numbers ``x1 * 2**x2``.
+
+    Parameters
+    ----------
+    x1 : array_like
+        Tensor of multipliers.
+    x2 : array_like, int
+        Tensor of twos exponents.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor or scalar
+        The result of ``x1 * 2**x2``.
+
+    See Also
+    --------
+    frexp : Return (y1, y2) from ``x = y1 * 2**y2``, inverse to `ldexp`.
+
+    Notes
+    -----
+    Complex dtypes are not supported, they will raise a TypeError.
+
+    `ldexp` is useful as the inverse of `frexp`, if used by itself it is
+    more clear to simply use the expression ``x1 * 2**x2``.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.ldexp(5, mt.arange(4)).execute()
+    array([  5.,  10.,  20.,  40.], dtype=float32)
+
+    >>> x = mt.arange(6)
+    >>> mt.ldexp(*mt.frexp(x)).execute()
+    array([ 0.,  1.,  2.,  3.,  4.,  5.])
+    """
+    x2_dtype = astensor(x2).dtype
+    casting = kwargs.get("casting", "safe")
+    if not np.can_cast(x2_dtype, np.int64, casting=casting):
+        raise TypeError(
+            "ufunc 'ldexp' not supported for the input types, "
+            "and the inputs could not be safely coerced to any supported types "
+            f"according to the casting rule ''{casting}''"
+        )
+
+    op = TensorLdexp(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/less.py b/python/xorbits/_mars/tensor/arithmetic/less.py
new file mode 100644
index 000000000..9f4e89cfb
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/less.py
@@ -0,0 +1,67 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import inject_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorLessThan(TensorBinOp):
+    _op_type_ = OperandDef.LT
+    _func_name = "less"
+
+
+@inject_dtype(np.bool_)
+def less(x1, x2, out=None, where=None, **kwargs):
+    """
+    Return the truth value of (x1 < x2) element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input tensors.  If ``x1.shape != x2.shape``, they must be
+        broadcastable to a common shape (which may be the shape of one or
+        the other).
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : bool or Tensor of bool
+        Array of bools, or a single bool if `x1` and `x2` are scalars.
+
+    See Also
+    --------
+    greater, less_equal, greater_equal, equal, not_equal
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.less([1, 2], [2, 2]).execute()
+    array([ True, False])
+    """
+    op = TensorLessThan(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/less_equal.py b/python/xorbits/_mars/tensor/arithmetic/less_equal.py
new file mode 100644
index 000000000..5de0f1052
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/less_equal.py
@@ -0,0 +1,67 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import inject_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorLessEqual(TensorBinOp):
+    _op_type_ = OperandDef.LE
+    _func_name = "less_equal"
+
+
+@inject_dtype(np.bool_)
+def less_equal(x1, x2, out=None, where=None, **kwargs):
+    """
+    Return the truth value of (x1 =< x2) element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input tensors.  If ``x1.shape != x2.shape``, they must be
+        broadcastable to a common shape (which may be the shape of one or
+        the other).
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : bool or tensor of bool
+        Array of bools, or a single bool if `x1` and `x2` are scalars.
+
+    See Also
+    --------
+    greater, less, greater_equal, equal, not_equal
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.less_equal([4, 2, 1], [2, 2, 2]).execute()
+    array([False,  True,  True])
+    """
+    op = TensorLessEqual(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/log.py b/python/xorbits/_mars/tensor/arithmetic/log.py
new file mode 100644
index 000000000..ccedd2e5e
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/log.py
@@ -0,0 +1,90 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorLog(TensorUnaryOp):
+    _op_type_ = OperandDef.LOG
+    _func_name = "log"
+
+
+@infer_dtype(np.log)
+def log(x, out=None, where=None, **kwargs):
+    """
+    Natural logarithm, element-wise.
+
+    The natural logarithm `log` is the inverse of the exponential function,
+    so that `log(exp(x)) = x`. The natural logarithm is logarithm in base
+    `e`.
+
+    Parameters
+    ----------
+    x : array_like
+        Input value.
+    out : Tensor, None, or tuple of tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        The natural logarithm of `x`, element-wise.
+
+    See Also
+    --------
+    log10, log2, log1p
+
+    Notes
+    -----
+    Logarithm is a multivalued function: for each `x` there is an infinite
+    number of `z` such that `exp(z) = x`. The convention is to return the
+    `z` whose imaginary part lies in `[-pi, pi]`.
+
+    For real-valued input data types, `log` always returns real output. For
+    each value that cannot be expressed as a real number or infinity, it
+    yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `log` is a complex analytical function that
+    has a branch cut `[-inf, 0]` and is continuous from above on it. `log`
+    handles the floating-point negative zero as an infinitesimal negative
+    number, conforming to the C99 standard.
+
+    References
+    ----------
+    .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
+           10th printing, 1964, pp. 67. http://www.math.sfu.ca/~cbm/aands/
+    .. [2] Wikipedia, "Logarithm". http://en.wikipedia.org/wiki/Logarithm
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.log([1, mt.e, mt.e**2, 0]).execute()
+    array([  0.,   1.,   2., -Inf])
+    """
+    op = TensorLog(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/log10.py b/python/xorbits/_mars/tensor/arithmetic/log10.py
new file mode 100644
index 000000000..87d7efbba
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/log10.py
@@ -0,0 +1,83 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorLog10(TensorUnaryOp):
+    _op_type_ = OperandDef.LOG10
+    _func_name = "log10"
+
+
+@infer_dtype(np.log10)
+def log10(x, out=None, where=None, **kwargs):
+    """
+    Return the base 10 logarithm of the input tensor, element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        Input values.
+    out : Tensor, None, or tuple of tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        The logarithm to the base 10 of `x`, element-wise. NaNs are
+        returned where x is negative.
+
+    Notes
+    -----
+    Logarithm is a multivalued function: for each `x` there is an infinite
+    number of `z` such that `10**z = x`. The convention is to return the
+    `z` whose imaginary part lies in `[-pi, pi]`.
+
+    For real-valued input data types, `log10` always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `log10` is a complex analytical function that
+    has a branch cut `[-inf, 0]` and is continuous from above on it.
+    `log10` handles the floating-point negative zero as an infinitesimal
+    negative number, conforming to the C99 standard.
+
+    References
+    ----------
+    .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
+           10th printing, 1964, pp. 67. http://www.math.sfu.ca/~cbm/aands/
+    .. [2] Wikipedia, "Logarithm". http://en.wikipedia.org/wiki/Logarithm
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.log10([1e-15, -3.]).execute()
+    array([-15.,  NaN])
+    """
+    op = TensorLog10(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/log1p.py b/python/xorbits/_mars/tensor/arithmetic/log1p.py
new file mode 100644
index 000000000..8850a9ea3
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/log1p.py
@@ -0,0 +1,93 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorLog1p(TensorUnaryOp):
+    _op_type_ = OperandDef.LOG1P
+    _func_name = "log1p"
+
+
+@infer_dtype(np.log1p)
+def log1p(x, out=None, where=None, **kwargs):
+    """
+    Return the natural logarithm of one plus the input tensor, element-wise.
+
+    Calculates ``log(1 + x)``.
+
+    Parameters
+    ----------
+    x : array_like
+        Input values.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        Natural logarithm of `1 + x`, element-wise.
+
+    See Also
+    --------
+    expm1 : ``exp(x) - 1``, the inverse of `log1p`.
+
+    Notes
+    -----
+    For real-valued input, `log1p` is accurate also for `x` so small
+    that `1 + x == 1` in floating-point accuracy.
+
+    Logarithm is a multivalued function: for each `x` there is an infinite
+    number of `z` such that `exp(z) = 1 + x`. The convention is to return
+    the `z` whose imaginary part lies in `[-pi, pi]`.
+
+    For real-valued input data types, `log1p` always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `log1p` is a complex analytical function that
+    has a branch cut `[-inf, -1]` and is continuous from above on it.
+    `log1p` handles the floating-point negative zero as an infinitesimal
+    negative number, conforming to the C99 standard.
+
+    References
+    ----------
+    .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
+           10th printing, 1964, pp. 67. http://www.math.sfu.ca/~cbm/aands/
+    .. [2] Wikipedia, "Logarithm". http://en.wikipedia.org/wiki/Logarithm
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.log1p(1e-99).execute()
+    1e-99
+    >>> mt.log(1 + 1e-99).execute()
+    0.0
+    """
+    op = TensorLog1p(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/log2.py b/python/xorbits/_mars/tensor/arithmetic/log2.py
new file mode 100644
index 000000000..5c8ae4dbd
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/log2.py
@@ -0,0 +1,83 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorLog2(TensorUnaryOp):
+    _op_type_ = OperandDef.LOG2
+    _func_name = "log2"
+
+
+@infer_dtype(np.log2)
+def log2(x, out=None, where=None, **kwargs):
+    """
+    Base-2 logarithm of `x`.
+
+    Parameters
+    ----------
+    x : array_like
+        Input values.
+    out : Tensor, None, or tuple of tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        Base-2 logarithm of `x`.
+
+    See Also
+    --------
+    log, log10, log1p
+
+    Logarithm is a multivalued function: for each `x` there is an infinite
+    number of `z` such that `2**z = x`. The convention is to return the `z`
+    whose imaginary part lies in `[-pi, pi]`.
+
+    For real-valued input data types, `log2` always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, `log2` is a complex analytical function that
+    has a branch cut `[-inf, 0]` and is continuous from above on it. `log2`
+    handles the floating-point negative zero as an infinitesimal negative
+    number, conforming to the C99 standard.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.array([0, 1, 2, 2**4])
+    >>> mt.log2(x).execute()
+    array([-Inf,   0.,   1.,   4.])
+
+    >>> xi = mt.array([0+1.j, 1, 2+0.j, 4.j])
+    >>> mt.log2(xi).execute()
+    array([ 0.+2.26618007j,  0.+0.j        ,  1.+0.j        ,  2.+2.26618007j])
+    """
+    op = TensorLog2(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/logaddexp.py b/python/xorbits/_mars/tensor/arithmetic/logaddexp.py
new file mode 100644
index 000000000..9b894b4c6
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/logaddexp.py
@@ -0,0 +1,78 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="always_false")
+class TensorLogAddExp(TensorBinOp):
+    _op_type_ = OperandDef.LOGADDEXP
+    _func_name = "logaddexp"
+
+
+@infer_dtype(np.logaddexp)
+def logaddexp(x1, x2, out=None, where=None, **kwargs):
+    """
+    Logarithm of the sum of exponentiations of the inputs.
+
+    Calculates ``log(exp(x1) + exp(x2))``. This function is useful in
+    statistics where the calculated probabilities of events may be so small
+    as to exceed the range of normal floating point numbers.  In such cases
+    the logarithm of the calculated probability is stored. This function
+    allows adding probabilities stored in such a fashion.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input values.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+        For other keyword-only arguments, see the
+        :ref:`ufunc docs <ufuncs.kwargs>`.
+
+    Returns
+    -------
+    result : Tensor
+        Logarithm of ``exp(x1) + exp(x2)``.
+
+    See Also
+    --------
+    logaddexp2: Logarithm of the sum of exponentiations of inputs in base 2.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> prob1 = mt.log(1e-50)
+    >>> prob2 = mt.log(2.5e-50)
+    >>> prob12 = mt.logaddexp(prob1, prob2)
+    >>> prob12.execute()
+    -113.87649168120691
+    >>> mt.exp(prob12).execute()
+    3.5000000000000057e-50
+    """
+    op = TensorLogAddExp(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/logaddexp2.py b/python/xorbits/_mars/tensor/arithmetic/logaddexp2.py
new file mode 100644
index 000000000..af41dd8b7
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/logaddexp2.py
@@ -0,0 +1,76 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="always_false")
+class TensorLogAddExp2(TensorBinOp):
+    _op_type_ = OperandDef.LOGADDEXP2
+    _func_name = "logaddexp2"
+
+
+@infer_dtype(np.logaddexp2)
+def logaddexp2(x1, x2, out=None, where=None, **kwargs):
+    """
+    Logarithm of the sum of exponentiations of the inputs in base-2.
+
+    Calculates ``log2(2**x1 + 2**x2)``. This function is useful in machine
+    learning when the calculated probabilities of events may be so small as
+    to exceed the range of normal floating point numbers.  In such cases
+    the base-2 logarithm of the calculated probability can be used instead.
+    This function allows adding probabilities stored in such a fashion.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input values.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    result : Tensor
+        Base-2 logarithm of ``2**x1 + 2**x2``.
+
+    See Also
+    --------
+    logaddexp: Logarithm of the sum of exponentiations of the inputs.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> prob1 = mt.log2(1e-50)
+    >>> prob2 = mt.log2(2.5e-50)
+    >>> prob12 = mt.logaddexp2(prob1, prob2)
+    >>> prob1.execute(), prob2.execute(), prob12.execute()
+    (-166.09640474436813, -164.77447664948076, -164.28904982231052)
+    >>> (2**prob12).execute()
+    3.4999999999999914e-50
+    """
+    op = TensorLogAddExp2(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/logical_and.py b/python/xorbits/_mars/tensor/arithmetic/logical_and.py
new file mode 100644
index 000000000..9fe64d5b3
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/logical_and.py
@@ -0,0 +1,79 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_or")
+class TensorAnd(TensorBinOp):
+    _op_type_ = OperandDef.AND
+    _func_name = "logical_and"
+
+
+@infer_dtype(np.logical_and)
+def logical_and(x1, x2, out=None, where=None, **kwargs):
+    """
+    Compute the truth value of x1 AND x2 element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input tensors. `x1` and `x2` must be of the same shape.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor or bool
+        Boolean result with the same shape as `x1` and `x2` of the logical
+        AND operation on corresponding elements of `x1` and `x2`.
+
+    See Also
+    --------
+    logical_or, logical_not, logical_xor
+    bitwise_and
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.logical_and(True, False).execute()
+    False
+    >>> mt.logical_and([True, False], [False, False]).execute()
+    array([False, False])
+
+    >>> x = mt.arange(5)
+    >>> mt.logical_and(x>1, x<4).execute()
+    array([False, False,  True,  True, False])
+    """
+    op = TensorAnd(**kwargs)
+    return op(x1, x2, out=out, where=where)
+
+
+@infer_dtype(np.logical_and, reverse=True)
+def rlogical_and(x1, x2, **kwargs):
+    op = TensorAnd(**kwargs)
+    return op.rcall(x1, x2)
diff --git a/python/xorbits/_mars/tensor/arithmetic/logical_not.py b/python/xorbits/_mars/tensor/arithmetic/logical_not.py
new file mode 100644
index 000000000..4df067cb4
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/logical_not.py
@@ -0,0 +1,72 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorNot(TensorUnaryOp):
+    _op_type_ = OperandDef.NOT
+    _func_name = "logical_not"
+
+
+@infer_dtype(np.logical_not)
+def logical_not(x, out=None, where=None, **kwargs):
+    """
+    Compute the truth value of NOT x element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        Logical NOT is applied to the elements of `x`.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : bool or Tensor of bool
+        Boolean result with the same shape as `x` of the NOT operation
+        on elements of `x`.
+
+    See Also
+    --------
+    logical_and, logical_or, logical_xor
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.logical_not(3).execute()
+    False
+    >>> mt.logical_not([True, False, 0, 1]).execute()
+    array([False,  True,  True, False])
+
+    >>> x = mt.arange(5)
+    >>> mt.logical_not(x<3).execute()
+    array([False, False, False,  True,  True])
+    """
+    op = TensorNot(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/logical_or.py b/python/xorbits/_mars/tensor/arithmetic/logical_or.py
new file mode 100644
index 000000000..f7a31dd02
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/logical_or.py
@@ -0,0 +1,80 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorOr(TensorBinOp):
+    _op_type_ = OperandDef.OR
+    _func_name = "logical_or"
+
+
+@infer_dtype(np.logical_or)
+def logical_or(x1, x2, out=None, where=None, **kwargs):
+    """
+    Compute the truth value of x1 OR x2 element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Logical OR is applied to the elements of `x1` and `x2`.
+        They have to be of the same shape.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor or bool
+        Boolean result with the same shape as `x1` and `x2` of the logical
+        OR operation on elements of `x1` and `x2`.
+
+    See Also
+    --------
+    logical_and, logical_not, logical_xor
+    bitwise_or
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.logical_or(True, False).execute()
+    True
+    >>> mt.logical_or([True, False], [False, False]).execute()
+    array([ True, False])
+
+    >>> x = mt.arange(5)
+    >>> mt.logical_or(x < 1, x > 3).execute()
+    array([ True, False, False, False,  True])
+    """
+    op = TensorOr(**kwargs)
+    return op(x1, x2, out=out, where=where)
+
+
+@infer_dtype(np.logical_or, reverse=True)
+def rlogical_or(x1, x2, **kwargs):
+    op = TensorOr(**kwargs)
+    return op.rcall(x1, x2)
diff --git a/python/xorbits/_mars/tensor/arithmetic/logical_xor.py b/python/xorbits/_mars/tensor/arithmetic/logical_xor.py
new file mode 100644
index 000000000..f3327570f
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/logical_xor.py
@@ -0,0 +1,86 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorXor(TensorBinOp):
+    _op_type_ = OperandDef.XOR
+    _func_name = "logical_xor"
+
+
+@infer_dtype(np.logical_xor)
+def logical_xor(x1, x2, out=None, where=None, **kwargs):
+    """
+    Compute the truth value of x1 XOR x2, element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Logical XOR is applied to the elements of `x1` and `x2`.  They must
+        be broadcastable to the same shape.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : bool or Tensor of bool
+        Boolean result of the logical XOR operation applied to the elements
+        of `x1` and `x2`; the shape is determined by whether or not
+        broadcasting of one or both arrays was required.
+
+    See Also
+    --------
+    logical_and, logical_or, logical_not, bitwise_xor
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.logical_xor(True, False).execute()
+    True
+    >>> mt.logical_xor([True, True, False, False], [True, False, True, False]).execute()
+    array([False,  True,  True, False])
+
+    >>> x = mt.arange(5)
+    >>> mt.logical_xor(x < 1, x > 3).execute()
+    array([ True, False, False, False,  True])
+
+    Simple example showing support of broadcasting
+
+    >>> mt.logical_xor(0, mt.eye(2)).execute()
+    array([[ True, False],
+           [False,  True]])
+    """
+    op = TensorXor(**kwargs)
+    return op(x1, x2, out=out, where=where)
+
+
+@infer_dtype(np.logical_xor, reverse=True)
+def rlogical_xor(x1, x2, **kwargs):
+    op = TensorXor(**kwargs)
+    return op.rcall(x1, x2)
diff --git a/python/xorbits/_mars/tensor/arithmetic/lshift.py b/python/xorbits/_mars/tensor/arithmetic/lshift.py
new file mode 100644
index 000000000..f9c29f1f2
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/lshift.py
@@ -0,0 +1,80 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_or")
+class TensorLshift(TensorBinOp):
+    _op_type_ = OperandDef.LSHIFT
+    _func_name = "left_shift"
+
+
+@infer_dtype(np.left_shift)
+def lshift(x1, x2, out=None, where=None, **kwargs):
+    """
+    Shift the bits of an integer to the left.
+
+    Bits are shifted to the left by appending `x2` 0s at the right of `x1`.
+    Since the internal representation of numbers is in binary format, this
+    operation is equivalent to multiplying `x1` by ``2**x2``.
+
+    Parameters
+    ----------
+    x1 : array_like of integer type
+        Input values.
+    x2 : array_like of integer type
+        Number of zeros to append to `x1`. Has to be non-negative.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : tensor of integer type
+        Return `x1` with bits shifted `x2` times to the left.
+
+    See Also
+    --------
+    right_shift : Shift the bits of an integer to the right.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.left_shift(5, 2).execute()
+    20
+
+    >>> mt.left_shift(5, [1,2,3]).execute()
+    array([10, 20, 40])
+    """
+    op = TensorLshift(**kwargs)
+    return op(x1, x2, out=out, where=where)
+
+
+@infer_dtype(np.left_shift, reverse=True)
+def rlshift(x1, x2, **kwargs):
+    op = TensorLshift(**kwargs)
+    return op.rcall(x1, x2)
diff --git a/python/xorbits/_mars/tensor/arithmetic/maximum.py b/python/xorbits/_mars/tensor/arithmetic/maximum.py
new file mode 100644
index 000000000..af3a02721
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/maximum.py
@@ -0,0 +1,106 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand
+class TensorMaximum(TensorBinOp):
+    _op_type_ = OperandDef.MAXIMUM
+    _func_name = "maximum"
+
+    @classmethod
+    def _is_sparse(cls, x1, x2):
+        if hasattr(x1, "issparse") and x1.issparse() and np.isscalar(x2) and x2 <= 0:
+            return True
+        if hasattr(x2, "issparse") and x2.issparse() and np.isscalar(x1) and x1 <= 0:
+            return True
+        return False
+
+
+@infer_dtype(np.maximum)
+def maximum(x1, x2, out=None, where=None, **kwargs):
+    """
+    Element-wise maximum of tensor elements.
+
+    Compare two tensors and returns a new array containing the element-wise
+    maxima. If one of the elements being compared is a NaN, then that
+    element is returned. If both elements are NaNs then the first is
+    returned. The latter distinction is important for complex NaNs, which
+    are defined as at least one of the real or imaginary parts being a NaN.
+    The net effect is that NaNs are propagated.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        The tensors holding the elements to be compared. They must have
+        the same shape, or shapes that can be broadcast to a single shape.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The maximum of `x1` and `x2`, element-wise.  Returns scalar if
+        both  `x1` and `x2` are scalars.
+
+    See Also
+    --------
+    minimum :
+        Element-wise minimum of two tensors, propagates NaNs.
+    fmax :
+        Element-wise maximum of two tensors, ignores NaNs.
+    amax :
+        The maximum value of a tensor along a given axis, propagates NaNs.
+    nanmax :
+        The maximum value of a tensor along a given axis, ignores NaNs.
+
+    fmin, amin, nanmin
+
+    Notes
+    -----
+    The maximum is equivalent to ``mt.where(x1 >= x2, x1, x2)`` when
+    neither x1 nor x2 are nans, but it is faster and does proper
+    broadcasting.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.maximum([2, 3, 4], [1, 5, 2]).execute()
+    array([2, 5, 4])
+
+    >>> mt.maximum(mt.eye(2), [0.5, 2]).execute() # broadcasting
+    array([[ 1. ,  2. ],
+           [ 0.5,  2. ]])
+
+    >>> mt.maximum([mt.nan, 0, mt.nan], [0, mt.nan, mt.nan]).execute()
+    array([ NaN,  NaN,  NaN])
+    >>> mt.maximum(mt.Inf, 1).execute()
+    inf
+    """
+    op = TensorMaximum(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/minimum.py b/python/xorbits/_mars/tensor/arithmetic/minimum.py
new file mode 100644
index 000000000..9db2b5c32
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/minimum.py
@@ -0,0 +1,106 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand
+class TensorMinimum(TensorBinOp):
+    _op_type_ = OperandDef.MINIMUM
+    _func_name = "minimum"
+
+    @classmethod
+    def _is_sparse(cls, x1, x2):
+        if hasattr(x1, "issparse") and x1.issparse() and np.isscalar(x2) and x2 >= 0:
+            return True
+        if hasattr(x2, "issparse") and x2.issparse() and np.isscalar(x1) and x1 >= 0:
+            return True
+        return False
+
+
+@infer_dtype(np.minimum)
+def minimum(x1, x2, out=None, where=None, **kwargs):
+    """
+    Element-wise minimum of tensor elements.
+
+    Compare two tensors and returns a new tensor containing the element-wise
+    minima. If one of the elements being compared is a NaN, then that
+    element is returned. If both elements are NaNs then the first is
+    returned. The latter distinction is important for complex NaNs, which
+    are defined as at least one of the real or imaginary parts being a NaN.
+    The net effect is that NaNs are propagated.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        The tensors holding the elements to be compared. They must have
+        the same shape, or shapes that can be broadcast to a single shape.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor or scalar
+        The minimum of `x1` and `x2`, element-wise.  Returns scalar if
+        both  `x1` and `x2` are scalars.
+
+    See Also
+    --------
+    maximum :
+        Element-wise maximum of two tensors, propagates NaNs.
+    fmin :
+        Element-wise minimum of two tensors, ignores NaNs.
+    amin :
+        The minimum value of a tensor along a given axis, propagates NaNs.
+    nanmin :
+        The minimum value of a tenosr along a given axis, ignores NaNs.
+
+    fmax, amax, nanmax
+
+    Notes
+    -----
+    The minimum is equivalent to ``mt.where(x1 <= x2, x1, x2)`` when
+    neither x1 nor x2 are NaNs, but it is faster and does proper
+    broadcasting.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.minimum([2, 3, 4], [1, 5, 2]).execute()
+    array([1, 3, 2])
+
+    >>> mt.minimum(mt.eye(2), [0.5, 2]).execute() # broadcasting
+    array([[ 0.5,  0. ],
+           [ 0. ,  1. ]])
+
+    >>> mt.minimum([mt.nan, 0, mt.nan],[0, mt.nan, mt.nan]).execute()
+    array([ NaN,  NaN,  NaN])
+    >>> mt.minimum(-mt.Inf, 1).execute()
+    -inf
+    """
+    op = TensorMinimum(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/mod.py b/python/xorbits/_mars/tensor/arithmetic/mod.py
new file mode 100644
index 000000000..e598084ea
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/mod.py
@@ -0,0 +1,102 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_or")
+class TensorMod(TensorBinOp):
+    _op_type_ = OperandDef.MOD
+    _func_name = "mod"
+
+
+@infer_dtype(np.mod)
+def mod(x1, x2, out=None, where=None, **kwargs):
+    """
+    Return element-wise remainder of division.
+
+    Computes the remainder complementary to the `floor_divide` function.  It is
+    equivalent to the Python modulus operator``x1 % x2`` and has the same sign
+    as the divisor `x2`. The MATLAB function equivalent to ``np.remainder``
+    is ``mod``.
+
+    .. warning::
+
+        This should not be confused with:
+
+        * Python 3.7's `math.remainder` and C's ``remainder``, which
+          computes the IEEE remainder, which are the complement to
+          ``round(x1 / x2)``.
+        * The MATLAB ``rem`` function and or the C ``%`` operator which is the
+          complement to ``int(x1 / x2)``.
+
+    Parameters
+    ----------
+    x1 : array_like
+        Dividend array.
+    x2 : array_like
+        Divisor array.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        The element-wise remainder of the quotient ``floor_divide(x1, x2)``.
+        Returns a scalar if both  `x1` and `x2` are scalars.
+
+    See Also
+    --------
+    floor_divide : Equivalent of Python ``//`` operator.
+    divmod : Simultaneous floor division and remainder.
+    fmod : Equivalent of the MATLAB ``rem`` function.
+    divide, floor
+
+    Notes
+    -----
+    Returns 0 when `x2` is 0 and both `x1` and `x2` are (tensors of)
+    integers.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.remainder([4, 7], [2, 3]).execute()
+    array([0, 1])
+    >>> mt.remainder(mt.arange(7), 5).execute()
+    array([0, 1, 2, 3, 4, 0, 1])
+    """
+    op = TensorMod(**kwargs)
+    return op(x1, x2, out=out, where=where)
+
+
+remainder = mod
+
+
+@infer_dtype(np.mod, reverse=True)
+def rmod(x1, x2, **kwargs):
+    op = TensorMod(**kwargs)
+    return op.rcall(x1, x2)
diff --git a/python/xorbits/_mars/tensor/arithmetic/modf.py b/python/xorbits/_mars/tensor/arithmetic/modf.py
new file mode 100644
index 000000000..c2ca31b63
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/modf.py
@@ -0,0 +1,123 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..array_utils import as_same_device, device
+from ..datasource import tensor as astensor
+from .core import TensorOutBinOp
+
+
+class TensorModf(TensorOutBinOp):
+    _op_type_ = OperandDef.MODF
+
+    def __init__(self, casting="same_kind", dtype=None, sparse=False, **kw):
+        super().__init__(_casting=casting, dtype=dtype, sparse=sparse, **kw)
+
+    @property
+    def _fun(self):
+        return np.modf
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            kw = {"casting": op.casting}
+
+            inputs_iter = iter(inputs)
+            input = next(inputs_iter)
+            if op.out1 is not None:
+                out1 = next(inputs_iter)
+            else:
+                out1 = None
+            if op.out2 is not None:
+                out2 = next(inputs_iter)
+            else:
+                out2 = None
+            if op.where is not None:
+                where = kw["where"] = next(inputs_iter)
+            else:
+                where = None
+            kw["order"] = op.order
+
+            try:
+                args = [input]
+                if out1 is not None:
+                    args.append(out1.copy())
+                if out2 is not None:
+                    args.append(out2.copy())
+                y1, y2 = xp.modf(*args, **kw)
+            except TypeError:
+                if where is None:
+                    raise
+                y1, y2 = xp.modf(input)
+                y1, y2 = xp.where(where, y1, out1), xp.where(where, y2, out2)
+
+            for c, res in zip(op.outputs, (y1, y2)):
+                ctx[c.key] = res
+
+
+def modf(x, out1=None, out2=None, out=None, where=None, **kwargs):
+    """
+    Return the fractional and integral parts of a tensor, element-wise.
+
+    The fractional and integral parts are negative if the given number is
+    negative.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y1 : Tensor
+        Fractional part of `x`.
+    y2 : Tensor
+        Integral part of `x`.
+
+    Notes
+    -----
+    For integer input the return values are floats.
+
+    See Also
+    --------
+    divmod : ``divmod(x, 1)`` is equivalent to ``modf`` with the return values
+             switched, except it always has a positive remainder.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.modf([0, 3.5]).execute()
+    (array([ 0. ,  0.5]), array([ 0.,  3.]))
+    >>> mt.modf(-0.5).execute()
+    (-0.5, -0)
+    """
+    x = astensor(x)
+    op = TensorModf(**kwargs)
+    return op(x, out1=out1, out2=out2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/multiply.py b/python/xorbits/_mars/tensor/arithmetic/multiply.py
new file mode 100644
index 000000000..cc214f556
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/multiply.py
@@ -0,0 +1,130 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import reduce
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import BoolField
+from ..array_utils import as_same_device, device
+from ..datasource import scalar
+from ..utils import infer_dtype
+from .core import TensorBinOp, TensorMultiOp
+from .utils import TreeReductionBuilder, arithmetic_operand, tree_op_estimate_size
+
+
+@arithmetic_operand(sparse_mode="binary_or")
+class TensorMultiply(TensorBinOp):
+    _op_type_ = OperandDef.MUL
+    _func_name = "multiply"
+
+
+@infer_dtype(np.multiply)
+def multiply(x1, x2, out=None, where=None, **kwargs):
+    """
+    Multiply arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input arrays to be multiplied.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        The product of `x1` and `x2`, element-wise. Returns a scalar if
+        both  `x1` and `x2` are scalars.
+
+    Notes
+    -----
+    Equivalent to `x1` * `x2` in terms of array broadcasting.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.multiply(2.0, 4.0).execute()
+    8.0
+
+    >>> x1 = mt.arange(9.0).reshape((3, 3))
+    >>> x2 = mt.arange(3.0)
+    >>> mt.multiply(x1, x2).execute()
+    array([[  0.,   1.,   4.],
+           [  0.,   4.,  10.],
+           [  0.,   7.,  16.]])
+    """
+    op = TensorMultiply(**kwargs)
+    return op(x1, x2, out=out, where=where)
+
+
+@infer_dtype(np.multiply, reverse=True)
+def rmultiply(x1, x2, **kwargs):
+    op = TensorMultiply(**kwargs)
+    return op.rcall(x1, x2)
+
+
+class TensorTreeMultiply(TensorMultiOp):
+    _op_type_ = OperandDef.TREE_MULTIPLY
+    _func_name = "multiply"
+
+    ignore_empty_input = BoolField("ignore_empty_input", default=False)
+
+    def __init__(self, sparse=False, **kw):
+        super().__init__(sparse=sparse, **kw)
+
+    @classmethod
+    def _is_sparse(cls, *args):
+        if not args or all(np.isscalar(x) for x in args):
+            return False
+        if all(
+            np.isscalar(x) or (hasattr(x, "issparse") and x.issparse()) for x in args
+        ):
+            return True
+        return False
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+        if op.ignore_empty_input:
+            inputs = [inp for inp in inputs if not hasattr(inp, "size") or inp.size > 0]
+
+        with device(device_id):
+            ctx[op.outputs[0].key] = reduce(xp.multiply, inputs)
+
+    @classmethod
+    def estimate_size(cls, ctx, op):
+        tree_op_estimate_size(ctx, op)
+
+
+@infer_dtype(lambda *args: reduce(np.multiply, args))
+def tree_multiply(*args, combine_size=None, **kwargs):
+    class MultiplyBuilder(TreeReductionBuilder):
+        def _build_reduction(self, inputs, final=False):
+            op = TensorTreeMultiply(args=inputs, **kwargs)
+            return op(*inputs)
+
+    args = [scalar(a) if np.isscalar(a) else a for a in args]
+    return MultiplyBuilder(combine_size).build(args)
diff --git a/python/xorbits/_mars/tensor/arithmetic/nan_to_num.py b/python/xorbits/_mars/tensor/arithmetic/nan_to_num.py
new file mode 100644
index 000000000..67d844f7b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/nan_to_num.py
@@ -0,0 +1,97 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..core import Tensor
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorNanToNum(TensorUnaryOp):
+    _op_type_ = OperandDef.NAN_TO_NUM
+    _func_name = "nan_to_num"
+
+
+@infer_dtype(np.nan_to_num)
+def nan_to_num(x, copy=True, **kwargs):
+    """
+    Replace nan with zero and inf with large finite numbers.
+
+    If `x` is inexact, NaN is replaced by zero, and infinity and -infinity
+    replaced by the respectively largest and most negative finite floating
+    point values representable by ``x.dtype``.
+
+    For complex dtypes, the above is applied to each of the real and
+    imaginary components of `x` separately.
+
+    If `x` is not inexact, then no replacements are made.
+
+    Parameters
+    ----------
+    x : array_like
+        Input data.
+    copy : bool, optional
+        Whether to create a copy of `x` (True) or to replace values
+        in-place (False). The in-place operation only occurs if
+        casting to an array does not require a copy.
+        Default is True.
+
+    Returns
+    -------
+    out : Tensor
+        `x`, with the non-finite values replaced. If `copy` is False, this may
+        be `x` itself.
+
+    See Also
+    --------
+    isinf : Shows which elements are positive or negative infinity.
+    isneginf : Shows which elements are negative infinity.
+    isposinf : Shows which elements are positive infinity.
+    isnan : Shows which elements are Not a Number (NaN).
+    isfinite : Shows which elements are finite (not NaN, not infinity)
+
+    Notes
+    -----
+    Mars uses the IEEE Standard for Binary Floating-Point for Arithmetic
+    (IEEE 754). This means that Not a Number is not equivalent to infinity.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.array([mt.inf, -mt.inf, mt.nan, -128, 128])
+    >>> mt.nan_to_num(x).execute()
+    array([  1.79769313e+308,  -1.79769313e+308,   0.00000000e+000,
+            -1.28000000e+002,   1.28000000e+002])
+    >>> y = mt.array([complex(mt.inf, mt.nan), mt.nan, complex(mt.nan, mt.inf)])
+    >>> mt.nan_to_num(y).execute()
+    array([  1.79769313e+308 +0.00000000e+000j,
+             0.00000000e+000 +0.00000000e+000j,
+             0.00000000e+000 +1.79769313e+308j])
+    """
+    op = TensorNanToNum(**kwargs)
+    ret = op(x)
+
+    if copy:
+        return ret
+
+    # set back, make sure x is a Tensor
+    if not isinstance(x, Tensor):
+        raise ValueError(f"`x` must be a Tensor, got {type(x)} instead")
+    x.data = ret.data
+    return x
diff --git a/python/xorbits/_mars/tensor/arithmetic/negative.py b/python/xorbits/_mars/tensor/arithmetic/negative.py
new file mode 100644
index 000000000..e7d495fa7
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/negative.py
@@ -0,0 +1,63 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorNegative(TensorUnaryOp):
+    _op_type_ = OperandDef.NEGATIVE
+    _func_name = "negative"
+
+
+@infer_dtype(np.negative)
+def negative(x, out=None, where=None, **kwargs):
+    """
+    Numerical negative, element-wise.
+
+    Parameters
+    ----------
+    x : array_like or scalar
+        Input tensor.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+        For other keyword-only arguments, see the
+        :ref:`ufunc docs <ufuncs.kwargs>`.
+
+    Returns
+    -------
+    y : Tensor or scalar
+        Returned array or scalar: `y = -x`.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.negative([1.,-1.]).execute()
+    array([-1.,  1.])
+    """
+    op = TensorNegative(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/nextafter.py b/python/xorbits/_mars/tensor/arithmetic/nextafter.py
new file mode 100644
index 000000000..2b4ffbfcf
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/nextafter.py
@@ -0,0 +1,66 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorNextafter(TensorBinOp):
+    _op_type_ = OperandDef.NEXTAFTER
+    _func_name = "nextafter"
+
+
+@infer_dtype(np.nextafter)
+def nextafter(x1, x2, out=None, where=None, **kwargs):
+    """
+    Return the next floating-point value after x1 towards x2, element-wise.
+
+    Parameters
+    ----------
+    x1 : array_like
+        Values to find the next representable value of.
+    x2 : array_like
+        The direction where to look for the next representable value of `x1`.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : array_like
+        The next representable values of `x1` in the direction of `x2`.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> eps = mt.finfo(mt.float64).eps
+    >>> (mt.nextafter(1, 2) == eps + 1).execute()
+    True
+    >>> (mt.nextafter([1, 2], [2, 1]) == [eps + 1, 2 - eps]).execute()
+    array([ True,  True])
+    """
+    op = TensorNextafter(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/not_equal.py b/python/xorbits/_mars/tensor/arithmetic/not_equal.py
new file mode 100644
index 000000000..0c2150bf3
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/not_equal.py
@@ -0,0 +1,70 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import inject_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorNotEqual(TensorBinOp):
+    _op_type_ = OperandDef.NE
+    _func_name = "not_equal"
+
+
+@inject_dtype(np.bool_)
+def not_equal(x1, x2, out=None, where=None, **kwargs):
+    """
+    Return (x1 != x2) element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+      Input tensors.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    not_equal : tensor bool, scalar bool
+      For each element in `x1, x2`, return True if `x1` is not equal
+      to `x2` and False otherwise.
+
+
+    See Also
+    --------
+    equal, greater, greater_equal, less, less_equal
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.not_equal([1.,2.], [1., 3.]).execute()
+    array([False,  True])
+    >>> mt.not_equal([1, 2], [[1, 3],[1, 4]]).execute()
+    array([[False,  True],
+           [False,  True]])
+    """
+    op = TensorNotEqual(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/positive.py b/python/xorbits/_mars/tensor/arithmetic/positive.py
new file mode 100644
index 000000000..39c077337
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/positive.py
@@ -0,0 +1,45 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorPositive(TensorUnaryOp):
+    _op_type_ = OperandDef.POSITIVE
+    _func_name = "positive"
+
+
+@infer_dtype(np.positive)
+def positive(x, out=None, where=None, **kwargs):
+    """
+    Numerical positive, element-wise.
+
+    Parameters
+    ----------
+    x : array_like or scalar
+        Input tensor.
+
+    Returns
+    -------
+    y : Tensor or scalar
+        Returned array or scalar: `y = +x`.
+    """
+    op = TensorPositive(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/power.py b/python/xorbits/_mars/tensor/arithmetic/power.py
new file mode 100644
index 000000000..0a611c801
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/power.py
@@ -0,0 +1,104 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand
+class TensorPower(TensorBinOp):
+    _op_type_ = OperandDef.POW
+    _func_name = "power"
+
+    @classmethod
+    def _is_sparse(cls, x1, x2):
+        if hasattr(x1, "issparse") and x1.issparse():
+            return True
+        return False
+
+
+@infer_dtype(np.power)
+def power(x1, x2, out=None, where=None, **kwargs):
+    r"""
+    First tensor elements raised to powers from second tensor, element-wise.
+
+    Raise each base in `x1` to the positionally-corresponding power in
+    `x2`.  `x1` and `x2` must be broadcastable to the same shape. Note that an
+    integer type raised to a negative integer power will raise a ValueError.
+
+    Parameters
+    ----------
+    x1 : array_like
+        The bases.
+    x2 : array_like
+        The exponents.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        The bases in `x1` raised to the exponents in `x2`.
+
+    See Also
+    --------
+    float_power : power function that promotes integers to float
+
+    Examples
+    --------
+    Cube each element in a list.
+
+    >>> import mars.tensor as mt
+
+    >>> x1 = range(6)
+    >>> x1
+    [0, 1, 2, 3, 4, 5]
+    >>> mt.power(x1, 3).execute()
+    array([  0,   1,   8,  27,  64, 125])
+
+    Raise the bases to different exponents.
+
+    >>> x2 = [1.0, 2.0, 3.0, 3.0, 2.0, 1.0]
+    >>> mt.power(x1, x2).execute()
+    array([  0.,   1.,   8.,  27.,  16.,   5.])
+
+    The effect of broadcasting.
+
+    >>> x2 = mt.array([[1, 2, 3, 3, 2, 1], [1, 2, 3, 3, 2, 1]])
+    >>> x2.execute()
+    array([[1, 2, 3, 3, 2, 1],
+           [1, 2, 3, 3, 2, 1]])
+    >>> mt.power(x1, x2).execute()
+    array([[ 0,  1,  8, 27, 16,  5],
+           [ 0,  1,  8, 27, 16,  5]])
+    """
+    op = TensorPower(**kwargs)
+    return op(x1, x2, out=out, where=where)
+
+
+@infer_dtype(np.power, reverse=True)
+def rpower(x1, x2, **kwargs):
+    op = TensorPower(**kwargs)
+    return op.rcall(x1, x2)
diff --git a/python/xorbits/_mars/tensor/arithmetic/rad2deg.py b/python/xorbits/_mars/tensor/arithmetic/rad2deg.py
new file mode 100644
index 000000000..34b5067a2
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/rad2deg.py
@@ -0,0 +1,69 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorRad2deg(TensorUnaryOp):
+    _op_type_ = OperandDef.RAD2DEG
+    _func_name = "rad2deg"
+
+
+@infer_dtype(np.rad2deg)
+def rad2deg(x, out=None, where=None, **kwargs):
+    """
+    Convert angles from radians to degrees.
+
+    Parameters
+    ----------
+    x : array_like
+        Angle in radians.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        The corresponding angle in degrees.
+
+    See Also
+    --------
+    deg2rad : Convert angles from degrees to radians.
+
+    Notes
+    -----
+    rad2deg(x) is ``180 * x / pi``.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.rad2deg(mt.pi/2).execute()
+    90.0
+    """
+    op = TensorRad2deg(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/radians.py b/python/xorbits/_mars/tensor/arithmetic/radians.py
new file mode 100644
index 000000000..2aa943f1f
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/radians.py
@@ -0,0 +1,75 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorRadians(TensorUnaryOp):
+    _op_type_ = OperandDef.RADIANS
+    _func_name = "radians"
+
+
+@infer_dtype(np.radians)
+def radians(x, out=None, where=None, **kwargs):
+    """
+    Convert angles from degrees to radians.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor in degrees.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        The corresponding radian values.
+
+    See Also
+    --------
+    deg2rad : equivalent function
+
+    Examples
+    --------
+    Convert a degree array to radians
+
+    >>> import mars.tensor as mt
+
+    >>> deg = mt.arange(12.) * 30.
+    >>> mt.radians(deg).execute()
+    array([ 0.        ,  0.52359878,  1.04719755,  1.57079633,  2.0943951 ,
+            2.61799388,  3.14159265,  3.66519143,  4.1887902 ,  4.71238898,
+            5.23598776,  5.75958653])
+
+    >>> out = mt.zeros((deg.shape))
+    >>> ret = mt.radians(deg, out)
+    >>> ret is out
+    True
+    """
+    op = TensorRadians(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/real.py b/python/xorbits/_mars/tensor/arithmetic/real.py
new file mode 100644
index 000000000..32b651d77
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/real.py
@@ -0,0 +1,68 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorReal(TensorUnaryOp):
+    _op_type_ = OperandDef.REAL
+    _func_name = "real"
+
+
+@infer_dtype(np.real)
+def real(val, **kwargs):
+    """
+    Return the real part of the complex argument.
+
+    Parameters
+    ----------
+    val : array_like
+        Input tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        The real component of the complex argument. If `val` is real, the type
+        of `val` is used for the output.  If `val` has complex elements, the
+        returned type is float.
+
+    See Also
+    --------
+    real_if_close, imag, angle
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([1+2j, 3+4j, 5+6j])
+    >>> a.real.execute()
+    array([ 1.,  3.,  5.])
+    >>> a.real = 9
+    >>> a.execute()
+    array([ 9.+2.j,  9.+4.j,  9.+6.j])
+    >>> a.real = mt.array([9, 8, 7])
+    >>> a.execute()
+    array([ 9.+2.j,  8.+4.j,  7.+6.j])
+    >>> mt.real(1 + 1j).execute()
+    1.0
+
+    """
+    op = TensorReal(**kwargs)
+    return op(val)
diff --git a/python/xorbits/_mars/tensor/arithmetic/reciprocal.py b/python/xorbits/_mars/tensor/arithmetic/reciprocal.py
new file mode 100644
index 000000000..cb3bb901b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/reciprocal.py
@@ -0,0 +1,74 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorReciprocal(TensorUnaryOp):
+    _op_type_ = OperandDef.RECIPROCAL
+    _func_name = "reciprocal"
+
+
+@infer_dtype(np.reciprocal)
+def reciprocal(x, out=None, where=None, **kwargs):
+    """
+    Return the reciprocal of the argument, element-wise.
+
+    Calculates ``1/x``.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        Return tensor.
+
+    Notes
+    -----
+    .. note::
+        This function is not designed to work with integers.
+
+    For integer arguments with absolute value larger than 1 the result is
+    always zero because of the way Python handles integer division.  For
+    integer zero the result is an overflow.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.reciprocal(2.).execute()
+    0.5
+    >>> mt.reciprocal([1, 2., 3.33]).execute()
+    array([ 1.       ,  0.5      ,  0.3003003])
+    """
+    op = TensorReciprocal(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/rint.py b/python/xorbits/_mars/tensor/arithmetic/rint.py
new file mode 100644
index 000000000..c15408e2b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/rint.py
@@ -0,0 +1,66 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorRint(TensorUnaryOp):
+    _op_type_ = OperandDef.RINT
+    _func_name = "rint"
+
+
+@infer_dtype(np.rint)
+def rint(x, out=None, where=None, **kwargs):
+    """
+    Round elements of the tensor to the nearest integer.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Output array is same shape and type as `x`.
+
+    See Also
+    --------
+    ceil, floor, trunc
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])
+    >>> mt.rint(a).execute()
+    array([-2., -2., -0.,  0.,  2.,  2.,  2.])
+    """
+    op = TensorRint(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/rshift.py b/python/xorbits/_mars/tensor/arithmetic/rshift.py
new file mode 100644
index 000000000..93aef63e2
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/rshift.py
@@ -0,0 +1,79 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_or")
+class TensorRshift(TensorBinOp):
+    _op_type_ = OperandDef.RSHIFT
+    _func_name = "right_shift"
+
+
+@infer_dtype(np.right_shift)
+def rshift(x1, x2, out=None, where=None, **kwargs):
+    """
+    Shift the bits of an integer to the right.
+
+    Bits are shifted to the right `x2`.  Because the internal
+    representation of numbers is in binary format, this operation is
+    equivalent to dividing `x1` by ``2**x2``.
+
+    Parameters
+    ----------
+    x1 : array_like, int
+        Input values.
+    x2 : array_like, int
+        Number of bits to remove at the right of `x1`.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : Tensor, int
+        Return `x1` with bits shifted `x2` times to the right.
+
+    See Also
+    --------
+    left_shift : Shift the bits of an integer to the left.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> mt.right_shift(10, 1).execute()
+    5
+
+    >>> mt.right_shift(10, [1,2,3]).execute()
+    array([5, 2, 1])
+    """
+    op = TensorRshift(**kwargs)
+    return op(x1, x2, out=out, where=where)
+
+
+@infer_dtype(np.right_shift, reverse=True)
+def rrshift(x1, x2, **kwargs):
+    op = TensorRshift(**kwargs)
+    return op.rcall(x1, x2)
diff --git a/python/xorbits/_mars/tensor/arithmetic/setimag.py b/python/xorbits/_mars/tensor/arithmetic/setimag.py
new file mode 100644
index 000000000..f3aad4bf9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/setimag.py
@@ -0,0 +1,46 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ..array_utils import as_same_device, device
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorSetImag(TensorBinOp):
+    _op_type_ = OperandDef.SET_IMAG
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        if len(inputs) == 1:
+            val, imag = inputs[0], op.rhs
+        else:
+            assert len(inputs) == 2
+            val, imag = inputs
+
+        with device(device_id):
+            val = val.copy()
+            val.imag = imag
+
+            ctx[op.outputs[0].key] = val
+
+
+def set_imag(val, imag):
+    op = TensorSetImag(dtype=val.dtype)
+    return op(val, imag)
diff --git a/python/xorbits/_mars/tensor/arithmetic/setreal.py b/python/xorbits/_mars/tensor/arithmetic/setreal.py
new file mode 100644
index 000000000..76cbe52c8
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/setreal.py
@@ -0,0 +1,46 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ..array_utils import as_same_device, device
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorSetReal(TensorBinOp):
+    _op_type_ = OperandDef.SET_REAL
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        if len(inputs) == 1:
+            val, real = inputs[0], op.rhs
+        else:
+            assert len(inputs) == 2
+            val, real = inputs
+
+        with device(device_id):
+            val = val.copy()
+            val.real = real
+
+            ctx[op.outputs[0].key] = val
+
+
+def set_real(val, real):
+    op = TensorSetReal(dtype=val.dtype)
+    return op(val, real)
diff --git a/python/xorbits/_mars/tensor/arithmetic/sign.py b/python/xorbits/_mars/tensor/arithmetic/sign.py
new file mode 100644
index 000000000..d988337ba
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/sign.py
@@ -0,0 +1,79 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorSign(TensorUnaryOp):
+    _op_type_ = OperandDef.SIGN
+    _func_name = "sign"
+
+
+@infer_dtype(np.sign)
+def sign(x, out=None, where=None, **kwargs):
+    r"""
+    Returns an element-wise indication of the sign of a number.
+
+    The `sign` function returns ``-1 if x < 0, 0 if x==0, 1 if x > 0``.  nan
+    is returned for nan inputs.
+
+    For complex inputs, the `sign` function returns
+    ``sign(x.real) + 0j if x.real != 0 else sign(x.imag) + 0j``.
+
+    complex(nan, 0) is returned for complex nan inputs.
+
+    Parameters
+    ----------
+    x : array_like
+      Input values.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+      The sign of `x`.
+
+    Notes
+    -----
+    There is more than one definition of sign in common use for complex
+    numbers.  The definition used here is equivalent to :math:`x/\sqrt{x*x}`
+    which is different from a common alternative, :math:`x/|x|`.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.sign([-5., 4.5]).execute()
+    array([-1.,  1.])
+    >>> mt.sign(0).execute()
+    0
+    >>> mt.sign(5-2j).execute()
+    (1+0j)
+    """
+    op = TensorSign(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/signbit.py b/python/xorbits/_mars/tensor/arithmetic/signbit.py
new file mode 100644
index 000000000..8e576f614
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/signbit.py
@@ -0,0 +1,63 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import inject_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorSignbit(TensorUnaryOp):
+    _op_type_ = OperandDef.SIGNBIT
+    _func_name = "signbit"
+
+
+@inject_dtype(np.bool_)
+def signbit(x, out=None, where=None, **kwargs):
+    """
+    Returns element-wise True where signbit is set (less than zero).
+
+    Parameters
+    ----------
+    x : array_like
+        The input value(s).
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    result : Tensor of bool
+        Output tensor, or reference to `out` if that was supplied.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.signbit(-1.2).execute()
+    True
+    >>> mt.signbit(mt.array([1, -2.3, 2.1])).execute()
+    array([False,  True, False])
+    """
+    op = TensorSignbit(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/sin.py b/python/xorbits/_mars/tensor/arithmetic/sin.py
new file mode 100644
index 000000000..b022af5ed
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/sin.py
@@ -0,0 +1,96 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorSin(TensorUnaryOp):
+    _op_type_ = OperandDef.SIN
+    _func_name = "sin"
+
+
+@infer_dtype(np.sin)
+def sin(x, out=None, where=None, **kwargs):
+    r"""
+    Trigonometric sine, element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        Angle, in radians (:math:`2 \pi` rad equals 360 degrees).
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : array_like
+        The sine of each element of x.
+
+    See Also
+    --------
+    arcsin, sinh, cos
+
+    Notes
+    -----
+    The sine is one of the fundamental functions of trigonometry (the
+    mathematical study of triangles).  Consider a circle of radius 1
+    centered on the origin.  A ray comes in from the :math:`+x` axis, makes
+    an angle at the origin (measured counter-clockwise from that axis), and
+    departs from the origin.  The :math:`y` coordinate of the outgoing
+    ray's intersection with the unit circle is the sine of that angle.  It
+    ranges from -1 for :math:`x=3\pi / 2` to +1 for :math:`\pi / 2.`  The
+    function has zeroes where the angle is a multiple of :math:`\pi`.
+    Sines of angles between :math:`\pi` and :math:`2\pi` are negative.
+    The numerous properties of the sine and related functions are included
+    in any standard trigonometry text.
+
+    Examples
+    --------
+    Print sine of one angle:
+
+    >>> import mars.tensor as mt
+
+    >>> mt.sin(mt.pi/2.).execute()
+    1.0
+
+    Print sines of an array of angles given in degrees:
+
+    >>> mt.sin(mt.array((0., 30., 45., 60., 90.)) * mt.pi / 180. ).execute()
+    array([ 0.        ,  0.5       ,  0.70710678,  0.8660254 ,  1.        ])
+
+    Plot the sine function:
+
+    >>> import matplotlib.pylab as plt
+    >>> x = mt.linspace(-mt.pi, mt.pi, 201)
+    >>> plt.plot(x.execute(), mt.sin(x).execute())
+    >>> plt.xlabel('Angle [rad]')
+    >>> plt.ylabel('sin(x)')
+    >>> plt.axis('tight')
+    >>> plt.show()
+    """
+    op = TensorSin(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/sinc.py b/python/xorbits/_mars/tensor/arithmetic/sinc.py
new file mode 100644
index 000000000..a1e12b510
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/sinc.py
@@ -0,0 +1,100 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorSinc(TensorUnaryOp):
+    _op_type_ = OperandDef.SINC
+    _func_name = "sinc"
+
+
+@infer_dtype(np.sinc)
+def sinc(x, **kwargs):
+    r"""
+    Return the sinc function.
+
+    The sinc function is :math:`\\sin(\\pi x)/(\\pi x)`.
+
+    Parameters
+    ----------
+    x : Tensor
+        Tensor (possibly multi-dimensional) of values for which to to
+        calculate ``sinc(x)``.
+
+    Returns
+    -------
+    out : Tensor
+        ``sinc(x)``, which has the same shape as the input.
+
+    Notes
+    -----
+    ``sinc(0)`` is the limit value 1.
+
+    The name sinc is short for "sine cardinal" or "sinus cardinalis".
+
+    The sinc function is used in various signal processing applications,
+    including in anti-aliasing, in the construction of a Lanczos resampling
+    filter, and in interpolation.
+
+    For bandlimited interpolation of discrete-time signals, the ideal
+    interpolation kernel is proportional to the sinc function.
+
+    References
+    ----------
+    .. [1] Weisstein, Eric W. "Sinc Function." From MathWorld--A Wolfram Web
+           Resource. http://mathworld.wolfram.com/SincFunction.html
+    .. [2] Wikipedia, "Sinc function",
+           http://en.wikipedia.org/wiki/Sinc_function
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.linspace(-4, 4, 41)
+    >>> mt.sinc(x).execute()
+    array([ -3.89804309e-17,  -4.92362781e-02,  -8.40918587e-02,
+            -8.90384387e-02,  -5.84680802e-02,   3.89804309e-17,
+             6.68206631e-02,   1.16434881e-01,   1.26137788e-01,
+             8.50444803e-02,  -3.89804309e-17,  -1.03943254e-01,
+            -1.89206682e-01,  -2.16236208e-01,  -1.55914881e-01,
+             3.89804309e-17,   2.33872321e-01,   5.04551152e-01,
+             7.56826729e-01,   9.35489284e-01,   1.00000000e+00,
+             9.35489284e-01,   7.56826729e-01,   5.04551152e-01,
+             2.33872321e-01,   3.89804309e-17,  -1.55914881e-01,
+            -2.16236208e-01,  -1.89206682e-01,  -1.03943254e-01,
+            -3.89804309e-17,   8.50444803e-02,   1.26137788e-01,
+             1.16434881e-01,   6.68206631e-02,   3.89804309e-17,
+            -5.84680802e-02,  -8.90384387e-02,  -8.40918587e-02,
+            -4.92362781e-02,  -3.89804309e-17])
+
+    >>> import matplotlib.pyplot as plt
+    >>> plt.plot(x.execute(), np.sinc(x).execute())
+    [<matplotlib.lines.Line2D object at 0x...>]
+    >>> plt.title("Sinc Function")
+    <matplotlib.text.Text object at 0x...>
+    >>> plt.ylabel("Amplitude")
+    <matplotlib.text.Text object at 0x...>
+    >>> plt.xlabel("X")
+    <matplotlib.text.Text object at 0x...>
+    >>> plt.show()
+    """
+    op = TensorSinc(**kwargs)
+    return op(x)
diff --git a/python/xorbits/_mars/tensor/arithmetic/sinh.py b/python/xorbits/_mars/tensor/arithmetic/sinh.py
new file mode 100644
index 000000000..f11f2a542
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/sinh.py
@@ -0,0 +1,91 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorSinh(TensorUnaryOp):
+    _op_type_ = OperandDef.SINH
+    _func_name = "sinh"
+
+
+@infer_dtype(np.sinh)
+def sinh(x, out=None, where=None, **kwargs):
+    """
+    Hyperbolic sine, element-wise.
+
+    Equivalent to ``1/2 * (mt.exp(x) - mt.exp(-x))`` or
+    ``-1j * mt.sin(1j*x)``.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        The corresponding hyperbolic sine values.
+
+    Notes
+    -----
+    If `out` is provided, the function writes the result into it,
+    and returns a reference to `out`.  (See Examples)
+
+    References
+    ----------
+    M. Abramowitz and I. A. Stegun, Handbook of Mathematical Functions.
+    New York, NY: Dover, 1972, pg. 83.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.sinh(0).execute()
+    0.0
+    >>> mt.sinh(mt.pi*1j/2).execute()
+    1j
+    >>> mt.sinh(mt.pi*1j).execute() # (exact value is 0)
+    1.2246063538223773e-016j
+    >>> # Discrepancy due to vagaries of floating point arithmetic.
+
+    >>> # Example of providing the optional output parameter
+    >>> out1 = mt.zeros(1)
+    >>> out2 = mt.sinh([0.1], out1)
+    >>> out2 is out1
+    True
+
+    >>> # Example of ValueError due to provision of shape mis-matched `out`
+    >>> mt.sinh(mt.zeros((3,3)),mt.zeros((2,2))).execute()
+    Traceback (most recent call last):
+    ...
+    ValueError:  operands could not be broadcast together with shapes (3,3) (2,2)
+    """
+    op = TensorSinh(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/spacing.py b/python/xorbits/_mars/tensor/arithmetic/spacing.py
new file mode 100644
index 000000000..a3c1c97ab
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/spacing.py
@@ -0,0 +1,70 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="always_false")
+class TensorSpacing(TensorUnaryOp):
+    _op_type_ = OperandDef.SPACING
+    _func_name = "spacing"
+
+
+@infer_dtype(np.spacing)
+def spacing(x, out=None, where=None, **kwargs):
+    """
+    Return the distance between x and the nearest adjacent number.
+
+    Parameters
+    ----------
+    x : array_like
+        Values to find the spacing of.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : array_like
+        The spacing of values of `x1`.
+
+    Notes
+    -----
+    It can be considered as a generalization of EPS:
+    ``spacing(mt.float64(1)) == mt.finfo(mt.float64).eps``, and there
+    should not be any representable number between ``x + spacing(x)`` and
+    x for any finite x.
+
+    Spacing of +- inf and NaN is NaN.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> (mt.spacing(1) == mt.finfo(mt.float64).eps).execute()
+    True
+    """
+    op = TensorSpacing(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/sqrt.py b/python/xorbits/_mars/tensor/arithmetic/sqrt.py
new file mode 100644
index 000000000..b5b9ee785
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/sqrt.py
@@ -0,0 +1,79 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorSqrt(TensorUnaryOp):
+    _op_type_ = OperandDef.SQRT
+    _func_name = "sqrt"
+
+
+@infer_dtype(np.sqrt)
+def sqrt(x, out=None, where=None, **kwargs):
+    """
+    Return the positive square-root of an tensor, element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        The values whose square-roots are required.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        An tensor of the same shape as `x`, containing the positive
+        square-root of each element in `x`.  If any element in `x` is
+        complex, a complex tensor is returned (and the square-roots of
+        negative reals are calculated).  If all of the elements in `x`
+        are real, so is `y`, with negative elements returning ``nan``.
+        If `out` was provided, `y` is a reference to it.
+
+    Notes
+    -----
+    *sqrt* has--consistent with common convention--as its branch cut the
+    real "interval" [`-inf`, 0), and is continuous from above on it.
+    A branch cut is a curve in the complex plane across which a given
+    complex function fails to be continuous.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.sqrt([1,4,9]).execute()
+    array([ 1.,  2.,  3.])
+
+    >>> mt.sqrt([4, -1, -3+4J]).execute()
+    array([ 2.+0.j,  0.+1.j,  1.+2.j])
+
+    >>> mt.sqrt([4, -1, mt.inf]).execute()
+    array([  2.,  NaN,  Inf])
+    """
+    op = TensorSqrt(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/square.py b/python/xorbits/_mars/tensor/arithmetic/square.py
new file mode 100644
index 000000000..37c6418b0
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/square.py
@@ -0,0 +1,67 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorSquare(TensorUnaryOp):
+    _op_type_ = OperandDef.SQUARE
+    _func_name = "square"
+
+
+@infer_dtype(np.square)
+def square(x, out=None, where=None, **kwargs):
+    """
+    Return the element-wise square of the input.
+
+    Parameters
+    ----------
+    x : array_like
+        Input data.
+    out : Tensor, None, or tuple of tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : Tensor
+        Element-wise `x*x`, of the same shape and dtype as `x`.
+        Returns scalar if `x` is a scalar.
+
+    See Also
+    --------
+    sqrt
+    power
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.square([-1j, 1]).execute()
+    array([-1.-0.j,  1.+0.j])
+    """
+    op = TensorSquare(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/subtract.py b/python/xorbits/_mars/tensor/arithmetic/subtract.py
new file mode 100644
index 000000000..ebc39eeb6
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/subtract.py
@@ -0,0 +1,79 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorSubtract(TensorBinOp):
+    _op_type_ = OperandDef.SUB
+    _func_name = "subtract"
+
+
+@infer_dtype(np.subtract)
+def subtract(x1, x2, out=None, where=None, **kwargs):
+    """
+    Subtract arguments, element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        The tensors to be subtracted from each other.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        The difference of `x1` and `x2`, element-wise.  Returns a scalar if
+        both  `x1` and `x2` are scalars.
+
+    Notes
+    -----
+    Equivalent to ``x1 - x2`` in terms of tensor broadcasting.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.subtract(1.0, 4.0).execute()
+    -3.0
+
+    >>> x1 = mt.arange(9.0).reshape((3, 3))
+    >>> x2 = mt.arange(3.0)
+    >>> mt.subtract(x1, x2).execute()
+    array([[ 0.,  0.,  0.],
+           [ 3.,  3.,  3.],
+           [ 6.,  6.,  6.]])
+    """
+    op = TensorSubtract(**kwargs)
+    return op(x1, x2, out=out, where=where)
+
+
+@infer_dtype(np.subtract, reverse=True)
+def rsubtract(x1, x2, **kwargs):
+    op = TensorSubtract(**kwargs)
+    return op.rcall(x1, x2)
diff --git a/python/xorbits/_mars/tensor/arithmetic/tan.py b/python/xorbits/_mars/tensor/arithmetic/tan.py
new file mode 100644
index 000000000..ef9495ba7
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/tan.py
@@ -0,0 +1,86 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorTan(TensorUnaryOp):
+    _op_type_ = OperandDef.TAN
+    _func_name = "tan"
+
+
+@infer_dtype(np.tan)
+def tan(x, out=None, where=None, **kwargs):
+    """
+    Compute tangent element-wise.
+
+    Equivalent to ``mt.sin(x)/mt.cos(x)`` element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+      Input tensor.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+      The corresponding tangent values.
+
+    Notes
+    -----
+    If `out` is provided, the function writes the result into it,
+    and returns a reference to `out`.  (See Examples)
+
+    References
+    ----------
+    M. Abramowitz and I. A. Stegun, Handbook of Mathematical Functions.
+    New York, NY: Dover, 1972.
+
+    Examples
+    --------
+    >>> from math import pi
+    >>> import mars.tensor as mt
+    >>> mt.tan(mt.array([-pi,pi/2,pi])).execute()
+    array([  1.22460635e-16,   1.63317787e+16,  -1.22460635e-16])
+    >>>
+    >>> # Example of providing the optional output parameter illustrating
+    >>> # that what is returned is a reference to said parameter
+    >>> out1 = mt.zeros(1)
+    >>> out2 = mt.cos([0.1], out1)
+    >>> out2 is out1
+    True
+    >>>
+    >>> # Example of ValueError due to provision of shape mis-matched `out`
+    >>> mt.cos(mt.zeros((3,3)),mt.zeros((2,2)))
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    ValueError: invalid return array shape
+    """
+    op = TensorTan(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/tanh.py b/python/xorbits/_mars/tensor/arithmetic/tanh.py
new file mode 100644
index 000000000..ad21f6b94
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/tanh.py
@@ -0,0 +1,90 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorTanh(TensorUnaryOp):
+    _op_type_ = OperandDef.TANH
+    _func_name = "tanh"
+
+
+@infer_dtype(np.tanh)
+def tanh(x, out=None, where=None, **kwargs):
+    """
+    Compute hyperbolic tangent element-wise.
+
+    Equivalent to ``mt.sinh(x)/np.cosh(x)`` or ``-1j * mt.tan(1j*x)``.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor
+        The corresponding hyperbolic tangent values.
+
+    Notes
+    -----
+    If `out` is provided, the function writes the result into it,
+    and returns a reference to `out`.  (See Examples)
+
+    References
+    ----------
+    .. [1] M. Abramowitz and I. A. Stegun, Handbook of Mathematical Functions.
+           New York, NY: Dover, 1972, pg. 83.
+           http://www.math.sfu.ca/~cbm/aands/
+
+    .. [2] Wikipedia, "Hyperbolic function",
+           http://en.wikipedia.org/wiki/Hyperbolic_function
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.tanh((0, mt.pi*1j, mt.pi*1j/2)).execute()
+    array([ 0. +0.00000000e+00j,  0. -1.22460635e-16j,  0. +1.63317787e+16j])
+
+    >>> # Example of providing the optional output parameter illustrating
+    >>> # that what is returned is a reference to said parameter
+    >>> out1 = mt.zeros(1)
+    >>> out2 = mt.tanh([0.1], out1)
+    >>> out2 is out1
+    True
+
+    >>> # Example of ValueError due to provision of shape mis-matched `out`
+    >>> mt.tanh(mt.zeros((3,3)),mt.zeros((2,2)))
+    Traceback (most recent call last):
+    ...
+    ValueError: operands could not be broadcast together with shapes (3,3) (2,2)
+    """
+    op = TensorTanh(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/tests/__init__.py b/python/xorbits/_mars/tensor/arithmetic/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/arithmetic/tests/test_arithmetic.py b/python/xorbits/_mars/tensor/arithmetic/tests/test_arithmetic.py
new file mode 100644
index 000000000..7ebaf45a8
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/tests/test_arithmetic.py
@@ -0,0 +1,640 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import scipy.sparse as sps
+
+from ....core import enter_mode, tile
+from ...core import SparseTensor, Tensor
+from ...datasource import array, empty, ones, tensor
+from ...fetch import TensorFetch
+from ...linalg import matmul
+from .. import (
+    TensorAdd,
+    TensorGreaterThan,
+    TensorIsclose,
+    TensorLog,
+    TensorSubtract,
+    TensorTreeAdd,
+    TensorTreeMultiply,
+    add,
+    around,
+    cos,
+    frexp,
+    isclose,
+    isfinite,
+    log,
+    negative,
+    subtract,
+    tree_add,
+    tree_multiply,
+    truediv,
+)
+
+
+def test_add():
+    t1 = ones((3, 4), chunk_size=2)
+    t2 = ones(4, chunk_size=2)
+    t3 = t1 + t2
+    k1 = t3.key
+    assert t3.op.gpu is None
+    t1, t2, t3 = tile(t1, t2, t3)
+    assert t3.key != k1
+    assert t3.shape == (3, 4)
+    assert len(t3.chunks) == 4
+    assert t3.chunks[0].inputs == [t1.chunks[0].data, t2.chunks[0].data]
+    assert t3.chunks[1].inputs == [t1.chunks[1].data, t2.chunks[1].data]
+    assert t3.chunks[2].inputs == [t1.chunks[2].data, t2.chunks[0].data]
+    assert t3.chunks[3].inputs == [t1.chunks[3].data, t2.chunks[1].data]
+    assert t3.op.dtype == np.dtype("f8")
+    assert t3.chunks[0].op.dtype == np.dtype("f8")
+
+    t1 = ones((3, 4), chunk_size=2)
+    t4 = t1 + 1
+    t1, t4 = tile(t1, t4)
+    assert t4.shape == (3, 4)
+    assert len(t3.chunks) == 4
+    assert t4.chunks[0].inputs == [t1.chunks[0].data]
+    assert t4.chunks[0].op.rhs == 1
+    assert t4.chunks[1].inputs == [t1.chunks[1].data]
+    assert t4.chunks[1].op.rhs == 1
+    assert t4.chunks[2].inputs == [t1.chunks[2].data]
+    assert t4.chunks[2].op.rhs == 1
+    assert t4.chunks[3].inputs == [t1.chunks[3].data]
+    assert t4.chunks[3].op.rhs == 1
+
+    t5 = add([1, 2, 3, 4], 1)
+    tile(t5)
+    assert t4.chunks[0].inputs == [t1.chunks[0].data]
+
+    t2 = ones(4, chunk_size=2)
+    t6 = ones((3, 4), chunk_size=2, gpu=True)
+    t7 = ones(4, chunk_size=2, gpu=True)
+    t8 = t6 + t7
+    t9 = t6 + t2
+    assert t8.op.gpu is True
+    t8, t9 = tile(t8, t9)
+    assert t8.chunks[0].op.gpu is True
+    assert t9.op.gpu is None
+    assert t9.chunks[0].op.gpu is None
+
+    # sparse tests
+    t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse()
+
+    t = t1 + 1
+    assert t.op.gpu is None
+    assert t.issparse() is True
+    assert type(t) is SparseTensor
+
+    t = tile(t)
+    assert t.chunks[0].op.sparse is True
+
+    t = t1 + 0
+    assert t.issparse() is True
+    assert type(t) is SparseTensor
+
+    t2 = tensor([[1, 0, 0]], chunk_size=2).tosparse()
+
+    t = t1 + t2
+    assert t.issparse() is True
+    assert type(t) is SparseTensor
+
+    t = tile(t)
+    assert t.chunks[0].op.sparse is True
+
+    t3 = tensor([1, 1, 1], chunk_size=2)
+    t = t1 + t3
+    assert t.issparse() is False
+    assert type(t) is Tensor
+
+    t = tile(t)
+    assert t.chunks[0].op.sparse is False
+
+
+def test_add_order():
+    raw_a = np.random.rand(4, 2)
+    raw_b = np.asfortranarray(np.random.rand(4, 2))
+    t1 = tensor(raw_a)
+    t2 = tensor(raw_b)
+    out = tensor(raw_b)
+
+    # C + scalar
+    assert (t1 + 1).flags["C_CONTIGUOUS"] == (raw_a + 1).flags["C_CONTIGUOUS"]
+    assert (t1 + 1).flags["F_CONTIGUOUS"] == (raw_a + 1).flags["F_CONTIGUOUS"]
+    # C + C
+    assert (t1 + t1).flags["C_CONTIGUOUS"] == (raw_a + raw_a).flags["C_CONTIGUOUS"]
+    assert (t1 + t1).flags["F_CONTIGUOUS"] == (raw_a + raw_a).flags["F_CONTIGUOUS"]
+    # F + scalar
+    assert (t2 + 1).flags["C_CONTIGUOUS"] == (raw_b + 1).flags["C_CONTIGUOUS"]
+    assert (t2 + 1).flags["F_CONTIGUOUS"] == (raw_b + 1).flags["F_CONTIGUOUS"]
+    # F + F
+    assert (t2 + t2).flags["C_CONTIGUOUS"] == (raw_b + raw_b).flags["C_CONTIGUOUS"]
+    assert (t2 + t2).flags["F_CONTIGUOUS"] == (raw_b + raw_b).flags["F_CONTIGUOUS"]
+    # C + F
+    assert (t1 + t2).flags["C_CONTIGUOUS"] == (raw_a + raw_b).flags["C_CONTIGUOUS"]
+    assert (t1 + t2).flags["F_CONTIGUOUS"] == (raw_a + raw_b).flags["F_CONTIGUOUS"]
+    # C + C + out
+    assert (
+        add(t1, t1, out=out).flags["C_CONTIGUOUS"]
+        == np.add(raw_a, raw_a, out=np.empty((4, 2), order="F")).flags["C_CONTIGUOUS"]
+    )
+    assert (
+        add(t1, t1, out=out).flags["F_CONTIGUOUS"]
+        == np.add(raw_a, raw_a, out=np.empty((4, 2), order="F")).flags["F_CONTIGUOUS"]
+    )
+
+    with pytest.raises(TypeError):
+        add(t1, 1, order="B")
+
+
+def test_multiply():
+    t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse()
+
+    t = t1 * 10
+    assert t.issparse() is True
+    assert type(t) is SparseTensor
+
+    t = tile(t)
+    assert t.chunks[0].op.sparse is True
+
+    t2 = tensor([[1, 0, 0]], chunk_size=2).tosparse()
+
+    t = t1 * t2
+    assert t.issparse() is True
+    assert type(t) is SparseTensor
+
+    t = tile(t)
+    assert t.chunks[0].op.sparse is True
+
+    t3 = tensor([1, 1, 1], chunk_size=2)
+    t = t1 * t3
+    assert t.issparse() is True
+    assert type(t) is SparseTensor
+
+    t = tile(t)
+    assert t.chunks[0].op.sparse is True
+
+
+def test_divide():
+    t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse()
+
+    t = t1 / 10
+    assert t.issparse() is True
+    assert type(t) is SparseTensor
+
+    t = tile(t)
+    assert t.chunks[0].op.sparse is True
+
+    t2 = tensor([[1, 0, 0]], chunk_size=2).tosparse()
+
+    t = t1 / t2
+    assert t.issparse() is False
+    assert type(t) is Tensor
+
+    t = tile(t)
+    assert t.chunks[0].op.sparse is False
+
+    t3 = tensor([1, 1, 1], chunk_size=2)
+    t = t1 / t3
+    assert t.issparse() is False
+    assert type(t) is Tensor
+
+    t = tile(t)
+    assert t.chunks[0].op.sparse is False
+
+    t = t3 / t1
+    assert t.issparse() is False
+    assert type(t) is Tensor
+
+    t = tile(t)
+    assert t.chunks[0].op.sparse is False
+
+
+def test_datatime_arith():
+    t1 = array([np.datetime64("2005-02-02"), np.datetime64("2005-02-03")])
+    t2 = t1 + np.timedelta64(1)
+
+    assert isinstance(t2.op, TensorAdd)
+
+    t3 = t1 - np.datetime64("2005-02-02")
+
+    assert isinstance(t3.op, TensorSubtract)
+    assert (
+        t3.dtype
+        == (
+            np.array(["2005-02-02", "2005-02-03"], dtype=np.datetime64)
+            - np.datetime64("2005-02-02")
+        ).dtype
+    )
+
+    t1 = array([np.datetime64("2005-02-02"), np.datetime64("2005-02-03")])
+    subtract(t1, np.datetime64("2005-02-02"), out=empty(t1.shape, dtype=t3.dtype))
+
+    t1 = array([np.datetime64("2005-02-02"), np.datetime64("2005-02-03")])
+    add(t1, np.timedelta64(1, "D"), out=t1)
+
+
+def test_add_with_out():
+    t1 = ones((3, 4), chunk_size=2)
+    t2 = ones(4, chunk_size=2)
+
+    t3 = add(t1, t2, out=t1)
+
+    assert isinstance(t1.op, TensorAdd)
+    assert t1.op.out.key == t1.op.lhs.key
+    assert t3 is t1
+    assert t3.shape == (3, 4)
+    assert t3.op.lhs.extra_params.raw_chunk_size == 2
+    assert t3.op.rhs is t2.data
+    assert t3.key != t3.op.lhs.key
+
+    t1, t3 = tile(t1, t3)
+
+    assert isinstance(t1.chunks[0].op, TensorAdd)
+    assert t1.chunks[0].op.out.key == t1.chunks[0].op.lhs.key
+
+    with pytest.raises(TypeError):
+        add(t1, t2, out=1)
+
+    with pytest.raises(ValueError):
+        add(t1, t2, out=t2)
+
+    with pytest.raises(TypeError):
+        truediv(t1, t2, out=t1.astype("i8"))
+
+    t1 = ones((3, 4), chunk_size=2, dtype=float)
+    t2 = ones(4, chunk_size=2, dtype=int)
+
+    t3 = add(t2, 1, out=t1)
+    assert t3.shape == (3, 4)
+    assert t3.dtype == np.float64
+
+
+def test_dtype_from_out():
+    x = array([-np.inf, 0.0, np.inf])
+    y = array([2, 2, 2])
+
+    t3 = isfinite(x, y)
+    assert t3.dtype == y.dtype
+
+
+def test_log_without_where():
+    t1 = ones((3, 4), chunk_size=2)
+
+    t2 = log(t1, out=t1)
+
+    assert isinstance(t2.op, TensorLog)
+    assert t1.op.out.key == t1.op.input.key
+    assert t2 is t1
+    assert t2.op.input.extra_params.raw_chunk_size == 2
+    assert t2.key != t2.op.input.key
+
+    t3 = empty((3, 4), chunk_size=2)
+    t4 = log(t1, out=t3, where=t1 > 0)
+    assert isinstance(t4.op, TensorLog)
+    assert t4 is t3
+    assert t2.op.input.extra_params.raw_chunk_size == 2
+    assert t2.key != t2.op.input.key
+
+
+def test_copy_add():
+    t1 = ones((3, 4), chunk_size=2)
+    t2 = ones(4, chunk_size=2)
+    t3 = t1 + t2
+    t3 = tile(t3)
+
+    c = t3.chunks[0]
+    inputs = (
+        c.op.lhs,
+        TensorFetch().new_chunk(
+            c.op.rhs.inputs,
+            shape=c.op.rhs.shape,
+            index=c.op.rhs.index,
+            _key=c.op.rhs.key,
+        ),
+    )
+    new_c = c.op.copy().reset_key().new_chunk(inputs, shape=c.shape, _key="new_key")
+    assert new_c.key == "new_key"
+    assert new_c.inputs[1] is new_c.op.rhs
+    assert isinstance(new_c.inputs[1].op, TensorFetch)
+
+
+def test_compare():
+    t1 = ones(4, chunk_size=2) * 2
+    t2 = ones(4, chunk_size=2)
+    t3 = t1 > t2
+    t3 = tile(t3)
+    assert len(t3.chunks) == 2
+    assert isinstance(t3.op, TensorGreaterThan)
+
+
+def test_unify_chunk_add():
+    t1 = ones(4, chunk_size=2)
+    t2 = ones(1, chunk_size=1)
+
+    t3 = t1 + t2
+    t1, t2, t3 = tile(t1, t2, t3)
+
+    assert len(t3.chunks) == 2
+    assert t3.chunks[0].inputs[0] == t1.chunks[0].data
+    assert t3.chunks[0].inputs[1] == t2.chunks[0].data
+    assert t3.chunks[1].inputs[0] == t1.chunks[1].data
+    assert t3.chunks[1].inputs[1] == t2.chunks[0].data
+
+
+def test_frexp():
+    t1 = ones((3, 4, 5), chunk_size=2)
+    t2 = empty((3, 4, 5), dtype=np.float_, chunk_size=2)
+    op_type = type(t1.op)
+
+    o1, o2 = frexp(t1)
+
+    assert o1.op is o2.op
+    assert o1.dtype != o2.dtype
+
+    o1, o2 = frexp(t1, t1)
+
+    assert o1 is t1
+    assert o1.inputs[0] is not t1
+    assert isinstance(o1.inputs[0].op, op_type)
+    assert o2.inputs[0] is not t1
+
+    o1, o2 = frexp(t1, t2, where=t1 > 0)
+
+    op_type = type(t2.op)
+    assert o1 is t2
+    assert o1.inputs[0] is not t1
+    assert isinstance(o1.inputs[0].op, op_type)
+    assert o2.inputs[0] is not t1
+
+
+def test_frexp_order():
+    raw1 = np.asfortranarray(np.random.rand(2, 4))
+    t = tensor(raw1)
+    o1 = tensor(np.random.rand(2, 4))
+
+    o1, o2 = frexp(t, out1=o1)
+
+    assert (
+        o1.flags["C_CONTIGUOUS"]
+        == np.frexp(raw1, np.empty((2, 4)))[0].flags["C_CONTIGUOUS"]
+    )
+    assert (
+        o1.flags["F_CONTIGUOUS"]
+        == np.frexp(raw1, np.empty((2, 4)))[0].flags["F_CONTIGUOUS"]
+    )
+    assert o2.flags["C_CONTIGUOUS"] == np.frexp(raw1)[1].flags["C_CONTIGUOUS"]
+    assert o2.flags["F_CONTIGUOUS"] == np.frexp(raw1)[1].flags["F_CONTIGUOUS"]
+
+
+def test_dtype():
+    t1 = ones((2, 3), dtype="f4", chunk_size=2)
+
+    t = truediv(t1, 2, dtype="f8")
+
+    assert t.dtype == np.float64
+
+    with pytest.raises(TypeError):
+        truediv(t1, 2, dtype="i4")
+
+
+def test_negative():
+    t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse()
+
+    t = negative(t1)
+    assert t.op.gpu is None
+    assert t.issparse() is True
+    assert type(t) is SparseTensor
+
+    t = tile(t)
+    assert t.chunks[0].op.sparse is True
+
+
+def test_negative_order():
+    raw1 = np.random.rand(4, 2)
+    raw2 = np.asfortranarray(np.random.rand(4, 2))
+    t1 = tensor(raw1)
+    t2 = tensor(raw2)
+    t3 = tensor(raw1)
+    t4 = tensor(raw2)
+
+    # C
+    assert negative(t1).flags["C_CONTIGUOUS"] == np.negative(raw1).flags["C_CONTIGUOUS"]
+    assert negative(t1).flags["F_CONTIGUOUS"] == np.negative(raw1).flags["F_CONTIGUOUS"]
+    # F
+    assert negative(t2).flags["C_CONTIGUOUS"] == np.negative(raw2).flags["C_CONTIGUOUS"]
+    assert negative(t2).flags["F_CONTIGUOUS"] == np.negative(raw2).flags["F_CONTIGUOUS"]
+    # C + out
+    assert (
+        negative(t1, out=t4).flags["C_CONTIGUOUS"]
+        == np.negative(raw1, out=np.empty((4, 2), order="F")).flags["C_CONTIGUOUS"]
+    )
+    assert (
+        negative(t1, out=t4).flags["F_CONTIGUOUS"]
+        == np.negative(raw1, out=np.empty((4, 2), order="F")).flags["F_CONTIGUOUS"]
+    )
+    # F + out
+    assert (
+        negative(t2, out=t3).flags["C_CONTIGUOUS"]
+        == np.negative(raw1, out=np.empty((4, 2), order="C")).flags["C_CONTIGUOUS"]
+    )
+    assert (
+        negative(t2, out=t3).flags["F_CONTIGUOUS"]
+        == np.negative(raw1, out=np.empty((4, 2), order="C")).flags["F_CONTIGUOUS"]
+    )
+
+    with pytest.raises(TypeError):
+        negative(t1, order="B")
+
+
+def test_cos():
+    t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse()
+
+    t = cos(t1)
+    assert t.issparse() is True
+    assert type(t) is SparseTensor
+
+
+def test_around():
+    t1 = ones((2, 3), dtype="f4", chunk_size=2)
+
+    t = around(t1, decimals=3)
+
+    assert t.issparse() is False
+    assert t.op.decimals == 3
+
+    t = tile(t)
+
+    assert t.chunks[0].op.decimals == 3
+
+
+def test_isclose():
+    t1 = ones((2, 3), dtype="f4", chunk_size=2)
+
+    atol = 1e-4
+    rtol = 1e-5
+    equal_nan = True
+
+    t = isclose(t1, 2, atol=atol, rtol=rtol, equal_nan=equal_nan)
+
+    assert isinstance(t.op, TensorIsclose)
+    assert t.op.atol == atol
+    assert t.op.rtol == rtol
+    assert t.op.equal_nan == equal_nan
+
+    t = tile(t)
+
+    assert isinstance(t.chunks[0].op, TensorIsclose)
+    assert t.chunks[0].op.atol == atol
+    assert t.chunks[0].op.rtol == rtol
+    assert t.chunks[0].op.equal_nan == equal_nan
+
+    t1 = ones((2, 3), dtype="f4", chunk_size=2)
+    t2 = ones((2, 3), dtype="f4", chunk_size=2)
+
+    atol = 1e-4
+    rtol = 1e-5
+    equal_nan = True
+
+    t = isclose(t1, t2, atol=atol, rtol=rtol, equal_nan=equal_nan)
+
+    assert isinstance(t.op, TensorIsclose)
+    assert t.op.atol == atol
+    assert t.op.rtol == rtol
+    assert t.op.equal_nan == equal_nan
+
+    t = tile(t)
+
+    assert isinstance(t.chunks[0].op, TensorIsclose)
+    assert t.chunks[0].op.atol == atol
+    assert t.chunks[0].op.rtol == rtol
+    assert t.chunks[0].op.equal_nan == equal_nan
+
+
+def test_matmul():
+    a_data = [[1, 0], [0, 1]]
+    b_data = [[4, 1], [2, 2]]
+
+    a = tensor(a_data, chunk_size=1)
+    b = tensor(b_data, chunk_size=1)
+
+    t = matmul(a, b)
+
+    assert t.shape == (2, 2)
+    t = tile(t)
+    assert t.shape == tuple(sum(s) for s in t.nsplits)
+
+    b_data = [1, 2]
+    b = tensor(b_data, chunk_size=1)
+
+    t = matmul(a, b)
+
+    assert t.shape == (2,)
+    t = tile(t)
+    assert t.shape == tuple(sum(s) for s in t.nsplits)
+
+    t = matmul(b, a)
+
+    assert t.shape == (2,)
+    t = tile(t)
+    assert t.shape == tuple(sum(s) for s in t.nsplits)
+
+    a_data = np.arange(2 * 2 * 4).reshape((2, 2, 4))
+    b_data = np.arange(2 * 2 * 4).reshape((2, 4, 2))
+
+    a = tensor(a_data, chunk_size=1)
+    b = tensor(b_data, chunk_size=1)
+
+    t = matmul(a, b)
+
+    assert t.shape == (2, 2, 2)
+    t = tile(t)
+    assert t.shape == tuple(sum(s) for s in t.nsplits)
+
+    t = matmul(tensor([2j, 3j], chunk_size=1), tensor([2j, 3j], chunk_size=1))
+
+    assert t.shape == ()
+    t = tile(t)
+    assert t.shape == tuple(sum(s) for s in t.nsplits)
+
+    with pytest.raises(ValueError):
+        matmul([1, 2], 3)
+
+    with pytest.raises(ValueError):
+        matmul(np.random.randn(2, 3, 4), np.random.randn(3, 4, 3))
+
+    t = matmul(
+        tensor(np.random.randn(2, 3, 4), chunk_size=2),
+        tensor(np.random.randn(3, 1, 4, 3), chunk_size=3),
+    )
+    assert t.shape == (3, 2, 3, 3)
+
+    v = ones((100, 100), chunk_size=10)
+    tv = matmul(v, v)
+    assert tv.shape == (100, 100)
+    tv = tile(tv)
+    assert tv.shape == tuple(sum(s) for s in tv.nsplits)
+
+
+def test_tree_arithmetic():
+    raws = [np.random.rand(10, 10) for _ in range(10)]
+    tensors = [tensor(a, chunk_size=3) for a in raws]
+
+    t = tree_add(*tensors, combine_size=4)
+    assert isinstance(t.op, TensorTreeAdd)
+    assert t.issparse() is False
+    assert len(t.inputs) == 3
+    assert len(t.inputs[0].inputs) == 4
+    assert len(t.inputs[-1].inputs) == 2
+
+    t = tree_multiply(*tensors, combine_size=4)
+    assert isinstance(t.op, TensorTreeMultiply)
+    assert t.issparse() is False
+    assert len(t.inputs) == 3
+    assert len(t.inputs[0].inputs) == 4
+    assert len(t.inputs[-1].inputs) == 2
+
+    raws = [sps.random(5, 9, density=0.1) for _ in range(10)]
+    tensors = [tensor(a, chunk_size=3) for a in raws]
+
+    t = tree_add(*tensors, combine_size=4)
+    assert isinstance(t.op, TensorTreeAdd)
+    assert t.issparse() is True
+    assert len(t.inputs) == 3
+    assert len(t.inputs[0].inputs) == 4
+    assert len(t.inputs[-1].inputs) == 2
+
+    t = tree_multiply(*tensors, combine_size=4)
+    assert isinstance(t.op, TensorTreeMultiply)
+    assert t.issparse() is True
+    assert len(t.inputs) == 3
+    assert len(t.inputs[0].inputs) == 4
+    assert len(t.inputs[-1].inputs) == 2
+
+
+def test_get_set_real():
+    a_data = np.array([1 + 2j, 3 + 4j, 5 + 6j])
+    a = tensor(a_data, chunk_size=2)
+
+    with pytest.raises(ValueError):
+        a.real = [2, 4]
+
+
+def test_build_mode():
+    t1 = ones((2, 3), chunk_size=2)
+    assert t1 == 2
+
+    with enter_mode(build=True):
+        assert t1 != 2
diff --git a/python/xorbits/_mars/tensor/arithmetic/tests/test_arithmetic_execution.py b/python/xorbits/_mars/tensor/arithmetic/tests/test_arithmetic_execution.py
new file mode 100644
index 000000000..a9d6580d5
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/tests/test_arithmetic_execution.py
@@ -0,0 +1,795 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import operator
+
+import numpy as np
+import pytest
+import scipy.sparse as sps
+
+from ....config import option_context
+from ....session import execute, fetch
+from ....tests.core import require_cupy
+from ....utils import ignore_warning
+from ...datasource import arange, ones, tensor, zeros
+from .. import (
+    add,
+    arctan2,
+    clip,
+    cos,
+    frexp,
+    isclose,
+    modf,
+    tree_add,
+    tree_multiply,
+    truediv,
+)
+
+
+def _nan_equal(a, b):
+    try:
+        np.testing.assert_equal(a, b)
+    except AssertionError:
+        return False
+    return True
+
+
+def _get_func(op):
+    if isinstance(op, str):
+        return getattr(np, op)
+    return op
+
+
+def _get_sparse_func(op):
+    from ....lib.sparse.core import issparse
+
+    if isinstance(op, str):
+        op = getattr(np, op)
+
+    def func(*args):
+        new_args = []
+        for arg in args:
+            if issparse(arg):
+                new_args.append(arg.toarray())
+            else:
+                new_args.append(arg)
+
+        return op(*new_args)
+
+    return func
+
+
+def toarray(x):
+    if hasattr(x, "toarray"):
+        return x.toarray()
+    return x
+
+
+def test_base_execution(setup):
+    arr = ones((10, 8), chunk_size=2)
+    arr2 = arr + 1
+
+    res = arr2.execute().fetch()
+
+    np.testing.assert_array_equal(res, np.ones((10, 8)) + 1)
+
+    data = np.random.random((10, 8, 3))
+    arr = tensor(data, chunk_size=2)
+    arr2 = arr + 1
+
+    res = arr2.execute().fetch()
+    np.testing.assert_array_equal(res, data + 1)
+
+
+def test_base_order_execution(setup):
+    raw = np.asfortranarray(np.random.rand(5, 6))
+    arr = tensor(raw, chunk_size=3)
+
+    res = (arr + 1).execute().fetch()
+    np.testing.assert_array_equal(res, raw + 1)
+    assert res.flags["C_CONTIGUOUS"] is False
+    assert res.flags["F_CONTIGUOUS"] is True
+
+    res2 = add(arr, 1, order="C").execute().fetch()
+    np.testing.assert_array_equal(res2, np.add(raw, 1, order="C"))
+    assert res2.flags["C_CONTIGUOUS"] is True
+    assert res2.flags["F_CONTIGUOUS"] is False
+
+
+def test_ufunc_execution(setup):
+    from .. import (
+        BIN_UFUNC,
+        UNARY_UFUNC,
+        arccosh,
+        bitand,
+        bitor,
+        bitxor,
+        fmod,
+        invert,
+        ldexp,
+        lshift,
+        mod,
+        rshift,
+    )
+
+    _sp_unary_ufunc = {arccosh, invert}
+    _sp_bin_ufunc = {mod, fmod, bitand, bitor, bitxor, lshift, rshift, ldexp}
+
+    data1 = np.random.random((5, 6, 2))
+    data2 = np.random.random((5, 6, 2))
+    rand = np.random.random()
+    arr1 = tensor(data1, chunk_size=3)
+    arr2 = tensor(data2, chunk_size=3)
+
+    _new_unary_ufunc = UNARY_UFUNC - _sp_unary_ufunc
+    for func in _new_unary_ufunc:
+        res_tensor = func(arr1)
+        assert res_tensor.dtype is not None
+        res = res_tensor.execute().fetch()
+        expected = _get_func(res_tensor.op._func_name)(data1)
+        np.testing.assert_array_almost_equal(res, expected)
+
+    _new_bin_ufunc = BIN_UFUNC - _sp_bin_ufunc
+    for func in _new_bin_ufunc:
+        res_tensor1 = func(arr1, arr2)
+        assert res_tensor1.dtype is not None
+        res_tensor2 = func(arr1, rand)
+        assert res_tensor2.dtype is not None
+        res_tensor3 = func(rand, arr1)
+        assert res_tensor3.dtype is not None
+
+        res1 = res_tensor1.execute().fetch()
+        res2 = res_tensor2.execute().fetch()
+        res3 = res_tensor3.execute().fetch()
+
+        expected1 = _get_func(res_tensor1.op._func_name)(data1, data2)
+        expected2 = _get_func(res_tensor1.op._func_name)(data1, rand)
+        expected3 = _get_func(res_tensor1.op._func_name)(rand, data1)
+
+        np.testing.assert_array_almost_equal(res1, expected1)
+        np.testing.assert_array_almost_equal(res2, expected2)
+        np.testing.assert_array_almost_equal(res3, expected3)
+
+    data1 = np.random.randint(2, 10, size=(10, 10, 10))
+    data2 = np.random.randint(2, 10, size=(10, 10, 10))
+    rand = np.random.randint(1, 10)
+    arr1 = tensor(data1, chunk_size=6)
+    arr2 = tensor(data2, chunk_size=6)
+
+    for func in _sp_unary_ufunc:
+        res_tensor = func(arr1)
+        assert res_tensor.dtype is not None
+        res = res_tensor.execute().fetch()
+        expected = _get_func(res_tensor.op._func_name)(data1)
+        np.testing.assert_array_almost_equal(res, expected)
+
+    for func in _sp_bin_ufunc:
+        res_tensor1 = func(arr1, arr2)
+        assert res_tensor1.dtype is not None
+        res_tensor2 = func(arr1, rand)
+        assert res_tensor2.dtype is not None
+        res_tensor3 = func(rand, arr1)
+        assert res_tensor3.dtype is not None
+
+        res1 = res_tensor1.execute().fetch()
+        res2 = res_tensor2.execute().fetch()
+        res3 = res_tensor3.execute().fetch()
+
+        expected1 = _get_func(res_tensor1.op._func_name)(data1, data2)
+        expected2 = _get_func(res_tensor1.op._func_name)(data1, rand)
+        expected3 = _get_func(res_tensor1.op._func_name)(rand, data1)
+
+        np.testing.assert_array_almost_equal(res1, expected1)
+        np.testing.assert_array_almost_equal(res2, expected2)
+        np.testing.assert_array_almost_equal(res3, expected3)
+
+
+def test_sparse_ufunc_execution(setup):
+    from .. import add, arccosh, mod, square
+
+    _normal_unary_ufunc = [square]
+    _normal_bin_ufunc = [add]
+    _sp_unary_ufunc = [arccosh]
+    _sp_bin_ufunc = [mod]
+
+    data1 = sps.random(5, 9, density=0.1)
+    data2 = sps.random(5, 9, density=0.2)
+    rand = np.random.random()
+    arr1 = tensor(data1, chunk_size=3)
+    arr2 = tensor(data2, chunk_size=3)
+
+    for func in _normal_unary_ufunc:
+        res_tensor = func(arr1)
+        res = res_tensor.execute().fetch()
+        expected = _get_sparse_func(res_tensor.op._func_name)(data1)
+        _nan_equal(toarray(res[0]), expected)
+
+    for func in _normal_bin_ufunc:
+        res_tensor1 = func(arr1, arr2)
+        res_tensor2 = func(arr1, rand)
+        res_tensor3 = func(rand, arr1)
+
+        res1 = res_tensor1.execute().fetch()
+        res2 = res_tensor2.execute().fetch()
+        res3 = res_tensor3.execute().fetch()
+
+        expected1 = _get_sparse_func(res_tensor1.op._func_name)(data1, data2)
+        expected2 = _get_sparse_func(res_tensor1.op._func_name)(data1, rand)
+        expected3 = _get_sparse_func(res_tensor1.op._func_name)(rand, data1)
+
+        _nan_equal(toarray(res1[0]), expected1)
+        _nan_equal(toarray(res2[0]), expected2)
+        _nan_equal(toarray(res3[0]), expected3)
+
+    data1 = np.random.randint(2, 10, size=(10, 10))
+    data2 = np.random.randint(2, 10, size=(10, 10))
+    rand = np.random.randint(1, 10)
+    arr1 = tensor(data1, chunk_size=3).tosparse()
+    arr2 = tensor(data2, chunk_size=3).tosparse()
+
+    for func in _sp_unary_ufunc:
+        res_tensor = func(arr1)
+        res = res_tensor.execute().fetch()
+        expected = _get_sparse_func(res_tensor.op._func_name)(data1)
+        _nan_equal(toarray(res[0]), expected)
+
+    for func in _sp_bin_ufunc:
+        res_tensor1 = func(arr1, arr2)
+        res_tensor2 = func(arr1, rand)
+        res_tensor3 = func(rand, arr1)
+
+        res1 = res_tensor1.execute().fetch()
+        res2 = res_tensor2.execute().fetch()
+        res3 = res_tensor3.execute().fetch()
+        expected1 = _get_sparse_func(res_tensor1.op._func_name)(data1, data2)
+        expected2 = _get_sparse_func(res_tensor1.op._func_name)(data1, rand)
+        expected3 = _get_sparse_func(res_tensor1.op._func_name)(rand, data1)
+
+        _nan_equal(toarray(res1[0]), expected1)
+        _nan_equal(toarray(res2[0]), expected2)
+        _nan_equal(toarray(res3[0]), expected3)
+
+
+def test_add_with_out_execution(setup):
+    data1 = np.random.random((5, 9, 4))
+    data2 = np.random.random((9, 4))
+
+    arr1 = tensor(data1.copy(), chunk_size=3)
+    arr2 = tensor(data2.copy(), chunk_size=3)
+
+    add(arr1, arr2, out=arr1)
+    res = arr1.execute().fetch()
+    np.testing.assert_array_equal(res, data1 + data2)
+
+    arr1 = tensor(data1.copy(), chunk_size=3)
+    arr2 = tensor(data2.copy(), chunk_size=3)
+
+    arr3 = add(arr1, arr2, out=arr1.astype("i4"), casting="unsafe")
+    res = arr3.execute().fetch()
+    np.testing.assert_array_equal(res, (data1 + data2).astype("i4"))
+
+    arr1 = tensor(data1.copy(), chunk_size=3)
+    arr2 = tensor(data2.copy(), chunk_size=3)
+
+    arr3 = truediv(arr1, arr2, out=arr1, where=arr2 > 0.5)
+    res = arr3.execute().fetch()
+    np.testing.assert_array_equal(
+        res, np.true_divide(data1, data2, out=data1.copy(), where=data2 > 0.5)
+    )
+
+    arr1 = tensor(data1.copy(), chunk_size=4)
+    arr2 = tensor(data2.copy(), chunk_size=4)
+
+    arr3 = add(arr1, arr2, where=arr1 > 0.5)
+    res = arr3.execute().fetch()
+    expected = np.add(data1, data2, where=data1 > 0.5)
+    np.testing.assert_array_equal(res[data1 > 0.5], expected[data1 > 0.5])
+
+    arr1 = tensor(data1.copy(), chunk_size=4)
+
+    arr3 = add(arr1, 1, where=arr1 > 0.5)
+    res = arr3.execute().fetch()
+    expected = np.add(data1, 1, where=data1 > 0.5)
+    np.testing.assert_array_equal(res[data1 > 0.5], expected[data1 > 0.5])
+
+    arr1 = tensor(data2.copy(), chunk_size=3)
+
+    arr3 = add(arr1[:5, :], 1, out=arr1[-5:, :])
+    res = arr3.execute().fetch()
+    expected = np.add(data2[:5, :], 1)
+    np.testing.assert_array_equal(res, expected)
+
+
+def test_arctan2_execution(setup):
+    x = tensor(1)  # scalar
+    y = arctan2(x, x)
+
+    assert y.issparse() is False
+    result = y.execute().fetch()
+    np.testing.assert_equal(result, np.arctan2(1, 1))
+
+    y = arctan2(0, x)
+
+    assert y.issparse() is False
+    result = y.execute().fetch()
+    np.testing.assert_equal(result, np.arctan2(0, 1))
+
+    raw1 = np.array([[0, 1, 2]])
+    raw2 = sps.csr_matrix([[0, 1, 0]])
+    y = arctan2(raw1, raw2)
+
+    assert y.issparse() is False
+    result = y.execute().fetch()
+    np.testing.assert_equal(result, np.arctan2(raw1, raw2.A))
+
+    y = arctan2(raw2, raw2)
+
+    assert y.issparse() is True
+    result = y.execute().fetch()
+    np.testing.assert_equal(result, np.arctan2(raw2.A, raw2.A))
+
+    y = arctan2(0, raw2)
+
+    assert y.issparse() is True
+    result = y.execute().fetch()
+    np.testing.assert_equal(result, np.arctan2(0, raw2.A))
+
+
+@pytest.mark.ray_dag
+def test_frexp_execution(setup):
+    data1 = np.random.RandomState(0).randint(0, 100, (5, 9, 6))
+
+    arr1 = tensor(data1.copy(), chunk_size=4)
+
+    o1, o2 = frexp(arr1)
+    o = o1 + o2
+
+    res = o.execute().fetch()
+    expected = sum(np.frexp(data1))
+    np.testing.assert_array_almost_equal(res, expected)
+
+    arr1 = tensor(data1.copy(), chunk_size=4)
+    o1 = zeros(data1.shape, chunk_size=4)
+    o2 = zeros(data1.shape, dtype="i8", chunk_size=4)
+    frexp(arr1, o1, o2)
+    res1, res2 = fetch(*execute(o1, o2))
+
+    res = res1 * 2**res2
+    np.testing.assert_array_almost_equal(res, data1, decimal=3)
+
+    data1 = sps.random(5, 9, density=0.1)
+
+    arr1 = tensor(data1.copy(), chunk_size=4)
+
+    o1, o2 = frexp(arr1)
+    o = o1 + o2
+
+    res = o.execute().fetch()
+    expected = sum(np.frexp(data1.toarray()))
+    np.testing.assert_equal(res.toarray(), expected)
+
+    x = np.arange(9)
+    a = np.zeros(9)
+    b = np.zeros(9)
+    mx = arange(9)
+    ma = zeros(9)
+    mb = zeros(9)
+    res = frexp(mx, ma, mb, where=mx > 5).execute()
+    expected = np.frexp(x, a, b, where=x > 5)
+    np.testing.assert_equal(res[0], expected[0])
+    np.testing.assert_equal(res[1], expected[1])
+
+
+def test_frexp_order_execution(setup):
+    data1 = np.random.RandomState(0).random((5, 9))
+    t = tensor(data1, chunk_size=3)
+
+    o1, o2 = frexp(t, order="F")
+    res1, res2 = execute(o1, o2)
+    expected1, expected2 = np.frexp(data1, order="F")
+    np.testing.assert_allclose(res1, expected1)
+    assert res1.flags["F_CONTIGUOUS"] is True
+    assert res1.flags["C_CONTIGUOUS"] is False
+    np.testing.assert_allclose(res2, expected2)
+    assert res2.flags["F_CONTIGUOUS"] is True
+    assert res2.flags["C_CONTIGUOUS"] is False
+
+
+def test_modf_execution(setup):
+    data1 = np.random.random((5, 9))
+
+    arr1 = tensor(data1.copy(), chunk_size=3)
+
+    o1, o2 = modf(arr1)
+    o = o1 + o2
+
+    res = o.execute().fetch()
+    expected = sum(np.modf(data1))
+    np.testing.assert_array_almost_equal(res, expected)
+
+    o1, o2 = modf([0, 3.5])
+    o = o1 + o2
+
+    res = o.execute().fetch()
+    expected = sum(np.modf([0, 3.5]))
+    np.testing.assert_array_almost_equal(res, expected)
+
+    arr1 = tensor(data1.copy(), chunk_size=3)
+    o1 = zeros(data1.shape, chunk_size=3)
+    o2 = zeros(data1.shape, chunk_size=3)
+    modf(arr1, o1, o2)
+    o = o1 + o2
+
+    res = o.execute().fetch()
+    expected = sum(np.modf(data1))
+    np.testing.assert_array_almost_equal(res, expected)
+
+    data1 = sps.random(5, 9, density=0.1)
+
+    arr1 = tensor(data1.copy(), chunk_size=3)
+
+    o1, o2 = modf(arr1)
+    o = o1 + o2
+
+    res = o.execute().fetch()
+    expected = sum(np.modf(data1.toarray()))
+    np.testing.assert_equal(res.toarray(), expected)
+
+
+def test_modf_order_execution(setup):
+    data1 = np.random.random((5, 9))
+    t = tensor(data1, chunk_size=3)
+
+    o1, o2 = modf(t, order="F")
+    res1, res2 = execute(o1, o2)
+    expected1, expected2 = np.modf(data1, order="F")
+    np.testing.assert_allclose(res1, expected1)
+    assert res1.flags["F_CONTIGUOUS"] is True
+    assert res1.flags["C_CONTIGUOUS"] is False
+    np.testing.assert_allclose(res2, expected2)
+    assert res2.flags["F_CONTIGUOUS"] is True
+    assert res2.flags["C_CONTIGUOUS"] is False
+
+
+def test_clip_execution(setup):
+    a_data = np.arange(10)
+
+    a = tensor(a_data.copy(), chunk_size=3)
+
+    b = clip(a, 1, 8)
+
+    res = b.execute().fetch()
+    expected = np.clip(a_data, 1, 8)
+    np.testing.assert_array_equal(res, expected)
+
+    a = tensor(a_data.copy(), chunk_size=3)
+    clip(a, 3, 6, out=a)
+
+    res = a.execute().fetch()
+    expected = np.clip(a_data, 3, 6)
+    np.testing.assert_array_equal(res, expected)
+
+    a = tensor(a_data.copy(), chunk_size=3)
+    a_min_data = np.random.randint(1, 10, size=(10,))
+    a_max_data = np.random.randint(1, 10, size=(10,))
+    a_min = tensor(a_min_data)
+    a_max = tensor(a_max_data)
+    clip(a, a_min, a_max, out=a)
+
+    res = a.execute().fetch()
+    expected = np.clip(a_data, a_min_data, a_max_data)
+    np.testing.assert_array_equal(res, expected)
+
+    with option_context() as options:
+        options.chunk_size = 3
+
+        a = tensor(a_data.copy(), chunk_size=3)
+        b = clip(a, [3, 4, 1, 1, 1, 4, 4, 4, 4, 4], 8)
+
+        res = b.execute().fetch()
+        expected = np.clip(a_data, [3, 4, 1, 1, 1, 4, 4, 4, 4, 4], 8)
+        np.testing.assert_array_equal(res, expected)
+
+        # test sparse clip
+        a_data = sps.csr_matrix([[0, 2, 8], [0, 0, -1]])
+        a = tensor(a_data, chunk_size=3)
+        b_data = sps.csr_matrix([[0, 3, 0], [1, 0, -2]])
+
+        c = clip(a, b_data, 4)
+
+        res = c.execute().fetch()
+        expected = np.clip(a_data.toarray(), b_data.toarray(), 4)
+        np.testing.assert_array_equal(res, expected)
+
+
+def test_clip_order_execution(setup):
+    a_data = np.asfortranarray(np.random.rand(4, 8))
+
+    a = tensor(a_data, chunk_size=3)
+
+    b = clip(a, 0.2, 0.8)
+
+    res = b.execute().fetch()
+    expected = np.clip(a_data, 0.2, 0.8)
+
+    np.testing.assert_allclose(res, expected)
+    assert res.flags["F_CONTIGUOUS"] is True
+    assert res.flags["C_CONTIGUOUS"] is False
+
+
+def test_around_execution(setup):
+    data = np.random.randn(10, 20)
+    x = tensor(data, chunk_size=3)
+
+    t = x.round(2)
+
+    res = t.execute().fetch()
+    expected = np.around(data, decimals=2)
+
+    np.testing.assert_allclose(res, expected)
+
+    data = sps.random(10, 20, density=0.2)
+    x = tensor(data, chunk_size=3)
+
+    t = x.round(2)
+
+    res = t.execute().fetch()
+    expected = np.around(data.toarray(), decimals=2)
+
+    np.testing.assert_allclose(res.toarray(), expected)
+
+
+def test_around_order_execution(setup):
+    data = np.asfortranarray(np.random.rand(10, 20))
+    x = tensor(data, chunk_size=3)
+
+    t = x.round(2)
+
+    res = t.execute().fetch()
+    expected = np.around(data, decimals=2)
+
+    np.testing.assert_allclose(res, expected)
+    assert res.flags["F_CONTIGUOUS"] is True
+    assert res.flags["C_CONTIGUOUS"] is False
+
+
+def test_cos_order_execution(setup):
+    data = np.asfortranarray(np.random.rand(3, 5))
+    x = tensor(data, chunk_size=2)
+
+    t = cos(x)
+
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, np.cos(data))
+    assert res.flags["C_CONTIGUOUS"] is False
+    assert res.flags["F_CONTIGUOUS"] is True
+
+    t2 = cos(x, order="C")
+
+    res2 = t2.execute().fetch()
+    np.testing.assert_allclose(res2, np.cos(data, order="C"))
+    assert res2.flags["C_CONTIGUOUS"] is True
+    assert res2.flags["F_CONTIGUOUS"] is False
+
+
+def test_is_close_execution(setup):
+    data = np.array([1.05, 1.0, 1.01, np.nan])
+    data2 = np.array([1.04, 1.0, 1.03, np.nan])
+
+    x = tensor(data, chunk_size=2)
+    y = tensor(data2, chunk_size=3)
+
+    z = isclose(x, y, atol=0.01)
+
+    res = z.execute().fetch()
+    expected = np.isclose(data, data2, atol=0.01)
+    np.testing.assert_equal(res, expected)
+
+    z = isclose(x, y, atol=0.01, equal_nan=True)
+
+    res = z.execute().fetch()
+    expected = np.isclose(data, data2, atol=0.01, equal_nan=True)
+    np.testing.assert_equal(res, expected)
+
+    # test tensor with scalar
+    z = isclose(x, 1.0, atol=0.01)
+    res = z.execute().fetch()
+    expected = np.isclose(data, 1.0, atol=0.01)
+    np.testing.assert_equal(res, expected)
+    z = isclose(1.0, y, atol=0.01)
+    res = z.execute().fetch()
+    expected = np.isclose(1.0, data2, atol=0.01)
+    np.testing.assert_equal(res, expected)
+    z = isclose(1.0, 2.0, atol=0.01)
+    res = z.execute().fetch()
+    expected = np.isclose(1.0, 2.0, atol=0.01)
+    np.testing.assert_equal(res, expected)
+
+    # test sparse
+    data = sps.csr_matrix(np.array([0, 1.0, 1.01, np.nan]))
+    data2 = sps.csr_matrix(np.array([0, 1.0, 1.03, np.nan]))
+
+    x = tensor(data, chunk_size=2)
+    y = tensor(data2, chunk_size=3)
+
+    z = isclose(x, y, atol=0.01)
+
+    res = z.execute().fetch()
+    expected = np.isclose(data.toarray(), data2.toarray(), atol=0.01)
+    np.testing.assert_equal(res, expected)
+
+    z = isclose(x, y, atol=0.01, equal_nan=True)
+
+    res = z.execute().fetch()
+    expected = np.isclose(data.toarray(), data2.toarray(), atol=0.01, equal_nan=True)
+    np.testing.assert_equal(res, expected)
+
+
+@ignore_warning
+def test_dtype_execution(setup):
+    a = ones((10, 20), dtype="f4", chunk_size=5)
+
+    c = truediv(a, 2, dtype="f8")
+
+    res = c.execute().fetch()
+    assert res.dtype == np.float64
+
+    c = truediv(a, 0, dtype="f8")
+    res = c.execute().fetch()
+    assert np.isinf(res[0, 0])
+
+    with pytest.raises(FloatingPointError):
+        with np.errstate(divide="raise"):
+            c = truediv(a, 0, dtype="f8")
+            _ = c.execute().fetch()  # noqa: F841
+
+
+def test_set_get_real_execution(setup):
+    a_data = np.array([1 + 2j, 3 + 4j, 5 + 6j])
+    a = tensor(a_data, chunk_size=2)
+
+    res = a.real.execute().fetch()
+    expected = a_data.real
+
+    np.testing.assert_equal(res, expected)
+
+    a.real = 9
+
+    res = a.execute().fetch()
+    expected = a_data.copy()
+    expected.real = 9
+
+    np.testing.assert_equal(res, expected)
+
+    a.real = np.array([9, 8, 7])
+
+    res = a.execute().fetch()
+    expected = a_data.copy()
+    expected.real = np.array([9, 8, 7])
+
+    np.testing.assert_equal(res, expected)
+
+    # test sparse
+    a_data = np.array([[1 + 2j, 3 + 4j, 0], [0, 0, 0]])
+    a = tensor(sps.csr_matrix(a_data))
+
+    res = a.real.execute().fetch().toarray()
+    expected = a_data.real
+
+    np.testing.assert_equal(res, expected)
+
+    a.real = 9
+
+    res = a.execute().fetch().toarray()
+    expected = a_data.copy()
+    expected.real = 9
+
+    np.testing.assert_equal(res, expected)
+
+    a.real = np.array([9, 8, 7])
+
+    res = a.execute().fetch().toarray()
+    expected = a_data.copy()
+    expected.real = np.array([9, 8, 7])
+
+    np.testing.assert_equal(res, expected)
+
+
+def test_set_get_imag_execution(setup):
+    a_data = np.array([1 + 2j, 3 + 4j, 5 + 6j])
+    a = tensor(a_data, chunk_size=2)
+
+    res = a.imag.execute().fetch()
+    expected = a_data.imag
+
+    np.testing.assert_equal(res, expected)
+
+    a.imag = 9
+
+    res = a.execute().fetch()
+    expected = a_data.copy()
+    expected.imag = 9
+
+    np.testing.assert_equal(res, expected)
+
+    a.imag = np.array([9, 8, 7])
+
+    res = a.execute().fetch()
+    expected = a_data.copy()
+    expected.imag = np.array([9, 8, 7])
+
+    np.testing.assert_equal(res, expected)
+
+    # test sparse
+    a_data = np.array([[1 + 2j, 3 + 4j, 0], [0, 0, 0]])
+    a = tensor(sps.csr_matrix(a_data))
+
+    res = a.imag.execute().fetch().toarray()
+    expected = a_data.imag
+
+    np.testing.assert_equal(res, expected)
+
+    a.imag = 9
+
+    res = a.execute().fetch().toarray()
+    expected = a_data.copy()
+    expected.imag = 9
+
+    np.testing.assert_equal(res, expected)
+
+    a.imag = np.array([9, 8, 7])
+
+    res = a.execute().fetch().toarray()
+    expected = a_data.copy()
+    expected.imag = np.array([9, 8, 7])
+
+    np.testing.assert_equal(res, expected)
+
+
+def test_tree_arithmetic_execution(setup):
+    raws = [np.random.rand(10, 10) for _ in range(10)]
+    tensors = [tensor(a, chunk_size=3) for a in raws]
+
+    res = tree_add(*tensors, 1.0).execute().fetch()
+    np.testing.assert_array_almost_equal(
+        res, 1.0 + functools.reduce(operator.add, raws)
+    )
+
+    res = tree_multiply(*tensors, 2.0).execute().fetch()
+    np.testing.assert_array_almost_equal(
+        res, 2.0 * functools.reduce(operator.mul, raws)
+    )
+
+    raws = [sps.random(5, 9, density=0.1) for _ in range(10)]
+    tensors = [tensor(a, chunk_size=3) for a in raws]
+
+    res = tree_add(*tensors).execute().fetch()
+    np.testing.assert_array_almost_equal(
+        res.toarray(), functools.reduce(operator.add, raws).toarray()
+    )
+
+
+@require_cupy
+def test_cupy_execution(setup_gpu):
+    a_data = np.random.rand(10, 10)
+    b_data = np.random.rand(10, 10)
+
+    a = tensor(a_data, gpu=True, chunk_size=3)
+    b = tensor(b_data, gpu=True, chunk_size=3)
+    res_binary = (a + b).execute().fetch()
+    np.testing.assert_array_equal(res_binary.get(), (a_data + b_data))
+
+    res_unary = cos(a).execute().fetch()
+    np.testing.assert_array_almost_equal(res_unary.get(), np.cos(a_data))
diff --git a/python/xorbits/_mars/tensor/arithmetic/truediv.py b/python/xorbits/_mars/tensor/arithmetic/truediv.py
new file mode 100644
index 000000000..ff7358a73
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/truediv.py
@@ -0,0 +1,102 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorBinOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand
+class TensorTrueDiv(TensorBinOp):
+    _op_type_ = OperandDef.TRUEDIV
+    _func_name = "true_divide"
+
+    @classmethod
+    def _is_sparse(cls, x1, x2):
+        if not np.isscalar(x1) and not np.isscalar(x2):
+            return False
+        if hasattr(x1, "issparse") and x1.issparse():
+            if x2 != 0:
+                return True
+            else:
+                raise ZeroDivisionError("float division by zero")
+        return False
+
+
+@infer_dtype(np.true_divide)
+def truediv(x1, x2, out=None, where=None, **kwargs):
+    """
+    Returns a true division of the inputs, element-wise.
+
+    Instead of the Python traditional 'floor division', this returns a true
+    division.  True division adjusts the output type to present the best
+    answer, regardless of input types.
+
+    Parameters
+    ----------
+    x1 : array_like
+        Dividend tensor.
+    x2 : array_like
+        Divisor tensor.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    out : Tensor
+        Result is scalar if both inputs are scalar, tensor otherwise.
+
+    Notes
+    -----
+    The floor division operator ``//`` was added in Python 2.2 making
+    ``//`` and ``/`` equivalent operators.  The default floor division
+    operation of ``/`` can be replaced by true division with ``from
+    __future__ import division``.
+
+    In Python 3.0, ``//`` is the floor division operator and ``/`` the
+    true division operator.  The ``true_divide(x1, x2)`` function is
+    equivalent to true division in Python.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.arange(5)
+    >>> mt.true_divide(x, 4).execute()
+    array([ 0.  ,  0.25,  0.5 ,  0.75,  1.  ])
+
+    # for python 2
+    >>> (x/4).execute()
+    array([0, 0, 0, 0, 1])
+    >>> (x//4).execute()
+    array([0, 0, 0, 0, 1])
+    """
+    op = TensorTrueDiv(**kwargs)
+    return op(x1, x2, out=out, where=where)
+
+
+@infer_dtype(np.true_divide, reverse=True)
+def rtruediv(x1, x2, **kwargs):
+    op = TensorTrueDiv(**kwargs)
+    return op.rcall(x1, x2)
diff --git a/python/xorbits/_mars/tensor/arithmetic/trunc.py b/python/xorbits/_mars/tensor/arithmetic/trunc.py
new file mode 100644
index 000000000..bf2aae77d
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/trunc.py
@@ -0,0 +1,70 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import infer_dtype
+from .core import TensorUnaryOp
+from .utils import arithmetic_operand
+
+
+@arithmetic_operand(sparse_mode="unary")
+class TensorTrunc(TensorUnaryOp):
+    _op_type_ = OperandDef.TRUNC
+    _func_name = "trunc"
+
+
+@infer_dtype(np.trunc)
+def trunc(x, out=None, where=None, **kwargs):
+    """
+    Return the truncated value of the input, element-wise.
+
+    The truncated value of the scalar `x` is the nearest integer `i` which
+    is closer to zero than `x` is. In short, the fractional part of the
+    signed number `x` is discarded.
+
+    Parameters
+    ----------
+    x : array_like
+        Input data.
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    y : Tensor or scalar
+        The truncated value of each element in `x`.
+
+    See Also
+    --------
+    ceil, floor, rint
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])
+    >>> mt.trunc(a).execute()
+    array([-1., -1., -0.,  0.,  1.,  1.,  2.])
+    """
+    op = TensorTrunc(**kwargs)
+    return op(x, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/arithmetic/utils.py b/python/xorbits/_mars/tensor/arithmetic/utils.py
new file mode 100644
index 000000000..0c53e5f52
--- /dev/null
+++ b/python/xorbits/_mars/tensor/arithmetic/utils.py
@@ -0,0 +1,125 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+
+from ...utils import TreeReductionBuilder
+
+
+def arithmetic_operand(cls=None, init=True, sparse_mode=None):
+    def _decorator(cls):
+        def __init__(self, casting="same_kind", err=None, **kw):
+            err = err if err is not None else np.geterr()
+            super(cls, self).__init__(_casting=casting, _err=err, **kw)
+
+        def _is_sparse_binary_and_const(x1, x2):
+            if all(np.isscalar(x) for x in [x1, x2]):
+                return False
+            if all(
+                np.isscalar(x) or (hasattr(x, "issparse") and x.issparse())
+                for x in [x1, x2]
+            ):
+                return True
+            return False
+
+        def _is_sparse_binary_or_const(x1, x2):
+            if (hasattr(x1, "issparse") and x1.issparse()) or (
+                hasattr(x2, "issparse") and x2.issparse()
+            ):
+                return True
+            return False
+
+        _is_sparse_dict = dict(
+            always_false=lambda *_: False,
+            unary=lambda x: x.issparse(),
+            binary_and=_is_sparse_binary_and_const,
+            binary_or=_is_sparse_binary_or_const,
+        )
+        for v in _is_sparse_dict.values():
+            v.__name__ = "_is_sparse"
+
+        if init:
+            cls.__init__ = __init__
+
+        if sparse_mode in _is_sparse_dict:
+            cls._is_sparse = staticmethod(_is_sparse_dict[sparse_mode])
+        elif sparse_mode is not None:  # pragma: no cover
+            raise ValueError(f"Unsupported sparse mode: {sparse_mode}")
+
+        return cls
+
+    if cls is not None:
+        return _decorator(cls)
+    else:
+        return _decorator
+
+
+def chunk_tree_add(dtype, chunks, idx, shape, sparse=False, combine_size=None):
+    """
+    Generate tree add plan.
+
+    Assume combine size as 4, given a input chunks with size 8,
+    we will generate tree add plan like:
+
+    op op op op    op op op op
+     |        |     |        |
+      --------       --------
+      tree_add        tree_add
+          |             |
+           -------------
+              tree_add
+
+    :param dtype: data type for tree added chunk
+    :param chunks: input chunks
+    :param idx: index of result chunk
+    :param shape: shape of result chunk
+    :param sparse: return value is sparse or dense
+    :param combine_size: combine size
+    :return: result chunk
+    """
+
+    class ChunkAddBuilder(TreeReductionBuilder):
+        def _build_reduction(self, inputs, final=False):
+            from .add import TensorTreeAdd
+
+            op = TensorTreeAdd(args=inputs, dtype=dtype, sparse=sparse)
+            if not final:
+                return op.new_chunk(inputs, shape=shape)
+            else:
+                return op.new_chunk(
+                    inputs, shape=shape, index=idx, order=chunks[0].order
+                )
+
+    return ChunkAddBuilder(combine_size).build(chunks)
+
+
+def tree_op_estimate_size(ctx, op):
+    chunk = op.outputs[0]
+    if not chunk.is_sparse():
+        max_inputs = max(ctx[inp.key][0] for inp in op.inputs)
+        calc_size = chunk_size = chunk.nbytes
+        if np.isnan(calc_size):
+            chunk_size = calc_size = max_inputs
+    else:
+        sum_inputs = sum(ctx[inp.key][0] for inp in op.inputs)
+        calc_size = sum_inputs
+        chunk_size = min(
+            sum_inputs,
+            chunk.nbytes
+            + np.dtype(np.int64).itemsize * np.prod(chunk.shape) * chunk.ndim,
+        )
+        if np.isnan(chunk_size):
+            chunk_size = sum_inputs
+    ctx[chunk.key] = (chunk_size, calc_size)
diff --git a/python/xorbits/_mars/tensor/array_utils.py b/python/xorbits/_mars/tensor/array_utils.py
new file mode 100644
index 000000000..2b08368b8
--- /dev/null
+++ b/python/xorbits/_mars/tensor/array_utils.py
@@ -0,0 +1,186 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from contextlib import contextmanager
+
+import numpy as np
+
+from ..lib import sparse
+from ..lib.sparse.core import get_dense_module, issparse
+from ..utils import lazy_import
+
+cp = lazy_import("cupy", rename="cp")
+
+
+def is_array(x):
+    if isinstance(x, np.ndarray):
+        return True
+    elif isinstance(x, (sparse.SparseMatrix, sparse.SparseVector)):
+        return True
+    elif cp:  # pragma: no cover
+        return isinstance(x, cp.ndarray)
+    else:
+        return False
+
+
+def is_cupy(x):
+    if cp and isinstance(x, cp.ndarray):  # pragma: no cover
+        return True
+    else:
+        return False
+
+
+def get_array_module(x, nosparse=False):
+    if issparse(x):
+        if nosparse:
+            return get_dense_module(x)
+        return sparse
+    if cp:
+        return cp.get_array_module(x)
+    return np
+
+
+def array_module(gpu):
+    if gpu:
+        if cp is None:
+            raise ImportError("Execute on GPU requires for `cupy` library")
+        return cp
+
+    return np
+
+
+def _get(x):
+    m = get_array_module(x)
+
+    if m is np:
+        return x
+    if m is sparse:
+        return x if not hasattr(x, "get") else x.get()
+    return x.get()
+
+
+def move_to_device(x, device_id):
+    if hasattr(x, "device") and x.device.id == device_id:
+        return x
+
+    assert device_id >= 0
+
+    if issparse(x) and device_id > 0:
+        raise NotImplementedError
+
+    # for dense array, we currently copy from gpu to memory and then copy back to destination device
+    # to avoid kernel panic
+    with cp.cuda.Device(device_id):
+        return cp.asarray(cp.asnumpy(x))  # remove `cp.asnumpy` call to do directly copy
+
+
+def convert_order(x, order):
+    xp = get_array_module(x)
+    if xp.isfortran(x) != (order == "F"):
+        x = xp.array(x, order=order)
+    return x
+
+
+def _most_nbytes_device(device_nbytes):
+    device_to_nbytes = defaultdict(lambda: 0)
+    for device, nbytes in device_nbytes:
+        device_to_nbytes[device] += nbytes
+    return max(device_to_nbytes, key=lambda i: device_to_nbytes[i])
+
+
+def _is_array_writeable(a):
+    if hasattr(a, "flags") and hasattr(a.flags, "writeable"):
+        return a.flags.writeable
+    # writeable as default
+    return True
+
+
+def as_same_device(inputs, device=None, ret_extra=False, copy_if_not_writeable=False):
+    input_tensors = [
+        i for i in inputs if hasattr(i, "ndim") and i.ndim > 0
+    ]  # filter scalar
+    has_sparse = any(issparse(i) for i in inputs)
+
+    if device is None:
+        try:
+            device = _most_nbytes_device(
+                (i.device.id if hasattr(i, "device") else -1, i.nbytes)
+                for i in input_tensors
+            )
+        except ValueError:
+            device = -1
+
+    if device == -1:
+        outputs = [_get(i) for i in inputs]
+    else:
+        outputs = [move_to_device(i, device) for i in inputs]
+
+    if copy_if_not_writeable:
+        new_outputs = []
+        for out in outputs:
+            if not _is_array_writeable(out):
+                new_outputs.append(out.copy())
+            elif isinstance(out, (sparse.SparseMatrix, sparse.SparseVector)):
+                if (
+                    not _is_array_writeable(out.data)
+                    or not _is_array_writeable(out.indices)
+                    or not _is_array_writeable(out.indptr)
+                ):
+                    new_outputs.append(type(out)(out.spmatrix.copy(), shape=out.shape))
+                else:
+                    new_outputs.append(out)
+            else:
+                new_outputs.append(out)
+        outputs = new_outputs
+
+    if not ret_extra:
+        return outputs
+
+    if has_sparse:
+        m = sparse
+    else:
+        if len(input_tensors) > 0:
+            m = get_array_module(input_tensors[0])
+        else:
+            m = np
+    return outputs, device, m
+
+
+def as_np_array(x):
+    xp = get_array_module(x)
+    return x if xp == np else x.get()
+
+
+def is_sparse_module(xp):
+    return xp is sparse
+
+
+@contextmanager
+def device(device_id):
+    if device_id is None or device_id < 0:
+        yield
+    else:  # pragma: no cover
+        with cp.cuda.Device(device_id) as dev:
+            yield dev
+
+
+def create_array(op):
+    xp = array_module(op.gpu)
+
+    def inner(func, *args, **kwargs):
+        with device(op.device):
+            return getattr(xp, func)(*args, **kwargs)
+
+    return inner
diff --git a/python/xorbits/_mars/tensor/base/__init__.py b/python/xorbits/_mars/tensor/base/__init__.py
new file mode 100644
index 000000000..9358a7c67
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/__init__.py
@@ -0,0 +1,87 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .argpartition import argpartition
+from .argsort import argsort
+from .argtopk import argtopk
+from .argwhere import TensorArgwhere, argwhere
+from .array_split import array_split
+from .astype import TensorAstype
+from .atleast_1d import atleast_1d
+from .atleast_2d import atleast_2d
+from .atleast_3d import atleast_3d
+from .broadcast_arrays import broadcast_arrays
+from .broadcast_to import TensorBroadcastTo, broadcast_to
+from .copy import copy
+from .copyto import TensorCopyTo, copyto
+from .delete import delete
+from .diff import diff
+from .dsplit import dsplit
+from .ediff1d import ediff1d
+from .expand_dims import expand_dims
+from .flatten import flatten
+from .flip import flip
+from .fliplr import fliplr
+from .flipud import flipud
+from .hsplit import hsplit
+from .in1d import in1d
+from .insert import insert
+from .isin import TensorIsIn, isin
+from .map_chunk import TensorMapChunk, map_chunk
+from .moveaxis import moveaxis
+from .ndim import ndim
+from .partition import partition
+from .ravel import ravel
+from .rebalance import rebalance
+from .repeat import TensorRepeat, repeat
+from .result_type import result_type
+from .roll import roll
+from .rollaxis import rollaxis
+from .searchsorted import TensorSearchsorted, searchsorted
+from .setdiff1d import setdiff1d
+from .shape import shape
+from .sort import sort
+from .split import TensorSplit, split
+from .squeeze import TensorSqueeze, squeeze
+from .swapaxes import TensorSwapAxes, swapaxes
+from .tile import tile
+from .to_cpu import to_cpu
+from .to_gpu import to_gpu
+from .topk import topk
+from .transpose import TensorTranspose, transpose
+from .trapz import trapz
+from .unique import unique
+from .vsplit import vsplit
+from .where import TensorWhere, where
+
+
+def _install():
+    from ..core import Tensor, TensorData
+    from .astype import _astype
+
+    for cls in (Tensor, TensorData):
+        setattr(cls, "astype", _astype)
+        setattr(cls, "swapaxes", swapaxes)
+        setattr(cls, "squeeze", squeeze)
+        setattr(cls, "repeat", repeat)
+        setattr(cls, "ravel", ravel)
+        setattr(cls, "flatten", flatten)
+        setattr(cls, "to_gpu", to_gpu)
+        setattr(cls, "to_cpu", to_cpu)
+        setattr(cls, "rebalance", rebalance)
+        setattr(cls, "map_chunk", map_chunk)
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/tensor/base/argpartition.py b/python/xorbits/_mars/tensor/base/argpartition.py
new file mode 100644
index 000000000..fe7424a05
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/argpartition.py
@@ -0,0 +1,98 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .partition import TensorPartition, _validate_partition_arguments
+
+
+def argpartition(a, kth, axis=-1, kind="introselect", order=None, **kw):
+    """
+    Perform an indirect partition along the given axis using the
+    algorithm specified by the `kind` keyword. It returns an array of
+    indices of the same shape as `a` that index data along the given
+    axis in partitioned order.
+
+    .. versionadded:: 1.8.0
+
+    Parameters
+    ----------
+    a : array_like
+        Tensor to sort.
+    kth : int or sequence of ints
+        Element index to partition by. The k-th element will be in its
+        final sorted position and all smaller elements will be moved
+        before it and all larger elements behind it. The order all
+        elements in the partitions is undefined. If provided with a
+        sequence of k-th it will partition all of them into their sorted
+        position at once.
+    axis : int or None, optional
+        Axis along which to sort. The default is -1 (the last axis). If
+        None, the flattened tensor is used.
+    kind : {'introselect'}, optional
+        Selection algorithm. Default is 'introselect'
+    order : str or list of str, optional
+        When `a` is a tensor with fields defined, this argument
+        specifies which fields to compare first, second, etc. A single
+        field can be specified as a string, and not all fields need be
+        specified, but unspecified fields will still be used, in the
+        order in which they come up in the dtype, to break ties.
+
+    Returns
+    -------
+    index_tensor : Tensor, int
+        Tensor of indices that partition `a` along the specified axis.
+        If `a` is one-dimensional, ``a[index_tensor]`` yields a partitioned `a`.
+        More generally, ``np.take_along_axis(a, index_tensor, axis=a)`` always
+        yields the partitioned `a`, irrespective of dimensionality.
+
+    See Also
+    --------
+    partition : Describes partition algorithms used.
+    Tensor.partition : Inplace partition.
+    argsort : Full indirect sort
+
+    Notes
+    -----
+    See `partition` for notes on the different selection algorithms.
+
+    Examples
+    --------
+    One dimensional tensor:
+
+    >>> import mars.tensor as mt
+    >>> x = mt.array([3, 4, 2, 1])
+    >>> x[mt.argpartition(x, 3)].execute()
+    array([2, 1, 3, 4])
+    >>> x[mt.argpartition(x, (1, 3))].execute()
+    array([1, 2, 3, 4])
+
+    >>> x = [3, 4, 2, 1]
+    >>> mt.array(x)[mt.argpartition(x, 3)].execute()
+    array([2, 1, 3, 4])
+
+    """
+    a, kth, axis, kind, order, need_align = _validate_partition_arguments(
+        a, kth, axis, kind, order, kw
+    )
+    op = TensorPartition(
+        kth=kth,
+        axis=axis,
+        kind=kind,
+        order=order,
+        need_align=need_align,
+        return_value=False,
+        return_indices=True,
+        dtype=a.dtype,
+        gpu=a.op.gpu,
+    )
+    return op(a, kth)
diff --git a/python/xorbits/_mars/tensor/base/argsort.py b/python/xorbits/_mars/tensor/base/argsort.py
new file mode 100644
index 000000000..e83850b42
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/argsort.py
@@ -0,0 +1,136 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .sort import TensorSort, _validate_sort_arguments
+
+
+def argsort(a, axis=-1, kind=None, parallel_kind=None, psrs_kinds=None, order=None):
+    """
+    Returns the indices that would sort a tensor.
+
+    Perform an indirect sort along the given axis using the algorithm specified
+    by the `kind` keyword. It returns a tensor of indices of the same shape as
+    `a` that index data along the given axis in sorted order.
+
+    Parameters
+    ----------
+    a : array_like
+        Tensor to sort.
+    axis : int or None, optional
+        Axis along which to sort.  The default is -1 (the last axis). If None,
+        the flattened tensor is used.
+    kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
+        Sorting algorithm. The default is 'quicksort'. Note that both 'stable'
+        and 'mergesort' use timsort under the covers and, in general, the
+        actual implementation will vary with data type. The 'mergesort' option
+        is retained for backwards compatibility.
+
+        .. versionchanged:: 1.15.0.
+           The 'stable' option was added.
+    order : str or list of str, optional
+        When `a` is a tensor with fields defined, this argument specifies
+        which fields to compare first, second, etc.  A single field can
+        be specified as a string, and not all fields need be specified,
+        but unspecified fields will still be used, in the order in which
+        they come up in the dtype, to break ties.
+
+    Returns
+    -------
+    index_tensor : Tensor, int
+        Tensor of indices that sort `a` along the specified `axis`.
+        If `a` is one-dimensional, ``a[index_tensor]`` yields a sorted `a`.
+        More generally, ``np.take_along_axis(a, index_tensor, axis=axis)``
+        always yields the sorted `a`, irrespective of dimensionality.
+
+    See Also
+    --------
+    sort : Describes sorting algorithms used.
+    lexsort : Indirect stable sort with multiple keys.
+    Tensor.sort : Inplace sort.
+    argpartition : Indirect partial sort.
+
+    Notes
+    -----
+    See `sort` for notes on the different sorting algorithms.
+
+    Examples
+    --------
+    One dimensional tensor:
+
+    >>> import mars.tensor as mt
+    >>> x = mt.array([3, 1, 2])
+    >>> mt.argsort(x).execute()
+    array([1, 2, 0])
+
+    Two-dimensional tensor:
+
+    >>> x = mt.array([[0, 3], [2, 2]])
+    >>> x.execute()
+    array([[0, 3],
+           [2, 2]])
+
+    >>> ind = mt.argsort(x, axis=0)  # sorts along first axis (down)
+    >>> ind.execute()
+    array([[0, 1],
+           [1, 0]])
+    #>>> mt.take_along_axis(x, ind, axis=0).execute()  # same as np.sort(x, axis=0)
+    #array([[0, 2],
+    #       [2, 3]])
+
+    >>> ind = mt.argsort(x, axis=1)  # sorts along last axis (across)
+    >>> ind.execute()
+    array([[0, 1],
+           [0, 1]])
+    #>>> mt.take_along_axis(x, ind, axis=1).execute()  # same as np.sort(x, axis=1)
+    #array([[0, 3],
+    #       [2, 2]])
+
+    Indices of the sorted elements of a N-dimensional array:
+
+    >>> ind = mt.unravel_index(mt.argsort(x, axis=None), x.shape)
+    >>> ind.execute9)
+    (array([0, 1, 1, 0]), array([0, 0, 1, 1]))
+    >>> x[ind].execute()  # same as np.sort(x, axis=None)
+    array([0, 2, 2, 3])
+
+    Sorting with keys:
+
+    >>> x = mt.array([(1, 0), (0, 1)], dtype=[('x', '<i4'), ('y', '<i4')])
+    >>> x.execute()
+    array([(1, 0), (0, 1)],
+          dtype=[('x', '<i4'), ('y', '<i4')])
+
+    >>> mt.argsort(x, order=('x','y')).execute()
+    array([1, 0])
+
+    >>> mt.argsort(x, order=('y','x')).execute()
+    array([0, 1])
+
+    """
+    a, axis, kind, parallel_kind, psrs_kinds, order = _validate_sort_arguments(
+        a, axis, kind, parallel_kind, psrs_kinds, order
+    )
+
+    op = TensorSort(
+        axis=axis,
+        kind=kind,
+        parallel_kind=parallel_kind,
+        order=order,
+        psrs_kinds=psrs_kinds,
+        return_value=False,
+        return_indices=True,
+        dtype=a.dtype,
+        gpu=a.op.gpu,
+    )
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/base/argtopk.py b/python/xorbits/_mars/tensor/base/argtopk.py
new file mode 100644
index 000000000..1ef0af2ac
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/argtopk.py
@@ -0,0 +1,53 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...core.operand import OperandStage
+from .topk import TensorTopk, _validate_topk_arguments
+
+
+def argtopk(
+    a,
+    k,
+    axis=-1,
+    largest=True,
+    sorted=True,
+    order=None,
+    parallel_kind="auto",
+    psrs_kinds=None,
+):
+    (
+        a,
+        k,
+        axis,
+        largest,
+        sorted,
+        order,
+        parallel_kind,
+        psrs_kinds,
+    ) = _validate_topk_arguments(
+        a, k, axis, largest, sorted, order, parallel_kind, psrs_kinds
+    )
+    op = TensorTopk(
+        k=k,
+        axis=axis,
+        largest=largest,
+        sorted=sorted,
+        parallel_kind=parallel_kind,
+        psrs_kinds=psrs_kinds,
+        dtype=a.dtype,
+        return_value=False,
+        return_indices=True,
+        stage=OperandStage.agg,
+    )
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/base/argwhere.py b/python/xorbits/_mars/tensor/base/argwhere.py
new file mode 100644
index 000000000..ce9b03474
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/argwhere.py
@@ -0,0 +1,126 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import recursive_tile
+from ...serialization.serializables import KeyField
+from ...utils import has_unknown_shape
+from ..datasource import tensor as astensor
+from ..operands import TensorHasInput, TensorOperandMixin
+from .ravel import ravel
+
+
+class TensorArgwhere(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.ARGWHERE
+
+    _input = KeyField("input")
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def __call__(self, a):
+        shape = (np.nan, a.ndim)
+        return self.new_tensor([a], shape)
+
+    @classmethod
+    def tile(cls, op):
+        from ..datasource import arange
+        from ..indexing import unravel_index
+        from ..reshape.reshape import TensorReshape
+
+        in_tensor = op.input
+        out_tensor = op.outputs[0]
+
+        if has_unknown_shape(in_tensor):
+            yield
+
+        flattened = yield from recursive_tile(ravel(in_tensor))
+        indices = arange(flattened.size, dtype=np.intp, chunks=flattened.nsplits)
+        indices = indices[flattened]
+        dim_indices = unravel_index(indices, in_tensor.shape)
+        dim_indices = yield from recursive_tile(*dim_indices)
+
+        out_chunk_shape = dim_indices[0].chunk_shape + (in_tensor.ndim,)
+        nsplits = dim_indices[0].nsplits + ((1,) * in_tensor.ndim,)
+        out_chunks = []
+        for out_index in itertools.product(*(map(range, out_chunk_shape))):
+            dim_ind_chunk = dim_indices[out_index[1]].chunks[out_index[0]]
+            chunk_shape = dim_ind_chunk.shape + (1,)
+            chunk_op = TensorReshape(newshape=(-1, 1), dtype=dim_ind_chunk.dtype)
+            out_chunk = chunk_op.new_chunk(
+                [dim_ind_chunk],
+                shape=chunk_shape,
+                index=out_index,
+                order=out_tensor.order,
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            out_tensor.shape,
+            order=out_tensor.order,
+            chunks=out_chunks,
+            nsplits=nsplits,
+        )
+
+
+def argwhere(a):
+    """
+    Find the indices of tensor elements that are non-zero, grouped by element.
+
+    Parameters
+    ----------
+    a : array_like
+        Input data.
+
+    Returns
+    -------
+    index_tensor : Tensor
+        Indices of elements that are non-zero. Indices are grouped by element.
+
+    See Also
+    --------
+    where, nonzero
+
+    Notes
+    -----
+    ``mt.argwhere(a)`` is the same as ``mt.transpose(mt.nonzero(a))``.
+
+    The output of ``argwhere`` is not suitable for indexing tensors.
+    For this purpose use ``nonzero(a)`` instead.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.arange(6).reshape(2,3)
+    >>> x.execute()
+    array([[0, 1, 2],
+           [3, 4, 5]])
+    >>> mt.argwhere(x>1).execute()
+    array([[0, 2],
+           [1, 0],
+           [1, 1],
+           [1, 2]])
+
+    """
+    a = astensor(a).astype(bool, order="A")
+    op = TensorArgwhere(dtype=np.dtype(np.intp))
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/base/array_split.py b/python/xorbits/_mars/tensor/base/array_split.py
new file mode 100644
index 000000000..2f1095eb4
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/array_split.py
@@ -0,0 +1,46 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .split import _split
+
+
+def array_split(a, indices_or_sections, axis=0):
+    """
+    Split a tensor into multiple sub-tensors.
+
+    Please refer to the ``split`` documentation.  The only difference
+    between these functions is that ``array_split`` allows
+    `indices_or_sections` to be an integer that does *not* equally
+    divide the axis. For a tensor of length l that should be split
+    into n sections, it returns l % n sub-arrays of size l//n + 1
+    and the rest of size l//n.
+
+    See Also
+    --------
+    split : Split tensor into multiple sub-tensors of equal size.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.arange(8.0)
+    >>> mt.array_split(x, 3).execute()
+        [array([ 0.,  1.,  2.]), array([ 3.,  4.,  5.]), array([ 6.,  7.])]
+
+    >>> x = mt.arange(7.0)
+    >>> mt.array_split(x, 3).execute()
+        [array([ 0.,  1.,  2.]), array([ 3.,  4.]), array([ 5.,  6.])]
+
+    """
+    return _split(a, indices_or_sections, axis=axis)
diff --git a/python/xorbits/_mars/tensor/base/astype.py b/python/xorbits/_mars/tensor/base/astype.py
new file mode 100644
index 000000000..0f1b32faf
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/astype.py
@@ -0,0 +1,168 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import KeyField, StringField
+from ...utils import get_dtype
+from ..array_utils import as_same_device, device
+from ..operands import TensorHasInput, TensorOperandMixin
+from ..utils import get_order
+
+
+class TensorAstype(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.ASTYPE
+
+    _input = KeyField("input")
+    _order = StringField("order")
+    _casting = StringField("casting")
+
+    def __init__(self, dtype=None, order=None, casting=None, sparse=False, **kw):
+        super().__init__(
+            dtype=dtype, _order=order, _casting=casting, sparse=sparse, **kw
+        )
+
+    @property
+    def order(self):
+        return self._order
+
+    @property
+    def casting(self):
+        return self._casting
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def __call__(self, tensor, order=None):
+        return self.new_tensor([tensor], tensor.shape, order=order)
+
+    @classmethod
+    def tile(cls, op):
+        in_tensor = op.input
+        out_tensor = op.outputs[0]
+
+        out_chunks = []
+        for c in in_tensor.chunks:
+            chunk_op = op.copy().reset_key()
+            chunk = chunk_op.new_chunk(
+                [c], shape=c.shape, index=c.index, order=out_tensor.order
+            )
+            out_chunks.append(chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            nsplits=in_tensor.nsplits,
+            chunks=out_chunks,
+            kws=[out_tensor.params],
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        (x,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            if op.sparse:
+                ctx[chunk.key] = x.astype(op.dtype)
+            else:
+                if xp is np:
+                    ctx[chunk.key] = x.astype(
+                        op.dtype, order=op.order, casting=op.casting
+                    )
+                else:  # pragma: no cover
+                    # cupy does not support casting
+                    ctx[chunk.key] = x.astype(op.dtype, order=op.order)
+
+
+def _astype(tensor, dtype, order="K", casting="unsafe", copy=True):
+    """
+    Copy of the tensor, cast to a specified type.
+
+    Parameters
+    ----------
+    dtype : str or dtype
+        Typecode or data-type to which the array is cast.
+    casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
+        Controls what kind of data casting may occur. Defaults to 'unsafe'
+        for backwards compatibility.
+          * 'no' means the data types should not be cast at all.
+          * 'equiv' means only byte-order changes are allowed.
+          * 'safe' means only casts which can preserve values are allowed.
+          * 'same_kind' means only safe casts or casts within a kind,
+            like float64 to float32, are allowed.
+          * 'unsafe' means any data conversions may be done.
+    order : {'C', 'F', 'A', 'K'}, optional
+        Controls the memory layout order of the result.
+        'C' means C order, 'F' means Fortran order, 'A'
+        means 'F' order if all the arrays are Fortran contiguous,
+        'C' order otherwise, and 'K' means as close to the
+        order the array elements appear in memory as possible.
+        Default is 'K'.
+    copy : bool, optional
+        By default, astype always returns a newly allocated array. If this
+        is set to false, and the `dtype`, `order`, and `subok`
+        requirements are satisfied, the input array is returned instead
+        of a copy.
+
+    Returns
+    -------
+    arr_t : Tensor
+        Unless `copy` is False and the other conditions for returning the input
+        array are satisfied (see description for `copy` input parameter), `arr_t`
+        is a new tensor of the same shape as the input array, with dtype, order
+        given by `dtype`, `order`.
+
+    Notes
+    -----
+    astype method returns an error if the string
+    dtype to cast to is not long enough in 'safe' casting mode to hold the max
+    value of integer/float array that is being casted. Previously the casting
+    was allowed even if the result was truncated.
+
+    Raises
+    ------
+    ComplexWarning
+        When casting from complex to float or int. To avoid this,
+        one should use ``a.real.astype(t)``.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> x = mt.array([1, 2, 2.5])
+    >>> x.execute()
+    array([ 1. ,  2. ,  2.5])
+
+    >>> x.astype(int).execute()
+    array([1, 2, 2])
+    """
+    dtype = get_dtype(dtype)
+    tensor_order = get_order(order, tensor.order)
+
+    if tensor.dtype == dtype and tensor.order == tensor_order:
+        return tensor if not copy else tensor.copy(order=order)
+    elif not np.can_cast(tensor.dtype, dtype, casting=casting):
+        raise TypeError(
+            f"Cannot cast array from {tensor.dtype!r} to {dtype!r} "
+            f"according to the rule {casting}"
+        )
+
+    op = TensorAstype(
+        dtype=dtype, order=order, casting=casting, sparse=tensor.issparse()
+    )
+    return op(tensor, order=tensor_order)
diff --git a/python/xorbits/_mars/tensor/base/atleast_1d.py b/python/xorbits/_mars/tensor/base/atleast_1d.py
new file mode 100644
index 000000000..8e26feace
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/atleast_1d.py
@@ -0,0 +1,72 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ...core import ExecutableTuple
+from ..datasource import tensor as astensor
+
+
+def atleast_1d(*tensors):
+    """
+    Convert inputs to tensors with at least one dimension.
+
+    Scalar inputs are converted to 1-dimensional tensors, whilst
+    higher-dimensional inputs are preserved.
+
+    Parameters
+    ----------
+    tensors1, tensors2, ... : array_like
+        One or more input tensors.
+
+    Returns
+    -------
+    ret : Tensor
+        An tensor, or list of tensors, each with ``a.ndim >= 1``.
+        Copies are made only if necessary.
+
+    See Also
+    --------
+    atleast_2d, atleast_3d
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.atleast_1d(1.0).execute()
+    array([ 1.])
+
+    >>> x = mt.arange(9.0).reshape(3,3)
+    >>> mt.atleast_1d(x).execute()
+    array([[ 0.,  1.,  2.],
+           [ 3.,  4.,  5.],
+           [ 6.,  7.,  8.]])
+    >>> mt.atleast_1d(x) is x
+    True
+
+    >>> mt.atleast_1d(1, [3, 4]).execute()
+    [array([1]), array([3, 4])]
+
+    """
+    new_tensors = []
+    for x in tensors:
+        x = astensor(x)
+        if x.ndim == 0:
+            x = x[np.newaxis]
+
+        new_tensors.append(x)
+
+    if len(new_tensors) == 1:
+        return new_tensors[0]
+    return ExecutableTuple(new_tensors)
diff --git a/python/xorbits/_mars/tensor/base/atleast_2d.py b/python/xorbits/_mars/tensor/base/atleast_2d.py
new file mode 100644
index 000000000..f9eac5622
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/atleast_2d.py
@@ -0,0 +1,70 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ...core import ExecutableTuple
+from ..datasource import tensor as astensor
+
+
+def atleast_2d(*tensors):
+    """
+    View inputs as tensors with at least two dimensions.
+
+    Parameters
+    ----------
+    tensors1, tensors2, ... : array_like
+        One or more array-like sequences.  Non-tensor inputs are converted
+        to tensors.  Tensors that already have two or more dimensions are
+        preserved.
+
+    Returns
+    -------
+    res, res2, ... : Tensor
+        A tensor, or list of tensors, each with ``a.ndim >= 2``.
+        Copies are avoided where possible, and views with two or more
+        dimensions are returned.
+
+    See Also
+    --------
+    atleast_1d, atleast_3d
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.atleast_2d(3.0).execute()
+    array([[ 3.]])
+
+    >>> x = mt.arange(3.0)
+    >>> mt.atleast_2d(x).execute()
+    array([[ 0.,  1.,  2.]])
+
+    >>> mt.atleast_2d(1, [1, 2], [[1, 2]]).execute()
+    [array([[1]]), array([[1, 2]]), array([[1, 2]])]
+
+    """
+    new_tensors = []
+    for x in tensors:
+        x = astensor(x)
+        if x.ndim == 0:
+            x = x[np.newaxis, np.newaxis]
+        elif x.ndim == 1:
+            x = x[np.newaxis, :]
+
+        new_tensors.append(x)
+
+    if len(new_tensors) == 1:
+        return new_tensors[0]
+    return ExecutableTuple(new_tensors)
diff --git a/python/xorbits/_mars/tensor/base/atleast_3d.py b/python/xorbits/_mars/tensor/base/atleast_3d.py
new file mode 100644
index 000000000..540236a38
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/atleast_3d.py
@@ -0,0 +1,84 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ...core import ExecutableTuple
+from ..datasource import tensor as astensor
+
+
+def atleast_3d(*tensors):
+    """
+    View inputs as tensors with at least three dimensions.
+
+    Parameters
+    ----------
+    tensors1, tensors2, ... : array_like
+        One or more tensor-like sequences.  Non-tensor inputs are converted to
+        tensors.  Tensors that already have three or more dimensions are
+        preserved.
+
+    Returns
+    -------
+    res1, res2, ... : Tensor
+        A tensor, or list of tensors, each with ``a.ndim >= 3``.  Copies are
+        avoided where possible, and views with three or more dimensions are
+        returned.  For example, a 1-D tensor of shape ``(N,)`` becomes a view
+        of shape ``(1, N, 1)``, and a 2-D tensor of shape ``(M, N)`` becomes a
+        view of shape ``(M, N, 1)``.
+
+    See Also
+    --------
+    atleast_1d, atleast_2d
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.atleast_3d(3.0).execute()
+    array([[[ 3.]]])
+
+    >>> x = mt.arange(3.0)
+    >>> mt.atleast_3d(x).shape
+    (1, 3, 1)
+
+    >>> x = mt.arange(12.0).reshape(4,3)
+    >>> mt.atleast_3d(x).shape
+    (4, 3, 1)
+
+    >>> for arr in mt.atleast_3d([1, 2], [[1, 2]], [[[1, 2]]]).execute():
+    ...     print(arr, arr.shape)
+    ...
+    [[[1]
+      [2]]] (1, 2, 1)
+    [[[1]
+      [2]]] (1, 2, 1)
+    [[[1 2]]] (1, 1, 2)
+
+    """
+    new_tensors = []
+    for x in tensors:
+        x = astensor(x)
+        if x.ndim == 0:
+            x = x[np.newaxis, np.newaxis, np.newaxis]
+        elif x.ndim == 1:
+            x = x[np.newaxis, :, np.newaxis]
+        elif x.ndim == 2:
+            x = x[:, :, None]
+
+        new_tensors.append(x)
+
+    if len(new_tensors) == 1:
+        return new_tensors[0]
+    return ExecutableTuple(new_tensors)
diff --git a/python/xorbits/_mars/tensor/base/broadcast_arrays.py b/python/xorbits/_mars/tensor/base/broadcast_arrays.py
new file mode 100644
index 000000000..3d96d14d7
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/broadcast_arrays.py
@@ -0,0 +1,57 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...core import ExecutableTuple
+from ..datasource import tensor as astensor
+from ..utils import broadcast_shape
+from .broadcast_to import broadcast_to
+
+
+def broadcast_arrays(*args, **kwargs):
+    """
+    Broadcast any number of arrays against each other.
+
+    Parameters
+    ----------
+    `*args` : array_likes
+        The tensors to broadcast.
+
+    Returns
+    -------
+    broadcasted : list of tensors
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.array([[1,2,3]])
+    >>> y = mt.array([[1],[2],[3]])
+    >>> mt.broadcast_arrays(x, y).execute()
+    [array([[1, 2, 3],
+           [1, 2, 3],
+           [1, 2, 3]]), array([[1, 1, 1],
+           [2, 2, 2],
+           [3, 3, 3]])]
+
+    """
+    if kwargs:
+        raise TypeError(
+            "broadcast_arrays() got an unexpected keyword "
+            f"argument {next(iter(kwargs.keys()))!r}"
+        )
+
+    args = [astensor(arg) for arg in args]
+
+    shape = broadcast_shape(*[arg.shape for arg in args])
+    return ExecutableTuple([broadcast_to(a, shape) for a in args])
diff --git a/python/xorbits/_mars/tensor/base/broadcast_to.py b/python/xorbits/_mars/tensor/base/broadcast_to.py
new file mode 100644
index 000000000..12f892f01
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/broadcast_to.py
@@ -0,0 +1,151 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import KeyField, TupleField
+from ..array_utils import device, get_array_module
+from ..datasource import tensor as astensor
+from ..operands import TensorHasInput, TensorOperandMixin
+
+
+class TensorBroadcastTo(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.BROADCAST_TO
+
+    _input = KeyField("input")
+    _shape = TupleField("shape")
+
+    def __init__(self, shape=None, **kw):
+        super().__init__(_shape=shape, **kw)
+
+    @property
+    def shape(self):
+        return self._shape
+
+    def __call__(self, tensor, shape):
+        return self.new_tensor([tensor], shape)
+
+    @classmethod
+    def tile(cls, op):
+        tensor = op.outputs[0]
+        in_tensor = op.inputs[0]
+        shape = op.shape
+        new_dim = tensor.ndim - in_tensor.ndim
+
+        out_chunks = []
+        for c in in_tensor.chunks:
+            chunk_shape = shape[:new_dim] + tuple(
+                s if in_tensor.shape[idx] != 1 else shape[new_dim + idx]
+                for idx, s in enumerate(c.shape)
+            )
+            chunk_idx = (0,) * new_dim + c.index
+            chunk_op = op.copy().reset_key()
+            chunk_op._shape = chunk_shape
+            out_chunk = chunk_op.new_chunk(
+                [c], shape=chunk_shape, index=chunk_idx, order=tensor.order
+            )
+            out_chunks.append(out_chunk)
+
+        nsplits = [
+            tuple(
+                c.shape[i]
+                for c in out_chunks
+                if all(idx == 0 for j, idx in enumerate(c.index) if j != i)
+            )
+            for i in range(len(out_chunks[0].shape))
+        ]
+        new_op = op.copy()
+        return new_op.new_tensors(
+            [in_tensor],
+            tensor.shape,
+            order=tensor.order,
+            chunks=out_chunks,
+            nsplits=nsplits,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        xp = get_array_module(ctx[op.input.key])
+        input_data = ctx[op.input.key]
+        device_id = input_data.device.id if hasattr(input_data, "device") else -1
+
+        with device(device_id):
+            shape = op.shape
+            if any(np.isnan(s) for s in shape):
+                shape = list(shape)
+                new_dim = len(shape) - input_data.ndim
+                for i in range(input_data.ndim):
+                    if np.isnan(shape[i + new_dim]):
+                        shape[i + new_dim] = input_data.shape[i]
+            ctx[op.outputs[0].key] = xp.broadcast_to(input_data, shape)
+
+
+def broadcast_to(tensor, shape):
+    """Broadcast an tensor to a new shape.
+
+    Parameters
+    ----------
+    tensor : array_like
+        The tensor to broadcast.
+    shape : tuple
+        The shape of the desired array.
+
+    Returns
+    -------
+    broadcast : Tensor
+
+    Raises
+    ------
+    ValueError
+        If the tensor is not compatible with the new shape according to Mars's
+        broadcasting rules.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.array([1, 2, 3])
+    >>> mt.broadcast_to(x, (3, 3)).execute()
+    array([[1, 2, 3],
+           [1, 2, 3],
+           [1, 2, 3]])
+    """
+    from ..core import Tensor
+
+    tensor = tensor if isinstance(tensor, Tensor) else astensor(tensor)
+    shape = tuple(shape) if isinstance(shape, (list, tuple)) else (shape,)
+
+    if any(np.isnan(s) for s in tensor.shape):
+        raise ValueError(
+            "input tensor has unknown shape, need to call `.execute()` first"
+        )
+
+    if tensor.shape == shape:
+        return tensor
+
+    new_ndim = len(shape) - tensor.ndim
+    if new_ndim < 0:
+        raise ValueError(
+            "input operand has more dimensions than allowed by the axis remapping"
+        )
+    if any(o != n for o, n in zip(tensor.shape, shape[new_ndim:]) if o != 1):
+        raise ValueError(
+            "operands could not be broadcast together "
+            f"with remapped shapes [original->remapped]: {tensor.shape} "
+            f"and requested shape {shape}"
+        )
+
+    op = TensorBroadcastTo(shape, dtype=tensor.dtype, sparse=tensor.issparse())
+    return op(tensor, shape)
diff --git a/python/xorbits/_mars/tensor/base/copy.py b/python/xorbits/_mars/tensor/base/copy.py
new file mode 100644
index 000000000..3ae544c7a
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/copy.py
@@ -0,0 +1,64 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def copy(a, order="K"):
+    """
+    Return a tensor copy of the given object.
+
+    Parameters
+    ----------
+    a : array_like
+        Input data.
+    order : {'C', 'F', 'A', 'K'}, optional
+        Controls the memory layout of the copy. 'C' means C-order,
+        'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous,
+        'C' otherwise. 'K' means match the layout of `a` as closely
+        as possible. (Note that this function and :meth:`ndarray.copy` are very
+        similar, but have different default values for their order=
+        arguments.)
+
+    Returns
+    -------
+    arr : Tensor
+        Tensor interpretation of `a`.
+
+    Notes
+    -----
+    This is equivalent to:
+
+    >>> import mars.tensor as mt
+
+    >>> mt.array(a, copy=True)  #doctest: +SKIP
+
+    Examples
+    --------
+    Create an array x, with a reference y and a copy z:
+
+    >>> x = mt.array([1, 2, 3])
+    >>> y = x
+    >>> z = mt.copy(x)
+
+    Note that, when we modify x, y changes, but not z:
+
+    >>> x[0] = 10
+    >>> (x[0] == y[0]).execute()
+    True
+    >>> (x[0] == z[0]).execute()
+    False
+
+    """
+    from ..datasource import array
+
+    return array(a, order=order, copy=True)
diff --git a/python/xorbits/_mars/tensor/base/copyto.py b/python/xorbits/_mars/tensor/base/copyto.py
new file mode 100644
index 000000000..4b7b88dcf
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/copyto.py
@@ -0,0 +1,211 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import KeyField, StringField
+from ...utils import has_unknown_shape
+from ..array_utils import as_same_device, device
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+from ..utils import broadcast_shape, unify_chunks
+from .broadcast_to import broadcast_to
+
+
+class TensorCopyTo(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.COPYTO
+
+    _src = KeyField("src")
+    _dst = KeyField("dest")
+    _casting = StringField("casting")
+    _where = KeyField("where")
+
+    def __init__(self, casting=None, **kw):
+        super().__init__(_casting=casting, **kw)
+
+    @property
+    def src(self):
+        return self._src
+
+    @property
+    def dst(self):
+        return self._dst
+
+    @property
+    def casting(self):
+        return self._casting
+
+    @property
+    def where(self):
+        return self._where
+
+    def check_inputs(self, inputs):
+        if not 2 <= len(inputs) <= 3:
+            raise ValueError("inputs' length must be 2 or 3")
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+
+        self._src = self._inputs[0]
+        self._dst = self._inputs[1]
+        if len(self._inputs) > 2:
+            self._where = self._inputs[2]
+
+    @staticmethod
+    def _extract_inputs(inputs):
+        if len(inputs) == 2:
+            (src, dst), where = inputs, None
+        else:
+            src, dst, where = inputs
+            if where is True:
+                where = None
+            else:
+                where = astensor(where)
+
+        return src, dst, where
+
+    def __call__(self, *inputs):
+        from ..core import Tensor
+
+        src, dst, where = self._extract_inputs(inputs)
+
+        if not isinstance(dst, Tensor):
+            raise TypeError("dst has to be a Tensor")
+
+        self.dtype = dst.dtype
+        self.gpu = dst.op.gpu
+        self.sparse = dst.issparse()
+
+        if not np.can_cast(src.dtype, dst.dtype, casting=self.casting):
+            raise TypeError(
+                f"Cannot cast array from {src.dtype!r} to {dst.dtype!r} "
+                f"according to the rule {self.casting!s}"
+            )
+
+        try:
+            broadcast_to(src, dst.shape)
+        except ValueError:
+            raise ValueError(
+                "could not broadcast input array "
+                f"from shape {src.shape!r} into shape {dst.shape!r}"
+            )
+        if where:
+            try:
+                broadcast_to(where, dst.shape)
+            except ValueError:
+                raise ValueError(
+                    "could not broadcast where mask "
+                    f"from shape {src.shape!r} into shape {dst.shape!r}"
+                )
+
+        inps = [src, dst]
+        if where is not None:
+            inps.append(where)
+        ret = self.new_tensor(inps, dst.shape, order=dst.order)
+        dst.data = ret.data
+
+    @classmethod
+    def tile(cls, op):
+        if has_unknown_shape(*op.inputs):
+            yield
+        inputs = yield from unify_chunks(
+            *[(input, list(range(input.ndim))[::-1]) for input in op.inputs]
+        )
+        output = op.outputs[0]
+
+        chunk_shapes = [
+            t.chunk_shape if hasattr(t, "chunk_shape") else t for t in inputs
+        ]
+        out_chunk_shape = broadcast_shape(*chunk_shapes)
+
+        out_chunks = []
+        nsplits = [[np.nan] * shape for shape in out_chunk_shape]
+        get_index = lambda idx, t: tuple(
+            0 if t.nsplits[i] == (1,) else ix for i, ix in enumerate(idx)
+        )
+        for out_idx in itertools.product(*(map(range, out_chunk_shape))):
+            in_chunks = [
+                t.cix[get_index(out_idx[-t.ndim :], t)] if t.ndim != 0 else t.chunks[0]
+                for t in inputs
+            ]
+            out_chunk = (
+                op.copy()
+                .reset_key()
+                .new_chunk(
+                    in_chunks,
+                    shape=in_chunks[1].shape,
+                    order=output.order,
+                    index=out_idx,
+                )
+            )
+            out_chunks.append(out_chunk)
+            for i, idx, s in zip(itertools.count(0), out_idx, out_chunk.shape):
+                nsplits[i][idx] = s
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            output.shape,
+            order=output.order,
+            chunks=out_chunks,
+            nsplits=nsplits,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            dst = inputs[1].copy()
+            src = inputs[0]
+            where = inputs[2] if len(inputs) > 2 else True
+
+            xp.copyto(dst, src, casting=op.casting, where=where)
+            ctx[op.outputs[0].key] = dst
+
+
+def copyto(dst, src, casting="same_kind", where=True):
+    """
+    Copies values from one array to another, broadcasting as necessary.
+
+    Raises a TypeError if the `casting` rule is violated, and if
+    `where` is provided, it selects which elements to copy.
+
+    Parameters
+    ----------
+    dst : Tensor
+        The tensor into which values are copied.
+    src : array_like
+        The tensor from which values are copied.
+    casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
+        Controls what kind of data casting may occur when copying.
+
+          * 'no' means the data types should not be cast at all.
+          * 'equiv' means only byte-order changes are allowed.
+          * 'safe' means only casts which can preserve values are allowed.
+          * 'same_kind' means only safe casts or casts within a kind,
+            like float64 to float32, are allowed.
+          * 'unsafe' means any data conversions may be done.
+    where : array_like of bool, optional
+        A boolean tensor which is broadcasted to match the dimensions
+        of `dst`, and selects elements to copy from `src` to `dst`
+        wherever it contains the value True.
+    """
+    op = TensorCopyTo(casting=casting)
+    return op(src, dst, where)
diff --git a/python/xorbits/_mars/tensor/base/core.py b/python/xorbits/_mars/tensor/base/core.py
new file mode 100644
index 000000000..b44dc8dde
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/core.py
@@ -0,0 +1,47 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...serialization.serializables import KeyField
+from ..operands import TensorOperand, TensorOperandMixin
+
+
+class TensorDeviceConversionBase(TensorOperand, TensorOperandMixin):
+    _input = KeyField("input")
+
+    @property
+    def input(self):
+        return self._input
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = inputs[0]
+
+    def __call__(self, tensor):
+        return self.new_tensor(
+            [tensor], shape=tensor.shape, dtype=tensor.dtype, order=tensor.order
+        )
+
+    @classmethod
+    def tile(cls, op):
+        out_chunks = []
+        for c in op.input.chunks:
+            chunk_op = op.copy().reset_key()
+            out_chunk = chunk_op.new_chunk([c], **c.params)
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy().reset_key()
+        out = op.outputs[0]
+        return new_op.new_tensors(
+            op.inputs, nsplits=op.input.nsplits, chunks=out_chunks, **out.params
+        )
diff --git a/python/xorbits/_mars/tensor/base/delete.py b/python/xorbits/_mars/tensor/base/delete.py
new file mode 100644
index 000000000..87db6b8dd
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/delete.py
@@ -0,0 +1,236 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import ENTITY_TYPE, recursive_tile
+from ...serialization.serializables import AnyField, Int32Field, Int64Field, KeyField
+from ...utils import has_unknown_shape
+from ..datasource import tensor as astensor
+from ..operands import TensorHasInput, TensorOperandMixin
+from ..utils import calc_object_length, filter_inputs, slice_split, validate_axis
+
+
+class TensorDelete(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.DELETE
+
+    _index_obj = AnyField("index_obj")
+    _axis = Int32Field("axis")
+    _input = KeyField("input")
+
+    # for chunk
+    _offset_on_axis = Int64Field("offset_on_axis")
+
+    def __init__(self, index_obj=None, axis=None, offset_on_axis=None, **kw):
+        super().__init__(
+            _index_obj=index_obj, _axis=axis, _offset_on_axis=offset_on_axis, **kw
+        )
+
+    @property
+    def index_obj(self):
+        return self._index_obj
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def offset_on_axis(self):
+        return self._offset_on_axis
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if len(self._inputs) > 1:
+            self._index_obj = self._inputs[1]
+
+    @classmethod
+    def tile(cls, op: "TensorDelete"):
+        inp = op.input
+        index_obj = op.index_obj
+        axis = op.axis
+        if axis is None:
+            inp = yield from recursive_tile(inp.flatten())
+            axis = 0
+        if has_unknown_shape(inp):
+            yield
+
+        if isinstance(index_obj, int):
+            index_obj = [index_obj]
+
+        if isinstance(index_obj, ENTITY_TYPE):
+            index_obj = yield from recursive_tile(index_obj.rechunk(index_obj.shape))
+            offsets = np.cumsum([0] + list(inp.nsplits[axis]))
+            out_chunks = []
+            for c in inp.chunks:
+                chunk_op = op.copy().reset_key()
+                chunk_op._index_obj = index_obj.chunks[0]
+                chunk_op._offset_on_axis = int(offsets[c.index[axis]])
+                shape = tuple(np.nan if j == axis else s for j, s in enumerate(c.shape))
+                out_chunks.append(
+                    chunk_op.new_chunk(
+                        [c, index_obj.chunks[0]], shape=shape, index=c.index
+                    )
+                )
+            nsplits_on_axis = (np.nan,) * len(inp.nsplits[axis])
+        else:
+            nsplits_on_axis = [None for _ in inp.nsplits[axis]]
+            out_chunks = []
+            # index_obj is list, tuple, slice or array like
+            if isinstance(index_obj, slice):
+                slc_splits = slice_split(index_obj, inp.nsplits[axis])
+                for c in inp.chunks:
+                    if c.index[axis] in slc_splits:
+                        chunk_op = op.copy().reset_key()
+                        chunk_slc = slc_splits[c.index[axis]]
+                        shape = tuple(
+                            s - calc_object_length(chunk_slc, s) if j == axis else s
+                            for j, s in enumerate(c.shape)
+                        )
+                        chunk_op._index_obj = chunk_slc
+                        out_chunks.append(
+                            chunk_op.new_chunk([c], shape=shape, index=c.index)
+                        )
+                        nsplits_on_axis[c.index[axis]] = shape[axis]
+                    else:
+                        out_chunks.append(c)
+                        nsplits_on_axis[c.index[axis]] = c.shape[axis]
+            else:
+                index_obj = np.array(index_obj)
+                cum_splits = np.cumsum([0] + list(inp.nsplits[axis]))
+                chunk_indexes = defaultdict(list)
+                for int_idx in index_obj:
+                    in_idx = cum_splits.searchsorted(int_idx, side="right") - 1
+                    chunk_indexes[in_idx].append(int_idx - cum_splits[in_idx])
+
+                for c in inp.chunks:
+                    idx_on_axis = c.index[axis]
+                    if idx_on_axis in chunk_indexes:
+                        chunk_op = op.copy().reset_key()
+                        chunk_op._index_obj = chunk_indexes[idx_on_axis]
+                        shape = tuple(
+                            s - len(chunk_indexes[idx_on_axis]) if j == axis else s
+                            for j, s in enumerate(c.shape)
+                        )
+                        out_chunks.append(
+                            chunk_op.new_chunk([c], shape=shape, index=c.index)
+                        )
+                        nsplits_on_axis[c.index[axis]] = shape[axis]
+                    else:
+                        out_chunks.append(c)
+                        nsplits_on_axis[c.index[axis]] = c.shape[axis]
+
+        nsplits = tuple(
+            s if i != axis else tuple(nsplits_on_axis)
+            for i, s in enumerate(inp.nsplits)
+        )
+        out = op.outputs[0]
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            shape=out.shape,
+            order=out.order,
+            chunks=out_chunks,
+            nsplits=nsplits,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inp = ctx[op.input.key]
+        index_obj = (
+            ctx[op.index_obj.key] if hasattr(op.index_obj, "key") else op.index_obj
+        )
+        if op.offset_on_axis is None:
+            ctx[op.outputs[0].key] = np.delete(inp, index_obj, axis=op.axis)
+        else:
+            index_obj = np.array(index_obj)
+            part_index = [
+                idx - op.offset_on_axis
+                for idx in index_obj
+                if (
+                    (idx >= op.offset_on_axis)
+                    and idx < (op.offset_on_axis + inp.shape[op.axis or 0])
+                )
+            ]
+
+            ctx[op.outputs[0].key] = np.delete(inp, part_index, axis=op.axis)
+
+    def __call__(self, arr, obj, shape):
+        return self.new_tensor(filter_inputs([arr, obj]), shape=shape, order=arr.order)
+
+
+def delete(arr, obj, axis=None):
+    """
+    Return a new array with sub-arrays along an axis deleted. For a one
+    dimensional array, this returns those entries not returned by
+    `arr[obj]`.
+
+    Parameters
+    ----------
+    arr : array_like
+        Input array.
+    obj : slice, int or array of ints
+        Indicate indices of sub-arrays to remove along the specified axis.
+    axis : int, optional
+        The axis along which to delete the subarray defined by `obj`.
+        If `axis` is None, `obj` is applied to the flattened array.
+
+    Returns
+    -------
+    out : mars.tensor
+        A copy of `arr` with the elements specified by `obj` removed. Note
+        that `delete` does not occur in-place. If `axis` is None, `out` is
+        a flattened array.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> arr = mt.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
+    >>> arr.execute()
+    array([[ 1,  2,  3,  4],
+           [ 5,  6,  7,  8],
+           [ 9, 10, 11, 12]])
+    >>> mt.delete(arr, 1, 0).execute()
+    array([[ 1,  2,  3,  4],
+           [ 9, 10, 11, 12]])
+    >>> mt.delete(arr, np.s_[::2], 1).execute()
+    array([[ 2,  4],
+           [ 6,  8],
+           [10, 12]])
+    >>> mt.delete(arr, [1,3,5], None).execute()
+    array([ 1,  3,  5,  7,  8,  9, 10, 11, 12])
+    """
+    arr = astensor(arr)
+    arr = astensor(arr)
+    if getattr(obj, "ndim", 0) > 1:  # pragma: no cover
+        raise ValueError(
+            "index array argument obj to insert must be one dimensional or scalar"
+        )
+
+    if axis is None:
+        # if axis is None, array will be flatten
+        arr_size = arr.size
+        idx_length = calc_object_length(obj, size=arr_size)
+        shape = (arr_size - idx_length,)
+    else:
+        validate_axis(arr.ndim, axis)
+        idx_length = calc_object_length(obj, size=arr.shape[axis])
+        shape = tuple(
+            s - idx_length if i == axis else s for i, s in enumerate(arr.shape)
+        )
+
+    op = TensorDelete(index_obj=obj, axis=axis, dtype=arr.dtype)
+    return op(arr, obj, shape)
diff --git a/python/xorbits/_mars/tensor/base/diff.py b/python/xorbits/_mars/tensor/base/diff.py
new file mode 100644
index 000000000..e8626cb6b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/diff.py
@@ -0,0 +1,132 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...core import recursive_tile
+from ...serialization.serializables import Int32Field, Int64Field
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+from ..utils import validate_axis
+
+
+class TensorDiff(TensorOperand, TensorOperandMixin):
+    n = Int64Field("n")
+    axis = Int32Field("axis")
+
+    def __call__(self, a):
+        shape = list(a.shape)
+        shape[self.axis] -= self.n
+        shape = tuple(shape)
+        return self.new_tensor([a], shape, dtype=a.dtype, order=a.order)
+
+    @classmethod
+    def tile(cls, op: "TensorDiff"):
+        axis = op.axis
+        n = op.n
+        a = astensor(op.inputs[0])
+
+        slc1 = (slice(None),) * axis + (slice(1, None),)
+        slc2 = (slice(None),) * axis + (slice(-1),)
+
+        for _ in range(n):
+            l = yield from recursive_tile(a[slc1])
+            r = (yield from recursive_tile(a[slc2])).rechunk(l.nsplits)
+            a = yield from recursive_tile(l - r)
+
+        return [a]
+
+
+def diff(a, n=1, axis=-1):
+    """
+    Calculate the n-th discrete difference along the given axis.
+
+    The first difference is given by ``out[n] = a[n+1] - a[n]`` along
+    the given axis, higher differences are calculated by using `diff`
+    recursively.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor
+    n : int, optional
+        The number of times values are differenced. If zero, the input
+        is returned as-is.
+    axis : int, optional
+        The axis along which the difference is taken, default is the
+        last axis.
+
+    Returns
+    -------
+    diff : Tensor
+        The n-th differences. The shape of the output is the same as `a`
+        except along `axis` where the dimension is smaller by `n`. The
+        type of the output is the same as the type of the difference
+        between any two elements of `a`. This is the same as the type of
+        `a` in most cases. A notable exception is `datetime64`, which
+        results in a `timedelta64` output tensor.
+
+    See Also
+    --------
+    gradient, ediff1d, cumsum
+
+    Notes
+    -----
+    Type is preserved for boolean tensors, so the result will contain
+    `False` when consecutive elements are the same and `True` when they
+    differ.
+
+    For unsigned integer tensors, the results will also be unsigned. This
+    should not be surprising, as the result is consistent with
+    calculating the difference directly:
+
+    >>> import mars.tensor as mt
+
+    >>> u8_arr = mt.array([1, 0], dtype=mt.uint8)
+    >>> mt.diff(u8_arr).execute()
+    array([255], dtype=uint8)
+    >>> (u8_arr[1,...] - u8_arr[0,...]).execute()
+    255
+
+    If this is not desirable, then the array should be cast to a larger
+    integer type first:
+
+    >>> i16_arr = u8_arr.astype(mt.int16)
+    >>> mt.diff(i16_arr).execute()
+    array([-1], dtype=int16)
+
+    Examples
+    --------
+    >>> x = mt.array([1, 2, 4, 7, 0])
+    >>> mt.diff(x).execute()
+    array([ 1,  2,  3, -7])
+    >>> mt.diff(x, n=2).execute()
+    array([  1,   1, -10])
+
+    >>> x = mt.array([[1, 3, 6, 10], [0, 5, 6, 8]])
+    >>> mt.diff(x).execute()
+    array([[2, 3, 4],
+           [5, 1, 2]])
+    >>> mt.diff(x, axis=0).execute()
+    array([[-1,  2,  0, -2]])
+
+    >>> x = mt.arange('1066-10-13', '1066-10-16', dtype=mt.datetime64)
+    >>> mt.diff(x).execute()
+    array([1, 1], dtype='timedelta64[D]')
+
+    """
+    a = astensor(a)
+    n = int(n)
+
+    axis = validate_axis(a.ndim, axis)
+    op = TensorDiff(axis=axis, n=n)
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/base/dsplit.py b/python/xorbits/_mars/tensor/base/dsplit.py
new file mode 100644
index 000000000..90ebca324
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/dsplit.py
@@ -0,0 +1,67 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..datasource import tensor as astensor
+from .split import split
+
+
+def dsplit(a, indices_or_sections):
+    """
+    Split tensor into multiple sub-tensors along the 3rd axis (depth).
+
+    Please refer to the `split` documentation.  `dsplit` is equivalent
+    to `split` with ``axis=2``, the array is always split along the third
+    axis provided the tensor dimension is greater than or equal to 3.
+
+    See Also
+    --------
+    split : Split a tensor into multiple sub-arrays of equal size.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.arange(16.0).reshape(2, 2, 4)
+    >>> x.execute()
+    array([[[  0.,   1.,   2.,   3.],
+            [  4.,   5.,   6.,   7.]],
+           [[  8.,   9.,  10.,  11.],
+            [ 12.,  13.,  14.,  15.]]])
+    >>> mt.dsplit(x, 2).execute()
+    [array([[[  0.,   1.],
+            [  4.,   5.]],
+           [[  8.,   9.],
+            [ 12.,  13.]]]),
+     array([[[  2.,   3.],
+            [  6.,   7.]],
+           [[ 10.,  11.],
+            [ 14.,  15.]]])]
+    >>> mt.dsplit(x, mt.array([3, 6])).execute()
+    [array([[[  0.,   1.,   2.],
+            [  4.,   5.,   6.]],
+           [[  8.,   9.,  10.],
+            [ 12.,  13.,  14.]]]),
+     array([[[  3.],
+            [  7.]],
+           [[ 11.],
+            [ 15.]]]),
+     array([], dtype=float64)]
+
+    """
+    ary = a
+    a = astensor(a)
+
+    if a.ndim < 3:
+        raise ValueError("dsplit only works on tensors of 3 or more dimensions")
+    return split(ary, indices_or_sections, 2)
diff --git a/python/xorbits/_mars/tensor/base/ediff1d.py b/python/xorbits/_mars/tensor/base/ediff1d.py
new file mode 100644
index 000000000..53742668c
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/ediff1d.py
@@ -0,0 +1,74 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..datasource import tensor as astensor
+from .ravel import ravel
+
+
+def ediff1d(a, to_end=None, to_begin=None):
+    """
+    The differences between consecutive elements of a tensor.
+
+    Parameters
+    ----------
+    a : array_like
+        If necessary, will be flattened before the differences are taken.
+    to_end : array_like, optional
+        Number(s) to append at the end of the returned differences.
+    to_begin : array_like, optional
+        Number(s) to prepend at the beginning of the returned differences.
+
+    Returns
+    -------
+    ediff1d : Tensor
+        The differences. Loosely, this is ``a.flat[1:] - a.flat[:-1]``.
+
+    See Also
+    --------
+    diff, gradient
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.array([1, 2, 4, 7, 0])
+    >>> mt.ediff1d(x).execute()
+    array([ 1,  2,  3, -7])
+
+    >>> mt.ediff1d(x, to_begin=-99, to_end=mt.array([88, 99])).execute()
+    array([-99,   1,   2,   3,  -7,  88,  99])
+
+    The returned tensor is always 1D.
+
+    >>> y = [[1, 2, 4], [1, 6, 24]]
+    >>> mt.ediff1d(y).execute()
+    array([ 1,  2, -3,  5, 18])
+
+    """
+    from ..merge import concatenate
+
+    a = astensor(a)
+    a = ravel(a)
+
+    t = a[1:] - a[:-1]
+    if to_begin is None and to_end is None:
+        return t
+
+    to_concat = [t]
+    if to_begin is not None:
+        to_concat.insert(0, ravel(astensor(to_begin)))
+    if to_end is not None:
+        to_concat.append(ravel(astensor(to_end)))
+
+    return concatenate(to_concat)
diff --git a/python/xorbits/_mars/tensor/base/expand_dims.py b/python/xorbits/_mars/tensor/base/expand_dims.py
new file mode 100644
index 000000000..ff3450b86
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/expand_dims.py
@@ -0,0 +1,85 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ..datasource import tensor as astensor
+
+
+def expand_dims(a, axis):
+    """
+    Expand the shape of a tensor.
+
+    Insert a new axis that will appear at the `axis` position in the expanded
+    array shape.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor.
+    axis : int
+        Position in the expanded axes where the new axis is placed.
+
+    Returns
+    -------
+    res : Tensor
+        Output tensor. The number of dimensions is one greater than that of
+        the input tensor.
+
+    See Also
+    --------
+    squeeze : The inverse operation, removing singleton dimensions
+    reshape : Insert, remove, and combine dimensions, and resize existing ones
+    doc.indexing, atleast_1d, atleast_2d, atleast_3d
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.array([1,2])
+    >>> x.shape
+    (2,)
+
+    The following is equivalent to ``x[mt.newaxis,:]`` or ``x[mt.newaxis]``:
+
+    >>> y = mt.expand_dims(x, axis=0)
+    >>> y.execute()
+    array([[1, 2]])
+    >>> y.shape
+    (1, 2)
+
+    >>> y = mt.expand_dims(x, axis=1)  # Equivalent to x[:,mt.newaxis]
+    >>> y.execute()
+    array([[1],
+           [2]])
+    >>> y.shape
+    (2, 1)
+
+    Note that some examples may use ``None`` instead of ``np.newaxis``.  These
+    are the same objects:
+
+    >>> mt.newaxis is None
+    True
+
+    """
+    a = astensor(a)
+
+    if axis > a.ndim or axis < -a.ndim - 1:
+        raise np.AxisError(
+            f"Axis must be between -{a.ndim + 1} and {a.ndim}, got {axis}"
+        )
+
+    axis = axis if axis >= 0 else axis + a.ndim + 1
+    indexes = (slice(None),) * axis + (np.newaxis,) + (slice(None),) * (a.ndim - axis)
+    return a[indexes]
diff --git a/python/xorbits/_mars/tensor/base/flatten.py b/python/xorbits/_mars/tensor/base/flatten.py
new file mode 100644
index 000000000..857f32732
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/flatten.py
@@ -0,0 +1,63 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ..utils import get_order
+
+
+def flatten(a, order="C"):
+    """
+    Return a copy of the tensor collapsed into one dimension.
+
+    Parameters
+    ----------
+    order : {'C', 'F', 'A', 'K'}, optional
+        'C' means to flatten in row-major (C-style) order.
+        'F' means to flatten in column-major (Fortran-
+        style) order. 'A' means to flatten in column-major
+        order if `a` is Fortran *contiguous* in memory,
+        row-major order otherwise. 'K' means to flatten
+        `a` in the order the elements occur in memory.
+        The default is 'C'.
+
+    Returns
+    -------
+    y : Tensor
+        A copy of the input tensor, flattened to one dimension.
+
+    See Also
+    --------
+    ravel : Return a flattened tensor.
+    flat : A 1-D flat iterator over the tensor.
+
+    Examples
+    --------
+
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([[1,2], [3,4]])
+    >>> a.flatten().execute()
+    array([1, 2, 3, 4])
+    """
+
+    from ..reshape.reshape import TensorReshape, calc_shape
+
+    if a.ndim == 1:
+        return a
+
+    new_shape = np.nan if any(np.isnan(s) for s in a.shape) else calc_shape(a.size, -1)
+    tensor_order = get_order(order, a.order)
+    op = TensorReshape(new_shape, dtype=a.dtype, create_view=False)
+    return op(a, order=tensor_order, out_shape=new_shape)
diff --git a/python/xorbits/_mars/tensor/base/flip.py b/python/xorbits/_mars/tensor/base/flip.py
new file mode 100644
index 000000000..a77d2949c
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/flip.py
@@ -0,0 +1,90 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ..datasource import tensor as astensor
+
+
+def flip(m, axis):
+    """
+    Reverse the order of elements in a tensor along the given axis.
+
+    The shape of the array is preserved, but the elements are reordered.
+
+    Parameters
+    ----------
+    m : array_like
+        Input tensor.
+    axis : integer
+        Axis in tensor, which entries are reversed.
+
+
+    Returns
+    -------
+    out : array_like
+        A view of `m` with the entries of axis reversed.  Since a view is
+        returned, this operation is done in constant time.
+
+    See Also
+    --------
+    flipud : Flip a tensor vertically (axis=0).
+    fliplr : Flip a tensor horizontally (axis=1).
+
+    Notes
+    -----
+    flip(m, 0) is equivalent to flipud(m).
+    flip(m, 1) is equivalent to fliplr(m).
+    flip(m, n) corresponds to ``m[...,::-1,...]`` with ``::-1`` at position n.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> A = mt.arange(8).reshape((2,2,2))
+    >>> A.execute()
+    array([[[0, 1],
+            [2, 3]],
+
+           [[4, 5],
+            [6, 7]]])
+
+    >>> mt.flip(A, 0).execute()
+    array([[[4, 5],
+            [6, 7]],
+
+           [[0, 1],
+            [2, 3]]])
+
+    >>> mt.flip(A, 1).execute()
+    array([[[2, 3],
+            [0, 1]],
+
+           [[6, 7],
+            [4, 5]]])
+
+    >>> A = mt.random.randn(3,4,5)
+    >>> mt.all(mt.flip(A,2) == A[:,:,::-1,...]).execute()
+    True
+    """
+    m = astensor(m)
+
+    sl = [slice(None)] * m.ndim
+    try:
+        sl[axis] = slice(None, None, -1)
+    except IndexError:
+        raise ValueError(
+            "axis=%i is invalid for the %i-dimensional input tensor" % (axis, m.ndim)
+        )
+
+    return m[tuple(sl)]
diff --git a/python/xorbits/_mars/tensor/base/fliplr.py b/python/xorbits/_mars/tensor/base/fliplr.py
new file mode 100644
index 000000000..825a38d17
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/fliplr.py
@@ -0,0 +1,64 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .flip import flip
+
+
+def fliplr(m):
+    """
+    Flip tensor in the left/right direction.
+
+    Flip the entries in each row in the left/right direction.
+    Columns are preserved, but appear in a different order than before.
+
+    Parameters
+    ----------
+    m : array_like
+        Input tensor, must be at least 2-D.
+
+    Returns
+    -------
+    f : Tensor
+        A view of `m` with the columns reversed.  Since a view
+        is returned, this operation is :math:`\\mathcal O(1)`.
+
+    See Also
+    --------
+    flipud : Flip array in the up/down direction.
+    rot90 : Rotate array counterclockwise.
+
+    Notes
+    -----
+    Equivalent to m[:,::-1]. Requires the tensor to be at least 2-D.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> A = mt.diag([1.,2.,3.])
+    >>> A.execute()
+    array([[ 1.,  0.,  0.],
+           [ 0.,  2.,  0.],
+           [ 0.,  0.,  3.]])
+    >>> mt.fliplr(A).execute()
+    array([[ 0.,  0.,  1.],
+           [ 0.,  2.,  0.],
+           [ 3.,  0.,  0.]])
+
+    >>> A = mt.random.randn(2,3,5)
+    >>> mt.all(mt.fliplr(A) == A[:,::-1,...]).execute()
+    True
+
+    """
+    return flip(m, 1)
diff --git a/python/xorbits/_mars/tensor/base/flipud.py b/python/xorbits/_mars/tensor/base/flipud.py
new file mode 100644
index 000000000..4b8bea2ee
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/flipud.py
@@ -0,0 +1,68 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .flip import flip
+
+
+def flipud(m):
+    """
+    Flip tensor in the up/down direction.
+
+    Flip the entries in each column in the up/down direction.
+    Rows are preserved, but appear in a different order than before.
+
+    Parameters
+    ----------
+    m : array_like
+        Input tensor.
+
+    Returns
+    -------
+    out : array_like
+        A view of `m` with the rows reversed.  Since a view is
+        returned, this operation is :math:`\\mathcal O(1)`.
+
+    See Also
+    --------
+    fliplr : Flip tensor in the left/right direction.
+    rot90 : Rotate tensor counterclockwise.
+
+    Notes
+    -----
+    Equivalent to ``m[::-1,...]``.
+    Does not require the tensor to be two-dimensional.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> A = mt.diag([1.0, 2, 3])
+    >>> A.execute()
+    array([[ 1.,  0.,  0.],
+           [ 0.,  2.,  0.],
+           [ 0.,  0.,  3.]])
+    >>> mt.flipud(A).execute()
+    array([[ 0.,  0.,  3.],
+           [ 0.,  2.,  0.],
+           [ 1.,  0.,  0.]])
+
+    >>> A = mt.random.randn(2,3,5)
+    >>> mt.all(mt.flipud(A) == A[::-1,...]).execute()
+    True
+
+    >>> mt.flipud([1,2]).execute()
+    array([2, 1])
+
+    """
+    return flip(m, 0)
diff --git a/python/xorbits/_mars/tensor/base/hsplit.py b/python/xorbits/_mars/tensor/base/hsplit.py
new file mode 100644
index 000000000..bfffbc0c6
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/hsplit.py
@@ -0,0 +1,84 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..datasource import tensor as astensor
+from .split import split
+
+
+def hsplit(a, indices_or_sections):
+    """
+    Split a tensor into multiple sub-tensors horizontally (column-wise).
+
+    Please refer to the `split` documentation.  `hsplit` is equivalent
+    to `split` with ``axis=1``, the tensor is always split along the second
+    axis regardless of the tensor dimension.
+
+    See Also
+    --------
+    split : Split an array into multiple sub-arrays of equal size.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.arange(16.0).reshape(4, 4)
+    >>> x.execute()
+    array([[  0.,   1.,   2.,   3.],
+           [  4.,   5.,   6.,   7.],
+           [  8.,   9.,  10.,  11.],
+           [ 12.,  13.,  14.,  15.]])
+    >>> mt.hsplit(x, 2).execute()
+    [array([[  0.,   1.],
+           [  4.,   5.],
+           [  8.,   9.],
+           [ 12.,  13.]]),
+     array([[  2.,   3.],
+           [  6.,   7.],
+           [ 10.,  11.],
+           [ 14.,  15.]])]
+    >>> mt.hsplit(x, mt.array([3, 6])).execute()
+    [array([[  0.,   1.,   2.],
+           [  4.,   5.,   6.],
+           [  8.,   9.,  10.],
+           [ 12.,  13.,  14.]]),
+     array([[  3.],
+           [  7.],
+           [ 11.],
+           [ 15.]]),
+     array([], dtype=float64)]
+
+    With a higher dimensional array the split is still along the second axis.
+
+    >>> x = mt.arange(8.0).reshape(2, 2, 2)
+    >>> x.execute()
+    array([[[ 0.,  1.],
+            [ 2.,  3.]],
+           [[ 4.,  5.],
+            [ 6.,  7.]]])
+    >>> mt.hsplit(x, 2)
+    [array([[[ 0.,  1.]],
+           [[ 4.,  5.]]]),
+     array([[[ 2.,  3.]],
+           [[ 6.,  7.]]])]
+
+    """
+    ary = a
+    a = astensor(a)
+
+    if a.ndim == 0:
+        raise ValueError("hsplit only works on tensors of 1 or more dimensions")
+    if a.ndim > 1:
+        return split(ary, indices_or_sections, 1)
+    else:
+        return split(ary, indices_or_sections, 0)
diff --git a/python/xorbits/_mars/tensor/base/in1d.py b/python/xorbits/_mars/tensor/base/in1d.py
new file mode 100644
index 000000000..cf01961be
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/in1d.py
@@ -0,0 +1,94 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Union
+
+import numpy as np
+
+from ...typing import TileableType
+from .. import asarray
+
+
+def in1d(
+    ar1: Union[TileableType, np.ndarray],
+    ar2: Union[TileableType, np.ndarray, list],
+    assume_unique: bool = False,
+    invert: bool = False,
+):
+    """
+    Test whether each element of a 1-D tensor is also present in a second tensor.
+
+    Returns a boolean tensor the same length as `ar1` that is True
+    where an element of `ar1` is in `ar2` and False otherwise.
+
+    We recommend using :func:`isin` instead of `in1d` for new code.
+
+    Parameters
+    ----------
+    ar1 : (M,) Tensor
+        Input tensor.
+    ar2 : array_like
+        The values against which to test each value of `ar1`.
+    assume_unique : bool, optional
+        If True, the input tensors are both assumed to be unique, which
+        can speed up the calculation.  Default is False.
+    invert : bool, optional
+        If True, the values in the returned tensor are inverted (that is,
+        False where an element of `ar1` is in `ar2` and True otherwise).
+        Default is False. ``np.in1d(a, b, invert=True)`` is equivalent
+        to (but is faster than) ``np.invert(in1d(a, b))``.
+
+    Returns
+    -------
+    in1d : (M,) Tensor, bool
+        The values `ar1[in1d]` are in `ar2`.
+
+    See Also
+    --------
+    isin                  : Version of this function that preserves the
+                            shape of ar1.
+    numpy.lib.arraysetops : Module with a number of other functions for
+                            performing set operations on arrays.
+
+    Notes
+    -----
+    `in1d` can be considered as an element-wise function version of the
+    python keyword `in`, for 1-D sequences. ``in1d(a, b)`` is roughly
+    equivalent to ``mt.array([item in b for item in a])``.
+    However, this idea fails if `ar2` is a set, or similar (non-sequence)
+    container:  As ``ar2`` is converted to a tensor, in those cases
+    ``asarray(ar2)`` is an object tensor rather than the expected tensor of
+    contained values.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> test = mt.array([0, 1, 2, 5, 0])
+    >>> states = [0, 2]
+    >>> mask = mt.in1d(test, states)
+    >>> mask.execute()
+    array([ True, False,  True, False,  True])
+    >>> test[mask].execute()
+    array([0, 2, 0])
+    >>> mask = mt.in1d(test, states, invert=True)
+    >>> mask.execute()
+    array([False,  True, False,  True, False])
+    >>> test[mask].execute()
+    array([1, 5])
+    """
+    from .isin import isin
+
+    ar1 = asarray(ar1).ravel()
+    ar2 = asarray(ar2).ravel()
+    return isin(ar1, ar2, assume_unique=assume_unique, invert=invert)
diff --git a/python/xorbits/_mars/tensor/base/insert.py b/python/xorbits/_mars/tensor/base/insert.py
new file mode 100644
index 000000000..57dc75c31
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/insert.py
@@ -0,0 +1,377 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import ENTITY_TYPE, recursive_tile
+from ...serialization.serializables import AnyField, Int32Field, KeyField, TupleField
+from ...utils import has_unknown_shape
+from ..datasource import tensor as astensor
+from ..operands import TensorHasInput, TensorOperandMixin
+from ..utils import calc_object_length, filter_inputs, validate_axis
+
+
+class TensorInsert(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.INSERT
+
+    _index_obj = AnyField("index_obj")
+    _values = AnyField("values")
+    _axis = Int32Field("axis")
+    _input = KeyField("input")
+
+    # for chunk
+    _range_on_axis = TupleField("range_on_axis")
+
+    def __init__(
+        self, index_obj=None, values=None, axis=None, range_on_axis=None, **kw
+    ):
+        super().__init__(
+            _index_obj=index_obj,
+            _values=values,
+            _axis=axis,
+            _range_on_axis=range_on_axis,
+            **kw
+        )
+
+    @property
+    def index_obj(self):
+        return self._index_obj
+
+    @property
+    def values(self):
+        return self._values
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def range_on_axis(self):
+        return self._range_on_axis
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs[1:])
+        if isinstance(self._index_obj, ENTITY_TYPE):
+            self._index_obj = next(inputs_iter)
+        if isinstance(self._values, ENTITY_TYPE):
+            self._values = next(inputs_iter)
+
+    @classmethod
+    def tile(cls, op: "TensorInsert"):
+        inp = op.inputs[0]
+        axis = op.axis
+        if axis is None:
+            inp = yield from recursive_tile(inp.flatten())
+            axis = 0
+        else:
+            new_splits = [s if i == axis else sum(s) for i, s in enumerate(inp.nsplits)]
+            inp = yield from recursive_tile(inp.rechunk(new_splits))
+
+        if has_unknown_shape(inp):
+            yield
+
+        index_obj = op.index_obj
+        values = op.values
+        if isinstance(values, ENTITY_TYPE):
+            # if values is Mars type, we rechunk it into one chunk and
+            # all insert chunks depend on it
+            values = yield from recursive_tile(values.rechunk(values.shape))
+
+        nsplits_on_axis = []
+        if isinstance(index_obj, int):
+            splits = inp.nsplits[axis]
+            cum_splits = np.cumsum([0] + list(splits))
+            # add 1 for last split
+            cum_splits[-1] = cum_splits[-1] + 1
+            in_idx = cum_splits.searchsorted(index_obj, side="right") - 1
+            out_chunks = []
+            for chunk in inp.chunks:
+                if chunk.index[axis] == in_idx:
+                    chunk_op = op.copy().reset_key()
+                    chunk_op._index_obj = index_obj - cum_splits[in_idx]
+                    if isinstance(values, ENTITY_TYPE):
+                        chunk_values = values.chunks[0]
+                    else:
+                        chunk_values = values
+                    inputs = filter_inputs([chunk, chunk_values])
+                    shape = tuple(
+                        s + calc_object_length(index_obj) if i == axis else s
+                        for i, s in enumerate(chunk.shape)
+                    )
+                    out_chunks.append(
+                        chunk_op.new_chunk(inputs, shape=shape, index=chunk.index)
+                    )
+                    nsplits_on_axis.append(shape[axis])
+                else:
+                    out_chunks.append(chunk)
+                    nsplits_on_axis.append(chunk.shape[axis])
+        elif isinstance(index_obj, ENTITY_TYPE):
+            index_obj = yield from recursive_tile(index_obj.rechunk(index_obj.shape))
+            offset = 0
+            out_chunks = []
+            for chunk in inp.chunks:
+                chunk_op = op.copy().reset_key()
+                chunk_op._index_obj = index_obj.chunks[0]
+                if isinstance(values, ENTITY_TYPE):
+                    chunk_values = values.chunks[0]
+                else:
+                    chunk_values = values
+                chunk_op._values = chunk_values
+                if chunk.index[axis] + 1 == len(inp.nsplits[axis]):
+                    # the last chunk on axis
+                    chunk_op._range_on_axis = (offset, offset + chunk.shape[axis] + 1)
+                else:
+                    chunk_op._range_on_axis = (offset, offset + chunk.shape[axis])
+                shape = tuple(
+                    np.nan if j == axis else s for j, s in enumerate(chunk.shape)
+                )
+                inputs = filter_inputs([chunk, index_obj.chunks[0], chunk_values])
+                out_chunks.append(
+                    chunk_op.new_chunk(inputs, shape=shape, index=chunk.index)
+                )
+                offset += chunk.shape[axis]
+                nsplits_on_axis.append(np.nan)
+        else:
+            # index object is slice or sequence of ints
+            if isinstance(index_obj, slice):
+                index_obj = range(
+                    index_obj.start or 0, index_obj.stop, index_obj.step or 1
+                )
+            splits = inp.nsplits[axis]
+            cum_splits = np.cumsum([0] + list(splits))
+            # add 1 for last split
+            cum_splits[-1] = cum_splits[-1] + 1
+            chunk_idx_params = [[[], []] for _ in splits]
+            for i, int_idx in enumerate(index_obj):
+                in_idx = cum_splits.searchsorted(int_idx, side="right") - 1
+                chunk_idx_params[in_idx][0].append(int_idx - cum_splits[in_idx])
+                chunk_idx_params[in_idx][1].append(i)
+
+            out_chunks = []
+            offset = 0
+            for chunk in inp.chunks:
+                idx_on_axis = chunk.index[axis]
+                if len(chunk_idx_params[idx_on_axis][0]) > 0:
+                    chunk_op = op.copy().reset_key()
+                    chunk_index_obj = chunk_idx_params[idx_on_axis][0]
+                    shape = tuple(
+                        s + len(chunk_index_obj) if j == axis else s
+                        for j, s in enumerate(chunk.shape)
+                    )
+                    if isinstance(values, int):
+                        chunk_op._index_obj = chunk_index_obj
+                        out_chunks.append(
+                            chunk_op.new_chunk([chunk], shape=shape, index=chunk.index)
+                        )
+                    elif isinstance(values, ENTITY_TYPE):
+                        chunk_op._values = values.chunks[0]
+                        if chunk.index[axis] + 1 == len(inp.nsplits[axis]):
+                            chunk_op._range_on_axis = (
+                                offset,
+                                offset + chunk.shape[axis] + 1,
+                            )
+                        else:
+                            chunk_op._range_on_axis = (
+                                offset,
+                                offset + chunk.shape[axis],
+                            )
+                        out_chunks.append(
+                            chunk_op.new_chunk(
+                                [chunk, values.chunks[0]],
+                                shape=shape,
+                                index=chunk.index,
+                            )
+                        )
+                        offset += chunk.shape[axis]
+                    else:
+                        chunk_op._index_obj = chunk_index_obj
+                        values = np.asarray(values)
+                        to_shape = [
+                            calc_object_length(index_obj, chunk.shape[axis])
+                        ] + [s for j, s in enumerate(inp.shape) if j != axis]
+                        if all(j == k for j, k in zip(to_shape, values.shape)):
+                            chunk_values = np.asarray(values)[
+                                chunk_idx_params[idx_on_axis][1]
+                            ]
+                            chunk_op._values = chunk_values
+                            out_chunks.append(
+                                chunk_op.new_chunk(
+                                    [chunk], shape=shape, index=chunk.index
+                                )
+                            )
+                        else:
+                            out_chunks.append(
+                                chunk_op.new_chunk(
+                                    [chunk], shape=shape, index=chunk.index
+                                )
+                            )
+
+                    nsplits_on_axis.append(shape[axis])
+                else:
+                    out_chunks.append(chunk)
+                    nsplits_on_axis.append(chunk.shape[axis])
+
+        nsplits = tuple(
+            s if i != axis else tuple(nsplits_on_axis)
+            for i, s in enumerate(inp.nsplits)
+        )
+        out = op.outputs[0]
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            shape=out.shape,
+            order=out.order,
+            chunks=out_chunks,
+            nsplits=nsplits,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op: "TensorInsert"):
+        inp = ctx[op.input.key]
+        index_obj = (
+            ctx[op.index_obj.key] if hasattr(op.index_obj, "key") else op.index_obj
+        )
+        values = ctx[op.values.key] if hasattr(op.values, "key") else op.values
+        if op.range_on_axis is None:
+            ctx[op.outputs[0].key] = np.insert(inp, index_obj, values, axis=op.axis)
+        else:
+            if isinstance(index_obj, slice):
+                index_obj = np.arange(
+                    index_obj.step or 0, index_obj.stop, index_obj.step or 1
+                )
+            else:
+                index_obj = np.array(index_obj)
+            values = np.asarray(values)
+
+            part_index = [
+                i
+                for i, idx in enumerate(index_obj)
+                if ((idx >= op.range_on_axis[0]) and idx < op.range_on_axis[1])
+            ]
+            if (
+                (values.ndim > 0)
+                and len(index_obj) == len(values)
+                and (values[0].ndim > 0 or inp.ndim == 1)
+            ):
+                ctx[op.outputs[0].key] = np.insert(
+                    inp,
+                    index_obj[part_index] - op.range_on_axis[0],
+                    values[part_index],
+                    axis=op.axis,
+                )
+            else:
+                ctx[op.outputs[0].key] = np.insert(
+                    inp,
+                    index_obj[part_index] - op.range_on_axis[0],
+                    values,
+                    axis=op.axis,
+                )
+
+    def __call__(self, arr, obj, values, shape):
+        return self.new_tensor(
+            filter_inputs([arr, obj, values]), shape=shape, order=arr.order
+        )
+
+
+def insert(arr, obj, values, axis=None):
+    """
+    Insert values along the given axis before the given indices.
+
+    Parameters
+    ----------
+    arr : array like
+        Input array.
+    obj : int, slice or sequence of ints
+        Object that defines the index or indices before which `values` is
+        inserted.
+    values : array_like
+        Values to insert into `arr`. If the type of `values` is different
+        from that of `arr`, `values` is converted to the type of `arr`.
+        `values` should be shaped so that ``arr[...,obj,...] = values``
+        is legal.
+    axis : int, optional
+        Axis along which to insert `values`.  If `axis` is None then `arr`
+        is flattened first.
+    Returns
+    -------
+    out : ndarray
+        A copy of `arr` with `values` inserted.  Note that `insert`
+        does not occur in-place: a new array is returned. If
+        `axis` is None, `out` is a flattened array.
+    See Also
+    --------
+    append : Append elements at the end of an array.
+    concatenate : Join a sequence of arrays along an existing axis.
+    delete : Delete elements from an array.
+    Notes
+    -----
+    Note that for higher dimensional inserts `obj=0` behaves very different
+    from `obj=[0]` just like `arr[:,0,:] = values` is different from
+    `arr[:,[0],:] = values`.
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> a = mt.array([[1, 1], [2, 2], [3, 3]])
+    >>> a.execute()
+    array([[1, 1],
+           [2, 2],
+           [3, 3]])
+    >>> mt.insert(a, 1, 5).execute()
+    array([1, 5, 1, ..., 2, 3, 3])
+    >>> mt.insert(a, 1, 5, axis=1).execute()
+    array([[1, 5, 1],
+           [2, 5, 2],
+           [3, 5, 3]])
+    Difference between sequence and scalars:
+    >>> mt.insert(a, [1], [[1],[2],[3]], axis=1).execute()
+    array([[1, 1, 1],
+           [2, 2, 2],
+           [3, 3, 3]])
+    >>> b = a.flatten()
+    >>> b.execute()
+    array([1, 1, 2, 2, 3, 3])
+    >>> mt.insert(b, [2, 2], [5, 6]).execute()
+    array([1, 1, 5, ..., 2, 3, 3])
+    >>> mt.insert(b, slice(2, 4), [5, 6]).execute()
+    array([1, 1, 5, ..., 2, 3, 3])
+    >>> mt.insert(b, [2, 2], [7.13, False]).execute() # type casting
+    array([1, 1, 7, ..., 2, 3, 3])
+    >>> x = mt.arange(8).reshape(2, 4)
+    >>> idx = (1, 3)
+    >>> mt.insert(x, idx, 999, axis=1).execute()
+    array([[  0, 999,   1,   2, 999,   3],
+           [  4, 999,   5,   6, 999,   7]])
+    """
+    arr = astensor(arr)
+    if getattr(obj, "ndim", 0) > 1:  # pragma: no cover
+        raise ValueError(
+            "index array argument obj to insert must be one dimensional or scalar"
+        )
+
+    if axis is None:
+        # if axis is None, array will be flatten
+        arr_size = arr.size
+        idx_length = calc_object_length(obj, size=arr_size)
+        shape = (arr_size + idx_length,)
+    else:
+        validate_axis(arr.ndim, axis)
+        idx_length = calc_object_length(obj, size=arr.shape[axis])
+        shape = tuple(
+            s + idx_length if i == axis else s for i, s in enumerate(arr.shape)
+        )
+
+    op = TensorInsert(index_obj=obj, values=values, axis=axis, dtype=arr.dtype)
+    return op(arr, obj, values, shape)
diff --git a/python/xorbits/_mars/tensor/base/isin.py b/python/xorbits/_mars/tensor/base/isin.py
new file mode 100644
index 000000000..95fe76e0d
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/isin.py
@@ -0,0 +1,199 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Union
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import BoolField
+from ...typing import TileableType
+from ..array_utils import as_same_device, device
+from ..core import TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+
+
+class TensorIsIn(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.ISIN
+
+    assume_unique = BoolField("assume_unique")
+    invert = BoolField("invert")
+
+    def __call__(self, element, test_elements):
+        self.dtype = np.dtype(bool)
+        return self.new_tensor(
+            [element, test_elements], shape=element.shape, order=TensorOrder.C_ORDER
+        )
+
+    @classmethod
+    def tile(cls, op):
+        from ..merge.stack import TensorStack
+        from ..reduction import TensorAll, TensorAny
+
+        ar1, ar2 = op.inputs
+        invert = op.invert
+        out = op.outputs[0]
+
+        out_chunks = []
+        for ar1_chunk in ar1.chunks:
+            to_concat_chunks = []
+            for ar2_chunk in ar2.chunks:
+                chunk_op = op.copy().reset_key()
+                out_chunk = chunk_op.new_chunk(
+                    [ar1_chunk, ar2_chunk],
+                    dtype=out.dtype,
+                    shape=ar1_chunk.shape,
+                    order=out.order,
+                    index=ar1_chunk.index,
+                )
+                to_concat_chunks.append(out_chunk)
+            if len(to_concat_chunks) == 1:
+                out_chunks.append(to_concat_chunks[0])
+            else:
+                # concat chunks
+                concat_op = TensorStack(axis=0)
+                shape = (len(to_concat_chunks),) + ar1_chunk.shape
+                concat_chunk = concat_op.new_chunk(
+                    to_concat_chunks, shape=shape, dtype=out.dtype, order=out.order
+                )
+                if not invert:
+                    chunk_op = TensorAny(axis=(0,), dtype=out.dtype)
+                    out_chunk = chunk_op.new_chunk(
+                        [concat_chunk],
+                        shape=ar1_chunk.shape,
+                        dtype=out.dtype,
+                        order=out.order,
+                        index=ar1_chunk.index,
+                    )
+                else:
+                    chunk_op = TensorAll(axis=(0,), dtype=out.dtype)
+                    out_chunk = chunk_op.new_chunk(
+                        [concat_chunk],
+                        shape=ar1_chunk.shape,
+                        dtype=out.dtype,
+                        order=out.order,
+                        index=ar1_chunk.index,
+                    )
+                out_chunks.append(out_chunk)
+
+        params = out.params.copy()
+        params["nsplits"] = ar1.nsplits
+        params["chunks"] = out_chunks
+        new_op = op.copy()
+        return new_op.new_tensors(op.inputs, kws=[params])
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (element, test_elements), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            ctx[op.outputs[0].key] = xp.isin(
+                element, test_elements, assume_unique=op.assume_unique, invert=op.invert
+            )
+
+
+def isin(
+    element: Union[TileableType, np.ndarray],
+    test_elements: Union[TileableType, np.ndarray, list],
+    assume_unique: bool = False,
+    invert: bool = False,
+):
+    """
+    Calculates `element in test_elements`, broadcasting over `element` only.
+    Returns a boolean array of the same shape as `element` that is True
+    where an element of `element` is in `test_elements` and False otherwise.
+
+    Parameters
+    ----------
+    element : array_like
+        Input tensor.
+    test_elements : array_like
+        The values against which to test each value of `element`.
+        This argument is flattened if it is a tensor or array_like.
+        See notes for behavior with non-array-like parameters.
+    assume_unique : bool, optional
+        If True, the input tensors are both assumed to be unique, which
+        can speed up the calculation.  Default is False.
+    invert : bool, optional
+        If True, the values in the returned tensor are inverted, as if
+        calculating `element not in test_elements`. Default is False.
+        ``mt.isin(a, b, invert=True)`` is equivalent to (but faster
+        than) ``mt.invert(mt.isin(a, b))``.
+
+    Returns
+    -------
+    isin : Tensor, bool
+        Has the same shape as `element`. The values `element[isin]`
+        are in `test_elements`.
+
+    See Also
+    --------
+    in1d                  : Flattened version of this function.
+
+    Notes
+    -----
+
+    `isin` is an element-wise function version of the python keyword `in`.
+    ``isin(a, b)`` is roughly equivalent to
+    ``mt.array([item in b for item in a])`` if `a` and `b` are 1-D sequences.
+
+    `element` and `test_elements` are converted to tensors if they are not
+    already. If `test_elements` is a set (or other non-sequence collection)
+    it will be converted to an object tensor with one element, rather than a
+    tensor of the values contained in `test_elements`. This is a consequence
+    of the `tensor` constructor's way of handling non-sequence collections.
+    Converting the set to a list usually gives the desired behavior.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> element = 2*mt.arange(4).reshape((2, 2))
+    >>> element.execute()
+    array([[0, 2],
+           [4, 6]])
+    >>> test_elements = [1, 2, 4, 8]
+    >>> mask = mt.isin(element, test_elements)
+    >>> mask.execute()
+    array([[ False,  True],
+           [ True,  False]])
+    >>> element[mask].execute()
+    array([2, 4])
+    >>> mask = mt.isin(element, test_elements, invert=True)
+    >>> mask.execute()
+    array([[ True, False],
+           [ False, True]])
+    >>> element[mask]
+    array([0, 6])
+
+    Because of how `array` handles sets, the following does not
+    work as expected:
+
+    >>> test_set = {1, 2, 4, 8}
+    >>> mt.isin(element, test_set).execute()
+    array([[ False, False],
+           [ False, False]])
+
+    Casting the set to a list gives the expected result:
+
+    >>> mt.isin(element, list(test_set)).execute()
+    array([[ False,  True],
+           [ True,  False]])
+    """
+    element, test_elements = astensor(element), astensor(test_elements).ravel()
+    op = TensorIsIn(assume_unique=assume_unique, invert=invert)
+    return op(element, test_elements)
diff --git a/python/xorbits/_mars/tensor/base/map_chunk.py b/python/xorbits/_mars/tensor/base/map_chunk.py
new file mode 100644
index 000000000..0f884c326
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/map_chunk.py
@@ -0,0 +1,221 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes
+from ...core import CHUNK_TYPE, ENTITY_TYPE, recursive_tile
+from ...core.custom_log import redirect_custom_log
+from ...serialization.serializables import (
+    BoolField,
+    DictField,
+    FunctionField,
+    TupleField,
+)
+from ...utils import (
+    enter_current_session,
+    find_objects,
+    has_unknown_shape,
+    quiet_stdio,
+    replace_objects,
+)
+from ..operands import TensorOperand, TensorOperandMixin
+
+
+class TensorMapChunk(TensorOperand, TensorOperandMixin):
+    _op_type_ = opcodes.MAP_CHUNK
+
+    _func = FunctionField("func")
+    _elementwise = BoolField("elementwise")
+    _args = TupleField("args")
+    _kwargs = DictField("kwargs")
+    _with_chunk_index = BoolField("with_chunk_index")
+
+    def __init__(
+        self,
+        func=None,
+        args=None,
+        kwargs=None,
+        elementwise=None,
+        with_chunk_index=None,
+        **kw
+    ):
+        super().__init__(
+            _func=func,
+            _args=args,
+            _kwargs=kwargs,
+            _elementwise=elementwise,
+            _with_chunk_index=with_chunk_index,
+            **kw
+        )
+
+    @property
+    def func(self):
+        return self._func
+
+    @property
+    def elementwise(self):
+        return self._elementwise
+
+    @property
+    def args(self):
+        return self._args
+
+    @property
+    def kwargs(self):
+        return self._kwargs
+
+    @property
+    def with_chunk_index(self):
+        return self._with_chunk_index
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        old_inputs = find_objects(self._args, ENTITY_TYPE) + find_objects(
+            self._kwargs, ENTITY_TYPE
+        )
+        mapping = {o: n for o, n in zip(old_inputs, self._inputs[1:])}
+        self._args = replace_objects(self._args, mapping)
+        self._kwargs = replace_objects(self._kwargs, mapping)
+
+    def __call__(self, t, dtype=None, shape=None):
+        if dtype is None:
+            try:
+                kwargs = self.kwargs or dict()
+                if self.with_chunk_index:
+                    kwargs["chunk_index"] = (0,) * t.ndim
+                with np.errstate(all="ignore"), quiet_stdio():
+                    mock_result = self.func(
+                        np.random.rand(2, 2).astype(t.dtype),
+                        *(self.args or ()),
+                        **kwargs
+                    )
+            except:
+                raise TypeError("Cannot estimate output type of map_chunk call")
+            dtype = mock_result.dtype
+
+        if shape is not None:
+            new_shape = shape
+        else:
+            new_shape = t.shape if self.elementwise else (np.nan,) * t.ndim
+        inputs = (
+            [t]
+            + find_objects(self.args, ENTITY_TYPE)
+            + find_objects(self.kwargs, ENTITY_TYPE)
+        )
+        return self.new_tensor(inputs, dtype=dtype, shape=new_shape)
+
+    @classmethod
+    def tile(cls, op: "TensorMapChunk"):
+        inp = op.inputs[0]
+        out = op.outputs[0]
+
+        new_inputs = [op.inputs[0]]
+        if has_unknown_shape(*op.inputs[1:]):
+            yield
+        for other_inp in op.inputs[1:]:
+            other_inp = yield from recursive_tile(other_inp.rechunk(other_inp.shape))
+            new_inputs.append(other_inp)
+
+        chunks = []
+        for c in inp.chunks:
+            params = c.params
+            params["dtype"] = out.dtype
+            if not op.elementwise:
+                params["shape"] = (np.nan,) * out.ndim
+                params["index"] = params["index"][: out.ndim]
+
+            new_op = op.copy().reset_key()
+            new_op.tileable_op_key = out.key
+            chunk_inputs = [c]
+            for other_inp in new_inputs[1:]:
+                chunk_inputs.append(other_inp.chunks[0])
+            chunks.append(new_op.new_chunk(chunk_inputs, **params))
+
+        new_op = op.copy().reset_key()
+        params = out.params
+        nsplits = inp.nsplits[: out.ndim]
+        if not op.elementwise:
+            nsplits = tuple((np.nan,) * len(sp) for sp in nsplits)
+        return new_op.new_tileables(op.inputs, chunks=chunks, nsplits=nsplits, **params)
+
+    @classmethod
+    @redirect_custom_log
+    @enter_current_session
+    def execute(cls, ctx, op: "TensorMapChunk"):
+        in_data = ctx[op.inputs[0].key]
+        out_chunk = op.outputs[0]
+
+        args = op.args or tuple()
+        kwargs = op.kwargs or dict()
+        if op.with_chunk_index:
+            kwargs["chunk_index"] = out_chunk.index
+
+        chunks = find_objects(args, CHUNK_TYPE) + find_objects(kwargs, CHUNK_TYPE)
+        mapping = {chunk: ctx[chunk.key] for chunk in chunks}
+        args = replace_objects(args, mapping)
+        kwargs = replace_objects(kwargs, mapping)
+
+        ctx[op.outputs[0].key] = op.func(in_data, *args, **kwargs)
+
+
+def map_chunk(t, func, args=(), **kwargs):
+    """
+    Apply function to each chunk.
+
+    Parameters
+    ----------
+    func : function
+        Function to apply to each chunk.
+    args : tuple
+        Positional arguments to pass to func in addition to the array.
+    **kwargs
+        Additional keyword arguments to pass as keywords arguments to func.
+
+    Returns
+    -------
+    Tensor
+        Result of applying ``func`` to each chunk of the Tensor.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> a = mt.array([[4, 9]] * 3)
+    >>> a.execute()
+    array([[4, 9],
+           [4, 9],
+           [4, 9]])
+
+    Output dtype will be auto inferred.
+
+    >>> a.map_chunk(lambda c: c * 0.5).execute()
+    array([[2. , 4.5],
+           [2. , 4.5],
+           [2. , 4.5]])
+
+    You can specify ``dtype`` by yourself if auto infer failed.
+    """
+    elementwise = kwargs.pop("elementwise", None)
+    dtype = np.dtype(kwargs.pop("dtype")) if "dtype" in kwargs else None
+    shape = kwargs.pop("shape", None)
+    with_chunk_index = kwargs.pop("with_chunk_index", False)
+
+    op = TensorMapChunk(
+        func=func,
+        args=args,
+        kwargs=kwargs,
+        elementwise=elementwise,
+        with_chunk_index=with_chunk_index,
+    )
+    return op(t, dtype=dtype, shape=shape)
diff --git a/python/xorbits/_mars/tensor/base/moveaxis.py b/python/xorbits/_mars/tensor/base/moveaxis.py
new file mode 100644
index 000000000..9caa0654e
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/moveaxis.py
@@ -0,0 +1,84 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from numpy.core.numeric import normalize_axis_tuple
+
+from ..datasource import tensor as astensor
+from .transpose import transpose
+
+
+def moveaxis(a, source, destination):
+    """
+    Move axes of a tensor to new positions.
+
+    Other axes remain in their original order.
+
+    Parameters
+    ----------
+    a : Tensor
+        The tensor whose axes should be reordered.
+    source : int or sequence of int
+        Original positions of the axes to move. These must be unique.
+    destination : int or sequence of int
+        Destination positions for each of the original axes. These must also be
+        unique.
+
+    Returns
+    -------
+    result : Tensor
+        Array with moved axes. This tensor is a view of the input tensor.
+
+    See Also
+    --------
+    transpose: Permute the dimensions of an array.
+    swapaxes: Interchange two axes of an array.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.zeros((3, 4, 5))
+    >>> mt.moveaxis(x, 0, -1).shape
+    (4, 5, 3)
+    >>> mt.moveaxis(x, -1, 0).shape
+    (5, 3, 4),
+
+    These all achieve the same result:
+
+    >>> mt.transpose(x).shape
+    (5, 4, 3)
+    >>> mt.swapaxes(x, 0, -1).shape
+    (5, 4, 3)
+    >>> mt.moveaxis(x, [0, 1], [-1, -2]).shape
+    (5, 4, 3)
+    >>> mt.moveaxis(x, [0, 1, 2], [-1, -2, -3]).shape
+    (5, 4, 3)
+
+    """
+    a = astensor(a)
+
+    source = normalize_axis_tuple(source, a.ndim, "source")
+    destination = normalize_axis_tuple(destination, a.ndim, "destination")
+    if len(source) != len(destination):
+        raise ValueError(
+            "`source` and `destination` arguments must have "
+            "the same number of elements"
+        )
+
+    order = [n for n in range(a.ndim) if n not in source]
+
+    for dest, src in sorted(zip(destination, source)):
+        order.insert(dest, src)
+
+    return transpose(a, order)
diff --git a/python/xorbits/_mars/tensor/base/ndim.py b/python/xorbits/_mars/tensor/base/ndim.py
new file mode 100644
index 000000000..cb7a8c9f1
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/ndim.py
@@ -0,0 +1,53 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def ndim(a):
+    """
+    Return the number of dimensions of a tensor.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tebsir.  If it is not already a tensor, a conversion is
+        attempted.
+
+    Returns
+    -------
+    number_of_dimensions : int
+        The number of dimensions in `a`.  Scalars are zero-dimensional.
+
+    See Also
+    --------
+    ndarray.ndim : equivalent method
+    shape : dimensions of tensor
+    Tensor.shape : dimensions of tensor
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> mt.ndim([[1,2,3],[4,5,6]])
+    2
+    >>> mt.ndim(mt.array([[1,2,3],[4,5,6]]))
+    2
+    >>> mt.ndim(1)
+    0
+
+    """
+    from ..datasource import asarray
+
+    try:
+        return a.ndim
+    except AttributeError:
+        return asarray(a).ndim
diff --git a/python/xorbits/_mars/tensor/base/partition.py b/python/xorbits/_mars/tensor/base/partition.py
new file mode 100644
index 000000000..c57f832a2
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/partition.py
@@ -0,0 +1,774 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import ENTITY_TYPE, ExecutableTuple, recursive_tile
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    FieldTypes,
+    Int32Field,
+    KeyField,
+    ListField,
+    StringField,
+)
+from ...utils import flatten, has_unknown_shape, stack_back
+from ..array_utils import as_same_device, device
+from ..core import TENSOR_CHUNK_TYPE, TENSOR_TYPE, TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorShuffleProxy
+from ..utils import validate_axis, validate_order
+from .psrs import TensorPSRSOperandMixin
+
+
+class ParallelPartitionMixin(TensorPSRSOperandMixin):
+    @classmethod
+    def calc_paritions_info(cls, op, kth, size, sort_info_chunks):
+        # stage5, collect sort infos and calculate partition info for each partitions
+        if isinstance(kth, TENSOR_TYPE):
+            kth = kth.chunks[0]
+            is_kth_input = True
+        else:
+            is_kth_input = False
+        calc_op = CalcPartitionsInfo(
+            kth=kth, size=size, dtype=np.dtype(np.int32), gpu=op.gpu
+        )
+        kws = []
+        for i, sort_info_chunk in enumerate(sort_info_chunks):
+            kws.append(
+                {
+                    "shape": sort_info_chunk.shape + (len(kth),),
+                    "order": sort_info_chunk.order,
+                    "index": sort_info_chunk.index,
+                    "pos": i,
+                }
+            )
+        inputs = list(sort_info_chunks)
+        if is_kth_input:
+            inputs.insert(0, kth)
+        return calc_op.new_chunks(inputs, kws=kws, output_limit=len(kws))
+
+    @classmethod
+    def partition_on_merged(
+        cls,
+        op,
+        need_align,
+        partition_merged_chunks,
+        partition_indices_chunks,
+        partition_info_chunks,
+    ):
+        # Stage 6: partition on each partitions
+        return_value, return_indices = op.return_value, op.return_indices
+        partitioned_chunks, partitioned_indices_chunks = [], []
+        for i, partition_merged_chunk, partition_info_chunk in zip(
+            itertools.count(), partition_merged_chunks, partition_info_chunks
+        ):
+            partition_op = PartitionMerged(
+                return_value=return_value,
+                return_indices=return_indices,
+                order=op.order,
+                kind=op.kind,
+                need_align=need_align,
+                dtype=partition_merged_chunk.dtype,
+                gpu=op.gpu,
+            )
+            chunk_inputs = []
+            kws = []
+            if return_value:
+                chunk_inputs.append(partition_merged_chunk)
+                kws.append(
+                    {
+                        "shape": partition_merged_chunk.shape,
+                        "order": partition_merged_chunk.order,
+                        "index": partition_merged_chunk.index,
+                        "dtype": partition_merged_chunk.dtype,
+                        "type": "partitioned",
+                    }
+                )
+            if return_indices:
+                if not return_value:
+                    # value is required even it's not returned
+                    chunk_inputs.append(partition_merged_chunk)
+                chunk_inputs.append(partition_indices_chunks[i])
+                kws.append(
+                    {
+                        "shape": partition_merged_chunk.shape,
+                        "order": TensorOrder.C_ORDER,
+                        "index": partition_merged_chunk.index,
+                        "dtype": np.dtype(np.int64),
+                        "type": "argpartition",
+                    }
+                )
+            chunk_inputs.append(partition_info_chunk)
+            partition_chunks = partition_op.new_chunks(chunk_inputs, kws=kws)
+            if return_value:
+                partitioned_chunks.append(partition_chunks[0])
+            if return_indices:
+                partitioned_indices_chunks.append(partition_chunks[-1])
+
+        return partitioned_chunks, partitioned_indices_chunks
+
+
+class TensorPartition(TensorOperand, ParallelPartitionMixin):
+    _op_type_ = OperandDef.PARTITION
+
+    _input = KeyField("input")
+    _kth = AnyField("kth")
+    _axis = Int32Field("axis")
+    _kind = StringField("kind")
+    _order = ListField("order", FieldTypes.string)
+    _need_align = BoolField("need_align")
+    _return_value = BoolField("return_value")
+    _return_indices = BoolField("return_indices")
+
+    def __init__(
+        self,
+        kth=None,
+        axis=None,
+        kind=None,
+        order=None,
+        need_align=None,
+        return_value=None,
+        return_indices=None,
+        dtype=None,
+        gpu=None,
+        **kw,
+    ):
+        super().__init__(
+            _kth=kth,
+            _axis=axis,
+            _kind=kind,
+            _order=order,
+            _need_align=need_align,
+            _return_value=return_value,
+            _return_indices=return_indices,
+            dtype=dtype,
+            gpu=gpu,
+            **kw,
+        )
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+        if len(self._inputs) > 1:
+            self._kth = self._inputs[1]
+
+    @property
+    def psrs_kinds(self):
+        # to keep compatibility with PSRS
+        # remember when merging data in PSRSShuffle(reduce),
+        # we don't need sort, thus set psrs_kinds[2] to None
+        return ["quicksort", "mergesort", None]
+
+    @property
+    def need_align(self):
+        return self._need_align
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def kth(self):
+        return self._kth
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def kind(self):
+        return self._kind
+
+    @property
+    def order(self):
+        return self._order
+
+    @property
+    def return_value(self):
+        return self._return_value
+
+    @property
+    def return_indices(self):
+        return self._return_indices
+
+    @property
+    def output_limit(self):
+        return int(bool(self._return_value)) + int(bool(self._return_indices))
+
+    def __call__(self, a, kth):
+        inputs = [a]
+        if isinstance(kth, TENSOR_TYPE):
+            inputs.append(kth)
+        kws = []
+        if self._return_value:
+            kws.append(
+                {
+                    "shape": a.shape,
+                    "order": a.order,
+                    "type": "sorted",
+                    "dtype": a.dtype,
+                }
+            )
+        if self._return_indices:
+            kws.append(
+                {
+                    "shape": a.shape,
+                    "order": TensorOrder.C_ORDER,
+                    "type": "argsort",
+                    "dtype": np.dtype(np.int64),
+                }
+            )
+        ret = self.new_tensors(inputs, kws=kws)
+        if len(kws) == 1:
+            return ret[0]
+        return ExecutableTuple(ret)
+
+    @classmethod
+    def _tile_psrs(cls, op, kth):
+        """
+        Approach here would be almost like PSRSSorter, but there are definitely some differences
+        Main processes are listed below:
+        Stage 1, local sort and regular samples collected
+        State 2, gather and merge samples, choose and broadcast p-1 pivots
+        Stage 3, Local data is partitioned
+        Stage 4: all *ith* classes are gathered and merged, sizes should be calculated as well
+        Stage 5: collect sizes from partitions, calculate how to partition given kth
+        Stage 6: partition on each partitions
+        Stage 7: align if axis is given, and more than 1 dimension
+        """
+        out_tensor = op.outputs[0]
+        return_value, return_indices = op.return_value, op.return_indices
+        # preprocess, to make sure chunk shape on axis are approximately same
+        in_tensor, axis_chunk_shape, out_idxes, need_align = yield from cls.preprocess(
+            op
+        )
+        axis_offsets = [0] + np.cumsum(in_tensor.nsplits[op.axis]).tolist()[:-1]
+
+        out_chunks, out_indices_chunks = [], []
+        for out_idx in out_idxes:
+            # stage 1: local sort and regular samples collected
+            (
+                sorted_chunks,
+                indices_chunks,
+                sampled_chunks,
+            ) = cls.local_sort_and_regular_sample(
+                op, in_tensor, axis_chunk_shape, axis_offsets, out_idx
+            )
+
+            # stage 2: gather and merge samples, choose and broadcast p-1 pivots
+            concat_pivot_chunk = cls.concat_and_pivot(
+                op, axis_chunk_shape, out_idx, sorted_chunks, sampled_chunks
+            )
+
+            # stage 3: Local data is partitioned
+            partition_chunks = cls.partition_local_data(
+                op, axis_chunk_shape, sorted_chunks, indices_chunks, concat_pivot_chunk
+            )
+
+            proxy_chunk = TensorShuffleProxy(dtype=partition_chunks[0].dtype).new_chunk(
+                partition_chunks, shape=()
+            )
+
+            # stage 4: all *ith* classes are gathered and merged,
+            # note that we don't need sort here, op.psrs_kinds[2] is None
+            # force need_align=True to get sort info
+            (
+                partition_merged_chunks,
+                partition_indices_chunks,
+                sort_info_chunks,
+            ) = cls.partition_merge_data(op, True, True, partition_chunks, proxy_chunk)
+
+            # stage5, collect sort infos and calculate partition info for each partitions
+            partition_info_chunks = cls.calc_paritions_info(
+                op, kth, in_tensor.shape[op.axis], sort_info_chunks
+            )
+
+            # Stage 6: partition on each partitions
+            partitioned_chunks, partitioned_indices_chunks = cls.partition_on_merged(
+                op,
+                need_align,
+                partition_merged_chunks,
+                partition_indices_chunks,
+                partition_info_chunks,
+            )
+
+            if not need_align:
+                if return_value:
+                    out_chunks.extend(partitioned_chunks)
+                if return_indices:
+                    out_indices_chunks.extend(partitioned_indices_chunks)
+            else:
+                (
+                    align_reduce_chunks,
+                    align_reduce_indices_chunks,
+                ) = cls.align_partitions_data(
+                    op,
+                    out_idx,
+                    in_tensor,
+                    partitioned_chunks,
+                    partitioned_indices_chunks,
+                    sort_info_chunks,
+                )
+                if return_value:
+                    out_chunks.extend(align_reduce_chunks)
+                if return_indices:
+                    out_indices_chunks.extend(align_reduce_indices_chunks)
+
+        new_op = op.copy()
+        nsplits = list(in_tensor.nsplits)
+        if not need_align:
+            nsplits[op.axis] = (np.nan,) * axis_chunk_shape
+        kws = []
+        if return_value:
+            kws.append(
+                {
+                    "shape": out_tensor.shape,
+                    "order": out_tensor.order,
+                    "chunks": out_chunks,
+                    "nsplits": tuple(nsplits),
+                    "dtype": out_tensor.dtype,
+                    "type": "partitioned",
+                }
+            )
+        if return_indices:
+            kws.append(
+                {
+                    "shape": out_tensor.shape,
+                    "order": TensorOrder.C_ORDER,
+                    "chunks": out_indices_chunks,
+                    "nsplits": tuple(nsplits),
+                    "dtype": np.dtype(np.int64),
+                    "type": "argpartition",
+                }
+            )
+        return new_op.new_tensors(op.inputs, kws=kws)
+
+    @classmethod
+    def tile(cls, op):
+        in_tensor = op.input
+        if np.isnan(in_tensor.shape[op.axis]):
+            yield
+
+        kth = op.kth
+        if isinstance(kth, TENSOR_TYPE):
+            # if `kth` is a tensor, make sure no unknown shape
+            if has_unknown_shape(kth):
+                yield
+            kth = yield from recursive_tile(kth.rechunk(kth.shape))
+
+        return_value, return_indices = op.return_value, op.return_indices
+        if in_tensor.chunk_shape[op.axis] == 1:
+            out_chunks, out_indices_chunks = [], []
+            for chunk in in_tensor.chunks:
+                chunk_op = op.copy().reset_key()
+                kws = []
+                if return_value:
+                    kws.append(
+                        {
+                            "shape": chunk.shape,
+                            "index": chunk.index,
+                            "order": chunk.order,
+                            "dtype": chunk.dtype,
+                            "type": "partitioned",
+                        }
+                    )
+                if return_indices:
+                    kws.append(
+                        {
+                            "shape": chunk.shape,
+                            "index": chunk.index,
+                            "order": TensorOrder.C_ORDER,
+                            "dtype": np.dtype(np.int64),
+                            "type": "argpartition",
+                        }
+                    )
+                chunk_inputs = [chunk]
+                if isinstance(kth, TENSOR_TYPE):
+                    chunk_inputs.append(kth.chunks[0])
+                chunks = chunk_op.new_chunks(chunk_inputs, kws=kws)
+                if return_value:
+                    out_chunks.append(chunks[0])
+                if return_indices:
+                    out_indices_chunks.append(chunks[-1])
+
+            new_op = op.copy()
+            kws = [out.params for out in op.outputs]
+            if return_value:
+                kws[0]["nsplits"] = in_tensor.nsplits
+                kws[0]["chunks"] = out_chunks
+            if return_indices:
+                kws[-1]["nsplits"] = in_tensor.nsplits
+                kws[-1]["chunks"] = out_indices_chunks
+            return new_op.new_tensors([in_tensor], kws=kws)
+        else:
+            return (yield from cls._tile_psrs(op, kth))
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+        a = inputs[0]
+        if len(inputs) == 2:
+            kth = inputs[1]
+        else:
+            kth = op.kth
+        return_value, return_indices = op.return_value, op.return_indices
+
+        with device(device_id):
+            kw = {}
+            if op.kind is not None:
+                kw["kind"] = op.kind
+            if op.order is not None:
+                kw["order"] = op.order
+
+            if return_indices:
+                if not return_value:
+                    ctx[op.outputs[0].key] = xp.argpartition(a, kth, axis=op.axis, **kw)
+                else:
+                    argparts = ctx[op.outputs[1].key] = xp.argpartition(
+                        a, kth, axis=op.axis, **kw
+                    )
+                    ctx[op.outputs[0].key] = xp.take_along_axis(a, argparts, op.axis)
+            else:
+                ctx[op.outputs[0].key] = xp.partition(a, kth, axis=op.axis, **kw)
+
+
+class CalcPartitionsInfo(TensorOperand, TensorPSRSOperandMixin):
+    _op_type_ = OperandDef.CALC_PARTITIONS_INFO
+
+    _kth = AnyField("kth")
+    _size = Int32Field("size")
+
+    def __init__(self, kth=None, size=None, dtype=None, gpu=None, **kw):
+        super().__init__(_kth=kth, _size=size, dtype=dtype, gpu=gpu, **kw)
+
+    @property
+    def kth(self):
+        return self._kth
+
+    @property
+    def size(self):
+        return self._size
+
+    @property
+    def output_limit(self):
+        return np.inf
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if isinstance(self._kth, ENTITY_TYPE):
+            self._kth = self._inputs[0]
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            if isinstance(op.kth, TENSOR_CHUNK_TYPE):
+                kth = inputs[0]
+                sort_infos = inputs[1:]
+                # make kth all positive
+                kth = _validate_kth_value(kth, op.size)
+            else:
+                kth = op.kth
+                sort_infos = inputs
+
+            sort_info_shape = sort_infos[0].shape
+            # create arrays filled with -1, -1 means do nothing about partition
+            partition_infos = [
+                xp.full(sort_info_shape + (len(kth),), -1) for _ in sort_infos
+            ]
+            concat_sort_info = xp.stack([sort_info.ravel() for sort_info in sort_infos])
+            cumsum_sort_info = xp.cumsum(concat_sort_info, axis=0)
+
+            for j in range(cumsum_sort_info.shape[1]):
+                idx = xp.unravel_index(j, sort_infos[0].shape)
+                sizes = cumsum_sort_info[:, j]
+                to_partition_chunk_idxes = xp.searchsorted(sizes, kth, side="right")
+                for i, to_partition_chunk_idx in enumerate(to_partition_chunk_idxes):
+                    partition_idx = tuple(idx) + (i,)
+                    k = kth[i]
+                    # if to partition on chunk 0, just set to kth
+                    # else kth - {size of previous chunks}
+                    chunk_k = (
+                        k
+                        if to_partition_chunk_idx == 0
+                        else k - sizes[to_partition_chunk_idx - 1]
+                    )
+                    partition_infos[to_partition_chunk_idx][partition_idx] = chunk_k
+
+            for out, partition_info in zip(op.outputs, partition_infos):
+                ctx[out.key] = partition_info
+
+
+class PartitionMerged(TensorOperand, TensorPSRSOperandMixin):
+    _op_type_ = OperandDef.PARTITION_MERGED
+
+    _return_value = BoolField("return_value")
+    _return_indices = BoolField("return_indices")
+    _order = ListField("order", FieldTypes.string)
+    _kind = StringField("kind")
+    _need_align = BoolField("need_align")
+
+    def __init__(
+        self,
+        return_value=None,
+        return_indices=None,
+        order=None,
+        kind=None,
+        need_align=None,
+        **kw,
+    ):
+        super().__init__(
+            _return_value=return_value,
+            _return_indices=return_indices,
+            _order=order,
+            _kind=kind,
+            _need_align=need_align,
+            **kw,
+        )
+
+    @property
+    def return_value(self):
+        return self._return_value
+
+    @property
+    def return_indices(self):
+        return self._return_indices
+
+    @property
+    def order(self):
+        return self._order
+
+    @property
+    def kind(self):
+        return self._kind
+
+    @property
+    def need_align(self):
+        return self._need_align
+
+    @property
+    def output_limit(self):
+        return int(bool(self._return_value)) + int(bool(self._return_indices))
+
+    @classmethod
+    def execute(cls, ctx, op):
+        return_value, return_indices = op.return_value, op.return_indices
+
+        raw_inputs = [ctx[inp.key] for inp in op.inputs]
+        flatten_inputs = flatten(raw_inputs)
+        inputs, device_id, xp = as_same_device(
+            flatten_inputs, device=op.device, ret_extra=True
+        )
+        inputs = stack_back(inputs, raw_inputs)
+        partition_info = inputs[-1]
+        merged_data, merged_indices = None, None
+        if return_value:
+            merged_data = inputs[0]
+        if return_indices:
+            # if return indices, value should be returned
+            assert len(inputs) == 3
+            if not return_value:
+                merged_data = inputs[0]
+            merged_indices = inputs[1]
+
+        outs, out_indices = [], []
+        with device(device_id):
+            kw = {}
+            if op.kind is not None:
+                kw["kind"] = op.kind
+            if op.order is not None:
+                kw["order"] = op.order
+
+            ravel_partition_info = partition_info.reshape(-1, partition_info.shape[-1])
+            for i, merged_vec, kth in zip(
+                itertools.count(), merged_data, ravel_partition_info
+            ):
+                kth = kth[kth > -1]
+                if kth.size == 0:
+                    if return_value:
+                        outs.append(merged_vec)
+                    if return_indices:
+                        out_indices.append(merged_indices[i])
+                else:
+                    if return_indices:
+                        argparts = xp.argpartition(merged_vec, kth, **kw)
+                        if return_value:
+                            outs.append(xp.take(merged_vec, argparts))
+                        out_indices.append(xp.take(merged_indices[i], argparts))
+                    else:
+                        outs.append(xp.partition(merged_vec, kth, **kw))
+
+        if not op.need_align:
+            assert len(outs or out_indices) == 1
+            i = 0
+            if return_value:
+                ctx[op.outputs[0].key] = outs[0]
+                i += 1
+            if return_indices:
+                ctx[op.outputs[i].key] = out_indices[0]
+        else:
+            i = 0
+            if return_value:
+                ctx[op.outputs[0].key] = tuple(outs)
+                i += 1
+            if return_indices:
+                ctx[op.outputs[i].key] = tuple(out_indices)
+
+
+def _check_kth_dtype(dtype):
+    if not np.issubdtype(dtype, np.integer):
+        raise TypeError("Partition index must be integer")
+
+
+def _validate_kth_value(kth, size):
+    kth = np.where(kth < 0, kth + size, kth)
+    if np.any((kth < 0) | (kth >= size)):
+        invalid_kth = next(k for k in kth if k < 0 or k >= size)
+        raise ValueError(f"kth(={invalid_kth}) out of bounds ({size})")
+    return kth
+
+
+def _validate_partition_arguments(a, kth, axis, kind, order, kw):
+    a = astensor(a)
+    if axis is None:
+        a = a.flatten()
+        axis = 0
+    else:
+        axis = validate_axis(a.ndim, axis)
+    if isinstance(kth, ENTITY_TYPE):
+        kth = astensor(kth)
+        _check_kth_dtype(kth.dtype)
+    else:
+        kth = np.atleast_1d(kth)
+        kth = _validate_kth_value(kth, a.shape[axis])
+    if kth.ndim > 1:
+        raise ValueError("object too deep for desired array")
+    if kind != "introselect":
+        raise ValueError(f"{kind} is an unrecognized kind of select")
+    # if a is structure type and order is not None
+    order = validate_order(a.dtype, order)
+    need_align = kw.pop("need_align", None)
+    if len(kw) > 0:
+        raise TypeError(
+            f"partition() got an unexpected keyword argument '{next(iter(kw))}'"
+        )
+
+    return a, kth, axis, kind, order, need_align
+
+
+def partition(a, kth, axis=-1, kind="introselect", order=None, **kw):
+    r"""
+    Return a partitioned copy of a tensor.
+
+    Creates a copy of the tensor with its elements rearranged in such a
+    way that the value of the element in k-th position is in the
+    position it would be in a sorted tensor. All elements smaller than
+    the k-th element are moved before this element and all equal or
+    greater are moved behind it. The ordering of the elements in the two
+    partitions is undefined.
+
+    Parameters
+    ----------
+    a : array_like
+        Tensor to be sorted.
+    kth : int or sequence of ints
+        Element index to partition by. The k-th value of the element
+        will be in its final sorted position and all smaller elements
+        will be moved before it and all equal or greater elements behind
+        it. The order of all elements in the partitions is undefined. If
+        provided with a sequence of k-th it will partition all elements
+        indexed by k-th  of them into their sorted position at once.
+    axis : int or None, optional
+        Axis along which to sort. If None, the tensor is flattened before
+        sorting. The default is -1, which sorts along the last axis.
+    kind : {'introselect'}, optional
+        Selection algorithm. Default is 'introselect'.
+    order : str or list of str, optional
+        When `a` is a tensor with fields defined, this argument
+        specifies which fields to compare first, second, etc.  A single
+        field can be specified as a string.  Not all fields need be
+        specified, but unspecified fields will still be used, in the
+        order in which they come up in the dtype, to break ties.
+
+    Returns
+    -------
+    partitioned_tensor : Tensor
+        Tensor of the same type and shape as `a`.
+
+    See Also
+    --------
+    Tensor.partition : Method to sort a tensor in-place.
+    argpartition : Indirect partition.
+    sort : Full sorting
+
+    Notes
+    -----
+    The various selection algorithms are characterized by their average
+    speed, worst case performance, work space size, and whether they are
+    stable. A stable sort keeps items with the same key in the same
+    relative order. The available algorithms have the following
+    properties:
+
+    ================= ======= ============= ============ =======
+       kind            speed   worst case    work space  stable
+    ================= ======= ============= ============ =======
+    'introselect'        1        O(n)           0         no
+    ================= ======= ============= ============ =======
+
+    All the partition algorithms make temporary copies of the data when
+    partitioning along any but the last axis.  Consequently,
+    partitioning along the last axis is faster and uses less space than
+    partitioning along any other axis.
+
+    The sort order for complex numbers is lexicographic. If both the
+    real and imaginary parts are non-nan then the order is determined by
+    the real parts except when they are equal, in which case the order
+    is determined by the imaginary parts.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> a = mt.array([3, 4, 2, 1])
+    >>> mt.partition(a, 3).execute()
+    array([2, 1, 3, 4])
+
+    >>> mt.partition(a, (1, 3)).execute()
+    array([1, 2, 3, 4])
+    """
+    return_indices = kw.pop("return_index", False)
+    a, kth, axis, kind, order, need_align = _validate_partition_arguments(
+        a, kth, axis, kind, order, kw
+    )
+    op = TensorPartition(
+        kth=kth,
+        axis=axis,
+        kind=kind,
+        order=order,
+        need_align=need_align,
+        return_value=True,
+        return_indices=return_indices,
+        dtype=a.dtype,
+        gpu=a.op.gpu,
+    )
+    return op(a, kth)
diff --git a/python/xorbits/_mars/tensor/base/psrs.py b/python/xorbits/_mars/tensor/base/psrs.py
new file mode 100644
index 000000000..ff8c712fc
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/psrs.py
@@ -0,0 +1,993 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from functools import partial
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import recursive_tile
+from ...core.operand import OperandStage
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    FieldTypes,
+    Int32Field,
+    ListField,
+    StringField,
+)
+from ...utils import flatten, stack_back
+from ..array_utils import as_same_device, cp, device
+from ..core import TensorOrder
+from ..operands import (
+    TensorMapReduceOperand,
+    TensorOperand,
+    TensorOperandMixin,
+    TensorShuffleProxy,
+)
+
+
+class PSRSOperandMixin:
+    @classmethod
+    def preprocess(cls, op, in_data=None):
+        if in_data is None:
+            in_data = op.inputs[0]
+        axis_shape = in_data.shape[op.axis]
+        axis_chunk_shape = in_data.chunk_shape[op.axis]
+
+        # rechunk to ensure all chunks on axis have rough same size
+        has_unknown_shape = False
+        for ns in in_data.nsplits:
+            if any(np.isnan(s) for s in ns):
+                has_unknown_shape = True
+                break
+
+        if not has_unknown_shape:
+            axis_chunk_shape = min(axis_chunk_shape, int(np.sqrt(axis_shape)))
+            if np.isnan(axis_shape) or any(
+                np.isnan(s) for s in in_data.nsplits[op.axis]
+            ):
+                yield
+            chunk_size = int(axis_shape / axis_chunk_shape)
+            chunk_sizes = [chunk_size for _ in range(int(axis_shape // chunk_size))]
+            if axis_shape % chunk_size > 0:
+                chunk_sizes[-1] += axis_shape % chunk_size
+            in_data = yield from recursive_tile(
+                in_data.rechunk({op.axis: tuple(chunk_sizes)})
+            )
+            axis_chunk_shape = in_data.chunk_shape[op.axis]
+
+        left_chunk_shape = (
+            in_data.chunk_shape[: op.axis] + in_data.chunk_shape[op.axis + 1 :]
+        )
+        if len(left_chunk_shape) > 0:
+            out_idxes = itertools.product(*(range(s) for s in left_chunk_shape))
+        else:
+            out_idxes = [()]
+        # if the size except axis has more than 1, the sorted values on each one may be different
+        # another shuffle would be required to make sure each axis except to sort
+        # has elements with identical size
+        extra_shape = [s for i, s in enumerate(in_data.shape) if i != op.axis]
+        if getattr(op, "need_align", None) is None:
+            need_align = bool(np.prod(extra_shape, dtype=int) != 1)
+        else:
+            need_align = op.need_align
+
+        return in_data, axis_chunk_shape, out_idxes, need_align
+
+    @classmethod
+    def local_sort_and_regular_sample(
+        cls, op, in_data, axis_chunk_shape, axis_offsets, out_idx
+    ):
+        raise NotImplementedError
+
+    @classmethod
+    def concat_and_pivot(
+        cls, op, axis_chunk_shape, out_idx, sorted_chunks, sampled_chunks
+    ):
+        raise NotImplementedError
+
+    @classmethod
+    def partition_local_data(
+        cls, op, axis_chunk_shape, sorted_chunks, indices_chunks, concat_pivot_chunk
+    ):
+        raise NotImplementedError
+
+    @classmethod
+    def partition_merge_data(
+        cls, op, need_align, return_value, partition_chunks, proxy_chunk
+    ):
+        raise NotImplementedError
+
+    @classmethod
+    def align_partitions_data(
+        cls,
+        op,
+        out_idx,
+        in_data,
+        partition_sort_chunks,
+        partition_indices_chunks,
+        sort_info_chunks,
+    ):
+        raise NotImplementedError
+
+
+class TensorPSRSOperandMixin(TensorOperandMixin, PSRSOperandMixin):
+    @classmethod
+    def local_sort_and_regular_sample(
+        cls, op, in_data, axis_chunk_shape, axis_offsets, out_idx
+    ):
+        # stage 1: local sort and regular samples collected
+        sorted_chunks, indices_chunks, sampled_chunks = [], [], []
+        sampled_dtype = (
+            np.dtype([(o, in_data.dtype[o]) for o in op.order])
+            if op.order is not None
+            else in_data.dtype
+        )
+        for i in range(axis_chunk_shape):
+            idx = list(out_idx)
+            idx.insert(op.axis, i)
+            in_chunk = in_data.cix[tuple(idx)]
+            kind = None if op.psrs_kinds is None else op.psrs_kinds[0]
+            chunk_op = PSRSSortRegularSample(
+                axis=op.axis,
+                order=op.order,
+                kind=kind,
+                return_indices=op.return_indices,
+                n_partition=axis_chunk_shape,
+                axis_offset=axis_offsets[i],
+                gpu=op.gpu,
+            )
+            kws = []
+            sort_shape = in_chunk.shape
+            kws.append(
+                {
+                    "shape": sort_shape,
+                    "order": in_chunk.order,
+                    "dtype": in_chunk.dtype,
+                    "index": in_chunk.index,
+                    "type": "sorted",
+                }
+            )
+            if op.return_indices:
+                kws.append(
+                    {
+                        "shape": sort_shape,
+                        "order": TensorOrder.C_ORDER,
+                        "dtype": np.dtype(np.int64),
+                        "index": in_chunk.index,
+                        "type": "argsort",
+                    }
+                )
+            sampled_shape = (axis_chunk_shape,)
+            kws.append(
+                {
+                    "shape": sampled_shape,
+                    "order": in_chunk.order,
+                    "dtype": sampled_dtype,
+                    "index": (i,),
+                    "type": "regular_sampled",
+                }
+            )
+            chunks = chunk_op.new_chunks([in_chunk], kws=kws, output_limit=len(kws))
+            if len(chunks) == 2:
+                sort_chunk, sampled_chunk = chunks
+                sorted_chunks.append(sort_chunk)
+                sampled_chunks.append(sampled_chunk)
+            else:
+                sort_chunk, indices_chunk, sampled_chunk = chunks
+                sorted_chunks.append(sort_chunk)
+                indices_chunks.append(indices_chunk)
+                sampled_chunks.append(sampled_chunk)
+
+        return sorted_chunks, indices_chunks, sampled_chunks
+
+    @classmethod
+    def concat_and_pivot(
+        cls, op, axis_chunk_shape, out_idx, sorted_chunks, sampled_chunks
+    ):
+        # stage 2: gather and merge samples, choose and broadcast p-1 pivots
+        concat_pivot_op = PSRSConcatPivot(
+            axis=op.axis,
+            order=op.order,
+            kind=None if op.psrs_kinds is None else op.psrs_kinds[1],
+            dtype=sampled_chunks[0].dtype,
+            gpu=op.gpu,
+        )
+        concat_pivot_shape = (
+            sorted_chunks[0].shape[: op.axis]
+            + (axis_chunk_shape - 1,)
+            + sorted_chunks[0].shape[op.axis + 1 :]
+        )
+        concat_pivot_index = out_idx[: op.axis] + (0,) + out_idx[op.axis :]
+        concat_pivot_chunk = concat_pivot_op.new_chunk(
+            sampled_chunks, shape=concat_pivot_shape, index=concat_pivot_index
+        )
+        return concat_pivot_chunk
+
+    @classmethod
+    def partition_local_data(
+        cls, op, axis_chunk_shape, sorted_chunks, indices_chunks, concat_pivot_chunk
+    ):
+        # stage 3: Local data is partitioned
+        return_value = op.return_value
+        return_indices = op.return_indices
+        if return_indices:
+            # if return indices and psrs_kind[2] is not None
+            # value has to be output
+            map_return_value = True
+        else:
+            map_return_value = return_value
+        partition_chunks = []
+        length = len(sorted_chunks or indices_chunks)
+        for i in range(length):
+            chunk_inputs = []
+            if sorted_chunks:
+                chunk_inputs.append(sorted_chunks[i])
+            if indices_chunks:
+                chunk_inputs.append(indices_chunks[i])
+            chunk_inputs.append(concat_pivot_chunk)
+            partition_shuffle_map = PSRSShuffle(
+                return_value=map_return_value,
+                return_indices=return_indices,
+                stage=OperandStage.map,
+                axis=op.axis,
+                n_partition=axis_chunk_shape,
+                input_sorted=op.psrs_kinds[0] is not None,
+                order=op.order,
+                dtype=chunk_inputs[0].dtype,
+                gpu=chunk_inputs[0].op.gpu,
+            )
+            partition_chunk = partition_shuffle_map.new_chunk(
+                chunk_inputs,
+                shape=chunk_inputs[0].shape,
+                index=chunk_inputs[0].index,
+                order=chunk_inputs[0].order,
+            )
+            partition_chunks.append(partition_chunk)
+        return partition_chunks
+
+    @classmethod
+    def partition_merge_data(
+        cls, op, need_align, return_value, partition_chunks, proxy_chunk
+    ):
+        # stage 4: all *ith* classes are gathered and merged
+        return_value = return_value if return_value is not None else op.return_value
+        return_indices = op.return_indices
+        partition_sort_chunks, partition_indices_chunks, sort_info_chunks = [], [], []
+        for i, partition_chunk in enumerate(partition_chunks):
+            kind = None if op.psrs_kinds is None else op.psrs_kinds[2]
+            partition_shuffle_reduce = PSRSShuffle(
+                return_value=return_value,
+                return_indices=return_indices,
+                stage=OperandStage.reduce,
+                axis=op.axis,
+                order=op.order,
+                kind=kind,
+                reducer_index=(i,),
+                n_reducers=len(partition_chunks),
+                dtype=partition_chunk.dtype,
+                gpu=partition_chunk.op.gpu,
+                need_align=need_align,
+            )
+            kws = []
+            chunk_shape = list(partition_chunk.shape)
+            chunk_shape[op.axis] = np.nan
+            if return_value:
+                kws.append(
+                    {
+                        "shape": tuple(chunk_shape),
+                        "order": partition_chunk.order,
+                        "index": partition_chunk.index,
+                        "dtype": partition_chunk.dtype,
+                        "type": "sorted",
+                    }
+                )
+            if return_indices:
+                kws.append(
+                    {
+                        "shape": tuple(chunk_shape),
+                        "order": TensorOrder.C_ORDER,
+                        "index": partition_chunk.index,
+                        "dtype": np.dtype(np.int64),
+                        "type": "argsort",
+                    }
+                )
+            if need_align:
+                s = list(chunk_shape)
+                s.pop(op.axis)
+                kws.append(
+                    {
+                        "shape": tuple(s),
+                        "order": TensorOrder.C_ORDER,
+                        "index": partition_chunk.index,
+                        "dtype": np.dtype(np.int32),
+                        "type": "sort_info",
+                    }
+                )
+            cs = partition_shuffle_reduce.new_chunks([proxy_chunk], kws=kws)
+            i = 0
+            if return_value:
+                partition_sort_chunks.append(cs[0])
+                i += 1
+            if return_indices:
+                partition_indices_chunks.append(cs[i])
+            if need_align:
+                sort_info_chunks.append(cs[-1])
+
+        return partition_sort_chunks, partition_indices_chunks, sort_info_chunks
+
+    @classmethod
+    def align_partitions_data(
+        cls,
+        op,
+        out_idx,
+        in_tensor,
+        partition_sort_chunks,
+        partition_indices_chunks,
+        sort_info_chunks,
+    ):
+        return_value, return_indices = op.return_value, op.return_indices
+        align_map_chunks = []
+        length = len(partition_sort_chunks or partition_indices_chunks)
+        for i in range(length):
+            chunk_inputs = []
+            if return_value:
+                chunk_inputs.append(partition_sort_chunks[i])
+            if return_indices:
+                chunk_inputs.append(partition_indices_chunks[i])
+            chunk_inputs.extend(sort_info_chunks)
+            align_map_op = PSRSAlign(
+                return_value=return_value,
+                return_indices=return_indices,
+                stage=OperandStage.map,
+                axis=op.axis,
+                output_sizes=list(in_tensor.nsplits[op.axis]),
+                dtype=chunk_inputs[0].dtype,
+                gpu=chunk_inputs[0].op.gpu,
+            )
+            align_map_chunk = align_map_op.new_chunk(
+                chunk_inputs,
+                shape=chunk_inputs[0].shape,
+                index=chunk_inputs[0].index,
+                order=TensorOrder.C_ORDER,
+            )
+            align_map_chunks.append(align_map_chunk)
+        proxy_chunk = TensorShuffleProxy(dtype=align_map_chunks[0].dtype).new_chunk(
+            align_map_chunks, shape=()
+        )
+        align_reduce_value_chunks, align_reduce_indices_chunks = [], []
+        for i, align_map_chunk in enumerate(align_map_chunks):
+            align_reduce_op = PSRSAlign(
+                return_value=return_value,
+                return_indices=return_indices,
+                stage=OperandStage.reduce,
+                axis=op.axis,
+                reducer_index=(i,),
+                n_reducers=len(align_map_chunks),
+                dtype=align_map_chunk.dtype,
+                gpu=align_map_chunk.op.gpu,
+            )
+            idx = list(out_idx)
+            idx.insert(op.axis, i)
+            in_chunk = in_tensor.cix[tuple(idx)]
+            kws = []
+            if return_value:
+                kws.append(
+                    {
+                        "shape": in_chunk.shape,
+                        "index": in_chunk.index,
+                        "order": in_chunk.order,
+                        "dtype": in_chunk.dtype,
+                        "type": "sorted",
+                    }
+                )
+            if return_indices:
+                kws.append(
+                    {
+                        "shape": in_chunk.shape,
+                        "index": in_chunk.index,
+                        "order": TensorOrder.C_ORDER,
+                        "dtype": np.dtype(np.int64),
+                        "type": "argsort",
+                    }
+                )
+            align_reduce_chunks = align_reduce_op.new_chunks([proxy_chunk], kws=kws)
+            if return_value:
+                align_reduce_value_chunks.append(align_reduce_chunks[0])
+            if return_indices:
+                align_reduce_indices_chunks.append(align_reduce_chunks[-1])
+
+        return align_reduce_value_chunks, align_reduce_indices_chunks
+
+
+def _sort(a, op, xp, axis=None, kind=None, order=None, inplace=False):
+    axis = axis if axis is not None else op.axis
+    kind = kind if kind is not None else op.kind
+    order = order if order is not None else op.order
+    if xp is np:
+        method = a.sort if inplace else partial(np.sort, a)
+        return method(axis=axis, kind=kind, order=order)
+    else:  # pragma: no cover
+        # cupy does not support structure type
+        assert xp is cp
+        assert order is not None
+        method = a.sort if inplace else partial(cp.sort, a)
+        # cupy does not support kind, thus just ignore it
+        return method(axis=axis)
+
+
+def _argsort(a, op, xp, axis=None, kind=None, order=None):
+    axis = axis if axis is not None else op.axis
+    kind = kind if kind is not None else op.kind
+    order = order if order is not None else op.order
+    if xp is np:
+        return np.argsort(a, axis=axis, kind=kind, order=order)
+    else:  # pragma: no cover
+        # cupy does not support structure type
+        assert xp is cp
+        assert order is not None
+        return cp.argsort(a, axis=axis)
+
+
+class PSRSSortRegularSample(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.PSRS_SORT_REGULAR_SMAPLE
+
+    _axis = Int32Field("axis")
+    _order = ListField("order", FieldTypes.string)
+    _kind = StringField("kind")
+    _return_indices = BoolField("return_indices")
+    _n_partition = Int32Field("n_partition")
+    _axis_offset = AnyField("axis_offset")
+
+    def __init__(
+        self,
+        axis=None,
+        order=None,
+        kind=None,
+        return_indices=None,
+        n_partition=None,
+        axis_offset=None,
+        dtype=None,
+        gpu=None,
+        **kw
+    ):
+        super().__init__(
+            _axis=axis,
+            _order=order,
+            _kind=kind,
+            _return_indices=return_indices,
+            _n_partition=n_partition,
+            _axis_offset=axis_offset,
+            dtype=dtype,
+            gpu=gpu,
+            **kw
+        )
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def order(self):
+        return self._order
+
+    @property
+    def kind(self):
+        return self._kind
+
+    @property
+    def return_indices(self):
+        return self._return_indices
+
+    @property
+    def n_partition(self):
+        return self._n_partition
+
+    @property
+    def axis_offset(self):
+        return self._axis_offset
+
+    @property
+    def output_limit(self):
+        # return sorted tensor, indices(optional) and regular sampled tensor
+        return 2 if not self._return_indices else 3
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (a,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        if len(a) == 0:
+            # when chunk is empty, return the empty chunk itself
+            ctx[op.outputs[0].key] = ctx[op.outputs[-1].key] = a
+            return
+
+        with device(device_id):
+            n = op.n_partition
+            w = a.shape[op.axis] * 1.0 / (n + 1)
+            if not op.return_indices:
+                if op.kind is not None:
+                    # sort
+                    res = ctx[op.outputs[0].key] = _sort(a, op, xp)
+                else:
+                    # do not sort, prepare for sample by `xp.partition`
+                    kth = xp.linspace(
+                        max(w - 1, 0), a.shape[op.axis] - 1, num=n, endpoint=False
+                    ).astype(int)
+                    ctx[op.outputs[0].key] = res = xp.partition(
+                        a, kth, axis=op.axis, order=op.order
+                    )
+            else:
+                if op.kind is not None:
+                    # argsort
+                    indices = _argsort(a, op, xp)
+                else:
+                    # do not sort, use `xp.argpartition`
+                    kth = xp.linspace(
+                        max(w - 1, 0), a.shape[op.axis] - 1, num=n, endpoint=False
+                    ).astype(int)
+                    indices = xp.argpartition(a, kth, axis=op.axis, order=op.order)
+                ctx[op.outputs[0].key] = res = xp.take_along_axis(a, indices, op.axis)
+                ctx[op.outputs[1].key] = op.axis_offset + indices
+
+            # do regular sample
+            if op.order is not None:
+                res = res[op.order]
+            slc = xp.linspace(
+                max(w - 1, 0), a.shape[op.axis] - 1, num=n, endpoint=False
+            ).astype(int)
+            slc = (slice(None),) * op.axis + (slc,)
+            ctx[op.outputs[-1].key] = res[slc]
+
+
+class PSRSConcatPivot(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.PSRS_CONCAT_PIVOT
+
+    _axis = Int32Field("axis")
+    _order = ListField("order", FieldTypes.string)
+    _kind = StringField("kind")
+
+    def __init__(self, axis=None, order=None, kind=None, dtype=None, gpu=None, **kw):
+        super().__init__(
+            _axis=axis, _order=order, _kind=kind, dtype=dtype, gpu=gpu, **kw
+        )
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def order(self):
+        return self._order
+
+    @property
+    def kind(self):
+        return self._kind
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs if len(ctx[c.key]) > 0],
+            device=op.device,
+            ret_extra=True,
+        )
+
+        with device(device_id):
+            a = xp.concatenate(inputs, axis=op.axis)
+            p = len(inputs)
+            assert a.shape[op.axis] == p * len(op.inputs)
+
+            if op.kind is not None:
+                # sort
+                _sort(a, op, xp, inplace=True)
+            else:
+                # prepare for sampling via `partition`
+                kth = xp.linspace(
+                    p - 1, a.shape[op.axis] - 1, num=p - 1, endpoint=False
+                ).astype(int)
+                a.partition(kth, axis=op.axis)
+
+            select = xp.linspace(
+                p - 1, a.shape[op.axis] - 1, num=len(op.inputs) - 1, endpoint=False
+            ).astype(int)
+            slc = (slice(None),) * op.axis + (select,)
+            ctx[op.outputs[0].key] = a[slc]
+
+
+class PSRSShuffle(TensorMapReduceOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.PSRS_SHUFFLE
+
+    # public
+    _return_value = BoolField("return_value")
+    _return_indices = BoolField("return_indices")
+
+    # for shuffle map
+    _axis = Int32Field("axis")
+    _order = ListField("order", FieldTypes.string)
+    _n_partition = Int32Field("n_partition")
+    _input_sorted = BoolField("input_sorted")
+
+    # for shuffle reduce
+    _kind = StringField("kind")
+    _need_align = BoolField("need_align")
+
+    def __init__(
+        self,
+        return_value=None,
+        return_indices=None,
+        axis=None,
+        order=None,
+        n_partition=None,
+        input_sorted=None,
+        kind=None,
+        need_align=None,
+        **kw
+    ):
+        super().__init__(
+            _return_value=return_value,
+            _return_indices=return_indices,
+            _axis=axis,
+            _order=order,
+            _n_partition=n_partition,
+            _input_sorted=input_sorted,
+            _kind=kind,
+            _need_align=need_align,
+            **kw
+        )
+
+    @property
+    def return_value(self):
+        return self._return_value
+
+    @property
+    def return_indices(self):
+        return self._return_indices
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def order(self):
+        return self._order
+
+    @property
+    def n_partition(self):
+        return self._n_partition
+
+    @property
+    def input_sorted(self):
+        return self._input_sorted
+
+    @property
+    def kind(self):
+        return self._kind
+
+    @property
+    def need_align(self):
+        return self._need_align
+
+    @property
+    def output_limit(self):
+        if self.stage == OperandStage.map:
+            return 1
+        else:
+            limit = int(bool(self._return_value)) + int(bool(self._return_indices))
+            if self._need_align:
+                limit += 1
+            return limit
+
+    @classmethod
+    def _execute_map(cls, ctx, op):
+        return_value = op.return_value
+        return_indices = op.return_indices
+
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+        out = op.outputs[0]
+        a = inputs[0]
+        pivots = inputs[-1]
+        a_indices = None
+        if return_indices:
+            a_indices = inputs[-2]
+
+        with device(device_id):
+            shape = tuple(s for i, s in enumerate(a.shape) if i != op.axis)
+            reduce_outputs = [
+                np.empty(shape, dtype=object) for _ in range(op.n_partition)
+            ]
+            for idx in itertools.product(*(range(s) for s in shape)):
+                slc = list(idx)
+                slc.insert(op.axis, slice(None))
+                slc = tuple(slc)
+                a_1d, pivots_1d = a[slc], pivots[slc]
+                a_indices_1d = a_indices[slc] if a_indices is not None else None
+                raw_a_1d = a_1d
+                if op.order is not None:
+                    a_1d = a_1d[op.order]
+                if op.input_sorted:
+                    # a is sorted already
+                    poses = xp.searchsorted(a_1d, pivots_1d, side="right")
+                    poses = (None,) + tuple(poses) + (None,)
+                    for i in range(op.n_partition):
+                        reduce_out = []
+                        if return_value:
+                            values = raw_a_1d[poses[i] : poses[i + 1]]
+                            reduce_out.append(values)
+                        if return_indices:
+                            indices = a_indices_1d[poses[i] : poses[i + 1]]
+                            reduce_out.append(indices)
+                        reduce_outputs[i][idx] = tuple(reduce_out)
+                else:
+                    # a is not sorted, search every element in pivots
+                    out_idxes = xp.searchsorted(pivots_1d, a_1d, side="right")
+                    for i in range(op.n_partition):
+                        cond = out_idxes == i
+                        reduce_out = []
+                        if return_value:
+                            values = raw_a_1d[cond]
+                            reduce_out.append(values)
+                        if return_indices:
+                            indices = a_indices_1d[cond]
+                            reduce_out.append(indices)
+                        reduce_outputs[i][idx] = tuple(reduce_out)
+            for i in range(op.n_partition):
+                ctx[out.key, (i,)] = tuple(reduce_outputs[i].ravel())
+
+    @classmethod
+    def _execute_reduce(cls, ctx, op: "PSRSShuffle"):
+        raw_inputs = list(op.iter_mapper_data(ctx))
+        # flatten inputs
+        flatten_inputs = flatten(raw_inputs)
+        inputs, device_id, xp = as_same_device(
+            flatten_inputs, device=op.device, ret_extra=True
+        )
+        # organize back inputs
+        inputs = stack_back(inputs, raw_inputs)
+
+        out = op.outputs[0]
+        extra_shape = list(out.shape)
+        extra_shape.pop(op.axis)
+
+        return_value = op.return_value
+        return_indices = op.return_indices
+
+        with device(device_id):
+            sort_res = np.empty(len(inputs[0]), dtype=object)
+            if extra_shape:
+                sort_res = sort_res.reshape(*extra_shape)
+            sort_info = np.empty(sort_res.shape, dtype=np.int32)
+            it = itertools.count(0)
+            for inps in zip(*inputs):
+                cur = itertools.count()
+                values, indices = None, None
+                ret = []
+                if return_value or len(inps[0]) == 2:
+                    i = next(cur)
+                    values = xp.concatenate([inp[i] for inp in inps])
+                    if return_value:
+                        ret.append(values)
+                if return_indices:
+                    i = next(cur)
+                    indices = xp.concatenate([inp[i] for inp in inps])
+                    ret.append(indices)
+
+                if op.kind is not None:
+                    # sort only if kind specified
+                    if return_indices:
+                        # if kind specified and return_indices
+                        # values cannot be None
+                        assert values is not None
+                        values_indices = _argsort(values, op, xp, axis=0)
+                        if return_value:
+                            xp.take(values, values_indices, out=values)
+                        xp.take(indices, values_indices, out=indices)
+                    else:
+                        _sort(values, op, xp, axis=0, inplace=True)
+
+                j = next(it)
+                sort_res.ravel()[j] = ret
+                sort_info.ravel()[j] = len(ret[0])
+
+            if not op.need_align:
+                assert len(sort_res) == 1
+                shape = list(extra_shape)
+                shape.insert(op.axis, len(sort_res[0]))
+                i = 0
+                if return_value:
+                    ctx[op.outputs[0].key] = sort_res[0][i]
+                    i += 1
+                if return_indices:
+                    ctx[op.outputs[i].key] = sort_res[0][i]
+            else:
+                i = 0
+                if return_value:
+                    ctx[op.outputs[0].key] = tuple(r[0] for r in sort_res.ravel())
+                    i += 1
+                if return_indices:
+                    ctx[op.outputs[i].key] = tuple(r[i] for r in sort_res.ravel())
+                ctx[op.outputs[-1].key] = sort_info
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            cls._execute_map(ctx, op)
+        else:
+            cls._execute_reduce(ctx, op)
+
+
+class PSRSAlign(TensorMapReduceOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.PSRS_ALIGN
+
+    _return_value = BoolField("return_value")
+    _return_indices = BoolField("return_indices")
+    _axis = Int32Field("axis")
+    _output_sizes = ListField("output_sizes", FieldTypes.int32)
+
+    def __init__(
+        self, return_value=None, return_indices=None, axis=None, output_sizes=None, **kw
+    ):
+        super().__init__(
+            _return_value=return_value,
+            _return_indices=return_indices,
+            _axis=axis,
+            _output_sizes=output_sizes,
+            **kw
+        )
+
+    @property
+    def return_value(self):
+        return self._return_value
+
+    @property
+    def return_indices(self):
+        return self._return_indices
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def output_sizes(self):
+        return self._output_sizes
+
+    @property
+    def output_limit(self):
+        if self.stage == OperandStage.map:
+            return 1
+        else:
+            return int(bool(self._return_value)) + int(bool(self._return_indices))
+
+    @classmethod
+    def _execute_map(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+        sort_res, sort_indices = None, None
+        i = 0
+        if op.return_value:
+            sort_res = inputs[0]
+            i += 1
+        if op.return_indices:
+            sort_indices = inputs[i]
+            i += 1
+        sort_infos = inputs[i:]
+        out = op.outputs[0]
+
+        with device(device_id):
+            length = len(sort_res or sort_indices)
+            outs = np.empty((len(op.output_sizes), length), dtype=object)
+            out_sizes = op.output_sizes
+            cum_out_sizes = (0,) + tuple(np.cumsum(out_sizes))
+            for i in range(length):
+                sort_1d = sort_res[i] if sort_res is not None else None
+                indices_1d = sort_indices[i] if sort_indices is not None else None
+                sort_lengths = [sort_info.flat[i] for sort_info in sort_infos]
+                cum_sort_lengths = (0,) + tuple(np.cumsum(sort_lengths))
+                j = out.index[op.axis]
+                start_pos = cum_sort_lengths[j]
+                end_pos = cum_sort_lengths[j + 1]
+                out_idx_start, out_idx_end = np.searchsorted(
+                    cum_out_sizes, [start_pos, end_pos]
+                )
+                out_idx_start = max(out_idx_start - 1, 0)
+                for out_idx in range(out_idx_start, out_idx_end):
+                    out_start_pos = cum_out_sizes[out_idx]
+                    out_end_pos = cum_out_sizes[out_idx + 1]
+                    s = max(start_pos, out_start_pos)
+                    size = max(min(end_pos, out_end_pos) - s, 0)
+                    s = max(0, s - start_pos)
+                    ret = []
+                    if sort_1d is not None:
+                        ret.append(sort_1d[s : s + size])
+                    if indices_1d is not None:
+                        ret.append(indices_1d[s : s + size])
+                    outs[out_idx, i] = tuple(ret)
+
+            for idx in range(len(op.output_sizes)):
+                ret = []
+                for ar in outs[idx]:
+                    if ar is None:
+                        item = []
+                        if sort_res is not None:
+                            item.append(xp.empty((0,), dtype=out.dtype))
+                        if sort_indices is not None:
+                            item.append(xp.empty((0,), dtype=np.dtype(np.int64)))
+                        ret.append(tuple(item))
+                    else:
+                        ret.append(ar)
+                ctx[op.outputs[0].key, (idx,)] = tuple(ret)
+
+    @classmethod
+    def _execute_reduce(cls, ctx, op: "PSRSAlign"):
+        axis = op.axis
+        raw_inputs = list(op.iter_mapper_data(ctx))
+        flatten_inputs = flatten(raw_inputs)
+        inputs, device_id, xp = as_same_device(
+            flatten_inputs, device=op.device, ret_extra=True
+        )
+        inputs = stack_back(flatten_inputs, raw_inputs)
+
+        out = op.outputs[0]
+        extra_shape = list(out.shape)
+        extra_shape.pop(axis)
+
+        return_value = op.return_value
+        return_indices = op.return_indices
+
+        with device(device_id):
+            if return_value:
+                values_res = xp.empty(out.shape, dtype=out.dtype)
+            else:
+                values_res = None
+            if return_indices:
+                indices_res = xp.empty(out.shape, dtype=np.dtype(np.int64))
+            else:
+                indices_res = None
+            it = itertools.product(
+                *(range(s) for i, s in enumerate(out.shape) if i != axis)
+            )
+            for inps in zip(*inputs):
+                slc = list(next(it))
+                slc.insert(op.axis, slice(None))
+                i = 0
+                if return_value:
+                    value_concat_1d = xp.concatenate([inp[0] for inp in inps])
+                    values_res[tuple(slc)] = value_concat_1d
+                    i += 1
+                if return_indices:
+                    ind_concat_id = xp.concatenate([inp[i] for inp in inps])
+                    indices_res[tuple(slc)] = ind_concat_id
+
+            i = 0
+            if return_value:
+                ctx[op.outputs[0].key] = values_res.astype(
+                    values_res.dtype, order=op.outputs[0].order.value
+                )
+                i += 1
+            if return_indices:
+                ctx[op.outputs[i].key] = indices_res.astype(
+                    indices_res.dtype, order=op.outputs[i].order.value
+                )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            cls._execute_map(ctx, op)
+        else:
+            cls._execute_reduce(ctx, op)
diff --git a/python/xorbits/_mars/tensor/base/ravel.py b/python/xorbits/_mars/tensor/base/ravel.py
new file mode 100644
index 000000000..d2e6d6130
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/ravel.py
@@ -0,0 +1,90 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ..datasource import tensor as astensor
+
+
+def ravel(a, order="C"):
+    """Return a contiguous flattened tensor.
+
+    A 1-D tensor, containing the elements of the input, is returned.  A copy is
+    made only if needed.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor.  The elements in `a` are packed as a 1-D tensor.
+    order : {'C','F', 'A', 'K'}, optional
+
+        The elements of `a` are read using this index order. 'C' means
+        to index the elements in row-major, C-style order,
+        with the last axis index changing fastest, back to the first
+        axis index changing slowest.  'F' means to index the elements
+        in column-major, Fortran-style order, with the
+        first index changing fastest, and the last index changing
+        slowest. Note that the 'C' and 'F' options take no account of
+        the memory layout of the underlying array, and only refer to
+        the order of axis indexing.  'A' means to read the elements in
+        Fortran-like index order if `a` is Fortran *contiguous* in
+        memory, C-like order otherwise.  'K' means to read the
+        elements in the order they occur in memory, except for
+        reversing the data when strides are negative.  By default, 'C'
+        index order is used.
+
+    Returns
+    -------
+    y : array_like
+        If `a` is a matrix, y is a 1-D tensor, otherwise y is a tensor of
+        the same subtype as `a`. The shape of the returned array is
+        ``(a.size,)``. Matrices are special cased for backward
+        compatibility.
+
+    See Also
+    --------
+    Tensor.flat : 1-D iterator over an array.
+    Tensor.flatten : 1-D array copy of the elements of an array
+                      in row-major order.
+    Tensor.reshape : Change the shape of an array without changing its data.
+
+    Examples
+    --------
+    It is equivalent to ``reshape(-1)``.
+
+    >>> import mars.tensor as mt
+
+    >>> x = mt.array([[1, 2, 3], [4, 5, 6]])
+    >>> print(mt.ravel(x).execute())
+    [1 2 3 4 5 6]
+
+    >>> print(x.reshape(-1).execute())
+    [1 2 3 4 5 6]
+
+    >>> print(mt.ravel(x.T).execute())
+    [1 4 2 5 3 6]
+
+    >>> a = mt.arange(12).reshape(2,3,2).swapaxes(1,2); a.execute()
+    array([[[ 0,  2,  4],
+            [ 1,  3,  5]],
+           [[ 6,  8, 10],
+            [ 7,  9, 11]]])
+    >>> a.ravel().execute()
+    array([ 0,  2,  4,  1,  3,  5,  6,  8, 10,  7,  9, 11])
+
+    """
+    a = astensor(a)
+    if a.ndim == 0:
+        return a[np.newaxis]
+    return a.reshape(-1, order=order)
diff --git a/python/xorbits/_mars/tensor/base/rebalance.py b/python/xorbits/_mars/tensor/base/rebalance.py
new file mode 100644
index 000000000..c9bc519ea
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/rebalance.py
@@ -0,0 +1,142 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes
+from ...core import recursive_tile
+from ...core.context import get_context
+from ...serialization.serializables import Float64Field, Int64Field, KeyField
+from ...tensor.datasource import tensor as astensor
+from ...utils import ceildiv, has_unknown_shape
+from ..operands import TensorOperand, TensorOperandMixin
+
+
+class RebalanceMixin:
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def __call__(self, df_or_series):
+        self._output_types = df_or_series.op.output_types
+        return self.new_tileable([df_or_series], kws=[df_or_series.params])
+
+    def _get_input_object(self):
+        raise NotImplementedError
+
+    @classmethod
+    def tile(cls, op: "RebalanceMixin"):
+        in_obj = op._get_input_object()
+        ctx = get_context()
+
+        if ctx is None and op.factor is not None:
+            return [in_obj]
+
+        if has_unknown_shape(in_obj):
+            yield
+
+        size = in_obj.shape[op.axis]
+        if op.factor is not None:
+            cluster_cpu_count = ctx.get_total_n_cpu()
+            assert cluster_cpu_count > 0
+            expect_n_chunk = int(cluster_cpu_count * op.factor)
+        else:
+            expect_n_chunk = op.num_partitions
+
+        expect_chunk_size = max(ceildiv(size, expect_n_chunk), 1)
+        r = yield from recursive_tile(
+            in_obj.rechunk(
+                {op.axis: expect_chunk_size}, reassign_worker=op.reassign_worker
+            )
+        )
+        return r
+
+
+class TensorRebalance(RebalanceMixin, TensorOperandMixin, TensorOperand):
+    _op_type_ = opcodes.REBALANCE
+
+    _input = KeyField("input")
+    _factor = Float64Field("factor")
+    _axis = Int64Field("axis")
+    _num_partitions = Int64Field("num_partitions")
+
+    def __init__(
+        self,
+        input=None,
+        factor=None,
+        axis=None,  # pylint: disable=redefined-builtin
+        num_partitions=None,
+        output_types=None,
+        **kw
+    ):
+        super().__init__(
+            _input=input,
+            _factor=factor,
+            _axis=axis,
+            _num_partitions=num_partitions,
+            _output_types=output_types,
+            **kw
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def factor(self):
+        return self._factor
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def num_partitions(self):
+        return self._num_partitions
+
+    def _get_input_object(self):
+        return astensor(self.inputs[0])
+
+
+def rebalance(tensor, factor=None, axis=0, num_partitions=None, reassign_worker=True):
+    """
+    Make Data more balanced across entire cluster.
+
+    Parameters
+    ----------
+    factor : float
+        Specified so that number of chunks after balance is
+        total CPU count of cluster * factor.
+    axis : int
+        The axis to rebalance.
+    num_partitions : int
+        Specified so the number of chunks are at most
+        num_partitions.
+    reassign_worker : bool
+        If True, workers will be reassigned.
+
+    Returns
+    -------
+    Series or DataFrame
+        Result of DataFrame or Series after rebalanced.
+    """
+    if num_partitions is None:
+        factor = factor if factor is not None else 1.2
+
+    op = TensorRebalance(
+        input=tensor,
+        factor=factor,
+        axis=axis,
+        num_partitions=num_partitions,
+        reassign_worker=reassign_worker,
+    )
+    return op(tensor)
diff --git a/python/xorbits/_mars/tensor/base/repeat.py b/python/xorbits/_mars/tensor/base/repeat.py
new file mode 100644
index 000000000..522a7921e
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/repeat.py
@@ -0,0 +1,227 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from numbers import Integral
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import recursive_tile
+from ...serialization.serializables import AnyField, Int32Field, KeyField
+from ...utils import has_unknown_shape
+from ..array_utils import as_same_device, device
+from ..core import TENSOR_CHUNK_TYPE, TENSOR_TYPE, Tensor, TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorHasInput, TensorOperandMixin
+from ..utils import broadcast_shape, unify_chunks
+from .ravel import ravel
+
+
+class TensorRepeat(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.REPEAT
+
+    _input = KeyField("input")
+    _repeats = AnyField("repeats")
+    _axis = Int32Field("axis")
+
+    def __init__(self, axis=None, dtype=None, sparse=False, **kw):
+        super().__init__(_axis=axis, dtype=dtype, sparse=sparse, **kw)
+
+    @property
+    def repeats(self):
+        return self._repeats
+
+    @property
+    def axis(self):
+        return self._axis
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+        if len(inputs) > 1:
+            self._repeats = self._inputs[1]
+
+    def __call__(self, a, repeats):
+        axis = self._axis
+        a = astensor(a)
+        if axis is None:
+            a = ravel(a)
+
+        ax = axis or 0
+
+        if not isinstance(repeats, Integral):
+            if not isinstance(repeats, Tensor):
+                repeats = np.asarray(repeats)
+                if repeats.size == 1:
+                    repeats = int(repeats[0])
+                    size = repeats * a.shape[axis or 0]
+                elif a.shape[ax] == 1:
+                    size = repeats = int(repeats.sum())
+                else:
+                    size = int(repeats.sum())
+            else:
+                size = np.nan
+            if not isinstance(repeats, Integral):
+                if repeats.ndim != 1:
+                    raise ValueError("repeats should be 1-d tensor")
+                broadcast_shape(repeats.shape, a.shape[ax : ax + 1])
+        else:
+            size = a.shape[axis or 0] * repeats
+
+        shape = a.shape[:ax] + (size,) + a.shape[ax + 1 :]
+        self.dtype = a.dtype
+        self.sparse = a.issparse()
+
+        inputs = [a]
+        if isinstance(repeats, Tensor):
+            inputs.append(repeats)
+        else:
+            self._repeats = repeats
+
+        return self.new_tensor(inputs, shape, order=TensorOrder.C_ORDER)
+
+    @classmethod
+    def tile(cls, op):
+        a = op.input
+        repeats = op.repeats
+        axis = op.axis
+        ax = axis or 0
+        out = op.outputs[0]
+
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        if isinstance(repeats, TENSOR_TYPE):
+            a, repeats = yield from unify_chunks(a, (repeats, (ax,)))
+
+        nsplit = a.nsplits[axis or 0]
+
+        if isinstance(repeats, Integral):
+            new_nsplit = []
+            for split in nsplit:
+                s = max(split // repeats, 1)
+                c = split // s
+                new_nsplit.extend([s] * c)
+                if split % s != 0:
+                    new_nsplit.append(split % s)
+
+            a = yield from recursive_tile(a.rechunk({ax: new_nsplit}))
+
+        out_chunks = []
+        ax_cum_count = np.cumsum((0,) + a.nsplits[ax])
+        is_repeats_ndarray = isinstance(repeats, np.ndarray)
+        for out_idx in itertools.product(*[range(len(s)) for s in a.nsplits]):
+            in_chunk = a.cix[out_idx]
+            ax_idx = out_idx[ax]
+            if is_repeats_ndarray:
+                start = ax_cum_count[ax_idx]
+                stop = ax_cum_count[ax_idx + 1]
+                rp = repeats[start:stop]
+                size = int(rp.sum())
+            elif not isinstance(repeats, Integral):
+                rp = repeats.cix[ax_idx,]
+                size = np.nan
+            else:
+                rp = repeats
+                size = in_chunk.shape[ax] * rp
+
+            chunk_inputs = [in_chunk]
+            if isinstance(rp, TENSOR_CHUNK_TYPE):
+                chunk_inputs.append(rp)
+
+            chunk_shape = in_chunk.shape[:ax] + (size,) + in_chunk.shape[ax + 1 :]
+            chunk_op = op.copy().reset_key()
+            if len(chunk_inputs) < 2:
+                # repeats is not chunk
+                chunk_op._repeats = rp
+            out_chunk = chunk_op.new_chunk(
+                chunk_inputs, shape=chunk_shape, index=out_idx, order=out.order
+            )
+            out_chunks.append(out_chunk)
+
+        nsplits = [
+            tuple(
+                c.shape[i]
+                for c in out_chunks
+                if all(idx == 0 for j, idx in enumerate(c.index) if j != i)
+            )
+            for i in range(len(out_chunks[0].shape))
+        ]
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs, out.shape, order=out.order, chunks=out_chunks, nsplits=nsplits
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        a = inputs[0]
+        if len(inputs) > 1:
+            repeats = inputs[1]
+        else:
+            repeats = op.repeats
+
+        with device(device_id):
+            ctx[op.outputs[0].key] = xp.repeat(a, repeats=repeats, axis=op.axis)
+
+
+def repeat(a, repeats, axis=None):
+    """
+    Repeat elements of a tensor.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor.
+    repeats : int or tensor of ints
+        The number of repetitions for each element.  `repeats` is broadcasted
+        to fit the shape of the given axis.
+    axis : int, optional
+        The axis along which to repeat values.  By default, use the
+        flattened input tensor, and return a flat output tensor.
+
+    Returns
+    -------
+    repeated_tensor : Tensor
+        Output array which has the same shape as `a`, except along
+        the given axis.
+
+    See Also
+    --------
+    tile : Tile a tensor.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.repeat(3, 4).execute()
+    array([3, 3, 3, 3])
+    >>> x = mt.array([[1,2],[3,4]])
+    >>> mt.repeat(x, 2).execute()
+    array([1, 1, 2, 2, 3, 3, 4, 4])
+    >>> mt.repeat(x, 3, axis=1).execute()
+    array([[1, 1, 1, 2, 2, 2],
+           [3, 3, 3, 4, 4, 4]])
+    >>> mt.repeat(x, [1, 2], axis=0).execute()
+    array([[1, 2],
+           [3, 4],
+           [3, 4]])
+
+    """
+    op = TensorRepeat(axis=axis)
+    return op(a, repeats)
diff --git a/python/xorbits/_mars/tensor/base/result_type.py b/python/xorbits/_mars/tensor/base/result_type.py
new file mode 100644
index 000000000..8f1fad04c
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/result_type.py
@@ -0,0 +1,88 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+def result_type(*tensors_and_dtypes):
+    """
+    Returns the type that results from applying the NumPy
+    type promotion rules to the arguments.
+
+    Type promotion in Mars works similarly to the rules in languages
+    like C++, with some slight differences.  When both scalars and
+    arrays are used, the array's type takes precedence and the actual value
+    of the scalar is taken into account.
+
+    For example, calculating 3*a, where a is an array of 32-bit floats,
+    intuitively should result in a 32-bit float output.  If the 3 is a
+    32-bit integer, the NumPy rules indicate it can't convert losslessly
+    into a 32-bit float, so a 64-bit float should be the result type.
+    By examining the value of the constant, '3', we see that it fits in
+    an 8-bit integer, which can be cast losslessly into the 32-bit float.
+
+    Parameters
+    ----------
+    tensors_and_dtypes : list of tensors and dtypes
+        The operands of some operation whose result type is needed.
+
+    Returns
+    -------
+    out : dtype
+        The result type.
+
+    See also
+    --------
+    dtype, promote_types, min_scalar_type, can_cast
+
+    Notes
+    -----
+    The specific algorithm used is as follows.
+
+    Categories are determined by first checking which of boolean,
+    integer (int/uint), or floating point (float/complex) the maximum
+    kind of all the arrays and the scalars are.
+
+    If there are only scalars or the maximum category of the scalars
+    is higher than the maximum category of the arrays,
+    the data types are combined with :func:`promote_types`
+    to produce the return value.
+
+    Otherwise, `min_scalar_type` is called on each array, and
+    the resulting data types are all combined with :func:`promote_types`
+    to produce the return value.
+
+    The set of int values is not a subset of the uint values for types
+    with the same number of bits, something not reflected in
+    :func:`min_scalar_type`, but handled as a special case in `result_type`.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.result_type(3, mt.arange(7, dtype='i1'))
+    dtype('int8')
+
+    >>> mt.result_type('i4', 'c8')
+    dtype('complex128')
+
+    >>> mt.result_type(3.0, -2)
+    dtype('float64')
+    """
+    from ..core import Tensor
+
+    arrays_and_dtypes = [
+        a.dtype if isinstance(a, Tensor) else a for a in tensors_and_dtypes
+    ]
+    return np.result_type(*arrays_and_dtypes)
diff --git a/python/xorbits/_mars/tensor/base/roll.py b/python/xorbits/_mars/tensor/base/roll.py
new file mode 100644
index 000000000..6da3833e8
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/roll.py
@@ -0,0 +1,124 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.'
+
+from collections.abc import Iterable
+
+import numpy as np
+
+from ..datasource import tensor as astensor
+from ..utils import validate_axis
+from .ravel import ravel
+
+
+def roll(a, shift, axis=None):
+    """
+    Roll tensor elements along a given axis.
+
+    Elements that roll beyond the last position are re-introduced at
+    the first.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor.
+    shift : int or tuple of ints
+        The number of places by which elements are shifted.  If a tuple,
+        then `axis` must be a tuple of the same size, and each of the
+        given axes is shifted by the corresponding number.  If an int
+        while `axis` is a tuple of ints, then the same value is used for
+        all given axes.
+    axis : int or tuple of ints, optional
+        Axis or axes along which elements are shifted.  By default, the
+        tensor is flattened before shifting, after which the original
+        shape is restored.
+
+    Returns
+    -------
+    res : Tensor
+        Output tensor, with the same shape as `a`.
+
+    See Also
+    --------
+    rollaxis : Roll the specified axis backwards, until it lies in a
+               given position.
+
+    Notes
+    -----
+
+    Supports rolling over multiple dimensions simultaneously.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.arange(10)
+    >>> mt.roll(x, 2).execute()
+    array([8, 9, 0, 1, 2, 3, 4, 5, 6, 7])
+
+    >>> x2 = mt.reshape(x, (2,5))
+    >>> x2.execute()
+    array([[0, 1, 2, 3, 4],
+           [5, 6, 7, 8, 9]])
+    >>> mt.roll(x2, 1).execute()
+    array([[9, 0, 1, 2, 3],
+           [4, 5, 6, 7, 8]])
+    >>> mt.roll(x2, 1, axis=0).execute()
+    array([[5, 6, 7, 8, 9],
+           [0, 1, 2, 3, 4]])
+    >>> mt.roll(x2, 1, axis=1).execute()
+    array([[4, 0, 1, 2, 3],
+           [9, 5, 6, 7, 8]])
+
+    """
+    from ..merge import concatenate
+
+    a = astensor(a)
+    raw = a
+
+    if axis is None:
+        a = ravel(a)
+        axis = 0
+
+    if not isinstance(shift, Iterable):
+        shift = (shift,)
+    else:
+        shift = tuple(shift)
+    if not isinstance(axis, Iterable):
+        axis = (axis,)
+    else:
+        axis = tuple(axis)
+
+    for ax in axis:
+        validate_axis(a.ndim, ax)
+    broadcasted = np.broadcast(shift, axis)
+    if broadcasted.ndim > 1:
+        raise ValueError("'shift' and 'axis' should be scalars or 1D sequences")
+
+    shifts = {ax: 0 for ax in range(a.ndim)}
+    for s, ax in broadcasted:
+        shifts[ax] += s
+
+    for ax, s in shifts.items():
+        if s == 0:
+            continue
+
+        s = -s
+        s %= a.shape[ax]
+
+        slc1 = (slice(None),) * ax + (slice(s, None),)
+        slc2 = (slice(None),) * ax + (slice(s),)
+
+        a = concatenate([a[slc1], a[slc2]], axis=ax)
+
+    return a.reshape(raw.shape)
diff --git a/python/xorbits/_mars/tensor/base/rollaxis.py b/python/xorbits/_mars/tensor/base/rollaxis.py
new file mode 100644
index 000000000..586294d69
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/rollaxis.py
@@ -0,0 +1,77 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ..utils import validate_axis
+
+
+def rollaxis(tensor, axis, start=0):
+    """
+    Roll the specified axis backwards, until it lies in a given position.
+
+    This function continues to be supported for backward compatibility, but you
+    should prefer `moveaxis`.
+
+    Parameters
+    ----------
+    a : Tensor
+        Input tensor.
+    axis : int
+        The axis to roll backwards.  The positions of the other axes do not
+        change relative to one another.
+    start : int, optional
+        The axis is rolled until it lies before this position.  The default,
+        0, results in a "complete" roll.
+
+    Returns
+    -------
+    res : Tensor
+        a view of `a` is always returned.
+
+    See Also
+    --------
+    moveaxis : Move array axes to new positions.
+    roll : Roll the elements of an array by a number of positions along a
+        given axis.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.ones((3,4,5,6))
+    >>> mt.rollaxis(a, 3, 1).shape
+    (3, 6, 4, 5)
+    >>> mt.rollaxis(a, 2).shape
+    (5, 3, 4, 6)
+    >>> mt.rollaxis(a, 1, 4).shape
+    (3, 5, 6, 4)
+
+    """
+    n = tensor.ndim
+    axis = validate_axis(n, axis)
+    if start < 0:
+        start += n
+    msg = "'%s' arg requires %d <= %s < %d, but %d was passed in"
+    if not (0 <= start < n + 1):
+        raise np.AxisError(msg % ("start", -n, "start", n + 1, start))
+    if axis < start:
+        # it's been removed
+        start -= 1
+    if axis == start:
+        return tensor
+    axes = list(range(0, n))
+    axes.remove(axis)
+    axes.insert(start, axis)
+    return tensor.transpose(axes)
diff --git a/python/xorbits/_mars/tensor/base/searchsorted.py b/python/xorbits/_mars/tensor/base/searchsorted.py
new file mode 100644
index 000000000..98d323ddf
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/searchsorted.py
@@ -0,0 +1,381 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from typing import Any, List, Tuple, Type
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import TILEABLE_TYPE
+from ...core.operand import OperandStage
+from ...serialization.serializables import AnyField, Int32Field, Int64Field, StringField
+from ...typing import ChunkType, TileableType
+from ...utils import has_unknown_shape
+from ..array_utils import as_same_device, device
+from ..core import TENSOR_TYPE, TensorOrder
+from ..datasource.array import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+
+
+class TensorSearchsorted(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.SEARCHSORTED
+
+    v = AnyField("v")
+    side = StringField("side")
+    combine_size = Int32Field("combine_size")
+    # for chunk
+    offset = Int64Field("offset")
+    size = Int64Field("size")
+    n_chunk = Int64Field("n_chunk")
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if isinstance(self.v, TILEABLE_TYPE):
+            self.v = self._inputs[1]
+
+    def __call__(self, a, v):
+        inputs = [a]
+        if isinstance(v, TILEABLE_TYPE):
+            inputs.append(v)
+            shape = v.shape
+        else:
+            shape = ()
+        return self.new_tensor(inputs, shape=shape, order=TensorOrder.C_ORDER)
+
+    @classmethod
+    def _tile_one_chunk(cls, op, a, v, out):
+        chunks = []
+        if len(op.inputs) == 1:
+            v_chunks = [v]
+        else:
+            v_chunks = v.chunks
+        for v_chunk in v_chunks:
+            chunk_op = op.copy().reset_key()
+            in_chunks = [a.chunks[0]]
+            if len(op.inputs) == 2:
+                in_chunks.append(v_chunk)
+            v_shape = v_chunk.shape if hasattr(v_chunk, "shape") else ()
+            chunk_idx = v_chunk.index if len(op.inputs) == 2 else (0,)
+            chunk = chunk_op.new_chunk(
+                in_chunks, shape=v_shape, index=chunk_idx, order=out.order
+            )
+            chunks.append(chunk)
+        new_op = op.copy().reset_key()
+        nsplits = ((s,) for s in out.shape) if len(op.inputs) == 1 else v.nsplits
+        return new_op.new_tensors(op.inputs, out.shape, chunks=chunks, nsplits=nsplits)
+
+    @classmethod
+    def _combine_chunks(
+        cls,
+        to_combine: List[ChunkType],
+        op_type: Type,
+        v: Any,
+        stage: OperandStage,
+        chunk_index: Tuple[int],
+    ):
+        from ..merge import TensorStack
+
+        dtype = np.dtype(np.intp)
+        v_shape = v.shape if hasattr(v, "shape") else ()
+        combine_op = TensorStack(axis=0, dtype=dtype)
+        combine_chunk = combine_op.new_chunk(to_combine, shape=v_shape)
+        chunk_op = op_type(dtype=dtype, axis=(0,), stage=stage)
+        return chunk_op.new_chunk(
+            [combine_chunk], shape=v_shape, index=chunk_index, order=TensorOrder.C_ORDER
+        )
+
+    @classmethod
+    def _tile_tree_reduction(
+        cls, op: "TensorSearchsorted", a: TileableType, v: Any, out: TileableType
+    ):
+        from ..indexing import TensorSlice
+        from ..merge import TensorConcatenate
+        from ..reduction import TensorMax, TensorMin
+
+        if has_unknown_shape(a):
+            yield
+
+        combine_size = op.combine_size or options.combine_size
+        n_chunk = len(a.chunks)
+        input_len = len(op.inputs)
+        v_chunks = [v] if input_len == 1 else v.chunks
+        cum_nsplits = [0] + np.cumsum(a.nsplits[0]).tolist()
+
+        input_chunks = []
+        offsets = []
+        for i in range(n_chunk):
+            offset = cum_nsplits[i]
+            cur_chunk = a.chunks[i]
+            chunk_size = a.shape[0]
+            chunks = []
+            if i > 0:
+                last_chunk = a.chunks[i - 1]
+                if last_chunk.shape[0] > 0:
+                    slice_chunk_op = TensorSlice(
+                        slices=[slice(-1, None)], dtype=cur_chunk.dtype
+                    )
+                    slice_chunk = slice_chunk_op.new_chunk(
+                        [last_chunk], shape=(1,), order=out.order
+                    )
+                    chunks.append(slice_chunk)
+                    chunk_size += 1
+                    offset -= 1
+            chunks.append(cur_chunk)
+            if i < n_chunk - 1:
+                next_chunk = a.chunks[i + 1]
+                if next_chunk.shape[0] > 0:
+                    slice_chunk_op = TensorSlice(
+                        slices=[slice(1)], dtype=cur_chunk.dtype
+                    )
+                    slice_chunk = slice_chunk_op.new_chunk(
+                        [next_chunk], shape=(1,), order=out.order
+                    )
+                    chunks.append(slice_chunk)
+                    chunk_size += 1
+
+            concat_op = TensorConcatenate(dtype=cur_chunk.dtype)
+            concat_chunk = concat_op.new_chunk(
+                chunks, shape=(chunk_size,), order=out.order, index=cur_chunk.index
+            )
+            input_chunks.append(concat_chunk)
+            offsets.append(offset)
+
+        out_chunks = []
+        for v_chunk in v_chunks:
+            chunks = []
+            v_shape = v_chunk.shape if hasattr(v_chunk, "shape") else ()
+            v_index = v_chunk.index if hasattr(v_chunk, "index") else (0,)
+            for inp_chunk, offset in zip(input_chunks, offsets):
+                chunk_op = op.copy().reset_key()
+                chunk_op.stage = OperandStage.map
+                chunk_op.offset = offset
+                chunk_op.n_chunk = n_chunk
+                chunk_op.size = a.shape[0]
+                chunk_inputs = [inp_chunk]
+                if input_len > 1:
+                    chunk_inputs.append(v_chunk)
+                map_chunk = chunk_op.new_chunk(
+                    chunk_inputs, shape=v_shape, index=inp_chunk.index, order=out.order
+                )
+                chunks.append(map_chunk)
+
+            op_type = TensorMax if op.side == "right" else TensorMin
+            while len(chunks) > combine_size:
+                new_chunks = []
+                it = itertools.count(0)
+                while True:
+                    j = next(it)
+                    to_combine = chunks[j * combine_size : (j + 1) * combine_size]
+                    if len(to_combine) == 0:
+                        break
+
+                    new_chunks.append(
+                        cls._combine_chunks(
+                            to_combine, op_type, v_chunk, OperandStage.combine, (j,)
+                        )
+                    )
+                chunks = new_chunks
+
+            chunk = cls._combine_chunks(
+                chunks, op_type, v_chunk, OperandStage.agg, v_index
+            )
+            out_chunks.append(chunk)
+
+        new_op = op.copy().reset_key()
+        nsplits = ((s,) for s in out.shape) if len(op.inputs) == 1 else v.nsplits
+        return new_op.new_tensors(
+            op.inputs, out.shape, chunks=out_chunks, nsplits=nsplits
+        )
+
+    @classmethod
+    def tile(cls, op):
+        a = op.inputs[0]
+        out = op.outputs[0]
+        input_len = len(op.inputs)
+        if input_len == 1:
+            v = op.v
+        else:
+            v = op.inputs[1]
+
+        if len(a.chunks) == 1:
+            return cls._tile_one_chunk(op, a, v, out)
+        return (yield from cls._tile_tree_reduction(op, a, v, out))
+
+    @classmethod
+    def _execute_without_stage(cls, xp, a, v, op):
+        return xp.searchsorted(a, v, side=op.side)
+
+    @classmethod
+    def _execute_map(cls, xp: Any, a: np.ndarray, v: Any, op: "TensorSearchsorted"):
+        out = op.outputs[0]
+        i = out.index[0]
+        side = op.side
+
+        raw_v = v
+        v = xp.atleast_1d(v)
+        searched = xp.searchsorted(a, v, side=op.side)
+        xp.add(searched, op.offset, out=searched)
+        a_min, a_max = a[0], a[-1]
+        if i == 0:
+            # the first chunk
+            if a_min == a_max:
+                miss = v > a_max
+            else:
+                miss = v > a_max if side == "left" else v >= a_max
+        elif i == op.n_chunk - 1:
+            # the last chunk
+            if a_min == a_max:
+                miss = v < a_min
+            else:
+                miss = v <= a_min if side == "left" else v < a_min
+        else:
+            if side == "left" and a_min < a_max:
+                miss = (v <= a_min) | (v > a_max)
+            elif a_min < a_max:
+                miss = (v < a_min) | (v >= a_max)
+            else:
+                assert a_min == a_max
+                miss = v != a_min
+        if side == "right":
+            searched[miss] = -1
+        else:
+            searched[miss] = op.size + 1
+
+        return searched[0] if np.isscalar(raw_v) else searched
+
+    @classmethod
+    def execute(cls, ctx, op):
+        a = ctx[op.inputs[0].key]
+        v = ctx[op.inputs[1].key] if len(op.inputs) == 2 else op.v
+
+        data = []
+        if isinstance(a, tuple):
+            data.extend(a)
+        else:
+            data.append(a)
+        if len(op.inputs) == 2:
+            data.append(v)
+
+        data, device_id, xp = as_same_device(data, device=op.device, ret_extra=True)
+
+        if isinstance(a, tuple):
+            a = data[:2]
+        else:
+            a = data[0]
+        if len(op.inputs) == 2:
+            v = data[-1]
+
+        with device(device_id):
+            if op.stage is None:
+                ret = cls._execute_without_stage(xp, a, v, op)
+            else:
+                assert op.stage == OperandStage.map
+                ret = cls._execute_map(xp, a, v, op)
+            ctx[op.outputs[0].key] = ret
+
+
+def searchsorted(a, v, side="left", sorter=None, combine_size=None):
+    """
+    Find indices where elements should be inserted to maintain order.
+
+    Find the indices into a sorted tensor `a` such that, if the
+    corresponding elements in `v` were inserted before the indices, the
+    order of `a` would be preserved.
+
+    Assuming that `a` is sorted:
+
+    ======  ============================
+    `side`  returned index `i` satisfies
+    ======  ============================
+    left    ``a[i-1] < v <= a[i]``
+    right   ``a[i-1] <= v < a[i]``
+    ======  ============================
+
+    Parameters
+    ----------
+    a : 1-D array_like
+        Input tensor. If `sorter` is None, then it must be sorted in
+        ascending order, otherwise `sorter` must be an array of indices
+        that sort it.
+    v : array_like
+        Values to insert into `a`.
+    side : {'left', 'right'}, optional
+        If 'left', the index of the first suitable location found is given.
+        If 'right', return the last such index.  If there is no suitable
+        index, return either 0 or N (where N is the length of `a`).
+    sorter : 1-D array_like, optional
+        Optional tensor of integer indices that sort array a into ascending
+        order. They are typically the result of argsort.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    indices : tensor of ints
+        Array of insertion points with the same shape as `v`.
+
+    See Also
+    --------
+    sort : Return a sorted copy of a tensor.
+    histogram : Produce histogram from 1-D data.
+
+    Notes
+    -----
+    Binary search is used to find the required insertion points.
+
+    This function is a faster version of the builtin python `bisect.bisect_left`
+    (``side='left'``) and `bisect.bisect_right` (``side='right'``) functions,
+    which is also vectorized in the `v` argument.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.searchsorted([1,2,3,4,5], 3).execute()
+    2
+    >>> mt.searchsorted([1,2,3,4,5], 3, side='right').execute()
+    3
+    >>> mt.searchsorted([1,2,3,4,5], [-10, 10, 2, 3]).execute()
+    array([0, 5, 1, 2])
+
+    """
+
+    if (
+        not isinstance(a, TENSOR_TYPE)
+        and sorter is not None
+        and not isinstance(sorter, TENSOR_TYPE)
+    ):
+        a = astensor(np.asarray(a)[sorter])
+    else:
+        a = astensor(a)
+        if sorter is not None:
+            a = a[sorter]
+
+    if a.ndim != 1:
+        raise ValueError("`a` should be 1-d tensor")
+    if a.issparse():
+        # does not support sparse tensor
+        raise ValueError("`a` should be a dense tensor")
+    if side not in {"left", "right"}:
+        raise ValueError(f"'{side}' is an invalid value for keyword 'side'")
+
+    if not np.isscalar(v):
+        v = astensor(v)
+
+    op = TensorSearchsorted(
+        v=v, side=side, dtype=np.dtype(np.intp), combine_size=combine_size
+    )
+    return op(a, v)
diff --git a/python/xorbits/_mars/tensor/base/setdiff1d.py b/python/xorbits/_mars/tensor/base/setdiff1d.py
new file mode 100644
index 000000000..1becec87d
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/setdiff1d.py
@@ -0,0 +1,58 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def setdiff1d(ar1, ar2, assume_unique=False):
+    """
+    Find the set difference of two tensors.
+
+    Return the unique values in `ar1` that are not in `ar2`.
+
+    Parameters
+    ----------
+    ar1 : array_like
+        Input tensor.
+    ar2 : array_like
+        Input comparison tensor.
+    assume_unique : bool
+        If True, the input tensors are both assumed to be unique, which
+        can speed up the calculation.  Default is False.
+
+    Returns
+    -------
+    setdiff1d : Tensor
+        1D tensor of values in `ar1` that are not in `ar2`. The result
+        is sorted when `assume_unique=False`, but otherwise only sorted
+        if the input is sorted.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> a = mt.array([1, 2, 3, 2, 4, 1])
+    >>> b = mt.array([3, 4, 5, 6])
+    >>> mt.setdiff1d(a, b).execute()
+    array([1, 2])
+
+    """
+
+    from ..datasource.array import asarray
+    from .in1d import in1d
+    from .unique import unique
+
+    if assume_unique:
+        ar1 = asarray(ar1).ravel()
+    else:
+        ar1 = unique(ar1)
+        ar2 = unique(ar2)
+    return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)]
diff --git a/python/xorbits/_mars/tensor/base/shape.py b/python/xorbits/_mars/tensor/base/shape.py
new file mode 100644
index 000000000..722a6a1a9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/shape.py
@@ -0,0 +1,139 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes
+from ...core import ExecutableTuple
+from ...serialization.serializables import Int32Field, KeyField
+from ...utils import calc_nsplits
+from ..core import TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+
+
+class TensorGetShape(TensorOperand, TensorOperandMixin):
+    _op_type_ = opcodes.GET_SHAPE
+
+    _a = KeyField("a")
+    _ndim = Int32Field("ndim")
+
+    def __init__(self, pure_depends=None, a=None, ndim=None, dtype=None, **kw):
+        super().__init__(
+            dtype=dtype, _a=a, _ndim=ndim, _pure_depends=pure_depends, **kw
+        )
+
+    @property
+    def a(self):
+        return self._a
+
+    @property
+    def ndim(self):
+        return self._ndim
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if self._a is not None:
+            self._a = self._inputs[0]
+
+    @property
+    def output_limit(self):
+        return self._ndim
+
+    def __call__(self, a):
+        if not np.isnan(a.size):
+            return ExecutableTuple([astensor(s) for s in a.shape])
+
+        self._a = a
+        kws = []
+        for i in range(self.output_limit):
+            kws.append(
+                {
+                    "shape": (),
+                    "dtype": np.dtype(np.intc),
+                    "order": TensorOrder.C_ORDER,
+                    "i": i,
+                }
+            )
+        return ExecutableTuple(self.new_tensors([a], kws=kws))
+
+    @classmethod
+    def tile(cls, op):
+        a = op.a
+        outs = op.outputs
+
+        yield a.chunks
+
+        chunk_op = TensorGetShape(pure_depends=[True] * len(a.chunks), ndim=op.ndim)
+        chunk_kws = []
+        for out in outs:
+            params = out.params
+            params["index"] = ()
+            chunk_kws.append(params)
+        chunks = chunk_op.new_chunks(a.chunks, kws=chunk_kws)
+
+        kws = []
+        for c, out in zip(chunks, outs):
+            params = out.params
+            params["chunks"] = [c]
+            params["nsplits"] = ()
+            kws.append(params)
+        new_op = op.copy()
+        return new_op.new_tensors(op.inputs, kws=kws, output_limit=op.output_limit)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk_idx_to_chunk_shapes = dict((c.index, c.shape) for c in op.inputs)
+        nsplits = calc_nsplits(chunk_idx_to_chunk_shapes)
+        shape = tuple(sum(ns) for ns in nsplits)
+        for o, s in zip(op.outputs, shape):
+            ctx[o.key] = s
+
+
+def shape(a):
+    """
+    Return the shape of a tensor.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor.
+
+    Returns
+    -------
+    shape : ExecutableTuple of tensors
+        The elements of the shape tuple give the lengths of the
+        corresponding array dimensions.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.shape(mt.eye(3)).execute()
+    (3, 3)
+    >>> mt.shape([[1, 2]]).execute()
+    (1, 2)
+    >>> mt.shape([0]).execute()
+    (1,)
+    >>> mt.shape(0).execute()
+    ()
+
+    >>> a = mt.array([(1, 2), (3, 4)], dtype=[('x', 'i4'), ('y', 'i4')])
+    >>> mt.shape(a).execute()
+    (2,)
+
+    """
+    a = astensor(a)
+    op = TensorGetShape(ndim=a.ndim)
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/base/sort.py b/python/xorbits/_mars/tensor/base/sort.py
new file mode 100644
index 000000000..98df72138
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/sort.py
@@ -0,0 +1,508 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import ExecutableTuple
+from ...serialization.serializables import (
+    BoolField,
+    FieldTypes,
+    Int32Field,
+    ListField,
+    StringField,
+)
+from ..array_utils import as_same_device, device
+from ..core import TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorShuffleProxy
+from ..utils import validate_axis, validate_order
+from .psrs import TensorPSRSOperandMixin
+
+
+class TensorSort(TensorOperand, TensorPSRSOperandMixin):
+    _op_type_ = OperandDef.SORT
+
+    _axis = Int32Field("axis")
+    _kind = StringField("kind")
+    _parallel_kind = StringField("parallel_kind")
+    _order = ListField("order", FieldTypes.string)
+    _psrs_kinds = ListField("psrs_kinds", FieldTypes.string)
+    _need_align = BoolField("need_align")
+    _return_value = BoolField("return_value")
+    _return_indices = BoolField("return_indices")
+
+    def __init__(
+        self,
+        axis=None,
+        kind=None,
+        parallel_kind=None,
+        order=None,
+        psrs_kinds=None,
+        need_align=None,
+        return_value=None,
+        return_indices=None,
+        dtype=None,
+        gpu=None,
+        **kw,
+    ):
+        super().__init__(
+            _axis=axis,
+            _kind=kind,
+            _parallel_kind=parallel_kind,
+            _order=order,
+            _psrs_kinds=psrs_kinds,
+            _need_align=need_align,
+            _return_value=return_value,
+            _return_indices=return_indices,
+            dtype=dtype,
+            gpu=gpu,
+            **kw,
+        )
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def kind(self):
+        return self._kind
+
+    @property
+    def parallel_kind(self):
+        return self._parallel_kind
+
+    @property
+    def order(self):
+        return self._order
+
+    @property
+    def psrs_kinds(self):
+        return self._psrs_kinds
+
+    @property
+    def need_align(self):
+        return self._need_align
+
+    @property
+    def return_value(self):
+        return self._return_value
+
+    @property
+    def return_indices(self):
+        return self._return_indices
+
+    @property
+    def output_limit(self):
+        return int(bool(self._return_value)) + int(bool(self._return_indices))
+
+    def __call__(self, a):
+        kws = []
+        if self._return_value:
+            kws.append(
+                {"shape": a.shape, "order": a.order, "dtype": a.dtype, "type": "sorted"}
+            )
+        if self._return_indices:
+            kws.append(
+                {
+                    "shape": a.shape,
+                    "order": TensorOrder.C_ORDER,
+                    "dtype": np.dtype(np.int64),
+                    "type": "argsort",
+                }
+            )
+        ret = self.new_tensors([a], kws=kws)
+        if len(kws) == 1:
+            return ret[0]
+        return ExecutableTuple(ret)
+
+    @classmethod
+    def _tile_psrs(cls, op):
+        """
+        Refer to http://csweb.cs.wfu.edu/bigiron/LittleFE-PSRS/build/html/PSRSalgorithm.html
+        to see explanation of parallel sorting by regular sampling
+        """
+        out_tensor = op.outputs[0]
+        in_tensor, axis_chunk_shape, out_idxes, need_align = yield from cls.preprocess(
+            op
+        )
+        axis_offsets = [0] + np.cumsum(in_tensor.nsplits[op.axis]).tolist()[:-1]
+        return_value, return_indices = op.return_value, op.return_indices
+
+        out_value_chunks, out_indices_chunks = [], []
+        for out_idx in out_idxes:
+            # stage 1: local sort and regular samples collected
+            (
+                sorted_chunks,
+                indices_chunks,
+                sampled_chunks,
+            ) = cls.local_sort_and_regular_sample(
+                op, in_tensor, axis_chunk_shape, axis_offsets, out_idx
+            )
+
+            # stage 2: gather and merge samples, choose and broadcast p-1 pivots
+            concat_pivot_chunk = cls.concat_and_pivot(
+                op, axis_chunk_shape, out_idx, sorted_chunks, sampled_chunks
+            )
+
+            # stage 3: Local data is partitioned
+            partition_chunks = cls.partition_local_data(
+                op, axis_chunk_shape, sorted_chunks, indices_chunks, concat_pivot_chunk
+            )
+
+            proxy_chunk = TensorShuffleProxy(dtype=partition_chunks[0].dtype).new_chunk(
+                partition_chunks, shape=()
+            )
+
+            # stage 4: all *ith* classes are gathered and merged
+            (
+                partition_sort_chunks,
+                partition_indices_chunks,
+                sort_info_chunks,
+            ) = cls.partition_merge_data(
+                op, need_align, None, partition_chunks, proxy_chunk
+            )
+
+            if not need_align:
+                if return_value:
+                    out_value_chunks.extend(partition_sort_chunks)
+                if return_indices:
+                    out_indices_chunks.extend(partition_indices_chunks)
+            else:
+                (
+                    align_reduce_value_chunks,
+                    align_reduce_indices_chunks,
+                ) = cls.align_partitions_data(
+                    op,
+                    out_idx,
+                    in_tensor,
+                    partition_sort_chunks,
+                    partition_indices_chunks,
+                    sort_info_chunks,
+                )
+                if return_value:
+                    out_value_chunks.extend(align_reduce_value_chunks)
+                if return_indices:
+                    out_indices_chunks.extend(align_reduce_indices_chunks)
+
+        new_op = op.copy()
+        nsplits = list(in_tensor.nsplits)
+        if not need_align:
+            nsplits[op.axis] = (np.nan,) * axis_chunk_shape
+        kws = []
+        if return_value:
+            kws.append(
+                {
+                    "shape": out_tensor.shape,
+                    "order": out_tensor.order,
+                    "chunks": out_value_chunks,
+                    "nsplits": nsplits,
+                    "dtype": out_tensor.dtype,
+                }
+            )
+        if return_indices:
+            kws.append(
+                {
+                    "shape": out_tensor.shape,
+                    "order": TensorOrder.C_ORDER,
+                    "chunks": out_indices_chunks,
+                    "nsplits": nsplits,
+                    "dtype": np.dtype(np.int64),
+                }
+            )
+        return new_op.new_tensors(op.inputs, kws=kws)
+
+    @classmethod
+    def tile(cls, op):
+        in_tensor = op.inputs[0]
+        return_value, return_indices = op.return_value, op.return_indices
+
+        if in_tensor.chunk_shape[op.axis] == 1:
+            out_chunks, out_indices_chunks = [], []
+            for chunk in in_tensor.chunks:
+                chunk_op = op.copy().reset_key()
+                kws = []
+                if return_value:
+                    kws.append(
+                        {
+                            "shape": chunk.shape,
+                            "index": chunk.index,
+                            "order": chunk.order,
+                            "dtype": chunk.dtype,
+                            "type": "sorted",
+                        }
+                    )
+                if return_indices:
+                    kws.append(
+                        {
+                            "shape": chunk.shape,
+                            "index": chunk.index,
+                            "order": TensorOrder.C_ORDER,
+                            "dtype": np.dtype(np.int64),
+                            "type": "argsort",
+                        }
+                    )
+                chunks = chunk_op.new_chunks([chunk], kws=kws)
+                if return_value:
+                    out_chunks.append(chunks[0])
+                if return_indices:
+                    out_indices_chunks.append(chunks[-1])
+
+            new_op = op.copy()
+            kws = [out.params for out in op.outputs]
+            if return_value:
+                kws[0]["nsplits"] = in_tensor.nsplits
+                kws[0]["chunks"] = out_chunks
+            if return_indices:
+                kws[-1]["nsplits"] = in_tensor.nsplits
+                kws[-1]["chunks"] = out_indices_chunks
+            return new_op.new_tensors([in_tensor], kws=kws)
+        else:
+            # use parallel sorting by regular sampling
+            return (yield from cls._tile_psrs(op))
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (a,), device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+        return_value, return_indices = op.return_value, op.return_indices
+
+        with device(device_id):
+            kw = {}
+            if op.kind is not None:
+                kw["kind"] = op.kind
+            if op.order is not None:
+                kw["order"] = op.order
+
+            if return_indices:
+                if not return_value:
+                    ctx[op.outputs[0].key] = xp.argsort(a, axis=op.axis, **kw)
+                else:
+                    indices = ctx[op.outputs[1].key] = xp.argsort(a, axis=op.axis, **kw)
+                    ctx[op.outputs[0].key] = xp.take_along_axis(a, indices, op.axis)
+            else:
+                ctx[op.outputs[0].key] = xp.sort(a, axis=op.axis, **kw)
+
+
+_AVAILABLE_KINDS = {"QUICKSORT", "MERGESORT", "HEAPSORT", "STABLE"}
+
+
+def _validate_sort_psrs_kinds(psrs_kinds):
+    if psrs_kinds is not None:
+        if isinstance(psrs_kinds, (list, tuple)):
+            psrs_kinds = list(psrs_kinds)
+            if len(psrs_kinds) != 3:
+                raise ValueError("psrs_kinds should have 3 elements")
+            for i, psrs_kind in enumerate(psrs_kinds):
+                if psrs_kind is None:
+                    if i < 2:
+                        continue
+                    else:
+                        raise ValueError(
+                            "3rd element of psrs_kinds should be specified"
+                        )
+                upper_psrs_kind = psrs_kind.upper()
+                if upper_psrs_kind not in _AVAILABLE_KINDS:
+                    raise ValueError(
+                        f"{psrs_kind} is an unrecognized kind in psrs_kinds"
+                    )
+        else:
+            raise TypeError("psrs_kinds should be list or tuple")
+    else:
+        psrs_kinds = ["quicksort", "mergesort", "mergesort"]
+    return psrs_kinds
+
+
+def _validate_sort_arguments(a, axis, kind, parallel_kind, psrs_kinds, order):
+    a = astensor(a)
+    if axis is None:
+        a = a.flatten()
+        axis = 0
+    else:
+        axis = validate_axis(a.ndim, axis)
+    if kind is not None:
+        raw_kind = kind
+        kind = kind.upper()
+        if kind not in _AVAILABLE_KINDS:
+            # check kind
+            raise ValueError(f"{raw_kind} is an unrecognized kind of sort")
+    if parallel_kind is not None:
+        raw_parallel_kind = parallel_kind
+        parallel_kind = parallel_kind.upper()
+        if parallel_kind not in {"PSRS"}:
+            raise ValueError(
+                f"{raw_parallel_kind} is an unrecognized kind of parallel sort"
+            )
+
+    order = validate_order(a.dtype, order)
+    psrs_kinds = _validate_sort_psrs_kinds(psrs_kinds)
+    return a, axis, kind, parallel_kind, psrs_kinds, order
+
+
+def sort(
+    a,
+    axis=-1,
+    kind=None,
+    parallel_kind=None,
+    psrs_kinds=None,
+    order=None,
+    return_index=False,
+    **kw,
+):
+    r"""
+    Return a sorted copy of a tensor.
+
+    Parameters
+    ----------
+    a : array_like
+        Tensor to be sorted.
+    axis : int or None, optional
+        Axis along which to sort. If None, the tensor is flattened before
+        sorting. The default is -1, which sorts along the last axis.
+    kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
+        Sorting algorithm. The default is 'quicksort'. Note that both 'stable'
+        and 'mergesort' use timsort or radix sort under the covers and, in general,
+        the actual implementation will vary with data type. The 'mergesort' option
+        is retained for backwards compatibility.
+        Note that this argument would not take effect if `a` has more than
+        1 chunk on the sorting axis.
+    parallel_kind: {'PSRS'}, optional
+        Parallel sorting algorithm, for the details, refer to:
+        http://csweb.cs.wfu.edu/bigiron/LittleFE-PSRS/build/html/PSRSalgorithm.html
+    psrs_kinds: list with 3 elements, optional
+        Sorting algorithms during PSRS algorithm.
+    order : str or list of str, optional
+        When `a` is a tensor with fields defined, this argument specifies
+        which fields to compare first, second, etc.  A single field can
+        be specified as a string, and not all fields need be specified,
+        but unspecified fields will still be used, in the order in which
+        they come up in the dtype, to break ties.
+    return_index: bool
+        Return indices as well if True.
+
+    Returns
+    -------
+    sorted_tensor : Tensor
+        Tensor of the same type and shape as `a`.
+
+    See Also
+    --------
+    Tensor.sort : Method to sort a tensor in-place.
+    argsort : Indirect sort.
+    lexsort : Indirect stable sort on multiple keys.
+    searchsorted : Find elements in a sorted tensor.
+    partition : Partial sort.
+
+    Notes
+    -----
+    The various sorting algorithms are characterized by their average speed,
+    worst case performance, work space size, and whether they are stable. A
+    stable sort keeps items with the same key in the same relative
+    order. The four algorithms implemented in NumPy have the following
+    properties:
+
+    =========== ======= ============= ============ ========
+       kind      speed   worst case    work space   stable
+    =========== ======= ============= ============ ========
+    'quicksort'    1     O(n^2)            0          no
+    'heapsort'     3     O(n*log(n))       0          no
+    'mergesort'    2     O(n*log(n))      ~n/2        yes
+    'timsort'      2     O(n*log(n))      ~n/2        yes
+    =========== ======= ============= ============ ========
+
+    .. note:: The datatype determines which of 'mergesort' or 'timsort'
+       is actually used, even if 'mergesort' is specified. User selection
+       at a finer scale is not currently available.
+
+    All the sort algorithms make temporary copies of the data when
+    sorting along any but the last axis.  Consequently, sorting along
+    the last axis is faster and uses less space than sorting along
+    any other axis.
+
+    The sort order for complex numbers is lexicographic. If both the real
+    and imaginary parts are non-nan then the order is determined by the
+    real parts except when they are equal, in which case the order is
+    determined by the imaginary parts.
+
+    quicksort has been changed to an introsort which will switch
+    heapsort when it does not make enough progress. This makes its
+    worst case O(n*log(n)).
+
+    'stable' automatically choses the best stable sorting algorithm
+    for the data type being sorted. It, along with 'mergesort' is
+    currently mapped to timsort or radix sort depending on the
+    data type. API forward compatibility currently limits the
+    ability to select the implementation and it is hardwired for the different
+    data types.
+
+    Timsort is added for better performance on already or nearly
+    sorted data. On random data timsort is almost identical to
+    mergesort. It is now used for stable sort while quicksort is still the
+    default sort if none is chosen. For details of timsort, refer to
+    `CPython listsort.txt <https://github.com/python/cpython/blob/3.7/Objects/listsort.txt>`_.
+    'mergesort' and 'stable' are mapped to radix sort for integer data types. Radix sort is an
+    O(n) sort instead of O(n log n).
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> a = mt.array([[1,4],[3,1]])
+    >>> mt.sort(a).execute()                # sort along the last axis
+    array([[1, 4],
+           [1, 3]])
+    >>> mt.sort(a, axis=None).execute()     # sort the flattened tensor
+    array([1, 1, 3, 4])
+    >>> mt.sort(a, axis=0).execute()        # sort along the first axis
+    array([[1, 1],
+           [3, 4]])
+
+    Use the `order` keyword to specify a field to use when sorting a
+    structured array:
+
+    >>> dtype = [('name', 'S10'), ('height', float), ('age', int)]
+    >>> values = [('Arthur', 1.8, 41), ('Lancelot', 1.9, 38),
+    ...           ('Galahad', 1.7, 38)]
+    >>> a = mt.array(values, dtype=dtype)       # create a structured tensor
+    >>> mt.sort(a, order='height').execute()                # doctest: +SKIP
+    array([('Galahad', 1.7, 38), ('Arthur', 1.8, 41),
+           ('Lancelot', 1.8999999999999999, 38)],
+          dtype=[('name', '|S10'), ('height', '<f8'), ('age', '<i4')])
+
+    Sort by age, then height if ages are equal:
+
+    >>> mt.sort(a, order=['age', 'height']).execute()       # doctest: +SKIP
+    array([('Galahad', 1.7, 38), ('Lancelot', 1.8999999999999999, 38),
+           ('Arthur', 1.8, 41)],
+          dtype=[('name', '|S10'), ('height', '<f8'), ('age', '<i4')])
+    """
+    need_align = kw.pop("need_align", None)
+    if len(kw) > 0:
+        raise TypeError(f"sort() got an unexpected keyword argument '{next(iter(kw))}'")
+    a, axis, kind, parallel_kind, psrs_kinds, order = _validate_sort_arguments(
+        a, axis, kind, parallel_kind, psrs_kinds, order
+    )
+    op = TensorSort(
+        axis=axis,
+        kind=kind,
+        parallel_kind=parallel_kind,
+        order=order,
+        psrs_kinds=psrs_kinds,
+        need_align=need_align,
+        return_value=True,
+        return_indices=return_index,
+        dtype=a.dtype,
+        gpu=a.op.gpu,
+    )
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/base/split.py b/python/xorbits/_mars/tensor/base/split.py
new file mode 100644
index 000000000..f6aac57d7
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/split.py
@@ -0,0 +1,218 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import ExecutableTuple, recursive_tile
+from ...lib.sparse.core import get_array_module
+from ...serialization.serializables import AnyField, Int32Field, KeyField
+from ..core import Tensor
+from ..datasource import tensor as astensor
+from ..operands import TensorHasInput, TensorOperandMixin
+from ..utils import calc_sliced_size
+
+
+class TensorSplit(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.ARRAY_SPLIT
+
+    _input = KeyField("input")
+    _indices_or_sections = AnyField("indices_or_sections")
+    _axis = Int32Field("axis")
+
+    def __init__(self, axis=None, **kw):
+        super().__init__(_axis=axis, **kw)
+
+    @property
+    def indices_or_sections(self):
+        return self._indices_or_sections
+
+    @property
+    def axis(self):
+        return getattr(self, "_axis", 0)
+
+    @property
+    def output_limit(self):
+        return float("inf")
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+        if len(self._inputs) > 1:
+            self._indices_or_sections = self._inputs[1]
+
+    def __call__(self, a, indices_or_sections, is_split=False):
+        axis = self._axis
+        size = a.shape[axis]
+        if np.isnan(size):
+            raise ValueError(
+                "cannot split array with unknown shape, "
+                "call `.execute()` on input tensor first"
+            )
+
+        if (
+            isinstance(indices_or_sections, Tensor)
+            and hasattr(indices_or_sections.op, "data")
+            and indices_or_sections.op.data is not None
+        ):
+            indices_or_sections = indices_or_sections.op.data
+
+        try:
+            indices_or_sections = int(indices_or_sections)
+            if is_split:
+                if size % indices_or_sections:
+                    raise ValueError(
+                        "tensor split does not result in an equal division"
+                    )
+                nparts = indices_or_sections
+                nsplit = (size // indices_or_sections,) * nparts
+            else:
+                nparts = indices_or_sections
+                if size % indices_or_sections == 0:
+                    nsplit = (size // indices_or_sections,) * nparts
+                else:
+                    nsplit = (size // indices_or_sections + 1,) * (
+                        size % indices_or_sections
+                    ) + (size // indices_or_sections,) * (
+                        size - size % indices_or_sections
+                    )
+        except TypeError:
+            if isinstance(indices_or_sections, Tensor):
+                nparts = indices_or_sections.shape[0] + 1
+                nsplit = (np.nan,) * nparts
+            else:
+                ind = indices_or_sections = get_array_module(
+                    indices_or_sections
+                ).asarray(indices_or_sections)
+                if indices_or_sections.ndim != 1 or not np.issubdtype(
+                    indices_or_sections.dtype, np.integer
+                ):
+                    raise TypeError("slice indices must be integers or None")
+                nparts = indices_or_sections.shape[0] + 1
+                get = lambda i: None if i < 0 or i >= len(ind) else ind[i]
+                nsplit = [
+                    calc_sliced_size(size, slice(get(j - 1), get(j)))
+                    for j in range(nparts)
+                ]
+
+        inputs = [a]
+        if isinstance(indices_or_sections, Tensor):
+            inputs.append(indices_or_sections)
+        else:
+            self._indices_or_sections = indices_or_sections
+
+        kws = [
+            {
+                "i": i,
+                "shape": a.shape[:axis] + (nsplit[i],) + a.shape[axis + 1 :],
+                "order": a.order,
+            }
+            for i in range(nparts)
+        ]
+        return ExecutableTuple(self.new_tensors(inputs, kws=kws, output_limit=nparts))
+
+    @classmethod
+    def tile(cls, op):
+        in_tensor = op.input
+        splits = op.outputs
+        axis = op.axis
+
+        acc_shapes = np.cumsum([s.shape[axis] for s in splits])
+        out_kws = [dict() for _ in splits]
+        for i, split in enumerate(splits):
+            slc = slice(0 if i == 0 else acc_shapes[i - 1], acc_shapes[i])
+            new_s = yield from recursive_tile(in_tensor[(slice(None),) * axis + (slc,)])
+            out_kws[i]["chunks"] = new_s.chunks
+            out_kws[i]["nsplits"] = new_s.nsplits
+            out_kws[i]["shape"] = split.shape
+            out_kws[i]["order"] = op.outputs[i].order
+
+        new_op = op.copy()
+        return new_op.new_tensors(op.inputs, kws=out_kws, output_limit=len(out_kws))
+
+
+def _split(a, indices_or_sections, axis=0, is_split=False):
+    op = TensorSplit(axis=axis, dtype=a.dtype)
+    return op(a, indices_or_sections, is_split=is_split)
+
+
+def split(ary, indices_or_sections, axis=0):
+    """
+    Split a tensor into multiple sub-tensors.
+
+    Parameters
+    ----------
+    ary : Tensor
+        Tensor to be divided into sub-tensors.
+    indices_or_sections : int or 1-D tensor
+        If `indices_or_sections` is an integer, N, the array will be divided
+        into N equal tensors along `axis`.  If such a split is not possible,
+        an error is raised.
+
+        If `indices_or_sections` is a 1-D tensor of sorted integers, the entries
+        indicate where along `axis` the array is split.  For example,
+        ``[2, 3]`` would, for ``axis=0``, result in
+
+          - ary[:2]
+          - ary[2:3]
+          - ary[3:]
+
+        If an index exceeds the dimension of the tensor along `axis`,
+        an empty sub-tensor is returned correspondingly.
+    axis : int, optional
+        The axis along which to split, default is 0.
+
+    Returns
+    -------
+    sub-tensors : list of Tensors
+        A list of sub-tensors.
+
+    Raises
+    ------
+    ValueError
+        If `indices_or_sections` is given as an integer, but
+        a split does not result in equal division.
+
+    See Also
+    --------
+    array_split : Split a tensor into multiple sub-tensors of equal or
+                  near-equal size.  Does not raise an exception if
+                  an equal division cannot be made.
+    hsplit : Split  into multiple sub-arrays horizontally (column-wise).
+    vsplit : Split tensor into multiple sub-tensors vertically (row wise).
+    dsplit : Split tensor into multiple sub-tensors along the 3rd axis (depth).
+    concatenate : Join a sequence of tensors along an existing axis.
+    stack : Join a sequence of tensors along a new axis.
+    hstack : Stack tensors in sequence horizontally (column wise).
+    vstack : Stack tensors in sequence vertically (row wise).
+    dstack : Stack tensors in sequence depth wise (along third dimension).
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.arange(9.0)
+    >>> mt.split(x, 3).execute()
+    [array([ 0.,  1.,  2.]), array([ 3.,  4.,  5.]), array([ 6.,  7.,  8.])]
+
+    >>> x = mt.arange(8.0)
+    >>> mt.split(x, [3, 5, 6, 10]).execute()
+    [array([ 0.,  1.,  2.]),
+     array([ 3.,  4.]),
+     array([ 5.]),
+     array([ 6.,  7.]),
+     array([], dtype=float64)]
+
+    """
+    return _split(astensor(ary), indices_or_sections, axis=axis, is_split=True)
diff --git a/python/xorbits/_mars/tensor/base/squeeze.py b/python/xorbits/_mars/tensor/base/squeeze.py
new file mode 100644
index 000000000..3bdf1476e
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/squeeze.py
@@ -0,0 +1,163 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Iterable
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import FieldTypes, KeyField, TupleField
+from ..array_utils import as_same_device, device
+from ..operands import TensorHasInput, TensorOperandMixin
+
+
+def _get_squeeze_shape(shape, axis):
+    if axis is not None:
+        if isinstance(axis, Iterable):
+            axis = tuple(axis)
+        else:
+            axis = (axis,)
+
+        for ax in axis:
+            if shape[ax] != 1:
+                raise ValueError(
+                    "cannot select an axis to squeeze out "
+                    "which has size not equal to one"
+                )
+        shape = tuple(s for i, s in enumerate(shape) if i not in axis)
+    else:
+        axis = tuple(i for i, s in enumerate(shape) if s == 1)
+        shape = tuple(s for s in shape if s != 1)
+
+    return shape, axis
+
+
+class TensorSqueeze(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.SQUEEZE
+
+    _input = KeyField("input")
+    _axis = TupleField("axis", FieldTypes.int32)
+
+    def __init__(self, axis=None, **kw):
+        super().__init__(_axis=axis, create_view=True, **kw)
+
+    def on_output_modify(self, new_output):
+        slcs = [slice(None)] * new_output.ndim
+        for axis in self._axis:
+            slcs.insert(axis, None)
+        return new_output[slcs]
+
+    def on_input_modify(self, new_input):
+        op = self.copy().reset_key()
+        return op(new_input, self.outputs[0].shape)
+
+    @property
+    def axis(self):
+        return self._axis
+
+    def __call__(self, a, shape):
+        return self.new_tensor([a], shape, order=a.order)
+
+    @classmethod
+    def tile(cls, op):
+        in_tensor = op.input
+        out_tensor = op.outputs[0]
+        axis_set = set(op.axis)
+
+        out_chunks = []
+        for c in in_tensor.chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_shape = _get_squeeze_shape(c.shape, op.axis)[0]
+            chunk_idx = tuple(idx for i, idx in enumerate(c.index) if i not in axis_set)
+            out_chunk = chunk_op.new_chunk(
+                [c], shape=chunk_shape, index=chunk_idx, order=out_tensor.order
+            )
+            out_chunks.append(out_chunk)
+        nsplits = [
+            nsplit for i, nsplit in enumerate(in_tensor.nsplits) if i not in axis_set
+        ]
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            op.outputs[0].shape,
+            order=out_tensor.order,
+            chunks=out_chunks,
+            nsplits=nsplits,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (a,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            ctx[op.outputs[0].key] = xp.squeeze(a, axis=op.axis)
+
+
+def squeeze(a, axis=None):
+    """
+    Remove single-dimensional entries from the shape of a tensor.
+
+    Parameters
+    ----------
+    a : array_like
+        Input data.
+    axis : None or int or tuple of ints, optional
+        Selects a subset of the single-dimensional entries in the
+        shape. If an axis is selected with shape entry greater than
+        one, an error is raised.
+
+    Returns
+    -------
+    squeezed : Tensor
+        The input tensor, but with all or a subset of the
+        dimensions of length 1 removed. This is always `a` itself
+        or a view into `a`.
+
+    Raises
+    ------
+    ValueError
+        If `axis` is not `None`, and an axis being squeezed is not of length 1
+
+    See Also
+    --------
+    expand_dims : The inverse operation, adding singleton dimensions
+    reshape : Insert, remove, and combine dimensions, and resize existing ones
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.array([[[0], [1], [2]]])
+    >>> x.shape
+    (1, 3, 1)
+    >>> mt.squeeze(x).shape
+    (3,)
+    >>> mt.squeeze(x, axis=0).shape
+    (3, 1)
+    >>> mt.squeeze(x, axis=1).shape
+    Traceback (most recent call last):
+    ...
+    ValueError: cannot select an axis to squeeze out which has size not equal to one
+    >>> mt.squeeze(x, axis=2).shape
+    (1, 3)
+
+    """
+    shape, axis = _get_squeeze_shape(a.shape, axis)
+
+    if 1 not in a.shape:
+        return a
+
+    op = TensorSqueeze(axis=axis, dtype=a.dtype, sparse=a.issparse())
+    return op(a, shape)
diff --git a/python/xorbits/_mars/tensor/base/swapaxes.py b/python/xorbits/_mars/tensor/base/swapaxes.py
new file mode 100644
index 000000000..1e3178732
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/swapaxes.py
@@ -0,0 +1,163 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import Int32Field, KeyField
+from ..array_utils import as_same_device, device
+from ..core import TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorHasInput, TensorOperandMixin
+from ..utils import reverse_order, validate_axis
+
+
+def _swap(it, axis1, axis2):
+    new_it = list(it)
+    new_it[axis1], new_it[axis2] = it[axis2], it[axis1]
+
+    return tuple(new_it)
+
+
+class TensorSwapAxes(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.SWAPAXES
+
+    _input = KeyField("input")
+    _axis1 = Int32Field("axis1")
+    _axis2 = Int32Field("axis2")
+
+    def __init__(self, axis1=None, axis2=None, **kw):
+        super().__init__(_axis1=axis1, _axis2=axis2, create_view=True, **kw)
+
+    @property
+    def axis1(self):
+        return self._axis1
+
+    @property
+    def axis2(self):
+        return self._axis2
+
+    def __call__(self, a):
+        axis1, axis2 = self._axis1, self._axis2
+        if (axis1 == 0 and axis2 == a.ndim - 1) or (axis1 == a.ndim - 1 and axis2 == 0):
+            tensor_order = reverse_order(a.order)
+        else:
+            tensor_order = TensorOrder.C_ORDER
+        shape = _swap(a.shape, self.axis1, self.axis2)
+        return self.new_tensor([a], shape, order=tensor_order)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def on_output_modify(self, new_output):
+        op = TensorSwapAxes(
+            axis1=self._axis2,
+            axis2=self._axis1,
+            dtype=new_output.dtype,
+            sparse=new_output.issparse(),
+        )
+        return op(new_output)
+
+    def on_input_modify(self, new_input):
+        op = self.copy().reset_key()
+        return op(new_input)
+
+    @classmethod
+    def tile(cls, op):
+        axis1, axis2 = op.axis1, op.axis2
+        in_tensor = op.inputs[0]
+        out_tensor = op.outputs[0]
+
+        out_chunks = []
+        for c in in_tensor.chunks:
+            chunk_shape = _swap(c.shape, axis1, axis2)
+            chunk_idx = _swap(c.index, axis1, axis2)
+            chunk_op = op.copy().reset_key()
+            out_chunk = chunk_op.new_chunk(
+                [c], shape=chunk_shape, index=chunk_idx, order=out_tensor.order
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        nsplits = _swap(in_tensor.nsplits, axis1, axis2)
+        return new_op.new_tensors(
+            [in_tensor],
+            out_tensor.shape,
+            order=out_tensor.order,
+            chunks=out_chunks,
+            nsplits=nsplits,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (x,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        axis1, axis2 = op.axis1, op.axis2
+        with device(device_id):
+            ctx[op.outputs[0].key] = xp.swapaxes(x, axis1, axis2)
+
+
+def swapaxes(a, axis1, axis2):
+    """
+    Interchange two axes of a tensor.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor.
+    axis1 : int
+        First axis.
+    axis2 : int
+        Second axis.
+
+    Returns
+    -------
+    a_swapped : Tensor
+        If `a` is a Tensor, then a view of `a` is
+        returned; otherwise a new tensor is created.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.array([[1,2,3]])
+    >>> mt.swapaxes(x,0,1).execute()
+    array([[1],
+           [2],
+           [3]])
+
+    >>> x = mt.array([[[0,1],[2,3]],[[4,5],[6,7]]])
+    >>> x.execute()
+    array([[[0, 1],
+            [2, 3]],
+           [[4, 5],
+            [6, 7]]])
+
+    >>> mt.swapaxes(x,0,2).execute()
+    array([[[0, 4],
+            [2, 6]],
+           [[1, 5],
+            [3, 7]]])
+
+    """
+    a = astensor(a)
+    axis1 = validate_axis(a.ndim, axis1)
+    axis2 = validate_axis(a.ndim, axis2)
+
+    if axis1 == axis2:
+        return a
+
+    op = TensorSwapAxes(axis1, axis2, dtype=a.dtype, sparse=a.issparse())
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/base/tests/__init__.py b/python/xorbits/_mars/tensor/base/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/base/tests/test_base.py b/python/xorbits/_mars/tensor/base/tests/test_base.py
new file mode 100644
index 000000000..f41f595ae
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/tests/test_base.py
@@ -0,0 +1,797 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+from ....core import tile
+from ....core.operand import OperandStage
+from ...datasource import arange, ones, tensor
+from .. import (
+    TensorCopyTo,
+    argwhere,
+    array_split,
+    atleast_1d,
+    atleast_2d,
+    atleast_3d,
+    broadcast_to,
+    copyto,
+    isin,
+    moveaxis,
+    partition,
+    ravel,
+    repeat,
+    result_type,
+    searchsorted,
+    sort,
+    split,
+    squeeze,
+    to_cpu,
+    to_gpu,
+    topk,
+    transpose,
+    unique,
+    where,
+)
+
+
+def test_dir():
+    a = tensor([0, 1, 2], chunk_size=2)
+    tensor_dir = dir(a)
+    for attr in dir(a.data):
+        assert attr in tensor_dir
+
+
+def test_copyto():
+    a = ones((10, 20), chunk_size=3)
+    b = ones(10, chunk_size=4)
+
+    with pytest.raises(ValueError):
+        copyto(a, b)
+
+    tp = type(a.op)
+    b = ones(20, chunk_size=4)
+    copyto(a, b)
+
+    assert isinstance(a.op, TensorCopyTo)
+    assert a.inputs[0] is b.data
+    assert isinstance(a.inputs[1].op, tp)
+
+    a = tile(a)
+
+    assert isinstance(a.chunks[0].op, TensorCopyTo)
+    assert len(a.chunks[0].inputs) == 2
+
+    a = ones((10, 20), chunk_size=3, dtype="i4")
+    b = ones(20, chunk_size=4, dtype="f8")
+
+    with pytest.raises(TypeError):
+        copyto(a, b)
+
+    b = ones(20, chunk_size=4, dtype="i4")
+    copyto(a, b, where=b > 0)
+
+    assert a.op.where is not None
+
+    a = tile(a)
+
+    assert isinstance(a.chunks[0].op, TensorCopyTo)
+    assert len(a.chunks[0].inputs) == 3
+
+    with pytest.raises(ValueError):
+        copyto(a, a, where=np.ones(30, dtype="?"))
+
+
+def test_astype():
+    arr = ones((10, 20, 30), chunk_size=3)
+
+    arr2 = arr.astype(np.int32)
+    arr2 = tile(arr2)
+
+    assert arr2.shape == (10, 20, 30)
+    assert np.issubdtype(arr2.dtype, np.int32) is True
+    assert arr2.op.casting == "unsafe"
+
+    with pytest.raises(TypeError):
+        arr.astype(np.int32, casting="safe")
+
+    arr3 = arr.astype(arr.dtype, order="F")
+    assert arr3.flags["F_CONTIGUOUS"] is True
+    assert arr3.flags["C_CONTIGUOUS"] is False
+
+    arr3 = tile(arr3)
+
+    assert arr3.chunks[0].order.value == "F"
+
+
+def test_transpose():
+    arr = ones((10, 20, 30), chunk_size=[4, 3, 5])
+
+    arr2 = transpose(arr)
+    arr2 = tile(arr2)
+
+    assert arr2.shape == (30, 20, 10)
+    assert len(arr2.chunks) == 126
+    assert arr2.chunks[0].shape == (5, 3, 4)
+    assert arr2.chunks[-1].shape == (5, 2, 2)
+
+    with pytest.raises(ValueError):
+        transpose(arr, axes=(1, 0))
+
+    arr3 = transpose(arr, (-2, 2, 0))
+    arr3 = tile(arr3)
+
+    assert arr3.shape == (20, 30, 10)
+    assert len(arr3.chunks) == 126
+    assert arr3.chunks[0].shape == (3, 5, 4)
+    assert arr3.chunks[-1].shape == (2, 5, 2)
+
+    arr4 = arr.transpose(-2, 2, 0)
+    arr4 = tile(arr4)
+
+    assert arr4.shape == (20, 30, 10)
+    assert len(arr4.chunks) == 126
+    assert arr4.chunks[0].shape == (3, 5, 4)
+    assert arr4.chunks[-1].shape == (2, 5, 2)
+
+    arr5 = arr.T
+    arr5 = tile(arr5)
+
+    assert arr5.shape == (30, 20, 10)
+    assert len(arr5.chunks) == 126
+    assert arr5.chunks[0].shape == (5, 3, 4)
+    assert arr5.chunks[-1].shape == (5, 2, 2)
+
+
+def test_swapaxes():
+    arr = ones((10, 20, 30), chunk_size=[4, 3, 5])
+    arr2 = arr.swapaxes(0, 1)
+    arr, arr2 = tile(arr, arr2)
+
+    assert arr2.shape == (20, 10, 30)
+    assert len(arr.chunks) == len(arr2.chunks)
+
+
+def test_broadcast_to():
+    arr = ones((10, 5), chunk_size=2)
+    arr2 = broadcast_to(arr, (20, 10, 5))
+    arr, arr2 = tile(arr, arr2)
+
+    assert arr2.shape == (20, 10, 5)
+    assert len(arr2.chunks) == len(arr.chunks)
+    assert arr2.chunks[0].shape == (20, 2, 2)
+
+    arr = ones((10, 5, 1), chunk_size=2)
+    arr3 = broadcast_to(arr, (5, 10, 5, 6))
+    arr, arr3 = tile(arr, arr3)
+
+    assert arr3.shape == (5, 10, 5, 6)
+    assert len(arr3.chunks) == len(arr.chunks)
+    assert arr3.nsplits == ((5,), (2, 2, 2, 2, 2), (2, 2, 1), (6,))
+    assert arr3.chunks[0].shape == (5, 2, 2, 6)
+
+    arr = ones((10, 1), chunk_size=2)
+    arr4 = broadcast_to(arr, (20, 10, 5))
+    arr, arr4 = tile(arr, arr4)
+
+    assert arr4.shape == (20, 10, 5)
+    assert len(arr4.chunks) == len(arr.chunks)
+    assert arr4.chunks[0].shape == (20, 2, 5)
+
+    with pytest.raises(ValueError):
+        broadcast_to(arr, (10,))
+
+    with pytest.raises(ValueError):
+        broadcast_to(arr, (5, 1))
+
+    arr = ones((4, 5), chunk_size=2)
+    with pytest.raises((ValueError)):
+        broadcast_to(arr[arr < 2], (3, 20))
+
+
+def test_where():
+    cond = tensor([[True, False], [False, True]], chunk_size=1)
+    x = tensor([1, 2], chunk_size=1)
+    y = tensor([3, 4], chunk_size=1)
+
+    arr = where(cond, x, y)
+    arr = tile(arr)
+
+    assert len(arr.chunks) == 4
+    np.testing.assert_equal(arr.chunks[0].inputs[0].op.data, [[True]])
+    np.testing.assert_equal(arr.chunks[0].inputs[1].op.data, [1])
+    np.testing.assert_equal(arr.chunks[0].inputs[2].op.data, [3])
+    np.testing.assert_equal(arr.chunks[1].inputs[0].op.data, [[False]])
+    np.testing.assert_equal(arr.chunks[1].inputs[1].op.data, [2])
+    np.testing.assert_equal(arr.chunks[1].inputs[2].op.data, [4])
+    np.testing.assert_equal(arr.chunks[2].inputs[0].op.data, [[False]])
+    np.testing.assert_equal(arr.chunks[2].inputs[1].op.data, [1])
+    np.testing.assert_equal(arr.chunks[2].inputs[2].op.data, [3])
+    np.testing.assert_equal(arr.chunks[3].inputs[0].op.data, [[True]])
+    np.testing.assert_equal(arr.chunks[3].inputs[1].op.data, [2])
+    np.testing.assert_equal(arr.chunks[3].inputs[2].op.data, [4])
+
+    with pytest.raises(ValueError):
+        where(cond, x)
+
+    x = arange(9.0).reshape(3, 3)
+    y = where(x < 5, x, -1)
+
+    assert y.dtype == np.float64
+
+
+def test_argwhere():
+    cond = tensor([[True, False], [False, True]], chunk_size=1)
+    indices = argwhere(cond)
+
+    assert np.isnan(indices.shape[0])
+    assert indices.shape[1] == 2
+
+    indices = tile(indices)
+
+    assert indices.nsplits[1] == (1, 1)
+
+
+def test_argwhere_order():
+    data = np.asfortranarray([[True, False], [False, True]])
+    cond = tensor(data, chunk_size=1)
+    indices = argwhere(cond)
+
+    assert indices.flags["F_CONTIGUOUS"] is True
+    assert indices.flags["C_CONTIGUOUS"] is False
+
+    indices = tile(indices)
+
+    assert indices.chunks[0].order.value == "F"
+
+
+def test_array_split():
+    a = arange(8, chunk_size=2)
+
+    splits = array_split(a, 3)
+    assert len(splits) == 3
+    assert [s.shape[0] for s in splits] == [3, 3, 2]
+
+    splits = tile(*splits)
+    assert splits[0].nsplits == ((2, 1),)
+    assert splits[1].nsplits == ((1, 2),)
+    assert splits[2].nsplits == ((2,),)
+
+    a = arange(7, chunk_size=2)
+
+    splits = array_split(a, 3)
+    assert len(splits) == 3
+    assert [s.shape[0] for s in splits] == [3, 2, 2]
+
+    splits = tile(*splits)
+    assert splits[0].nsplits == ((2, 1),)
+    assert splits[1].nsplits == ((1, 1),)
+    assert splits[2].nsplits == ((1, 1),)
+
+
+def test_split():
+    a = arange(9, chunk_size=2)
+
+    splits = split(a, 3)
+    assert len(splits) == 3
+    assert all(s.shape == (3,) for s in splits) is True
+
+    splits = tile(*splits)
+    assert splits[0].nsplits == ((2, 1),)
+    assert splits[1].nsplits == ((1, 2),)
+    assert splits[2].nsplits == ((2, 1),)
+
+    a = arange(8, chunk_size=2)
+
+    splits = split(a, [3, 5, 6, 10])
+    assert len(splits) == 5
+    assert splits[0].shape == (3,)
+    assert splits[1].shape == (2,)
+    assert splits[2].shape == (1,)
+    assert splits[3].shape == (2,)
+    assert splits[4].shape == (0,)
+
+    splits = tile(*splits)
+    assert splits[0].nsplits == ((2, 1),)
+    assert splits[1].nsplits == ((1, 1),)
+    assert splits[2].nsplits == ((1,),)
+    assert splits[3].nsplits == ((2,),)
+    assert splits[4].nsplits == ((0,),)
+
+    a = tensor(np.asfortranarray(np.random.rand(9, 10)), chunk_size=4)
+    splits = split(a, 3)
+    assert splits[0].flags["F_CONTIGUOUS"] is True
+    assert splits[0].flags["C_CONTIGUOUS"] is False
+    assert splits[1].flags["F_CONTIGUOUS"] is True
+    assert splits[0].flags["C_CONTIGUOUS"] is False
+    assert splits[2].flags["F_CONTIGUOUS"] is True
+    assert splits[0].flags["C_CONTIGUOUS"] is False
+
+    for a in ((1, 1, 1, 2, 2, 3), [1, 1, 1, 2, 2, 3]):
+        splits = split(a, (3, 5))
+        assert len(splits) == 3
+
+
+def test_squeeze():
+    data = np.array([[[0], [1], [2]]])
+    x = tensor(data)
+
+    t = squeeze(x)
+    assert t.shape == (3,)
+    assert t.dtype is not None
+
+    t = squeeze(x, axis=0)
+    assert t.shape == (3, 1)
+
+    with pytest.raises(ValueError):
+        squeeze(x, axis=1)
+
+    t = squeeze(x, axis=2)
+    assert t.shape == (1, 3)
+
+
+def test_result_type():
+    x = tensor([2, 3], dtype="i4")
+    y = 3
+    z = np.array([3, 4], dtype="f4")
+
+    r = result_type(x, y, z)
+    e = np.result_type(x.dtype, y, z)
+    assert r == e
+
+
+def test_repeat():
+    a = arange(10, chunk_size=2).reshape(2, 5)
+
+    t = repeat(a, 3)
+    assert t.shape == (30,)
+
+    t = repeat(a, 3, axis=0)
+    assert t.shape == (6, 5)
+
+    t = repeat(a, 3, axis=1)
+    assert t.shape == (2, 15)
+
+    t = repeat(a, [3], axis=1)
+    assert t.shape == (2, 15)
+
+    t = repeat(a, [3, 4], axis=0)
+    assert t.shape == (7, 5)
+
+    with pytest.raises(ValueError):
+        repeat(a, [3, 4], axis=1)
+
+    a = tensor(np.random.randn(10), chunk_size=5)
+
+    t = repeat(a, 3)
+    t = tile(t)
+    assert sum(t.nsplits[0]) == 30
+
+    a = tensor(np.random.randn(100), chunk_size=10)
+
+    t = repeat(a, 3)
+    t = tile(t)
+    assert sum(t.nsplits[0]) == 300
+
+    a = tensor(np.random.randn(4))
+    b = tensor((4,))
+
+    t = repeat(a, b)
+
+    t = tile(t)
+    assert np.isnan(t.nsplits[0])
+
+
+def test_isin():
+    element = 2 * arange(4, chunk_size=1).reshape(2, 2)
+    test_elements = [1, 2, 4, 8]
+
+    mask = isin(element, test_elements)
+    assert mask.shape == (2, 2)
+    assert mask.dtype == np.bool_
+
+    mask, element = tile(mask, element)
+
+    assert len(mask.chunks) == len(element.chunks)
+    assert len(mask.op.inputs[1].chunks) == 1
+    assert mask.chunks[0].inputs[0] is element.chunks[0].data
+
+    element = 2 * arange(4, chunk_size=1).reshape(2, 2)
+    test_elements = tensor([1, 2, 4, 8], chunk_size=2)
+
+    mask = isin(element, test_elements, invert=True)
+    assert mask.shape == (2, 2)
+    assert mask.dtype == np.bool_
+
+
+def test_create_view():
+    arr = ones((10, 20, 30), chunk_size=[4, 3, 5])
+    arr2 = transpose(arr)
+    assert arr2.op.create_view is True
+
+    arr3 = transpose(arr)
+    assert arr3.op.create_view is True
+
+    arr4 = arr.swapaxes(0, 1)
+    assert arr4.op.create_view is True
+
+    arr5 = moveaxis(arr, 1, 0)
+    assert arr5.op.create_view is True
+
+    arr6 = atleast_1d(1)
+    assert arr6.op.create_view is True
+
+    arr7 = atleast_2d([1, 1])
+    assert arr7.op.create_view is True
+
+    arr8 = atleast_3d([1, 1])
+    assert arr8.op.create_view is True
+
+    arr9 = arr[:3, [1, 2, 3]]
+    # no view cuz of fancy indexing
+    assert arr9.op.create_view is False
+
+    arr9[0][0][0] = 100
+    assert arr9.op.create_view is False
+
+    arr10 = arr[:3, None, :5]
+    assert arr10.op.create_view is True
+
+    arr10[0][0][0] = 100
+    assert arr10.op.create_view is False
+
+    data = np.array([[[0], [1], [2]]])
+    x = tensor(data)
+
+    t = squeeze(x)
+    assert t.op.create_view is True
+
+    y = x.reshape(3)
+    assert y.op.create_view is True
+
+
+def test_ravel():
+    arr = ones((10, 5), chunk_size=2)
+    flat_arr = ravel(arr)
+    assert flat_arr.shape == (50,)
+
+
+def test_searchsorted():
+    raw = np.sort(np.random.randint(100, size=(16,)))
+    arr = tensor(raw, chunk_size=3).cumsum()
+
+    t1 = searchsorted(arr, 10)
+
+    assert t1.shape == ()
+    assert (
+        t1.flags["C_CONTIGUOUS"]
+        == np.searchsorted(raw.cumsum(), 10).flags["C_CONTIGUOUS"]
+    )
+    assert (
+        t1.flags["F_CONTIGUOUS"]
+        == np.searchsorted(raw.cumsum(), 10).flags["F_CONTIGUOUS"]
+    )
+
+    t1 = tile(t1)
+
+    assert t1.nsplits == ()
+    assert len(t1.chunks) == 1
+    assert t1.chunks[0].op.stage == OperandStage.agg
+
+    with pytest.raises(ValueError):
+        searchsorted(np.random.randint(10, size=(14, 14)), 1)
+
+    with pytest.raises(ValueError):
+        searchsorted(arr, 10, side="both")
+
+    with pytest.raises(ValueError):
+        searchsorted(arr.tosparse(), 10)
+
+    raw2 = np.asfortranarray(np.sort(np.random.randint(100, size=(16,))))
+    arr = tensor(raw2, chunk_size=3)
+    to_search = np.asfortranarray([[1, 2], [3, 4]])
+
+    t1 = searchsorted(arr, to_search)
+    expected = np.searchsorted(raw2, to_search)
+
+    assert t1.shape == to_search.shape
+    assert t1.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert t1.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+
+def test_to_gpu():
+    x = tensor(np.random.rand(10, 10), chunk_size=3)
+
+    gx = to_gpu(x)
+
+    assert gx.dtype == x.dtype
+    assert gx.order == x.order
+    assert gx.op.gpu is True
+
+    gx, x = tile(gx, x)
+
+    assert gx.chunks[0].dtype == x.chunks[0].dtype
+    assert gx.chunks[0].order == x.chunks[0].order
+    assert gx.chunks[0].op.gpu is True
+
+
+def test_to_cpu():
+    x = tensor(np.random.rand(10, 10), chunk_size=3, gpu=True)
+
+    cx = to_cpu(x)
+
+    assert cx.dtype == x.dtype
+    assert cx.order == x.order
+    assert cx.op.gpu is False
+
+    cx, x = tile(cx, x)
+
+    assert cx.chunks[0].dtype == x.chunks[0].dtype
+    assert cx.chunks[0].order == x.chunks[0].order
+    assert cx.chunks[0].op.gpu is False
+
+
+def test_unique():
+    x = unique(np.int64(1))
+
+    assert len(x.shape) == 1
+    assert np.isnan(x.shape[0])
+    assert x.dtype == np.dtype(np.int64)
+
+    x = tile(x)
+
+    assert len(x.chunks) == 1
+    assert len(x.chunks[0].shape) == 1
+    assert np.isnan(x.chunks[0].shape[0])
+    assert x.chunks[0].dtype == np.dtype(np.int64)
+
+    x, indices = unique(0.1, return_index=True)
+
+    assert len(x.shape) == 1
+    assert np.isnan(x.shape[0])
+    assert x.dtype == np.dtype(np.float64)
+    assert len(indices.shape) == 1
+    assert np.isnan(indices.shape[0])
+    assert indices.dtype == np.dtype(np.intp)
+
+    x, indices = tile(x, indices)
+
+    assert len(x.chunks) == 1
+    assert len(x.chunks[0].shape) == 1
+    assert np.isnan(x.chunks[0].shape[0])
+    assert x.chunks[0].dtype == np.dtype(np.float64)
+    assert len(indices.chunks) == 1
+    assert len(indices.chunks[0].shape) == 1
+    assert np.isnan(indices.chunks[0].shape[0])
+    assert indices.chunks[0].dtype == np.dtype(np.intp)
+
+    with pytest.raises(np.AxisError):
+        unique(0.1, axis=1)
+
+    raw = np.random.randint(10, size=(10), dtype=np.int64)
+    a = tensor(raw, chunk_size=4)
+
+    x = unique(a, aggregate_size=2)
+
+    assert len(x.shape) == len(raw.shape)
+    assert np.isnan(x.shape[0])
+    assert x.dtype == np.dtype(np.int64)
+
+    x = tile(x)
+
+    assert len(x.chunks) == 2
+    assert x.nsplits == ((np.nan, np.nan),)
+    for i in range(2):
+        assert x.chunks[i].shape == (np.nan,)
+        assert x.chunks[i].dtype == raw.dtype
+
+    raw = np.random.randint(10, size=(10, 20), dtype=np.int64)
+    a = tensor(raw, chunk_size=(4, 6))
+
+    x, indices, inverse, counts = unique(
+        a,
+        axis=1,
+        aggregate_size=2,
+        return_index=True,
+        return_inverse=True,
+        return_counts=True,
+    )
+
+    assert x.shape == (10, np.nan)
+    assert x.dtype == np.dtype(np.int64)
+    assert indices.shape == (np.nan,)
+    assert indices.dtype == np.dtype(np.intp)
+    assert inverse.shape == (20,)
+    assert inverse.dtype == np.dtype(np.intp)
+    assert counts.shape == (np.nan,)
+    assert counts.dtype == np.dtype(np.int_)
+
+    x, indices, inverse, counts = tile(x, indices, inverse, counts)
+
+    assert len(x.chunks) == 2
+    assert x.nsplits == ((10,), (np.nan, np.nan))
+    for i in range(2):
+        assert x.chunks[i].shape == (10, np.nan)
+        assert x.chunks[i].dtype == raw.dtype
+        assert x.chunks[i].index == (0, i)
+
+    assert len(indices.chunks) == 2
+    assert indices.nsplits == ((np.nan, np.nan),)
+    for i in range(2):
+        assert indices.chunks[i].shape == (np.nan,)
+        assert indices.chunks[i].dtype == raw.dtype
+        assert indices.chunks[i].index == (i,)
+
+    assert len(inverse.chunks) == 4
+    assert inverse.nsplits == ((6, 6, 6, 2),)
+    for i in range(4):
+        assert inverse.chunks[i].shape == ((6, 6, 6, 2)[i],)
+        assert inverse.chunks[i].dtype == np.dtype(np.int64)
+        assert inverse.chunks[i].index == (i,)
+
+    assert len(counts.chunks) == 2
+    assert counts.nsplits == ((np.nan, np.nan),)
+    for i in range(2):
+        assert counts.chunks[i].shape == (np.nan,)
+        assert counts.chunks[i].dtype == np.dtype(np.int_)
+        assert counts.chunks[i].index == (i,)
+
+
+def test_sort():
+    a = tensor(np.random.rand(10, 10), chunk_size=(5, 10))
+
+    sa = sort(a)
+    assert type(sa.op).__name__ == "TensorSort"
+
+    sa = tile(sa)
+
+    assert len(sa.chunks) == 2
+    for c in sa.chunks:
+        assert type(c.op).__name__ == "TensorSort"
+        assert type(c.inputs[0].op).__name__ == "ArrayDataSource"
+
+    a = tensor(np.random.rand(100), chunk_size=(10))
+
+    sa = sort(a)
+    assert type(sa.op).__name__ == "TensorSort"
+
+    sa = tile(sa)
+
+    for c in sa.chunks:
+        assert type(c.op).__name__ == "PSRSShuffle"
+        assert c.op.stage == OperandStage.reduce
+        assert c.shape == (np.nan,)
+
+    a = tensor(
+        np.empty((10, 10), dtype=[("id", np.int32), ("size", np.int64)]),
+        chunk_size=(10, 5),
+    )
+    sa = sort(a)
+    assert sa.op.order == ["id", "size"]
+
+    with pytest.raises(ValueError):
+        sort(a, order=["unknown_field"])
+
+    with pytest.raises(np.AxisError):
+        sort(np.random.rand(100), axis=1)
+
+    with pytest.raises(ValueError):
+        sort(np.random.rand(100), kind="non_valid_kind")
+
+    with pytest.raises(ValueError):
+        sort(np.random.rand(100), parallel_kind="non_valid_parallel_kind")
+
+    with pytest.raises(TypeError):
+        sort(np.random.rand(100), psrs_kinds="non_valid_psrs_kinds")
+
+    with pytest.raises(ValueError):
+        sort(np.random.rand(100), psrs_kinds=["quicksort"] * 2)
+
+    with pytest.raises(ValueError):
+        sort(np.random.rand(100), psrs_kinds=["non_valid_kind"] * 3)
+
+    with pytest.raises(ValueError):
+        sort(np.random.rand(100), psrs_kinds=[None, None, None])
+
+    with pytest.raises(ValueError):
+        sort(np.random.rand(100), psrs_kinds=["quicksort", "mergesort", None])
+
+
+def test_partition():
+    a = tensor(np.random.rand(10, 10), chunk_size=(5, 10))
+
+    pa = partition(a, [4, 9])
+    assert type(pa.op).__name__ == "TensorPartition"
+
+    pa = tile(pa)
+
+    assert len(pa.chunks) == 2
+    for c in pa.chunks:
+        assert type(c.op).__name__ == "TensorPartition"
+        assert type(c.inputs[0].op).__name__ == "ArrayDataSource"
+
+    a = tensor(np.random.rand(100), chunk_size=(10))
+
+    pa = partition(a, 4)
+    assert type(pa.op).__name__ == "TensorPartition"
+
+    pa = tile(pa)
+
+    for c in pa.chunks:
+        assert type(c.op).__name__ == "PartitionMerged"
+        assert c.shape == (np.nan,)
+
+    a = tensor(
+        np.empty((10, 10), dtype=[("id", np.int32), ("size", np.int64)]),
+        chunk_size=(10, 5),
+    )
+    pa = partition(a, 3)
+    assert pa.op.order == ["id", "size"]
+
+    with pytest.raises(ValueError):
+        partition(a, 4, order=["unknown_field"])
+
+    with pytest.raises(np.AxisError):
+        partition(np.random.rand(100), 4, axis=1)
+
+    with pytest.raises(ValueError):
+        partition(np.random.rand(100), 4, kind="non_valid_kind")
+
+    with pytest.raises(ValueError):
+        partition(np.random.rand(10), 10)
+
+    with pytest.raises(TypeError):
+        partition(np.random.rand(10), tensor([1.0, 2.0]))
+
+    with pytest.raises(ValueError):
+        partition(np.random.rand(10), tensor([[1, 2]]))
+
+    with pytest.raises(ValueError):
+        partition(np.random.rand(10), [-11, 2])
+
+
+def test_topk():
+    raw = np.random.rand(20)
+    a = tensor(raw, chunk_size=10)
+
+    t = topk(a, 2)
+    t = tile(t)
+    assert t.op.parallel_kind == "tree"
+
+    t = topk(a, 3)
+    t = tile(t)
+    assert t.op.parallel_kind == "psrs"
+
+    t = topk(sort(a), 3)
+    t = tile(t)
+    # k is less than 100
+    assert t.op.parallel_kind == "tree"
+
+    with pytest.raises(ValueError):
+        topk(a, 3, parallel_kind="unknown")
+
+
+def test_map_chunk():
+    raw = np.random.rand(20)
+    a = tensor(raw, chunk_size=10)
+
+    mapped = tile(a.map_chunk(lambda x: x * 0.5))
+    assert np.issubdtype(mapped.dtype, np.floating) is True
+    assert mapped.shape == (np.nan,)
+    assert len(mapped.chunks) == 2
+
+    mapped = tile(a.map_chunk(lambda x: x * 0.5, elementwise=True))
+    assert np.issubdtype(mapped.dtype, np.floating) is True
+    assert mapped.shape == (20,)
+    assert len(mapped.chunks) == 2
diff --git a/python/xorbits/_mars/tensor/base/tests/test_base_execution.py b/python/xorbits/_mars/tensor/base/tests/test_base_execution.py
new file mode 100644
index 000000000..78092cf63
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/tests/test_base_execution.py
@@ -0,0 +1,1989 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import pytest
+import scipy.sparse as sps
+
+from .... import dataframe as md
+from .... import execute, fetch
+from .... import tensor as mt
+from ....tests.core import require_cupy
+from ...datasource import arange, ones, tensor, zeros
+from .. import (
+    argpartition,
+    argsort,
+    argtopk,
+    argwhere,
+    array_split,
+    atleast_1d,
+    atleast_2d,
+    atleast_3d,
+    broadcast_arrays,
+    broadcast_to,
+    copyto,
+    diff,
+    dsplit,
+    ediff1d,
+    expand_dims,
+    flip,
+    fliplr,
+    flipud,
+    hsplit,
+    isin,
+    moveaxis,
+    partition,
+    repeat,
+    roll,
+    rollaxis,
+    searchsorted,
+    shape,
+    sort,
+    split,
+    squeeze,
+    swapaxes,
+    tile,
+    to_cpu,
+    to_gpu,
+    topk,
+    transpose,
+    trapz,
+    unique,
+    vsplit,
+    where,
+)
+
+
+def test_rechunk_execution(setup):
+    raw = np.random.RandomState(0).random((11, 8))
+    arr = tensor(raw, chunk_size=3)
+    arr2 = arr.rechunk(4)
+
+    res = arr2.execute().fetch()
+    np.testing.assert_array_equal(res, raw)
+
+
+def test_copyto_execution(setup):
+    a = ones((2, 3), chunk_size=1)
+    b = tensor([3, -1, 3], chunk_size=2)
+
+    copyto(a, b, where=b > 1)
+
+    res = a.execute().fetch()
+    expected = np.array([[3, 1, 3], [3, 1, 3]])
+
+    np.testing.assert_equal(res, expected)
+
+    a = ones((2, 3), chunk_size=1)
+    b = tensor(np.asfortranarray(np.random.rand(2, 3)), chunk_size=2)
+
+    copyto(b, a)
+
+    res = b.execute().fetch()
+    expected = np.asfortranarray(np.ones((2, 3)))
+
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["F_CONTIGUOUS"] is True
+    assert res.flags["C_CONTIGUOUS"] is False
+
+
+@pytest.mark.ray_dag
+def test_astype_execution(setup):
+    raw = np.random.random((10, 5))
+    arr = tensor(raw, chunk_size=3)
+    arr2 = arr.astype("i8")
+
+    res = arr2.execute().fetch()
+    np.testing.assert_array_equal(res, raw.astype("i8"))
+
+    raw = sps.random(10, 5, density=0.2)
+    arr = tensor(raw, chunk_size=3)
+    arr2 = arr.astype("i8")
+
+    res = arr2.execute().fetch()
+    assert np.array_equal(res.toarray(), raw.astype("i8").toarray()) is True
+
+    raw = np.asfortranarray(np.random.random((10, 5)))
+    arr = tensor(raw, chunk_size=3)
+    arr2 = arr.astype("i8", order="C")
+
+    res = arr2.execute().fetch()
+    np.testing.assert_array_equal(res, raw.astype("i8"))
+    assert res.flags["C_CONTIGUOUS"] is True
+    assert res.flags["F_CONTIGUOUS"] is False
+
+
+def test_transpose_execution(setup):
+    raw = np.random.random((11, 8, 5))
+    arr = tensor(raw, chunk_size=3)
+    arr2 = transpose(arr)
+
+    res = arr2.execute().fetch()
+    np.testing.assert_array_equal(res, raw.T)
+
+    arr3 = transpose(arr, axes=(-2, -1, -3))
+
+    res = arr3.execute().fetch()
+    np.testing.assert_array_equal(res, raw.transpose(1, 2, 0))
+
+    raw = sps.random(11, 8)
+    arr = tensor(raw, chunk_size=3)
+    arr2 = transpose(arr)
+
+    assert arr2.issparse() is True
+
+    res = arr2.execute().fetch()
+    np.testing.assert_array_equal(res.toarray(), raw.T.toarray())
+
+    # test order
+    raw = np.asfortranarray(np.random.random((11, 8, 5)))
+
+    arr = tensor(raw, chunk_size=3)
+    arr2 = transpose(arr)
+
+    res = arr2.execute().fetch()
+    expected = np.transpose(raw).copy(order="A")
+
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+    arr = tensor(raw, chunk_size=3)
+    arr2 = transpose(arr, (1, 2, 0))
+
+    res = arr2.execute().fetch()
+    expected = np.transpose(raw, (1, 2, 0)).copy(order="A")
+
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+    df = md.DataFrame(mt.random.rand(10, 5, chunk_size=5))
+    df = df[df[0] < 1]
+    # generate tensor with unknown shape
+    t = df.to_tensor()
+    t2 = transpose(t)
+
+    res = t2.execute().fetch()
+    assert res.shape == (5, 10)
+
+
+def test_swapaxes_execution(setup):
+    raw = np.random.random((11, 8, 5))
+    arr = swapaxes(raw, 2, 0)
+
+    res = arr.execute().fetch()
+    np.testing.assert_array_equal(res, raw.swapaxes(2, 0))
+
+    raw = np.random.random((11, 8, 5))
+    arr = tensor(raw, chunk_size=3)
+    arr2 = arr.swapaxes(2, 0)
+
+    res = arr2.execute().fetch()
+    np.testing.assert_array_equal(res, raw.swapaxes(2, 0))
+
+    raw = sps.random(11, 8, density=0.2)
+    arr = tensor(raw, chunk_size=3)
+    arr2 = arr.swapaxes(1, 0)
+
+    res = arr2.execute().fetch()
+    np.testing.assert_array_equal(res.toarray(), raw.toarray().swapaxes(1, 0))
+
+    # test order
+    raw = np.asfortranarray(np.random.rand(11, 8, 5))
+
+    arr = tensor(raw, chunk_size=3)
+    arr2 = arr.swapaxes(2, 0)
+
+    res = arr2.execute().fetch()
+    expected = raw.swapaxes(2, 0).copy(order="A")
+
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+    arr = tensor(raw, chunk_size=3)
+    arr2 = arr.swapaxes(0, 2)
+
+    res = arr2.execute().fetch()
+    expected = raw.swapaxes(0, 2).copy(order="A")
+
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+    arr = tensor(raw, chunk_size=3)
+    arr2 = arr.swapaxes(1, 0)
+
+    res = arr2.execute().fetch()
+    expected = raw.swapaxes(1, 0).copy(order="A")
+
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+
+def test_moveaxis_execution(setup):
+    x = zeros((3, 4, 5), chunk_size=2)
+
+    t = moveaxis(x, 0, -1)
+
+    res = t.execute().fetch()
+    assert res.shape == (4, 5, 3)
+
+    t = moveaxis(x, -1, 0)
+
+    res = t.execute().fetch()
+    assert res.shape == (5, 3, 4)
+
+    t = moveaxis(x, [0, 1], [-1, -2])
+
+    res = t.execute().fetch()
+    assert res.shape == (5, 4, 3)
+
+    t = moveaxis(x, [0, 1, 2], [-1, -2, -3])
+
+    res = t.execute().fetch()
+    assert res.shape == (5, 4, 3)
+
+
+def test_broadcast_to_execution(setup):
+    raw = np.random.random((10, 5, 1))
+    arr = tensor(raw, chunk_size=2)
+    arr2 = broadcast_to(arr, (5, 10, 5, 6))
+
+    res = arr2.execute().fetch()
+    np.testing.assert_array_equal(res, np.broadcast_to(raw, (5, 10, 5, 6)))
+
+    # test chunk with unknown shape
+    arr1 = mt.random.rand(3, 4, chunk_size=2)
+    arr2 = mt.random.permutation(arr1)
+    arr3 = broadcast_to(arr2, (2, 3, 4))
+
+    res = arr3.execute().fetch()
+    assert res.shape == (2, 3, 4)
+
+
+def test_broadcast_arrays_executions(setup):
+    x_data = [[1, 2, 3]]
+    x = tensor(x_data, chunk_size=1)
+    y_data = [[1], [2], [3]]
+    y = tensor(y_data, chunk_size=2)
+
+    a = broadcast_arrays(x, y)
+
+    res = [arr.execute().fetch() for arr in a]
+    expected = np.broadcast_arrays(x_data, y_data)
+
+    for r, e in zip(res, expected):
+        np.testing.assert_equal(r, e)
+
+
+def test_where_execution(setup):
+    raw_cond = np.random.randint(0, 2, size=(4, 4), dtype="?")
+    raw_x = np.random.rand(4, 1)
+    raw_y = np.random.rand(4, 4)
+
+    cond, x, y = (
+        tensor(raw_cond, chunk_size=2),
+        tensor(raw_x, chunk_size=2),
+        tensor(raw_y, chunk_size=2),
+    )
+
+    arr = where(cond, x, y)
+    res = arr.execute().fetch()
+    assert np.array_equal(res, np.where(raw_cond, raw_x, raw_y)) is True
+
+    raw_cond = sps.csr_matrix(np.random.randint(0, 2, size=(4, 4), dtype="?"))
+    raw_x = sps.random(4, 1, density=0.1)
+    raw_y = sps.random(4, 4, density=0.1)
+
+    cond, x, y = (
+        tensor(raw_cond, chunk_size=2),
+        tensor(raw_x, chunk_size=2),
+        tensor(raw_y, chunk_size=2),
+    )
+
+    arr = where(cond, x, y)
+    res = arr.execute().fetch()
+    assert (
+        np.array_equal(
+            res.toarray(),
+            np.where(raw_cond.toarray(), raw_x.toarray(), raw_y.toarray()),
+        )
+        is True
+    )
+
+    # GH 2009
+    raw_x = np.arange(9.0).reshape(3, 3)
+    x = arange(9.0).reshape(3, 3)
+    arr = where(x < 5, 2, -1)
+    res = arr.execute().fetch()
+    np.testing.assert_array_equal(res, np.where(raw_x < 5, 2, -1))
+
+
+@pytest.mark.ray_dag
+def test_reshape_execution(setup):
+    raw_data = np.random.rand(5, 10, 30)
+    x = tensor(raw_data, chunk_size=8)
+
+    y = x.reshape(-1, 30)
+
+    res = y.execute().fetch()
+    np.testing.assert_array_equal(res, raw_data.reshape(-1, 30))
+
+    y2 = x.reshape(10, -1)
+
+    res = y2.execute().fetch()
+    np.testing.assert_array_equal(res, raw_data.reshape(10, -1))
+
+    y3 = x.reshape(-1)
+
+    res = y3.execute().fetch()
+    np.testing.assert_array_equal(res, raw_data.reshape(-1))
+
+    y4 = x.ravel()
+
+    res = y4.execute().fetch()
+    np.testing.assert_array_equal(res, raw_data.ravel())
+
+    raw_data = np.random.rand(6, 20, 4)
+    x = tensor(raw_data, chunk_size=5)
+
+    y = x.reshape(-1, 4, 5, 2, 2)
+
+    res = y.execute().fetch()
+    np.testing.assert_array_equal(res, raw_data.reshape(-1, 4, 5, 2, 2))
+
+    y2 = x.reshape(120, 2, 2)
+
+    res = y2.execute().fetch()
+    np.testing.assert_array_equal(res, raw_data.reshape(120, 2, 2))
+
+    y3 = x.reshape(12, 5, 8)
+
+    res = y3.execute().fetch()
+    np.testing.assert_array_equal(res, raw_data.reshape(12, 5, 8))
+
+    y4 = x.reshape(12, 5, 8)
+    y4.op.extra_params["_reshape_with_shuffle"] = True
+
+    # size_res = self.executor.execute_tensor(y4, mock=True)
+    res = y4.execute().fetch()
+    # assert res[0].nbytes == sum(v[0] for v in size_res)
+    assert np.array_equal(res, raw_data.reshape(12, 5, 8)) is True
+
+    y5 = x.ravel(order="F")
+
+    res = y5.execute().fetch()
+    expected = raw_data.ravel(order="F")
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+
+def test_expand_dims_execution(setup):
+    raw_data = np.random.rand(10, 20, 30)
+    x = tensor(raw_data, chunk_size=6)
+
+    y = expand_dims(x, 1)
+
+    res = y.execute().fetch()
+    assert np.array_equal(res, np.expand_dims(raw_data, 1)) is True
+
+    y = expand_dims(x, 0)
+
+    res = y.execute().fetch()
+    assert np.array_equal(res, np.expand_dims(raw_data, 0)) is True
+
+    y = expand_dims(x, 3)
+
+    res = y.execute().fetch()
+    assert np.array_equal(res, np.expand_dims(raw_data, 3)) is True
+
+    y = expand_dims(x, -1)
+
+    res = y.execute().fetch()
+    assert np.array_equal(res, np.expand_dims(raw_data, -1)) is True
+
+    y = expand_dims(x, -4)
+
+    res = y.execute().fetch()
+    assert np.array_equal(res, np.expand_dims(raw_data, -4)) is True
+
+    with pytest.raises(np.AxisError):
+        expand_dims(x, -5)
+
+    with pytest.raises(np.AxisError):
+        expand_dims(x, 4)
+
+
+def test_rollaxis_execution(setup):
+    x = ones((3, 4, 5, 6), chunk_size=1)
+    y = rollaxis(x, 3, 1)
+
+    res = y.execute().fetch()
+    np.testing.assert_array_equal(res, np.rollaxis(np.ones((3, 4, 5, 6)), 3, 1))
+
+
+def test_atleast1d_execution(setup):
+    x = 1
+    y = ones(3, chunk_size=2)
+    z = ones((3, 4), chunk_size=2)
+
+    t = atleast_1d(x, y, z)
+
+    res = [i.execute().fetch() for i in t]
+
+    np.testing.assert_array_equal(res[0], np.array([1]))
+    np.testing.assert_array_equal(res[1], np.ones(3))
+    np.testing.assert_array_equal(res[2], np.ones((3, 4)))
+
+
+def test_atleast2d_execution(setup):
+    x = 1
+    y = ones(3, chunk_size=2)
+    z = ones((3, 4), chunk_size=2)
+
+    t = atleast_2d(x, y, z)
+
+    res = [i.execute().fetch() for i in t]
+
+    np.testing.assert_array_equal(res[0], np.array([[1]]))
+    np.testing.assert_array_equal(res[1], np.atleast_2d(np.ones(3)))
+    assert np.array_equal(res[2], np.ones((3, 4))) is True
+
+
+def test_atleast3d_execution(setup):
+    x = 1
+    y = ones(3, chunk_size=2)
+    z = ones((3, 4), chunk_size=2)
+
+    t = atleast_3d(x, y, z)
+
+    res = [i.execute().fetch() for i in t]
+
+    np.testing.assert_array_equal(res[0], np.atleast_3d(x))
+    np.testing.assert_array_equal(res[1], np.atleast_3d(np.ones(3)))
+    np.testing.assert_array_equal(res[2], np.atleast_3d(np.ones((3, 4))))
+
+
+def test_argwhere_execution(setup):
+    x = arange(6, chunk_size=2).reshape(2, 3)
+    t = argwhere(x > 1)
+
+    res = t.execute().fetch()
+    expected = np.argwhere(np.arange(6).reshape(2, 3) > 1)
+
+    np.testing.assert_array_equal(res, expected)
+
+    data = np.asfortranarray(np.random.rand(10, 20))
+    x = tensor(data, chunk_size=10)
+
+    t = argwhere(x > 0.5)
+
+    res = t.execute().fetch()
+    expected = np.argwhere(data > 0.5)
+
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["F_CONTIGUOUS"] is True
+    assert res.flags["C_CONTIGUOUS"] is False
+
+
+def test_array_split_execution(setup):
+    x = arange(48, chunk_size=3).reshape(2, 3, 8)
+    ss = array_split(x, 3, axis=2)
+
+    res = [i.execute().fetch() for i in ss]
+    expected = np.array_split(np.arange(48).reshape(2, 3, 8), 3, axis=2)
+    assert len(res) == len(expected)
+    [np.testing.assert_equal(r, e) for r, e in zip(res, expected)]
+
+    ss = array_split(x, [3, 5, 6, 10], axis=2)
+
+    res = [i.execute().fetch() for i in ss]
+    expected = np.array_split(np.arange(48).reshape(2, 3, 8), [3, 5, 6, 10], axis=2)
+    assert len(res) == len(expected)
+    [np.testing.assert_equal(r, e) for r, e in zip(res, expected)]
+
+
+def test_split_execution(setup):
+    for a in ((1, 1, 1, 2, 2, 3), [1, 1, 1, 2, 2, 3]):
+        splits = split(a, (3, 5))
+        assert len(splits) == 3
+        splits0 = splits[0].execute().fetch()
+        np.testing.assert_array_equal(splits0, (1, 1, 1))
+        splits1 = splits[1].execute().fetch()
+        np.testing.assert_array_equal(splits1, (2, 2))
+        splits2 = splits[2].execute().fetch()
+        np.testing.assert_array_equal(splits2, (3,))
+
+    x = arange(48, chunk_size=3).reshape(2, 3, 8)
+    ss = split(x, 4, axis=2)
+
+    res = [i.execute().fetch() for i in ss]
+    expected = np.split(np.arange(48).reshape(2, 3, 8), 4, axis=2)
+    assert len(res) == len(expected)
+    [np.testing.assert_equal(r, e) for r, e in zip(res, expected)]
+
+    ss = split(x, [3, 5, 6, 10], axis=2)
+
+    res = [i.execute().fetch() for i in ss]
+    expected = np.split(np.arange(48).reshape(2, 3, 8), [3, 5, 6, 10], axis=2)
+    assert len(res) == len(expected)
+    [np.testing.assert_equal(r, e) for r, e in zip(res, expected)]
+
+    # hsplit
+    x = arange(120, chunk_size=3).reshape(2, 12, 5)
+    ss = hsplit(x, 4)
+
+    res = [i.execute().fetch() for i in ss]
+    expected = np.hsplit(np.arange(120).reshape(2, 12, 5), 4)
+    assert len(res) == len(expected)
+    [np.testing.assert_equal(r, e) for r, e in zip(res, expected)]
+
+    # vsplit
+    x = arange(48, chunk_size=3).reshape(8, 3, 2)
+    ss = vsplit(x, 4)
+
+    res = [i.execute().fetch() for i in ss]
+    expected = np.vsplit(np.arange(48).reshape(8, 3, 2), 4)
+    assert len(res) == len(expected)
+    [np.testing.assert_equal(r, e) for r, e in zip(res, expected)]
+
+    # dsplit
+    x = arange(48, chunk_size=3).reshape(2, 3, 8)
+    ss = dsplit(x, 4)
+
+    res = [i.execute().fetch() for i in ss]
+    expected = np.dsplit(np.arange(48).reshape(2, 3, 8), 4)
+    assert len(res) == len(expected)
+    [np.testing.assert_equal(r, e) for r, e in zip(res, expected)]
+
+    x_data = sps.random(12, 8, density=0.1)
+    x = tensor(x_data, chunk_size=3)
+    ss = split(x, 4, axis=0)
+
+    res = [i.execute().fetch() for i in ss]
+    expected = np.split(x_data.toarray(), 4, axis=0)
+    assert len(res) == len(expected)
+    [np.testing.assert_equal(r.toarray(), e) for r, e in zip(res, expected)]
+
+
+def test_roll_execution(setup):
+    x = arange(10, chunk_size=2)
+
+    t = roll(x, 2)
+
+    res = t.execute().fetch()
+    expected = np.roll(np.arange(10), 2)
+    np.testing.assert_equal(res, expected)
+
+    x2 = x.reshape(2, 5)
+
+    t = roll(x2, 1)
+
+    res = t.execute().fetch()
+    expected = np.roll(np.arange(10).reshape(2, 5), 1)
+    np.testing.assert_equal(res, expected)
+
+    t = roll(x2, 1, axis=0)
+
+    res = t.execute().fetch()
+    expected = np.roll(np.arange(10).reshape(2, 5), 1, axis=0)
+    np.testing.assert_equal(res, expected)
+
+    t = roll(x2, 1, axis=1)
+
+    res = t.execute().fetch()
+    expected = np.roll(np.arange(10).reshape(2, 5), 1, axis=1)
+    np.testing.assert_equal(res, expected)
+
+
+def test_squeeze_execution(setup):
+    data = np.array([[[0], [1], [2]]])
+    x = tensor(data, chunk_size=1)
+
+    t = squeeze(x)
+
+    res = t.execute().fetch()
+    expected = np.squeeze(data)
+    np.testing.assert_equal(res, expected)
+
+    t = squeeze(x, axis=2)
+
+    res = t.execute().fetch()
+    expected = np.squeeze(data, axis=2)
+    np.testing.assert_equal(res, expected)
+
+
+def test_diff_execution(setup):
+    data = np.array([1, 2, 4, 7, 0])
+    x = tensor(data, chunk_size=2)
+
+    t = diff(x)
+
+    res = t.execute().fetch()
+    expected = np.diff(data)
+    np.testing.assert_equal(res, expected)
+
+    t = diff(x, n=2)
+
+    res = t.execute().fetch()
+    expected = np.diff(data, n=2)
+    np.testing.assert_equal(res, expected)
+
+    data = np.array([[1, 3, 6, 10], [0, 5, 6, 8]])
+    x = tensor(data, chunk_size=2)
+
+    t = diff(x)
+
+    res = t.execute().fetch()
+    expected = np.diff(data)
+    np.testing.assert_equal(res, expected)
+
+    t = diff(x, axis=0)
+
+    res = t.execute().fetch()
+    expected = np.diff(data, axis=0)
+    np.testing.assert_equal(res, expected)
+
+    x = mt.arange("1066-10-13", "1066-10-16", dtype=mt.datetime64)
+    t = diff(x)
+
+    res = t.execute().fetch()
+    expected = np.diff(np.arange("1066-10-13", "1066-10-16", dtype=np.datetime64))
+    np.testing.assert_equal(res, expected)
+
+
+def test_ediff1d(setup):
+    data = np.array([1, 2, 4, 7, 0])
+    x = tensor(data, chunk_size=2)
+
+    t = ediff1d(x)
+
+    res = t.execute().fetch()
+    expected = np.ediff1d(data)
+    np.testing.assert_equal(res, expected)
+
+    to_begin = tensor(-99, chunk_size=2)
+    to_end = tensor([88, 99], chunk_size=2)
+    t = ediff1d(x, to_begin=to_begin, to_end=to_end)
+
+    res = t.execute().fetch()
+    expected = np.ediff1d(data, to_begin=-99, to_end=np.array([88, 99]))
+    np.testing.assert_equal(res, expected)
+
+    data = [[1, 2, 4], [1, 6, 24]]
+
+    t = ediff1d(tensor(data, chunk_size=2))
+
+    res = t.execute().fetch()
+    expected = np.ediff1d(data)
+    np.testing.assert_equal(res, expected)
+
+
+def test_flip_execution(setup):
+    a = arange(8, chunk_size=2).reshape((2, 2, 2))
+
+    t = flip(a, 0)
+
+    res = t.execute().fetch()
+    expected = np.flip(np.arange(8).reshape(2, 2, 2), 0)
+    np.testing.assert_equal(res, expected)
+
+    t = flip(a, 1)
+
+    res = t.execute().fetch()
+    expected = np.flip(np.arange(8).reshape(2, 2, 2), 1)
+    np.testing.assert_equal(res, expected)
+
+    t = flipud(a)
+
+    res = t.execute().fetch()
+    expected = np.flipud(np.arange(8).reshape(2, 2, 2))
+    np.testing.assert_equal(res, expected)
+
+    t = fliplr(a)
+
+    res = t.execute().fetch()
+    expected = np.fliplr(np.arange(8).reshape(2, 2, 2))
+    np.testing.assert_equal(res, expected)
+
+
+def test_repeat_execution(setup):
+    a = repeat(3, 4)
+
+    res = a.execute().fetch()
+    expected = np.repeat(3, 4)
+    np.testing.assert_equal(res, expected)
+
+    x_data = np.random.randn(20, 30)
+    x = tensor(x_data, chunk_size=(12, 16))
+
+    t = repeat(x, 2)
+
+    res = t.execute().fetch()
+    expected = np.repeat(x_data, 2)
+    np.testing.assert_equal(res, expected)
+
+    t = repeat(x, 3, axis=1)
+
+    res = t.execute().fetch()
+    expected = np.repeat(x_data, 3, axis=1)
+    np.testing.assert_equal(res, expected)
+
+    t = repeat(x, np.arange(20), axis=0)
+
+    res = t.execute().fetch()
+    expected = np.repeat(x_data, np.arange(20), axis=0)
+    np.testing.assert_equal(res, expected)
+
+    t = repeat(x, arange(20, chunk_size=10), axis=0)
+
+    res = t.execute().fetch()
+    expected = np.repeat(x_data, np.arange(20), axis=0)
+    np.testing.assert_equal(res, expected)
+
+    x_data = sps.random(20, 30, density=0.1)
+    x = tensor(x_data, chunk_size=(12, 16))
+
+    t = repeat(x, 2, axis=1)
+
+    res = t.execute().fetch()
+    expected = np.repeat(x_data.toarray(), 2, axis=1)
+    np.testing.assert_equal(res.toarray(), expected)
+
+
+def test_tile_execution(setup):
+    a_data = np.array([0, 1, 2])
+    a = tensor(a_data, chunk_size=2)
+
+    t = tile(a, 2)
+
+    res = t.execute().fetch()
+    expected = np.tile(a_data, 2)
+    np.testing.assert_equal(res, expected)
+
+    t = tile(a, (2, 2))
+
+    res = t.execute().fetch()
+    expected = np.tile(a_data, (2, 2))
+    np.testing.assert_equal(res, expected)
+
+    t = tile(a, (2, 1, 2))
+
+    res = t.execute().fetch()
+    expected = np.tile(a_data, (2, 1, 2))
+    np.testing.assert_equal(res, expected)
+
+    b_data = np.array([[1, 2], [3, 4]])
+    b = tensor(b_data, chunk_size=1)
+
+    t = tile(b, 2)
+
+    res = t.execute().fetch()
+    expected = np.tile(b_data, 2)
+    np.testing.assert_equal(res, expected)
+
+    t = tile(b, (2, 1))
+
+    res = t.execute().fetch()
+    expected = np.tile(b_data, (2, 1))
+    np.testing.assert_equal(res, expected)
+
+    c_data = np.array([1, 2, 3, 4])
+    c = tensor(c_data, chunk_size=3)
+
+    t = tile(c, (4, 1))
+
+    res = t.execute().fetch()
+    expected = np.tile(c_data, (4, 1))
+    np.testing.assert_equal(res, expected)
+
+
+@pytest.mark.ray_dag
+def test_isin_execution(setup):
+    element = 2 * arange(4, chunk_size=1).reshape((2, 2))
+    test_elements = [1, 2, 4, 8]
+
+    mask = isin(element, test_elements)
+
+    res = mask.execute().fetch()
+    expected = np.isin(2 * np.arange(4).reshape((2, 2)), test_elements)
+    np.testing.assert_equal(res, expected)
+
+    res = element[mask].execute().fetch()
+    expected = np.array([2, 4])
+    np.testing.assert_equal(res, expected)
+
+    mask = isin(element, test_elements, invert=True)
+
+    res = mask.execute().fetch()
+    expected = np.isin(2 * np.arange(4).reshape((2, 2)), test_elements, invert=True)
+    np.testing.assert_equal(res, expected)
+
+    res = element[mask].execute().fetch()
+    expected = np.array([0, 6])
+    np.testing.assert_equal(res, expected)
+
+    test_set = {1, 2, 4, 8}
+    mask = isin(element, test_set)
+
+    res = mask.execute().fetch()
+    expected = np.isin(2 * np.arange(4).reshape((2, 2)), test_set)
+    np.testing.assert_equal(res, expected)
+
+
+def test_ravel_execution(setup):
+    arr = ones((10, 5), chunk_size=2)
+    flat_arr = mt.ravel(arr)
+
+    res = flat_arr.execute().fetch()
+    assert len(res) == 50
+    np.testing.assert_equal(res, np.ones(50))
+
+
+def test_searchsorted_execution(setup):
+    raw = np.sort(np.random.randint(100, size=(16,)))
+
+    # test different chunk_size, 3 will have combine, 6 will skip combine
+    for chunk_size in (3, 8):
+        arr = tensor(raw, chunk_size=chunk_size)
+
+        # test scalar, with value in the middle
+        t1 = searchsorted(arr, 20)
+
+        res = t1.execute().fetch()
+        expected = np.searchsorted(raw, 20)
+        np.testing.assert_array_equal(res, expected)
+
+        # test scalar, with value larger than 100
+        t2 = searchsorted(arr, 200)
+
+        res = t2.execute().fetch()
+        expected = np.searchsorted(raw, 200)
+        np.testing.assert_array_equal(res, expected)
+
+        # test scalar, side left, with value exact in the middle of the array
+        t3 = searchsorted(arr, raw[10], side="left")
+
+        res = t3.execute().fetch()
+        expected = np.searchsorted(raw, raw[10], side="left")
+        np.testing.assert_array_equal(res, expected)
+
+        # test scalar, side right, with value exact in the middle of the array
+        t4 = searchsorted(arr, raw[10], side="right")
+
+        res = t4.execute().fetch()
+        expected = np.searchsorted(raw, raw[10], side="right")
+        np.testing.assert_array_equal(res, expected)
+
+        # test scalar, side left, with value exact in the end of the array
+        t5 = searchsorted(arr, raw[15], side="left")
+
+        res = t5.execute().fetch()
+        expected = np.searchsorted(raw, raw[15], side="left")
+        np.testing.assert_array_equal(res, expected)
+
+        # test scalar, side right, with value exact in the end of the array
+        t6 = searchsorted(arr, raw[15], side="right")
+
+        res = t6.execute().fetch()
+        expected = np.searchsorted(raw, raw[15], side="right")
+        np.testing.assert_array_equal(res, expected)
+
+        # test scalar, side left, with value exact in the start of the array
+        t7 = searchsorted(arr, raw[0], side="left")
+
+        res = t7.execute().fetch()
+        expected = np.searchsorted(raw, raw[0], side="left")
+        np.testing.assert_array_equal(res, expected)
+
+        # test scalar, side right, with value exact in the start of the array
+        t8 = searchsorted(arr, raw[0], side="right")
+
+        res = t8.execute().fetch()
+        expected = np.searchsorted(raw, raw[0], side="right")
+        np.testing.assert_array_equal(res, expected)
+
+        raw2 = np.random.randint(100, size=(3, 4))
+
+        # test tensor, side left
+        t9 = searchsorted(arr, tensor(raw2, chunk_size=2), side="left")
+
+        res = t9.execute().fetch()
+        expected = np.searchsorted(raw, raw2, side="left")
+        np.testing.assert_array_equal(res, expected)
+
+        # test tensor, side right
+        t10 = searchsorted(arr, tensor(raw2, chunk_size=2), side="right")
+
+        res = t10.execute().fetch()
+        expected = np.searchsorted(raw, raw2, side="right")
+        np.testing.assert_array_equal(res, expected)
+
+    # test one chunk
+    arr = tensor(raw, chunk_size=16)
+
+    # test scalar, tensor to search has 1 chunk
+    t11 = searchsorted(arr, 20)
+    res = t11.execute().fetch()
+    expected = np.searchsorted(raw, 20)
+    np.testing.assert_array_equal(res, expected)
+
+    # test tensor with 1 chunk, tensor to search has 1 chunk
+    t12 = searchsorted(arr, tensor(raw2, chunk_size=4))
+
+    res = t12.execute().fetch()
+    expected = np.searchsorted(raw, raw2)
+    np.testing.assert_array_equal(res, expected)
+
+    # test tensor with more than 1 chunk, tensor to search has 1 chunk
+    t13 = searchsorted(arr, tensor(raw2, chunk_size=2))
+
+    res = t13.execute().fetch()
+    expected = np.searchsorted(raw, raw2)
+    np.testing.assert_array_equal(res, expected)
+
+    # test sorter
+    raw3 = np.random.randint(100, size=(16,))
+    arr = tensor(raw3, chunk_size=3)
+    order = np.argsort(raw3)
+    order_arr = tensor(order, chunk_size=4)
+
+    t14 = searchsorted(arr, 20, sorter=order_arr)
+
+    res = t14.execute().fetch()
+    expected = np.searchsorted(raw3, 20, sorter=order)
+    np.testing.assert_array_equal(res, expected)
+
+    # all data same
+    raw4 = np.ones(8)
+    arr = tensor(raw4, chunk_size=2)
+
+    for val in (0, 1, 2):
+        for side in ("left", "right"):
+            t15 = searchsorted(arr, val, side=side)
+
+            res = t15.execute().fetch()
+            expected = np.searchsorted(raw4, val, side=side)
+            np.testing.assert_array_equal(res, expected)
+
+
+@pytest.mark.ray_dag
+def test_unique_execution(setup):
+    rs = np.random.RandomState(0)
+    raw = rs.randint(10, size=(10,))
+
+    for chunk_size in (10, 3):
+        x = tensor(raw, chunk_size=chunk_size)
+
+        y = unique(x)
+
+        res = y.execute().fetch()
+        expected = np.unique(raw)
+        np.testing.assert_array_equal(res, expected)
+
+        y, indices = unique(x, return_index=True)
+
+        res = fetch(execute(y, indices))
+        expected = np.unique(raw, return_index=True)
+        assert len(res) == 2
+        assert len(expected) == 2
+        np.testing.assert_array_equal(res[0], expected[0])
+        np.testing.assert_array_equal(res[1], expected[1])
+
+        y, inverse = unique(x, return_inverse=True)
+
+        res = fetch(*execute(y, inverse))
+        expected = np.unique(raw, return_inverse=True)
+        assert len(res) == 2
+        assert len(expected) == 2
+        np.testing.assert_array_equal(res[0], expected[0])
+        np.testing.assert_array_equal(res[1], expected[1])
+
+        y, counts = unique(x, return_counts=True)
+
+        res = fetch(*execute(y, counts))
+        expected = np.unique(raw, return_counts=True)
+        assert len(res) == 2
+        assert len(expected) == 2
+        np.testing.assert_array_equal(res[0], expected[0])
+        np.testing.assert_array_equal(res[1], expected[1])
+
+        y, indices, inverse, counts = unique(
+            x, return_index=True, return_inverse=True, return_counts=True
+        )
+
+        res = fetch(*execute(y, indices, inverse, counts))
+        expected = np.unique(
+            raw, return_index=True, return_inverse=True, return_counts=True
+        )
+        assert len(res) == 4
+        assert len(expected) == 4
+        np.testing.assert_array_equal(res[0], expected[0])
+        np.testing.assert_array_equal(res[1], expected[1])
+        np.testing.assert_array_equal(res[2], expected[2])
+        np.testing.assert_array_equal(res[3], expected[3])
+
+        y, indices, counts = unique(x, return_index=True, return_counts=True)
+
+        res = fetch(*execute(y, indices, counts))
+        expected = np.unique(raw, return_index=True, return_counts=True)
+        assert len(res) == 3
+        assert len(expected) == 3
+        np.testing.assert_array_equal(res[0], expected[0])
+        np.testing.assert_array_equal(res[1], expected[1])
+        np.testing.assert_array_equal(res[2], expected[2])
+
+        raw2 = rs.randint(10, size=(4, 5, 6))
+        x2 = tensor(raw2, chunk_size=chunk_size)
+
+        y2 = unique(x2)
+
+        res = y2.execute().fetch()
+        expected = np.unique(raw2)
+        np.testing.assert_array_equal(res, expected)
+
+        y2 = unique(x2, axis=1)
+
+        res = y2.execute().fetch()
+        expected = np.unique(raw2, axis=1)
+        np.testing.assert_array_equal(res, expected)
+
+        y2 = unique(x2, axis=2)
+
+        res = y2.execute().fetch()
+        expected = np.unique(raw2, axis=2)
+        np.testing.assert_array_equal(res, expected)
+
+    raw = rs.randint(10, size=(10, 20))
+    raw[:, 0] = raw[:, 11] = rs.randint(10, size=(10,))
+    x = tensor(raw, chunk_size=2)
+    y, ind, inv, counts = unique(
+        x,
+        aggregate_size=3,
+        axis=1,
+        return_index=True,
+        return_inverse=True,
+        return_counts=True,
+    )
+
+    res_unique, res_ind, res_inv, res_counts = fetch(*execute(y, ind, inv, counts))
+    exp_unique, exp_ind, exp_counts = np.unique(
+        raw, axis=1, return_index=True, return_counts=True
+    )
+    raw_res_unique = res_unique
+    res_unique_df = pd.DataFrame(res_unique)
+    res_unique_ind = np.asarray(
+        res_unique_df.sort_values(list(range(res_unique.shape[0])), axis=1).columns
+    )
+    res_unique = res_unique[:, res_unique_ind]
+    res_ind = res_ind[res_unique_ind]
+    res_counts = res_counts[res_unique_ind]
+
+    np.testing.assert_array_equal(res_unique, exp_unique)
+    np.testing.assert_array_equal(res_ind, exp_ind)
+    np.testing.assert_array_equal(raw_res_unique[:, res_inv], raw)
+    np.testing.assert_array_equal(res_counts, exp_counts)
+
+    x = (mt.random.RandomState(0).rand(1000, chunk_size=20) > 0.5).astype(np.int32)
+    y = unique(x)
+    res = np.sort(y.execute().fetch())
+    np.testing.assert_array_equal(res, np.array([0, 1]))
+
+    # test sparse
+    sparse_raw = sps.random(10, 3, density=0.1, format="csr", random_state=rs)
+    x = tensor(sparse_raw, chunk_size=2)
+    y = unique(x)
+    res = np.sort(y.execute().fetch())
+    np.testing.assert_array_equal(res, np.unique(sparse_raw.data))
+
+    # test empty
+    x = tensor([])
+    y = unique(x)
+    res = y.execute().fetch()
+    np.testing.assert_array_equal(res, np.unique([]))
+
+    x = tensor([[]])
+    y = unique(x)
+    res = y.execute().fetch()
+    np.testing.assert_array_equal(res, np.unique([[]]))
+
+
+@require_cupy
+def test_to_gpu_execution(setup_gpu):
+    raw = np.random.rand(10, 10)
+    x = tensor(raw, chunk_size=3)
+
+    gx = to_gpu(x)
+
+    res = gx.execute().fetch()
+    np.testing.assert_array_equal(res.get(), raw)
+
+
+@require_cupy
+def test_to_cpu_execution(setup_gpu):
+    raw = np.random.rand(10, 10)
+    x = tensor(raw, chunk_size=3, gpu=True)
+
+    cx = to_cpu(x)
+
+    res = cx.execute().fetch()
+    np.testing.assert_array_equal(res, raw)
+
+
+@pytest.mark.ray_dag
+def test_sort_execution(setup):
+    # only 1 chunk when axis = -1
+    raw = np.random.rand(100, 10)
+    x = tensor(raw, chunk_size=20)
+
+    sx = sort(x)
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw))
+
+    # 1-d chunk
+    raw = np.random.rand(100)
+    x = tensor(raw, chunk_size=20)
+
+    sx = sort(x)
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw))
+
+    # test force need_align=True
+    sx = sort(x)
+    sx.op._need_align = True
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw))
+
+    # test psrs_kinds
+    sx = sort(x, psrs_kinds=[None, None, "quicksort"])
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw))
+
+    # structured dtype
+    raw = np.empty(100, dtype=[("id", np.int32), ("size", np.int64)])
+    raw["id"] = np.random.randint(1000, size=100, dtype=np.int32)
+    raw["size"] = np.random.randint(1000, size=100, dtype=np.int64)
+    x = tensor(raw, chunk_size=10)
+
+    sx = sort(x, order=["size", "id"])
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw, order=["size", "id"]))
+
+    # test psrs_kinds with structured dtype
+    sx = sort(x, order=["size", "id"], psrs_kinds=[None, None, "quicksort"])
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw, order=["size", "id"]))
+
+    # test flatten case
+    raw = np.random.rand(10, 10)
+    x = tensor(raw, chunk_size=(5, 10))
+
+    sx = sort(x, axis=None)
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw, axis=None))
+
+    # test multi-dimension
+    raw = np.random.rand(10, 100)
+    x = tensor(raw, chunk_size=(5, 40))
+
+    sx = sort(x, psrs_kinds=["quicksort"] * 3)
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw))
+
+    sx = sort(x, psrs_kinds=[None, None, "quicksort"])
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw))
+
+    raw = np.random.rand(10, 99)
+    x = tensor(raw, chunk_size=(5, 20))
+
+    sx = sort(x)
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw))
+
+    # test 3-d
+    raw = np.random.rand(20, 25, 28)
+    x = tensor(raw, chunk_size=(10, 15, 14))
+
+    sx = sort(x)
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw))
+
+    sx = sort(x, psrs_kinds=[None, None, "quicksort"])
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw))
+
+    sx = sort(x, axis=0)
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw, axis=0))
+
+    sx = sort(x, axis=0, psrs_kinds=[None, None, "quicksort"])
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw, axis=0))
+
+    sx = sort(x, axis=1)
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw, axis=1))
+
+    sx = sort(x, axis=1, psrs_kinds=[None, None, "quicksort"])
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw, axis=1))
+
+    # test multi-dimension with structured type
+    raw = np.empty((10, 100), dtype=[("id", np.int32), ("size", np.int64)])
+    raw["id"] = np.random.randint(1000, size=(10, 100), dtype=np.int32)
+    raw["size"] = np.random.randint(1000, size=(10, 100), dtype=np.int64)
+    x = tensor(raw, chunk_size=(7, 30))
+
+    sx = sort(x)
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw))
+
+    sx = sort(x, order=["size", "id"])
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw, order=["size", "id"]))
+
+    sx = sort(x, order=["size"])
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw, order=["size"]))
+
+    sx = sort(x, axis=0, order=["size", "id"])
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw, axis=0, order=["size", "id"]))
+
+    sx = sort(x, axis=0, order=["size", "id"], psrs_kinds=[None, None, "quicksort"])
+
+    res = sx.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw, axis=0, order=["size", "id"]))
+
+    # test inplace sort
+    raw = np.random.rand(10, 12)
+    a = tensor(raw, chunk_size=(5, 4))
+    a.sort(axis=1)
+
+    res = a.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw, axis=1))
+
+    a.sort(axis=0)
+
+    res = a.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(np.sort(raw, axis=1), axis=0))
+
+    # test with empty chunk
+    raw = np.random.rand(20, 10)
+    raw[:, :8] = 1
+    a = tensor(raw, chunk_size=5)
+    filtered = a[a < 1]
+    filtered.sort()
+
+    res = filtered.execute().fetch()
+    np.testing.assert_array_equal(res, np.sort(raw[raw < 1]))
+
+
+@pytest.mark.ray_dag
+def test_sort_indices_execution(setup):
+    # only 1 chunk when axis = -1
+    raw = np.random.rand(100, 10)
+    x = tensor(raw, chunk_size=20)
+
+    r = sort(x, return_index=True)
+
+    sr, si = r.execute().fetch()
+    np.testing.assert_array_equal(sr, np.take_along_axis(raw, si, axis=-1))
+
+    x = tensor(raw, chunk_size=(22, 4))
+
+    r = sort(x, return_index=True)
+
+    sr, si = r.execute().fetch()
+    np.testing.assert_array_equal(sr, np.take_along_axis(raw, si, axis=-1))
+
+    raw = np.random.rand(100)
+
+    x = tensor(raw, chunk_size=23)
+
+    r = sort(x, axis=0, return_index=True)
+
+    sr, si = r.execute().fetch()
+    np.testing.assert_array_equal(sr, raw[si])
+
+
+@pytest.mark.ray_dag
+def test_argsort(setup):
+    # only 1 chunk when axis = -1
+    raw = np.random.rand(100, 10)
+    x = tensor(raw, chunk_size=10)
+
+    xa = argsort(x)
+
+    r = xa.execute().fetch()
+    np.testing.assert_array_equal(np.sort(raw), np.take_along_axis(raw, r, axis=-1))
+
+    x = tensor(raw, chunk_size=(22, 4))
+
+    xa = argsort(x)
+
+    r = xa.execute().fetch()
+    np.testing.assert_array_equal(np.sort(raw), np.take_along_axis(raw, r, axis=-1))
+
+    raw = np.random.rand(100)
+
+    x = tensor(raw, chunk_size=23)
+
+    xa = argsort(x, axis=0)
+
+    r = xa.execute().fetch()
+    np.testing.assert_array_equal(np.sort(raw, axis=0), raw[r])
+
+
+@pytest.mark.ray_dag
+def test_partition_execution(setup):
+    # only 1 chunk when axis = -1
+    raw = np.random.rand(100, 10)
+    x = tensor(raw, chunk_size=20)
+
+    px = partition(x, [1, 8])
+
+    res = px.execute().fetch()
+    np.testing.assert_array_equal(res, np.partition(raw, [1, 8]))
+
+    # 1-d chunk
+    raw = np.random.rand(100)
+    x = tensor(raw, chunk_size=20)
+
+    kth = np.random.RandomState(0).randint(-100, 100, size=(10,))
+    px = partition(x, kth)
+
+    res = px.execute().fetch()
+    np.testing.assert_array_equal(res[kth], np.partition(raw, kth)[kth])
+
+    # structured dtype
+    raw = np.empty(100, dtype=[("id", np.int32), ("size", np.int64)])
+    raw["id"] = np.random.randint(1000, size=100, dtype=np.int32)
+    raw["size"] = np.random.randint(1000, size=100, dtype=np.int64)
+    x = tensor(raw, chunk_size=20)
+
+    px = partition(x, kth, order=["size", "id"])
+
+    res = px.execute().fetch()
+    np.testing.assert_array_equal(
+        res[kth], np.partition(raw, kth, order=["size", "id"])[kth]
+    )
+
+    # test flatten case
+    raw = np.random.rand(10, 10)
+    x = tensor(raw, chunk_size=5)
+
+    px = partition(x, kth, axis=None)
+
+    res = px.execute().fetch()
+    np.testing.assert_array_equal(res[kth], np.partition(raw, kth, axis=None)[kth])
+
+    # test multi-dimension
+    raw = np.random.rand(10, 100)
+    x = tensor(raw, chunk_size=(5, 20))
+
+    kth = np.random.RandomState(0).randint(-10, 10, size=(3,))
+    px = partition(x, kth)
+
+    res = px.execute().fetch()
+    np.testing.assert_array_equal(res[:, kth], np.partition(raw, kth)[:, kth])
+
+    raw = np.random.rand(10, 99)
+    x = tensor(raw, chunk_size=(5, 20))
+
+    px = partition(x, kth)
+
+    res = px.execute().fetch()
+    np.testing.assert_array_equal(res[:, kth], np.partition(raw, kth)[:, kth])
+
+    # test 3-d
+    raw = np.random.rand(20, 25, 28)
+    x = tensor(raw, chunk_size=(10, 15, 14))
+
+    kth = np.random.RandomState(0).randint(-28, 28, size=(3,))
+    px = partition(x, kth)
+
+    res = px.execute().fetch()
+    np.testing.assert_array_equal(res[:, :, kth], np.partition(raw, kth)[:, :, kth])
+
+    kth = np.random.RandomState(0).randint(-20, 20, size=(3,))
+    px = partition(x, kth, axis=0)
+
+    res = px.execute().fetch()
+    np.testing.assert_array_equal(res[kth], np.partition(raw, kth, axis=0)[kth])
+
+    kth = np.random.RandomState(0).randint(-25, 25, size=(3,))
+    px = partition(x, kth, axis=1)
+
+    res = px.execute().fetch()
+    np.testing.assert_array_equal(res[:, kth], np.partition(raw, kth, axis=1)[:, kth])
+
+    # test multi-dimension with structured type
+    raw = np.empty((10, 100), dtype=[("id", np.int32), ("size", np.int64)])
+    raw["id"] = np.random.randint(1000, size=(10, 100), dtype=np.int32)
+    raw["size"] = np.random.randint(1000, size=(10, 100), dtype=np.int64)
+    x = tensor(raw, chunk_size=(7, 30))
+
+    kth = np.random.RandomState(0).randint(-100, 100, size=(10,))
+    px = partition(x, kth)
+
+    res = px.execute().fetch()
+    np.testing.assert_array_equal(res[:, kth], np.partition(raw, kth)[:, kth])
+
+    px = partition(x, kth, order=["size", "id"])
+
+    res = px.execute().fetch()
+    np.testing.assert_array_equal(
+        res[:, kth], np.partition(raw, kth, order=["size", "id"])[:, kth]
+    )
+
+    px = partition(x, kth, order=["size"])
+
+    res = px.execute().fetch()
+    np.testing.assert_array_equal(
+        res[:, kth], np.partition(raw, kth, order=["size"])[:, kth]
+    )
+
+    kth = np.random.RandomState(0).randint(-10, 10, size=(5,))
+    px = partition(x, kth, axis=0, order=["size", "id"])
+
+    res = px.execute().fetch()
+    np.testing.assert_array_equal(
+        res[kth], np.partition(raw, kth, axis=0, order=["size", "id"])[kth]
+    )
+
+    raw = np.random.rand(10, 12)
+    a = tensor(raw, chunk_size=(5, 4))
+    kth = np.random.RandomState(0).randint(-12, 12, size=(2,))
+    a.partition(kth, axis=1)
+
+    res = a.execute().fetch()
+    np.testing.assert_array_equal(res[:, kth], np.partition(raw, kth, axis=1)[:, kth])
+
+    kth = np.random.RandomState(0).randint(-10, 10, size=(2,))
+    a.partition(kth, axis=0)
+
+    raw_base = res
+    res = a.execute().fetch()
+    np.testing.assert_array_equal(res[kth], np.partition(raw_base, kth, axis=0)[kth])
+
+    # test kth which is tensor
+    raw = np.random.rand(10, 12)
+    a = tensor(raw, chunk_size=(3, 5))
+    kth = (mt.random.rand(5) * 24 - 12).astype(int)
+
+    px = partition(a, kth)
+    sx = sort(a)
+
+    res = px.execute().fetch()
+    kth_res = kth.execute().fetch()
+    sort_res = sx.execute().fetch()
+    np.testing.assert_array_equal(res[:, kth_res], sort_res[:, kth_res])
+
+    a = tensor(raw, chunk_size=(10, 12))
+    kth = (mt.random.rand(5) * 24 - 12).astype(int)
+
+    px = partition(a, kth)
+    sx = sort(a)
+
+    res = px.execute().fetch()
+    kth_res = kth.execute().fetch()
+    sort_res = sx.execute().fetch()
+    np.testing.assert_array_equal(res[:, kth_res], sort_res[:, kth_res])
+
+
+@pytest.mark.ray_dag
+def test_partition_indices_execution(setup):
+    # only 1 chunk when axis = -1
+    raw = np.random.rand(100, 10)
+    x = tensor(raw, chunk_size=10)
+
+    kth = [2, 5, 9]
+    r = partition(x, kth, return_index=True)
+
+    pr, pi = r.execute().fetch()
+    np.testing.assert_array_equal(pr, np.take_along_axis(raw, pi, axis=-1))
+    np.testing.assert_array_equal(np.sort(raw)[:, kth], pr[:, kth])
+
+    x = tensor(raw, chunk_size=(22, 4))
+
+    r = partition(x, kth, return_index=True)
+
+    pr, pi = r.execute().fetch()
+    np.testing.assert_array_equal(pr, np.take_along_axis(raw, pi, axis=-1))
+    np.testing.assert_array_equal(np.sort(raw)[:, kth], pr[:, kth])
+
+    raw = np.random.rand(100)
+
+    x = tensor(raw, chunk_size=23)
+
+    r = partition(x, kth, axis=0, return_index=True)
+
+    pr, pi = r.execute().fetch()
+    np.testing.assert_array_equal(pr, np.take_along_axis(raw, pi, axis=-1))
+    np.testing.assert_array_equal(np.sort(raw)[kth], pr[kth])
+
+
+@pytest.mark.ray_dag
+def test_argpartition_execution(setup):
+    # only 1 chunk when axis = -1
+    raw = np.random.rand(100, 10)
+    x = tensor(raw, chunk_size=10)
+
+    kth = [6, 3, 8]
+    pa = argpartition(x, kth)
+
+    r = pa.execute().fetch()
+    np.testing.assert_array_equal(
+        np.sort(raw)[:, kth], np.take_along_axis(raw, r, axis=-1)[:, kth]
+    )
+
+    x = tensor(raw, chunk_size=(22, 4))
+
+    pa = argpartition(x, kth)
+
+    r = pa.execute().fetch()
+    np.testing.assert_array_equal(
+        np.sort(raw)[:, kth], np.take_along_axis(raw, r, axis=-1)[:, kth]
+    )
+
+    raw = np.random.rand(100)
+
+    x = tensor(raw, chunk_size=23)
+
+    pa = argpartition(x, kth, axis=0)
+
+    r = pa.execute().fetch()
+    np.testing.assert_array_equal(np.sort(raw, axis=0)[kth], raw[r][kth])
+
+
+def _topk_slow(a, k, axis, largest, order):
+    if axis is None:
+        a = a.flatten()
+        axis = 0
+    a = np.sort(a, axis=axis, order=order)
+    if largest:
+        a = a[(slice(None),) * axis + (slice(None, None, -1),)]
+    return a[(slice(None),) * axis + (slice(k),)]
+
+
+def _handle_result(result, axis, largest, order):
+    result = np.sort(result, axis=axis, order=order)
+    if largest:
+        ax = axis if axis is not None else 0
+        result = result[(slice(None),) * ax + (slice(None, None, -1),)]
+    return result
+
+
+@pytest.mark.parametrize("chunk_size", [7, 4])
+@pytest.mark.parametrize("axis", [0, 1, 2, None])
+@pytest.mark.parametrize("largest", [True, False])
+@pytest.mark.parametrize("to_sort", [True, False])
+@pytest.mark.parametrize("parallel_kind", ["tree", "psrs"])
+def test_topk_execution(setup, chunk_size, axis, largest, to_sort, parallel_kind):
+    raw1, order1 = np.random.rand(5, 6, 7), None
+    raw2 = np.empty((5, 6, 7), dtype=[("a", np.int32), ("b", np.float64)])
+    raw2["a"] = np.random.randint(1000, size=(5, 6, 7), dtype=np.int32)
+    raw2["b"] = np.random.rand(5, 6, 7)
+    order2 = ["b", "a"]
+
+    for raw, order in [(raw1, order1), (raw2, order2)]:
+        a = tensor(raw, chunk_size=chunk_size)
+        size = raw.shape[axis] if axis is not None else raw.size
+        for k in [2, size - 2, size, size + 2]:
+            r = topk(
+                a,
+                k,
+                axis=axis,
+                largest=largest,
+                sorted=to_sort,
+                order=order,
+                parallel_kind=parallel_kind,
+            )
+
+            result = r.execute().fetch()
+
+            if not to_sort:
+                result = _handle_result(result, axis, largest, order)
+            expected = _topk_slow(raw, k, axis, largest, order)
+            np.testing.assert_array_equal(result, expected)
+
+            r = topk(
+                a,
+                k,
+                axis=axis,
+                largest=largest,
+                sorted=to_sort,
+                order=order,
+                parallel_kind=parallel_kind,
+                return_index=True,
+            )
+
+            ta, ti = r.execute().fetch()
+            raw2 = raw
+            if axis is None:
+                raw2 = raw.flatten()
+            np.testing.assert_array_equal(ta, np.take_along_axis(raw2, ti, axis))
+            if not to_sort:
+                ta = _handle_result(ta, axis, largest, order)
+            np.testing.assert_array_equal(ta, expected)
+
+
+def test_argtopk(setup):
+    # only 1 chunk when axis = -1
+    raw = np.random.rand(100, 10)
+    x = tensor(raw, chunk_size=20)
+
+    pa = argtopk(x, 3, parallel_kind="tree")
+
+    r = pa.execute().fetch()
+    np.testing.assert_array_equal(
+        np.sort(raw)[:, -1:-4:-1], np.take_along_axis(raw, r, axis=-1)
+    )
+
+    pa = argtopk(x, 3, parallel_kind="psrs")
+
+    r = pa.execute().fetch()
+    np.testing.assert_array_equal(
+        np.sort(raw)[:, -1:-4:-1], np.take_along_axis(raw, r, axis=-1)
+    )
+
+    x = tensor(raw, chunk_size=(22, 4))
+
+    pa = argtopk(x, 3, parallel_kind="tree")
+
+    r = pa.execute().fetch()
+    np.testing.assert_array_equal(
+        np.sort(raw)[:, -1:-4:-1], np.take_along_axis(raw, r, axis=-1)
+    )
+
+    pa = argtopk(x, 3, parallel_kind="psrs")
+
+    r = pa.execute().fetch()
+    np.testing.assert_array_equal(
+        np.sort(raw)[:, -1:-4:-1], np.take_along_axis(raw, r, axis=-1)
+    )
+
+    raw = np.random.rand(100)
+
+    x = tensor(raw, chunk_size=23)
+
+    pa = argtopk(x, 3, axis=0, parallel_kind="tree")
+
+    r = pa.execute().fetch()
+    np.testing.assert_array_equal(np.sort(raw, axis=0)[-1:-4:-1], raw[r])
+
+    pa = argtopk(x, 3, axis=0, parallel_kind="psrs")
+
+    r = pa.execute().fetch()
+    np.testing.assert_array_equal(np.sort(raw, axis=0)[-1:-4:-1], raw[r])
+
+
+def test_copy(setup):
+    x = tensor([1, 2, 3])
+    y = mt.copy(x)
+    z = x
+
+    x[0] = 10
+    y_res = y.execute().fetch()
+    np.testing.assert_array_equal(y_res, np.array([1, 2, 3]))
+
+    z_res = z.execute().fetch()
+    np.testing.assert_array_equal(z_res, np.array([10, 2, 3]))
+
+
+def test_trapz_execution(setup):
+    raws = [np.random.rand(10), np.random.rand(10, 3)]
+
+    for raw in raws:
+        for chunk_size in (4, 10):
+            for dx in (1.0, 2.0):
+                t = tensor(raw, chunk_size=chunk_size)
+                r = trapz(t, dx=dx)
+
+                result = r.execute().fetch()
+                expected = np.trapz(raw, dx=dx)
+                np.testing.assert_almost_equal(
+                    result,
+                    expected,
+                    err_msg=f"failed when raw={raw}, "
+                    f"chunk_size={chunk_size}, dx={dx}",
+                )
+
+    # test x not None
+    raw_ys = [np.random.rand(10), np.random.rand(10, 3)]
+    raw_xs = [np.random.rand(10), np.random.rand(10, 3)]
+
+    for raw_y, raw_x in zip(raw_ys, raw_xs):
+        ys = [tensor(raw_y, chunk_size=5), tensor(raw_y, chunk_size=10)]
+        x = tensor(raw_x, chunk_size=4)
+
+        for y in ys:
+            r = trapz(y, x=x)
+
+            result = r.execute().fetch()
+            expected = np.trapz(raw_y, x=raw_x)
+            np.testing.assert_almost_equal(result, expected)
+
+
+@pytest.mark.ray_dag
+def test_shape(setup):
+    raw = np.random.RandomState(0).rand(4, 3)
+    x = mt.tensor(raw, chunk_size=2)
+
+    s = shape(x)
+
+    result = s.execute().fetch()
+    assert result == [4, 3]
+
+    s = shape(x[x > 0.5])
+
+    result = s.execute().fetch()
+    expected = np.shape(raw[raw > 0.5])
+    assert result == expected
+
+    s = shape(0)
+
+    result = s.execute().fetch()
+    expected = np.shape(0)
+    assert result == expected
+
+
+@pytest.mark.ray_dag
+def test_rebalance_execution(setup):
+    session = setup
+
+    raw = np.random.rand(10, 3)
+    x = mt.tensor(raw)
+
+    r = x.rebalance(num_partitions=3)
+    result = r.execute().fetch()
+    np.testing.assert_array_equal(result, raw)
+    assert len(session._session._tileable_to_fetch[r.data].chunks) == 3
+
+    r = x.rebalance(factor=1.5)
+    result = r.execute().fetch()
+    np.testing.assert_array_equal(result, raw)
+
+    r = x.rebalance()
+    result = r.execute().fetch()
+    np.testing.assert_array_equal(result, raw)
+    assert len(session._session._tileable_to_fetch[r.data].chunks) == 2
+
+
+def test_map_chunk_execution(setup):
+    raw = np.random.rand(20)
+    a = tensor(raw, chunk_size=10)
+
+    r = a.map_chunk(lambda x: x * 0.5)
+    results = r.execute().fetch()
+    np.testing.assert_array_equal(raw * 0.5, results)
+
+    r = a.map_chunk(lambda x: x * 0.5, elementwise=True)
+    results = r.execute().fetch()
+    np.testing.assert_array_equal(raw * 0.5, results)
+
+    r = a.map_chunk(
+        lambda x, chunk_index: x * 0.5 + chunk_index[0], with_chunk_index=True
+    )
+    results = r.execute().fetch()
+    np.testing.assert_array_equal(raw * 0.5 + np.arange(0, 20) // 10, results)
+
+
+def test_insert_execution(setup):
+    raw = np.random.randint(0, 100, size=(20, 10))
+    a = tensor(raw, chunk_size=6)
+
+    r1 = mt.insert(a, 1, 5)
+    result = r1.execute().fetch()
+    np.testing.assert_array_equal(np.insert(raw, 1, 5), result)
+
+    r2 = mt.insert(a, [3, 50, 10], 10)
+    result = r2.execute().fetch()
+    np.testing.assert_array_equal(np.insert(raw, [3, 50, 10], 10), result)
+
+    r3 = mt.insert(a, [2, 3, 4], [5, 6, 7])
+    result = r3.execute().fetch()
+    np.testing.assert_array_equal(np.insert(raw, [2, 3, 4], [5, 6, 7]), result)
+
+    # specify axis
+    r4 = mt.insert(a, 5, 4, axis=0)
+    result = r4.execute().fetch()
+    np.testing.assert_array_equal(np.insert(raw, 5, 4, axis=0), result)
+
+    r5 = mt.insert(a, [1, 2, 6], np.arange(20).reshape((20, 1)), axis=1)
+    result = r5.execute().fetch()
+    np.testing.assert_array_equal(
+        np.insert(raw, [1, 2, 6], np.arange(20).reshape((20, 1)), axis=1), result
+    )
+
+    r6 = mt.insert(a, [1, 16, 10], np.arange(30).reshape((3, 10)), axis=0)
+    result = r6.execute().fetch()
+    np.testing.assert_array_equal(
+        np.insert(raw, [1, 16, 10], np.arange(30).reshape((3, 10)), axis=0), result
+    )
+
+    # test mt.tensor as values
+    r5 = mt.insert(a, [1, 2, 6], mt.arange(20).reshape((20, 1)), axis=1)
+    result = r5.execute().fetch()
+    np.testing.assert_array_equal(
+        np.insert(raw, [1, 2, 6], np.arange(20).reshape((20, 1)), axis=1), result
+    )
+
+    r6 = mt.insert(a, [1, 16, 10], mt.arange(30).reshape((3, 10)), axis=0)
+    result = r6.execute().fetch()
+    np.testing.assert_array_equal(
+        np.insert(raw, [1, 16, 10], np.arange(30).reshape((3, 10)), axis=0), result
+    )
+
+    r7 = mt.insert(a, [20, 30, 50], mt.tensor([5, 6, 7]))
+    result = r7.execute().fetch()
+    np.testing.assert_array_equal(np.insert(raw, [20, 30, 50], [5, 6, 7]), result)
+
+    # test mt.tensor as index
+    r8 = mt.insert(a, mt.tensor([1, 2, 6]), mt.arange(20).reshape((20, 1)), axis=1)
+    result = r8.execute().fetch()
+    np.testing.assert_array_equal(
+        np.insert(raw, [1, 2, 6], np.arange(20).reshape((20, 1)), axis=1), result
+    )
+
+    r9 = mt.insert(a, mt.tensor([1, 16, 10]), mt.arange(30).reshape((3, 10)), axis=0)
+    result = r9.execute().fetch()
+    np.testing.assert_array_equal(
+        np.insert(raw, [1, 16, 10], np.arange(30).reshape((3, 10)), axis=0), result
+    )
+
+    r10 = mt.insert(a, mt.tensor([20, 30, 50]), mt.tensor([5, 6, 7]))
+    result = r10.execute().fetch()
+    np.testing.assert_array_equal(np.insert(raw, [20, 30, 50], [5, 6, 7]), result)
+
+    r11 = mt.insert(a, slice(0, 10), mt.arange(10), axis=0)
+    result = r11.execute().fetch()
+    np.testing.assert_array_equal(
+        np.insert(raw, slice(0, 10), np.arange(10), axis=0), result
+    )
+
+    r12 = mt.insert(a, 10, 5, axis=1)
+    result = r12.execute().fetch()
+    np.testing.assert_array_equal(np.insert(raw, 10, 5, axis=1), result)
+
+    r13 = mt.insert(a, [2, 10], 5, axis=1)
+    result = r13.execute().fetch()
+    np.testing.assert_array_equal(np.insert(raw, [2, 10], 5, axis=1), result)
+
+    r14 = mt.insert(a, mt.tensor([2, 20]), 5, axis=0)
+    result = r14.execute().fetch()
+    np.testing.assert_array_equal(np.insert(raw, [2, 20], 5, axis=0), result)
+
+    r15 = mt.insert(a, 7, mt.arange(20), axis=1)
+    result = r15.execute().fetch()
+    np.testing.assert_array_equal(np.insert(raw, 7, mt.arange(20), axis=1), result)
+
+
+def test_delete_execution(setup):
+    raw = np.random.randint(0, 100, size=(20, 10))
+    a = tensor(raw, chunk_size=6)
+
+    r1 = mt.delete(a, 1)
+    result = r1.execute().fetch()
+    np.testing.assert_array_equal(np.delete(raw, 1), result)
+
+    r2 = mt.delete(a, [3, 50, 10])
+    result = r2.execute().fetch()
+    np.testing.assert_array_equal(np.delete(raw, [3, 50, 10]), result)
+
+    # specify axis
+    r4 = mt.delete(a, 5, axis=0)
+    result = r4.execute().fetch()
+    np.testing.assert_array_equal(np.delete(raw, 5, axis=0), result)
+
+    r5 = mt.delete(a, [1, 2, 6], axis=1)
+    result = r5.execute().fetch()
+    np.testing.assert_array_equal(np.delete(raw, [1, 2, 6], axis=1), result)
+
+    r6 = mt.delete(a, mt.tensor([1, 2, 6, 8], chunk_size=3), axis=1)
+    result = r6.execute().fetch()
+    np.testing.assert_array_equal(np.delete(raw, [1, 2, 6, 8], axis=1), result)
+
+    r7 = mt.delete(a, slice(0, 10), axis=0)
+    result = r7.execute().fetch()
+    np.testing.assert_array_equal(np.delete(raw, slice(0, 10), axis=0), result)
+
+    r8 = mt.delete(a, mt.tensor([10, 20, 6, 80]))
+    result = r8.execute().fetch()
+    np.testing.assert_array_equal(np.delete(raw, [10, 20, 6, 80]), result)
+
+    r9 = mt.delete(a, 9, axis=1)
+    result = r9.execute().fetch()
+    np.testing.assert_array_equal(np.delete(raw, 9, axis=1), result)
+
+
+@pytest.mark.parametrize("chunk_size", [3, 5])
+@pytest.mark.parametrize("invert", [True, False])
+def test_in1d_execute(setup, chunk_size, invert):
+    rs = np.random.RandomState(0)
+    raw1 = rs.randint(10, size=10)
+    ar1 = mt.tensor(raw1, chunk_size=5)
+    raw2 = np.arange(5)
+    ar2 = mt.tensor(raw2, chunk_size=chunk_size)
+    ar = mt.in1d(ar1, ar2, invert=invert)
+    result = ar.execute().fetch()
+    expected = np.in1d(raw1, raw2, invert=invert)
+    np.testing.assert_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("chunk_size", [3, 5])
+def test_setdiff1d_execute(setup, chunk_size):
+    rs = np.random.RandomState(0)
+    raw1 = rs.randint(10, size=10)
+    ar1 = mt.tensor(raw1, chunk_size=5)
+    raw2 = np.arange(5)
+    ar2 = mt.tensor(raw2, chunk_size=chunk_size)
+    ar = mt.setdiff1d(ar1, ar2)
+    result = ar.execute().fetch()
+    expected = np.setdiff1d(raw1, raw2)
+    np.testing.assert_array_equal(result, expected)
+
+    raw3 = rs.shuffle(rs.choice(np.arange(100), 10))
+    ar3 = mt.tensor(raw3, chunk_size=5)
+    ar = mt.setdiff1d(ar3, ar2, assume_unique=True)
+    result = ar.execute().fetch()
+    expected = np.setdiff1d(raw3, raw2, assume_unique=True)
+    np.testing.assert_array_equal(result, expected)
diff --git a/python/xorbits/_mars/tensor/base/tile.py b/python/xorbits/_mars/tensor/base/tile.py
new file mode 100644
index 000000000..76b7174b1
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/tile.py
@@ -0,0 +1,109 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+def tile(A, reps):
+    """
+    Construct a tensor by repeating A the number of times given by reps.
+
+    If `reps` has length ``d``, the result will have dimension of
+    ``max(d, A.ndim)``.
+
+    If ``A.ndim < d``, `A` is promoted to be d-dimensional by prepending new
+    axes. So a shape (3,) array is promoted to (1, 3) for 2-D replication,
+    or shape (1, 1, 3) for 3-D replication. If this is not the desired
+    behavior, promote `A` to d-dimensions manually before calling this
+    function.
+
+    If ``A.ndim > d``, `reps` is promoted to `A`.ndim by pre-pending 1's to it.
+    Thus for an `A` of shape (2, 3, 4, 5), a `reps` of (2, 2) is treated as
+    (1, 1, 2, 2).
+
+    Note : Although tile may be used for broadcasting, it is strongly
+    recommended to use Mars' broadcasting operations and functions.
+
+    Parameters
+    ----------
+    A : array_like
+        The input tensor.
+    reps : array_like
+        The number of repetitions of `A` along each axis.
+
+    Returns
+    -------
+    c : Tensor
+        The tiled output tensor.
+
+    See Also
+    --------
+    repeat : Repeat elements of a tensor.
+    broadcast_to : Broadcast a tensor to a new shape
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([0, 1, 2])
+    >>> mt.tile(a, 2).execute()
+    array([0, 1, 2, 0, 1, 2])
+    >>> mt.tile(a, (2, 2)).execute()
+    array([[0, 1, 2, 0, 1, 2],
+           [0, 1, 2, 0, 1, 2]])
+    >>> mt.tile(a, (2, 1, 2)).execute()
+    array([[[0, 1, 2, 0, 1, 2]],
+           [[0, 1, 2, 0, 1, 2]]])
+
+    >>> b = mt.array([[1, 2], [3, 4]])
+    >>> mt.tile(b, 2).execute()
+    array([[1, 2, 1, 2],
+           [3, 4, 3, 4]])
+    >>> mt.tile(b, (2, 1)).execute()
+    array([[1, 2],
+           [3, 4],
+           [1, 2],
+           [3, 4]])
+
+    >>> c = mt.array([1,2,3,4])
+    >>> mt.tile(c,(4,1)).execute()
+    array([[1, 2, 3, 4],
+           [1, 2, 3, 4],
+           [1, 2, 3, 4],
+           [1, 2, 3, 4]])
+    """
+    from ..merge import concatenate
+
+    try:
+        tup = tuple(reps)
+    except TypeError:
+        tup = (reps,)
+
+    d = len(tup)
+    if A.ndim < d:
+        A = A[tuple(np.newaxis for _ in range(d - A.ndim))]
+    elif A.ndim > d:
+        tup = (1,) * (A.ndim - d) + tup
+
+    a = A
+    for axis, rep in enumerate(tup):
+        if rep == 0:
+            slc = (slice(None),) * axis + (slice(0),)
+            a = a[slc]
+        elif rep < 0:
+            raise ValueError("negative dimensions are not allowed")
+        elif rep > 1:
+            a = concatenate([a] * rep, axis=axis)
+
+    return a
diff --git a/python/xorbits/_mars/tensor/base/to_cpu.py b/python/xorbits/_mars/tensor/base/to_cpu.py
new file mode 100644
index 000000000..56b5138e8
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/to_cpu.py
@@ -0,0 +1,40 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorDeviceConversionBase
+
+
+class TensorToCPU(TensorDeviceConversionBase):
+    _op_type_ = OperandDef.TO_CPU
+
+    def __init__(self, dtype=None, gpu=None, sparse=None, **kw):
+        super().__init__(dtype=dtype, gpu=gpu, sparse=sparse, **kw)
+        if self.gpu or self.gpu is None:
+            self.gpu = False
+
+    @classmethod
+    def execute(cls, ctx, op):
+        ctx[op.outputs[0].key] = ctx[op.input.key].get()
+
+
+def to_cpu(x):
+    x = astensor(x)
+
+    if x.op.gpu is False:
+        return x
+
+    op = TensorToCPU(dtype=x.dtype)
+    return op(x)
diff --git a/python/xorbits/_mars/tensor/base/to_gpu.py b/python/xorbits/_mars/tensor/base/to_gpu.py
new file mode 100644
index 000000000..44677191d
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/to_gpu.py
@@ -0,0 +1,42 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ..array_utils import move_to_device
+from ..datasource import tensor as astensor
+from .core import TensorDeviceConversionBase
+
+
+class TensorToGPU(TensorDeviceConversionBase):
+    _op_type_ = OperandDef.TO_GPU
+
+    def __init__(self, dtype=None, gpu=None, sparse=None, **kw):
+        super().__init__(dtype=dtype, gpu=gpu, sparse=sparse, **kw)
+        if not self.gpu:
+            self.gpu = True
+
+    @classmethod
+    def execute(cls, ctx, op):
+        device = op.device or 0
+        ctx[op.outputs[0].key] = move_to_device(ctx[op.input.key], device)
+
+
+def to_gpu(x):
+    x = astensor(x)
+
+    if x.op.gpu:
+        return x
+
+    op = TensorToGPU(dtype=x.dtype)
+    return op(x)
diff --git a/python/xorbits/_mars/tensor/base/topk.py b/python/xorbits/_mars/tensor/base/topk.py
new file mode 100644
index 000000000..e8792b6e8
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/topk.py
@@ -0,0 +1,597 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import ExecutableTuple, recursive_tile
+from ...core.operand import OperandStage
+from ...serialization.serializables import (
+    BoolField,
+    FieldTypes,
+    Int32Field,
+    Int64Field,
+    KeyField,
+    ListField,
+    StringField,
+)
+from ...utils import ceildiv, flatten
+from ..array_utils import as_same_device, device
+from ..core import TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+from ..utils import validate_axis, validate_order
+from .sort import _validate_sort_psrs_kinds
+
+
+class TensorTopk(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.TOPK
+
+    _input = KeyField("input")
+    _k = Int64Field("k")
+    _axis = Int32Field("axis")
+    _largest = BoolField("largest")
+    _sorted = BoolField("sorted")
+    _order = ListField("order", FieldTypes.string)
+    _parallel_kind = StringField("parallel_kind")
+    _psrs_kinds = ListField("psrs_kinds", FieldTypes.string)
+    _return_value = BoolField("return_value")
+    _return_indices = BoolField("return_indices")
+    _axis_offset = Int64Field(
+        "axis_offset",
+        on_serialize=lambda x: -1 if x is not None and np.isnan(x) else x,
+        on_deserialize=lambda x: np.nan if x == -1 else x,
+    )
+
+    def __init__(
+        self,
+        k=None,
+        axis=None,
+        largest=None,
+        sorted=None,
+        order=None,
+        parallel_kind=None,
+        psrs_kinds=None,
+        return_value=None,
+        return_indices=None,
+        axis_offset=None,
+        **kw
+    ):
+        super().__init__(
+            _k=k,
+            _axis=axis,
+            _largest=largest,
+            _sorted=sorted,
+            _parallel_kind=parallel_kind,
+            _psrs_kinds=psrs_kinds,
+            _return_value=return_value,
+            _return_indices=return_indices,
+            _order=order,
+            _axis_offset=axis_offset,
+            **kw
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def k(self):
+        return self._k
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def largest(self):
+        return self._largest
+
+    @property
+    def sorted(self):
+        return self._sorted
+
+    @property
+    def order(self):
+        return self._order
+
+    @property
+    def parallel_kind(self):
+        return self._parallel_kind
+
+    @property
+    def psrs_kinds(self):
+        return self._psrs_kinds
+
+    @property
+    def return_value(self):
+        return self._return_value
+
+    @property
+    def return_indices(self):
+        return self._return_indices
+
+    @property
+    def axis_offset(self):
+        return self._axis_offset
+
+    @property
+    def output_limit(self):
+        if self.stage != OperandStage.agg:
+            return 1
+        else:
+            return int(bool(self._return_value)) + int(bool(self._return_indices))
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def __call__(self, a):
+        shape = list(a.shape)
+        shape[self._axis] = min(a.shape[self._axis], self._k)
+        kws = []
+        if self._return_value:
+            kws.append(
+                {
+                    "shape": tuple(shape),
+                    "order": a.order,
+                    "dtype": a.dtype,
+                    "type": "topk",
+                }
+            )
+        if self._return_indices:
+            kws.append(
+                {
+                    "shape": tuple(shape),
+                    "order": TensorOrder.C_ORDER,
+                    "dtype": np.dtype(np.int64),
+                    "type": "argtopk",
+                }
+            )
+        ret = self.new_tensors([a], kws=kws)
+        if len(kws) == 1:
+            return ret[0]
+        return ExecutableTuple(ret)
+
+    @classmethod
+    def _tile_one_chunk(cls, op):
+        return_value, return_indices = op.return_value, op.return_indices
+        out = op.outputs[0]
+        chunk_op = op.copy().reset_key()
+        kws = []
+        if return_value:
+            kws.append(
+                {
+                    "shape": out.shape,
+                    "order": out.order,
+                    "index": (0,) * out.ndim,
+                    "dtype": out.dtype,
+                    "type": "topk",
+                }
+            )
+        if return_indices:
+            kws.append(
+                {
+                    "shape": out.shape,
+                    "order": TensorOrder.C_ORDER,
+                    "index": (0,) * out.ndim,
+                    "dtype": np.dtype(np.int64),
+                    "type": "argtopk",
+                }
+            )
+        chunks = chunk_op.new_chunks([op.input.chunks[0]], kws=kws)
+        kws = [out.params for out in op.outputs]
+        nsplits = tuple((s,) for s in out.shape)
+        if return_value:
+            kws[0]["nsplits"] = nsplits
+            kws[0]["chunks"] = [chunks[0]]
+        if return_indices:
+            kws[-1]["nsplits"] = nsplits
+            kws[-1]["chunks"] = [chunks[1]]
+        new_op = op.copy()
+        return new_op.new_tensors(op.inputs, kws=kws)
+
+    @classmethod
+    def _tile_via_psrs(cls, op):
+        from .sort import TensorSort
+
+        return_value = op.return_value
+        return_indices = op.return_indices
+
+        # just sort, force need_align=True
+        psrs_kinds = op.psrs_kinds or ["quicksort", "mergesort", "mergesort"]
+        sort_op = TensorSort(
+            axis=op.axis,
+            order=op.order,
+            psrs_kinds=psrs_kinds,
+            need_align=True,
+            return_value=return_value,
+            return_indices=return_indices,
+        )
+        ret = sort_op(op.input)
+
+        if not isinstance(ret, tuple):
+            ret = (ret,)
+
+        base_slcs = (slice(None),) * op.axis
+        if op.largest:
+            ret = [r[base_slcs + (slice(-1, -op.k - 1, -1),)] for r in ret]
+        else:
+            ret = [r[base_slcs + (slice(op.k),)] for r in ret]
+
+        ret = yield from recursive_tile(ret)
+        new_op = op.copy()
+        kws = [o.params for o in op.outputs]
+        if return_value:
+            kws[0]["nsplits"] = ret[0].nsplits
+            kws[0]["chunks"] = ret[0].chunks
+        if return_indices:
+            kws[-1]["nsplits"] = ret[-1].nsplits
+            kws[-1]["chunks"] = ret[-1].chunks
+        return new_op.new_tensors(op.inputs, kws=kws)
+
+    @classmethod
+    def _gen_topk_chunk(
+        cls, input_chunk, op, is_terminate_node, axis_offset=None, chunk_index=None
+    ):
+        chunk_op = op.copy().reset_key()
+        if axis_offset is not None:
+            chunk_op._axis_offset = axis_offset
+        if not is_terminate_node:
+            # no need to sort if not the terminated node
+            chunk_op._sorted = False
+        shape = list(input_chunk.shape)
+        shape[op.axis] = min(op.k, input_chunk.shape[op.axis])
+        if not is_terminate_node:
+            # whenever return_indices, value is required
+            chunk_op._return_value = True
+            if axis_offset is not None:
+                chunk_op.stage = OperandStage.map
+            else:
+                chunk_op.stage = OperandStage.combine
+            return chunk_op.new_chunk(
+                [input_chunk],
+                shape=tuple(shape),
+                order=input_chunk.order,
+                index=chunk_index,
+            )
+        else:
+            chunk_op.stage = OperandStage.agg
+            kws = []
+            if op.return_value:
+                kws.append(
+                    {
+                        "shape": tuple(shape),
+                        "order": input_chunk.order,
+                        "dtype": input_chunk.dtype,
+                        "index": chunk_index,
+                        "type": "topk",
+                    }
+                )
+            if op.return_indices:
+                kws.append(
+                    {
+                        "shape": tuple(shape),
+                        "order": TensorOrder.C_ORDER,
+                        "dtype": np.dtype(np.int64),
+                        "index": chunk_index,
+                        "type": "argtopk",
+                    }
+                )
+            return chunk_op.new_chunks([input_chunk], kws=kws)
+
+    @classmethod
+    def _merge_chunks(cls, input_chunks, axis):
+        from ..merge import TensorConcatenate
+
+        if len(input_chunks) == 1:
+            return input_chunks[0]
+
+        shape = list(input_chunks[0].shape)
+        shape[axis] = sum(c.shape[axis] for c in input_chunks)
+
+        merge_op = TensorConcatenate(axis=axis, dtype=input_chunks[0].dtype)
+        return merge_op.new_chunk(
+            input_chunks, shape=tuple(shape), order=input_chunks[0].order
+        )
+
+    @classmethod
+    def _tile_via_tree(cls, op):
+        a = op.input
+        axis = op.axis
+        return_value, return_indices = op.return_value, op.return_indices
+        combine_size = options.combine_size
+        axis_offsets = [0] + np.cumsum(a.nsplits[axis]).tolist()[:-1]
+
+        out_chunks, indices_chunks = [], []
+        for other_idx in itertools.product(
+            *(range(s) for i, s in enumerate(a.chunk_shape) if i != axis)
+        ):
+            merge_chunks = []
+            for j in range(a.chunk_shape[axis]):
+                idx = list(other_idx)
+                idx.insert(axis, j)
+                input_chunk = a.cix[tuple(idx)]
+                merge_chunks.append(
+                    cls._gen_topk_chunk(
+                        input_chunk, op, False, axis_offset=axis_offsets[j]
+                    )
+                )
+            while len(merge_chunks) > combine_size:
+                new_size = ceildiv(len(merge_chunks), combine_size)
+                new_merge_chunks = []
+                for i in range(new_size):
+                    to_merge_chunks = merge_chunks[
+                        i * combine_size : (i + 1) * combine_size
+                    ]
+                    merge_chunk = cls._merge_chunks(to_merge_chunks, axis)
+                    topk_chunk = cls._gen_topk_chunk(merge_chunk, op, False)
+                    new_merge_chunks.append(topk_chunk)
+                merge_chunks = new_merge_chunks
+
+            merge_chunk = cls._merge_chunks(merge_chunks, axis)
+            chunk_index = list(other_idx)
+            chunk_index.insert(axis, 0)
+            chunks = cls._gen_topk_chunk(
+                merge_chunk, op, True, chunk_index=tuple(chunk_index)
+            )
+            if return_value:
+                out_chunks.append(chunks[0])
+            if return_indices:
+                indices_chunks.append(chunks[-1])
+
+        new_op = op.copy()
+        nsplits = list(a.nsplits)
+        nsplits[axis] = (min(a.shape[axis], op.k),)
+        kws = [out.params for out in op.outputs]
+        if return_value:
+            kws[0]["nsplits"] = nsplits
+            kws[0]["chunks"] = out_chunks
+        if return_indices:
+            kws[-1]["nsplits"] = nsplits
+            kws[-1]["chunks"] = indices_chunks
+        return new_op.new_tensors(op.inputs, kws=kws)
+
+    @classmethod
+    def tile(cls, op):
+        a = op.input
+        combine_size = options.combine_size
+        k = op.k
+        axis = op.axis
+
+        if len(a.chunks) == 1:
+            return cls._tile_one_chunk(op)
+
+        parallel_kind = op.parallel_kind.lower()
+
+        if parallel_kind == "auto":
+            nsplit = a.nsplits[axis]
+            max_chunk_size = max(nsplit)
+            if np.isnan(max_chunk_size):
+                # has unknown chunk shape and k > 100 just choose 'psrs'
+                parallel_kind = "psrs" if k > 100 else "tree"
+            else:
+                if combine_size * k <= max_chunk_size:
+                    # each chunk will have k elements on specified axis,
+                    # if combined chunk which generated in the tree reduction
+                    # is less than max chunk size, parallel kind `tree` will be adopted
+                    parallel_kind = "tree"
+                else:
+                    parallel_kind = "psrs"
+
+        if parallel_kind == "tree":
+            op._parallel_kind = "tree"
+            return cls._tile_via_tree(op)
+        else:
+            assert parallel_kind == "psrs"
+            op._parallel_kind = "psrs"
+            return (yield from cls._tile_via_psrs(op))
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            flatten([ctx[inp.key] for inp in op.inputs]),
+            device=op.device,
+            ret_extra=True,
+        )
+        if len(inputs) == 2:
+            a, indices = inputs
+        else:
+            a, indices = inputs[0], None
+
+        k = op.k
+        axis = op.axis
+        to_sort = op.sorted
+        largest = op.largest
+        return_value = op.return_value
+        return_indices = op.return_indices
+        axis_offset = op.axis_offset
+
+        with device(device_id):
+            av, ap = _topk_helper(
+                xp,
+                a,
+                k,
+                axis=axis,
+                largest=largest,
+                sorted=to_sort,
+                order=op.order,
+                indices=indices,
+                axis_offset=axis_offset,
+                return_value=return_value,
+                return_indices=return_indices,
+            )
+            if op.stage != OperandStage.agg:
+                out = [av]
+                if op.return_indices:
+                    out.append(ap)
+                ctx[op.outputs[0].key] = tuple(out)
+            else:
+                if op.return_value:
+                    ctx[op.outputs[0].key] = av
+                if op.return_indices:
+                    ctx[op.outputs[-1].key] = ap
+
+
+def _gen_indices(shape, axis, xp):
+    ap = xp.swapaxes(xp.empty(shape, dtype=np.int64), axis, -1)
+    ap[...] = xp.arange(shape[axis]).reshape((1,) * (ap.ndim - 1) + (-1,))
+    return xp.swapaxes(ap, -1, axis)
+
+
+def _topk_helper(
+    xp,
+    a,
+    k,
+    axis=-1,
+    largest=True,
+    sorted=True,
+    order=None,
+    indices=None,
+    axis_offset=None,
+    return_value=True,
+    return_indices=False,
+):
+    size = a.shape[axis]
+    base_slc = (slice(None),) * axis
+    kw = {}
+    if order is not None:
+        kw["order"] = order
+
+    ap = None
+    if return_indices:
+        # do partition
+        if largest:
+            if k < size:
+                length = size - k
+                ap = xp.argpartition(a, length, axis=axis, **kw)[
+                    base_slc + (slice(-k, None),)
+                ]
+                av = xp.take_along_axis(a, ap, axis)
+                if indices is not None:
+                    ap = xp.take_along_axis(indices, ap, axis)
+            else:
+                av = a
+                if indices is not None:
+                    ap = indices
+                else:
+                    ap = _gen_indices(a.shape, axis, xp)
+            if sorted:
+                # sort then reverse
+                ags = xp.argsort(av, axis=axis, **kw)[
+                    base_slc + (slice(None, None, -1),)
+                ]
+                ap = xp.take_along_axis(ap, ags, axis)
+                av = xp.take_along_axis(av, ags, axis)
+        else:
+            if k < size:
+                ap = xp.argpartition(a, k, axis=axis, **kw)[base_slc + (slice(k),)]
+                av = xp.take_along_axis(a, ap, axis)
+                if indices is not None:
+                    ap = xp.take_along_axis(indices, ap, axis)
+            else:
+                av = a
+                if indices is not None:
+                    ap = indices
+                else:
+                    ap = _gen_indices(a.shape, axis, xp)
+            if sorted:
+                ags = xp.argsort(av, axis=axis, **kw)
+                ap = xp.take_along_axis(ap, ags, axis)
+                av = xp.take_along_axis(av, ags, axis)
+        if axis_offset:
+            ap = ap + axis_offset
+    else:
+        assert return_value
+        if largest:
+            if k < size:
+                length = size - k
+                av = xp.partition(a, length, axis=axis, **kw)[
+                    base_slc + (slice(-k, None),)
+                ]
+            else:
+                av = a
+            if sorted:
+                # sort then reverse
+                av = xp.sort(av, axis=axis, **kw)[base_slc + (slice(None, None, -1),)]
+        else:
+            if k < size:
+                av = xp.partition(a, k, axis=axis, **kw)[base_slc + (slice(k),)]
+            else:
+                av = a
+            if sorted:
+                av = xp.sort(av, axis=axis, **kw)
+
+    return av, ap
+
+
+def _validate_topk_arguments(
+    a, k, axis, largest, sorted, order, parallel_kind, psrs_kinds
+):
+    a = astensor(a)
+    if axis is None:
+        a = a.flatten()
+        axis = 0
+    else:
+        axis = validate_axis(a.ndim, axis)
+    # if a is structure type and order is not None
+    order = validate_order(a.dtype, order)
+    if parallel_kind.lower() not in {"auto", "tree", "psrs"}:
+        raise ValueError("`parallel_kind` could only be `auto`, `tree`, or `psrs`")
+    # if psrs is chosen, sort will be used,
+    # psrs_kinds will be passed into it, so use the validation logic in sort
+    psrs_kinds = _validate_sort_psrs_kinds(psrs_kinds)
+    return a, k, axis, largest, sorted, order, parallel_kind, psrs_kinds
+
+
+def topk(
+    a,
+    k,
+    axis=-1,
+    largest=True,
+    sorted=True,
+    order=None,
+    parallel_kind="auto",
+    psrs_kinds=None,
+    return_index=False,
+):
+    (
+        a,
+        k,
+        axis,
+        largest,
+        sorted,
+        order,
+        parallel_kind,
+        psrs_kinds,
+    ) = _validate_topk_arguments(
+        a, k, axis, largest, sorted, order, parallel_kind, psrs_kinds
+    )
+    op = TensorTopk(
+        k=k,
+        axis=axis,
+        largest=largest,
+        sorted=sorted,
+        order=order,
+        parallel_kind=parallel_kind,
+        psrs_kinds=psrs_kinds,
+        dtype=a.dtype,
+        return_value=True,
+        return_indices=return_index,
+        stage=OperandStage.agg,
+    )
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/base/transpose.py b/python/xorbits/_mars/tensor/base/transpose.py
new file mode 100644
index 000000000..24028ac5e
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/transpose.py
@@ -0,0 +1,168 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import FieldTypes, KeyField, ListField
+from ..array_utils import as_same_device, device
+from ..core import TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorHasInput, TensorOperandMixin
+from ..utils import reverse_order
+
+
+def _reorder(x, axes):
+    if x is None:
+        return
+    return type(x)(x[ax] for ax in axes)
+
+
+class TensorTranspose(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.TRANSPOSE
+
+    _input = KeyField("input")
+    _axes = ListField("axes", FieldTypes.int32)
+
+    def __init__(self, axes=None, **kw):
+        super().__init__(
+            _axes=axes,
+            # transpose will create a view
+            create_view=True,
+            **kw
+        )
+
+    @property
+    def axes(self):
+        return getattr(self, "_axes", None)
+
+    def __call__(self, a):
+        shape = tuple(
+            s if np.isnan(s) else int(s) for s in _reorder(a.shape, self._axes)
+        )
+        if self._axes == list(reversed(range(a.ndim))):
+            # order reversed
+            tensor_order = reverse_order(a.order)
+        else:
+            tensor_order = TensorOrder.C_ORDER
+        return self.new_tensor([a], shape, order=tensor_order)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def on_output_modify(self, new_output):
+        op = self.copy().reset_key()
+        return op(new_output)
+
+    def on_input_modify(self, new_input):
+        op = self.copy().reset_key()
+        return op(new_input)
+
+    @classmethod
+    def tile(cls, op):
+        tensor = op.outputs[0]
+
+        out_chunks = []
+        for c in op.inputs[0].chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_shape = tuple(
+                s if np.isnan(s) else int(s) for s in _reorder(c.shape, op.axes)
+            )
+            chunk_idx = _reorder(c.index, op.axes)
+            out_chunk = chunk_op.new_chunk(
+                [c], shape=chunk_shape, index=chunk_idx, order=tensor.order
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        nsplits = _reorder(op.inputs[0].nsplits, op.axes)
+        return new_op.new_tensors(
+            op.inputs,
+            op.outputs[0].shape,
+            order=tensor.order,
+            chunks=out_chunks,
+            nsplits=nsplits,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (x,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        axes = op.axes
+        with device(device_id):
+            ctx[op.outputs[0].key] = xp.transpose(x, axes or None)
+
+
+def transpose(a, axes=None):
+    """
+    Permute the dimensions of a tensor.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor.
+    axes : list of ints, optional
+        By default, reverse the dimensions, otherwise permute the axes
+        according to the values given.
+
+    Returns
+    -------
+    p : Tensor
+        `a` with its axes permuted.  A view is returned whenever
+        possible.
+
+    See Also
+    --------
+    moveaxis
+    argsort
+
+    Notes
+    -----
+    Use `transpose(a, argsort(axes))` to invert the transposition of tensors
+    when using the `axes` keyword argument.
+
+    Transposing a 1-D array returns an unchanged view of the original tensor.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.arange(4).reshape((2,2))
+    >>> x.execute()
+    array([[0, 1],
+           [2, 3]])
+
+    >>> mt.transpose(x).execute()
+    array([[0, 2],
+           [1, 3]])
+
+    >>> x = mt.ones((1, 2, 3))
+    >>> mt.transpose(x, (1, 0, 2)).shape
+    (2, 1, 3)
+
+    """
+    a = astensor(a)
+    if axes:
+        if len(axes) != a.ndim:
+            raise ValueError("axes don't match tensor")
+
+    if not axes:
+        axes = list(range(a.ndim))[::-1]
+    else:
+        axes = list(axes)
+    op = TensorTranspose(axes, dtype=a.dtype, sparse=a.issparse())
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/base/trapz.py b/python/xorbits/_mars/tensor/base/trapz.py
new file mode 100644
index 000000000..7e37e651e
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/trapz.py
@@ -0,0 +1,213 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes
+from ...core import recursive_tile
+from ...serialization.serializables import Float64Field, Int8Field, KeyField
+from ...utils import has_unknown_shape
+from ..array_utils import as_same_device, device
+from ..core import TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+from ..utils import validate_axis
+
+
+class TensorTrapz(TensorOperand, TensorOperandMixin):
+    _op_type_ = opcodes.TRAPZ
+
+    _y = KeyField("y")
+    _x = KeyField("x")
+    _dx = Float64Field("dx")
+    _axis = Int8Field("axis")
+
+    def __init__(self, y=None, x=None, dx=None, axis=None, **kw):
+        super().__init__(_y=y, _x=x, _dx=dx, _axis=axis, **kw)
+
+    @property
+    def y(self):
+        return self._y
+
+    @property
+    def x(self):
+        return self._x
+
+    @property
+    def dx(self):
+        return self._dx
+
+    @property
+    def axis(self):
+        return self._axis
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._y = self._inputs[0]
+        if self._x is not None:
+            self._x = self._inputs[-1]
+
+    def __call__(self, y, x=None):
+        inputs = [y]
+        order = y.order
+        if x is not None:
+            x = astensor(x)
+            inputs.append(x)
+            if x.order == TensorOrder.C_ORDER:
+                order = TensorOrder.C_ORDER
+
+        shape = tuple(s for ax, s in enumerate(y.shape) if ax != self._axis)
+        dtype = np.trapz(np.empty(1, dtype=y.dtype)).dtype
+        return self.new_tensor(inputs, shape=shape, dtype=dtype, order=order)
+
+    @classmethod
+    def tile(cls, op: "TensorTrapz"):
+        from .diff import diff
+
+        y = astensor(op.y)
+        x = op.x
+        axis = op.axis
+
+        if x is not None:
+            x = astensor(x)
+            # rechunk x to make x.nsplits == y.nsplits
+            if has_unknown_shape(x, y):
+                yield
+            x = yield from recursive_tile(x.rechunk(y.nsplits))
+
+        if len(y.chunks) == 1:
+            return cls._tile_one_chunk(op, y, x)
+
+        if x is None:
+            d = op.dx
+        else:
+            if x.ndim == 1:
+                d = diff(x)
+                # reshape to correct shape
+                shape = [1] * y.ndim
+                shape[axis] = d.shape[0]
+                d = d.reshape(shape)
+            else:
+                d = diff(x, axis=axis)
+        nd = y.ndim
+        slice1 = [slice(None)] * nd
+        slice2 = [slice(None)] * nd
+        slice1[axis] = slice(1, None)
+        slice2[axis] = slice(None, -1)
+        ret = (d * (y[tuple(slice1)] + y[tuple(slice2)]) / 2.0).sum(axis)
+        return [(yield from recursive_tile(ret))]
+
+    @classmethod
+    def _tile_one_chunk(cls, op, y, x):
+        out = op.outputs[0]
+        chunk_op = op.copy().reset_key()
+        inputs = [y.chunks[0]]
+        if x is not None:
+            inputs.append(x.chunks[0])
+        chunk = chunk_op.new_chunk(
+            inputs, shape=out.shape, order=out.order, index=(0,) * out.ndim
+        )
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            shape=out.shape,
+            order=out.order,
+            nsplits=tuple((s,) for s in out.shape),
+            chunks=[chunk],
+        )
+
+    @classmethod
+    def execute(cls, ctx, op: "TensorTrapz"):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        y = inputs[0]
+        if len(inputs) > 1:
+            x = inputs[-1]
+        else:
+            x = None
+
+        with device(device_id):
+            ctx[op.outputs[0].key] = xp.trapz(y, x=x, dx=op.dx, axis=op.axis)
+
+
+def trapz(y, x=None, dx=1.0, axis=-1):
+    """
+    Integrate along the given axis using the composite trapezoidal rule.
+
+    Integrate `y` (`x`) along given axis.
+
+    Parameters
+    ----------
+    y : array_like
+        Input tensor to integrate.
+    x : array_like, optional
+        The sample points corresponding to the `y` values. If `x` is None,
+        the sample points are assumed to be evenly spaced `dx` apart. The
+        default is None.
+    dx : scalar, optional
+        The spacing between sample points when `x` is None. The default is 1.
+    axis : int, optional
+        The axis along which to integrate.
+
+    Returns
+    -------
+    trapz : float
+        Definite integral as approximated by trapezoidal rule.
+
+    See Also
+    --------
+    sum, cumsum
+
+    Notes
+    -----
+    Image [2]_ illustrates trapezoidal rule -- y-axis locations of points
+    will be taken from `y` tensor, by default x-axis distances between
+    points will be 1.0, alternatively they can be provided with `x` tensor
+    or with `dx` scalar.  Return value will be equal to combined area under
+    the red lines.
+
+
+    References
+    ----------
+    .. [1] Wikipedia page: https://en.wikipedia.org/wiki/Trapezoidal_rule
+
+    .. [2] Illustration image:
+           https://en.wikipedia.org/wiki/File:Composite_trapezoidal_rule_illustration.png
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> mt.trapz([1,2,3]).execute()
+    4.0
+    >>> mt.trapz([1,2,3], x=[4,6,8]).execute()
+    8.0
+    >>> mt.trapz([1,2,3], dx=2).execute()
+    8.0
+    >>> a = mt.arange(6).reshape(2, 3)
+    >>> a.execute()
+    array([[0, 1, 2],
+           [3, 4, 5]])
+    >>> mt.trapz(a, axis=0).execute()
+    array([1.5, 2.5, 3.5])
+    >>> mt.trapz(a, axis=1).execute()
+    array([2.,  8.])
+
+    """
+    y = astensor(y)
+    axis = validate_axis(y.ndim, axis)
+    op = TensorTrapz(y=y, x=x, dx=dx, axis=axis)
+    return op(y, x=x)
diff --git a/python/xorbits/_mars/tensor/base/unique.py b/python/xorbits/_mars/tensor/base/unique.py
new file mode 100644
index 000000000..caf169d53
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/unique.py
@@ -0,0 +1,603 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import recursive_tile
+from ...core.operand import OperandStage
+from ...lib import sparse
+from ...lib.sparse.core import get_array_module as get_sparse_array_module
+from ...serialization.serializables import BoolField, Int32Field, Int64Field
+from ...utils import has_unknown_shape
+from ..array_utils import as_same_device, device
+from ..core import TensorOrder
+from ..operands import TensorMapReduceOperand, TensorOperandMixin, TensorShuffleProxy
+from ..utils import hash_on_axis, validate_axis
+
+
+class TensorUnique(TensorMapReduceOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.UNIQUE
+
+    _return_index = BoolField("return_index")
+    _return_inverse = BoolField("return_inverse")
+    _return_counts = BoolField("return_counts")
+    _axis = Int32Field("axis")
+    _aggregate_size = Int32Field("aggregate_size")
+
+    _start_pos = Int64Field("start_pos")
+
+    def __init__(
+        self,
+        return_index=None,
+        return_inverse=None,
+        return_counts=None,
+        axis=None,
+        start_pos=None,
+        aggregate_size=None,
+        **kw
+    ):
+        super().__init__(
+            _return_index=return_index,
+            _return_inverse=return_inverse,
+            _return_counts=return_counts,
+            _axis=axis,
+            _start_pos=start_pos,
+            _aggregate_size=aggregate_size,
+            **kw
+        )
+
+    @property
+    def output_limit(self):
+        if self.stage == OperandStage.map:
+            return 1
+        return (
+            1
+            + bool(self._return_index)
+            + bool(self._return_inverse)
+            + bool(self._return_counts)
+        )
+
+    @property
+    def return_index(self):
+        return self._return_index
+
+    @property
+    def return_inverse(self):
+        return self._return_inverse
+
+    @property
+    def return_counts(self):
+        return self._return_counts
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def aggregate_size(self):
+        return self._aggregate_size
+
+    @property
+    def start_pos(self):
+        return self._start_pos
+
+    @classmethod
+    def _gen_kws(cls, op, input_obj, chunk=False, chunk_index=None):
+        kws = []
+
+        # unique tensor
+        shape = list(input_obj.shape)
+        shape[op.axis] = np.nan
+        kw = {"shape": tuple(shape), "dtype": input_obj.dtype, "gpu": input_obj.op.gpu}
+        if chunk:
+            idx = [0] * len(shape)
+            idx[op.axis] = chunk_index or 0
+            kw["index"] = tuple(idx)
+        kws.append(kw)
+
+        # unique indices tensor
+        if op.return_index:
+            kw = {
+                "shape": (np.nan,),
+                "dtype": np.dtype(np.intp),
+                "gpu": input_obj.op.gpu,
+                "type": "indices",
+            }
+            if chunk:
+                kw["index"] = (chunk_index or 0,)
+            kws.append(kw)
+
+        # unique inverse tensor
+        if op.return_inverse:
+            kw = {
+                "shape": (input_obj.shape[op.axis],),
+                "dtype": np.dtype(np.intp),
+                "gpu": input_obj.op.gpu,
+                "type": "inverse",
+            }
+            if chunk:
+                kw["index"] = (chunk_index or 0,)
+            kws.append(kw)
+
+        # unique counts tensor
+        if op.return_counts:
+            kw = {
+                "shape": (np.nan,),
+                "dtype": np.dtype(np.int_),
+                "gpu": input_obj.op.gpu,
+                "type": "counts",
+            }
+            if chunk:
+                kw["index"] = (chunk_index or 0,)
+            kws.append(kw)
+
+        return kws
+
+    def __call__(self, ar):
+        from .atleast_1d import atleast_1d
+
+        ar = atleast_1d(ar)
+        if self.axis is None:
+            if ar.ndim > 1:
+                ar = ar.flatten()
+            self._axis = 0
+        else:
+            self._axis = validate_axis(ar.ndim, self._axis)
+
+        kws = self._gen_kws(self, ar)
+        tensors = self.new_tensors([ar], kws=kws, order=TensorOrder.C_ORDER)
+        if len(tensors) == 1:
+            return tensors[0]
+        return tensors
+
+    @classmethod
+    def _tile_one_chunk(cls, op):
+        outs = op.outputs
+        ins = op.inputs
+
+        chunk_op = op.copy().reset_key()
+        in_chunk = ins[0].chunks[0]
+        kws = cls._gen_kws(chunk_op, in_chunk, chunk=True)
+        out_chunks = chunk_op.new_chunks([in_chunk], kws=kws, order=outs[0].order)
+        new_op = op.copy()
+        kws = [out.params.copy() for out in outs]
+        for kw, out_chunk in zip(kws, out_chunks):
+            kw["chunks"] = [out_chunk]
+            kw["nsplits"] = tuple((s,) for s in out_chunk.shape)
+        return new_op.new_tensors(ins, kws=kws, order=outs[0].order)
+
+    @classmethod
+    def _tile_via_shuffle(cls, op):
+        # rechunk the axes except the axis to do unique into 1 chunk
+        inp = op.inputs[0]
+        if has_unknown_shape(inp):
+            yield
+
+        if inp.ndim > 1:
+            new_chunk_size = dict()
+            for axis in range(inp.ndim):
+                if axis == op.axis:
+                    continue
+                if np.isnan(inp.shape[axis]):
+                    yield
+                new_chunk_size[axis] = inp.shape[axis]
+            if has_unknown_shape(inp):
+                yield
+            inp = yield from recursive_tile(inp.rechunk(new_chunk_size))
+
+        aggregate_size = op.aggregate_size
+        if aggregate_size is None:
+            aggregate_size = max(inp.chunk_shape[op.axis] // options.combine_size, 1)
+
+        unique_on_chunk_sizes = inp.nsplits[op.axis]
+        start_poses = np.cumsum((0,) + unique_on_chunk_sizes).tolist()[:-1]
+        map_chunks = []
+        for c in inp.chunks:
+            map_op = TensorUnique(
+                stage=OperandStage.map,
+                return_index=op.return_index,
+                return_inverse=op.return_inverse,
+                return_counts=op.return_counts,
+                axis=op.axis,
+                aggregate_size=aggregate_size,
+                start_pos=start_poses[c.index[op.axis]],
+                dtype=inp.dtype,
+            )
+            shape = list(c.shape)
+            shape[op.axis] = np.nan
+            map_chunks.append(map_op.new_chunk([c], shape=tuple(shape), index=c.index))
+
+        shuffle_chunk = TensorShuffleProxy(
+            dtype=inp.dtype, _tensor_keys=[inp.op.key]
+        ).new_chunk(map_chunks, shape=())
+
+        reduce_chunks = [list() for _ in range(len(op.outputs))]
+        for i in range(aggregate_size):
+            reduce_op = TensorUnique(
+                stage=OperandStage.reduce,
+                return_index=op.return_index,
+                return_inverse=op.return_inverse,
+                return_counts=op.return_counts,
+                axis=op.axis,
+                reducer_index=(i,),
+                reducer_phase="agg",
+                n_reducers=aggregate_size,
+            )
+            kws = cls._gen_kws(op, inp, chunk=True, chunk_index=i)
+            chunks = reduce_op.new_chunks(
+                [shuffle_chunk], kws=kws, order=op.outputs[0].order
+            )
+            if op.return_inverse:
+                inverse_idx = 2 if op.return_index else 1
+                for j, chk in enumerate(chunks):
+                    if j == inverse_idx:
+                        chk.is_mapper = True
+                    else:
+                        chk.is_mapper = False
+            for j, c in enumerate(chunks):
+                reduce_chunks[j].append(c)
+
+        if op.return_inverse:
+            inverse_pos = 2 if op.return_index else 1
+            map_inverse_chunks = reduce_chunks[inverse_pos]
+            inverse_shuffle_chunk = TensorShuffleProxy(
+                dtype=map_inverse_chunks[0].dtype
+            ).new_chunk(map_inverse_chunks, shape=())
+            inverse_chunks = []
+            for j, cs in enumerate(unique_on_chunk_sizes):
+                chunk_op = TensorUnique(
+                    stage=OperandStage.reduce,
+                    n_reducers=len(unique_on_chunk_sizes),
+                    dtype=map_inverse_chunks[0].dtype,
+                    reducer_index=(j,),
+                    reducer_phase="inverse",
+                )
+                inverse_chunk = chunk_op.new_chunk(
+                    [inverse_shuffle_chunk], shape=(cs,), index=(j,)
+                )
+                inverse_chunks.append(inverse_chunk)
+            reduce_chunks[inverse_pos] = inverse_chunks
+
+        kws = [out.params for out in op.outputs]
+        for kw, chunks in zip(kws, reduce_chunks):
+            kw["chunks"] = chunks
+        unique_nsplits = list(inp.nsplits)
+        unique_nsplits[op.axis] = (np.nan,) * len(reduce_chunks[0])
+        kws[0]["nsplits"] = tuple(unique_nsplits)
+        i = 1
+        if op.return_index:
+            kws[i]["nsplits"] = ((np.nan,) * len(reduce_chunks[i]),)
+            i += 1
+        if op.return_inverse:
+            kws[i]["nsplits"] = (inp.nsplits[op.axis],)
+            i += 1
+        if op.return_counts:
+            kws[i]["nsplits"] = ((np.nan,) * len(reduce_chunks[i]),)
+
+        new_op = op.copy()
+        return new_op.new_tensors(op.inputs, kws=kws)
+
+    @classmethod
+    def tile(cls, op: "TensorUnique"):
+        if len(op.inputs[0].chunks) == 1:
+            return cls._tile_one_chunk(op)
+        else:
+            return (yield from cls._tile_via_shuffle(op))
+
+    @classmethod
+    def _execute_map(cls, ctx, op: "TensorUnique"):
+        (ar,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+        n_reducers = op.aggregate_size
+
+        with device(device_id):
+            results = xp.unique(
+                ar,
+                return_index=op.return_index,
+                return_inverse=op.return_inverse,
+                return_counts=op.return_counts,
+                axis=op.axis,
+            )
+            results = (results,) if not isinstance(results, tuple) else results
+            results_iter = iter(results)
+            unique_ar = next(results_iter)
+            indices_ar = next(results_iter) + op.start_pos if op.return_index else None
+            inverse_ar = next(results_iter) if op.return_inverse else None
+            counts_ar = next(results_iter) if op.return_counts else None
+
+            if xp is sparse:
+                dense_xp = get_sparse_array_module(unique_ar)
+            else:
+                dense_xp = xp
+            unique_index = (
+                dense_xp.arange(unique_ar.shape[op.axis])
+                if inverse_ar is not None
+                else None
+            )
+            if unique_ar.size > 0:
+                unique_reducers = dense_xp.asarray(
+                    hash_on_axis(unique_ar, op.axis, n_reducers)
+                )
+            else:
+                unique_reducers = dense_xp.empty_like(unique_ar)
+            ind_ar = dense_xp.arange(ar.shape[op.axis])
+
+            for reducer in range(n_reducers):
+                res = []
+                cond = unique_reducers == reducer
+                # unique
+                slc = (slice(None),) * op.axis + (cond,)
+                res.append(unique_ar[slc])
+                # indices
+                if indices_ar is not None:
+                    res.append(indices_ar[cond])
+                # inverse
+                if inverse_ar is not None:
+                    index_selected = unique_index[cond]
+                    inv_cond = xp.isin(inverse_ar, index_selected)
+                    inv_selected = xp.searchsorted(index_selected, inverse_ar[inv_cond])
+                    ind_selected = ind_ar[inv_cond]
+                    res.append(xp.stack([ind_selected, inv_selected]))
+                # counts
+                if counts_ar is not None:
+                    res.append(counts_ar[cond])
+                ctx[op.outputs[0].key, (reducer,)] = (
+                    ctx.get_current_chunk().index,
+                    tuple(res),
+                )
+
+    @classmethod
+    def _execute_agg_reduce(cls, ctx, op: "TensorUnique"):
+        input_indexes, input_data = zip(*list(op.iter_mapper_data(ctx)))
+
+        inputs = list(zip(*input_data))
+        flatten, device_id, xp = as_same_device(
+            list(itertools.chain(*inputs)), device=op.device, ret_extra=True
+        )
+        n_ret = len(inputs[0])
+        inputs = [flatten[i * n_ret : (i + 1) * n_ret] for i in range(len(inputs))]
+
+        inputs_iter = iter(inputs)
+        unique_arrays = next(inputs_iter)
+        indices_arrays = next(inputs_iter) if op.return_index else None
+        inverse_arrays = next(inputs_iter) if op.return_inverse else None
+        counts_arrays = next(inputs_iter) if op.return_counts else None
+
+        with device(device_id):
+            ar = xp.concatenate(unique_arrays, axis=op.axis)
+            result_return_inverse = op.return_inverse or op.return_counts
+            axis = op.axis
+            if ar.size == 0 or ar.shape[axis] == 0:
+                # empty array on the axis
+                results = [xp.empty(ar.shape)]
+                i = 1
+                for it in (op.return_index, op.return_inverse, op.return_counts):
+                    if it:
+                        results.append(xp.empty([], dtype=op.outputs[i].dtype))
+                        i += 1
+                results = tuple(results)
+            else:
+                results = xp.unique(
+                    ar,
+                    return_index=op.return_index,
+                    return_inverse=result_return_inverse,
+                    axis=axis,
+                )
+            results = (results,) if not isinstance(results, tuple) else results
+            results_iter = iter(results)
+            outputs_iter = iter(op.outputs)
+            # unique array
+            ctx[next(outputs_iter).key] = next(results_iter)
+
+            if op.output_limit == 1:
+                return
+
+            # calc indices
+            if op.return_index:
+                ctx[next(outputs_iter).key] = xp.concatenate(indices_arrays)[
+                    next(results_iter)
+                ]
+            # calc inverse
+            try:
+                inverse_result = next(results_iter)
+                if op.return_inverse:
+                    unique_sizes = tuple(ua.shape[op.axis] for ua in unique_arrays)
+                    cum_unique_sizes = np.cumsum((0,) + unique_sizes)
+                    indices_out_key = next(outputs_iter).key
+                    for i, inverse_array in enumerate(inverse_arrays):
+                        p = inverse_result[
+                            cum_unique_sizes[i] : cum_unique_sizes[i + 1]
+                        ]
+                        r = xp.empty(inverse_array.shape, dtype=inverse_array.dtype)
+                        if inverse_array.size > 0:
+                            r[0] = inverse_array[0]
+                            r[1] = p[inverse_array[1]]
+                        # return unique length and
+                        ctx[indices_out_key, (input_indexes[i][op.axis],)] = (
+                            results[0].shape[op.axis],
+                            r,
+                        )
+                # calc counts
+                if op.return_counts:
+                    result_counts = xp.zeros(results[0].shape[op.axis], dtype=int)
+                    t = np.stack([inverse_result, np.concatenate(counts_arrays)])
+
+                    def acc(a):
+                        i, v = a
+                        result_counts[i] += v
+
+                    np.apply_along_axis(acc, 0, t)
+                    ctx[next(outputs_iter).key] = xp.asarray(result_counts)
+            except StopIteration:
+                pass
+
+    @classmethod
+    def _execute_inverse_reduce(cls, ctx, op: "TensorUnique"):
+        out = op.outputs[0]
+        inputs = list(op.iter_mapper_data(ctx))
+        unique_sizes = [inp[0] for inp in inputs]
+        cum_unique_sizes = np.cumsum([0] + unique_sizes)
+        invs, device_id, xp = as_same_device(
+            [inp[1] for inp in inputs], device=op.device, ret_extra=True
+        )
+        with device(device_id):
+            ret = xp.empty(out.shape, dtype=out.dtype)
+            for i, inv in enumerate(invs):
+                ret[inv[0]] = cum_unique_sizes[i] + inv[1]
+            ctx[out.key] = ret
+
+    @classmethod
+    def execute(cls, ctx, op: "TensorUnique"):
+        if op.stage == OperandStage.map:
+            cls._execute_map(ctx, op)
+        elif op.stage == OperandStage.reduce:
+            if op.reducer_phase == "agg":
+                cls._execute_agg_reduce(ctx, op)
+            else:
+                assert op.reducer_phase == "inverse"
+                cls._execute_inverse_reduce(ctx, op)
+        else:
+            (ar,), device_id, xp = as_same_device(
+                [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+            )
+
+            with device(device_id):
+                kw = dict(
+                    return_index=op.return_index,
+                    return_inverse=op.return_inverse,
+                    return_counts=op.return_counts,
+                )
+                if ar.dtype != object and sum(ar.shape) > 0:
+                    # axis cannot pass when dtype is object or array size is 0
+                    kw["axis"] = op.axis
+                results = xp.unique(ar, **kw)
+                outs = op.outputs
+                if len(outs) == 1:
+                    ctx[outs[0].key] = results
+                    return
+
+                assert len(outs) == len(results)
+                for out, result in zip(outs, results):
+                    ctx[out.key] = result
+
+
+def unique(
+    ar,
+    return_index=False,
+    return_inverse=False,
+    return_counts=False,
+    axis=None,
+    aggregate_size=None,
+):
+    """
+    Find the unique elements of a tensor.
+
+    Returns the sorted unique elements of a tensor. There are three optional
+    outputs in addition to the unique elements:
+
+    * the indices of the input tensor that give the unique values
+    * the indices of the unique tensor that reconstruct the input tensor
+    * the number of times each unique value comes up in the input tensor
+
+    Parameters
+    ----------
+    ar : array_like
+        Input tensor. Unless `axis` is specified, this will be flattened if it
+        is not already 1-D.
+    return_index : bool, optional
+        If True, also return the indices of `ar` (along the specified axis,
+        if provided, or in the flattened tensor) that result in the unique tensor.
+    return_inverse : bool, optional
+        If True, also return the indices of the unique tensor (for the specified
+        axis, if provided) that can be used to reconstruct `ar`.
+    return_counts : bool, optional
+        If True, also return the number of times each unique item appears
+        in `ar`.
+    axis : int or None, optional
+        The axis to operate on. If None, `ar` will be flattened. If an integer,
+        the subarrays indexed by the given axis will be flattened and treated
+        as the elements of a 1-D tensor with the dimension of the given axis,
+        see the notes for more details.  Object tensors or structured tensors
+        that contain objects are not supported if the `axis` kwarg is used. The
+        default is None.
+    aggregate_size: int or None, optional
+        How many chunks will be after unique, default as #input.chunks / options.combine_size
+
+    Returns
+    -------
+    unique : Tensor
+        The sorted unique values.
+    unique_indices : Tensor, optional
+        The indices of the first occurrences of the unique values in the
+        original tensor. Only provided if `return_index` is True.
+    unique_inverse : Tensor, optional
+        The indices to reconstruct the original tensor from the
+        unique tensor. Only provided if `return_inverse` is True.
+    unique_counts : Tensor, optional
+        The number of times each of the unique values comes up in the
+        original tensor. Only provided if `return_counts` is True.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.unique([1, 1, 2, 2, 3, 3]).execute()
+    array([1, 2, 3])
+    >>> a = mt.array([[1, 1], [2, 3]])
+    >>> mt.unique(a).execute()
+    array([1, 2, 3])
+
+    Return the unique rows of a 2D tensor
+
+    >>> a = mt.array([[1, 0, 0], [1, 0, 0], [2, 3, 4]])
+    >>> mt.unique(a, axis=0).execute()
+    array([[1, 0, 0], [2, 3, 4]])
+
+    Return the indices of the original tensor that give the unique values:
+
+    >>> a = mt.array(['a', 'b', 'b', 'c', 'a'])
+    >>> u, indices = mt.unique(a, return_index=True)
+    >>> u.execute()
+    array(['a', 'b', 'c'],
+           dtype='|S1')
+    >>> indices.execute()
+    array([0, 1, 3])
+    >>> a[indices].execute()
+    array(['a', 'b', 'c'],
+           dtype='|S1')
+
+    Reconstruct the input array from the unique values:
+
+    >>> a = mt.array([1, 2, 6, 4, 2, 3, 2])
+    >>> u, indices = mt.unique(a, return_inverse=True)
+    >>> u.execute()
+    array([1, 2, 3, 4, 6])
+    >>> indices.execute()
+    array([0, 1, 4, 3, 1, 2, 1])
+    >>> u[indices].execute()
+    array([1, 2, 6, 4, 2, 3, 2])
+    """
+    op = TensorUnique(
+        return_index=return_index,
+        return_inverse=return_inverse,
+        return_counts=return_counts,
+        axis=axis,
+        aggregate_size=aggregate_size,
+    )
+    return op(ar)
diff --git a/python/xorbits/_mars/tensor/base/vsplit.py b/python/xorbits/_mars/tensor/base/vsplit.py
new file mode 100644
index 000000000..328af84aa
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/vsplit.py
@@ -0,0 +1,74 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ..datasource import tensor as astensor
+from .split import split
+
+
+def vsplit(a, indices_or_sections):
+    """
+    Split a tensor into multiple sub-tensors vertically (row-wise).
+
+    Please refer to the ``split`` documentation.  ``vsplit`` is equivalent
+    to ``split`` with `axis=0` (default), the tensor is always split along the
+    first axis regardless of the tensor dimension.
+
+    See Also
+    --------
+    split : Split a tensor into multiple sub-tensors of equal size.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.arange(16.0).reshape(4, 4)
+    >>> x.execute()
+    array([[  0.,   1.,   2.,   3.],
+           [  4.,   5.,   6.,   7.],
+           [  8.,   9.,  10.,  11.],
+           [ 12.,  13.,  14.,  15.]])
+    >>> mt.vsplit(x, 2).execute()
+    [array([[ 0.,  1.,  2.,  3.],
+           [ 4.,  5.,  6.,  7.]]),
+     array([[  8.,   9.,  10.,  11.],
+           [ 12.,  13.,  14.,  15.]])]
+    >>> mt.vsplit(x, mt.array([3, 6])).execute()
+    [array([[  0.,   1.,   2.,   3.],
+           [  4.,   5.,   6.,   7.],
+           [  8.,   9.,  10.,  11.]]),
+     array([[ 12.,  13.,  14.,  15.]]),
+     array([], dtype=float64)]
+
+    With a higher dimensional tensor the split is still along the first axis.
+
+    >>> x = mt.arange(8.0).reshape(2, 2, 2)
+    >>> x.execute()
+    array([[[ 0.,  1.],
+            [ 2.,  3.]],
+           [[ 4.,  5.],
+            [ 6.,  7.]]])
+    >>> mt.vsplit(x, 2).execute()
+    [array([[[ 0.,  1.],
+            [ 2.,  3.]]]),
+     array([[[ 4.,  5.],
+            [ 6.,  7.]]])]
+
+    """
+    ary = a
+    a = astensor(a)
+
+    if a.ndim < 2:
+        raise ValueError("vsplit only works on tensors of 2 or more dimensions")
+    return split(ary, indices_or_sections, 0)
diff --git a/python/xorbits/_mars/tensor/base/where.py b/python/xorbits/_mars/tensor/base/where.py
new file mode 100644
index 000000000..be32b6619
--- /dev/null
+++ b/python/xorbits/_mars/tensor/base/where.py
@@ -0,0 +1,193 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import KeyField
+from ...utils import has_unknown_shape
+from ..array_utils import as_same_device, device
+from ..core import TENSOR_TYPE
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+from ..utils import broadcast_shape, unify_chunks
+from .broadcast_to import broadcast_to
+
+
+class TensorWhere(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.WHERE
+
+    _condition = KeyField("condition")
+    _x = KeyField("x")
+    _y = KeyField("y")
+
+    @property
+    def condition(self):
+        return self._condition
+
+    @property
+    def x(self):
+        return self._x
+
+    @property
+    def y(self):
+        return self._y
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._condition = self._inputs[0]
+        self._x = self._inputs[1]
+        self._y = self._inputs[2]
+
+    def __call__(self, condition, x, y, shape=None):
+        shape = shape or broadcast_shape(condition.shape, x.shape, y.shape)
+        return self.new_tensor([condition, x, y], shape)
+
+    @classmethod
+    def tile(cls, op):
+        if has_unknown_shape(*op.inputs):
+            yield
+        inputs = yield from unify_chunks(
+            *[(input, list(range(input.ndim))[::-1]) for input in op.inputs]
+        )
+        chunk_shapes = [
+            t.chunk_shape if isinstance(t, TENSOR_TYPE) else t for t in inputs
+        ]
+        out_chunk_shape = broadcast_shape(*chunk_shapes)
+        output = op.outputs[0]
+
+        out_chunks = []
+        nsplits = [[np.nan] * shape for shape in out_chunk_shape]
+        get_index = lambda idx, t: tuple(
+            0 if t.nsplits[i] == (1,) else ix for i, ix in enumerate(idx)
+        )
+        for out_index in itertools.product(*(map(range, out_chunk_shape))):
+            in_chunks = [
+                t.cix[get_index(out_index[-t.ndim :], t)]
+                if t.ndim != 0
+                else t.chunks[0]
+                for t in inputs
+            ]
+            chunk_shape = broadcast_shape(*(c.shape for c in in_chunks))
+            out_chunk = (
+                op.copy()
+                .reset_key()
+                .new_chunk(
+                    in_chunks, shape=chunk_shape, index=out_index, order=output.order
+                )
+            )
+            out_chunks.append(out_chunk)
+            for i, idx, s in zip(itertools.count(0), out_index, out_chunk.shape):
+                nsplits[i][idx] = s
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            inputs, output.shape, order=output.order, chunks=out_chunks, nsplits=nsplits
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (cond, x, y), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            ctx[op.outputs[0].key] = xp.where(cond, x, y)
+
+
+def where(condition, x=None, y=None):
+    """
+    Return elements, either from `x` or `y`, depending on `condition`.
+
+    If only `condition` is given, return ``condition.nonzero()``.
+
+    Parameters
+    ----------
+    condition : array_like, bool
+        When True, yield `x`, otherwise yield `y`.
+    x, y : array_like, optional
+        Values from which to choose. `x`, `y` and `condition` need to be
+        broadcastable to some shape.
+
+    Returns
+    -------
+    out : Tensor or tuple of Tensors
+        If both `x` and `y` are specified, the output tensor contains
+        elements of `x` where `condition` is True, and elements from
+        `y` elsewhere.
+
+        If only `condition` is given, return the tuple
+        ``condition.nonzero()``, the indices where `condition` is True.
+
+    See Also
+    --------
+    nonzero, choose
+
+    Notes
+    -----
+    If `x` and `y` are given and input arrays are 1-D, `where` is
+    equivalent to::
+
+        [xv if c else yv for (c,xv,yv) in zip(condition,x,y)]
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.where([[True, False], [True, True]],
+    ...          [[1, 2], [3, 4]],
+    ...          [[9, 8], [7, 6]]).execute()
+    array([[1, 8],
+           [3, 4]])
+
+    >>> mt.where([[0, 1], [1, 0]]).execute()
+    (array([0, 1]), array([1, 0]))
+
+    >>> x = mt.arange(9.).reshape(3, 3)
+    >>> mt.where( x > 5 ).execute()
+    (array([2, 2, 2]), array([0, 1, 2]))
+    >>> mt.where(x < 5, x, -1).execute()               # Note: broadcasting.
+    array([[ 0.,  1.,  2.],
+           [ 3.,  4., -1.],
+           [-1., -1., -1.]])
+
+    Find the indices of elements of `x` that are in `goodvalues`.
+
+    >>> goodvalues = [3, 4, 7]
+    >>> ix = mt.isin(x, goodvalues)
+    >>> ix.execute()
+    array([[False, False, False],
+           [ True,  True, False],
+           [False,  True, False]])
+    >>> mt.where(ix).execute()
+    (array([1, 1, 2]), array([0, 1, 1]))
+    """
+    if (x is None) != (y is None):
+        raise ValueError("either both or neither of x and y should be given")
+
+    if x is None and y is None:
+        return astensor(condition).nonzero()
+
+    x, y = astensor(x), astensor(y)
+    dtype = np.result_type(x.dtype, y.dtype)
+    shape = broadcast_shape(x.shape, y.shape)
+
+    if np.isscalar(condition):
+        return broadcast_to(x if condition else y, shape).astype(dtype)
+    else:
+        condition = astensor(condition)
+        op = TensorWhere(dtype=dtype)
+        return op(condition, x, y, shape=shape)
diff --git a/python/xorbits/_mars/tensor/core.py b/python/xorbits/_mars/tensor/core.py
new file mode 100644
index 000000000..c50d052dd
--- /dev/null
+++ b/python/xorbits/_mars/tensor/core.py
@@ -0,0 +1,727 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from collections.abc import Iterable
+from enum import Enum
+from operator import attrgetter
+from typing import Any, Dict
+
+import numpy as np
+
+from ..core import (
+    Chunk,
+    ChunkData,
+    HasShapeTileable,
+    HasShapeTileableData,
+    OutputType,
+    _ExecuteAndFetchMixin,
+    is_build_mode,
+    register_output_types,
+)
+from ..core.entity.utils import refresh_tileable_shape
+from ..serialization.serializables import (
+    AnyField,
+    DataTypeField,
+    FieldTypes,
+    ListField,
+    ReferenceField,
+    Serializable,
+    StringField,
+    TupleField,
+)
+from ..utils import on_deserialize_shape, on_serialize_shape
+from .utils import fetch_corner_data, get_chunk_slices
+
+logger = logging.getLogger(__name__)
+
+
+class TensorOrder(Enum):
+    # C order
+    C_ORDER = "C"
+    # Fortran order
+    F_ORDER = "F"
+
+
+class TensorChunkData(ChunkData):
+    __slots__ = ()
+    _no_copy_attrs_ = ChunkData._no_copy_attrs_ | {"dtype"}
+    type_name = "Tensor"
+
+    # required fields
+    _shape = TupleField(
+        "shape",
+        FieldTypes.int64,
+        on_serialize=on_serialize_shape,
+        on_deserialize=on_deserialize_shape,
+    )
+    _order = ReferenceField("order", TensorOrder)
+    # optional fields
+    _dtype = DataTypeField("dtype")
+
+    def __init__(self, op=None, index=None, shape=None, dtype=None, order=None, **kw):
+        if isinstance(order, str):
+            order = getattr(TensorOrder, order)
+        super().__init__(
+            _op=op, _index=index, _shape=shape, _dtype=dtype, _order=order, **kw
+        )
+        if self.order is None and self.op is not None:
+            if len(self.inputs) == 0:
+                self._order = TensorOrder.C_ORDER
+            elif all(
+                hasattr(inp, "order") and inp.order == TensorOrder.F_ORDER
+                for inp in self.inputs
+            ):
+                self._order = TensorOrder.F_ORDER
+            else:
+                self._order = TensorOrder.C_ORDER
+
+    @property
+    def params(self) -> Dict[str, Any]:
+        # params return the properties which useful to rebuild a new chunk
+        return {
+            "shape": self.shape,
+            "dtype": self.dtype,
+            "order": self.order,
+            "index": self.index,
+        }
+
+    @params.setter
+    def params(self, new_params: Dict[str, Any]):
+        params = new_params.copy()
+        params.pop("index", None)  # index not needed to update
+        new_shape = params.pop("shape", None)
+        if new_shape is not None:
+            self._shape = new_shape
+        dtype = params.pop("dtype", None)
+        if dtype is not None:
+            self._dtype = dtype
+        order = params.pop("order", None)
+        if order is not None:
+            self._order = order
+        if params:  # pragma: no cover
+            raise TypeError(f"Unknown params: {list(params)}")
+
+    @classmethod
+    def get_params_from_data(cls, data: np.ndarray) -> Dict[str, Any]:
+        from .array_utils import is_cupy
+
+        if not is_cupy(data):
+            data = np.asarray(data)
+        order = (
+            TensorOrder.C_ORDER if data.flags["C_CONTIGUOUS"] else TensorOrder.F_ORDER
+        )
+        return {"shape": data.shape, "dtype": data.dtype, "order": order}
+
+    def __len__(self):
+        try:
+            return self.shape[0]
+        except IndexError:
+            if is_build_mode():
+                return 0
+            raise TypeError("len() of unsized object")
+
+    @property
+    def shape(self):
+        return getattr(self, "_shape", None)
+
+    @property
+    def ndim(self):
+        return len(self.shape)
+
+    @property
+    def size(self):
+        return np.prod(self.shape).item()
+
+    @property
+    def dtype(self):
+        return getattr(self, "_dtype", None) or getattr(self.op, "dtype", None)
+
+    @property
+    def order(self):
+        return getattr(self, "_order", None)
+
+    @property
+    def nbytes(self):
+        return np.prod(self.shape) * self.dtype.itemsize
+
+
+class TensorChunk(Chunk):
+    __slots__ = ()
+    _allow_data_type_ = (TensorChunkData,)
+    type_name = "Tensor"
+
+    def __len__(self):
+        return len(self._data)
+
+
+class TensorData(HasShapeTileableData, _ExecuteAndFetchMixin):
+    __slots__ = ()
+    type_name = "Tensor"
+
+    # required fields
+    _order = StringField(
+        "order", on_serialize=attrgetter("value"), on_deserialize=TensorOrder
+    )
+    # optional fields
+    _dtype = DataTypeField("dtype")
+    _chunks = ListField(
+        "chunks",
+        FieldTypes.reference(TensorChunkData),
+        on_serialize=lambda x: [it.data for it in x] if x is not None else x,
+        on_deserialize=lambda x: [TensorChunk(it) for it in x] if x is not None else x,
+    )
+
+    def __init__(
+        self,
+        op=None,
+        shape=None,
+        dtype=None,
+        order=None,
+        nsplits=None,
+        chunks=None,
+        **kw,
+    ):
+        if isinstance(order, str):
+            order = getattr(TensorOrder, order)
+        super().__init__(
+            _op=op,
+            _shape=shape,
+            _dtype=dtype,
+            _order=order,
+            _nsplits=nsplits,
+            _chunks=chunks,
+            **kw,
+        )
+        if self.order is None and self.op is not None:
+            if len(self.inputs) == 0:
+                self._order = TensorOrder.C_ORDER
+            elif all(
+                hasattr(inp, "order") and inp.order == TensorOrder.F_ORDER
+                for inp in self.inputs
+            ):
+                self._order = TensorOrder.F_ORDER
+            else:
+                self._order = TensorOrder.C_ORDER
+
+    def _to_str(self, representation=False):
+        if is_build_mode() or len(self._executed_sessions) == 0:
+            # in build mode, or not executed, just return representation
+            if representation:
+                return f"Tensor <op={type(self._op).__name__}, shape={self._shape}, key={self._key}>"
+            else:
+                return f"Tensor(op={type(self._op).__name__}, shape={self._shape})"
+        else:
+            print_options = np.get_printoptions()
+            threshold = print_options["threshold"]
+
+            corner_data = fetch_corner_data(self, session=self._executed_sessions[-1])
+            # if less than default threshold, just set it as default,
+            # if not, set to corner_data.size - 1 make sure ... exists in repr
+            threshold = threshold if self.size <= threshold else corner_data.size - 1
+            with np.printoptions(threshold=threshold):
+                corner_str = repr(corner_data) if representation else str(corner_data)
+            return corner_str
+
+    def __str__(self):
+        return self._to_str(representation=False)
+
+    def __repr__(self):
+        return self._to_str(representation=True)
+
+    @property
+    def params(self):
+        # params return the properties which useful to rebuild a new tileable object
+        return {"shape": self.shape, "dtype": self.dtype, "order": self.order}
+
+    @params.setter
+    def params(self, new_params: Dict[str, Any]):
+        params = new_params.copy()
+        shape = params.pop("shape", None)
+        if shape is not None:
+            self._shape = shape
+        dtype = params.pop("dtype", None)
+        if dtype is not None:
+            self._dtype = dtype
+        order = params.pop("order", None)
+        if order is not None:
+            self._order = order
+        if params:  # pragma: no cover
+            raise TypeError(f"Unknown params: {list(params)}")
+
+    def refresh_params(self):
+        refresh_tileable_shape(self)
+        if self._dtype is None:
+            self._dtype = self.chunks[0].dtype
+
+    @property
+    def flags(self):
+        c_order = True if self.ndim <= 1 else self.order == TensorOrder.C_ORDER
+        f_order = True if self.ndim <= 1 else self.order == TensorOrder.F_ORDER
+        return {"C_CONTIGUOUS": c_order, "F_CONTIGUOUS": f_order}
+
+    @property
+    def real(self):
+        from .arithmetic import real
+
+        return real(self)
+
+    @property
+    def imag(self):
+        from .arithmetic import imag
+
+        return imag(self)
+
+    @property
+    def dtype(self):
+        return getattr(self, "_dtype", None) or getattr(self.op, "dtype", None)
+
+    @property
+    def order(self):
+        return getattr(self, "_order", None)
+
+    @property
+    def nbytes(self):
+        return np.prod(self.shape) * self.dtype.itemsize
+
+    def get_chunk_slices(self, idx):
+        return get_chunk_slices(self.nsplits, idx)
+
+    def is_scalar(self):
+        return self.ndim == 0
+
+    isscalar = is_scalar
+
+    def tosparse(self, missing=None):
+        if self.issparse():
+            return self
+
+        from .datasource import fromdense
+
+        return fromdense(self, missing=missing)
+
+    def todense(self, fill_value=None):
+        if not self.issparse():
+            return self
+
+        from .datasource import fromsparse
+
+        return fromsparse(self, fill_value=fill_value)
+
+    def transpose(self, *axes):
+        from .base import transpose
+
+        if len(axes) == 1 and isinstance(axes[0], Iterable):
+            axes = axes[0]
+
+        return transpose(self, axes)
+
+    @property
+    def T(self):
+        return self.transpose()
+
+    def reshape(self, shape, *shapes, **kw):
+        from .reshape import reshape
+
+        order = kw.pop("order", "C")
+        if kw:
+            raise TypeError(
+                f"'{next(iter(kw))}' is an invalid keyword argument for this function"
+            )
+
+        if isinstance(shape, Iterable):
+            shape = tuple(shape)
+        else:
+            shape = (shape,)
+        shape += shapes
+
+        return reshape(self, shape, order=order)
+
+    def totiledb(self, uri, ctx=None, key=None, timestamp=None):
+        from .datastore import totiledb
+
+        return totiledb(uri, self, ctx=ctx, key=key, timestamp=timestamp)
+
+    @staticmethod
+    def from_dataframe(in_df):
+        from .datasource import from_dataframe
+
+        return from_dataframe(in_df)
+
+    def to_dataframe(self, *args, **kwargs):
+        from ..dataframe.datasource.from_tensor import dataframe_from_tensor
+
+        return dataframe_from_tensor(self, *args, **kwargs)
+
+    @property
+    def flat(self):
+        return flatiter(self)
+
+    def to_numpy(self, session=None, **kw):
+        return self._execute_and_fetch(session=session, **kw)
+
+
+class Tensor(HasShapeTileable):
+    __slots__ = ()
+    _allow_data_type_ = (TensorData,)
+    type_name = "Tensor"
+
+    def __len__(self):
+        return len(self._data)
+
+    @property
+    def shape(self):
+        return self._data.shape
+
+    @shape.setter
+    def shape(self, new_shape):
+        self._data = self._data.reshape(new_shape).data
+
+    def _update_shape(self, new_shape):
+        self._data._update_shape(new_shape)
+
+    @property
+    def real(self):
+        return self.data.real
+
+    @real.setter
+    def real(self, new_real):
+        from .arithmetic.setreal import set_real
+
+        self._data = set_real(self._data, new_real).data
+
+    @property
+    def imag(self):
+        return self.data.imag
+
+    @imag.setter
+    def imag(self, new_imag):
+        from .arithmetic.setimag import set_imag
+
+        self._data = set_imag(self._data, new_imag).data
+
+    def __array__(self, dtype=None):
+        return np.asarray(self.to_numpy(), dtype=dtype)
+
+    def __array_function__(self, func, types, args, kwargs):
+        from .. import tensor as module
+
+        for submodule in func.__module__.split(".")[1:]:
+            try:
+                module = getattr(module, submodule)
+            except AttributeError:
+                return NotImplemented
+        if not hasattr(module, func.__name__):
+            return NotImplemented
+        mars_func = getattr(module, func.__name__)
+        if mars_func is func:
+            # avoid Numpy func
+            return NotImplemented
+        return mars_func(*args, **kwargs)
+
+    def view(self):
+        return self._view()
+
+    @property
+    def ndim(self):
+        """
+        Number of array dimensions.
+
+        Examples
+        --------
+        >>> import mars.tensor as mt
+        >>> x = mt.array([1, 2, 3])
+        >>> x.ndim
+        1
+        >>> y = mt.zeros((2, 3, 4))
+        >>> y.ndim
+        3
+        """
+        return super().ndim
+
+    def transpose(self, *axes):
+        """
+        Returns a view of the tensor with axes transposed.
+
+        For a 1-D tensor, this has no effect. (To change between column and
+        row vectors, first cast the 1-D tensor into a matrix object.)
+        For a 2-D tensor, this is the usual matrix transpose.
+        For an n-D tensor, if axes are given, their order indicates how the
+        axes are permuted (see Examples). If axes are not provided and
+        ``a.shape = (i[0], i[1], ... i[n-2], i[n-1])``, then
+        ``a.transpose().shape = (i[n-1], i[n-2], ... i[1], i[0])``.
+
+        Parameters
+        ----------
+        axes : None, tuple of ints, or `n` ints
+
+         * None or no argument: reverses the order of the axes.
+
+         * tuple of ints: `i` in the `j`-th place in the tuple means `a`'s
+           `i`-th axis becomes `a.transpose()`'s `j`-th axis.
+
+         * `n` ints: same as an n-tuple of the same ints (this form is
+           intended simply as a "convenience" alternative to the tuple form)
+
+        Returns
+        -------
+        out : Tensor
+            View of `a`, with axes suitably permuted.
+
+        See Also
+        --------
+        Tensor.T : Tensor property returning the tensor transposed.
+
+        Examples
+        --------
+        >>> import mars.tensor as mt
+
+        >>> a = mt.array([[1, 2], [3, 4]])
+        >>> a.execute()
+        array([[1, 2],
+               [3, 4]])
+        >>> a.transpose().execute()
+        array([[1, 3],
+               [2, 4]])
+        >>> a.transpose((1, 0))
+        array([[1, 3],
+               [2, 4]])
+        >>> a.transpose(1, 0).execute()
+        array([[1, 3],
+               [2, 4]])
+        """
+        return self._data.transpose(*axes)
+
+    @property
+    def T(self):
+        """
+        Same as self.transpose(), except that self is returned if
+        self.ndim < 2.
+
+        Examples
+        --------
+        >>> import mars.tensor as mt
+
+        >>> x = mt.array([[1.,2.],[3.,4.]])
+        >>> x.execute()
+        array([[ 1.,  2.],
+               [ 3.,  4.]])
+        >>> x.T.execute()
+        array([[ 1.,  3.],
+               [ 2.,  4.]])
+        >>> x = mt.array([1.,2.,3.,4.])
+        >>> x.execute()
+        array([ 1.,  2.,  3.,  4.])
+        >>> x.T.execute()
+        array([ 1.,  2.,  3.,  4.])
+        """
+        return self._data.T
+
+    def totiledb(self, uri, ctx=None, key=None, timestamp=None):
+        return self._data.totiledb(uri, ctx=ctx, key=key, timestamp=timestamp)
+
+    def copy(self, order="C"):
+        return super().copy().astype(self.dtype, order=order, copy=False)
+
+    def sort(self, axis=-1, kind=None, parallel_kind=None, psrs_kinds=None, order=None):
+        """
+        Sort a tensor, in-place.
+
+        Parameters
+        ----------
+        axis : int, optional
+            Axis along which to sort. Default is -1, which means sort along the
+            last axis.
+        kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
+            Sorting algorithm. Default is 'quicksort'.
+        parallel_kind: {'PSRS'}, optional
+            Parallel sorting algorithm, for the details, refer to:
+            http://csweb.cs.wfu.edu/bigiron/LittleFE-PSRS/build/html/PSRSalgorithm.html
+        psrs_kinds: list with 3 elements, optional
+            Sorting algorithms during PSRS algorithm.
+        order : str or list of str, optional
+            When `a` is a tensor with fields defined, this argument specifies
+            which fields to compare first, second, etc.  A single field can
+            be specified as a string, and not all fields need be specified,
+            but unspecified fields will still be used, in the order in which
+            they come up in the dtype, to break ties.
+
+        See Also
+        --------
+        numpy.sort : Return a sorted copy of a tensor.
+        argsort : Indirect sort.
+        lexsort : Indirect stable sort on multiple keys.
+        searchsorted : Find elements in sorted tensor.
+        partition: Partial sort.
+
+        Notes
+        -----
+        See ``sort`` for notes on the different sorting algorithms.
+
+        Examples
+        --------
+        >>> import mars.tensor as mt
+        >>> a = mt.array([[1,4], [3,1]])
+        >>> a.sort(axis=1)
+        >>> a.execute()
+        array([[1, 4],
+               [1, 3]])
+        >>> a.sort(axis=0)
+        >>> a.execute()
+        array([[1, 3],
+               [1, 4]])
+
+        Use the `order` keyword to specify a field to use when sorting a
+        structured tensor:
+
+        >>> a = mt.array([('a', 2), ('c', 1)], dtype=[('x', 'S1'), ('y', int)])
+        >>> a.sort(order='y')
+        >>> a.execute()
+        array([('c', 1), ('a', 2)],
+              dtype=[('x', '|S1'), ('y', '<i4')])
+        """
+        from .base import sort
+
+        self._data = sort(
+            self,
+            axis=axis,
+            kind=kind,
+            parallel_kind=parallel_kind,
+            psrs_kinds=psrs_kinds,
+            order=order,
+        ).data
+
+    def partition(self, kth, axis=-1, kind="introselect", order=None, **kw):
+        """
+        Rearranges the elements in the tensor in such a way that the value of the
+        element in kth position is in the position it would be in a sorted tensor.
+        All elements smaller than the kth element are moved before this element and
+        all equal or greater are moved behind it. The ordering of the elements in
+        the two partitions is undefined.
+
+        Parameters
+        ----------
+        kth : int or sequence of ints
+            Element index to partition by. The kth element value will be in its
+            final sorted position and all smaller elements will be moved before it
+            and all equal or greater elements behind it.
+            The order of all elements in the partitions is undefined.
+            If provided with a sequence of kth it will partition all elements
+            indexed by kth of them into their sorted position at once.
+        axis : int, optional
+            Axis along which to sort. Default is -1, which means sort along the
+            last axis.
+        kind : {'introselect'}, optional
+            Selection algorithm. Default is 'introselect'.
+        order : str or list of str, optional
+            When `a` is a tensor with fields defined, this argument specifies
+            which fields to compare first, second, etc. A single field can
+            be specified as a string, and not all fields need to be specified,
+            but unspecified fields will still be used, in the order in which
+            they come up in the dtype, to break ties.
+
+        See Also
+        --------
+        mt.partition : Return a partitioned copy of an tensor.
+        argpartition : Indirect partition.
+        sort : Full sort.
+
+        Notes
+        -----
+        See ``mt.partition`` for notes on the different algorithms.
+
+        Examples
+        --------
+        >>> import mars.tensor as mt
+        >>> a = mt.array([3, 4, 2, 1])
+        >>> a.partition(3)
+        >>> a.execute()
+        array([2, 1, 3, 4])
+
+        >>> a.partition((1, 3))
+        >>> a.execute()
+        array([1, 2, 3, 4])
+        """
+        from .base import partition
+
+        self._data = partition(self, kth, axis=axis, kind=kind, order=order, **kw).data
+
+    @property
+    def flat(self):
+        """
+        Flat iterator object to iterate over arrays.
+
+        A `flatiter` iterator is returned by ``x.flat`` for any tensor `x`.
+        It allows iterating over the tensor as if it were a 1-D array,
+        either in a for-loop or by calling its `next` method.
+
+        Iteration is done in row-major, C-style order (the last
+        index varying the fastest). The iterator can also be indexed using
+        basic slicing or advanced indexing.
+
+        See Also
+        --------
+        Tensor.flat : Return a flat iterator over a tensor.
+        Tensor.flatten : Returns a flattened copy of a tensor.
+
+        Examples
+        --------
+        >>> import mars.tensor as mt
+
+        >>> x = mt.arange(6).reshape(2, 3)
+        >>> fl = x.flat
+
+        >>> fl[2:4].execute()
+        array([2, 3])
+        """
+        return self._data.flat
+
+    def from_dataframe(self, in_df):
+        return self._data.from_dataframe(in_df)
+
+    def to_dataframe(self, *args, **kwargs):
+        return self._data.to_dataframe(*args, **kwargs)
+
+    def to_numpy(self, session=None, **kw):
+        return self._data.to_numpy(session, **kw)
+
+
+SparseTensor = Tensor
+
+
+class flatiter(object):
+    def __init__(self, tensor):
+        # flatten creates a copy
+        self._flatten_tensor = tensor.flatten()
+        # ravel creates a view
+        self._ravel_tensor = tensor.ravel()
+
+    def __getitem__(self, item):
+        # a.flat[item] create a copy
+        return self._flatten_tensor[item]
+
+    def __setitem__(self, key, value):
+        # a.flat[item] = value will apply changes to original tensor
+        self._ravel_tensor[key] = value
+
+
+class Indexes(Serializable):
+    indexes = AnyField("indexes")
+
+
+TENSOR_TYPE = (Tensor, TensorData)
+TENSOR_CHUNK_TYPE = (TensorChunk, TensorChunkData)
+
+register_output_types(OutputType.tensor, TENSOR_TYPE, TENSOR_CHUNK_TYPE)
+register_output_types(OutputType.scalar, TENSOR_TYPE, TENSOR_CHUNK_TYPE)
diff --git a/python/xorbits/_mars/tensor/datasource/__init__.py b/python/xorbits/_mars/tensor/datasource/__init__.py
new file mode 100644
index 000000000..74688b13c
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/__init__.py
@@ -0,0 +1,45 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .arange import TensorArange, arange
+from .array import (
+    ArrayDataSource,
+    CSRMatrixDataSource,
+    array,
+    asarray,
+    ascontiguousarray,
+    asfortranarray,
+    tensor,
+)
+from .diag import TensorDiag, diag
+from .diagflat import diagflat
+from .empty import TensorEmpty, TensorEmptyLike, empty, empty_like
+from .eye import TensorEye, eye
+from .from_dataframe import TensorFromDataFrame, from_dataframe, from_series
+from .from_dense import DenseToSparse, fromdense
+from .from_hdf5 import TensorHDF5DataSource, fromhdf5
+from .from_sparse import SparseToDense, fromsparse
+from .from_tiledb import TensorTileDBDataSource, fromtiledb
+from .from_vineyard import TensorFromVineyard, TensorFromVineyardChunk, fromvineyard
+from .from_zarr import TensorFromZarr, fromzarr
+from .full import TensorFull, TensorFullLike, full, full_like
+from .identity import identity
+from .indices import TensorIndices, indices
+from .linspace import TensorLinspace, linspace
+from .meshgrid import meshgrid
+from .ones import TensorOnes, TensorOnesLike, ones, ones_like
+from .scalar import Scalar, scalar
+from .tri import TensorTril, TensorTriu, tril, triu
+from .zeros import TensorZeros, TensorZerosLike, zeros, zeros_like
diff --git a/python/xorbits/_mars/tensor/datasource/arange.py b/python/xorbits/_mars/tensor/datasource/arange.py
new file mode 100644
index 000000000..62975ebc8
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/arange.py
@@ -0,0 +1,219 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...serialization.serializables import AnyField
+from ..array_utils import create_array
+from ..utils import decide_chunk_sizes
+from .core import TensorNoInput
+
+
+class TensorArange(TensorNoInput):
+    _op_type_ = OperandDef.TENSOR_ARANGE
+
+    _start = AnyField("start")
+    _stop = AnyField("stop")
+    _step = AnyField("step")
+
+    def __init__(self, start=None, stop=None, step=None, dtype=None, **kw):
+        if dtype is not None:
+            dtype = np.dtype(dtype)
+        elif stop is not None and step is not None:
+            dtype = (
+                np.dtype(dtype)
+                if dtype is not None
+                else np.arange(0, type(stop)(1), step).dtype
+            )
+        super().__init__(_start=start, _stop=stop, _step=step, dtype=dtype, **kw)
+
+    @property
+    def start(self):
+        return self._start
+
+    @property
+    def stop(self):
+        return self._stop
+
+    @property
+    def step(self):
+        return self._step
+
+    def to_chunk_op(self, *args):
+        op = self.copy().reset_key()
+        start, stop, step = args
+        op._start = start
+        op._stop = stop
+        op._step = step
+        return op
+
+    @classmethod
+    def tile(cls, op):
+        tensor = op.outputs[0]
+
+        chunk_length = tensor.extra_params.raw_chunk_size or options.chunk_size
+        chunk_length = decide_chunk_sizes(
+            tensor.shape, chunk_length, tensor.dtype.itemsize
+        )
+
+        start, stop, step = op.start, op.stop, op.step  # noqa: F841
+
+        out_chunks = []
+        n_elem = 0
+        for i, cs in enumerate(chunk_length[0]):
+            chunk_start = start + n_elem * step
+            chunk_stop = start + (n_elem + cs) * step
+            chunk_size = max(int(np.ceil((chunk_stop - chunk_start) / step)), 0)
+            if chunk_size > cs:
+                chunk_stop -= step
+            chunk_shape = (cs,)
+            chunk_idx = (i,)
+            chunk_op = op.to_chunk_op(chunk_start, chunk_stop, step)
+            out_chunk = chunk_op.new_chunk(None, shape=chunk_shape, index=chunk_idx)
+            n_elem += cs
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            tensor.shape,
+            order=tensor.order,
+            chunks=out_chunks,
+            nsplits=chunk_length,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        ctx[op.outputs[0].key] = create_array(op)(
+            "arange", op.start, op.stop, op.step, dtype=op.dtype
+        )
+
+
+def arange(*args, **kwargs):
+    """
+    Return evenly spaced values within a given interval.
+
+    Values are generated within the half-open interval ``[start, stop)``
+    (in other words, the interval including `start` but excluding `stop`).
+    For integer arguments the function is equivalent to the Python built-in
+    `range <http://docs.python.org/lib/built-in-funcs.html>`_ function,
+    but returns a tensor rather than a list.
+
+    When using a non-integer step, such as 0.1, the results will often not
+    be consistent.  It is better to use ``linspace`` for these cases.
+
+    Parameters
+    ----------
+    start : number, optional
+        Start of interval.  The interval includes this value.  The default
+        start value is 0.
+    stop : number
+        End of interval.  The interval does not include this value, except
+        in some cases where `step` is not an integer and floating point
+        round-off affects the length of `out`.
+    step : number, optional
+        Spacing between values.  For any output `out`, this is the distance
+        between two adjacent values, ``out[i+1] - out[i]``.  The default
+        step size is 1.  If `step` is specified as a position argument,
+        `start` must also be given.
+    dtype : dtype
+        The type of the output tensor.  If `dtype` is not given, infer the data
+        type from the other input arguments.
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+
+    Returns
+    -------
+    arange : Tensor
+        Tensor of evenly spaced values.
+
+        For floating point arguments, the length of the result is
+        ``ceil((stop - start)/step)``.  Because of floating point overflow,
+        this rule may result in the last element of `out` being greater
+        than `stop`.
+
+    See Also
+    --------
+    linspace : Evenly spaced numbers with careful handling of endpoints.
+    ogrid: Tensors of evenly spaced numbers in N-dimensions.
+    mgrid: Grid-shaped tensors of evenly spaced numbers in N-dimensions.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.arange(3).execute()
+    array([0, 1, 2])
+    >>> mt.arange(3.0).execute()
+    array([ 0.,  1.,  2.])
+    >>> mt.arange(3,7).execute()
+    array([3, 4, 5, 6])
+    >>> mt.arange(3,7,2).execute()
+    array([3, 5])
+    """
+    kw_args = [kwargs.get("start"), kwargs.get("stop"), kwargs.get("step")]
+    kw_def = any(arg is not None for arg in kw_args)
+    dtype = None
+    if not kw_def:
+        if len(args) == 1:
+            start = 0
+            stop = args[0]
+            step = 1
+        elif len(args) == 2:
+            start = args[0]
+            stop = args[1]
+            step = 1
+        elif len(args) == 3:
+            start, stop, step = args
+        elif len(args) == 4:
+            start, stop, step, dtype = args
+            dtype = np.dtype(dtype)
+        else:
+            raise TypeError("Required argument 'start' (pos 1) not found")
+    else:
+        names = "start", "stop", "step"
+        for i, arg in enumerate(args):
+            if kw_args[i] is not None:
+                raise TypeError(
+                    f"Argument given by name ('{names[i]}') and position ({i})"
+                )
+            kw_args[i] = arg
+        start, stop, step = kw_args
+
+    if dtype is None:
+        if "dtype" in kwargs:
+            dtype = np.dtype(kwargs["dtype"])
+        else:
+            dtype = np.arange(0, type(stop)(1), step).dtype
+
+    start, stop = dtype.type(start), dtype.type(stop)
+    if dtype == np.datetime64 and not start:
+        raise ValueError(
+            "arange requires both a start and a stop for Mars datetime64 ranges"
+        )
+    if dtype == np.datetime64:
+        span = np.array([stop - start])
+        span[0] = step
+        step = span[0]
+        dtype = np.dtype(stop.dtype)
+    else:
+        step = dtype.type(step)
+    size = max(int(np.ceil(np.true_divide(stop - start, step))), 0)
+
+    op = TensorArange(start, stop, step, dtype=dtype, gpu=kwargs.get("gpu", False))
+    shape = (size,)
+    return op(shape, chunk_size=kwargs.pop("chunk_size", None))
diff --git a/python/xorbits/_mars/tensor/datasource/array.py b/python/xorbits/_mars/tensor/datasource/array.py
new file mode 100644
index 000000000..0342bba82
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/array.py
@@ -0,0 +1,437 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...lib.sparse import SparseNDArray
+from ...lib.sparse.core import cp, cps, get_array_module, issparse, sps
+from ...serialization.serializables import (
+    AnyField,
+    FieldTypes,
+    NDArrayField,
+    TupleField,
+)
+from ...utils import on_deserialize_shape, on_serialize_shape
+from ..array_utils import array_module, is_array, is_cupy
+from ..core import TENSOR_TYPE, Tensor, TensorData, TensorOrder
+from ..utils import get_chunk_slices
+from .core import TensorNoInput
+from .scalar import scalar
+
+
+class ArrayDataSource(TensorNoInput):
+    """
+    Represents data from numpy or cupy array
+    """
+
+    _op_type_ = OperandDef.TENSOR_DATA_SOURCE
+
+    data = NDArrayField("data")
+    chunk_size = AnyField("chunk_size")
+
+    def __init__(self, data=None, dtype=None, gpu=None, **kw):
+        if dtype is not None:
+            dtype = np.dtype(dtype)
+        elif data is not None:
+            dtype = np.dtype(data.dtype)
+
+        if gpu is None and is_cupy(data):  # pragma: no cover
+            gpu = True
+
+        super().__init__(data=data, dtype=dtype, gpu=gpu, **kw)
+
+    def to_chunk_op(self, *args):
+        _, idx, chunk_size = args
+        chunk_op = self.copy().reset_key()
+        chunk_op.data = self.data[get_chunk_slices(chunk_size, idx)].astype(
+            chunk_op.dtype, order=self.outputs[0].order.value, copy=False
+        )
+        chunk_op.chunk_size = None
+
+        return chunk_op
+
+    @classmethod
+    def execute(cls, ctx, op):
+        ctx[op.outputs[0].key] = array_module(op.gpu).asarray(op.data)
+
+
+class CSRMatrixDataSource(TensorNoInput):
+    """
+    Represents data from sparse array include scipy sparse or cupy sparse matrix.
+    """
+
+    _op_type_ = OperandDef.SPARSE_MATRIX_DATA_SOURCE
+
+    indices = NDArrayField("indices")
+    indptr = NDArrayField("indptr")
+    data = NDArrayField("data")
+    shape = TupleField(
+        "shape",
+        FieldTypes.int64,
+        on_serialize=on_serialize_shape,
+        on_deserialize=on_deserialize_shape,
+    )
+    chunk_size = AnyField("chunk_size")
+
+    def __init__(self, data=None, **kw):
+        kw["sparse"] = True
+        if is_cupy(data):  # pragma: no cover
+            kw["gpu"] = True
+        super().__init__(data=data, **kw)
+
+    def to_chunk_op(self, *args):
+        _, idx, chunk_size = args
+
+        xps = cps if self.gpu else sps
+        if len(self.shape) == 1:
+            shape = (1, self.shape[0])
+        else:
+            shape = self.shape
+        data = xps.csr_matrix((self.data, self.indices, self.indptr), shape)
+        chunk_data = data[get_chunk_slices(chunk_size, idx)]
+
+        chunk_op = self.copy().reset_key()
+        chunk_op.data = chunk_data.data
+        chunk_op.indices = chunk_data.indices
+        chunk_op.indptr = chunk_data.indptr
+        chunk_shape = chunk_data.shape[1:] if len(self.shape) == 1 else chunk_data.shape
+        chunk_op.shape = chunk_shape
+
+        return chunk_op
+
+    @classmethod
+    def execute(cls, ctx, op: "CSRMatrixDataSource"):
+        xps = cps if op.gpu else sps
+        chunk_shape = (1, op.shape[0]) if op.outputs[0].ndim == 1 else op.shape
+        ctx[op.outputs[0].key] = SparseNDArray(
+            xps.csr_matrix((op.data, op.indices, op.indptr), shape=chunk_shape),
+            shape=op.shape,
+        )
+
+
+def _from_spmatrix(spmatrix, dtype=None, chunk_size=None, gpu=None):
+    if gpu is None:
+        m = get_array_module(spmatrix)
+        if cp is not None and m is cp:
+            gpu = True
+        elif cp is np:
+            gpu = False
+    if dtype and spmatrix.dtype != dtype:
+        spmatrix = spmatrix.astype(dtype)
+    spmatrix = spmatrix.tocsr()
+    op = CSRMatrixDataSource(
+        indices=spmatrix.indices,
+        indptr=spmatrix.indptr,
+        data=spmatrix.data,
+        shape=spmatrix.shape,
+        dtype=spmatrix.dtype,
+        gpu=gpu,
+        chunk_size=chunk_size,
+    )
+    return op(spmatrix.shape, chunk_size=chunk_size)
+
+
+def tensor(
+    data=None, dtype=None, order="K", chunk_size=None, gpu=None, sparse=False
+) -> Tensor:
+    order = order or "K"
+    if isinstance(data, TENSOR_TYPE):
+        if isinstance(data, TensorData):
+            data = Tensor(data)
+        return data.astype(dtype or data.dtype, order=order, copy=False)
+    elif (
+        isinstance(data, (tuple, list))
+        and len(data) > 0
+        and all(isinstance(d, TENSOR_TYPE) for d in data)
+    ):
+        from ..merge import stack
+
+        data = stack(data)
+        return data.astype(dtype or data.dtype, order=order, copy=False)
+    elif np.isscalar(data):
+        return scalar(data, dtype=dtype)
+    elif issparse(data):
+        return _from_spmatrix(data, dtype=dtype, chunk_size=chunk_size, gpu=gpu)
+    elif hasattr(data, "__mars_tensor__"):
+        return data.__mars_tensor__(dtype=dtype, order=order)
+    else:
+        m = get_array_module(data)
+        try:
+            data = m.asarray(data, dtype=dtype, order=order)
+        except ValueError:
+            arr = data.__array__(dtype=dtype)
+            if isinstance(arr, TENSOR_TYPE):
+                return arr.astype(arr.dtype, order=order, copy=False)
+            raise
+        if gpu is None:
+            if cp is not None and m is cp:
+                gpu = True
+
+    if is_array(data):
+        if data.ndim == 0:
+            return scalar(data.item(), dtype=dtype)
+        tensor_order = (
+            TensorOrder.C_ORDER if data.flags["C_CONTIGUOUS"] else TensorOrder.F_ORDER
+        )
+        op = ArrayDataSource(data, dtype=dtype, gpu=gpu, chunk_size=chunk_size)
+        t = op(data.shape, chunk_size=chunk_size, order=tensor_order)
+        if sparse and not t.issparse():
+            return t.tosparse()
+        return t
+    else:
+        raise ValueError(f"Cannot create tensor by given data: {data}")
+
+
+def array(x, dtype=None, copy=True, order="K", ndmin=None, chunk_size=None):
+    """
+    Create a tensor.
+
+    Parameters
+    ----------
+    object : array_like
+        An array, any object exposing the array interface, an object whose
+        __array__ method returns an array, or any (nested) sequence.
+    dtype : data-type, optional
+        The desired data-type for the array.  If not given, then the type will
+        be determined as the minimum type required to hold the objects in the
+        sequence.  This argument can only be used to 'upcast' the array.  For
+        downcasting, use the .astype(t) method.
+    copy : bool, optional
+        If true (default), then the object is copied.  Otherwise, a copy will
+        only be made if __array__ returns a copy, if obj is a nested sequence,
+        or if a copy is needed to satisfy any of the other requirements
+        (`dtype`, `order`, etc.).
+    order : {'K', 'A', 'C', 'F'}, optional
+        Specify the memory layout of the array. If object is not an array, the
+        newly created array will be in C order (row major) unless 'F' is
+        specified, in which case it will be in Fortran order (column major).
+        If object is an array the following holds.
+
+        ===== ========= ===================================================
+        order  no copy                     copy=True
+        ===== ========= ===================================================
+        'K'   unchanged F & C order preserved, otherwise most similar order
+        'A'   unchanged F order if input is F and not C, otherwise C order
+        'C'   C order   C order
+        'F'   F order   F order
+        ===== ========= ===================================================
+
+        When ``copy=False`` and a copy is made for other reasons, the result is
+        the same as if ``copy=True``, with some exceptions for `A`, see the
+        Notes section. The default order is 'K'.
+    ndmin : int, optional
+        Specifies the minimum number of dimensions that the resulting
+        array should have.  Ones will be prepended to the shape as
+        needed to meet this requirement.
+    chunk_size: int, tuple, optional
+        Specifies chunk size for each dimension.
+
+    Returns
+    -------
+    out : Tensor
+        An tensor object satisfying the specified requirements.
+
+    See Also
+    --------
+    empty, empty_like, zeros, zeros_like, ones, ones_like, full, full_like
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.array([1, 2, 3]).execute()
+    array([1, 2, 3])
+
+    Upcasting:
+
+    >>> mt.array([1, 2, 3.0]).execute()
+    array([ 1.,  2.,  3.])
+
+    More than one dimension:
+
+    >>> mt.array([[1, 2], [3, 4]]).execute()
+    array([[1, 2],
+           [3, 4]])
+
+    Minimum dimensions 2:
+
+    >>> mt.array([1, 2, 3], ndmin=2).execute()
+    array([[1, 2, 3]])
+
+    Type provided:
+
+    >>> mt.array([1, 2, 3], dtype=complex).execute()
+    array([ 1.+0.j,  2.+0.j,  3.+0.j])
+
+    """
+    raw_x = x
+    order = order or "K"
+    x = tensor(x, dtype=dtype, order=order, chunk_size=chunk_size)
+    while ndmin is not None and x.ndim < ndmin:
+        x = x[np.newaxis]
+
+    if copy and x is raw_x:
+        x = x.copy(order=order)
+    elif (
+        not copy
+        and isinstance(raw_x, TENSOR_TYPE)
+        and raw_x.dtype == x.dtype
+        and raw_x.order == x.order
+        and raw_x.shape == x.shape
+        and raw_x is not x
+        and hasattr(raw_x, "data")
+    ):
+        raw_x.data = x.data
+
+    return x
+
+
+def asarray(x, dtype=None, order=None, chunk_size=None):
+    """Convert the input to an array.
+
+    Parameters
+    ----------
+    a : array_like
+        Input data, in any form that can be converted to a tensor.  This
+        includes lists, lists of tuples, tuples, tuples of tuples, tuples
+        of lists and tensors.
+    dtype : data-type, optional
+        By default, the data-type is inferred from the input data.
+    order : {'C', 'F'}, optional
+        Whether to use row-major (C-style) or
+        column-major (Fortran-style) memory representation.
+    chunk_size: int, tuple, optional
+        Specifies chunk size for each dimension.
+
+    Returns
+    -------
+    out : Tensor
+        Tensor interpretation of `a`.  No copy is performed if the input
+        is already an ndarray with matching dtype and order.  If `a` is a
+        subclass of ndarray, a base class ndarray is returned.
+
+    See Also
+    --------
+    ascontiguousarray : Convert input to a contiguous tensor.
+    asfortranarray : Convert input to a tensor with column-major
+                     memory order.
+
+    Examples
+    --------
+    Convert a list into a tensor:
+
+    >>> import mars.tensor as mt
+
+    >>> a = [1, 2]
+    >>> mt.asarray(a).execute()
+    array([1, 2])
+
+    Existing arrays are not copied:
+
+    >>> a = mt.array([1, 2])
+    >>> mt.asarray(a) is a
+    True
+
+    If `dtype` is set, array is copied only if dtype does not match:
+
+    >>> a = mt.array([1, 2], dtype=mt.float32)
+    >>> mt.asarray(a, dtype=mt.float32) is a
+    True
+    >>> mt.asarray(a, dtype=mt.float64) is a
+    False
+    """
+    return array(x, dtype=dtype, copy=False, order=order, chunk_size=chunk_size)
+
+
+def ascontiguousarray(a, dtype=None, chunk_size=None):
+    """
+    Return a contiguous tensor (ndim >= 1) in memory (C order).
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor.
+    dtype : str or dtype object, optional
+        Data-type of returned tensor.
+    chunk_size: int, tuple, optional
+        Specifies chunk size for each dimension.
+
+    Returns
+    -------
+    out : Tensor
+        Contiguous tensor of same shape and content as `a`, with type `dtype`
+        if specified.
+
+    See Also
+    --------
+    asfortranarray : Convert input to a tensor with column-major
+                     memory order.
+    Tensor.flags : Information about the memory layout of the tensor.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> x = mt.arange(6).reshape(2,3)
+    >>> mt.ascontiguousarray(x, dtype=mt.float32)
+    array([[ 0.,  1.,  2.],
+           [ 3.,  4.,  5.]], dtype=float32)
+    >>> x.flags['C_CONTIGUOUS']
+    True
+
+    Note: This function returns a tensor with at least one-dimension (1-d)
+    so it will not preserve 0-d tensors.
+
+    """
+
+    return array(a, dtype, copy=False, order="C", ndmin=1, chunk_size=chunk_size)
+
+
+def asfortranarray(a, dtype=None, chunk_size=None):
+    """
+    Return a tensor (ndim >= 1) laid out in Fortran order in memory.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor.
+    dtype : str or dtype object, optional
+        By default, the data-type is inferred from the input data.
+    chunk_size: int, tuple, optional
+        Specifies chunk size for each dimension.
+
+    Returns
+    -------
+    out : Tensor
+        The input `a` in Fortran, or column-major, order.
+
+    See Also
+    --------
+    ascontiguousarray : Convert input to a contiguous (C order) tensor.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> x = mt.arange(6).reshape(2,3)
+    >>> y = mt.asfortranarray(x)
+    >>> x.flags['F_CONTIGUOUS']
+    False
+    >>> y.flags['F_CONTIGUOUS']
+    True
+
+    Note: This function returns a tensor with at least one-dimension (1-d)
+    so it will not preserve 0-d tensors.
+
+    """
+    return array(a, dtype, copy=False, order="F", ndmin=1, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/datasource/core.py b/python/xorbits/_mars/tensor/datasource/core.py
new file mode 100644
index 000000000..00f0dab1d
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/core.py
@@ -0,0 +1,205 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import itertools
+
+import numpy as np
+
+from ...config import options
+from ...serialization.serializables import FieldTypes, StringField, TupleField
+from ..core import TensorOrder
+from ..operands import TensorOperand, TensorOperandMixin
+from ..utils import decide_chunk_sizes, normalize_shape
+
+
+class TensorDataSource(TensorOperand, TensorOperandMixin):
+    """
+    Tensor data source base class, provide universal tile logic,
+    subclass can overwrite tile method.
+    """
+
+    __slots__ = ()
+
+    def to_chunk_op(self, *args):
+        chunk_shape = args[0]
+        chunk_op = self.copy().reset_key()
+        chunk_op.extra_params = {"size": chunk_shape}  # to make op key different
+        return chunk_op
+
+    @classmethod
+    def tile(cls, op):
+        tensor = op.outputs[0]
+
+        chunk_size = tensor.extra_params.raw_chunk_size or options.chunk_size
+        chunk_size = decide_chunk_sizes(tensor.shape, chunk_size, tensor.dtype.itemsize)
+        chunk_size_idxes = (range(len(size)) for size in chunk_size)
+
+        out_chunks = []
+        for chunk_shape, chunk_idx in zip(
+            itertools.product(*chunk_size), itertools.product(*chunk_size_idxes)
+        ):
+            chunk_op = op.to_chunk_op(chunk_shape, chunk_idx, chunk_size)
+            out_chunk = chunk_op.new_chunk(
+                None, shape=chunk_shape, index=chunk_idx, order=tensor.order
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            tensor.shape,
+            chunks=out_chunks,
+            nsplits=chunk_size,
+            order=tensor.order,
+            **tensor.extra_params
+        )
+
+
+class TensorNoInput(TensorDataSource):
+    """
+    Tensor operand with no inputs.
+    """
+
+    def check_inputs(self, inputs):
+        # no inputs
+        if inputs and len(inputs) > 0:
+            raise ValueError("Tensor data source has no inputs")
+
+    def _new_chunks(self, inputs, kws=None, **kw):
+        shape = kw.get("shape", None)
+        self.extra_params[
+            "shape"
+        ] = shape  # set shape to make the operand key different
+        return super()._new_chunks(inputs, kws=kws, **kw)
+
+    def _new_tileables(self, inputs, kws=None, **kw):
+        shape = kw.get("shape", None)
+        self.extra_params[
+            "shape"
+        ] = shape  # set shape to make the operand key different
+        return super()._new_tileables(inputs, kws=kws, **kw)
+
+    def __call__(self, shape, chunk_size=None, order=None):
+        shape = normalize_shape(shape)
+        order = TensorOrder.C_ORDER if order is None else order
+        return self.new_tensor(None, shape, raw_chunk_size=chunk_size, order=order)
+
+
+class TensorHasInput(TensorDataSource):
+    """
+    Tensor operand with a single input.
+    """
+
+    @property
+    def input(self):
+        return self._input
+
+    def check_inputs(self, inputs):
+        # no inputs
+        if len(inputs) != 1:
+            raise ValueError("Tensor can only have 1 input")
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    @classmethod
+    def tile(cls, op):
+        output = op.outputs[0]
+
+        out_chunks = []
+        for c in op.input.chunks:
+            out_chunk = (
+                op.copy()
+                .reset_key()
+                .new_chunk([c], shape=c.shape, index=c.index, order=output.order)
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            output.shape,
+            order=output.order,
+            chunks=out_chunks,
+            nsplits=op.input.nsplits,
+        )
+
+    def __call__(self, a, order=None):
+        order = a.order if order is None else order
+        return self.new_tensor([a], a.shape, order=order)
+
+
+class TensorLike(TensorHasInput):
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if self.dtype is None:
+            self.dtype = self.input.dtype
+        if self.gpu is None:
+            self.gpu = self.input.op.gpu
+
+        # FIXME: remove when cupy supports other dtypes
+        if self.gpu and self.dtype not in (np.float32, np.float64):
+            raise NotImplementedError(
+                "Sparse tensor on GPU only supports float32 and float64"
+            )
+
+
+class TensorFromHDF5Like(TensorNoInput):
+    _filename = StringField("filename")
+    _group = StringField("group")
+    _dataset = StringField("dataset")
+    _axis_offsets = TupleField("axis_offsets", FieldTypes.int64)
+
+    def __init__(self, filename=None, group=None, dataset=None, **kw):
+        super().__init__(_filename=filename, _group=group, _dataset=dataset, **kw)
+
+    @property
+    def filename(self):
+        return self._filename
+
+    @property
+    def group(self):
+        return self._group
+
+    @property
+    def dataset(self):
+        return self._dataset
+
+    @property
+    def axis_offsets(self):
+        return self._axis_offsets
+
+    @property
+    def path(self):
+        return self.get_path(self.group, self.dataset)
+
+    def to_chunk_op(self, *args):
+        _, chunk_index, nsplits = args
+        chunk_op = super().to_chunk_op(*args)
+        cum_offsets = [[0] + np.cumsum(ns).tolist() for ns in nsplits]
+        axis_offsets = []
+        for axis, idx in enumerate(chunk_index):
+            axis_offsets.append(cum_offsets[axis][idx])
+        chunk_op._axis_offsets = tuple(axis_offsets)
+        return chunk_op
+
+    @staticmethod
+    def get_path(group, dataset):
+        paths = []
+        if group:
+            paths.append(group)
+        paths.append(dataset)
+        return "/".join(paths)
diff --git a/python/xorbits/_mars/tensor/datasource/diag.py b/python/xorbits/_mars/tensor/datasource/diag.py
new file mode 100644
index 000000000..8362e7896
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/diag.py
@@ -0,0 +1,297 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...lib import sparse
+from ...lib.sparse import diag as sparse_diag
+from ...lib.sparse.core import get_array_module, get_sparse_module, issparse
+from ...serialization.serializables import Int32Field, KeyField
+from ...utils import has_unknown_shape
+from ..array_utils import create_array
+from ..core import TENSOR_TYPE, TensorOrder
+from .array import tensor
+from .core import TensorHasInput
+from .zeros import TensorZeros
+
+
+def _get_diag_shape(v_shape, k):
+    size_0, size_1 = 0, 0
+    if k > 0:
+        size_1 += k
+    elif k < 0:
+        size_0 -= k
+    size = min(v_shape[0] - size_0, v_shape[1] - size_1)
+    return (size,)
+
+
+class TensorDiagBase:
+    __slots__ = ()
+
+    def to_chunk_op(self, *args):
+        op = self.copy().reset_key()
+        (k,) = args
+        op._k = k
+        return op
+
+    @classmethod
+    def _get_nsplits(cls, op):
+        raise NotImplementedError
+
+    @classmethod
+    def _get_chunk(cls, op, chunk_k, chunk_shape, chunk_idx):
+        raise NotImplementedError
+
+    @classmethod
+    def tile(cls, op):
+        if op.inputs:
+            if has_unknown_shape(*op.inputs):
+                yield
+        tensor = op.outputs[0]
+
+        # op can be TensorDiag or TensorEye
+        k = op.k
+        nsplits = op._get_nsplits(op)
+
+        fx = lambda x, y: x - y + k
+        cum_size = [np.cumsum(s).tolist() for s in nsplits]
+        out_chunks = []
+        for out_idx in itertools.product(*[range(len(s)) for s in nsplits]):
+            i, j = out_idx
+            ld_pos = cum_size[0][i] - 1, cum_size[1][j] - nsplits[1][j]
+            ru_pos = cum_size[0][i] - nsplits[0][i], cum_size[1][j] - 1
+
+            ld_fx = fx(*ld_pos)
+            ru_fx = fx(*ru_pos)
+
+            chunk_shape = (nsplits[0][i], nsplits[1][j])
+            if (ld_fx > 0 and ru_fx > 0) or (ld_fx < 0 and ru_fx < 0):
+                # does not cross, fill with zeros
+                chunk_op = TensorZeros(
+                    dtype=op.dtype,
+                    gpu=op.gpu,
+                    sparse=op.sparse,
+                    shape=chunk_shape,
+                    order=tensor.order.value,
+                )
+                chunk = chunk_op.new_chunk(None, shape=chunk_shape, index=out_idx)
+            else:
+                lu_pos = ru_pos[0], ld_pos[1]
+                chunk_k = fx(*lu_pos)
+                chunk = op._get_chunk(op, chunk_k, chunk_shape, out_idx)
+
+            out_chunks.append(chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs, tensor.shape, chunks=out_chunks, nsplits=nsplits
+        )
+
+
+class TensorDiag(TensorDiagBase, TensorHasInput):
+    _op_type_ = OperandDef.TENSOR_DIAG
+
+    _input = KeyField("input")
+    _k = Int32Field("k")
+
+    def __init__(self, k=None, **kw):
+        super().__init__(_k=k, **kw)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if self.dtype is None:
+            self._dtype = self.input.dtype
+
+    def to_chunk_op(self, *args):
+        return TensorDiagBase.to_chunk_op(self, *args)
+
+    @classmethod
+    def _get_nsplits(cls, op):
+        assert op.input.ndim == 1
+        k = op.k
+        nsplits_1d = op.input.nsplits[0]
+        nsplit_0, nsplit_1 = list(nsplits_1d), list(nsplits_1d)
+        if k > 0:
+            nsplit_0.append(k)
+            nsplit_1.insert(0, k)
+        elif k < 0:
+            nsplit_0.insert(0, abs(k))
+            nsplit_1.append(abs(k))
+        return nsplit_0, nsplit_1
+
+    @classmethod
+    def _get_chunk(cls, op, chunk_k, chunk_shape, chunk_idx):
+        assert chunk_shape[0] == chunk_shape[1]
+        input_idx = chunk_idx[1] if op.k < 0 else chunk_idx[0]
+        input_chunk = op.inputs[0].cix[input_idx,]
+        op = TensorDiag(k=chunk_k, dtype=op.dtype, gpu=op.gpu, sparse=op.sparse)
+        return op.new_chunk([input_chunk], shape=chunk_shape, index=chunk_idx)
+
+    def __call__(self, v, shape, chunk_size=None):
+        return self.new_tensor(
+            [v], shape, raw_chunk_size=chunk_size, order=TensorOrder.C_ORDER
+        )
+
+    @classmethod
+    def tile(cls, op):
+        tensor = op.outputs[0]
+
+        v = op.input
+        k = op.k
+        idx = itertools.count(0)
+        if v.ndim == 2:
+            if has_unknown_shape(*op.inputs):
+                yield
+            chunks = []
+            nsplit = []
+
+            fx = lambda x, y: x - y + k
+            in_nsplits = v.nsplits
+            cum_size = [np.cumsum(s).tolist() for s in in_nsplits]
+            for c in v.chunks:
+                i, j = c.index
+                ld_pos = cum_size[0][i] - 1, cum_size[1][j] - in_nsplits[1][j]
+                ru_pos = cum_size[0][i] - in_nsplits[0][i], cum_size[1][j] - 1
+
+                ld_fx = fx(*ld_pos)
+                ru_fx = fx(*ru_pos)
+
+                if (ld_fx > 0 and ru_fx > 0) or (ld_fx < 0 and ru_fx < 0):
+                    continue
+
+                lu_pos = ru_pos[0], ld_pos[1]
+                chunk_k = fx(*lu_pos)
+
+                chunk_shape = _get_diag_shape(c.shape, chunk_k)
+                chunk_idx = (next(idx),)
+                chunk_op = op.to_chunk_op(chunk_k)
+                chunk = chunk_op.new_chunk(
+                    [c], shape=chunk_shape, index=chunk_idx, order=tensor.order
+                )
+                nsplit.append(chunk_shape[0])
+                chunks.append(chunk)
+
+            new_op = op.copy()
+            return new_op.new_tensors(
+                op.inputs,
+                op.outputs[0].shape,
+                order=tensor.order,
+                chunks=chunks,
+                nsplits=(tuple(nsplit),),
+            )
+        else:
+            return (yield from super().tile(op))
+
+    @property
+    def k(self):
+        return getattr(self, "_k", 0)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        if op.sparse:
+            ctx[chunk.key] = sparse.diag(ctx[op.inputs[0].key], k=op.k, gpu=op.gpu)
+        else:
+            ctx[chunk.key] = create_array(op)("diag", ctx[op.inputs[0].key], k=op.k)
+
+
+def diag(v, k=0, sparse=None, gpu=None, chunk_size=None):
+    """
+    Extract a diagonal or construct a diagonal tensor.
+
+    See the more detailed documentation for ``mt.diagonal`` if you use this
+    function to extract a diagonal and wish to write to the resulting tensor
+
+    Parameters
+    ----------
+    v : array_like
+        If `v` is a 2-D tensor, return its `k`-th diagonal.
+        If `v` is a 1-D tensor, return a 2-D tensor with `v` on the `k`-th
+        diagonal.
+    k : int, optional
+        Diagonal in question. The default is 0. Use `k>0` for diagonals
+        above the main diagonal, and `k<0` for diagonals below the main
+        diagonal.
+    sparse: bool, optional
+        Create sparse tensor if True, False as default
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+
+    Returns
+    -------
+    out : Tensor
+        The extracted diagonal or constructed diagonal tensor.
+
+    See Also
+    --------
+    diagonal : Return specified diagonals.
+    diagflat : Create a 2-D array with the flattened input as a diagonal.
+    trace : Sum along diagonals.
+    triu : Upper triangle of a tensor.
+    tril : Lower triangle of a tensor.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.arange(9).reshape((3,3))
+    >>> x.execute()
+    array([[0, 1, 2],
+           [3, 4, 5],
+           [6, 7, 8]])
+
+    >>> mt.diag(x).execute()
+    array([0, 4, 8])
+    >>> mt.diag(x, k=1).execute()
+    array([1, 5])
+    >>> mt.diag(x, k=-1).execute()
+    array([3, 7])
+
+    >>> mt.diag(mt.diag(x)).execute()
+    array([[0, 0, 0],
+           [0, 4, 0],
+           [0, 0, 8]])
+
+    """
+    if not isinstance(v, TENSOR_TYPE):
+        tensor_v = tensor(v)
+        if tensor_v.issparse():
+            xps = get_sparse_module(tensor_v.data)
+            v = xps.csr_matrix(
+                (tensor_v.op.data, tensor_v.op.indices, tensor_v.op.indptr),
+                tensor_v.shape,
+            )
+            diag_v = sparse_diag(v, k=k)
+        else:
+            v = tensor(v).op.data
+            diag_v = get_array_module(v).diag(v, k=k)
+        sparse = sparse if sparse is not None else issparse(v)
+        return tensor(diag_v, gpu=gpu, sparse=sparse, chunk_size=chunk_size)
+
+    sparse = sparse if sparse is not None else v.issparse()
+
+    if v.ndim == 1:
+        shape = (v.size + abs(k),) * 2
+    elif v.ndim == 2:
+        shape = _get_diag_shape(v.shape, k)
+    else:
+        raise ValueError("Input must be 1- or 2-d.")
+
+    op = TensorDiag(k, dtype=v.dtype, gpu=gpu, sparse=sparse)
+    return op(v, shape)
diff --git a/python/xorbits/_mars/tensor/datasource/diagflat.py b/python/xorbits/_mars/tensor/datasource/diagflat.py
new file mode 100644
index 000000000..b66bab858
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/diagflat.py
@@ -0,0 +1,69 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core import Tensor
+from .array import tensor as astensor
+from .diag import diag
+
+
+def diagflat(v, k=0, sparse=None, gpu=None, chunk_size=None):
+    """
+    Create a two-dimensional tensor with the flattened input as a diagonal.
+
+    Parameters
+    ----------
+    v : array_like
+        Input data, which is flattened and set as the `k`-th
+        diagonal of the output.
+    k : int, optional
+        Diagonal to set; 0, the default, corresponds to the "main" diagonal,
+        a positive (negative) `k` giving the number of the diagonal above
+        (below) the main.
+    sparse: bool, optional
+        Create sparse tensor if True, False as default
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+
+    Returns
+    -------
+    out : Tensor
+        The 2-D output tensor.
+
+    See Also
+    --------
+    diag : MATLAB work-alike for 1-D and 2-D tensors.
+    diagonal : Return specified diagonals.
+    trace : Sum along diagonals.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.diagflat([[1,2], [3,4]]).execute()
+    array([[1, 0, 0, 0],
+           [0, 2, 0, 0],
+           [0, 0, 3, 0],
+           [0, 0, 0, 4]])
+
+    >>> mt.diagflat([1,2], 1).execute()
+    array([[0, 1, 0],
+           [0, 0, 2],
+           [0, 0, 0]])
+
+    """
+    if not isinstance(v, Tensor):
+        v = astensor(v).op.data
+    return diag(v.flatten(), k=k, sparse=sparse, gpu=gpu, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/datasource/empty.py b/python/xorbits/_mars/tensor/datasource/empty.py
new file mode 100644
index 000000000..fa0d3df04
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/empty.py
@@ -0,0 +1,213 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...lib.sparse import SparseNDArray
+from ...lib.sparse.core import get_array_module, get_sparse_module, naked
+from ...serialization.serializables import KeyField, StringField
+from ..array_utils import create_array
+from ..utils import get_order
+from .array import tensor
+from .core import TensorLike, TensorNoInput
+
+
+class TensorEmptyBase(object):
+    __slots__ = ()
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._gen_rand()
+
+    def _gen_rand(self):
+        if getattr(self, "_rand", None) is None:
+            self._obj_set("_rand", np.random.random())
+
+    def to_chunk_op(self, *args):
+        op = self.copy().reset_key()
+        op._rand = None
+        op._gen_rand()
+        return op
+
+
+class TensorEmpty(TensorEmptyBase, TensorNoInput):
+    __slots__ = ("_rand",)
+    _op_type_ = OperandDef.TENSOR_EMPTY
+
+    _order = StringField("order")
+
+    def __init__(self, dtype=None, order=None, **kw):
+        dtype = np.dtype(dtype or "f8")
+        super().__init__(dtype=dtype, _order=order, **kw)
+
+    @property
+    def order(self):
+        return self._order
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        ctx[chunk.key] = create_array(op)(
+            "empty", chunk.shape, dtype=op.dtype, order=op.order
+        )
+
+
+def empty(shape, dtype=None, chunk_size=None, gpu=None, order="C"):
+    """
+    Return a new tensor of given shape and type, without initializing entries.
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        Shape of the empty tensor
+    dtype : data-type, optional
+        Desired output data-type.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    order : {'C', 'F'}, optional, default: 'C'
+        Whether to store multi-dimensional data in row-major
+        (C-style) or column-major (Fortran-style) order in
+        memory.
+
+    Returns
+    -------
+    out : Tensor
+        Tensor of uninitialized (arbitrary) data of the given shape, dtype, and
+        order.  Object arrays will be initialized to None.
+
+    See Also
+    --------
+    empty_like, zeros, ones
+
+    Notes
+    -----
+    `empty`, unlike `zeros`, does not set the array values to zero,
+    and may therefore be marginally faster.  On the other hand, it requires
+    the user to manually set all the values in the array, and should be
+    used with caution.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> mt.empty([2, 2]).execute()
+    array([[ -9.74499359e+001,   6.69583040e-309],
+           [  2.13182611e-314,   3.06959433e-309]])         #random
+    >>> mt.empty([2, 2], dtype=int).execute()
+    array([[-1073741821, -1067949133],
+           [  496041986,    19249760]])                     #random
+    """
+    tensor_order = get_order(
+        order,
+        None,
+        available_options="CF",
+        err_msg="only 'C' or 'F' order is permitted",
+    )
+    op = TensorEmpty(dtype=dtype, gpu=gpu, order=order)
+    return op(shape, chunk_size=chunk_size, order=tensor_order)
+
+
+class TensorEmptyLike(TensorEmptyBase, TensorLike):
+    __slots__ = ("_rand",)
+    _op_type_ = OperandDef.TENSOR_EMPTY_LIKE
+
+    _input = KeyField("input")
+    _order = StringField("order")
+
+    def __init__(self, dtype=None, gpu=None, sparse=False, order=None, **kw):
+        dtype = np.dtype(dtype) if dtype is not None else None
+        super().__init__(dtype=dtype, gpu=gpu, _order=order, sparse=sparse, **kw)
+
+    @property
+    def order(self):
+        return self._order
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        if op.issparse():
+            in_data = naked(ctx[op.inputs[0].key])
+            xps = get_sparse_module(in_data)
+            xp = get_array_module(in_data)
+            ctx[chunk.key] = SparseNDArray(
+                xps.csr_matrix(
+                    (
+                        xp.empty_like(in_data.data, dtype=op.dtype),
+                        in_data.indices,
+                        in_data.indptr,
+                    ),
+                    shape=in_data.shape,
+                )
+            )
+        else:
+            ctx[chunk.key] = create_array(op)(
+                "empty_like", ctx[op.inputs[0].key], dtype=op.dtype, order=op.order
+            )
+
+
+def empty_like(a, dtype=None, gpu=None, order="K"):
+    """
+    Return a new tensor with the same shape and type as a given tensor.
+
+    Parameters
+    ----------
+    a : array_like
+        The shape and data-type of `a` define these same attributes of the
+        returned tensor.
+    dtype : data-type, optional
+        Overrides the data type of the result.
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, None as default
+    order : {'C', 'F', 'A', or 'K'}, optional
+        Overrides the memory layout of the result. 'C' means C-order,
+        'F' means F-order, 'A' means 'F' if ``prototype`` is Fortran
+        contiguous, 'C' otherwise. 'K' means match the layout of ``prototype``
+        as closely as possible.
+
+    Returns
+    -------
+    out : Tensor
+        Array of uninitialized (arbitrary) data with the same
+        shape and type as `a`.
+    See Also
+    --------
+    ones_like : Return a tensor of ones with shape and type of input.
+    zeros_like : Return a tensor of zeros with shape and type of input.
+    empty : Return a new uninitialized tensor.
+    ones : Return a new tensor setting values to one.
+    zeros : Return a new tensor setting values to zero.
+    Notes
+    -----
+    This function does *not* initialize the returned tensor; to do that use
+    `zeros_like` or `ones_like` instead.  It may be marginally faster than
+    the functions that do set the array values.
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> a = ([1,2,3], [4,5,6])                         # a is array-like
+    >>> mt.empty_like(a).execute()
+    array([[-1073741821, -1073741821,           3],    #ranm
+           [          0,           0, -1073741821]])
+    >>> a = mt.array([[1., 2., 3.],[4.,5.,6.]])
+    >>> mt.empty_like(a).execute()
+    array([[ -2.00000715e+000,   1.48219694e-323,  -2.00000572e+000],#random
+           [  4.38791518e-305,  -2.00000715e+000,   4.17269252e-309]])
+    """
+    a = tensor(a)
+    tensor_order = get_order(order, a.order)
+    gpu = a.op.gpu if gpu is None else gpu
+    op = TensorEmptyLike(dtype=dtype, gpu=gpu, sparse=a.issparse(), order=order)
+    return op(a, order=tensor_order)
diff --git a/python/xorbits/_mars/tensor/datasource/eye.py b/python/xorbits/_mars/tensor/datasource/eye.py
new file mode 100644
index 000000000..9c4ceda80
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/eye.py
@@ -0,0 +1,139 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...lib import sparse
+from ...serialization.serializables import Int32Field, StringField
+from ..array_utils import create_array
+from ..utils import decide_chunk_sizes, get_order
+from .core import TensorNoInput
+from .diag import TensorDiagBase
+
+
+class TensorEye(TensorNoInput, TensorDiagBase):
+    _op_type_ = OperandDef.TENSOR_EYE
+
+    _k = Int32Field("k")
+    _order = StringField("order")
+
+    def __init__(self, k=None, dtype=None, order=None, **kw):
+        dtype = np.dtype(dtype or "f8")
+        super().__init__(_k=k, dtype=dtype, _order=order, **kw)
+
+    @property
+    def k(self):
+        return getattr(self, "_k", 0)
+
+    @property
+    def order(self):
+        return self._order
+
+    @classmethod
+    def _get_nsplits(cls, op):
+        tensor = op.outputs[0]
+        chunk_size = tensor.extra_params.raw_chunk_size or options.chunk_size
+        return decide_chunk_sizes(tensor.shape, chunk_size, tensor.dtype.itemsize)
+
+    @classmethod
+    def _get_chunk(cls, op, chunk_k, chunk_shape, chunk_idx):
+        chunk_op = TensorEye(k=chunk_k, dtype=op.dtype, gpu=op.gpu, sparse=op.sparse)
+        return chunk_op.new_chunk(None, shape=chunk_shape, index=chunk_idx)
+
+    @classmethod
+    def tile(cls, op):
+        return (yield from TensorDiagBase.tile(op))
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        if op.sparse:
+            ctx[chunk.key] = sparse.eye(
+                chunk.shape[0], M=chunk.shape[1], k=op.k, dtype=op.dtype, gpu=op.gpu
+            )
+        else:
+            ctx[chunk.key] = create_array(op)(
+                "eye",
+                chunk.shape[0],
+                M=chunk.shape[1],
+                k=op.k,
+                dtype=op.dtype,
+                order=op.order,
+            )
+
+
+def eye(N, M=None, k=0, dtype=None, sparse=False, gpu=None, chunk_size=None, order="C"):
+    """
+    Return a 2-D tensor with ones on the diagonal and zeros elsewhere.
+
+    Parameters
+    ----------
+    N : int
+      Number of rows in the output.
+    M : int, optional
+      Number of columns in the output. If None, defaults to `N`.
+    k : int, optional
+      Index of the diagonal: 0 (the default) refers to the main diagonal,
+      a positive value refers to an upper diagonal, and a negative value
+      to a lower diagonal.
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+    sparse: bool, optional
+        Create sparse tensor if True, False as default
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    order : {'C', 'F'}, optional
+        Whether the output should be stored in row-major (C-style) or
+        column-major (Fortran-style) order in memory.
+
+    Returns
+    -------
+    I : Tensor of shape (N,M)
+      An tensor where all elements are equal to zero, except for the `k`-th
+      diagonal, whose values are equal to one.
+
+    See Also
+    --------
+    identity : (almost) equivalent function
+    diag : diagonal 2-D tensor from a 1-D tensor specified by the user.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.eye(2, dtype=int).execute()
+    array([[1, 0],
+           [0, 1]])
+    >>> mt.eye(3, k=1).execute()
+    array([[ 0.,  1.,  0.],
+           [ 0.,  0.,  1.],
+           [ 0.,  0.,  0.]])
+
+    """
+    if M is None:
+        M = N
+
+    shape = (N, M)
+    tensor_order = get_order(
+        order,
+        None,
+        available_options="CF",
+        err_msg="only 'C' or 'F' order is permitted",
+    )
+    op = TensorEye(k, dtype=dtype, gpu=gpu, sparse=sparse, order=order)
+    return op(shape, chunk_size=chunk_size, order=tensor_order)
diff --git a/python/xorbits/_mars/tensor/datasource/from_dataframe.py b/python/xorbits/_mars/tensor/datasource/from_dataframe.py
new file mode 100644
index 000000000..6500639f4
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/from_dataframe.py
@@ -0,0 +1,109 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import BoolField, KeyField
+from ..core import TensorOrder
+from ..utils import to_numpy
+from .core import TensorHasInput
+
+
+class TensorFromDataFrame(TensorHasInput):
+    """represent tensor from DataFrame"""
+
+    _op_type_ = OperandDef.TENSOR_FROM_DATAFRAME
+    _input = KeyField("_input")
+    _extract_multi_index = BoolField("extract_multi_index")
+
+    def __init__(self, extract_multi_index=False, **kw):
+        super().__init__(_extract_multi_index=extract_multi_index, **kw)
+
+    @classmethod
+    def execute(cls, ctx, op: "TensorFromDataFrame"):
+        df = ctx[op.inputs[0].key]
+        if op._extract_multi_index:
+            df = df.to_frame()
+        ctx[op.outputs[0].key] = to_numpy(df).astype(op.dtype, order="F")
+
+    @classmethod
+    def tile(cls, op: "TensorFromDataFrame"):
+        output = op.outputs[0]
+
+        out_chunks = []
+        for c in op.input.chunks:
+            shape = (
+                (c.shape[0], output.shape[1]) if op._extract_multi_index else c.shape
+            )
+            index = (c.index[0], 0) if op._extract_multi_index else c.index
+            out_chunk = (
+                op.copy()
+                .reset_key()
+                .new_chunk([c], shape=shape, index=index, order=output.order)
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        nsplits = (
+            (op.input.nsplits[0], (output.shape[1],))
+            if op._extract_multi_index
+            else op.input.nsplits
+        )
+        return new_op.new_tensors(
+            op.inputs,
+            output.shape,
+            order=output.order,
+            chunks=out_chunks,
+            nsplits=nsplits,
+        )
+
+    def __call__(self, a, order=None):
+        from ...dataframe.core import INDEX_TYPE, IndexValue
+
+        if (
+            self._extract_multi_index
+            and isinstance(a, INDEX_TYPE)
+            and isinstance(a.index_value.value, IndexValue.MultiIndex)
+        ):
+            order = a.order if order is None else order
+            return self.new_tensor(
+                [a], (a.shape[0], len(a.index_value.value.names)), order=order
+            )
+        else:
+            self._extract_multi_index = False
+
+        return super().__call__(a, order=order)
+
+
+def from_dataframe(in_df, dtype=None):
+    from ...dataframe.utils import build_empty_df
+
+    if dtype is None:
+        empty_pdf = build_empty_df(in_df.dtypes)
+        dtype = to_numpy(empty_pdf).dtype
+    op = TensorFromDataFrame(dtype=dtype, gpu=in_df.op.gpu)
+    return op(in_df, order=TensorOrder.F_ORDER)  # return tensor with F-order always
+
+
+def from_series(in_series, dtype=None):
+    op = TensorFromDataFrame(dtype=dtype or in_series.dtype, gpu=in_series.op.gpu)
+    return op(in_series, order=TensorOrder.F_ORDER)  # return tensor with F-order always
+
+
+def from_index(in_index, dtype=None, extract_multi_index=False):
+    op = TensorFromDataFrame(
+        dtype=dtype or in_index.dtype,
+        gpu=in_index.op.gpu,
+        extract_multi_index=extract_multi_index,
+    )
+    return op(in_index, order=TensorOrder.F_ORDER)  # return tensor with F-order always
diff --git a/python/xorbits/_mars/tensor/datasource/from_dense.py b/python/xorbits/_mars/tensor/datasource/from_dense.py
new file mode 100644
index 000000000..0d6bfc11f
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/from_dense.py
@@ -0,0 +1,74 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Iterable
+from functools import reduce
+from operator import and_
+
+import numpy as np
+import pandas as pd
+
+from ... import opcodes as OperandDef
+from ...lib.sparse import SparseNDArray
+from ...lib.sparse.core import cps, naked, sps
+from ...serialization.serializables import AnyField, KeyField
+from .array import tensor
+from .core import TensorHasInput
+
+
+class DenseToSparse(TensorHasInput):
+    _op_type_ = OperandDef.DENSE_TO_SPARSE
+
+    _input = KeyField("input")
+    _missing = AnyField("missing")
+
+    def __init__(self, missing=None, **kw):
+        super().__init__(sparse=True, _missing=missing, **kw)
+
+    @property
+    def missing(self):
+        return self._missing
+
+    @staticmethod
+    def _get_mask(data, missing):
+        if isinstance(missing, Iterable):
+            return reduce(and_, (DenseToSparse._get_mask(data, m) for m in missing))
+        elif pd.isna(missing):
+            return ~pd.isna(data)
+        else:
+            return data != missing
+
+    @classmethod
+    def execute(cls, ctx, op):
+        out = op.outputs[0]
+        in_data = naked(ctx[op.inputs[0].key])
+        missing = op.missing
+        shape = in_data.shape if any(np.isnan(s) for s in out.shape) else out.shape
+
+        xps = cps if op.gpu else sps
+        if missing is None:
+            ctx[out.key] = SparseNDArray(xps.csr_matrix(in_data), shape=shape)
+        else:
+            mask = cls._get_mask(in_data, missing)
+            spmatrix = xps.csr_matrix((in_data[mask], mask.nonzero()), shape=shape)
+            ctx[out.key] = SparseNDArray(spmatrix)
+
+
+def fromdense(a, missing=None):
+    a = tensor(a)
+    if a.issparse():
+        return a
+
+    op = DenseToSparse(dtype=a.dtype, gpu=a.op.gpu, missing=missing)
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/datasource/from_hdf5.py b/python/xorbits/_mars/tensor/datasource/from_hdf5.py
new file mode 100644
index 000000000..f845cb6e4
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/from_hdf5.py
@@ -0,0 +1,85 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...lib.filesystem import open_file
+from .core import TensorFromHDF5Like, TensorOrder
+
+
+class TensorHDF5DataSource(TensorFromHDF5Like):
+    _op_type_ = OperandDef.TENSOR_FROM_HDF5
+
+    @classmethod
+    def execute(cls, ctx, op):
+        import h5py
+
+        axis_offsets = op.axis_offsets
+        shape = op.outputs[0].shape
+
+        with h5py.File(open_file(op.filename), mode="r") as f:
+            ds = f[op.path]
+            data = ds[
+                tuple(
+                    slice(offset, offset + size)
+                    for offset, size in zip(axis_offsets, shape)
+                )
+            ]
+            ctx[op.outputs[0].key] = data
+
+
+def fromhdf5(hdf5_file, group=None, dataset=None, chunk_size=None):
+    import h5py
+
+    if isinstance(hdf5_file, h5py.Dataset):
+        filename = hdf5_file.file.filename
+        group = hdf5_file.parent.name
+        dataset = hdf5_file.name.rsplit("/", 1)[1]
+        chunk_size = chunk_size if chunk_size is not None else hdf5_file.chunks
+        shape = hdf5_file.shape
+        dtype = hdf5_file.dtype
+    elif isinstance(hdf5_file, h5py.File):
+        filename = hdf5_file.filename
+        if dataset is None:
+            raise ValueError("`dataset` should be provided")
+        try:
+            h5_dataset = hdf5_file[TensorHDF5DataSource.get_path(group, dataset)]
+        except KeyError:
+            raise ValueError(f"dataset({dataset}) does not exist")
+        chunk_size = chunk_size if chunk_size is not None else h5_dataset.chunks
+        shape = h5_dataset.shape
+        dtype = h5_dataset.dtype
+    elif isinstance(hdf5_file, str):
+        filename = hdf5_file
+        try:
+            with h5py.File(open_file(filename), mode="r") as f:
+                if dataset is None:
+                    raise ValueError("`dataset` should be provided")
+                h5_dataset = f[TensorHDF5DataSource.get_path(group, dataset)]
+
+                chunk_size = chunk_size if chunk_size is not None else h5_dataset.chunks
+                shape = h5_dataset.shape
+                dtype = h5_dataset.dtype
+        except KeyError:
+            raise ValueError(f"dataset({dataset}) does not exist")
+    else:
+        raise TypeError(
+            "`hdf5_file` passed has wrong type, "
+            "expect str, h5py.File or h5py.Dataset, "
+            f"got {type(hdf5_file)}"
+        )
+
+    op = TensorHDF5DataSource(
+        filename=filename, group=group, dataset=dataset, dtype=dtype
+    )
+    return op(shape, chunk_size=chunk_size, order=TensorOrder.C_ORDER)
diff --git a/python/xorbits/_mars/tensor/datasource/from_sparse.py b/python/xorbits/_mars/tensor/datasource/from_sparse.py
new file mode 100644
index 000000000..5726117cf
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/from_sparse.py
@@ -0,0 +1,75 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField, KeyField, StringField
+from ..array_utils import as_same_device, device, get_array_module
+from ..utils import get_order
+from .array import tensor
+from .core import TensorHasInput
+
+
+class SparseToDense(TensorHasInput):
+    _op_type_ = OperandDef.SPARSE_TO_DENSE
+
+    _input = KeyField("input")
+    _order = StringField("order")
+    _fill_value = AnyField("fill_value")
+
+    def __init__(self, fill_value=None, order=None, **kw):
+        super().__init__(_fill_value=fill_value, sparse=False, _order=order, **kw)
+
+    @property
+    def order(self):
+        return self._order
+
+    @property
+    def fill_value(self):
+        return self._fill_value
+
+    @classmethod
+    def execute(cls, ctx, op):
+        fill_value = op.fill_value
+        out = op.outputs[0]
+        (inp,), device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            if fill_value is None:
+                ctx[out.key] = inp.toarray().astype(
+                    out.dtype, order=op.order, copy=False
+                )
+            else:
+                xp = get_array_module(xp)
+                spmatrix = inp.spmatrix
+                inds = spmatrix.nonzero()
+                ret = xp.full(inp.shape, fill_value, dtype=out.dtype, order=op.order)
+                ret[inds] = spmatrix.data
+                ctx[out.key] = ret
+
+
+def fromsparse(a, order="C", fill_value=None):
+    a = tensor(a)
+    if not a.issparse():
+        return a.astype(a.dtype, order=order, copy=False)
+
+    tensor_order = get_order(
+        order,
+        None,
+        available_options="CF",
+        err_msg="only 'C' or 'F' order is permitted",
+    )
+    op = SparseToDense(dtype=a.dtype, gpu=a.op.gpu, order=order, fill_value=fill_value)
+    return op(a, order=tensor_order)
diff --git a/python/xorbits/_mars/tensor/datasource/from_tiledb.py b/python/xorbits/_mars/tensor/datasource/from_tiledb.py
new file mode 100644
index 000000000..751c8d18f
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/from_tiledb.py
@@ -0,0 +1,205 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...lib.sparse import SparseNDArray
+from ...lib.sparse.core import sps
+from ...serialization.serializables import (
+    DictField,
+    FieldTypes,
+    Int64Field,
+    StringField,
+    TupleField,
+)
+from ..core import TensorOrder
+from .core import TensorNoInput
+
+
+class TensorTileDBDataSource(TensorNoInput):
+    _op_type_ = OperandDef.TENSOR_FROM_TILEDB
+
+    _tiledb_config = DictField("tiledb_config")
+    # URI of array to open
+    _tiledb_uri = StringField("tiledb_uri")
+    # tiledb dim start
+    _tiledb_dim_starts = TupleField("tiledb_dim_starts", FieldTypes.int64)
+    # encryption key to decrypt if provided
+    _tiledb_key = StringField("tiledb_key")
+    # open array at a given timestamp if provided
+    _tiledb_timestamp = Int64Field("tiledb_timestamp")
+    _axis_offsets = TupleField("axis_offsets", FieldTypes.int64)
+
+    def __init__(
+        self,
+        tiledb_config=None,
+        tiledb_uri=None,
+        tiledb_dim_starts=None,
+        tiledb_key=None,
+        tiledb_timstamp=None,
+        **kw
+    ):
+        super().__init__(
+            _tiledb_config=tiledb_config,
+            _tiledb_uri=tiledb_uri,
+            _tiledb_dim_starts=tiledb_dim_starts,
+            _tiledb_key=tiledb_key,
+            _tiledb_timestamp=tiledb_timstamp,
+            **kw
+        )
+
+    @property
+    def tiledb_config(self):
+        return self._tiledb_config
+
+    @property
+    def tiledb_uri(self):
+        return self._tiledb_uri
+
+    @property
+    def tiledb_dim_starts(self):
+        return self._tiledb_dim_starts
+
+    @property
+    def tiledb_key(self):
+        return self._tiledb_key
+
+    @property
+    def tiledb_timestamp(self):
+        return self._tiledb_timestamp
+
+    @property
+    def axis_offsets(self):
+        return self._axis_offsets
+
+    def to_chunk_op(self, *args):
+        _, chunk_idx, nsplits = args
+        chunk_op = super().to_chunk_op(*args)
+        axis_offsets = []
+        for axis, idx in enumerate(chunk_idx):
+            axis_offsets.append(sum(nsplits[axis][:idx]))
+        chunk_op._axis_offsets = tuple(axis_offsets)
+        return chunk_op
+
+    @classmethod
+    def execute(cls, ctx, op):
+        import tiledb
+
+        chunk = op.outputs[0]
+        from ..array_utils import array_module
+        from ..utils import get_tiledb_ctx
+
+        xp = array_module(op.gpu)
+
+        axis_offsets = [
+            offset + dim_start
+            for offset, dim_start in zip(op.axis_offsets, op.tiledb_dim_starts)
+        ]
+        tiledb_ctx = get_tiledb_ctx(op.tiledb_config)
+        uri = op.tiledb_uri
+        key = op.tiledb_key
+        timestamp = op.tiledb_timestamp
+
+        slcs = []
+        for axis in range(chunk.ndim):
+            axis_offset = axis_offsets[axis]
+            axis_length = chunk.shape[axis]
+            slcs.append(slice(axis_offset, axis_offset + axis_length))
+
+        if not op.sparse:
+            # read dense array from tiledb
+            with tiledb.DenseArray(
+                uri=uri, ctx=tiledb_ctx, key=key, timestamp=timestamp
+            ) as tiledb_arr:
+                ctx[chunk.key] = tiledb_arr[tuple(slcs)]
+        else:
+            # read sparse array from tiledb
+            with tiledb.SparseArray(
+                uri=uri, ctx=tiledb_ctx, key=key, timestamp=timestamp
+            ) as tiledb_arr:
+                if tiledb_arr.ndim > 2:
+                    raise NotImplementedError(
+                        "Does not support to read array with more than 2 dimensions"
+                    )
+
+                data = tiledb_arr[tuple(slcs)]
+                coords = data["coords"]
+
+                value = data[tiledb_arr.attr(0).name]
+                if tiledb_arr.ndim == 2:
+                    # 2-d
+                    ij = tuple(
+                        coords[tiledb_arr.domain.dim(k).name] - axis_offsets[k]
+                        for k in range(tiledb_arr.ndim)
+                    )
+                    spmatrix = sps.coo_matrix((value, ij), shape=chunk.shape)
+                    ctx[chunk.key] = SparseNDArray(spmatrix)
+                else:
+                    # 1-d
+                    ij = (
+                        xp.zeros(coords.shape),
+                        coords[tiledb_arr.domain.dim(0).name] - axis_offsets[0],
+                    )
+                    spmatrix = sps.coo_matrix((value, ij), shape=(1,) + chunk.shape)
+                    ctx[chunk.key] = SparseNDArray(spmatrix, shape=chunk.shape)
+
+
+def fromtiledb(uri, ctx=None, key=None, timestamp=None, gpu=None):
+    import tiledb
+
+    raw_ctx = ctx
+    if raw_ctx is None:
+        ctx = tiledb.Ctx()
+
+    # get metadata from tiledb
+    try:
+        tiledb_arr = tiledb.DenseArray(uri=uri, ctx=ctx, key=key, timestamp=timestamp)
+        sparse = False
+    except ValueError:
+        # if the array is not dense, ValueError will be raised by tiledb
+        tiledb_arr = tiledb.SparseArray(uri=uri, ctx=ctx, key=key, timestamp=timestamp)
+        sparse = True
+
+    if tiledb_arr.nattr > 1:
+        raise NotImplementedError(
+            "Does not supported TileDB array schema with more than 1 attr"
+        )
+    tiledb_dim_starts = tuple(
+        tiledb_arr.domain.dim(j).domain[0].item() for j in range(tiledb_arr.ndim)
+    )
+    if any(isinstance(s, float) for s in tiledb_dim_starts):
+        raise ValueError(
+            "Does not support TileDB array schema whose dimensions has float domain"
+        )
+
+    dtype = tiledb_arr.attr(0).dtype
+    tiledb_config = None if raw_ctx is None else ctx.config().dict()
+    tensor_order = (
+        TensorOrder.C_ORDER
+        if tiledb_arr.schema.cell_order == "row-major"
+        else TensorOrder.F_ORDER
+    )
+    op = TensorTileDBDataSource(
+        tiledb_config=tiledb_config,
+        tiledb_uri=uri,
+        tiledb_key=key,
+        tiledb_timstamp=timestamp,
+        tiledb_dim_starts=tiledb_dim_starts,
+        gpu=gpu,
+        sparse=sparse,
+        dtype=dtype,
+    )
+    chunk_size = tuple(
+        int(tiledb_arr.domain.dim(i).tile) for i in range(tiledb_arr.domain.ndim)
+    )
+    return op(tiledb_arr.shape, chunk_size=chunk_size, order=tensor_order)
diff --git a/python/xorbits/_mars/tensor/datasource/from_vineyard.py b/python/xorbits/_mars/tensor/datasource/from_vineyard.py
new file mode 100644
index 000000000..535e81ed8
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/from_vineyard.py
@@ -0,0 +1,192 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core.context import get_context
+from ...serialization.serializables import Int32Field, StringField
+from ...storage.base import StorageLevel
+from ...utils import calc_nsplits, has_unknown_shape, lazy_import
+from ..operands import TensorOperand, TensorOperandMixin
+from .core import TensorNoInput
+
+vineyard = lazy_import("vineyard")
+vy_data_utils = lazy_import("vineyard.data.utils", rename="vy_data_utils")
+
+
+def resolve_vineyard_socket(ctx, op):
+    if op.vineyard_socket is None:  # pragma: no cover
+        storage_backend = ctx.get_storage_info(level=StorageLevel.MEMORY)
+        if storage_backend.get("name", None) == "vineyard":
+            return storage_backend["socket"]
+        else:
+            return op.vineyard_socket
+    else:
+        return op.vineyard_socket
+
+
+class TensorFromVineyard(TensorNoInput):
+    _op_type_ = OperandDef.TENSOR_FROM_VINEYARD_META
+
+    # vineyard ipc socket
+    vineyard_socket = StringField("vineyard_socket")
+
+    # ObjectID in vineyard
+    object_id = StringField("object_id")
+
+    # a dummy attr to make sure ops have different keys
+    operator_index = Int32Field("operator_index")
+
+    def __init__(self, vineyard_socket=None, object_id=None, **kw):
+        super().__init__(vineyard_socket=vineyard_socket, object_id=object_id, **kw)
+
+    @classmethod
+    def tile(cls, op):
+        ctx = get_context()
+        workers = ctx.get_worker_addresses()
+
+        out_chunks = []
+        for index, worker in enumerate(workers):
+            chunk_op = op.copy().reset_key()
+            chunk_op.expect_worker = worker
+            chunk_op.operator_index = index
+            out_chunk = chunk_op.new_chunk(
+                [], dtype=np.dtype(object), shape=(1,), index=(index,)
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy().reset_key()
+        return new_op.new_tensors(
+            op.inputs,
+            shape=(np.nan,),
+            dtype=np.dtype(object),
+            chunks=out_chunks,
+            nsplits=((np.nan,) * len(workers),),
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if vineyard is None:
+            raise RuntimeError("vineyard is not available")
+
+        socket = resolve_vineyard_socket(ctx, op)
+        client = vineyard.connect(socket)
+
+        meta = client.get_meta(vineyard.ObjectID(op.object_id))
+        chunks = []
+        for idx in range(meta["partitions_-size"]):
+            chunk_meta = meta["partitions_-%d" % idx]
+            if not chunk_meta.islocal:
+                continue
+            dtype = vy_data_utils.normalize_dtype(
+                chunk_meta["value_type_"], chunk_meta.get("value_type_meta_", None)
+            )
+            shape = tuple(json.loads(chunk_meta["shape_"]))
+            chunk_index = tuple(json.loads(chunk_meta["partition_index_"]))
+            # chunk: (chunk_id, worker_address, dtype, shape, index)
+            chunks.append(
+                (repr(chunk_meta.id), ctx.worker_address, dtype, shape, chunk_index)
+            )
+
+        holder = np.empty((1,), dtype=object)
+        holder[0] = chunks
+        ctx[op.outputs[0].key] = np.asarray(holder)
+
+
+class TensorFromVineyardChunk(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.TENSOR_FROM_VINEYARD_CHUNK
+
+    # vineyard ipc socket
+    vineyard_socket = StringField("vineyard_socket")
+
+    # ObjectID of chunk in vineyard
+    object_id = StringField("object_id")
+
+    def __init__(self, vineyard_socket=None, object_id=None, **kw):
+        super().__init__(vineyard_socket=vineyard_socket, object_id=object_id, **kw)
+
+    def __call__(self, meta):
+        return self.new_tensor([meta], shape=(np.nan,))
+
+    @classmethod
+    def tile(cls, op):
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        ctx = get_context()
+
+        in_chunk_keys = [chunk.key for chunk in op.inputs[0].chunks]
+        out_chunks = []
+        chunk_map = dict()
+        dtype = None
+        for chunk, infos in zip(
+            op.inputs[0].chunks, ctx.get_chunks_result(in_chunk_keys)
+        ):
+            for info in infos[0]:  # n.b. 1-element ndarray
+                chunk_op = op.copy().reset_key()
+                chunk_op.object_id = info[0]
+                chunk_op.expect_worker = info[1]
+                dtype = info[2]
+                shape = info[3]
+                chunk_index = info[4]
+                chunk_map[chunk_index] = info[3]
+                out_chunk = chunk_op.new_chunk(
+                    [chunk], shape=shape, dtype=dtype, index=chunk_index
+                )
+                out_chunks.append(out_chunk)
+
+        nsplits = calc_nsplits(chunk_map)
+        shape = [np.sum(nsplit) for nsplit in nsplits]
+        new_op = op.copy().reset_key()
+        return new_op.new_tensors(
+            op.inputs, shape=shape, dtype=dtype, chunks=out_chunks, nsplits=nsplits
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if vineyard is None:
+            raise RuntimeError("vineyard is not available")
+
+        socket = resolve_vineyard_socket(ctx, op)
+        client = vineyard.connect(socket)
+
+        client = vineyard.connect(socket)
+        ctx[op.outputs[0].key] = client.get(vineyard.ObjectID(op.object_id))
+
+
+def fromvineyard(tensor, vineyard_socket=None):
+    if vineyard is not None and isinstance(tensor, vineyard.Object):  # pragma: no cover
+        if "vineyard::GlobalTensor" not in tensor.typename:
+            raise TypeError(
+                "The input tensor %r is not a vineyard' GlobalTensor" % tensor
+            )
+        object_id = tensor.id
+    else:
+        object_id = tensor
+    if vineyard is not None and isinstance(object_id, vineyard.ObjectID):
+        object_id = repr(object_id)
+    metaop = TensorFromVineyard(
+        vineyard_socket=vineyard_socket,
+        object_id=object_id,
+        dtype=np.dtype("byte"),
+        gpu=None,
+    )
+    meta = metaop(shape=(np.nan,), chunk_size=(np.nan,))
+    op = TensorFromVineyardChunk(
+        vineyard_socket=vineyard_socket, object_id=object_id, gpu=None
+    )
+    return op(meta)
diff --git a/python/xorbits/_mars/tensor/datasource/from_zarr.py b/python/xorbits/_mars/tensor/datasource/from_zarr.py
new file mode 100644
index 000000000..4899697f5
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/from_zarr.py
@@ -0,0 +1,93 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...lib.filesystem import FSMap, get_fs
+from ..core import TensorOrder
+from .core import TensorFromHDF5Like
+
+
+class TensorFromZarr(TensorFromHDF5Like):
+    _op_type_ = OperandDef.TENSOR_FROM_ZARR
+
+    @classmethod
+    def execute(cls, ctx, op):
+        import zarr
+
+        axis_offsets = op.axis_offsets
+        shape = op.outputs[0].shape
+
+        fs = get_fs(op.filename, None)
+        fs_map = FSMap(op.filename, fs)
+
+        root = zarr.group(store=fs_map)
+        path = cls.get_path(op.group, op.dataset)
+        arr = root[path]
+
+        data = arr[
+            tuple(
+                slice(offset, offset + size)
+                for offset, size in zip(axis_offsets, shape)
+            )
+        ]
+        ctx[op.outputs[0].key] = data
+
+
+def fromzarr(path, group=None, dataset=None, chunk_size=None):
+    import zarr
+
+    try:
+        # since v2.11.0, zarr convert mutable mappings to KVStore
+        from zarr.storage import KVStore as zarr_kvstore
+    except ImportError:  # pragma: no cover
+        zarr_kvstore = None
+
+    if isinstance(path, zarr.Array):
+        arr = path
+        if zarr_kvstore is None and isinstance(arr.store, FSMap):  # pragma: no cover
+            root = arr.store.root
+            path, dataset = root.rsplit("/", 1)
+        elif zarr_kvstore and isinstance(arr.store, zarr_kvstore):
+            root = arr.store._mutable_mapping.root
+            path, dataset = root.rsplit("/", 1)
+        else:
+            path = arr.store.path
+            if "/" in arr.path and group is None:
+                group = arr.path.rsplit("/", 1)[0]
+            dataset = arr.basename
+            if not dataset:
+                path, dataset = path.rsplit("/", 1)
+        shape = arr.shape
+    elif isinstance(path, str):
+        fs = get_fs(path, None)
+        fs_map = FSMap(path, fs)
+
+        if group is None and dataset is None:
+            arr = zarr.open(fs_map)
+            if isinstance(arr, zarr.Array):
+                return fromzarr(arr, chunk_size=chunk_size)
+
+        g = zarr.group(store=fs_map)
+        arr = g[TensorFromZarr.get_path(group, dataset)]
+        shape = arr.shape
+    else:
+        raise TypeError(
+            "`path` passed has wrong type, "
+            "expect str, or zarr.Array"
+            f"got {type(path)}"
+        )
+
+    chunk_size = chunk_size if chunk_size is not None else arr.chunks
+    op = TensorFromZarr(filename=path, group=group, dataset=dataset, dtype=arr.dtype)
+    return op(shape, chunk_size=chunk_size, order=TensorOrder(arr.order))
diff --git a/python/xorbits/_mars/tensor/datasource/full.py b/python/xorbits/_mars/tensor/datasource/full.py
new file mode 100644
index 000000000..65a089fda
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/full.py
@@ -0,0 +1,243 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...lib.sparse import SparseNDArray
+from ...lib.sparse.core import get_array_module, get_sparse_module, naked
+from ...serialization.serializables import AnyField, KeyField, StringField
+from ..array_utils import create_array
+from ..utils import get_order
+from .array import tensor
+from .core import TensorLike, TensorNoInput
+
+
+class TensorFull(TensorNoInput):
+    _op_type_ = OperandDef.TENSOR_FULL
+
+    _fill_value = AnyField("fill_value")
+    _order = StringField("order")
+
+    def __init__(self, fill_value=None, dtype=None, order=None, **kw):
+        if dtype is not None:
+            dtype = np.dtype(dtype)
+            if fill_value is not None:
+                fill_value = dtype.type(fill_value)
+        elif fill_value is not None:
+            dtype = np.array(fill_value).dtype
+        super().__init__(_fill_value=fill_value, dtype=dtype, _order=order, **kw)
+
+    @property
+    def fill_value(self):
+        return self._fill_value
+
+    @property
+    def order(self):
+        return self._order
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        ctx[chunk.key] = create_array(op)(
+            "full", chunk.shape, op.fill_value, dtype=op.dtype, order=op.order
+        )
+
+
+def full(shape, fill_value, dtype=None, chunk_size=None, gpu=None, order="C"):
+    """
+    Return a new tensor of given shape and type, filled with `fill_value`.
+
+    Parameters
+    ----------
+    shape : int or sequence of ints
+        Shape of the new tensor, e.g., ``(2, 3)`` or ``2``.
+    fill_value : scalar
+        Fill value.
+    dtype : data-type, optional
+        The desired data-type for the tensor  The default, `None`, means
+         `np.array(fill_value).dtype`.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    order : {'C', 'F'}, optional
+        Whether to store multidimensional data in C- or Fortran-contiguous
+        (row- or column-wise) order in memory.
+
+    Returns
+    -------
+    out : Tensor
+        Tensor of `fill_value` with the given shape, dtype, and order.
+
+    See Also
+    --------
+    zeros_like : Return a tensor of zeros with shape and type of input.
+    ones_like : Return a tensor of ones with shape and type of input.
+    empty_like : Return an empty tensor with shape and type of input.
+    full_like : Fill a tensor with shape and type of input.
+    zeros : Return a new tensor setting values to zero.
+    ones : Return a new tensor setting values to one.
+    empty : Return a new uninitialized tensor.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.full((2, 2), mt.inf).execute()
+    array([[ inf,  inf],
+           [ inf,  inf]])
+    >>> mt.full((2, 2), 10).execute()
+    array([[10, 10],
+           [10, 10]])
+
+    """
+    v = np.asarray(fill_value)
+    if len(v.shape) > 0:
+        from ..base import broadcast_to
+
+        return broadcast_to(
+            tensor(v, dtype=dtype, chunk_size=chunk_size, gpu=gpu, order=order), shape
+        )
+
+    tensor_order = get_order(
+        order,
+        None,
+        available_options="CF",
+        err_msg="only 'C' or 'F' order is permitted",
+    )
+    op = TensorFull(fill_value, dtype=dtype, gpu=gpu, order=order)
+    return op(shape, chunk_size=chunk_size, order=tensor_order)
+
+
+class TensorFullLike(TensorLike):
+    _op_type_ = OperandDef.TENSOR_FULL_LIKE
+
+    _input = KeyField("input")
+    _fill_value = AnyField("fill_value")
+    _order = StringField("order")
+
+    def __init__(
+        self, fill_value=None, dtype=None, gpu=None, sparse=False, order=None, **kw
+    ):
+        if dtype is not None:
+            dtype = np.dtype(dtype)
+            if fill_value is not None:
+                fill_value = dtype.type(fill_value)
+        elif fill_value is not None:
+            dtype = np.array(fill_value).dtype
+        super().__init__(
+            _fill_value=fill_value,
+            _order=order,
+            dtype=dtype,
+            gpu=gpu,
+            sparse=sparse,
+            **kw
+        )
+
+    @property
+    def fill_value(self):
+        return self._fill_value
+
+    @property
+    def order(self):
+        return self._order
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        if op.issparse():
+            in_data = naked(ctx[op.inputs[0].key])
+            xps = get_sparse_module(in_data)
+            xp = get_array_module(in_data)
+            ctx[chunk.key] = SparseNDArray(
+                xps.csr_matrix(
+                    (
+                        xp.full_like(in_data.data, op.fill_value, dtype=op.dtype),
+                        in_data.indices,
+                        in_data.indptr,
+                    ),
+                    shape=in_data.shape,
+                )
+            )
+        else:
+            ctx[chunk.key] = create_array(op)(
+                "full_like",
+                ctx[op.inputs[0].key],
+                op.fill_value,
+                dtype=op.dtype,
+                order=op.order,
+            )
+
+
+def full_like(a, fill_value, dtype=None, gpu=None, order="K"):
+    """
+    Return a full tensor with the same shape and type as a given tensor.
+
+    Parameters
+    ----------
+    a : array_like
+        The shape and data-type of `a` define these same attributes of
+        the returned tensor.
+    fill_value : scalar
+        Fill value.
+    dtype : data-type, optional
+        Overrides the data type of the result.
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, None as default
+    order : {'C', 'F', 'A', or 'K'}, optional
+        Overrides the memory layout of the result. 'C' means C-order,
+        'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous,
+        'C' otherwise. 'K' means match the layout of `a` as closely
+        as possible.
+
+    Returns
+    -------
+    out : Tensor
+        Tensor of `fill_value` with the same shape and type as `a`.
+
+    See Also
+    --------
+    empty_like : Return an empty tensor with shape and type of input.
+    ones_like : Return a tensor of ones with shape and type of input.
+    zeros_like : Return a tensor of zeros with shape and type of input.
+    full : Return a new tensor of given shape filled with value.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> x = mt.arange(6, dtype=int)
+    >>> mt.full_like(x, 1).execute()
+    array([1, 1, 1, 1, 1, 1])
+    >>> mt.full_like(x, 0.1).execute()
+    array([0, 0, 0, 0, 0, 0])
+    >>> mt.full_like(x, 0.1, dtype=mt.double).execute()
+    array([ 0.1,  0.1,  0.1,  0.1,  0.1,  0.1])
+    >>> mt.full_like(x, mt.nan, dtype=mt.double).execute()
+    array([ nan,  nan,  nan,  nan,  nan,  nan])
+
+    >>> y = mt.arange(6, dtype=mt.double)
+    >>> mt.full_like(y, 0.1).execute()
+    array([ 0.1,  0.1,  0.1,  0.1,  0.1,  0.1])
+
+    """
+    a = tensor(a)
+    tensor_order = get_order(order, a.order)
+    if dtype is None:
+        dtype = a.dtype
+    gpu = a.op.gpu if gpu is None else gpu
+    op = TensorFullLike(
+        fill_value=fill_value, dtype=dtype, gpu=gpu, sparse=a.issparse()
+    )
+    return op(a, order=tensor_order)
diff --git a/python/xorbits/_mars/tensor/datasource/identity.py b/python/xorbits/_mars/tensor/datasource/identity.py
new file mode 100644
index 000000000..c60580bb2
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/identity.py
@@ -0,0 +1,54 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .eye import eye
+
+
+def identity(n, dtype=None, sparse=False, gpu=None, chunk_size=None):
+    """
+    Return the identity tensor.
+
+    The identity tensor is a square array with ones on
+    the main diagonal.
+
+    Parameters
+    ----------
+    n : int
+        Number of rows (and columns) in `n` x `n` output.
+    dtype : data-type, optional
+        Data-type of the output.  Defaults to ``float``.
+    sparse: bool, optional
+        Create sparse tensor if True, False as default
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    chunks : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+
+    Returns
+    -------
+    out : Tensor
+        `n` x `n` array with its main diagonal set to one,
+        and all other elements 0.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.identity(3).execute()
+    array([[ 1.,  0.,  0.],
+           [ 0.,  1.,  0.],
+           [ 0.,  0.,  1.]])
+
+    """
+    return eye(n, dtype=dtype, sparse=sparse, gpu=gpu, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/datasource/indices.py b/python/xorbits/_mars/tensor/datasource/indices.py
new file mode 100644
index 000000000..0cb389ad9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/indices.py
@@ -0,0 +1,131 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Iterable
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import FieldTypes, ListField
+from .arange import arange
+from .core import TensorNoInput
+from .empty import empty
+from .meshgrid import meshgrid
+
+
+class TensorIndices(TensorNoInput):
+    _op_type_ = OperandDef.TENSOR_INDICES
+
+    _dimensions = ListField("dimensions", FieldTypes.uint64)
+
+    def __init__(self, dimensions=None, **kw):
+        super().__init__(_dimensions=dimensions, **kw)
+
+    @property
+    def dimensions(self):
+        return self._dimensions
+
+
+def indices(dimensions, dtype=int, chunk_size=None):
+    """
+    Return a tensor representing the indices of a grid.
+
+    Compute a tensor where the subtensors contain index values 0,1,...
+    varying only along the corresponding axis.
+
+    Parameters
+    ----------
+    dimensions : sequence of ints
+        The shape of the grid.
+    dtype : dtype, optional
+        Data type of the result.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+
+    Returns
+    -------
+    grid : Tensor
+        The tensor of grid indices,
+        ``grid.shape = (len(dimensions),) + tuple(dimensions)``.
+
+    See Also
+    --------
+    mgrid, meshgrid
+
+    Notes
+    -----
+    The output shape is obtained by prepending the number of dimensions
+    in front of the tuple of dimensions, i.e. if `dimensions` is a tuple
+    ``(r0, ..., rN-1)`` of length ``N``, the output shape is
+    ``(N,r0,...,rN-1)``.
+
+    The subtensors ``grid[k]`` contains the N-D array of indices along the
+    ``k-th`` axis. Explicitly::
+
+        grid[k,i0,i1,...,iN-1] = ik
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> grid = mt.indices((2, 3))
+    >>> grid.shape
+    (2, 2, 3)
+    >>> grid[0].execute()        # row indices
+    array([[0, 0, 0],
+           [1, 1, 1]])
+    >>> grid[1].execute()        # column indices
+    array([[0, 1, 2],
+           [0, 1, 2]])
+
+    The indices can be used as an index into a tensor.
+
+    >>> x = mt.arange(20).reshape(5, 4)
+    >>> row, col = mt.indices((2, 3))
+    >>> # x[row, col]  # TODO(jisheng): accomplish this if multiple fancy indexing is supported
+
+    Note that it would be more straightforward in the above example to
+    extract the required elements directly with ``x[:2, :3]``.
+
+    """
+    from ..merge import stack
+
+    dimensions = tuple(dimensions)
+    dtype = np.dtype(dtype)
+    raw_chunk_size = chunk_size
+    if chunk_size is not None and isinstance(chunk_size, Iterable):
+        chunk_size = tuple(chunk_size)
+    else:
+        chunk_size = (chunk_size,) * len(dimensions)
+
+    xi = []
+    for ch, dim in zip(chunk_size, dimensions):
+        xi.append(arange(dim, dtype=dtype, chunk_size=ch))
+
+    grid = None
+    if np.prod(dimensions):
+        grid = meshgrid(*xi, indexing="ij")
+
+    if grid:
+        grid = stack(grid)
+    else:
+        if raw_chunk_size is None:
+            empty_chunk_size = None
+        else:
+            empty_chunk_size = (1,) + chunk_size
+        grid = empty(
+            (len(dimensions),) + dimensions, dtype=dtype, chunk_size=empty_chunk_size
+        )
+
+    return grid
diff --git a/python/xorbits/_mars/tensor/datasource/linspace.py b/python/xorbits/_mars/tensor/datasource/linspace.py
new file mode 100644
index 000000000..fc8954f3b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/linspace.py
@@ -0,0 +1,219 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import ExecutableTuple
+from ...serialization.serializables import AnyField, BoolField, Int64Field
+from ..array_utils import create_array
+from ..utils import decide_chunk_sizes
+from .core import TensorNoInput
+
+
+class TensorLinspace(TensorNoInput):
+    _op_type_ = OperandDef.TENSOR_LINSPACE
+
+    _start = AnyField("start")
+    _stop = AnyField("stop")
+    _num = Int64Field("num")
+    _endpoint = BoolField("endpoint")
+
+    def __init__(
+        self, start=None, stop=None, num=None, endpoint=None, dtype=None, **kw
+    ):
+        dtype = np.dtype(np.linspace(0, 1, 1).dtype if dtype is None else dtype)
+        super().__init__(
+            _start=start, _stop=stop, _num=num, _endpoint=endpoint, dtype=dtype, **kw
+        )
+
+    def to_chunk_op(self, *args):
+        start, stop, num, endpoint = args
+        op = self.copy().reset_key()
+        op._start = start
+        op._stop = stop
+        op._num = num
+        op._endpoint = endpoint
+        return op
+
+    @classmethod
+    def tile(cls, op):
+        tensor = op.outputs[0]
+
+        chunk_length = tensor.extra_params.raw_chunk_size or options.chunk_size
+        chunk_length = decide_chunk_sizes(
+            tensor.shape, chunk_length, tensor.dtype.itemsize
+        )
+
+        start, stop, num, endpoint = (
+            tensor.op.start,
+            tensor.op.stop,
+            tensor.op.num,
+            tensor.op.endpoint,
+        )
+        if num > 1:
+            step = float(stop - start) / (num if not endpoint else num - 1)
+        else:
+            step = 0.0
+
+        chunks = []
+        chunk_start = start
+        nsplit = []
+        for i, cs in enumerate(chunk_length[0]):
+            chunk_stop = chunk_start + (cs - 1) * step
+            chunk_op = op.to_chunk_op(chunk_start, chunk_stop, cs, True)
+            chunk_shape = (cs,)
+            chunk_idx = (i,)
+            chunk = chunk_op.new_chunk(None, shape=chunk_shape, index=chunk_idx)
+            chunks.append(chunk)
+            nsplit.append(cs)
+            chunk_start = chunk_start + cs * step
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs, op.outputs[0].shape, chunks=chunks, nsplits=(tuple(nsplit),)
+        )
+
+    @property
+    def start(self):
+        return self._start
+
+    @property
+    def stop(self):
+        return self._stop
+
+    @property
+    def num(self):
+        return self._num
+
+    @property
+    def endpoint(self):
+        return self._endpoint
+
+    @classmethod
+    def execute(cls, ctx, op):
+        ctx[op.outputs[0].key] = create_array(op)(
+            "linspace",
+            op.start,
+            op.stop,
+            num=op.num,
+            endpoint=op.endpoint,
+            dtype=op.dtype,
+        )
+
+
+def linspace(
+    start,
+    stop,
+    num=50,
+    endpoint=True,
+    retstep=False,
+    dtype=None,
+    gpu=None,
+    chunk_size=None,
+):
+    """
+    Return evenly spaced numbers over a specified interval.
+
+    Returns `num` evenly spaced samples, calculated over the
+    interval [`start`, `stop`].
+
+    The endpoint of the interval can optionally be excluded.
+
+    Parameters
+    ----------
+    start : scalar
+        The starting value of the sequence.
+    stop : scalar
+        The end value of the sequence, unless `endpoint` is set to False.
+        In that case, the sequence consists of all but the last of ``num + 1``
+        evenly spaced samples, so that `stop` is excluded.  Note that the step
+        size changes when `endpoint` is False.
+    num : int, optional
+        Number of samples to generate. Default is 50. Must be non-negative.
+    endpoint : bool, optional
+        If True, `stop` is the last sample. Otherwise, it is not included.
+        Default is True.
+    retstep : bool, optional
+        If True, return (`samples`, `step`), where `step` is the spacing
+        between samples.
+    dtype : dtype, optional
+        The type of the output tensor.  If `dtype` is not given, infer the data
+        type from the other input arguments.
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+
+    Returns
+    -------
+    samples : Tensor
+        There are `num` equally spaced samples in the closed interval
+        ``[start, stop]`` or the half-open interval ``[start, stop)``
+        (depending on whether `endpoint` is True or False).
+    step : float, optional
+        Only returned if `retstep` is True
+
+        Size of spacing between samples.
+
+
+    See Also
+    --------
+    arange : Similar to `linspace`, but uses a step size (instead of the
+             number of samples).
+    logspace : Samples uniformly distributed in log space.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.linspace(2.0, 3.0, num=5).execute()
+    array([ 2.  ,  2.25,  2.5 ,  2.75,  3.  ])
+    >>> mt.linspace(2.0, 3.0, num=5, endpoint=False).execute()
+    array([ 2. ,  2.2,  2.4,  2.6,  2.8])
+    >>> mt.linspace(2.0, 3.0, num=5, retstep=True).execute()
+    (array([ 2.  ,  2.25,  2.5 ,  2.75,  3.  ]), 0.25)
+
+    Graphical illustration:
+
+    >>> import matplotlib.pyplot as plt
+    >>> N = 8
+    >>> y = mt.zeros(N)
+    >>> x1 = mt.linspace(0, 10, N, endpoint=True)
+    >>> x2 = mt.linspace(0, 10, N, endpoint=False)
+    >>> plt.plot(x1.execute(), y.execute(), 'o')
+    [<matplotlib.lines.Line2D object at 0x...>]
+    >>> plt.plot(x2.execute(), y.execute() + 0.5, 'o')
+    [<matplotlib.lines.Line2D object at 0x...>]
+    >>> plt.ylim([-0.5, 1])
+    (-0.5, 1)
+    >>> plt.show()
+
+    """
+    num = int(num)
+
+    op = TensorLinspace(start, stop, num, endpoint, dtype=dtype, gpu=gpu)
+    shape = (num,)
+    ret = op(shape, chunk_size=chunk_size)
+
+    if not retstep:
+        return ret
+
+    if num > 1:
+        step = float(stop - start) / (num if not endpoint else num - 1)
+    else:
+        step = np.nan
+
+    return ExecutableTuple([ret, step])
diff --git a/python/xorbits/_mars/tensor/datasource/meshgrid.py b/python/xorbits/_mars/tensor/datasource/meshgrid.py
new file mode 100644
index 000000000..bb2817518
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/meshgrid.py
@@ -0,0 +1,135 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .array import tensor
+
+
+def meshgrid(*xi, **kwargs):
+    """
+    Return coordinate matrices from coordinate vectors.
+
+    Make N-D coordinate arrays for vectorized evaluations of
+    N-D scalar/vector fields over N-D grids, given
+    one-dimensional coordinate tensors x1, x2,..., xn.
+
+    Parameters
+    ----------
+    x1, x2,..., xn : array_like
+        1-D arrays representing the coordinates of a grid.
+    indexing : {'xy', 'ij'}, optional
+        Cartesian ('xy', default) or matrix ('ij') indexing of output.
+        See Notes for more details.
+    sparse : bool, optional
+        If True a sparse grid is returned in order to conserve memory.
+        Default is False.
+
+    Returns
+    -------
+    X1, X2,..., XN : Tensor
+        For vectors `x1`, `x2`,..., 'xn' with lengths ``Ni=len(xi)`` ,
+        return ``(N1, N2, N3,...Nn)`` shaped tensors if indexing='ij'
+        or ``(N2, N1, N3,...Nn)`` shaped tensors if indexing='xy'
+        with the elements of `xi` repeated to fill the matrix along
+        the first dimension for `x1`, the second for `x2` and so on.
+
+    Notes
+    -----
+    This function supports both indexing conventions through the indexing
+    keyword argument.  Giving the string 'ij' returns a meshgrid with
+    matrix indexing, while 'xy' returns a meshgrid with Cartesian indexing.
+    In the 2-D case with inputs of length M and N, the outputs are of shape
+    (N, M) for 'xy' indexing and (M, N) for 'ij' indexing.  In the 3-D case
+    with inputs of length M, N and P, outputs are of shape (N, M, P) for
+    'xy' indexing and (M, N, P) for 'ij' indexing.  The difference is
+    illustrated by the following code snippet::
+
+        xv, yv = mt.meshgrid(x, y, sparse=False, indexing='ij')
+        for i in range(nx):
+            for j in range(ny):
+                # treat xv[i,j], yv[i,j]
+
+        xv, yv = mt.meshgrid(x, y, sparse=False, indexing='xy')
+        for i in range(nx):
+            for j in range(ny):
+                # treat xv[j,i], yv[j,i]
+
+    In the 1-D and 0-D case, the indexing and sparse keywords have no effect.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> nx, ny = (3, 2)
+    >>> x = mt.linspace(0, 1, nx)
+    >>> y = mt.linspace(0, 1, ny)
+    >>> xv, yv = mt.meshgrid(x, y)
+    >>> xv.execute()
+    array([[ 0. ,  0.5,  1. ],
+           [ 0. ,  0.5,  1. ]])
+    >>> yv.execute()
+    array([[ 0.,  0.,  0.],
+           [ 1.,  1.,  1.]])
+    >>> xv, yv = mt.meshgrid(x, y, sparse=True)  # make sparse output arrays
+    >>> xv.execute()
+    array([[ 0. ,  0.5,  1. ]])
+    >>> yv.execute()
+    array([[ 0.],
+           [ 1.]])
+
+    `meshgrid` is very useful to evaluate functions on a grid.
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = mt.arange(-5, 5, 0.1)
+    >>> y = mt.arange(-5, 5, 0.1)
+    >>> xx, yy = mt.meshgrid(x, y, sparse=True)
+    >>> z = mt.sin(xx**2 + yy**2) / (xx**2 + yy**2)
+    >>> h = plt.contourf(x,y,z)
+
+    """
+    from ..base import broadcast_to
+
+    indexing = kwargs.pop("indexing", "xy")
+    sparse = kwargs.pop("sparse", False)
+
+    if kwargs:
+        raise TypeError(
+            f"meshgrid() got an unexpected keyword argument '{list(kwargs)[0]}'"
+        )
+    if indexing not in ("xy", "ij"):
+        raise ValueError("Valid values for `indexing` are 'xy' and 'ij'.")
+
+    xi = [tensor(x) for x in xi]
+    xi = [a.ravel() for a in xi]
+    shape = [x.size for x in xi]
+
+    if indexing == "xy" and len(xi) > 1:
+        xi[0], xi[1] = xi[1], xi[0]
+        shape[0], shape[1] = shape[1], shape[0]
+
+    grid = []
+    for i, x in enumerate(xi):
+        slc = [None] * len(shape)
+        slc[i] = slice(None)
+
+        r = x[tuple(slc)]
+
+        if not sparse:
+            r = broadcast_to(r, shape)
+
+        grid.append(r)
+
+    if indexing == "xy" and len(xi) > 1:
+        grid[0], grid[1] = grid[1], grid[0]
+
+    return grid
diff --git a/python/xorbits/_mars/tensor/datasource/ones.py b/python/xorbits/_mars/tensor/datasource/ones.py
new file mode 100644
index 000000000..01c1a1b5a
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/ones.py
@@ -0,0 +1,217 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...lib.sparse import SparseNDArray
+from ...lib.sparse.core import get_array_module, get_sparse_module, naked
+from ...serialization.serializables import (
+    AnyField,
+    FieldTypes,
+    KeyField,
+    StringField,
+    TupleField,
+)
+from ..array_utils import convert_order, create_array
+from ..utils import get_order
+from .array import tensor
+from .core import TensorLike, TensorNoInput
+
+
+class TensorOnes(TensorNoInput):
+    _op_type_ = OperandDef.TENSOR_ONES
+
+    order = StringField("order")
+    shape = TupleField("shape", FieldTypes.int64)
+    chunk_size = AnyField("chunk_size")
+
+    def __init__(self, shape=None, **kwargs):
+        if type(shape) is int:
+            shape = (shape,)
+        super().__init__(shape=shape, **kwargs)
+
+    def to_chunk_op(self, *args):
+        chunk_op = super().to_chunk_op(*args)
+        chunk_op.shape = args[0]
+        chunk_op.chunk_size = None
+        return chunk_op
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        try:
+            ctx[chunk.key] = create_array(op)(
+                "ones", op.shape, dtype=op.dtype, order=op.order
+            )
+        except TypeError:  # in case that cp.ones does not have arg ``order``
+            x = create_array(op)("ones", op.shape, dtype=op.dtype)
+            ctx[chunk.key] = convert_order(x, op.order)
+
+
+def ones(shape, dtype=None, chunk_size=None, gpu=None, order="C"):
+    """
+    Return a new tensor of given shape and type, filled with ones.
+
+    Parameters
+    ----------
+    shape : int or sequence of ints
+        Shape of the new tensor, e.g., ``(2, 3)`` or ``2``.
+    dtype : data-type, optional
+        The desired data-type for the tensor, e.g., `mt.int8`.  Default is
+        `mt.float64`.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    order : {'C', 'F'}, optional, default: C
+        Whether to store multi-dimensional data in row-major
+        (C-style) or column-major (Fortran-style) order in
+        memory.
+
+    Returns
+    -------
+    out : Tensor
+        Tensor of ones with the given shape, dtype, and order.
+
+    See Also
+    --------
+    zeros, ones_like
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.ones(5).execute()
+    array([ 1.,  1.,  1.,  1.,  1.])
+
+    >>> mt.ones((5,), dtype=int).execute()
+    array([1, 1, 1, 1, 1])
+
+    >>> mt.ones((2, 1)).execute()
+    array([[ 1.],
+           [ 1.]])
+
+    >>> s = (2,2)
+    >>> mt.ones(s).execute()
+    array([[ 1.,  1.],
+           [ 1.,  1.]])
+
+    """
+    tensor_order = get_order(
+        order,
+        None,
+        available_options="CF",
+        err_msg="only 'C' or 'F' order is permitted",
+    )
+    dtype = np.dtype(dtype or "f8")
+    op = TensorOnes(
+        dtype=dtype, shape=shape, chunk_size=chunk_size, gpu=gpu, order=order
+    )
+    return op(shape, chunk_size=chunk_size, order=tensor_order)
+
+
+class TensorOnesLike(TensorLike):
+    _op_type_ = OperandDef.TENSOR_ONES_LIKE
+
+    _input = KeyField("input")
+
+    def __init__(self, dtype=None, sparse=False, **kw):
+        dtype = np.dtype(dtype) if dtype is not None else None
+        super().__init__(dtype=dtype, sparse=sparse, **kw)
+
+    @classmethod
+    def execute_sparse(cls, ctx, op):
+        chunk = op.outputs[0]
+        in_data = naked(ctx[op.input.key])
+        xps = get_sparse_module(in_data)
+        xp = get_array_module(in_data)
+        ctx[chunk.key] = SparseNDArray(
+            xps.csr_matrix(
+                (
+                    xp.ones_like(in_data.data, dtype=chunk.op.dtype),
+                    in_data.indices,
+                    in_data.indptr,
+                ),
+                shape=in_data.shape,
+            )
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.sparse:
+            cls.execute_sparse(ctx, op)
+        else:
+            ctx[op.outputs[0].key] = create_array(op)(
+                "ones_like", ctx[op.inputs[0].key], dtype=op.dtype
+            )
+
+
+def ones_like(a, dtype=None, gpu=None, order="K"):
+    """
+    Return a tensor of ones with the same shape and type as a given tensor.
+
+    Parameters
+    ----------
+    a : array_like
+        The shape and data-type of `a` define these same attributes of
+        the returned tensor.
+    dtype : data-type, optional
+        Overrides the data type of the result.
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, None as default
+    order : {'C', 'F', 'A', or 'K'}, optional
+        Overrides the memory layout of the result. 'C' means C-order,
+        'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous,
+        'C' otherwise. 'K' means match the layout of `a` as closely
+        as possible.
+
+    Returns
+    -------
+    out : Tensor
+        Tensor of ones with the same shape and type as `a`.
+
+    See Also
+    --------
+    zeros_like : Return a tensor of zeros with shape and type of input.
+    empty_like : Return a empty tensor with shape and type of input.
+    zeros : Return a new tensor setting values to zero.
+    ones : Return a new tensor setting values to one.
+    empty : Return a new uninitialized tensor.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.arange(6)
+    >>> x = x.reshape((2, 3))
+    >>> x.execute()
+    array([[0, 1, 2],
+           [3, 4, 5]])
+    >>> mt.ones_like(x).execute()
+    array([[1, 1, 1],
+           [1, 1, 1]])
+
+    >>> y = mt.arange(3, dtype=float)
+    >>> y.execute()
+    array([ 0.,  1.,  2.])
+    >>> mt.ones_like(y).execute()
+    array([ 1.,  1.,  1.])
+
+    """
+    a = tensor(a)
+    tensor_order = get_order(order, a.order)
+    gpu = a.op.gpu if gpu is None else gpu
+    op = TensorOnesLike(dtype=dtype, gpu=gpu, sparse=a.issparse(), order=order)
+    return op(a, order=tensor_order)
diff --git a/python/xorbits/_mars/tensor/datasource/scalar.py b/python/xorbits/_mars/tensor/datasource/scalar.py
new file mode 100644
index 000000000..0e303a058
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/scalar.py
@@ -0,0 +1,64 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..array_utils import create_array
+from .core import TensorNoInput
+
+
+class Scalar(TensorNoInput):
+    """
+    Operand represents scalar type.
+    """
+
+    _op_type_ = OperandDef.SCALAR
+
+    _data = AnyField("data")
+
+    def __init__(self, data=None, **kw):
+        super().__init__(_data=data, **kw)
+
+    @classmethod
+    def tile(cls, op):
+        chunk_op = op.copy().reset_key()
+        chunk = chunk_op.new_chunk(None, shape=(), index=())
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs, op.outputs[0].shape, chunks=[chunk], nsplits=()
+        )
+
+    @property
+    def data(self):
+        return self._data
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        if chunk.ndim != 0:
+            raise ValueError("Missing op for chunk")
+        ctx[chunk.key] = create_array(op)("asarray", op.data)
+
+
+def scalar(data, dtype=None, gpu=None):
+    try:
+        arr = np.array(data, dtype=dtype)
+        op = Scalar(arr, dtype=arr.dtype, gpu=gpu)
+        shape = ()
+        return op(shape)
+    except ValueError:
+        raise TypeError(f"Expect scalar, got: {data}")
diff --git a/python/xorbits/_mars/tensor/datasource/tests/__init__.py b/python/xorbits/_mars/tensor/datasource/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/datasource/tests/test_datasource.py b/python/xorbits/_mars/tensor/datasource/tests/test_datasource.py
new file mode 100644
index 000000000..1b51c8413
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/tests/test_datasource.py
@@ -0,0 +1,647 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+from copy import copy
+
+import numpy as np
+import pytest
+import scipy.sparse as sps
+
+try:
+    import tiledb
+except (ImportError, OSError):  # pragma: no cover
+    tiledb = None
+
+from .... import dataframe as md
+from ....core import enter_mode, tile
+from ... import arange, diag, full, linspace, ones, ones_like, tensor, tril, triu, zeros
+from ...core import SparseTensor, Tensor
+from .. import (
+    TensorTileDBDataSource,
+    array,
+    asarray,
+    ascontiguousarray,
+    asfortranarray,
+    fromdense,
+    fromtiledb,
+)
+from ..array import CSRMatrixDataSource
+from ..from_dataframe import from_dataframe
+from ..from_dense import DenseToSparse
+from ..ones import TensorOnes, TensorOnesLike
+from ..tri import TensorTril, TensorTriu
+from ..zeros import TensorZeros
+
+
+def test_array():
+    a = tensor([0, 1, 2], chunk_size=2)
+
+    b = array(a)
+    assert a is not b
+
+    c = asarray(a)
+    assert a is c
+
+
+def test_ascontiguousarray():
+    # dtype different
+    raw_a = np.asfortranarray(np.random.rand(2, 4))
+    raw_b = np.ascontiguousarray(raw_a, dtype="f4")
+
+    a = tensor(raw_a, chunk_size=2)
+    b = ascontiguousarray(a, dtype="f4")
+
+    assert a.dtype == raw_a.dtype
+    assert a.flags["C_CONTIGUOUS"] == raw_a.flags["C_CONTIGUOUS"]
+    assert a.flags["F_CONTIGUOUS"] == raw_a.flags["F_CONTIGUOUS"]
+
+    assert b.dtype == raw_b.dtype
+    assert b.flags["C_CONTIGUOUS"] == raw_b.flags["C_CONTIGUOUS"]
+    assert b.flags["F_CONTIGUOUS"] == raw_b.flags["F_CONTIGUOUS"]
+
+    # no copy
+    raw_a = np.random.rand(2, 4)
+    raw_b = np.ascontiguousarray(raw_a)
+
+    a = tensor(raw_a, chunk_size=2)
+    b = ascontiguousarray(a)
+
+    assert a.dtype == raw_a.dtype
+    assert a.flags["C_CONTIGUOUS"] == raw_a.flags["C_CONTIGUOUS"]
+    assert a.flags["F_CONTIGUOUS"] == raw_a.flags["F_CONTIGUOUS"]
+
+    assert b.dtype == raw_b.dtype
+    assert b.flags["C_CONTIGUOUS"] == raw_b.flags["C_CONTIGUOUS"]
+    assert b.flags["F_CONTIGUOUS"] == raw_b.flags["F_CONTIGUOUS"]
+
+
+def test_asfortranarray():
+    # dtype different
+    raw_a = np.random.rand(2, 4)
+    raw_b = np.asfortranarray(raw_a, dtype="f4")
+
+    a = tensor(raw_a, chunk_size=2)
+    b = asfortranarray(a, dtype="f4")
+
+    assert a.dtype == raw_a.dtype
+    assert a.flags["C_CONTIGUOUS"] == raw_a.flags["C_CONTIGUOUS"]
+    assert a.flags["F_CONTIGUOUS"] == raw_a.flags["F_CONTIGUOUS"]
+
+    assert b.dtype == raw_b.dtype
+    assert b.flags["C_CONTIGUOUS"] == raw_b.flags["C_CONTIGUOUS"]
+    assert b.flags["F_CONTIGUOUS"] == raw_b.flags["F_CONTIGUOUS"]
+
+    # no copy
+    raw_a = np.asfortranarray(np.random.rand(2, 4))
+    raw_b = np.asfortranarray(raw_a)
+
+    a = tensor(raw_a, chunk_size=2)
+    b = asfortranarray(a)
+
+    assert a.dtype == raw_a.dtype
+    assert a.flags["C_CONTIGUOUS"] == raw_a.flags["C_CONTIGUOUS"]
+    assert a.flags["F_CONTIGUOUS"] == raw_a.flags["F_CONTIGUOUS"]
+
+    assert b.dtype == raw_b.dtype
+    assert b.flags["C_CONTIGUOUS"] == raw_b.flags["C_CONTIGUOUS"]
+    assert b.flags["F_CONTIGUOUS"] == raw_b.flags["F_CONTIGUOUS"]
+
+
+def test_ones():
+    tensor = ones((10, 10, 8), chunk_size=(3, 3, 5))
+    tensor = tile(tensor)
+    assert tensor.shape == (10, 10, 8)
+    assert len(tensor.chunks) == 32
+
+    tensor = ones((10, 3), chunk_size=(4, 2))
+    tensor = tile(tensor)
+    assert tensor.shape == (10, 3)
+
+    chunk = tensor.cix[1, 1]
+    assert tensor.get_chunk_slices(chunk.index) == (slice(4, 8), slice(2, 3))
+
+    tensor = ones((10, 5), chunk_size=(2, 3), gpu=True)
+    tensor = tile(tensor)
+
+    assert tensor.op.gpu is True
+    assert tensor.chunks[0].op.gpu is True
+
+    tensor1 = ones((10, 10, 8), chunk_size=(3, 3, 5))
+    tensor1 = tile(tensor1)
+
+    tensor2 = ones((10, 10, 8), chunk_size=(3, 3, 5))
+    tensor2 = tile(tensor2)
+
+    assert tensor1.chunks[0].op.key == tensor2.chunks[0].op.key
+    assert tensor1.chunks[0].key == tensor2.chunks[0].key
+    assert tensor1.chunks[0].op.key != tensor1.chunks[1].op.key
+    assert tensor1.chunks[0].key != tensor1.chunks[1].key
+
+    tensor = ones((2, 3, 4))
+    assert len(list(tensor)) == 2
+
+    tensor2 = ones((2, 3, 4), chunk_size=1)
+    assert tensor.op.key != tensor2.op.key
+    assert tensor.key != tensor2.key
+
+    tensor3 = ones((2, 3, 3))
+    assert tensor.op.key != tensor3.op.key
+    assert tensor.key != tensor3.key
+
+    # test create chunk op of ones manually
+    chunk_op1 = TensorOnes(dtype=tensor.dtype)
+    chunk1 = chunk_op1.new_chunk(None, shape=(3, 3), index=(0, 0))
+    chunk_op2 = TensorOnes(dtype=tensor.dtype)
+    chunk2 = chunk_op2.new_chunk(None, shape=(3, 4), index=(0, 1))
+    assert chunk1.op.key != chunk2.op.key
+    assert chunk1.key != chunk2.key
+
+    tensor = ones((100, 100), chunk_size=50)
+    tensor = tile(tensor)
+    assert len({c.op.key for c in tensor.chunks}) == 1
+    assert len({c.key for c in tensor.chunks}) == 1
+
+
+def test_zeros():
+    tensor = zeros((2, 3, 4))
+    assert len(list(tensor)) == 2
+    assert tensor.op.gpu is None
+
+    tensor2 = zeros((2, 3, 4), chunk_size=1)
+    # tensor's op key must be equal to tensor2
+    assert tensor.op.key != tensor2.op.key
+    assert tensor.key != tensor2.key
+
+    tensor3 = zeros((2, 3, 3))
+    assert tensor.op.key != tensor3.op.key
+    assert tensor.key != tensor3.key
+
+    # test create chunk op of zeros manually
+    chunk_op1 = TensorZeros(dtype=tensor.dtype)
+    chunk1 = chunk_op1.new_chunk(None, shape=(3, 3), index=(0, 0))
+    chunk_op2 = TensorZeros(dtype=tensor.dtype)
+    chunk2 = chunk_op2.new_chunk(None, shape=(3, 4), index=(0, 1))
+    assert chunk1.op.key != chunk2.op.key
+    assert chunk1.key != chunk2.key
+
+    tensor = zeros((100, 100), chunk_size=50)
+    tensor = tile(tensor)
+    assert len({c.op.key for c in tensor.chunks}) == 1
+    assert len({c.key for c in tensor.chunks}) == 1
+
+
+def test_data_source():
+    from ...base.broadcast_to import TensorBroadcastTo
+
+    data = np.random.random((10, 3))
+    t = tensor(data, chunk_size=2)
+    assert t.op.gpu is None
+    t = tile(t)
+    assert (t.chunks[0].op.data == data[:2, :2]).all()
+    assert (t.chunks[1].op.data == data[:2, 2:3]).all()
+    assert (t.chunks[2].op.data == data[2:4, :2]).all()
+    assert (t.chunks[3].op.data == data[2:4, 2:3]).all()
+
+    assert t.key == tile(tensor(data, chunk_size=2)).key
+    assert t.key != tile(tensor(data, chunk_size=3)).key
+    assert t.key != tile(tensor(np.random.random((10, 3)), chunk_size=2)).key
+
+    t = tensor(data, chunk_size=2, gpu=True)
+    t = tile(t)
+
+    assert t.op.gpu is True
+    assert t.chunks[0].op.gpu is True
+
+    t = full((2, 2), 2, dtype="f4")
+    assert t.op.gpu is None
+    assert t.shape == (2, 2)
+    assert t.dtype == np.float32
+
+    t = full((2, 2), [1.0, 2.0], dtype="f4")
+    assert t.shape == (2, 2)
+    assert t.dtype == np.float32
+    assert isinstance(t.op, TensorBroadcastTo)
+
+    with pytest.raises(ValueError):
+        full((2, 2), [1.0, 2.0, 3.0], dtype="f4")
+
+
+def test_ufunc():
+    t = ones((3, 10), chunk_size=2)
+
+    x = np.add(t, [[1], [2], [3]])
+    assert isinstance(x, Tensor)
+
+    y = np.sum(t, axis=1)
+    assert isinstance(y, Tensor)
+
+
+def test_arange():
+    t = arange(10, chunk_size=3)
+
+    assert t.op.gpu is False
+    t = tile(t)
+
+    assert t.shape == (10,)
+    assert t.nsplits == ((3, 3, 3, 1),)
+    assert t.chunks[1].op.start == 3
+    assert t.chunks[1].op.stop == 6
+
+    t = arange(0, 10, 3, chunk_size=2)
+    t = tile(t)
+
+    assert t.shape == (4,)
+    assert t.nsplits == ((2, 2),)
+    assert t.chunks[0].op.start == 0
+    assert t.chunks[0].op.stop == 6
+    assert t.chunks[0].op.step == 3
+    assert t.chunks[1].op.start == 6
+    assert t.chunks[1].op.stop == 12
+    assert t.chunks[1].op.step == 3
+
+    pytest.raises(TypeError, lambda: arange(10, start=0))
+    pytest.raises(TypeError, lambda: arange(0, 10, stop=0))
+    pytest.raises(TypeError, lambda: arange())
+    pytest.raises(
+        ValueError, lambda: arange("1066-10-13", dtype=np.datetime64, chunks=3)
+    )
+
+
+def test_diag():
+    # test 2-d, shape[0] == shape[1], k == 0
+    v = tensor(np.arange(16).reshape(4, 4), chunk_size=2)
+    t = diag(v)
+
+    assert t.shape == (4,)
+    assert t.op.gpu is None
+    t = tile(t)
+    assert t.nsplits == ((2, 2),)
+
+    v = tensor(np.arange(16).reshape(4, 4), chunk_size=(2, 3))
+    t = diag(v)
+
+    assert t.shape == (4,)
+    t = tile(t)
+    assert t.nsplits == ((2, 1, 1),)
+
+    # test 1-d, k == 0
+    v = tensor(np.arange(3), chunk_size=2)
+    t = diag(v, sparse=True)
+
+    assert t.shape == (3, 3)
+    t = tile(t)
+    assert t.nsplits == ((2, 1), (2, 1))
+    assert len([c for c in t.chunks if c.op.__class__.__name__ == "TensorDiag"]) == 2
+    assert t.chunks[0].op.sparse is True
+
+    # test 2-d, shape[0] != shape[1]
+    v = tensor(np.arange(24).reshape(4, 6), chunk_size=2)
+    t = diag(v)
+
+    assert t.shape == np.diag(np.arange(24).reshape(4, 6)).shape
+    t = tile(t)
+    assert tuple(sum(s) for s in t.nsplits) == t.shape
+
+    v = tensor(np.arange(24).reshape(4, 6), chunk_size=2)
+
+    t = diag(v, k=1)
+    assert t.shape == np.diag(np.arange(24).reshape(4, 6), k=1).shape
+    t = tile(t)
+    assert tuple(sum(s) for s in t.nsplits) == t.shape
+
+    t = diag(v, k=2)
+    assert t.shape == np.diag(np.arange(24).reshape(4, 6), k=2).shape
+    t = tile(t)
+    assert tuple(sum(s) for s in t.nsplits) == t.shape
+
+    t = diag(v, k=-1)
+    assert t.shape == np.diag(np.arange(24).reshape(4, 6), k=-1).shape
+    t = tile(t)
+    assert tuple(sum(s) for s in t.nsplits) == t.shape
+
+    t = diag(v, k=-2)
+    assert t.shape == np.diag(np.arange(24).reshape(4, 6), k=-2).shape
+    t = tile(t)
+    assert tuple(sum(s) for s in t.nsplits) == t.shape
+
+    # test tiled zeros' keys
+    a = arange(5, chunk_size=2)
+    t = diag(a)
+    t = tile(t)
+    # 1 and 2 of t.chunks is ones, they have different shapes
+    assert t.chunks[1].op.key != t.chunks[2].op.key
+
+
+def test_linspace():
+    a = linspace(2.0, 3.0, num=5, chunk_size=2)
+
+    assert a.shape == (5,)
+
+    a = tile(a)
+    assert a.nsplits == ((2, 2, 1),)
+    assert a.chunks[0].op.start == 2.0
+    assert a.chunks[0].op.stop == 2.25
+    assert a.chunks[1].op.start == 2.5
+    assert a.chunks[1].op.stop == 2.75
+    assert a.chunks[2].op.start == 3.0
+    assert a.chunks[2].op.stop == 3.0
+
+    a = linspace(2.0, 3.0, num=5, endpoint=False, chunk_size=2)
+
+    assert a.shape == (5,)
+
+    a = tile(a)
+    assert a.nsplits == ((2, 2, 1),)
+    assert a.chunks[0].op.start == 2.0
+    assert a.chunks[0].op.stop == 2.2
+    assert a.chunks[1].op.start == 2.4
+    assert a.chunks[1].op.stop == 2.6
+    assert a.chunks[2].op.start == 2.8
+    assert a.chunks[2].op.stop == 2.8
+
+    _, step = linspace(2.0, 3.0, num=5, chunk_size=2, retstep=True)
+    assert step == 0.25
+
+
+def test_triu_tril():
+    a_data = np.arange(12).reshape(4, 3)
+    a = tensor(a_data, chunk_size=2)
+
+    t = triu(a)
+
+    assert t.op.gpu is None
+
+    t = tile(t)
+    assert len(t.chunks) == 4
+    assert isinstance(t.chunks[0].op, TensorTriu)
+    assert isinstance(t.chunks[1].op, TensorTriu)
+    assert isinstance(t.chunks[2].op, TensorZeros)
+    assert isinstance(t.chunks[3].op, TensorTriu)
+
+    t = triu(a, k=1)
+
+    t = tile(t)
+    assert len(t.chunks) == 4
+    assert isinstance(t.chunks[0].op, TensorTriu)
+    assert isinstance(t.chunks[1].op, TensorTriu)
+    assert isinstance(t.chunks[2].op, TensorZeros)
+    assert isinstance(t.chunks[3].op, TensorZeros)
+
+    t = triu(a, k=2)
+
+    t = tile(t)
+    assert len(t.chunks) == 4
+    assert isinstance(t.chunks[0].op, TensorZeros)
+    assert isinstance(t.chunks[1].op, TensorTriu)
+    assert isinstance(t.chunks[2].op, TensorZeros)
+    assert isinstance(t.chunks[3].op, TensorZeros)
+
+    t = triu(a, k=-1)
+
+    t = tile(t)
+    assert len(t.chunks) == 4
+    assert isinstance(t.chunks[0].op, TensorTriu)
+    assert isinstance(t.chunks[1].op, TensorTriu)
+    assert isinstance(t.chunks[2].op, TensorTriu)
+    assert isinstance(t.chunks[3].op, TensorTriu)
+
+    t = tril(a)
+
+    assert t.op.gpu is None
+
+    t = tile(t)
+    assert len(t.chunks) == 4
+    assert isinstance(t.chunks[0].op, TensorTril)
+    assert isinstance(t.chunks[1].op, TensorZeros)
+    assert isinstance(t.chunks[2].op, TensorTril)
+    assert isinstance(t.chunks[3].op, TensorTril)
+
+    t = tril(a, k=1)
+
+    t = tile(t)
+    assert len(t.chunks) == 4
+    assert isinstance(t.chunks[0].op, TensorTril)
+    assert isinstance(t.chunks[1].op, TensorTril)
+    assert isinstance(t.chunks[2].op, TensorTril)
+    assert isinstance(t.chunks[3].op, TensorTril)
+
+    t = tril(a, k=-1)
+
+    t = tile(t)
+    assert len(t.chunks) == 4
+    assert isinstance(t.chunks[0].op, TensorTril)
+    assert isinstance(t.chunks[1].op, TensorZeros)
+    assert isinstance(t.chunks[2].op, TensorTril)
+    assert isinstance(t.chunks[3].op, TensorTril)
+
+    t = tril(a, k=-2)
+
+    t = tile(t)
+    assert len(t.chunks) == 4
+    assert isinstance(t.chunks[0].op, TensorZeros)
+    assert isinstance(t.chunks[1].op, TensorZeros)
+    assert isinstance(t.chunks[2].op, TensorTril)
+    assert isinstance(t.chunks[3].op, TensorZeros)
+
+
+def test_set_tensor_inputs():
+    t1 = tensor([1, 2], chunk_size=2)
+    t2 = tensor([2, 3], chunk_size=2)
+    t3 = t1 + t2
+
+    t1c = copy(t1)
+    t2c = copy(t2)
+
+    assert t1c is not t1
+    assert t2c is not t2
+
+    assert t3.op.lhs is t1.data
+    assert t3.op.rhs is t2.data
+    assert t3.op.inputs == [t1.data, t2.data]
+    assert t3.inputs == [t1.data, t2.data]
+
+    with pytest.raises(StopIteration):
+        t3.inputs = []
+
+    t1 = tensor([1, 2], chunk_size=2)
+    t2 = tensor([True, False], chunk_size=2)
+    t3 = t1[t2]
+
+    t1c = copy(t1)
+    t2c = copy(t2)
+    t3c = copy(t3)
+    t3c.inputs = [t1c, t2c]
+
+    with enter_mode(build=True):
+        assert t3c.op.input is t1c.data
+        assert t3c.op.indexes[0] is t2c.data
+
+
+def test_from_spmatrix():
+    t = tensor(sps.csr_matrix([[0, 0, 1], [1, 0, 0]], dtype="f8"), chunk_size=2)
+
+    assert isinstance(t, SparseTensor)
+    assert isinstance(t.op, CSRMatrixDataSource)
+    assert t.issparse() is True
+    assert not t.op.gpu
+
+    t = tile(t)
+    assert t.chunks[0].index == (0, 0)
+    assert isinstance(t.op, CSRMatrixDataSource)
+    assert not t.op.gpu
+    m = sps.csr_matrix([[0, 0], [1, 0]])
+    assert np.array_equal(t.chunks[0].op.indices, m.indices) is True
+    assert np.array_equal(t.chunks[0].op.indptr, m.indptr) is True
+    assert np.array_equal(t.chunks[0].op.data, m.data) is True
+    assert np.array_equal(t.chunks[0].op.shape, m.shape) is True
+
+
+def test_from_dense():
+    t = fromdense(tensor([[0, 0, 1], [1, 0, 0]], chunk_size=2))
+
+    assert isinstance(t, SparseTensor)
+    assert isinstance(t.op, DenseToSparse)
+    assert t.issparse() is True
+
+    t = tile(t)
+    assert t.chunks[0].index == (0, 0)
+    assert isinstance(t.op, DenseToSparse)
+
+
+def test_ones_like():
+    t1 = tensor([[0, 0, 1], [1, 0, 0]], chunk_size=2).tosparse()
+    t = ones_like(t1, dtype="f8")
+
+    assert isinstance(t, SparseTensor)
+    assert isinstance(t.op, TensorOnesLike)
+    assert t.issparse() is True
+    assert t.op.gpu is None
+
+    t = tile(t)
+    assert t.chunks[0].index == (0, 0)
+    assert isinstance(t.op, TensorOnesLike)
+    assert t.chunks[0].issparse() is True
+
+
+def test_from_array():
+    x = array([1, 2, 3])
+    assert x.shape == (3,)
+
+    y = array([x, x])
+    assert y.shape == (2, 3)
+
+    z = array((x, x, x))
+    assert z.shape == (3, 3)
+
+
+@pytest.mark.skipif(tiledb is None, reason="TileDB not installed")
+def test_from_tile_db():
+    ctx = tiledb.Ctx()
+
+    for sparse in (True, False):
+        dom = tiledb.Domain(
+            tiledb.Dim(ctx=ctx, name="i", domain=(1, 30), tile=7, dtype=np.int32),
+            tiledb.Dim(ctx=ctx, name="j", domain=(1, 20), tile=3, dtype=np.int32),
+            tiledb.Dim(ctx=ctx, name="k", domain=(1, 10), tile=4, dtype=np.int32),
+            ctx=ctx,
+        )
+        schema = tiledb.ArraySchema(
+            ctx=ctx,
+            domain=dom,
+            sparse=sparse,
+            attrs=[tiledb.Attr(ctx=ctx, name="a", dtype=np.float32)],
+        )
+
+        tempdir = tempfile.mkdtemp()
+        try:
+            # create tiledb array
+            array_type = tiledb.DenseArray if not sparse else tiledb.SparseArray
+            array_type.create(tempdir, schema)
+
+            tensor = fromtiledb(tempdir)
+            assert isinstance(tensor.op, TensorTileDBDataSource)
+            assert tensor.op.issparse() == sparse
+            assert tensor.shape == (30, 20, 10)
+            assert tensor.extra_params.raw_chunk_size == (7, 3, 4)
+            assert tensor.op.tiledb_config is None
+            assert tensor.op.tiledb_uri == tempdir
+            assert tensor.op.tiledb_key is None
+            assert tensor.op.tiledb_timestamp is None
+
+            tensor = tile(tensor)
+
+            assert len(tensor.chunks) == 105
+            assert isinstance(tensor.chunks[0].op, TensorTileDBDataSource)
+            assert tensor.chunks[0].op.issparse() == sparse
+            assert tensor.chunks[0].shape == (7, 3, 4)
+            assert tensor.chunks[0].op.tiledb_config is None
+            assert tensor.chunks[0].op.tiledb_uri == tempdir
+            assert tensor.chunks[0].op.tiledb_key is None
+            assert tensor.chunks[0].op.tiledb_timestamp is None
+            assert tensor.chunks[0].op.tiledb_dim_starts == (1, 1, 1)
+
+            # test axis_offsets of chunk op
+            assert tensor.chunks[0].op.axis_offsets == (0, 0, 0)
+            assert tensor.chunks[1].op.axis_offsets == (0, 0, 4)
+            assert tensor.cix[0, 2, 2].op.axis_offsets == (0, 6, 8)
+            assert tensor.cix[0, 6, 2].op.axis_offsets == (0, 18, 8)
+            assert tensor.cix[4, 6, 2].op.axis_offsets == (28, 18, 8)
+
+            tensor2 = fromtiledb(tempdir, ctx=ctx)
+            assert tensor2.op.tiledb_config == ctx.config().dict()
+
+            tensor2 = tile(tensor2)
+
+            assert tensor2.chunks[0].op.tiledb_config == ctx.config().dict()
+        finally:
+            shutil.rmtree(tempdir)
+
+
+@pytest.mark.skipif(tiledb is None, reason="TileDB not installed")
+def test_dim_start_float():
+    ctx = tiledb.Ctx()
+
+    dom = tiledb.Domain(
+        tiledb.Dim(ctx=ctx, name="i", domain=(0.0, 6.0), tile=6, dtype=np.float64),
+        ctx=ctx,
+    )
+    schema = tiledb.ArraySchema(
+        ctx=ctx,
+        domain=dom,
+        sparse=True,
+        attrs=[tiledb.Attr(ctx=ctx, name="a", dtype=np.float32)],
+    )
+
+    tempdir = tempfile.mkdtemp()
+    try:
+        # create tiledb array
+        tiledb.SparseArray.create(tempdir, schema)
+
+        with pytest.raises(ValueError):
+            fromtiledb(tempdir, ctx=ctx)
+    finally:
+        shutil.rmtree(tempdir)
+
+
+def test_from_dataframe():
+    mdf = md.DataFrame(
+        {"a": [0, 1, 2], "b": [3, 4, 5], "c": [0.1, 0.2, 0.3]},
+        index=["c", "d", "e"],
+        chunk_size=2,
+    )
+    tensor = from_dataframe(mdf)
+    assert tensor.shape == (3, 3)
+    assert np.float64 == tensor.dtype
diff --git a/python/xorbits/_mars/tensor/datasource/tests/test_datasource_execution.py b/python/xorbits/_mars/tensor/datasource/tests/test_datasource_execution.py
new file mode 100644
index 000000000..85769a551
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/tests/test_datasource_execution.py
@@ -0,0 +1,1168 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import tempfile
+import time
+
+import numpy as np
+import pandas as pd
+import pytest
+
+try:
+    import scipy.sparse as sps
+except ImportError:
+    sps = None
+try:
+    import tiledb
+except (ImportError, OSError):  # pragma: no cover
+    tiledb = None
+try:
+    import h5py
+except ImportError:  # pragma: no cover
+    h5py = None
+try:
+    import zarr
+except ImportError:  # pragma: no cover
+    zarr = None
+
+from .... import dataframe as md
+from .... import tensor as mt
+from ....lib.sparse import SparseNDArray
+from ....tests.core import require_cupy
+from ....utils import lazy_import
+from ...lib import nd_grid
+from .. import (
+    arange,
+    diag,
+    diagflat,
+    empty,
+    empty_like,
+    eye,
+    from_dataframe,
+    fromhdf5,
+    fromtiledb,
+    fromzarr,
+    full,
+    full_like,
+    indices,
+    linspace,
+    meshgrid,
+    ones_like,
+    tensor,
+    tril,
+    triu,
+    zeros,
+    zeros_like,
+)
+
+cupy = lazy_import("cupy")
+
+
+@require_cupy
+def test_array_gpu_execution(setup_gpu):
+    raw = cupy.random.rand(20, 30)
+    t = tensor(raw, dtype="f8", chunk_size=10)
+
+    res = t.execute().fetch()
+    expected = raw.astype("f8")
+    cupy.testing.assert_array_equal(res, expected)
+
+
+def test_create_sparse_execution(setup):
+    mat = sps.csr_matrix([[0, 0, 2], [2, 0, 0]])
+    t = tensor(mat, dtype="f8", chunk_size=2)
+
+    res = t.execute().fetch()
+    assert isinstance(res, SparseNDArray)
+    assert res.dtype == np.float64
+    np.testing.assert_array_equal(res.toarray(), mat.toarray())
+
+    t2 = ones_like(t, dtype="f4")
+
+    res = t2.execute().fetch()
+    expected = sps.csr_matrix([[0, 0, 1], [1, 0, 0]])
+    assert isinstance(res, SparseNDArray)
+    assert res.dtype == np.float32
+    np.testing.assert_array_equal(res.toarray(), expected.toarray())
+
+    t3 = tensor(np.array([[0, 0, 2], [2, 0, 0]]), chunk_size=2).tosparse()
+
+    res = t3.execute().fetch()
+    assert isinstance(res, SparseNDArray)
+    assert res.dtype == np.int_
+    np.testing.assert_array_equal(res.toarray(), mat.toarray())
+
+    # test missing argument
+    t4 = tensor(np.array([[0, 0, 2], [2, 0, 0]]), chunk_size=2).tosparse(missing=2)
+    t4 = t4 + 1
+    expected = mat.toarray()
+    raw = expected.copy()
+    expected[raw == 0] += 1
+    expected[raw != 0] = 0
+
+    res = t4.execute().fetch()
+    assert isinstance(res, SparseNDArray)
+    assert res.dtype == np.int_
+    np.testing.assert_array_equal(res.toarray(), expected)
+
+    # test missing argument that is np.nan
+    t5 = tensor(
+        np.array([[np.nan, np.nan, 2], [2, np.nan, -999]]), chunk_size=2
+    ).tosparse(missing=[np.nan, -999])
+    t5 = (t5 + 1).todense(fill_value=np.nan)
+    expected = mat.toarray().astype(float)
+    expected[expected != 0] += 1
+    expected[expected == 0] = np.nan
+
+    res = t5.execute().fetch()
+    assert res.dtype == np.float64
+    np.testing.assert_array_equal(res, expected)
+
+
+def test_zeros_execution(setup):
+    t = zeros((20, 30), dtype="i8", chunk_size=10)
+
+    res = t.execute().fetch()
+    np.testing.assert_array_equal(res, np.zeros((20, 30), dtype="i8"))
+    assert res[0].dtype == np.int64
+
+    t2 = zeros_like(t)
+    res = t2.execute().fetch()
+    np.testing.assert_array_equal(res, np.zeros((20, 30), dtype="i8"))
+    assert res[0].dtype == np.int64
+
+    t = zeros((20, 30), dtype="i4", chunk_size=5, sparse=True)
+    res = t.execute().fetch()
+
+    assert res[0].nnz == 0
+
+    t = zeros((20, 30), dtype="i8", chunk_size=6, order="F")
+    res = t.execute().fetch()
+    expected = np.zeros((20, 30), dtype="i8", order="F")
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+
+def test_empty_execution(setup):
+    t = empty((20, 30), dtype="i8", chunk_size=5)
+
+    res = t.execute().fetch()
+    assert res.shape == (20, 30)
+    assert res.dtype == np.int64
+
+    t = empty((20, 30), chunk_size=10)
+
+    res = t.execute().fetch()
+    assert res.shape == (20, 30)
+    assert res.dtype == np.float64
+
+    t2 = empty_like(t)
+    res = t2.execute().fetch()
+    assert res.shape == (20, 30)
+    assert res.dtype == np.float64
+
+    t = empty((20, 30), dtype="i8", chunk_size=5, order="F")
+
+    res = t.execute().fetch()
+    expected = np.empty((20, 30), dtype="i8", order="F")
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+
+def test_full_execution(setup):
+    t = full((2, 2), 1, dtype="f4", chunk_size=1)
+
+    res = t.execute().fetch()
+    np.testing.assert_array_equal(res, np.full((2, 2), 1, dtype="f4"))
+
+    t = full((2, 2), [1, 2], dtype="f8", chunk_size=1)
+
+    res = t.execute().fetch()
+    np.testing.assert_array_equal(res, np.full((2, 2), [1, 2], dtype="f8"))
+
+    t = full((2, 2), 1, dtype="f4", chunk_size=1, order="F")
+
+    res = t.execute().fetch()
+    expected = np.full((2, 2), 1, dtype="f4", order="F")
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+    t2 = full_like(t, 10, order="F")
+
+    res = t2.execute().fetch()
+    expected = np.full((2, 2), 10, dtype="f4", order="F")
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+
+def test_arange_execution(setup):
+    t = arange(1, 20, 3, chunk_size=2)
+
+    res = t.execute().fetch()
+    assert np.array_equal(res, np.arange(1, 20, 3)) is True
+
+    t = arange(1, 20, 0.3, chunk_size=4)
+
+    res = t.execute().fetch()
+    expected = np.arange(1, 20, 0.3)
+    assert np.allclose(res, expected) is True
+
+    t = arange(1.0, 1.8, 0.3, chunk_size=4)
+
+    res = t.execute().fetch()
+    expected = np.arange(1.0, 1.8, 0.3)
+    assert np.allclose(res, expected) is True
+
+    t = arange("1066-10-13", "1066-10-31", dtype=np.datetime64, chunk_size=3)
+
+    res = t.execute().fetch()
+    expected = np.arange("1066-10-13", "1066-10-31", dtype=np.datetime64)
+    assert np.array_equal(res, expected) is True
+
+
+def test_diag_execution(setup):
+    # 2-d  6 * 6
+    a = arange(36, chunk_size=5).reshape(6, 6)
+
+    d = diag(a)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(36).reshape(6, 6))
+    np.testing.assert_equal(res, expected)
+
+    d = diag(a, k=1)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(36).reshape(6, 6), k=1)
+    np.testing.assert_equal(res, expected)
+
+    d = diag(a, k=3)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(36).reshape(6, 6), k=3)
+    np.testing.assert_equal(res, expected)
+
+    d = diag(a, k=-2)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(36).reshape(6, 6), k=-2)
+    np.testing.assert_equal(res, expected)
+
+    d = diag(a, k=-5)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(36).reshape(6, 6), k=-5)
+    np.testing.assert_equal(res, expected)
+
+    # 2-d  6 * 6 sparse, no tensor
+    a = sps.rand(6, 6, density=0.1)
+
+    d = diag(a)
+    res = d.execute().fetch()
+    expected = np.diag(a.toarray())
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=1)
+    res = d.execute().fetch()
+    expected = np.diag(a.toarray(), k=1)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=3)
+    res = d.execute().fetch()
+    expected = np.diag(a.toarray(), k=3)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=-2)
+    res = d.execute().fetch()
+    expected = np.diag(a.toarray(), k=-2)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=-5)
+    res = d.execute().fetch()
+    expected = np.diag(a.toarray(), k=-5)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    # 2-d  6 * 6 sparse, from tensor
+    raw_a = sps.rand(6, 6, density=0.1)
+    a = tensor(raw_a, chunk_size=2)
+
+    d = diag(a)
+    res = d.execute().fetch()
+    expected = np.diag(raw_a.toarray())
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=1)
+    res = d.execute().fetch()
+    expected = np.diag(raw_a.toarray(), k=1)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=3)
+    res = d.execute().fetch()
+    expected = np.diag(raw_a.toarray(), k=3)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=-2)
+    res = d.execute().fetch()
+    expected = np.diag(raw_a.toarray(), k=-2)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=-5)
+    res = d.execute().fetch()
+    expected = np.diag(raw_a.toarray(), k=-5)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    # 2-d  4 * 9
+    a = arange(36, chunk_size=2).reshape(4, 9)
+
+    d = diag(a)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(36).reshape(4, 9))
+    np.testing.assert_equal(res, expected)
+
+    d = diag(a, k=1)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(36).reshape(4, 9), k=1)
+    np.testing.assert_equal(res, expected)
+
+    d = diag(a, k=3)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(36).reshape(4, 9), k=3)
+    np.testing.assert_equal(res, expected)
+
+    d = diag(a, k=-2)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(36).reshape(4, 9), k=-2)
+    np.testing.assert_equal(res, expected)
+
+    d = diag(a, k=-3)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(36).reshape(4, 9), k=-3)
+    np.testing.assert_equal(res, expected)
+
+    # 2-d  4 * 9 sparse, no tensor
+    a = sps.rand(4, 9, density=0.1)
+
+    d = diag(a)
+    res = d.execute().fetch()
+    expected = np.diag(a.toarray())
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=1)
+    res = d.execute().fetch()
+    expected = np.diag(a.toarray(), k=1)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=3)
+    res = d.execute().fetch()
+    expected = np.diag(a.toarray(), k=3)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=-2)
+    res = d.execute().fetch()
+    expected = np.diag(a.toarray(), k=-2)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=-3)
+    res = d.execute().fetch()
+    expected = np.diag(a.toarray(), k=-3)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    # 2-d  4 * 9 sparse, from tensor
+    raw_a = sps.rand(4, 9, density=0.1)
+    a = tensor(raw_a, chunk_size=2)
+
+    d = diag(a)
+    res = d.execute().fetch()
+    expected = np.diag(raw_a.toarray())
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=1)
+    res = d.execute().fetch()
+    expected = np.diag(raw_a.toarray(), k=1)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=3)
+    res = d.execute().fetch()
+    expected = np.diag(raw_a.toarray(), k=3)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=-2)
+    res = d.execute().fetch()
+    expected = np.diag(raw_a.toarray(), k=-2)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=-3)
+    res = d.execute().fetch()
+    expected = np.diag(raw_a.toarray(), k=-3)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    # 1-d
+    a = arange(5, chunk_size=2)
+
+    d = diag(a)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(5))
+    np.testing.assert_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] is True
+    assert res.flags["F_CONTIGUOUS"] is False
+
+    d = diag(a, k=1)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(5), k=1)
+    np.testing.assert_equal(res, expected)
+
+    d = diag(a, k=3)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(5), k=3)
+    np.testing.assert_equal(res, expected)
+
+    d = diag(a, k=-2)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(5), k=-2)
+    np.testing.assert_equal(res, expected)
+
+    d = diag(a, k=-3)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(5), k=-3)
+    np.testing.assert_equal(res, expected)
+
+    d = diag(a, sparse=True)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(5))
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=1, sparse=True)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(5), k=1)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=2, sparse=True)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(5), k=2)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=-2, sparse=True)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(5), k=-2)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    d = diag(a, k=-3, sparse=True)
+    res = d.execute().fetch()
+    expected = np.diag(np.arange(5), k=-3)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res.toarray(), expected)
+
+
+def test_diagflat_execution(setup):
+    a = diagflat([[1, 2], [3, 4]], chunk_size=1)
+
+    res = a.execute().fetch()
+    expected = np.diagflat([[1, 2], [3, 4]])
+    np.testing.assert_equal(res, expected)
+
+    d = tensor([[1, 2], [3, 4]], chunk_size=1)
+    a = diagflat(d)
+
+    res = a.execute().fetch()
+    expected = np.diagflat([[1, 2], [3, 4]])
+    np.testing.assert_equal(res, expected)
+
+    a = diagflat([1, 2], 1, chunk_size=1)
+
+    res = a.execute().fetch()
+    expected = np.diagflat([1, 2], 1)
+    np.testing.assert_equal(res, expected)
+
+    d = tensor([[1, 2]], chunk_size=1)
+    a = diagflat(d, 1)
+
+    res = a.execute().fetch()
+    expected = np.diagflat([1, 2], 1)
+    np.testing.assert_equal(res, expected)
+
+
+def test_eye_execution(setup):
+    t = eye(5, chunk_size=2)
+
+    res = t.execute().fetch()
+    expected = np.eye(5)
+    np.testing.assert_equal(res, expected)
+
+    t = eye(5, k=1, chunk_size=2)
+
+    res = t.execute().fetch()
+    expected = np.eye(5, k=1)
+    np.testing.assert_equal(res, expected)
+
+    t = eye(5, k=2, chunk_size=2)
+
+    res = t.execute().fetch()
+    expected = np.eye(5, k=2)
+    np.testing.assert_equal(res, expected)
+
+    t = eye(5, k=-1, chunk_size=2)
+
+    res = t.execute().fetch()
+    expected = np.eye(5, k=-1)
+    np.testing.assert_equal(res, expected)
+
+    t = eye(5, k=-3, chunk_size=2)
+
+    res = t.execute().fetch()
+    expected = np.eye(5, k=-3)
+    np.testing.assert_equal(res, expected)
+
+    t = eye(5, M=3, k=1, chunk_size=2)
+
+    res = t.execute().fetch()
+    expected = np.eye(5, M=3, k=1)
+    np.testing.assert_equal(res, expected)
+
+    t = eye(5, M=3, k=-3, chunk_size=2)
+
+    res = t.execute().fetch()
+    expected = np.eye(5, M=3, k=-3)
+    np.testing.assert_equal(res, expected)
+
+    t = eye(5, M=7, k=1, chunk_size=2)
+
+    res = t.execute().fetch()
+    expected = np.eye(5, M=7, k=1)
+    np.testing.assert_equal(res, expected)
+
+    t = eye(5, M=8, k=-3, chunk_size=2)
+
+    res = t.execute().fetch()
+    expected = np.eye(5, M=8, k=-3)
+    np.testing.assert_equal(res, expected)
+
+    t = eye(2, dtype=int)
+
+    res = t.execute().fetch()
+    assert res.dtype == np.int_
+
+    # test sparse
+    t = eye(5, sparse=True, chunk_size=2)
+
+    res = t.execute().fetch()
+    expected = np.eye(5)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    t = eye(5, k=1, sparse=True, chunk_size=2)
+
+    res = t.execute().fetch()
+    expected = np.eye(5, k=1)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    t = eye(5, k=2, sparse=True, chunk_size=2)
+
+    res = t.execute().fetch()
+    expected = np.eye(5, k=2)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    t = eye(5, k=-1, sparse=True, chunk_size=2)
+
+    res = t.execute().fetch()
+    expected = np.eye(5, k=-1)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    t = eye(5, k=-3, sparse=True, chunk_size=2)
+
+    res = t.execute().fetch()
+    expected = np.eye(5, k=-3)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    t = eye(5, M=3, k=1, sparse=True, chunk_size=2)
+
+    res = t.execute().fetch()
+    expected = np.eye(5, M=3, k=1)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    t = eye(5, M=3, k=-3, sparse=True, chunk_size=2)
+
+    res = t.execute().fetch()
+    expected = np.eye(5, M=3, k=-3)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    t = eye(5, M=7, k=1, sparse=True, chunk_size=2)
+
+    res = t.execute().fetch()
+    expected = np.eye(5, M=7, k=1)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    t = eye(5, M=8, k=-3, sparse=True, chunk_size=2)
+
+    res = t.execute().fetch()
+    expected = np.eye(5, M=8, k=-3)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res.toarray(), expected)
+
+    t = eye(5, M=9, k=-3, chunk_size=2, order="F")
+
+    res = t.execute().fetch()
+    assert res.flags["C_CONTIGUOUS"] is True
+    assert res.flags["F_CONTIGUOUS"] is False
+
+
+def test_linspace_execution(setup):
+    a = linspace(2.0, 9.0, num=11, chunk_size=3)
+
+    res = a.execute().fetch()
+    expected = np.linspace(2.0, 9.0, num=11)
+    np.testing.assert_allclose(res, expected)
+
+    a = linspace(2.0, 9.0, num=11, endpoint=False, chunk_size=3)
+
+    res = a.execute().fetch()
+    expected = np.linspace(2.0, 9.0, num=11, endpoint=False)
+    np.testing.assert_allclose(res, expected)
+
+    a = linspace(2.0, 9.0, num=11, chunk_size=3, dtype=int)
+
+    res = a.execute().fetch()
+    assert res.dtype == np.int_
+
+
+def test_meshgrid_execution(setup):
+    a = arange(5, chunk_size=2)
+    b = arange(6, 12, chunk_size=3)
+    c = arange(12, 19, chunk_size=4)
+
+    A, B, C = meshgrid(a, b, c)
+
+    A_res = A.execute().fetch()
+    A_expected = np.meshgrid(np.arange(5), np.arange(6, 12), np.arange(12, 19))[0]
+    np.testing.assert_equal(A_res, A_expected)
+
+    B_res = B.execute().fetch()
+    B_expected = np.meshgrid(np.arange(5), np.arange(6, 12), np.arange(12, 19))[1]
+    np.testing.assert_equal(B_res, B_expected)
+
+    C_res = C.execute().fetch()
+    C_expected = np.meshgrid(np.arange(5), np.arange(6, 12), np.arange(12, 19))[2]
+    np.testing.assert_equal(C_res, C_expected)
+
+    A, B, C = meshgrid(a, b, c, indexing="ij")
+
+    A_res = A.execute().fetch()
+    A_expected = np.meshgrid(
+        np.arange(5), np.arange(6, 12), np.arange(12, 19), indexing="ij"
+    )[0]
+    np.testing.assert_equal(A_res, A_expected)
+
+    B_res = B.execute().fetch()
+    B_expected = np.meshgrid(
+        np.arange(5), np.arange(6, 12), np.arange(12, 19), indexing="ij"
+    )[1]
+    np.testing.assert_equal(B_res, B_expected)
+
+    C_res = C.execute().fetch()
+    C_expected = np.meshgrid(
+        np.arange(5), np.arange(6, 12), np.arange(12, 19), indexing="ij"
+    )[2]
+    np.testing.assert_equal(C_res, C_expected)
+
+    A, B, C = meshgrid(a, b, c, sparse=True)
+
+    A_res = A.execute().fetch()
+    A_expected = np.meshgrid(
+        np.arange(5), np.arange(6, 12), np.arange(12, 19), sparse=True
+    )[0]
+    np.testing.assert_equal(A_res, A_expected)
+
+    B_res = B.execute().fetch()
+    B_expected = np.meshgrid(
+        np.arange(5), np.arange(6, 12), np.arange(12, 19), sparse=True
+    )[1]
+    np.testing.assert_equal(B_res, B_expected)
+
+    C_res = C.execute().fetch()
+    C_expected = np.meshgrid(
+        np.arange(5), np.arange(6, 12), np.arange(12, 19), sparse=True
+    )[2]
+    np.testing.assert_equal(C_res, C_expected)
+
+    A, B, C = meshgrid(a, b, c, indexing="ij", sparse=True)
+
+    A_res = A.execute().fetch()
+    A_expected = np.meshgrid(
+        np.arange(5), np.arange(6, 12), np.arange(12, 19), indexing="ij", sparse=True
+    )[0]
+    np.testing.assert_equal(A_res, A_expected)
+
+    B_res = B.execute().fetch()
+    B_expected = np.meshgrid(
+        np.arange(5), np.arange(6, 12), np.arange(12, 19), indexing="ij", sparse=True
+    )[1]
+    np.testing.assert_equal(B_res, B_expected)
+
+    C_res = C.execute().fetch()
+    C_expected = np.meshgrid(
+        np.arange(5), np.arange(6, 12), np.arange(12, 19), indexing="ij", sparse=True
+    )[2]
+    np.testing.assert_equal(C_res, C_expected)
+
+
+def test_indices_execution(setup):
+    grid = indices((2, 3), chunk_size=1)
+
+    res = grid.execute().fetch()
+    expected = np.indices((2, 3))
+    np.testing.assert_equal(res, expected)
+
+    res = grid[0].execute().fetch()
+    np.testing.assert_equal(res, expected[0])
+
+    res = grid[1].execute().fetch()
+    np.testing.assert_equal(res, expected[1])
+
+
+def test_triu_execution(setup):
+    a = arange(24, chunk_size=2).reshape(2, 3, 4)
+
+    t = triu(a)
+
+    res = t.execute().fetch()
+    expected = np.triu(np.arange(24).reshape(2, 3, 4))
+    np.testing.assert_equal(res, expected)
+
+    t = triu(a, k=1)
+
+    res = t.execute().fetch()
+    expected = np.triu(np.arange(24).reshape(2, 3, 4), k=1)
+    np.testing.assert_equal(res, expected)
+
+    t = triu(a, k=2)
+
+    res = t.execute().fetch()
+    expected = np.triu(np.arange(24).reshape(2, 3, 4), k=2)
+    np.testing.assert_equal(res, expected)
+
+    t = triu(a, k=-1)
+
+    res = t.execute().fetch()
+    expected = np.triu(np.arange(24).reshape(2, 3, 4), k=-1)
+    np.testing.assert_equal(res, expected)
+
+    t = triu(a, k=-2)
+
+    res = t.execute().fetch()
+    expected = np.triu(np.arange(24).reshape(2, 3, 4), k=-2)
+    np.testing.assert_equal(res, expected)
+
+    # test sparse
+    a = arange(12, chunk_size=2).reshape(3, 4).tosparse()
+
+    t = triu(a)
+
+    res = t.execute().fetch()
+    expected = np.triu(np.arange(12).reshape(3, 4))
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res, expected)
+
+    t = triu(a, k=1)
+
+    res = t.execute().fetch()
+    expected = np.triu(np.arange(12).reshape(3, 4), k=1)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res, expected)
+
+    t = triu(a, k=2)
+
+    res = t.execute().fetch()
+    expected = np.triu(np.arange(12).reshape(3, 4), k=2)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res, expected)
+
+    t = triu(a, k=-1)
+
+    res = t.execute().fetch()
+    expected = np.triu(np.arange(12).reshape(3, 4), k=-1)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res, expected)
+
+    t = triu(a, k=-2)
+
+    res = t.execute().fetch()
+    expected = np.triu(np.arange(12).reshape(3, 4), k=-2)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res, expected)
+
+    raw = np.asfortranarray(np.random.rand(10, 7))
+    a = tensor(raw, chunk_size=3)
+
+    t = triu(a, k=-2)
+
+    res = t.execute().fetch()
+    expected = np.triu(raw, k=-2)
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+
+def test_tril_execution(setup):
+    a = arange(24, chunk_size=2).reshape(2, 3, 4)
+
+    t = tril(a)
+
+    res = t.execute().fetch()
+    expected = np.tril(np.arange(24).reshape(2, 3, 4))
+    np.testing.assert_equal(res, expected)
+
+    t = tril(a, k=1)
+
+    res = t.execute().fetch()
+    expected = np.tril(np.arange(24).reshape(2, 3, 4), k=1)
+    np.testing.assert_equal(res, expected)
+
+    t = tril(a, k=2)
+
+    res = t.execute().fetch()
+    expected = np.tril(np.arange(24).reshape(2, 3, 4), k=2)
+    np.testing.assert_equal(res, expected)
+
+    t = tril(a, k=-1)
+
+    res = t.execute().fetch()
+    expected = np.tril(np.arange(24).reshape(2, 3, 4), k=-1)
+    np.testing.assert_equal(res, expected)
+
+    t = tril(a, k=-2)
+
+    res = t.execute().fetch()
+    expected = np.tril(np.arange(24).reshape(2, 3, 4), k=-2)
+    np.testing.assert_equal(res, expected)
+
+    a = arange(12, chunk_size=2).reshape(3, 4).tosparse()
+
+    t = tril(a)
+
+    res = t.execute().fetch()
+    expected = np.tril(np.arange(12).reshape(3, 4))
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res, expected)
+
+    t = tril(a, k=1)
+
+    res = t.execute().fetch()
+    expected = np.tril(np.arange(12).reshape(3, 4), k=1)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res, expected)
+
+    t = tril(a, k=2)
+
+    res = t.execute().fetch()
+    expected = np.tril(np.arange(12).reshape(3, 4), k=2)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res, expected)
+
+    t = tril(a, k=-1)
+
+    res = t.execute().fetch()
+    expected = np.tril(np.arange(12).reshape(3, 4), k=-1)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res, expected)
+
+    t = tril(a, k=-2)
+
+    res = t.execute().fetch()
+    expected = np.tril(np.arange(12).reshape(3, 4), k=-2)
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_equal(res, expected)
+
+
+def test_index_trick_execution(setup):
+    mgrid = nd_grid()
+    t = mgrid[0:5, 0:5]
+
+    res = t.execute().fetch()
+    expected = np.lib.index_tricks.nd_grid()[0:5, 0:5]
+    np.testing.assert_equal(res, expected)
+
+    t = mgrid[-1:1:5j]
+
+    res = t.execute().fetch()
+    expected = np.lib.index_tricks.nd_grid()[-1:1:5j]
+    np.testing.assert_equal(res, expected)
+
+    ogrid = nd_grid(sparse=True)
+
+    t = ogrid[0:5, 0:5]
+
+    res = [o.execute().fetch() for o in t]
+    expected = np.lib.index_tricks.nd_grid(sparse=True)[0:5, 0:5]
+    [np.testing.assert_equal(r, e) for r, e in zip(res, expected)]
+
+
+@pytest.mark.skipif(tiledb is None, reason="tiledb not installed")
+def test_read_tile_db_execution(setup):
+    ctx = tiledb.Ctx()
+
+    tempdir = tempfile.mkdtemp()
+    try:
+        # create TileDB dense array
+        dom = tiledb.Domain(
+            tiledb.Dim(ctx=ctx, domain=(1, 100), tile=30, dtype=np.int32),
+            tiledb.Dim(ctx=ctx, domain=(0, 90), tile=22, dtype=np.int32),
+            tiledb.Dim(ctx=ctx, domain=(0, 9), tile=8, dtype=np.int32),
+            ctx=ctx,
+        )
+        schema = tiledb.ArraySchema(
+            ctx=ctx,
+            domain=dom,
+            sparse=False,
+            attrs=[tiledb.Attr(ctx=ctx, dtype=np.float64)],
+        )
+        tiledb.DenseArray.create(tempdir, schema)
+
+        expected = np.random.rand(100, 91, 10)
+        with tiledb.DenseArray(uri=tempdir, ctx=ctx, mode="w") as arr:
+            arr.write_direct(expected)
+
+        a = fromtiledb(tempdir, ctx=ctx)
+        result = a.execute().fetch()
+
+        np.testing.assert_allclose(expected, result)
+    finally:
+        shutil.rmtree(tempdir)
+
+    tempdir = tempfile.mkdtemp()
+    try:
+        # create 2-d TileDB sparse array
+        dom = tiledb.Domain(
+            tiledb.Dim(ctx=ctx, domain=(0, 99), tile=30, dtype=np.int32),
+            tiledb.Dim(ctx=ctx, domain=(2, 11), tile=8, dtype=np.int32),
+            ctx=ctx,
+        )
+        schema = tiledb.ArraySchema(
+            ctx=ctx,
+            domain=dom,
+            sparse=True,
+            attrs=[tiledb.Attr(ctx=ctx, dtype=np.float64)],
+        )
+        tiledb.SparseArray.create(tempdir, schema)
+
+        expected = sps.rand(100, 10, density=0.01)
+        with tiledb.SparseArray(uri=tempdir, ctx=ctx, mode="w") as arr:
+            I, J = expected.row, expected.col + 2
+            arr[I, J] = {arr.attr(0).name: expected.data}
+
+        a = fromtiledb(tempdir, ctx=ctx)
+        result = a.execute().fetch()
+
+        np.testing.assert_allclose(expected.toarray(), result.toarray())
+    finally:
+        shutil.rmtree(tempdir)
+
+    tempdir = tempfile.mkdtemp()
+    try:
+        # create 1-d TileDB sparse array
+        dom = tiledb.Domain(
+            tiledb.Dim(ctx=ctx, domain=(1, 100), tile=30, dtype=np.int32), ctx=ctx
+        )
+        schema = tiledb.ArraySchema(
+            ctx=ctx,
+            domain=dom,
+            sparse=True,
+            attrs=[tiledb.Attr(ctx=ctx, dtype=np.float64)],
+        )
+        tiledb.SparseArray.create(tempdir, schema)
+
+        expected = sps.rand(1, 100, density=0.05)
+        with tiledb.SparseArray(uri=tempdir, ctx=ctx, mode="w") as arr:
+            arr[expected.col + 1] = expected.data
+
+        a = fromtiledb(tempdir, ctx=ctx)
+        result = a.execute().fetch()
+
+        np.testing.assert_allclose(expected.toarray()[0], result.toarray())
+    finally:
+        shutil.rmtree(tempdir)
+
+    tempdir = tempfile.mkdtemp()
+    try:
+        # create TileDB dense array with column-major
+        dom = tiledb.Domain(
+            tiledb.Dim(ctx=ctx, domain=(1, 100), tile=30, dtype=np.int32),
+            tiledb.Dim(ctx=ctx, domain=(0, 90), tile=22, dtype=np.int32),
+            tiledb.Dim(ctx=ctx, domain=(0, 9), tile=8, dtype=np.int32),
+            ctx=ctx,
+        )
+        schema = tiledb.ArraySchema(
+            ctx=ctx,
+            domain=dom,
+            sparse=False,
+            cell_order="F",
+            attrs=[tiledb.Attr(ctx=ctx, dtype=np.float64)],
+        )
+        tiledb.DenseArray.create(tempdir, schema)
+
+        expected = np.asfortranarray(np.random.rand(100, 91, 10))
+        with tiledb.DenseArray(uri=tempdir, ctx=ctx, mode="w") as arr:
+            arr.write_direct(expected)
+
+        a = fromtiledb(tempdir, ctx=ctx)
+        result = a.execute().fetch()
+
+        np.testing.assert_allclose(expected, result)
+        assert result.flags["F_CONTIGUOUS"] is True
+        assert result.flags["C_CONTIGUOUS"] is False
+    finally:
+        shutil.rmtree(tempdir)
+
+
+def test_from_dataframe_execution(setup):
+    mdf = md.DataFrame(
+        {"angle": [0, 3, 4], "degree": [360, 180, 360]},
+        index=["circle", "triangle", "rectangle"],
+    )
+    tensor_result = from_dataframe(mdf).execute().fetch()
+    tensor_expected = mt.tensor([[0, 360], [3, 180], [4, 360]]).execute().fetch()
+    np.testing.assert_equal(tensor_result, tensor_expected)
+
+    # test up-casting
+    mdf2 = md.DataFrame({"a": [0.1, 0.2, 0.3], "b": [1, 2, 3]})
+    tensor_result2 = from_dataframe(mdf2).execute().fetch()
+    np.testing.assert_equal(tensor_result2[0].dtype, np.dtype("float64"))
+    tensor_expected2 = mt.tensor([[0.1, 1.0], [0.2, 2.0], [0.3, 3.0]]).execute().fetch()
+    np.testing.assert_equal(tensor_result2, tensor_expected2)
+
+    raw = [[0.1, 0.2, 0.4], [0.4, 0.7, 0.3]]
+    mdf3 = md.DataFrame(raw, columns=list("abc"), chunk_size=2)
+    tensor_result3 = from_dataframe(mdf3).execute().fetch()
+    np.testing.assert_array_equal(tensor_result3, np.asarray(raw))
+    assert tensor_result3.flags["F_CONTIGUOUS"] is True
+    assert tensor_result3.flags["C_CONTIGUOUS"] is False
+
+    # test from series
+    series = md.Series([1, 2, 3])
+    tensor_result = series.to_tensor().execute().fetch()
+    np.testing.assert_array_equal(tensor_result, np.array([1, 2, 3]))
+
+    series = md.Series(range(10), chunk_size=3)
+    tensor_result = series.to_tensor().execute().fetch()
+    np.testing.assert_array_equal(tensor_result, np.arange(10))
+
+    # test from index
+    index = md.Index(pd.MultiIndex.from_tuples([(0, 1), (2, 3), (4, 5)]))
+    tensor_result = index.to_tensor(extract_multi_index=True).execute().fetch()
+    np.testing.assert_array_equal(tensor_result, np.arange(6).reshape((3, 2)))
+
+    index = md.Index(pd.MultiIndex.from_tuples([(0, 1), (2, 3), (4, 5)]))
+    tensor_result = index.to_tensor(extract_multi_index=False).execute().fetch()
+    np.testing.assert_array_equal(
+        tensor_result, pd.MultiIndex.from_tuples([(0, 1), (2, 3), (4, 5)]).to_series()
+    )
+
+
+@pytest.mark.skipif(h5py is None, reason="h5py not installed")
+def test_read_hdf5_execution(setup):
+    test_array = np.random.RandomState(0).rand(20, 10)
+    group_name = "test_group"
+    dataset_name = "test_dataset"
+
+    with pytest.raises(TypeError):
+        fromhdf5(object())
+
+    with tempfile.TemporaryDirectory() as d:
+        filename = os.path.join(d, f"test_read_{int(time.time())}.hdf5")
+        with h5py.File(filename, "w") as f:
+            g = f.create_group(group_name)
+            g.create_dataset(dataset_name, chunks=(7, 4), data=test_array)
+
+        # test filename
+        r = fromhdf5(filename, group=group_name, dataset=dataset_name)
+
+        result = r.execute().fetch()
+        np.testing.assert_array_equal(result, test_array)
+        assert r.extra_params["raw_chunk_size"] == (7, 4)
+
+        with pytest.raises(ValueError):
+            fromhdf5(filename)
+
+        with pytest.raises(ValueError):
+            fromhdf5(filename, dataset="non_exist")
+
+        with h5py.File(filename, "r") as f:
+            # test file
+            r = fromhdf5(f, group=group_name, dataset=dataset_name)
+
+            result = r.execute().fetch()
+            np.testing.assert_array_equal(result, test_array)
+
+            with pytest.raises(ValueError):
+                fromhdf5(f)
+
+            with pytest.raises(ValueError):
+                fromhdf5(f, dataset="non_exist")
+
+            # test dataset
+            ds = f[f"{group_name}/{dataset_name}"]
+            r = fromhdf5(ds)
+
+            result = r.execute().fetch()
+            np.testing.assert_array_equal(result, test_array)
+
+
+@pytest.mark.skipif(zarr is None, reason="zarr not installed")
+def test_read_zarr_execution(setup):
+    session = setup
+
+    test_array = np.random.RandomState(0).rand(20, 10)
+    group_name = "test_group"
+    dataset_name = "test_dataset"
+
+    with pytest.raises(TypeError):
+        fromzarr(object())
+
+    with tempfile.TemporaryDirectory() as d:
+        path = os.path.join(d, f"test_read_{int(time.time())}.zarr")
+
+        group = zarr.group(path)
+        arr = group.array(group_name + "/" + dataset_name, test_array, chunks=(7, 4))
+
+        r = fromzarr(arr)
+
+        result = r.execute().fetch()
+        np.testing.assert_array_equal(result, test_array)
+        assert len(session._session._tileable_to_fetch[r.data].chunks) > 1
+
+        arr = zarr.open_array(f"{path}/{group_name}/{dataset_name}")
+        r = fromzarr(arr)
+
+        result = r.execute().fetch()
+        np.testing.assert_array_equal(result, test_array)
+        assert len(session._session._tileable_to_fetch[r.data].chunks) > 1
+
+        r = fromzarr(path, group=group_name, dataset=dataset_name)
+
+        result = r.execute().fetch()
+        np.testing.assert_array_equal(result, test_array)
+        assert len(session._session._tileable_to_fetch[r.data].chunks) > 1
+
+        r = fromzarr(path + "/" + group_name + "/" + dataset_name)
+
+        result = r.execute().fetch()
+        np.testing.assert_array_equal(result, test_array)
+        assert len(session._session._tileable_to_fetch[r.data].chunks) > 1
diff --git a/python/xorbits/_mars/tensor/datasource/tri.py b/python/xorbits/_mars/tensor/datasource/tri.py
new file mode 100644
index 000000000..c1c0ca4ed
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/tri.py
@@ -0,0 +1,204 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...lib import sparse
+from ...serialization.serializables import Int32Field, KeyField
+from ...utils import has_unknown_shape
+from ..array_utils import create_array
+from ..core import TensorOrder
+from .array import tensor as astensor
+from .core import TensorHasInput
+from .zeros import TensorZeros
+
+
+class TensorTri(TensorHasInput):
+    def __call__(self, m, order=None):
+        order = TensorOrder.C_ORDER if order is None else order
+        return self.new_tensor([m], shape=m.shape, order=order)
+
+    def to_chunk_op(self, *args):
+        (k,) = args
+        op = self.copy().reset_key()
+        op._k = k
+        return op
+
+    @classmethod
+    def tile(cls, op):
+        if has_unknown_shape(*op.inputs):
+            yield
+        tensor = op.outputs[0]
+
+        m = op.input
+        k = op.k
+        is_triu = type(op) == TensorTriu
+
+        fx = lambda x, y: x - y + k
+        nsplits = m.nsplits
+        cum_size = [np.cumsum(s).tolist() for s in nsplits]
+
+        out_chunks = []
+        for out_idx in itertools.product(*[range(len(s)) for s in nsplits]):
+            i, j = out_idx[-2:]
+            ld_pos = cum_size[-2][i] - 1, cum_size[-1][j] - nsplits[-1][j]
+            ru_pos = cum_size[-2][i] - nsplits[-2][i], cum_size[-1][j] - 1
+
+            ld_fx = fx(*ld_pos)
+            ru_fx = fx(*ru_pos)
+
+            chunk_shape = tuple(nsplits[i][idx] for i, idx in enumerate(out_idx))
+            if (is_triu and ld_fx > 0 and ru_fx > 0) or (
+                not is_triu and ld_fx < 0 and ru_fx < 0
+            ):
+                # does not cross, fill with zeros
+                chunk_op = TensorZeros(
+                    dtype=op.dtype,
+                    gpu=op.gpu,
+                    sparse=op.sparse,
+                    shape=chunk_shape,
+                    order=tensor.order.value,
+                )
+                out_chunk = chunk_op.new_chunk(
+                    None, shape=chunk_shape, index=out_idx, order=tensor.order
+                )
+            else:
+                lu_pos = ru_pos[0], ld_pos[1]
+                chunk_k = fx(*lu_pos)
+
+                input_chunk = m.cix[out_idx]
+                chunk_op = op.to_chunk_op(chunk_k)
+                out_chunk = chunk_op.new_chunk(
+                    [input_chunk], shape=chunk_shape, index=out_idx, order=tensor.order
+                )
+
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        params = tensor.params
+        params["chunks"] = out_chunks
+        params["nsplits"] = m.nsplits
+        return new_op.new_tensors(op.inputs, kws=[params])
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        f = "triu" if isinstance(op, TensorTriu) else "tril"
+        if op.sparse:
+            ctx[chunk.key] = getattr(sparse, f)(ctx[op.inputs[0].key], k=op.k)
+        else:
+            ctx[chunk.key] = create_array(op)(f, ctx[op.inputs[0].key], op.k)
+
+
+class TensorTriu(TensorTri):
+    _op_type_ = OperandDef.TENSOR_TRIU
+
+    _input = KeyField("input")
+    _k = Int32Field("k")
+
+    def __init__(self, k=None, **kw):
+        super().__init__(_k=k, **kw)
+
+    @property
+    def k(self):
+        return self._k
+
+
+def triu(m, k=0, gpu=None):
+    """
+    Upper triangle of a tensor.
+
+    Return a copy of a matrix with the elements below the `k`-th diagonal
+    zeroed.
+
+    Please refer to the documentation for `tril` for further details.
+
+    See Also
+    --------
+    tril : lower triangle of a tensor
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.triu([[1,2,3],[4,5,6],[7,8,9],[10,11,12]], -1).execute()
+    array([[ 1,  2,  3],
+           [ 4,  5,  6],
+           [ 0,  8,  9],
+           [ 0,  0, 12]])
+
+    """
+    m = astensor(m)
+    gpu = m.op.gpu if gpu is None else gpu
+    op = TensorTriu(k, dtype=m.dtype, sparse=m.issparse(), gpu=gpu)
+    return op(m)
+
+
+class TensorTril(TensorTri):
+    _op_type_ = OperandDef.TENSOR_TRIL
+
+    _input = KeyField("input")
+    _k = Int32Field("k")
+
+    def __init__(self, k=None, **kw):
+        super().__init__(_k=k, **kw)
+
+    @property
+    def k(self):
+        return self._k
+
+
+def tril(m, k=0, gpu=None):
+    """
+    Lower triangle of a tensor.
+
+    Return a copy of a tensor with elements above the `k`-th diagonal zeroed.
+
+    Parameters
+    ----------
+    m : array_like, shape (M, N)
+        Input tensor.
+    k : int, optional
+        Diagonal above which to zero elements.  `k = 0` (the default) is the
+        main diagonal, `k < 0` is below it and `k > 0` is above.
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, None as default
+
+    Returns
+    -------
+    tril : Tensor, shape (M, N)
+        Lower triangle of `m`, of same shape and data-type as `m`.
+
+    See Also
+    --------
+    triu : same thing, only for the upper triangle
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.tril([[1,2,3],[4,5,6],[7,8,9],[10,11,12]], -1).execute()
+    array([[ 0,  0,  0],
+           [ 4,  0,  0],
+           [ 7,  8,  0],
+           [10, 11, 12]])
+
+    """
+    m = astensor(m)
+    gpu = m.op.gpu if gpu is None else gpu
+    op = TensorTril(k, dtype=m.dtype, sparse=m.issparse(), gpu=gpu)
+    return op(m)
diff --git a/python/xorbits/_mars/tensor/datasource/zeros.py b/python/xorbits/_mars/tensor/datasource/zeros.py
new file mode 100644
index 000000000..250dc9824
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datasource/zeros.py
@@ -0,0 +1,231 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...lib import sparse
+from ...lib.sparse.core import get_array_module, get_sparse_module, naked
+from ...serialization.serializables import (
+    AnyField,
+    FieldTypes,
+    KeyField,
+    StringField,
+    TupleField,
+)
+from ..array_utils import create_array
+from ..utils import get_order
+from .array import tensor
+from .core import TensorLike, TensorNoInput
+
+
+class TensorZeros(TensorNoInput):
+    _op_type_ = OperandDef.TENSOR_ZEROS
+
+    order = StringField("order")
+    shape = TupleField("shape", FieldTypes.int64)
+    chunk_size = AnyField("chunk_size")
+
+    def __init__(self, shape=None, **kwargs):
+        if type(shape) is int:
+            shape = (shape,)
+        super().__init__(shape=shape, **kwargs)
+
+    def to_chunk_op(self, *args):
+        chunk_op = super().to_chunk_op(*args)
+        chunk_op.shape = args[0]
+        chunk_op.chunk_size = None
+        return chunk_op
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        if op.sparse:
+            ctx[chunk.key] = sparse.zeros(op.shape, dtype=op.dtype, gpu=op.gpu)
+        else:
+            ctx[chunk.key] = create_array(op)(
+                "zeros", op.shape, dtype=op.dtype, order=op.order
+            )
+
+
+def zeros(shape, dtype=None, chunk_size=None, gpu=None, sparse=False, order="C"):
+    """
+    Return a new tensor of given shape and type, filled with zeros.
+
+    Parameters
+    ----------
+    shape : int or sequence of ints
+        Shape of the new tensor, e.g., ``(2, 3)`` or ``2``.
+    dtype : data-type, optional
+        The desired data-type for the array, e.g., `mt.int8`.  Default is
+        `mt.float64`.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    sparse: bool, optional
+        Create sparse tensor if True, False as default
+    order : {'C', 'F'}, optional, default: 'C'
+        Whether to store multi-dimensional data in row-major
+        (C-style) or column-major (Fortran-style) order in
+        memory.
+
+    Returns
+    -------
+    out : Tensor
+        Tensor of zeros with the given shape, dtype, and order.
+
+    See Also
+    --------
+    zeros_like : Return a tensor of zeros with shape and type of input.
+    ones_like : Return a tensor of ones with shape and type of input.
+    empty_like : Return a empty tensor with shape and type of input.
+    ones : Return a new tensor setting values to one.
+    empty : Return a new uninitialized tensor.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> mt.zeros(5).execute()
+    array([ 0.,  0.,  0.,  0.,  0.])
+
+    >>> mt.zeros((5,), dtype=int).execute()
+    array([0, 0, 0, 0, 0])
+
+    >>> mt.zeros((2, 1)).execute()
+    array([[ 0.],
+           [ 0.]])
+
+    >>> s = (2,2)
+    >>> mt.zeros(s).execute()
+    array([[ 0.,  0.],
+           [ 0.,  0.]])
+
+    >>> mt.zeros((2,), dtype=[('x', 'i4'), ('y', 'i4')]).execute() # custom dtype
+    array([(0, 0), (0, 0)],
+          dtype=[('x', '<i4'), ('y', '<i4')])
+    """
+    tensor_order = get_order(
+        order,
+        None,
+        available_options="CF",
+        err_msg="only 'C' or 'F' order is permitted",
+    )
+    dtype = np.dtype(dtype or "f8")
+    op = TensorZeros(
+        dtype=dtype,
+        shape=shape,
+        chunk_size=chunk_size,
+        gpu=gpu,
+        sparse=sparse,
+        order=order,
+    )
+    return op(shape, chunk_size=chunk_size, order=tensor_order)
+
+
+class TensorZerosLike(TensorLike):
+    _op_type_ = OperandDef.TENSOR_ZEROS_LIKE
+
+    _input = KeyField("input")
+    _order = StringField("order")
+
+    def __init__(self, dtype=None, gpu=None, sparse=False, order=None, **kw):
+        dtype = np.dtype(dtype) if dtype is not None else None
+        super().__init__(dtype=dtype, gpu=gpu, sparse=sparse, _order=order, **kw)
+
+    @property
+    def order(self):
+        return self._order
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        if op.sparse:
+            in_data = naked(ctx[op.inputs[0].key])
+            xps = get_sparse_module(in_data)
+            xp = get_array_module(in_data)
+            ctx[chunk.key] = sparse.SparseNDArray(
+                xps.csr_matrix(
+                    (
+                        xp.zeros_like(in_data.data, dtype=op.dtype),
+                        in_data.indices,
+                        in_data.indptr,
+                    ),
+                    shape=in_data.shape,
+                )
+            )
+        else:
+            ctx[chunk.key] = create_array(op)(
+                "zeros_like", ctx[op.inputs[0].key], dtype=op.dtype, order=op.order
+            )
+
+
+def zeros_like(a, dtype=None, gpu=None, order="K"):
+    """
+    Return a tensor of zeros with the same shape and type as a given tensor.
+
+    Parameters
+    ----------
+    a : array_like
+        The shape and data-type of `a` define these same attributes of
+        the returned array.
+    dtype : data-type, optional
+        Overrides the data type of the result.
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, None as default
+    order : {'C', 'F', 'A', or 'K'}, optional
+        Overrides the memory layout of the result. 'C' means C-order,
+        'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous,
+        'C' otherwise. 'K' means match the layout of `a` as closely
+        as possible.
+
+    Returns
+    -------
+    out : Tensor
+        tensor of zeros with the same shape and type as `a`.
+
+    See Also
+    --------
+    ones_like : Return an array of ones with shape and type of input.
+    empty_like : Return an empty array with shape and type of input.
+    zeros : Return a new array setting values to zero.
+    ones : Return a new array setting values to one.
+    empty : Return a new uninitialized array.
+
+    Examples
+    --------
+    >>> import mars.tensr as mt
+    >>> x = mt.arange(6)
+    >>> x = x.reshape((2, 3))
+    >>> x.execute()
+    array([[0, 1, 2],
+           [3, 4, 5]])
+
+    >>> mt.zeros_like(x).execute()
+    array([[0, 0, 0],
+           [0, 0, 0]])
+
+    >>> y = mt.arange(3, dtype=float)
+    >>> y.execute()
+    array([ 0.,  1.,  2.])
+
+    >>> mt.zeros_like(y).execute()
+    array([ 0.,  0.,  0.])
+    """
+    a = tensor(a)
+    tensor_order = get_order(order, a.order)
+    gpu = a.op.gpu if gpu is None else gpu
+    op = TensorZerosLike(dtype=dtype, gpu=gpu, sparse=a.issparse(), order=order)
+    return op(a, order=tensor_order)
diff --git a/python/xorbits/_mars/tensor/datastore/__init__.py b/python/xorbits/_mars/tensor/datastore/__init__.py
new file mode 100644
index 000000000..bdf35e360
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datastore/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .to_hdf5 import TensorHDF5DataStore, tohdf5
+from .to_tiledb import TensorTileDBConsolidate, TensorTileDBDataStore, totiledb
+from .to_vineyard import (
+    TensorVineyardDataStoreChunk,
+    TensorVineyardDataStoreMeta,
+    tovineyard,
+)
+from .to_zarr import TensorToZarrDataStore, tozarr
diff --git a/python/xorbits/_mars/tensor/datastore/core.py b/python/xorbits/_mars/tensor/datastore/core.py
new file mode 100644
index 000000000..0bee60ff7
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datastore/core.py
@@ -0,0 +1,56 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..operands import TensorHasInput, TensorOperandMixin
+
+
+class TensorDataStore(TensorHasInput, TensorOperandMixin):
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = inputs[0]
+
+    def __call__(self, a, order=None):
+        shape = (0,) * a.ndim
+        order = a.order if order is None else order
+        return self.new_tensor([a], shape, order=order)
+
+    @classmethod
+    def _get_out_chunk(cls, op, in_chunk):
+        chunk_op = op.copy().reset_key()
+        out_chunk_shape = (0,) * in_chunk.ndim
+        return chunk_op.new_chunk(
+            [in_chunk], out_chunk_shape, index=in_chunk.index, order=op.outputs[0].order
+        )
+
+    @classmethod
+    def _process_out_chunks(cls, op, out_chunks):
+        return out_chunks
+
+    @classmethod
+    def tile(cls, op):
+        in_tensor = op.input
+
+        out_chunks = []
+        for chunk in in_tensor.chunks:
+            out_chunk = cls._get_out_chunk(op, chunk)
+            out_chunks.append(out_chunk)
+        out_chunks = cls._process_out_chunks(op, out_chunks)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            op.outputs[0].shape,
+            chunks=out_chunks,
+            nsplits=((0,) for _ in range(in_tensor.ndim)),
+        )
diff --git a/python/xorbits/_mars/tensor/datastore/tests/__init__.py b/python/xorbits/_mars/tensor/datastore/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datastore/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/datastore/tests/test_datastore.py b/python/xorbits/_mars/tensor/datastore/tests/test_datastore.py
new file mode 100644
index 000000000..4430f2852
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datastore/tests/test_datastore.py
@@ -0,0 +1,109 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+
+import numpy as np
+import pytest
+
+try:
+    import tiledb
+except (ImportError, OSError):  # pragma: no cover
+    tiledb = None
+
+from ....core import tile
+from ... import random
+from .. import totiledb
+from ..utils import check_tiledb_array_with_tensor, get_tiledb_schema_from_tensor
+
+
+@pytest.mark.skipif(tiledb is None, reason="TileDB not installed")
+def test_get_tile_db_schema():
+    ctx = tiledb.Ctx()
+
+    nsplits = ((1, 2), (3, 1), (2, 2, 1))
+    a = random.rand(3, 4, 5, dtype=np.float64, chunk_size=nsplits)
+    schema = get_tiledb_schema_from_tensor(a, ctx, nsplits)
+    assert schema.ndim == 3
+    assert schema.shape == (3, 4, 5)
+    assert [schema.domain.dim(i).tile for i in range(a.ndim)] == [2, 3, 2]
+    assert schema.attr(0).dtype == a.dtype
+
+
+@pytest.mark.skipif(tiledb is None, reason="TileDB not installed")
+def test_check_tile_db():
+    ctx = tiledb.Ctx()
+
+    tempdir = tempfile.mkdtemp()
+    try:
+        np_a = np.random.rand(2, 3)
+        tiledb_a = tiledb.DenseArray.from_numpy(ctx=ctx, uri=tempdir, array=np_a)
+
+        with pytest.raises(ValueError):
+            # ndim not match
+            check_tiledb_array_with_tensor(random.rand(2, 3, 4), tiledb_a)
+
+        with pytest.raises(ValueError):
+            # shape not matchn
+            check_tiledb_array_with_tensor(random.rand(2, 4), tiledb_a)
+
+        with pytest.raises(ValueError):
+            # dtype not match
+            check_tiledb_array_with_tensor(
+                random.rand(2, 3, dtype=np.float32), tiledb_a
+            )
+
+        # legal
+        check_tiledb_array_with_tensor(random.rand(2, 3), tiledb_a)
+    finally:
+        shutil.rmtree(tempdir)
+
+
+@pytest.mark.skipif(tiledb is None, reason="TileDB not installed")
+def test_store_tile_db():
+    ctx = tiledb.Ctx()
+    tempdir = tempfile.mkdtemp()
+    try:
+        t = random.rand(50, 30, chunk_size=13)
+        t2 = t + 1
+
+        saved = totiledb(tempdir, t2)
+        assert saved.shape == (0, 0)
+        assert saved.op.tiledb_config is None
+        assert saved.op.tiledb_uri == tempdir
+
+        with pytest.raises(tiledb.TileDBError):
+            tiledb.DenseArray(ctx=ctx, uri=tempdir)
+
+        # tiledb array is created in the tile
+        saved = tile(saved)
+
+        # no error
+        tiledb.DenseArray(ctx=ctx, uri=tempdir)
+
+        # TileDB consolidation
+        assert len(saved.chunks) == 1
+
+        assert saved.chunks[0].inputs[0].op.axis_offsets == (0, 0)
+        assert saved.chunks[0].inputs[1].op.axis_offsets == (0, 13)
+        assert saved.chunks[0].inputs[2].op.axis_offsets == (0, 26)  # input (0, 2)
+        assert saved.chunks[0].inputs[5].op.axis_offsets == (13, 26)  # input (1, 2)
+        assert saved.chunks[0].inputs[11].op.axis_offsets == (39, 26)  # input (3, 2)
+
+        with pytest.raises(ValueError):
+            t3 = random.rand(30, 50)
+            totiledb(tempdir, t3, ctx=ctx)  # shape incompatible
+    finally:
+        shutil.rmtree(tempdir)
diff --git a/python/xorbits/_mars/tensor/datastore/tests/test_datastore_execution.py b/python/xorbits/_mars/tensor/datastore/tests/test_datastore_execution.py
new file mode 100644
index 000000000..a858e9937
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datastore/tests/test_datastore_execution.py
@@ -0,0 +1,245 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import tempfile
+import time
+
+import numpy as np
+import pytest
+import scipy.sparse as sps
+
+try:
+    import tiledb
+except (ImportError, OSError):  # pragma: no cover
+    tiledb = None
+try:
+    import h5py
+except ImportError:  # pragma: no cover
+    h5py = None
+try:
+    import zarr
+    from numcodecs import Blosc, Delta, Zstd
+except ImportError:  # pragma: no cover
+    zarr = None
+try:
+    import vineyard
+except ImportError:
+    vineyard = None
+
+from ... import arange, tensor, tohdf5, totiledb, tovineyard, tozarr
+from ...datasource import fromvineyard
+
+_exec_timeout = 120 if "CI" in os.environ else -1
+
+
+@pytest.mark.skipif(tiledb is None, reason="tiledb not installed")
+def test_store_tiledb_execution(setup):
+    ctx = tiledb.Ctx()
+
+    tempdir = tempfile.mkdtemp()
+    try:
+        # store TileDB dense array
+        expected = np.random.rand(8, 4, 3)
+        a = tensor(expected, chunk_size=(3, 3, 2))
+        save = totiledb(tempdir, a, ctx=ctx)
+        save.execute()
+
+        with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr:
+            np.testing.assert_allclose(expected, arr.read_direct())
+    finally:
+        shutil.rmtree(tempdir)
+
+    tempdir = tempfile.mkdtemp()
+    try:
+        # store tensor with 1 chunk to TileDB dense array
+        a = arange(12)
+        save = totiledb(tempdir, a, ctx=ctx)
+        save.execute()
+
+        with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr:
+            np.testing.assert_allclose(np.arange(12), arr.read_direct())
+    finally:
+        shutil.rmtree(tempdir)
+
+    tempdir = tempfile.mkdtemp()
+    try:
+        # store 2-d TileDB sparse array
+        expected = sps.random(8, 7, density=0.1)
+        a = tensor(expected, chunk_size=(3, 5))
+        save = totiledb(tempdir, a, ctx=ctx)
+        save.execute()
+
+        with tiledb.SparseArray(uri=tempdir, ctx=ctx) as arr:
+            data = arr[:, :]
+            coords = data["coords"]
+            value = data[arr.attr(0).name]
+            ij = tuple(coords[arr.domain.dim(k).name] for k in range(arr.ndim))
+            result = sps.coo_matrix((value, ij), shape=arr.shape)
+
+            np.testing.assert_allclose(expected.toarray(), result.toarray())
+    finally:
+        shutil.rmtree(tempdir)
+
+    tempdir = tempfile.mkdtemp()
+    try:
+        # store TileDB dense array
+        expected = np.asfortranarray(np.random.rand(8, 4, 3))
+        a = tensor(expected, chunk_size=(3, 3, 2))
+        save = totiledb(tempdir, a, ctx=ctx)
+        save.execute()
+
+        with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr:
+            np.testing.assert_allclose(expected, arr.read_direct())
+            assert arr.schema.cell_order == "col-major"
+    finally:
+        shutil.rmtree(tempdir)
+
+
+@pytest.mark.skipif(h5py is None, reason="h5py not installed")
+@pytest.mark.ray_dag
+def test_store_hdf5_execution(setup):
+    raw = np.random.RandomState(0).rand(10, 20)
+
+    group_name = "test_group"
+    dataset_name = "test_dataset"
+
+    t1 = tensor(raw, chunk_size=20)
+    t2 = tensor(raw, chunk_size=9)
+
+    with pytest.raises(TypeError):
+        tohdf5(object(), t2)
+
+    with tempfile.TemporaryDirectory() as d:
+        filename = os.path.join(d, f"test_store_{int(time.time())}.hdf5")
+
+        # test 1 chunk
+        r = tohdf5(filename, t1, group=group_name, dataset=dataset_name)
+        r.execute()
+
+        with h5py.File(filename, "r") as f:
+            result = np.asarray(f[f"{group_name}/{dataset_name}"])
+            np.testing.assert_array_equal(result, raw)
+
+        # test filename
+        r = tohdf5(filename, t2, group=group_name, dataset=dataset_name)
+        r.execute()
+
+        with h5py.File(filename, "r") as f:
+            result = np.asarray(f[f"{group_name}/{dataset_name}"])
+            np.testing.assert_array_equal(result, raw)
+
+        with pytest.raises(ValueError):
+            tohdf5(filename, t2)
+
+        with h5py.File(filename, "r") as f:
+            # test file
+            r = tohdf5(f, t2, group=group_name, dataset=dataset_name)
+        r.execute()
+
+        with h5py.File(filename, "r") as f:
+            result = np.asarray(f[f"{group_name}/{dataset_name}"])
+            np.testing.assert_array_equal(result, raw)
+
+        with pytest.raises(ValueError):
+            with h5py.File(filename, "r") as f:
+                tohdf5(f, t2)
+
+        with h5py.File(filename, "r") as f:
+            # test dataset
+            ds = f[f"{group_name}/{dataset_name}"]
+            # test file
+            r = tohdf5(ds, t2)
+        r.execute()
+
+        with h5py.File(filename, "r") as f:
+            result = np.asarray(f[f"{group_name}/{dataset_name}"])
+            np.testing.assert_array_equal(result, raw)
+
+
+@pytest.mark.skipif(zarr is None, reason="zarr not installed")
+def test_store_zarr_execution(setup):
+    raw = np.random.RandomState(0).rand(10, 20)
+
+    group_name = "test_group"
+    dataset_name = "test_dataset"
+
+    t = tensor(raw, chunk_size=6)
+
+    with pytest.raises(TypeError):
+        tozarr(object(), t)
+
+    with tempfile.TemporaryDirectory() as d:
+        filename = os.path.join(d, f"test_store_{int(time.time())}.zarr")
+        path = f"{filename}/{group_name}/{dataset_name}"
+
+        r = tozarr(
+            filename,
+            t,
+            group=group_name,
+            dataset=dataset_name,
+            compressor=Zstd(level=3),
+        )
+        r.execute()
+
+        arr = zarr.open(path)
+        np.testing.assert_array_equal(arr, raw)
+        assert arr.compressor == Zstd(level=3)
+
+        r = tozarr(path, t + 2)
+        r.execute()
+
+        arr = zarr.open(path)
+        np.testing.assert_array_equal(arr, raw + 2)
+
+        filters = [Delta(dtype="i4")]
+        compressor = Blosc(cname="zstd", clevel=1, shuffle=Blosc.SHUFFLE)
+        arr = zarr.open(path, compressor=compressor, filters=filters)
+
+        r = tozarr(arr, t + 1)
+        r.execute()
+        result = zarr.open_array(path)
+        np.testing.assert_array_equal(result, raw + 1)
+
+
+@pytest.mark.skipif(vineyard is None, reason="vineyard not installed")
+def test_vineyard_execution(setup):
+    raw = np.random.RandomState(0).rand(55, 55)
+
+    extra_config = {
+        "check_dtype": False,
+        "check_nsplits": False,
+        "check_shape": False,
+    }
+
+    with vineyard.deploy.local.start_vineyardd() as (_, vineyard_socket, _):
+        a = tensor(raw, chunk_size=15)
+        a.execute()  # n.b.: pre-execute
+
+        b = tovineyard(a, vineyard_socket=vineyard_socket)
+        object_id = b.execute(extra_config=extra_config).fetch()[0]
+
+        c = fromvineyard(object_id, vineyard_socket=vineyard_socket)
+        value = c.execute(extra_config=extra_config).fetch()
+        np.testing.assert_allclose(value, raw)
+
+        a = tensor(raw, chunk_size=15)  # n.b.: no pre-execute
+
+        b = tovineyard(a, vineyard_socket=vineyard_socket)
+        object_id = b.execute(extra_config=extra_config).fetch()[0]
+
+        c = fromvineyard(object_id, vineyard_socket=vineyard_socket)
+        value = c.execute(extra_config=extra_config).fetch()
+        np.testing.assert_allclose(value, raw)
diff --git a/python/xorbits/_mars/tensor/datastore/to_hdf5.py b/python/xorbits/_mars/tensor/datastore/to_hdf5.py
new file mode 100644
index 000000000..84a295d12
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datastore/to_hdf5.py
@@ -0,0 +1,254 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import threading
+import time
+from typing import List
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core.context import get_context
+from ...lib.filesystem import open_file
+from ...oscar import ActorNotExist
+from ...serialization.serializables import (
+    DictField,
+    FieldTypes,
+    KeyField,
+    StringField,
+    TupleField,
+)
+from ...utils import has_unknown_shape
+from ..datasource import tensor as astensor
+from .core import TensorDataStore
+
+
+class _HDF5Container:
+    def __init__(self, all_chunk_op_keys: List[str]):
+        self._all_chunk_op_keys = set(all_chunk_op_keys)
+        self._done_chunk_op_keys = set()
+        self._lock = threading.Lock()
+
+    def acquire(self):
+        return self._lock.acquire()
+
+    def release(self):
+        return self._lock.release()
+
+    def mark_done(self, op_key: str):
+        self._done_chunk_op_keys.add(op_key)
+
+    def is_done(self):
+        return self._done_chunk_op_keys == self._all_chunk_op_keys
+
+
+class TensorHDF5DataStore(TensorDataStore):
+    _op_type_ = OperandDef.TENSOR_STORE_HDF5
+
+    _input = KeyField("input")
+    _filename = StringField("filename")
+    _group = StringField("group")
+    _dataset = StringField("dataset")
+    _dataset_kwds = DictField("dataset_kwds", key_type=FieldTypes.string)
+    _axis_offsets = TupleField("axis_offsets", FieldTypes.int32)
+    _out_shape = TupleField("out_shape", FieldTypes.int32)
+    _container_name = StringField("container_name")
+
+    def __init__(
+        self,
+        filename=None,
+        group=None,
+        dataset=None,
+        dataset_kwds=None,
+        container_name=None,
+        **kw,
+    ):
+        super().__init__(
+            _filename=filename,
+            _group=group,
+            _dataset=dataset,
+            _dataset_kwds=dataset_kwds,
+            _container_name=container_name,
+            **kw,
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def filename(self):
+        return self._filename
+
+    @property
+    def group(self):
+        return self._group
+
+    @property
+    def dataset(self):
+        return self._dataset
+
+    @property
+    def dataset_kwds(self):
+        return self._dataset_kwds
+
+    @property
+    def axis_offsets(self):
+        return self._axis_offsets
+
+    @property
+    def out_shape(self):
+        return self._out_shape
+
+    @property
+    def container_name(self):
+        return self._container_name
+
+    @property
+    def path(self):
+        paths = []
+        if self._group is not None:
+            paths.append(self.group)
+        paths.append(self.dataset)
+        return "/".join(paths)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    @classmethod
+    def tile(cls, op):
+        if has_unknown_shape(*op.inputs):
+            yield
+        in_tensor = op.input
+
+        with open_file(op.filename, "w"):
+            # create file if not exist
+            pass
+
+        nsplits = tuple([(0,) * len(ns) for ns in in_tensor.nsplits])
+        if len(in_tensor.chunks) == 1:
+            in_chunk = in_tensor.chunks[0]
+            chunk_op = op.copy().reset_key()
+            chunk_op._axis_offsets = (0,) * in_chunk.ndim
+            chunk_op._out_shape = in_tensor.shape
+            out_chunk = chunk_op.new_chunk(
+                [in_chunk], shape=(0,) * in_chunk.ndim, index=in_chunk.index
+            )
+            new_op = op.copy()
+            return new_op.new_tensors(
+                op.inputs,
+                shape=(0,) * in_tensor.ndim,
+                nsplits=nsplits,
+                chunks=[out_chunk],
+            )
+
+        container_name = f"{op.key}_{int(time.time() * 1000)}"
+
+        out_chunks = []
+        acc = [[0] + np.cumsum(ns).tolist() for ns in in_tensor.nsplits]
+        chunk_op_keys = []
+        for chunk in in_tensor.chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_op._out_shape = in_tensor.shape
+            chunk_op._container_name = container_name
+            chunk_op._axis_offsets = tuple(
+                acc[ax][i] for ax, i in enumerate(chunk.index)
+            )
+            out_chunk = chunk_op.new_chunk(
+                [chunk], shape=(0,) * chunk.ndim, index=chunk.index
+            )
+            out_chunks.append(out_chunk)
+            chunk_op_keys.append(out_chunk.op.key)
+
+        ctx = get_context()
+        ctx.create_remote_object(container_name, _HDF5Container, chunk_op_keys)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs, shape=(0,) * in_tensor.ndim, nsplits=nsplits, chunks=out_chunks
+        )
+
+    @classmethod
+    def execute(cls, ctx, op: "TensorHDF5DataStore"):
+        import h5py
+
+        to_store = ctx[op.inputs[0].key]
+        axis_offsets = op.axis_offsets
+
+        container_name = op.container_name
+        container: _HDF5Container = None
+        if container_name:
+            container = ctx.get_remote_object(container_name)
+            container.acquire()
+        try:
+            with h5py.File(open_file(op.filename, mode="r+b"), mode="r+") as f:
+                try:
+                    ds = f[op.path]
+                except KeyError:
+                    ds = f.create_dataset(
+                        op.path,
+                        shape=op.out_shape,
+                        dtype=to_store.dtype,
+                        **op.dataset_kwds,
+                    )
+                ds[
+                    tuple(
+                        slice(offset, offset + size)
+                        for offset, size in zip(axis_offsets, to_store.shape)
+                    )
+                ] = to_store
+                ctx[op.outputs[0].key] = np.empty(
+                    (0,) * to_store.ndim, dtype=to_store.dtype
+                )
+                if container:
+                    container.mark_done(op.key)
+        finally:
+            if container:
+                try:
+                    container.release()
+                    if container.is_done():
+                        ctx.destroy_remote_object(container_name)
+                except ActorNotExist:
+                    # destroyed by other execution, just ignore
+                    return
+
+
+def tohdf5(hdf5_file, x, group=None, dataset=None, **kwds):
+    import h5py
+
+    x = astensor(x)
+    if isinstance(hdf5_file, h5py.Dataset):
+        filename = hdf5_file.file.filename
+        group = hdf5_file.parent.name
+        dataset = hdf5_file.name.rsplit("/", 1)[1]
+    elif isinstance(hdf5_file, h5py.File):
+        filename = hdf5_file.filename
+        if dataset is None:
+            raise ValueError("`dataset` should be provided")
+    elif isinstance(hdf5_file, str):
+        filename = hdf5_file
+        if dataset is None:
+            raise ValueError("`dataset` should be provided")
+    else:
+        raise TypeError(
+            "`hdf5_file` passed has wrong type, "
+            "expect str, h5py.File or h5py.Dataset, "
+            f"got {type(hdf5_file)}"
+        )
+
+    op = TensorHDF5DataStore(
+        filename=filename, group=group, dataset=dataset, dataset_kwds=kwds
+    )
+    return op(x)
diff --git a/python/xorbits/_mars/tensor/datastore/to_tiledb.py b/python/xorbits/_mars/tensor/datastore/to_tiledb.py
new file mode 100644
index 000000000..48683ef4f
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datastore/to_tiledb.py
@@ -0,0 +1,263 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+try:
+    import tiledb
+except (ImportError, OSError):  # pragma: no cover
+    tildb = None
+
+from ... import opcodes as OperandDef
+from ...lib.sparse import SparseNDArray
+from ...lib.sparse.core import sps
+from ...serialization.serializables import (
+    DictField,
+    FieldTypes,
+    Int64Field,
+    KeyField,
+    StringField,
+    TupleField,
+)
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+from ..utils import get_tiledb_ctx
+from .core import TensorDataStore
+from .utils import check_tiledb_array_with_tensor, get_tiledb_schema_from_tensor
+
+
+class TensorTileDBDataStore(TensorDataStore):
+    _op_type_ = OperandDef.TENSOR_STORE_TILEDB
+
+    _input = KeyField("input")
+    _tiledb_config = DictField("tiledb_config")
+    # URI of array to write
+    _tiledb_uri = StringField("tiledb_uri")
+    # encryption key to decrypt if provided
+    _tiledb_key = StringField("tiledb_key")
+    # open array at a given timestamp if provided
+    _tiledb_timestamp = Int64Field("tiledb_timestamp")
+    _axis_offsets = TupleField("axis_offsets", FieldTypes.int64)
+
+    def __init__(
+        self,
+        tiledb_config=None,
+        tiledb_uri=None,
+        tiledb_key=None,
+        tiledb_timestamp=None,
+        **kw,
+    ):
+        super().__init__(
+            _tiledb_config=tiledb_config,
+            _tiledb_uri=tiledb_uri,
+            _tiledb_key=tiledb_key,
+            _tiledb_timestamp=tiledb_timestamp,
+            **kw,
+        )
+
+    @property
+    def tiledb_config(self):
+        return self._tiledb_config
+
+    @property
+    def tiledb_uri(self):
+        return self._tiledb_uri
+
+    @property
+    def tiledb_key(self):
+        return self._tiledb_key
+
+    @property
+    def tiledb_timestamp(self):
+        return self._tiledb_timestamp
+
+    @property
+    def axis_offsets(self):
+        return self._axis_offsets
+
+    @classmethod
+    def _get_out_chunk(cls, op, in_chunk):
+        chunk_op = op.copy().reset_key()
+        nsplits = op.input.nsplits
+        axis_offsets = []
+        for axis, idx in enumerate(in_chunk.index):
+            axis_offsets.append(sum(nsplits[axis][:idx]))
+        chunk_op._axis_offsets = tuple(axis_offsets)
+        out_chunk_shape = (0,) * in_chunk.ndim
+        return chunk_op.new_chunk(
+            [in_chunk], shape=out_chunk_shape, index=in_chunk.index
+        )
+
+    @classmethod
+    def _process_out_chunks(cls, op, out_chunks):
+        if len(out_chunks) == 1:
+            return out_chunks
+
+        consolidate_op = TensorTileDBConsolidate(
+            tiledb_config=op.tiledb_config,
+            tiledb_uri=op.tiledb_uri,
+            tiledb_key=op.tiledb_key,
+            sparse=op.sparse,
+            dtype=op.dtype,
+        )
+        return consolidate_op.new_chunks(
+            out_chunks, shape=out_chunks[0].shape, index=(0,) * out_chunks[0].ndim
+        )
+
+    @classmethod
+    def tile(cls, op):
+        import tiledb
+
+        tensor = super().tile(op)[0]
+
+        ctx = tiledb.Ctx(op.tiledb_config)
+        tiledb_array_type = (
+            tiledb.SparseArray if tensor.issparse() else tiledb.DenseArray
+        )
+        try:
+            tiledb_array_type(
+                uri=op.tiledb_uri,
+                key=op.tiledb_key,
+                timestamp=op.tiledb_timestamp,
+                ctx=ctx,
+            )
+        except tiledb.TileDBError:
+            # not exist, try to create TileDB Array by given uri
+            tiledb_array_schema = get_tiledb_schema_from_tensor(
+                op.input, ctx, op.input.nsplits
+            )
+            tiledb_array_type.create(
+                op.tiledb_uri, tiledb_array_schema, key=op.tiledb_key
+            )
+
+        return [tensor]
+
+    @classmethod
+    def execute(cls, ctx, op):
+        tiledb_ctx = get_tiledb_ctx(op.tiledb_config)
+        uri = op.tiledb_uri
+        key = op.tiledb_key
+        timestamp = op.tiledb_timestamp
+        axis_offsets = op.axis_offsets
+
+        chunk = op.outputs[0]
+        if not chunk.issparse():
+            # dense
+            to_store = np.ascontiguousarray(ctx[op.input.key])
+            slcs = []
+            for axis in range(chunk.ndim):
+                axis_offset = int(axis_offsets[axis])
+                axis_length = int(op.input.shape[axis])
+                slcs.append(slice(axis_offset, axis_offset + axis_length))
+            with tiledb.DenseArray(
+                uri=uri, ctx=tiledb_ctx, mode="w", key=key, timestamp=timestamp
+            ) as arr:
+                arr[tuple(slcs)] = to_store
+            ctx[chunk.key] = np.empty((0,) * chunk.ndim, dtype=chunk.dtype)
+        else:
+            # sparse
+            to_store = ctx[op.input.key].spmatrix.tocoo()
+            if to_store.nnz > 0:
+                with tiledb.SparseArray(
+                    uri=uri, ctx=tiledb_ctx, mode="w", key=key, timestamp=timestamp
+                ) as arr:
+                    if chunk.ndim == 1:
+                        vec = to_store.col if to_store.shape[0] == 1 else to_store.row
+                        vec += axis_offsets[0]
+                        arr[vec] = to_store.data
+                    else:
+                        i, j = (
+                            to_store.row + axis_offsets[0],
+                            to_store.col + axis_offsets[1],
+                        )
+                        arr[i, j] = to_store.data
+            ctx[chunk.key] = SparseNDArray(
+                sps.csr_matrix((0, 0), dtype=chunk.dtype), shape=chunk.shape
+            )
+
+
+class TensorTileDBConsolidate(TensorOperandMixin, TensorOperand):
+    _op_type_ = OperandDef.TENSOR_STORE_TILEDB_CONSOLIDATE
+
+    _tiledb_config = DictField("tiledb_config")
+    # URI of array to write
+    _tiledb_uri = StringField("tiledb_uri")
+    # encryption key to decrypt if provided
+    _tiledb_key = StringField("tiledb_key")
+
+    def __init__(self, tiledb_config=None, tiledb_uri=None, tiledb_key=None, **kw):
+        super().__init__(
+            _tiledb_config=tiledb_config,
+            _tiledb_uri=tiledb_uri,
+            _tiledb_key=tiledb_key,
+            **kw,
+        )
+
+    def calc_shape(self, *inputs_shape):
+        return self.outputs[0].shape
+
+    @property
+    def tiledb_config(self):
+        return self._tiledb_config
+
+    @property
+    def tiledb_uri(self):
+        return self._tiledb_uri
+
+    @property
+    def tiledb_key(self):
+        return self._tiledb_key
+
+    @classmethod
+    def tile(cls, op):
+        raise TypeError(f"{cls.__name__} is a chunk op, cannot be tiled")
+
+    @classmethod
+    def execute(cls, ctx, op):
+        tiledb_config = tiledb.Config(op.tiledb_config)
+        uri = op.tiledb_uri
+        key = op.tiledb_key
+
+        tiledb.consolidate(config=tiledb_config, uri=uri, key=key)
+        ctx[op.outputs[0].key] = ctx[op.inputs[0].key]
+
+
+def totiledb(uri, x, ctx=None, key=None, timestamp=None):
+    import tiledb
+
+    x = astensor(x)
+    raw_ctx = ctx
+    if raw_ctx is None:
+        ctx = tiledb.Ctx()
+
+    tiledb_array_type = tiledb.SparseArray if x.issparse() else tiledb.DenseArray
+    try:
+        tiledb_array = tiledb_array_type(uri=uri, key=key, timestamp=timestamp, ctx=ctx)
+        # if already created, we will check the shape and dtype
+        check_tiledb_array_with_tensor(x, tiledb_array)
+    except tiledb.TileDBError:
+        # not exist, as we don't know the tile,
+        # we will create the tiledb array in the tile of tensor
+        pass
+
+    tiledb_config = None if raw_ctx is None else raw_ctx.config().dict()
+    op = TensorTileDBDataStore(
+        tiledb_config=tiledb_config,
+        tiledb_uri=uri,
+        tiledb_key=key,
+        tiledb_timestamp=timestamp,
+        dtype=x.dtype,
+        sparse=x.issparse(),
+    )
+    return op(x)
diff --git a/python/xorbits/_mars/tensor/datastore/to_vineyard.py b/python/xorbits/_mars/tensor/datastore/to_vineyard.py
new file mode 100644
index 000000000..fd771e40b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datastore/to_vineyard.py
@@ -0,0 +1,179 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core.operand.base import SchedulingHint
+from ...serialization.serializables import FieldTypes, KeyField, StringField, TupleField
+from ...storage.base import StorageLevel
+from ...utils import lazy_import
+from ..datasource import tensor as astensor
+from .core import TensorDataStore
+
+vineyard = lazy_import("vineyard")
+vy_data_tensor = lazy_import("vineyard.data.tensor", rename="vy_data_tensor")
+vy_data_utils = lazy_import("vineyard.data.utils", rename="vy_data_utils")
+
+
+def resolve_vineyard_socket(ctx, op) -> Tuple[str, bool]:
+    storage_backend = ctx.get_storage_info(level=StorageLevel.MEMORY)
+    if storage_backend.get("name", None) == "vineyard":  # pragma: no cover
+        if (
+            op.vineyard_socket is not None
+            and op.vineyard_socket != storage_backend["socket"]
+        ):
+            return op.vineyard_socket, True
+        else:
+            return storage_backend["socket"], False
+    else:
+        return op.vineyard_socket, True
+
+
+class TensorVineyardDataStoreChunk(TensorDataStore):
+    _op_type_ = OperandDef.TENSOR_STORE_VINEYARD_CHUNK
+
+    _input = KeyField("input")
+
+    # vineyard ipc socket
+    vineyard_socket = StringField("vineyard_socket")
+
+    # a dummy attr to make sure ops have different keys
+    operator_index = TupleField("operator_index", FieldTypes.int32)
+
+    def __init__(self, vineyard_socket=None, **kw):
+        super().__init__(vineyard_socket=vineyard_socket, **kw)
+
+    @classmethod
+    def _process_out_chunks(cls, op, out_chunks):
+        merge_op = TensorVineyardDataStoreMeta(
+            vineyard_socket=op.vineyard_socket, sparse=op.sparse, dtype=np.dtype("O")
+        )
+        return merge_op.new_chunks(
+            out_chunks, shape=(1,), dtype=np.dtype("O"), index=(0,) * out_chunks[0].ndim
+        )
+
+    @classmethod
+    def tile(cls, op):
+        out_chunks = []
+        scheduling_hint = SchedulingHint(fuseable=False)
+        for idx, chunk in enumerate(op.inputs[0].chunks):
+            chunk_op = op.copy().reset_key()
+            chunk_op.scheduling_hint = scheduling_hint
+            chunk_op.operator_index = chunk.index
+            out_chunk = chunk_op.new_chunk(
+                [chunk], dtype=np.dtype("O"), shape=(1,), index=(idx,)
+            )
+            out_chunks.append(out_chunk)
+        out_chunks = cls._process_out_chunks(op, out_chunks)
+
+        new_op = op.copy().reset_key()
+        return new_op.new_tensors(
+            op.inputs,
+            shape=(len(out_chunks),),
+            dtype=np.dtype("O"),
+            chunks=out_chunks,
+            nsplits=((1,),),
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if vineyard is None:
+            raise RuntimeError("vineyard is not available")
+        socket, needs_put = resolve_vineyard_socket(ctx, op)
+        client = vineyard.connect(socket)
+
+        # some op might be fused and executed twice on different workers
+        if not needs_put:
+            # might be fused
+            try:  # pragma: no cover
+                meta = ctx.get_chunks_meta([op.inputs[0].key])[0]
+                tensor_id = vineyard.ObjectID(meta["object_ref"])
+                if not client.exists(tensor_id):
+                    needs_put = True
+            except KeyError:
+                needs_put = True
+        if needs_put:
+            tensor_id = client.put(
+                np.asarray(ctx[op.inputs[0].key]), partition_index=op.inputs[0].index
+            )
+        else:  # pragma: no cover
+            meta = client.get_meta(tensor_id)
+            new_meta = vineyard.ObjectMeta()
+            for k, v in meta.items():
+                if k not in ["id", "signature", "instance_id"]:
+                    if isinstance(v, vineyard.ObjectMeta):
+                        new_meta.add_member(k, v)
+                    else:
+                        new_meta[k] = v
+            new_meta["partition_index_"] = vy_data_utils.to_json(op.inputs[0].index)
+            tensor_id = client.create_metadata(new_meta).id
+
+        client.persist(tensor_id)
+        holder = np.empty((1,), dtype=object)
+        holder[0] = tensor_id
+        ctx[op.outputs[0].key] = holder
+
+
+class TensorVineyardDataStoreMeta(TensorDataStore):
+    _op_type_ = OperandDef.TENSOR_STORE_VINEYARD_META
+
+    _input = KeyField("input")
+
+    # vineyard ipc socket
+    vineyard_socket = StringField("vineyard_socket")
+
+    def __init__(self, vineyard_socket=None, dtype=None, sparse=None, **kw):
+        super().__init__(
+            vineyard_socket=vineyard_socket, dtype=dtype, sparse=sparse, **kw
+        )
+
+    @classmethod
+    def tile(cls, op):
+        chunk_op = op.copy().reset_key()
+        out_chunk = chunk_op.new_chunk(
+            op.inputs[0].chunks, dtype=np.dtype("O"), shape=(1,), index=(0,)
+        )
+        new_op = op.copy().reset_key()
+        return new_op.new_tensors(
+            op.inputs,
+            shape=(1,),
+            dtype=np.dtype("O"),
+            chunks=[out_chunk],
+            nsplits=((1,),),
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if vineyard is None:
+            raise RuntimeError("vineyard is not available")
+
+        socket, _ = resolve_vineyard_socket(ctx, op)
+        client = vineyard.connect(socket)
+
+        # # store the result object id to execution context
+        chunks = [ctx[chunk.key][0] for chunk in op.inputs]
+        holder = np.empty((1,), dtype=object)
+        holder[0] = vy_data_tensor.make_global_tensor(client, chunks).id
+        ctx[op.outputs[0].key] = holder
+
+
+def tovineyard(x, vineyard_socket=None):
+    x = astensor(x)
+    op = TensorVineyardDataStoreChunk(
+        vineyard_socket=vineyard_socket, dtype=x.dtype, sparse=x.issparse()
+    )
+    return op(x)
diff --git a/python/xorbits/_mars/tensor/datastore/to_zarr.py b/python/xorbits/_mars/tensor/datastore/to_zarr.py
new file mode 100644
index 000000000..91aae4e3f
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datastore/to_zarr.py
@@ -0,0 +1,212 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle
+from typing import Dict
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...lib.filesystem import FSMap, get_fs
+from ...serialization.serializables import (
+    BytesField,
+    FieldTypes,
+    KeyField,
+    StringField,
+    TupleField,
+)
+from ...utils import has_unknown_shape
+from .core import TensorDataStore
+
+
+class ZarrOptions(object):
+    def __init__(self, options: Dict):
+        self._options = options
+
+    def todict(self):
+        return self._options
+
+    @staticmethod
+    def _stringfy(v):
+        return pickle.dumps(v) if not isinstance(v, str) else v
+
+    def __mars_tokenize__(self):
+        return (
+            list(self._options.keys()),
+            list(self._stringfy(v) for v in self._options.values()),
+        )
+
+    def __getstate__(self):
+        return self._options
+
+    def __setstate__(self, state):
+        self._options = state
+
+
+class TensorToZarrDataStore(TensorDataStore):
+    _op_type_ = OperandDef.TENSOR_STORE_ZARR
+
+    _input = KeyField("input")
+    _path = StringField("path")
+    _group = StringField("group")
+    _dataset = StringField("dataset")
+    _zarr_options = BytesField(
+        "zarr_options", on_serialize=pickle.dumps, on_deserialize=pickle.loads
+    )
+    _axis_offsets = TupleField("axis_offsets", FieldTypes.int32)
+
+    def __init__(
+        self,
+        path=None,
+        group=None,
+        dataset=None,
+        zarr_options=None,
+        axis_offsets=None,
+        **kw,
+    ):
+        super().__init__(
+            _path=path,
+            _group=group,
+            _dataset=dataset,
+            _zarr_options=zarr_options,
+            _axis_offsets=axis_offsets,
+            **kw,
+        )
+
+    @property
+    def path(self):
+        return self._path
+
+    @property
+    def group(self):
+        return self._group
+
+    @property
+    def dataset(self):
+        return self._dataset
+
+    @property
+    def zarr_options(self):
+        return self._zarr_options
+
+    @property
+    def axis_offsets(self):
+        return self._axis_offsets
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    @classmethod
+    def tile(cls, op):
+        import zarr
+
+        if has_unknown_shape(*op.inputs):
+            yield
+        in_tensor = op.input
+
+        # create dataset
+        fs = get_fs(op.path, None)
+        path = op.path
+        if op.group is not None:
+            path += "/" + op.group
+        fs_map = FSMap(path, fs)
+        zarr.open(
+            fs_map,
+            "w",
+            path=op.dataset,
+            dtype=in_tensor.dtype,
+            shape=in_tensor.shape,
+            chunks=tuple(max(ns) for ns in in_tensor.nsplits),
+            **op.zarr_options.todict(),
+        )
+
+        cum_nsplits = [[0] + np.cumsum(ns).tolist() for ns in in_tensor.nsplits]
+        out_chunks = []
+        for chunk in in_tensor.chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_op._axis_offsets = tuple(
+                cs[i] for i, cs in zip(chunk.index, cum_nsplits)
+            )
+            out_chunks.append(
+                chunk_op.new_chunk([chunk], shape=(0,) * chunk.ndim, index=chunk.index)
+            )
+
+        new_op = op.copy()
+        out = op.outputs[0]
+        nsplits = tuple((0,) * len(ns) for ns in in_tensor.nsplits)
+        return new_op.new_tensors(
+            op.inputs,
+            shape=out.shape,
+            order=out.order,
+            nsplits=nsplits,
+            chunks=out_chunks,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        import zarr
+
+        fs = get_fs(op.path, None)
+        fs_map = FSMap(op.path, fs)
+
+        group = zarr.Group(store=fs_map, path=op.group)
+        array = group[op.dataset]
+
+        to_store = ctx[op.inputs[0].key]
+        axis_offsets = op.axis_offsets
+        shape = to_store.shape
+
+        array[
+            tuple(
+                slice(offset, offset + size)
+                for offset, size in zip(axis_offsets, shape)
+            )
+        ] = to_store
+
+        ctx[op.outputs[0].key] = np.empty((0,) * to_store.ndim, dtype=to_store.dtype)
+
+
+def tozarr(path, x, group=None, dataset=None, **zarr_options):
+    import zarr
+
+    if isinstance(path, zarr.Array):
+        arr = path
+        if isinstance(arr.store, FSMap):
+            root = arr.store.root
+            path, dataset = root.rsplit("/", 1)
+        else:
+            path = arr.store.path
+            if "/" in arr.path and group is None:
+                group = arr.path.rsplit("/", 1)[0]
+            dataset = arr.basename
+            if not dataset:
+                path, dataset = path.rsplit("/", 1)
+        for attr in ["compressor", "filters"]:
+            if getattr(arr, attr):
+                zarr_options[attr] = getattr(arr, attr)
+    elif isinstance(path, str):
+        if dataset is None:
+            path, dataset = path.rsplit("/", 1)
+    else:
+        raise TypeError(
+            "`path` passed has wrong type, "
+            "expect str, or zarr.Array"
+            f"got {type(path)}"
+        )
+
+    op = TensorToZarrDataStore(
+        path=path, group=group, dataset=dataset, zarr_options=ZarrOptions(zarr_options)
+    )
+    return op(x)
diff --git a/python/xorbits/_mars/tensor/datastore/utils.py b/python/xorbits/_mars/tensor/datastore/utils.py
new file mode 100644
index 000000000..91db50a33
--- /dev/null
+++ b/python/xorbits/_mars/tensor/datastore/utils.py
@@ -0,0 +1,67 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+try:
+    import tiledb
+except (ImportError, OSError):  # pragma: no cover
+    tiledb = None
+
+
+def get_tiledb_schema_from_tensor(tensor, tiledb_ctx, nsplits, **kw):
+    from ..core import TensorOrder
+
+    ctx = tiledb_ctx
+
+    dims = []
+    for d in range(tensor.ndim):
+        extent = tensor.shape[d]
+        domain = (0, extent - 1)
+        tile = max(nsplits[d])
+        dims.append(
+            tiledb.Dim(name="", domain=domain, tile=tile, dtype=np.int64, ctx=ctx)
+        )
+    dom = tiledb.Domain(*dims, ctx=ctx)
+    att = tiledb.Attr(ctx=ctx, dtype=tensor.dtype)
+    cell_order = "C" if tensor.order == TensorOrder.C_ORDER else "F"
+    return tiledb.ArraySchema(
+        ctx=ctx,
+        domain=dom,
+        attrs=(att,),
+        sparse=tensor.issparse(),
+        cell_order=cell_order,
+        **kw,
+    )
+
+
+def check_tiledb_array_with_tensor(tensor, tiledb_array):
+    if tensor.ndim != tiledb_array.ndim:
+        # ndim
+        raise ValueError(
+            "ndim of TileDB Array to store is different to tensor, "
+            f"expect {tensor.ndim}, got {tiledb_array.ndim}"
+        )
+    if tensor.shape != tiledb_array.shape:
+        # shape
+        raise ValueError(
+            "shape of TileDB Array to store is different to tensor, "
+            f"expect {tensor.shape}, got {tiledb_array.shape}"
+        )
+    if tensor.dtype != tiledb_array.attr(0).dtype:
+        # dtype
+        raise ValueError(
+            "dtype of TileDB Array to store is different to tensor, "
+            f"expect {tensor.dtype}, got {tiledb_array.domain.dtype}"
+        )
diff --git a/python/xorbits/_mars/tensor/einsum/__init__.py b/python/xorbits/_mars/tensor/einsum/__init__.py
new file mode 100644
index 000000000..f21419396
--- /dev/null
+++ b/python/xorbits/_mars/tensor/einsum/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import einsum
diff --git a/python/xorbits/_mars/tensor/einsum/core.py b/python/xorbits/_mars/tensor/einsum/core.py
new file mode 100644
index 000000000..924743c47
--- /dev/null
+++ b/python/xorbits/_mars/tensor/einsum/core.py
@@ -0,0 +1,486 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from collections import defaultdict
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import recursive_tile
+from ...serialization.serializables import AnyField, StringField
+from ..arithmetic.utils import chunk_tree_add
+from ..array_utils import as_same_device, device
+from ..core import TensorOrder
+from ..operands import TensorOperand, TensorOperandMixin
+from ..utils import decide_unify_split
+from .einsumfunc import einsum_path, parse_einsum_input
+
+
+class TensorEinsum(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.EINSUM
+
+    _subscripts = StringField("subscripts")
+    _optimize = AnyField("optimize")
+    _order = StringField("order")
+    _casting = StringField("casting")
+
+    def __init__(self, subscripts=None, optimize=None, order=None, casting=None, **kw):
+        super().__init__(
+            _subscripts=subscripts,
+            _optimize=optimize,
+            _order=order,
+            _casting=casting,
+            **kw
+        )
+
+    @property
+    def subscripts(self):
+        return self._subscripts
+
+    @property
+    def optimize(self):
+        return self._optimize
+
+    @property
+    def order(self):
+        return self._order
+
+    @property
+    def casting(self):
+        return self._casting
+
+    def __call__(self, input_tensors, shape):
+        if self.order in "KA":
+            if any(t.order == TensorOrder.C_ORDER for t in input_tensors):
+                order = TensorOrder.C_ORDER
+            else:
+                order = TensorOrder.F_ORDER
+        else:
+            if self.order == "C":
+                order = TensorOrder.C_ORDER
+            else:
+                order = TensorOrder.F_ORDER
+        return self.new_tensor(
+            input_tensors, shape=shape, dtype=self.dtype, order=order
+        )
+
+    @classmethod
+    def tile(cls, op):
+        out_tensor = op.outputs[0]
+        input_scripts, output_scripts = op.subscripts.split("->")
+        tensor_axes = list(zip(op.inputs, input_scripts.split(",")))
+
+        # rechunk to unify nsplits
+        input_nsplits = defaultdict(list)
+        for t, axes in tensor_axes:
+            for splits, ax in zip(t.nsplits, axes):
+                input_nsplits[ax].append(splits)
+        input_tensors = []
+        for t, axes in tensor_axes:
+            new_nsplits = tuple(
+                decide_unify_split(*input_nsplits[ax])
+                if t.shape[j] > 1
+                else (t.shape[j],)
+                for j, ax in enumerate(axes)
+            )
+            input_tensors.append((yield from recursive_tile(t.rechunk(new_nsplits))))
+
+        tensor_indexes = dict()
+        output_axes = defaultdict(list)
+        axes_splits = dict()
+        tensor_contract_axes = []
+        for i, (t, axes) in enumerate(zip(input_tensors, input_scripts.split(","))):
+            for j in range(t.ndim):
+                if axes[j] in output_scripts:
+                    # Record the output tensor's axes and its nsplit.
+                    tensor_indexes[axes[j]] = range(len(t.nsplits[j]))
+                    output_axes[axes[j]].append((i, j))
+            axis_splits = dict((axes[j], t.nsplits[j]) for j in range(t.ndim))
+            axes_splits.update(axis_splits)
+            tensor_contract_axes.append([ax for ax in axes if ax not in output_scripts])
+
+        out_chunks = []
+        output_indexes = [tensor_indexes[ax] for ax in output_scripts]
+        for out_idx in itertools.product(*output_indexes):
+            all_indexes = [[None] * t.ndim for t in input_tensors]
+            tensor_shape = []
+            for i, idx in enumerate(out_idx):
+                tensor_shape.append(axes_splits[output_scripts[i]][idx])
+                for t_idx, axis in output_axes[output_scripts[i]]:
+                    if input_tensors[t_idx].shape[axis] == 1:
+                        all_indexes[t_idx][axis] = 0
+                    else:
+                        all_indexes[t_idx][axis] = idx
+            tensor_shape = tuple(tensor_shape)
+            einsum_chunks = []
+            contract_axes = [
+                s
+                for s in set(input_scripts.replace(",", ""))
+                if s not in output_scripts
+            ]
+            for contract_indexes in itertools.product(
+                *[range(len(axes_splits[ax])) for ax in contract_axes]
+            ):
+                for j, t_contract_axes in enumerate(tensor_contract_axes):
+                    for axis in t_contract_axes:
+                        axis_index = tensor_axes[j][1].index(axis)
+                        all_indexes[j][axis_index] = contract_indexes[
+                            contract_axes.index(axis)
+                        ]
+                einsum_op = op.copy().reset_key()
+                in_chunks = [
+                    t.cix[tuple(indices)]
+                    for t, indices in zip(input_tensors, all_indexes)
+                ]
+                chunk = einsum_op.new_chunk(
+                    in_chunks, shape=tensor_shape, order=out_tensor.order
+                )
+                einsum_chunks.append(chunk)
+
+            if len(einsum_chunks) == 1:
+                c = einsum_chunks[0]
+                chunk_op = c.op.copy()
+                chunk = chunk_op.new_chunk(
+                    c.inputs, shape=c.shape, index=out_idx, order=out_tensor.order
+                )
+            else:
+                chunk = chunk_tree_add(
+                    op.dtype, einsum_chunks, out_idx, tensor_shape, sparse=op.sparse
+                )
+            out_chunks.append(chunk)
+
+        nsplits = [axes_splits[ax] for ax in output_scripts]
+        new_op = op.copy()
+        return new_op.new_tensors(
+            input_tensors, out_tensor.shape, chunks=out_chunks, nsplits=nsplits
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            if xp is np:
+                ctx[op.outputs[0].key] = xp.einsum(
+                    op.subscripts,
+                    *inputs,
+                    optimize=op.optimize,
+                    dtype=op.dtype,
+                    order=op.order,
+                    casting=op.casting
+                )
+            else:
+                # Cupy doesn't support `optimize`, `order` and `casting`.
+                ctx[op.outputs[0].key] = xp.einsum(
+                    op.subscripts, *inputs, dtype=op.dtype
+                )
+
+
+def einsum(
+    subscripts, *operands, dtype=None, order="K", casting="safe", optimize=False
+):
+    """
+    Evaluates the Einstein summation convention on the operands.
+
+    Using the Einstein summation convention, many common multi-dimensional,
+    linear algebraic array operations can be represented in a simple fashion.
+    In *implicit* mode `einsum` computes these values.
+
+    In *explicit* mode, `einsum` provides further flexibility to compute
+    other array operations that might not be considered classical Einstein
+    summation operations, by disabling, or forcing summation over specified
+    subscript labels.
+
+    See the notes and examples for clarification.
+
+    Parameters
+    ----------
+    subscripts : str
+        Specifies the subscripts for summation as comma separated list of
+        subscript labels. An implicit (classical Einstein summation)
+        calculation is performed unless the explicit indicator '->' is
+        included as well as subscript labels of the precise output form.
+    operands : list of array_like
+        These are the arrays for the operation.
+    dtype : {data-type, None}, optional
+        If provided, forces the calculation to use the data type specified.
+        Note that you may have to also give a more liberal `casting`
+        parameter to allow the conversions. Default is None.
+    order : {'C', 'F', 'A', 'K'}, optional
+        Controls the memory layout of the output. 'C' means it should
+        be C contiguous. 'F' means it should be Fortran contiguous,
+        'A' means it should be 'F' if the inputs are all 'F', 'C' otherwise.
+        'K' means it should be as close to the layout as the inputs as
+        is possible, including arbitrarily permuted axes.
+        Default is 'K'.
+    casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
+        Controls what kind of data casting may occur.  Setting this to
+        'unsafe' is not recommended, as it can adversely affect accumulations.
+
+          * 'no' means the data types should not be cast at all.
+          * 'equiv' means only byte-order changes are allowed.
+          * 'safe' means only casts which can preserve values are allowed.
+          * 'same_kind' means only safe casts or casts within a kind,
+            like float64 to float32, are allowed.
+          * 'unsafe' means any data conversions may be done.
+
+        Default is 'safe'.
+    optimize : {False, True, 'greedy', 'optimal'}, optional
+        Controls if intermediate optimization should occur. No optimization
+        will occur if False and True will default to the 'greedy' algorithm.
+        Also accepts an explicit contraction list from the ``np.einsum_path``
+        function. See ``np.einsum_path`` for more details. Defaults to False.
+
+    Returns
+    -------
+    output : Mars.tensor
+        The calculation based on the Einstein summation convention.
+
+    The Einstein summation convention can be used to compute
+    many multi-dimensional, linear algebraic array operations. `einsum`
+    provides a succinct way of representing these.
+
+    A non-exhaustive list of these operations,
+    which can be computed by `einsum`, is shown below along with examples:
+
+    * Trace of an array, :py:func:`numpy.trace`.
+    * Return a diagonal, :py:func:`numpy.diag`.
+    * Array axis summations, :py:func:`numpy.sum`.
+    * Transpositions and permutations, :py:func:`numpy.transpose`.
+    * Matrix multiplication and dot product, :py:func:`numpy.matmul` :py:func:`numpy.dot`.
+    * Vector inner and outer products, :py:func:`numpy.inner` :py:func:`numpy.outer`.
+    * Broadcasting, element-wise and scalar multiplication, :py:func:`numpy.multiply`.
+    * Tensor contractions, :py:func:`numpy.tensordot`.
+    * Chained array operations, in efficient calculation order, :py:func:`numpy.einsum_path`.
+
+    The subscripts string is a comma-separated list of subscript labels,
+    where each label refers to a dimension of the corresponding operand.
+    Whenever a label is repeated it is summed, so ``mt.einsum('i,i', a, b)``
+    is equivalent to :py:func:`mt.inner(a,b) <mars.tensor.inner>`. If a label
+    appears only once, it is not summed, so ``mt.einsum('i', a)`` produces a
+    view of ``a`` with no changes. A further example ``mt.einsum('ij,jk', a, b)``
+    describes traditional matrix multiplication and is equivalent to
+    :py:func:`mt.matmul(a,b) <mars.tensor.matmul>`.
+
+    In *implicit mode*, the chosen subscripts are important
+    since the axes of the output are reordered alphabetically.  This
+    means that ``mt.einsum('ij', a)`` doesn't affect a 2D array, while
+    ``mt.einsum('ji', a)`` takes its transpose. Additionally,
+    ``mt.einsum('ij,jk', a, b)`` returns a matrix multiplication, while,
+    ``mt.einsum('ij,jh', a, b)`` returns the transpose of the
+    multiplication since subscript 'h' precedes subscript 'i'.
+
+    In *explicit mode* the output can be directly controlled by
+    specifying output subscript labels.  This requires the
+    identifier '->' as well as the list of output subscript labels.
+    This feature increases the flexibility of the function since
+    summing can be disabled or forced when required. The call
+    ``mt.einsum('i->', a)`` is like :py:func:`mt.sum(a, axis=-1) <mars.tensor.sum>`,
+    and ``mt.einsum('ii->i', a)`` is like :py:func:`mt.diag(a) <mars.tensor.diag>`.
+    The difference is that `einsum` does not allow broadcasting by default.
+    Additionally ``mt.einsum('ij,jh->ih', a, b)`` directly specifies the
+    order of the output subscript labels and therefore returns matrix
+    multiplication, unlike the example above in implicit mode.
+
+    To enable and control broadcasting, use an ellipsis.  Default
+    NumPy-style broadcasting is done by adding an ellipsis
+    to the left of each term, like ``mt.einsum('...ii->...i', a)``.
+    To take the trace along the first and last axes,
+    you can do ``mt.einsum('i...i', a)``, or to do a matrix-matrix
+    product with the left-most indices instead of rightmost, one can do
+    ``mt.einsum('ij...,jk...->ik...', a, b)``.
+
+    When there is only one operand, no axes are summed, and no output
+    parameter is provided, a view into the operand is returned instead
+    of a new array.  Thus, taking the diagonal as ``mt.einsum('ii->i', a)``
+    produces a view (changed in version 1.10.0).
+
+    `einsum` also provides an alternative way to provide the subscripts
+    and operands as ``einsum(op0, sublist0, op1, sublist1, ..., [sublistout])``.
+    If the output shape is not provided in this format `einsum` will be
+    calculated in implicit mode, otherwise it will be performed explicitly.
+    The examples below have corresponding `einsum` calls with the two
+    parameter methods.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> a = mt.arange(25).reshape(5,5)
+    >>> b = mt.arange(5)
+    >>> c = mt.arange(6).reshape(2,3)
+    Trace of a matrix:
+    >>> mt.einsum('ii', a).execute()
+    60
+    >>> mt.einsum(a, [0,0]).execute()
+    60
+    Extract the diagonal (requires explicit form):
+    >>> mt.einsum('ii->i', a).execute()
+    array([ 0,  6, 12, 18, 24])
+    >>> mt.einsum(a, [0,0], [0]).execute()
+    array([ 0,  6, 12, 18, 24])
+    >>> mt.diag(a).execute()
+    array([ 0,  6, 12, 18, 24])
+    Sum over an axis (requires explicit form):
+    >>> mt.einsum('ij->i', a).execute()
+    array([ 10,  35,  60,  85, 110])
+    >>> mt.einsum(a, [0,1], [0]).execute()
+    array([ 10,  35,  60,  85, 110])
+    >>> mt.sum(a, axis=1).execute()
+    array([ 10,  35,  60,  85, 110])
+    For higher dimensional arrays summing a single axis can be done with ellipsis:
+    >>> mt.einsum('...j->...', a).execute()
+    array([ 10,  35,  60,  85, 110])
+    >>> mt.einsum(a, [Ellipsis,1], [Ellipsis]).execute()
+    array([ 10,  35,  60,  85, 110])
+    Compute a matrix transpose, or reorder any number of axes:
+    >>> mt.einsum('ji', c).execute()
+    array([[0, 3],
+           [1, 4],
+           [2, 5]])
+    >>> mt.einsum('ij->ji', c).execute()
+    array([[0, 3],
+           [1, 4],
+           [2, 5]])
+    >>> mt.einsum(c, [1,0]).execute()
+    array([[0, 3],
+           [1, 4],
+           [2, 5]])
+    >>> mt.transpose(c).execute()
+    array([[0, 3],
+           [1, 4],
+           [2, 5]])
+    Vector inner products:
+    >>> mt.einsum('i,i', b, b).execute()
+    30
+    >>> mt.einsum(b, [0], b, [0]).execute()
+    30
+    >>> mt.inner(b,b).execute()
+    30
+    Matrix vector multiplication:
+    >>> mt.einsum('ij,j', a, b).execute()
+    array([ 30,  80, 130, 180, 230])
+    >>> mt.einsum(a, [0,1], b, [1]).execute()
+    array([ 30,  80, 130, 180, 230])
+    >>> mt.dot(a, b).execute()
+    array([ 30,  80, 130, 180, 230])
+    >>> mt.einsum('...j,j', a, b).execute()
+    array([ 30,  80, 130, 180, 230])
+    Broadcasting and scalar multiplication:
+    >>> mt.einsum('..., ...', 3, c).execute()
+    array([[ 0,  3,  6],
+           [ 9, 12, 15]])
+    >>> mt.einsum(',ij', 3, c).execute()
+    array([[ 0,  3,  6],
+           [ 9, 12, 15]])
+    >>> mt.einsum(3, [Ellipsis], c, [Ellipsis]).execute()
+    array([[ 0,  3,  6],
+           [ 9, 12, 15]])
+    >>> mt.multiply(3, c).execute()
+    array([[ 0,  3,  6],
+           [ 9, 12, 15]])
+    Vector outer product:
+    >>> mt.einsum('i,j', mt.arange(2)+1, b).execute()
+    array([[0, 1, 2, 3, 4],
+           [0, 2, 4, 6, 8]])
+    >>> mt.einsum(mt.arange(2)+1, [0], b, [1]).execute()
+    array([[0, 1, 2, 3, 4],
+           [0, 2, 4, 6, 8]])
+    >>> mt.outer(mt.arange(2)+1, b).execute()
+    array([[0, 1, 2, 3, 4],
+           [0, 2, 4, 6, 8]])
+    Tensor contraction:
+    >>> a = mt.arange(60.).reshape(3,4,5)
+    >>> b = mt.arange(24.).reshape(4,3,2)
+    >>> mt.einsum('ijk,jil->kl', a, b).execute()
+    array([[4400., 4730.],
+           [4532., 4874.],
+           [4664., 5018.],
+           [4796., 5162.],
+           [4928., 5306.]])
+    >>> mt.einsum(a, [0,1,2], b, [1,0,3], [2,3]).execute()
+    array([[4400., 4730.],
+           [4532., 4874.],
+           [4664., 5018.],
+           [4796., 5162.],
+           [4928., 5306.]])
+    >>> mt.tensordot(a,b, axes=([1,0],[0,1])).execute()
+    array([[4400., 4730.],
+           [4532., 4874.],
+           [4664., 5018.],
+           [4796., 5162.],
+           [4928., 5306.]])
+    Writeable returned arrays (since version 1.10.0):
+    >>> a = mt.zeros((3, 3))
+    >>> mt.einsum('ii->i', a)[:] = 1
+    >>> a.execute()
+    array([[1., 0., 0.],
+           [0., 1., 0.],
+           [0., 0., 1.]])
+    Example of ellipsis use:
+    >>> a = mt.arange(6).reshape((3,2))
+    >>> b = mt.arange(12).reshape((4,3))
+    >>> mt.einsum('ki,jk->ij', a, b).execute()
+    array([[10, 28, 46, 64],
+           [13, 40, 67, 94]])
+    >>> mt.einsum('ki,...k->i...', a, b).execute()
+    array([[10, 28, 46, 64],
+           [13, 40, 67, 94]])
+    >>> mt.einsum('k...,jk', a, b).execute()
+    array([[10, 28, 46, 64],
+           [13, 40, 67, 94]])
+    Chained array operations. For more complicated contractions, speed ups
+    might be achieved by repeatedly computing a 'greedy' path or pre-computing the
+    'optimal' path and repeatedly applying it, using an
+    `einsum_path` insertion (since version 1.12.0). Performance improvements can be
+    particularly significant with larger arrays:
+    >>> a = mt.ones(64).reshape(2,4,8)
+    Basic `einsum`: ~1520ms  (benchmarked on 3.1GHz Intel i5.)
+    >>> for iteration in range(500):
+    ...     _ = mt.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a)
+    Sub-optimal `einsum` (due to repeated path calculation time): ~330ms
+    >>> for iteration in range(500):
+    ...     _ = mt.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize='optimal')
+    Greedy `einsum` (faster optimal path approximation): ~160ms
+    >>> for iteration in range(500):
+    ...     _ = mt.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize='greedy')
+    Optimal `einsum` (best usage pattern in some use cases): ~110ms
+    >>> path = mt.einsum_path('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize='optimal')[0]
+    >>> for iteration in range(500):
+    ...     _ = mt.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize=path)
+
+    """
+
+    all_inputs = [subscripts] + list(operands)
+    inputs, outputs, operands = parse_einsum_input(all_inputs)
+    subscripts = "->".join((inputs, outputs))
+    axes_shape = dict()
+    for axes, op in zip(inputs.split(","), operands):
+        for ax, s in zip(axes, op.shape):
+            axes_shape[ax] = s
+
+    if optimize:
+        optimize, _ = einsum_path(*all_inputs, optimize=optimize)
+
+    shape = tuple(axes_shape[ax] for ax in outputs)
+    op = TensorEinsum(
+        subscripts=subscripts,
+        optimize=optimize,
+        dtype=dtype or operands[0].dtype,
+        order=order,
+        casting=casting,
+    )
+    return op(operands, shape)
diff --git a/python/xorbits/_mars/tensor/einsum/einsumfunc.py b/python/xorbits/_mars/tensor/einsum/einsumfunc.py
new file mode 100644
index 000000000..23988b0c2
--- /dev/null
+++ b/python/xorbits/_mars/tensor/einsum/einsumfunc.py
@@ -0,0 +1,1027 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+from numpy.compat import basestring
+
+from ..datasource.array import tensor as astensor
+
+__all__ = ["parse_einsum_input", "einsum_path"]
+
+einsum_symbols = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+einsum_symbols_set = set(einsum_symbols)
+
+
+def _flop_count(idx_contraction, inner, num_terms, size_dictionary):
+    """
+    Computes the number of FLOPS in the contraction.
+
+    Parameters
+    ----------
+    idx_contraction : iterable
+        The indices involved in the contraction
+    inner : bool
+        Does this contraction require an inner product?
+    num_terms : int
+        The number of terms in a contraction
+    size_dictionary : dict
+        The size of each of the indices in idx_contraction
+
+    Returns
+    -------
+    flop_count : int
+        The total number of FLOPS required for the contraction.
+
+    Examples
+    --------
+
+    >>> _flop_count('abc', False, 1, {'a': 2, 'b':3, 'c':5})
+    30
+
+    >>> _flop_count('abc', True, 2, {'a': 2, 'b':3, 'c':5})
+    60
+
+    """
+
+    overall_size = _compute_size_by_dict(idx_contraction, size_dictionary)
+    op_factor = max(1, num_terms - 1)
+    if inner:
+        op_factor += 1
+
+    return overall_size * op_factor
+
+
+def _compute_size_by_dict(indices, idx_dict):
+    """
+    Computes the product of the elements in indices based on the dictionary
+    idx_dict.
+
+    Parameters
+    ----------
+    indices : iterable
+        Indices to base the product on.
+    idx_dict : dictionary
+        Dictionary of index sizes
+
+    Returns
+    -------
+    ret : int
+        The resulting product.
+
+    Examples
+    --------
+    >>> _compute_size_by_dict('abbc', {'a': 2, 'b':3, 'c':5})
+    90
+
+    """
+    ret = 1
+    for i in indices:
+        ret *= idx_dict[i]
+    return ret
+
+
+def _find_contraction(positions, input_sets, output_set):
+    """
+    Finds the contraction for a given set of input and output sets.
+
+    Parameters
+    ----------
+    positions : iterable
+        Integer positions of terms used in the contraction.
+    input_sets : list
+        List of sets that represent the lhs side of the einsum subscript
+    output_set : set
+        Set that represents the rhs side of the overall einsum subscript
+
+    Returns
+    -------
+    new_result : set
+        The indices of the resulting contraction
+    remaining : list
+        List of sets that have not been contracted, the new set is appended to
+        the end of this list
+    idx_removed : set
+        Indices removed from the entire contraction
+    idx_contraction : set
+        The indices used in the current contraction
+
+    Examples
+    --------
+
+    # A simple dot product test case
+    >>> pos = (0, 1)
+    >>> isets = [set('ab'), set('bc')]
+    >>> oset = set('ac')
+    >>> _find_contraction(pos, isets, oset)
+    ({'a', 'c'}, [{'a', 'c'}], {'b'}, {'a', 'b', 'c'})
+
+    # A more complex case with additional terms in the contraction
+    >>> pos = (0, 2)
+    >>> isets = [set('abd'), set('ac'), set('bdc')]
+    >>> oset = set('ac')
+    >>> _find_contraction(pos, isets, oset)
+    ({'a', 'c'}, [{'a', 'c'}, {'a', 'c'}], {'b', 'd'}, {'a', 'b', 'c', 'd'})
+    """
+
+    idx_contract = set()
+    idx_remain = output_set.copy()
+    remaining = []
+    for ind, value in enumerate(input_sets):
+        if ind in positions:
+            idx_contract |= value
+        else:
+            remaining.append(value)
+            idx_remain |= value
+
+    new_result = idx_remain & idx_contract
+    idx_removed = idx_contract - new_result
+    remaining.append(new_result)
+
+    return (new_result, remaining, idx_removed, idx_contract)
+
+
+def _optimal_path(input_sets, output_set, idx_dict, memory_limit):
+    """
+    Computes all possible pair contractions, sieves the results based
+    on ``memory_limit`` and returns the lowest cost path. This algorithm
+    scales factorial with respect to the elements in the list ``input_sets``.
+
+    Parameters
+    ----------
+    input_sets : list
+        List of sets that represent the lhs side of the einsum subscript
+    output_set : set
+        Set that represents the rhs side of the overall einsum subscript
+    idx_dict : dictionary
+        Dictionary of index sizes
+    memory_limit : int
+        The maximum number of elements in a temporary array
+
+    Returns
+    -------
+    path : list
+        The optimal contraction order within the memory limit constraint.
+
+    Examples
+    --------
+    >>> isets = [set('abd'), set('ac'), set('bdc')]
+    >>> oset = set()
+    >>> idx_sizes = {'a': 1, 'b':2, 'c':3, 'd':4}
+    >>> _optimal_path(isets, oset, idx_sizes, 5000)
+    [(0, 2), (0, 1)]
+    """
+
+    full_results = [(0, [], input_sets)]
+    for iteration in range(len(input_sets) - 1):
+        iter_results = []
+
+        # Compute all unique pairs
+        for curr in full_results:
+            cost, positions, remaining = curr
+            for con in itertools.combinations(range(len(input_sets) - iteration), 2):
+                # Find the contraction
+                cont = _find_contraction(con, remaining, output_set)
+                new_result, new_input_sets, idx_removed, idx_contract = cont
+
+                # Sieve the results based on memory_limit
+                new_size = _compute_size_by_dict(new_result, idx_dict)
+                if new_size > memory_limit:
+                    continue
+
+                # Build (total_cost, positions, indices_remaining)
+                total_cost = cost + _flop_count(
+                    idx_contract, idx_removed, len(con), idx_dict
+                )
+                new_pos = positions + [con]
+                iter_results.append((total_cost, new_pos, new_input_sets))
+
+        # Update combinatorial list, if we did not find anything return best
+        # path + remaining contractions
+        if iter_results:
+            full_results = iter_results
+        else:
+            path = min(full_results, key=lambda x: x[0])[1]
+            path += [tuple(range(len(input_sets) - iteration))]
+            return path
+
+    # If we have not found anything return single einsum contraction
+    if len(full_results) == 0:
+        return [tuple(range(len(input_sets)))]
+
+    path = min(full_results, key=lambda x: x[0])[1]
+    return path
+
+
+def _parse_possible_contraction(
+    positions, input_sets, output_set, idx_dict, memory_limit, path_cost, naive_cost
+):
+    """Compute the cost (removed size + flops) and resultant indices for
+    performing the contraction specified by ``positions``.
+
+    Parameters
+    ----------
+    positions : tuple of int
+        The locations of the proposed tensors to contract.
+    input_sets : list of sets
+        The indices found on each tensors.
+    output_set : set
+        The output indices of the expression.
+    idx_dict : dict
+        Mapping of each index to its size.
+    memory_limit : int
+        The total allowed size for an intermediary tensor.
+    path_cost : int
+        The contraction cost so far.
+    naive_cost : int
+        The cost of the unoptimized expression.
+
+    Returns
+    -------
+    cost : (int, int)
+        A tuple containing the size of any indices removed, and the flop cost.
+    positions : tuple of int
+        The locations of the proposed tensors to contract.
+    new_input_sets : list of sets
+        The resulting new list of indices if this proposed contraction is performed.
+
+    """
+
+    # Find the contraction
+    contract = _find_contraction(positions, input_sets, output_set)
+    idx_result, new_input_sets, idx_removed, idx_contract = contract
+
+    # Sieve the results based on memory_limit
+    new_size = _compute_size_by_dict(idx_result, idx_dict)
+    if new_size > memory_limit:
+        return None
+
+    # Build sort tuple
+    old_sizes = (_compute_size_by_dict(input_sets[p], idx_dict) for p in positions)
+    removed_size = sum(old_sizes) - new_size
+
+    # NB: removed_size used to be just the size of any removed indices i.e.:
+    #     helpers.compute_size_by_dict(idx_removed, idx_dict)
+    cost = _flop_count(idx_contract, idx_removed, len(positions), idx_dict)
+    sort = (-removed_size, cost)
+
+    # Sieve based on total cost as well
+    if (path_cost + cost) > naive_cost:
+        return None
+
+    # Add contraction to possible choices
+    return [sort, positions, new_input_sets]
+
+
+def _update_other_results(results, best):
+    """Update the positions and provisional input_sets of ``results`` based on
+    performing the contraction result ``best``. Remove any involving the tensors
+    contracted.
+
+    Parameters
+    ----------
+    results : list
+        List of contraction results produced by ``_parse_possible_contraction``.
+    best : list
+        The best contraction of ``results`` i.e. the one that will be performed.
+
+    Returns
+    -------
+    mod_results : list
+        The list of modified results, updated with outcome of ``best`` contraction.
+    """
+
+    best_con = best[1]
+    bx, by = best_con
+    mod_results = []
+
+    for cost, (x, y), con_sets in results:
+        # Ignore results involving tensors just contracted
+        if x in best_con or y in best_con:
+            continue
+
+        # Update the input_sets
+        del con_sets[by - int(by > x) - int(by > y)]
+        del con_sets[bx - int(bx > x) - int(bx > y)]
+        con_sets.insert(-1, best[2][-1])
+
+        # Update the position indices
+        mod_con = x - int(x > bx) - int(x > by), y - int(y > bx) - int(y > by)
+        mod_results.append((cost, mod_con, con_sets))
+
+    return mod_results
+
+
+def _greedy_path(input_sets, output_set, idx_dict, memory_limit):
+    """
+    Finds the path by contracting the best pair until the input list is
+    exhausted. The best pair is found by minimizing the tuple
+    ``(-prod(indices_removed), cost)``.  What this amounts to is prioritizing
+    matrix multiplication or inner product operations, then Hadamard like
+    operations, and finally outer operations. Outer products are limited by
+    ``memory_limit``. This algorithm scales cubically with respect to the
+    number of elements in the list ``input_sets``.
+
+    Parameters
+    ----------
+    input_sets : list
+        List of sets that represent the lhs side of the einsum subscript
+    output_set : set
+        Set that represents the rhs side of the overall einsum subscript
+    idx_dict : dictionary
+        Dictionary of index sizes
+    memory_limit_limit : int
+        The maximum number of elements in a temporary array
+
+    Returns
+    -------
+    path : list
+        The greedy contraction order within the memory limit constraint.
+
+    Examples
+    --------
+    >>> isets = [set('abd'), set('ac'), set('bdc')]
+    >>> oset = set()
+    >>> idx_sizes = {'a': 1, 'b':2, 'c':3, 'd':4}
+    >>> _greedy_path(isets, oset, idx_sizes, 5000)
+    [(0, 2), (0, 1)]
+    """
+
+    # Handle trivial cases that leaked through
+    if len(input_sets) == 1:
+        return [(0,)]
+    elif len(input_sets) == 2:
+        return [(0, 1)]
+
+    # Build up a naive cost
+    contract = _find_contraction(range(len(input_sets)), input_sets, output_set)
+    idx_result, new_input_sets, idx_removed, idx_contract = contract
+    naive_cost = _flop_count(idx_contract, idx_removed, len(input_sets), idx_dict)
+
+    # Initially iterate over all pairs
+    comb_iter = itertools.combinations(range(len(input_sets)), 2)
+    known_contractions = []
+
+    path_cost = 0
+    path = []
+
+    for iteration in range(len(input_sets) - 1):
+        # Iterate over all pairs on first step, only previously found pairs on subsequent steps
+        for positions in comb_iter:
+            # Always initially ignore outer products
+            if input_sets[positions[0]].isdisjoint(input_sets[positions[1]]):
+                continue
+
+            result = _parse_possible_contraction(
+                positions,
+                input_sets,
+                output_set,
+                idx_dict,
+                memory_limit,
+                path_cost,
+                naive_cost,
+            )
+            if result is not None:
+                known_contractions.append(result)
+
+        # If we do not have a inner contraction, rescan pairs including outer products
+        if len(known_contractions) == 0:  # pragma: no cover
+            # Then check the outer products
+            for positions in itertools.combinations(range(len(input_sets)), 2):
+                result = _parse_possible_contraction(
+                    positions,
+                    input_sets,
+                    output_set,
+                    idx_dict,
+                    memory_limit,
+                    path_cost,
+                    naive_cost,
+                )
+                if result is not None:
+                    known_contractions.append(result)
+
+            # If we still did not find any remaining contractions, default back to einsum like behavior
+            if len(known_contractions) == 0:
+                path.append(tuple(range(len(input_sets))))
+                break
+
+        # Sort based on first index
+        best = min(known_contractions, key=lambda x: x[0])
+
+        # Now propagate as many unused contractions as possible to next iteration
+        known_contractions = _update_other_results(known_contractions, best)
+
+        # Next iteration only compute contractions with the new tensor
+        # All other contractions have been accounted for
+        input_sets = best[2]
+        new_tensor_pos = len(input_sets) - 1
+        comb_iter = ((i, new_tensor_pos) for i in range(new_tensor_pos))
+
+        # Update path and total cost
+        path.append(best[1])
+        path_cost += best[0][1]
+
+    return path
+
+
+def _can_dot(inputs, result, idx_removed):
+    """
+    Checks if we can use BLAS (np.tensordot) call and its beneficial to do so.
+
+    Parameters
+    ----------
+    inputs : list of str
+        Specifies the subscripts for summation.
+    result : str
+        Resulting summation.
+    idx_removed : set
+        Indices that are removed in the summation
+
+
+    Returns
+    -------
+    type : bool
+        Returns true if BLAS should and can be used, else False
+
+    Notes
+    -----
+    If the operations is BLAS level 1 or 2 and is not already aligned
+    we default back to einsum as the memory movement to copy is more
+    costly than the operation itself.
+
+
+    Examples
+    --------
+
+    # Standard GEMM operation
+    >>> _can_dot(['ij', 'jk'], 'ik', set('j'))
+    True
+
+    # Can use the standard BLAS, but requires odd data movement
+    >>> _can_dot(['ijj', 'jk'], 'ik', set('j'))
+    False
+
+    # DDOT where the memory is not aligned
+    >>> _can_dot(['ijk', 'ikj'], '', set('ijk'))
+    False
+
+    """
+
+    # All `dot` calls remove indices
+    if len(idx_removed) == 0:
+        return False
+
+    # BLAS can only handle two operands
+    if len(inputs) != 2:
+        return False
+
+    input_left, input_right = inputs
+
+    for c in set(input_left + input_right):
+        # can't deal with repeated indices on same input or more than 2 total
+        nl, nr = input_left.count(c), input_right.count(c)
+        if (nl > 1) or (nr > 1) or (nl + nr > 2):
+            return False
+
+        # can't do implicit summation or dimension collapse e.g.
+        #     "ab,bc->c" (implicitly sum over 'a')
+        #     "ab,ca->ca" (take diagonal of 'a')
+        if nl + nr - 1 == int(c in result):
+            return False
+
+    # Build a few temporaries
+    set_left = set(input_left)
+    set_right = set(input_right)
+    keep_left = set_left - idx_removed
+    keep_right = set_right - idx_removed
+    rs = len(idx_removed)
+
+    # At this point we are a DOT, GEMV, or GEMM operation
+
+    # Handle inner products
+
+    # DDOT with aligned data
+    if input_left == input_right:
+        return True
+
+    # DDOT without aligned data (better to use einsum)
+    if set_left == set_right:
+        return False
+
+    # Handle the 4 possible (aligned) GEMV or GEMM cases
+
+    # GEMM or GEMV no transpose
+    if input_left[-rs:] == input_right[:rs]:
+        return True
+
+    # GEMM or GEMV transpose both
+    if input_left[:rs] == input_right[-rs:]:
+        return True
+
+    # GEMM or GEMV transpose right
+    if input_left[-rs:] == input_right[-rs:]:
+        return True
+
+    # GEMM or GEMV transpose left
+    if input_left[:rs] == input_right[:rs]:
+        return True
+
+    # Einsum is faster than GEMV if we have to copy data
+    if not keep_left or not keep_right:
+        return False
+
+    # We are a matrix-matrix product, but we need to copy data
+    return True
+
+
+def parse_einsum_input(operands):
+    """
+    A reproduction of einsum c side einsum parsing in python.
+
+    Returns
+    -------
+    input_strings : str
+        Parsed input strings
+    output_string : str
+        Parsed output string
+    operands : list of array_like
+        The operands to use in the numpy contraction
+
+    Examples
+    --------
+    The operand list is simplified to reduce printing:
+
+    >>> import mars.tensor as mt
+    >>> mt.random.seed(123)
+    >>> a = mt.random.rand(4, 4)
+    >>> b = mt.random.rand(4, 4, 4)
+    >>> parse_einsum_input(('...a,...a->...', a, b))
+    ('za,xza', 'xz', [a, b]) # may vary
+
+    >>> parse_einsum_input((a, [Ellipsis, 0], b, [Ellipsis, 0]))
+    ('za,xza', 'xz', [a, b]) # may vary
+    """
+
+    if len(operands) == 0:
+        raise ValueError("No input operands")
+
+    if isinstance(operands[0], basestring):
+        subscripts = operands[0].replace(" ", "")
+        operands = [astensor(v) for v in operands[1:]]
+
+        # Ensure all characters are valid
+        for s in subscripts:
+            if s in ".,->":
+                continue
+            if s not in einsum_symbols:
+                raise ValueError(f"Character {s} is not a valid symbol.")
+
+    else:  # pragma: no cover
+        tmp_operands = list(operands)
+        operand_list = []
+        subscript_list = []
+        for p in range(len(operands) // 2):
+            operand_list.append(tmp_operands.pop(0))
+            subscript_list.append(tmp_operands.pop(0))
+
+        output_list = tmp_operands[-1] if len(tmp_operands) else None
+        operands = [astensor(v) for v in operand_list]
+        subscripts = ""
+        last = len(subscript_list) - 1
+        for num, sub in enumerate(subscript_list):
+            for s in sub:
+                if s is Ellipsis:
+                    subscripts += "..."
+                elif isinstance(s, int):
+                    subscripts += einsum_symbols[s]
+                else:
+                    raise TypeError(
+                        "For this input type lists must contain "
+                        "either int or Ellipsis"
+                    )
+            if num != last:
+                subscripts += ","
+
+        if output_list is not None:
+            subscripts += "->"
+            for s in output_list:
+                if s is Ellipsis:
+                    subscripts += "..."
+                elif isinstance(s, int):
+                    subscripts += einsum_symbols[s]
+                else:
+                    raise TypeError(
+                        "For this input type lists must contain "
+                        "either int or Ellipsis"
+                    )
+    # Check for proper "->"
+    if ("-" in subscripts) or (">" in subscripts):
+        invalid = (subscripts.count("-") > 1) or (subscripts.count(">") > 1)
+        if invalid or (subscripts.count("->") != 1):
+            raise ValueError("Subscripts can only contain one '->'.")
+
+    # Parse ellipses
+    if "." in subscripts:
+        used = subscripts.replace(".", "").replace(",", "").replace("->", "")
+        unused = list(einsum_symbols_set - set(used))
+        ellipse_inds = "".join(unused)
+        longest = 0
+
+        if "->" in subscripts:
+            input_tmp, output_sub = subscripts.split("->")
+            split_subscripts = input_tmp.split(",")
+            out_sub = True
+        else:
+            split_subscripts = subscripts.split(",")
+            out_sub = False
+
+        for num, sub in enumerate(split_subscripts):
+            if "." in sub:
+                if (sub.count(".") != 3) or (sub.count("...") != 1):
+                    raise ValueError("Invalid Ellipses.")
+
+                # Take into account numerical values
+                if operands[num].shape == ():
+                    ellipse_count = 0
+                else:
+                    ellipse_count = max(operands[num].ndim, 1)
+                    ellipse_count -= len(sub) - 3
+
+                if ellipse_count > longest:
+                    longest = ellipse_count
+
+                if ellipse_count < 0:
+                    raise ValueError("Ellipses lengths do not match.")
+                elif ellipse_count == 0:
+                    split_subscripts[num] = sub.replace("...", "")
+                else:
+                    rep_inds = ellipse_inds[-ellipse_count:]
+                    split_subscripts[num] = sub.replace("...", rep_inds)
+
+        subscripts = ",".join(split_subscripts)
+        if longest == 0:
+            out_ellipse = ""
+        else:
+            out_ellipse = ellipse_inds[-longest:]
+
+        if out_sub:
+            subscripts += "->" + output_sub.replace("...", out_ellipse)
+        else:
+            # Special care for outputless ellipses
+            output_subscript = ""
+            tmp_subscripts = subscripts.replace(",", "")
+            for s in sorted(set(tmp_subscripts)):
+                if s not in (einsum_symbols):
+                    raise ValueError(f"Character {s} is not a valid symbol.")
+                if tmp_subscripts.count(s) == 1:
+                    output_subscript += s
+            normal_inds = "".join(sorted(set(output_subscript) - set(out_ellipse)))
+
+            subscripts += "->" + out_ellipse + normal_inds
+
+    # Build output string if does not exist
+    if "->" in subscripts:
+        input_subscripts, output_subscript = subscripts.split("->")
+    else:
+        input_subscripts = subscripts
+        # Build output subscripts
+        tmp_subscripts = subscripts.replace(",", "")
+        output_subscript = ""
+        for s in sorted(set(tmp_subscripts)):
+            if s not in einsum_symbols:
+                raise ValueError(f"Character {s} is not a valid symbol.")
+            if tmp_subscripts.count(s) == 1:
+                output_subscript += s
+
+    # Make sure output subscripts are in the input
+    for char in output_subscript:
+        if char not in input_subscripts:
+            raise ValueError(f"Output character {char} did not appear in the input")
+
+    # Make sure number operands is equivalent to the number of terms
+    if len(input_subscripts.split(",")) != len(operands):
+        raise ValueError(
+            "Number of einsum subscripts must be equal to the number of operands."
+        )
+
+    return (input_subscripts, output_subscript, operands)
+
+
+def _einsum_path_dispatcher(*operands, **kwargs):
+    # NOTE: technically, we should only dispatch on array-like arguments, not
+    # subscripts (given as strings). But separating operands into
+    # arrays/subscripts is a little tricky/slow (given einsum's two supported
+    # signatures), so as a practical shortcut we dispatch on everything.
+    # Strings will be ignored for dispatching since they don't define
+    # __array_function__.
+    return operands
+
+
+def einsum_path(*operands, **kwargs):
+    """
+    einsum_path(subscripts, *operands, optimize='greedy')
+
+    Evaluates the lowest cost contraction order for an einsum expression by
+    considering the creation of intermediate arrays.
+
+    Parameters
+    ----------
+    subscripts : str
+        Specifies the subscripts for summation.
+    *operands : list of array_like
+        These are the arrays for the operation.
+    optimize : {bool, list, tuple, 'greedy', 'optimal'}
+        Choose the type of path. If a tuple is provided, the second argument is
+        assumed to be the maximum intermediate size created. If only a single
+        argument is provided the largest input or output array size is used
+        as a maximum intermediate size.
+
+        * if a list is given that starts with ``einsum_path``, uses this as the
+          contraction path
+        * if False no optimization is taken
+        * if True defaults to the 'greedy' algorithm
+        * 'optimal' An algorithm that combinatorially explores all possible
+          ways of contracting the listed tensors and choosest the least costly
+          path. Scales exponentially with the number of terms in the
+          contraction.
+        * 'greedy' An algorithm that chooses the best pair contraction
+          at each step. Effectively, this algorithm searches the largest inner,
+          Hadamard, and then outer products at each step. Scales cubically with
+          the number of terms in the contraction. Equivalent to the 'optimal'
+          path for most contractions.
+
+        Default is 'greedy'.
+
+    Returns
+    -------
+    path : list of tuples
+        A list representation of the einsum path.
+    string_repr : str
+        A printable representation of the einsum path.
+
+    Notes
+    -----
+    The resulting path indicates which terms of the input contraction should be
+    contracted first, the result of this contraction is then appended to the
+    end of the contraction list. This list can then be iterated over until all
+    intermediate contractions are complete.
+
+    See Also
+    --------
+    einsum, linalg.multi_dot
+
+    Examples
+    --------
+
+    We can begin with a chain dot example. In this case, it is optimal to
+    contract the ``b`` and ``c`` tensors first as represented by the first
+    element of the path ``(1, 2)``. The resulting tensor is added to the end
+    of the contraction and the remaining contraction ``(0, 1)`` is then
+    completed.
+
+    >>> np.random.seed(123)
+    >>> a = np.random.rand(2, 2)
+    >>> b = np.random.rand(2, 5)
+    >>> c = np.random.rand(5, 2)
+    >>> path_info = np.einsum_path('ij,jk,kl->il', a, b, c, optimize='greedy')
+    >>> print(path_info[0])
+    ['einsum_path', (1, 2), (0, 1)]
+    >>> print(path_info[1])
+      Complete contraction:  ij,jk,kl->il # may vary
+             Naive scaling:  4
+         Optimized scaling:  3
+          Naive FLOP count:  1.600e+02
+      Optimized FLOP count:  5.600e+01
+       Theoretical speedup:  2.857
+      Largest intermediate:  4.000e+00 elements
+    -------------------------------------------------------------------------
+    scaling                  current                                remaining
+    -------------------------------------------------------------------------
+       3                   kl,jk->jl                                ij,jl->il
+       3                   jl,ij->il                                   il->il
+
+
+    A more complex index transformation example.
+
+    >>> I = np.random.rand(10, 10, 10, 10)
+    >>> C = np.random.rand(10, 10)
+    >>> path_info = np.einsum_path('ea,fb,abcd,gc,hd->efgh', C, C, I, C, C,
+    ...                            optimize='greedy')
+
+    >>> print(path_info[0])
+    ['einsum_path', (0, 2), (0, 3), (0, 2), (0, 1)]
+    >>> print(path_info[1])
+      Complete contraction:  ea,fb,abcd,gc,hd->efgh # may vary
+             Naive scaling:  8
+         Optimized scaling:  5
+          Naive FLOP count:  8.000e+08
+      Optimized FLOP count:  8.000e+05
+       Theoretical speedup:  1000.000
+      Largest intermediate:  1.000e+04 elements
+    --------------------------------------------------------------------------
+    scaling                  current                                remaining
+    --------------------------------------------------------------------------
+       5               abcd,ea->bcde                      fb,gc,hd,bcde->efgh
+       5               bcde,fb->cdef                         gc,hd,cdef->efgh
+       5               cdef,gc->defg                            hd,defg->efgh
+       5               defg,hd->efgh                               efgh->efgh
+    """
+
+    # Make sure all keywords are valid
+    valid_contract_kwargs = ["optimize", "einsum_call"]
+    unknown_kwargs = [k for (k, v) in kwargs.items() if k not in valid_contract_kwargs]
+    if len(unknown_kwargs):
+        raise TypeError(f"Did not understand the following kwargs: {unknown_kwargs!r}")
+
+    # Figure out what the path really is
+    path_type = kwargs.pop("optimize", True)
+    if path_type is True:
+        path_type = "greedy"
+    if path_type is None:
+        path_type = False
+
+    memory_limit = None
+
+    # No optimization or a named path algorithm
+    if (path_type is False) or isinstance(path_type, basestring):
+        pass
+
+    # Given an explicit path
+    elif len(path_type) and (path_type[0] == "einsum_path"):  # pragma: no cover
+        pass
+
+    # Path tuple with memory limit
+    elif (
+        (len(path_type) == 2)
+        and isinstance(path_type[0], basestring)
+        and isinstance(path_type[1], (int, float))
+    ):  # pragma: no cover
+        memory_limit = int(path_type[1])
+        path_type = path_type[0]
+
+    else:  # pragma: no cover
+        raise TypeError(f"Did not understand the path: {path_type}")
+
+    # Hidden option, only einsum should call this
+    einsum_call_arg = kwargs.pop("einsum_call", False)
+
+    # Python side parsing
+    input_subscripts, output_subscript, operands = parse_einsum_input(operands)
+
+    # Build a few useful list and sets
+    input_list = input_subscripts.split(",")
+    input_sets = [set(x) for x in input_list]
+    output_set = set(output_subscript)
+    indices = set(input_subscripts.replace(",", ""))
+
+    # Get length of each unique dimension and ensure all dimensions are correct
+    dimension_dict = {}
+    broadcast_indices = [[] for x in range(len(input_list))]
+    for tnum, term in enumerate(input_list):
+        sh = operands[tnum].shape
+        if len(sh) != len(term):
+            raise ValueError(
+                "Einstein sum subscript %s does not contain the "
+                "correct number of indices for operand %d."
+                % (input_subscripts[tnum], tnum)
+            )
+        for cnum, char in enumerate(term):
+            dim = sh[cnum]
+
+            # Build out broadcast indices
+            if dim == 1:
+                broadcast_indices[tnum].append(char)
+
+            if char in dimension_dict.keys():
+                # For broadcasting cases we always want the largest dim size
+                if dimension_dict[char] == 1:
+                    dimension_dict[char] = dim
+                elif dim not in (1, dimension_dict[char]):
+                    raise ValueError(
+                        "Size of label '%s' for operand %d (%d) "
+                        "does not match previous terms (%d)."
+                        % (char, tnum, dimension_dict[char], dim)
+                    )
+            else:
+                dimension_dict[char] = dim
+
+    # Convert broadcast inds to sets
+    broadcast_indices = [set(x) for x in broadcast_indices]
+
+    # Compute size of each input array plus the output array
+    size_list = [
+        _compute_size_by_dict(term, dimension_dict)
+        for term in input_list + [output_subscript]
+    ]
+    max_size = max(size_list)
+
+    if memory_limit is None:
+        memory_arg = max_size
+    else:
+        memory_arg = memory_limit
+
+    # Compute naive cost
+    # This isn't quite right, need to look into exactly how einsum does this
+    inner_product = (sum(len(x) for x in input_sets) - len(indices)) > 0
+    naive_cost = _flop_count(indices, inner_product, len(input_list), dimension_dict)
+
+    # Compute the path
+    if (path_type is False) or (len(input_list) in [1, 2]) or (indices == output_set):
+        # Nothing to be optimized, leave it to einsum
+        path = [tuple(range(len(input_list)))]
+    elif path_type == "greedy":
+        path = _greedy_path(input_sets, output_set, dimension_dict, memory_arg)
+    elif path_type == "optimal":
+        path = _optimal_path(input_sets, output_set, dimension_dict, memory_arg)
+    elif path_type[0] == "einsum_path":  # pragma: no cover
+        path = path_type[1:]
+    else:  # pragma: no cover
+        raise KeyError("Path name %s not found", path_type)
+
+    cost_list, scale_list, size_list, contraction_list = [], [], [], []
+
+    # Build contraction tuple (positions, gemm, einsum_str, remaining)
+    for cnum, contract_inds in enumerate(path):
+        # Make sure we remove inds from right to left
+        contract_inds = tuple(sorted(list(contract_inds), reverse=True))
+
+        contract = _find_contraction(contract_inds, input_sets, output_set)
+        out_inds, input_sets, idx_removed, idx_contract = contract
+
+        cost = _flop_count(
+            idx_contract, idx_removed, len(contract_inds), dimension_dict
+        )
+        cost_list.append(cost)
+        scale_list.append(len(idx_contract))
+        size_list.append(_compute_size_by_dict(out_inds, dimension_dict))
+
+        bcast = set()
+        tmp_inputs = []
+        for x in contract_inds:
+            tmp_inputs.append(input_list.pop(x))
+            bcast |= broadcast_indices.pop(x)
+
+        new_bcast_inds = bcast - idx_removed
+
+        # If we're broadcasting, nix blas
+        if not len(idx_removed & bcast):
+            do_blas = _can_dot(tmp_inputs, out_inds, idx_removed)
+        else:
+            do_blas = False
+
+        # Last contraction
+        if (cnum - len(path)) == -1:
+            idx_result = output_subscript
+        else:
+            sort_result = [(dimension_dict[ind], ind) for ind in out_inds]
+            idx_result = "".join([x[1] for x in sorted(sort_result)])
+
+        input_list.append(idx_result)
+        broadcast_indices.append(new_bcast_inds)
+        einsum_str = ",".join(tmp_inputs) + "->" + idx_result
+
+        contraction = (contract_inds, idx_removed, einsum_str, input_list[:], do_blas)
+        contraction_list.append(contraction)
+
+    opt_cost = sum(cost_list) + 1
+
+    if einsum_call_arg:
+        return (operands, contraction_list)
+
+    # Return the path along with a nice string representation
+    overall_contraction = input_subscripts + "->" + output_subscript
+    header = ("scaling", "current", "remaining")
+
+    speedup = naive_cost / opt_cost
+    max_i = max(size_list)
+
+    path_print = "  Complete contraction:  %s\n" % overall_contraction  # noqa: E221
+    path_print += "         Naive scaling:  %d\n" % len(indices)
+    path_print += "     Optimized scaling:  %d\n" % max(scale_list)
+    path_print += "      Naive FLOP count:  %.3e\n" % naive_cost
+    path_print += "  Optimized FLOP count:  %.3e\n" % opt_cost
+    path_print += "   Theoretical speedup:  %3.3f\n" % speedup
+    path_print += "  Largest intermediate:  %.3e elements\n" % max_i
+    path_print += "-" * 74 + "\n"
+    path_print += "%6s %24s %40s\n" % header
+    path_print += "-" * 74
+
+    for n, contraction in enumerate(contraction_list):
+        inds, idx_rm, einsum_str, remaining, blas = contraction
+        remaining_str = ",".join(remaining) + "->" + output_subscript
+        path_run = (scale_list[n], einsum_str, remaining_str)
+        path_print += "\n%4d    %24s %40s" % path_run
+
+    path = ["einsum_path"] + path
+    return path, path_print
diff --git a/python/xorbits/_mars/tensor/einsum/tests/__init__.py b/python/xorbits/_mars/tensor/einsum/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/einsum/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/einsum/tests/test_einsum.py b/python/xorbits/_mars/tensor/einsum/tests/test_einsum.py
new file mode 100644
index 000000000..271f3d3db
--- /dev/null
+++ b/python/xorbits/_mars/tensor/einsum/tests/test_einsum.py
@@ -0,0 +1,65 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ....core import tile
+from ... import einsum
+from ...datasource import tensor
+
+
+def test_einsum():
+    data1 = np.random.rand(3, 4, 5)
+    data2 = np.random.rand(4, 3, 2)
+
+    t1 = tensor(data1, chunk_size=2)
+    t2 = tensor(data2, chunk_size=3)
+    t = einsum("ijk, jil -> kl", t1, t2)
+
+    assert t.shape == (5, 2)
+
+    t = tile(t)
+    assert len(t.chunks) == 3
+
+    # multiply(data1, data2)
+    data1 = np.random.rand(6, 6)
+    data2 = np.random.rand(6, 6)
+    t1 = tensor(data1, chunk_size=3)
+    t2 = tensor(data2, chunk_size=3)
+    t = einsum("..., ...", t1, t2)
+
+    assert t.shape == (6, 6)
+
+    t = tile(t)
+    assert len(t.chunks) == 4
+
+    t = einsum("..., ...", t1, t2, optimize=True)
+    assert t.op.optimize == ["einsum_path", (0, 1)]
+
+    # test broadcast
+    data1 = np.random.rand(1, 10, 9)
+    data2 = np.random.rand(9, 6)
+    data3 = np.random.rand(10, 6)
+    data4 = np.random.rand(8)
+
+    t1 = tensor(data1, chunk_size=(1, (5, 5), (3, 3, 3)))
+    t2 = tensor(data2, chunk_size=((3, 3, 3), (3, 3)))
+    t3 = tensor(data3, chunk_size=((6, 4), (4, 2)))
+    t4 = tensor(data4, chunk_size=3)
+    t = einsum("ajk,kl,jl,a->a", t1, t2, t3, t4, optimize="")
+
+    assert t.shape == (8,)
+
+    t = tile(t)
+    assert len(t.chunks) == 3
diff --git a/python/xorbits/_mars/tensor/einsum/tests/test_einsum_execution.py b/python/xorbits/_mars/tensor/einsum/tests/test_einsum_execution.py
new file mode 100644
index 000000000..675e7f672
--- /dev/null
+++ b/python/xorbits/_mars/tensor/einsum/tests/test_einsum_execution.py
@@ -0,0 +1,85 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import einsum
+from ...datasource import tensor
+
+
+def test_einsum_execution(setup):
+    data1 = np.random.rand(3, 4, 5)
+    data2 = np.random.rand(4, 3, 2)
+
+    t1 = tensor(data1, chunk_size=2)
+    t2 = tensor(data2, chunk_size=3)
+    t = einsum("ijk, jil -> kl", t1, t2)
+    res = t.execute().fetch()
+    expected = np.einsum("ijk, jil -> kl", data1, data2)
+    np.testing.assert_almost_equal(res, expected)
+
+    # dot
+    t = einsum("ijk, jil", t1, t2, optimize=True)
+    res = t.execute().fetch()
+    expected = np.einsum("ijk, jil", data1, data2, optimize=True)
+    np.testing.assert_almost_equal(res, expected)
+
+    # multiply(data1, data2)
+    data1 = np.random.rand(6, 6)
+    data2 = np.random.rand(6, 6)
+    t1 = tensor(data1, chunk_size=3)
+    t2 = tensor(data2, chunk_size=3)
+    t = einsum("..., ...", t1, t2, order="C")
+    res = t.execute().fetch()
+    expected = np.einsum("..., ...", data1, data2, order="C")
+    np.testing.assert_almost_equal(res, expected)
+
+    # sum(data, axis=-1)
+    data = np.random.rand(10)
+    t1 = tensor(data, chunk_size=3)
+    t = einsum("i->", t1, order="F")
+    res = t.execute().fetch()
+    expected = np.einsum("i->", data, order="F")
+    np.testing.assert_almost_equal(res, expected)
+
+    # sum(data, axis=0)
+    t1 = tensor(data)
+    t = einsum("...i->...", t1)
+    res = t.execute().fetch()
+    expected = np.einsum("...i->...", data)
+    np.testing.assert_almost_equal(res, expected)
+
+    # test broadcast
+    data1 = np.random.rand(1, 10, 9)
+    data2 = np.random.rand(9, 6)
+    data3 = np.random.rand(10, 6)
+    data4 = np.random.rand(8)
+
+    t1 = tensor(data1, chunk_size=(1, (5, 5), (3, 3, 3)))
+    t2 = tensor(data2, chunk_size=((3, 3, 3), (3, 3)))
+    t3 = tensor(data3, chunk_size=((6, 4), (4, 2)))
+    t4 = tensor(data4, chunk_size=4)
+    t = einsum("ajk,kl,jl,a->a", t1, t2, t3, t4, optimize="optimal")
+    res = t.execute().fetch()
+    expected = np.einsum(
+        "ajk,kl,jl,a->a", data1, data2, data3, data4, optimize="optimal"
+    )
+    np.testing.assert_almost_equal(res, expected)
+
+    t = einsum("ajk,kl,jl,a->a", t1, t2, t3, t4, optimize="greedy")
+    res = t.execute().fetch()
+    expected = np.einsum(
+        "ajk,kl,jl,a->a", data1, data2, data3, data4, optimize="greedy"
+    )
+    np.testing.assert_almost_equal(res, expected)
diff --git a/python/xorbits/_mars/tensor/fetch/__init__.py b/python/xorbits/_mars/tensor/fetch/__init__.py
new file mode 100644
index 000000000..eec1e1702
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fetch/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import TensorFetch, TensorFetchShuffle
diff --git a/python/xorbits/_mars/tensor/fetch/core.py b/python/xorbits/_mars/tensor/fetch/core.py
new file mode 100644
index 000000000..d84a9c7f3
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fetch/core.py
@@ -0,0 +1,59 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...core import OutputType, register_fetch_class
+from ...core.operand import Fetch, FetchMixin, FetchShuffle
+from ...serialization.serializables import DataTypeField
+from ..operands import TensorOperandMixin
+
+
+class TensorFetchMixin(TensorOperandMixin, FetchMixin):
+    __slots__ = ()
+    _output_type_ = OutputType.tensor
+
+
+class TensorFetch(TensorFetchMixin, Fetch):
+    dtype = DataTypeField("dtype")
+
+    def __init__(self, **kw):
+        kw.pop("output_types", None)
+        kw.pop("_output_types", None)
+        super().__init__(**kw)
+
+    def _new_chunks(self, inputs, kws=None, **kw):
+        if "_key" in kw and self.source_key is None:
+            self.source_key = kw["_key"]
+        return super()._new_chunks(inputs, kws=kws, **kw)
+
+    def _new_tileables(self, inputs, kws=None, **kw):
+        if "_key" in kw and self.source_key is None:
+            self.source_key = kw["_key"]
+        return super()._new_tileables(inputs, kws=kws, **kw)
+
+
+class TensorFetchShuffle(TensorFetchMixin, FetchShuffle):
+    _dtype = DataTypeField("dtype")
+
+    def __init__(self, **kw):
+        kw.pop("output_types", None)
+        kw.pop("_output_types", None)
+        super().__init__(**kw)
+
+    @property
+    def dtype(self):
+        return getattr(self, "_dtype", None)
+
+
+register_fetch_class(OutputType.tensor, TensorFetch, TensorFetchShuffle)
+register_fetch_class(OutputType.scalar, TensorFetch, TensorFetchShuffle)
diff --git a/python/xorbits/_mars/tensor/fft/__init__.py b/python/xorbits/_mars/tensor/fft/__init__.py
new file mode 100644
index 000000000..b80c1d708
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .fft import TensorFFT, fft
+from .fft2 import TensorFFT2, fft2
+from .fftfreq import TensorFFTFreq, fftfreq
+from .fftn import TensorFFTN, fftn
+from .fftshift import TensorFFTShift, fftshift
+from .hfft import TensorHFFT, hfft
+from .ifft import TensorIFFT, ifft
+from .ifft2 import TensorIFFT2, ifft2
+from .ifftn import TensorIFFTN, ifftn
+from .ifftshift import TensorIFFTShift, ifftshift
+from .ihfft import TensorIHFFT, ihfft
+from .irfft import TensorIRFFT, irfft
+from .irfft2 import TensorIRFFT2, irfft2
+from .irfftn import TensorIRFFTN, irfftn
+from .rfft import TensorRFFT, rfft
+from .rfft2 import TensorRFFT2, rfft2
+from .rfftfreq import TensorRFFTFreq, rfftfreq
+from .rfftn import TensorRFFTN, rfftn
diff --git a/python/xorbits/_mars/tensor/fft/core.py b/python/xorbits/_mars/tensor/fft/core.py
new file mode 100644
index 000000000..241f55eb8
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/core.py
@@ -0,0 +1,300 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Iterable
+
+from ...core import recursive_tile
+from ...serialization.serializables import (
+    FieldTypes,
+    Int32Field,
+    Int64Field,
+    KeyField,
+    StringField,
+    TupleField,
+)
+from ...utils import has_unknown_shape
+from ..array_utils import get_array_module
+from ..operands import TensorHasInput, TensorOperandMixin
+from ..utils import decide_chunk_sizes, validate_axis
+
+
+class TensorFFTBaseMixin(TensorOperandMixin):
+    __slots__ = ()
+
+    @classmethod
+    def _get_shape(cls, op, shape):
+        raise NotImplementedError
+
+    @classmethod
+    def _tile_fft(cls, op, axes):
+        in_tensor = op.inputs[0]
+        out_tensor = op.outputs[0]
+
+        if any(in_tensor.chunk_shape[axis] != 1 for axis in axes):
+            if has_unknown_shape(in_tensor):
+                yield
+            # fft requires only 1 chunk for the specified axis, so we do rechunk first
+            chunks = {
+                validate_axis(in_tensor.ndim, axis): in_tensor.shape[axis]
+                for axis in axes
+            }
+            new_chunks = decide_chunk_sizes(
+                in_tensor.shape, chunks, in_tensor.dtype.itemsize
+            )
+            in_tensor = yield from recursive_tile(in_tensor.rechunk(new_chunks))
+
+        out_chunks = []
+        for c in in_tensor.chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_shape = cls._get_shape(op, c.shape)
+            out_chunk = chunk_op.new_chunk(
+                [c], shape=chunk_shape, index=c.index, order=out_tensor.order
+            )
+            out_chunks.append(out_chunk)
+
+        nsplits = [
+            tuple(
+                c.shape[i]
+                for c in out_chunks
+                if all(idx == 0 for j, idx in enumerate(c.index) if j != i)
+            )
+            for i in range(len(out_chunks[0].shape))
+        ]
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            out_tensor.shape,
+            order=out_tensor.order,
+            chunks=out_chunks,
+            nsplits=nsplits,
+        )
+
+    def __call__(self, a, order=None):
+        shape = self._get_shape(self, a.shape)
+        order = a.order if order is None else order
+        return self.new_tensor([a], shape, order=order)
+
+
+class TensorFFTMixin(TensorFFTBaseMixin):
+    __slots__ = ()
+
+    @classmethod
+    def tile(cls, op):
+        return (yield from cls._tile_fft(op, [op.axis]))
+
+
+class TensorComplexFFTMixin(TensorFFTMixin):
+    @classmethod
+    def _get_shape(cls, op, shape):
+        new_shape = list(shape)
+        if op.n is not None:
+            new_shape[op.axis] = op.n
+        return tuple(new_shape)
+
+
+def validate_fft(tensor, axis=-1, norm=None):
+    validate_axis(tensor.ndim, axis)
+    if norm is not None and norm not in ("ortho",):
+        raise ValueError(f'Invalid norm value {norm}, should be None or "ortho"')
+
+
+class TensorFFTNMixin(TensorFFTBaseMixin):
+    @classmethod
+    def tile(cls, op):
+        return (yield from cls._tile_fft(op, op.axes))
+
+    @staticmethod
+    def _merge_shape(op, shape):
+        new_shape = list(shape)
+        if op.shape is not None:
+            for ss, axis in zip(op.shape, op.axes):
+                new_shape[axis] = ss
+        return new_shape
+
+
+class TensorComplexFFTNMixin(TensorFFTNMixin):
+    @classmethod
+    def _get_shape(cls, op, shape):
+        return tuple(cls._merge_shape(op, shape))
+
+
+class TensorRealFFTNMixin(TensorFFTNMixin):
+    @classmethod
+    def _get_shape(cls, op, shape):
+        new_shape = cls._merge_shape(op, shape)
+        new_shape[op.axes[-1]] = new_shape[op.axes[-1]] // 2 + 1
+        return tuple(new_shape)
+
+
+class TensorRealIFFTNMixin(TensorFFTNMixin):
+    @classmethod
+    def _get_shape(cls, op, shape):
+        new_shape = list(shape)
+        new_shape[op.axes[-1]] = 2 * (new_shape[op.axes[-1]] - 1)
+        return tuple(cls._merge_shape(op, new_shape))
+
+
+def validate_fftn(tensor, s=None, axes=None, norm=None):
+    if axes is None:
+        if s is None:
+            axes = tuple(range(tensor.ndim))
+        else:
+            axes = tuple(range(len(s)))
+    else:
+        for axis in axes:
+            validate_axis(tensor.ndim, axis)
+        if len(set(axes)) < len(axes):
+            raise ValueError("Duplicate axes not allowed")
+
+    if norm is not None and norm not in ("ortho",):
+        raise ValueError(f'Invalid norm value {norm}, should be None or "ortho"')
+
+    return axes
+
+
+class TensorFFTShiftMixin(TensorOperandMixin):
+    __slots__ = ()
+
+    @classmethod
+    def _is_inverse(cls):
+        return False
+
+    @classmethod
+    def _process_axes(cls, x, axes):
+        if axes is None:
+            axes = tuple(range(x.ndim))
+        elif isinstance(axes, Iterable):
+            axes = tuple(axes)
+        else:
+            axes = (axes,)
+
+        return axes
+
+    @classmethod
+    def tile(cls, op):
+        from ..merge import concatenate
+
+        axes = op.axes
+        in_tensor = op.input
+        is_inverse = cls._is_inverse()
+
+        if has_unknown_shape(in_tensor):
+            yield
+
+        x = in_tensor
+        for axis in axes:
+            size = in_tensor.shape[axis]
+            slice_on = (size + 1) // 2 if not is_inverse else size // 2
+            slc1 = [slice(None)] * axis + [slice(slice_on)]
+            slc2 = [slice(None)] * axis + [slice(slice_on, None)]
+            x = concatenate([x[slc2], x[slc1]], axis=axis)
+
+        x = yield from recursive_tile(x)
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs, op.outputs[0].shape, chunks=x.chunks, nsplits=x.nsplits
+        )
+
+
+class TensorDiscreteFourierTransform(TensorHasInput):
+    __slots__ = ()
+
+
+class TensorBaseFFT(TensorDiscreteFourierTransform):
+    _input = KeyField("input")
+    _norm = StringField("norm")
+
+    @property
+    def norm(self):
+        return getattr(self, "_norm", None)
+
+
+class TensorBaseSingleDimensionFFT(TensorBaseFFT):
+    _n = Int64Field("n")
+    _axis = Int32Field("axis")
+
+    @property
+    def n(self):
+        return self._n
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @classmethod
+    def execute(cls, ctx, op):
+        a = ctx[op.inputs[0].key]
+        xp = get_array_module(a)
+        fun = _get_fft_func(op, xp)
+        res = fun(a, n=op.n, axis=op.axis, norm=op.norm)
+        if res.dtype != op.dtype:
+            res = res.astype(op.dtype)
+        ctx[op.outputs[0].key] = res
+
+
+class TensorBaseMultipleDimensionFFT(TensorBaseFFT):
+    _shape = TupleField("shape", FieldTypes.int64)
+    _axes = TupleField("axes", FieldTypes.int32)
+
+    @property
+    def shape(self):
+        return self._shape
+
+    @property
+    def axes(self):
+        return self._axes
+
+    @classmethod
+    def execute(cls, ctx, op):
+        a = ctx[op.inputs[0].key]
+        xp = get_array_module(a)
+        fun = _get_fft_func(op, xp)
+        res = fun(a, s=op.shape, axes=op.axes, norm=op.norm)
+        if res.dtype != op.dtype:
+            res = res.astype(op.dtype)
+        ctx[op.outputs[0].key] = res
+
+
+def _get_fft_func(op, xp):
+    fun_name = type(op).__name__.lower()[6:]  # all op starts with tensor
+    return getattr(xp.fft, fun_name)
+
+
+class TensorStandardFFT(TensorBaseSingleDimensionFFT):
+    pass
+
+
+class TensorStandardFFTN(TensorBaseMultipleDimensionFFT):
+    pass
+
+
+class TensorFFTShiftBase(TensorHasInput):
+    _input = KeyField("input")
+    _axes = TupleField("axes", FieldTypes.int32)
+
+    @property
+    def axes(self):
+        return self._axes
+
+
+class TensorRealFFT(TensorBaseSingleDimensionFFT):
+    pass
+
+
+class TensorRealFFTN(TensorBaseMultipleDimensionFFT):
+    pass
+
+
+class TensorHermitianFFT(TensorBaseSingleDimensionFFT):
+    pass
diff --git a/python/xorbits/_mars/tensor/fft/fft.py b/python/xorbits/_mars/tensor/fft/fft.py
new file mode 100644
index 000000000..f1a3e9e8d
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/fft.py
@@ -0,0 +1,114 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorComplexFFTMixin, TensorStandardFFT, validate_fft
+
+
+class TensorFFT(TensorStandardFFT, TensorComplexFFTMixin):
+    _op_type_ = OperandDef.FFT
+
+    def __init__(self, n=None, axis=-1, norm=None, **kw):
+        super().__init__(_n=n, _axis=axis, _norm=norm, **kw)
+
+
+def fft(a, n=None, axis=-1, norm=None):
+    """
+    Compute the one-dimensional discrete Fourier Transform.
+
+    This function computes the one-dimensional *n*-point discrete Fourier
+    Transform (DFT) with the efficient Fast Fourier Transform (FFT)
+    algorithm [CT].
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor, can be complex.
+    n : int, optional
+        Length of the transformed axis of the output.
+        If `n` is smaller than the length of the input, the input is cropped.
+        If it is larger, the input is padded with zeros.  If `n` is not given,
+        the length of the input along the axis specified by `axis` is used.
+    axis : int, optional
+        Axis over which to compute the FFT.  If not given, the last axis is
+        used.
+    norm : {None, "ortho"}, optional
+        Normalization mode (see `mt.fft`). Default is None.
+
+    Returns
+    -------
+    out : complex Tensor
+        The truncated or zero-padded input, transformed along the axis
+        indicated by `axis`, or the last one if `axis` is not specified.
+
+    Raises
+    ------
+    IndexError
+        if `axes` is larger than the last axis of `a`.
+
+    See Also
+    --------
+    mt.fft : for definition of the DFT and conventions used.
+    ifft : The inverse of `fft`.
+    fft2 : The two-dimensional FFT.
+    fftn : The *n*-dimensional FFT.
+    rfftn : The *n*-dimensional FFT of real input.
+    fftfreq : Frequency bins for given FFT parameters.
+
+    Notes
+    -----
+    FFT (Fast Fourier Transform) refers to a way the discrete Fourier
+    Transform (DFT) can be calculated efficiently, by using symmetries in the
+    calculated terms.  The symmetry is highest when `n` is a power of 2, and
+    the transform is therefore most efficient for these sizes.
+
+    The DFT is defined, with the conventions used in this implementation, in
+    the documentation for the `numpy.fft` module.
+
+    References
+    ----------
+    .. [CT] Cooley, James W., and John W. Tukey, 1965, "An algorithm for the
+            machine calculation of complex Fourier series," *Math. Comput.*
+            19: 297-301.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.fft.fft(mt.exp(2j * mt.pi * mt.arange(8) / 8)).execute()
+    array([-2.33486982e-16+1.14423775e-17j,  8.00000000e+00-6.89018570e-16j,
+            2.33486982e-16+2.33486982e-16j,  0.00000000e+00+0.00000000e+00j,
+           -1.14423775e-17+2.33486982e-16j,  0.00000000e+00+1.99159850e-16j,
+            1.14423775e-17+1.14423775e-17j,  0.00000000e+00+0.00000000e+00j])
+
+    In this example, real input has an FFT which is Hermitian, i.e., symmetric
+    in the real part and anti-symmetric in the imaginary part, as described in
+    the `numpy.fft` documentation:
+
+    >>> import matplotlib.pyplot as plt
+    >>> t = mt.arange(256)
+    >>> sp = mt.fft.fft(mt.sin(t))
+    >>> freq = mt.fft.fftfreq(t.shape[-1])
+    >>> plt.plot(freq.execute(), sp.real.execute(), freq.execute(), sp.imag.execute())
+    [<matplotlib.lines.Line2D object at 0x...>, <matplotlib.lines.Line2D object at 0x...>]
+    >>> plt.show()
+
+    """
+    a = astensor(a)
+    validate_fft(a, axis, norm)
+    op = TensorFFT(n=n, axis=axis, norm=norm, dtype=np.dtype(np.complex_))
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/fft/fft2.py b/python/xorbits/_mars/tensor/fft/fft2.py
new file mode 100644
index 000000000..3f76cb009
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/fft2.py
@@ -0,0 +1,120 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorComplexFFTNMixin, TensorStandardFFTN, validate_fftn
+
+
+class TensorFFT2(TensorStandardFFTN, TensorComplexFFTNMixin):
+    _op_type_ = OperandDef.FFT2
+
+    def __init__(self, shape=None, axes=None, norm=None, **kw):
+        super().__init__(_shape=shape, _axes=axes, _norm=norm, **kw)
+
+
+def fft2(a, s=None, axes=(-2, -1), norm=None):
+    """
+    Compute the 2-dimensional discrete Fourier Transform
+
+    This function computes the *n*-dimensional discrete Fourier Transform
+    over any axes in an *M*-dimensional array by means of the
+    Fast Fourier Transform (FFT).  By default, the transform is computed over
+    the last two axes of the input array, i.e., a 2-dimensional FFT.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor, can be complex
+    s : sequence of ints, optional
+        Shape (length of each transformed axis) of the output
+        (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.).
+        This corresponds to ``n`` for ``fft(x, n)``.
+        Along each axis, if the given shape is smaller than that of the input,
+        the input is cropped.  If it is larger, the input is padded with zeros.
+        if `s` is not given, the shape of the input along the axes specified
+        by `axes` is used.
+    axes : sequence of ints, optional
+        Axes over which to compute the FFT.  If not given, the last two
+        axes are used.  A repeated index in `axes` means the transform over
+        that axis is performed multiple times.  A one-element sequence means
+        that a one-dimensional FFT is performed.
+    norm : {None, "ortho"}, optional
+        Normalization mode (see `mt.fft`). Default is None.
+
+    Returns
+    -------
+    out : complex Tensor
+        The truncated or zero-padded input, transformed along the axes
+        indicated by `axes`, or the last two axes if `axes` is not given.
+
+    Raises
+    ------
+    ValueError
+        If `s` and `axes` have different length, or `axes` not given and
+        ``len(s) != 2``.
+    IndexError
+        If an element of `axes` is larger than than the number of axes of `a`.
+
+    See Also
+    --------
+    mt.fft : Overall view of discrete Fourier transforms, with definitions
+         and conventions used.
+    ifft2 : The inverse two-dimensional FFT.
+    fft : The one-dimensional FFT.
+    fftn : The *n*-dimensional FFT.
+    fftshift : Shifts zero-frequency terms to the center of the array.
+        For two-dimensional input, swaps first and third quadrants, and second
+        and fourth quadrants.
+
+    Notes
+    -----
+    `fft2` is just `fftn` with a different default for `axes`.
+
+    The output, analogously to `fft`, contains the term for zero frequency in
+    the low-order corner of the transformed axes, the positive frequency terms
+    in the first half of these axes, the term for the Nyquist frequency in the
+    middle of the axes and the negative frequency terms in the second half of
+    the axes, in order of decreasingly negative frequency.
+
+    See `fftn` for details and a plotting example, and `mt.fft` for
+    definitions and conventions used.
+
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.mgrid[:5, :5][0]
+    >>> mt.fft.fft2(a).execute()
+    array([[ 50.0 +0.j        ,   0.0 +0.j        ,   0.0 +0.j        ,
+              0.0 +0.j        ,   0.0 +0.j        ],
+           [-12.5+17.20477401j,   0.0 +0.j        ,   0.0 +0.j        ,
+              0.0 +0.j        ,   0.0 +0.j        ],
+           [-12.5 +4.0614962j ,   0.0 +0.j        ,   0.0 +0.j        ,
+              0.0 +0.j        ,   0.0 +0.j        ],
+           [-12.5 -4.0614962j ,   0.0 +0.j        ,   0.0 +0.j        ,
+                0.0 +0.j        ,   0.0 +0.j        ],
+           [-12.5-17.20477401j,   0.0 +0.j        ,   0.0 +0.j        ,
+              0.0 +0.j        ,   0.0 +0.j        ]])
+
+    """
+    if len(axes) != 2:
+        raise ValueError("axes length should be 2")
+    a = astensor(a)
+    axes = validate_fftn(a, s=s, axes=axes, norm=norm)
+    op = TensorFFT2(shape=s, axes=axes, norm=norm, dtype=np.dtype(np.complex_))
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/fft/fftfreq.py b/python/xorbits/_mars/tensor/fft/fftfreq.py
new file mode 100644
index 000000000..b24d2cddb
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/fftfreq.py
@@ -0,0 +1,160 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import NotSupportTile, recursive_tile
+from ...serialization.serializables import Float64Field, Int32Field, KeyField
+from ..core import TensorOrder
+from ..datasource import arange
+from ..operands import TensorHasInput, TensorOperand, TensorOperandMixin
+
+
+class TensorFFTFreq(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.FFTFREQ
+
+    _n = Int32Field("n")
+    _d = Float64Field("d")
+
+    def __init__(self, n=None, d=None, **kw):
+        super().__init__(_n=n, _d=d, **kw)
+
+    @property
+    def n(self):
+        return self._n
+
+    @property
+    def d(self):
+        return self._d
+
+    def __call__(self, chunk_size=None):
+        shape = (self.n,)
+        return self.new_tensor(
+            None, shape, raw_chunk_size=chunk_size, order=TensorOrder.C_ORDER
+        )
+
+    @classmethod
+    def tile(cls, op):
+        tensor = op.outputs[0]
+        in_tensor = yield from recursive_tile(
+            arange(
+                op.n,
+                gpu=op.gpu,
+                dtype=op.dtype,
+                chunks=tensor.extra_params.raw_chunk_size,
+            )
+        )
+
+        out_chunks = []
+        for c in in_tensor.chunks:
+            chunk_op = TensorFFTFreqChunk(n=op.n, d=op.d, dtype=op.dtype)
+            out_chunk = chunk_op.new_chunk(
+                [c], shape=c.shape, index=c.index, order=tensor.order
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            tensor.shape,
+            order=tensor.order,
+            chunks=out_chunks,
+            nsplits=in_tensor.nsplits,
+            **tensor.extra_params
+        )
+
+
+class TensorFFTFreqChunk(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.FFTFREQ_CHUNK
+
+    _input = KeyField("input")
+    _n = Int32Field("n")
+    _d = Float64Field("d")
+
+    def __init__(self, n=None, d=None, dtype=None, **kw):
+        super().__init__(_n=n, _d=d, dtype=dtype, **kw)
+
+    @property
+    def n(self):
+        return self._n
+
+    @property
+    def d(self):
+        return self._d
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    @classmethod
+    def tile(cls, op):
+        raise NotSupportTile(
+            "FFTFreqChunk is a chunk operand which does not support tile"
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        n, d = op.n, op.d
+        x = ctx[op.inputs[0].key].copy()
+        x[x >= (n + 1) // 2] -= n
+        x /= n * d
+        ctx[op.outputs[0].key] = x
+
+
+def fftfreq(n, d=1.0, gpu=None, chunk_size=None):
+    """
+    Return the Discrete Fourier Transform sample frequencies.
+
+    The returned float tensor `f` contains the frequency bin centers in cycles
+    per unit of the sample spacing (with zero at the start).  For instance, if
+    the sample spacing is in seconds, then the frequency unit is cycles/second.
+
+    Given a window length `n` and a sample spacing `d`::
+
+      f = [0, 1, ...,   n/2-1,     -n/2, ..., -1] / (d*n)   if n is even
+      f = [0, 1, ..., (n-1)/2, -(n-1)/2, ..., -1] / (d*n)   if n is odd
+
+    Parameters
+    ----------
+    n : int
+        Window length.
+    d : scalar, optional
+        Sample spacing (inverse of the sampling rate). Defaults to 1.
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+
+    Returns
+    -------
+    f : Tensor
+        Array of length `n` containing the sample frequencies.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> signal = mt.array([-2, 8, 6, 4, 1, 0, 3, 5], dtype=float)
+    >>> fourier = mt.fft.fft(signal)
+    >>> n = signal.size
+    >>> timestep = 0.1
+    >>> freq = mt.fft.fftfreq(n, d=timestep)
+    >>> freq.execute()
+    array([ 0.  ,  1.25,  2.5 ,  3.75, -5.  , -3.75, -2.5 , -1.25])
+
+    """
+    n, d = int(n), float(d)
+    op = TensorFFTFreq(n=n, d=d, dtype=np.dtype(float), gpu=gpu)
+    return op(chunk_size)
diff --git a/python/xorbits/_mars/tensor/fft/fftn.py b/python/xorbits/_mars/tensor/fft/fftn.py
new file mode 100644
index 000000000..cf4ce1ec6
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/fftn.py
@@ -0,0 +1,125 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorComplexFFTNMixin, TensorStandardFFTN, validate_fftn
+
+
+class TensorFFTN(TensorStandardFFTN, TensorComplexFFTNMixin):
+    _op_type_ = OperandDef.FFTN
+
+    def __init__(self, shape=None, axes=None, norm=None, **kw):
+        super().__init__(_shape=shape, _axes=axes, _norm=norm, **kw)
+
+
+def fftn(a, s=None, axes=None, norm=None):
+    """
+    Compute the N-dimensional discrete Fourier Transform.
+
+    This function computes the *N*-dimensional discrete Fourier Transform over
+    any number of axes in an *M*-dimensional tensor by means of the Fast Fourier
+    Transform (FFT).
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor, can be complex.
+    s : sequence of ints, optional
+        Shape (length of each transformed axis) of the output
+        (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.).
+        This corresponds to ``n`` for ``fft(x, n)``.
+        Along any axis, if the given shape is smaller than that of the input,
+        the input is cropped.  If it is larger, the input is padded with zeros.
+        if `s` is not given, the shape of the input along the axes specified
+        by `axes` is used.
+    axes : sequence of ints, optional
+        Axes over which to compute the FFT.  If not given, the last ``len(s)``
+        axes are used, or all axes if `s` is also not specified.
+        Repeated indices in `axes` means that the transform over that axis is
+        performed multiple times.
+    norm : {None, "ortho"}, optional
+        Normalization mode (see `mt.fft`). Default is None.
+
+    Returns
+    -------
+    out : complex Tensor
+        The truncated or zero-padded input, transformed along the axes
+        indicated by `axes`, or by a combination of `s` and `a`,
+        as explained in the parameters section above.
+
+    Raises
+    ------
+    ValueError
+        If `s` and `axes` have different length.
+    IndexError
+        If an element of `axes` is larger than than the number of axes of `a`.
+
+    See Also
+    --------
+    mt.fft : Overall view of discrete Fourier transforms, with definitions
+        and conventions used.
+    ifftn : The inverse of `fftn`, the inverse *n*-dimensional FFT.
+    fft : The one-dimensional FFT, with definitions and conventions used.
+    rfftn : The *n*-dimensional FFT of real input.
+    fft2 : The two-dimensional FFT.
+    fftshift : Shifts zero-frequency terms to centre of tensor
+
+    Notes
+    -----
+    The output, analogously to `fft`, contains the term for zero frequency in
+    the low-order corner of all axes, the positive frequency terms in the
+    first half of all axes, the term for the Nyquist frequency in the middle
+    of all axes and the negative frequency terms in the second half of all
+    axes, in order of decreasingly negative frequency.
+
+    See `mt.fft` for details, definitions and conventions used.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.mgrid[:3, :3, :3][0]
+    >>> mt.fft.fftn(a, axes=(1, 2)).execute()
+    array([[[  0.+0.j,   0.+0.j,   0.+0.j],
+            [  0.+0.j,   0.+0.j,   0.+0.j],
+            [  0.+0.j,   0.+0.j,   0.+0.j]],
+           [[  9.+0.j,   0.+0.j,   0.+0.j],
+            [  0.+0.j,   0.+0.j,   0.+0.j],
+            [  0.+0.j,   0.+0.j,   0.+0.j]],
+           [[ 18.+0.j,   0.+0.j,   0.+0.j],
+            [  0.+0.j,   0.+0.j,   0.+0.j],
+            [  0.+0.j,   0.+0.j,   0.+0.j]]])
+    >>> mt.fft.fftn(a, (2, 2), axes=(0, 1)).execute()
+    array([[[ 2.+0.j,  2.+0.j,  2.+0.j],
+            [ 0.+0.j,  0.+0.j,  0.+0.j]],
+           [[-2.+0.j, -2.+0.j, -2.+0.j],
+            [ 0.+0.j,  0.+0.j,  0.+0.j]]])
+
+    >>> import matplotlib.pyplot as plt
+    >>> [X, Y] = mt.meshgrid(2 * mt.pi * mt.arange(200) / 12,
+    ...                      2 * mt.pi * mt.arange(200) / 34)
+    >>> S = mt.sin(X) + mt.cos(Y) + mt.random.uniform(0, 1, X.shape)
+    >>> FS = mt.fft.fftn(S)
+    >>> plt.imshow(mt.log(mt.abs(mt.fft.fftshift(FS))**2).execute())
+    <matplotlib.image.AxesImage object at 0x...>
+    >>> plt.show()
+
+    """
+    a = astensor(a)
+    axes = validate_fftn(a, s=s, axes=axes, norm=norm)
+    op = TensorFFTN(shape=s, axes=axes, norm=norm, dtype=np.dtype(np.complex_))
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/fft/fftshift.py b/python/xorbits/_mars/tensor/fft/fftshift.py
new file mode 100644
index 000000000..801eb4fa8
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/fftshift.py
@@ -0,0 +1,86 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorFFTShiftBase, TensorFFTShiftMixin
+
+
+class TensorFFTShift(TensorFFTShiftBase, TensorFFTShiftMixin):
+    _op_type_ = OperandDef.FFTSHIFT
+
+    def __init__(self, axes=None, **kw):
+        super().__init__(_axes=axes, **kw)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def __call__(self, x):
+        return self.new_tensor([x], x.shape)
+
+
+def fftshift(x, axes=None):
+    """
+    Shift the zero-frequency component to the center of the spectrum.
+
+    This function swaps half-spaces for all axes listed (defaults to all).
+    Note that ``y[0]`` is the Nyquist component only if ``len(x)`` is even.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor.
+    axes : int or shape tuple, optional
+        Axes over which to shift.  Default is None, which shifts all axes.
+
+    Returns
+    -------
+    y : Tensor
+        The shifted tensor.
+
+    See Also
+    --------
+    ifftshift : The inverse of `fftshift`.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> freqs = mt.fft.fftfreq(10, 0.1)
+    >>> freqs.execute()
+    array([ 0.,  1.,  2.,  3.,  4., -5., -4., -3., -2., -1.])
+    >>> mt.fft.fftshift(freqs).execute()
+    array([-5., -4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.])
+
+    Shift the zero-frequency component only along the second axis:
+
+    >>> freqs = mt.fft.fftfreq(9, d=1./9).reshape(3, 3)
+    >>> freqs.execute()
+    array([[ 0.,  1.,  2.],
+           [ 3.,  4., -4.],
+           [-3., -2., -1.]])
+    >>> mt.fft.fftshift(freqs, axes=(1,)).execute()
+    array([[ 2.,  0.,  1.],
+           [-4.,  3.,  4.],
+           [-1., -3., -2.]])
+
+    """
+    x = astensor(x)
+    dtype = np.fft.fftshift(np.empty((1,) * max(1, x.ndim), dtype=x.dtype)).dtype
+    axes = TensorFFTShift._process_axes(x, axes)
+    op = TensorFFTShift(axes=axes, dtype=dtype)
+    return op(x)
diff --git a/python/xorbits/_mars/tensor/fft/hfft.py b/python/xorbits/_mars/tensor/fft/hfft.py
new file mode 100644
index 000000000..6e3b071cd
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/hfft.py
@@ -0,0 +1,115 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorFFTMixin, TensorHermitianFFT, validate_fft
+
+
+class TensorHFFT(TensorHermitianFFT, TensorFFTMixin):
+    _op_type_ = OperandDef.HFFT
+
+    def __init__(self, n=None, axis=-1, norm=None, **kw):
+        super().__init__(_n=n, _axis=axis, _norm=norm, **kw)
+
+    @classmethod
+    def _get_shape(cls, op, shape):
+        new_shape = list(shape)
+        if op.n is not None:
+            new_shape[op.axis] = op.n
+        else:
+            new_shape[op.axis] = 2 * (shape[op.axis] - 1)
+        return tuple(new_shape)
+
+
+def hfft(a, n=None, axis=-1, norm=None):
+    """
+    Compute the FFT of a signal that has Hermitian symmetry, i.e., a real
+    spectrum.
+
+    Parameters
+    ----------
+    a : array_like
+        The input tensor.
+    n : int, optional
+        Length of the transformed axis of the output. For `n` output
+        points, ``n//2 + 1`` input points are necessary.  If the input is
+        longer than this, it is cropped.  If it is shorter than this, it is
+        padded with zeros.  If `n` is not given, it is determined from the
+        length of the input along the axis specified by `axis`.
+    axis : int, optional
+        Axis over which to compute the FFT. If not given, the last
+        axis is used.
+    norm : {None, "ortho"}, optional
+        Normalization mode (see `mt.fft`). Default is None.
+
+    Returns
+    -------
+    out : Tensor
+        The truncated or zero-padded input, transformed along the axis
+        indicated by `axis`, or the last one if `axis` is not specified.
+        The length of the transformed axis is `n`, or, if `n` is not given,
+        ``2*m - 2`` where ``m`` is the length of the transformed axis of
+        the input. To get an odd number of output points, `n` must be
+        specified, for instance as ``2*m - 1`` in the typical case,
+
+    Raises
+    ------
+    IndexError
+        If `axis` is larger than the last axis of `a`.
+
+    See also
+    --------
+    rfft : Compute the one-dimensional FFT for real input.
+    ihfft : The inverse of `hfft`.
+
+    Notes
+    -----
+    `hfft`/`ihfft` are a pair analogous to `rfft`/`irfft`, but for the
+    opposite case: here the signal has Hermitian symmetry in the time
+    domain and is real in the frequency domain. So here it's `hfft` for
+    which you must supply the length of the result if it is to be odd.
+
+    * even: ``ihfft(hfft(a, 2*len(a) - 2) == a``, within roundoff error,
+    * odd: ``ihfft(hfft(a, 2*len(a) - 1) == a``, within roundoff error.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> signal = mt.array([1, 2, 3, 4, 3, 2])
+    >>> mt.fft.fft(signal).execute()
+    array([ 15.+0.j,  -4.+0.j,   0.+0.j,  -1.-0.j,   0.+0.j,  -4.+0.j])
+    >>> mt.fft.hfft(signal[:4]).execute() # Input first half of signal
+    array([ 15.,  -4.,   0.,  -1.,   0.,  -4.])
+    >>> mt.fft.hfft(signal, 6).execute()  # Input entire signal and truncate
+    array([ 15.,  -4.,   0.,  -1.,   0.,  -4.])
+
+
+    >>> signal = mt.array([[1, 1.j], [-1.j, 2]])
+    >>> (mt.conj(signal.T) - signal).execute()   # check Hermitian symmetry
+    array([[ 0.-0.j,  0.+0.j],
+           [ 0.+0.j,  0.-0.j]])
+    >>> freq_spectrum = mt.fft.hfft(signal)
+    >>> freq_spectrum.execute()
+    array([[ 1.,  1.],
+           [ 2., -2.]])
+
+    """
+    a = astensor(a)
+    validate_fft(a, axis=axis, norm=norm)
+    op = TensorHFFT(n=n, axis=axis, norm=norm, dtype=np.dtype(np.float_))
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/fft/ifft.py b/python/xorbits/_mars/tensor/fft/ifft.py
new file mode 100644
index 000000000..7256f6a72
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/ifft.py
@@ -0,0 +1,116 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorComplexFFTMixin, TensorStandardFFT, validate_fft
+
+
+class TensorIFFT(TensorStandardFFT, TensorComplexFFTMixin):
+    _op_type_ = OperandDef.IFFT
+
+    def __init__(self, n=None, axis=-1, norm=None, **kw):
+        super().__init__(_n=n, _axis=axis, _norm=norm, **kw)
+
+
+def ifft(a, n=None, axis=-1, norm=None):
+    """
+    Compute the one-dimensional inverse discrete Fourier Transform.
+
+    This function computes the inverse of the one-dimensional *n*-point
+    discrete Fourier transform computed by `fft`.  In other words,
+    ``ifft(fft(a)) == a`` to within numerical accuracy.
+    For a general description of the algorithm and definitions,
+    see `mt.fft`.
+
+    The input should be ordered in the same way as is returned by `fft`,
+    i.e.,
+
+    * ``a[0]`` should contain the zero frequency term,
+    * ``a[1:n//2]`` should contain the positive-frequency terms,
+    * ``a[n//2 + 1:]`` should contain the negative-frequency terms, in
+      increasing order starting from the most negative frequency.
+
+    For an even number of input points, ``A[n//2]`` represents the sum of
+    the values at the positive and negative Nyquist frequencies, as the two
+    are aliased together. See `numpy.fft` for details.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor, can be complex.
+    n : int, optional
+        Length of the transformed axis of the output.
+        If `n` is smaller than the length of the input, the input is cropped.
+        If it is larger, the input is padded with zeros.  If `n` is not given,
+        the length of the input along the axis specified by `axis` is used.
+        See notes about padding issues.
+    axis : int, optional
+        Axis over which to compute the inverse DFT.  If not given, the last
+        axis is used.
+    norm : {None, "ortho"}, optional
+        Normalization mode (see `numpy.fft`). Default is None.
+
+    Returns
+    -------
+    out : complex Tensor
+        The truncated or zero-padded input, transformed along the axis
+        indicated by `axis`, or the last one if `axis` is not specified.
+
+    Raises
+    ------
+    IndexError
+        If `axes` is larger than the last axis of `a`.
+
+    See Also
+    --------
+    mt.fft : An introduction, with definitions and general explanations.
+    fft : The one-dimensional (forward) FFT, of which `ifft` is the inverse
+    ifft2 : The two-dimensional inverse FFT.
+    ifftn : The n-dimensional inverse FFT.
+
+    Notes
+    -----
+    If the input parameter `n` is larger than the size of the input, the input
+    is padded by appending zeros at the end.  Even though this is the common
+    approach, it might lead to surprising results.  If a different padding is
+    desired, it must be performed before calling `ifft`.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.fft.ifft([0, 4, 0, 0]).execute()
+    array([ 1.+0.j,  0.+1.j, -1.+0.j,  0.-1.j])
+
+    Create and plot a band-limited signal with random phases:
+
+    >>> import matplotlib.pyplot as plt
+    >>> t = mt.arange(400)
+    >>> n = mt.zeros((400,), dtype=complex)
+    >>> n[40:60] = mt.exp(1j*mt.random.uniform(0, 2*mt.pi, (20,)))
+    >>> s = mt.fft.ifft(n)
+    >>> plt.plot(t.execute(), s.real.execute(), 'b-', t.execute(), s.imag.execute(), 'r--')
+    ...
+    >>> plt.legend(('real', 'imaginary'))
+    ...
+    >>> plt.show()
+
+    """
+    a = astensor(a)
+    validate_fft(a, axis, norm)
+    op = TensorIFFT(n=n, axis=axis, norm=norm, dtype=np.dtype(np.complex_))
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/fft/ifft2.py b/python/xorbits/_mars/tensor/fft/ifft2.py
new file mode 100644
index 000000000..37ce72d9f
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/ifft2.py
@@ -0,0 +1,117 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorComplexFFTNMixin, TensorStandardFFTN, validate_fftn
+
+
+class TensorIFFT2(TensorStandardFFTN, TensorComplexFFTNMixin):
+    _op_type_ = OperandDef.IFFT2
+
+    def __init__(self, shape=None, axes=None, norm=None, **kw):
+        super().__init__(_shape=shape, _axes=axes, _norm=norm, **kw)
+
+
+def ifft2(a, s=None, axes=(-2, -1), norm=None):
+    """
+    Compute the 2-dimensional inverse discrete Fourier Transform.
+
+    This function computes the inverse of the 2-dimensional discrete Fourier
+    Transform over any number of axes in an M-dimensional array by means of
+    the Fast Fourier Transform (FFT).  In other words, ``ifft2(fft2(a)) == a``
+    to within numerical accuracy.  By default, the inverse transform is
+    computed over the last two axes of the input array.
+
+    The input, analogously to `ifft`, should be ordered in the same way as is
+    returned by `fft2`, i.e. it should have the term for zero frequency
+    in the low-order corner of the two axes, the positive frequency terms in
+    the first half of these axes, the term for the Nyquist frequency in the
+    middle of the axes and the negative frequency terms in the second half of
+    both axes, in order of decreasingly negative frequency.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor, can be complex.
+    s : sequence of ints, optional
+        Shape (length of each axis) of the output (``s[0]`` refers to axis 0,
+        ``s[1]`` to axis 1, etc.).  This corresponds to `n` for ``ifft(x, n)``.
+        Along each axis, if the given shape is smaller than that of the input,
+        the input is cropped.  If it is larger, the input is padded with zeros.
+        if `s` is not given, the shape of the input along the axes specified
+        by `axes` is used.  See notes for issue on `ifft` zero padding.
+    axes : sequence of ints, optional
+        Axes over which to compute the FFT.  If not given, the last two
+        axes are used.  A repeated index in `axes` means the transform over
+        that axis is performed multiple times.  A one-element sequence means
+        that a one-dimensional FFT is performed.
+    norm : {None, "ortho"}, optional
+        Normalization mode (see `mt.fft`). Default is None.
+
+    Returns
+    -------
+    out : complex Tensor
+        The truncated or zero-padded input, transformed along the axes
+        indicated by `axes`, or the last two axes if `axes` is not given.
+
+    Raises
+    ------
+    ValueError
+        If `s` and `axes` have different length, or `axes` not given and
+        ``len(s) != 2``.
+    IndexError
+        If an element of `axes` is larger than than the number of axes of `a`.
+
+    See Also
+    --------
+    mt.fft : Overall view of discrete Fourier transforms, with definitions
+         and conventions used.
+    fft2 : The forward 2-dimensional FFT, of which `ifft2` is the inverse.
+    ifftn : The inverse of the *n*-dimensional FFT.
+    fft : The one-dimensional FFT.
+    ifft : The one-dimensional inverse FFT.
+
+    Notes
+    -----
+    `ifft2` is just `ifftn` with a different default for `axes`.
+
+    See `ifftn` for details and a plotting example, and `numpy.fft` for
+    definition and conventions used.
+
+    Zero-padding, analogously with `ifft`, is performed by appending zeros to
+    the input along the specified dimension.  Although this is the common
+    approach, it might lead to surprising results.  If another form of zero
+    padding is desired, it must be performed before `ifft2` is called.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = 4 * mt.eye(4)
+    >>> mt.fft.ifft2(a).execute()
+    array([[ 1.+0.j,  0.+0.j,  0.+0.j,  0.+0.j],
+           [ 0.+0.j,  0.+0.j,  0.+0.j,  1.+0.j],
+           [ 0.+0.j,  0.+0.j,  1.+0.j,  0.+0.j],
+           [ 0.+0.j,  1.+0.j,  0.+0.j,  0.+0.j]])
+
+    """
+    if len(axes) != 2:
+        raise ValueError("axes length should be 2")
+    a = astensor(a)
+    axes = validate_fftn(a, s=s, axes=axes, norm=norm)
+    op = TensorIFFT2(shape=s, axes=axes, norm=norm, dtype=np.dtype(np.complex_))
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/fft/ifftn.py b/python/xorbits/_mars/tensor/fft/ifftn.py
new file mode 100644
index 000000000..4c563d5ca
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/ifftn.py
@@ -0,0 +1,125 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorComplexFFTNMixin, TensorStandardFFTN, validate_fftn
+
+
+class TensorIFFTN(TensorStandardFFTN, TensorComplexFFTNMixin):
+    _op_type_ = OperandDef.IFFTN
+
+    def __init__(self, shape=None, axes=None, norm=None, **kw):
+        super().__init__(_shape=shape, _axes=axes, _norm=norm, **kw)
+
+
+def ifftn(a, s=None, axes=None, norm=None):
+    """
+    Compute the N-dimensional inverse discrete Fourier Transform.
+
+    This function computes the inverse of the N-dimensional discrete
+    Fourier Transform over any number of axes in an M-dimensional tensor by
+    means of the Fast Fourier Transform (FFT).  In other words,
+    ``ifftn(fftn(a)) == a`` to within numerical accuracy.
+    For a description of the definitions and conventions used, see `mt.fft`.
+
+    The input, analogously to `ifft`, should be ordered in the same way as is
+    returned by `fftn`, i.e. it should have the term for zero frequency
+    in all axes in the low-order corner, the positive frequency terms in the
+    first half of all axes, the term for the Nyquist frequency in the middle
+    of all axes and the negative frequency terms in the second half of all
+    axes, in order of decreasingly negative frequency.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor, can be complex.
+    s : sequence of ints, optional
+        Shape (length of each transformed axis) of the output
+        (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.).
+        This corresponds to ``n`` for ``ifft(x, n)``.
+        Along any axis, if the given shape is smaller than that of the input,
+        the input is cropped.  If it is larger, the input is padded with zeros.
+        if `s` is not given, the shape of the input along the axes specified
+        by `axes` is used.  See notes for issue on `ifft` zero padding.
+    axes : sequence of ints, optional
+        Axes over which to compute the IFFT.  If not given, the last ``len(s)``
+        axes are used, or all axes if `s` is also not specified.
+        Repeated indices in `axes` means that the inverse transform over that
+        axis is performed multiple times.
+    norm : {None, "ortho"}, optional
+        Normalization mode (see `mt.fft`). Default is None.
+
+    Returns
+    -------
+    out : complex Tensor
+        The truncated or zero-padded input, transformed along the axes
+        indicated by `axes`, or by a combination of `s` or `a`,
+        as explained in the parameters section above.
+
+    Raises
+    ------
+    ValueError
+        If `s` and `axes` have different length.
+    IndexError
+        If an element of `axes` is larger than than the number of axes of `a`.
+
+    See Also
+    --------
+    mt.fft : Overall view of discrete Fourier transforms, with definitions
+         and conventions used.
+    fftn : The forward *n*-dimensional FFT, of which `ifftn` is the inverse.
+    ifft : The one-dimensional inverse FFT.
+    ifft2 : The two-dimensional inverse FFT.
+    ifftshift : Undoes `fftshift`, shifts zero-frequency terms to beginning
+        of tensor.
+
+    Notes
+    -----
+    See `mt.fft` for definitions and conventions used.
+
+    Zero-padding, analogously with `ifft`, is performed by appending zeros to
+    the input along the specified dimension.  Although this is the common
+    approach, it might lead to surprising results.  If another form of zero
+    padding is desired, it must be performed before `ifftn` is called.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.eye(4)
+    >>> mt.fft.ifftn(mt.fft.fftn(a, axes=(0,)), axes=(1,)).execute()
+    array([[ 1.+0.j,  0.+0.j,  0.+0.j,  0.+0.j],
+           [ 0.+0.j,  1.+0.j,  0.+0.j,  0.+0.j],
+           [ 0.+0.j,  0.+0.j,  1.+0.j,  0.+0.j],
+           [ 0.+0.j,  0.+0.j,  0.+0.j,  1.+0.j]])
+
+
+    Create and plot an image with band-limited frequency content:
+
+    >>> import matplotlib.pyplot as plt
+    >>> n = mt.zeros((200,200), dtype=complex)
+    >>> n[60:80, 20:40] = mt.exp(1j*mt.random.uniform(0, 2*mt.pi, (20, 20)))
+    >>> im = mt.fft.ifftn(n).real
+    >>> plt.imshow(im.execute())
+    <matplotlib.image.AxesImage object at 0x...>
+    >>> plt.show()
+
+    """
+    a = astensor(a)
+    axes = validate_fftn(a, s=s, axes=axes, norm=norm)
+    op = TensorIFFTN(shape=s, axes=axes, norm=norm, dtype=np.dtype(np.complex_))
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/fft/ifftshift.py b/python/xorbits/_mars/tensor/fft/ifftshift.py
new file mode 100644
index 000000000..e83943aa9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/ifftshift.py
@@ -0,0 +1,80 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorFFTShiftBase, TensorFFTShiftMixin
+
+
+class TensorIFFTShift(TensorFFTShiftBase, TensorFFTShiftMixin):
+    _op_type_ = OperandDef.IFFTSHIFT
+
+    def __init__(self, axes=None, **kw):
+        super().__init__(_axes=axes, **kw)
+
+    @classmethod
+    def _is_inverse(cls):
+        return True
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def __call__(self, x):
+        return self.new_tensor([x], x.shape)
+
+
+def ifftshift(x, axes=None):
+    """
+    The inverse of `fftshift`. Although identical for even-length `x`, the
+    functions differ by one sample for odd-length `x`.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor.
+    axes : int or shape tuple, optional
+        Axes over which to calculate.  Defaults to None, which shifts all axes.
+
+    Returns
+    -------
+    y : Tensor
+        The shifted tensor.
+
+    See Also
+    --------
+    fftshift : Shift zero-frequency component to the center of the spectrum.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> freqs = mt.fft.fftfreq(9, d=1./9).reshape(3, 3)
+    >>> freqs.execute()
+    array([[ 0.,  1.,  2.],
+           [ 3.,  4., -4.],
+           [-3., -2., -1.]])
+    >>> mt.fft.ifftshift(mt.fft.fftshift(freqs)).execute()
+    array([[ 0.,  1.,  2.],
+           [ 3.,  4., -4.],
+           [-3., -2., -1.]])
+
+    """
+    x = astensor(x)
+    dtype = np.fft.ifftshift(np.empty((1,) * max(1, x.ndim), dtype=x.dtype)).dtype
+    axes = TensorIFFTShift._process_axes(x, axes)
+    op = TensorIFFTShift(axes=axes, dtype=dtype)
+    return op(x)
diff --git a/python/xorbits/_mars/tensor/fft/ihfft.py b/python/xorbits/_mars/tensor/fft/ihfft.py
new file mode 100644
index 000000000..bc4b1c150
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/ihfft.py
@@ -0,0 +1,95 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorFFTMixin, TensorHermitianFFT, validate_fft
+
+
+class TensorIHFFT(TensorHermitianFFT, TensorFFTMixin):
+    _op_type_ = OperandDef.IHFFT
+
+    def __init__(self, n=None, axis=-1, norm=None, **kw):
+        super().__init__(_n=n, _axis=axis, _norm=norm, **kw)
+
+    @classmethod
+    def _get_shape(cls, op, shape):
+        new_shape = list(shape)
+        shape = op.n if op.n is not None else shape[op.axis]
+        if shape % 2 == 0:
+            shape = (shape // 2) + 1
+        else:
+            shape = (shape + 1) // 2
+        new_shape[op.axis] = shape
+        return tuple(new_shape)
+
+
+def ihfft(a, n=None, axis=-1, norm=None):
+    """
+    Compute the inverse FFT of a signal that has Hermitian symmetry.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor.
+    n : int, optional
+        Length of the inverse FFT, the number of points along
+        transformation axis in the input to use.  If `n` is smaller than
+        the length of the input, the input is cropped.  If it is larger,
+        the input is padded with zeros. If `n` is not given, the length of
+        the input along the axis specified by `axis` is used.
+    axis : int, optional
+        Axis over which to compute the inverse FFT. If not given, the last
+        axis is used.
+    norm : {None, "ortho"}, optional
+        Normalization mode (see `numpy.fft`). Default is None.
+
+    Returns
+    -------
+    out : complex Tensor
+        The truncated or zero-padded input, transformed along the axis
+        indicated by `axis`, or the last one if `axis` is not specified.
+        The length of the transformed axis is ``n//2 + 1``.
+
+    See also
+    --------
+    hfft, irfft
+
+    Notes
+    -----
+    `hfft`/`ihfft` are a pair analogous to `rfft`/`irfft`, but for the
+    opposite case: here the signal has Hermitian symmetry in the time
+    domain and is real in the frequency domain. So here it's `hfft` for
+    which you must supply the length of the result if it is to be odd:
+
+    * even: ``ihfft(hfft(a, 2*len(a) - 2) == a``, within roundoff error,
+    * odd: ``ihfft(hfft(a, 2*len(a) - 1) == a``, within roundoff error.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> spectrum = mt.array([ 15, -4, 0, -1, 0, -4])
+    >>> mt.fft.ifft(spectrum).execute()
+    array([ 1.+0.j,  2.-0.j,  3.+0.j,  4.+0.j,  3.+0.j,  2.-0.j])
+    >>> mt.fft.ihfft(spectrum).execute()
+    array([ 1.-0.j,  2.-0.j,  3.-0.j,  4.-0.j])
+
+    """
+    a = astensor(a)
+    validate_fft(a, axis=axis, norm=norm)
+    op = TensorIHFFT(n=n, axis=axis, norm=norm, dtype=np.dtype(np.complex_))
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/fft/irfft.py b/python/xorbits/_mars/tensor/fft/irfft.py
new file mode 100644
index 000000000..ece39e164
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/irfft.py
@@ -0,0 +1,121 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorFFTMixin, TensorRealFFT, validate_fft
+
+
+class TensorIRFFT(TensorRealFFT, TensorFFTMixin):
+    _op_type_ = OperandDef.IRFFT
+
+    def __init__(self, n=None, axis=-1, norm=None, **kw):
+        super().__init__(_n=n, _axis=axis, _norm=norm, **kw)
+
+    @classmethod
+    def _get_shape(cls, op, shape):
+        new_shape = list(shape)
+        if op.n is not None:
+            new_shape[op.axis] = op.n
+        else:
+            new_shape[op.axis] = 2 * (new_shape[op.axis] - 1)
+        return tuple(new_shape)
+
+
+def irfft(a, n=None, axis=-1, norm=None):
+    """
+    Compute the inverse of the n-point DFT for real input.
+
+    This function computes the inverse of the one-dimensional *n*-point
+    discrete Fourier Transform of real input computed by `rfft`.
+    In other words, ``irfft(rfft(a), len(a)) == a`` to within numerical
+    accuracy. (See Notes below for why ``len(a)`` is necessary here.)
+
+    The input is expected to be in the form returned by `rfft`, i.e. the
+    real zero-frequency term followed by the complex positive frequency terms
+    in order of increasing frequency.  Since the discrete Fourier Transform of
+    real input is Hermitian-symmetric, the negative frequency terms are taken
+    to be the complex conjugates of the corresponding positive frequency terms.
+
+    Parameters
+    ----------
+    a : array_like
+        The input tensor.
+    n : int, optional
+        Length of the transformed axis of the output.
+        For `n` output points, ``n//2+1`` input points are necessary.  If the
+        input is longer than this, it is cropped.  If it is shorter than this,
+        it is padded with zeros.  If `n` is not given, it is determined from
+        the length of the input along the axis specified by `axis`.
+    axis : int, optional
+        Axis over which to compute the inverse FFT. If not given, the last
+        axis is used.
+    norm : {None, "ortho"}, optional
+        Normalization mode (see `mt.fft`). Default is None.
+
+    Returns
+    -------
+    out : Tensor
+        The truncated or zero-padded input, transformed along the axis
+        indicated by `axis`, or the last one if `axis` is not specified.
+        The length of the transformed axis is `n`, or, if `n` is not given,
+        ``2*(m-1)`` where ``m`` is the length of the transformed axis of the
+        input. To get an odd number of output points, `n` must be specified.
+
+    Raises
+    ------
+    IndexError
+        If `axis` is larger than the last axis of `a`.
+
+    See Also
+    --------
+    mt.fft : For definition of the DFT and conventions used.
+    rfft : The one-dimensional FFT of real input, of which `irfft` is inverse.
+    fft : The one-dimensional FFT.
+    irfft2 : The inverse of the two-dimensional FFT of real input.
+    irfftn : The inverse of the *n*-dimensional FFT of real input.
+
+    Notes
+    -----
+    Returns the real valued `n`-point inverse discrete Fourier transform
+    of `a`, where `a` contains the non-negative frequency terms of a
+    Hermitian-symmetric sequence. `n` is the length of the result, not the
+    input.
+
+    If you specify an `n` such that `a` must be zero-padded or truncated, the
+    extra/removed values will be added/removed at high frequencies. One can
+    thus resample a series to `m` points via Fourier interpolation by:
+    ``a_resamp = irfft(rfft(a), m)``.
+
+    Examples
+    --------
+    >>> import mars.tenosr as mt
+
+    >>> mt.fft.ifft([1, -1j, -1, 1j]).execute()
+    array([ 0.+0.j,  1.+0.j,  0.+0.j,  0.+0.j])
+    >>> mt.fft.irfft([1, -1j, -1]).execute()
+    array([ 0.,  1.,  0.,  0.])
+
+    Notice how the last term in the input to the ordinary `ifft` is the
+    complex conjugate of the second term, and the output has zero imaginary
+    part everywhere.  When calling `irfft`, the negative frequencies are not
+    specified, and the output array is purely real.
+
+    """
+    a = astensor(a)
+    validate_fft(a, axis=axis, norm=norm)
+    op = TensorIRFFT(n=n, axis=axis, norm=norm, dtype=np.dtype(np.float_))
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/fft/irfft2.py b/python/xorbits/_mars/tensor/fft/irfft2.py
new file mode 100644
index 000000000..0a975605b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/irfft2.py
@@ -0,0 +1,65 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorRealFFTN, TensorRealIFFTNMixin, validate_fftn
+
+
+class TensorIRFFT2(TensorRealFFTN, TensorRealIFFTNMixin):
+    _op_type_ = OperandDef.IRFFT2
+
+    def __init__(self, shape=None, axes=None, norm=None, **kw):
+        super().__init__(_shape=shape, _axes=axes, _norm=norm, **kw)
+
+
+def irfft2(a, s=None, axes=(-2, -1), norm=None):
+    """
+    Compute the 2-dimensional inverse FFT of a real array.
+
+    Parameters
+    ----------
+    a : array_like
+        The input tensor
+    s : sequence of ints, optional
+        Shape of the inverse FFT.
+    axes : sequence of ints, optional
+        The axes over which to compute the inverse fft.
+        Default is the last two axes.
+    norm : {None, "ortho"}, optional
+        Normalization mode (see `mt.fft`). Default is None.
+
+    Returns
+    -------
+    out : Tensor
+        The result of the inverse real 2-D FFT.
+
+    See Also
+    --------
+    irfftn : Compute the inverse of the N-dimensional FFT of real input.
+
+    Notes
+    -----
+    This is really `irfftn` with different defaults.
+    For more details see `irfftn`.
+
+    """
+    if len(axes) != 2:
+        raise ValueError("axes length should be 2")
+    a = astensor(a)
+    axes = validate_fftn(a, s=s, axes=axes, norm=norm)
+    op = TensorIRFFT2(shape=s, axes=axes, norm=norm, dtype=np.dtype(np.float_))
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/fft/irfftn.py b/python/xorbits/_mars/tensor/fft/irfftn.py
new file mode 100644
index 000000000..ea6cf21be
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/irfftn.py
@@ -0,0 +1,117 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorRealFFTN, TensorRealIFFTNMixin, validate_fftn
+
+
+class TensorIRFFTN(TensorRealFFTN, TensorRealIFFTNMixin):
+    _op_type_ = OperandDef.IRFFTN
+
+    def __init__(self, shape=None, axes=None, norm=None, **kw):
+        super().__init__(_shape=shape, _axes=axes, _norm=norm, **kw)
+
+
+def irfftn(a, s=None, axes=None, norm=None):
+    """
+    Compute the inverse of the N-dimensional FFT of real input.
+
+    This function computes the inverse of the N-dimensional discrete
+    Fourier Transform for real input over any number of axes in an
+    M-dimensional tensor by means of the Fast Fourier Transform (FFT).  In
+    other words, ``irfftn(rfftn(a), a.shape) == a`` to within numerical
+    accuracy. (The ``a.shape`` is necessary like ``len(a)`` is for `irfft`,
+    and for the same reason.)
+
+    The input should be ordered in the same way as is returned by `rfftn`,
+    i.e. as for `irfft` for the final transformation axis, and as for `ifftn`
+    along all the other axes.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor.
+    s : sequence of ints, optional
+        Shape (length of each transformed axis) of the output
+        (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). `s` is also the
+        number of input points used along this axis, except for the last axis,
+        where ``s[-1]//2+1`` points of the input are used.
+        Along any axis, if the shape indicated by `s` is smaller than that of
+        the input, the input is cropped.  If it is larger, the input is padded
+        with zeros. If `s` is not given, the shape of the input along the
+        axes specified by `axes` is used.
+    axes : sequence of ints, optional
+        Axes over which to compute the inverse FFT. If not given, the last
+        `len(s)` axes are used, or all axes if `s` is also not specified.
+        Repeated indices in `axes` means that the inverse transform over that
+        axis is performed multiple times.
+    norm : {None, "ortho"}, optional
+        Normalization mode (see `mt.fft`). Default is None.
+
+    Returns
+    -------
+    out : Tensor
+        The truncated or zero-padded input, transformed along the axes
+        indicated by `axes`, or by a combination of `s` or `a`,
+        as explained in the parameters section above.
+        The length of each transformed axis is as given by the corresponding
+        element of `s`, or the length of the input in every axis except for the
+        last one if `s` is not given.  In the final transformed axis the length
+        of the output when `s` is not given is ``2*(m-1)`` where ``m`` is the
+        length of the final transformed axis of the input.  To get an odd
+        number of output points in the final axis, `s` must be specified.
+
+    Raises
+    ------
+    ValueError
+        If `s` and `axes` have different length.
+    IndexError
+        If an element of `axes` is larger than than the number of axes of `a`.
+
+    See Also
+    --------
+    rfftn : The forward n-dimensional FFT of real input,
+            of which `ifftn` is the inverse.
+    fft : The one-dimensional FFT, with definitions and conventions used.
+    irfft : The inverse of the one-dimensional FFT of real input.
+    irfft2 : The inverse of the two-dimensional FFT of real input.
+
+    Notes
+    -----
+    See `fft` for definitions and conventions used.
+
+    See `rfft` for definitions and conventions used for real input.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.zeros((3, 2, 2))
+    >>> a[0, 0, 0] = 3 * 2 * 2
+    >>> mt.fft.irfftn(a).execute()
+    array([[[ 1.,  1.],
+            [ 1.,  1.]],
+           [[ 1.,  1.],
+            [ 1.,  1.]],
+           [[ 1.,  1.],
+            [ 1.,  1.]]])
+
+    """
+    a = astensor(a)
+    axes = validate_fftn(a, s=s, axes=axes, norm=norm)
+    op = TensorIRFFTN(shape=s, axes=axes, norm=norm, dtype=np.dtype(np.float_))
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/fft/rfft.py b/python/xorbits/_mars/tensor/fft/rfft.py
new file mode 100644
index 000000000..110c77966
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/rfft.py
@@ -0,0 +1,118 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorFFTMixin, TensorRealFFT, validate_fft
+
+
+class TensorRFFT(TensorRealFFT, TensorFFTMixin):
+    _op_type_ = OperandDef.RFFT
+
+    def __init__(self, n=None, axis=-1, norm=None, **kw):
+        super().__init__(_n=n, _axis=axis, _norm=norm, **kw)
+
+    @classmethod
+    def _get_shape(cls, op, shape):
+        new_shape = list(shape)
+        if op.n is not None:
+            new_shape[op.axis] = op.n
+        new_shape[op.axis] = new_shape[op.axis] // 2 + 1
+        return tuple(new_shape)
+
+
+def rfft(a, n=None, axis=-1, norm=None):
+    """
+    Compute the one-dimensional discrete Fourier Transform for real input.
+
+    This function computes the one-dimensional *n*-point discrete Fourier
+    Transform (DFT) of a real-valued array by means of an efficient algorithm
+    called the Fast Fourier Transform (FFT).
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor
+    n : int, optional
+        Number of points along transformation axis in the input to use.
+        If `n` is smaller than the length of the input, the input is cropped.
+        If it is larger, the input is padded with zeros. If `n` is not given,
+        the length of the input along the axis specified by `axis` is used.
+    axis : int, optional
+        Axis over which to compute the FFT. If not given, the last axis is
+        used.
+    norm : {None, "ortho"}, optional
+        Normalization mode (see `mt.fft`). Default is None.
+
+    Returns
+    -------
+    out : complex Tensor
+        The truncated or zero-padded input, transformed along the axis
+        indicated by `axis`, or the last one if `axis` is not specified.
+        If `n` is even, the length of the transformed axis is ``(n/2)+1``.
+        If `n` is odd, the length is ``(n+1)/2``.
+
+    Raises
+    ------
+    IndexError
+        If `axis` is larger than the last axis of `a`.
+
+    See Also
+    --------
+    mt.fft : For definition of the DFT and conventions used.
+    irfft : The inverse of `rfft`.
+    fft : The one-dimensional FFT of general (complex) input.
+    fftn : The *n*-dimensional FFT.
+    rfftn : The *n*-dimensional FFT of real input.
+
+    Notes
+    -----
+    When the DFT is computed for purely real input, the output is
+    Hermitian-symmetric, i.e. the negative frequency terms are just the complex
+    conjugates of the corresponding positive-frequency terms, and the
+    negative-frequency terms are therefore redundant.  This function does not
+    compute the negative frequency terms, and the length of the transformed
+    axis of the output is therefore ``n//2 + 1``.
+
+    When ``A = rfft(a)`` and fs is the sampling frequency, ``A[0]`` contains
+    the zero-frequency term 0*fs, which is real due to Hermitian symmetry.
+
+    If `n` is even, ``A[-1]`` contains the term representing both positive
+    and negative Nyquist frequency (+fs/2 and -fs/2), and must also be purely
+    real. If `n` is odd, there is no term at fs/2; ``A[-1]`` contains
+    the largest positive frequency (fs/2*(n-1)/n), and is complex in the
+    general case.
+
+    If the input `a` contains an imaginary part, it is silently discarded.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.fft.fft([0, 1, 0, 0]).execute()
+    array([ 1.+0.j,  0.-1.j, -1.+0.j,  0.+1.j])
+    >>> mt.fft.rfft([0, 1, 0, 0]).execute()
+    array([ 1.+0.j,  0.-1.j, -1.+0.j])
+
+    Notice how the final element of the `fft` output is the complex conjugate
+    of the second element, for real input. For `rfft`, this symmetry is
+    exploited to compute only the non-negative frequency terms.
+
+    """
+    a = astensor(a)
+    validate_fft(a, axis=axis, norm=norm)
+    op = TensorRFFT(n=n, axis=axis, norm=norm, dtype=np.dtype(np.complex_))
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/fft/rfft2.py b/python/xorbits/_mars/tensor/fft/rfft2.py
new file mode 100644
index 000000000..8a47517d5
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/rfft2.py
@@ -0,0 +1,65 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorRealFFTN, TensorRealFFTNMixin, validate_fftn
+
+
+class TensorRFFT2(TensorRealFFTN, TensorRealFFTNMixin):
+    _op_type_ = OperandDef.RFFT2
+
+    def __init__(self, shape=None, axes=None, norm=None, **kw):
+        super().__init__(_shape=shape, _axes=axes, _norm=norm, **kw)
+
+
+def rfft2(a, s=None, axes=(-2, -1), norm=None):
+    """
+    Compute the 2-dimensional FFT of a real tensor.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor, taken to be real.
+    s : sequence of ints, optional
+        Shape of the FFT.
+    axes : sequence of ints, optional
+        Axes over which to compute the FFT.
+    norm : {None, "ortho"}, optional
+        Normalization mode (see `mt.fft`). Default is None.
+
+    Returns
+    -------
+    out : Tensor
+        The result of the real 2-D FFT.
+
+    See Also
+    --------
+    rfftn : Compute the N-dimensional discrete Fourier Transform for real
+            input.
+
+    Notes
+    -----
+    This is really just `rfftn` with different default behavior.
+    For more details see `rfftn`.
+
+    """
+    if len(axes) != 2:
+        raise ValueError("axes length should be 2")
+    a = astensor(a)
+    axes = validate_fftn(a, s=s, axes=axes, norm=norm)
+    op = TensorRFFT2(shape=s, axes=axes, norm=norm, dtype=np.dtype(np.complex_))
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/fft/rfftfreq.py b/python/xorbits/_mars/tensor/fft/rfftfreq.py
new file mode 100644
index 000000000..1d4819cee
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/rfftfreq.py
@@ -0,0 +1,122 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import recursive_tile
+from ...serialization.serializables import Float64Field, Int32Field
+from ..core import TensorOrder
+from ..datasource import arange
+from ..operands import TensorOperand, TensorOperandMixin
+
+
+class TensorRFFTFreq(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.RFFTFREQ
+
+    _n = Int32Field("n")
+    _d = Float64Field("d")
+
+    def __init__(self, n=None, d=None, **kw):
+        super().__init__(_n=n, _d=d, **kw)
+
+    @property
+    def n(self):
+        return self._n
+
+    @property
+    def d(self):
+        return self._d
+
+    def __call__(self, chunk_size=None):
+        shape = (self.n // 2 + 1,)
+        return self.new_tensor(
+            None, shape, raw_chunk_size=chunk_size, order=TensorOrder.C_ORDER
+        )
+
+    @classmethod
+    def tile(cls, op):
+        tensor = op.outputs[0]
+        t = arange(
+            tensor.shape[0],
+            dtype=op.dtype,
+            gpu=op.gpu,
+            chunk_size=tensor.extra_params.raw_chunk_size,
+        )
+        t = t / (op.n * op.d)
+        t = yield from recursive_tile(t)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            None,
+            tensor.shape,
+            order=tensor.order,
+            chunks=t.chunks,
+            nsplits=t.nsplits,
+            **tensor.extra_params
+        )
+
+
+def rfftfreq(n, d=1.0, gpu=None, chunk_size=None):
+    """
+    Return the Discrete Fourier Transform sample frequencies
+    (for usage with rfft, irfft).
+
+    The returned float tensor `f` contains the frequency bin centers in cycles
+    per unit of the sample spacing (with zero at the start).  For instance, if
+    the sample spacing is in seconds, then the frequency unit is cycles/second.
+
+    Given a window length `n` and a sample spacing `d`::
+
+      f = [0, 1, ...,     n/2-1,     n/2] / (d*n)   if n is even
+      f = [0, 1, ..., (n-1)/2-1, (n-1)/2] / (d*n)   if n is odd
+
+    Unlike `fftfreq` (but like `scipy.fftpack.rfftfreq`)
+    the Nyquist frequency component is considered to be positive.
+
+    Parameters
+    ----------
+    n : int
+        Window length.
+    d : scalar, optional
+        Sample spacing (inverse of the sampling rate). Defaults to 1.
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+
+    Returns
+    -------
+    f : Tensor
+        Tensor of length ``n//2 + 1`` containing the sample frequencies.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> signal = mt.array([-2, 8, 6, 4, 1, 0, 3, 5, -3, 4], dtype=float)
+    >>> fourier = mt.fft.rfft(signal)
+    >>> n = signal.size
+    >>> sample_rate = 100
+    >>> freq = mt.fft.fftfreq(n, d=1./sample_rate)
+    >>> freq.execute()
+    array([  0.,  10.,  20.,  30.,  40., -50., -40., -30., -20., -10.])
+    >>> freq = mt.fft.rfftfreq(n, d=1./sample_rate)
+    >>> freq.execute()
+    array([  0.,  10.,  20.,  30.,  40.,  50.])
+
+    """
+    n, d = int(n), float(d)
+    op = TensorRFFTFreq(n=n, d=d, dtype=np.dtype(float), gpu=gpu)
+    return op(chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/fft/rfftn.py b/python/xorbits/_mars/tensor/fft/rfftn.py
new file mode 100644
index 000000000..bdfceeb1a
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/rfftn.py
@@ -0,0 +1,115 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorRealFFTN, TensorRealFFTNMixin, validate_fftn
+
+
+class TensorRFFTN(TensorRealFFTN, TensorRealFFTNMixin):
+    _op_type_ = OperandDef.RFFTN
+
+    def __init__(self, shape=None, axes=None, norm=None, **kw):
+        super().__init__(_shape=shape, _axes=axes, _norm=norm, **kw)
+
+
+def rfftn(a, s=None, axes=None, norm=None):
+    """
+    Compute the N-dimensional discrete Fourier Transform for real input.
+
+    This function computes the N-dimensional discrete Fourier Transform over
+    any number of axes in an M-dimensional real tensor by means of the Fast
+    Fourier Transform (FFT).  By default, all axes are transformed, with the
+    real transform performed over the last axis, while the remaining
+    transforms are complex.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor, taken to be real.
+    s : sequence of ints, optional
+        Shape (length along each transformed axis) to use from the input.
+        (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.).
+        The final element of `s` corresponds to `n` for ``rfft(x, n)``, while
+        for the remaining axes, it corresponds to `n` for ``fft(x, n)``.
+        Along any axis, if the given shape is smaller than that of the input,
+        the input is cropped.  If it is larger, the input is padded with zeros.
+        if `s` is not given, the shape of the input along the axes specified
+        by `axes` is used.
+    axes : sequence of ints, optional
+        Axes over which to compute the FFT.  If not given, the last ``len(s)``
+        axes are used, or all axes if `s` is also not specified.
+    norm : {None, "ortho"}, optional
+        Normalization mode (see `mt.fft`). Default is None.
+
+    Returns
+    -------
+    out : complex Tensor
+        The truncated or zero-padded input, transformed along the axes
+        indicated by `axes`, or by a combination of `s` and `a`,
+        as explained in the parameters section above.
+        The length of the last axis transformed will be ``s[-1]//2+1``,
+        while the remaining transformed axes will have lengths according to
+        `s`, or unchanged from the input.
+
+    Raises
+    ------
+    ValueError
+        If `s` and `axes` have different length.
+    IndexError
+        If an element of `axes` is larger than than the number of axes of `a`.
+
+    See Also
+    --------
+    irfftn : The inverse of `rfftn`, i.e. the inverse of the n-dimensional FFT
+         of real input.
+    fft : The one-dimensional FFT, with definitions and conventions used.
+    rfft : The one-dimensional FFT of real input.
+    fftn : The n-dimensional FFT.
+    rfft2 : The two-dimensional FFT of real input.
+
+    Notes
+    -----
+    The transform for real input is performed over the last transformation
+    axis, as by `rfft`, then the transform over the remaining axes is
+    performed as by `fftn`.  The order of the output is as for `rfft` for the
+    final transformation axis, and as for `fftn` for the remaining
+    transformation axes.
+
+    See `fft` for details, definitions and conventions used.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.ones((2, 2, 2))
+    >>> mt.fft.rfftn(a).execute()
+    array([[[ 8.+0.j,  0.+0.j],
+            [ 0.+0.j,  0.+0.j]],
+           [[ 0.+0.j,  0.+0.j],
+            [ 0.+0.j,  0.+0.j]]])
+
+    >>> mt.fft.rfftn(a, axes=(2, 0)).execute()
+    array([[[ 4.+0.j,  0.+0.j],
+            [ 4.+0.j,  0.+0.j]],
+           [[ 0.+0.j,  0.+0.j],
+            [ 0.+0.j,  0.+0.j]]])
+
+    """
+    a = astensor(a)
+    axes = validate_fftn(a, s=s, axes=axes, norm=norm)
+    op = TensorRFFTN(shape=s, axes=axes, norm=norm, dtype=np.dtype(np.complex_))
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/fft/tests/__init__.py b/python/xorbits/_mars/tensor/fft/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/fft/tests/test_fft.py b/python/xorbits/_mars/tensor/fft/tests/test_fft.py
new file mode 100644
index 000000000..84e02f43f
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/tests/test_fft.py
@@ -0,0 +1,186 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ....core import tile
+from ...datasource import ones
+from .. import (
+    fft,
+    fft2,
+    fftfreq,
+    fftn,
+    fftshift,
+    hfft,
+    ifft,
+    ifft2,
+    ifftn,
+    ifftshift,
+    ihfft,
+    irfft,
+    irfft2,
+    irfftn,
+    rfft,
+    rfft2,
+    rfftfreq,
+    rfftn,
+)
+
+
+def test_standard_fft():
+    t = ones((10, 20, 30), chunk_size=(3, 20, 30))
+
+    t1 = fft(t)
+    assert t1.shape == (10, 20, 30)
+    t1 = tile(t1)
+    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
+
+    t = ones((10, 20, 30), chunk_size=(3, 20, 30))
+
+    t1 = ifft(t)
+    assert t1.shape == (10, 20, 30)
+    t1 = tile(t1)
+    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
+
+    t = ones((10, 20, 30), chunk_size=(3, 20, 30))
+
+    t1 = fft2(t, s=(23, 21))
+    assert t1.shape == (10, 23, 21)
+    t1 = tile(t1)
+    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
+
+    t = ones((10, 20, 30), chunk_size=(3, 20, 30))
+
+    t1 = ifft2(t, s=(11, 9), axes=(1, 2))
+    assert t1.shape == (10, 11, 9)
+    t1 = tile(t1)
+    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
+
+    t = ones((10, 20, 30), chunk_size=(3, 20, 30))
+
+    t1 = fftn(t, s=(11, 9), axes=(1, 2))
+    assert t1.shape == (10, 11, 9)
+    t1 = tile(t1)
+    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
+
+    t = ones((10, 20, 30), chunk_size=(3, 20, 30))
+
+    t1 = ifftn(t, s=(11, 9), axes=(1, 2))
+    assert t1.shape == (10, 11, 9)
+    t1 = tile(t1)
+    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
+
+
+def test_real_fft():
+    t = ones((10, 20, 30), chunk_size=(3, 20, 30))
+
+    t1 = rfft(t)
+    assert t1.shape == np.fft.rfft(np.ones(t.shape)).shape
+    t1 = tile(t1)
+    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
+
+    t = ones((10, 20, 30), chunk_size=(3, 20, 30))
+
+    t1 = irfft(t)
+    assert t1.shape == np.fft.irfft(np.ones(t.shape)).shape
+    t1 = tile(t1)
+    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
+
+    t = ones((10, 20, 30), chunk_size=(3, 20, 30))
+
+    t1 = rfft2(t, s=(23, 21))
+    assert t1.shape == np.fft.rfft2(np.ones(t.shape), s=(23, 21)).shape
+    t1 = tile(t1)
+    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
+
+    t = ones((10, 20, 30), chunk_size=(3, 20, 30))
+
+    t1 = irfft2(t, s=(11, 9), axes=(1, 2))
+    assert t1.shape == np.fft.irfft2(np.ones(t.shape), s=(11, 9), axes=(1, 2)).shape
+    t1 = tile(t1)
+    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
+
+    t = ones((10, 20, 30), chunk_size=(3, 20, 30))
+
+    t1 = rfftn(t, s=(11, 30), axes=(1, 2))
+    assert t1.shape == np.fft.rfftn(np.ones(t.shape), s=(11, 30), axes=(1, 2)).shape
+    t1 = tile(t1)
+    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
+
+    t = ones((10, 20, 30), chunk_size=(3, 20, 30))
+
+    t1 = irfftn(t, s=(11, 9), axes=(1, 2))
+    assert t1.shape == np.fft.irfftn(np.ones(t.shape), s=(11, 9), axes=(1, 2)).shape
+    t1 = tile(t1)
+    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
+
+
+def test_hermitian_fft():
+    t = ones((10, 20, 30), chunk_size=(3, 20, 30))
+
+    t1 = hfft(t)
+    assert t1.shape == np.fft.hfft(np.ones(t.shape)).shape
+    t1 = tile(t1)
+    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
+
+    t = ones((10, 20, 30), chunk_size=(3, 20, 30))
+
+    t1 = hfft(t, n=100)
+    assert t1.shape == np.fft.hfft(np.ones(t.shape), n=100).shape
+    t1 = tile(t1)
+    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
+
+    t = ones((10, 20, 30), chunk_size=(3, 20, 30))
+
+    t1 = ihfft(t)
+    assert t1.shape == np.fft.ihfft(np.ones(t.shape)).shape
+    t1 = tile(t1)
+    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
+
+    t = ones((10, 20, 30), chunk_size=(3, 20, 30))
+
+    t1 = ihfft(t, n=100)
+    assert t1.shape == np.fft.ihfft(np.ones(t.shape), n=100).shape
+    t1 = tile(t1)
+    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
+
+    t1 = ihfft(t, n=101)
+    assert t1.shape == np.fft.ihfft(np.ones(t.shape), n=101).shape
+    t1 = tile(t1)
+    assert t1.shape == tuple(sum(ns) for ns in t1.nsplits)
+
+
+def test_fft_shift():
+    freqs = fftfreq(9, d=1.0 / 9).reshape(3, 3)
+    t = ifftshift(fftshift(freqs))
+
+    assert t.dtype is not None
+    expect_dtype = np.fft.ifftshift(
+        np.fft.fftshift(np.fft.fftfreq(9, d=1.0 / 9).reshape(3, 3))
+    ).dtype
+    assert t.dtype == expect_dtype
+
+
+def test_fft_freq():
+    t = fftfreq(10, 0.1, chunk_size=3)
+
+    assert t.shape == np.fft.fftfreq(10, 0.1).shape
+    t = tile(t)
+    assert t.shape == tuple(sum(ns) for ns in t.nsplits)
+
+    t = rfftfreq(10, 0.1, chunk_size=3)
+
+    assert t.shape == np.fft.rfftfreq(10, 0.1).shape
+    t = tile(t)
+    assert t.shape == tuple(sum(ns) for ns in t.nsplits)
diff --git a/python/xorbits/_mars/tensor/fft/tests/test_fft_execution.py b/python/xorbits/_mars/tensor/fft/tests/test_fft_execution.py
new file mode 100644
index 000000000..9e520bc90
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fft/tests/test_fft_execution.py
@@ -0,0 +1,541 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ....lib.mkl_interface import mkl_free_buffers
+from ...datasource import tensor
+from .. import (
+    fft,
+    fft2,
+    fftfreq,
+    fftn,
+    fftshift,
+    hfft,
+    ifft,
+    ifft2,
+    ifftn,
+    ifftshift,
+    ihfft,
+    irfft,
+    irfft2,
+    irfftn,
+    rfft,
+    rfft2,
+    rfftfreq,
+    rfftn,
+)
+
+
+def test_fft_execution(setup):
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(8, 8, 30))
+
+    r = fft(t)
+    res = r.execute().fetch()
+    expected = np.fft.fft(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = fft(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.fft(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = fft(t, n=11)
+    res = r.execute().fetch()
+    expected = np.fft.fft(raw, n=11)
+    np.testing.assert_allclose(res, expected)
+
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(8, 8, 8))
+
+    r = fft(t)
+    res = r.execute().fetch()
+    expected = np.fft.fft(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = fft(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.fft(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = fft(t, n=11)
+    res = r.execute().fetch()
+    expected = np.fft.fft(raw, n=11)
+    np.testing.assert_allclose(res, expected)
+
+
+def test_ifft_execution(setup):
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(8, 8, 30))
+
+    r = ifft(t)
+    res = r.execute().fetch()
+    expected = np.fft.ifft(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = ifft(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.ifft(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = ifft(t, n=11)
+    res = r.execute().fetch()
+    expected = np.fft.ifft(raw, n=11)
+    np.testing.assert_allclose(res, expected)
+
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(8, 8, 10))
+
+    r = ifft(t)
+    res = r.execute().fetch()
+    expected = np.fft.ifft(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = ifft(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.ifft(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = ifft(t, n=11)
+    res = r.execute().fetch()
+    expected = np.fft.ifft(raw, n=11)
+    np.testing.assert_allclose(res, expected)
+
+
+def test_fft2_execution(setup):
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(8, 20, 30))
+
+    r = fft2(t)
+    res = r.execute().fetch()
+    expected = np.fft.fft2(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = fft2(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.fft2(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = fft2(t, s=(11, 12))
+    res = r.execute().fetch()
+    expected = np.fft.fft2(raw, s=(11, 12))
+    np.testing.assert_allclose(res, expected)
+
+    r = fft2(t, s=(11, 12), axes=(-1, -2))
+    res = r.execute().fetch()
+    expected = np.fft.fft2(raw, s=(11, 12), axes=(-1, -2))
+    np.testing.assert_allclose(res, expected)
+
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(8, 10, 12))
+
+    r = fft2(t)
+    res = r.execute().fetch()
+    expected = np.fft.fft2(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = fft2(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.fft2(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = fft2(t, s=(11, 12))
+    res = r.execute().fetch()
+    expected = np.fft.fft2(raw, s=(11, 12))
+    np.testing.assert_allclose(res, expected)
+
+    r = fft2(t, s=(11, 12), axes=(-1, -2))
+    res = r.execute().fetch()
+    expected = np.fft.fft2(raw, s=(11, 12), axes=(-1, -2))
+    np.testing.assert_allclose(res, expected)
+
+
+def test_ifft2_execution(setup):
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(8, 20, 30))
+
+    r = ifft2(t)
+    res = r.execute().fetch()
+    expected = np.fft.ifft2(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = ifft2(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.ifft2(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = ifft2(t, s=(11, 12))
+    res = r.execute().fetch()
+    expected = np.fft.ifft2(raw, s=(11, 12))
+    np.testing.assert_allclose(res, expected)
+
+    r = ifft2(t, s=(11, 12), axes=(-1, -2))
+    res = r.execute().fetch()
+    expected = np.fft.ifft2(raw, s=(11, 12), axes=(-1, -2))
+    np.testing.assert_allclose(res, expected)
+
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(8, 6, 10))
+
+    r = ifft2(t)
+    res = r.execute().fetch()
+    expected = np.fft.ifft2(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = ifft2(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.ifft2(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = ifft2(t, s=(11, 12))
+    res = r.execute().fetch()
+    expected = np.fft.ifft2(raw, s=(11, 12))
+    np.testing.assert_allclose(res, expected)
+
+    r = ifft2(t, s=(11, 12), axes=(-1, -2))
+    res = r.execute().fetch()
+    expected = np.fft.ifft2(raw, s=(11, 12), axes=(-1, -2))
+    np.testing.assert_allclose(res, expected)
+
+
+def test_fftn_execution(setup):
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(10, 20, 30))
+
+    r = fftn(t)
+    res = r.execute().fetch()
+    expected = np.fft.fftn(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = fftn(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.fftn(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = fftn(t, s=(11, 12, 5))
+    res = r.execute().fetch()
+    expected = np.fft.fftn(raw, s=(11, 12, 5))
+    np.testing.assert_allclose(res, expected)
+
+    r = fftn(t, s=(11, 12, 5), axes=(-1, -2, -3))
+    res = r.execute().fetch()
+    expected = np.fft.fftn(raw, s=(11, 12, 5), axes=(-1, -2, -3))
+    np.testing.assert_allclose(res, expected)
+
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(6, 6, 8))
+
+    r = fftn(t)
+    res = r.execute().fetch()
+    expected = np.fft.fftn(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = fftn(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.fftn(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = fftn(t, s=(11, 12, 5))
+    res = r.execute().fetch()
+    expected = np.fft.fftn(raw, s=(11, 12, 5))
+    np.testing.assert_allclose(res, expected)
+
+    r = fftn(t, s=(11, 12, 5), axes=(-1, -2, -3))
+    res = r.execute().fetch()
+    expected = np.fft.fftn(raw, s=(11, 12, 5), axes=(-1, -2, -3))
+    np.testing.assert_allclose(res, expected)
+
+
+def test_ifftn_execution(setup):
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(10, 20, 30))
+
+    r = ifftn(t)
+    res = r.execute().fetch()
+    expected = np.fft.ifftn(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = ifftn(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.ifftn(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = ifftn(t, s=(11, 12, 5))
+    res = r.execute().fetch()
+    expected = np.fft.ifftn(raw, s=(11, 12, 5))
+    np.testing.assert_allclose(res, expected)
+
+    r = ifftn(t, s=(11, 12, 5), axes=(-1, -2, -3))
+    res = r.execute().fetch()
+    expected = np.fft.ifftn(raw, s=(11, 12, 5), axes=(-1, -2, -3))
+    np.testing.assert_allclose(res, expected)
+
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(6, 8, 14))
+
+    r = ifftn(t)
+    res = r.execute().fetch()
+    expected = np.fft.ifftn(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = ifftn(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.ifftn(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = ifftn(t, s=(11, 12, 5))
+    res = r.execute().fetch()
+    expected = np.fft.ifftn(raw, s=(11, 12, 5))
+    np.testing.assert_allclose(res, expected)
+
+    r = ifftn(t, s=(11, 12, 5), axes=(-1, -2, -3))
+    res = r.execute().fetch()
+    expected = np.fft.ifftn(raw, s=(11, 12, 5), axes=(-1, -2, -3))
+    np.testing.assert_allclose(res, expected)
+
+
+def test_rfft_execution(setup):
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(8, 8, 30))
+
+    r = rfft(t)
+    res = r.execute().fetch()
+    expected = np.fft.rfft(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = rfft(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.rfft(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = rfft(t, n=11)
+    res = r.execute().fetch()
+    expected = np.fft.rfft(raw, n=11)
+    np.testing.assert_allclose(res, expected)
+
+
+def test_irfft_execution(setup):
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(8, 8, 30))
+
+    r = irfft(t)
+    res = r.execute().fetch()
+    expected = np.fft.irfft(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = irfft(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.irfft(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = irfft(t, n=11)
+    res = r.execute().fetch()
+    expected = np.fft.irfft(raw, n=11)
+    np.testing.assert_allclose(res, expected)
+
+
+def test_rfft2_execution(setup):
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(8, 20, 30))
+
+    r = rfft2(t)
+    res = r.execute().fetch()
+    expected = np.fft.rfft2(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = rfft2(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.rfft2(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = rfft2(t, s=(11, 12))
+    res = r.execute().fetch()
+    expected = np.fft.rfft2(raw, s=(11, 12))
+    np.testing.assert_allclose(res, expected)
+
+    r = rfft2(t, s=(11, 12), axes=(-1, -2))
+    res = r.execute().fetch()
+    expected = np.fft.rfft2(raw, s=(11, 12), axes=(-1, -2))
+    np.testing.assert_allclose(res, expected)
+
+
+def test_irfft2_execution(setup):
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(8, 20, 30))
+
+    r = irfft2(t)
+    res = r.execute().fetch()
+    expected = np.fft.irfft2(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = irfft2(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.irfft2(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = irfft2(t, s=(11, 12))
+    res = r.execute().fetch()
+    expected = np.fft.irfft2(raw, s=(11, 12))
+    np.testing.assert_allclose(res, expected)
+
+    r = irfft2(t, s=(11, 12), axes=(-1, -2))
+    res = r.execute().fetch()
+    expected = np.fft.irfft2(raw, s=(11, 12), axes=(-1, -2))
+    np.testing.assert_allclose(res, expected)
+
+
+def test_rfftn_execution(setup):
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(10, 20, 30))
+
+    r = rfftn(t)
+    res = r.execute().fetch()
+    expected = np.fft.rfftn(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = rfftn(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.rfftn(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = rfftn(t, s=(11, 12, 5))
+    res = r.execute().fetch()
+    expected = np.fft.rfftn(raw, s=(11, 12, 5))
+    np.testing.assert_allclose(res, expected)
+
+    r = rfftn(t, s=(11, 12, 11), axes=(-1, -2, -3))
+    res = r.execute().fetch()
+    expected = np.fft.rfftn(raw, s=(11, 12, 11), axes=(-1, -2, -3))
+    np.testing.assert_allclose(res, expected)
+
+
+def test_irfftn_execution(setup):
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(10, 20, 30))
+
+    r = irfftn(t)
+    res = r.execute().fetch()
+    expected = np.fft.irfftn(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = irfftn(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.irfftn(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = irfftn(t, s=(11, 21, 5))
+    res = r.execute().fetch()
+    expected = np.fft.irfftn(raw, s=(11, 21, 5))
+    np.testing.assert_allclose(res, expected)
+
+    # a bug in mkl version will cause the section below to fail
+    if mkl_free_buffers is None:
+        r = irfftn(t, s=(11, 21, 30), axes=(-1, -2, -3))
+        res = r.execute().fetch()
+        expected = np.fft.irfftn(raw, s=(11, 21, 30), axes=(-1, -2, -3))
+        np.testing.assert_allclose(res, expected)
+
+
+def test_hfft_execution(setup):
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(8, 8, 30))
+
+    r = hfft(t)
+    res = r.execute().fetch()
+    expected = np.fft.hfft(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = hfft(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.hfft(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = hfft(t, n=11)
+    res = r.execute().fetch()
+    expected = np.fft.hfft(raw, n=11)
+    np.testing.assert_allclose(res, expected)
+
+
+def test_ihfft_execution(setup):
+    raw = np.random.rand(10, 20, 30)
+    t = tensor(raw, chunk_size=(8, 8, 30))
+
+    r = ihfft(t)
+    res = r.execute().fetch()
+    expected = np.fft.ihfft(raw)
+    np.testing.assert_allclose(res, expected)
+
+    r = ihfft(t, norm="ortho")
+    res = r.execute().fetch()
+    expected = np.fft.ihfft(raw, norm="ortho")
+    np.testing.assert_allclose(res, expected)
+
+    r = ihfft(t, n=11)
+    res = r.execute().fetch()
+    expected = np.fft.ihfft(raw, n=11)
+    np.testing.assert_allclose(res, expected)
+
+    r = ihfft(t, n=12)
+    res = r.execute().fetch()
+    expected = np.fft.ihfft(raw, n=12)
+    np.testing.assert_allclose(res, expected)
+
+
+def test_fft_freq_execution(setup):
+    t = fftfreq(10, 0.1, chunk_size=6)
+
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, np.fft.fftfreq(10, 0.1))
+
+    t = fftfreq(11, 0.01, chunk_size=6)
+
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, np.fft.fftfreq(11, 0.01))
+
+
+def test_rfft_freq_execution(setup):
+    t = rfftfreq(20, 0.1, chunk_size=6)
+
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, np.fft.rfftfreq(20, 0.1))
+
+    t = rfftfreq(21, 0.01, chunk_size=6)
+
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, np.fft.rfftfreq(21, 0.01))
+
+
+def test_fft_shift_execution(setup):
+    t = fftfreq(10, 0.1, chunk_size=6)
+    r = fftshift(t)
+
+    res = r.execute().fetch()
+    np.testing.assert_allclose(res, np.fft.fftshift(np.fft.fftfreq(10, 0.1)))
+
+    freqs = fftfreq(9, d=1.0 / 9, chunk_size=4).reshape(3, 3)
+    r = fftshift(freqs, axes=(1,))
+
+    res = r.execute().fetch()
+    expected = np.fft.fftshift(np.fft.fftfreq(9, d=1.0 / 9).reshape(3, 3), axes=(1,))
+    np.testing.assert_allclose(res, expected)
+
+
+def test_ifft_shift_execution(setup):
+    t = fftfreq(9, d=1.0 / 9, chunk_size=4).reshape(3, 3)
+    r = ifftshift(t)
+
+    res = r.execute().fetch()
+    expected = np.fft.ifftshift(np.fft.fftfreq(9, d=1.0 / 9).reshape(3, 3))
+    np.testing.assert_allclose(res, expected)
diff --git a/python/xorbits/_mars/tensor/fuse/__init__.py b/python/xorbits/_mars/tensor/fuse/__init__.py
new file mode 100644
index 000000000..1b08a8e25
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fuse/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import TensorFuseChunk
+from .cupy import TensorCpFuseChunk
+from .numexpr import TensorNeFuseChunk
diff --git a/python/xorbits/_mars/tensor/fuse/core.py b/python/xorbits/_mars/tensor/fuse/core.py
new file mode 100644
index 000000000..8cda209c4
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fuse/core.py
@@ -0,0 +1,25 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...core.operand import FuseChunkMixin
+from ..operands import TensorFuse, TensorOperandMixin
+
+
+class TensorFuseChunkMixin(FuseChunkMixin, TensorOperandMixin):
+    __slots__ = ()
+
+
+class TensorFuseChunk(TensorFuse, TensorFuseChunkMixin):
+    def __init__(self, dtype=None, **kw):
+        super().__init__(dtype=dtype, **kw)
diff --git a/python/xorbits/_mars/tensor/fuse/cupy.py b/python/xorbits/_mars/tensor/fuse/cupy.py
new file mode 100644
index 000000000..f95db4a3b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fuse/cupy.py
@@ -0,0 +1,79 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from string import ascii_letters
+
+from ...utils import tokenize
+from .. import arithmetic
+from ..operands import TensorFuse
+from .core import TensorFuseChunkMixin
+
+
+class TensorCpFuseChunk(TensorFuse, TensorFuseChunkMixin):
+    # use for cupy-fused operand
+    _op_type_ = None  # no opcode, cannot be serialized
+
+    @classmethod
+    def execute(cls, ctx, op):
+        import cupy as cp
+
+        chunk = op.outputs[0]
+        func = cp.ElementwiseKernel(*_evaluate(chunk))
+        ctx[chunk.key] = func(*[ctx[i.key] for i in op.inputs])
+
+
+# execution part
+CP_BINOP_TO_STRING = {
+    arithmetic.TensorSubtract: "-",
+    arithmetic.TensorMultiply: "*",
+    arithmetic.TensorTrueDiv: "/",
+}
+
+CP_UNARYOP_TO_STRING = {
+    arithmetic.TensorSqrt: "sqrt",
+}
+
+
+def _evaluate(chunk):
+    letters = iter(letter for letter in ascii_letters if letter not in "ni")
+
+    input_types = [i.dtype.name for i in chunk.op.inputs]
+    input_names = {i: next(letters) for i in chunk.op.inputs}
+    input_arguments = ", ".join(
+        [f"{tp} {input_names[i]}" for i, tp in zip(chunk.op.inputs, input_types)]
+    )
+    output_type = chunk.op.dtype.name
+    output_name = next(letters)
+    output_argument = f"{output_type} {output_name}"
+    body = dict(input_names)
+
+    for node in chunk.composed:
+        op_cls = type(node.op)
+        if op_cls in CP_BINOP_TO_STRING:
+            input_bodies = [body.get(i, repr(i)) for i in (node.op.lhs, node.op.rhs)]
+            body[node] = f" {CP_BINOP_TO_STRING[op_cls]} ".join(input_bodies)
+        elif op_cls in CP_UNARYOP_TO_STRING:
+            input_data = body[node.op.inputs[0]]
+            body[node] = f"{CP_UNARYOP_TO_STRING[op_cls]}({input_data})"
+        else:
+            raise NotImplementedError
+
+    body = f"{output_name} = {body[chunk.composed[-1]]}"
+    key = tokenize(input_arguments, output_argument, body)
+    return (
+        input_arguments,
+        output_argument,
+        body,
+        f"{type(chunk.op).__name__.lower()}_{key}",
+    )
diff --git a/python/xorbits/_mars/tensor/fuse/numexpr.py b/python/xorbits/_mars/tensor/fuse/numexpr.py
new file mode 100644
index 000000000..6546c1dcd
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fuse/numexpr.py
@@ -0,0 +1,198 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from itertools import count
+
+try:
+    import numexpr as ne
+
+    NUMEXPR_INSTALLED = True
+except ImportError:
+    ne = None
+    NUMEXPR_INSTALLED = False
+import numpy as np
+
+from .. import arithmetic, reduction
+from ..array_utils import as_same_device
+from ..operands import TensorFuse
+from .core import TensorFuseChunkMixin
+
+
+class TensorNeFuseChunk(TensorFuse, TensorFuseChunkMixin):
+    _op_type_ = None  # no opcode, cannot be serialized
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        inputs = as_same_device([ctx[c.key] for c in op.inputs], device=op.device)
+        counter = count()
+        # Unified the var names to V_0, V_1, ... for better cache hit.
+        key_to_var = defaultdict(lambda: f"V_{counter.__next__()}")
+        local_dict = {key_to_var[c.key]: i for c, i in zip(op.inputs, inputs)}
+        expr = _evaluate(chunk).format_map(key_to_var)
+        # The numexpr.evaluate is thread safe: https://github.com/pydata/numexpr/pull/200
+        try:
+            res = ne.evaluate(expr, local_dict=local_dict, global_dict={})
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to evaluate numexpr {repr(expr)} on local dict {local_dict}."
+            ) from e
+        res = _maybe_keepdims(chunk, res)
+        if chunk.ndim == 0 and res.ndim == 1 and res.size == 0:
+            res = res.dtype.type(0)
+        ctx[chunk.key] = res
+
+
+# execution part
+NE_UNARYOP_TO_STRING = {
+    arithmetic.TensorNegative: "-",
+    arithmetic.TensorAbs: "abs",
+    arithmetic.TensorConj: "conj",
+    arithmetic.TensorExp: "exp",
+    arithmetic.TensorLog: "log",
+    arithmetic.TensorLog10: "log10",
+    arithmetic.TensorExpm1: "expm1",
+    arithmetic.TensorLog1p: "log1p",
+    arithmetic.TensorSqrt: "sqrt",
+    arithmetic.TensorSin: "sin",
+    arithmetic.TensorCos: "cos",
+    arithmetic.TensorTan: "tan",
+    arithmetic.TensorArcsin: "arcsin",
+    arithmetic.TensorArccos: "arccos",
+    arithmetic.TensorArctan: "arctan",
+    arithmetic.TensorSinh: "sinh",
+    arithmetic.TensorCosh: "cosh",
+    arithmetic.TensorTanh: "tanh",
+    arithmetic.TensorArcsinh: "arcsinh",
+    arithmetic.TensorArccosh: "arccosh",
+    arithmetic.TensorArctanh: "arctanh",
+    arithmetic.TensorFloor: "floor",
+    arithmetic.TensorCeil: "ceil",
+    arithmetic.TensorNot: "~",
+}
+
+
+NE_BINOP_TO_STRING = {
+    arithmetic.TensorAdd: "+",
+    arithmetic.TensorSubtract: "-",
+    arithmetic.TensorMultiply: "*",
+    arithmetic.TensorDivide: "/",
+    arithmetic.TensorMod: "%",
+    arithmetic.TensorPower: "**",
+    arithmetic.TensorLshift: "<<",
+    arithmetic.TensorRshift: ">>",
+    arithmetic.TensorEqual: "==",
+    arithmetic.TensorNotEqual: "!=",
+    arithmetic.TensorLessThan: "<",
+    arithmetic.TensorLessEqual: "<=",
+    arithmetic.TensorGreaterThan: ">",
+    arithmetic.TensorGreaterEqual: ">=",
+    arithmetic.TensorAnd: "and",
+    arithmetic.TensorOr: "or",
+}
+
+NE_TREE_OP_TO_STRING = {
+    arithmetic.TensorTreeAdd: "+",
+    arithmetic.TensorTreeMultiply: "*",
+}
+
+NE_REDUCTION_TO_STRING = {
+    reduction.TensorSum: "sum",
+    reduction.TensorProd: "prod",
+    reduction.TensorMax: "max",
+    reduction.TensorMin: "min",
+}
+
+
+class _Default(dict):
+    def __missing__(self, key):
+        return f"{{{key}}}"
+
+
+def _handle_unary(chunk):
+    if len(chunk.inputs) != 1:
+        raise ValueError("unary operand inputs should be 1")
+    data = chunk.inputs[0]
+    unary_op = NE_UNARYOP_TO_STRING[type(chunk.op)]
+    return f"{unary_op}({{{data.key}}})"
+
+
+def _decompose(chunk):
+    expr = f"{{{chunk.key}}}"
+    for node in reversed(chunk.composed):
+        _expr = _evaluate(node)
+        expr = expr.format_map(_Default([(node.key, f"({_expr})")]))
+    return expr
+
+
+def _handle_bin(chunk):
+    op = chunk.op
+    lhs = str(op.lhs) if np.isscalar(op.lhs) else f"{{{op.lhs.key}}}"
+    rhs = str(op.rhs) if np.isscalar(op.rhs) else f"{{{op.rhs.key}}}"
+    reverse = getattr(op, "reverse", False)
+    op = NE_BINOP_TO_STRING[type(op)]
+    if reverse:
+        exprs = [rhs, lhs]
+    else:
+        exprs = [lhs, rhs]
+    return op.join(exprs)
+
+
+def _handle_tree(chunk):
+    op = NE_TREE_OP_TO_STRING[type(chunk.op)]
+    return op.join(f"{{{c.key}}}" for c in chunk.inputs)
+
+
+def _wrap_bool(data):
+    if data.dtype == np.bool_:
+        return f"where({{{data.key}}}, 1, 0)"
+
+    return f"{{{data.key}}}"
+
+
+def _handle_reduction(chunk):
+    ax = chunk.op.axis
+    data = chunk.inputs[0]
+    op_str = NE_REDUCTION_TO_STRING[type(chunk.op)]
+    # TODO(hks): delete it if numexpr.sum fixed
+    if len(ax) == data.ndim:
+        return f"{op_str}({_wrap_bool(data)})"
+    elif len(ax) == 1:
+        return f"{op_str}({_wrap_bool(data)},axis={ax[0]})"
+    else:
+        raise ValueError("numexpr cannot encode axis")
+
+
+def _evaluate(chunk):
+    op_type = type(chunk.op)
+    if op_type in NE_UNARYOP_TO_STRING:
+        return _handle_unary(chunk)
+    elif op_type in NE_BINOP_TO_STRING:
+        return _handle_bin(chunk)
+    elif op_type in NE_TREE_OP_TO_STRING:
+        return _handle_tree(chunk)
+    elif op_type in NE_REDUCTION_TO_STRING:
+        return _handle_reduction(chunk)
+    elif op_type is TensorNeFuseChunk:
+        return _decompose(chunk)
+    else:
+        raise TypeError(f"unsupported operator in numexpr: {op_type.__name__}")
+
+
+def _maybe_keepdims(chunk, res):
+    out_chunk = chunk.composed[-1] if type(chunk.op) == TensorNeFuseChunk else chunk
+    if type(out_chunk.op) in NE_REDUCTION_TO_STRING and out_chunk.op.keepdims:
+        res = np.reshape(res, out_chunk.shape)
+    return res
diff --git a/python/xorbits/_mars/tensor/fuse/tests/__init__.py b/python/xorbits/_mars/tensor/fuse/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fuse/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/fuse/tests/test_numexpr_execution.py b/python/xorbits/_mars/tensor/fuse/tests/test_numexpr_execution.py
new file mode 100644
index 000000000..1d11bacc8
--- /dev/null
+++ b/python/xorbits/_mars/tensor/fuse/tests/test_numexpr_execution.py
@@ -0,0 +1,206 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ....utils import ignore_warning
+from ...arithmetic import abs as mt_abs
+from ...datasource import arange, tensor
+from ...reduction import sum as mt_sum
+
+
+def test_base_execution(setup):
+    rs = np.random.RandomState(0)
+    raw1 = rs.randint(10, size=(10, 10, 10))
+    raw2 = rs.randint(10, size=(10, 10, 10))
+    arr1 = tensor(raw1, chunk_size=5)
+    arr2 = tensor(raw2, chunk_size=5)
+
+    arr3 = arr1 + arr2 + 10
+    arr4 = 10 + arr1 + arr2
+    res3 = arr3.execute().fetch()
+    res3_cmp = arr4.execute().fetch()
+    np.testing.assert_array_equal(res3, res3_cmp)
+
+    a = arange(10)
+    b = arange(10) * 0.1
+    raw_a = np.arange(10)
+    raw_b = np.arange(10) * 0.1
+    c = a * b - 4.1 * a > 2.5 * b
+    res4_cmp = raw_a * raw_b - 4.1 * raw_a > 2.5 * raw_b
+    res4 = c.execute().fetch()
+    np.testing.assert_array_equal(res4, res4_cmp)
+
+    c = mt_sum(1) * (-1)
+    r = c.execute().fetch()
+    assert r == -1
+
+    c = -mt_abs(mt_sum(mt_abs(-1)))
+    r = c.execute().fetch()
+    assert r == -1
+
+
+def _gen_pairs(seq):
+    test_seq = np.random.RandomState(0).permutation(seq)
+    for i in range(0, len(seq), 2):
+        j = (i + 1) % len(seq)
+        yield test_seq[i], test_seq[j]
+
+
+@ignore_warning
+def test_unary_execution(setup):
+    from ...arithmetic import UNARY_UFUNC, arccosh, conj, invert, logical_not, sin
+
+    _sp_unary_ufunc = {arccosh, invert, conj, logical_not}
+    _new_unary_ufunc = list(UNARY_UFUNC - _sp_unary_ufunc)[:3]
+
+    def _normalize_by_sin(func1, func2, arr):
+        return func1(abs(sin((func2(arr)))))
+
+    tested = set()
+    rs = np.random.RandomState(0)
+    for func1, func2 in _gen_pairs(_new_unary_ufunc):
+        raw = rs.random((8, 8, 8))
+        arr1 = tensor(raw, chunk_size=4)
+
+        arr2 = _normalize_by_sin(func1, func2, arr1)
+        res = arr2.execute()
+        res_cmp = arr2.execute(fuse_enabled=False)
+        np.testing.assert_allclose(res[0], res_cmp[0])
+        tested.update([func1, func2])
+    # make sure all functions tested
+    assert tested == set(_new_unary_ufunc)
+
+    raw = rs.randint(100, size=(8, 8, 8))
+    arr1 = tensor(raw, chunk_size=4)
+    arr2 = arccosh(1 + abs(invert(arr1)))
+    res = arr2.execute(fuse_enabled=False).fetch()
+    res_cmp = arccosh(1 + abs(~raw))
+    np.testing.assert_array_almost_equal(res[0], res_cmp[0])
+
+
+@ignore_warning
+def test_bin_execution(setup):
+    from ...arithmetic import (
+        BIN_UFUNC,
+        bitand,
+        bitor,
+        bitxor,
+        fmod,
+        ldexp,
+        logical_and,
+        logical_or,
+        lshift,
+        mod,
+        rshift,
+    )
+
+    _sp_bin_ufunc = [
+        mod,
+        fmod,
+        bitand,
+        bitor,
+        bitxor,
+        lshift,
+        rshift,
+        logical_and,
+        logical_or,
+    ]
+    _new_bin_ufunc = list(BIN_UFUNC - set(_sp_bin_ufunc) - {ldexp})
+
+    tested = set()
+    rs = np.random.RandomState(0)
+    for func1, func2 in _gen_pairs(_new_bin_ufunc):
+        raw = rs.random((9, 9, 9))
+        arr1 = tensor(raw, chunk_size=5)
+
+        arr2 = func1(1, func2(2, arr1))
+        res = arr2.execute().fetch()
+        res_cmp = arr2.execute(fuse_enabled=False).fetch()
+        np.testing.assert_array_almost_equal(res, res_cmp)
+        tested.update([func1, func2])
+    # make sure all functions tested
+    assert tested == set(_new_bin_ufunc)
+
+    tested = set()
+    for func1, func2 in _gen_pairs(_sp_bin_ufunc):
+        raw = rs.randint(1, 100, size=(10, 10, 10))
+        arr1 = tensor(raw, chunk_size=6)
+
+        arr2 = func1(10, func2(arr1, 5))
+        res = arr2.execute().fetch()
+        res_cmp = arr2.execute(fuse_enabled=False).fetch()
+        np.testing.assert_array_almost_equal(res, res_cmp)
+        tested.update([func1, func2])
+    # make sure all functions tested
+    assert tested == set(_sp_bin_ufunc)
+
+
+def test_reduction_execution(setup):
+    rs = np.random.RandomState(0)
+    raw1 = rs.randint(5, size=(8, 8, 8))
+    raw2 = rs.randint(5, size=(8, 8, 8))
+    arr1 = tensor(raw1, chunk_size=4)
+    arr2 = tensor(raw2, chunk_size=4)
+
+    res1 = (arr1 + 1).sum(keepdims=True).execute().fetch()
+    res2 = (arr1 + 1).prod(keepdims=True).execute().fetch()
+    np.testing.assert_array_equal((raw1 + 1).sum(keepdims=True), res1)
+    np.testing.assert_array_equal((raw1 + 1).prod(keepdims=True), res2)
+
+    res1 = (arr1 + 1).sum(axis=1).execute().fetch()
+    res2 = (arr1 + 1).prod(axis=1).execute().fetch()
+    res3 = (arr1 + 1).max(axis=1).execute().fetch()
+    res4 = (arr1 + 1).min(axis=1).execute().fetch()
+    np.testing.assert_array_equal((raw1 + 1).sum(axis=1), res1)
+    np.testing.assert_array_equal((raw1 + 1).prod(axis=1), res2)
+    np.testing.assert_array_equal((raw1 + 1).max(axis=1), res3)
+    np.testing.assert_array_equal((raw1 + 1).min(axis=1), res4)
+
+    raw3 = raw2 - raw1 + 10
+    arr3 = -arr1 + arr2 + 10
+
+    res1 = arr3.sum(axis=(0, 1)).execute().fetch()
+    res2 = arr3.prod(axis=(0, 1)).execute().fetch()
+    res3 = arr3.max(axis=(0, 1)).execute().fetch()
+    res4 = arr3.min(axis=(0, 1)).execute().fetch()
+    np.testing.assert_array_equal(raw3.sum(axis=(0, 1)), res1)
+    np.testing.assert_array_equal(raw3.prod(axis=(0, 1)), res2)
+    np.testing.assert_array_equal(raw3.max(axis=(0, 1)), res3)
+    np.testing.assert_array_equal(raw3.min(axis=(0, 1)), res4)
+
+
+def test_bool_reduction_execution(setup):
+    rs = np.random.RandomState(0)
+    raw = rs.randint(5, size=(8, 8, 8))
+    arr = tensor(raw, chunk_size=4)
+
+    res = (arr > 3).sum(axis=1).execute().fetch()
+    np.testing.assert_array_equal(res, (raw > 3).sum(axis=1))
+
+    res = (arr > 3).sum().execute().fetch()
+    np.testing.assert_array_equal(res, (raw > 3).sum())
+
+
+def test_order_execution(setup):
+    rs = np.random.RandomState(0)
+    raw = np.asfortranarray(rs.rand(4, 5, 6))
+    arr = tensor(raw, chunk_size=3)
+
+    res = (arr * 3 + 1).execute().fetch()
+    expected = raw * 3 + 1
+
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
diff --git a/python/xorbits/_mars/tensor/images/__init__.py b/python/xorbits/_mars/tensor/images/__init__.py
new file mode 100644
index 000000000..79fb234f5
--- /dev/null
+++ b/python/xorbits/_mars/tensor/images/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .imread import imread
diff --git a/python/xorbits/_mars/tensor/images/imread.py b/python/xorbits/_mars/tensor/images/imread.py
new file mode 100644
index 000000000..41ed282f0
--- /dev/null
+++ b/python/xorbits/_mars/tensor/images/imread.py
@@ -0,0 +1,106 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...lib.filesystem import file_size, glob, open_file
+from ...serialization.serializables import AnyField
+from ...utils import ModulePlaceholder, ceildiv
+from ..operands import TensorOperand, TensorOperandMixin
+
+try:
+    from PIL import Image
+except ImportError:
+    Image = ModulePlaceholder("PIL")
+
+
+def _read_image(fpath):
+    return np.asarray(Image.open(fpath))
+
+
+class TensorImread(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.IMREAD
+
+    _filepath = AnyField("filepath")
+
+    def __init__(self, filepath=None, **kwargs):
+        super().__init__(_filepath=filepath, **kwargs)
+
+    @property
+    def filepath(self):
+        return self._filepath
+
+    @classmethod
+    def tile(cls, op):
+        out_shape = op.outputs[0].shape
+        paths = (
+            op.filepath if isinstance(op.filepath, (tuple, list)) else glob(op.filepath)
+        )
+        chunk_size = op.outputs[0].extra_params.raw_chunk_size
+        n_chunks = ceildiv(len(paths), chunk_size)
+        if len(paths) > 1:
+            chunks = []
+            splits = []
+            for i in range(n_chunks):
+                chunk_op = op.copy().reset_key()
+                chunk_op._filepath = paths[i * chunk_size : (i + 1) * chunk_size]
+                file_nums = len(chunk_op._filepath)
+                shape = (file_nums,) + out_shape[1:]
+                chunk = chunk_op.new_chunk(
+                    None, shape=shape, index=(i,) + (0,) * (len(out_shape) - 1)
+                )
+                chunks.append(chunk)
+                splits.append(file_nums)
+            nsplits = (tuple(splits),) + tuple((s,) for s in out_shape[1:])
+        else:
+            chunk_op = op.copy().reset_key()
+            chunks = [
+                chunk_op.new_chunk(None, shape=out_shape, index=(0,) * len(out_shape))
+            ]
+            nsplits = tuple((s,) for s in out_shape)
+        new_op = op.copy()
+        return new_op.new_tensors(None, shape=out_shape, chunks=chunks, nsplits=nsplits)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if isinstance(op.filepath, list):
+            arrays = np.empty(op.outputs[0].shape)
+            for i, path in enumerate(op.filepath):
+                with open_file(path, "rb") as f:
+                    arrays[i] = _read_image(f)
+            ctx[op.outputs[0].key] = np.array(arrays)
+        else:
+            with open_file(op.filepath, "rb") as f:
+                ctx[op.outputs[0].key] = np.array(_read_image(f))
+
+    def __call__(self, shape, chunk_size):
+        return self.new_tensor(None, shape, raw_chunk_size=chunk_size)
+
+
+def imread(path, chunk_size=None):
+    paths = path if isinstance(path, (tuple, list)) else glob(path)
+    with open_file(paths[0], "rb") as f:
+        sample_data = _read_image(f)
+        img_shape = sample_data.shape
+        img_size = file_size(paths[0])
+    if len(paths) > 1:
+        shape = (len(paths),) + img_shape
+    else:
+        shape = img_shape
+    if chunk_size is None:
+        chunk_size = int(options.chunk_store_limit / img_size)
+    op = TensorImread(filepath=path, chunk_size=chunk_size, dtype=sample_data.dtype)
+    return op(shape=shape, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/images/tests/__init__.py b/python/xorbits/_mars/tensor/images/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/images/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/images/tests/test_images.py b/python/xorbits/_mars/tensor/images/tests/test_images.py
new file mode 100644
index 000000000..97994c069
--- /dev/null
+++ b/python/xorbits/_mars/tensor/images/tests/test_images.py
@@ -0,0 +1,65 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+
+import numpy as np
+import pytest
+
+try:
+    from PIL import Image
+except ImportError:
+    Image = None
+
+from ....core import tile
+from ...images import imread
+
+
+@pytest.mark.skipif(not Image, reason="Pillow not installed")
+def test_imread():
+    with tempfile.TemporaryDirectory() as tempdir:
+        raws = []
+        for i in range(10):
+            array = np.random.randint(0, 256, 2500 * 3, dtype=np.uint8).reshape(
+                (50, 50, 3)
+            )
+            raws.append(array)
+            im = Image.fromarray(array)
+            im.save(os.path.join(tempdir, f"random_{i}.png"))
+
+        t = imread(os.path.join(tempdir, "random_0.png"))
+        assert t.shape == (50, 50, 3)
+        assert t.dtype == np.dtype("uint8")
+
+        tiled = tile(t)
+        assert len(tiled.chunks) == 1
+        assert tiled.chunks[0].shape == (50, 50, 3)
+        assert tiled.chunks[0].dtype == np.dtype("uint8")
+
+        t = imread(os.path.join(tempdir, "random_*.png"), chunk_size=3)
+        assert t.shape == (10, 50, 50, 3)
+
+        tiled = tile(t)
+        assert len(tiled.chunks) == 4
+        assert tiled.nsplits == ((3, 3, 3, 1), (50,), (50,), (3,))
+        assert tiled.chunks[0].dtype == np.dtype("uint8")
+        assert tiled.chunks[0].index == (0, 0, 0, 0)
+        assert tiled.chunks[0].shape == (3, 50, 50, 3)
+        assert tiled.chunks[1].index == (1, 0, 0, 0)
+        assert tiled.chunks[1].shape == (3, 50, 50, 3)
+        assert tiled.chunks[2].index == (2, 0, 0, 0)
+        assert tiled.chunks[2].shape == (3, 50, 50, 3)
+        assert tiled.chunks[3].index == (3, 0, 0, 0)
+        assert tiled.chunks[3].shape == (1, 50, 50, 3)
diff --git a/python/xorbits/_mars/tensor/images/tests/test_images_execution.py b/python/xorbits/_mars/tensor/images/tests/test_images_execution.py
new file mode 100644
index 000000000..35b4816ba
--- /dev/null
+++ b/python/xorbits/_mars/tensor/images/tests/test_images_execution.py
@@ -0,0 +1,53 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+
+import numpy as np
+import pytest
+
+try:
+    from PIL import Image
+except ImportError:
+    Image = None
+
+from ...images import imread
+
+
+@pytest.mark.skipif(not Image, reason="Pillow not installed")
+def test_imread_execution(setup):
+    with tempfile.TemporaryDirectory() as tempdir:
+        raws = []
+        for i in range(10):
+            array = np.random.randint(0, 256, 2500, dtype=np.uint8).reshape((50, 50))
+            raws.append(array)
+            im = Image.fromarray(array)
+            im.save(os.path.join(tempdir, f"random_{i}.png"))
+        # Single image
+        t = imread(os.path.join(tempdir, "random_0.png"))
+        res = t.execute().fetch()
+        np.testing.assert_array_equal(res, raws[0])
+
+        t2 = imread(os.path.join(tempdir, "random_*.png"))
+        res = t2.execute().fetch()
+        np.testing.assert_array_equal(np.sort(res, axis=0), np.sort(raws, axis=0))
+
+        t3 = imread(os.path.join(tempdir, "random_*.png"), chunk_size=4)
+        res = t3.execute().fetch()
+        np.testing.assert_array_equal(np.sort(res, axis=0), np.sort(raws, axis=0))
+
+        t4 = imread(os.path.join(tempdir, "random_*.png"), chunk_size=4)
+        res = t4.execute().fetch()
+        np.testing.assert_array_equal(np.sort(res, axis=0), np.sort(raws, axis=0))
diff --git a/python/xorbits/_mars/tensor/indexing/__init__.py b/python/xorbits/_mars/tensor/indexing/__init__.py
new file mode 100644
index 000000000..d1102bac3
--- /dev/null
+++ b/python/xorbits/_mars/tensor/indexing/__init__.py
@@ -0,0 +1,47 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .choose import TensorChoose, choose
+from .compress import compress
+from .extract import extract
+from .fill_diagonal import TensorFillDiagonal, fill_diagonal
+from .flatnonzero import flatnonzero
+from .getitem import FancyIndexingConcat, FancyIndexingDistribute, TensorIndex
+from .nonzero import TensorNonzero, nonzero
+from .setitem import TensorIndexSetValue
+from .slice import TensorSlice
+from .take import take
+from .unravel_index import TensorUnravelIndex, unravel_index
+
+
+def _install():
+    from ..core import Tensor, TensorData
+    from .getitem import _getitem
+    from .setitem import _setitem
+
+    setattr(Tensor, "__getitem__", _getitem)
+    setattr(TensorData, "__getitem__", _getitem)
+    setattr(Tensor, "__setitem__", _setitem)
+    setattr(Tensor, "take", take)
+    setattr(
+        Tensor,
+        "compress",
+        lambda a, condition, axis=None: compress(condition, a, axis=axis),
+    )
+    setattr(Tensor, "choose", choose)
+    setattr(Tensor, "nonzero", nonzero)
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/tensor/indexing/choose.py b/python/xorbits/_mars/tensor/indexing/choose.py
new file mode 100644
index 000000000..5209bb95c
--- /dev/null
+++ b/python/xorbits/_mars/tensor/indexing/choose.py
@@ -0,0 +1,226 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import FieldTypes, KeyField, ListField, StringField
+from ..array_utils import as_same_device, device
+from ..core import Tensor, TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+from ..utils import broadcast_shape, check_out_param
+
+
+class TensorChoose(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.CHOOSE
+
+    _a = KeyField("a")
+    _choices = ListField("choices", FieldTypes.key)
+    _mode = StringField("mode")
+
+    def __init__(self, mode=None, **kw):
+        super().__init__(_mode=mode, **kw)
+
+    def __setattr__(self, key, value):
+        if key == "_mode" and value not in ("raise", "wrap", "clip"):
+            raise ValueError(f"mode should be raise, wrap or clip, not {value}")
+
+        super().__setattr__(key, value)
+
+    @property
+    def a(self):
+        return self._a
+
+    @property
+    def choices(self):
+        return self._choices
+
+    @property
+    def mode(self):
+        return self._mode
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._a = self._inputs[0]
+        self._choices = self._inputs[1:]
+
+    def __call__(self, a, choices, out=None):
+        if out is not None and not isinstance(out, Tensor):
+            raise TypeError(f"out should be Tensor object, got {type(out)} instead")
+
+        inputs = [a] + choices
+        shape = broadcast_shape(a.shape, *[c.shape for c in choices])
+        order = TensorOrder.C_ORDER if out is None else out.order
+        t = self.new_tensor(inputs, shape, order=order)
+
+        if out is None:
+            return t
+
+        check_out_param(out, t, "unsafe")
+        out_shape, out_dtype = out.shape, out.dtype
+        # if `out` is specified, use out's dtype and shape
+        if out_shape != t.shape:
+            raise ValueError(f"output shape should be {t.shape}, got {out_shape}")
+        setattr(self, "dtype", out_dtype)
+        out.data = t.data
+        return out
+
+    @classmethod
+    def tile(cls, op):
+        from ..arithmetic.core import TensorElementWise
+
+        return (yield from TensorElementWise.tile(op))
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+        a, choices = inputs[0], inputs[1:]
+
+        out = op.outputs[0]
+        with device(device_id):
+            ctx[out.key] = xp.choose(a, choices, mode=op.mode).astype(
+                op.dtype, order=out.order.value, copy=False
+            )
+
+
+def choose(a, choices, out=None, mode="raise"):
+    """
+    Construct a tensor from an index tensor and a set of tensors to choose from.
+
+    First of all, if confused or uncertain, definitely look at the Examples -
+    in its full generality, this function is less simple than it might
+    seem from the following code description (below ndi =
+    `mt.lib.index_tricks`):
+
+    ``mt.choose(a,c) == mt.array([c[a[I]][I] for I in ndi.ndindex(a.shape)])``.
+
+    But this omits some subtleties.  Here is a fully general summary:
+
+    Given an "index" tensor (`a`) of integers and a sequence of `n` tensors
+    (`choices`), `a` and each choice tensor are first broadcast, as necessary,
+    to tensors of a common shape; calling these *Ba* and *Bchoices[i], i =
+    0,...,n-1* we have that, necessarily, ``Ba.shape == Bchoices[i].shape``
+    for each `i`.  Then, a new array with shape ``Ba.shape`` is created as
+    follows:
+
+    * if ``mode=raise`` (the default), then, first of all, each element of
+      `a` (and thus `Ba`) must be in the range `[0, n-1]`; now, suppose that
+      `i` (in that range) is the value at the `(j0, j1, ..., jm)` position
+      in `Ba` - then the value at the same position in the new array is the
+      value in `Bchoices[i]` at that same position;
+
+    * if ``mode=wrap``, values in `a` (and thus `Ba`) may be any (signed)
+      integer; modular arithmetic is used to map integers outside the range
+      `[0, n-1]` back into that range; and then the new array is constructed
+      as above;
+
+    * if ``mode=clip``, values in `a` (and thus `Ba`) may be any (signed)
+      integer; negative integers are mapped to 0; values greater than `n-1`
+      are mapped to `n-1`; and then the new tensor is constructed as above.
+
+    Parameters
+    ----------
+    a : int tensor
+        This tensor must contain integers in `[0, n-1]`, where `n` is the number
+        of choices, unless ``mode=wrap`` or ``mode=clip``, in which cases any
+        integers are permissible.
+    choices : sequence of tensors
+        Choice tensors. `a` and all of the choices must be broadcastable to the
+        same shape.  If `choices` is itself a tensor (not recommended), then
+        its outermost dimension (i.e., the one corresponding to
+        ``choices.shape[0]``) is taken as defining the "sequence".
+    out : tensor, optional
+        If provided, the result will be inserted into this tensor. It should
+        be of the appropriate shape and dtype.
+    mode : {'raise' (default), 'wrap', 'clip'}, optional
+        Specifies how indices outside `[0, n-1]` will be treated:
+
+          * 'raise' : an exception is raised
+          * 'wrap' : value becomes value mod `n`
+          * 'clip' : values < 0 are mapped to 0, values > n-1 are mapped to n-1
+
+    Returns
+    -------
+    merged_array : Tensor
+        The merged result.
+
+    Raises
+    ------
+    ValueError: shape mismatch
+        If `a` and each choice tensor are not all broadcastable to the same
+        shape.
+
+    See Also
+    --------
+    Tensor.choose : equivalent method
+
+    Notes
+    -----
+    To reduce the chance of misinterpretation, even though the following
+    "abuse" is nominally supported, `choices` should neither be, nor be
+    thought of as, a single tensor, i.e., the outermost sequence-like container
+    should be either a list or a tuple.
+
+    Examples
+    --------
+
+    >>> import mars.tensor as mt
+
+    >>> choices = [[0, 1, 2, 3], [10, 11, 12, 13],
+    ...   [20, 21, 22, 23], [30, 31, 32, 33]]
+    >>> mt.choose([2, 3, 1, 0], choices
+    ... # the first element of the result will be the first element of the
+    ... # third (2+1) "array" in choices, namely, 20; the second element
+    ... # will be the second element of the fourth (3+1) choice array, i.e.,
+    ... # 31, etc.
+    ... ).execute()
+    array([20, 31, 12,  3])
+    >>> mt.choose([2, 4, 1, 0], choices, mode='clip').execute() # 4 goes to 3 (4-1)
+    array([20, 31, 12,  3])
+    >>> # because there are 4 choice arrays
+    >>> mt.choose([2, 4, 1, 0], choices, mode='wrap').execute() # 4 goes to (4 mod 4)
+    array([20,  1, 12,  3])
+    >>> # i.e., 0
+
+    A couple examples illustrating how choose broadcasts:
+
+    >>> a = [[1, 0, 1], [0, 1, 0], [1, 0, 1]]
+    >>> choices = [-10, 10]
+    >>> mt.choose(a, choices).execute()
+    array([[ 10, -10,  10],
+           [-10,  10, -10],
+           [ 10, -10,  10]])
+
+    >>> # With thanks to Anne Archibald
+    >>> a = mt.array([0, 1]).reshape((2,1,1))
+    >>> c1 = mt.array([1, 2, 3]).reshape((1,3,1))
+    >>> c2 = mt.array([-1, -2, -3, -4, -5]).reshape((1,1,5))
+    >>> mt.choose(a, (c1, c2)).execute() # result is 2x3x5, res[0,:,:]=c1, res[1,:,:]=c2
+    array([[[ 1,  1,  1,  1,  1],
+            [ 2,  2,  2,  2,  2],
+            [ 3,  3,  3,  3,  3]],
+           [[-1, -2, -3, -4, -5],
+            [-1, -2, -3, -4, -5],
+            [-1, -2, -3, -4, -5]]])
+
+    """
+    a = astensor(a, dtype="i8")
+    choices = [astensor(c) for c in choices]
+
+    dtype = np.result_type(*[c.dtype for c in choices])
+    op = TensorChoose(mode=mode, dtype=dtype)
+    return op(a, choices, out=out)
diff --git a/python/xorbits/_mars/tensor/indexing/compress.py b/python/xorbits/_mars/tensor/indexing/compress.py
new file mode 100644
index 000000000..f0168f93a
--- /dev/null
+++ b/python/xorbits/_mars/tensor/indexing/compress.py
@@ -0,0 +1,122 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ..core import Tensor
+from ..datasource import tensor as astensor
+from ..utils import validate_axis
+
+
+def compress(condition, a, axis=None, out=None):
+    """
+    Return selected slices of a tensor along given axis.
+
+    When working along a given axis, a slice along that axis is returned in
+    `output` for each index where `condition` evaluates to True. When
+    working on a 1-D array, `compress` is equivalent to `extract`.
+
+    Parameters
+    ----------
+    condition : 1-D tensor of bools
+        Tensor that selects which entries to return. If len(condition)
+        is less than the size of `a` along the given axis, then output is
+        truncated to the length of the condition tensor.
+    a : array_like
+        Tensor from which to extract a part.
+    axis : int, optional
+        Axis along which to take slices. If None (default), work on the
+        flattened tensor.
+    out : Tensor, optional
+        Output tensor.  Its type is preserved and it must be of the right
+        shape to hold the output.
+
+    Returns
+    -------
+    compressed_array : Tensor
+        A copy of `a` without the slices along axis for which `condition`
+        is false.
+
+    See Also
+    --------
+    take, choose, diag, diagonal, select
+    Tensor.compress : Equivalent method in ndarray
+    mt.extract: Equivalent method when working on 1-D arrays
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([[1, 2], [3, 4], [5, 6]])
+    >>> a.execute()
+    array([[1, 2],
+           [3, 4],
+           [5, 6]])
+    >>> mt.compress([0, 1], a, axis=0).execute()
+    array([[3, 4]])
+    >>> mt.compress([False, True, True], a, axis=0).execute()
+    array([[3, 4],
+           [5, 6]])
+    >>> mt.compress([False, True], a, axis=1).execute()
+    array([[2],
+           [4],
+           [6]])
+
+    Working on the flattened tensor does not return slices along an axis but
+    selects elements.
+
+    >>> mt.compress([False, True], a).execute()
+    array([2])
+
+    """
+    a = astensor(a)
+    condition = astensor(condition, dtype=bool)
+
+    if condition.ndim != 1:
+        raise ValueError("condition must be an 1-d tensor")
+
+    if axis is None:
+        a = a.ravel()
+        if len(condition) < a.size:
+            a = a[: len(condition)]
+        return a[condition]
+
+    try:
+        axis = validate_axis(a.ndim, axis)
+    except ValueError:
+        raise np.AxisError(
+            f"axis {axis} is out of bounds for tensor of dimension {a.ndim}"
+        )
+
+    try:
+        if len(condition) < a.shape[axis]:
+            a = a[(slice(None),) * axis + (slice(len(condition)),)]
+        t = a[(slice(None),) * axis + (condition,)]
+        if out is None:
+            return t
+
+        if out is not None and not isinstance(out, Tensor):
+            raise TypeError(f"out should be Tensor object, got {type(out)} instead")
+        if not np.can_cast(out.dtype, t.dtype, "safe"):
+            raise TypeError(
+                f"Cannot cast array data from dtype('{out.dtype}') to dtype('{t.dtype}') "
+                "according to the rule 'safe'"
+            )
+        # skip shape check because out shape is unknown
+        out.data = t.astype(out.dtype, order=out.order.value).data
+        return out
+    except IndexError:
+        raise np.AxisError(
+            f"axis {len(condition)} is out of bounds for tensor of dimension 1"
+        )
diff --git a/python/xorbits/_mars/tensor/indexing/core.py b/python/xorbits/_mars/tensor/indexing/core.py
new file mode 100644
index 000000000..13d1136e4
--- /dev/null
+++ b/python/xorbits/_mars/tensor/indexing/core.py
@@ -0,0 +1,193 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import itertools
+from numbers import Integral
+
+import numpy as np
+
+from ..core import TENSOR_CHUNK_TYPE, TENSOR_TYPE
+from ..datasource import tensor as astensor
+from ..utils import broadcast_shape, calc_sliced_size, index_ndim, replace_ellipsis
+
+_INDEX_ERROR_MSG = (
+    "only integers, slices (`:`), ellipsis (`...`), "
+    "numpy.newaxis (`None`) and integer or boolean arrays are valid indices"
+)
+
+
+def calc_shape(tensor_shape, index):
+    shape = []
+    in_axis = 0
+    out_axis = 0
+    fancy_index = None
+    fancy_index_shapes = []
+    for ind in index:
+        if (
+            isinstance(ind, TENSOR_TYPE + TENSOR_CHUNK_TYPE + (np.ndarray,))
+            and ind.dtype == np.bool_
+        ):
+            # bool
+            shape.append(np.nan if not isinstance(ind, np.ndarray) else int(ind.sum()))
+            for i, t_size, size in zip(
+                itertools.count(0),
+                ind.shape,
+                tensor_shape[in_axis : ind.ndim + in_axis],
+            ):
+                if not np.isnan(t_size) and not np.isnan(size) and t_size != size:
+                    raise IndexError(
+                        f"boolean index did not match indexed array along dimension {in_axis + i}; "
+                        f"dimension is {size} but corresponding boolean dimension is {t_size}"
+                    )
+            in_axis += ind.ndim
+            out_axis += 1
+        elif isinstance(ind, TENSOR_TYPE + TENSOR_CHUNK_TYPE + (np.ndarray,)):
+            first_fancy_index = False
+            if fancy_index is None:
+                first_fancy_index = True
+                fancy_index = out_axis
+            if isinstance(ind, np.ndarray) and np.any(ind >= tensor_shape[in_axis]):
+                out_of_range_index = next(
+                    i for i in ind.flat if i >= tensor_shape[in_axis]
+                )
+                raise IndexError(
+                    f"IndexError: index {out_of_range_index} is out of "
+                    f"bounds with size {tensor_shape[in_axis]}"
+                )
+            fancy_index_shapes.append(ind.shape)
+            in_axis += 1
+            if first_fancy_index:
+                out_axis += ind.ndim
+        elif isinstance(ind, slice):
+            if np.isnan(tensor_shape[in_axis]):
+                shape.append(np.nan)
+            else:
+                shape.append(calc_sliced_size(tensor_shape[in_axis], ind))
+            in_axis += 1
+            out_axis += 1
+        elif isinstance(ind, Integral):
+            size = tensor_shape[in_axis]
+            if not np.isnan(size) and ind >= size:
+                raise IndexError(
+                    f"index {ind} is out of bounds for axis {in_axis} with size {size}"
+                )
+            in_axis += 1
+        else:
+            assert ind is None
+            shape.append(1)
+
+    if fancy_index is not None:
+        try:
+            if any(np.isnan(np.prod(s)) for s in fancy_index_shapes):
+                fancy_index_shape = (np.nan,) * len(fancy_index_shapes[0])
+            else:
+                fancy_index_shape = broadcast_shape(*fancy_index_shapes)
+            shape = shape[:fancy_index] + list(fancy_index_shape) + shape[fancy_index:]
+        except ValueError:
+            raise IndexError(
+                "shape mismatch: indexing arrays could not be broadcast together "
+                "with shapes {0}".format(" ".join(str(s) for s in fancy_index_shapes))
+            )
+
+    return shape
+
+
+def preprocess_index(index, convert_bool_to_fancy=None):
+    from .nonzero import nonzero
+
+    inds = []
+    fancy_indexes = []
+    bool_indexes = []
+    all_fancy_index_ndarray = True
+    all_bool_index_ndarray = True
+    for j, ind in enumerate(index):
+        if isinstance(ind, (list, np.ndarray) + TENSOR_TYPE):
+            if not isinstance(ind, TENSOR_TYPE):
+                ind = np.array(ind)
+            if ind.dtype.kind not in "biu":
+                raise IndexError(_INDEX_ERROR_MSG)
+            if ind.dtype.kind == "b":
+                # bool indexing
+                bool_indexes.append(j)
+                if not isinstance(ind, np.ndarray):
+                    all_bool_index_ndarray = False
+            else:
+                # fancy indexing
+                fancy_indexes.append(j)
+                if not isinstance(ind, np.ndarray):
+                    all_fancy_index_ndarray = False
+        elif (
+            not isinstance(ind, (slice, Integral))
+            and ind is not None
+            and ind is not Ellipsis
+        ):
+            raise IndexError(_INDEX_ERROR_MSG)
+        inds.append(ind)
+
+    if convert_bool_to_fancy is None:
+        convert_bool_to_fancy = (fancy_indexes and len(bool_indexes) > 0) or len(
+            bool_indexes
+        ) > 1
+
+    if not all_fancy_index_ndarray or (
+        convert_bool_to_fancy and not all_bool_index_ndarray
+    ):
+        # if not all fancy indexes are ndarray,
+        # or bool indexes need to be converted to fancy indexes,
+        # and not all bool indexes are ndarray,
+        # we will convert all of them to Tensor
+        for fancy_index in fancy_indexes:
+            inds[fancy_index] = astensor(inds[fancy_index])
+
+    # convert bool index to fancy index when any situation below meets:
+    # 1. fancy indexes and bool indexes both exists
+    # 2. bool indexes more than 2
+    if convert_bool_to_fancy:
+        default_m = None
+        if len(fancy_indexes) > 0:
+            default_m = (
+                np.nonzero
+                if isinstance(inds[fancy_indexes[0]], np.ndarray)
+                else nonzero
+            )
+        for bool_index in bool_indexes:
+            ind = inds[bool_index]
+            m = default_m
+            if m is None:
+                m = np.nonzero if isinstance(ind, np.ndarray) else nonzero
+            ind = m(ind)[0]
+            inds[bool_index] = ind
+
+    return tuple(inds)
+
+
+def process_index(tensor_ndim, item, convert_bool_to_fancy=None):
+    if isinstance(item, list):
+        arr = np.array(item)
+        if arr.dtype == object:
+            item = tuple(item)
+        elif arr.dtype.kind == "f":
+            raise IndexError(_INDEX_ERROR_MSG)
+        else:
+            item = (arr,)
+    elif not isinstance(item, tuple):
+        item = (item,)
+
+    index = preprocess_index(item, convert_bool_to_fancy=convert_bool_to_fancy)
+    index = replace_ellipsis(index, tensor_ndim)
+    missing = tensor_ndim - sum(index_ndim(i) for i in index)
+    if missing < 0:
+        raise IndexError("too many indices for tensor")
+    return index + (slice(None),) * missing
diff --git a/python/xorbits/_mars/tensor/indexing/extract.py b/python/xorbits/_mars/tensor/indexing/extract.py
new file mode 100644
index 000000000..243d6cdb0
--- /dev/null
+++ b/python/xorbits/_mars/tensor/indexing/extract.py
@@ -0,0 +1,69 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..datasource import tensor as astensor
+
+
+def extract(condition, a):
+    """
+    Return the elements of a tensor that satisfy some condition.
+
+    This is equivalent to ``mt.compress(ravel(condition), ravel(arr))``.  If
+    `condition` is boolean ``mt.extract`` is equivalent to ``arr[condition]``.
+
+    Note that `place` does the exact opposite of `extract`.
+
+    Parameters
+    ----------
+    condition : array_like
+        An array whose nonzero or True entries indicate the elements of `arr`
+        to extract.
+    a : array_like
+        Input tensor of the same size as `condition`.
+
+    Returns
+    -------
+    extract : Tensor
+        Rank 1 tensor of values from `arr` where `condition` is True.
+
+    See Also
+    --------
+    take, put, copyto, compress, place
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> arr = mt.arange(12).reshape((3, 4))
+    >>> arr.execute()
+    array([[ 0,  1,  2,  3],
+           [ 4,  5,  6,  7],
+           [ 8,  9, 10, 11]])
+    >>> condition = mt.mod(arr, 3)==0
+    >>> condition.execute()
+    array([[ True, False, False,  True],
+           [False, False,  True, False],
+           [False,  True, False, False]])
+    >>> mt.extract(condition, arr).execute()
+    array([0, 3, 6, 9])
+
+
+    If `condition` is boolean:
+
+    >>> arr[condition].execute()
+    array([0, 3, 6, 9])
+
+    """
+    condition = astensor(condition, dtype=bool)
+    return a[condition]
diff --git a/python/xorbits/_mars/tensor/indexing/fill_diagonal.py b/python/xorbits/_mars/tensor/indexing/fill_diagonal.py
new file mode 100644
index 000000000..5337e2dd1
--- /dev/null
+++ b/python/xorbits/_mars/tensor/indexing/fill_diagonal.py
@@ -0,0 +1,465 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import ENTITY_TYPE, recursive_tile
+from ...serialization.serializables import AnyField, BoolField, Int32Field, KeyField
+from ...utils import ceildiv, has_unknown_shape
+from ..array_utils import as_same_device, device
+from ..core import TENSOR_TYPE, Tensor
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+from ..utils import decide_unify_split
+
+
+class TensorFillDiagonal(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.FILL_DIAGONAL
+
+    _input = KeyField("input")
+    _val = AnyField("val")
+    _wrap = BoolField("wrap")
+    # used for chunk
+    _k = Int32Field("k")
+
+    def __init__(self, val=None, wrap=None, k=None, **kw):
+        super().__init__(_val=val, _wrap=wrap, _k=k, **kw)
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def val(self):
+        return self._val
+
+    @property
+    def wrap(self):
+        return self._wrap
+
+    @property
+    def k(self):
+        return self._k
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+        if len(self._inputs) == 2:
+            self._val = self._inputs[1]
+
+    def __call__(self, a, val=None):
+        inputs = [a]
+        if val is not None:
+            inputs.append(val)
+        return self.new_tensor(inputs, shape=a.shape, order=a.order)
+
+    @staticmethod
+    def _process_val(val, a, wrap):
+        """
+        given the `val`, `a`, `wrap` which are the arguments in `fill_diagonal`,
+        do some preprocess on `val` includes:
+
+        1. calculate the length to fill on diagonal, 2-d and n-d(n > 2)
+           as well as that `wrap` is True and `a` is a tall matrix need to be considered.
+        2. if val is a Tensor, rechunk it into one chunk.
+        """
+
+        from ..base import tile
+        from ..datasource import diag
+
+        is_val_tensor = isinstance(val, TENSOR_TYPE)
+
+        if a.ndim == 2:
+            if wrap and TensorFillDiagonal._is_tall(a):
+                size = sum(
+                    diag(sub).shape[0]
+                    for sub in TensorFillDiagonal._split_tall_matrix(a)
+                )
+            else:
+                size = diag(a).shape[0]
+        else:
+            # every dimension has same shape
+            size = a.shape[0]
+
+        repeat_method = tile if is_val_tensor else np.tile
+        val_size = val.size
+        if val_size < size:
+            n = ceildiv(size, val_size)
+            val = repeat_method(val, n)[:size]
+        elif val_size > size:
+            val = val[:size]
+
+        if is_val_tensor and val.ndim > 0:
+            val = yield from recursive_tile(val)
+            val = val.rechunk({0: val.size})
+
+        return (yield from recursive_tile(val)) if is_val_tensor else val
+
+    @staticmethod
+    def _gen_val(val, diag_idx, cum_sizes):
+        """
+        Given a tensor-level `val`, calculate the chunk-level `val`.
+        Consider both the cases that `val` could be a tensor or ndarray.
+
+        :param val: tensor-level `val`
+        :diag_idx: chunk index on the diagonal direction
+        :cum_sizes: accumulative chunk sizes on the diagonal direction
+        """
+        from .slice import TensorSlice
+
+        if val.ndim == 0:
+            if isinstance(val, TENSOR_TYPE):
+                return val.chunks[0]
+            else:
+                return val
+
+        if isinstance(val, TENSOR_TYPE):
+            start, stop = cum_sizes[diag_idx], cum_sizes[diag_idx + 1]
+            slc = slice(start, stop)
+            slc_op = TensorSlice(slices=[slc], dtype=val.dtype)
+            return slc_op.new_chunk(
+                [val.chunks[0]],
+                shape=(stop - start,),
+                order=val.order,
+                index=(diag_idx,),
+            )
+        else:
+            return val[cum_sizes[diag_idx] : cum_sizes[diag_idx + 1]]
+
+    @classmethod
+    def _tile_2d(cls, op, val):
+        from ..datasource import diag
+
+        d = yield from recursive_tile(diag(op.input))
+        index_to_diag_chunk = {c.inputs[0].index: c for c in d.chunks}
+        cum_sizes = [0] + np.cumsum(d.nsplits[0]).tolist()
+
+        out_chunks = []
+        for chunk in op.input.chunks:
+            if chunk.index not in index_to_diag_chunk:
+                out_chunks.append(chunk)
+            else:
+                diag_chunk = index_to_diag_chunk[chunk.index]
+                diag_idx = diag_chunk.index[0]
+                input_chunks = [chunk]
+                chunk_val = cls._gen_val(val, diag_idx, cum_sizes)
+                if len(op.inputs) == 2:
+                    input_chunks.append(chunk_val)
+                chunk_op = op.copy().reset_key()
+                chunk_op._wrap = False
+                chunk_op._k = diag_chunk.op.k
+                chunk_op._val = chunk_val
+                out_chunk = chunk_op.new_chunk(
+                    input_chunks,
+                    shape=chunk.shape,
+                    order=chunk.order,
+                    index=chunk.index,
+                )
+                out_chunks.append(out_chunk)
+
+        out = op.outputs[0]
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            shape=out.shape,
+            order=out.order,
+            chunks=out_chunks,
+            nsplits=op.input.nsplits,
+        )
+
+    @classmethod
+    def _tile_nd(cls, op, val):
+        # if more than 3d, we will rechunk the tensor into square chunk
+        # on the diagonal direction
+        in_tensor = op.input
+        nsplits = [tuple(np.array(split)) for split in in_tensor.nsplits]
+        if len(set(nsplits)) != 1:
+            # need rechunk
+            nsplit = decide_unify_split(*in_tensor.nsplits)
+            in_tensor = yield from recursive_tile(
+                in_tensor.rechunk(tuple(nsplit for _ in range(in_tensor.ndim)))
+            )
+        cum_sizes = [0] + np.cumsum(in_tensor.nsplits[0]).tolist()
+
+        out_chunks = []
+        for chunk in in_tensor.chunks:
+            if len(set(chunk.index)) == 1:
+                # chunk on the diagonal direction
+                chunk_op = op.copy().reset_key()
+                chunk_op._k = 0
+                chunk_inputs = [chunk]
+                chunk_val = cls._gen_val(val, chunk.index[0], cum_sizes)
+                if len(op.inputs) == 2:
+                    chunk_inputs.append(chunk_val)
+                chunk_op._val = chunk_val
+                out_chunk = chunk_op.new_chunk(
+                    chunk_inputs,
+                    shape=chunk.shape,
+                    order=chunk.order,
+                    index=chunk.index,
+                )
+                out_chunks.append(out_chunk)
+            else:
+                out_chunks.append(chunk)
+
+        out = op.outputs[0]
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            shape=out.shape,
+            order=out.order,
+            chunks=out_chunks,
+            nsplits=in_tensor.nsplits,
+        )
+
+    @classmethod
+    def _tile_one_chunk(cls, op, val):
+        out = op.outputs[0]
+        chunk_op = op.copy().reset_key()
+        chunk_inputs = [op.input.chunks[0]]
+        if isinstance(val, TENSOR_TYPE):
+            chunk_inputs.append(val.chunks[0])
+        chunk = chunk_op.new_chunk(
+            chunk_inputs, shape=out.shape, order=out.order, index=(0,) * out.ndim
+        )
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            shape=out.shape,
+            order=out.order,
+            chunks=[chunk],
+            nsplits=((s,) for s in out.shape),
+        )
+
+    @staticmethod
+    def _is_tall(x):
+        return x.shape[0] > x.shape[1] + 1
+
+    @staticmethod
+    def _split_tall_matrix(a):
+        blocksize = a.shape[1] + 1
+        n_block = ceildiv(a.shape[0], blocksize)
+        return [a[i * blocksize : (i + 1) * blocksize] for i in range(n_block)]
+
+    @classmethod
+    def tile(cls, op):
+        # input tensor must have no unknown chunk shape
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        in_tensor = op.input
+        is_in_tensor_tall = cls._is_tall(in_tensor)
+
+        if op.val.ndim > 0:
+            val = yield from cls._process_val(op.val, in_tensor, op.wrap)
+        else:
+            val = op.val
+
+        if len(in_tensor.chunks) == 1:
+            return cls._tile_one_chunk(op, val)
+
+        if op.input.ndim == 2:
+            if op.wrap and is_in_tensor_tall:
+                from ..merge import concatenate
+
+                sub_tensors = cls._split_tall_matrix(in_tensor)
+                for i, sub_tensor in enumerate(sub_tensors):
+                    if val.ndim > 0:
+                        sub_val = val[
+                            i * sub_tensor.shape[1] : (i + 1) * sub_tensor.shape[1]
+                        ]
+                    else:
+                        sub_val = val
+                    fill_diagonal(sub_tensor, sub_val, wrap=False)
+                out_tensor = concatenate(sub_tensors)
+                return [(yield from recursive_tile(out_tensor))]
+            else:
+                return (yield from cls._tile_2d(op, val))
+        else:
+            return (yield from cls._tile_nd(op, val))
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+        a = inputs[0]
+        if len(inputs) == 2:
+            val = inputs[1]
+        else:
+            val = op.val
+
+        with device(device_id):
+            if not op.k:
+                a = a.copy()
+                xp.fill_diagonal(a, val, wrap=op.wrap)
+            else:
+                assert a.ndim == 2
+                k = op.k or 0
+                n_rows, n_cols = a.shape
+                if k > 0:
+                    n_cols -= k
+                elif k < 0:
+                    n_rows += k
+                n = min(n_rows, n_cols)
+
+                # generate indices
+                rows, cols = np.diag_indices(n)
+                if k > 0:
+                    cols = cols.copy()
+                    cols += k
+                elif k < 0:
+                    rows = rows.copy()
+                    rows -= k
+
+                a = a.copy()
+                a[rows, cols] = val
+
+            ctx[op.outputs[0].key] = a
+
+
+def fill_diagonal(a, val, wrap=False):
+    """Fill the main diagonal of the given tensor of any dimensionality.
+
+    For a tensor `a` with ``a.ndim >= 2``, the diagonal is the list of
+    locations with indices ``a[i, ..., i]`` all identical. This function
+    modifies the input tensor in-place, it does not return a value.
+
+    Parameters
+    ----------
+    a : Tensor, at least 2-D.
+      Tensor whose diagonal is to be filled, it gets modified in-place.
+
+    val : scalar
+      Value to be written on the diagonal, its type must be compatible with
+      that of the tensor a.
+
+    wrap : bool
+      For tall matrices in NumPy version up to 1.6.2, the
+      diagonal "wrapped" after N columns. You can have this behavior
+      with this option. This affects only tall matrices.
+
+    See also
+    --------
+    diag_indices, diag_indices_from
+
+    Notes
+    -----
+
+    This functionality can be obtained via `diag_indices`, but internally
+    this version uses a much faster implementation that never constructs the
+    indices and uses simple slicing.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> a = mt.zeros((3, 3), int)
+    >>> mt.fill_diagonal(a, 5)
+    >>> a.execute()
+    array([[5, 0, 0],
+           [0, 5, 0],
+           [0, 0, 5]])
+
+    The same function can operate on a 4-D tensor:
+
+    >>> a = mt.zeros((3, 3, 3, 3), int)
+    >>> mt.fill_diagonal(a, 4)
+
+    We only show a few blocks for clarity:
+
+    >>> a[0, 0].execute()
+    array([[4, 0, 0],
+           [0, 0, 0],
+           [0, 0, 0]])
+    >>> a[1, 1].execute()
+    array([[0, 0, 0],
+           [0, 4, 0],
+           [0, 0, 0]])
+    >>> a[2, 2].execute()
+    array([[0, 0, 0],
+           [0, 0, 0],
+           [0, 0, 4]])
+
+    The wrap option affects only tall matrices:
+
+    >>> # tall matrices no wrap
+    >>> a = mt.zeros((5, 3), int)
+    >>> mt.fill_diagonal(a, 4)
+    >>> a.execute()
+    array([[4, 0, 0],
+           [0, 4, 0],
+           [0, 0, 4],
+           [0, 0, 0],
+           [0, 0, 0]])
+
+    >>> # tall matrices wrap
+    >>> a = mt.zeros((5, 3), int)
+    >>> mt.fill_diagonal(a, 4, wrap=True)
+    >>> a.execute()
+    array([[4, 0, 0],
+           [0, 4, 0],
+           [0, 0, 4],
+           [0, 0, 0],
+           [4, 0, 0]])
+
+    >>> # wide matrices
+    >>> a = mt.zeros((3, 5), int)
+    >>> mt.fill_diagonal(a, 4, wrap=True)
+    >>> a.execute()
+    array([[4, 0, 0, 0, 0],
+           [0, 4, 0, 0, 0],
+           [0, 0, 4, 0, 0]])
+
+    The anti-diagonal can be filled by reversing the order of elements
+    using either `numpy.flipud` or `numpy.fliplr`.
+
+    >>> a = mt.zeros((3, 3), int)
+    >>> mt.fill_diagonal(mt.fliplr(a), [1,2,3])  # Horizontal flip
+    >>> a.execute()
+    array([[0, 0, 1],
+           [0, 2, 0],
+           [3, 0, 0]])
+    >>> mt.fill_diagonal(mt.flipud(a), [1,2,3])  # Vertical flip
+    >>> a.execute()
+    array([[0, 0, 3],
+           [0, 2, 0],
+           [1, 0, 0]])
+
+    Note that the order in which the diagonal is filled varies depending
+    on the flip function.
+    """
+
+    if not isinstance(a, Tensor):
+        raise TypeError(f"`a` should be a tensor, got {type(a)}")
+    if a.ndim < 2:
+        raise ValueError("array must be at least 2-d")
+    if a.ndim > 2 and len(set(a.shape)) != 1:
+        raise ValueError("All dimensions of input must be of equal length")
+
+    # process val
+    if isinstance(val, ENTITY_TYPE):
+        val = astensor(val)
+        if val.ndim > 1:
+            val = val.ravel()
+        val_input = val
+    else:
+        val = np.asarray(val)
+        if val.ndim > 1:
+            val = val.ravel()
+        val_input = None
+
+    op = TensorFillDiagonal(val=val, wrap=wrap, dtype=a.dtype)
+    t = op(a, val=val_input)
+    a.data = t.data
diff --git a/python/xorbits/_mars/tensor/indexing/flatnonzero.py b/python/xorbits/_mars/tensor/indexing/flatnonzero.py
new file mode 100644
index 000000000..935011c07
--- /dev/null
+++ b/python/xorbits/_mars/tensor/indexing/flatnonzero.py
@@ -0,0 +1,58 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .nonzero import nonzero
+
+
+def flatnonzero(a):
+    """
+    Return indices that are non-zero in the flattened version of a.
+
+    This is equivalent to a.ravel().nonzero()[0].
+
+    Parameters
+    ----------
+    a : Tensor
+        Input tensor.
+
+    Returns
+    -------
+    res : Tensor
+        Output tensor, containing the indices of the elements of `a.ravel()`
+        that are non-zero.
+
+    See Also
+    --------
+    nonzero : Return the indices of the non-zero elements of the input tensor.
+    ravel : Return a 1-D tensor containing the elements of the input tensor.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.arange(-2, 3)
+    >>> x.execute()
+    array([-2, -1,  0,  1,  2])
+    >>> mt.flatnonzero(x).execute()
+    array([0, 1, 3, 4])
+
+    Use the indices of the non-zero elements as an index array to extract
+    these elements:
+
+    >>> x.ravel()[mt.flatnonzero(x)].execute()  # TODO(jisheng): accomplish this after fancy indexing is supported
+
+    """
+    from ..base import ravel
+
+    return nonzero(ravel(a))[0]
diff --git a/python/xorbits/_mars/tensor/indexing/getitem.py b/python/xorbits/_mars/tensor/indexing/getitem.py
new file mode 100644
index 000000000..e2d33f9d1
--- /dev/null
+++ b/python/xorbits/_mars/tensor/indexing/getitem.py
@@ -0,0 +1,388 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from numbers import Integral
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import ENTITY_TYPE
+from ...core.operand import OperandStage
+from ...serialization.serializables import (
+    FieldTypes,
+    Int32Field,
+    KeyField,
+    ListField,
+    TupleField,
+)
+from ..array_utils import get_array_module
+from ..core import TENSOR_TYPE, TensorOrder
+from ..operands import TensorHasInput, TensorMapReduceOperand, TensorOperandMixin
+from ..utils import calc_pos, filter_inputs, split_indexes_into_chunks
+from .core import calc_shape, process_index
+from .index_lib import TensorIndexesHandler
+
+FANCY_INDEX_TYPES = TENSOR_TYPE + (np.ndarray,)
+
+
+class TensorIndex(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.INDEX
+
+    _input = KeyField("input")
+    _indexes = ListField("indexes")
+
+    def __init__(self, indexes=None, **kw):
+        super().__init__(_indexes=indexes, **kw)
+
+    @property
+    def indexes(self):
+        return self._indexes
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs[1:])
+        new_indexes = [
+            next(inputs_iter) if isinstance(index, ENTITY_TYPE) else index
+            for index in self._indexes
+        ]
+        self._indexes = new_indexes
+
+    def on_output_modify(self, new_output):
+        from .setitem import TensorIndexSetValue
+
+        if self.create_view:
+            a = self.input
+            op = TensorIndexSetValue(
+                dtype=a.dtype,
+                sparse=a.issparse(),
+                indexes=tuple(self._indexes),
+                value=new_output,
+            )
+            return op(a, self._indexes, new_output)
+
+    def on_input_modify(self, new_input):
+        if self.create_view:
+            new_op = self.copy().reset_key()
+            new_inputs = [new_input] + self.inputs[1:]
+            return new_op.new_tensor(new_inputs, shape=self.outputs[0].shape)
+
+    def __call__(self, a, index, shape, order):
+        self._indexes = list(index)
+        return self.new_tensor(filter_inputs([a] + list(index)), shape, order=order)
+
+    @classmethod
+    def _tile_one_chunk(cls, op: "TensorIndex"):
+        inp = op.inputs[0]
+        out = op.outputs[0]
+        chunk_op = op.copy().reset_key()
+        chunk_params = out.params.copy()
+        chunk_params["shape"] = shape = tuple(calc_shape(inp.shape, op.indexes))
+        chunk_params["index"] = (0,) * out.ndim
+        chunk = chunk_op.new_chunk(
+            [inp.chunks[0] for inp in op.inputs], kws=[chunk_params]
+        )
+        params = out.params.copy()
+        params["chunks"] = [chunk]
+        params["nsplits"] = tuple((s,) for s in shape)
+        return op.copy().new_tensors(op.inputs, kws=[params])
+
+    @classmethod
+    def tile(cls, op):
+        if all(len(inp.chunks) == 1 for inp in op.inputs):
+            return cls._tile_one_chunk(op)
+
+        handler = TensorIndexesHandler()
+        return [(yield from handler.handle(op))]
+
+    @classmethod
+    def execute(cls, ctx, op):
+        indexes = tuple(
+            ctx[index.key] if hasattr(index, "key") else index for index in op.indexes
+        )
+        input_ = ctx[op.inputs[0].key]
+        xp = get_array_module(input_)
+        ret = xp.asarray(input_)[indexes]
+        if hasattr(ret, "astype"):
+            ret = ret.astype(ret.dtype, order=op.outputs[0].order.value, copy=False)
+        ctx[op.outputs[0].key] = ret
+
+    @classmethod
+    def estimate_size(cls, ctx, op):
+        chunk = op.outputs[0]
+        shape = chunk.shape
+
+        if any(np.isnan(s) for s in shape):
+            return super().estimate_size(ctx, op)
+
+        new_indexes = [index for index in op._indexes if index is not None]
+        new_shape = []
+        first_fancy_index = False
+        for index in new_indexes:
+            if isinstance(index, ENTITY_TYPE):
+                if index.dtype != np.bool_:
+                    if not first_fancy_index:
+                        first_fancy_index = True
+                    else:
+                        continue
+                new_shape.append(ctx[index.key][0] // index.dtype.itemsize)
+
+        rough_shape = []
+        idx = 0
+        for s in shape:
+            if np.isnan(s):
+                rough_shape.append(new_shape[idx])
+                idx += 1
+            else:
+                rough_shape.append(s)
+        result = int(np.prod(rough_shape) * chunk.dtype.itemsize)
+        ctx[chunk.key] = (result, result)
+
+
+class FancyIndexingDistribute(TensorMapReduceOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.FANCY_INDEX_DISTRIBUTE
+
+    _input = KeyField("input")
+    _dest_nsplits = TupleField("dest_nsplits", FieldTypes.tuple(FieldTypes.uint64))
+    _axes = TupleField("axes", FieldTypes.int32)
+
+    def __init__(self, dest_nsplits=None, axes=None, **kw):
+        super().__init__(_dest_nsplits=dest_nsplits, _axes=axes, **kw)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    @property
+    def output_limit(self):
+        if self.stage == OperandStage.map:
+            return 1
+        # return fancy indexes on each axis as well as original position
+        return len(self._axes) + 1
+
+    @property
+    def dest_nsplits(self):
+        return self._dest_nsplits
+
+    @property
+    def axes(self):
+        return self._axes
+
+    @classmethod
+    def _execute_map(cls, ctx, op):
+        nsplits = op.dest_nsplits
+        axes = op.axes
+        fancy_index_nsplits = [nsplits[ax] for ax in axes]
+        indexes = ctx[op.inputs[0].key]
+        flatten_indexes = indexes.reshape(indexes.shape[0], -1)
+        idx_to_fancy_indexes, idx_to_poses = split_indexes_into_chunks(
+            fancy_index_nsplits, flatten_indexes, False
+        )
+        for idx in idx_to_fancy_indexes:
+            ctx[op.outputs[0].key, idx] = (idx_to_fancy_indexes[idx], idx_to_poses[idx])
+
+    @classmethod
+    def _execute_reduce(cls, ctx, op: "FancyIndexingDistribute"):
+        fancy_indexes = []
+        poses = []
+        xp = None
+        for fancy_index, pos in op.iter_mapper_data(ctx):
+            if xp is None:
+                xp = get_array_module(fancy_index)
+            if fancy_index.size == 0:
+                fancy_index = fancy_index.reshape(len(op.axes), 0)
+            fancy_indexes.append(fancy_index)
+            poses.append(pos)
+
+        fancy_index = np.hstack(fancy_indexes)
+        pos = np.hstack(poses)
+
+        assert len(op.outputs) - 1 == len(fancy_index)
+        for out_chunk, axis_fancy_index in zip(op.outputs[:-1], fancy_index):
+            ctx[out_chunk.key] = axis_fancy_index
+        ctx[op.outputs[-1].key] = np.asarray([len(p) for p in poses]), pos
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            cls._execute_map(ctx, op)
+        else:
+            cls._execute_reduce(ctx, op)
+
+    @classmethod
+    def estimate_size(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            fancy_index_size = len(op.axes)
+            inp_size = ctx[op.inputs[0].key][0]
+            factor = (
+                1 / float(fancy_index_size) + fancy_index_size
+            )  # 1/#fancy_index is the poses
+            ctx[op.outputs[0].key] = (inp_size * factor,) * 2
+        else:
+            sum_size = 0
+            for shuffle_input in op.inputs[0].inputs or ():
+                sum_size += ctx[shuffle_input.key]
+            for out_chunk in op.outputs:
+                ctx[out_chunk.key] = sum_size, sum_size
+
+
+class FancyIndexingConcat(TensorMapReduceOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.FANCY_INDEX_CONCAT
+
+    _fancy_index_axis = Int32Field("fancy_index_axis")
+    _fancy_index_shape = TupleField("fancy_index_shape", FieldTypes.int64)
+
+    def __init__(self, fancy_index_axis=None, fancy_index_shape=None, **kw):
+        super().__init__(
+            _fancy_index_axis=fancy_index_axis,
+            _fancy_index_shape=fancy_index_shape,
+            **kw
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def fancy_index_axis(self):
+        return self._fancy_index_axis
+
+    @property
+    def fancy_index_shape(self):
+        return self._fancy_index_shape
+
+    @classmethod
+    def _execute_map(cls, ctx, op):
+        indexed_array = ctx[op.inputs[0].key]
+        sizes, pos = ctx[op.inputs[1].key]
+        acc_sizes = np.cumsum(sizes)
+        fancy_index_axis = op.fancy_index_axis
+
+        for i in range(len(sizes)):
+            start = 0 if i == 0 else acc_sizes[i - 1]
+            end = acc_sizes[i]
+            select = (slice(None),) * fancy_index_axis + (slice(start, end),)
+            ctx[op.outputs[0].key, (i,)] = (indexed_array[select], pos[start:end])
+
+    @classmethod
+    def _execute_reduce(cls, ctx, op: "FancyIndexingConcat"):
+        fancy_index_axis = op.fancy_index_axis
+        fancy_index_shape = op.fancy_index_shape
+
+        indexed_arrays = []
+        poses = []
+        for index_array, pos in op.iter_mapper_data(ctx):
+            indexed_arrays.append(index_array)
+            poses.append(pos)
+
+        concat_array = get_array_module(indexed_arrays[0]).concatenate(
+            indexed_arrays, axis=fancy_index_axis
+        )
+        concat_pos = get_array_module(poses[0]).hstack(poses)
+        select_pos = calc_pos(
+            fancy_index_shape, concat_pos, xp=get_array_module(poses[0])
+        )
+        select = (slice(None),) * fancy_index_axis + (select_pos,)
+        ctx[op.outputs[0].key] = concat_array[select]
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            cls._execute_map(ctx, op)
+        else:
+            cls._execute_reduce(ctx, op)
+
+    @classmethod
+    def estimate_size(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            input_size = ctx[op.inputs[0].key][0]
+            pos_size = ctx[op.inputs[0].key][0]
+            ctx[op.outputs[0].key] = input_size + pos_size, input_size + pos_size * 2
+        else:
+            chunk = op.outputs[0]
+            input_sizes = [ctx[c.key][0] for c in op.inputs[0].inputs or ()]
+            ctx[chunk.key] = chunk.nbytes, chunk.nbytes + sum(input_sizes)
+
+
+def _is_bool_index(index_obj):
+    return isinstance(index_obj, TENSOR_TYPE) and index_obj.dtype == np.bool_
+
+
+def _is_fancy_index(index_obj):
+    return isinstance(index_obj, FANCY_INDEX_TYPES) and index_obj.dtype != np.bool_
+
+
+def _is_create_view(index):
+    # is view if all of index is slice, int or newaxis
+    return all(isinstance(ind, (slice, Integral)) or ind is None for ind in index)
+
+
+def _calc_order(a, index):
+    if a.order == TensorOrder.C_ORDER:
+        return TensorOrder.C_ORDER
+
+    in_axis = 0
+    for ind in index:
+        if _is_bool_index(ind):
+            in_axis += ind.ndim
+            return TensorOrder.C_ORDER
+        elif _is_fancy_index(ind):
+            in_axis += 1
+            return TensorOrder.C_ORDER
+        elif ind is None:
+            continue
+        elif isinstance(ind, slice):
+            shape = a.shape[in_axis]
+            slc = ind.indices(shape)
+            if slc[0] == 0 and slc[1] == shape and slc[2] == 1:
+                continue
+            else:
+                return TensorOrder.C_ORDER
+        else:
+            assert isinstance(ind, Integral)
+            in_axis += 1
+            return TensorOrder.C_ORDER
+
+    return TensorOrder.F_ORDER
+
+
+def _getitem_nocheck(a, item, convert_bool_to_fancy=None):
+    index = process_index(a.ndim, item, convert_bool_to_fancy=convert_bool_to_fancy)
+    if convert_bool_to_fancy is False:
+        # come from __setitem__, the bool index is not converted to fancy index
+        # if multiple bool indexes or bool + fancy indexes exist,
+        # thus the shape will be wrong,
+        # here we just convert when calculating shape,
+        # refer to issue #1282.
+        shape = calc_shape(a.shape, process_index(a.ndim, index))
+    else:
+        shape = calc_shape(a.shape, index)
+    tensor_order = _calc_order(a, index)
+    op = TensorIndex(
+        dtype=a.dtype,
+        sparse=a.issparse(),
+        indexes=list(index),
+        create_view=_is_create_view(index),
+    )
+    return op(a, index, tuple(shape), order=tensor_order)
+
+
+def _getitem(a, item):
+    if isinstance(item, (list, tuple)) and all(
+        isinstance(it, slice) and it == slice(None) for it in item
+    ):
+        # nothing to do
+        return a
+
+    # TODO(jisheng): field access, e.g. t['a'], t[['a', 'b']]
+    return _getitem_nocheck(a, item)
diff --git a/python/xorbits/_mars/tensor/indexing/index_lib.py b/python/xorbits/_mars/tensor/indexing/index_lib.py
new file mode 100644
index 000000000..54f8c73d5
--- /dev/null
+++ b/python/xorbits/_mars/tensor/indexing/index_lib.py
@@ -0,0 +1,1062 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import itertools
+from abc import ABC, abstractmethod
+from collections import OrderedDict, namedtuple
+from enum import Enum
+from numbers import Integral
+from operator import itemgetter
+from typing import List, Tuple, Union
+
+import numpy as np
+
+from ...core import Tileable, recursive_tile
+from ...core.operand import OperandStage
+from ...utils import calc_nsplits, has_unknown_shape
+from ..core import TENSOR_TYPE, Chunk, TensorOrder
+from ..operands import TensorShuffleProxy
+from ..utils import (
+    broadcast_shape,
+    calc_pos,
+    calc_sliced_size,
+    filter_inputs,
+    slice_split,
+    split_indexes_into_chunks,
+    unify_chunks,
+)
+
+
+class IndexType(Enum):
+    new_axis = 0
+    slice = 1
+    label_slice = 2  # e.g. 'a': 'd' used for pandas etc
+    integer = 3
+    label = 4  # e.g. 'a' used for pandas etc
+    bool_index = 5
+    fancy_index = 6
+    label_fancy_index = 7  # e.g. ['a', 'b', 'c'] for pandas etc
+
+
+class IndexInfo:
+    def __init__(
+        self,
+        index_type: IndexType,
+        input_axis: int,
+        output_axis: int,
+        raw_index,
+        handler,
+    ):
+        self.index_type = index_type
+        self.input_axis = input_axis
+        self.output_axis = output_axis
+        self.raw_index = raw_index
+        self.handler = handler
+
+
+class FancyIndexInfo(IndexInfo):
+    def __init__(
+        self,
+        index_type: IndexType,
+        input_axis: int,
+        output_axis: int,
+        raw_index,
+        handler,
+    ):
+        super().__init__(index_type, input_axis, output_axis, raw_index, handler)
+
+        # extra info for fancy index
+        # shape broadcast index
+        self.shape_unified_index = None
+        # split info
+        #   - chunk_index_to_fancy_index_arrays
+        #   - chunk_index_to_raw_positions
+        #   - is_fancy_index_asc_sorted
+        self.split_info = None
+
+
+ChunkIndexAxisInfo = namedtuple(
+    "chunk_index_axis_info", ["output_axis_index", "processed_index", "output_shape"]
+)
+
+
+class ChunkIndexInfo:
+    def __init__(self):
+        self.indexes = []
+        self.output_chunk_index = []
+        self.output_chunk_shape = []
+
+    def set(self, info: ChunkIndexAxisInfo):
+        output_axis_index = info.output_axis_index
+        if output_axis_index is not None:
+            self.output_chunk_index.append(output_axis_index)
+        self.indexes.append(info.processed_index)
+        output_shape = info.output_shape
+        if output_shape is not None:
+            if not isinstance(output_shape, tuple):
+                self.output_chunk_shape.append(output_shape)
+            else:
+                self.output_chunk_shape.extend(output_shape)
+
+
+class IndexHandlerContext(ABC):
+    def __init__(self, op):
+        self.parsed_infos = []
+        self.input_axis = 0
+        self.output_axis = 0
+
+        # store index_type -> positions
+        # for a quick search on indexes of a specified index type
+        self._index_type_to_positions = dict()
+
+        # store chunk index -> ChunkIndexInfo
+        # for the IndexHandler to process
+        self.chunk_index_to_info = OrderedDict()
+        self.op = op
+        self.tileable = op.input
+        self.set_tileable(self.tileable)
+
+        # chunks and nsplits, used for store intermediate result
+        self.processed_chunks = None
+        self.out_chunks = None
+        self.out_nsplits = None
+
+    def append(self, index_info: IndexInfo):
+        position = len(self.parsed_infos)
+        if index_info.index_type not in self._index_type_to_positions:
+            self._index_type_to_positions[index_info.index_type] = []
+        self._index_type_to_positions[index_info.index_type].append(position)
+        self.parsed_infos.append(index_info)
+
+    def get_positions(self, index_type: IndexType) -> List[int]:
+        return self._index_type_to_positions.get(index_type, [])
+
+    def get_indexes(self, index_type: IndexType):
+        return [self.parsed_infos[i] for i in self.get_positions(index_type)]
+
+    def set_tileable(self, tileable: Tileable):
+        for chunk in tileable.chunks:
+            self.chunk_index_to_info[chunk.index] = ChunkIndexInfo()
+
+    @abstractmethod
+    def concat_chunks(self, chunks: List[Chunk], axis: Union[Tuple, int]) -> Chunk:
+        pass
+
+    @abstractmethod
+    def create_chunk(
+        self, chunk_index: Tuple[int], chunk_index_info: ChunkIndexInfo
+    ) -> Chunk:
+        pass
+
+    def create_tileable(self) -> Tileable:
+        out = self.op.outputs[0]
+        params = out.params
+        params["chunks"] = self.out_chunks
+        params["nsplits"] = self.out_nsplits
+        if "shape" in params and any(np.isnan(s) for s in params["shape"]):
+            params["shape"] = tuple(sum(ns) for ns in self.out_nsplits)
+        new_op = out.op.copy()
+        return new_op.new_tileable(out.inputs, kws=[params])
+
+
+class TensorIndexHandlerContext(IndexHandlerContext):
+    def concat_chunks(self, chunks: List[Chunk], axis: Union[Tuple[int], int]) -> Chunk:
+        from ..merge import TensorConcatenate
+
+        assert isinstance(axis, int), "axis to concat could only be int for tensor"
+
+        shape = list(chunks[0].shape)
+        shape[axis] = sum(c.shape[axis] for c in chunks)
+        chunk_index = list(chunks[0].index)
+        chunk_index[axis] = 0
+
+        op = TensorConcatenate(
+            axis=axis, dtype=chunks[0].dtype, sparse=chunks[0].issparse()
+        )
+        return op.new_chunk(
+            chunks,
+            shape=tuple(shape),
+            index=tuple(chunk_index),
+            order=TensorOrder.C_ORDER,
+        )
+
+    def create_chunk(
+        self, chunk_index: Tuple[int], chunk_index_info: ChunkIndexInfo
+    ) -> Chunk:
+        chunk_op = self.op.copy().reset_key()
+        chunk_op._indexes = indexes = chunk_index_info.indexes
+        chunk_input = (
+            self.tileable.chunks[0]
+            if self.tileable.ndim == 0
+            else self.tileable.cix[chunk_index]
+        )
+        chunk_inputs = filter_inputs([chunk_input] + indexes)
+        return chunk_op.new_chunk(
+            chunk_inputs,
+            shape=tuple(chunk_index_info.output_chunk_shape),
+            index=tuple(chunk_index_info.output_chunk_index),
+            order=self.op.outputs[0].order,
+        )
+
+
+_type_to_instance = {}
+
+
+class IndexHandler(ABC):
+    @classmethod
+    def get_instance(cls):
+        if cls not in _type_to_instance:
+            _type_to_instance[cls] = cls()
+        return _type_to_instance[cls]
+
+    @abstractmethod
+    def accept(cls, raw_index):
+        pass
+
+    @abstractmethod
+    def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo:
+        pass
+
+    def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        pass
+
+    @abstractmethod
+    def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        pass
+
+    def postprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        pass
+
+    @classmethod
+    def set_chunk_index_info(
+        cls,
+        context: IndexHandlerContext,
+        index_info: IndexInfo,
+        chunk_index: Tuple[int],
+        chunk_index_info: ChunkIndexInfo,
+        output_axis_index: int,
+        index,
+        output_shape: int,
+    ):
+        _ = context, index_info, chunk_index
+        chunk_index_info.set(
+            ChunkIndexAxisInfo(
+                output_axis_index=output_axis_index,
+                processed_index=index,
+                output_shape=output_shape,
+            )
+        )
+
+
+class NewaxisIndexHandler(IndexHandler):
+    def accept(self, raw_index):
+        return raw_index is np.newaxis
+
+    def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo:
+        info = IndexInfo(
+            IndexType.new_axis, context.input_axis, context.output_axis, raw_index, self
+        )
+        context.output_axis += 1
+        context.append(info)
+        return info
+
+    def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        for chunk_index_info in context.chunk_index_to_info.values():
+            # index on axis and index object
+            chunk_index_info.set(
+                ChunkIndexAxisInfo(
+                    output_axis_index=0, processed_index=None, output_shape=1
+                )
+            )
+
+
+class SliceIndexHandler(IndexHandler):
+    def accept(self, raw_index):
+        return isinstance(raw_index, slice)
+
+    def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo:
+        info = IndexInfo(
+            IndexType.slice, context.input_axis, context.output_axis, raw_index, self
+        )
+        context.input_axis += 1
+        context.output_axis += 1
+        context.append(info)
+        return info
+
+    def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        # make sure input tileable has known chunk shapes
+        if has_unknown_shape(context.tileable):
+            yield
+
+    def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        tileable = context.tileable
+        input_axis = index_info.input_axis
+        # slice.step < 0
+        is_reversed = (index_info.raw_index.step or 0) < 0
+
+        # e.g. slice_split(slice(3, 10), [2, 2, 7, 5])
+        # return {1: slice(1, 2, 1), 2: slice(0, 6, 1)}
+        effected_i_to_slice = slice_split(
+            index_info.raw_index, tileable.nsplits[index_info.input_axis]
+        )
+        output_axis_index_range = (
+            range(len(effected_i_to_slice))
+            if not is_reversed
+            else range(len(effected_i_to_slice) - 1, -1, -1)
+        )
+        other_index_to_iter = dict()
+
+        index_to_info = context.chunk_index_to_info.copy()
+        for chunk_index, chunk_index_info in index_to_info.items():
+            i = chunk_index[input_axis]
+            other_index = chunk_index[:input_axis] + chunk_index[input_axis + 1 :]
+            size = tileable.nsplits[input_axis][i]
+            if i not in effected_i_to_slice:
+                # delete it, the input chunk could be ignored
+                del context.chunk_index_to_info[chunk_index]
+            else:
+                slc = effected_i_to_slice[i]
+                output_shape = calc_sliced_size(size, slc)
+                if other_index not in other_index_to_iter:
+                    other_index_to_iter[other_index] = iter(output_axis_index_range)
+                output_axis_index = next(other_index_to_iter[other_index])
+                self.set_chunk_index_info(
+                    context,
+                    index_info,
+                    chunk_index,
+                    chunk_index_info,
+                    output_axis_index,
+                    slc,
+                    output_shape,
+                )
+
+
+class IntegralIndexHandler(IndexHandler):
+    def accept(self, raw_index):
+        return isinstance(raw_index, Integral)
+
+    def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo:
+        info = IndexInfo(
+            IndexType.integer, context.input_axis, context.output_axis, raw_index, self
+        )
+        context.input_axis += 1
+        context.append(info)
+        return info
+
+    def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        if has_unknown_shape(context.tileable):  # pragma: no cover
+            yield
+
+    def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        tileable = context.tileable
+        input_axis = index_info.input_axis
+
+        # e.g. slice_split(6, [2, 2, 7, 5])
+        # return {2: 2}
+        effected_i_to_slice = slice_split(
+            index_info.raw_index, tileable.nsplits[index_info.input_axis]
+        )
+
+        index_to_info = context.chunk_index_to_info.copy()
+        for chunk_index, chunk_index_info in index_to_info.items():
+            i = chunk_index[input_axis]
+            if i not in effected_i_to_slice:
+                # delete it, the input chunk could be ignored
+                del context.chunk_index_to_info[chunk_index]
+            else:
+                slc = effected_i_to_slice[i]
+                chunk_index_info.set(
+                    ChunkIndexAxisInfo(
+                        output_axis_index=None, processed_index=slc, output_shape=None
+                    )
+                )
+
+
+class _BoolIndexHandler(IndexHandler):
+    def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo:
+        info = IndexInfo(
+            IndexType.bool_index,
+            context.input_axis,
+            context.output_axis,
+            raw_index,
+            self,
+        )
+        context.input_axis += raw_index.ndim
+        context.output_axis += 1
+        context.append(info)
+        return info
+
+    @classmethod
+    def _is_first_bool_index(
+        self, context: IndexHandlerContext, index_info: IndexInfo
+    ) -> bool:
+        bool_index_infos = [
+            info
+            for info in context.parsed_infos
+            if info.index_type == IndexType.bool_index
+        ]
+        return bool_index_infos[0] is index_info
+
+
+class NDArrayBoolIndexHandler(_BoolIndexHandler):
+    def accept(self, raw_index):
+        return isinstance(raw_index, np.ndarray) and raw_index.dtype == np.bool_
+
+    def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        if has_unknown_shape(context.tileable):  # pragma: no cover
+            yield
+
+    def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        tileable = context.tileable
+        input_axis = index_info.input_axis
+        is_first_bool_index = self._is_first_bool_index(context, index_info)
+
+        axes = list(range(input_axis, input_axis + index_info.raw_index.ndim))
+        cum_sizes = []
+        for axis in axes:
+            cum_sizes.append(np.cumsum((0,) + tileable.nsplits[axis]))
+
+        other_index_to_iter = dict()
+        for chunk_index, chunk_index_info in context.chunk_index_to_info.items():
+            slcs = []
+            for j, axis in enumerate(axes):
+                axis_index = chunk_index[axis]
+                slcs.append(
+                    slice(cum_sizes[j][axis_index], cum_sizes[j][axis_index + 1])
+                )
+            other_index = chunk_index[: axes[0]] + chunk_index[axes[-1] + 1 :]
+            if other_index not in other_index_to_iter:
+                other_index_to_iter[other_index] = itertools.count()
+            index = index_info.raw_index[tuple(slcs)]
+            output_axis_index = next(other_index_to_iter[other_index])
+
+            # if more than 1 bool index, getitem will rewrite them into fancy
+            # but for now, setitem will keep them, thus we cannot record
+            # index or shape for this one
+            output_axis_index = None if not is_first_bool_index else output_axis_index
+            output_size = None if not is_first_bool_index else int(index.sum())
+
+            self.set_chunk_index_info(
+                context,
+                index_info,
+                chunk_index,
+                chunk_index_info,
+                output_axis_index,
+                index,
+                output_size,
+            )
+
+
+class TensorBoolIndexHandler(_BoolIndexHandler):
+    def accept(self, raw_index):
+        return isinstance(raw_index, TENSOR_TYPE) and raw_index.dtype == np.bool_
+
+    def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        # check both input tileable and index object itself
+        if has_unknown_shape(context.tileable):
+            yield
+        if has_unknown_shape(index_info.raw_index):  # pragma: no cover
+            yield
+
+    def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        tileable = context.tileable
+        input_axis = index_info.input_axis
+        index = index_info.raw_index
+        # rechunk index into the same chunk size
+        nsplits = tileable.nsplits[input_axis : input_axis + index.ndim]
+        index = yield from recursive_tile(index.rechunk(nsplits))
+        is_first_bool_index = self._is_first_bool_index(context, index_info)
+
+        other_index_to_iter = dict()
+        for chunk_index, chunk_index_info in context.chunk_index_to_info.items():
+            effected_chunk_index = chunk_index[input_axis : input_axis + index.ndim]
+            other_index = (
+                chunk_index[:input_axis] + chunk_index[input_axis + index.ndim :]
+            )
+            if other_index not in other_index_to_iter:
+                other_index_to_iter[other_index] = itertools.count()
+            output_axis_index = next(other_index_to_iter[other_index])
+
+            # if more than 1 bool index, getitem will rewrite them into fancy
+            # but for now, setitem will keep them, thus we cannot record
+            # index or shape for this one
+            output_axis_index = None if not is_first_bool_index else output_axis_index
+            output_size = None if not is_first_bool_index else np.nan
+
+            self.set_chunk_index_info(
+                context,
+                index_info,
+                chunk_index,
+                chunk_index_info,
+                output_axis_index,
+                index.cix[tuple(effected_chunk_index)],
+                output_size,
+            )
+
+
+class _FancyIndexHandler(IndexHandler):
+    def parse(self, raw_index, context: IndexHandlerContext) -> IndexInfo:
+        prev_fancy_indexes = context.get_indexes(IndexType.fancy_index)
+        is_first_fancy_index = len(prev_fancy_indexes) == 0
+
+        if is_first_fancy_index:
+            output_axis = context.output_axis
+        else:
+            output_axis = prev_fancy_indexes[0].output_axis
+        info = FancyIndexInfo(
+            IndexType.fancy_index, context.input_axis, output_axis, raw_index, self
+        )
+
+        context.input_axis += 1
+        if is_first_fancy_index:
+            context.output_axis += 1
+        context.append(info)
+        return info
+
+    @classmethod
+    def is_first(cls, index_info: IndexInfo, context: IndexHandlerContext) -> bool:
+        # check if is first fancy index after parsing
+        fancy_indexes = context.get_indexes(index_info.index_type)
+        i = fancy_indexes.index(index_info)
+        if i > 0:
+            # only process for the first fancy indexes
+            return False
+        else:
+            return True
+
+    def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        fancy_indexe_infos = context.get_indexes(index_info.index_type)
+        # check all fancy indexes are all ndarrays
+        for fancy_index_info in fancy_indexe_infos:
+            if not self.accept(fancy_index_info.raw_index):  # pragma: no cover
+                raise TypeError("Fancy indexes should be all ndarrays or tensors")
+
+
+class NDArrayFancyIndexHandler(_FancyIndexHandler):
+    def accept(self, raw_index):
+        return isinstance(raw_index, np.ndarray) and raw_index.dtype != np.bool_
+
+    def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        is_first = self.is_first(index_info, context)
+        if not is_first:
+            return
+
+        # check if all ndarrays
+        super().preprocess(index_info, context)
+        if has_unknown_shape(context.tileable):  # pragma: no cover
+            yield
+
+        fancy_index_infos = context.get_indexes(index_info.index_type)
+        # unify shapes of all fancy indexes
+        shape = broadcast_shape(*(info.raw_index.shape for info in fancy_index_infos))
+        for fancy_index_info in fancy_index_infos:
+            fancy_index_info.shape_unified_index = np.broadcast_to(
+                fancy_index_info.raw_index, shape
+            )
+
+        # concat all fancy index together
+        concat_fancy_index = np.stack(
+            [info.shape_unified_index.ravel() for info in fancy_index_infos]
+        )
+        effected_nsplits = [
+            context.tileable.nsplits[info.input_axis] for info in fancy_index_infos
+        ]
+        # split concatenated fancy index into chunks according to input tileable
+        split_info = split_indexes_into_chunks(effected_nsplits, concat_fancy_index)
+        fancy_index_infos[0].split_info = split_info
+
+    def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        fancy_index_infos = context.get_indexes(index_info.index_type)
+        fancy_index_axes = [info.input_axis for info in fancy_index_infos]
+        split_info = fancy_index_infos[0].split_info
+        chunk_index_to_fancy_index_arrays = split_info[0]
+        i_fancy_index = fancy_index_infos.index(index_info)
+
+        other_index_to_iter = dict()
+        chunk_index_to_info = context.chunk_index_to_info.copy()
+        for chunk_index, chunk_index_info in chunk_index_to_info.items():
+            effected_chunk_index = tuple(chunk_index[ax] for ax in fancy_index_axes)
+            fancy_index_array = chunk_index_to_fancy_index_arrays[effected_chunk_index][
+                i_fancy_index
+            ]
+
+            if fancy_index_array.size == 0:
+                # not effected
+                del context.chunk_index_to_info[chunk_index]
+                continue
+
+            if i_fancy_index == 0:
+                other_index = tuple(
+                    ci for i, ci in enumerate(chunk_index) if i not in fancy_index_axes
+                )
+                if other_index not in other_index_to_iter:
+                    other_index_to_iter[other_index] = itertools.count()
+                output_axis_index = next(other_index_to_iter[other_index])
+                output_axis_shape = fancy_index_array.shape[0]
+            else:
+                output_axis_index = None
+                output_axis_shape = None
+
+            chunk_index_info.set(
+                ChunkIndexAxisInfo(
+                    output_axis_index=output_axis_index,
+                    processed_index=fancy_index_array,
+                    output_shape=output_axis_shape,
+                )
+            )
+
+    @classmethod
+    def need_postprocess(cls, context: IndexHandlerContext) -> bool:
+        fancy_indexes = context.get_indexes(IndexType.fancy_index)
+
+        if (
+            fancy_indexes[0].split_info[2]
+            and fancy_indexes[0].shape_unified_index.ndim == 1
+        ):
+            # if fancy indexes are asc sorted,
+            # and they are 1-d, no further computation required
+            return False
+
+        return True
+
+    def postprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        fancy_indexes = context.get_indexes(index_info.index_type)
+
+        if not self.need_postprocess(context):
+            return
+
+        is_first = self.is_first(index_info, context)
+        if not is_first:
+            # only need to postprocess fancy indexes once
+            return
+
+        # current chunks and nsplits
+        chunks, nsplits = context.out_chunks, context.out_nsplits
+
+        index_to_chunks = {c.index: c for c in chunks}
+        fancy_index_shape = fancy_indexes[0].shape_unified_index.shape
+        reorder_index = calc_pos(fancy_index_shape, fancy_indexes[0].split_info[1])
+
+        to_concat_axis = index_info.output_axis
+        new_out_chunks = []
+        for chunk_index in itertools.product(
+            *(range(len(ns)) for ax, ns in enumerate(nsplits) if ax != to_concat_axis)
+        ):
+            # concat chunks on output axis of first fancy index
+            to_concat_chunks = []
+            for i in range(len(nsplits[to_concat_axis])):
+                to_concat_index = list(chunk_index)
+                to_concat_index.insert(to_concat_axis, i)
+                to_concat_chunks.append(index_to_chunks[tuple(to_concat_index)])
+            concat_chunk = context.concat_chunks(to_concat_chunks, to_concat_axis)
+
+            reorder_chunk_op = context.op.copy().reset_key()
+            reorder_chunk_op._indexes = [slice(None)] * to_concat_axis + [reorder_index]
+            reorder_shape = (
+                concat_chunk.shape[:to_concat_axis]
+                + fancy_index_shape
+                + concat_chunk.shape[to_concat_axis + 1 :]
+            )
+            chunk_reorder_index = (
+                concat_chunk.index[:to_concat_axis]
+                + (0,) * len(fancy_index_shape)
+                + concat_chunk.index[to_concat_axis + 1 :]
+            )
+            reorder_chunk = reorder_chunk_op.new_chunk(
+                [concat_chunk],
+                shape=reorder_shape,
+                index=chunk_reorder_index,
+                order=TensorOrder.C_ORDER,
+            )
+            new_out_chunks.append(reorder_chunk)
+
+        new_nsplits = (
+            nsplits[:to_concat_axis]
+            + tuple((s,) for s in fancy_index_shape)
+            + nsplits[to_concat_axis + 1 :]
+        )
+        context.out_chunks = new_out_chunks
+        context.out_nsplits = new_nsplits
+
+
+class TensorFancyIndexHandler(_FancyIndexHandler):
+    def accept(self, raw_index):
+        return isinstance(raw_index, TENSOR_TYPE) and raw_index.dtype != np.bool_
+
+    def preprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        from ..base import broadcast_to
+        from ..merge import stack
+
+        is_first = self.is_first(index_info, context)
+        if not is_first:
+            return
+
+        fancy_index_infos = context.get_indexes(index_info.index_type)
+
+        # check if all tensors
+        super().preprocess(index_info, context)
+        to_check = [context.tileable] + list(
+            info.raw_index for info in fancy_index_infos
+        )
+        if has_unknown_shape(*to_check):
+            yield
+
+        # unify shapes of all fancy indexes
+        shape = broadcast_shape(*(info.raw_index.shape for info in fancy_index_infos))
+        fancy_indexes = []
+        for fancy_index_info in fancy_index_infos:
+            fancy_index = yield from recursive_tile(
+                broadcast_to(fancy_index_info.raw_index, shape)
+            )
+            fancy_indexes.append(fancy_index)
+        shape_unified_fancy_indexes = yield from unify_chunks(*fancy_indexes)
+        for fancy_index_info, shape_unified_fancy_index in zip(
+            fancy_index_infos, shape_unified_fancy_indexes
+        ):
+            fancy_index_info.shape_unified_index = shape_unified_fancy_index
+
+        fancy_index_axes = tuple(info.input_axis for info in fancy_index_infos)
+
+        # stack fancy indexes into one
+        concat_fancy_index = yield from recursive_tile(
+            stack(
+                [
+                    fancy_index_info.shape_unified_index
+                    for fancy_index_info in fancy_index_infos
+                ]
+            )
+        )
+        concat_fancy_index = yield from recursive_tile(
+            concat_fancy_index.rechunk({0: len(fancy_index_infos)})
+        )
+
+        self._shuffle_fancy_indexes(
+            concat_fancy_index, context, index_info, fancy_index_axes
+        )
+
+    @classmethod
+    def _shuffle_fancy_indexes(
+        cls,
+        concat_fancy_index: Tileable,
+        context: IndexHandlerContext,
+        index_info: IndexInfo,
+        axes: Tuple,
+    ):
+        from .getitem import FancyIndexingDistribute
+
+        tileable = context.tileable
+
+        # generate shuffle map, for concatenated fancy index,
+        # calculated a counterpart index chunk for each chunk of input tensor
+        map_chunks = []
+        for chunk in concat_fancy_index.chunks:
+            map_op = FancyIndexingDistribute(
+                stage=OperandStage.map,
+                dest_nsplits=tileable.nsplits,
+                axes=axes,
+                dtype=chunk.dtype,
+            )
+            map_chunk = map_op.new_chunk(
+                [chunk], shape=(np.nan,), index=chunk.index, order=TensorOrder.C_ORDER
+            )
+            map_chunks.append(map_chunk)
+        # shuffle proxy
+        proxy_chunk = TensorShuffleProxy(dtype=concat_fancy_index.dtype).new_chunk(
+            map_chunks, shape=(), order=TensorOrder.C_ORDER
+        )
+        chunk_index_to_fancy_index_chunks = OrderedDict()
+        chunk_index_to_raw_positions = OrderedDict()
+        out_indices = list(
+            itertools.product(*(range(tileable.chunk_shape[ax]) for ax in axes))
+        )
+        for chunk_index in out_indices:
+            reduce_op = FancyIndexingDistribute(
+                stage=OperandStage.reduce,
+                axes=axes,
+                dtype=proxy_chunk.dtype,
+                n_reducers=len(out_indices),
+            )
+            # chunks of fancy indexes on each axis
+            kws = [
+                {
+                    "axis": ax,
+                    "shape": (np.nan,),
+                    "index": chunk_index,
+                    "order": context.op.outputs[0].order,
+                }
+                for ax in axes
+            ]
+            kws.append({"pos": True, "shape": (np.nan,), "index": chunk_index})
+            reduce_chunks = reduce_op.new_chunks([proxy_chunk], kws=kws)
+            chunk_index_to_fancy_index_chunks[chunk_index] = reduce_chunks[:-1]
+            chunk_index_to_raw_positions[chunk_index] = reduce_chunks[-1]
+
+        # split info
+        #   - chunk_index_to_fancy_index_chunks
+        #   - chunk_index_to_raw_positions
+        #   - is_fancy_index_asc_sorted, False for tensor fancy indexes
+        index_info.split_info = (
+            chunk_index_to_fancy_index_chunks,
+            chunk_index_to_raw_positions,
+            False,
+        )
+
+    def process(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        fancy_index_infos = context.get_indexes(index_info.index_type)
+        fancy_index_axes = [info.input_axis for info in fancy_index_infos]
+        split_info = fancy_index_infos[0].split_info
+        chunk_index_to_fancy_index_chunks = split_info[0]
+        i_fancy_index = fancy_index_infos.index(index_info)
+
+        other_index_to_iter = dict()
+        for chunk_index, chunk_index_info in context.chunk_index_to_info.items():
+            effected_chunk_index = tuple(chunk_index[ax] for ax in fancy_index_axes)
+            fancy_index_chunk = chunk_index_to_fancy_index_chunks[effected_chunk_index][
+                i_fancy_index
+            ]
+
+            if i_fancy_index == 0:
+                other_index = tuple(
+                    ci for i, ci in enumerate(chunk_index) if i not in fancy_index_axes
+                )
+                if other_index not in other_index_to_iter:
+                    other_index_to_iter[other_index] = itertools.count()
+                output_axis_index = next(other_index_to_iter[other_index])
+                output_axis_shape = fancy_index_chunk.shape[0]
+            else:
+                output_axis_index = output_axis_shape = None
+
+            chunk_index_info.set(
+                ChunkIndexAxisInfo(
+                    output_axis_index=output_axis_index,
+                    processed_index=fancy_index_chunk,
+                    output_shape=output_axis_shape,
+                )
+            )
+
+    def postprocess(self, index_info: IndexInfo, context: IndexHandlerContext) -> None:
+        from .getitem import FancyIndexingConcat
+
+        fancy_index_infos = context.get_indexes(index_info.index_type)
+
+        is_first = self.is_first(index_info, context)
+        if not is_first:
+            # only need to postprocess fancy indexes once
+            return
+
+        # current chunks and nsplits
+        chunks, nsplits = context.out_chunks, context.out_nsplits
+        chunk_shape = tuple(len(ns) for ns in nsplits)
+        to_concat_axis = index_info.output_axis
+        tileable = context.tileable
+        fancy_index_effected_input_chunk_shapes = tuple(
+            tileable.chunk_shape[info.input_axis] for info in fancy_index_infos
+        )
+        fancy_indexes = [info.shape_unified_index for info in fancy_index_infos]
+
+        concat_index_to_chunks = dict()
+        for chunk in chunks:
+            effected_chunk_index = np.unravel_index(
+                chunk.index[to_concat_axis], fancy_index_effected_input_chunk_shapes
+            )
+            raw_position_chunk = fancy_index_infos[0].split_info[1][
+                effected_chunk_index
+            ]
+            concat_map_op = FancyIndexingConcat(
+                stage=OperandStage.map,
+                fancy_index_axis=to_concat_axis,
+                sparse=chunk.issparse(),
+                dtype=chunk.dtype,
+            )
+            map_chunk_shape = (
+                chunk.shape[:to_concat_axis]
+                + (np.nan,)
+                + chunk.shape[to_concat_axis + 1 :]
+            )
+            concat_map_chunk = concat_map_op.new_chunk(
+                [chunk, raw_position_chunk],
+                index=chunk.index,
+                shape=map_chunk_shape,
+                order=TensorOrder.C_ORDER,
+            )
+            concat_index_to_chunks[concat_map_chunk.index] = concat_map_chunk
+
+        other_index_chunk_shape = (
+            chunk_shape[:to_concat_axis] + chunk_shape[to_concat_axis + 1 :]
+        )
+        out_chunks = []
+        for chunk_index in itertools.product(
+            *(range(s) for s in other_index_chunk_shape)
+        ):
+            to_shuffle_chunks = []
+            other_shape = None
+            for i in range(chunk_shape[to_concat_axis]):
+                to_concat_chunk_index = (
+                    chunk_index[:to_concat_axis] + (i,) + chunk_index[to_concat_axis:]
+                )
+                to_concat_chunk = concat_index_to_chunks[to_concat_chunk_index]
+                to_shuffle_chunks.append(to_concat_chunk)
+                if other_shape is None:
+                    other_shape = tuple(
+                        s
+                        for ax, s in enumerate(to_concat_chunk.shape)
+                        if ax != to_concat_axis
+                    )
+
+            proxy_chunk = TensorShuffleProxy(
+                dtype=to_shuffle_chunks[0].dtype
+            ).new_chunk(to_shuffle_chunks, shape=(), order=TensorOrder.C_ORDER)
+
+            it = itertools.count()
+            out_indices = list(
+                itertools.product(*(range(s) for s in fancy_indexes[0].chunk_shape))
+            )
+            for ordinal, reduce_index in enumerate(out_indices):
+                fancy_index_chunk = fancy_indexes[0].cix[reduce_index]
+                concat_reduce_op = FancyIndexingConcat(
+                    stage=OperandStage.reduce,
+                    fancy_index_axis=to_concat_axis,
+                    fancy_index_shape=fancy_index_chunk.shape,
+                    dtype=proxy_chunk.dtype,
+                    sparse=to_shuffle_chunks[0].issparse(),
+                    reducer_index=(next(it),),
+                    n_reducers=len(out_indices),
+                )
+                reduce_chunk_shape = (
+                    other_shape[:to_concat_axis]
+                    + fancy_index_chunk.shape
+                    + other_shape[to_concat_axis:]
+                )
+                reduce_chunk_index = (
+                    chunk_index[:to_concat_axis]
+                    + fancy_index_chunk.index
+                    + chunk_index[to_concat_axis:]
+                )
+                concat_reduce_chunk = concat_reduce_op.new_chunk(
+                    [proxy_chunk],
+                    shape=reduce_chunk_shape,
+                    index=reduce_chunk_index,
+                    order=TensorOrder.C_ORDER,
+                )
+                out_chunks.append(concat_reduce_chunk)
+
+        context.out_chunks = out_chunks
+        context.out_nsplits = (
+            nsplits[:to_concat_axis]
+            + fancy_indexes[0].nsplits
+            + nsplits[to_concat_axis + 1 :]
+        )
+
+
+class IndexesHandler(ABC):
+    def __init__(self):
+        self.available_index_handlers = []
+
+    def register(self, *handlers):
+        self.available_index_handlers.extend(h.get_instance() for h in handlers)
+
+    @abstractmethod
+    def create_context(self, op):
+        pass
+
+    def handle(self, op, return_context: bool = False):
+        indexes = op.indexes
+        # create context
+        context = self.create_context(op)
+
+        # parse index infos
+        index_infos = []
+        for index in indexes:
+            parsed = False
+            for index_handler in self.available_index_handlers:
+                if index_handler.accept(index):
+                    parsed = True
+                    index_infos.append(index_handler.parse(index, context))
+                    break
+            if not parsed:
+                raise TypeError(f"unable to parse index {index}")
+
+        yield from self._preprocess(context, index_infos)
+        yield from self._process(context, index_infos)
+        self._postprocess(context, index_infos)
+
+        if return_context:
+            return context
+        else:
+            return context.create_tileable()
+
+    @classmethod
+    def _preprocess(cls, context: IndexHandlerContext, index_infos: List[IndexInfo]):
+        # preprocess
+        for index_info in index_infos:
+            preprocess = index_info.handler.preprocess(index_info, context)
+            if inspect.isgenerator(preprocess):
+                yield from preprocess
+
+    @classmethod
+    def _process(cls, context, index_infos):
+        # process
+        for index_info in index_infos:
+            process = index_info.handler.process(index_info, context)
+            if inspect.isgenerator(process):
+                yield from process
+
+        context.processed_chunks = context.out_chunks = out_chunks = []
+        for chunk_index, chunk_index_info in context.chunk_index_to_info.items():
+            out_chunks.append(context.create_chunk(chunk_index, chunk_index_info))
+        index_to_shape = OrderedDict(
+            sorted([(c.index, c.shape) for c in out_chunks], key=itemgetter(0))
+        )
+        context.out_nsplits = calc_nsplits(index_to_shape)
+
+    @classmethod
+    def _postprocess(cls, context, index_infos):
+        # post process
+        for index_info in index_infos:
+            index_info.handler.postprocess(index_info, context)
+
+
+class NDArrayIndexesHandler(IndexesHandler):
+    # indexes handler only for slice, integer,
+    # boolean ndarray, integer ndarray and None
+    def __init__(self):
+        super().__init__()
+        self.register(
+            NewaxisIndexHandler,
+            SliceIndexHandler,
+            IntegralIndexHandler,
+            NDArrayBoolIndexHandler,
+            NDArrayFancyIndexHandler,
+        )
+
+    def create_context(self, op):
+        return TensorIndexHandlerContext(op)
+
+
+class TensorIndexesHandler(IndexesHandler):
+    def __init__(self):
+        super().__init__()
+        self.register(
+            NewaxisIndexHandler,
+            SliceIndexHandler,
+            IntegralIndexHandler,
+            NDArrayBoolIndexHandler,
+            TensorBoolIndexHandler,
+            NDArrayFancyIndexHandler,
+            TensorFancyIndexHandler,
+        )
+
+    def create_context(self, op):
+        return TensorIndexHandlerContext(op)
diff --git a/python/xorbits/_mars/tensor/indexing/nonzero.py b/python/xorbits/_mars/tensor/indexing/nonzero.py
new file mode 100644
index 000000000..58d9ba953
--- /dev/null
+++ b/python/xorbits/_mars/tensor/indexing/nonzero.py
@@ -0,0 +1,139 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import ExecutableTuple, recursive_tile
+from ...serialization.serializables import KeyField
+from ..core import TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorHasInput, TensorOperandMixin
+from .unravel_index import unravel_index
+
+
+class TensorNonzero(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.NONZERO
+
+    _input = KeyField("input")
+
+    @property
+    def output_limit(self):
+        return float("inf")
+
+    def __call__(self, a):
+        kws = [
+            {"shape": (np.nan,), "order": TensorOrder.C_ORDER, "_idx_": i}
+            for i in range(a.ndim)
+        ]
+        return ExecutableTuple(self.new_tensors([a], kws=kws, output_limit=len(kws)))
+
+    @classmethod
+    def tile(cls, op):
+        from ..datasource import arange
+
+        in_tensor = astensor(op.input)
+
+        flattened = in_tensor.astype(bool).flatten()
+        flattened = yield from recursive_tile(flattened)
+        indices = arange(flattened.size, dtype=np.intp, chunk_size=flattened.nsplits)
+        indices = indices[flattened]
+        dim_indices = unravel_index(indices, in_tensor.shape)
+        dim_indices = yield from recursive_tile(dim_indices)
+
+        kws = [
+            {"nsplits": ind.nsplits, "chunks": ind.chunks, "shape": o.shape}
+            for ind, o in zip(dim_indices, op.outputs)
+        ]
+        new_op = op.copy()
+        return new_op.new_tensors(op.inputs, kws=kws, output_limit=len(kws))
+
+
+def nonzero(a):
+    """
+    Return the indices of the elements that are non-zero.
+
+    Returns a tuple of tensors, one for each dimension of `a`,
+    containing the indices of the non-zero elements in that
+    dimension. The values in `a` are always tested and returned.
+    The corresponding non-zero
+    values can be obtained with::
+
+        a[nonzero(a)]
+
+    To group the indices by element, rather than dimension, use::
+
+        transpose(nonzero(a))
+
+    The result of this is always a 2-D array, with a row for
+    each non-zero element.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor.
+
+    Returns
+    -------
+    tuple_of_arrays : tuple
+        Indices of elements that are non-zero.
+
+    See Also
+    --------
+    flatnonzero :
+        Return indices that are non-zero in the flattened version of the input
+        tensor.
+    Tensor.nonzero :
+        Equivalent tensor method.
+    count_nonzero :
+        Counts the number of non-zero elements in the input tensor.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.array([[1,0,0], [0,2,0], [1,1,0]])
+    >>> x.execute()
+    array([[1, 0, 0],
+           [0, 2, 0],
+           [1, 1, 0]])
+    >>> mt.nonzero(x).execute()
+    (array([0, 1, 2, 2]), array([0, 1, 0, 1]))
+
+    >>> x[mt.nonzero(x)].execute()  # TODO(jisheng): accomplish this after fancy indexing is supported
+
+    >>> mt.transpose(mt.nonzero(x)).execute() # TODO(jisheng): accomplish this later
+
+    A common use for ``nonzero`` is to find the indices of an array, where
+    a condition is True.  Given an array `a`, the condition `a` > 3 is a
+    boolean array and since False is interpreted as 0, np.nonzero(a > 3)
+    yields the indices of the `a` where the condition is true.
+
+    >>> a = mt.array([[1,2,3],[4,5,6],[7,8,9]])
+    >>> (a > 3).execute()
+    array([[False, False, False],
+           [ True,  True,  True],
+           [ True,  True,  True]])
+    >>> mt.nonzero(a > 3).execute()
+    (array([1, 1, 1, 2, 2, 2]), array([0, 1, 2, 0, 1, 2]))
+
+    The ``nonzero`` method of the boolean array can also be called.
+
+    >>> (a > 3).nonzero().execute()
+    (array([1, 1, 1, 2, 2, 2]), array([0, 1, 2, 0, 1, 2]))
+
+    """
+    a = astensor(a)
+    op = TensorNonzero(dtype=np.dtype(np.intp))
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/indexing/setitem.py b/python/xorbits/_mars/tensor/indexing/setitem.py
new file mode 100644
index 000000000..74a89171d
--- /dev/null
+++ b/python/xorbits/_mars/tensor/indexing/setitem.py
@@ -0,0 +1,372 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import itertools
+import operator
+from numbers import Integral
+from typing import Union
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import ENTITY_TYPE, recursive_tile
+from ...core.context import Context
+from ...core.operand import OperandStage
+from ...serialization.serializables import AnyField, BoolField, KeyField, TupleField
+from ...tensor import tensor as astensor
+from ...utils import has_unknown_shape
+from ..base import broadcast_to
+from ..core import TENSOR_TYPE, TensorOrder
+from ..operands import TensorMapReduceOperand, TensorOperandMixin, TensorShuffleProxy
+from ..utils import broadcast_shape, filter_inputs
+from .core import process_index
+
+
+class TensorIndexSetValue(TensorMapReduceOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.INDEXSETVALUE
+
+    input = KeyField("input")
+    indexes = TupleField("indexes")
+    value = AnyField("value")
+    is_fancy_index = BoolField("is_fancy_index")
+    input_nsplits = TupleField("input_nsplits")
+    chunk_offsets = TupleField("chunk_offsets")
+    shuffle_axes = TupleField("shuffle_axes")
+
+    def __init__(
+        self,
+        indexes=None,
+        value=None,
+        is_fancy_index=None,
+        input_nsplits=None,
+        chunk_offsets=None,
+        shuffle_axes=None,
+        **kw,
+    ):
+        super().__init__(
+            indexes=indexes,
+            value=value,
+            is_fancy_index=is_fancy_index,
+            input_nsplits=input_nsplits,
+            chunk_offsets=chunk_offsets,
+            shuffle_axes=shuffle_axes,
+            **kw,
+        )
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if self.stage == OperandStage.reduce:
+            self.input = self._inputs[0]
+            return
+        elif self.stage == OperandStage.map:
+            inputs_iter = iter(self._inputs)
+        else:
+            self.input = self._inputs[0]
+            inputs_iter = iter(self._inputs[1:])
+        new_indexes = [
+            next(inputs_iter) if isinstance(index, ENTITY_TYPE) else index
+            for index in self.indexes
+        ]
+        self.indexes = tuple(new_indexes)
+        if isinstance(self.value, ENTITY_TYPE):
+            self.value = next(inputs_iter)
+
+    def __call__(self, a, index, value):
+        inputs = filter_inputs([a] + list(index) + [value])
+        self.indexes = tuple(index)
+        self.value = value
+        return self.new_tensor(inputs, a.shape, order=a.order)
+
+    def on_output_modify(self, new_output):
+        return new_output
+
+    def on_input_modify(self, new_input):
+        new_op = self.copy().reset_key()
+        new_inputs = [new_input] + self.inputs[1:]
+        return new_op.new_tensor(new_inputs, shape=self.outputs[0].shape)
+
+    @classmethod
+    def _tile_fancy_index(cls, op: "TensorIndexSetValue"):
+        from ..utils import unify_chunks
+
+        tensor = op.outputs[0]
+        inp = op.inputs[0]
+        value = op.value
+        indexes = op.indexes
+
+        if has_unknown_shape(inp):
+            yield
+
+        fancy_indexes = [index for index in indexes if isinstance(index, ENTITY_TYPE)]
+        shape = broadcast_shape(*[ind.shape for ind in fancy_indexes])
+        fancy_indexes = [broadcast_to(ind, shape) for ind in fancy_indexes]
+        if isinstance(value, ENTITY_TYPE):
+            value = broadcast_to(value, shape)
+            value, *fancy_indexes = yield from unify_chunks(value, *fancy_indexes)
+            value = value.chunks
+        else:
+            fancy_indexes = yield from unify_chunks(*fancy_indexes)
+            value = [value] * len(fancy_indexes[0].chunks)
+        input_nsplits = inp.nsplits
+        shuffle_axes = tuple(
+            axis for axis, ind in enumerate(indexes) if isinstance(ind, ENTITY_TYPE)
+        )
+
+        map_chunks = []
+        for value_chunk, *index_chunks in zip(
+            value, *[index.chunks for index in fancy_indexes]
+        ):
+            map_op = TensorIndexSetValue(
+                stage=OperandStage.map,
+                input_nsplits=input_nsplits,
+                value=value_chunk,
+                indexes=tuple(index_chunks),
+                shuffle_axes=shuffle_axes,
+                dtype=tensor.dtype,
+            )
+            inputs = filter_inputs([value_chunk] + list(index_chunks))
+            map_chunk = map_op.new_chunk(
+                inputs,
+                shape=(np.nan,),
+                index=index_chunks[0].index,
+                order=TensorOrder.C_ORDER,
+            )
+            map_chunks.append(map_chunk)
+
+        proxy_chunk = TensorShuffleProxy(dtype=tensor.dtype).new_chunk(
+            map_chunks, shape=(), order=TensorOrder.C_ORDER
+        )
+
+        reducer_chunks = []
+        offsets_on_axis = [np.cumsum([0] + list(split)) for split in input_nsplits]
+        for input_chunk in inp.chunks:
+            chunk_offsets = tuple(
+                offsets_on_axis[axis][input_chunk.index[axis]]
+                for axis in range(len(inp.shape))
+            )
+            reducer_op = TensorIndexSetValue(
+                stage=OperandStage.reduce,
+                n_reducers=len(inp.chunks),
+                dtype=input_chunk.dtype,
+                shuffle_axes=shuffle_axes,
+                chunk_offsets=chunk_offsets,
+            )
+            reducer_chunk = reducer_op.new_chunk(
+                [input_chunk, proxy_chunk],
+                index=input_chunk.index,
+                shape=input_chunk.shape,
+                order=input_chunk.order,
+            )
+            reducer_chunks.append(reducer_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            tensor.shape,
+            order=tensor.order,
+            chunks=reducer_chunks,
+            nsplits=op.input.nsplits,
+        )
+
+    @classmethod
+    def _tile(cls, op: "TensorIndexSetValue"):
+        from ..base import broadcast_to
+        from .getitem import _getitem_nocheck
+
+        tensor = op.outputs[0]
+        value = op.value
+        indexed = yield from recursive_tile(
+            _getitem_nocheck(op.input, op.indexes, convert_bool_to_fancy=False)
+        )
+        is_value_tensor = isinstance(value, TENSOR_TYPE)
+
+        if is_value_tensor and value.ndim > 0:
+            if has_unknown_shape(indexed, value):
+                exec_chunks = indexed.chunks + op.input.chunks
+                for c in indexed.chunks:
+                    exec_chunks.extend(c.inputs)
+                yield exec_chunks + [indexed]
+
+            nsplits = indexed.nsplits
+            value = yield from recursive_tile(
+                broadcast_to(value, indexed.shape)
+                .astype(op.input.dtype, copy=False)
+                .rechunk(nsplits)
+            )
+
+        chunk_mapping = {c.op.input.index: c for c in indexed.chunks}
+        out_chunks = []
+        for chunk in indexed.op.input.chunks:
+            index_chunk = chunk_mapping.get(chunk.index)
+            if index_chunk is None:
+                out_chunks.append(chunk)
+                continue
+
+            if is_value_tensor:
+                if value.ndim > 0:
+                    value_chunk = value.cix[index_chunk.index]
+                else:
+                    value_chunk = value.chunks[0]
+            else:
+                # non tensor
+                value_chunk = value
+            chunk_op = TensorIndexSetValue(
+                dtype=op.dtype,
+                sparse=op.sparse,
+                indexes=tuple(index_chunk.op.indexes),
+                value=value_chunk,
+            )
+            chunk_inputs = filter_inputs(
+                [chunk] + index_chunk.op.indexes + [value_chunk]
+            )
+            out_chunk = chunk_op.new_chunk(
+                chunk_inputs, shape=chunk.shape, index=chunk.index, order=tensor.order
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            tensor.shape,
+            order=tensor.order,
+            chunks=out_chunks,
+            nsplits=op.input.nsplits,
+        )
+
+    @classmethod
+    def tile(cls, op: "TensorIndexSetValue"):
+        if op.is_fancy_index:
+            return (yield from cls._tile_fancy_index(op))
+        else:
+            return (yield from cls._tile(op))
+
+    @classmethod
+    def execute(cls, ctx: Union[dict, Context], op: "TensorIndexSetValue"):
+        if op.stage == OperandStage.map:
+            return cls._execute_map(ctx, op)
+        elif op.stage == OperandStage.reduce:
+            return cls._execute_reduce(ctx, op)
+        else:
+            return cls._execute(ctx, op)
+
+    @classmethod
+    def _execute(cls, ctx, op):
+        indexes = [
+            ctx[index.key] if hasattr(index, "key") else index for index in op.indexes
+        ]
+        input_ = ctx[op.inputs[0].key].copy()
+        value = ctx[op.value.key] if hasattr(op.value, "key") else op.value
+        if hasattr(input_, "flags") and not input_.flags.writeable:
+            input_.setflags(write=True)
+        input_[tuple(indexes)] = value
+        ctx[op.outputs[0].key] = input_
+
+    @classmethod
+    def _execute_map(cls, ctx, op):
+        nsplits = op.input_nsplits
+        shuffle_axes = op.shuffle_axes
+        all_inputs = [ctx[inp.key] for inp in op.inputs]
+        if hasattr(op.value, "key"):
+            value = ctx[op.value.key]
+            indexes = all_inputs[1:]
+        else:
+            value = op.value
+            indexes = all_inputs
+
+        offsets_on_axis = [np.cumsum([0] + list(split)) for split in nsplits]
+        for reducer_index in itertools.product(
+            *(map(range, [len(s) for s in nsplits]))
+        ):
+            chunk_filters = []
+            indexes_iter = iter(indexes)
+            for axis, _ in enumerate(reducer_index):
+                start = offsets_on_axis[axis][reducer_index[axis]]
+                end = offsets_on_axis[axis][reducer_index[axis] + 1]
+                if axis in shuffle_axes:
+                    index_on_axis = next(indexes_iter)
+                    filtered = (index_on_axis >= start) & (index_on_axis < end)
+                    chunk_filters.append(filtered)
+            combined_filter = functools.reduce(operator.and_, chunk_filters)
+            if hasattr(op.value, "key"):
+                ctx[op.outputs[0].key, reducer_index] = tuple(
+                    inp[combined_filter] for inp in all_inputs
+                )
+            else:
+                ctx[op.outputs[0].key, reducer_index] = tuple(
+                    [value] + [inp[combined_filter] for inp in all_inputs]
+                )
+
+    @classmethod
+    def _execute_reduce(cls, ctx, op):
+        input_data = ctx[op.inputs[0].key].copy()
+        for index_value in op.iter_mapper_data(ctx, input_id=1):
+            value = index_value[0]
+            indexes_with_offset = index_value[1:]
+            indexes = []
+            index_iter = iter(indexes_with_offset)
+            for axis in range(input_data.ndim):
+                if axis in op.shuffle_axes:
+                    indexes.append(next(index_iter) - op.chunk_offsets[axis])
+            input_data[tuple(indexes)] = value
+
+        ctx[op.outputs[0].key] = input_data
+
+
+def _check_support(indexes):
+    if all(
+        (
+            isinstance(ix, (TENSOR_TYPE, np.ndarray))
+            and ix.dtype != np.bool_
+            or isinstance(ix, slice)
+            and ix == slice(None)
+        )
+        for ix in indexes
+    ):
+        if any(isinstance(ix, (TENSOR_TYPE, np.ndarray)) for ix in indexes):
+            return True
+    for index in indexes:
+        if isinstance(index, (slice, Integral)):
+            pass
+        elif isinstance(index, (np.ndarray, TENSOR_TYPE)) and index.dtype == np.bool_:
+            pass
+        else:  # pragma: no cover
+            raise NotImplementedError(
+                "Only slice, int, or bool indexing "
+                f"supported by now, got {type(index)}"
+            )
+    return False
+
+
+def _setitem(a, item, value):
+    index = process_index(a.ndim, item, convert_bool_to_fancy=False)
+    if not (np.isscalar(value) or (isinstance(value, tuple) and a.dtype.fields)):
+        # do not convert for tuple when dtype is record type.
+        value = astensor(value)
+
+    is_fancy_index = _check_support(index)
+    if is_fancy_index:
+        index = [astensor(ind) if isinstance(ind, np.ndarray) else ind for ind in index]
+
+    # __setitem__ on a view should be still a view, see GH #732.
+    op = TensorIndexSetValue(
+        dtype=a.dtype,
+        sparse=a.issparse(),
+        is_fancy_index=is_fancy_index,
+        indexes=tuple(index),
+        value=value,
+        create_view=a.op.create_view,
+    )
+    ret = op(a, index, value)
+    a.data = ret.data
diff --git a/python/xorbits/_mars/tensor/indexing/slice.py b/python/xorbits/_mars/tensor/indexing/slice.py
new file mode 100644
index 000000000..cd8ba473c
--- /dev/null
+++ b/python/xorbits/_mars/tensor/indexing/slice.py
@@ -0,0 +1,68 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import KeyField, ListField
+from ..array_utils import get_array_module
+from ..core import TensorOrder
+from ..operands import TensorHasInput, TensorOperandMixin
+
+
+class TensorSlice(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.SLICE
+
+    _input = KeyField("input")
+    _slices = ListField("slices")
+
+    def __init__(self, slices=None, **kw):
+        super().__init__(_slices=slices, **kw)
+
+    @property
+    def slices(self):
+        return self._slices
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def _get_order(self, kw, i):
+        order = kw.pop("order", None)
+        if order is None:
+            inp = self.input
+            if inp is None or inp.order == TensorOrder.C_ORDER:
+                return TensorOrder.C_ORDER
+
+            for shape, slc in zip(inp.shape, self._slices):
+                if slc is None:
+                    continue
+                s = slc.indices(shape)
+                if s[0] == 0 and s[1] == shape and s[2] == 1:
+                    continue
+                else:
+                    return TensorOrder.C_ORDER
+
+            return inp.order
+
+        return order[i] if isinstance(order, (list, tuple)) else order
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inp = ctx[op.inputs[0].key]
+        if op.input.ndim == 0 and not hasattr(inp, "shape"):
+            # scalar, but organize it into an array
+            inp = get_array_module(inp).array(inp)
+        x = inp[tuple(op.slices)]
+        out = op.outputs[0]
+        ctx[out.key] = x.astype(x.dtype, order=out.order.value, copy=False)
diff --git a/python/xorbits/_mars/tensor/indexing/take.py b/python/xorbits/_mars/tensor/indexing/take.py
new file mode 100644
index 000000000..f2b2a5ba7
--- /dev/null
+++ b/python/xorbits/_mars/tensor/indexing/take.py
@@ -0,0 +1,128 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..datasource import tensor as astensor
+from ..utils import check_out_param, validate_axis
+
+
+def take(a, indices, axis=None, out=None):
+    """
+    Take elements from a tensor along an axis.
+
+    When axis is not None, this function does the same thing as "fancy"
+    indexing (indexing arrays using tensors); however, it can be easier to use
+    if you need elements along a given axis. A call such as
+    ``mt.take(arr, indices, axis=3)`` is equivalent to
+    ``arr[:,:,:,indices,...]``.
+
+    Explained without fancy indexing, this is equivalent to the following use
+    of `ndindex`, which sets each of ``ii``, ``jj``, and ``kk`` to a tuple of
+    indices::
+
+        Ni, Nk = a.shape[:axis], a.shape[axis+1:]
+        Nj = indices.shape
+        for ii in ndindex(Ni):
+            for jj in ndindex(Nj):
+                for kk in ndindex(Nk):
+                    out[ii + jj + kk] = a[ii + (indices[jj],) + kk]
+
+    Parameters
+    ----------
+    a : array_like (Ni..., M, Nk...)
+        The source tensor.
+    indices : array_like (Nj...)
+        The indices of the values to extract.
+
+        Also allow scalars for indices.
+    axis : int, optional
+        The axis over which to select values. By default, the flattened
+        input tensor is used.
+    out : Tensor, optional (Ni..., Nj..., Nk...)
+        If provided, the result will be placed in this tensor. It should
+        be of the appropriate shape and dtype.
+    mode : {'raise', 'wrap', 'clip'}, optional
+        Specifies how out-of-bounds indices will behave.
+
+        * 'raise' -- raise an error (default)
+        * 'wrap' -- wrap around
+        * 'clip' -- clip to the range
+
+        'clip' mode means that all indices that are too large are replaced
+        by the index that addresses the last element along that axis. Note
+        that this disables indexing with negative numbers.
+
+    Returns
+    -------
+    out : Tensor (Ni..., Nj..., Nk...)
+        The returned tensor has the same type as `a`.
+
+    See Also
+    --------
+    compress : Take elements using a boolean mask
+    Tensor.take : equivalent method
+
+    Notes
+    -----
+
+    By eliminating the inner loop in the description above, and using `s_` to
+    build simple slice objects, `take` can be expressed  in terms of applying
+    fancy indexing to each 1-d slice::
+
+        Ni, Nk = a.shape[:axis], a.shape[axis+1:]
+        for ii in ndindex(Ni):
+            for kk in ndindex(Nj):
+                out[ii + s_[...,] + kk] = a[ii + s_[:,] + kk][indices]
+
+    For this reason, it is equivalent to (but faster than) the following use
+    of `apply_along_axis`::
+
+        out = mt.apply_along_axis(lambda a_1d: a_1d[indices], axis, a)
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> a = [4, 3, 5, 7, 6, 8]
+    >>> indices = [0, 1, 4]
+    >>> mt.take(a, indices).execute()
+    array([4, 3, 6])
+
+    In this example if `a` is a tensor, "fancy" indexing can be used.
+
+    >>> a = mt.array(a)
+    >>> a[indices].execute()
+    array([4, 3, 6])
+
+    If `indices` is not one dimensional, the output also has these dimensions.
+
+    >>> mt.take(a, [[0, 1], [2, 3]]).execute()
+    array([[4, 3],
+           [5, 7]])
+    """
+    a = astensor(a)
+    if axis is None:
+        t = a.ravel()[indices]
+    else:
+        axis = validate_axis(a.ndim, axis)
+        t = a[(slice(None),) * axis + (indices,)]
+
+    if out is None:
+        return t
+
+    if out.shape != t.shape:
+        raise ValueError(
+            f"output tensor has wrong shape, expect: {t.shape}, got: {out.shape}"
+        )
+    check_out_param(out, t, "unsafe")
+    out.data = t.data
+    return out
diff --git a/python/xorbits/_mars/tensor/indexing/tests/__init__.py b/python/xorbits/_mars/tensor/indexing/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/indexing/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/indexing/tests/test_indexing.py b/python/xorbits/_mars/tensor/indexing/tests/test_indexing.py
new file mode 100644
index 000000000..28217b953
--- /dev/null
+++ b/python/xorbits/_mars/tensor/indexing/tests/test_indexing.py
@@ -0,0 +1,414 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+from ....config import option_context
+from ....core import tile
+from ...base.broadcast_to import TensorBroadcastTo
+from ...datasource import array, empty, ones, tensor
+from ...datasource.ones import TensorOnes
+from ...merge.concatenate import TensorConcatenate
+from .. import choose, compress, fill_diagonal, nonzero, unravel_index
+from ..setitem import TensorIndexSetValue
+
+
+def test_bool_indexing():
+    t = ones((100, 200, 300))
+    indexed = t[t < 2]
+    assert len(indexed.shape) == 1
+    assert np.isnan(indexed.shape[0])
+
+    t2 = ones((100, 200))
+    indexed = t[t2 < 2]
+    assert len(indexed.shape) == 2
+    assert np.isnan(indexed.shape[0])
+    assert indexed.shape[1] == 300
+
+    t2 = ones((100, 200))
+    indexed = t[t2 < 2] + 1
+    assert len(indexed.shape) == 2
+    assert np.isnan(indexed.shape[0])
+    assert indexed.shape[1] == 300
+
+    t2 = ones((10, 20))
+    rs = np.random.RandomState(0)
+    i1 = np.zeros(10, dtype=bool)
+    i1[rs.permutation(np.arange(10))[:5]] = True
+    i2 = np.zeros(20, dtype=bool)
+    i2[rs.permutation(np.arange(20))[:5]] = True
+    indexed = t2[i1, i2]
+    assert len(indexed.shape) == 1
+    assert indexed.shape[0] == 5
+
+    t2 = tile(indexed)
+    assert t2.chunks[0].index == (0,)
+
+    t3 = ones((101, 200))
+    with pytest.raises(IndexError) as cm:
+        _ = t[t3 < 2]  # noqa: F841
+    e = cm.value.args[0]
+    assert "along dimension 0" in e
+    assert "dimension is 100 but corresponding boolean dimension is 101" in e
+
+    t4 = ones((100, 201))
+    with pytest.raises(IndexError) as cm:
+        _ = t[t4 < 2]  # noqa: F841
+    e = cm.value.args[0]
+    assert "along dimension 1" in e
+    assert "dimension is 200 but corresponding boolean dimension is 201" in e
+
+
+def test_slice():
+    t = ones((100, 200, 300))
+    t2 = t[10:30, 199:, -30:303]
+    assert t2.shape == (20, 1, 30)
+
+    t3 = t[10:90:4, 20:80:5]
+    s1 = len(list(range(100))[10:90:4])
+    s2 = len(list(range(200))[20:80:5])
+    assert t3.shape == (s1, s2, 300)
+
+
+def test_fancy_indexing():
+    t = ones((100, 200, 300))
+    t2 = t[[0, 1], [2, 3]]
+    assert t2.shape == (2, 300)
+
+    t3 = t[[[0, 1], [2, 3]], [4, 5]]
+    assert t3.shape == (2, 2, 300)
+
+    with pytest.raises(IndexError) as cm:
+        _ = t[[1, 2], [3, 4, 5]]  # noqa: F841
+    e = cm.value.args[0]
+    assert (
+        e == "shape mismatch: indexing arrays could not be broadcast "
+        "together with shapes (2,) (3,)"
+    )
+
+    with pytest.raises(IndexError):
+        t[[100]]
+
+    t = ones((100, 200, 300), chunk_size=10)
+
+    # fancy index on numpy ndarrays
+
+    t4 = tile(t[:10, -10:, [13, 244, 151, 242, 34]])
+    assert t4.shape == (10, 10, 5)
+    assert t4.chunk_shape == (1, 1, 1)
+
+    t5 = tile(t[:10, -10:, [1, 10, 20, 33, 34, 200]])
+    assert t5.shape == (10, 10, 6)
+    assert t5.chunk_shape == (1, 1, 5)
+
+    t6 = tile(t[[20, 1, 33, 22, 11], :15, [255, 211, 2, 11, 121]])
+    assert t6.shape == (5, 15)
+    # need a concat, because the fancy indexes are not ascending according to chunk index
+    assert t6.chunk_shape == (1, 2)
+    assert t6.chunks[0].ndim == 2
+    assert t6.nsplits == ((5,), (10, 5))
+
+    t7 = tile(t[[5, 6, 33, 66], :15, [0, 9, 2, 11]])
+    assert t7.shape == (4, 15)
+    # not need a concat
+    assert t7.chunk_shape == (3, 2)
+    assert t7.chunks[0].ndim == 2
+    assert t7.nsplits == ((2, 1, 1), (10, 5))
+
+    t8 = tile(t[[[5, 33], [66, 6]], :15, [255, 11]])
+    assert t8.shape == (2, 2, 15)
+    assert t8.chunk_shape == (1, 1, 2)
+    assert t8.chunks[0].ndim == 3
+    assert t8.nsplits == ((2,), (2,), (10, 5))
+
+    # fancy index on tensors
+
+    t9 = tile(t[:10, -10:, tensor([13, 244, 151, 242, 34], chunk_size=2)])
+    assert t9.shape == (10, 10, 5)
+    assert t9.chunk_shape == (1, 1, 3)
+
+    t10 = tile(t[:10, -10:, tensor([1, 10, 20, 33, 34, 200], chunk_size=4)])
+    assert t10.shape == (10, 10, 6)
+    assert t10.chunk_shape == (1, 1, 2)
+
+    t11 = tile(
+        t[
+            tensor([20, 1, 33, 22, 11], chunk_size=2),
+            :15,
+            tensor([255, 211, 2, 11, 121], chunk_size=3),
+        ]
+    )
+    assert t11.shape == (5, 15)
+    # need a concat, because the fancy indexes are not ascending according to chunk index
+    assert t11.chunk_shape == (4, 2)
+    assert t11.chunks[0].ndim == 2
+    assert t11.nsplits == ((2, 1, 1, 1), (10, 5))
+
+    t12 = tile(t[tensor([5, 6, 33, 66], chunk_size=2), :15, [0, 9, 2, 11]])
+    assert t12.shape == (4, 15)
+    # not need a concat
+    assert t12.chunk_shape == (2, 2)
+    assert t12.chunks[0].ndim == 2
+    assert t12.nsplits == ((2, 2), (10, 5))
+
+    t13 = tile(t[tensor([[5, 33], [66, 6]]), :15, tensor([255, 11])])
+    assert t13.shape == (2, 2, 15)
+    assert t13.chunk_shape == (1, 1, 2)
+    assert t13.chunks[0].ndim == 3
+    assert t13.nsplits == ((2,), (2,), (10, 5))
+
+
+def test_mixed_indexing():
+    t = ones((100, 200, 300, 400))
+
+    with pytest.raises(IndexError):
+        _ = t[ones((100, 200), dtype=float)]  # noqa: F841
+
+    t2 = t[ones(100) < 2, ..., 20::101, 2]
+    assert len(t2.shape) == 3
+    assert np.isnan(t2.shape[0])
+
+    t3 = ones((2, 3, 4, 5))
+    t4 = t3[1]
+    assert t4.flags["C_CONTIGUOUS"] == np.ones((2, 3, 4, 5))[1].flags["C_CONTIGUOUS"]
+    assert t4.flags["F_CONTIGUOUS"] == np.ones((2, 3, 4, 5))[1].flags["F_CONTIGUOUS"]
+
+
+def test_bool_indexing_tiles():
+    t = ones((100, 200, 300), chunk_size=30)
+    indexed = t[t < 2]
+    indexed, t = tile(indexed, t)
+
+    assert len(indexed.chunks) == 280
+    assert indexed.chunks[0].index == (0,)
+    assert indexed.chunks[20].index == (20,)
+    assert indexed.chunks[20].inputs[0] is t.cix[(0, 2, 0)].data
+    assert indexed.chunks[20].inputs[1] is indexed.op.indexes[0].cix[0, 2, 0].data
+
+    t = ones((100, 200, 300), chunk_size=30)
+    t2 = ones((100, 200), chunk_size=30)
+    indexed2 = t[t2 < 2]
+    indexed2, t = tile(indexed2, t)
+
+    assert len(indexed2.chunks) == 280
+    assert len(indexed2.chunks[0].shape) == 2
+    assert np.isnan(indexed2.chunks[0].shape[0])
+    assert indexed2.chunks[0].shape[1] == 30
+    assert indexed2.chunks[20].inputs[0] == t.cix[(0, 2, 0)].data
+    assert indexed2.chunks[20].inputs[1] == indexed2.op.indexes[0].cix[0, 2].data
+
+
+def test_slice_tiles():
+    t = ones((100, 200, 300), chunk_size=30)
+    t2 = t[10:40, 199:, -30:303]
+    t, t2 = tile(t, t2)
+
+    assert t2.chunk_shape == (2, 1, 1)
+    assert t2.chunks[0].inputs[0] == t.cix[0, -1, -1].data
+    assert t2.chunks[0].op.indexes == [slice(10, 30, 1), slice(19, 20, 1), slice(None)]
+    assert t2.chunks[0].index == (0, 0, 0)
+    assert t2.chunks[1].inputs[0] == t.cix[1, -1, -1].data
+    assert t2.chunks[1].op.indexes == [slice(0, 10, 1), slice(19, 20, 1), slice(None)]
+    assert t2.chunks[1].index == (1, 0, 0)
+
+
+def test_indices_indexing_tiles():
+    t = ones((10, 20, 30), chunk_size=(2, 20, 30))
+    t2 = t[3]
+    t, t2 = tile(t, t2)
+
+    assert len(t2.chunks) == 1
+    assert t2.chunks[0].inputs[0] is t.cix[1, 0, 0].data
+    assert t2.chunks[0].op.indexes[0] == 1
+
+    t = ones((10, 20, 30), chunk_size=(2, 20, 30))
+    t3 = t[4]
+    t, t3 = tile(t, t3)
+
+    assert len(t3.chunks) == 1
+    assert t3.chunks[0].inputs[0] is t.cix[2, 0, 0].data
+    assert t3.chunks[0].op.indexes[0] == 0
+
+
+def test_mixed_indexing_tiles():
+    t = ones((100, 200, 300, 400), chunk_size=24)
+
+    cmp = ones(400, chunk_size=24) < 2
+    t2 = t[10:90:3, 5, ..., None, cmp]
+    t2, cmp = tile(t2, cmp)
+
+    assert t2.shape[:-1] == (27, 300, 1)
+    assert np.isnan(t2.shape[-1])
+    assert t2.chunk_shape == (4, 13, 1, 17)
+    assert t2.chunks[0].op.indexes == [
+        slice(10, 24, 3),
+        5,
+        slice(None),
+        None,
+        cmp.cix[(0,)].data,
+    ]
+
+
+def test_setitem():
+    shape = (10, 20, 30, 40)
+    t = ones(shape, chunk_size=5, dtype="i4")
+    t[5:20:3, 5, ..., :-5] = 2.2
+
+    assert isinstance(t.op, TensorIndexSetValue)
+    assert t.shape == shape
+    assert isinstance(t.inputs[0].op.outputs[0].op, TensorOnes)
+
+    t = tile(t)
+    assert isinstance(t.chunks[0].op, TensorOnes)
+    assert isinstance(t.cix[1, 1, 0, 0].op, TensorIndexSetValue)
+    assert t.cix[1, 1, 0, 0].op.value == 2.2
+
+    t2 = ones(shape, chunk_size=5, dtype="i4")
+    shape = t2[5:20:3, 5, ..., :-5].shape
+    t2[5:20:3, 5, ..., :-5] = ones(shape, chunk_size=4, dtype="i4") * 2
+
+    t2 = tile(t2)
+    assert isinstance(t2.chunks[0].op, TensorOnes)
+    assert isinstance(t2.cix[1, 1, 0, 0].op, TensorIndexSetValue)
+    assert isinstance(t2.cix[1, 1, 0, 0].op.value.op, TensorConcatenate)
+
+
+def test_setitem_structured():
+    # Check to value is properly broadcast for `setitem` on complex record dtype arrays.
+    rec_type = np.dtype(
+        [
+            ("a", np.int32),
+            ("b", np.double),
+            ("c", np.dtype([("a", np.int16), ("b", np.int64)])),
+        ]
+    )
+
+    t = ones((4, 5), dtype=rec_type, chunk_size=3)
+
+    # assign tuple to record
+    t[1:4, 1] = (3, 4.0, (5, 6))
+    tt = tile(t)
+    assert tt.cix[0, 0].op.value == (3, 4.0, (5, 6))
+
+    # assign scalar to record
+    t[1:4, 2] = 8
+    tt = tile(t)
+    assert tt.cix[0, 0].op.value == 8
+
+    # assign scalar array to record array with broadcast
+    t[1:3] = np.arange(5)
+    tt = tile(t)
+    slices_op = tt.cix[0, 0].op.value.op
+    assert slices_op.slices == [slice(None, None, None), slice(0, 3, None)]
+    broadcast_op = slices_op.inputs[0].op.inputs[0].op
+    assert isinstance(broadcast_op, TensorBroadcastTo)
+    assert broadcast_op.shape == (2, 5)
+    np.testing.assert_array_equal(broadcast_op.inputs[0].op.data, np.arange(5))
+
+    # assign scalar array to record array of same shape, no broadcast
+    t[2:4] = np.arange(10).reshape(2, 5)
+    tt = tile(t)
+    slices_op = tt.cix[0, 0].op.value.op
+    assert slices_op.slices == [slice(0, 1, None), slice(0, 3, None)]
+    np.testing.assert_array_equal(
+        slices_op.inputs[0].op.inputs[0].op.data, np.arange(10).reshape(2, 5)
+    )
+
+
+def test_choose():
+    with option_context() as options:
+        options.chunk_size = 2
+
+        choices = [[0, 1, 2, 3], [10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33]]
+        a = choose([2, 3, 1, 0], choices)
+
+        a = tile(a)
+        assert len(a.chunks) == 2
+        assert isinstance(a.chunks[0].op, type(a.op))
+        assert len(a.chunks[0].inputs) == 5
+
+        with pytest.raises(TypeError):
+            choose([2, 3, 1, 0], choices, out=1)
+
+        with pytest.raises(ValueError):
+            choose([2, 3, 1, 0], choices, out=tensor(np.empty((1, 4))))
+
+
+def test_unravel_index():
+    indices = tensor([22, 41, 37], chunk_size=1)
+    t = unravel_index(indices, (7, 6))
+
+    assert len(t) == 2
+
+    t = [tile(r) for r in t]
+
+    assert len(t[0].chunks) == 3
+    assert len(t[1].chunks) == 3
+
+    with pytest.raises(TypeError):
+        unravel_index([22, 41, 37], (7, 6), order="B")
+
+
+def test_nonzero():
+    x = tensor([[1, 0, 0], [0, 2, 0], [1, 1, 0]], chunk_size=2)
+    y = nonzero(x)
+
+    assert len(y) == 2
+
+    tile(y[0])
+
+
+def test_compress():
+    a = np.array([[1, 2], [3, 4], [5, 6]])
+
+    with pytest.raises(TypeError):
+        compress([0, 1], a, axis=0, out=1)
+
+    with pytest.raises(TypeError):
+        compress(
+            [0, 1],
+            array([[1, 2], [3, 4], [5, 6]], dtype="i8"),
+            axis=0,
+            out=empty((1, 2), dtype="f8"),
+        )
+
+
+def test_operand_key():
+    t = ones((10, 2), chunk_size=5)
+    t_slice1 = t[:5]
+    t_slice2 = t[5:]
+
+    assert t_slice1.op.key != t_slice2.op.key
+
+
+def test_fill_diagonal():
+    a = tensor(np.random.rand(10, 13))
+    fill_diagonal(a, 10)
+
+    assert a.shape == (10, 13)
+
+    # must be Tensor
+    with pytest.raises(TypeError):
+        fill_diagonal(np.random.rand(11, 10), 1)
+
+    # at least 2-d required
+    with pytest.raises(ValueError):
+        a = tensor(np.random.rand(4))
+        fill_diagonal(a, 1)
+
+    # for more than 2-d, shape on each dimension should be equal
+    with pytest.raises(ValueError):
+        a = tensor(np.random.rand(11, 10, 11))
+        fill_diagonal(a, 1)
diff --git a/python/xorbits/_mars/tensor/indexing/tests/test_indexing_execution.py b/python/xorbits/_mars/tensor/indexing/tests/test_indexing_execution.py
new file mode 100644
index 000000000..74d70d88c
--- /dev/null
+++ b/python/xorbits/_mars/tensor/indexing/tests/test_indexing_execution.py
@@ -0,0 +1,897 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import scipy.sparse as sps
+
+from ....config import options
+from ... import hstack, mod, stack
+from ...datasource import arange, tensor, zeros
+from .. import (
+    choose,
+    compress,
+    extract,
+    fill_diagonal,
+    flatnonzero,
+    nonzero,
+    take,
+    unravel_index,
+)
+
+
+def test_bool_indexing_execution(setup):
+    raw = np.random.random((11, 8, 12, 14))
+    arr = tensor(raw, chunk_size=6)
+
+    index = arr < 0.5
+    arr2 = arr[index]
+    # size_res = self.executor.execute_tensor(arr2, mock=True)
+    res = arr2.execute().fetch()
+
+    # assert sum(s[0] for s in size_res) == arr.nbytes
+    np.testing.assert_array_equal(np.sort(res), np.sort(raw[raw < 0.5]))
+
+    index2 = tensor(raw[:, :, 0, 0], chunk_size=3) < 0.5
+    arr3 = arr[index2]
+    res = arr3.execute().fetch()
+
+    expected = raw[raw[:, :, 0, 0] < 0.5]
+    assert sum(it.size for it in res) == expected.size
+    assert res.shape == expected.shape
+
+    raw = np.asfortranarray(np.random.random((11, 8, 12, 14)))
+    arr = tensor(raw, chunk_size=3)
+
+    index = tensor(raw[:, :, 0, 0], chunk_size=3) < 0.5
+    arr2 = arr[index]
+    res = arr2.execute().fetch()
+    expected = raw[raw[:, :, 0, 0] < 0.5].copy("A")
+
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+
+def test_fancy_indexing_numpy_execution(setup):
+    # test fancy index of type numpy ndarray
+    raw = np.random.random((11, 8, 12, 14))
+    arr = tensor(raw, chunk_size=(6, 5, 7, 8))
+
+    index = [9, 10, 3, 1, 8, 10]
+    arr2 = arr[index]
+
+    res = arr2.execute().fetch()
+    np.testing.assert_array_equal(res, raw[index])
+
+    index = np.random.permutation(8)
+    arr3 = arr[:2, ..., index]
+
+    res = arr3.execute().fetch()
+    np.testing.assert_array_equal(res, raw[:2, ..., index])
+
+    index = [1, 3, 9, 10]
+    arr4 = arr[..., index, :5]
+
+    res = arr4.execute().fetch()
+    np.testing.assert_array_equal(res, raw[..., index, :5])
+
+    index1 = [8, 10, 3, 1, 9, 10]
+    index2 = [1, 3, 9, 10, 2, 7]
+    arr5 = arr[index1, :, index2]
+
+    res = arr5.execute().fetch()
+    np.testing.assert_array_equal(res, raw[index1, :, index2])
+
+    index1 = [1, 3, 5, 7, 9, 10]
+    index2 = [1, 9, 9, 10, 2, 7]
+    arr6 = arr[index1, :, index2]
+
+    res = arr6.execute().fetch()
+    np.testing.assert_array_equal(res, raw[index1, :, index2])
+
+    index1 = [[8, 10, 3], [1, 9, 10]]
+    index2 = [[1, 3, 9], [10, 2, 7]]
+    arr7 = arr[index1, :, index2]
+
+    res = arr7.execute().fetch()
+    np.testing.assert_array_equal(res, raw[index1, :, index2])
+
+    index1 = [[1, 3], [3, 7], [7, 7]]
+    index2 = [1, 9]
+    arr8 = arr[0, index1, :, index2]
+
+    res = arr8.execute().fetch()
+    np.testing.assert_array_equal(res, raw[0, index1, :, index2])
+
+
+def test_fancy_indexing_tensor_execution(setup):
+    # test fancy index of type tensor
+
+    raw = np.random.random((11, 8, 12, 14))
+    arr = tensor(raw, chunk_size=(6, 5, 7, 8))
+
+    raw_index = [8, 10, 3, 1, 9, 10]
+    index = tensor(raw_index, chunk_size=4)
+    arr2 = arr[index]
+
+    res = arr2.execute().fetch()
+    np.testing.assert_array_equal(res, raw[raw_index])
+
+    raw_index = np.random.permutation(8)
+    index = tensor(raw_index, chunk_size=3)
+    arr3 = arr[:2, ..., index]
+
+    res = arr3.execute().fetch()
+    np.testing.assert_array_equal(res, raw[:2, ..., raw_index])
+
+    raw_index = [1, 3, 9, 10]
+    index = tensor(raw_index)
+    arr4 = arr[..., index, :5]
+
+    res = arr4.execute().fetch()
+    np.testing.assert_array_equal(res, raw[..., raw_index, :5])
+
+    raw_index1 = [8, 10, 3, 1, 9, 10]
+    raw_index2 = [1, 3, 9, 10, 2, 7]
+    index1 = tensor(raw_index1, chunk_size=4)
+    index2 = tensor(raw_index2, chunk_size=3)
+    arr5 = arr[index1, :, index2]
+
+    res = arr5.execute().fetch()
+    np.testing.assert_array_equal(res, raw[raw_index1, :, raw_index2])
+
+    raw_index1 = [1, 3, 5, 7, 9, 10]
+    raw_index2 = [1, 9, 9, 10, 2, 7]
+    index1 = tensor(raw_index1, chunk_size=3)
+    index2 = tensor(raw_index2, chunk_size=4)
+    arr6 = arr[index1, :, index2]
+
+    res = arr6.execute().fetch()
+    np.testing.assert_array_equal(res, raw[raw_index1, :, raw_index2])
+
+    raw_index1 = [[8, 10, 3], [1, 9, 10]]
+    raw_index2 = [[1, 3, 9], [10, 2, 7]]
+    index1 = tensor(raw_index1)
+    index2 = tensor(raw_index2, chunk_size=2)
+    arr7 = arr[index1, :, index2]
+
+    res = arr7.execute().fetch()
+    np.testing.assert_array_equal(res, raw[raw_index1, :, raw_index2])
+
+    raw_index1 = [[1, 3], [3, 7], [7, 7]]
+    raw_index2 = [1, 9]
+    index1 = tensor(raw_index1, chunk_size=(2, 1))
+    index2 = tensor(raw_index2)
+    arr8 = arr[0, index1, :, index2]
+
+    res = arr8.execute().fetch()
+    np.testing.assert_array_equal(res, raw[0, raw_index1, :, raw_index2])
+
+    raw_a = np.random.rand(30, 30)
+    a = tensor(raw_a, chunk_size=(13, 17))
+    b = a.argmax(axis=0)
+    c = a[b, arange(30)]
+    res = c.execute().fetch()
+
+    np.testing.assert_array_equal(res, raw_a[raw_a.argmax(axis=0), np.arange(30)])
+
+    # test one chunk
+    arr = tensor(raw, chunk_size=20)
+
+    raw_index = [8, 10, 3, 1, 9, 10]
+    index = tensor(raw_index, chunk_size=20)
+    arr9 = arr[index]
+
+    res = arr9.execute().fetch()
+    np.testing.assert_array_equal(res, raw[raw_index])
+
+    raw_index1 = [[1, 3], [3, 7], [7, 7]]
+    raw_index2 = [1, 9]
+    index1 = tensor(raw_index1)
+    index2 = tensor(raw_index2)
+    arr10 = arr[0, index1, :, index2]
+
+    res = arr10.execute().fetch()
+    np.testing.assert_array_equal(res, raw[0, raw_index1, :, raw_index2])
+
+    # test order
+    raw = np.asfortranarray(np.random.random((11, 8, 12, 14)))
+    arr = tensor(raw, chunk_size=(6, 5, 7, 8))
+
+    raw_index = [8, 10, 3, 1, 9, 10]
+    index = tensor(raw_index, chunk_size=4)
+    arr11 = arr[index]
+
+    res = arr11.execute().fetch()
+    expected = raw[raw_index].copy("A")
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+
+def test_slice_execution(setup):
+    raw = np.random.random((11, 8, 12, 14))
+    arr = tensor(raw, chunk_size=6)
+
+    arr2 = arr[2:9:2, 3:7, -1:-9:-2, 12:-11:-4]
+    res = arr2.execute().fetch()
+
+    np.testing.assert_array_equal(res, raw[2:9:2, 3:7, -1:-9:-2, 12:-11:-4])
+
+    arr3 = arr[-4, 2:]
+    res = arr3.execute().fetch()
+    np.testing.assert_equal(res, raw[-4, 2:])
+
+    raw = sps.random(12, 14, density=0.1)
+    arr = tensor(raw, chunk_size=6)
+
+    arr2 = arr[-1:-9:-2, 12:-11:-4]
+    res = arr2.execute().fetch()
+
+    np.testing.assert_equal(res.toarray(), raw.toarray()[-1:-9:-2, 12:-11:-4])
+
+    # test order
+    raw = np.asfortranarray(np.random.random((11, 8, 12, 14)))
+    arr = tensor(raw, chunk_size=6)
+
+    arr2 = arr[2:9:2, 3:7, -1:-9:-2, 12:-11:-4]
+    res = arr2.execute().fetch()
+    expected = raw[2:9:2, 3:7, -1:-9:-2, 12:-11:-4].copy("A")
+
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+    arr3 = arr[0:13, :, None]
+    res = arr3.execute().fetch()
+    expected = raw[0:13, :, None].copy("A")
+
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+
+def test_mixed_indexing_execution(setup):
+    rs = np.random.RandomState(0)
+    raw = rs.random((11, 8, 12, 13))
+    arr = tensor(raw, chunk_size=6)
+
+    raw_cond = raw[0, :, 0, 0] < 0.5
+    cond = tensor(raw[0, :, 0, 0], chunk_size=3) < 0.5
+    arr2 = arr[10::-2, cond, None, ..., :5]
+    # size_res = self.executor.execute_tensor(arr2, mock=True)
+    res = arr2.execute().fetch()
+
+    new_shape = list(arr2.shape)
+    new_shape[1] = cond.shape[0]
+    # assert sum(s[0] for s in size_res) == int(np.prod(new_shape) * arr2.dtype.itemsize)
+    np.testing.assert_array_equal(res, raw[10::-2, raw_cond, None, ..., :5])
+
+    b_raw = np.random.random(8)
+    raw_cond = b_raw < 0.5
+    conds = [raw_cond, tensor(b_raw, chunk_size=2) < 0.5]
+    for cond in conds:
+        arr3 = arr[-2::-3, cond, ...]
+        res = arr3.execute().fetch()
+
+        np.testing.assert_array_equal(res, raw[-2::-3, raw_cond, ...])
+
+    # test multiple bool index and fancy index
+    cond1 = np.zeros(11, dtype=bool)
+    cond1[rs.permutation(11)[:5]] = True
+    cond2 = np.zeros(12, dtype=bool)
+    cond2[rs.permutation(12)[:5]] = True
+    f3 = np.random.randint(13, size=5)
+
+    expected = raw[cond1, ..., cond2, f3]
+
+    t = arr[cond1, ..., cond2, f3]
+    res = t.execute().fetch()
+    np.testing.assert_array_equal(res, expected)
+
+    t = arr[tensor(cond1), ..., tensor(cond2), tensor(f3)]
+    res = t.execute().fetch()
+    np.testing.assert_array_equal(res, expected)
+
+
+@pytest.mark.ray_dag
+def test_setitem_fancy_index_execution(setup):
+    rs = np.random.RandomState(0)
+
+    raw = rs.randint(0, 10, size=(11, 12))
+
+    # index is a ndarray, value is a scalar
+    arr = tensor(raw.copy(), chunk_size=5)
+    idx = rs.randint(0, 11, (5,))
+    arr[idx] = 20
+    res = arr.execute().fetch()
+    expected = raw.copy()
+    expected[idx] = 20
+    np.testing.assert_array_equal(res, expected)
+
+    # index is a tensor, value is a scalar
+    arr = tensor(raw.copy(), chunk_size=5)
+    raw_index = rs.randint(0, 11, (8,))
+    idx = tensor(raw_index.copy(), chunk_size=5)
+    arr[idx] = 2
+    res = arr.execute().fetch()
+    expected = raw.copy()
+    expected[raw_index] = 2
+    np.testing.assert_array_equal(res, expected)
+
+    # indexes are all tensors
+    arr = tensor(raw.copy(), chunk_size=6)
+    raw_index1 = rs.randint(0, 11, (20,))
+    idx1 = tensor(raw_index1.copy(), chunk_size=8)
+    raw_index2 = rs.randint(0, 12, (20,))
+    idx2 = tensor(raw_index2.copy(), chunk_size=8)
+    arr[idx1, idx2] = 2
+    res = arr.execute().fetch()
+    expected = raw.copy()
+    expected[raw_index1, raw_index2] = 2
+    np.testing.assert_array_equal(res, expected)
+
+    # indexes all tensors, value is also a tensor
+    arr = tensor(raw.copy(), chunk_size=6)
+    raw_index1 = rs.randint(0, 11, (20,))
+    idx1 = tensor(raw_index1.copy(), chunk_size=8)
+    raw_index2 = rs.randint(0, 12, (20,))
+    idx2 = tensor(raw_index2.copy(), chunk_size=8)
+    raw_value = rs.randint(0, 10, (20,))
+    arr[idx1, idx2] = tensor(raw_value, chunk_size=4)
+    res = arr.execute().fetch()
+    expected = raw.copy()
+    expected[raw_index1, raw_index2] = raw_value
+    np.testing.assert_array_equal(res, expected)
+
+    raw = rs.randint(0, 10, size=(20,))
+    arr = tensor(raw.copy(), chunk_size=6)
+    raw_index = rs.randint(0, 11, (9,))
+    raw_value = rs.randint(0, 10, (9,))
+    index = tensor(raw_index, chunk_size=3)
+    arr[index] = tensor(raw_value, chunk_size=4)
+    res = arr.execute().fetch()
+    expected = raw.copy()
+    expected[raw_index] = raw_value
+    np.testing.assert_array_equal(res, expected)
+
+    # input's nsplits is unknown
+    raw = rs.randint(0, 10, size=(11, 11))
+    arr = tensor(raw.copy(), chunk_size=6)
+    arr1 = arr[arr[0] < 20, :]
+    raw_index1 = rs.randint(0, 11, (10,))
+    idx1 = tensor(raw_index1.copy(), chunk_size=3)
+    raw_index2 = rs.randint(0, 11, (10,))
+    idx2 = tensor(raw_index2.copy(), chunk_size=4)
+    raw_value = rs.randint(100, 110, (10,))
+    arr1[idx1, idx2] = tensor(raw_value, chunk_size=4)
+    res = arr1.execute().fetch()
+    expected = raw.copy()
+    expected = expected[expected[0] < 20, :]
+    expected[raw_index1, raw_index2] = raw_value
+    np.testing.assert_array_equal(res, expected)
+
+
+def test_setitem_execution(setup):
+    rs = np.random.RandomState(0)
+
+    raw = data = rs.randint(0, 10, size=(11, 8, 12, 13))
+    arr = tensor(raw.copy(), chunk_size=6)
+    raw = raw.copy()
+
+    idx = slice(2, 9, 2), slice(3, 7), slice(-1, -9, -2), 2
+    arr[idx] = 20
+    res = arr.execute().fetch()
+
+    raw[idx] = 20
+    np.testing.assert_array_equal(res, raw)
+    assert res.flags["C_CONTIGUOUS"] == raw.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == raw.flags["F_CONTIGUOUS"]
+
+    raw = data
+    shape = raw[idx].shape
+
+    arr2 = tensor(raw.copy(), chunk_size=6)
+    raw = raw.copy()
+
+    replace = rs.randint(10, 20, size=shape[:-1] + (1,)).astype("f4")
+    arr2[idx] = tensor(replace, chunk_size=7)
+    res = arr2.execute().fetch()
+
+    raw[idx] = replace
+    np.testing.assert_array_equal(res, raw)
+
+    raw = np.asfortranarray(np.random.randint(0, 10, size=(11, 8, 12, 13)))
+    arr = tensor(raw.copy("A"), chunk_size=6)
+    raw = raw.copy("A")
+
+    idx = slice(2, 9, 2), slice(3, 7), slice(-1, -9, -2), 2
+    arr[idx] = 20
+    res = arr.execute().fetch()
+
+    raw[idx] = 20
+    np.testing.assert_array_equal(res, raw)
+    assert res.flags["C_CONTIGUOUS"] == raw.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == raw.flags["F_CONTIGUOUS"]
+
+    # test bool indexing set
+    raw = data
+
+    arr = tensor(raw.copy(), chunk_size=6)
+    raw1 = rs.rand(11)
+    arr[tensor(raw1, chunk_size=4) < 0.6, 2:7] = 3
+    res = arr.execute().fetch()
+
+    raw[raw1 < 0.6, 2:7] = 3
+    np.testing.assert_array_equal(res, raw)
+
+    raw = np.random.randint(3, size=10).astype(np.int64)
+    raw2 = np.arange(3)
+
+    arr = zeros((10, 3))
+    arr[tensor(raw) == 1, tensor(raw2) == 1] = 1
+    res = arr.execute().fetch()
+
+    expected = np.zeros((10, 3))
+    expected[raw == 1, raw2 == 1] = 1
+    np.testing.assert_array_equal(res, expected)
+
+    raw = data
+
+    arr = tensor(raw.copy(), chunk_size=6)
+    raw1 = rs.rand(11)
+    set_data = rs.rand((raw1 < 0.8).sum(), 8, 12, 13)
+    arr[tensor(raw1, chunk_size=4) < 0.8] = tensor(set_data)
+
+    res = arr.execute().fetch()
+
+    raw[raw1 < 0.8] = set_data
+    np.testing.assert_array_equal(res, raw)
+
+    # test error
+    with pytest.raises(ValueError):
+        t = tensor(raw, chunk_size=3)
+        t[0, 0, 0, 0] = zeros(2, chunk_size=10)
+        t.execute()
+
+
+def test_setitem_structured_execution(setup):
+    rec_type = np.dtype(
+        [
+            ("a", np.int32),
+            ("b", np.double),
+            ("c", np.dtype([("a", np.int16), ("b", np.int64)])),
+        ]
+    )
+
+    raw = np.zeros((4, 5), dtype=rec_type)
+    arr = tensor(raw.copy(), chunk_size=3)
+
+    arr[1:4, 1] = (3, 4.0, (5, 6))
+    arr[1:4, 2] = 8
+    arr[1:3] = np.arange(5)
+    arr[2:4] = np.arange(10).reshape(2, 5)
+    arr[0] = np.arange(5)
+
+    raw[1:4, 1] = (3, 4.0, (5, 6))
+    raw[1:4, 2] = 8
+    raw[1:3] = np.arange(5)
+    raw[2:4] = np.arange(10).reshape(2, 5)
+    raw[0] = np.arange(5)
+
+    res = arr.execute().fetch()
+    assert arr.dtype == raw.dtype
+    assert arr.shape == raw.shape
+    np.testing.assert_array_equal(res, raw)
+
+
+def test_take_execution(setup):
+    data = np.random.rand(10, 20, 30)
+    t = tensor(data, chunk_size=10)
+
+    a = t.take([4, 1, 2, 6, 200])
+
+    res = a.execute().fetch()
+    expected = np.take(data, [4, 1, 2, 6, 200])
+    np.testing.assert_array_equal(res, expected)
+
+    a = take(t, [5, 19, 2, 13], axis=1)
+
+    res = a.execute().fetch()
+    expected = np.take(data, [5, 19, 2, 13], axis=1)
+    np.testing.assert_array_equal(res, expected)
+
+    with pytest.raises(ValueError):
+        take(t, [1, 3, 4], out=tensor(np.random.rand(4)))
+
+    out = tensor([1, 2, 3, 4])
+    a = take(t, [4, 19, 2, 8], out=out)
+
+    res = out.execute().fetch()
+    expected = np.take(data, [4, 19, 2, 8])
+    np.testing.assert_array_equal(res, expected)
+
+
+def test_compress_execution(setup):
+    data = np.array([[1, 2], [3, 4], [5, 6]])
+    a = tensor(data, chunk_size=1)
+
+    t = compress([0, 1], a, axis=0)
+
+    res = t.execute().fetch()
+    expected = np.compress([0, 1], data, axis=0)
+    np.testing.assert_array_equal(res, expected)
+
+    t = compress([0, 1], a, axis=1)
+
+    res = t.execute().fetch()
+    expected = np.compress([0, 1], data, axis=1)
+    np.testing.assert_array_equal(res, expected)
+
+    t = a.compress([0, 1, 1])
+
+    res = t.execute().fetch()
+    expected = np.compress([0, 1, 1], data)
+    np.testing.assert_array_equal(res, expected)
+
+    t = compress([False, True, True], a, axis=0)
+
+    res = t.execute().fetch()
+    expected = np.compress([False, True, True], data, axis=0)
+    np.testing.assert_array_equal(res, expected)
+
+    t = compress([False, True], a, axis=1)
+
+    res = t.execute().fetch()
+    expected = np.compress([False, True], data, axis=1)
+    np.testing.assert_array_equal(res, expected)
+
+    with pytest.raises(np.AxisError):
+        compress([0, 1, 1], a, axis=1)
+
+    # test order
+    data = np.asfortranarray([[1, 2], [3, 4], [5, 6]])
+    a = tensor(data, chunk_size=1)
+
+    t = compress([0, 1, 1], a, axis=0)
+
+    res = t.execute().fetch()
+    expected = np.compress([0, 1, 1], data, axis=0)
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+    t = compress(
+        [0, 1, 1], a, axis=0, out=tensor(np.empty((2, 2), order="F", dtype=int))
+    )
+
+    res = t.execute().fetch()
+    expected = np.compress(
+        [0, 1, 1], data, axis=0, out=np.empty((2, 2), order="F", dtype=int)
+    )
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+
+def test_extract_execution(setup):
+    data = np.arange(12).reshape((3, 4))
+    a = tensor(data, chunk_size=2)
+    condition = mod(a, 3) == 0
+
+    t = extract(condition, a)
+
+    res = t.execute().fetch()
+    expected = np.extract(np.mod(data, 3) == 0, data)
+    np.testing.assert_array_equal(res, expected)
+
+
+def test_choose_execution(setup):
+    options.chunk_size = 2
+
+    choices = [[0, 1, 2, 3], [10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33]]
+    a = choose([2, 3, 1, 0], choices)
+
+    res = a.execute().fetch()
+    expected = np.choose([2, 3, 1, 0], choices)
+
+    np.testing.assert_array_equal(res, expected)
+
+    a = choose([2, 4, 1, 0], choices, mode="clip")  # 4 goes to 3 (4-1)
+    expected = np.choose([2, 4, 1, 0], choices, mode="clip")
+
+    res = a.execute().fetch()
+    np.testing.assert_array_equal(res, expected)
+
+    a = choose([2, 4, 1, 0], choices, mode="wrap")  # 4 goes to (4 mod 4)
+    expected = np.choose([2, 4, 1, 0], choices, mode="wrap")  # 4 goes to (4 mod 4)
+
+    res = a.execute().fetch()
+    np.testing.assert_array_equal(res, expected)
+
+    a = [[1, 0, 1], [0, 1, 0], [1, 0, 1]]
+    choices = [-10, 10]
+
+    b = choose(a, choices)
+    expected = np.choose(a, choices)
+
+    res = b.execute().fetch()
+    np.testing.assert_array_equal(res, expected)
+
+    a = np.array([0, 1]).reshape((2, 1, 1))
+    c1 = np.array([1, 2, 3]).reshape((1, 3, 1))
+    c2 = np.array([-1, -2, -3, -4, -5]).reshape((1, 1, 5))
+
+    b = choose(a, (c1, c2))
+    expected = np.choose(a, (c1, c2))
+
+    res = b.execute().fetch()
+    np.testing.assert_array_equal(res, expected)
+
+    # test order
+    a = np.array([0, 1]).reshape((2, 1, 1), order="F")
+    c1 = np.array([1, 2, 3]).reshape((1, 3, 1), order="F")
+    c2 = np.array([-1, -2, -3, -4, -5]).reshape((1, 1, 5), order="F")
+
+    b = choose(a, (c1, c2))
+    expected = np.choose(a, (c1, c2))
+
+    res = b.execute().fetch()
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+    b = choose(a, (c1, c2), out=tensor(np.empty(res.shape, order="F")))
+    expected = np.choose(a, (c1, c2), out=np.empty(res.shape, order="F"))
+
+    res = b.execute().fetch()
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+
+def test_unravel_execution(setup):
+    a = tensor([22, 41, 37], chunk_size=1)
+    t = stack(unravel_index(a, (7, 6)))
+
+    res = t.execute().fetch()
+    expected = np.stack(np.unravel_index([22, 41, 37], (7, 6)))
+
+    np.testing.assert_array_equal(res, expected)
+
+
+def test_nonzero_execution(setup):
+    data = np.array([[1, 0, 0], [0, 2, 0], [1, 1, 0]])
+    x = tensor(data, chunk_size=2)
+    t = hstack(nonzero(x))
+
+    res = t.execute().fetch()
+    expected = np.hstack(np.nonzero(data))
+
+    np.testing.assert_array_equal(res, expected)
+
+    t = hstack((x > 1).nonzero())
+
+    res = t.execute().fetch()
+    expected = np.hstack(np.nonzero(data > 1))
+
+    np.testing.assert_array_equal(res, expected)
+
+
+def test_flatnonzero_execution(setup):
+    x = arange(-2, 3, chunk_size=2)
+
+    t = flatnonzero(x)
+
+    res = t.execute().fetch()
+    expected = np.flatnonzero(np.arange(-2, 3))
+
+    np.testing.assert_equal(res, expected)
+
+
+def test_fill_diagonal_execution(setup):
+    # 2-d
+    raws = [
+        np.random.rand(30, 11),
+        np.random.rand(15, 15),
+        np.random.rand(11, 30),
+        sps.random(30, 11, density=0.1, format="csr"),
+    ]
+
+    def copy(x):
+        if hasattr(x, "nnz"):
+            # sparse
+            return x.A
+        else:
+            return x.copy()
+
+    for raw in raws:
+        # test 1 chunk, wrap=False
+        t = tensor(raw, chunk_size=30)
+        fill_diagonal(t, 1)
+
+        res = t.execute().fetch()
+        expected = copy(raw)
+        np.fill_diagonal(expected, 1)
+
+        np.testing.assert_array_equal(np.asarray(res), expected)
+
+        # test 1 chunk, wrap=True
+        t = tensor(raw, chunk_size=30)
+        fill_diagonal(t, 1, wrap=True)
+
+        res = t.execute().fetch()
+        expected = copy(raw)
+        np.fill_diagonal(expected, 1, wrap=True)
+
+        np.testing.assert_array_equal(np.asarray(res), expected)
+
+        # test multiple chunks, wrap=False
+        t = tensor(raw, chunk_size=(12, 4))
+        fill_diagonal(t, 1)
+
+        res = t.execute().fetch()
+        expected = copy(raw)
+        np.fill_diagonal(expected, 1)
+
+        np.testing.assert_array_equal(np.asarray(res), expected)
+
+        t = tensor(raw, chunk_size=(4, 12))
+        fill_diagonal(t, 1)
+
+        res = t.execute().fetch()
+        expected = copy(raw)
+        np.fill_diagonal(expected, 1)
+
+        np.testing.assert_array_equal(np.asarray(res), expected)
+
+        # test multiple chunk, val with list type
+        t = tensor(raw, chunk_size=(12, 4))
+        fill_diagonal(t, [1, 2, 3])
+
+        res = t.execute().fetch()
+        expected = copy(raw)
+        np.fill_diagonal(expected, [1, 2, 3])
+
+        np.testing.assert_array_equal(np.asarray(res), expected)
+
+        # test multiple chunk, val with tensor type
+        t = tensor(raw, chunk_size=(12, 4))
+        fill_diagonal(t, tensor([1, 2, 3]))
+
+        res = t.execute().fetch()
+        expected = copy(raw)
+        np.fill_diagonal(expected, [1, 2, 3])
+
+        np.testing.assert_array_equal(np.asarray(res), expected)
+
+        # test multiple chunks, wrap=True
+        t = tensor(raw, chunk_size=(12, 4))
+        fill_diagonal(t, 1, wrap=True)
+
+        res = t.execute().fetch()
+        expected = copy(raw)
+        np.fill_diagonal(expected, 1, wrap=True)
+
+        np.testing.assert_array_equal(np.asarray(res), expected)
+
+        t = tensor(raw, chunk_size=(4, 12))
+        fill_diagonal(t, 1, wrap=True)
+
+        res = t.execute().fetch()
+        expected = copy(raw)
+        np.fill_diagonal(expected, 1, wrap=True)
+
+        np.testing.assert_array_equal(np.asarray(res), expected)
+
+        # test multiple chunk, val with list type
+        t = tensor(raw, chunk_size=(12, 4))
+        fill_diagonal(t, [1, 2, 3], wrap=True)
+
+        res = t.execute().fetch()
+        expected = copy(raw)
+        np.fill_diagonal(expected, [1, 2, 3], wrap=True)
+
+        np.testing.assert_array_equal(np.asarray(res), expected)
+
+        # test multiple chunk, val with tensor type
+        t = tensor(raw, chunk_size=(12, 4))
+        fill_diagonal(t, tensor([[1, 2], [3, 4]]), wrap=True)
+
+        res = t.execute().fetch()
+        expected = copy(raw)
+        np.fill_diagonal(expected, [1, 2, 3, 4], wrap=True)
+
+        np.testing.assert_array_equal(np.asarray(res), expected)
+
+    # 3-d
+    raw = np.random.rand(11, 11, 11)
+
+    expected = raw.copy()
+    np.fill_diagonal(expected, 1)
+    expected2 = raw.copy()
+    np.fill_diagonal(expected2, 1, wrap=True)
+    np.testing.assert_array_equal(expected, expected2)
+
+    # test 1 chunk
+    t = tensor(raw, chunk_size=30)
+    fill_diagonal(t, 1)
+
+    res = t.execute().fetch()
+
+    np.testing.assert_array_equal(res, expected)
+
+    t = tensor(raw, chunk_size=30)
+    # wrap = True does not take effect when ndim > 2
+    fill_diagonal(t, 1, wrap=True)
+
+    res = t.execute().fetch()
+
+    np.testing.assert_array_equal(res, expected)
+
+    # test multiple chunk
+    t = tensor(raw, chunk_size=(3, 4, 5))
+    fill_diagonal(t, 1)
+
+    res = t.execute().fetch()
+
+    np.testing.assert_array_equal(res, expected)
+
+    t = tensor(raw, chunk_size=(3, 4, 5))
+    # wrap = True does not take effect when ndim > 2
+    fill_diagonal(t, 1, wrap=True)
+
+    res = t.execute().fetch()
+
+    np.testing.assert_array_equal(res, expected)
+
+    # test val with list type
+    t = tensor(raw, chunk_size=(3, 4, 5))
+    fill_diagonal(t, [[1, 2], [3, 4]])
+
+    res = t.execute().fetch()
+    expected = raw.copy()
+    np.fill_diagonal(expected, [1, 2, 3, 4])
+
+    np.testing.assert_array_equal(res, expected)
+
+    # test val with tensor type
+    t = tensor(raw, chunk_size=(3, 4, 5))
+    fill_diagonal(t, tensor([1, 2, 3]))
+
+    res = t.execute().fetch()
+    expected = raw.copy()
+    np.fill_diagonal(expected, [1, 2, 3])
+
+    np.testing.assert_array_equal(res, expected)
+
+    # test val with tensor type which ndim == 0
+    t = tensor(raw, chunk_size=(3, 4, 5))
+    fill_diagonal(t, tensor([1, 2, 3]).sum())
+
+    res = t.execute().fetch()
+    expected = raw.copy()
+    np.fill_diagonal(expected, 6)
+
+    np.testing.assert_array_equal(res, expected)
+
+    # test val with ndarray type which size is too long
+    t = tensor(raw, chunk_size=(3, 4, 5))
+    fill_diagonal(t, np.arange(20))
+
+    res = t.execute().fetch()
+    expected = raw.copy()
+    np.fill_diagonal(expected, np.arange(20))
+
+    np.testing.assert_array_equal(res, expected)
diff --git a/python/xorbits/_mars/tensor/indexing/unravel_index.py b/python/xorbits/_mars/tensor/indexing/unravel_index.py
new file mode 100644
index 000000000..a2773a63e
--- /dev/null
+++ b/python/xorbits/_mars/tensor/indexing/unravel_index.py
@@ -0,0 +1,150 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Iterable
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import ExecutableTuple
+from ...serialization.serializables import FieldTypes, KeyField, StringField, TupleField
+from ..array_utils import as_same_device, device
+from ..core import TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorHasInput, TensorOperandMixin
+
+
+class TensorUnravelIndex(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.UNRAVEL_INDEX
+
+    _input = KeyField("input")
+    _dims = TupleField("dims", FieldTypes.int32)
+    _order = StringField("order")
+
+    def __init__(self, dims=None, order=None, **kw):
+        super().__init__(_dims=dims, _order=order, **kw)
+        if self._order is None:
+            self._order = "C"
+
+    @property
+    def dims(self):
+        return self._dims
+
+    @property
+    def order(self):
+        return self._order
+
+    @property
+    def output_limit(self):
+        return float("inf")
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def __call__(self, indices):
+        order = TensorOrder.C_ORDER if self._order == "C" else TensorOrder.F_ORDER
+        kws = [{"pos": i, "order": order} for i in range(len(self._dims))]
+        return ExecutableTuple(
+            self.new_tensors([indices], indices.shape, kws=kws, output_limit=len(kws))
+        )
+
+    @classmethod
+    def tile(cls, op):
+        indices = op.inputs[0]
+        dims = op.dims
+        order = op.outputs[0].order
+
+        out_chunks = [list() for _ in range(len(dims))]
+        for in_chunk in indices.chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_kws = [
+                {"pos": i, "index": in_chunk.index, "order": order}
+                for i in range(len(dims))
+            ]
+            chunks = chunk_op.new_chunks(
+                [in_chunk], shape=in_chunk.shape, kws=chunk_kws, output_limit=len(dims)
+            )
+            for out_chunk, c in zip(out_chunks, chunks):
+                out_chunk.append(c)
+
+        new_op = op.copy()
+        kws = [
+            {"chunks": out_chunk, "nsplits": indices.nsplits, "shape": o.shape}
+            for out_chunk, o in zip(out_chunks, op.outputs)
+        ]
+        return new_op.new_tensors(
+            op.inputs, kws=kws, output_limit=len(dims), order=order
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+        indices = inputs[0]
+
+        with device(device_id):
+            outputs = xp.unravel_index(indices, op.dims, order=op.order)
+            for o, output in zip(op.outputs, outputs):
+                ctx[o.key] = output
+
+
+def unravel_index(indices, dims, order="C"):
+    """
+    Converts a flat index or tensor of flat indices into a tuple
+    of coordinate tensors.
+
+    Parameters
+    ----------
+    indices : array_like
+        An integer tensor whose elements are indices into the flattened
+        version of a tensor of dimensions ``dims``.
+    dims : tuple of ints
+        The shape of the tensor to use for unraveling ``indices``.
+    order : {'C', 'F'}, optional
+        Determines whether the indices should be viewed as indexing in
+        row-major (C-style) or column-major (Fortran-style) order.
+
+    Returns
+    -------
+    unraveled_coords : tuple of Tensor
+        Each tensor in the tuple has the same shape as the ``indices``
+        tensor.
+
+    See Also
+    --------
+    ravel_multi_index
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.unravel_index([22, 41, 37], (7,6)).execute()
+    (array([3, 6, 6]), array([4, 5, 1]))
+
+    >>> mt.unravel_index(1621, (6,7,8,9)).execute()
+    (3, 1, 4, 1)
+    """
+    indices = astensor(indices)
+    if isinstance(dims, Iterable):
+        dims = tuple(dims)
+    else:
+        dims = (dims,)
+
+    if order not in "CF":
+        raise TypeError("only 'C' or 'F' order is permitted")
+
+    op = TensorUnravelIndex(dims=dims, dtype=np.dtype(np.intp), order=order)
+    return op(indices)
diff --git a/python/xorbits/_mars/tensor/lib/__init__.py b/python/xorbits/_mars/tensor/lib/__init__.py
new file mode 100644
index 000000000..01608ff0d
--- /dev/null
+++ b/python/xorbits/_mars/tensor/lib/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import index_tricks
+from .index_tricks import nd_grid
diff --git a/python/xorbits/_mars/tensor/lib/index_tricks.py b/python/xorbits/_mars/tensor/lib/index_tricks.py
new file mode 100644
index 000000000..0d7a3305a
--- /dev/null
+++ b/python/xorbits/_mars/tensor/lib/index_tricks.py
@@ -0,0 +1,497 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+from numpy.core.numeric import ScalarType
+from numpy.core.numerictypes import find_common_type
+from numpy.lib.index_tricks import ndindex
+
+from .. import datasource as _nx
+from ..base import ndim
+from ..core import Tensor
+from ..merge import concatenate
+
+
+class nd_grid(object):
+    """
+    Construct a multi-dimensional "meshgrid".
+
+    ``grid = nd_grid()`` creates an instance which will return a mesh-grid
+    when indexed.  The dimension and number of the output arrays are equal
+    to the number of indexing dimensions.  If the step length is not a
+    complex number, then the stop is not inclusive.
+
+    However, if the step length is a **complex number** (e.g. 5j), then the
+    integer part of its magnitude is interpreted as specifying the
+    number of points to create between the start and stop values, where
+    the stop value **is inclusive**.
+
+    If instantiated with an argument of ``sparse=True``, the mesh-grid is
+    open (or not fleshed out) so that only one-dimension of each returned
+    argument is greater than 1.
+
+    Parameters
+    ----------
+    sparse : bool, optional
+        Whether the grid is sparse or not. Default is False.
+
+    Notes
+    -----
+    Two instances of `nd_grid` are made available in the Mars.tensor namespace,
+    `mgrid` and `ogrid`::
+
+        mgrid = nd_grid(sparse=False)
+        ogrid = nd_grid(sparse=True)
+
+    Users should use these pre-defined instances instead of using `nd_grid`
+    directly.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mgrid = mt.lib.index_tricks.nd_grid()
+    >>> mgrid[0:5,0:5]
+    array([[[0, 0, 0, 0, 0],
+            [1, 1, 1, 1, 1],
+            [2, 2, 2, 2, 2],
+            [3, 3, 3, 3, 3],
+            [4, 4, 4, 4, 4]],
+           [[0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4]]])
+    >>> mgrid[-1:1:5j]
+    array([-1. , -0.5,  0. ,  0.5,  1. ])
+
+    >>> ogrid = mt.lib.index_tricks.nd_grid(sparse=True)
+    >>> ogrid[0:5,0:5]
+    [array([[0],
+            [1],
+            [2],
+            [3],
+            [4]]), array([[0, 1, 2, 3, 4]])]
+
+    """
+
+    def __init__(self, sparse=False):
+        self.sparse = sparse
+
+    def __getitem__(self, key):
+        try:
+            size = []
+            typ = int
+            for k in key:
+                step = k.step
+                start = k.start
+                if start is None:
+                    start = 0
+                if step is None:
+                    step = 1
+                if isinstance(step, complex):
+                    size.append(int(abs(step)))
+                    typ = float
+                else:
+                    size.append(int(math.ceil((k.stop - start) / (step * 1.0))))
+                if (
+                    isinstance(step, float)
+                    or isinstance(start, float)
+                    or isinstance(k.stop, float)
+                ):
+                    typ = float
+            if self.sparse:
+                nn = [
+                    _nx.arange(_x, dtype=_t) for _x, _t in zip(size, (typ,) * len(size))
+                ]
+            else:
+                nn = _nx.indices(size, typ)
+            for k in range(len(size)):
+                step = key[k].step
+                start = key[k].start
+                if start is None:
+                    start = 0
+                if step is None:
+                    step = 1
+                if isinstance(step, complex):
+                    step = int(abs(step))
+                    if step != 1:
+                        step = (key[k].stop - start) / float(step - 1)
+                nn[k] = nn[k] * step + start
+            if self.sparse:
+                slobj = [np.newaxis] * len(size)
+                for k in range(len(size)):
+                    slobj[k] = slice(None, None)
+                    nn[k] = nn[k][slobj]
+                    slobj[k] = np.newaxis
+            return nn
+        except (IndexError, TypeError):  # pragma: no cover
+            step = key.step
+            stop = key.stop
+            start = key.start
+            if start is None:
+                start = 0
+            if isinstance(step, complex):
+                step = abs(step)
+                length = int(step)
+                if step != 1:
+                    step = (key.stop - start) / float(step - 1)
+                stop = key.stop + step
+                return _nx.arange(0, length, 1, float) * step + start
+            else:
+                return _nx.arange(start, stop, step)
+
+    def __len__(self):
+        return 0
+
+
+class MGridClass(nd_grid):
+    """
+    `nd_grid` instance which returns a dense multi-dimensional "meshgrid".
+
+    An instance of `numpy.lib.index_tricks.nd_grid` which returns an dense
+    (or fleshed out) mesh-grid when indexed, so that each returned argument
+    has the same shape.  The dimensions and number of the output arrays are
+    equal to the number of indexing dimensions.  If the step length is not a
+    complex number, then the stop is not inclusive.
+
+    However, if the step length is a **complex number** (e.g. 5j), then
+    the integer part of its magnitude is interpreted as specifying the
+    number of points to create between the start and stop values, where
+    the stop value **is inclusive**.
+
+    Returns
+    -------
+    mesh-grid `ndarrays` all of the same dimensions
+
+    See Also
+    --------
+    lib.index_tricks.nd_grid : class of `ogrid` and `mgrid` objects
+    ogrid : like mgrid but returns open (not fleshed out) mesh grids
+    meshgrid: return coordinate matrices from coordinate vectors
+    r_ : array concatenator
+    :ref:`how-to-partition`
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> mt.mgrid[0:5, 0:5]
+    array([[[0, 0, 0, 0, 0],
+            [1, 1, 1, 1, 1],
+            [2, 2, 2, 2, 2],
+            [3, 3, 3, 3, 3],
+            [4, 4, 4, 4, 4]],
+           [[0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4]]])
+    >>> mt.mgrid[-1:1:5j]
+    array([-1. , -0.5,  0. ,  0.5,  1. ])
+
+    """
+
+    def __init__(self):
+        super().__init__(sparse=False)
+
+
+class OGridClass(nd_grid):
+    """
+    `nd_grid` instance which returns an open multi-dimensional "meshgrid".
+
+    An instance of `numpy.lib.index_tricks.nd_grid` which returns an open
+    (i.e. not fleshed out) mesh-grid when indexed, so that only one dimension
+    of each returned array is greater than 1.  The dimension and number of the
+    output arrays are equal to the number of indexing dimensions.  If the step
+    length is not a complex number, then the stop is not inclusive.
+
+    However, if the step length is a **complex number** (e.g. 5j), then
+    the integer part of its magnitude is interpreted as specifying the
+    number of points to create between the start and stop values, where
+    the stop value **is inclusive**.
+
+    Returns
+    -------
+    mesh-grid
+        `ndarrays` with only one dimension not equal to 1
+
+    See Also
+    --------
+    np.lib.index_tricks.nd_grid : class of `ogrid` and `mgrid` objects
+    mgrid : like `ogrid` but returns dense (or fleshed out) mesh grids
+    meshgrid: return coordinate matrices from coordinate vectors
+    r_ : array concatenator
+    :ref:`how-to-partition`
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> mt.ogrid[-1:1:5j]
+    array([-1. , -0.5,  0. ,  0.5,  1. ])
+    >>> mt.ogrid[0:5,0:5]
+    [array([[0],
+            [1],
+            [2],
+            [3],
+            [4]]), array([[0, 1, 2, 3, 4]])]
+
+    """
+
+    def __init__(self):
+        super().__init__(sparse=True)
+
+
+mgrid = MGridClass()
+ogrid = OGridClass()
+
+
+class AxisConcatenator:
+    def __init__(self, axis=0, matrix=False, ndmin=1, trans1d=-1):
+        self.axis = axis
+        self.matrix = matrix
+        self.trans1d = trans1d
+        self.ndmin = ndmin
+
+    def __getitem__(self, key):
+        # handle matrix builder syntax
+        if isinstance(key, str):  # pragma: no cover
+            raise NotImplementedError("Does not support operation on matrix")
+
+        if not isinstance(key, tuple):
+            key = (key,)
+
+        # copy attributes, since they can be overridden in the first argument
+        trans1d = self.trans1d
+        ndmin = self.ndmin
+        matrix = self.matrix
+        axis = self.axis
+
+        objs = []
+        scalars = []
+        arraytypes = []
+        scalartypes = []
+
+        for k, item in enumerate(key):
+            scalar = False
+            if isinstance(item, slice):
+                step = item.step
+                start = item.start
+                stop = item.stop
+                if start is None:
+                    start = 0
+                if step is None:
+                    step = 1
+                if isinstance(step, complex):
+                    size = int(abs(step))
+                    newobj = _nx.linspace(start, stop, num=size)
+                else:
+                    newobj = _nx.arange(start, stop, step)
+                if ndmin > 1:
+                    newobj = _nx.array(newobj, copy=False, ndmin=ndmin)
+                    if trans1d != -1:
+                        newobj = newobj.swapaxes(-1, trans1d)
+            elif isinstance(item, str):
+                if k != 0:
+                    raise ValueError("special directives must be the first entry.")
+                if item in ("r", "c"):  # pragma: no cover
+                    raise NotImplementedError("Does not support operation on matrix")
+                if "," in item:
+                    vec = item.split(",")
+                    try:
+                        axis, ndmin = [int(x) for x in vec[:2]]
+                        if len(vec) == 3:
+                            trans1d = int(vec[2])
+                        continue
+                    except Exception:  # pragma: no cover
+                        raise ValueError("unknown special directive")
+                try:
+                    axis = int(item)
+                    continue
+                except (ValueError, TypeError):  # pragma: no cover# pragma: no cover
+                    raise ValueError("unknown special directive")
+            elif type(item) in ScalarType:
+                newobj = np.array(item, ndmin=ndmin)
+                scalars.append(len(objs))
+                scalar = True
+                scalartypes.append(newobj.dtype)
+            else:
+                item_ndim = ndim(item)
+                newobj = _nx.array(item, copy=False, ndmin=ndmin)
+                if trans1d != -1 and item_ndim < ndmin:
+                    k2 = ndmin - item_ndim
+                    k1 = trans1d
+                    if k1 < 0:
+                        k1 += k2 + 1
+                    defaxes = list(range(ndmin))
+                    axes = defaxes[:k1] + defaxes[k2:] + defaxes[k1:k2]
+                    newobj = newobj.transpose(axes)
+            objs.append(newobj)
+            if not scalar and isinstance(newobj, Tensor):
+                arraytypes.append(newobj.dtype)
+
+        # Ensure that scalars won't up-cast unless warranted
+        final_dtype = find_common_type(arraytypes, scalartypes)
+        if final_dtype is not None:
+            for k in scalars:
+                objs[k] = objs[k].astype(final_dtype)
+
+        res = concatenate(tuple(objs), axis=axis)
+
+        if matrix:  # pragma: no cover
+            raise NotImplementedError("Does not support operation on matrix")
+        return res
+
+    def __len__(self):
+        return 0
+
+
+# separate classes are used here instead of just making r_ = concatentor(0),
+# etc. because otherwise we couldn't get the doc string to come out right
+# in help(r_)
+
+
+class RClass(AxisConcatenator):
+    """
+    Translates slice objects to concatenation along the first axis.
+
+    This is a simple way to build up tensor quickly. There are two use cases.
+
+    1. If the index expression contains comma separated tensors, then stack
+       them along their first axis.
+    2. If the index expression contains slice notation or scalars then create
+       a 1-D tensor with a range indicated by the slice notation.
+
+    If slice notation is used, the syntax ``start:stop:step`` is equivalent
+    to ``mt.arange(start, stop, step)`` inside of the brackets. However, if
+    ``step`` is an imaginary number (i.e. 100j) then its integer portion is
+    interpreted as a number-of-points desired and the start and stop are
+    inclusive. In other words ``start:stop:stepj`` is interpreted as
+    ``mt.linspace(start, stop, step, endpoint=1)`` inside of the brackets.
+    After expansion of slice notation, all comma separated sequences are
+    concatenated together.
+
+    Optional character strings placed as the first element of the index
+    expression can be used to change the output. The strings 'r' or 'c' result
+    in matrix output. If the result is 1-D and 'r' is specified a 1 x N (row)
+    matrix is produced. If the result is 1-D and 'c' is specified, then a N x 1
+    (column) matrix is produced. If the result is 2-D then both provide the
+    same matrix result.
+
+    A string integer specifies which axis to stack multiple comma separated
+    tensors along. A string of two comma-separated integers allows indication
+    of the minimum number of dimensions to force each entry into as the
+    second integer (the axis to concatenate along is still the first integer).
+
+    A string with three comma-separated integers allows specification of the
+    axis to concatenate along, the minimum number of dimensions to force the
+    entries to, and which axis should contain the start of the tensors which
+    are less than the specified number of dimensions. In other words the third
+    integer allows you to specify where the 1's should be placed in the shape
+    of the tensors that have their shapes upgraded. By default, they are placed
+    in the front of the shape tuple. The third argument allows you to specify
+    where the start of the tensor should be instead. Thus, a third argument of
+    '0' would place the 1's at the end of the tensor shape. Negative integers
+    specify where in the new shape tuple the last dimension of upgraded tensors
+    should be placed, so the default is '-1'.
+
+    Parameters
+    ----------
+    Not a function, so takes no parameters
+
+
+    Returns
+    -------
+    A concatenated tensor or matrix.
+
+    See Also
+    --------
+    concatenate : Join a sequence of tensors along an existing axis.
+    c_ : Translates slice objects to concatenation along the second axis.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> mt.r_[mt.array([1,2,3]), 0, 0, mt.array([4,5,6])].execute()
+    array([1, 2, 3, ..., 4, 5, 6])
+    >>> mt.r_[-1:1:6j, [0]*3, 5, 6].execute()
+    array([-1. , -0.6, -0.2,  0.2,  0.6,  1. ,  0. ,  0. ,  0. ,  5. ,  6. ])
+
+    String integers specify the axis to concatenate along or the minimum
+    number of dimensions to force entries into.
+
+    >>> a = mt.array([[0, 1, 2], [3, 4, 5]])
+    >>> mt.r_['-1', a, a].execute() # concatenate along last axis
+    array([[0, 1, 2, 0, 1, 2],
+           [3, 4, 5, 3, 4, 5]])
+    >>> mt.r_['0,2', [1,2,3], [4,5,6]].execute() # concatenate along first axis, dim>=2
+    array([[1, 2, 3],
+           [4, 5, 6]])
+
+    >>> mt.r_['0,2,0', [1,2,3], [4,5,6]].execute()
+    array([[1],
+           [2],
+           [3],
+           [4],
+           [5],
+           [6]])
+    >>> mt.r_['1,2,0', [1,2,3], [4,5,6]].execute()
+    array([[1, 4],
+           [2, 5],
+           [3, 6]])
+    """
+
+    def __init__(self):
+        AxisConcatenator.__init__(self, 0)
+
+
+r_ = RClass()
+
+
+class CClass(AxisConcatenator):
+    """
+    Translates slice objects to concatenation along the second axis.
+
+    This is short-hand for ``mt.r_['-1,2,0', index expression]``, which is
+    useful because of its common occurrence. In particular, tensors will be
+    stacked along their last axis after being upgraded to at least 2-D with
+    1's post-pended to the shape (column vectors made out of 1-D tensors).
+
+    See Also
+    --------
+    column_stack : Stack 1-D tensors as columns into a 2-D tensor.
+    r_ : For more detailed documentation.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.c_[mt.array([1,2,3]), mt.array([4,5,6])].execute()
+    array([[1, 4],
+           [2, 5],
+           [3, 6]])
+    >>> mt.c_[mt.array([[1,2,3]]), 0, 0, mt.array([[4,5,6]])].execute()
+    array([[1, 2, 3, ..., 4, 5, 6]])
+
+    """
+
+    def __init__(self):
+        AxisConcatenator.__init__(self, -1, ndmin=2, trans1d=0)
+
+
+c_ = CClass()
+
+
+__all__ = ["ndindex", "mgrid", "ogrid", "r_", "c_"]
diff --git a/python/xorbits/_mars/tensor/lib/tests/__init__.py b/python/xorbits/_mars/tensor/lib/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/lib/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/lib/tests/test_index_tricks.py b/python/xorbits/_mars/tensor/lib/tests/test_index_tricks.py
new file mode 100644
index 000000000..13b7fbc69
--- /dev/null
+++ b/python/xorbits/_mars/tensor/lib/tests/test_index_tricks.py
@@ -0,0 +1,105 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+from .... import tensor as mt
+from ....core import tile
+from ...lib import nd_grid
+
+
+def test_index_tricks():
+    mgrid = nd_grid()
+    g = mgrid[0:5, 0:5]
+    tile(g)  # tileable means no loop exists
+
+    ogrid = nd_grid(sparse=True)
+    o = ogrid[0:5, 0:5]
+    tile(*o)  # tilesable means no loop exists
+
+
+def test_r_(setup):
+    r = mt.r_[mt.array([1, 2, 3]), 0, 0, mt.array([4, 5, 6])]
+
+    result = r.execute().fetch()
+    expected = np.r_[np.array([1, 2, 3]), 0, 0, np.array([4, 5, 6])]
+
+    np.testing.assert_array_equal(result, expected)
+
+    r = mt.r_[-1:1:6j, [0] * 3, 5, 6]
+
+    result = r.execute().fetch()
+    expected = np.r_[-1:1:6j, [0] * 3, 5, 6]
+
+    np.testing.assert_array_equal(result, expected)
+
+    r = mt.r_[-1:1:6j]
+
+    result = r.execute().fetch()
+    expected = np.r_[-1:1:6j]
+
+    np.testing.assert_array_equal(result, expected)
+
+    raw = [[0, 1, 2], [3, 4, 5]]
+    a = mt.array(raw, chunk_size=2)
+    r = mt.r_["-1", a, a]
+
+    result = r.execute().fetch()
+    expected = np.r_["-1", raw, raw]
+
+    np.testing.assert_array_equal(result, expected)
+
+    r = mt.r_["0,2", [1, 2, 3], [4, 5, 6]]
+
+    result = r.execute().fetch()
+    expected = np.r_["0,2", [1, 2, 3], [4, 5, 6]]
+
+    np.testing.assert_array_equal(result, expected)
+
+    r = mt.r_["0,2,0", [1, 2, 3], [4, 5, 6]]
+
+    result = r.execute().fetch()
+    expected = np.r_["0,2,0", [1, 2, 3], [4, 5, 6]]
+    np.testing.assert_array_equal(result, expected)
+
+    r = mt.r_["1,2,0", [1, 2, 3], [4, 5, 6]]
+
+    result = r.execute().fetch()
+    expected = np.r_["1,2,0", [1, 2, 3], [4, 5, 6]]
+    np.testing.assert_array_equal(result, expected)
+
+    assert len(mt.r_) == 0
+
+    with pytest.raises(ValueError):
+        _ = mt.r_[:3, "wrong"]
+
+
+def test_c_(setup):
+    r = mt.c_[mt.array([1, 2, 3]), mt.array([4, 5, 6])]
+
+    result = r.execute().fetch()
+    expected = np.c_[np.array([1, 2, 3]), np.array([4, 5, 6])]
+    np.testing.assert_array_equal(result, expected)
+
+    r = mt.c_[mt.array([[1, 2, 3]]), 0, 0, mt.array([[4, 5, 6]])]
+
+    result = r.execute().fetch()
+    expected = np.c_[np.array([[1, 2, 3]]), 0, 0, np.array([[4, 5, 6]])]
+    np.testing.assert_array_equal(result, expected)
+
+    r = mt.c_[:3, 1:4]
+    result = r.execute().fetch()
+    expected = np.c_[:3, 1:4]
+    np.testing.assert_array_equal(result, expected)
diff --git a/python/xorbits/_mars/tensor/linalg/__init__.py b/python/xorbits/_mars/tensor/linalg/__init__.py
new file mode 100644
index 000000000..4c62f4087
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/__init__.py
@@ -0,0 +1,41 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .cholesky import TensorCholesky, cholesky
+from .dot import TensorDot, dot
+from .inner import inner, innerproduct
+from .inv import TensorInv, inv
+from .lu import TensorLU, lu
+from .matmul import TensorMatmul, matmul
+from .norm import TensorNorm, norm
+from .qr import TensorQR, qr
+from .randomized_svd import randomized_svd
+from .solve import solve
+from .solve_triangular import TensorSolveTriangular, solve_triangular
+from .svd import TensorSVD, svd
+from .tensordot import TensorTensorDot, tensordot
+from .vdot import vdot
+
+
+def _install():
+    from ..core import Tensor, TensorData
+
+    setattr(Tensor, "__matmul__", matmul)
+    setattr(Tensor, "dot", dot)
+    setattr(TensorData, "__matmul__", matmul)
+    setattr(TensorData, "dot", dot)
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/tensor/linalg/cholesky.py b/python/xorbits/_mars/tensor/linalg/cholesky.py
new file mode 100644
index 000000000..7537cfeb7
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/cholesky.py
@@ -0,0 +1,325 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from numpy.linalg import LinAlgError
+
+from ... import opcodes as OperandDef
+from ...core import recursive_tile
+from ...serialization.serializables import BoolField, KeyField
+from ...utils import has_unknown_shape
+from ..array_utils import as_same_device, device
+from ..core import TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorHasInput, TensorOperand, TensorOperandMixin
+
+
+class TensorCholesky(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.CHOLESKY
+
+    _input = KeyField("input")
+    _lower = BoolField("lower")
+
+    def __init__(self, lower=None, **kw):
+        super().__init__(_lower=lower, **kw)
+
+    @property
+    def lower(self):
+        return self._lower
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def __call__(self, a):
+        return self.new_tensor([a], a.shape, order=TensorOrder.C_ORDER)
+
+    @classmethod
+    def tile(cls, op):
+        from ..base import TensorTranspose
+        from ..datasource.zeros import TensorZeros
+        from ..utils import reverse_order
+        from .dot import TensorDot
+        from .solve_triangular import TensorSolveTriangular
+
+        tensor = op.outputs[0]
+        in_tensor = op.input
+        if has_unknown_shape(in_tensor):
+            yield
+        if in_tensor.nsplits[0] != in_tensor.nsplits[1]:
+            # all chunks on diagonal should be square
+            nsplits = in_tensor.nsplits[0]
+            in_tensor = yield from recursive_tile(in_tensor.rechunk([nsplits, nsplits]))
+
+        lower_chunks, upper_chunks = {}, {}
+        for i in range(in_tensor.chunk_shape[0]):
+            for j in range(in_tensor.chunk_shape[1]):
+                if i < j:
+                    lower_shape = (in_tensor.nsplits[0][i], in_tensor.nsplits[1][j])
+                    lower_chunk = TensorZeros(
+                        dtype=tensor.dtype, shape=lower_shape, order=tensor.order.value
+                    ).new_chunk(
+                        None,
+                        shape=lower_shape,
+                        index=(i, j),
+                        order=tensor.order,
+                    )
+                    upper_shape = (in_tensor.nsplits[1][j], in_tensor.nsplits[0][i])
+                    upper_chunk = TensorZeros(
+                        dtype=tensor.dtype, shape=upper_shape, order=tensor.order.value
+                    ).new_chunk(
+                        None,
+                        shape=upper_shape,
+                        index=(j, i),
+                        order=tensor.order,
+                    )
+                    lower_chunks[lower_chunk.index] = lower_chunk
+                    upper_chunks[upper_chunk.index] = upper_chunk
+                elif i == j:
+                    target = in_tensor.cix[i, j]
+                    if i > 0:
+                        prev_chunks = []
+                        for p in range(i):
+                            a, b = lower_chunks[i, p], upper_chunks[p, j]
+                            prev_chunk = TensorDot(dtype=tensor.dtype).new_chunk(
+                                [a, b],
+                                shape=(a.shape[0], b.shape[1]),
+                                order=tensor.order,
+                            )
+                            prev_chunks.append(prev_chunk)
+
+                        cholesky_fuse_op = TensorCholeskyFuse()
+                        lower_chunk = cholesky_fuse_op.new_chunk(
+                            [target] + prev_chunks,
+                            shape=target.shape,
+                            index=(i, j),
+                            order=tensor.order,
+                        )
+                    else:
+                        lower_chunk = TensorCholesky(
+                            lower=True, dtype=tensor.dtype
+                        ).new_chunk(
+                            [target],
+                            shape=target.shape,
+                            index=(i, j),
+                            order=tensor.order,
+                        )
+
+                    upper_chunk = TensorTranspose(dtype=lower_chunk.dtype).new_chunk(
+                        [lower_chunk],
+                        shape=lower_chunk.shape[::-1],
+                        index=lower_chunk.index[::-1],
+                        order=reverse_order(lower_chunk.order),
+                    )
+                    lower_chunks[lower_chunk.index] = lower_chunk
+                    upper_chunks[upper_chunk.index] = upper_chunk
+                else:
+                    target = in_tensor.cix[j, i]
+                    if j > 0:
+                        prev_chunks = []
+                        for p in range(j):
+                            a, b = lower_chunks[j, p], upper_chunks[p, i]
+                            prev_chunk = TensorDot(dtype=tensor.dtype).new_chunk(
+                                [a, b],
+                                shape=(a.shape[0], b.shape[1]),
+                                order=tensor.order,
+                            )
+                            prev_chunks.append(prev_chunk)
+                        cholesky_fuse_op = TensorCholeskyFuse(by_solve_triangular=True)
+                        upper_chunk = cholesky_fuse_op.new_chunk(
+                            [target] + [lower_chunks[j, j]] + prev_chunks,
+                            shape=target.shape,
+                            index=(j, i),
+                            order=tensor.order,
+                        )
+                    else:
+                        upper_chunk = TensorSolveTriangular(
+                            lower=True, dtype=tensor.dtype
+                        ).new_chunk(
+                            [lower_chunks[j, j], target],
+                            shape=target.shape,
+                            index=(j, i),
+                            order=tensor.order,
+                        )
+                    lower_chunk = TensorTranspose(dtype=upper_chunk.dtype).new_chunk(
+                        [upper_chunk],
+                        shape=upper_chunk.shape[::-1],
+                        index=upper_chunk.index[::-1],
+                        order=reverse_order(upper_chunk.order),
+                    )
+                    lower_chunks[lower_chunk.index] = lower_chunk
+                    upper_chunks[upper_chunk.index] = upper_chunk
+
+        new_op = op.copy()
+        if op.lower:
+            return new_op.new_tensors(
+                op.inputs,
+                tensor.shape,
+                order=tensor.order,
+                chunks=list(lower_chunks.values()),
+                nsplits=in_tensor.nsplits,
+            )
+        else:
+            return new_op.new_tensors(
+                op.inputs,
+                tensor.shape,
+                order=tensor.order,
+                chunks=list(upper_chunks.values()),
+                nsplits=in_tensor.nsplits,
+            )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        (a,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            if xp is np:
+                import scipy.linalg
+
+                ctx[chunk.key] = scipy.linalg.cholesky(a, lower=op.lower)
+                return
+
+            r = xp.linalg.cholesky(a)
+            if not chunk.op.lower:
+                r = r.T.conj()
+
+            ctx[chunk.key] = r
+
+
+class TensorCholeskyFuse(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.CHOLESKY_FUSE
+
+    _by_solve_triangular = BoolField("by_solve_triangular")
+
+    def __init__(self, by_solve_triangular=None, **kw):
+        super().__init__(_by_solve_triangular=by_solve_triangular, **kw)
+
+    @property
+    def by_solve_triangular(self):
+        return self._by_solve_triangular
+
+    @classmethod
+    def _execute_by_cholesky(cls, inputs):
+        import scipy.linalg
+
+        target = inputs[0]
+        return scipy.linalg.cholesky((target - sum(inputs[1:])), lower=True)
+
+    @classmethod
+    def _execute_by_solve_striangular(cls, inputs):
+        import scipy.linalg
+
+        target = inputs[0]
+        lower = inputs[1]
+        return scipy.linalg.solve_triangular(
+            lower, (target - sum(inputs[2:])), lower=True
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs = [ctx[c.key] for c in op.inputs]
+        if op.by_solve_triangular:
+            ret = cls._execute_by_solve_striangular(inputs)
+        else:
+            ret = cls._execute_by_cholesky(inputs)
+        ctx[op.outputs[0].key] = ret
+
+
+def cholesky(a, lower=False):
+    """
+    Cholesky decomposition.
+
+    Return the Cholesky decomposition, `L * L.H`, of the square matrix `a`,
+    where `L` is lower-triangular and .H is the conjugate transpose operator
+    (which is the ordinary transpose if `a` is real-valued).  `a` must be
+    Hermitian (symmetric if real-valued) and positive-definite.  Only `L` is
+    actually returned.
+
+    Parameters
+    ----------
+    a : (..., M, M) array_like
+        Hermitian (symmetric if all elements are real), positive-definite
+        input matrix.
+    lower : bool
+        Whether to compute the upper or lower triangular Cholesky
+        factorization.  Default is upper-triangular.
+
+    Returns
+    -------
+    L : (..., M, M) array_like
+        Upper or lower-triangular Cholesky factor of `a`.
+
+    Raises
+    ------
+    LinAlgError
+       If the decomposition fails, for example, if `a` is not
+       positive-definite.
+
+    Notes
+    -----
+
+    Broadcasting rules apply, see the `mt.linalg` documentation for
+    details.
+
+    The Cholesky decomposition is often used as a fast way of solving
+
+    .. math:: A \\mathbf{x} = \\mathbf{b}
+
+    (when `A` is both Hermitian/symmetric and positive-definite).
+
+    First, we solve for :math:`\\mathbf{y}` in
+
+    .. math:: L \\mathbf{y} = \\mathbf{b},
+
+    and then for :math:`\\mathbf{x}` in
+
+    .. math:: L.H \\mathbf{x} = \\mathbf{y}.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> A = mt.array([[1,-2j],[2j,5]])
+    >>> A.execute()
+    array([[ 1.+0.j,  0.-2.j],
+           [ 0.+2.j,  5.+0.j]])
+    >>> L = mt.linalg.cholesky(A, lower=True)
+    >>> L.execute()
+    array([[ 1.+0.j,  0.+0.j],
+           [ 0.+2.j,  1.+0.j]])
+    >>> mt.dot(L, L.T.conj()).execute() # verify that L * L.H = A
+    array([[ 1.+0.j,  0.-2.j],
+           [ 0.+2.j,  5.+0.j]])
+    >>> A = [[1,-2j],[2j,5]] # what happens if A is only array_like?
+    >>> mt.linalg.cholesky(A, lower=True).execute()
+    array([[ 1.+0.j,  0.+0.j],
+           [ 0.+2.j,  1.+0.j]])
+
+    """
+    a = astensor(a)
+
+    if a.ndim != 2:  # pragma: no cover
+        raise LinAlgError(
+            f"{a.ndim}-dimensional array given. Tensor must be two-dimensional"
+        )
+    if a.shape[0] != a.shape[1]:  # pragma: no cover
+        raise LinAlgError("Input must be square")
+
+    cho = np.linalg.cholesky(np.array([[1, 2], [2, 5]], dtype=a.dtype))
+
+    op = TensorCholesky(lower=lower, dtype=cho.dtype)
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/linalg/core.py b/python/xorbits/_mars/tensor/linalg/core.py
new file mode 100644
index 000000000..f05f6f264
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/core.py
@@ -0,0 +1,320 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ...core import recursive_tile
+from ...utils import has_unknown_shape
+from ..core import TensorOrder
+from ..utils import decide_chunk_sizes
+from .utils import calc_svd_shapes
+
+
+class SFQR:
+    __slots__ = ()
+
+    @classmethod
+    def tile(cls, op):
+        """
+        Short-and-Fat QR
+
+        Q [R_1 R_2 ...] = [A_1 A_2 ...]
+        """
+        from ..base import TensorTranspose
+        from .dot import TensorDot
+        from .qr import TensorQR
+
+        a = op.input
+        q, r = op.outputs
+
+        tinyq, tinyr = np.linalg.qr(np.ones((1, 1), dtype=a.dtype))
+        q_dtype, r_dtype = tinyq.dtype, tinyr.dtype
+
+        check_nan_shape = False
+        rechunk_size = dict()
+        if a.chunk_shape[0] != 1:
+            check_nan_shape = True
+            rechunk_size[0] = a.shape[0]
+
+        if len(a.chunks) > 1:
+            check_nan_shape = True
+            if a.chunks[0].shape[0] > a.chunks[0].shape[1]:
+                rechunk_size[1] = a.shape[0]
+
+        if check_nan_shape:
+            if has_unknown_shape(a):
+                yield
+
+        if rechunk_size:
+            new_chunks = decide_chunk_sizes(a.shape, rechunk_size, a.dtype.itemsize)
+            a = yield from recursive_tile(a.rechunk(new_chunks))
+
+        # A_1's QR decomposition
+        r_chunks = []
+        first_chunk = a.chunks[0]
+        x, y = first_chunk.shape
+        q_shape, r_shape = (
+            (first_chunk.shape, (y, y)) if x > y else ((x, x), first_chunk.shape)
+        )
+        qr_op = TensorQR()
+        q_chunk, r_chunk = qr_op.new_chunks(
+            [first_chunk],
+            index=(0, 0),
+            kws=[
+                {"side": "q", "dtype": q_dtype, "shape": q_shape, "order": q.order},
+                {"side": "r", "dtype": r_dtype, "shape": r_shape, "order": r.order},
+            ],
+        )
+        # q is an orthogonal matrix, so q.T and inverse of q is equal
+        trans_op = TensorTranspose()
+        q_transpose = trans_op.new_chunk([q_chunk], shape=q_chunk.shape)
+        r_chunks.append(r_chunk)
+
+        r_rest = [
+            TensorDot().new_chunk(
+                [q_transpose, c],
+                shape=(q_transpose.shape[0], c.shape[1]),
+                index=c.index,
+                order=q.order,
+            )
+            for c in a.chunks[1:]
+        ]
+        r_chunks.extend(r_rest)
+
+        new_op = op.copy()
+        q_nsplits = ((q_chunk.shape[0],), (q_chunk.shape[1],))
+        r_nsplits = ((r_chunks[0].shape[0],), tuple(c.shape[1] for c in r_chunks))
+        kws = [
+            {
+                "chunks": [q_chunk],
+                "nsplits": q_nsplits,
+                "dtype": q.dtype,
+                "shape": q.shape,
+            },
+            {
+                "chunks": r_chunks,
+                "nsplits": r_nsplits,
+                "dtype": r.dtype,
+                "shape": r.shape,
+            },
+        ]
+        return new_op.new_tensors(op.inputs, kws=kws)
+
+
+class TSQR:
+    __slots__ = ()
+
+    @classmethod
+    def tile(cls, op):
+        from ..indexing.slice import TensorSlice
+        from ..merge.concatenate import TensorConcatenate
+        from .dot import TensorDot
+        from .qr import TensorQR
+        from .svd import TensorSVD
+
+        calc_svd = getattr(op, "_is_svd", lambda: None)() or False
+
+        a = op.input
+
+        tinyq, tinyr = np.linalg.qr(np.ones((1, 1), dtype=a.dtype))
+        q_dtype, r_dtype = tinyq.dtype, tinyr.dtype
+
+        if a.chunk_shape[1] != 1:
+            if has_unknown_shape(a):
+                yield
+            new_chunk_size = decide_chunk_sizes(
+                a.shape, {1: a.shape[1]}, a.dtype.itemsize
+            )
+            a = yield from recursive_tile(a.rechunk(new_chunk_size))
+
+        # stage 1, map phase
+        stage1_q_chunks, stage1_r_chunks = stage1_chunks = [[], []]  # Q and R chunks
+        for c in a.chunks:
+            x, y = c.shape
+            q_shape, r_shape = (c.shape, (y, y)) if x > y else ((x, x), c.shape)
+            qr_op = TensorQR()
+            qr_chunks = qr_op.new_chunks(
+                [c],
+                index=c.index,
+                kws=[
+                    {"side": "q", "dtype": q_dtype, "shape": q_shape},
+                    {"side": "r", "dtype": r_dtype, "shape": r_shape},
+                ],
+            )
+            stage1_chunks[0].append(qr_chunks[0])
+            stage1_chunks[1].append(qr_chunks[1])
+
+        # stage 2, reduce phase
+        # concatenate all r chunks into one
+        shape = (sum(c.shape[0] for c in stage1_r_chunks), stage1_r_chunks[0].shape[1])
+        concat_op = TensorConcatenate(axis=0, dtype=stage1_r_chunks[0].dtype)
+        concat_r_chunk = concat_op.new_chunk(
+            stage1_r_chunks, shape=shape, index=(0, 0), order=TensorOrder.C_ORDER
+        )
+        qr_op = TensorQR()
+        qr_chunks = qr_op.new_chunks(
+            [concat_r_chunk],
+            index=concat_r_chunk.index,
+            kws=[
+                {
+                    "side": "q",
+                    "dtype": q_dtype,
+                    "order": TensorOrder.C_ORDER,
+                    "shape": (concat_r_chunk.shape[0], min(concat_r_chunk.shape)),
+                },
+                {
+                    "side": "r",
+                    "dtype": r_dtype,
+                    "order": TensorOrder.C_ORDER,
+                    "shape": (min(concat_r_chunk.shape), concat_r_chunk.shape[1]),
+                },
+            ],
+        )
+        stage2_q_chunk, stage2_r_chunk = qr_chunks
+
+        # stage 3, map phase
+        # split stage2_q_chunk into the same size as stage1_q_chunks
+        q_splits = np.cumsum([c.shape[1] for c in stage1_q_chunks]).tolist()
+        q_slices = [
+            slice(q_splits[i]) if i == 0 else slice(q_splits[i - 1], q_splits[i])
+            for i in range(len(q_splits))
+        ]
+        stage2_q_chunks = []
+        for c, s in zip(stage1_q_chunks, q_slices):
+            slice_op = TensorSlice(slices=[s], dtype=c.dtype)
+            slice_length = s.stop - (s.start or 0)
+            stage2_q_chunks.append(
+                slice_op.new_chunk(
+                    [stage2_q_chunk],
+                    index=c.index,
+                    order=TensorOrder.C_ORDER,
+                    shape=(slice_length, stage2_q_chunk.shape[1]),
+                )
+            )
+        stage3_q_chunks = []
+        for c1, c2 in zip(stage1_q_chunks, stage2_q_chunks):
+            dot_op = TensorDot(dtype=q_dtype)
+            shape = (c1.shape[0], c2.shape[1])
+            stage3_q_chunks.append(
+                dot_op.new_chunk(
+                    [c1, c2], shape=shape, index=c1.index, order=TensorOrder.C_ORDER
+                )
+            )
+
+        if not calc_svd:
+            q, r = op.outputs
+            new_op = op.copy()
+            q_nsplits = (
+                tuple(c.shape[0] for c in stage3_q_chunks),
+                (stage3_q_chunks[0].shape[1],),
+            )
+            r_nsplits = ((stage2_r_chunk.shape[0],), (stage2_r_chunk.shape[1],))
+            kws = [
+                # Q
+                {
+                    "chunks": stage3_q_chunks,
+                    "nsplits": q_nsplits,
+                    "dtype": q.dtype,
+                    "shape": q.shape,
+                },
+                # R, calculate from stage2
+                {
+                    "chunks": [stage2_r_chunk],
+                    "nsplits": r_nsplits,
+                    "dtype": r.dtype,
+                    "shape": r.shape,
+                },
+            ]
+            return new_op.new_tensors(op.inputs, kws=kws)
+        else:
+            U, s, V = op.outputs
+            U_dtype, s_dtype, V_dtype = U.dtype, s.dtype, V.dtype
+            U_shape, s_shape, V_shape = U.shape, s.shape, V.shape
+
+            svd_op = TensorSVD()
+            u_shape, s_shape, v_shape = calc_svd_shapes(stage2_r_chunk)
+            stage2_usv_chunks = svd_op.new_chunks(
+                [stage2_r_chunk],
+                kws=[
+                    {
+                        "side": "U",
+                        "dtype": U_dtype,
+                        "index": stage2_r_chunk.index,
+                        "shape": u_shape,
+                        "order": TensorOrder.C_ORDER,
+                    },
+                    {
+                        "side": "s",
+                        "dtype": s_dtype,
+                        "index": stage2_r_chunk.index[1:],
+                        "shape": s_shape,
+                        "order": TensorOrder.C_ORDER,
+                    },
+                    {
+                        "side": "V",
+                        "dtype": V_dtype,
+                        "index": stage2_r_chunk.index,
+                        "shape": v_shape,
+                        "order": TensorOrder.C_ORDER,
+                    },
+                ],
+            )
+            stage2_u_chunk, stage2_s_chunk, stage2_v_chunk = stage2_usv_chunks
+
+            # stage 4, U = Q @ u
+            stage4_u_chunks = []
+            if U is not None:  # U is not garbage collected
+                for c1 in stage3_q_chunks:
+                    dot_op = TensorDot(dtype=U_dtype)
+                    shape = (c1.shape[0], stage2_u_chunk.shape[1])
+                    stage4_u_chunks.append(
+                        dot_op.new_chunk(
+                            [c1, stage2_u_chunk],
+                            shape=shape,
+                            index=c1.index,
+                            order=TensorOrder.C_ORDER,
+                        )
+                    )
+
+            new_op = op.copy()
+            u_nsplits = (
+                tuple(c.shape[0] for c in stage4_u_chunks),
+                (stage4_u_chunks[0].shape[1],),
+            )
+            s_nsplits = ((stage2_s_chunk.shape[0],),)
+            v_nsplits = ((stage2_v_chunk.shape[0],), (stage2_v_chunk.shape[1],))
+            kws = [
+                {
+                    "chunks": stage4_u_chunks,
+                    "nsplits": u_nsplits,
+                    "dtype": U_dtype,
+                    "shape": U_shape,
+                    "order": U.order,
+                },  # U
+                {
+                    "chunks": [stage2_s_chunk],
+                    "nsplits": s_nsplits,
+                    "dtype": s_dtype,
+                    "shape": s_shape,
+                    "order": s.order,
+                },  # s
+                {
+                    "chunks": [stage2_v_chunk],
+                    "nsplits": v_nsplits,
+                    "dtype": V_dtype,
+                    "shape": V_shape,
+                    "order": V.order,
+                },  # V
+            ]
+            return new_op.new_tensors(op.inputs, kws=kws)
diff --git a/python/xorbits/_mars/tensor/linalg/dot.py b/python/xorbits/_mars/tensor/linalg/dot.py
new file mode 100644
index 000000000..280289623
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/dot.py
@@ -0,0 +1,164 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import KeyField
+from ..array_utils import as_same_device, device, is_sparse_module
+from ..core import Tensor, TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+from .tensordot import tensordot
+
+
+class TensorDot(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.DOT
+
+    _a = KeyField("a")
+    _b = KeyField("b")
+
+    @property
+    def a(self):
+        return self._a
+
+    @property
+    def b(self):
+        return self._b
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._a, self._b = self._inputs
+
+    @classmethod
+    def execute(cls, ctx, op):
+        chunk = op.outputs[0]
+        (a, b), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            if not op.sparse and is_sparse_module(xp):
+                # tell sparse to do calculation on numpy or cupy dot
+                ctx[chunk.key] = xp.dot(a, b, sparse=False)
+            else:
+                ctx[chunk.key] = xp.dot(a, b)
+
+
+def dot(a, b, out=None, sparse=None):
+    """
+    Dot product of two arrays. Specifically,
+
+    - If both `a` and `b` are 1-D arrays, it is inner product of vectors
+      (without complex conjugation).
+
+    - If both `a` and `b` are 2-D arrays, it is matrix multiplication,
+      but using :func:`matmul` or ``a @ b`` is preferred.
+
+    - If either `a` or `b` is 0-D (scalar), it is equivalent to :func:`multiply`
+      and using ``numpy.multiply(a, b)`` or ``a * b`` is preferred.
+
+    - If `a` is an N-D array and `b` is a 1-D array, it is a sum product over
+      the last axis of `a` and `b`.
+
+    - If `a` is an N-D array and `b` is an M-D array (where ``M>=2``), it is a
+      sum product over the last axis of `a` and the second-to-last axis of `b`::
+
+        dot(a, b)[i,j,k,m] = sum(a[i,j,:] * b[k,:,m])
+
+    Parameters
+    ----------
+    a : array_like
+        First argument.
+    b : array_like
+        Second argument.
+    out : Tensor, optional
+        Output argument. This must have the exact kind that would be returned
+        if it was not used. In particular, it must have the right type, must be
+        C-contiguous, and its dtype must be the dtype that would be returned
+        for `dot(a,b)`. This is a performance feature. Therefore, if these
+        conditions are not met, an exception is raised, instead of attempting
+        to be flexible.
+
+    Returns
+    -------
+    output : Tensor
+        Returns the dot product of `a` and `b`.  If `a` and `b` are both
+        scalars or both 1-D arrays then a scalar is returned; otherwise
+        a tensor is returned.
+        If `out` is given, then it is returned.
+
+    Raises
+    ------
+    ValueError
+        If the last dimension of `a` is not the same size as
+        the second-to-last dimension of `b`.
+
+    See Also
+    --------
+    vdot : Complex-conjugating dot product.
+    tensordot : Sum products over arbitrary axes.
+    einsum : Einstein summation convention.
+    matmul : '@' operator as method with out parameter.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.dot(3, 4).execute()
+    12
+
+    Neither argument is complex-conjugated:
+
+    >>> mt.dot([2j, 3j], [2j, 3j]).execute()
+    (-13+0j)
+
+    For 2-D arrays it is the matrix product:
+
+    >>> a = [[1, 0], [0, 1]]
+    >>> b = [[4, 1], [2, 2]]
+    >>> mt.dot(a, b).execute()
+    array([[4, 1],
+           [2, 2]])
+
+    >>> a = mt.arange(3*4*5*6).reshape((3,4,5,6))
+    >>> b = mt.arange(3*4*5*6)[::-1].reshape((5,4,6,3))
+    >>> mt.dot(a, b)[2,3,2,1,2,2].execute()
+    499128
+    >>> mt.sum(a[2,3,2,:] * b[1,2,:,2]).execute()
+    499128
+    """
+    a, b = astensor(a), astensor(b)
+    if a.isscalar() and b.isscalar():
+        ret = a * b
+    else:
+        ret = tensordot(a, b, axes=((a.ndim - 1,), (b.ndim - 2,)), sparse=sparse)
+
+    if out is None:
+        return ret
+
+    # set to out
+    if not isinstance(out, Tensor):
+        raise TypeError(f"`out` must be a Tensor, got {type(out)} instead")
+    if out.shape != ret.shape:
+        raise ValueError("output tensor has wrong dimensions")
+    if not (
+        out.dtype == ret.dtype
+        and out.ndim == ret.ndim
+        and out.order == TensorOrder.C_ORDER
+    ):
+        raise ValueError(
+            "output tensor is not acceptable "
+            "(must have the right datatype, number of dimensions and be a C-Tensor"
+        )
+    out.data = ret.astype(out.dtype, order=out.order.value, copy=False).data
+    return out
diff --git a/python/xorbits/_mars/tensor/linalg/inner.py b/python/xorbits/_mars/tensor/linalg/inner.py
new file mode 100644
index 000000000..48f7ef361
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/inner.py
@@ -0,0 +1,36 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..datasource import tensor as astensor
+from .tensordot import tensordot
+
+
+def inner(a, b, sparse=None):
+    """
+    Returns the inner product of a and b for arrays of floating point types.
+
+    Like the generic NumPy equivalent the product sum is over the last dimension
+    of a and b. The first argument is not conjugated.
+
+    """
+    a, b = astensor(a), astensor(b)
+    if a.isscalar() and b.isscalar():
+        ret = a * b
+    else:
+        ret = tensordot(a, b, axes=(-1, -1), sparse=sparse)
+
+    return ret
+
+
+innerproduct = inner
diff --git a/python/xorbits/_mars/tensor/linalg/inv.py b/python/xorbits/_mars/tensor/linalg/inv.py
new file mode 100644
index 000000000..1b2898f11
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/inv.py
@@ -0,0 +1,154 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from numpy.linalg import LinAlgError
+
+from ... import opcodes as OperandDef
+from ...core import recursive_tile
+from ...serialization.serializables import KeyField
+from ..array_utils import as_same_device, device
+from ..core import TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorHasInput, TensorOperandMixin
+
+
+class TensorInv(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.INV
+
+    _input = KeyField("input")
+
+    def __call__(self, a):
+        a = astensor(a)
+        return self.new_tensor([a], a.shape, order=TensorOrder.C_ORDER)
+
+    @classmethod
+    def _tile_one_chunk(cls, op):
+        out = op.outputs[0]
+        chunk_op = op.copy().reset_key()
+        chunk_params = out.params
+        chunk_params["index"] = (0,) * out.ndim
+        out_chunk = chunk_op.new_chunk(op.inputs[0].chunks, kws=[chunk_params])
+
+        new_op = op.copy()
+        params = out.params
+        params["nsplits"] = tuple((s,) for s in out.shape)
+        params["chunks"] = [out_chunk]
+        return new_op.new_tensors(op.inputs, kws=[params])
+
+    @classmethod
+    def tile(cls, op):
+        """
+        Use LU decomposition to compute inverse of matrix.
+        Given a square matrix A:
+        P, L, U = lu(A)
+        b_eye is an identity matrix with the same shape as matrix A, then,
+        (P * L * U) * A_inv = b_eye
+        L * (U * A_inv) = P.T * b_eye
+        use `solve_triangular` twice to compute the inverse of matrix A.
+        """
+        from ..base.transpose import TensorTranspose
+        from ..datasource import eye
+        from .lu import lu
+        from .solve_triangular import solve_triangular
+        from .tensordot import tensordot
+
+        in_tensor = op.input
+        is_sparse = in_tensor.is_sparse()
+
+        if len(in_tensor.chunks) == 1:
+            return cls._tile_one_chunk(op)
+
+        b_eye = eye(in_tensor.shape[0], chunk_size=in_tensor.nsplits, sparse=is_sparse)
+
+        p, l, u = lu(in_tensor)
+
+        # transposed p equals to inverse of p
+        p_transpose = TensorTranspose(
+            dtype=p.dtype, sparse=p.op.sparse, axes=list(range(in_tensor.ndim))[::-1]
+        ).new_tensor([p], p.shape)
+
+        b = tensordot(
+            p_transpose, b_eye, axes=((p_transpose.ndim - 1,), (b_eye.ndim - 2,))
+        )
+
+        # as `l` is a lower matrix, `lower=True` should be specified.
+        uy = solve_triangular(l, b, lower=True, sparse=op.sparse)
+
+        a_inv = solve_triangular(u, uy, sparse=op.sparse)
+        a_inv = yield from recursive_tile(a_inv)
+        return [a_inv]
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (inp,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            ctx[op.outputs[0].key] = xp.linalg.inv(inp)
+
+
+def inv(a, sparse=None):
+    """
+    Compute the (multiplicative) inverse of a matrix.
+    Given a square matrix `a`, return the matrix `ainv` satisfying
+    ``dot(a, ainv) = dot(ainv, a) = eye(a.shape[0])``.
+
+    Parameters
+    ----------
+    a : (..., M, M) array_like
+        Matrix to be inverted.
+    sparse: bool, optional
+        Return sparse value or not.
+
+    Returns
+    -------
+    ainv : (..., M, M) ndarray or matrix
+        (Multiplicative) inverse of the matrix `a`.
+
+    Raises
+    ------
+    LinAlgError
+        If `a` is not square or inversion fails.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> a = np.array([[1., 2.], [3., 4.]])
+    >>> ainv = mt.linalg.inv(a)
+    >>> mt.allclose(mt.dot(a, ainv), mt.eye(2)).execute()
+    True
+
+    >>> mt.allclose(mt.dot(ainv, a), mt.eye(2)).execute()
+    True
+
+    >>> ainv.execute()
+    array([[ -2. ,  1. ],
+           [ 1.5, -0.5]])
+    """
+
+    # TODO: using some parallel algorithm for matrix inversion.
+    a = astensor(a)
+    if a.ndim != 2:
+        raise LinAlgError(
+            f"{a.ndim}-dimensional array given. Tensor must be two-dimensional"
+        )
+    if a.shape[0] != a.shape[1]:
+        raise LinAlgError("Input must be square")
+
+    tiny_inv = np.linalg.inv(np.array([[1, 2], [2, 5]], dtype=a.dtype))
+    sparse = sparse if sparse is not None else a.issparse()
+    op = TensorInv(dtype=tiny_inv.dtype, sparse=sparse)
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/linalg/lu.py b/python/xorbits/_mars/tensor/linalg/lu.py
new file mode 100644
index 000000000..eba59f767
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/lu.py
@@ -0,0 +1,510 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from numpy.linalg import LinAlgError
+
+from ... import opcodes as OperandDef
+from ...core import ExecutableTuple, recursive_tile
+from ...serialization.serializables import KeyField
+from ...utils import has_unknown_shape
+from ..array_utils import as_same_device, device, is_sparse_module
+from ..datasource import tensor as astensor
+from ..operands import TensorHasInput, TensorOperandMixin
+
+
+class TensorLU(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.LU
+
+    _input = KeyField("input")
+
+    @property
+    def output_limit(self):
+        return 3
+
+    def __call__(self, a):
+        import scipy.linalg
+
+        a = astensor(a)
+        if a.ndim != 2:
+            raise LinAlgError(
+                f"{a.ndim}-dimensional array given. Tensor must be two-dimensional"
+            )
+
+        if a.shape[0] > a.shape[1]:
+            p_shape = (a.shape[0],) * 2
+            l_shape = a.shape
+            u_shape = (a.shape[1],) * 2
+        elif a.shape[0] < a.shape[1]:
+            p_shape = (a.shape[0],) * 2
+            l_shape = (a.shape[0],) * 2
+            u_shape = a.shape
+        else:
+            p_shape, l_shape, u_shape = (a.shape,) * 3
+
+        tiny_p, tiny_l, tiny_u = scipy.linalg.lu(
+            np.array([[1, 2], [2, 5]], dtype=a.dtype)
+        )
+
+        order = a.order
+        p, l, u = self.new_tensors(
+            [a],
+            kws=[
+                {"side": "p", "dtype": tiny_p.dtype, "shape": p_shape, "order": order},
+                {"side": "l", "dtype": tiny_l.dtype, "shape": l_shape, "order": order},
+                {"side": "u", "dtype": tiny_u.dtype, "shape": u_shape, "order": order},
+            ],
+        )
+        return ExecutableTuple([p, l, u])
+
+    @classmethod
+    def _tile_one_chunk(cls, op):
+        p, l, u = op.outputs
+        chunk_op = op.copy().reset_key()
+        chunk_kws = [
+            {
+                "side": "p",
+                "dtype": p.dtype,
+                "shape": p.shape,
+                "order": p.order,
+                "index": (0,) * p.ndim,
+            },
+            {
+                "side": "l",
+                "dtype": l.dtype,
+                "shape": l.shape,
+                "order": l.order,
+                "index": (0,) * l.ndim,
+            },
+            {
+                "side": "u",
+                "dtype": u.dtype,
+                "shape": u.shape,
+                "order": u.order,
+                "index": (0,) * u.ndim,
+            },
+        ]
+        chunks = chunk_op.new_chunks(op.input.chunks, kws=chunk_kws)
+
+        new_op = op.copy()
+        kws = [p.params, l.params, u.params]
+        for i, out in enumerate([p, l, u]):
+            kws[i]["nsplits"] = tuple((s,) for s in out.shape)
+            kws[i]["chunks"] = [chunks[i]]
+        return new_op.new_tensors(op.inputs, kws=kws)
+
+    @classmethod
+    def tile(cls, op):
+        if len(op.input.chunks) == 1:
+            return cls._tile_one_chunk(op)
+
+        from ..arithmetic.add import TensorTreeAdd
+        from ..arithmetic.subtract import TensorSubtract
+        from ..base.transpose import TensorTranspose
+        from ..datasource.zeros import TensorZeros, zeros
+        from ..merge.hstack import hstack
+        from ..merge.vstack import vstack
+        from .dot import TensorDot
+        from .solve_triangular import TensorSolveTriangular
+
+        P, L, U = op.outputs
+        raw_in_tensor = in_tensor = op.input
+        out_tensor = op.outputs[0]
+
+        if in_tensor.shape[0] > in_tensor.shape[1]:
+            zero_tensor = zeros(
+                (in_tensor.shape[0], in_tensor.shape[0] - in_tensor.shape[1]),
+                dtype=in_tensor.dtype,
+                sparse=in_tensor.issparse(),
+                gpu=in_tensor.op.gpu,
+                chunk_size=(in_tensor.nsplits[0], max(in_tensor.nsplits[1])),
+                order=in_tensor.order.value,
+            )
+            in_tensor = yield from recursive_tile(hstack([in_tensor, zero_tensor]))
+        elif in_tensor.shape[0] < in_tensor.shape[1]:
+            zero_tensor = zeros(
+                (in_tensor.shape[1] - in_tensor.shape[0], in_tensor.shape[1]),
+                dtype=in_tensor.dtype,
+                sparse=in_tensor.issparse(),
+                gpu=in_tensor.op.gpu,
+                chunk_size=(max(in_tensor.nsplits[0]), in_tensor.nsplits[1]),
+                order=in_tensor.order.value,
+            )
+            in_tensor = yield from recursive_tile(vstack([in_tensor, zero_tensor]))
+
+        if has_unknown_shape(in_tensor):
+            yield
+        if in_tensor.nsplits[0] != in_tensor.nsplits[1]:
+            # all chunks on diagonal should be square
+            nsplits = in_tensor.nsplits[0]
+            in_tensor = yield from recursive_tile(in_tensor.rechunk([nsplits, nsplits]))
+
+        p_chunks, p_invert_chunks, lower_chunks, l_permuted_chunks, upper_chunks = (
+            {},
+            {},
+            {},
+            {},
+            {},
+        )
+        for i in range(in_tensor.chunk_shape[0]):
+            for j in range(in_tensor.chunk_shape[1]):
+                if i < j:
+                    chunk_shape = (in_tensor.nsplits[0][i], in_tensor.nsplits[1][j])
+                    p_chunk = TensorZeros(
+                        sparse=op.sparse,
+                        order=out_tensor.order.value,
+                        shape=chunk_shape,
+                    ).new_chunk(
+                        None, shape=chunk_shape, index=(i, j), order=out_tensor.order
+                    )
+                    lower_chunk = TensorZeros(
+                        sparse=op.sparse,
+                        order=out_tensor.order.value,
+                        shape=chunk_shape,
+                    ).new_chunk(
+                        None, shape=chunk_shape, index=(i, j), order=out_tensor.order
+                    )
+                    p_chunks[p_chunk.index] = p_chunk
+                    lower_chunks[lower_chunk.index] = lower_chunk
+
+                    target_u = in_tensor.cix[i, j]
+                    p_invert = p_invert_chunks[i, i]
+                    target = TensorDot(dtype=U.dtype, sparse=U.op.sparse).new_chunk(
+                        [p_invert, target_u],
+                        shape=(p_invert.shape[0], target_u.shape[1]),
+                        order=out_tensor.order,
+                    )
+                    if i > 0:
+                        prev_chunks_u = []
+                        for p in range(i):
+                            a, b = lower_chunks[i, p], upper_chunks[p, j]
+                            prev_chunk = TensorDot(
+                                dtype=U.dtype, sparse=U.op.sparse
+                            ).new_chunk(
+                                [a, b],
+                                shape=(a.shape[0], b.shape[1]),
+                                order=out_tensor.order,
+                            )
+                            prev_chunks_u.append(prev_chunk)
+                        if len(prev_chunks_u) == 1:
+                            s = prev_chunks_u[0]
+                        else:
+                            tree_add_op = TensorTreeAdd(
+                                args=prev_chunks_u,
+                                dtype=prev_chunks_u[0].dtype,
+                                sparse=op.sparse,
+                            )
+                            s = tree_add_op.new_chunk(
+                                prev_chunks_u, shape=prev_chunks_u[0].shape
+                            )
+                        target = TensorSubtract(
+                            dtype=U.dtype,
+                            lhs=target,
+                            rhs=s,
+                            order=out_tensor.order.value,
+                        ).new_chunk(
+                            [target, s], shape=target.shape, order=out_tensor.order
+                        )
+                    upper_chunk = TensorSolveTriangular(
+                        lower=True,
+                        dtype=U.dtype,
+                        strict=False,
+                        sparse=lower_chunks[i, i].op.sparse,
+                    ).new_chunk(
+                        [lower_chunks[i, i], target],
+                        shape=target.shape,
+                        index=(i, j),
+                        order=out_tensor.order,
+                    )
+                    upper_chunks[upper_chunk.index] = upper_chunk
+                elif i == j:
+                    target = in_tensor.cix[i, j]
+                    if i > 0:
+                        prev_chunks = []
+                        for p in range(i):
+                            a, b = l_permuted_chunks[i, p], upper_chunks[p, j]
+                            prev_chunk = TensorDot(
+                                dtype=a.dtype, sparse=op.sparse
+                            ).new_chunk(
+                                [a, b],
+                                shape=(a.shape[0], b.shape[1]),
+                                order=out_tensor.order,
+                            )
+                            prev_chunks.append(prev_chunk)
+                        if len(prev_chunks) == 1:
+                            s = prev_chunks[0]
+                        else:
+                            tree_add_op = TensorTreeAdd(
+                                args=prev_chunks,
+                                dtype=prev_chunks[0].dtype,
+                                sparse=op.sparse,
+                            )
+                            s = tree_add_op.new_chunk(
+                                prev_chunks, shape=prev_chunks[0].shape
+                            )
+                        target = TensorSubtract(
+                            dtype=L.dtype,
+                            lhs=target,
+                            rhs=s,
+                            order=out_tensor.order.value,
+                        ).new_chunk([target, s], shape=target.shape)
+                    new_op = TensorLU(dtype=op.dtype, sparse=target.op.sparse)
+                    lu_chunks = new_op.new_chunks(
+                        [target],
+                        index=(i, j),
+                        order=out_tensor.order,
+                        kws=[
+                            {"side": "p", "dtype": P.dtype, "shape": target.shape},
+                            {"side": "l", "dtype": L.dtype, "shape": target.shape},
+                            {"side": "u", "dtype": U.dtype, "shape": target.shape},
+                        ],
+                    )
+                    p_chunk, lower_chunk, upper_chunk = lu_chunks
+                    # transposed p equals to inverted p
+                    p_chunk_invert = TensorTranspose(
+                        dtype=p_chunk.dtype, sparse=op.sparse
+                    ).new_chunk(
+                        [p_chunk],
+                        shape=p_chunk.shape,
+                        index=p_chunk.index,
+                        order=out_tensor.order,
+                    )
+                    p_chunks[p_chunk.index] = p_chunk
+                    p_invert_chunks[p_chunk_invert.index] = p_chunk_invert
+                    lower_chunks[lower_chunk.index] = lower_chunk
+                    upper_chunks[upper_chunk.index] = upper_chunk
+
+                    # l_permuted should be transferred to the final lower triangular
+                    for p in range(i):
+                        l_permuted_chunk = l_permuted_chunks[i, p]
+                        l_chunk = TensorDot(
+                            dtype=L.dtype, sparse=L.op.sparse
+                        ).new_chunk(
+                            [p_chunk_invert, l_permuted_chunk],
+                            shape=(p_chunk_invert.shape[0], l_permuted_chunk.shape[1]),
+                            index=l_permuted_chunk.index,
+                            order=out_tensor.order,
+                        )
+                        lower_chunks[l_permuted_chunk.index] = l_chunk
+                else:
+                    chunk_shape = (in_tensor.nsplits[0][i], in_tensor.nsplits[1][j])
+                    p_chunk = TensorZeros(
+                        sparse=op.sparse,
+                        order=out_tensor.order.value,
+                        shape=chunk_shape,
+                    ).new_chunk(
+                        None, shape=chunk_shape, index=(i, j), order=out_tensor.order
+                    )
+                    upper_chunk = TensorZeros(
+                        sparse=op.sparse,
+                        order=out_tensor.order.value,
+                        shape=chunk_shape,
+                    ).new_chunk(
+                        None, shape=chunk_shape, index=(i, j), order=out_tensor.order
+                    )
+                    p_chunks[p_chunk.index] = p_chunk
+                    upper_chunks[upper_chunk.index] = upper_chunk
+                    target_l = in_tensor.cix[i, j]
+                    if j > 0:
+                        prev_chunks_l = []
+                        for p in range(j):
+                            a, b = l_permuted_chunks[i, p], upper_chunks[p, j]
+                            prev_chunk = TensorDot(
+                                dtype=L.dtype, sparse=L.op.sparse
+                            ).new_chunk(
+                                [a, b],
+                                shape=(a.shape[0], b.shape[1]),
+                                order=out_tensor.order,
+                            )
+                            prev_chunks_l.append(prev_chunk)
+                        if len(prev_chunks_l) == 1:
+                            s = prev_chunks_l[0]
+                        else:
+                            tree_add_op = TensorTreeAdd(
+                                args=prev_chunks_l,
+                                dtype=prev_chunks_l[0].dtype,
+                                sparse=op.sparse,
+                            )
+                            s = tree_add_op.new_chunk(
+                                prev_chunks_l, shape=prev_chunks_l[0].shape
+                            )
+                        target_l = TensorSubtract(
+                            dtype=L.dtype,
+                            lhs=target_l,
+                            rhs=s,
+                            order=out_tensor.order.value,
+                        ).new_chunk(
+                            [target_l, s], shape=target_l.shape, order=out_tensor.order
+                        )
+                    u = upper_chunks[j, j]
+                    a_transpose = TensorTranspose(
+                        dtype=u.dtype, sparse=op.sparse
+                    ).new_chunk([u], shape=u.shape)
+                    target_transpose = TensorTranspose(
+                        dtype=target_l.dtype, sparse=op.sparse
+                    ).new_chunk([target_l], shape=target_l.shape)
+                    lower_permuted_chunk = TensorSolveTriangular(
+                        lower=True, dtype=L.dtype, strict=False, sparse=op.sparse
+                    ).new_chunk(
+                        [a_transpose, target_transpose],
+                        shape=target_l.shape,
+                        index=(i, j),
+                        order=out_tensor.order,
+                    )
+                    lower_transpose = TensorTranspose(
+                        dtype=lower_permuted_chunk.dtype, sparse=op.sparse
+                    ).new_chunk(
+                        [lower_permuted_chunk],
+                        shape=lower_permuted_chunk.shape,
+                        index=lower_permuted_chunk.index,
+                    )
+                    l_permuted_chunks[lower_permuted_chunk.index] = lower_transpose
+
+        new_op = op.copy()
+        kws = [
+            {
+                "chunks": list(p_chunks.values()),
+                "nsplits": in_tensor.nsplits,
+                "dtype": P.dtype,
+                "shape": P.shape,
+                "order": P.order,
+            },
+            {
+                "chunks": list(lower_chunks.values()),
+                "nsplits": in_tensor.nsplits,
+                "dtype": L.dtype,
+                "shape": L.shape,
+                "order": L.order,
+            },
+            {
+                "chunks": list(upper_chunks.values()),
+                "nsplits": in_tensor.nsplits,
+                "dtype": U.dtype,
+                "shape": U.shape,
+                "order": U.order,
+            },
+        ]
+        if raw_in_tensor.shape[0] == raw_in_tensor.shape[1]:
+            return new_op.new_tensors(op.inputs, kws=kws)
+
+        p, l_, u = new_op.new_tensors(op.inputs, kws=kws)
+        if raw_in_tensor.shape[0] > raw_in_tensor.shape[1]:
+            l_ = yield from recursive_tile(l_[:, : raw_in_tensor.shape[1]])
+            u = yield from recursive_tile(
+                u[: raw_in_tensor.shape[1], : raw_in_tensor.shape[1]]
+            )
+        else:
+            p = yield from recursive_tile(
+                p[: raw_in_tensor.shape[0], : raw_in_tensor.shape[0]]
+            )
+            l_ = yield from recursive_tile(
+                l_[: raw_in_tensor.shape[0], : raw_in_tensor.shape[0]]
+            )
+            u = yield from recursive_tile(u[: raw_in_tensor.shape[0], :])
+        kws = [
+            {
+                "chunks": p.chunks,
+                "nsplits": p.nsplits,
+                "dtype": P.dtype,
+                "shape": p.shape,
+                "order": p.order,
+            },
+            {
+                "chunks": l_.chunks,
+                "nsplits": l_.nsplits,
+                "dtype": l_.dtype,
+                "shape": l_.shape,
+                "order": l_.order,
+            },
+            {
+                "chunks": u.chunks,
+                "nsplits": u.nsplits,
+                "dtype": u.dtype,
+                "shape": u.shape,
+                "order": u.order,
+            },
+        ]
+        return new_op.new_tensors(op.inputs, kws=kws)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (a,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            if xp is np:
+                import scipy.linalg
+
+                p, l, u = scipy.linalg.lu(a)
+            elif is_sparse_module(xp):
+                p, l, u = xp.lu(a)
+            else:
+                raise NotImplementedError
+            pc, lc, uc = op.outputs
+
+            ctx[pc.key] = p
+            ctx[lc.key] = l
+            ctx[uc.key] = u
+
+
+def lu(a):
+    """
+    LU decomposition
+
+    The decomposition is::
+        A = P L U
+    where P is a permutation matrix, L lower triangular with unit diagonal elements,
+    and U upper triangular.
+
+    Parameters
+    ----------
+    a : (M, N) array_like
+        Array to decompose
+
+    Returns
+    -------
+    p : (M, M) ndarray
+        Permutation matrix
+    l : (M, K) ndarray
+        Lower triangular or trapezoidal matrix with unit diagonal.
+        K = min(M, N)
+    u : (K, N) ndarray
+        Upper triangular or trapezoidal matrix
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> A = mt.array([[1,2],[2,3]])
+    >>> A.execute()
+    array([[ 1,  2],
+           [ 2,  3]])
+    >>> P, L, U = mt.linalg.lu(A)
+    >>> P.execute()
+    array([[ 0,  1],
+           [ 1,  0]])
+    >>> L.execute()
+    array([[ 1,  0],
+           [ 0.5,  1]])
+    >>> U.execute()
+    array([[ 2,  3],
+           [ 0,  0.5]])
+    >>> mt.dot(P.dot(L), U).execute() # verify that PL * U = A
+    array([[ 1,  2],
+           [ 2,  3]])
+
+    """
+    op = TensorLU(sparse=a.issparse())
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/linalg/matmul.py b/python/xorbits/_mars/tensor/linalg/matmul.py
new file mode 100644
index 000000000..6fcc74e99
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/matmul.py
@@ -0,0 +1,336 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import KeyField, StringField
+from ...utils import has_unknown_shape
+from ..arithmetic.utils import chunk_tree_add
+from ..array_utils import as_same_device, device, is_sparse_module
+from ..core import Tensor, TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+from ..utils import broadcast_shape, check_order, check_out_param, unify_chunks
+
+
+class TensorMatmul(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.MATMUL
+
+    _a = KeyField("a")
+    _b = KeyField("b")
+    _casting = StringField("casting")
+    _order = StringField("order")
+
+    def __init__(self, casting=None, order=None, **kw):
+        super().__init__(_casting=casting, _order=order, **kw)
+        if self._casting is None:
+            self._casting = "same_kind"
+        if self._order is None:
+            self._order = "K"
+        check_order(self._order)
+
+    @property
+    def a(self):
+        return self._a
+
+    @property
+    def b(self):
+        return self._b
+
+    @property
+    def casting(self):
+        return self._casting
+
+    @property
+    def order(self):
+        return self._order
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._a = self._inputs[0]
+        self._b = self._inputs[1]
+
+    def _calc_order(self, a, b, out):
+        if out is not None:
+            return out.order
+
+        if self._order in "A":
+            if a.order == TensorOrder.C_ORDER or b.order == TensorOrder.C_ORDER:
+                return TensorOrder.C_ORDER
+            else:
+                return TensorOrder.F_ORDER
+        elif self._order in "CK":
+            return TensorOrder.C_ORDER
+        else:
+            return TensorOrder.F_ORDER
+
+    def __call__(self, a, b, out=None):
+        from ..base import broadcast_to
+
+        if a.ndim == 0 or b.ndim == 0:
+            raise ValueError("Scalar operands are not allowed, use '*' instead")
+        if out is not None and not isinstance(out, Tensor):
+            raise TypeError(f"out must be a Tensor, got {type(out)} instead")
+
+        a_is_1d = False
+        if a.ndim == 1:
+            a_is_1d = True
+            a = a[np.newaxis, :]
+
+        b_is_1d = False
+        if b.ndim == 1:
+            b_is_1d = True
+            b = b[:, np.newaxis]
+
+        if a.ndim < b.ndim:
+            a = a[(b.ndim - a.ndim) * (np.newaxis,)]
+        elif a.ndim > b.ndim:
+            b = b[(a.ndim - b.ndim) * (np.newaxis,)]
+
+        if a.shape[-1] != b.shape[-2]:
+            raise ValueError(
+                f"shape {a.shape} and {b.shape} not aligned: "
+                f"{a.shape[-1]} (dim {a.ndim - 1}) != {b.shape[-2]} (dim {b.ndim - 2})"
+            )
+
+        shape = broadcast_shape(a.shape[:-2], b.shape[:-2]) + (a.shape[-2], b.shape[-1])
+        order = self._calc_order(a, b, out)
+        t = self.new_tensor([a, b], shape, order=order)
+
+        if a_is_1d:
+            t = t[..., 0, :]
+        if b_is_1d:
+            t = t[..., 0]
+
+        if out is not None:
+            check_out_param(out, t, self._casting)
+            t = broadcast_to(t, out.shape)
+            out.data = t.data
+            return out
+
+        return t
+
+    @classmethod
+    def tile(cls, op):
+        a, b = op.inputs
+        tensor = op.outputs[0]
+        # the axes to align on
+        a_axes = list(range(a.ndim - 2))[::-1] + [tensor.ndim - 2, tensor.ndim - 1]
+        b_axes = list(range(b.ndim - 2))[::-1] + [tensor.ndim - 1, tensor.ndim]
+        if has_unknown_shape(a, b):
+            yield
+        a, b = yield from unify_chunks((a, a_axes), (b, b_axes))
+
+        get_nsplit = lambda i: a.nsplits[i] if a.nsplits[i] != (1,) else b.nsplits[i]
+        get_idx = lambda ch, idx: tuple(
+            0 if ch.nsplits[j] == (1,) else ix for j, ix in enumerate(idx)
+        )
+
+        prefix_idxes = [range(len(get_nsplit(i))) for i in range(a.ndim - 2)]
+        out_idxes = prefix_idxes + [
+            range(len(a.nsplits[-2])),
+            range(len(b.nsplits[-1])),
+        ]
+
+        out_chunks = []
+        for out_idx in itertools.product(*out_idxes):
+            chunks = []
+            get_s = lambda x, idx: x[idx] if x != (1,) else x[0]
+            shape = tuple(
+                max(get_s(a_s, j), get_s(b_s, j))
+                for a_s, b_s, j in zip(a.nsplits[:-2], b.nsplits[:-2], out_idx[:-2])
+            ) + (get_s(a.nsplits[-2], out_idx[-2]), get_s(b.nsplits[-1], out_idx[-1]))
+
+            for contract_idx in range(len(a.nsplits[-1])):
+                a_idx = get_idx(a, out_idx[: a.ndim - 1] + (contract_idx,))
+                a_chunk = a.cix[a_idx]
+                b_idx = get_idx(
+                    b, out_idx[: b.ndim - 2] + (contract_idx,) + out_idx[-1:]
+                )
+                b_chunk = b.cix[b_idx]
+                chunk_op = op.copy().reset_key()
+                c = chunk_op.new_chunk(
+                    [a_chunk, b_chunk], shape=shape, order=tensor.order
+                )
+                chunks.append(c)
+
+            if len(chunks) == 1:
+                c = chunks[0]
+                out_chunk_op = c.op.copy()
+                out_chunk = out_chunk_op.new_chunk(
+                    out_chunk_op.inputs,
+                    shape=c.shape,
+                    index=out_idx,
+                    order=tensor.order,
+                )
+            else:
+                out_chunk = chunk_tree_add(
+                    tensor.op.dtype, chunks, out_idx, shape, sparse=tensor.op.sparse
+                )
+
+            out_chunks.append(out_chunk)
+
+        nsplits = tuple(get_nsplit(i) for i in range(a.ndim - 2)) + (
+            a.nsplits[-2],
+            b.nsplits[-1],
+        )
+        new_op = op.copy()
+        return new_op.new_tensors(
+            [a, b], tensor.shape, order=tensor.order, chunks=out_chunks, nsplits=nsplits
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (a, b), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            if not op.sparse and is_sparse_module(xp):
+                # tell sparse to do calculation on numpy or cupy matmul
+                ctx[op.outputs[0].key] = xp.matmul(a, b, sparse=False)
+            else:
+                try:
+                    # `np.matmul` support `order` argument in version 1.16
+                    ctx[op.outputs[0].key] = xp.matmul(
+                        a, b, casting=op.casting, order=op.order
+                    )
+                except TypeError:  # pragma: no cover
+                    ctx[op.outputs[0].key] = xp.matmul(a, b).astype(
+                        dtype=op.dtype, casting=op.casting, order=op.order
+                    )
+
+
+def matmul(a, b, sparse=None, out=None, **kw):
+    """
+    Matrix product of two tensors.
+
+    The behavior depends on the arguments in the following way.
+
+    - If both arguments are 2-D they are multiplied like conventional
+      matrices.
+    - If either argument is N-D, N > 2, it is treated as a stack of
+      matrices residing in the last two indexes and broadcast accordingly.
+    - If the first argument is 1-D, it is promoted to a matrix by
+      prepending a 1 to its dimensions. After matrix multiplication
+      the prepended 1 is removed.
+    - If the second argument is 1-D, it is promoted to a matrix by
+      appending a 1 to its dimensions. After matrix multiplication
+      the appended 1 is removed.
+
+    Multiplication by a scalar is not allowed, use ``*`` instead. Note that
+    multiplying a stack of matrices with a vector will result in a stack of
+    vectors, but matmul will not recognize it as such.
+
+    ``matmul`` differs from ``dot`` in two important ways.
+
+    - Multiplication by scalars is not allowed.
+    - Stacks of matrices are broadcast together as if the matrices
+      were elements.
+
+    Parameters
+    ----------
+    a : array_like
+        First argument.
+    b : array_like
+        Second argument.
+    out : Tensor, optional
+        Output argument. This must have the exact kind that would be returned
+        if it was not used. In particular, it must have the right type,
+        and its dtype must be the dtype that would be returned
+        for `dot(a,b)`. This is a performance feature. Therefore, if these
+        conditions are not met, an exception is raised, instead of attempting
+        to be flexible.
+
+    Returns
+    -------
+    output : Tensor
+        Returns the dot product of `a` and `b`.  If `a` and `b` are both
+        1-D arrays then a scalar is returned; otherwise an array is
+        returned.  If `out` is given, then it is returned.
+
+    Raises
+    ------
+    ValueError
+        If the last dimension of `a` is not the same size as
+        the second-to-last dimension of `b`.
+
+        If scalar value is passed.
+
+    See Also
+    --------
+    vdot : Complex-conjugating dot product.
+    tensordot : Sum products over arbitrary axes.
+    dot : alternative matrix product with different broadcasting rules.
+
+    Notes
+    -----
+    The matmul function implements the semantics of the `@` operator introduced
+    in Python 3.5 following PEP465.
+
+    Examples
+    --------
+    For 2-D arrays it is the matrix product:
+
+    >>> import mars.tensor as mt
+
+    >>> a = [[1, 0], [0, 1]]
+    >>> b = [[4, 1], [2, 2]]
+    >>> mt.matmul(a, b).execute()
+    array([[4, 1],
+           [2, 2]])
+
+    For 2-D mixed with 1-D, the result is the usual.
+
+    >>> a = [[1, 0], [0, 1]]
+    >>> b = [1, 2]
+    >>> mt.matmul(a, b).execute()
+    array([1, 2])
+    >>> mt.matmul(b, a).execute()
+    array([1, 2])
+
+
+    Broadcasting is conventional for stacks of arrays
+
+    >>> a = mt.arange(2*2*4).reshape((2,2,4))
+    >>> b = mt.arange(2*2*4).reshape((2,4,2))
+    >>> mt.matmul(a,b).shape
+    (2, 2, 2)
+    >>> mt.matmul(a,b)[0,1,1].execute()
+    98
+    >>> mt.sum(a[0,1,:] * b[0,:,1]).execute()
+    98
+
+    Vector, vector returns the scalar inner product, but neither argument
+    is complex-conjugated:
+
+    >>> mt.matmul([2j, 3j], [2j, 3j]).execute()
+    (-13+0j)
+
+    Scalar multiplication raises an error.
+
+    >>> mt.matmul([1,2], 3)
+    Traceback (most recent call last):
+    ...
+    ValueError: Scalar operands are not allowed, use '*' instead
+    """
+    a = astensor(a)
+    b = astensor(b)
+
+    sparse = sparse if sparse is not None else a.issparse() and b.issparse()
+    op = TensorMatmul(dtype=np.promote_types(a.dtype, b.dtype), sparse=sparse, **kw)
+    return op(a, b, out=out)
diff --git a/python/xorbits/_mars/tensor/linalg/norm.py b/python/xorbits/_mars/tensor/linalg/norm.py
new file mode 100644
index 000000000..44e8028df
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/norm.py
@@ -0,0 +1,342 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from collections.abc import Iterable
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import recursive_tile
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    FieldTypes,
+    KeyField,
+    TupleField,
+)
+from ..arithmetic import sqrt
+from ..array_utils import as_same_device, device
+from ..datasource import tensor as astensor
+from ..operands import TensorHasInput, TensorOperandMixin
+from ..utils import validate_axis
+from .svd import svd
+
+
+class TensorNorm(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.NORM
+
+    _input = KeyField("input")
+    _ord = AnyField("ord")
+    _axis = TupleField("axis", FieldTypes.int32)
+    _keepdims = BoolField("keepdims")
+
+    def __init__(self, ord=None, axis=None, keepdims=None, **kw):
+        super().__init__(_ord=ord, _axis=axis, _keepdims=keepdims, **kw)
+
+    @property
+    def ord(self):
+        return getattr(self, "_ord", None)
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def keepdims(self):
+        return self._keepdims
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def __call__(self, x):
+        r = x.astype(self.dtype)
+        shape = self._norm(r, self._ord, self._axis, self._keepdims).shape
+        return self.new_tensor([x], shape)
+
+    @classmethod
+    def tile(cls, op):
+        x = astensor(op.input)
+        axis = op.axis
+        ord = op.ord
+        keepdims = op.keepdims
+
+        axis_chunk_shapes = tuple(x.chunk_shape[i] for i in axis)
+        can_apply_norm = all(s == 1 for s in axis_chunk_shapes)
+
+        if can_apply_norm:
+            axis_set = set(axis)
+            get_shape = lambda shape: tuple(
+                s if i not in axis_set else 1
+                for i, s in enumerate(shape)
+                if i not in axis_set or keepdims
+            )
+
+            out_chunk_shape = get_shape(x.chunk_shape)
+            out_chunks = []
+            for idx in itertools.product(*[range(s) for s in out_chunk_shape]):
+                idx_iter = iter(idx)
+                in_idx = tuple(
+                    0 if i in axis_set and not keepdims else next(idx_iter)
+                    for i in range(x.ndim)
+                )
+
+                c = x.cix[in_idx]
+                chunk_op = op.copy().reset_key()
+                out_chunk = chunk_op.new_chunk([c], shape=get_shape(c.shape), index=idx)
+                out_chunks.append(out_chunk)
+
+            nsplits = [
+                tuple(
+                    c.shape[i]
+                    for c in out_chunks
+                    if all(idx == 0 for j, idx in enumerate(c.index) if j != i)
+                )
+                for i in range(len(out_chunks[0].shape))
+            ]
+            new_op = op.copy()
+            return new_op.new_tensors(
+                op.inputs, op.outputs[0].shape, chunks=out_chunks, nsplits=nsplits
+            )
+
+        r = yield from recursive_tile(
+            cls._norm(x.astype(op.outputs[0].dtype), ord, axis, keepdims)
+        )
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs, op.outputs[0].shape, chunks=r.chunks, nsplits=r.nsplits
+        )
+
+    @staticmethod
+    def _norm(r, ord, axis, keepdims):
+        if ord is None:
+            return sqrt((abs(r) ** 2).sum(axis=axis, keepdims=keepdims))
+        elif ord == "nuc":
+            if len(axis) == 1:
+                raise ValueError("Invalid norm order for vectors.")
+            return svd(r)[1][np.newaxis].sum(keepdims=keepdims)
+        elif ord == np.inf:
+            if r.ndim > 2:
+                raise ValueError("Improper number of dimensions to norm.")
+            r = abs(r)
+            if len(axis) == 1:
+                return r.max(axis=axis, keepdims=keepdims)
+            else:
+                return r.sum(axis=axis[1], keepdims=keepdims).max(keepdims=keepdims)
+        elif ord == -np.inf:
+            if r.ndim > 2:
+                raise ValueError("Improper number of dimensions to norm.")
+            r = abs(r)
+            if len(axis) == 1:
+                return r.min(axis=axis, keepdims=keepdims)
+            else:
+                return r.sum(axis=axis[1], keepdims=keepdims).min(keepdims=keepdims)
+        elif ord == 0:
+            if r.ndim > 2:
+                raise ValueError("Improper number of dimensions to norm.")
+            if len(axis) == 2:
+                raise ValueError("Invalid norm order for matrices.")
+            return (r != 0).astype(r.dtype).sum(axis=axis, keepdims=keepdims)
+        elif ord == 1:
+            if r.ndim > 2:
+                raise ValueError("Improper number of dimensions to norm.")
+            r = abs(r)
+            if len(axis) == 1:
+                return r.sum(axis=axis, keepdims=keepdims)
+            else:
+                return r.sum(axis=axis[0], keepdims=keepdims).max(keepdims=keepdims)
+        elif ord == -1 and len(axis) == 2:
+            if r.ndim > 2:
+                raise ValueError("Improper number of dimensions to norm.")
+            return abs(r).sum(axis=axis[0], keepdims=keepdims).min(keepdims=keepdims)
+        elif ord == 2 and len(axis) == 2:
+            return svd(r)[1][np.newaxis].max(keepdims=keepdims)
+        elif ord == -2 and len(axis) == 2:
+            return svd(r)[1][np.newaxis].min(keepdims=keepdims)
+        else:
+            if len(axis) == 2:
+                raise ValueError("Invalid norm order for matrices.")
+
+            return (abs(r) ** ord).sum(axis=axis, keepdims=keepdims) ** (1.0 / ord)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (x,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            ctx[op.outputs[0].key] = xp.linalg.norm(
+                x, ord=op.ord, axis=op.axis, keepdims=op.keepdims
+            )
+
+
+def norm(x, ord=None, axis=None, keepdims=False):
+    r"""
+    Matrix or vector norm.
+
+    This function is able to return one of eight different matrix norms,
+    or one of an infinite number of vector norms (described below), depending
+    on the value of the ``ord`` parameter.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor.  If `axis` is None, `x` must be 1-D or 2-D.
+    ord : {non-zero int, inf, -inf, 'fro', 'nuc'}, optional
+        Order of the norm (see table under ``Notes``). inf means mars tensor's
+        `inf` object.
+    axis : {int, 2-tuple of ints, None}, optional
+        If `axis` is an integer, it specifies the axis of `x` along which to
+        compute the vector norms.  If `axis` is a 2-tuple, it specifies the
+        axes that hold 2-D matrices, and the matrix norms of these matrices
+        are computed.  If `axis` is None then either a vector norm (when `x`
+        is 1-D) or a matrix norm (when `x` is 2-D) is returned.
+    keepdims : bool, optional
+        If this is set to True, the axes which are normed over are left in the
+        result as dimensions with size one.  With this option the result will
+        broadcast correctly against the original `x`.
+
+    Returns
+    -------
+    n : float or Tensor
+        Norm of the matrix or vector(s).
+
+    Notes
+    -----
+    For values of ``ord <= 0``, the result is, strictly speaking, not a
+    mathematical 'norm', but it may still be useful for various numerical
+    purposes.
+
+    The following norms can be calculated:
+
+    =====  ============================  ==========================
+    ord    norm for matrices             norm for vectors
+    =====  ============================  ==========================
+    None   Frobenius norm                2-norm
+    'fro'  Frobenius norm                --
+    'nuc'  nuclear norm                  --
+    inf    max(sum(abs(x), axis=1))      max(abs(x))
+    -inf   min(sum(abs(x), axis=1))      min(abs(x))
+    0      --                            sum(x != 0)
+    1      max(sum(abs(x), axis=0))      as below
+    -1     min(sum(abs(x), axis=0))      as below
+    2      2-norm (largest sing. value)  as below
+    -2     smallest singular value       as below
+    other  --                            sum(abs(x)**ord)**(1./ord)
+    =====  ============================  ==========================
+
+    The Frobenius norm is given by [1]_:
+
+        :math:`||A||_F = [\\sum_{i,j} abs(a_{i,j})^2]^{1/2}`
+
+    The nuclear norm is the sum of the singular values.
+
+    References
+    ----------
+    .. [1] G. H. Golub and C. F. Van Loan, *Matrix Computations*,
+           Baltimore, MD, Johns Hopkins University Press, 1985, pg. 15
+
+    Examples
+    --------
+    >>> from mars.tensor import linalg as LA
+    >>> import mars.tensor as mt
+    >>> a = mt.arange(9) - 4
+    >>> a.execute()
+    array([-4, -3, -2, -1,  0,  1,  2,  3,  4])
+    >>> b = a.reshape((3, 3))
+    >>> b.execute()
+    array([[-4, -3, -2],
+           [-1,  0,  1],
+           [ 2,  3,  4]])
+
+    >>> LA.norm(a).execute()
+    7.745966692414834
+    >>> LA.norm(b).execute()
+    7.745966692414834
+    >>> LA.norm(b, 'fro').execute()
+    7.745966692414834
+    >>> LA.norm(a, mt.inf).execute()
+    4.0
+    >>> LA.norm(b, mt.inf).execute()
+    9.0
+    >>> LA.norm(a, -mt.inf).execute()
+    0.0
+    >>> LA.norm(b, -mt.inf).execute()
+    2.0
+
+    >>> LA.norm(a, 1).execute()
+    20.0
+    >>> LA.norm(b, 1).execute()
+    7.0
+    >>> LA.norm(a, -1).execute()
+    0.0
+    >>> LA.norm(b, -1).execute()
+    6.0
+    >>> LA.norm(a, 2).execute()
+    7.745966692414834
+    >>> LA.norm(b, 2).execute()
+    7.3484692283495345
+
+    >>> LA.norm(a, -2).execute()
+    0.0
+    >>> LA.norm(b, -2).execute()
+    4.351066026358965e-18
+    >>> LA.norm(a, 3).execute()
+    5.8480354764257312
+    >>> LA.norm(a, -3).execute()
+    0.0
+
+    Using the `axis` argument to compute vector norms:
+
+    >>> c = mt.array([[ 1, 2, 3],
+    ...               [-1, 1, 4]])
+    >>> LA.norm(c, axis=0).execute()
+    array([ 1.41421356,  2.23606798,  5.        ])
+    >>> LA.norm(c, axis=1).execute()
+    array([ 3.74165739,  4.24264069])
+    >>> LA.norm(c, ord=1, axis=1).execute()
+    array([ 6.,  6.])
+
+    Using the `axis` argument to compute matrix norms:
+
+    >>> m = mt.arange(8).reshape(2,2,2)
+    >>> LA.norm(m, axis=(1,2)).execute()
+    array([  3.74165739,  11.22497216])
+    >>> LA.norm(m[0, :, :]).execute(), LA.norm(m[1, :, :]).execute()
+    (3.7416573867739413, 11.224972160321824)
+
+    """
+    x = astensor(x)
+    ndim = x.ndim
+
+    if ord == "fro":
+        ord = None
+    if axis is not None:
+        if isinstance(axis, Iterable):
+            axis = tuple(validate_axis(ndim, a) for a in axis)
+        else:
+            axis = (validate_axis(ndim, axis),)
+    else:
+        axis = tuple(range(x.ndim))
+
+    op = TensorNorm(
+        ord=ord,
+        axis=axis,
+        keepdims=keepdims,
+        dtype=np.result_type(x.dtype, np.float_),
+        sparse=x.issparse(),
+    )
+    return op(x)
diff --git a/python/xorbits/_mars/tensor/linalg/qr.py b/python/xorbits/_mars/tensor/linalg/qr.py
new file mode 100644
index 000000000..c0e338d7c
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/qr.py
@@ -0,0 +1,193 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from numpy.linalg import LinAlgError
+
+from ... import opcodes as OperandDef
+from ...core import ExecutableTuple
+from ...serialization.serializables import KeyField, StringField
+from ..array_utils import as_same_device, device
+from ..core import TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorHasInput, TensorOperandMixin
+from .core import SFQR, TSQR
+
+
+class TensorQR(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.QR
+
+    _input = KeyField("input")
+    _method = StringField("method")
+
+    def __init__(self, method=None, **kw):
+        super().__init__(_method=method, **kw)
+
+    @property
+    def method(self):
+        return self._method
+
+    @property
+    def output_limit(self):
+        return 2
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def __call__(self, a):
+        a = astensor(a)
+
+        if a.ndim != 2:
+            raise LinAlgError(
+                f"{a.ndim}-dimensional tensor given. Tensor must be two-dimensional"
+            )
+
+        tiny_q, tiny_r = np.linalg.qr(np.ones((1, 1), dtype=a.dtype))
+
+        x, y = a.shape
+        q_shape, r_shape = (a.shape, (y, y)) if x > y else ((x, x), a.shape)
+        q, r = self.new_tensors(
+            [a],
+            kws=[
+                {
+                    "side": "q",
+                    "dtype": tiny_q.dtype,
+                    "shape": q_shape,
+                    "order": TensorOrder.C_ORDER,
+                },
+                {
+                    "side": "r",
+                    "dtype": tiny_r.dtype,
+                    "shape": r_shape,
+                    "order": TensorOrder.C_ORDER,
+                },
+            ],
+        )
+        return ExecutableTuple([q, r])
+
+    @classmethod
+    def tile(cls, op):
+        q, r = op.outputs
+        q_dtype, r_dtype = q.dtype, r.dtype
+        q_shape, r_shape = q.shape, r.shape
+        in_tensor = op.input
+        if in_tensor.chunk_shape == (1, 1):
+            in_chunk = in_tensor.chunks[0]
+            chunk_op = op.copy().reset_key()
+            qr_chunks = chunk_op.new_chunks(
+                [in_chunk],
+                kws=[
+                    {"side": "q", "shape": q_shape, "index": in_chunk.index},
+                    {"side": "r", "shape": r_shape, "index": in_chunk.index},
+                ],
+            )
+            q_chunk, r_chunk = qr_chunks
+
+            new_op = op.copy()
+            kws = [
+                {
+                    "chunks": [q_chunk],
+                    "nsplits": ((q_shape[0],), (q_shape[1],)),
+                    "dtype": q_dtype,
+                    "shape": q_shape,
+                    "order": q.order,
+                },
+                {
+                    "chunks": [r_chunk],
+                    "nsplits": ((r_shape[0],), (r_shape[1],)),
+                    "dtype": r_dtype,
+                    "shape": r_shape,
+                    "order": r.order,
+                },
+            ]
+            return new_op.new_tensors(op.inputs, kws=kws)
+        elif op.method == "tsqr":
+            return (yield from TSQR.tile(op))
+        elif op.method == "sfqr":
+            return (yield from SFQR.tile(op))
+        else:
+            raise NotImplementedError("Only tsqr method supported for now")
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (a,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            q, r = xp.linalg.qr(a)
+            qc, rc = op.outputs
+            ctx[qc.key] = q
+            ctx[rc.key] = r
+
+
+def qr(a, method="tsqr"):
+    """
+    Compute the qr factorization of a matrix.
+
+    Factor the matrix `a` as *qr*, where `q` is orthonormal and `r` is
+    upper-triangular.
+
+    Parameters
+    ----------
+    a : array_like, shape (M, N)
+        Matrix to be factored.
+    method: {'tsqr', 'sfqr'}, optional
+        method to calculate qr factorization, tsqr as default
+
+        TSQR is presented in:
+
+            A. Benson, D. Gleich, and J. Demmel.
+            Direct QR factorizations for tall-and-skinny matrices in
+            MapReduce architectures.
+            IEEE International Conference on Big Data, 2013.
+            http://arxiv.org/abs/1301.1071
+
+        FSQR is a QR decomposition for fat and short matrix:
+            A = [A1, A2, A3, ...], A1 may be decomposed as A1 = Q1 * R1,
+            for A = Q * R, Q = Q1, R = [R1, R2, R3, ...] where A2 = Q1 * R2, A3 = Q1 * R3, ...
+
+    Returns
+    -------
+    q : Tensor of float or complex, optional
+        A matrix with orthonormal columns. When mode = 'complete' the
+        result is an orthogonal/unitary matrix depending on whether or not
+        a is real/complex. The determinant may be either +/- 1 in that
+        case.
+    r : Tensor of float or complex, optional
+        The upper-triangular matrix.
+
+    Raises
+    ------
+    LinAlgError
+        If factoring fails.
+
+    Notes
+    -----
+    For more information on the qr factorization, see for example:
+    http://en.wikipedia.org/wiki/QR_factorization
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.random.randn(9, 6)
+    >>> q, r = mt.linalg.qr(a)
+    >>> mt.allclose(a, mt.dot(q, r)).execute()  # a does equal qr
+    True
+
+    """
+    op = TensorQR(method=method)
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/linalg/randomized_svd.py b/python/xorbits/_mars/tensor/linalg/randomized_svd.py
new file mode 100644
index 000000000..66e491646
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/randomized_svd.py
@@ -0,0 +1,230 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..datasource import tensor as astensor
+from ..utils import check_random_state
+from .lu import lu
+from .qr import qr
+from .svd import svd
+from .utils import svd_flip
+
+# ---------------------------------------------------------------------
+# Original implementation is in `sklearn.utils.extmath.randomized_svd`.
+# ---------------------------------------------------------------------
+
+
+def randomized_range_finder(
+    A, size, n_iter, power_iteration_normalizer="auto", random_state=None
+):
+    r"""Computes an orthonormal matrix whose range approximates the range of A.
+
+    .. versionadded:: 0.1.3
+
+    Parameters
+    ----------
+    A : 2D tensor
+        The input data tensor
+
+    size : integer
+        Size of the return tensor
+
+    n_iter : integer
+        Number of power iterations used to stabilize the result
+
+    power_iteration_normalizer : 'auto' (default), 'QR', 'LU', 'none'
+        Whether the power iterations are normalized with step-by-step
+        QR factorization (the slowest but most accurate), 'none'
+        (the fastest but numerically unstable when `n_iter` is large, e.g.
+        typically 5 or larger), or 'LU' factorization (numerically stable
+        but can lose slightly in accuracy). The 'auto' mode applies no
+        normalization if `n_iter` <= 2 and switches to LU otherwise.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        The seed of the pseudo random number generator to use when shuffling
+        the data.  If int, random_state is the seed used by the random number
+        generator; If RandomState instance, random_state is the random number
+        generator; If None, the random number generator is the RandomState
+        instance used by `np.random`.
+
+    Returns
+    -------
+    Q : 2D array
+        A (size x size) projection matrix, the range of which
+        approximates well the range of the input matrix A.
+
+    Notes
+    -----
+
+    Follows Algorithm 4.3 of
+    Finding structure with randomness: Stochastic algorithms for constructing
+    approximate matrix decompositions
+    Halko, et al., 2009 (arXiv:909) https://arxiv.org/pdf/0909.4061.pdf
+
+    An implementation of a randomized algorithm for principal component
+    analysis
+    A. Szlam et al. 2014
+    """
+    random_state = check_random_state(random_state)
+
+    # Generating normal random vectors with shape: (A.shape[1], size)
+    Q = random_state.normal(size=(A.shape[1], size))
+    if A.dtype.kind == "f":
+        # Ensure f32 is preserved as f32
+        Q = Q.astype(A.dtype, copy=False)
+
+    # Deal with "auto" mode
+    if power_iteration_normalizer == "auto":
+        if n_iter <= 2:
+            power_iteration_normalizer = "none"
+        else:
+            power_iteration_normalizer = "LU"
+
+    # Perform power iterations with Q to further 'imprint' the top
+    # singular vectors of A in Q
+    for _ in range(n_iter):
+        if power_iteration_normalizer == "none":
+            Q = A.dot(Q)
+            Q = A.T.dot(Q)
+        elif power_iteration_normalizer == "LU":
+            # TODO: directly get Q when lu supports `permute_l`
+            p, l, _ = lu(A.dot(Q))
+            Q = p.dot(l)
+            p, l, _ = lu(A.T.dot(Q))
+            Q = p.dot(l)
+        elif power_iteration_normalizer == "QR":
+            Q, _ = qr(A.dot(Q))
+            Q, _ = qr(A.T.dot(Q))
+
+    # Sample the range of A using by linear projection of Q
+    # Extract an orthonormal basis
+    Q, _ = qr(A.dot(Q))
+    return Q
+
+
+def randomized_svd(
+    M,
+    n_components,
+    n_oversamples=10,
+    n_iter="auto",
+    power_iteration_normalizer="auto",
+    transpose="auto",
+    flip_sign=True,
+    random_state=0,
+):
+    r"""
+    Computes a truncated randomized SVD
+
+    .. versionadded:: 0.1.4
+
+    Parameters
+    ----------
+    M : Tensor
+        tensor to decompose
+    n_components : int
+        Number of singular values and vectors to extract.
+    n_oversamples : int (default is 10)
+        Additional number of random vectors to sample the range of M so as
+        to ensure proper conditioning. The total number of random vectors
+        used to find the range of M is n_components + n_oversamples. Smaller
+        number can improve speed but can negatively impact the quality of
+        approximation of singular vectors and singular values.
+    n_iter : int or 'auto' (default is 'auto')
+        Number of power iterations. It can be used to deal with very noisy
+        problems. When 'auto', it is set to 4, unless `n_components` is small
+        (< .1 * min(X.shape)) `n_iter` in which case is set to 7.
+        This improves precision with few components.
+    power_iteration_normalizer : 'auto' (default), 'QR', 'LU', 'none'
+        Whether the power iterations are normalized with step-by-step
+        QR factorization (the slowest but most accurate), 'none'
+        (the fastest but numerically unstable when `n_iter` is large, e.g.
+        typically 5 or larger), or 'LU' factorization (numerically stable
+        but can lose slightly in accuracy). The 'auto' mode applies no
+        normalization if `n_iter` <= 2 and switches to LU otherwise.
+    transpose : True, False or 'auto' (default)
+        Whether the algorithm should be applied to M.T instead of M. The
+        result should approximately be the same. The 'auto' mode will
+        trigger the transposition if M.shape[1] > M.shape[0] since this
+        implementation of randomized SVD tend to be a little faster in that
+        case.
+    flip_sign : boolean, (True by default)
+        The output of a singular value decomposition is only unique up to a
+        permutation of the signs of the singular vectors. If `flip_sign` is
+        set to `True`, the sign ambiguity is resolved by making the largest
+        loadings for each component in the left singular vectors positive.
+    random_state : int, RandomState instance or None, optional (default=None)
+        The seed of the pseudo random number generator to use when shuffling
+        the data.  If int, random_state is the seed used by the random number
+        generator; If RandomState instance, random_state is the random number
+        generator; If None, the random number generator is the RandomState
+        instance used by `np.random`.
+    Notes
+    -----
+    This algorithm finds a (usually very good) approximate truncated
+    singular value decomposition using randomization to speed up the
+    computations. It is particularly fast on large matrices on which
+    you wish to extract only a small number of components. In order to
+    obtain further speed up, `n_iter` can be set <=2 (at the cost of
+    loss of precision).
+    References
+    ----------
+    * Finding structure with randomness: Stochastic algorithms for constructing
+      approximate matrix decompositions
+      Halko, et al., 2009 https://arxiv.org/abs/0909.4061
+    * A randomized algorithm for the decomposition of matrices
+      Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert
+    * An implementation of a randomized algorithm for principal component
+      analysis
+      A. Szlam et al. 2014
+    """
+    M = astensor(M)
+    random_state = check_random_state(random_state)
+    n_random = n_components + n_oversamples
+    n_samples, n_features = M.shape
+
+    if n_iter == "auto":
+        # Check if the number of iterations is explicitly specified
+        # Adjust n_iter. 7 was found a good compromise for PCA.
+        # https://github.com/scikit-learn/scikit-learn/pull/5299
+        n_iter = 7 if n_components < 0.1 * min(M.shape) else 4
+
+    if transpose == "auto":
+        transpose = n_samples < n_features
+    if transpose:
+        # this implementation is a bit faster with smaller shape[1]
+        M = M.T
+
+    Q = randomized_range_finder(
+        M, n_random, n_iter, power_iteration_normalizer, random_state
+    )
+    # project M to the (k + p) dimensional space using the basis vectors
+    B = Q.T.dot(M)
+
+    # compute the SVD on the thin matrix: (k + p) wide
+    Uhat, s, V = svd(B)
+
+    U = Q.dot(Uhat)
+
+    if flip_sign:
+        if not transpose:
+            U, V = svd_flip(U, V)
+        else:
+            # In case of transpose u_based_decision=false
+            # to actually flip based on u and not v.
+            U, V = svd_flip(U, V, u_based_decision=False)
+
+    if transpose:
+        # transpose back the results according to the input convention
+        return V[:n_components, :].T, s[:n_components], U[:, :n_components].T
+    else:
+        return U[:, :n_components], s[:n_components], V[:n_components, :]
diff --git a/python/xorbits/_mars/tensor/linalg/solve.py b/python/xorbits/_mars/tensor/linalg/solve.py
new file mode 100644
index 000000000..216ca57ea
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/solve.py
@@ -0,0 +1,72 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..datasource import tensor as astensor
+from .cholesky import cholesky
+from .lu import lu
+from .solve_triangular import solve_triangular
+
+
+def solve(a, b, sym_pos=False, sparse=None):
+    """
+    Solve the equation ``a x = b`` for ``x``.
+
+    Parameters
+    ----------
+    a : (M, M) array_like
+        A square matrix.
+    b : (M,) or (M, N) array_like
+        Right-hand side matrix in ``a x = b``.
+    sym_pos : bool
+        Assume `a` is symmetric and positive definite. If ``True``, use Cholesky
+        decomposition.
+    sparse: bool, optional
+        Return sparse value or not.
+
+    Returns
+    -------
+    x : (M,) or (M, N) ndarray
+    Solution to the system ``a x = b``.  Shape of the return matches the
+    shape of `b`.
+
+    Raises
+    ------
+    LinAlgError
+    If `a` is singular.
+
+    Examples
+    --------
+    Given `a` and `b`, solve for `x`:
+
+    >>> import mars.tensor as mt
+    >>> a = mt.array([[3, 2, 0], [1, -1, 0], [0, 5, 1]])
+    >>> b = mt.array([2, 4, -1])
+    >>> x = mt.linalg.solve(a, b)
+    >>> x.execute()
+    array([ 2., -2.,  9.])
+
+    >>> mt.dot(a, x).execute()  # Check the result
+    array([ 2., 4., -1.])
+    """
+    a = astensor(a)
+    b = astensor(b)
+    if sym_pos:
+        l_ = cholesky(a, lower=True)
+        u = l_.T
+    else:
+        p, l_, u = lu(a)
+        b = p.T.dot(b)
+    sparse = sparse if sparse is not None else a.issparse()
+    uy = solve_triangular(l_, b, lower=True, sparse=sparse)
+    return solve_triangular(u, uy, sparse=sparse)
diff --git a/python/xorbits/_mars/tensor/linalg/solve_triangular.py b/python/xorbits/_mars/tensor/linalg/solve_triangular.py
new file mode 100644
index 000000000..c1ce4e83a
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/solve_triangular.py
@@ -0,0 +1,234 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from numpy.linalg import LinAlgError
+
+from ... import opcodes as OperandDef
+from ...core import recursive_tile
+from ...serialization.serializables import BoolField, KeyField
+from ...utils import has_unknown_shape
+from ..array_utils import as_same_device, cp, device
+from ..core import TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+from ..utils import decide_unify_split
+
+
+class TensorSolveTriangular(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.SOLVE_TRIANGULAR
+
+    _a = KeyField("a")
+    _b = KeyField("b")
+    _lower = BoolField("lower")
+    _strict = BoolField("strict")
+
+    def __init__(self, lower=None, strict=None, **kw):
+        super().__init__(_lower=lower, _strict=strict, **kw)
+
+    @property
+    def a(self):
+        return self._a
+
+    @property
+    def b(self):
+        return self._b
+
+    @property
+    def lower(self):
+        return self._lower
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._a, self._b = self._inputs
+
+    def __call__(self, a, b):
+        shape = (a.shape[1],) if len(b.shape) == 1 else (a.shape[1], b.shape[1])
+        return self.new_tensor([a, b], shape, order=TensorOrder.F_ORDER)
+
+    @property
+    def strict(self):
+        return self._strict
+
+    @classmethod
+    def tile(cls, op):
+        from ..arithmetic.subtract import TensorSubtract
+        from ..arithmetic.utils import chunk_tree_add
+        from .dot import TensorDot
+
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        a, b = op.a, op.b
+        unified_nsplit = decide_unify_split(a.nsplits[0], a.nsplits[1], b.nsplits[0])
+        a = yield from recursive_tile(a.rechunk((unified_nsplit, unified_nsplit)))
+        b = yield from recursive_tile(b.rechunk((unified_nsplit,) + b.nsplits[1:]))
+
+        b_multi_dim = b.ndim > 1
+        b_hsplits = b.chunk_shape[1] if b_multi_dim else 1
+
+        def _x_shape(a_shape, b_shape):
+            return (a_shape[1],) if len(b_shape) == 1 else (a_shape[1], b_shape[1])
+
+        def _dot_shape(a_shape, b_shape):
+            return (a_shape[0],) if len(b_shape) == 1 else (a_shape[0], b_shape[1])
+
+        lower = op.lower
+        out_chunks = {}
+        if lower:
+            i_range = range(a.chunk_shape[0])
+        else:
+            i_range = range(a.chunk_shape[0] - 1, -1, -1)
+        for i in i_range:
+            target_a = a.cix[i, i]
+            for j in range(b_hsplits):
+                idx = (i, j) if b_multi_dim else (i,)
+                target_b = b.cix[idx]
+                if (lower and i > 0) or (not lower and i < a.chunk_shape[0] - 1):
+                    prev_chunks = []
+                    if lower:
+                        k_range = range(i)
+                    else:
+                        k_range = range(i + 1, a.chunk_shape[0])
+                    for k in k_range:
+                        a_chunk, b_chunk = (
+                            a.cix[i, k],
+                            out_chunks[(k, j) if b_multi_dim else (k,)],
+                        )
+                        prev_chunk = TensorDot(
+                            dtype=op.dtype, sparse=a_chunk.issparse()
+                        ).new_chunk(
+                            [a_chunk, b_chunk],
+                            shape=_dot_shape(a_chunk.shape, b_chunk.shape),
+                        )
+                        prev_chunks.append(prev_chunk)
+                    if len(prev_chunks) == 1:
+                        s = prev_chunks[0]
+                    else:
+                        s = chunk_tree_add(
+                            prev_chunks[0].dtype,
+                            prev_chunks,
+                            None,
+                            prev_chunks[0].shape,
+                            sparse=op.sparse,
+                        )
+                    target_b = TensorSubtract(
+                        dtype=op.dtype, lhs=target_b, rhs=s
+                    ).new_chunk([target_b, s], shape=target_b.shape)
+                out_chunk = TensorSolveTriangular(
+                    lower=lower, sparse=op.sparse, dtype=op.dtype
+                ).new_chunk(
+                    [target_a, target_b],
+                    shape=_x_shape(target_a.shape, target_b.shape),
+                    index=idx,
+                    order=op.outputs[0].order,
+                )
+                out_chunks[out_chunk.index] = out_chunk
+
+        new_op = op.copy()
+        nsplits = (a.nsplits[0],) if b.ndim == 1 else (a.nsplits[0], b.nsplits[1])
+        return new_op.new_tensors(
+            op.inputs,
+            op.outputs[0].shape,
+            chunks=list(out_chunks.values()),
+            nsplits=nsplits,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (a, b), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        chunk = op.outputs[0]
+        with device(device_id):
+            if xp is np:
+                import scipy.linalg
+
+                try:
+                    ctx[chunk.key] = scipy.linalg.solve_triangular(a, b, lower=op.lower)
+                except np.linalg.LinAlgError:
+                    if op.strict is not False:
+                        raise
+                    ctx[chunk.key] = np.linalg.lstsq(a, b, rcond=-1)[0]
+            elif xp is cp:
+                import cupyx
+
+                ctx[chunk.key] = cupyx.scipy.linalg.solve_triangular(
+                    a, b, lower=op.lower
+                )
+            else:
+                ctx[chunk.key] = xp.solve_triangular(
+                    a, b, lower=op.lower, sparse=op.sparse
+                )
+
+
+def solve_triangular(a, b, lower=False, sparse=None):
+    """
+    Solve the equation `a x = b` for `x`, assuming a is a triangular matrix.
+
+    Parameters
+    ----------
+    a : (M, M) array_like
+        A triangular matrix
+    b : (M,) or (M, N) array_like
+        Right-hand side matrix in `a x = b`
+    lower : bool, optional
+        Use only data contained in the lower triangle of `a`.
+        Default is to use upper triangle.
+    sparse: bool, optional
+        Return sparse value or not.
+
+    Returns
+    -------
+    x : (M,) or (M, N) ndarray
+        Solution to the system `a x = b`.  Shape of return matches `b`.
+
+    Examples
+    --------
+    Solve the lower triangular system a x = b, where::
+             [3  0  0  0]       [4]
+        a =  [2  1  0  0]   b = [2]
+             [1  0  1  0]       [4]
+             [1  1  1  1]       [2]
+
+    >>> import mars.tensor as mt
+    >>> a = mt.array([[3, 0, 0, 0], [2, 1, 0, 0], [1, 0, 1, 0], [1, 1, 1, 1]])
+    >>> b = mt.array([4, 2, 4, 2])
+    >>> x = mt.linalg.solve_triangular(a, b, lower=True)
+    >>> x.execute()
+    array([ 1.33333333, -0.66666667,  2.66666667, -1.33333333])
+
+    >>> a.dot(x).execute()  # Check the result
+    array([ 4.,  2.,  4.,  2.])
+    """
+    import scipy.linalg
+
+    a = astensor(a)
+    b = astensor(b)
+
+    if a.ndim != 2:
+        raise LinAlgError("a must be 2 dimensional")
+    if b.ndim <= 2:
+        if a.shape[1] != b.shape[0]:
+            raise LinAlgError("a.shape[1] and b.shape[0] must be equal")
+    else:
+        raise LinAlgError("b must be 1 or 2 dimensional")
+
+    tiny_x = scipy.linalg.solve_triangular(
+        np.array([[2, 0], [2, 1]], dtype=a.dtype), np.array([[2], [3]], dtype=b.dtype)
+    )
+    sparse = sparse if sparse is not None else a.issparse()
+    op = TensorSolveTriangular(lower=lower, dtype=tiny_x.dtype, sparse=sparse)
+    return op(a, b)
diff --git a/python/xorbits/_mars/tensor/linalg/svd.py b/python/xorbits/_mars/tensor/linalg/svd.py
new file mode 100644
index 000000000..ec1919dca
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/svd.py
@@ -0,0 +1,243 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from numpy.linalg import LinAlgError
+
+from ... import opcodes as OperandDef
+from ...core import ExecutableTuple
+from ...serialization.serializables import KeyField, StringField
+from ..array_utils import as_same_device, device
+from ..core import TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorHasInput, TensorOperandMixin
+from .core import TSQR
+from .utils import calc_svd_shapes
+
+
+class TensorSVD(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.SVD
+
+    _input = KeyField("input")
+    _method = StringField("method")
+
+    def __init__(self, method=None, **kw):
+        super().__init__(_method=method, **kw)
+
+    @property
+    def method(self):
+        return self._method
+
+    @property
+    def output_limit(self):
+        return 3
+
+    @classmethod
+    def _is_svd(cls):
+        return True
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def __call__(self, a):
+        a = astensor(a)
+
+        if a.ndim != 2:
+            raise LinAlgError(
+                f"{a.ndim}-dimensional tensor given. Tensor must be two-dimensional"
+            )
+
+        tiny_U, tiny_s, tiny_V = np.linalg.svd(np.ones((1, 1), dtype=a.dtype))
+
+        # if a's shape is (6, 18), U's shape is (6, 6), s's shape is (6,), V's shape is (6, 18)
+        # if a's shape is (18, 6), U's shape is (18, 6), s's shape is (6,), V's shape is (6, 6)
+        U_shape, s_shape, V_shape = calc_svd_shapes(a)
+        U, s, V = self.new_tensors(
+            [a],
+            order=TensorOrder.C_ORDER,
+            kws=[
+                {"side": "U", "dtype": tiny_U.dtype, "shape": U_shape},
+                {"side": "s", "dtype": tiny_s.dtype, "shape": s_shape},
+                {"side": "V", "dtype": tiny_V.dtype, "shape": V_shape},
+            ],
+        )
+        return ExecutableTuple([U, s, V])
+
+    @classmethod
+    def tile(cls, op):
+        U, s, V = op.outputs
+        U_dtype, s_dtype, V_dtype = U.dtype, s.dtype, V.dtype
+        U_shape, s_shape, V_shape = U.shape, s.shape, V.shape
+        in_tensor = op.input
+        if in_tensor.chunk_shape == (1, 1):
+            in_chunk = in_tensor.chunks[0]
+            chunk_op = op.copy().reset_key()
+            svd_chunks = chunk_op.new_chunks(
+                [in_chunk],
+                kws=[
+                    {
+                        "side": "U",
+                        "dtype": U_dtype,
+                        "index": in_chunk.index,
+                        "shape": U_shape,
+                        "order": U.order,
+                    },
+                    {
+                        "side": "s",
+                        "dtype": s_dtype,
+                        "index": in_chunk.index[1:],
+                        "shape": s_shape,
+                        "order": s.order,
+                    },
+                    {
+                        "side": "V",
+                        "dtype": V_dtype,
+                        "index": in_chunk.index,
+                        "shape": V_shape,
+                        "order": V.order,
+                    },
+                ],
+            )
+            U_chunk, s_chunk, V_chunk = svd_chunks
+
+            new_op = op.copy()
+            kws = [
+                {
+                    "chunks": [U_chunk],
+                    "nsplits": tuple((s,) for s in U_shape),
+                    "dtype": U_dtype,
+                    "shape": U_shape,
+                },
+                {
+                    "chunks": [s_chunk],
+                    "nsplits": tuple((s,) for s in s_shape),
+                    "dtype": s_dtype,
+                    "shape": s_shape,
+                },
+                {
+                    "chunks": [V_chunk],
+                    "nsplits": tuple((s,) for s in V_shape),
+                    "dtype": V_dtype,
+                    "shape": V_shape,
+                },
+            ]
+            return new_op.new_tensors(op.inputs, kws=kws)
+        elif op.method == "tsqr":
+            return (yield from TSQR.tile(op))
+        else:
+            raise NotImplementedError("Only tsqr method supported for now")
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (a,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            u, s, v = xp.linalg.svd(a, full_matrices=False)
+            uc, sc, vc = op.outputs
+            ctx[uc.key] = u
+            ctx[sc.key] = s
+            ctx[vc.key] = v
+
+
+def svd(a, method="tsqr"):
+    """
+    Singular Value Decomposition.
+
+    When `a` is a 2D tensor, it is factorized as ``u @ np.diag(s) @ vh
+    = (u * s) @ vh``, where `u` and `vh` are 2D unitary tensors and `s` is a 1D
+    tensor of `a`'s singular values. When `a` is higher-dimensional, SVD is
+    applied in stacked mode as explained below.
+
+    Parameters
+    ----------
+    a : (..., M, N) array_like
+        A real or complex tensor with ``a.ndim >= 2``.
+    method: {'tsqr'}, optional
+        method to calculate qr factorization, tsqr as default
+
+        TSQR is presented in:
+
+            A. Benson, D. Gleich, and J. Demmel.
+            Direct QR factorizations for tall-and-skinny matrices in
+            MapReduce architectures.
+            IEEE International Conference on Big Data, 2013.
+            http://arxiv.org/abs/1301.1071
+
+
+    Returns
+    -------
+    u : { (..., M, M), (..., M, K) } tensor
+        Unitary tensor(s). The first ``a.ndim - 2`` dimensions have the same
+        size as those of the input `a`. The size of the last two dimensions
+        depends on the value of `full_matrices`. Only returned when
+        `compute_uv` is True.
+    s : (..., K) tensor
+        Vector(s) with the singular values, within each vector sorted in
+        descending order. The first ``a.ndim - 2`` dimensions have the same
+        size as those of the input `a`.
+    vh : { (..., N, N), (..., K, N) } tensor
+        Unitary tensor(s). The first ``a.ndim - 2`` dimensions have the same
+        size as those of the input `a`. The size of the last two dimensions
+        depends on the value of `full_matrices`. Only returned when
+        `compute_uv` is True.
+
+    Raises
+    ------
+    LinAlgError
+        If SVD computation does not converge.
+
+    Notes
+    -----
+
+    SVD is usually described for the factorization of a 2D matrix :math:`A`.
+    The higher-dimensional case will be discussed below. In the 2D case, SVD is
+    written as :math:`A = U S V^H`, where :math:`A = a`, :math:`U= u`,
+    :math:`S= \\mathtt{np.diag}(s)` and :math:`V^H = vh`. The 1D tensor `s`
+    contains the singular values of `a` and `u` and `vh` are unitary. The rows
+    of `vh` are the eigenvectors of :math:`A^H A` and the columns of `u` are
+    the eigenvectors of :math:`A A^H`. In both cases the corresponding
+    (possibly non-zero) eigenvalues are given by ``s**2``.
+
+    If `a` has more than two dimensions, then broadcasting rules apply, as
+    explained in :ref:`routines.linalg-broadcasting`. This means that SVD is
+    working in "stacked" mode: it iterates over all indices of the first
+    ``a.ndim - 2`` dimensions and for each combination SVD is applied to the
+    last two indices. The matrix `a` can be reconstructed from the
+    decomposition with either ``(u * s[..., None, :]) @ vh`` or
+    ``u @ (s[..., None] * vh)``. (The ``@`` operator can be replaced by the
+    function ``mt.matmul`` for python versions below 3.5.)
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> a = mt.random.randn(9, 6) + 1j*mt.random.randn(9, 6)
+    >>> b = mt.random.randn(2, 7, 8, 3) + 1j*mt.random.randn(2, 7, 8, 3)
+
+    Reconstruction based on reduced SVD, 2D case:
+
+    >>> u, s, vh = mt.linalg.svd(a)
+    >>> u.shape, s.shape, vh.shape
+    ((9, 6), (6,), (6, 6))
+    >>> np.allclose(a, np.dot(u * s, vh))
+    True
+    >>> smat = np.diag(s)
+    >>> np.allclose(a, np.dot(u, np.dot(smat, vh)))
+    True
+
+    """
+    op = TensorSVD(method=method)
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/linalg/tensordot.py b/python/xorbits/_mars/tensor/linalg/tensordot.py
new file mode 100644
index 000000000..d31fedbad
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/tensordot.py
@@ -0,0 +1,337 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from collections.abc import Iterable
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import FieldTypes, KeyField, TupleField
+from ...utils import has_unknown_shape
+from ..arithmetic.utils import chunk_tree_add
+from ..array_utils import as_same_device, device, is_sparse_module
+from ..core import TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+from ..utils import unify_chunks
+
+
+class TensorTensorDot(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.TENSORDOT
+
+    _a = KeyField("a")
+    _b = KeyField("b")
+    _a_axes = TupleField("a_axes", FieldTypes.int32)
+    _b_axes = TupleField("b_axes", FieldTypes.int32)
+
+    def __init__(self, a_axes=None, b_axes=None, **kw):
+        super().__init__(_a_axes=a_axes, _b_axes=b_axes, **kw)
+
+    @property
+    def a(self):
+        return self._a
+
+    @property
+    def b(self):
+        return self._b
+
+    @property
+    def a_axes(self):
+        return self._a_axes
+
+    @property
+    def b_axes(self):
+        return self._b_axes
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._a = self._inputs[0]
+        self._b = self._inputs[1]
+
+    def __call__(self, a, b):
+        shape = tuple(
+            s for i, s in enumerate(a.shape) if i not in set(self._a_axes)
+        ) + tuple(s for i, s in enumerate(b.shape) if i not in set(self._b_axes))
+        return self.new_tensor([a, b], shape, order=TensorOrder.C_ORDER)
+
+    @classmethod
+    def estimate_size(cls, ctx, op):
+        chunk = op.outputs[0]
+        if chunk.is_sparse():
+            return super().estimate_size(ctx, op)
+
+        # empirical value in real environments
+        calc_usage = chunk.nbytes
+
+        # add input sizes when sparse-to-dense is needed
+        for inp in chunk.inputs:
+            if inp.is_sparse():
+                calc_usage += inp.nbytes
+
+        ctx[chunk.key] = (chunk.nbytes, calc_usage)
+
+    @classmethod
+    def tile(cls, op):
+        a, b, a_axes, b_axes = op.a, op.b, op.a_axes, op.b_axes
+
+        c = itertools.count(max(a.ndim, b.ndim))
+        a_ax = tuple(a_axes.index(i) if i in a_axes else next(c) for i in range(a.ndim))
+        b_ax = tuple(b_axes.index(i) if i in b_axes else next(c) for i in range(b.ndim))
+        if has_unknown_shape(*op.inputs):
+            yield
+        a, b = yield from unify_chunks((a, a_ax), (b, b_ax))
+        out = op.outputs[0]
+
+        a_output_indexes = [
+            range(len(a.nsplits[i])) for i in range(a.ndim) if i not in a_axes
+        ]
+        b_output_indexes = [
+            range(len(b.nsplits[i])) for i in range(b.ndim) if i not in b_axes
+        ]
+        output_axes = [(0, i) for i in range(a.ndim) if i not in a_axes] + [
+            (1, i) for i in range(b.ndim) if i not in b_axes
+        ]
+
+        out_chunks = []
+        for out_idx in itertools.product(
+            *itertools.chain(a_output_indexes, b_output_indexes)
+        ):
+            a_indexes = [None] * a.ndim
+            b_indexes = [None] * b.ndim
+            tensor_shape = []
+            for i, idx in enumerate(out_idx):
+                t_idx, axis = output_axes[i]
+                t = (a, b)[t_idx]
+                (a_indexes if t_idx == 0 else b_indexes)[axis] = idx
+                tensor_shape.append(t.nsplits[axis][idx])
+            tensor_shape = tuple(tensor_shape)
+
+            tensordot_chunks = []
+            for contract_indexes in itertools.product(
+                *[range(len(a.nsplits[ax])) for ax in a_axes]
+            ):
+                a_indices, b_indices = list(a_indexes), list(b_indexes)
+                for a_axis, contract_index in zip(a_axes, contract_indexes):
+                    a_indices[a_axis] = contract_index
+                for b_axis, contract_index in zip(b_axes, contract_indexes):
+                    b_indices[b_axis] = contract_index
+
+                tensordot_chunk_op = op.copy().reset_key()
+                tensordot_chunk = tensordot_chunk_op.new_chunk(
+                    [a.cix[tuple(a_indices)], b.cix[tuple(b_indices)]],
+                    shape=tensor_shape,
+                    order=out.order,
+                )
+                tensordot_chunks.append(tensordot_chunk)
+
+            if len(tensordot_chunks) == 1:
+                c = tensordot_chunks[0]
+                chunk_op = c.op.copy()
+                chunk = chunk_op.new_chunk(
+                    c.inputs, shape=c.shape, index=out_idx, order=out.order
+                )
+            else:
+                chunk = chunk_tree_add(
+                    op.dtype, tensordot_chunks, out_idx, tensor_shape, sparse=op.sparse
+                )
+            out_chunks.append(chunk)
+
+        get_nsplits = lambda t_idx, i: (a, b)[t_idx].nsplits[i]
+        nsplits = [get_nsplits(*it) for it in output_axes]
+        new_op = op.copy()
+        return new_op.new_tensors([a, b], out.shape, chunks=out_chunks, nsplits=nsplits)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (a, b), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        axes = op.a_axes, op.b_axes
+        with device(device_id):
+            if not op.sparse and is_sparse_module(xp):
+                # tell sparse to do calculation on numpy or cupy dot
+                ctx[op.outputs[0].key] = xp.tensordot(a, b, axes, sparse=False)
+            else:
+                ret = xp.tensordot(a, b, axes)
+                out = op.outputs[0]
+                ctx[out.key] = ret.astype(ret.dtype, order=out.order.value, copy=False)
+
+
+def tensordot(a, b, axes=2, sparse=None):
+    """
+    Compute tensor dot product along specified axes for tensors >= 1-D.
+
+    Given two tensors (arrays of dimension greater than or equal to one),
+    `a` and `b`, and an array_like object containing two array_like
+    objects, ``(a_axes, b_axes)``, sum the products of `a`'s and `b`'s
+    elements (components) over the axes specified by ``a_axes`` and
+    ``b_axes``. The third argument can be a single non-negative
+    integer_like scalar, ``N``; if it is such, then the last ``N``
+    dimensions of `a` and the first ``N`` dimensions of `b` are summed
+    over.
+
+    Parameters
+    ----------
+    a, b : array_like, len(shape) >= 1
+        Tensors to "dot".
+    axes : int or (2,) array_like
+        * integer_like
+          If an int N, sum over the last N axes of `a` and the first N axes
+          of `b` in order. The sizes of the corresponding axes must match.
+        * (2,) array_like
+          Or, a list of axes to be summed over, first sequence applying to `a`,
+          second to `b`. Both elements array_like must be of the same length.
+
+    See Also
+    --------
+    dot, einsum
+
+    Notes
+    -----
+    Three common use cases are:
+
+        * ``axes = 0`` : tensor product :math:`a\\otimes b`
+        * ``axes = 1`` : tensor dot product :math:`a\\cdot b`
+        * ``axes = 2`` : (default) tensor double contraction :math:`a:b`
+
+    When `axes` is integer_like, the sequence for evaluation will be: first
+    the -Nth axis in `a` and 0th axis in `b`, and the -1th axis in `a` and
+    Nth axis in `b` last.
+
+    When there is more than one axis to sum over - and they are not the last
+    (first) axes of `a` (`b`) - the argument `axes` should consist of
+    two sequences of the same length, with the first axis to sum over given
+    first in both sequences, the second axis second, and so forth.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    A "traditional" example:
+
+    >>> a = mt.arange(60.).reshape(3,4,5)
+    >>> b = mt.arange(24.).reshape(4,3,2)
+    >>> c = mt.tensordot(a,b, axes=([1,0],[0,1]))
+    >>> c.shape
+    (5, 2)
+
+    >>> r = c.execute()
+    >>> r
+    array([[ 4400.,  4730.],
+           [ 4532.,  4874.],
+           [ 4664.,  5018.],
+           [ 4796.,  5162.],
+           [ 4928.,  5306.]])
+
+    >>> # A slower but equivalent way of computing the same...
+    >>> ra = np.arange(60.).reshape(3,4,5)
+    >>> rb = np.arange(24.).reshape(4,3,2)
+    >>> d = np.zeros((5,2))
+    >>> for i in range(5):
+    ...   for j in range(2):
+    ...     for k in range(3):
+    ...       for n in range(4):
+    ...         d[i,j] += ra[k,n,i] * rb[n,k,j]
+    >>> r == d
+    array([[ True,  True],
+           [ True,  True],
+           [ True,  True],
+           [ True,  True],
+           [ True,  True]], dtype=bool)
+
+    An extended example taking advantage of the overloading of + and \\*:
+
+    >>> a = mt.array(range(1, 9))
+    >>> a.shape = (2, 2, 2)
+    >>> A = mt.array(('a', 'b', 'c', 'd'), dtype=object)
+    >>> A.shape = (2, 2)
+    >>> a.execute(); A.execute()
+    array([[[1, 2],
+            [3, 4]],
+           [[5, 6],
+            [7, 8]]])
+    array([[a, b],
+           [c, d]], dtype=object)
+
+    >>> mt.tensordot(a, A).execute() # third argument default is 2 for double-contraction
+    array([abbcccdddd, aaaaabbbbbbcccccccdddddddd], dtype=object)
+
+    >>> mt.tensordot(a, A, 1).execute()
+    array([[[acc, bdd],
+            [aaacccc, bbbdddd]],
+           [[aaaaacccccc, bbbbbdddddd],
+            [aaaaaaacccccccc, bbbbbbbdddddddd]]], dtype=object)
+
+    >>> mt.tensordot(a, A, 0).execute() # tensor product (result too long to incl.)
+    array([[[[[a, b],
+              [c, d]],
+              ...
+
+    >>> mt.tensordot(a, A, (0, 1)).execute()
+    array([[[abbbbb, cddddd],
+            [aabbbbbb, ccdddddd]],
+           [[aaabbbbbbb, cccddddddd],
+            [aaaabbbbbbbb, ccccdddddddd]]], dtype=object)
+
+    >>> mt.tensordot(a, A, (2, 1)).execute()
+    array([[[abb, cdd],
+            [aaabbbb, cccdddd]],
+           [[aaaaabbbbbb, cccccdddddd],
+            [aaaaaaabbbbbbbb, cccccccdddddddd]]], dtype=object)
+
+    >>> mt.tensordot(a, A, ((0, 1), (0, 1))).execute()
+    array([abbbcccccddddddd, aabbbbccccccdddddddd], dtype=object)
+
+    >>> mt.tensordot(a, A, ((2, 1), (1, 0))).execute()
+    array([acccbbdddd, aaaaacccccccbbbbbbdddddddd], dtype=object)
+    """
+    a = astensor(a)
+    b = astensor(b)
+
+    if isinstance(axes, Iterable):
+        a_axes, b_axes = axes
+    else:
+        a_axes = tuple(range(a.ndim - 1, a.ndim - axes - 1, -1))
+        b_axes = tuple(range(0, axes))
+
+    if isinstance(a_axes, Iterable):
+        a_axes = tuple(a_axes)
+    else:
+        a_axes = (a_axes,)
+    a_axes = tuple(axis if axis >= 0 else a.ndim + axis for axis in a_axes)
+    if isinstance(b_axes, Iterable):
+        b_axes = tuple(b_axes)
+    else:
+        b_axes = (b_axes,)
+    b_axes = tuple(axis if axis >= 0 else b.ndim + axis for axis in b_axes)
+
+    if (
+        a.shape
+        and b.shape
+        and not np.array_equal(
+            np.array(a.shape)[list(a_axes)], np.array(b.shape)[list(b_axes)]
+        )
+    ):
+        raise ValueError("shape-mismatch for sum")
+
+    sparse = sparse if sparse is not None else a.issparse() and b.issparse()
+    op = TensorTensorDot(
+        a_axes=a_axes,
+        b_axes=b_axes,
+        dtype=np.promote_types(a.dtype, b.dtype),
+        sparse=sparse,
+    )
+    return op(a, b)
diff --git a/python/xorbits/_mars/tensor/linalg/tests/__init__.py b/python/xorbits/_mars/tensor/linalg/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/linalg/tests/test_linalg.py b/python/xorbits/_mars/tensor/linalg/tests/test_linalg.py
new file mode 100644
index 000000000..10fcea1ef
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/tests/test_linalg.py
@@ -0,0 +1,481 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import scipy.sparse as sps
+
+from .... import tensor as mt
+from ....core import tile
+from ... import dot, empty, ones, tensor
+from ...core import SparseTensor, Tensor
+from .. import matmul
+from ..inv import TensorInv
+
+
+def test_qr():
+    a = mt.random.rand(9, 6, chunk_size=(3, 6))
+    q, r = mt.linalg.qr(a)
+
+    assert q.shape == (9, 6)
+    assert r.shape == (6, 6)
+
+    q, r = tile(q, r)
+
+    assert len(q.chunks) == 3
+    assert len(r.chunks) == 1
+    assert q.nsplits == ((3, 3, 3), (6,))
+    assert r.nsplits == ((6,), (6,))
+
+    assert q.chunks[0].shape == (3, 6)
+    assert q.chunks[0].inputs[0].shape == (3, 3)
+    assert q.chunks[0].inputs[1].shape == (3, 6)
+
+    a = mt.random.rand(18, 6, chunk_size=(9, 6))
+    q, r = mt.linalg.qr(a)
+
+    assert q.shape == (18, 6)
+    assert r.shape == (6, 6)
+
+    q, r = tile(q, r)
+
+    assert len(q.chunks) == 2
+    assert len(r.chunks) == 1
+    assert q.nsplits == ((9, 9), (6,))
+    assert r.nsplits == ((6,), (6,))
+
+    assert q.chunks[0].shape == (9, 6)
+    assert q.chunks[0].inputs[0].shape == (9, 6)
+    assert q.chunks[0].inputs[1].shape == (6, 6)
+
+    # for Short-and-Fat QR
+    a = mt.random.rand(6, 18, chunk_size=(6, 6))
+    q, r = mt.linalg.qr(a, method="sfqr")
+
+    assert q.shape == (6, 6)
+    assert r.shape == (6, 18)
+
+    q, r = tile(q, r)
+
+    assert len(q.chunks) == 1
+    assert len(r.chunks) == 3
+    assert q.nsplits == ((6,), (6,))
+    assert r.nsplits == ((6,), (6, 6, 6))
+
+    # chunk width less than height
+    a = mt.random.rand(6, 9, chunk_size=(6, 3))
+    q, r = mt.linalg.qr(a, method="sfqr")
+
+    assert q.shape == (6, 6)
+    assert r.shape == (6, 9)
+
+    q, r = tile(q, r)
+
+    assert len(q.chunks) == 1
+    assert len(r.chunks) == 2
+    assert q.nsplits == ((6,), (6,))
+    assert r.nsplits == ((6,), (6, 3))
+
+    a = mt.random.rand(9, 6, chunk_size=(9, 3))
+    q, r = mt.linalg.qr(a, method="sfqr")
+
+    assert q.shape == (9, 6)
+    assert r.shape == (6, 6)
+
+    q, r = tile(q, r)
+
+    assert len(q.chunks) == 1
+    assert len(r.chunks) == 1
+    assert q.nsplits == ((9,), (6,))
+    assert r.nsplits == ((6,), (6,))
+
+
+def test_norm():
+    data = np.random.rand(9, 6)
+
+    a = mt.tensor(data, chunk_size=(2, 6))
+
+    for ord in (None, "nuc", np.inf, -np.inf, 0, 1, -1, 2, -2):
+        for axis in (0, 1, (0, 1)):
+            for keepdims in (True, False):
+                try:
+                    res = mt.linalg.norm(a, ord=ord, axis=axis, keepdims=keepdims)
+                    expect_shape = np.linalg.norm(
+                        data, ord=ord, axis=axis, keepdims=keepdims
+                    ).shape
+                    assert res.shape == expect_shape
+                except ValueError:
+                    continue
+
+
+def test_svd():
+    a = mt.random.rand(9, 6, chunk_size=(3, 6))
+    U, s, V = mt.linalg.svd(a)
+
+    assert U.shape == (9, 6)
+    assert s.shape == (6,)
+    assert V.shape == (6, 6)
+
+    U, s, V = tile(U, s, V)
+
+    assert len(U.chunks) == 3
+    assert U.chunks[0].shape == (3, 6)
+    assert len(s.chunks) == 1
+    assert s.chunks[0].shape == (6,)
+    assert len(V.chunks) == 1
+    assert V.chunks[0].shape == (6, 6)
+
+    assert U.chunks[0].inputs[0].shape == (3, 6)
+    assert U.chunks[0].inputs[0].inputs[0].shape == (3, 3)
+    assert U.chunks[0].inputs[0].inputs[1].shape == (3, 6)
+
+    assert s.ndim == 1
+    assert len(s.chunks[0].index) == 1
+
+    a = mt.random.rand(9, 6, chunk_size=(9, 6))
+    U, s, V = mt.linalg.svd(a)
+
+    assert U.shape == (9, 6)
+    assert s.shape == (6,)
+    assert V.shape == (6, 6)
+
+    U, s, V = tile(U, s, V)
+
+    assert len(U.chunks) == 1
+    assert U.chunks[0].shape == (9, 6)
+    assert len(s.chunks) == 1
+    assert s.chunks[0].shape == (6,)
+    assert len(V.chunks) == 1
+    assert V.chunks[0].shape == (6, 6)
+
+    assert s.ndim == 1
+    assert len(s.chunks[0].index) == 1
+
+    a = mt.random.rand(6, 20, chunk_size=10)
+    U, s, V = mt.linalg.svd(a)
+
+    assert U.shape == (6, 6)
+    assert s.shape == (6,)
+    assert V.shape == (6, 20)
+
+    U, s, V = tile(U, s, V)
+
+    assert len(U.chunks) == 1
+    assert U.chunks[0].shape == (6, 6)
+    assert len(s.chunks) == 1
+    assert s.chunks[0].shape == (6,)
+    assert len(V.chunks) == 1
+    assert V.chunks[0].shape == (6, 20)
+
+    a = mt.random.rand(6, 9, chunk_size=(6, 9))
+    U, s, V = mt.linalg.svd(a)
+
+    assert U.shape == (6, 6)
+    assert s.shape == (6,)
+    assert V.shape == (6, 9)
+
+    rs = mt.random.RandomState(1)
+
+    a = rs.rand(20, 10, chunk_size=10)
+    _, s, _ = mt.linalg.svd(a)
+    del _
+    graph = s.build_graph()
+    assert len(graph) == 4
+
+
+def test_lu():
+    a = mt.random.randint(1, 10, (6, 6), chunk_size=3)
+    p, l_, u = mt.linalg.lu(a)
+
+    p, l_, u = tile(p, l_, u)
+
+    assert l_.shape == (6, 6)
+    assert u.shape == (6, 6)
+    assert p.shape == (6, 6)
+
+    a = mt.random.randint(1, 10, (6, 6), chunk_size=(3, 2))
+    p, l_, u = mt.linalg.lu(a)
+    p, l_, u = tile(p, l_, u)
+
+    assert l_.shape == (6, 6)
+    assert u.shape == (6, 6)
+    assert p.shape == (6, 6)
+
+    assert p.nsplits == ((3, 3), (3, 3))
+    assert l_.nsplits == ((3, 3), (3, 3))
+    assert u.nsplits == ((3, 3), (3, 3))
+
+    a = mt.random.randint(1, 10, (7, 7), chunk_size=4)
+    p, l_, u = mt.linalg.lu(a)
+    p, l_, u = tile(p, l_, u)
+
+    assert l_.shape == (7, 7)
+    assert u.shape == (7, 7)
+    assert p.shape == (7, 7)
+
+    assert p.nsplits == ((4, 3), (4, 3))
+    assert l_.nsplits == ((4, 3), (4, 3))
+    assert u.nsplits == ((4, 3), (4, 3))
+
+    a = mt.random.randint(1, 10, (7, 5), chunk_size=4)
+    p, l_, u = mt.linalg.lu(a)
+    p, l_, u = tile(p, l_, u)
+
+    assert l_.shape == (7, 5)
+    assert u.shape == (5, 5)
+    assert p.shape == (7, 7)
+
+    a = mt.random.randint(1, 10, (5, 7), chunk_size=4)
+    p, l_, u = mt.linalg.lu(a)
+    p, l_, u = tile(p, l_, u)
+
+    assert l_.shape == (5, 5)
+    assert u.shape == (5, 7)
+    assert p.shape == (5, 5)
+
+    # test sparse
+    data = sps.csr_matrix(
+        [
+            [2, 0, 0, 0, 5, 2],
+            [0, 6, 1, 0, 0, 6],
+            [8, 0, 9, 0, 0, 2],
+            [0, 6, 0, 8, 7, 3],
+            [7, 0, 6, 1, 7, 0],
+            [0, 0, 0, 7, 0, 8],
+        ]
+    )
+    t = mt.tensor(data, chunk_size=3)
+    p, l_, u = mt.linalg.lu(t)
+
+    assert p.op.sparse is True
+    assert isinstance(p, SparseTensor)
+    assert l_.op.sparse is True
+    assert isinstance(l_, SparseTensor)
+    assert u.op.sparse is True
+    assert isinstance(u, SparseTensor)
+
+    p, l_, u = tile(p, l_, u)
+
+    assert all(c.is_sparse() for c in p.chunks) is True
+    assert all(c.is_sparse() for c in l_.chunks) is True
+    assert all(c.is_sparse() for c in u.chunks) is True
+
+
+def test_solve():
+    a = mt.random.randint(1, 10, (20, 20))
+    b = mt.random.randint(1, 10, (20,))
+    x = tile(mt.linalg.solve(a, b))
+
+    assert x.shape == (20,)
+
+    a = mt.random.randint(1, 10, (20, 20), chunk_size=5)
+    b = mt.random.randint(1, 10, (20, 3), chunk_size=5)
+    x = tile(mt.linalg.solve(a, b))
+
+    assert x.shape == (20, 3)
+
+    a = mt.random.randint(1, 10, (20, 20), chunk_size=12)
+    b = mt.random.randint(1, 10, (20, 3))
+    x = tile(mt.linalg.solve(a, b))
+
+    assert x.shape == (20, 3)
+    assert x.nsplits == ((12, 8), (3,))
+
+    # test sparse
+    a = sps.csr_matrix(np.random.randint(1, 10, (20, 20)))
+    b = mt.random.randint(1, 10, (20,), chunk_size=3)
+    x = tile(mt.linalg.solve(a, b))
+
+    assert x.shape == (20,)
+    assert x.op.sparse is True
+    assert x.chunks[0].op.sparse is True
+
+    a = mt.tensor(a, chunk_size=7)
+    b = mt.random.randint(1, 10, (20,))
+    x = tile(mt.linalg.solve(a, b))
+
+    assert x.shape == (20,)
+    assert x.nsplits == ((7, 7, 6),)
+
+    x = tile(mt.linalg.solve(a, b, sparse=False))
+    assert x.op.sparse is False
+    assert x.chunks[0].op.sparse is False
+
+
+def test_inv():
+    a = mt.random.randint(1, 10, (20, 20), chunk_size=8)
+    a_inv = tile(mt.linalg.inv(a))
+
+    assert a_inv.shape == (20, 20)
+
+    # test 1 chunk
+    a = mt.random.randint(1, 10, (20, 20), chunk_size=20)
+    a_inv = tile(mt.linalg.inv(a))
+
+    assert a_inv.shape == (20, 20)
+    assert len(a_inv.chunks) == 1
+    assert isinstance(a_inv.chunks[0].op, TensorInv)
+
+    a = mt.random.randint(1, 10, (20, 20), chunk_size=11)
+    a_inv = tile(mt.linalg.inv(a))
+
+    assert a_inv.shape == (20, 20)
+    assert a_inv.nsplits == ((11, 9), (11, 9))
+
+    b = a.T.dot(a)
+    b_inv = tile(mt.linalg.inv(b))
+    assert b_inv.shape == (20, 20)
+
+    # test sparse
+    data = sps.csr_matrix(np.random.randint(1, 10, (20, 20)))
+    a = mt.tensor(data, chunk_size=10)
+    a_inv = tile(mt.linalg.inv(a))
+
+    assert a_inv.shape == (20, 20)
+
+    assert a_inv.op.sparse is True
+    assert isinstance(a_inv, SparseTensor)
+    assert all(c.is_sparse() for c in a_inv.chunks) is True
+
+    b = a.T.dot(a)
+    b_inv = tile(mt.linalg.inv(b))
+    assert b_inv.shape == (20, 20)
+
+    assert b_inv.op.sparse is True
+    assert isinstance(b_inv, SparseTensor)
+    assert all(c.is_sparse() for c in b_inv.chunks) is True
+
+    b_inv = tile(mt.linalg.inv(b, sparse=False))
+    assert b_inv.op.sparse is False
+    assert not all(c.is_sparse() for c in b_inv.chunks) is True
+
+
+def test_tensordot():
+    from .. import dot, inner, tensordot
+
+    t1 = ones((3, 4, 6), chunk_size=2)
+    t2 = ones((4, 3, 5), chunk_size=2)
+    t3 = tensordot(t1, t2, axes=((0, 1), (1, 0)))
+
+    assert t3.shape == (6, 5)
+
+    t3 = tile(t3)
+
+    assert t3.shape == (6, 5)
+    assert len(t3.chunks) == 9
+
+    a = ones((10000, 20000), chunk_size=5000)
+    b = ones((20000, 1000), chunk_size=5000)
+
+    with pytest.raises(ValueError):
+        tensordot(a, b)
+
+    a = ones(10, chunk_size=2)
+    b = ones((10, 20), chunk_size=2)
+    c = dot(a, b)
+    assert c.shape == (20,)
+    c = tile(c)
+    assert c.shape == tuple(sum(s) for s in c.nsplits)
+
+    a = ones((10, 20), chunk_size=2)
+    b = ones(20, chunk_size=2)
+    c = dot(a, b)
+    assert c.shape == (10,)
+    c = tile(c)
+    assert c.shape == tuple(sum(s) for s in c.nsplits)
+
+    v = ones((100, 100), chunk_size=10)
+    tv = v.dot(v)
+    assert tv.shape == (100, 100)
+    tv = tile(tv)
+    assert tv.shape == tuple(sum(s) for s in tv.nsplits)
+
+    a = ones((10, 20), chunk_size=2)
+    b = ones((30, 20), chunk_size=2)
+    c = inner(a, b)
+    assert c.shape == (10, 30)
+    c = tile(c)
+    assert c.shape == tuple(sum(s) for s in c.nsplits)
+
+
+def test_dot():
+    t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse()
+    t2 = t1.T
+
+    assert t1.dot(t2).issparse() is True
+    assert type(t1.dot(t2)) is SparseTensor
+    assert t1.dot(t2, sparse=False).issparse() is False
+    assert type(t1.dot(t2, sparse=False)) is Tensor
+
+    with pytest.raises(TypeError):
+        dot(t1, t2, out=1)
+
+    with pytest.raises(ValueError):
+        dot(t1, t2, empty((3, 6)))
+
+    with pytest.raises(ValueError):
+        dot(t1, t2, empty((3, 3), dtype="i4"))
+
+    with pytest.raises(ValueError):
+        dot(t1, t2, empty((3, 3), order="F"))
+
+    t1.dot(t2, out=empty((2, 2), dtype=t1.dtype))
+
+
+def test_matmul():
+    t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse()
+    t2 = t1.T
+
+    t3 = matmul(t1, t2, out=empty((2, 2), dtype=t1.dtype, order="F"))
+    assert t3.order.value == "F"
+
+    with pytest.raises(TypeError):
+        matmul(t1, t2, out=1)
+
+    with pytest.raises(TypeError):
+        matmul(t1, t2, out=empty((2, 2), dtype="?"))
+
+    with pytest.raises(ValueError):
+        matmul(t1, t2, out=empty((3, 2), dtype=t1.dtype))
+
+    raw1 = np.asfortranarray(np.random.rand(3, 3))
+    raw2 = np.asfortranarray(np.random.rand(3, 3))
+    raw3 = np.random.rand(3, 3)
+
+    assert (
+        matmul(tensor(raw1), tensor(raw2)).flags["C_CONTIGUOUS"]
+        == np.matmul(raw1, raw2).flags["C_CONTIGUOUS"]
+    )
+    assert (
+        matmul(tensor(raw1), tensor(raw2)).flags["F_CONTIGUOUS"]
+        == np.matmul(raw1, raw2).flags["F_CONTIGUOUS"]
+    )
+
+    assert (
+        matmul(tensor(raw1), tensor(raw2), order="A").flags["C_CONTIGUOUS"]
+        == np.matmul(raw1, raw2, order="A").flags["C_CONTIGUOUS"]
+    )
+    assert (
+        matmul(tensor(raw1), tensor(raw2), order="A").flags["F_CONTIGUOUS"]
+        == np.matmul(raw1, raw2, order="A").flags["F_CONTIGUOUS"]
+    )
+
+    assert (
+        matmul(tensor(raw1), tensor(raw3), order="A").flags["C_CONTIGUOUS"]
+        == np.matmul(raw1, raw3, order="A").flags["C_CONTIGUOUS"]
+    )
+    assert (
+        matmul(tensor(raw1), tensor(raw3), order="A").flags["F_CONTIGUOUS"]
+        == np.matmul(raw1, raw3, order="A").flags["F_CONTIGUOUS"]
+    )
diff --git a/python/xorbits/_mars/tensor/linalg/tests/test_linalg_execution.py b/python/xorbits/_mars/tensor/linalg/tests/test_linalg_execution.py
new file mode 100644
index 000000000..e459c0a5a
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/tests/test_linalg_execution.py
@@ -0,0 +1,991 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import scipy.sparse as sps
+
+from ....learn.datasets.samples_generator import make_low_rank_matrix
+from ....lib.sparse import SparseNDArray, issparse
+from ....utils import ignore_warning
+from ...datasource import arange, diag, ones, tensor
+from ...random import uniform
+from .. import (
+    cholesky,
+    dot,
+    inner,
+    inv,
+    lu,
+    matmul,
+    norm,
+    qr,
+    randomized_svd,
+    solve,
+    solve_triangular,
+    svd,
+    tensordot,
+    vdot,
+)
+
+
+def test_qr_execution(setup):
+    rs = np.random.RandomState(0)
+    data = rs.randn(18, 6)
+
+    a = tensor(data, chunk_size=(3, 6))
+    q, r = qr(a)
+    t = q.dot(r)
+
+    res = t.execute().fetch()
+    np.testing.assert_array_almost_equal(res, data)
+
+    a = tensor(data, chunk_size=(9, 6))
+    q, r = qr(a)
+    t = q.dot(r)
+
+    res = t.execute().fetch()
+    np.testing.assert_array_almost_equal(res, data)
+
+    a = tensor(data, chunk_size=3)
+    q, r = qr(a)
+    t = q.dot(r)
+
+    res = t.execute().fetch()
+    np.testing.assert_array_almost_equal(res, data)
+
+    # test for Short-and-Fat QR
+    data = rs.randn(6, 18)
+
+    a = tensor(data, chunk_size=(6, 9))
+    q, r = qr(a, method="sfqr")
+    t = q.dot(r)
+
+    res = t.execute().fetch()
+    np.testing.assert_array_almost_equal(res, data)
+
+    a = tensor(data, chunk_size=(3, 3))
+    q, r = qr(a, method="sfqr")
+    t = q.dot(r)
+
+    res = t.execute().fetch()
+    np.testing.assert_array_almost_equal(res, data)
+
+    a = tensor(data, chunk_size=(6, 3))
+    q, r = qr(a, method="sfqr")
+    t = q.dot(r)
+
+    res = t.execute().fetch()
+    np.testing.assert_array_almost_equal(res, data)
+
+
+def test_svd_execution(setup):
+    rs = np.random.RandomState()
+    data = rs.randn(18, 6) + 1j * rs.randn(18, 6)
+
+    a = tensor(data, chunk_size=(9, 6))
+    U, s, V = svd(a)
+    t = U.dot(diag(s).dot(V))
+
+    res = t.execute().fetch()
+    np.testing.assert_array_almost_equal(res, data)
+
+    a = tensor(data, chunk_size=(18, 6))
+    U, s, V = svd(a)
+    t = U.dot(diag(s).dot(V))
+
+    res = t.execute().fetch()
+    np.testing.assert_array_almost_equal(res, data)
+
+    a = tensor(data, chunk_size=(2, 6))
+    U, s, V = svd(a)
+    t = U.dot(diag(s).dot(V))
+
+    res = t.execute().fetch()
+    np.testing.assert_array_almost_equal(res, data)
+
+    data = rs.randn(6, 18) + 1j * rs.randn(6, 18)
+
+    a = tensor(data)
+    U, s, V = svd(a)
+    t = U.dot(diag(s).dot(V))
+
+    res = t.execute().fetch()
+    np.testing.assert_array_almost_equal(res, data)
+
+    # test for matrix of ones
+    data = np.ones((20, 10))
+
+    a = tensor(data, chunk_size=10)
+    s = svd(a)[1]
+    res = s.execute().fetch()
+    expected = np.linalg.svd(a)[1]
+    np.testing.assert_array_almost_equal(res, expected)
+
+
+def test_randomized_svd_execution(setup):
+    n_samples = 100
+    n_features = 500
+    rank = 5
+    k = 10
+    for dtype in (np.int64, np.float64):
+        # generate a matrix X of approximate effective rank `rank` and no noise
+        # component (very structured signal):
+        X = make_low_rank_matrix(
+            n_samples=n_samples,
+            n_features=n_features,
+            effective_rank=rank,
+            tail_strength=0.0,
+            random_state=0,
+        ).astype(dtype, copy=False)
+        assert X.shape == (n_samples, n_features)
+        dtype = np.dtype(dtype)
+        decimal = 5 if dtype == np.float32 else 7
+
+        # compute the singular values of X using the slow exact method
+        X_res = X.execute().fetch()
+        U, s, V = np.linalg.svd(X_res, full_matrices=False)
+
+        # Convert the singular values to the specific dtype
+        U = U.astype(dtype, copy=False)
+        s = s.astype(dtype, copy=False)
+        V = V.astype(dtype, copy=False)
+
+        for normalizer in ["auto", "LU", "QR"]:  # 'none' would not be stable
+            # compute the singular values of X using the fast approximate method
+            Ua, sa, Va = randomized_svd(
+                X, k, n_iter=1, power_iteration_normalizer=normalizer, random_state=0
+            )
+
+            # If the input dtype is float, then the output dtype is float of the
+            # same bit size (f32 is not upcast to f64)
+            # But if the input dtype is int, the output dtype is float64
+            if dtype.kind == "f":
+                assert Ua.dtype == dtype
+                assert sa.dtype == dtype
+                assert Va.dtype == dtype
+            else:
+                assert Ua.dtype == np.float64
+                assert sa.dtype == np.float64
+                assert Va.dtype == np.float64
+
+            assert Ua.shape == (n_samples, k)
+            assert sa.shape == (k,)
+            assert Va.shape == (k, n_features)
+
+            # ensure that the singular values of both methods are equal up to the
+            # real rank of the matrix
+            sa_res = sa.execute().fetch()
+            np.testing.assert_almost_equal(s[:k], sa_res, decimal=decimal)
+
+            # check the singular vectors too (while not checking the sign)
+            dot_res = dot(Ua, Va).execute().fetch()
+            np.testing.assert_almost_equal(
+                np.dot(U[:, :k], V[:k, :]), dot_res, decimal=decimal
+            )
+
+
+def test_cholesky_execution(setup):
+    rs = np.random.RandomState(0)
+    data = rs.randint(1, 10, (10, 10))
+    symmetric_data = data.dot(data.T)
+
+    a = tensor(symmetric_data, chunk_size=5)
+
+    U = cholesky(a)
+    t = U.T.dot(U)
+
+    res_u = U.execute().fetch()
+    np.testing.assert_allclose(np.triu(res_u), res_u)
+
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, symmetric_data)
+
+    L = cholesky(a, lower=True)
+    U = cholesky(a)
+    t = L.dot(U)
+
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, symmetric_data)
+
+    a = tensor(symmetric_data, chunk_size=5)
+
+    L = cholesky(a, lower=True)
+    U = cholesky(a)
+    t = L.dot(U)
+
+    res_u = U.execute().fetch()
+    np.testing.assert_allclose(np.triu(res_u), res_u)
+
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, symmetric_data)
+
+    a = tensor(symmetric_data, chunk_size=(2, 3))
+
+    L = cholesky(a, lower=True)
+    U = cholesky(a)
+    t = L.dot(U)
+
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, symmetric_data)
+
+
+def test_lu_execution(setup):
+    rs = np.random.RandomState(0)
+
+    # square matrix
+    data = rs.randint(1, 10, (6, 6))
+
+    a = tensor(data)
+    P, L, U = lu(a)
+
+    # check lower and upper triangular matrix
+    result_l = L.execute().fetch()
+    result_u = U.execute().fetch()
+
+    np.testing.assert_allclose(np.tril(result_l), result_l)
+    np.testing.assert_allclose(np.triu(result_u), result_u)
+
+    t = P.dot(L).dot(U)
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, data)
+
+    a = tensor(data, chunk_size=(3, 4))
+    P, L, U = lu(a)
+
+    # check lower and upper triangular matrix
+    result_l = L.execute().fetch()
+    result_u = U.execute().fetch()
+
+    np.testing.assert_allclose(np.tril(result_l), result_l)
+    np.testing.assert_allclose(np.triu(result_u), result_u)
+
+    t = P.dot(L).dot(U)
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, data)
+
+    # shape[0] > shape[1]
+    data = rs.randint(1, 10, (10, 6))
+
+    a = tensor(data)
+    P, L, U = lu(a)
+
+    # check lower and upper triangular matrix
+    result_l = L.execute().fetch()
+    result_u = U.execute().fetch()
+
+    np.testing.assert_allclose(np.tril(result_l), result_l)
+    np.testing.assert_allclose(np.triu(result_u), result_u)
+
+    t = P.dot(L).dot(U)
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, data)
+
+    a = tensor(data, chunk_size=5)
+    P, L, U = lu(a)
+
+    # check lower and upper triangular matrix
+    result_l = L.execute().fetch()
+    result_u = U.execute().fetch()
+
+    np.testing.assert_allclose(np.tril(result_l), result_l)
+    np.testing.assert_allclose(np.triu(result_u), result_u)
+
+    t = P.dot(L).dot(U)
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, data)
+
+    a = tensor(data, chunk_size=(4, 5))
+    P, L, U = lu(a)
+
+    # check lower and upper triangular matrix
+    result_l = L.execute().fetch()
+    result_u = U.execute().fetch()
+
+    np.testing.assert_allclose(np.tril(result_l), result_l)
+    np.testing.assert_allclose(np.triu(result_u), result_u)
+
+    t = P.dot(L).dot(U)
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, data)
+
+    # shape[0] < shape[1]
+    data = rs.randint(1, 10, (6, 10))
+
+    a = tensor(data)
+    P, L, U = lu(a)
+
+    # check lower and upper triangular matrix
+    result_l = L.execute().fetch()
+    result_u = U.execute().fetch()
+
+    np.testing.assert_allclose(np.tril(result_l), result_l)
+    np.testing.assert_allclose(np.triu(result_u), result_u)
+
+    t = P.dot(L).dot(U)
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, data)
+
+    a = tensor(data, chunk_size=5)
+    P, L, U = lu(a)
+
+    # check lower and upper triangular matrix
+    result_l = L.execute().fetch()
+    result_u = U.execute().fetch()
+
+    np.testing.assert_allclose(np.tril(result_l), result_l)
+    np.testing.assert_allclose(np.triu(result_u), result_u)
+
+    t = P.dot(L).dot(U)
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, data)
+
+    a = tensor(data, chunk_size=(4, 5))
+    P, L, U = lu(a)
+
+    # check lower and upper triangular matrix
+    result_l = L.execute().fetch()
+    result_u = U.execute().fetch()
+
+    np.testing.assert_allclose(np.tril(result_l), result_l)
+    np.testing.assert_allclose(np.triu(result_u), result_u)
+
+    t = P.dot(L).dot(U)
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, data)
+
+    # test for sparse
+    data = sps.csr_matrix(
+        [
+            [2, 0, 0, 0, 5, 2],
+            [0, 6, 1, 0, 0, 6],
+            [8, 0, 9, 0, 0, 2],
+            [0, 6, 0, 8, 7, 3],
+            [7, 0, 6, 1, 7, 0],
+            [0, 0, 0, 7, 0, 8],
+        ]
+    )
+
+    a = tensor(data)
+    P, L, U = lu(a)
+    result_l = L.execute().fetch()
+    result_u = U.execute().fetch()
+
+    # check lower and upper triangular matrix
+    np.testing.assert_allclose(np.tril(result_l), result_l)
+    np.testing.assert_allclose(np.triu(result_u), result_u)
+    assert isinstance(result_l, SparseNDArray)
+    assert isinstance(result_u, SparseNDArray)
+
+    t = P.dot(L).dot(U)
+    res = t.execute().fetch()
+    np.testing.assert_array_almost_equal(data.A, res)
+
+    a = tensor(data, chunk_size=5)
+    P, L, U = lu(a)
+    result_l = L.execute().fetch()
+    result_u = U.execute().fetch()
+
+    # check lower and upper triangular matrix
+    np.testing.assert_allclose(np.tril(result_l), result_l)
+    np.testing.assert_allclose(np.triu(result_u), result_u)
+    assert isinstance(result_l, SparseNDArray)
+    assert isinstance(result_u, SparseNDArray)
+
+    t = P.dot(L).dot(U)
+    res = t.execute().fetch()
+    np.testing.assert_array_almost_equal(data.A, res)
+
+
+def test_solve_triangular(setup):
+    from ... import tril, triu
+
+    rs = np.random.RandomState(0)
+
+    data1 = rs.randint(1, 10, (20, 20))
+    data2 = rs.randint(1, 10, (20,))
+
+    A = tensor(data1, chunk_size=20)
+    b = tensor(data2, chunk_size=20)
+
+    x = solve_triangular(A, b)
+    t = triu(A).dot(x)
+
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, data2)
+
+    x = solve_triangular(A, b, lower=True)
+    t = tril(A).dot(x)
+
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, data2)
+
+    A = tensor(data1, chunk_size=10)
+    b = tensor(data2, chunk_size=10)
+
+    x = solve_triangular(A, b)
+    t = triu(A).dot(x)
+
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, data2)
+
+    x = solve_triangular(A, b, lower=True)
+    t = tril(A).dot(x)
+
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, data2)
+
+    data1 = rs.randint(1, 10, (10, 10))
+    data2 = rs.randint(1, 10, (10, 5))
+
+    A = tensor(data1, chunk_size=10)
+    b = tensor(data2, chunk_size=10)
+
+    x = solve_triangular(A, b)
+    t = triu(A).dot(x)
+
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, data2)
+
+    x = solve_triangular(A, b, lower=True)
+    t = tril(A).dot(x)
+
+    res = t.execute().fetch()
+    np.testing.assert_allclose(res, data2)
+
+    # test sparse
+    data1 = sps.csr_matrix(np.triu(rs.randint(1, 10, (10, 10))))
+    data2 = rs.random((10,))
+
+    A = tensor(data1, chunk_size=5)
+    b = tensor(data2, chunk_size=5)
+
+    x = solve_triangular(A, b)
+
+    result_x = x.execute().fetch()
+    result_b = data1.dot(result_x)
+
+    assert isinstance(result_x, SparseNDArray)
+    np.testing.assert_allclose(result_b, data2)
+
+    data1 = sps.csr_matrix(np.triu(rs.randint(1, 10, (10, 10))))
+    data2 = rs.random((10, 2))
+
+    A = tensor(data1, chunk_size=5)
+    b = tensor(data2, chunk_size=5)
+
+    x = solve_triangular(A, b)
+
+    result_x = x.execute().fetch()
+    result_b = data1.dot(result_x)
+
+    assert isinstance(result_x, SparseNDArray)
+    np.testing.assert_allclose(result_b, data2)
+
+
+def test_solve(setup):
+    import scipy.linalg
+
+    rs = np.random.RandomState(0)
+
+    data1 = rs.randint(1, 10, (20, 20))
+    data2 = rs.randint(1, 10, (20,))
+
+    A = tensor(data1, chunk_size=10)
+    b = tensor(data2, chunk_size=10)
+
+    x = solve(A, b)
+
+    res = x.execute().fetch()
+    np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2))
+    res = A.dot(x).execute().fetch()
+    np.testing.assert_allclose(res, data2)
+
+    data2 = rs.randint(1, 10, (20, 5))
+
+    A = tensor(data1, chunk_size=10)
+    b = tensor(data2, chunk_size=10)
+
+    x = solve(A, b)
+
+    res = x.execute().fetch()
+    np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2))
+    res = A.dot(x).execute().fetch()
+    np.testing.assert_allclose(res, data2)
+
+    # test for not all chunks are square in matrix A
+    data2 = rs.randint(1, 10, (20,))
+
+    A = tensor(data1, chunk_size=10)
+    b = tensor(data2, chunk_size=10)
+
+    x = solve(A, b)
+
+    res = x.execute().fetch()
+    np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2))
+    res = A.dot(x).execute().fetch()
+    np.testing.assert_allclose(res, data2)
+
+    A = tensor(data1, chunk_size=(10, 15))
+    b = tensor(data2, chunk_size=10)
+
+    x = solve(A, b)
+
+    res = x.execute().fetch()
+    np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2))
+    res = A.dot(x).execute().fetch()
+    np.testing.assert_allclose(res, data2)
+
+    # test sparse
+    data1 = sps.csr_matrix(rs.randint(1, 10, (20, 20)))
+    data2 = rs.randint(1, 10, (20,))
+
+    A = tensor(data1, chunk_size=10)
+    b = tensor(data2, chunk_size=10)
+
+    x = solve(A, b)
+
+    res = x.execute().fetch()
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_allclose(data1.dot(res), data2)
+
+    data2 = rs.randint(1, 10, (20, 5))
+
+    A = tensor(data1, chunk_size=10)
+    b = tensor(data2, chunk_size=10)
+
+    x = solve(A, b)
+
+    res = A.dot(x).execute().fetch()
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_allclose(res, data2)
+
+    # test for not all chunks are square in matrix A
+    data2 = rs.randint(1, 10, (20,))
+
+    A = tensor(data1, chunk_size=10)
+    b = tensor(data2, chunk_size=10)
+
+    x = solve(A, b)
+
+    res = A.dot(x).execute().fetch()
+    np.testing.assert_allclose(res, data2)
+
+
+def test_solve_sym_pos(setup):
+    import scipy.linalg
+
+    rs = np.random.RandomState(0)
+
+    data = rs.randint(1, 10, (20, 20))
+    data_l = np.tril(data)
+    data1 = data_l.dot(data_l.T)
+    data2 = rs.randint(1, 10, (20,))
+
+    A = tensor(data1, chunk_size=10)
+    b = tensor(data2, chunk_size=10)
+
+    x = solve(A, b, sym_pos=True)
+
+    res = x.execute().fetch()
+    np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2))
+    res = A.dot(x).execute().fetch()
+    np.testing.assert_allclose(res, data2)
+
+
+def test_inv(setup):
+    import scipy.linalg
+
+    rs = np.random.RandomState(0)
+
+    data = rs.randint(1, 10, (20, 20))
+
+    A = tensor(data)
+    inv_A = inv(A)
+
+    res = inv_A.execute().fetch()
+    np.testing.assert_allclose(res, scipy.linalg.inv(data))
+    res = A.dot(inv_A).execute().fetch()
+    np.testing.assert_array_almost_equal(res, np.eye(data.shape[0], dtype=float))
+
+    A = tensor(data, chunk_size=10)
+    inv_A = inv(A)
+
+    res = inv_A.execute().fetch()
+    np.testing.assert_allclose(res, scipy.linalg.inv(data))
+    res = A.dot(inv_A).execute().fetch()
+    np.testing.assert_array_almost_equal(res, np.eye(data.shape[0], dtype=float))
+
+    # test 1 chunk
+    A = tensor(data, chunk_size=20)
+    inv_A = inv(A)
+
+    res = inv_A.execute().fetch()
+    np.testing.assert_allclose(res, scipy.linalg.inv(data))
+    res = A.dot(inv_A).execute().fetch()
+    np.testing.assert_array_almost_equal(res, np.eye(data.shape[0], dtype=float))
+
+    B = A.T.dot(A)
+    inv_B = inv(B)
+    res = inv_B.execute().fetch()
+    np.testing.assert_array_almost_equal(res, scipy.linalg.inv(data.T.dot(data)))
+    res = B.dot(inv_B).execute().fetch()
+    np.testing.assert_array_almost_equal(res, np.eye(data.shape[0], dtype=float))
+
+    # test for not all chunks are square in matrix A
+    A = tensor(data, chunk_size=8)
+    inv_A = inv(A)
+
+    res = inv_A.execute().fetch()
+    np.testing.assert_array_almost_equal(res, scipy.linalg.inv(data))
+    res = A.dot(inv_A).execute().fetch()
+    np.testing.assert_array_almost_equal(res, np.eye(data.shape[0], dtype=float))
+
+    # test sparse
+    data = rs.randint(1, 10, (20, 20))
+    sp_data = sps.csr_matrix(data)
+
+    A = tensor(sp_data, chunk_size=10)
+    inv_A = inv(A)
+
+    res = inv_A.execute().fetch()
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_array_almost_equal(res, scipy.linalg.inv(data))
+    res = A.dot(inv_A).execute().fetch()
+    np.testing.assert_array_almost_equal(res, np.eye(data.shape[0], dtype=float))
+
+    # test for not all chunks are square in matrix A
+    A = tensor(sp_data, chunk_size=12)
+    inv_A = inv(A)
+
+    res = inv_A.execute().fetch()
+    assert isinstance(res, SparseNDArray)
+    np.testing.assert_array_almost_equal(res, scipy.linalg.inv(data))
+    res = A.dot(inv_A).execute().fetch()
+    np.testing.assert_array_almost_equal(res, np.eye(data.shape[0], dtype=float))
+
+
+@ignore_warning
+def test_norm_execution(setup):
+    d = np.arange(9) - 4
+    d2 = d.reshape(3, 3)
+
+    ma = [tensor(d, chunk_size=2), tensor(d2, chunk_size=(2, 3))]
+
+    for i, a in enumerate(ma):
+        data = d if i < 1 else d2
+        for ord in (None, "nuc", np.inf, -np.inf, 0, 1, -1, 2, -2):
+            for axis in (0, 1, (0, 1), -1):
+                for keepdims in (True, False):
+                    try:
+                        expected = np.linalg.norm(
+                            data, ord=ord, axis=axis, keepdims=keepdims
+                        )
+                        t = norm(a, ord=ord, axis=axis, keepdims=keepdims)
+                        res = t.execute().fetch()
+
+                        expected_shape = expected.shape
+                        t_shape = t.shape
+                        assert expected_shape == t_shape
+
+                        np.testing.assert_allclose(res, expected, atol=0.0001)
+                    except ValueError:
+                        continue
+
+    m = norm(tensor(d))
+    expected = m.execute().fetch()
+    res = np.linalg.norm(d)
+    assert expected == res
+
+    d = uniform(-0.5, 0.5, size=(5000, 2), chunk_size=1000)
+    inside = (norm(d, axis=1) < 0.5).sum().astype(float)
+    t = inside / 5000 * 4
+    res = t.execute().fetch()
+    np.testing.assert_almost_equal(3.14, res, decimal=1)
+
+    raw = np.random.RandomState(0).rand(10, 10)
+    d = norm(tensor(raw, chunk_size=5))
+    expected = d.execute().fetch()
+    result = np.linalg.norm(raw)
+    np.testing.assert_allclose(expected, result)
+
+
+def test_tensordot_execution(setup):
+    rs = np.random.RandomState(0)
+    # size_executor = ExecutorForTest(sync_provider_type=ExecutorForTest.SyncProviderType.MOCK)
+    #
+    # a_data = np.arange(60).reshape(3, 4, 5)
+    # a = tensor(a_data, chunk_size=2)
+    # b_data = np.arange(24).reshape(4, 3, 2)
+    # b = tensor(b_data, chunk_size=2)
+    #
+    # axes = ([1, 0], [0, 1])
+    # c = tensordot(a, b, axes=axes)
+    # size_res = size_executor.execute_tensor(c, mock=True)
+    # assert sum(s[0] for s in size_res) == c.nbytes
+    # assert sum(s[1] for s in size_res) == c.nbytes
+
+    a = ones((100, 200), chunk_size=50)
+    b = ones((200, 10), chunk_size=50)
+    c = dot(a, b)
+    res = c.execute().fetch()
+    expected = np.dot(np.ones((100, 200)), np.ones((200, 10)))
+    np.testing.assert_array_equal(res, expected)
+
+    a = ones((10, 8), chunk_size=4)
+    b = ones((8, 10), chunk_size=4)
+    c = a.dot(b)
+    res = c.execute().fetch()
+    np.testing.assert_array_equal(res, np.tile([8], [10, 10]))
+
+    a = ones((500, 500), chunk_size=500)
+    b = ones((500, 100), chunk_size=500)
+    c = a.dot(b)
+    res = c.execute().fetch()
+    np.testing.assert_array_equal(res, np.tile([500], [500, 100]))
+
+    raw_a = rs.random((100, 200, 50))
+    raw_b = rs.random((200, 10, 100))
+    a = tensor(raw_a, chunk_size=50)
+    b = tensor(raw_b, chunk_size=33)
+    c = tensordot(a, b, axes=((0, 1), (2, 0)))
+    res = c.execute().fetch()
+    expected = np.tensordot(raw_a, raw_b, axes=(c.op.a_axes, c.op.b_axes))
+    np.testing.assert_array_almost_equal(res, expected)
+
+    a = ones((100, 200), chunk_size=50)
+    b = ones((10, 200), chunk_size=50)
+    c = inner(a, b)
+    res = c.execute().fetch()
+    expected = np.inner(np.ones((100, 200)), np.ones((10, 200)))
+    np.testing.assert_array_equal(res, expected)
+
+    a = ones((100, 100), chunk_size=30)
+    b = ones((100, 100), chunk_size=30)
+    c = a.dot(b)
+    res = c.execute().fetch()
+    np.testing.assert_array_equal(res, np.ones((100, 100)) * 100)
+
+
+# def test_sparse_dot_size_execution():
+#     from mars.tensor.linalg.tensordot import TensorTensorDot
+#     from mars.executor import register, register_default
+#     chunk_sizes = dict()
+#     chunk_nbytes = dict()
+#     chunk_input_sizes = dict()
+#     chunk_input_nbytes = dict()
+#
+#     def execute_size(t):
+#         def _tensordot_size_recorder(ctx, op):
+#             TensorTensorDot.estimate_size(ctx, op)
+#
+#             chunk_key = op.outputs[0].key
+#             chunk_sizes[chunk_key] = ctx[chunk_key]
+#             chunk_nbytes[chunk_key] = op.outputs[0].nbytes
+#
+#             input_sizes = dict((inp.op.key, ctx[inp.key][0]) for inp in op.inputs)
+#             chunk_input_sizes[chunk_key] = sum(input_sizes.values())
+#             input_nbytes = dict((inp.op.key, inp.nbytes) for inp in op.inputs)
+#             chunk_input_nbytes[chunk_key] = sum(input_nbytes.values())
+#
+#         size_executor = ExecutorForTest(sync_provider_type=ExecutorForTest.SyncProviderType.MOCK)
+#         try:
+#             chunk_sizes.clear()
+#             chunk_nbytes.clear()
+#             chunk_input_sizes.clear()
+#             chunk_input_nbytes.clear()
+#             register(TensorTensorDot, size_estimator=_tensordot_size_recorder)
+#             size_executor.execute_tensor(t, mock=True)
+#         finally:
+#             register_default(TensorTensorDot)
+#
+#     a_data = sps.random(5, 9, density=.1)
+#     b_data = sps.random(9, 10, density=.2)
+#     a = tensor(a_data, chunk_size=2)
+#     b = tensor(b_data, chunk_size=3)
+#
+#     c = dot(a, b)
+#     execute_size(c)
+#
+#     for key in chunk_input_sizes.keys():
+#         assert chunk_sizes[key][1] >= chunk_input_sizes[key]
+#
+#     c2 = dot(a, b, sparse=False)
+#     execute_size(c2)
+#
+#     for key in chunk_input_sizes.keys():
+#         assert chunk_sizes[key][0] == chunk_nbytes[key]
+#         assert chunk_sizes[key][1] == chunk_input_nbytes[key] + chunk_nbytes[key]
+
+
+def test_sparse_dot_execution(setup):
+    rs = np.random.RandomState(0)
+
+    a_data = sps.random(5, 9, density=0.1)
+    b_data = sps.random(9, 10, density=0.2)
+    a = tensor(a_data, chunk_size=2)
+    b = tensor(b_data, chunk_size=3)
+
+    c = dot(a, b)
+
+    res = c.execute().fetch()
+    assert issparse(res) is True
+    np.testing.assert_allclose(res.toarray(), a_data.dot(b_data).toarray())
+
+    c2 = dot(a, b, sparse=False)
+
+    res = c2.execute().fetch()
+    assert issparse(res) is False
+    np.testing.assert_allclose(res, a_data.dot(b_data).toarray())
+
+    c3 = tensordot(a, b.T, (-1, -1), sparse=False)
+
+    res = c3.execute().fetch()
+    assert issparse(res) is False
+    np.testing.assert_allclose(res, a_data.dot(b_data).toarray())
+
+    c = inner(a, b.T)
+
+    res = c.execute().fetch()
+    assert issparse(res) is True
+    np.testing.assert_allclose(res.toarray(), a_data.dot(b_data).toarray())
+
+    c = inner(a, b.T, sparse=False)
+
+    res = c.execute().fetch()
+    assert issparse(res) is False
+    np.testing.assert_allclose(res, a_data.dot(b_data).toarray())
+
+    # test vector inner
+    a_data = rs.rand(5)
+    b_data = rs.rand(5)
+    a = tensor(a_data, chunk_size=2).tosparse()
+    b = tensor(b_data, chunk_size=2).tosparse()
+
+    c = inner(a, b)
+
+    res = c.execute().fetch()
+    assert np.isscalar(res) is True
+    np.testing.assert_allclose(res, np.inner(a_data, b_data))
+
+
+def test_vdot_execution(setup):
+    a_data = np.array([1 + 2j, 3 + 4j])
+    b_data = np.array([5 + 6j, 7 + 8j])
+    a = tensor(a_data, chunk_size=1)
+    b = tensor(b_data, chunk_size=1)
+
+    t = vdot(a, b)
+
+    res = t.execute().fetch()
+    expected = np.vdot(a_data, b_data)
+    np.testing.assert_equal(res, expected)
+
+    a_data = np.array([[1, 4], [5, 6]])
+    b_data = np.array([[4, 1], [2, 2]])
+    a = tensor(a_data, chunk_size=1)
+    b = tensor(b_data, chunk_size=1)
+
+    t = vdot(a, b)
+
+    res = t.execute().fetch()
+    expected = np.vdot(a_data, b_data)
+    np.testing.assert_equal(res, expected)
+
+
+def test_matmul_execution(setup):
+    rs = np.random.RandomState(0)
+
+    data_a = rs.randn(10, 20)
+    data_b = rs.randn(20)
+
+    a = tensor(data_a, chunk_size=5)
+    b = tensor(data_b, chunk_size=6)
+    c = matmul(a, b)
+
+    res = c.execute().fetch()
+    expected = np.matmul(data_a, data_b)
+    np.testing.assert_allclose(res, expected)
+
+    data_a = rs.randn(10, 20)
+    data_b = rs.randn(10)
+
+    a = tensor(data_a, chunk_size=5)
+    b = tensor(data_b, chunk_size=6)
+    c = matmul(b, a)
+
+    res = c.execute().fetch()
+    expected = np.matmul(data_b, data_a)
+    np.testing.assert_allclose(res, expected)
+
+    data_a = rs.randn(15, 1, 20, 30)
+    data_b = rs.randn(1, 11, 30, 20)
+
+    a = tensor(data_a, chunk_size=12)
+    b = tensor(data_b, chunk_size=13)
+    c = matmul(a, b)
+
+    res = c.execute().fetch()
+    expected = np.matmul(data_a, data_b)
+    np.testing.assert_allclose(res, expected, atol=0.0001)
+
+    a = arange(2 * 2 * 4, chunk_size=1).reshape((2, 2, 4))
+    b = arange(2 * 2 * 4, chunk_size=1).reshape((2, 4, 2))
+    c = matmul(a, b)
+
+    res = c.execute().fetch()
+    expected = np.matmul(
+        np.arange(2 * 2 * 4).reshape(2, 2, 4), np.arange(2 * 2 * 4).reshape(2, 4, 2)
+    )
+    np.testing.assert_allclose(res, expected, atol=0.0001)
+
+    data_a = sps.random(10, 20)
+    data_b = sps.random(20, 5)
+
+    a = tensor(data_a, chunk_size=5)
+    b = tensor(data_b, chunk_size=6)
+    c = matmul(a, b)
+
+    res = c.execute().fetch()
+    expected = np.matmul(data_a.toarray(), data_b.toarray())
+    np.testing.assert_allclose(res.toarray(), expected)
+
+    # test order
+    data_a = np.asfortranarray(rs.randn(10, 20))
+    data_b = np.asfortranarray(rs.randn(20, 30))
+
+    a = tensor(data_a, chunk_size=12)
+    b = tensor(data_b, chunk_size=13)
+
+    c = matmul(a, b)
+    res = c.execute().fetch()
+    expected = np.matmul(data_a, data_b)
+
+    np.testing.assert_allclose(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+    c = matmul(a, b, order="A")
+    res = c.execute().fetch()
+    expected = np.matmul(data_a, data_b, order="A")
+
+    np.testing.assert_allclose(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+    c = matmul(a, b, order="C")
+    res = c.execute().fetch()
+    expected = np.matmul(data_a, data_b, order="C")
+
+    np.testing.assert_allclose(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
diff --git a/python/xorbits/_mars/tensor/linalg/utils.py b/python/xorbits/_mars/tensor/linalg/utils.py
new file mode 100644
index 000000000..8c0fcdda9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/utils.py
@@ -0,0 +1,78 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+def calc_svd_shapes(a):
+    """
+    Calculate output shapes of singular value decomposition.
+    Follow the behavior of `numpy`:
+    if a's shape is (6, 18), U's shape is (6, 6), s's shape is (6,), V's shape is (6, 18)
+    if a's shape is (18, 6), U's shape is (18, 6), s's shape is (6,), V's shape is (6, 6)
+    :param a: input tensor
+    :return: (U.shape, s.shape, V.shape)
+    """
+    x, y = a.shape
+    if x > y:
+        return (x, y), (y,), (y, y)
+    else:
+        return (x, x), (x,), (x, y)
+
+
+def svd_flip(u, v, u_based_decision=True):
+    """
+    Sign correction to ensure deterministic output from SVD.
+
+    Adjusts the columns of u and the rows of v such that the loadings in the
+    columns in u that are largest in absolute value are always positive.
+
+    Parameters
+    ----------
+    u : Tensor
+        u and v are the output of `linalg.svd` or
+        `randomized_svd`, with matching inner dimensions
+        so one can compute `mt.dot(u * s, v)`.
+
+    v : Tensor
+        u and v are the output of `linalg.svd` or
+        `randomized_svd`, with matching inner dimensions
+        so one can compute `mt.dot(u * s, v)`.
+
+    u_based_decision : boolean, (default=True)
+        If True, use the columns of u as the basis for sign flipping.
+        Otherwise, use the rows of v. The choice of which variable to base the
+        decision on is generally algorithm dependent.
+
+
+    Returns
+    -------
+    u_adjusted, v_adjusted : arrays with the same dimensions as the input.
+
+    """
+    from ... import tensor as mt
+
+    if u_based_decision:
+        # columns of u, rows of v
+        max_abs_cols = mt.argmax(mt.abs(u), axis=0)
+        signs = mt.sign(u[max_abs_cols, np.arange(u.shape[1])])
+        u *= signs
+        v *= signs[:, mt.newaxis]
+    else:
+        # rows of v, columns of u
+        max_abs_rows = mt.argmax(mt.abs(v), axis=1)
+        signs = mt.sign(v[np.arange(v.shape[0]), max_abs_rows])
+        u *= signs
+        v *= signs[:, mt.newaxis]
+    return u, v
diff --git a/python/xorbits/_mars/tensor/linalg/vdot.py b/python/xorbits/_mars/tensor/linalg/vdot.py
new file mode 100644
index 000000000..298812af8
--- /dev/null
+++ b/python/xorbits/_mars/tensor/linalg/vdot.py
@@ -0,0 +1,73 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..datasource import tensor as astensor
+from .dot import dot
+
+
+def vdot(a, b):
+    """
+    Return the dot product of two vectors.
+
+    The vdot(`a`, `b`) function handles complex numbers differently than
+    dot(`a`, `b`).  If the first argument is complex the complex conjugate
+    of the first argument is used for the calculation of the dot product.
+
+    Note that `vdot` handles multidimensional tensors differently than `dot`:
+    it does *not* perform a matrix product, but flattens input arguments
+    to 1-D vectors first. Consequently, it should only be used for vectors.
+
+    Parameters
+    ----------
+    a : array_like
+        If `a` is complex the complex conjugate is taken before calculation
+        of the dot product.
+    b : array_like
+        Second argument to the dot product.
+
+    Returns
+    -------
+    output : Tensor
+        Dot product of `a` and `b`.  Can be an int, float, or
+        complex depending on the types of `a` and `b`.
+
+    See Also
+    --------
+    dot : Return the dot product without using the complex conjugate of the
+          first argument.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([1+2j,3+4j])
+    >>> b = mt.array([5+6j,7+8j])
+    >>> mt.vdot(a, b).execute()
+    (70-8j)
+    >>> mt.vdot(b, a).execute()
+    (70+8j)
+
+    Note that higher-dimensional arrays are flattened!
+
+    >>> a = mt.array([[1, 4], [5, 6]])
+    >>> b = mt.array([[4, 1], [2, 2]])
+    >>> mt.vdot(a, b).execute()
+    30
+    >>> mt.vdot(b, a).execute()
+    30
+    >>> 1*4 + 4*1 + 5*2 + 6*2
+    30
+    """
+    a, b = astensor(a), astensor(b)
+    return dot(a.conj().ravel(), b.ravel())
diff --git a/python/xorbits/_mars/tensor/merge/__init__.py b/python/xorbits/_mars/tensor/merge/__init__.py
new file mode 100644
index 000000000..5c0d34f70
--- /dev/null
+++ b/python/xorbits/_mars/tensor/merge/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .append import append
+from .block import block
+from .column_stack import column_stack
+from .concatenate import TensorConcatenate, concatenate
+from .dstack import dstack
+from .hstack import hstack
+from .stack import TensorStack, stack
+from .union1d import union1d
+from .vstack import vstack
diff --git a/python/xorbits/_mars/tensor/merge/append.py b/python/xorbits/_mars/tensor/merge/append.py
new file mode 100644
index 000000000..48844e150
--- /dev/null
+++ b/python/xorbits/_mars/tensor/merge/append.py
@@ -0,0 +1,74 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..base.ravel import ravel
+from ..datasource.array import asarray
+from .concatenate import concatenate
+
+
+def append(arr, values, axis=None):
+    """
+    Append values to the end of an array.
+
+    Parameters
+    ----------
+    arr : array_like
+        Values are appended to a copy of this array.
+    values : array_like
+        These values are appended to a copy of `arr`.  It must be of the
+        correct shape (the same shape as `arr`, excluding `axis`).  If
+        `axis` is not specified, `values` can be any shape and will be
+        flattened before use.
+    axis : int, optional
+        The axis along which `values` are appended.  If `axis` is not
+        given, both `arr` and `values` are flattened before use.
+
+    Returns
+    -------
+    append : Tensor
+        A copy of `arr` with `values` appended to `axis`.  Note that
+        `append` does not occur in-place: a new array is allocated and
+        filled.  If `axis` is None, `out` is a flattened array.
+
+    See Also
+    --------
+    insert : Insert elements into an array.
+    delete : Delete elements from an array.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.append([1, 2, 3], [[4, 5, 6], [7, 8, 9]]).execute()
+    array([1, 2, 3, ..., 7, 8, 9])
+
+    When `axis` is specified, `values` must have the correct shape.
+
+    >>> mt.append([[1, 2, 3], [4, 5, 6]], [[7, 8, 9]], axis=0).execute()
+    array([[1, 2, 3],
+           [4, 5, 6],
+           [7, 8, 9]])
+    >>> mt.append([[1, 2, 3], [4, 5, 6]], [7, 8, 9], axis=0)
+    Traceback (most recent call last):
+        ...
+    ValueError: all the input tensors must have same number of dimensions
+
+    """
+    arr = asarray(arr)
+    if axis is None:
+        if arr.ndim != 1:
+            arr = arr.ravel()
+        values = ravel(values)
+        axis = arr.ndim - 1
+    return concatenate((arr, values), axis=axis)
diff --git a/python/xorbits/_mars/tensor/merge/block.py b/python/xorbits/_mars/tensor/merge/block.py
new file mode 100644
index 000000000..25e76ee4c
--- /dev/null
+++ b/python/xorbits/_mars/tensor/merge/block.py
@@ -0,0 +1,474 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import itertools
+import operator
+
+import numpy as np
+
+from ..datasource.array import array
+from ..datasource.empty import empty
+
+# Internal functions to eliminate the overhead of repeated dispatch in one of
+# the two possible paths inside mt.block.
+# Use getattr to protect against __array_function__ being disabled.
+_size = getattr(np.size, "__wrapped__", np.size)
+_ndim = getattr(np.ndim, "__wrapped__", np.ndim)
+
+
+def _block_format_index(index):
+    """
+    Convert a list of indices ``[0, 1, 2]`` into ``"arrays[0][1][2]"``.
+    """
+    idx_str = "".join("[{}]".format(i) for i in index if i is not None)
+    return "arrays" + idx_str
+
+
+def _block_check_depths_match(arrays, parent_index=[]):
+    """
+    Recursive function checking that the depths of nested lists in `arrays`
+    all match. Mismatch raises a ValueError as described in the block
+    docstring below.
+
+    The entire index (rather than just the depth) needs to be calculated
+    for each innermost list, in case an error needs to be raised, so that
+    the index of the offending list can be printed as part of the error.
+
+    Parameters
+    ----------
+    arrays : nested list of arrays
+        The arrays to check
+    parent_index : list of int
+        The full index of `arrays` within the nested lists passed to
+        `_block_check_depths_match` at the top of the recursion.
+
+    Returns
+    -------
+    first_index : list of int
+        The full index of an element from the bottom of the nesting in
+        `arrays`. If any element at the bottom is an empty list, this will
+        refer to it, and the last index along the empty axis will be None.
+    max_arr_ndim : int
+        The maximum of the ndims of the arrays nested in `arrays`.
+    final_size: int
+        The number of elements in the final array. This is used the motivate
+        the choice of algorithm used using benchmarking wisdom.
+
+    """
+    if type(arrays) is tuple:
+        # not strictly necessary, but saves us from:
+        #  - more than one way to do things - no point treating tuples like
+        #    lists
+        #  - horribly confusing behaviour that results when tuples are
+        #    treated like ndarray
+        raise TypeError(
+            "{} is a tuple. "
+            "Only lists can be used to arrange blocks, and mt.block does "
+            "not allow implicit conversion from tuple to ndarray.".format(
+                _block_format_index(parent_index)
+            )
+        )
+    elif type(arrays) is list and len(arrays) > 0:
+        idxs_ndims = (
+            _block_check_depths_match(arr, parent_index + [i])
+            for i, arr in enumerate(arrays)
+        )
+
+        first_index, max_arr_ndim, final_size = next(idxs_ndims)
+        for index, ndim, size in idxs_ndims:
+            final_size += size
+            if ndim > max_arr_ndim:
+                max_arr_ndim = ndim
+            if len(index) != len(first_index):
+                raise ValueError(
+                    "List depths are mismatched. First element was at depth "
+                    "{}, but there is an element at depth {} ({})".format(
+                        len(first_index), len(index), _block_format_index(index)
+                    )
+                )
+            # propagate our flag that indicates an empty list at the bottom
+            if index[-1] is None:
+                first_index = index
+
+        return first_index, max_arr_ndim, final_size
+    elif type(arrays) is list and len(arrays) == 0:
+        # We've 'bottomed out' on an empty list
+        return parent_index + [None], 0, 0
+    else:
+        # We've 'bottomed out' - arrays is either a scalar or an array
+        size = _size(arrays)
+        return parent_index, _ndim(arrays), size
+
+
+def _atleast_nd(a, ndim):
+    # Ensures `a` has at least `ndim` dimensions by prepending
+    # ones to `a.shape` as necessary
+    return array(a, ndmin=ndim, copy=False)
+
+
+def _accumulate(values):
+    return list(itertools.accumulate(values))
+
+
+def _concatenate_shapes(shapes, axis):
+    """Given array shapes, return the resulting shape and slices prefixes.
+    These help in nested concatenation.
+
+    Returns
+    -------
+    shape: tuple of int
+        This tuple satisfies:
+        ```
+        shape, _ = _concatenate_shapes([arr.shape for shape in arrs], axis)
+        shape == concatenate(arrs, axis).shape
+        ```
+    slice_prefixes: tuple of (slice(start, end), )
+        For a list of arrays being concatenated, this returns the slice
+        in the larger array at axis that needs to be sliced into.
+        For example, the following holds:
+        ```
+        ret = concatenate([a, b, c], axis)
+        _, (sl_a, sl_b, sl_c) = concatenate_slices([a, b, c], axis)
+        ret[(slice(None),) * axis + sl_a] == a
+        ret[(slice(None),) * axis + sl_b] == b
+        ret[(slice(None),) * axis + sl_c] == c
+        ```
+        These are called slice prefixes since they are used in the recursive
+        blocking algorithm to compute the left-most slices during the
+        recursion. Therefore, they must be prepended to rest of the slice
+        that was computed deeper in the recursion.
+        These are returned as tuples to ensure that they can quickly be added
+        to existing slice tuple without creating a new tuple every time.
+    """
+    # Cache a result that will be reused.
+    shape_at_axis = [shape[axis] for shape in shapes]
+
+    # Take a shape, any shape
+    first_shape = shapes[0]
+    first_shape_pre = first_shape[:axis]
+    first_shape_post = first_shape[axis + 1 :]
+
+    if any(
+        shape[:axis] != first_shape_pre or shape[axis + 1 :] != first_shape_post
+        for shape in shapes
+    ):
+        raise ValueError("Mismatched array shapes in block along axis {}.".format(axis))
+
+    shape = first_shape_pre + (sum(shape_at_axis),) + first_shape[axis + 1 :]
+
+    offsets_at_axis = _accumulate(shape_at_axis)
+    slice_prefixes = [
+        (slice(start, end),)
+        for start, end in zip([0] + offsets_at_axis, offsets_at_axis)
+    ]
+    return shape, slice_prefixes
+
+
+def _block_info_recursion(arrays, max_depth, result_ndim, depth=0):
+    """
+    Returns the shape of the final array, along with a list
+    of slices and a list of arrays that can be used for assignment inside the
+    new array
+
+    Parameters
+    ----------
+    arrays : nested list of arrays
+        The arrays to check
+    max_depth : list of int
+        The number of nested lists
+    result_ndim: int
+        The number of dimensions in thefinal array.
+
+    Returns
+    -------
+    shape : tuple of int
+        The shape that the final array will take on.
+    slices: list of tuple of slices
+        The slices into the full array required for assignment. These are
+        required to be prepended with ``(Ellipsis, )`` to obtain to correct
+        final index.
+    arrays: list of ndarray
+        The data to assign to each slice of the full array
+
+    """
+    if depth < max_depth:
+        shapes, slices, arrays = zip(
+            *[
+                _block_info_recursion(arr, max_depth, result_ndim, depth + 1)
+                for arr in arrays
+            ]
+        )
+
+        axis = result_ndim - max_depth + depth
+        shape, slice_prefixes = _concatenate_shapes(shapes, axis)
+
+        # Prepend the slice prefix and flatten the slices
+        slices = [
+            slice_prefix + the_slice
+            for slice_prefix, inner_slices in zip(slice_prefixes, slices)
+            for the_slice in inner_slices
+        ]
+
+        # Flatten the array list
+        arrays = functools.reduce(operator.add, arrays)
+
+        return shape, slices, arrays
+    else:
+        # We've 'bottomed out' - arrays is either a scalar or an array
+        # type(arrays) is not list
+        # Return the slice and the array inside a list to be consistent with
+        # the recursive case.
+        arr = _atleast_nd(arrays, result_ndim)
+        return arr.shape, [()], [arr]
+
+
+def _block(arrays, max_depth, result_ndim, depth=0):
+    """
+    Internal implementation of block based on repeated concatenation.
+    `arrays` is the argument passed to
+    block. `max_depth` is the depth of nested lists within `arrays` and
+    `result_ndim` is the greatest of the dimensions of the arrays in
+    `arrays` and the depth of the lists in `arrays` (see block docstring
+    for details).
+    """
+    from ..merge.concatenate import concatenate
+
+    if depth < max_depth:
+        arrs = [_block(arr, max_depth, result_ndim, depth + 1) for arr in arrays]
+        return concatenate(arrs, axis=-(max_depth - depth))
+    else:
+        # We've 'bottomed out' - arrays is either a scalar or an array
+        # type(arrays) is not list
+        return _atleast_nd(arrays, result_ndim)
+
+
+def block(arrays):
+    """
+    Assemble an nd-array from nested lists of blocks.
+
+    Blocks in the innermost lists are concatenated (see `concatenate`) along
+    the last dimension (-1), then these are concatenated along the
+    second-last dimension (-2), and so on until the outermost list is reached.
+
+    Blocks can be of any dimension, but will not be broadcasted using the normal
+    rules. Instead, leading axes of size 1 are inserted, to make ``block.ndim``
+    the same for all blocks. This is primarily useful for working with scalars,
+    and means that code like ``mt.block([v, 1])`` is valid, where
+    ``v.ndim == 1``.
+
+    When the nested list is two levels deep, this allows block matrices to be
+    constructed from their components.
+
+    .. versionadded:: 1.13.0
+
+    Parameters
+    ----------
+    arrays : nested list of array_like or scalars (but not tuples)
+        If passed a single ndarray or scalar (a nested list of depth 0), this
+        is returned unmodified (and not copied).
+
+        Elements shapes must match along the appropriate axes (without
+        broadcasting), but leading 1s will be prepended to the shape as
+        necessary to make the dimensions match.
+
+    Returns
+    -------
+    block_array : Tensor
+        The array assembled from the given blocks.
+
+        The dimensionality of the output is equal to the greatest of:
+        * the dimensionality of all the inputs
+        * the depth to which the input list is nested
+
+    Raises
+    ------
+    ValueError
+        * If list depths are mismatched - for instance, ``[[a, b], c]`` is
+          illegal, and should be spelt ``[[a, b], [c]]``
+        * If lists are empty - for instance, ``[[a, b], []]``
+
+    See Also
+    --------
+    concatenate : Join a sequence of arrays along an existing axis.
+    stack : Join a sequence of arrays along a new axis.
+    vstack : Stack arrays in sequence vertically (row wise).
+    hstack : Stack arrays in sequence horizontally (column wise).
+    dstack : Stack arrays in sequence depth wise (along third axis).
+    column_stack : Stack 1-D arrays as columns into a 2-D array.
+    vsplit : Split an array into multiple sub-arrays vertically (row-wise).
+
+    Notes
+    -----
+
+    When called with only scalars, ``mt.block`` is equivalent to an ndarray
+    call. So ``mt.block([[1, 2], [3, 4]])`` is equivalent to
+    ``mt.array([[1, 2], [3, 4]])``.
+
+    This function does not enforce that the blocks lie on a fixed grid.
+    ``mt.block([[a, b], [c, d]])`` is not restricted to arrays of the form::
+
+        AAAbb
+        AAAbb
+        cccDD
+
+    But is also allowed to produce, for some ``a, b, c, d``::
+
+        AAAbb
+        AAAbb
+        cDDDD
+
+    Since concatenation happens along the last axis first, `block` is _not_
+    capable of producing the following directly::
+
+        AAAbb
+        cccbb
+        cccDD
+
+    Matlab's "square bracket stacking", ``[A, B, ...; p, q, ...]``, is
+    equivalent to ``mt.block([[A, B, ...], [p, q, ...]])``.
+
+    Examples
+    --------
+    The most common use of this function is to build a block matrix
+
+    >>> import mars.tensor as mt
+    >>> A = mt.eye(2) * 2
+    >>> B = mt.eye(3) * 3
+    >>> mt.block([
+    ...     [A,               mt.zeros((2, 3))],
+    ...     [mt.ones((3, 2)), B               ]
+    ... ]).execute()
+    array([[2., 0., 0., 0., 0.],
+           [0., 2., 0., 0., 0.],
+           [1., 1., 3., 0., 0.],
+           [1., 1., 0., 3., 0.],
+           [1., 1., 0., 0., 3.]])
+
+    With a list of depth 1, `block` can be used as `hstack`
+
+    >>> mt.block([1, 2, 3]).execute()    # hstack([1, 2, 3])
+    array([1, 2, 3])
+
+    >>> a = mt.array([1, 2, 3])
+    >>> b = mt.array([2, 3, 4])
+    >>> mt.block([a, b, 10]).execute()   # hstack([a, b, 10])
+    array([ 1,  2,  3,  2,  3,  4, 10])
+
+    >>> A = mt.ones((2, 2), int)
+    >>> B = 2 * A
+    >>> mt.block([A, B]).execute()       # hstack([A, B])
+    array([[1, 1, 2, 2],
+           [1, 1, 2, 2]])
+
+    With a list of depth 2, `block` can be used in place of `vstack`:
+
+    >>> a = mt.array([1, 2, 3])
+    >>> b = mt.array([2, 3, 4])
+    >>> mt.block([[a], [b]]).execute()   # vstack([a, b])
+    array([[1, 2, 3],
+           [2, 3, 4]])
+
+    >>> A = mt.ones((2, 2), int)
+    >>> B = 2 * A
+    >>> mt.block([[A], [B]]).execute()   # vstack([A, B])
+    array([[1, 1],
+           [1, 1],
+           [2, 2],
+           [2, 2]])
+
+    It can also be used in places of `atleast_1d` and `atleast_2d`
+
+    >>> a = mt.array(0)
+    >>> b = mt.array([1])
+    >>> mt.block([a]).execute()          # atleast_1d(a)
+    array([0])
+    >>> mt.block([b]).execute()          # atleast_1d(b)
+    array([1])
+
+    >>> mt.block([[a]]).execute()        # atleast_2d(a)
+    array([[0]])
+    >>> mt.block([[b]]).execute()        # atleast_2d(b)
+    array([[1]])
+
+
+    """
+    arrays, list_ndim, result_ndim, final_size = _block_setup(arrays)
+
+    # It was found through benchmarking that making an array of final size
+    # around 256x256 was faster by straight concatenation on a
+    # i7-7700HQ processor and dual channel ram 2400MHz.
+    # It didn't seem to matter heavily on the dtype used.
+    #
+    # A 2D array using repeated concatenation requires 2 copies of the array.
+    #
+    # The fastest algorithm will depend on the ratio of CPU power to memory
+    # speed.
+    # One can monitor the results of the benchmark
+    # https://pv.github.io/numpy-bench/#bench_shape_base.Block2D.time_block2d
+    # to tune this parameter until a C version of the `_block_info_recursion`
+    # algorithm is implemented which would likely be faster than the python
+    # version.
+    if list_ndim * final_size > (2 * 512 * 512):
+        return _block_slicing(arrays, list_ndim, result_ndim)
+    else:
+        return _block_concatenate(arrays, list_ndim, result_ndim)
+
+
+# These helper functions are mostly used for testing.
+# They allow us to write tests that directly call `_block_slicing`
+# or `_block_concatenate` without blocking large arrays to force the wisdom
+# to trigger the desired path.
+def _block_setup(arrays):
+    """
+    Returns
+    (`arrays`, list_ndim, result_ndim, final_size)
+    """
+    bottom_index, arr_ndim, final_size = _block_check_depths_match(arrays)
+    list_ndim = len(bottom_index)
+    if bottom_index and bottom_index[-1] is None:
+        raise ValueError(
+            "List at {} cannot be empty".format(_block_format_index(bottom_index))
+        )
+    result_ndim = max(arr_ndim, list_ndim)
+    return arrays, list_ndim, result_ndim, final_size
+
+
+def _block_slicing(arrays, list_ndim, result_ndim):
+    shape, slices, arrays = _block_info_recursion(arrays, list_ndim, result_ndim)
+    dtype = np.result_type(*[arr.dtype for arr in arrays])
+
+    # Test preferring F only in the case that all input arrays are F
+    F_order = all(arr.flags["F_CONTIGUOUS"] for arr in arrays)
+    C_order = all(arr.flags["C_CONTIGUOUS"] for arr in arrays)
+    order = "F" if F_order and not C_order else "C"
+    result = empty(shape=shape, dtype=dtype, order=order)
+    # Note: In a c implementation, the function
+    # PyArray_CreateMultiSortedStridePerm could be used for more advanced
+    # guessing of the desired order.
+
+    for the_slice, arr in zip(slices, arrays):
+        result[(Ellipsis,) + the_slice] = arr
+    return result
+
+
+def _block_concatenate(arrays, list_ndim, result_ndim):
+    result = _block(arrays, list_ndim, result_ndim)
+    if list_ndim == 0:
+        # Catch an edge case where _block returns a view because
+        # `arrays` is a single mars array and not a list of mars arrays.
+        # This might copy scalars or lists twice, but this isn't a likely
+        # usecase for those interested in performance
+        result = result.copy()
+    return result
diff --git a/python/xorbits/_mars/tensor/merge/column_stack.py b/python/xorbits/_mars/tensor/merge/column_stack.py
new file mode 100644
index 000000000..e4e7c011c
--- /dev/null
+++ b/python/xorbits/_mars/tensor/merge/column_stack.py
@@ -0,0 +1,63 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..datasource import tensor as astensor
+from .concatenate import concatenate
+
+
+def column_stack(tup):
+    """
+    Stack 1-D tensors as columns into a 2-D tensor.
+
+    Take a sequence of 1-D tensors and stack them as columns
+    to make a single 2-D tensor. 2-D tensors are stacked as-is,
+    just like with `hstack`.  1-D tensors are turned into 2-D columns
+    first.
+
+    Parameters
+    ----------
+    tup : sequence of 1-D or 2-D tensors.
+        Tensors to stack. All of them must have the same first dimension.
+
+    Returns
+    -------
+    stacked : 2-D tensor
+        The tensor formed by stacking the given tensors.
+
+    See Also
+    --------
+    stack, hstack, vstack, concatenate
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array((1,2,3))
+    >>> b = mt.array((2,3,4))
+    >>> mt.column_stack((a,b)).execute()
+    array([[1, 2],
+           [2, 3],
+           [3, 4]])
+
+    """
+    from ..datasource import array
+
+    arrays = []
+    for a in tup:
+        a = astensor(a)
+        if a.ndim < 2:
+            a = array(a, ndmin=2).T
+        arrays.append(a)
+
+    return concatenate(arrays, 1)
diff --git a/python/xorbits/_mars/tensor/merge/concatenate.py b/python/xorbits/_mars/tensor/merge/concatenate.py
new file mode 100644
index 000000000..512b11dad
--- /dev/null
+++ b/python/xorbits/_mars/tensor/merge/concatenate.py
@@ -0,0 +1,327 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import operator
+import tempfile
+from collections.abc import Iterable
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    SliceField,
+    StringField,
+    TupleField,
+)
+from ..array_utils import as_same_device, device
+from ..datasource import tensor as astensor
+from ..indexing.slice import TensorSlice
+from ..operands import TensorOperand, TensorOperandMixin
+from ..utils import unify_chunks, validate_axis
+
+
+def _get_index(chunk):
+    try:
+        return chunk.index
+    except AttributeError:
+        if isinstance(chunk.op, TensorSlice):
+            return chunk.inputs[0].index
+        raise
+
+
+def _norm_axis(axis):
+    if isinstance(axis, int):
+        return axis, True
+    if isinstance(axis, Iterable):
+        axis = sorted(tuple(axis))
+        if len(axis) == 1:
+            return axis[0], True
+        return axis, False
+
+    assert axis is None
+    return None, False
+
+
+class TensorConcatenate(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.CONCATENATE
+
+    _axis = AnyField("axis")
+
+    # for mmap
+    _mmap = BoolField("mmap")
+    _file_prefix = StringField("file_prefix")
+    _create_mmap_file = BoolField("create_mmap_file")
+    _partition_slice = SliceField("partition_slice")
+    _total_shape = TupleField("total_shape")
+
+    def __init__(
+        self,
+        axis=None,
+        mmap=None,
+        file_prefix=None,
+        create_mmap_file=None,
+        partition_slice=None,
+        total_shape=None,
+        **kw
+    ):
+        super().__init__(
+            _axis=axis,
+            _mmap=mmap,
+            _file_prefix=file_prefix,
+            _create_mmap_file=create_mmap_file,
+            _partition_slice=partition_slice,
+            _total_shape=total_shape,
+            **kw
+        )
+
+    @property
+    def axis(self):
+        return getattr(self, "_axis", None)
+
+    @property
+    def mmap(self):
+        return self._mmap
+
+    @property
+    def file_prefix(self):
+        return self._file_prefix
+
+    @property
+    def create_mmap_file(self):
+        return self._create_mmap_file
+
+    @property
+    def partition_slice(self):
+        return self._partition_slice
+
+    @property
+    def total_shape(self):
+        return self._total_shape
+
+    def __call__(self, tensors):
+        if len(set(t.ndim for t in tensors)) != 1:
+            raise ValueError(
+                "all the input tensors must have same number of dimensions"
+            )
+
+        axis = self._axis
+        shapes = [t.shape[:axis] + t.shape[axis + 1 :] for t in tensors]
+        if len(set(shapes)) != 1:
+            raise ValueError(
+                "all the input tensor dimensions "
+                "except for the concatenation axis must match exactly"
+            )
+
+        shape = [
+            0 if i == axis else tensors[0].shape[i] for i in range(tensors[0].ndim)
+        ]
+        shape[axis] = sum(t.shape[axis] for t in tensors)
+
+        if any(np.isnan(s) for i, s in enumerate(shape) if i != axis):
+            raise ValueError("cannot concatenate tensor with unknown shape")
+
+        return self.new_tensor(tensors, shape=tuple(shape))
+
+    @classmethod
+    def tile(cls, op):
+        from ..indexing.slice import TensorSlice
+
+        inputs = op.inputs
+        output = op.outputs[0]
+        axis = op.axis
+
+        c = itertools.count(inputs[0].ndim)
+        tensor_axes = [
+            (t, tuple(i if i != axis else next(c) for i in range(t.ndim)))
+            for t in inputs
+        ]
+        inputs = yield from unify_chunks(*tensor_axes)
+
+        out_chunk_shape = [
+            0 if i == axis else inputs[0].chunk_shape[i] for i in range(inputs[0].ndim)
+        ]
+        out_chunk_shape[axis] = sum(t.chunk_shape[axis] for t in inputs)
+        out_nsplits = [
+            None if i == axis else inputs[0].nsplits[i] for i in range(inputs[0].ndim)
+        ]
+        out_nsplits[axis] = tuple(itertools.chain(*[t.nsplits[axis] for t in inputs]))
+
+        out_chunks = []
+        axis_cum_chunk_shape = np.cumsum([t.chunk_shape[axis] for t in inputs])
+        for out_idx in itertools.product(*[range(s) for s in out_chunk_shape]):
+            axis_index = np.searchsorted(
+                axis_cum_chunk_shape, out_idx[axis], side="right"
+            )
+            t = inputs[axis_index]
+            axis_inner_index = out_idx[axis] - (
+                0 if axis_index < 1 else axis_cum_chunk_shape[axis_index - 1]
+            )
+            idx = out_idx[:axis] + (axis_inner_index,) + out_idx[axis + 1 :]
+            in_chunk = t.cix[idx]
+            if idx == out_idx:
+                # if index is the same, just use the input chunk
+                out_chunks.append(in_chunk)
+            else:
+                chunk_op = TensorSlice(
+                    slices=[slice(None) for _ in range(in_chunk.ndim)],
+                    dtype=in_chunk.dtype,
+                    sparse=in_chunk.op.sparse,
+                )
+                out_chunk = chunk_op.new_chunk(
+                    [in_chunk], shape=in_chunk.shape, index=out_idx, order=output.order
+                )
+
+                out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            output.shape,
+            order=output.order,
+            nsplits=out_nsplits,
+            chunks=out_chunks,
+        )
+
+    @staticmethod
+    def _ensure_order(result, order):
+        return result.astype(result.dtype, order=order.value, copy=False)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.mmap:  # pragma: no cover
+            cls._execute_with_mmap(ctx, op)
+        else:
+            cls._execute(ctx, op)
+
+    @classmethod
+    def _execute(cls, ctx, op):
+        def _base_concatenate(chunk, inputs):
+            inputs, device_id, xp = as_same_device(
+                inputs, device=chunk.op.device, ret_extra=True
+            )
+
+            axis, single_axis = _norm_axis(chunk.op.axis)
+            if single_axis:
+                with device(device_id):
+                    res = xp.concatenate(tuple(inputs), axis=axis)
+            else:
+                axes = axis or list(range(chunk.ndim))
+                chunks = [
+                    (_get_index(input), data)
+                    for input, data in zip(chunk.inputs, inputs)
+                ]
+                with device(device_id):
+                    for i in range(len(axes) - 1):
+                        new_chunks = []
+                        for idx, cs in itertools.groupby(
+                            chunks, key=lambda t: t[0][:-1]
+                        ):
+                            cs = list(map(operator.itemgetter(1), cs))
+                            new_chunks.append(
+                                (idx, xp.concatenate(cs, axis=len(axes) - i - 1))
+                            )
+                        chunks = new_chunks
+                    res = xp.concatenate(
+                        list(map(operator.itemgetter(1), chunks)), axis=axes[0]
+                    )
+            return res
+
+        chunk = op.outputs[0]
+        inputs = [ctx[input.key] for input in op.inputs]
+
+        if isinstance(inputs[0], tuple):
+            ctx[chunk.key] = tuple(
+                cls._ensure_order(
+                    _base_concatenate(chunk, [input[i] for input in inputs]),
+                    chunk.order,
+                )
+                for i in range(len(inputs[0]))
+            )
+        else:
+            ctx[chunk.key] = cls._ensure_order(
+                _base_concatenate(chunk, inputs), chunk.order
+            )
+
+    @classmethod
+    def _execute_with_mmap(cls, ctx, op):  # pragma: no cover
+        if op.create_mmap_file:
+            path = tempfile.mkstemp(prefix=op.file_prefix, suffix=".dat")[1]
+            np.memmap(path, dtype=op.dtype, mode="w+", shape=op.total_shape)
+            ctx[op.outputs[0].key] = path
+        else:
+            path = ctx[op.inputs[0].key]
+            array = ctx[op.inputs[1].key]
+            fp = np.memmap(path, dtype=op.dtype, mode="r+", shape=op.total_shape)
+            fp[op.partition_slice] = array
+            ctx[op.outputs[0].key] = path
+
+
+def concatenate(tensors, axis=0):
+    """
+    Join a sequence of arrays along an existing axis.
+
+    Parameters
+    ----------
+    a1, a2, ... : sequence of array_like
+        The tensors must have the same shape, except in the dimension
+        corresponding to `axis` (the first, by default).
+    axis : int, optional
+        The axis along which the tensors will be joined.  Default is 0.
+
+    Returns
+    -------
+    res : Tensor
+        The concatenated tensor.
+
+    See Also
+    --------
+    array_split : Split a tensor into multiple sub-arrays of equal or
+                  near-equal size.
+    split : Split tensor into a list of multiple sub-tensors of equal size.
+    hsplit : Split tensor into multiple sub-tensors horizontally (column wise)
+    vsplit : Split tensor into multiple sub-tensors vertically (row wise)
+    dsplit : Split tensor into multiple sub-tensors along the 3rd axis (depth).
+    stack : Stack a sequence of tensors along a new axis.
+    hstack : Stack tensors in sequence horizontally (column wise)
+    vstack : Stack tensors in sequence vertically (row wise)
+    dstack : Stack tensors in sequence depth wise (along third dimension)
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([[1, 2], [3, 4]])
+    >>> b = mt.array([[5, 6]])
+    >>> mt.concatenate((a, b), axis=0).execute()
+    array([[1, 2],
+           [3, 4],
+           [5, 6]])
+    >>> mt.concatenate((a, b.T), axis=1).execute()
+    array([[1, 2, 5],
+           [3, 4, 6]])
+
+    """
+    if axis is None:
+        axis = 0
+    tensors = [astensor(t) for t in tensors]
+
+    axis = validate_axis(tensors[0].ndim, axis)
+    dtype = np.result_type(*(t.dtype for t in tensors))
+    sparse = all(t.issparse() for t in tensors)
+
+    op = TensorConcatenate(axis=axis, dtype=dtype, sparse=sparse)
+    return op(tensors)
diff --git a/python/xorbits/_mars/tensor/merge/dstack.py b/python/xorbits/_mars/tensor/merge/dstack.py
new file mode 100644
index 000000000..2b9bf1da0
--- /dev/null
+++ b/python/xorbits/_mars/tensor/merge/dstack.py
@@ -0,0 +1,71 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..base import atleast_3d
+from .concatenate import concatenate
+
+
+def dstack(tup):
+    """
+    Stack tensors in sequence depth wise (along third axis).
+
+    This is equivalent to concatenation along the third axis after 2-D tensors
+    of shape `(M,N)` have been reshaped to `(M,N,1)` and 1-D arrays of shape
+    `(N,)` have been reshaped to `(1,N,1)`. Rebuilds arrays divided by
+    `dsplit`.
+
+    This function makes most sense for arrays with up to 3 dimensions. For
+    instance, for pixel-data with a height (first axis), width (second axis),
+    and r/g/b channels (third axis). The functions `concatenate`, `stack` and
+    `block` provide more general stacking and concatenation operations.
+
+    Parameters
+    ----------
+    tup : sequence of tensors
+        The tensors must have the same shape along all but the third axis.
+        1-D or 2-D arrays must have the same shape.
+
+    Returns
+    -------
+    stacked : Tensor
+        The array formed by stacking the given tensors, will be at least 3-D.
+
+    See Also
+    --------
+    stack : Join a sequence of tensors along a new axis.
+    vstack : Stack along first axis.
+    hstack : Stack along second axis.
+    concatenate : Join a sequence of arrays along an existing axis.
+    dsplit : Split tensor along third axis.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array((1,2,3))
+    >>> b = mt.array((2,3,4))
+    >>> mt.dstack((a,b)).execute()
+    array([[[1, 2],
+            [2, 3],
+            [3, 4]]])
+
+    >>> a = mt.array([[1],[2],[3]])
+    >>> b = mt.array([[2],[3],[4]])
+    >>> mt.dstack((a,b)).execute()
+    array([[[1, 2]],
+           [[2, 3]],
+           [[3, 4]]])
+
+    """
+    return concatenate([atleast_3d(t) for t in tup], axis=2)
diff --git a/python/xorbits/_mars/tensor/merge/hstack.py b/python/xorbits/_mars/tensor/merge/hstack.py
new file mode 100644
index 000000000..1e01f9745
--- /dev/null
+++ b/python/xorbits/_mars/tensor/merge/hstack.py
@@ -0,0 +1,70 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .concatenate import concatenate
+
+
+def hstack(tup):
+    """
+    Stack tensors in sequence horizontally (column wise).
+
+    This is equivalent to concatenation along the second axis, except for 1-D
+    tensors where it concatenates along the first axis. Rebuilds tensors divided
+    by `hsplit`.
+
+    This function makes most sense for tensors with up to 3 dimensions. For
+    instance, for pixel-data with a height (first axis), width (second axis),
+    and r/g/b channels (third axis). The functions `concatenate`, `stack` and
+    `block` provide more general stacking and concatenation operations.
+
+    Parameters
+    ----------
+    tup : sequence of tensors
+        The tensors must have the same shape along all but the second axis,
+        except 1-D tensors which can be any length.
+
+    Returns
+    -------
+    stacked : Tensor
+        The tensor formed by stacking the given tensors.
+
+    See Also
+    --------
+    stack : Join a sequence of tensors along a new axis.
+    vstack : Stack tensors in sequence vertically (row wise).
+    dstack : Stack tensors in sequence depth wise (along third axis).
+    concatenate : Join a sequence of tensors along an existing axis.
+    hsplit : Split tensor along second axis.
+    block : Assemble tensors from blocks.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array((1,2,3))
+    >>> b = mt.array((2,3,4))
+    >>> mt.hstack((a,b)).execute()
+    array([1, 2, 3, 2, 3, 4])
+    >>> a = mt.array([[1],[2],[3]])
+    >>> b = mt.array([[2],[3],[4]])
+    >>> mt.hstack((a,b)).execute()
+    array([[1, 2],
+           [2, 3],
+           [3, 4]])
+
+    """
+    if all(x.ndim == 1 for x in tup):
+        return concatenate(tup, axis=0)
+    else:
+        return concatenate(tup, axis=1)
diff --git a/python/xorbits/_mars/tensor/merge/stack.py b/python/xorbits/_mars/tensor/merge/stack.py
new file mode 100644
index 000000000..3bc88f711
--- /dev/null
+++ b/python/xorbits/_mars/tensor/merge/stack.py
@@ -0,0 +1,217 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import Int32Field
+from ...utils import has_unknown_shape
+from ..array_utils import as_same_device, device
+from ..core import Tensor, TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+from ..utils import check_out_param, unify_chunks
+
+
+class TensorStack(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.STACK
+
+    _axis = Int32Field("axis")
+
+    def __init__(self, axis=None, **kw):
+        super().__init__(_axis=axis, **kw)
+
+    @property
+    def axis(self):
+        return self._axis
+
+    def __call__(self, tensors, out=None):
+        if out is not None and not isinstance(out, Tensor):
+            raise TypeError(f"`out` must be a Tensor, got {type(out)} instead")
+
+        shape = (
+            tensors[0].shape[: self._axis]
+            + (len(tensors),)
+            + tensors[0].shape[self._axis :]
+        )
+        tensor_order = TensorOrder.C_ORDER if out is None else out.order
+        t = self.new_tensor(tensors, shape, order=tensor_order)
+
+        if out is None:
+            return t
+
+        if out.shape != t.shape:
+            raise ValueError("Output tensor has wrong dimensionality")
+        check_out_param(out, t, "same_kind")
+        out.data = t.data
+        return out
+
+    @classmethod
+    def tile(cls, op):
+        from ..indexing.slice import TensorSlice
+
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        if len(set([inp.shape for inp in op.inputs])) != 1:
+            # check shape again when input has unknown shape
+            raise ValueError("all input tensors must have the same shape")
+
+        inputs = yield from unify_chunks(*op.inputs)
+        output = op.outputs[0]
+        axis = op.axis
+
+        output_nsplits = (
+            inputs[0].nsplits[:axis] + ((1,) * len(inputs),) + inputs[0].nsplits[axis:]
+        )
+        output_idxes = itertools.product(
+            *[range(len(nsplit)) for nsplit in output_nsplits]
+        )
+
+        out_chunks = []
+        for idx in output_idxes:
+            input_idx = idx[:axis] + idx[axis + 1 :]
+            i = idx[axis]
+            input_chunk = inputs[i].cix[input_idx]
+            slices = (
+                [slice(None)] * axis
+                + [np.newaxis]
+                + [slice(None)] * (len(input_idx) - axis)
+            )
+            shape = input_chunk.shape[:axis] + (1,) + input_chunk.shape[axis:]
+            chunk_op = TensorSlice(slices=slices, dtype=op.dtype, sparse=op.sparse)
+            out_chunk = chunk_op.new_chunk(
+                [input_chunk], shape=shape, index=idx, order=output.order
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs, output.shape, chunks=out_chunks, nsplits=output_nsplits
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        raw_inputs = [ctx[c.key] for c in op.inputs]
+        is_input_tuple = isinstance(raw_inputs[0], tuple)
+        input_tuple_len = len(raw_inputs[0]) if is_input_tuple else 1
+
+        if is_input_tuple:
+            # situation that stack is used during tiling, not created by user
+            inputs = list(itertools.chain.from_iterable(raw_inputs))
+        else:
+            inputs = raw_inputs
+        # move all the data to the same device
+        inputs, device_id, xp = as_same_device(inputs, device=op.device, ret_extra=True)
+        if is_input_tuple:
+            inputs = [
+                inputs[i * input_tuple_len : (i + 1) * input_tuple_len]
+                for i in range(len(raw_inputs))
+            ]
+        else:
+            inputs = [[inp] for inp in inputs]
+
+        axis = op.axis
+        out = op.outputs[0]
+        with device(device_id):
+            rets = []
+            for i in range(input_tuple_len):
+                ret = xp.stack([inp[i] for inp in inputs], axis=axis)
+                # make sure order is identical to out's order
+                ret = ret.astype(ret.dtype, order=out.order.value, copy=False)
+                rets.append(ret)
+            ctx[out.key] = rets if is_input_tuple else rets[0]
+
+
+def stack(tensors, axis=0, out=None):
+    """
+    Join a sequence of tensors along a new axis.
+
+    The `axis` parameter specifies the index of the new axis in the dimensions
+    of the result. For example, if ``axis=0`` it will be the first dimension
+    and if ``axis=-1`` it will be the last dimension.
+
+    Parameters
+    ----------
+    tensors : sequence of array_like
+        Each tensor must have the same shape.
+    axis : int, optional
+        The axis in the result tensor along which the input tensors are stacked.
+    out : Tensor, optional
+        If provided, the destination to place the result. The shape must be
+        correct, matching that of what stack would have returned if no
+        out argument were specified.
+
+    Returns
+    -------
+    stacked : Tensor
+        The stacked tensor has one more dimension than the input tensors.
+
+    See Also
+    --------
+    concatenate : Join a sequence of tensors along an existing axis.
+    split : Split tensor into a list of multiple sub-tensors of equal size.
+    block : Assemble tensors from blocks.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> arrays = [mt.random.randn(3, 4) for _ in range(10)]
+    >>> mt.stack(arrays, axis=0).shape
+    (10, 3, 4)
+
+    >>> mt.stack(arrays, axis=1).shape
+    (3, 10, 4)
+
+    >>> mt.stack(arrays, axis=2).shape
+    (3, 4, 10)
+
+    >>> a = mt.array([1, 2, 3])
+    >>> b = mt.array([2, 3, 4])
+    >>> mt.stack((a, b)).execute()
+    array([[1, 2, 3],
+           [2, 3, 4]])
+
+    >>> mt.stack((a, b), axis=-1).execute()
+    array([[1, 2],
+           [2, 3],
+           [3, 4]])
+
+    """
+    tensors = [astensor(t) for t in tensors]
+
+    to_check_shapes = []
+    for t in tensors:
+        if not any(np.isnan(s) for s in t.shape):
+            to_check_shapes.append(t.shape)
+    if to_check_shapes and len(set(to_check_shapes)) != 1:
+        raise ValueError("all input tensors must have the same shape")
+
+    ndim = len(tensors[0].shape)
+    raw_axis = axis
+    if axis < 0:
+        axis = ndim + axis + 1
+    if axis > ndim or axis < 0:
+        raise np.AxisError(
+            f"axis {raw_axis} is out of bounds for tensor of dimension {ndim}"
+        )
+
+    dtype = np.result_type(*[t.dtype for t in tensors])
+    sparse = all(t.issparse() for t in tensors)
+
+    op = TensorStack(axis=axis, dtype=dtype, sparse=sparse)
+    return op(tensors, out=out)
diff --git a/python/xorbits/_mars/tensor/merge/tests/__init__.py b/python/xorbits/_mars/tensor/merge/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/merge/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/merge/tests/test_merge.py b/python/xorbits/_mars/tensor/merge/tests/test_merge.py
new file mode 100644
index 000000000..8c552a7fa
--- /dev/null
+++ b/python/xorbits/_mars/tensor/merge/tests/test_merge.py
@@ -0,0 +1,103 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+from ....core import tile
+from ...datasource import empty, ones
+from .. import concatenate, stack
+
+
+def test_concatenate():
+    a = ones((10, 20, 30), chunk_size=10)
+    b = ones((20, 20, 30), chunk_size=20)
+
+    c = concatenate([a, b])
+    assert c.shape == (30, 20, 30)
+
+    a = ones((10, 20, 30), chunk_size=10)
+    b = ones((10, 20, 40), chunk_size=20)
+
+    c = concatenate([a, b], axis=-1)
+    assert c.shape == (10, 20, 70)
+
+    with pytest.raises(ValueError):
+        a = ones((10, 20, 30), chunk_size=10)
+        b = ones((20, 30, 30), chunk_size=20)
+
+        concatenate([a, b])
+
+    with pytest.raises(ValueError):
+        a = ones((10, 20, 30), chunk_size=10)
+        b = ones((20, 20), chunk_size=20)
+
+        concatenate([a, b])
+
+    a = ones((10, 20, 30), chunk_size=5)
+    b = ones((20, 20, 30), chunk_size=10)
+
+    a, c = tile(a, concatenate([a, b]))
+    assert c.chunk_shape[0] == 4
+    assert c.chunk_shape[1] == 4
+    assert c.chunk_shape[2] == 6
+    assert c.nsplits == ((5, 5, 10, 10), (5,) * 4, (5,) * 6)
+    assert c.cix[0, 0, 0].key == a.cix[0, 0, 0].key
+    assert c.cix[1, 0, 0].key == a.cix[1, 0, 0].key
+
+
+def test_stack():
+    raw_arrs = [ones((3, 4), chunk_size=2) for _ in range(10)]
+    arr2 = stack(raw_arrs, axis=0)
+
+    assert arr2.shape == (10, 3, 4)
+
+    arr2 = tile(arr2)
+    assert arr2.nsplits == ((1,) * 10, (2, 1), (2, 2))
+
+    arr3 = stack(raw_arrs, axis=1)
+
+    assert arr3.shape == (3, 10, 4)
+
+    arr3 = tile(arr3)
+    assert arr3.nsplits == ((2, 1), (1,) * 10, (2, 2))
+
+    arr4 = stack(raw_arrs, axis=2)
+
+    assert arr4.shape == (3, 4, 10)
+
+    arr4 = tile(arr4)
+    assert arr4.nsplits == ((2, 1), (2, 2), (1,) * 10)
+
+    with pytest.raises(ValueError):
+        raw_arrs2 = [ones((3, 4), chunk_size=2), ones((4, 3), chunk_size=2)]
+        stack(raw_arrs2)
+
+    with pytest.raises(np.AxisError):
+        stack(raw_arrs, axis=3)
+
+    arr5 = tile(stack(raw_arrs, -1))
+    assert arr5.nsplits == ((2, 1), (2, 2), (1,) * 10)
+
+    arr6 = tile(stack(raw_arrs, -3))
+    assert arr6.nsplits == ((1,) * 10, (2, 1), (2, 2))
+
+    with pytest.raises(np.AxisError):
+        stack(raw_arrs, axis=-4)
+
+    with pytest.raises(TypeError):
+        stack(raw_arrs, out=1)
+
+    with pytest.raises(ValueError):
+        stack(raw_arrs, empty((1, 10, 3, 4)))
diff --git a/python/xorbits/_mars/tensor/merge/tests/test_merge_execution.py b/python/xorbits/_mars/tensor/merge/tests/test_merge_execution.py
new file mode 100644
index 000000000..9e86ecd11
--- /dev/null
+++ b/python/xorbits/_mars/tensor/merge/tests/test_merge_execution.py
@@ -0,0 +1,371 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import scipy.sparse as sps
+
+from ... import (
+    append,
+    array,
+    block,
+    column_stack,
+    concatenate,
+    dstack,
+    hstack,
+    stack,
+    union1d,
+    vstack,
+)
+from ...datasource import empty, eye, ones, tensor, zeros
+
+
+def test_concatenate_execution(setup):
+    a_data = np.random.rand(10, 20, 30)
+    b_data = np.random.rand(10, 20, 40)
+    c_data = np.random.rand(10, 20, 50)
+
+    a = tensor(a_data, chunk_size=8)
+    b = tensor(b_data, chunk_size=10)
+    c = tensor(c_data, chunk_size=15)
+
+    d = concatenate([a, b, c], axis=-1)
+    res = d.execute().fetch()
+    expected = np.concatenate([a_data, b_data, c_data], axis=-1)
+    np.testing.assert_array_equal(res, expected)
+
+    a_data = sps.random(10, 30)
+    b_data = sps.rand(10, 40)
+    c_data = sps.rand(10, 50)
+
+    a = tensor(a_data, chunk_size=8)
+    b = tensor(b_data, chunk_size=10)
+    c = tensor(c_data, chunk_size=15)
+
+    d = concatenate([a, b, c], axis=-1)
+    res = d.execute().fetch()
+    expected = np.concatenate([a_data.A, b_data.A, c_data.A], axis=-1)
+    np.testing.assert_array_equal(res.toarray(), expected)
+
+
+def test_stack_execution(setup):
+    raw = [np.random.randn(3, 4) for _ in range(10)]
+    arrs = [tensor(a, chunk_size=3) for a in raw]
+
+    arr2 = stack(arrs)
+    res = arr2.execute().fetch()
+    assert np.array_equal(res, np.stack(raw)) is True
+
+    arr3 = stack(arrs, axis=1)
+    res = arr3.execute().fetch()
+    assert np.array_equal(res, np.stack(raw, axis=1)) is True
+
+    arr4 = stack(arrs, axis=2)
+    res = arr4.execute().fetch()
+    assert np.array_equal(res, np.stack(raw, axis=2)) is True
+
+    raw2 = [np.asfortranarray(np.random.randn(3, 4)) for _ in range(10)]
+    arr5 = [tensor(a, chunk_size=3) for a in raw2]
+
+    arr6 = stack(arr5)
+    res = arr6.execute().fetch()
+    expected = np.stack(raw2).copy("A")
+    np.testing.assert_array_equal(res, expected)
+
+    arr7 = stack(arr5, out=empty((10, 3, 4), order="F"))
+    res = arr7.execute().fetch()
+    expected = np.stack(raw2, out=np.empty((10, 3, 4), order="F")).copy("A")
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+    # test stack with unknown shapes
+    t = tensor(raw[0], chunk_size=3)
+    t2 = t[t[:, 0] > 0.0]
+    t3 = t2 + 1
+
+    arr8 = stack([t2, t3])
+    result = arr8.execute().fetch()
+    e = raw[0]
+    e2 = e[e[:, 0] > 0.0]
+    e3 = e2 + 1
+    np.testing.assert_array_equal(result, np.stack([e2, e3]))
+
+
+def test_h_stack_execution(setup):
+    a_data = np.random.rand(10)
+    b_data = np.random.rand(20)
+
+    a = tensor(a_data, chunk_size=8)
+    b = tensor(b_data, chunk_size=8)
+
+    c = hstack([a, b])
+    res = c.execute().fetch()
+    expected = np.hstack([a_data, b_data])
+    assert np.array_equal(res, expected) is True
+
+    a_data = np.random.rand(10, 20)
+    b_data = np.random.rand(10, 5)
+
+    a = tensor(a_data, chunk_size=6)
+    b = tensor(b_data, chunk_size=8)
+
+    c = hstack([a, b])
+    res = c.execute().fetch()
+    expected = np.hstack([a_data, b_data])
+    assert np.array_equal(res, expected) is True
+
+
+def test_v_stack_execution(setup):
+    a_data = np.random.rand(10)
+    b_data = np.random.rand(10)
+
+    a = tensor(a_data, chunk_size=8)
+    b = tensor(b_data, chunk_size=8)
+
+    c = vstack([a, b])
+    res = c.execute().fetch()
+    expected = np.vstack([a_data, b_data])
+    assert np.array_equal(res, expected) is True
+
+    a_data = np.random.rand(10, 20)
+    b_data = np.random.rand(5, 20)
+
+    a = tensor(a_data, chunk_size=6)
+    b = tensor(b_data, chunk_size=8)
+
+    c = vstack([a, b])
+    res = c.execute().fetch()
+    expected = np.vstack([a_data, b_data])
+    assert np.array_equal(res, expected) is True
+
+
+def test_d_stack_execution(setup):
+    a_data = np.random.rand(10)
+    b_data = np.random.rand(10)
+
+    a = tensor(a_data, chunk_size=8)
+    b = tensor(b_data, chunk_size=8)
+
+    c = dstack([a, b])
+    res = c.execute().fetch()
+    expected = np.dstack([a_data, b_data])
+    assert np.array_equal(res, expected) is True
+
+    a_data = np.random.rand(10, 20)
+    b_data = np.random.rand(10, 20)
+
+    a = tensor(a_data, chunk_size=6)
+    b = tensor(b_data, chunk_size=8)
+
+    c = dstack([a, b])
+    res = c.execute().fetch()
+    expected = np.dstack([a_data, b_data])
+    assert np.array_equal(res, expected) is True
+
+
+def test_column_stack_execution(setup):
+    a_data = np.array((1, 2, 3))
+    b_data = np.array((2, 3, 4))
+    a = tensor(a_data, chunk_size=1)
+    b = tensor(b_data, chunk_size=2)
+
+    c = column_stack((a, b))
+    res = c.execute().fetch()
+    expected = np.column_stack((a_data, b_data))
+    np.testing.assert_equal(res, expected)
+
+    a_data = np.random.rand(4, 2, 3)
+    b_data = np.random.rand(4, 2, 3)
+    a = tensor(a_data, chunk_size=1)
+    b = tensor(b_data, chunk_size=2)
+
+    c = column_stack((a, b))
+    res = c.execute().fetch()
+    expected = np.column_stack((a_data, b_data))
+    np.testing.assert_equal(res, expected)
+
+
+def test_union1d_execution(setup):
+    rs = np.random.RandomState(0)
+    raw1 = rs.random(10)
+    raw2 = rs.random(9)
+
+    t1 = tensor(raw1, chunk_size=3)
+    t2 = tensor(raw2, chunk_size=4)
+
+    t = union1d(t1, t2, aggregate_size=1)
+    res = t.execute().fetch()
+    expected = np.union1d(raw1, raw2)
+    np.testing.assert_array_equal(res, expected)
+
+    t = union1d(t1, t2)
+    res = t.execute().fetch()
+    expected = np.union1d(raw1, raw2)
+    np.testing.assert_array_equal(res, expected)
+
+
+def test_block_execution(setup):
+    # arrays is a tuple.
+    with pytest.raises(TypeError):
+        block((1, 2, 3))
+
+    # List depths are mismatched.
+    with pytest.raises(ValueError):
+        block([[1, 2], [[3, 4]]])
+
+    # List at arrays cannot be empty.
+    with pytest.raises(ValueError):
+        block([])
+
+    # List at arrays[1] cannot be empty.
+    with pytest.raises(ValueError):
+        block([[1, 2], []])
+
+    # Mismatched array shapes.
+    with pytest.raises(ValueError):
+        block([eye(512), eye(512), ones((511, 1))])
+
+    # Test large block.
+    block([eye(512), eye(512), ones((512, 1))])
+
+    # Test block inputs a single array.
+    c = block(array([1, 2, 3]))
+    r = c.execute().fetch()
+    np.testing.assert_array_equal(r, array([1, 2, 3]))
+
+    a = eye(2) * 2
+    b = eye(3) * 3
+    c = block([[a, zeros((2, 3))], [ones((3, 2)), b]])
+    r = c.execute().fetch()
+    expected = array(
+        [
+            [2.0, 0.0, 0.0, 0.0, 0.0],
+            [0.0, 2.0, 0.0, 0.0, 0.0],
+            [1.0, 1.0, 3.0, 0.0, 0.0],
+            [1.0, 1.0, 0.0, 3.0, 0.0],
+            [1.0, 1.0, 0.0, 0.0, 3.0],
+        ]
+    )
+    np.testing.assert_array_equal(r, expected)
+
+    # eye with different chunk sizes
+    a = eye(5, chunk_size=2) * 2
+    b = eye(4, chunk_size=3) * 3
+    c = block([[a, zeros((5, 4), chunk_size=4)], [ones((4, 5), chunk_size=5), b]])
+    r = c.execute().fetch()
+    expected = array(
+        [
+            [2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            [0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0],
+            [1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 0.0, 0.0, 0.0],
+            [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 3.0, 0.0, 0.0],
+            [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 0.0],
+            [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 3.0],
+        ]
+    )
+    np.testing.assert_array_equal(r, expected)
+
+    # hstack([1, 2, 3])
+    c = block([1, 2, 3])
+    r = c.execute().fetch()
+    expected = array([1, 2, 3])
+    np.testing.assert_array_equal(r, expected)
+
+    # hstack([a, b, 10])
+    a = array([1, 2, 3])
+    b = array([2, 3, 4])
+    c = block([a, b, 10])
+    r = c.execute().fetch()
+    expected = array([1, 2, 3, 2, 3, 4, 10])
+    np.testing.assert_array_equal(r, expected)
+
+    # hstack([a, b, 10]) with different chunk sizes
+    a = array([1, 2, 3, 4, 5, 6, 7], chunk_size=3)
+    b = array([2, 3, 4, 5], chunk_size=4)
+    c = block([a, b, 10])
+    r = c.execute().fetch()
+    expected = array([1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 10])
+    np.testing.assert_array_equal(r, expected)
+
+    # hstack([A, B])
+    A = ones((2, 2), int)
+    B = 2 * A
+    c = block([A, B])
+    r = c.execute().fetch()
+    expected = array([[1, 1, 2, 2], [1, 1, 2, 2]])
+    np.testing.assert_array_equal(r, expected)
+
+    # vstack([a, b])
+    a = array([1, 2, 3])
+    b = array([2, 3, 4])
+    c = block([[a], [b]])
+    r = c.execute().fetch()
+    expected = array([[1, 2, 3], [2, 3, 4]])
+    np.testing.assert_array_equal(r, expected)
+
+    # vstack([a, b]) with different chunk sizes
+    a = array([1, 2, 3, 4, 5, 6, 7], chunk_size=5)
+    b = array([2, 3, 4, 5, 6, 7, 8], chunk_size=6)
+    c = block([[a], [b]])
+    r = c.execute().fetch()
+    expected = array([[1, 2, 3, 4, 5, 6, 7], [2, 3, 4, 5, 6, 7, 8]])
+    np.testing.assert_array_equal(r, expected)
+
+    # vstack([A, B])
+    A = ones((2, 2), int)
+    B = 2 * A
+    c = block([[A], [B]])
+    r = c.execute().fetch()
+    expected = array([[1, 1], [1, 1], [2, 2], [2, 2]])
+    np.testing.assert_array_equal(r, expected)
+
+    a = array(0)
+    b = array([1])
+    # atleast_1d(a)
+    c = block([a])
+    r = c.execute().fetch()
+    expected = array([0])
+    np.testing.assert_array_equal(r, expected)
+    # atleast_1d(b)
+    c = block([b])
+    r = c.execute().fetch()
+    expected = array([1])
+    np.testing.assert_array_equal(r, expected)
+    # atleast_2d(a)
+    c = block([[a]])
+    r = c.execute().fetch()
+    expected = array([[0]])
+    np.testing.assert_array_equal(r, expected)
+    # atleast_2d(b)
+    c = block([[b]])
+    r = c.execute().fetch()
+    expected = array([[1]])
+    np.testing.assert_array_equal(r, expected)
+
+
+@pytest.mark.parametrize("axis", [0, None])
+def test_append_execution(setup, axis):
+    raw1 = np.random.rand(10, 3)
+    raw2 = np.random.rand(6, 3)
+
+    a1 = tensor(raw1, chunk_size=3)
+    a2 = tensor(raw2, chunk_size=4)
+    r = append(a1, a2, axis=axis)
+    result = r.execute().fetch()
+    expected = np.append(raw1, raw2, axis=axis)
+    np.testing.assert_array_equal(result, expected)
diff --git a/python/xorbits/_mars/tensor/merge/union1d.py b/python/xorbits/_mars/tensor/merge/union1d.py
new file mode 100644
index 000000000..275d7827c
--- /dev/null
+++ b/python/xorbits/_mars/tensor/merge/union1d.py
@@ -0,0 +1,55 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def union1d(ar1, ar2, aggregate_size=None):
+    """
+    Find the union of two tensors.
+
+    Return the unique, sorted tensor of values that are in either of the two
+    input tensors.
+
+    Parameters
+    ----------
+    ar1, ar2 : array_like
+        Input tensors. They are flattened if they are not already 1D.
+
+    Returns
+    -------
+    union1d : Tensor
+        Unique, sorted union of the input tensors.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.union1d([-1, 0, 1], [-2, 0, 2]).execute()
+    array([-2, -1,  0,  1,  2])
+
+    To find the union of more than two arrays, use functools.reduce:
+
+    >>> from functools import reduce
+    >>> reduce(mt.union1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2])).execute()
+    array([1, 2, 3, 4, 6])
+    """
+
+    from ..base import sort, unique
+    from .concatenate import concatenate
+
+    result = unique(concatenate((ar1, ar2), axis=None), aggregate_size=aggregate_size)
+    if aggregate_size == 1:
+        return result
+    # make sure the result is sorted
+    # TODO(xuye.qin): remove when `mt.unique` supports sort shuffle
+    return sort(result)
diff --git a/python/xorbits/_mars/tensor/merge/vstack.py b/python/xorbits/_mars/tensor/merge/vstack.py
new file mode 100644
index 000000000..1d0764820
--- /dev/null
+++ b/python/xorbits/_mars/tensor/merge/vstack.py
@@ -0,0 +1,73 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..base import atleast_2d
+from .concatenate import concatenate
+
+
+def vstack(tup):
+    """
+    Stack tensors in sequence vertically (row wise).
+
+    This is equivalent to concatenation along the first axis after 1-D tensors
+    of shape `(N,)` have been reshaped to `(1,N)`. Rebuilds tensors divided by
+    `vsplit`.
+
+    This function makes most sense for tensors with up to 3 dimensions. For
+    instance, for pixel-data with a height (first axis), width (second axis),
+    and r/g/b channels (third axis). The functions `concatenate`, `stack` and
+    `block` provide more general stacking and concatenation operations.
+
+    Parameters
+    ----------
+    tup : sequence of tensors
+        The tensors must have the same shape along all but the first axis.
+        1-D tensors must have the same length.
+
+    Returns
+    -------
+    stacked : Tensor
+        The tensor formed by stacking the given tensors, will be at least 2-D.
+
+    See Also
+    --------
+    stack : Join a sequence of tensors along a new axis.
+    hstack : Stack tensors in sequence horizontally (column wise).
+    dstack : Stack tensors in sequence depth wise (along third dimension).
+    concatenate : Join a sequence of tensors along an existing axis.
+    vsplit : Split tensor into a list of multiple sub-arrays vertically.
+    block : Assemble tensors from blocks.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([1, 2, 3])
+    >>> b = mt.array([2, 3, 4])
+    >>> mt.vstack((a,b)).execute()
+    array([[1, 2, 3],
+           [2, 3, 4]])
+
+    >>> a = mt.array([[1], [2], [3]])
+    >>> b = mt.array([[2], [3], [4]])
+    >>> mt.vstack((a,b)).execute()
+    array([[1],
+           [2],
+           [3],
+           [2],
+           [3],
+           [4]])
+
+    """
+    return concatenate([atleast_2d(t) for t in tup], axis=0)
diff --git a/python/xorbits/_mars/tensor/operands.py b/python/xorbits/_mars/tensor/operands.py
new file mode 100644
index 000000000..df2934095
--- /dev/null
+++ b/python/xorbits/_mars/tensor/operands.py
@@ -0,0 +1,132 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core import OutputType
+from ..core.operand import (
+    Fuse,
+    HasInput,
+    MapReduceOperand,
+    Operand,
+    ShuffleProxy,
+    TileableOperandMixin,
+)
+from ..serialization.serializables import DataTypeField
+from ..utils import calc_nsplits
+
+
+class TensorOperandMixin(TileableOperandMixin):
+    __slots__ = ()
+    _op_module_ = "tensor"
+    _output_type_ = OutputType.tensor
+
+    def new_tensors(
+        self,
+        inputs,
+        shape=None,
+        dtype=None,
+        order=None,
+        chunks=None,
+        nsplits=None,
+        output_limit=None,
+        kws=None,
+        **kw
+    ):
+        return self.new_tileables(
+            inputs,
+            shape=shape,
+            chunks=chunks,
+            nsplits=nsplits,
+            output_limit=output_limit,
+            kws=kws,
+            dtype=dtype,
+            order=order,
+            **kw
+        )
+
+    def new_tensor(self, inputs, shape, dtype=None, order=None, **kw):
+        if getattr(self, "output_limit") != 1:
+            raise TypeError("cannot new tensor with more than 1 outputs")
+        return self.new_tensors(inputs, shape=shape, dtype=dtype, order=order, **kw)[0]
+
+    @classmethod
+    def concat_tileable_chunks(cls, tileable):
+        from .merge.concatenate import TensorConcatenate
+
+        tensor = tileable
+        assert not tensor.is_coarse()
+
+        op = TensorConcatenate(dtype=tensor.dtype)
+        chunk = TensorConcatenate(dtype=tensor.dtype).new_chunk(
+            tensor.chunks, shape=tensor.shape, index=(0,) * tileable.ndim
+        )
+        return op.new_tensor(
+            [tensor],
+            tensor.shape,
+            chunks=[chunk],
+            nsplits=tuple((s,) for s in tensor.shape),
+        )
+
+    @classmethod
+    def create_tileable_from_chunks(cls, chunks, inputs=None, **kw):
+        chunk_idx_to_shape = {c.index: c.shape for c in chunks}
+        nsplits = calc_nsplits(chunk_idx_to_shape)
+        shape = tuple(sum(ns) for ns in nsplits)
+        op = chunks[0].op.copy().reset_key()
+        return op.new_tensor(
+            inputs,
+            shape=shape,
+            chunks=chunks,
+            nsplits=nsplits,
+            dtype=chunks[0].dtype,
+            **kw
+        )
+
+    def get_fuse_op_cls(self, _):
+        from .fuse import TensorFuseChunk
+
+        return TensorFuseChunk
+
+
+class TensorOperand(Operand):
+    _output_type_ = OutputType.tensor
+
+    dtype = DataTypeField("dtype", default=None)
+
+
+class TensorHasInput(HasInput):
+    _output_type_ = OutputType.tensor
+
+    dtype = DataTypeField("dtype", default=None)
+
+
+class TensorShuffleProxy(ShuffleProxy, TensorOperandMixin):
+    _output_type_ = OutputType.tensor
+
+    dtype = DataTypeField("dtype", default=None)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        pass
+
+
+class TensorMapReduceOperand(MapReduceOperand):
+    _output_type_ = OutputType.tensor
+
+    dtype = DataTypeField("dtype", default=None)
+
+
+class TensorFuse(Fuse):
+    _output_type_ = OutputType.tensor
+
+    dtype = DataTypeField("dtype", default=None)
diff --git a/python/xorbits/_mars/tensor/random/__init__.py b/python/xorbits/_mars/tensor/random/__init__.py
new file mode 100644
index 000000000..ebd129073
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/__init__.py
@@ -0,0 +1,166 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .beta import TensorRandBeta, beta
+from .binomial import TensorBinomial, binomial
+from .bytes import bytes
+from .chisquare import TensorChisquareDist, chisquare
+from .choice import TensorChoice, choice
+from .core import RandomState, RandomStateField, _random_state
+from .dirichlet import TensorDirichlet, dirichlet
+from .exponential import TensorExponential, exponential
+from .f import TensorF, f
+from .gamma import TensorRandGamma, gamma
+from .geometric import TensorGeometric, geometric
+from .gumbel import TensorGumbel, gumbel
+from .hypergeometric import TensorHypergeometric, hypergeometric
+from .laplace import TensorLaplace, laplace
+from .logistic import TensorLogistic, logistic
+from .lognormal import TensorLognormal, lognormal
+from .logseries import TensorLogseries, logseries
+from .multinomial import TensorMultinomial, multinomial
+from .multivariate_normal import TensorMultivariateNormal, multivariate_normal
+from .negative_binomial import TensorNegativeBinomial, negative_binomial
+from .noncentral_chisquare import TensorNoncentralChisquare, noncentral_chisquare
+from .noncentral_f import TensorNoncentralF, noncentral_f
+from .normal import TensorNormal, normal
+from .pareto import TensorPareto, pareto
+from .permutation import TensorPermutation, permutation
+from .poisson import TensorPoisson, poisson
+from .power import TensorRandomPower, power
+from .rand import TensorRand, rand
+from .randint import TensorRandint, randint
+from .randn import TensorRandn, randn
+from .random_integers import TensorRandomIntegers, random_integers
+from .random_sample import TensorRandomSample, random_sample
+from .rayleigh import TensorRayleigh, rayleigh
+from .shuffle import shuffle
+from .standard_cauchy import TensorStandardCauchy, standard_cauchy
+from .standard_exponential import TensorStandardExponential, standard_exponential
+from .standard_gamma import TensorStandardGamma, standard_gamma
+from .standard_normal import TensorStandardNormal, standard_normal
+from .standard_t import TensorStandardT, standard_t
+from .triangular import TensorTriangular, triangular
+from .uniform import TensorUniform, uniform
+from .vonmises import TensorVonmises, vonmises
+from .wald import TensorWald, wald
+from .weibull import TensorWeibull, weibull
+from .zipf import TensorZipf, zipf
+
+
+def _install():
+    setattr(RandomState, "rand", rand)
+    setattr(RandomState, "randn", randn)
+    setattr(RandomState, "randint", randint)
+    setattr(RandomState, "random_integers", random_integers)
+    setattr(RandomState, "random_sample", random_sample)
+    setattr(RandomState, "ranf", random_sample)
+    setattr(RandomState, "random", random_sample)
+    setattr(RandomState, "sample", random_sample)
+    setattr(RandomState, "choice", choice)
+    setattr(RandomState, "bytes", bytes)
+    setattr(RandomState, "beta", beta)
+    setattr(RandomState, "binomial", binomial)
+    setattr(RandomState, "chisquare", chisquare)
+    setattr(RandomState, "dirichlet", dirichlet)
+    setattr(RandomState, "exponential", exponential)
+    setattr(RandomState, "f", f)
+    setattr(RandomState, "gamma", gamma)
+    setattr(RandomState, "geometric", geometric)
+    setattr(RandomState, "gumbel", gumbel)
+    setattr(RandomState, "hypergeometric", hypergeometric)
+    setattr(RandomState, "laplace", laplace)
+    setattr(RandomState, "logistic", logistic)
+    setattr(RandomState, "lognormal", lognormal)
+    setattr(RandomState, "logseries", logseries)
+    setattr(RandomState, "multinomial", multinomial)
+    setattr(RandomState, "multivariate_normal", multivariate_normal)
+    setattr(RandomState, "negative_binomial", negative_binomial)
+    setattr(RandomState, "noncentral_chisquare", noncentral_chisquare)
+    setattr(RandomState, "noncentral_f", noncentral_f)
+    setattr(RandomState, "normal", normal)
+    setattr(RandomState, "pareto", pareto)
+    setattr(RandomState, "poisson", poisson)
+    setattr(RandomState, "power", power)
+    setattr(RandomState, "rayleigh", rayleigh)
+    setattr(RandomState, "standard_cauchy", standard_cauchy)
+    setattr(RandomState, "standard_exponential", standard_exponential)
+    setattr(RandomState, "standard_gamma", standard_gamma)
+    setattr(RandomState, "standard_normal", standard_normal)
+    setattr(RandomState, "standard_t", standard_t)
+    setattr(RandomState, "triangular", triangular)
+    setattr(RandomState, "uniform", uniform)
+    setattr(RandomState, "vonmises", vonmises)
+    setattr(RandomState, "wald", wald)
+    setattr(RandomState, "weibull", weibull)
+    setattr(RandomState, "zipf", zipf)
+    setattr(RandomState, "permutation", permutation)
+    setattr(RandomState, "shuffle", shuffle)
+
+
+_install()
+del _install
+
+
+seed = _random_state.seed
+
+rand = _random_state.rand
+randn = _random_state.randn
+randint = _random_state.randint
+random_integers = _random_state.random_integers
+random_sample = _random_state.random_sample
+random = _random_state.random
+ranf = _random_state.ranf
+sample = _random_state.sample
+choice = _random_state.choice
+bytes = _random_state.bytes
+
+permutation = _random_state.permutation
+shuffle = _random_state.shuffle
+
+beta = _random_state.beta
+binomial = _random_state.binomial
+chisquare = _random_state.chisquare
+dirichlet = _random_state.dirichlet
+exponential = _random_state.exponential
+f = _random_state.f
+gamma = _random_state.gamma
+geometric = _random_state.geometric
+gumbel = _random_state.gumbel
+hypergeometric = _random_state.hypergeometric
+laplace = _random_state.laplace
+logistic = _random_state.logistic
+lognormal = _random_state.lognormal
+logseries = _random_state.logseries
+multinomial = _random_state.multinomial
+multivariate_normal = _random_state.multivariate_normal
+negative_binomial = _random_state.negative_binomial
+noncentral_chisquare = _random_state.noncentral_chisquare
+noncentral_f = _random_state.noncentral_f
+normal = _random_state.normal
+pareto = _random_state.pareto
+poisson = _random_state.poisson
+power = _random_state.power
+rayleigh = _random_state.rayleigh
+standard_cauchy = _random_state.standard_cauchy
+standard_exponential = _random_state.standard_exponential
+standard_gamma = _random_state.standard_gamma
+standard_normal = _random_state.standard_normal
+standard_t = _random_state.standard_t
+triangular = _random_state.triangular
+uniform = _random_state.uniform
+vonmises = _random_state.vonmises
+wald = _random_state.wald
+weibull = _random_state.weibull
+zipf = _random_state.zipf
diff --git a/python/xorbits/_mars/tensor/random/beta.py b/python/xorbits/_mars/tensor/random/beta.py
new file mode 100644
index 000000000..7cddc16a3
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/beta.py
@@ -0,0 +1,87 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorRandBeta(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["a", "b"]
+    _op_type_ = OperandDef.RAND_BETA
+
+    _fields_ = "a", "b", "size"
+    a = AnyField("a")
+    b = AnyField("b")
+    _func_name = "beta"
+
+    def __call__(self, a, b, chunk_size=None):
+        return self.new_tensor([a, b], None, raw_chunk_size=chunk_size)
+
+
+def beta(random_state, a, b, size=None, chunk_size=None, gpu=None, dtype=None):
+    r"""
+    Draw samples from a Beta distribution.
+
+    The Beta distribution is a special case of the Dirichlet distribution,
+    and is related to the Gamma distribution.  It has the probability
+    distribution function
+
+    .. math:: f(x; a,b) = \frac{1}{B(\alpha, \beta)} x^{\alpha - 1}
+                                                     (1 - x)^{\beta - 1},
+
+    where the normalisation, B, is the beta function,
+
+    .. math:: B(\alpha, \beta) = \int_0^1 t^{\alpha - 1}
+                                 (1 - t)^{\beta - 1} dt.
+
+    It is often seen in Bayesian inference and order statistics.
+
+    Parameters
+    ----------
+    a : float or array_like of floats
+        Alpha, non-negative.
+    b : float or array_like of floats
+        Beta, non-negative.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``a`` and ``b`` are both scalars.
+        Otherwise, ``mt.broadcast(a, b).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized beta distribution.
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState()
+            .beta(handle_array(a), handle_array(b), size=(0,))
+            .dtype
+        )
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorRandBeta(seed=seed, size=size, gpu=gpu, dtype=dtype)
+    return op(a, b, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/binomial.py b/python/xorbits/_mars/tensor/random/binomial.py
new file mode 100644
index 000000000..e2dd5700e
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/binomial.py
@@ -0,0 +1,135 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorBinomial(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["n", "p"]
+    _op_type_ = OperandDef.RAND_BINOMIAL
+
+    _fields_ = "n", "p", "size"
+    n = AnyField("n")
+    p = AnyField("p")
+    _func_name = "binomial"
+
+    def __call__(self, n, p, chunk_size=None):
+        return self.new_tensor([n, p], None, raw_chunk_size=chunk_size)
+
+
+def binomial(random_state, n, p, size=None, chunk_size=None, gpu=None, dtype=None):
+    r"""
+    Draw samples from a binomial distribution.
+
+    Samples are drawn from a binomial distribution with specified
+    parameters, n trials and p probability of success where
+    n an integer >= 0 and p is in the interval [0,1]. (n may be
+    input as a float, but it is truncated to an integer in use)
+
+    Parameters
+    ----------
+    n : int or array_like of ints
+        Parameter of the distribution, >= 0. Floats are also accepted,
+        but they will be truncated to integers.
+    p : float or array_like of floats
+        Parameter of the distribution, >= 0 and <=1.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``n`` and ``p`` are both scalars.
+        Otherwise, ``mt.broadcast(n, p).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized binomial distribution, where
+        each sample is equal to the number of successes over the n trials.
+
+    See Also
+    --------
+    scipy.stats.binom : probability density function, distribution or
+        cumulative density function, etc.
+
+    Notes
+    -----
+    The probability density for the binomial distribution is
+
+    .. math:: P(N) = \binom{n}{N}p^N(1-p)^{n-N},
+
+    where :math:`n` is the number of trials, :math:`p` is the probability
+    of success, and :math:`N` is the number of successes.
+
+    When estimating the standard error of a proportion in a population by
+    using a random sample, the normal distribution works well unless the
+    product p*n <=5, where p = population proportion estimate, and n =
+    number of samples, in which case the binomial distribution is used
+    instead. For example, a sample of 15 people shows 4 who are left
+    handed, and 11 who are right handed. Then p = 4/15 = 27%. 0.27*15 = 4,
+    so the binomial distribution should be used in this case.
+
+    References
+    ----------
+    .. [1] Dalgaard, Peter, "Introductory Statistics with R",
+           Springer-Verlag, 2002.
+    .. [2] Glantz, Stanton A. "Primer of Biostatistics.", McGraw-Hill,
+           Fifth Edition, 2002.
+    .. [3] Lentner, Marvin, "Elementary Applied Statistics", Bogden
+           and Quigley, 1972.
+    .. [4] Weisstein, Eric W. "Binomial Distribution." From MathWorld--A
+           Wolfram Web Resource.
+           http://mathworld.wolfram.com/BinomialDistribution.html
+    .. [5] Wikipedia, "Binomial distribution",
+           http://en.wikipedia.org/wiki/Binomial_distribution
+
+    Examples
+    --------
+    Draw samples from the distribution:
+
+    >>> import mars.tensor as mt
+
+    >>> n, p = 10, .5  # number of trials, probability of each trial
+    >>> s = mt.random.binomial(n, p, 1000).execute()
+    # result of flipping a coin 10 times, tested 1000 times.
+
+    A real world example. A company drills 9 wild-cat oil exploration
+    wells, each with an estimated probability of success of 0.1. All nine
+    wells fail. What is the probability of that happening?
+
+    Let's do 20,000 trials of the model, and count the number that
+    generate zero positive results.
+
+    >>> (mt.sum(mt.random.binomial(9, 0.1, 20000) == 0)/20000.).execute()
+    # answer = 0.38885, or 38%.
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState()
+            .binomial(handle_array(n), handle_array(p), size=(0,))
+            .dtype
+        )
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorBinomial(seed=seed, size=size, gpu=gpu, dtype=dtype)
+    return op(n, p, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/bytes.py b/python/xorbits/_mars/tensor/random/bytes.py
new file mode 100644
index 000000000..707f922bb
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/bytes.py
@@ -0,0 +1,37 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def bytes(random_state, length):
+    """
+    Return random bytes.
+
+    Parameters
+    ----------
+    length : int
+        Number of random bytes.
+
+    Returns
+    -------
+    out : str
+        String of length `length`.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.random.bytes(10)
+    ' eh\x85\x022SZ\xbf\xa4' #random
+    """
+    return random_state._random_state.bytes(length)
diff --git a/python/xorbits/_mars/tensor/random/chisquare.py b/python/xorbits/_mars/tensor/random/chisquare.py
new file mode 100644
index 000000000..2d1736b5b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/chisquare.py
@@ -0,0 +1,108 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorChisquareDist(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["df"]
+    _op_type_ = OperandDef.RAND_CHISQUARE
+
+    _fields_ = "df", "size"
+    df = AnyField("df")
+    _func_name = "chisquare"
+
+    def __call__(self, df, chunk_size=None):
+        return self.new_tensor([df], self.size, raw_chunk_size=chunk_size)
+
+
+def chisquare(random_state, df, size=None, chunk_size=None, gpu=None, dtype=None):
+    r"""
+    Draw samples from a chi-square distribution.
+
+    When `df` independent random variables, each with standard normal
+    distributions (mean 0, variance 1), are squared and summed, the
+    resulting distribution is chi-square (see Notes).  This distribution
+    is often used in hypothesis testing.
+
+    Parameters
+    ----------
+    df : float or array_like of floats
+         Number of degrees of freedom, should be > 0.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``df`` is a scalar.  Otherwise,
+        ``mt.array(df).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized chi-square distribution.
+
+    Raises
+    ------
+    ValueError
+        When `df` <= 0 or when an inappropriate `size` (e.g. ``size=-1``)
+        is given.
+
+    Notes
+    -----
+    The variable obtained by summing the squares of `df` independent,
+    standard normally distributed random variables:
+
+    .. math:: Q = \sum_{i=0}^{\mathtt{df}} X^2_i
+
+    is chi-square distributed, denoted
+
+    .. math:: Q \sim \chi^2_k.
+
+    The probability density function of the chi-squared distribution is
+
+    .. math:: p(x) = \frac{(1/2)^{k/2}}{\Gamma(k/2)}
+                     x^{k/2 - 1} e^{-x/2},
+
+    where :math:`\Gamma` is the gamma function,
+
+    .. math:: \Gamma(x) = \int_0^{-\infty} t^{x - 1} e^{-t} dt.
+
+    References
+    ----------
+    .. [1] NIST "Engineering Statistics Handbook"
+           http://www.itl.nist.gov/div898/handbook/eda/section3/eda3666.htm
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.random.chisquare(2,4).execute()
+    array([ 1.89920014,  9.00867716,  3.13710533,  5.62318272])
+    """
+    if dtype is None:
+        dtype = np.random.RandomState().chisquare(handle_array(df), size=(0,)).dtype
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorChisquareDist(seed=seed, size=size, gpu=gpu, dtype=dtype)
+    return op(df, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/choice.py b/python/xorbits/_mars/tensor/random/choice.py
new file mode 100644
index 000000000..7d009140a
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/choice.py
@@ -0,0 +1,381 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from numbers import Integral
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...core import recursive_tile
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    FieldTypes,
+    KeyField,
+    TupleField,
+)
+from ...utils import ceildiv, has_unknown_shape
+from ..array_utils import as_same_device, device
+from ..core import TENSOR_CHUNK_TYPE, TENSOR_TYPE, TensorOrder
+from ..datasource import arange, array
+from ..operands import TensorOperandMixin
+from ..utils import decide_chunk_sizes, gen_random_seeds, normalize_chunk_sizes
+from .core import RandomState, TensorRandomOperand
+
+
+class TensorChoice(TensorRandomOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.RAND_CHOICE
+
+    a = AnyField("a")
+    size = TupleField("size", FieldTypes.int64)
+    replace = BoolField("replace")
+    p = KeyField("p")
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if isinstance(self.a, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)):
+            self.a = self._inputs[0]
+        if isinstance(self.p, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)):
+            self.p = self._inputs[-1]
+
+    def __call__(self, a, p, chunk_size=None):
+        inputs = []
+        if isinstance(a, TENSOR_TYPE):
+            inputs.append(a)
+        if isinstance(p, TENSOR_TYPE):
+            inputs.append(p)
+        return self.new_tensor(
+            inputs,
+            shape=self.size,
+            raw_chunk_size=chunk_size,
+            order=TensorOrder.C_ORDER,
+        )
+
+    @classmethod
+    def _tile_one_chunk(cls, op, a, p):
+        out = op.outputs[0]
+        chunk_op = op.copy().reset_key()
+        chunk_op.seed = gen_random_seeds(1, np.random.RandomState(op.seed))[0]
+        chunk_inputs = []
+        if isinstance(a, TENSOR_TYPE):
+            chunk_op.a = a.chunks[0]
+            chunk_inputs.append(chunk_op.a)
+        else:
+            chunk_op.a = a
+        if isinstance(p, TENSOR_TYPE):
+            chunk_op.p = p.chunks[0]
+            chunk_inputs.append(chunk_op.p)
+        else:
+            chunk_op.p = p
+        chunk = chunk_op.new_chunk(
+            chunk_inputs, shape=out.shape, index=(0,) * out.ndim, order=out.order
+        )
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            shape=out.shape,
+            order=out.order,
+            chunks=[chunk],
+            nsplits=tuple((s,) for s in out.shape),
+        )
+
+    @classmethod
+    def _tile_sample_with_replacement(cls, op, a, nsplits):
+        out_shape = tuple(sum(ns) for ns in nsplits)
+        out_size = np.prod(out_shape).item()
+        most_chunk_size = np.prod([max(ns) for ns in nsplits]).item()
+
+        is_a_int = False
+        if isinstance(a, Integral):
+            is_a_int = True
+            a_size = a
+        else:
+            a = array(a)
+            a_size = a.size
+
+        rs = RandomState.from_numpy(np.random.RandomState(op.seed))
+
+        if is_a_int:
+            # the indices is just the result
+            ret = rs.randint(a_size, size=out_shape, chunk_size=nsplits)
+        else:
+            # gen indices first, need to be flattened
+            indices = rs.randint(a_size, size=out_size, chunk_size=most_chunk_size)
+            # get result via fancy indexing
+            ret = a[indices]
+            if len(out_shape) > 1:
+                # reshape back if out's ndim > 1
+                ret = ret.reshape(out_shape)
+            ret = ret.rechunk(nsplits)
+
+        return [(yield from recursive_tile(ret))]
+
+    @classmethod
+    def _tile_sample_without_replacement(cls, op, a, nsplits):
+        from ..base import searchsorted
+        from ..indexing.getitem import TensorIndex
+        from ..merge.stack import TensorStack
+
+        out = op.outputs[0]
+        out_shape = tuple(sum(ns) for ns in nsplits)
+        # to sample count
+        m = np.prod(out_shape).item()
+
+        if isinstance(a, Integral):
+            a_size = a
+            a = arange(a)
+        else:
+            a = array(a)
+            a_size = a.size
+        a = yield from recursive_tile(a)
+
+        if any(cs < m for cs in a.nsplits[0]):
+            # make sure all chunk > m
+            n_chunk = min(max(a.size // (m + 1), 1), a.chunk_shape[0])
+            chunk_size = ceildiv(a.size, n_chunk)
+            chunk_sizes = normalize_chunk_sizes(a.size, chunk_size)[0]
+            if chunk_sizes[-1] < m and len(chunk_sizes) > 1:
+                # the last chunk may still less than m
+                # merge it into previous one
+                chunk_sizes[-2] += chunk_sizes[-1]
+                chunk_sizes = chunk_sizes[:-1]
+            a = yield from recursive_tile(a.rechunk({0: chunk_sizes}))
+            if len(chunk_sizes) == 1:
+                return cls._tile_one_chunk(op, a, None)
+
+        # for each chunk in a, do regular sampling
+        sampled_chunks = []
+        sample_seeds = gen_random_seeds(len(a.chunks), np.random.RandomState(op.seed))
+        for seed, chunk in zip(sample_seeds, a.chunks):
+            chunk_op = op.copy().reset_key()
+            chunk_op._a = chunk
+            chunk_op.size = (m,)
+            chunk_op.seed = seed
+            sampled_chunk = chunk_op.new_chunk(
+                [chunk], shape=(m,), order=out.order, index=chunk.index
+            )
+            sampled_chunks.append(sampled_chunk)
+
+        if len(sampled_chunks) == 1:
+            out_chunk = sampled_chunks[0]
+        else:
+            stacked_chunk = TensorStack(
+                axis=0, dtype=sampled_chunks[0].dtype
+            ).new_chunk(
+                sampled_chunks, shape=(len(a.chunks), m), order=TensorOrder.C_ORDER
+            )
+
+            # gen indices with length m from 0...a.size
+            state = RandomState.from_numpy(np.random.RandomState(op.seed))
+            indices = state.randint(a_size, size=(m,))
+            cum_offsets = np.cumsum(a.nsplits[0])
+            ind = yield from recursive_tile(
+                searchsorted(cum_offsets, indices, side="right")
+            )
+            ind_chunk = ind.chunks[0]
+
+            # do fancy index to find result
+            arange_tensor = yield from recursive_tile(arange(m))
+            indexes = [ind_chunk, arange_tensor.chunks[0]]
+            out_chunk = TensorIndex(
+                dtype=stacked_chunk.dtype, indexes=indexes
+            ).new_chunk(
+                [stacked_chunk] + list(indexes), shape=(m,), order=TensorOrder.C_ORDER
+            )
+
+        ret = op.copy().new_tensor(
+            op.inputs, shape=(m,), order=out.order, nsplits=((m,),), chunks=[out_chunk]
+        )
+        if len(out_shape) > 0:
+            ret = yield from recursive_tile(ret.reshape(out_shape))
+        ret = yield from recursive_tile(ret.rechunk(nsplits))
+        return [ret]
+
+    @classmethod
+    def tile(cls, op):
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        out = op.outputs[0]
+        chunk_size = out.extra_params.raw_chunk_size or options.chunk_size
+        nsplits = decide_chunk_sizes(out.shape, chunk_size, out.dtype.itemsize)
+        inputs = op.inputs
+
+        a, p = op.a, op.p
+        if p is not None:
+            # we cannot handle p in a parallel fashion
+            inputs = []
+            if isinstance(a, TENSOR_TYPE):
+                a = yield from recursive_tile(a.rechunk(a.shape))
+                inputs.append(a)
+            p = yield from recursive_tile(p.rechunk(p.shape))
+            inputs.append(p)
+
+            # ignore nsplits if p is specified
+            nsplits = ((s,) for s in out.shape)
+
+        # all inputs and outputs has 1 chunk
+        if all(len(inp.chunks) == 1 for inp in inputs) and all(
+            len(ns) == 1 for ns in nsplits
+        ):
+            return cls._tile_one_chunk(op, a, p)
+
+        if op.replace:
+            return (yield from cls._tile_sample_with_replacement(op, a, nsplits))
+        else:
+            return (yield from cls._tile_sample_without_replacement(op, a, nsplits))
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+        if isinstance(op.a, TENSOR_CHUNK_TYPE):
+            a = inputs[0]
+        else:
+            a = op.a
+        if isinstance(op.p, TENSOR_CHUNK_TYPE):
+            p = inputs[-1]
+        else:
+            p = op.p
+
+        with device(device_id):
+            rs = xp.random.RandomState(op.seed)
+            ctx[op.outputs[0].key] = rs.choice(a, size=op.size, replace=op.replace, p=p)
+
+
+def choice(random_state, a, size=None, replace=True, p=None, chunk_size=None, gpu=None):
+    """
+    Generates a random sample from a given 1-D array
+
+    Parameters
+    -----------
+    a : 1-D array-like or int
+        If a tensor, a random sample is generated from its elements.
+        If an int, the random sample is generated as if a were mt.arange(a)
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    replace : boolean, optional
+        Whether the sample is with or without replacement
+    p : 1-D array-like, optional
+        The probabilities associated with each entry in a.
+        If not given the sample assumes a uniform distribution over all
+        entries in a.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+
+    Returns
+    --------
+    samples : single item or tensor
+        The generated random samples
+
+    Raises
+    -------
+    ValueError
+        If a is an int and less than zero, if a or p are not 1-dimensional,
+        if a is an array-like of size 0, if p is not a vector of
+        probabilities, if a and p have different lengths, or if
+        replace=False and the sample size is greater than the population
+        size
+
+    See Also
+    ---------
+    randint, shuffle, permutation
+
+    Examples
+    ---------
+    Generate a uniform random sample from mt.arange(5) of size 3:
+
+    >>> import mars.tensor as mt
+
+    >>> mt.random.choice(5, 3).execute()
+    array([0, 3, 4])
+    >>> #This is equivalent to mt.random.randint(0,5,3)
+
+    Generate a non-uniform random sample from np.arange(5) of size 3:
+
+    >>> mt.random.choice(5, 3, p=[0.1, 0, 0.3, 0.6, 0]).execute()
+    array([3, 3, 0])
+
+    Generate a uniform random sample from mt.arange(5) of size 3 without
+    replacement:
+
+    >>> mt.random.choice(5, 3, replace=False).execute()
+    array([3,1,0])
+    >>> #This is equivalent to np.random.permutation(np.arange(5))[:3]
+
+    Generate a non-uniform random sample from mt.arange(5) of size
+    3 without replacement:
+
+    >>> mt.random.choice(5, 3, replace=False, p=[0.1, 0, 0.3, 0.6, 0]).execute()
+    array([2, 3, 0])
+
+    Any of the above can be repeated with an arbitrary array-like
+    instead of just integers. For instance:
+
+    >>> aa_milne_arr = ['pooh', 'rabbit', 'piglet', 'Christopher']
+    >>> np.random.choice(aa_milne_arr, 5, p=[0.5, 0.1, 0.1, 0.3])
+    array(['pooh', 'pooh', 'pooh', 'Christopher', 'piglet'],
+          dtype='|S11')
+    """
+
+    if isinstance(a, Integral):
+        if a <= 0:
+            raise ValueError("a must be greater than 0")
+        a_size = a
+        dtype = np.random.choice(
+            1, size=(), p=np.array([1]) if p is not None else p
+        ).dtype
+    else:
+        a = array(a)
+        if a.ndim != 1:
+            raise ValueError("a must be one dimensional")
+        a_size = a.size
+        dtype = a.dtype
+
+    if p is not None:
+        if not isinstance(p, TENSOR_TYPE):
+            p = np.asarray(p)
+            if not np.isclose(p.sum(), 1, rtol=1e-7, atol=0):
+                raise ValueError("probabilities do not sum to 1")
+            p = array(p, chunk_size=p.size)
+        if p.ndim != 1:
+            raise ValueError("p must be one dimensional")
+
+    if size is None:
+        size = ()
+        length = 1
+    else:
+        try:
+            tuple(size)
+            length = np.prod(size)
+        except TypeError:
+            length = size
+    if replace is False and length > a_size:
+        raise ValueError(
+            "Cannot take a larger sample than population when 'replace=False'"
+        )
+
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorChoice(
+        a=a, p=p, seed=seed, replace=replace, size=size, dtype=dtype, gpu=gpu
+    )
+    return op(a, p, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/core.py b/python/xorbits/_mars/tensor/random/core.py
new file mode 100644
index 000000000..3623d6773
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/core.py
@@ -0,0 +1,416 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import itertools
+from collections.abc import Iterable
+from contextlib import contextmanager
+
+import numpy as np
+
+from ...config import options
+from ...core import recursive_tile
+from ...serialization.serializables import FieldTypes, Int32Field, TupleField
+from ..array_utils import array_module, device
+from ..base import broadcast_to
+from ..core import TENSOR_CHUNK_TYPE, TENSOR_TYPE
+from ..datasource import tensor as astensor
+from ..operands import TensorMapReduceOperand, TensorOperand, TensorOperandMixin
+from ..utils import broadcast_shape, decide_chunk_sizes, gen_random_seeds
+
+
+class RandomState:
+    def __init__(self, seed=None):
+        self._random_state = np.random.RandomState(seed=seed)
+
+    def seed(self, seed=None):
+        """
+        Seed the generator.
+
+        This method is called when `RandomState` is initialized. It can be
+        called again to re-seed the generator. For details, see `RandomState`.
+
+        Parameters
+        ----------
+        seed : int or 1-d array_like, optional
+            Seed for `RandomState`.
+            Must be convertible to 32 bit unsigned integers.
+
+        See Also
+        --------
+        RandomState
+        """
+        self._random_state.seed(seed=seed)
+
+    def to_numpy(self):
+        return self._random_state
+
+    @classmethod
+    def from_numpy(cls, np_random_state):
+        state = RandomState()
+        state._random_state = np_random_state
+        return state
+
+    @classmethod
+    def _handle_size(cls, size):
+        if size is None:
+            return size
+        try:
+            return tuple(int(s) for s in size)
+        except TypeError:
+            return (size,)
+
+
+_random_state = RandomState()
+
+
+def handle_array(arg):
+    if not isinstance(arg, TENSOR_TYPE):
+        if not isinstance(arg, Iterable):
+            return arg
+
+        arg = np.asarray(arg)
+        return arg[(0,) * max(1, arg.ndim)]
+    elif hasattr(arg, "op") and hasattr(arg.op, "data"):
+        return arg.op.data[(0,) * max(1, arg.ndim)]
+
+    return np.empty((0,), dtype=arg.dtype)
+
+
+class TensorRandomOperandMixin(TensorOperandMixin):
+    __slots__ = ()
+
+    @classmethod
+    def tile(cls, op):
+        tensor = op.outputs[0]
+        chunk_size = tensor.extra_params.raw_chunk_size or options.chunk_size
+        nsplits = decide_chunk_sizes(tensor.shape, chunk_size, tensor.dtype.itemsize)
+        fields = getattr(op, "_input_fields_", [])
+        to_one_chunk_fields = set(getattr(op, "_into_one_chunk_fields_", list()))
+
+        new_inputs = []
+        changed = False
+        for field in fields:
+            t = getattr(op, field)
+            if not isinstance(t, TENSOR_TYPE):
+                continue
+
+            if field not in to_one_chunk_fields:
+                t_nsplits = nsplits
+            else:
+                t_nsplits = t.shape  # into 1 chunk
+            rechunked = t.rechunk(t_nsplits)
+            if rechunked is not t:
+                yield from recursive_tile(rechunked)
+                changed = True
+                new_inputs.append(rechunked)
+            else:
+                new_inputs.append(t)
+        if changed:
+            op.inputs = new_inputs
+
+        idxes = list(itertools.product(*[range(len(s)) for s in nsplits]))
+        seeds = gen_random_seeds(len(idxes), np.random.RandomState(op.seed))
+
+        out_chunks = []
+        for seed, idx, shape in zip(seeds, idxes, itertools.product(*nsplits)):
+            inputs = []
+            for inp in op.inputs:
+                if len(inp.chunks) == 1:
+                    inputs.append(inp.chunks[0])
+                else:
+                    inputs.append(inp.cix[idx])
+            try:
+                s = len(tuple(op.size))
+                size = shape[:s]
+            except TypeError:
+                if op.size is None:
+                    size = None
+                else:
+                    size = shape[:1]
+            except AttributeError:
+                size = shape
+
+            chunk_op = op.copy().reset_key()
+            chunk_op.seed = int(seed)
+            chunk_op.size = size
+            out_chunk = chunk_op.new_chunk(
+                inputs, shape=shape, index=idx, order=tensor.order
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            tensor.shape,
+            order=tensor.order,
+            chunks=out_chunks,
+            nsplits=nsplits,
+            **tensor.extra_params
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        xp = array_module(op.gpu)
+        if xp is np:
+            device_id = -1
+        else:
+            device_id = op.device or 0
+        get_val = lambda x: ctx[x.key] if isinstance(x, TENSOR_CHUNK_TYPE) else x
+
+        with device(device_id):
+            rs = xp.random.RandomState(op.seed)
+
+            method_name = getattr(cls, "_func_name")
+            try:
+                if method_name in ("rand", "randn"):
+                    try:
+                        res = getattr(rs, method_name)(*op.size, dtype=op.dtype)
+                    except TypeError:
+                        res = getattr(rs, method_name)(*op.size)
+                elif method_name == "randint":
+                    try:
+                        res = rs.randint(
+                            get_val(op.low),
+                            get_val(op.high),
+                            size=op.size,
+                            dtype=op.dtype,
+                        )
+                    except TypeError:
+                        res = rs.randint(
+                            get_val(op.low), get_val(op.high), size=op.size
+                        )
+                else:
+                    try:
+                        res = getattr(rs, method_name)(
+                            *(get_val(getattr(op, arg)) for arg in op.args),
+                            dtype=op.dtype
+                        )
+                    except TypeError:
+                        res = getattr(rs, method_name)(
+                            *(get_val(getattr(op, arg)) for arg in op.args)
+                        )
+                if hasattr(res, "dtype") and res.dtype != op.dtype:
+                    res = res.astype(op.dtype, copy=False)
+                if xp is not np:
+                    ctx[op.outputs[0].key] = xp.asarray(res)
+                else:
+                    ctx[op.outputs[0].key] = res
+            except AttributeError:
+                if xp is not np:
+                    # cupy cannot generate data, fallback to numpy
+                    rs = np.random.RandomState(op.seed)
+                    if method_name in ("rand", "randn"):
+                        res = getattr(rs, method_name)(*op.size)
+                    else:
+                        res = getattr(rs, method_name)(
+                            *(get_val(getattr(op, arg)) for arg in op.args)
+                        )
+                    if res.dtype != op.dtype:
+                        res = res.astype(op.dtype, copy=False)
+                    ctx[op.outputs[0].key] = xp.asarray(res)
+                else:
+                    raise
+
+    def _calc_shape(self, shapes):
+        shapes = list(shapes)
+        if getattr(self, "size", None) is not None:
+            shapes.append(getattr(self, "size"))
+        return broadcast_shape(*shapes)
+
+    @classmethod
+    def _handle_arg(cls, arg, chunk_size):
+        if isinstance(arg, (list, np.ndarray)):
+            arg = astensor(arg, chunk_size=chunk_size)
+
+        return arg
+
+    @contextmanager
+    def _get_inputs_shape_by_given_fields(
+        self, inputs, shape, raw_chunk_size=None, tensor=True
+    ):
+        fields = getattr(self, "_input_fields_", [])
+        to_one_chunk_fields = set(getattr(self, "_into_one_chunk_fields_", list()))
+
+        field_to_obj = dict()
+        to_broadcast_shapes = []
+        if fields:
+            if getattr(self, fields[0], None) is None:
+                # create from beginning
+                for field, val in zip(fields, inputs):
+                    if field not in to_one_chunk_fields:
+                        if isinstance(val, list):
+                            val = np.asarray(val)
+                        if tensor:
+                            val = self._handle_arg(val, raw_chunk_size)
+                    if isinstance(val, TENSOR_TYPE + TENSOR_CHUNK_TYPE):
+                        field_to_obj[field] = val
+                        if field not in to_one_chunk_fields:
+                            to_broadcast_shapes.append(val.shape)
+                    setattr(self, field, val)
+            else:
+                inputs_iter = iter(inputs)
+                for field in fields:
+                    if isinstance(
+                        getattr(self, field), TENSOR_TYPE + TENSOR_CHUNK_TYPE
+                    ):
+                        field_to_obj[field] = next(inputs_iter)
+
+        if tensor:
+            if shape is None:
+                shape = self._calc_shape(to_broadcast_shapes)
+
+            for field, inp in field_to_obj.items():
+                if field not in to_one_chunk_fields:
+                    field_to_obj[field] = broadcast_to(inp, shape)
+
+        yield [field_to_obj[f] for f in fields if f in field_to_obj], shape
+
+        inputs_iter = iter(getattr(self, "_inputs"))
+        for field in fields:
+            if field in field_to_obj:
+                setattr(self, field, next(inputs_iter))
+
+    @classmethod
+    def _get_shape(cls, kws, kw):
+        if kw.get("shape") is not None:
+            return kw.get("shape")
+        elif kws is not None and len(kws) > 0:
+            return kws[0].get("shape")
+
+    def _new_tileables(self, inputs, kws=None, **kw):
+        raw_chunk_size = kw.get("chunk_size", None)
+        shape = self._get_shape(kws, kw)
+        with self._get_inputs_shape_by_given_fields(
+            inputs, shape, raw_chunk_size, True
+        ) as (inputs, shape):
+            kw["shape"] = shape
+            return super()._new_tileables(inputs, kws=kws, **kw)
+
+    def _new_chunks(self, inputs, kws=None, **kw):
+        shape = self._get_shape(kws, kw)
+        with self._get_inputs_shape_by_given_fields(inputs, shape, None, False) as (
+            inputs,
+            shape,
+        ):
+            kw["shape"] = shape
+            return super()._new_chunks(inputs, kws=kws, **kw)
+
+
+def _on_serialize_random_state(rs):
+    return rs.get_state() if rs is not None else None
+
+
+def _on_deserialize_random_state(tup):
+    if tup is None:
+        return None
+
+    rs = np.random.RandomState()
+    rs.set_state(tup)
+    return rs
+
+
+def RandomStateField(name, **kwargs):
+    kwargs.update(
+        dict(
+            on_serialize=_on_serialize_random_state,
+            on_deserialize=_on_deserialize_random_state,
+        )
+    )
+    return TupleField(name, **kwargs)
+
+
+class TensorSeedOperandMixin(object):
+    @property
+    def seed(self):
+        return getattr(self, "seed", None)
+
+    @property
+    def args(self):
+        if hasattr(self, "_fields_"):
+            return self._fields_
+        else:
+            return [
+                field
+                for field in self._FIELDS
+                if field not in TensorRandomOperand._FIELDS
+            ]
+
+
+class TensorRandomOperand(TensorSeedOperandMixin, TensorOperand):
+    seed = Int32Field("seed")
+
+    def __init__(self, dtype=None, **kw):
+        dtype = np.dtype(dtype) if dtype is not None else dtype
+        if "state" in kw:
+            kw["_state"] = kw.pop("state")
+        super().__init__(dtype=dtype, **kw)
+
+
+class TensorRandomMapReduceOperand(TensorSeedOperandMixin, TensorMapReduceOperand):
+    seed = Int32Field("seed")
+
+    def __init__(self, dtype=None, **kw):
+        dtype = np.dtype(dtype) if dtype is not None else dtype
+        if "state" in kw:
+            kw["_state"] = kw.pop("state")
+        super().__init__(dtype=dtype, **kw)
+
+
+class TensorDistribution(TensorRandomOperand):
+    size = TupleField("size", FieldTypes.int64)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        xp = array_module(op.gpu)
+        if xp is np:
+            device_id = -1
+        else:
+            device_id = op.device or 0
+
+        with device(device_id):
+            rs = xp.random.RandomState(op.seed)
+
+            args = []
+            for k in op.args:
+                val = getattr(op, k, None)
+                if isinstance(val, TENSOR_CHUNK_TYPE):
+                    args.append(ctx[val.key])
+                else:
+                    args.append(val)
+
+            method_name = getattr(cls, "_func_name")
+            try:
+                res = getattr(rs, method_name)(*args)
+                if xp is not np:
+                    ctx[op.outputs[0].key] = xp.asarray(res)
+                else:
+                    ctx[op.outputs[0].key] = res
+            except AttributeError:
+                if xp is not np:
+                    # cupy cannot generate, fall back to numpy
+                    rs = np.random.RandomState(op.seed)
+                    res = getattr(rs, method_name)(*args)
+                    ctx[op.outputs[0].key] = xp.asarray(res)
+                else:
+                    raise
+
+
+class TensorSimpleRandomData(TensorRandomOperand):
+    size = TupleField("size", FieldTypes.int64)
+
+    def __init__(self, size=None, **kw):
+        if type(size) is int:
+            size = (size,)
+        super().__init__(size=size, **kw)
diff --git a/python/xorbits/_mars/tensor/random/dirichlet.py b/python/xorbits/_mars/tensor/random/dirichlet.py
new file mode 100644
index 000000000..60f532f97
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/dirichlet.py
@@ -0,0 +1,152 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from collections.abc import Iterable
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...serialization.serializables import TupleField
+from ..utils import decide_chunk_sizes, gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin
+
+
+class TensorDirichlet(TensorDistribution, TensorRandomOperandMixin):
+    _op_type_ = OperandDef.RAND_DIRICHLET
+
+    _fields_ = "alpha", "size"
+    alpha = TupleField("alpha", default=None)
+    _func_name = "dirichlet"
+
+    def _calc_shape(self, shapes):
+        shape = super()._calc_shape(shapes)
+        return shape + (len(self.alpha),)
+
+    def __call__(self, chunk_size=None):
+        return self.new_tensor(None, None, raw_chunk_size=chunk_size)
+
+    @classmethod
+    def tile(cls, op):
+        tensor = op.outputs[0]
+        chunk_size = tensor.extra_params.raw_chunk_size or options.chunk_size
+        nsplits = decide_chunk_sizes(
+            tensor.shape[:-1], chunk_size, tensor.dtype.itemsize
+        )
+        nsplits += ((len(op.alpha),),)
+
+        idxes = list(itertools.product(*[range(len(s)) for s in nsplits]))
+        seeds = gen_random_seeds(len(idxes), np.random.RandomState(op.seed))
+
+        out_chunks = []
+        for seed, idx, shape in zip(seeds, idxes, itertools.product(*nsplits)):
+            inputs = [inp.cix[idx] for inp in op.inputs]
+            size = shape[:-1]
+
+            chunk_op = op.copy().reset_key()
+            chunk_op._state = None
+            chunk_op.seed = seed
+            chunk_op.size = size
+            out_chunk = chunk_op.new_chunk(inputs, shape=shape, index=idx)
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs, tensor.shape, chunks=out_chunks, nsplits=nsplits
+        )
+
+
+def dirichlet(random_state, alpha, size=None, chunk_size=None, gpu=None, dtype=None):
+    r"""
+    Draw samples from the Dirichlet distribution.
+
+    Draw `size` samples of dimension k from a Dirichlet distribution. A
+    Dirichlet-distributed random variable can be seen as a multivariate
+    generalization of a Beta distribution. Dirichlet pdf is the conjugate
+    prior of a multinomial in Bayesian inference.
+
+    Parameters
+    ----------
+    alpha : array
+        Parameter of the distribution (k dimension for sample of
+        dimension k).
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    samples : Tensor
+        The drawn samples, of shape (size, alpha.ndim).
+
+    Raises
+    -------
+    ValueError
+        If any value in alpha is less than or equal to zero
+
+    Notes
+    -----
+    .. math:: X \approx \prod_{i=1}^{k}{x^{\alpha_i-1}_i}
+
+    Uses the following property for computation: for each dimension,
+    draw a random sample y_i from a standard gamma generator of shape
+    `alpha_i`, then
+    :math:`X = \frac{1}{\sum_{i=1}^k{y_i}} (y_1, \ldots, y_n)` is
+    Dirichlet distributed.
+
+    References
+    ----------
+    .. [1] David McKay, "Information Theory, Inference and Learning
+           Algorithms," chapter 23,
+           http://www.inference.phy.cam.ac.uk/mackay/
+    .. [2] Wikipedia, "Dirichlet distribution",
+           http://en.wikipedia.org/wiki/Dirichlet_distribution
+
+    Examples
+    --------
+    Taking an example cited in Wikipedia, this distribution can be used if
+    one wanted to cut strings (each of initial length 1.0) into K pieces
+    with different lengths, where each piece had, on average, a designated
+    average length, but allowing some variation in the relative sizes of
+    the pieces.
+
+    >>> import mars.tensor as mt
+
+    >>> s = mt.random.dirichlet((10, 5, 3), 20).transpose()
+
+    >>> import matplotlib.pyplot as plt
+
+    >>> plt.barh(range(20), s[0].execute())
+    >>> plt.barh(range(20), s[1].execute(), left=s[0].execute(), color='g')
+    >>> plt.barh(range(20), s[2].execute(), left=(s[0]+s[1]).execute(), color='r')
+    >>> plt.title("Lengths of Strings")
+    """
+    if isinstance(alpha, Iterable):
+        alpha = tuple(alpha)
+    else:
+        raise TypeError("`alpha` should be an array")
+    if dtype is None:
+        dtype = np.random.RandomState().dirichlet(alpha, size=(0,)).dtype
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorDirichlet(seed=seed, alpha=alpha, size=size, gpu=gpu, dtype=dtype)
+    return op(chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/exponential.py b/python/xorbits/_mars/tensor/random/exponential.py
new file mode 100644
index 000000000..97de3eed4
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/exponential.py
@@ -0,0 +1,92 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorExponential(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["scale"]
+    _op_type_ = OperandDef.RAND_EXPONENTIAL
+
+    _fields_ = "scale", "size"
+    scale = AnyField("scale")
+    _func_name = "exponential"
+
+    def __call__(self, scale, chunk_size=None):
+        return self.new_tensor([scale], self.size, raw_chunk_size=chunk_size)
+
+
+def exponential(
+    random_state, scale=1.0, size=None, chunk_size=None, gpu=None, dtype=None
+):
+    r"""
+    Draw samples from an exponential distribution.
+
+    Its probability density function is
+
+    .. math:: f(x; \frac{1}{\beta}) = \frac{1}{\beta} \exp(-\frac{x}{\beta}),
+
+    for ``x > 0`` and 0 elsewhere. :math:`\beta` is the scale parameter,
+    which is the inverse of the rate parameter :math:`\lambda = 1/\beta`.
+    The rate parameter is an alternative, widely used parameterization
+    of the exponential distribution [3]_.
+
+    The exponential distribution is a continuous analogue of the
+    geometric distribution.  It describes many common situations, such as
+    the size of raindrops measured over many rainstorms [1]_, or the time
+    between page requests to Wikipedia [2]_.
+
+    Parameters
+    ----------
+    scale : float or array_like of floats
+        The scale parameter, :math:`\beta = 1/\lambda`.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``scale`` is a scalar.  Otherwise,
+        ``np.array(scale).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized exponential distribution.
+
+    References
+    ----------
+    .. [1] Peyton Z. Peebles Jr., "Probability, Random Variables and
+           Random Signal Principles", 4th ed, 2001, p. 57.
+    .. [2] Wikipedia, "Poisson process",
+           http://en.wikipedia.org/wiki/Poisson_process
+    .. [3] Wikipedia, "Exponential distribution",
+           http://en.wikipedia.org/wiki/Exponential_distribution
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState().exponential(handle_array(scale), size=(0,)).dtype
+        )
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorExponential(seed=seed, size=size, gpu=gpu, dtype=dtype)
+    return op(scale, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/f.py b/python/xorbits/_mars/tensor/random/f.py
new file mode 100644
index 000000000..b644a7af7
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/f.py
@@ -0,0 +1,133 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorF(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["dfnum", "dfden"]
+    _op_type_ = OperandDef.RAND_F
+
+    _fields_ = "dfnum", "dfden", "size"
+    dfnum = AnyField("dfnum")
+    dfden = AnyField("dfden")
+    _func_name = "f"
+
+    def __call__(self, dfnum, dfden, chunk_size=None):
+        return self.new_tensor([dfnum, dfden], None, raw_chunk_size=chunk_size)
+
+
+def f(random_state, dfnum, dfden, size=None, chunk_size=None, gpu=None, dtype=None):
+    """
+    Draw samples from an F distribution.
+
+    Samples are drawn from an F distribution with specified parameters,
+    `dfnum` (degrees of freedom in numerator) and `dfden` (degrees of
+    freedom in denominator), where both parameters should be greater than
+    zero.
+
+    The random variate of the F distribution (also known as the
+    Fisher distribution) is a continuous probability distribution
+    that arises in ANOVA tests, and is the ratio of two chi-square
+    variates.
+
+    Parameters
+    ----------
+    dfnum : float or array_like of floats
+        Degrees of freedom in numerator, should be > 0.
+    dfden : float or array_like of float
+        Degrees of freedom in denominator, should be > 0.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``dfnum`` and ``dfden`` are both scalars.
+        Otherwise, ``np.broadcast(dfnum, dfden).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized Fisher distribution.
+
+    See Also
+    --------
+    scipy.stats.f : probability density function, distribution or
+        cumulative density function, etc.
+
+    Notes
+    -----
+    The F statistic is used to compare in-group variances to between-group
+    variances. Calculating the distribution depends on the sampling, and
+    so it is a function of the respective degrees of freedom in the
+    problem.  The variable `dfnum` is the number of samples minus one, the
+    between-groups degrees of freedom, while `dfden` is the within-groups
+    degrees of freedom, the sum of the number of samples in each group
+    minus the number of groups.
+
+    References
+    ----------
+    .. [1] Glantz, Stanton A. "Primer of Biostatistics.", McGraw-Hill,
+           Fifth Edition, 2002.
+    .. [2] Wikipedia, "F-distribution",
+           http://en.wikipedia.org/wiki/F-distribution
+
+    Examples
+    --------
+    An example from Glantz[1], pp 47-40:
+
+    Two groups, children of diabetics (25 people) and children from people
+    without diabetes (25 controls). Fasting blood glucose was measured,
+    case group had a mean value of 86.1, controls had a mean value of
+    82.2. Standard deviations were 2.09 and 2.49 respectively. Are these
+    data consistent with the null hypothesis that the parents diabetic
+    status does not affect their children's blood glucose levels?
+    Calculating the F statistic from the data gives a value of 36.01.
+
+    Draw samples from the distribution:
+
+    >>> import mars.tensor as mt
+
+    >>> dfnum = 1. # between group degrees of freedom
+    >>> dfden = 48. # within groups degrees of freedom
+    >>> s = mt.random.f(dfnum, dfden, 1000).execute()
+
+    The lower bound for the top 1% of the samples is :
+
+    >>> sorted(s)[-10]
+    7.61988120985
+
+    So there is about a 1% chance that the F statistic will exceed 7.62,
+    the measured value is 36, so the null hypothesis is rejected at the 1%
+    level.
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState()
+            .f(handle_array(dfnum), handle_array(dfden), size=(0,))
+            .dtype
+        )
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorF(seed=seed, size=size, gpu=gpu, dtype=dtype)
+    return op(dfnum, dfden, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/gamma.py b/python/xorbits/_mars/tensor/random/gamma.py
new file mode 100644
index 000000000..490e82f74
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/gamma.py
@@ -0,0 +1,126 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorRandGamma(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["shape", "scale"]
+    _op_type_ = OperandDef.RAND_GAMMA
+
+    _fields_ = "shape", "scale", "size"
+    shape = AnyField("shape")
+    scale = AnyField("scale")
+    _func_name = "gamma"
+
+    def __call__(self, shape, scale, chunk_size=None):
+        return self.new_tensor([shape, scale], None, raw_chunk_size=chunk_size)
+
+
+def gamma(
+    random_state, shape, scale=1.0, size=None, chunk_size=None, gpu=None, dtype=None
+):
+    r"""
+    Draw samples from a Gamma distribution.
+
+    Samples are drawn from a Gamma distribution with specified parameters,
+    `shape` (sometimes designated "k") and `scale` (sometimes designated
+    "theta"), where both parameters are > 0.
+
+    Parameters
+    ----------
+    shape : float or array_like of floats
+        The shape of the gamma distribution. Should be greater than zero.
+    scale : float or array_like of floats, optional
+        The scale of the gamma distribution. Should be greater than zero.
+        Default is equal to 1.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``shape`` and ``scale`` are both scalars.
+        Otherwise, ``np.broadcast(shape, scale).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized gamma distribution.
+
+    See Also
+    --------
+    scipy.stats.gamma : probability density function, distribution or
+        cumulative density function, etc.
+
+    Notes
+    -----
+    The probability density for the Gamma distribution is
+
+    .. math:: p(x) = x^{k-1}\frac{e^{-x/\theta}}{\theta^k\Gamma(k)},
+
+    where :math:`k` is the shape and :math:`\theta` the scale,
+    and :math:`\Gamma` is the Gamma function.
+
+    The Gamma distribution is often used to model the times to failure of
+    electronic components, and arises naturally in processes for which the
+    waiting times between Poisson distributed events are relevant.
+
+    References
+    ----------
+    .. [1] Weisstein, Eric W. "Gamma Distribution." From MathWorld--A
+           Wolfram Web Resource.
+           http://mathworld.wolfram.com/GammaDistribution.html
+    .. [2] Wikipedia, "Gamma distribution",
+           http://en.wikipedia.org/wiki/Gamma_distribution
+
+    Examples
+    --------
+    Draw samples from the distribution:
+
+    >>> import mars.tensor as mt
+
+    >>> shape, scale = 2., 2.  # mean=4, std=2*sqrt(2)
+    >>> s = mt.random.gamma(shape, scale, 1000).execute()
+
+    Display the histogram of the samples, along with
+    the probability density function:
+
+    >>> import matplotlib.pyplot as plt
+    >>> import scipy.special as sps
+    >>> import numpy as np
+    >>> count, bins, ignored = plt.hist(s, 50, normed=True)
+    >>> y = bins**(shape-1)*(np.exp(-bins/scale) /
+    ...                      (sps.gamma(shape)*scale**shape))
+    >>> plt.plot(bins, y, linewidth=2, color='r')
+    >>> plt.show()
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState()
+            .gamma(handle_array(shape), handle_array(scale), size=(0,))
+            .dtype
+        )
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorRandGamma(seed=seed, size=size, gpu=gpu, dtype=dtype)
+    return op(shape, scale, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/geometric.py b/python/xorbits/_mars/tensor/random/geometric.py
new file mode 100644
index 000000000..3afcae356
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/geometric.py
@@ -0,0 +1,91 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorGeometric(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["p"]
+    _op_type_ = OperandDef.RAND_GEOMETRIC
+
+    _fields_ = "p", "size"
+    p = AnyField("p")
+    _func_name = "geometric"
+
+    def __call__(self, p, chunk_size=None):
+        return self.new_tensor([p], None, raw_chunk_size=chunk_size)
+
+
+def geometric(random_state, p, size=None, chunk_size=None, gpu=None, dtype=None):
+    """
+    Draw samples from the geometric distribution.
+
+    Bernoulli trials are experiments with one of two outcomes:
+    success or failure (an example of such an experiment is flipping
+    a coin).  The geometric distribution models the number of trials
+    that must be run in order to achieve success.  It is therefore
+    supported on the positive integers, ``k = 1, 2, ...``.
+
+    The probability mass function of the geometric distribution is
+
+    .. math:: f(k) = (1 - p)^{k - 1} p
+
+    where `p` is the probability of success of an individual trial.
+
+    Parameters
+    ----------
+    p : float or array_like of floats
+        The probability of success of an individual trial.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``p`` is a scalar.  Otherwise,
+        ``mt.array(p).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized geometric distribution.
+
+    Examples
+    --------
+    Draw ten thousand values from the geometric distribution,
+    with the probability of an individual success equal to 0.35:
+
+    >>> import mars.tensor as mt
+
+    >>> z = mt.random.geometric(p=0.35, size=10000)
+
+    How many trials succeeded after a single run?
+
+    >>> ((z == 1).sum() / 10000.).execute()
+    0.34889999999999999 #random
+    """
+    if dtype is None:
+        dtype = np.random.RandomState().geometric(handle_array(p), size=(0,)).dtype
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorGeometric(seed=seed, size=size, gpu=gpu, dtype=dtype)
+    return op(p, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/gumbel.py b/python/xorbits/_mars/tensor/random/gumbel.py
new file mode 100644
index 000000000..d5b538117
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/gumbel.py
@@ -0,0 +1,165 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorGumbel(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["loc", "scale"]
+    _op_type_ = OperandDef.RAND_GUMBEL
+
+    _fields_ = "loc", "scale", "size"
+    loc = AnyField("loc")
+    scale = AnyField("scale")
+    _func_name = "gumbel"
+
+    def __call__(self, loc, scale, chunk_size=None):
+        return self.new_tensor([loc, scale], None, raw_chunk_size=chunk_size)
+
+
+def gumbel(
+    random_state, loc=0.0, scale=1.0, size=None, chunk_size=None, gpu=None, dtype=None
+):
+    r"""
+    Draw samples from a Gumbel distribution.
+
+    Draw samples from a Gumbel distribution with specified location and
+    scale.  For more information on the Gumbel distribution, see
+    Notes and References below.
+
+    Parameters
+    ----------
+    loc : float or array_like of floats, optional
+        The location of the mode of the distribution. Default is 0.
+    scale : float or array_like of floats, optional
+        The scale parameter of the distribution. Default is 1.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``loc`` and ``scale`` are both scalars.
+        Otherwise, ``np.broadcast(loc, scale).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized Gumbel distribution.
+
+    See Also
+    --------
+    scipy.stats.gumbel_l
+    scipy.stats.gumbel_r
+    scipy.stats.genextreme
+    weibull
+
+    Notes
+    -----
+    The Gumbel (or Smallest Extreme Value (SEV) or the Smallest Extreme
+    Value Type I) distribution is one of a class of Generalized Extreme
+    Value (GEV) distributions used in modeling extreme value problems.
+    The Gumbel is a special case of the Extreme Value Type I distribution
+    for maximums from distributions with "exponential-like" tails.
+
+    The probability density for the Gumbel distribution is
+
+    .. math:: p(x) = \frac{e^{-(x - \mu)/ \beta}}{\beta} e^{ -e^{-(x - \mu)/
+              \beta}},
+
+    where :math:`\mu` is the mode, a location parameter, and
+    :math:`\beta` is the scale parameter.
+
+    The Gumbel (named for German mathematician Emil Julius Gumbel) was used
+    very early in the hydrology literature, for modeling the occurrence of
+    flood events. It is also used for modeling maximum wind speed and
+    rainfall rates.  It is a "fat-tailed" distribution - the probability of
+    an event in the tail of the distribution is larger than if one used a
+    Gaussian, hence the surprisingly frequent occurrence of 100-year
+    floods. Floods were initially modeled as a Gaussian process, which
+    underestimated the frequency of extreme events.
+
+    It is one of a class of extreme value distributions, the Generalized
+    Extreme Value (GEV) distributions, which also includes the Weibull and
+    Frechet.
+
+    The function has a mean of :math:`\mu + 0.57721\beta` and a variance
+    of :math:`\frac{\pi^2}{6}\beta^2`.
+
+    References
+    ----------
+    .. [1] Gumbel, E. J., "Statistics of Extremes,"
+           New York: Columbia University Press, 1958.
+    .. [2] Reiss, R.-D. and Thomas, M., "Statistical Analysis of Extreme
+           Values from Insurance, Finance, Hydrology and Other Fields,"
+           Basel: Birkhauser Verlag, 2001.
+
+    Examples
+    --------
+    Draw samples from the distribution:
+
+    >>> import mars.tensor as mt
+
+    >>> mu, beta = 0, 0.1 # location and scale
+    >>> s = mt.random.gumbel(mu, beta, 1000).execute()
+
+    Display the histogram of the samples, along with
+    the probability density function:
+
+    >>> import matplotlib.pyplot as plt
+    >>> import numpy as np
+    >>> count, bins, ignored = plt.hist(s, 30, normed=True)
+    >>> plt.plot(bins, (1/beta)*np.exp(-(bins - mu)/beta)
+    ...          * np.exp( -np.exp( -(bins - mu) /beta) ),
+    ...          linewidth=2, color='r')
+    >>> plt.show()
+
+    Show how an extreme value distribution can arise from a Gaussian process
+    and compare to a Gaussian:
+
+    >>> means = []
+    >>> maxima = []
+    >>> for i in range(0,1000) :
+    ...    a = mt.random.normal(mu, beta, 1000)
+    ...    means.append(a.mean().execute())
+    ...    maxima.append(a.max().execute())
+    >>> count, bins, ignored = plt.hist(maxima, 30, normed=True)
+    >>> beta = mt.std(maxima) * mt.sqrt(6) / mt.pi
+    >>> mu = mt.mean(maxima) - 0.57721*beta
+    >>> plt.plot(bins, ((1/beta)*mt.exp(-(bins - mu)/beta)
+    ...          * mt.exp(-mt.exp(-(bins - mu)/beta))).execute(),
+    ...          linewidth=2, color='r')
+    >>> plt.plot(bins, (1/(beta * mt.sqrt(2 * mt.pi))
+    ...          * mt.exp(-(bins - mu)**2 / (2 * beta**2))).execute(),
+    ...          linewidth=2, color='g')
+    >>> plt.show()
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState()
+            .gumbel(handle_array(loc), handle_array(scale), size=(0,))
+            .dtype
+        )
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorGumbel(seed=seed, size=size, gpu=gpu, dtype=dtype)
+    return op(loc, scale, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/hypergeometric.py b/python/xorbits/_mars/tensor/random/hypergeometric.py
new file mode 100644
index 000000000..378ddffb2
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/hypergeometric.py
@@ -0,0 +1,146 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorHypergeometric(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["ngood", "nbad", "nsample"]
+    _op_type_ = OperandDef.RAND_HYPERGEOMETRIC
+
+    _fields_ = "ngood", "nbad", "nsample", "size"
+    ngood = AnyField("ngood")
+    nbad = AnyField("nbad")
+    nsample = AnyField("nsample")
+    _func_name = "hypergeometric"
+
+    def __call__(self, ngood, nbad, nsample, chunk_size=None):
+        return self.new_tensor([ngood, nbad, nsample], None, raw_chunk_size=chunk_size)
+
+
+def hypergeometric(
+    random_state, ngood, nbad, nsample, size=None, chunk_size=None, gpu=None, dtype=None
+):
+    r"""
+    Draw samples from a Hypergeometric distribution.
+
+    Samples are drawn from a hypergeometric distribution with specified
+    parameters, ngood (ways to make a good selection), nbad (ways to make
+    a bad selection), and nsample = number of items sampled, which is less
+    than or equal to the sum ngood + nbad.
+
+    Parameters
+    ----------
+    ngood : int or array_like of ints
+        Number of ways to make a good selection.  Must be nonnegative.
+    nbad : int or array_like of ints
+        Number of ways to make a bad selection.  Must be nonnegative.
+    nsample : int or array_like of ints
+        Number of items sampled.  Must be at least 1 and at most
+        ``ngood + nbad``.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``ngood``, ``nbad``, and ``nsample``
+        are all scalars.  Otherwise, ``np.broadcast(ngood, nbad, nsample).size``
+        samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized hypergeometric distribution.
+
+    See Also
+    --------
+    scipy.stats.hypergeom : probability density function, distribution or
+        cumulative density function, etc.
+
+    Notes
+    -----
+    The probability density for the Hypergeometric distribution is
+
+    .. math:: P(x) = \frac{\binom{m}{n}\binom{N-m}{n-x}}{\binom{N}{n}},
+
+    where :math:`0 \le x \le m` and :math:`n+m-N \le x \le n`
+
+    for P(x) the probability of x successes, n = ngood, m = nbad, and
+    N = number of samples.
+
+    Consider an urn with black and white marbles in it, ngood of them
+    black and nbad are white. If you draw nsample balls without
+    replacement, then the hypergeometric distribution describes the
+    distribution of black balls in the drawn sample.
+
+    Note that this distribution is very similar to the binomial
+    distribution, except that in this case, samples are drawn without
+    replacement, whereas in the Binomial case samples are drawn with
+    replacement (or the sample space is infinite). As the sample space
+    becomes large, this distribution approaches the binomial.
+
+    References
+    ----------
+    .. [1] Lentner, Marvin, "Elementary Applied Statistics", Bogden
+           and Quigley, 1972.
+    .. [2] Weisstein, Eric W. "Hypergeometric Distribution." From
+           MathWorld--A Wolfram Web Resource.
+           http://mathworld.wolfram.com/HypergeometricDistribution.html
+    .. [3] Wikipedia, "Hypergeometric distribution",
+           http://en.wikipedia.org/wiki/Hypergeometric_distribution
+
+    Examples
+    --------
+    Draw samples from the distribution:
+
+    >>> import mars.tensor as mt
+
+    >>> ngood, nbad, nsamp = 100, 2, 10
+    # number of good, number of bad, and number of samples
+    >>> s = mt.random.hypergeometric(ngood, nbad, nsamp, 1000)
+    >>> hist(s)
+    #   note that it is very unlikely to grab both bad items
+
+    Suppose you have an urn with 15 white and 15 black marbles.
+    If you pull 15 marbles at random, how likely is it that
+    12 or more of them are one color?
+
+    >>> s = mt.random.hypergeometric(15, 15, 15, 100000)
+    >>> (mt.sum(s>=12)/100000. + mt.sum(s<=3)/100000.).execute()
+    #   answer = 0.003 ... pretty unlikely!
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState()
+            .hypergeometric(
+                handle_array(ngood),
+                handle_array(nbad),
+                handle_array(nsample),
+                size=(0,),
+            )
+            .dtype
+        )
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorHypergeometric(seed=seed, size=size, gpu=gpu, dtype=dtype)
+    return op(ngood, nbad, nsample, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/laplace.py b/python/xorbits/_mars/tensor/random/laplace.py
new file mode 100644
index 000000000..1290264da
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/laplace.py
@@ -0,0 +1,131 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorLaplace(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["loc", "scale"]
+    _op_type_ = OperandDef.RAND_LAPLACE
+
+    _fields_ = "loc", "scale", "size"
+    loc = AnyField("loc")
+    scale = AnyField("scale")
+    _func_name = "laplace"
+
+    def __call__(self, loc, scale, chunk_size=None):
+        return self.new_tensor([loc, scale], None, raw_chunk_size=chunk_size)
+
+
+def laplace(
+    random_state, loc=0.0, scale=1.0, size=None, chunk_size=None, gpu=None, dtype=None
+):
+    r"""
+    Draw samples from the Laplace or double exponential distribution with
+    specified location (or mean) and scale (decay).
+
+    The Laplace distribution is similar to the Gaussian/normal distribution,
+    but is sharper at the peak and has fatter tails. It represents the
+    difference between two independent, identically distributed exponential
+    random variables.
+
+    Parameters
+    ----------
+    loc : float or array_like of floats, optional
+        The position, :math:`\mu`, of the distribution peak. Default is 0.
+    scale : float or array_like of floats, optional
+        :math:`\lambda`, the exponential decay. Default is 1.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``loc`` and ``scale`` are both scalars.
+        Otherwise, ``np.broadcast(loc, scale).size`` samples are drawn.
+    chunks : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized Laplace distribution.
+
+    Notes
+    -----
+    It has the probability density function
+
+    .. math:: f(x; \mu, \lambda) = \frac{1}{2\lambda}
+                                   \exp\left(-\frac{|x - \mu|}{\lambda}\right).
+
+    The first law of Laplace, from 1774, states that the frequency
+    of an error can be expressed as an exponential function of the
+    absolute magnitude of the error, which leads to the Laplace
+    distribution. For many problems in economics and health
+    sciences, this distribution seems to model the data better
+    than the standard Gaussian distribution.
+
+    References
+    ----------
+    .. [1] Abramowitz, M. and Stegun, I. A. (Eds.). "Handbook of
+           Mathematical Functions with Formulas, Graphs, and Mathematical
+           Tables, 9th printing," New York: Dover, 1972.
+    .. [2] Kotz, Samuel, et. al. "The Laplace Distribution and
+           Generalizations, " Birkhauser, 2001.
+    .. [3] Weisstein, Eric W. "Laplace Distribution."
+           From MathWorld--A Wolfram Web Resource.
+           http://mathworld.wolfram.com/LaplaceDistribution.html
+    .. [4] Wikipedia, "Laplace distribution",
+           http://en.wikipedia.org/wiki/Laplace_distribution
+
+    Examples
+    --------
+    Draw samples from the distribution
+
+    >>> import mars.tensor as mt
+
+    >>> loc, scale = 0., 1.
+    >>> s = mt.random.laplace(loc, scale, 1000)
+
+    Display the histogram of the samples, along with
+    the probability density function:
+
+    >>> import matplotlib.pyplot as plt
+    >>> count, bins, ignored = plt.hist(s.execute(), 30, normed=True)
+    >>> x = mt.arange(-8., 8., .01)
+    >>> pdf = mt.exp(-abs(x-loc)/scale)/(2.*scale)
+    >>> plt.plot(x.execute(), pdf.execute())
+
+    Plot Gaussian for comparison:
+
+    >>> g = (1/(scale * mt.sqrt(2 * np.pi)) *
+    ...      mt.exp(-(x - loc)**2 / (2 * scale**2)))
+    >>> plt.plot(x.execute(),g.execute())
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState()
+            .laplace(handle_array(loc), handle_array(scale), size=(0,))
+            .dtype
+        )
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorLaplace(seed=seed, size=size, gpu=gpu, dtype=dtype)
+    return op(loc, scale, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/logistic.py b/python/xorbits/_mars/tensor/random/logistic.py
new file mode 100644
index 000000000..1184fce3f
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/logistic.py
@@ -0,0 +1,127 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorLogistic(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["loc", "scale"]
+    _op_type_ = OperandDef.RAND_LOGISTIC
+
+    _fields_ = "loc", "scale", "size"
+    loc = AnyField("loc")
+    scale = AnyField("scale")
+    _func_name = "logistic"
+
+    def __call__(self, loc, scale, chunk_size=None):
+        return self.new_tensor([loc, scale], None, raw_chunk_size=chunk_size)
+
+
+def logistic(
+    random_state, loc=0.0, scale=1.0, size=None, chunk_size=None, gpu=None, dtype=None
+):
+    r"""
+    Draw samples from a logistic distribution.
+
+    Samples are drawn from a logistic distribution with specified
+    parameters, loc (location or mean, also median), and scale (>0).
+
+    Parameters
+    ----------
+    loc : float or array_like of floats, optional
+        Parameter of the distribution. Default is 0.
+    scale : float or array_like of floats, optional
+        Parameter of the distribution. Should be greater than zero.
+        Default is 1.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``loc`` and ``scale`` are both scalars.
+        Otherwise, ``np.broadcast(loc, scale).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized logistic distribution.
+
+    See Also
+    --------
+    scipy.stats.logistic : probability density function, distribution or
+        cumulative density function, etc.
+
+    Notes
+    -----
+    The probability density for the Logistic distribution is
+
+    .. math:: P(x) = P(x) = \frac{e^{-(x-\mu)/s}}{s(1+e^{-(x-\mu)/s})^2},
+
+    where :math:`\mu` = location and :math:`s` = scale.
+
+    The Logistic distribution is used in Extreme Value problems where it
+    can act as a mixture of Gumbel distributions, in Epidemiology, and by
+    the World Chess Federation (FIDE) where it is used in the Elo ranking
+    system, assuming the performance of each player is a logistically
+    distributed random variable.
+
+    References
+    ----------
+    .. [1] Reiss, R.-D. and Thomas M. (2001), "Statistical Analysis of
+           Extreme Values, from Insurance, Finance, Hydrology and Other
+           Fields," Birkhauser Verlag, Basel, pp 132-133.
+    .. [2] Weisstein, Eric W. "Logistic Distribution." From
+           MathWorld--A Wolfram Web Resource.
+           http://mathworld.wolfram.com/LogisticDistribution.html
+    .. [3] Wikipedia, "Logistic-distribution",
+           http://en.wikipedia.org/wiki/Logistic_distribution
+
+    Examples
+    --------
+    Draw samples from the distribution:
+
+    >>> import mars.tensor as mt
+    >>> import matplotlib.pyplot as plt
+
+    >>> loc, scale = 10, 1
+    >>> s = mt.random.logistic(loc, scale, 10000)
+    >>> count, bins, ignored = plt.hist(s.execute(), bins=50)
+
+    #   plot against distribution
+
+    >>> def logist(x, loc, scale):
+    ...     return mt.exp((loc-x)/scale)/(scale*(1+mt.exp((loc-x)/scale))**2)
+    >>> plt.plot(bins, logist(bins, loc, scale).execute()*count.max()/\
+    ... logist(bins, loc, scale).max().execute())
+    >>> plt.show()
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState()
+            .logistic(handle_array(loc), handle_array(scale), size=(0,))
+            .dtype
+        )
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorLogistic(seed=seed, size=size, gpu=gpu, dtype=dtype)
+    return op(loc, scale, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/lognormal.py b/python/xorbits/_mars/tensor/random/lognormal.py
new file mode 100644
index 000000000..c0788d299
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/lognormal.py
@@ -0,0 +1,157 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorLognormal(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["mean", "sigma"]
+    _op_type_ = OperandDef.RAND_LOGNORMAL
+
+    _fields_ = "mean", "sigma", "size"
+    mean = AnyField("mean")
+    sigma = AnyField("sigma")
+    _func_name = "lognormal"
+
+    def __call__(self, mean, sigma, chunk_size=None):
+        return self.new_tensor([mean, sigma], None, raw_chunk_size=chunk_size)
+
+
+def lognormal(
+    random_state, mean=0.0, sigma=1.0, size=None, chunk_size=None, gpu=None, dtype=None
+):
+    r"""
+    Draw samples from a log-normal distribution.
+
+    Draw samples from a log-normal distribution with specified mean,
+    standard deviation, and array shape.  Note that the mean and standard
+    deviation are not the values for the distribution itself, but of the
+    underlying normal distribution it is derived from.
+
+    Parameters
+    ----------
+    mean : float or array_like of floats, optional
+        Mean value of the underlying normal distribution. Default is 0.
+    sigma : float or array_like of floats, optional
+        Standard deviation of the underlying normal distribution. Should
+        be greater than zero. Default is 1.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``mean`` and ``sigma`` are both scalars.
+        Otherwise, ``np.broadcast(mean, sigma).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized log-normal distribution.
+
+    See Also
+    --------
+    scipy.stats.lognorm : probability density function, distribution,
+        cumulative density function, etc.
+
+    Notes
+    -----
+    A variable `x` has a log-normal distribution if `log(x)` is normally
+    distributed.  The probability density function for the log-normal
+    distribution is:
+
+    .. math:: p(x) = \frac{1}{\sigma x \sqrt{2\pi}}
+                     e^{(-\frac{(ln(x)-\mu)^2}{2\sigma^2})}
+
+    where :math:`\mu` is the mean and :math:`\sigma` is the standard
+    deviation of the normally distributed logarithm of the variable.
+    A log-normal distribution results if a random variable is the *product*
+    of a large number of independent, identically-distributed variables in
+    the same way that a normal distribution results if the variable is the
+    *sum* of a large number of independent, identically-distributed
+    variables.
+
+    References
+    ----------
+    .. [1] Limpert, E., Stahel, W. A., and Abbt, M., "Log-normal
+           Distributions across the Sciences: Keys and Clues,"
+           BioScience, Vol. 51, No. 5, May, 2001.
+           http://stat.ethz.ch/~stahel/lognormal/bioscience.pdf
+    .. [2] Reiss, R.D. and Thomas, M., "Statistical Analysis of Extreme
+           Values," Basel: Birkhauser Verlag, 2001, pp. 31-32.
+
+    Examples
+    --------
+    Draw samples from the distribution:
+
+    >>> import mars.tensor as mt
+
+    >>> mu, sigma = 3., 1. # mean and standard deviation
+    >>> s = mt.random.lognormal(mu, sigma, 1000)
+
+    Display the histogram of the samples, along with
+    the probability density function:
+
+    >>> import matplotlib.pyplot as plt
+    >>> count, bins, ignored = plt.hist(s.execute(), 100, normed=True, align='mid')
+
+    >>> x = mt.linspace(min(bins), max(bins), 10000)
+    >>> pdf = (mt.exp(-(mt.log(x) - mu)**2 / (2 * sigma**2))
+    ...        / (x * sigma * mt.sqrt(2 * mt.pi)))
+
+    >>> plt.plot(x.execute(), pdf.execute(), linewidth=2, color='r')
+    >>> plt.axis('tight')
+    >>> plt.show()
+
+    Demonstrate that taking the products of random samples from a uniform
+    distribution can be fit well by a log-normal probability density
+    function.
+
+    >>> # Generate a thousand samples: each is the product of 100 random
+    >>> # values, drawn from a normal distribution.
+    >>> b = []
+    >>> for i in range(1000):
+    ...    a = 10. + mt.random.random(100)
+    ...    b.append(mt.product(a).execute())
+
+    >>> b = mt.array(b) / mt.min(b) # scale values to be positive
+    >>> count, bins, ignored = plt.hist(b.execute(), 100, normed=True, align='mid')
+    >>> sigma = mt.std(mt.log(b))
+    >>> mu = mt.mean(mt.log(b))
+
+    >>> x = mt.linspace(min(bins), max(bins), 10000)
+    >>> pdf = (mt.exp(-(mt.log(x) - mu)**2 / (2 * sigma**2))
+    ...        / (x * sigma * mt.sqrt(2 * mt.pi)))
+
+    >>> plt.plot(x.execute(), pdf.execute(), color='r', linewidth=2)
+    >>> plt.show()
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState()
+            .lognormal(handle_array(mean), handle_array(sigma), size=(0,))
+            .dtype
+        )
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorLognormal(seed=seed, size=size, gpu=gpu, dtype=dtype)
+    return op(mean, sigma, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/logseries.py b/python/xorbits/_mars/tensor/random/logseries.py
new file mode 100644
index 000000000..2f7444ab8
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/logseries.py
@@ -0,0 +1,120 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorLogseries(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["p"]
+    _op_type_ = OperandDef.RAND_LOGSERIES
+
+    _fields_ = "p", "size"
+    p = AnyField("p")
+    _func_name = "logseries"
+
+    def __call__(self, p, chunk_size=None):
+        return self.new_tensor([p], None, raw_chunk_size=chunk_size)
+
+
+def logseries(random_state, p, size=None, chunk_size=None, gpu=None, dtype=None):
+    r"""
+    Draw samples from a logarithmic series distribution.
+
+    Samples are drawn from a log series distribution with specified
+    shape parameter, 0 < ``p`` < 1.
+
+    Parameters
+    ----------
+    p : float or array_like of floats
+        Shape parameter for the distribution.  Must be in the range (0, 1).
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``p`` is a scalar.  Otherwise,
+        ``np.array(p).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized logarithmic series distribution.
+
+    See Also
+    --------
+    scipy.stats.logser : probability density function, distribution or
+        cumulative density function, etc.
+
+    Notes
+    -----
+    The probability density for the Log Series distribution is
+
+    .. math:: P(k) = \frac{-p^k}{k \ln(1-p)},
+
+    where p = probability.
+
+    The log series distribution is frequently used to represent species
+    richness and occurrence, first proposed by Fisher, Corbet, and
+    Williams in 1943 [2].  It may also be used to model the numbers of
+    occupants seen in cars [3].
+
+    References
+    ----------
+    .. [1] Buzas, Martin A.; Culver, Stephen J.,  Understanding regional
+           species diversity through the log series distribution of
+           occurrences: BIODIVERSITY RESEARCH Diversity & Distributions,
+           Volume 5, Number 5, September 1999 , pp. 187-195(9).
+    .. [2] Fisher, R.A,, A.S. Corbet, and C.B. Williams. 1943. The
+           relation between the number of species and the number of
+           individuals in a random sample of an animal population.
+           Journal of Animal Ecology, 12:42-58.
+    .. [3] D. J. Hand, F. Daly, D. Lunn, E. Ostrowski, A Handbook of Small
+           Data Sets, CRC Press, 1994.
+    .. [4] Wikipedia, "Logarithmic distribution",
+           http://en.wikipedia.org/wiki/Logarithmic_distribution
+
+    Examples
+    --------
+    Draw samples from the distribution:
+
+    >>> import mars.tensor as mt
+    >>> import matplotlib.pyplot as plt
+
+    >>> a = .6
+    >>> s = mt.random.logseries(a, 10000)
+    >>> count, bins, ignored = plt.hist(s.execute())
+
+    #   plot against distribution
+
+    >>> def logseries(k, p):
+    ...     return -p**k/(k*mt.log(1-p))
+    >>> plt.plot(bins, (logseries(bins, a)*count.max()/
+    ...          logseries(bins, a).max()).execute(), 'r')
+    >>> plt.show()
+    """
+    if dtype is None:
+        dtype = np.random.RandomState().logseries(handle_array(p), size=(0,)).dtype
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorLogseries(seed=seed, size=size, gpu=gpu, dtype=dtype)
+    return op(p, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/multinomial.py b/python/xorbits/_mars/tensor/random/multinomial.py
new file mode 100644
index 000000000..d0c488053
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/multinomial.py
@@ -0,0 +1,131 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import FieldTypes, Int64Field, TupleField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin
+
+
+class TensorMultinomial(TensorDistribution, TensorRandomOperandMixin):
+    _op_type_ = OperandDef.RAND_MULTINOMIAL
+
+    _fields_ = "n", "pvals", "size"
+    n = Int64Field("n")
+    pvals = TupleField("pvals", FieldTypes.float64)
+    _func_name = "multinomial"
+
+    def __call__(self, chunk_size=None):
+        if self.size is None:
+            shape = (len(self.pvals),)
+        else:
+            try:
+                shape = tuple(self.size) + (len(self.pvals),)
+            except TypeError:
+                shape = (self.size, len(self.pvals))
+        return self.new_tensor(None, shape, raw_chunk_size=chunk_size)
+
+
+def multinomial(
+    random_state, n, pvals, size=None, chunk_size=None, gpu=None, dtype=None
+):
+    """
+    Draw samples from a multinomial distribution.
+
+    The multinomial distribution is a multivariate generalisation of the
+    binomial distribution.  Take an experiment with one of ``p``
+    possible outcomes.  An example of such an experiment is throwing a dice,
+    where the outcome can be 1 through 6.  Each sample drawn from the
+    distribution represents `n` such experiments.  Its values,
+    ``X_i = [X_0, X_1, ..., X_p]``, represent the number of times the
+    outcome was ``i``.
+
+    Parameters
+    ----------
+    n : int
+        Number of experiments.
+    pvals : sequence of floats, length p
+        Probabilities of each of the ``p`` different outcomes.  These
+        should sum to 1 (however, the last element is always assumed to
+        account for the remaining probability, as long as
+        ``sum(pvals[:-1]) <= 1)``.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor
+        The drawn samples, of shape *size*, if that was provided.  If not,
+        the shape is ``(N,)``.
+
+        In other words, each entry ``out[i,j,...,:]`` is an N-dimensional
+        value drawn from the distribution.
+
+    Examples
+    --------
+    Throw a dice 20 times:
+
+    >>> import mars.tensor as mt
+
+    >>> mt.random.multinomial(20, [1/6.]*6, size=1).execute()
+    array([[4, 1, 7, 5, 2, 1]])
+
+    It landed 4 times on 1, once on 2, etc.
+
+    Now, throw the dice 20 times, and 20 times again:
+
+    >>> mt.random.multinomial(20, [1/6.]*6, size=2).execute()
+    array([[3, 4, 3, 3, 4, 3],
+           [2, 4, 3, 4, 0, 7]])
+
+    For the first run, we threw 3 times 1, 4 times 2, etc.  For the second,
+    we threw 2 times 1, 4 times 2, etc.
+
+    A loaded die is more likely to land on number 6:
+
+    >>> mt.random.multinomial(100, [1/7.]*5 + [2/7.]).execute()
+    array([11, 16, 14, 17, 16, 26])
+
+    The probability inputs should be normalized. As an implementation
+    detail, the value of the last entry is ignored and assumed to take
+    up any leftover probability mass, but this should not be relied on.
+    A biased coin which has twice as much weight on one side as on the
+    other should be sampled like so:
+
+    >>> mt.random.multinomial(100, [1.0 / 3, 2.0 / 3]).execute()  # RIGHT
+    array([38, 62])
+
+    not like:
+
+    >>> mt.random.multinomial(100, [1.0, 2.0]).execute()  # WRONG
+    array([100,   0])
+    """
+    n = int(n)
+    pvals = tuple(pvals)
+    if dtype is None:
+        dtype = np.random.RandomState().multinomial(n, pvals, size=(0,)).dtype
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorMultinomial(n=n, pvals=pvals, seed=seed, size=size, gpu=gpu, dtype=dtype)
+    return op(chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/multivariate_normal.py b/python/xorbits/_mars/tensor/random/multivariate_normal.py
new file mode 100644
index 000000000..47a3ec5c3
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/multivariate_normal.py
@@ -0,0 +1,266 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...config import options
+from ...serialization.serializables import Float64Field, NDArrayField, StringField
+from ..array_utils import array_module, device
+from ..utils import decide_chunk_sizes, gen_random_seeds
+from .core import TENSOR_CHUNK_TYPE, TensorDistribution, TensorRandomOperandMixin
+
+
+class TensorMultivariateNormal(TensorDistribution, TensorRandomOperandMixin):
+    _op_type_ = OperandDef.RAND_MULTIVARIATE_NORMAL
+
+    _fields_ = "mean", "cov", "size", "check_valid", "tol"
+    mean = NDArrayField("mean")
+    cov = NDArrayField("cov")
+    check_valid = StringField("check_valid")
+    tol = Float64Field("tol")
+    _func_name = "multivariate_normal"
+
+    def __call__(self, chunk_size=None):
+        N = self.mean.size
+        if self.size is None:
+            shape = (N,)
+        else:
+            try:
+                shape = tuple(self.size) + (N,)
+            except TypeError:
+                shape = (self.size, N)
+
+        return self.new_tensor(None, shape, raw_chunk_size=chunk_size)
+
+    @classmethod
+    def tile(cls, op):
+        tensor = op.outputs[0]
+        chunk_size = tensor.extra_params.raw_chunk_size or options.chunk_size
+        nsplits = decide_chunk_sizes(
+            tensor.shape[:-1], chunk_size, tensor.dtype.itemsize
+        ) + ((tensor.shape[-1],),)
+
+        mean_chunk = op.mean.chunks[0] if hasattr(op.mean, "chunks") else op.mean
+        cov_chunk = op.cov.chunks[0] if hasattr(op.cov, "chunks") else op.cov
+
+        idxes = list(itertools.product(*[range(len(s)) for s in nsplits]))
+        seeds = gen_random_seeds(len(idxes), np.random.RandomState(op.seed))
+
+        out_chunks = []
+        for seed, out_idx, shape in zip(seeds, idxes, itertools.product(*nsplits)):
+            chunk_op = op.copy().reset_key()
+            chunk_op._state = None
+            chunk_op.seed = seed
+            chunk_op.size = shape[:-1]
+            out_chunk = chunk_op.new_chunk(
+                [mean_chunk, cov_chunk], shape=shape, index=out_idx
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs, tensor.shape, chunks=out_chunks, nsplits=nsplits
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        xp = array_module(op.gpu)
+        if xp is np:
+            device_id = -1
+        else:
+            device_id = op.device or 0
+
+        with device(device_id):
+            rs = xp.random.RandomState(op.seed)
+
+            args = []
+            for k in op.args:
+                val = getattr(op, k, None)
+                if isinstance(val, TENSOR_CHUNK_TYPE):
+                    args.append(ctx[val.key])
+                else:
+                    args.append(val)
+            mean, cov = args[:2]
+            kw = {}
+            if args[2] is not None:
+                kw["size"] = args[2]
+            if args[3] is not None:
+                kw["check_valid"] = args[3]
+            if args[4] is not None:
+                kw["tol"] = args[4]
+
+            try:
+                res = rs.multivariate_normal(mean, cov, **kw)
+                if xp is not np:
+                    ctx[op.outputs[0].key] = xp.asarray(res)
+                else:
+                    ctx[op.outputs[0].key] = res
+            except AttributeError:
+                if xp is not np:
+                    # cupy cannot generate data, fallback to numpy first
+                    rs = np.random.RandomState(op.seed)
+                    res = rs.multivariate_normal(mean, cov, **kw)
+                    ctx[op.outputs[0].key] = xp.asarray(res)
+                else:
+                    raise
+
+
+def multivariate_normal(
+    random_state,
+    mean,
+    cov,
+    size=None,
+    check_valid=None,
+    tol=None,
+    chunk_size=None,
+    gpu=None,
+    dtype=None,
+):
+    """
+    Draw random samples from a multivariate normal distribution.
+
+    The multivariate normal, multinormal or Gaussian distribution is a
+    generalization of the one-dimensional normal distribution to higher
+    dimensions.  Such a distribution is specified by its mean and
+    covariance matrix.  These parameters are analogous to the mean
+    (average or "center") and variance (standard deviation, or "width,"
+    squared) of the one-dimensional normal distribution.
+
+    Parameters
+    ----------
+    mean : 1-D array_like, of length N
+        Mean of the N-dimensional distribution.
+    cov : 2-D array_like, of shape (N, N)
+        Covariance matrix of the distribution. It must be symmetric and
+        positive-semidefinite for proper sampling.
+    size : int or tuple of ints, optional
+        Given a shape of, for example, ``(m,n,k)``, ``m*n*k`` samples are
+        generated, and packed in an `m`-by-`n`-by-`k` arrangement.  Because
+        each sample is `N`-dimensional, the output shape is ``(m,n,k,N)``.
+        If no shape is specified, a single (`N`-D) sample is returned.
+    check_valid : { 'warn', 'raise', 'ignore' }, optional
+        Behavior when the covariance matrix is not positive semidefinite.
+    tol : float, optional
+        Tolerance when checking the singular values in covariance matrix.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor
+        The drawn samples, of shape *size*, if that was provided.  If not,
+        the shape is ``(N,)``.
+
+        In other words, each entry ``out[i,j,...,:]`` is an N-dimensional
+        value drawn from the distribution.
+
+    Notes
+    -----
+    The mean is a coordinate in N-dimensional space, which represents the
+    location where samples are most likely to be generated.  This is
+    analogous to the peak of the bell curve for the one-dimensional or
+    univariate normal distribution.
+
+    Covariance indicates the level to which two variables vary together.
+    From the multivariate normal distribution, we draw N-dimensional
+    samples, :math:`X = [x_1, x_2, ... x_N]`.  The covariance matrix
+    element :math:`C_{ij}` is the covariance of :math:`x_i` and :math:`x_j`.
+    The element :math:`C_{ii}` is the variance of :math:`x_i` (i.e. its
+    "spread").
+
+    Instead of specifying the full covariance matrix, popular
+    approximations include:
+
+      - Spherical covariance (`cov` is a multiple of the identity matrix)
+      - Diagonal covariance (`cov` has non-negative elements, and only on
+        the diagonal)
+
+    This geometrical property can be seen in two dimensions by plotting
+    generated data-points:
+
+    >>> mean = [0, 0]
+    >>> cov = [[1, 0], [0, 100]]  # diagonal covariance
+
+    Diagonal covariance means that points are oriented along x or y-axis:
+
+    >>> import matplotlib.pyplot as plt
+    >>> import mars.tensor as mt
+    >>> x, y = mt.random.multivariate_normal(mean, cov, 5000).T
+    >>> plt.plot(x.execute(), y.execute(), 'x')
+    >>> plt.axis('equal')
+    >>> plt.show()
+
+    Note that the covariance matrix must be positive semidefinite (a.k.a.
+    nonnegative-definite). Otherwise, the behavior of this method is
+    undefined and backwards compatibility is not guaranteed.
+
+    References
+    ----------
+    .. [1] Papoulis, A., "Probability, Random Variables, and Stochastic
+           Processes," 3rd ed., New York: McGraw-Hill, 1991.
+    .. [2] Duda, R. O., Hart, P. E., and Stork, D. G., "Pattern
+           Classification," 2nd ed., New York: Wiley, 2001.
+
+    Examples
+    --------
+    >>> mean = (1, 2)
+    >>> cov = [[1, 0], [0, 1]]
+    >>> x = mt.random.multivariate_normal(mean, cov, (3, 3))
+    >>> x.shape
+    (3, 3, 2)
+
+    The following is probably true, given that 0.6 is roughly twice the
+    standard deviation:
+
+    >>> list(((x[0,0,:] - mean) < 0.6).execute())
+    [True, True]
+    """
+    mean = np.asarray(mean)
+    cov = np.asarray(cov)
+
+    if mean.ndim != 1:
+        raise ValueError("mean must be 1 dimensional")
+    if cov.ndim != 2:
+        raise ValueError("cov must be 1 dimensional")
+    if len(set(mean.shape + cov.shape)) != 1:
+        raise ValueError("mean and cov must have same length")
+
+    if dtype is None:
+        small_kw = {}
+        if check_valid:
+            small_kw["check_valid"] = check_valid
+        if tol:
+            small_kw["tol"] = tol
+        dtype = np.random.multivariate_normal(mean, cov, size=(0,), **small_kw).dtype
+
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorMultivariateNormal(
+        mean=mean,
+        cov=cov,
+        size=size,
+        check_valid=check_valid,
+        tol=tol,
+        seed=seed,
+        gpu=gpu,
+        dtype=dtype,
+    )
+    return op(chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/negative_binomial.py b/python/xorbits/_mars/tensor/random/negative_binomial.py
new file mode 100644
index 000000000..4d6a547cc
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/negative_binomial.py
@@ -0,0 +1,123 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorNegativeBinomial(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["n", "p"]
+    _op_type_ = OperandDef.RAND_NEGATIVE_BINOMIAL
+
+    _fields_ = "n", "p", "size"
+    n = AnyField("n")
+    p = AnyField("p")
+    _func_name = "negative_binomial"
+
+    def __call__(self, n, p, chunk_size=None):
+        return self.new_tensor([n, p], None, raw_chunk_size=chunk_size)
+
+
+def negative_binomial(
+    random_state, n, p, size=None, chunk_size=None, gpu=None, dtype=None
+):
+    r"""
+    Draw samples from a negative binomial distribution.
+
+    Samples are drawn from a negative binomial distribution with specified
+    parameters, `n` trials and `p` probability of success where `n` is an
+    integer > 0 and `p` is in the interval [0, 1].
+
+    Parameters
+    ----------
+    n : int or array_like of ints
+        Parameter of the distribution, > 0. Floats are also accepted,
+        but they will be truncated to integers.
+    p : float or array_like of floats
+        Parameter of the distribution, >= 0 and <=1.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``n`` and ``p`` are both scalars.
+        Otherwise, ``np.broadcast(n, p).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized negative binomial distribution,
+        where each sample is equal to N, the number of trials it took to
+        achieve n - 1 successes, N - (n - 1) failures, and a success on the,
+        (N + n)th trial.
+
+    Notes
+    -----
+    The probability density for the negative binomial distribution is
+
+    .. math:: P(N;n,p) = \binom{N+n-1}{n-1}p^{n}(1-p)^{N},
+
+    where :math:`n-1` is the number of successes, :math:`p` is the
+    probability of success, and :math:`N+n-1` is the number of trials.
+    The negative binomial distribution gives the probability of n-1
+    successes and N failures in N+n-1 trials, and success on the (N+n)th
+    trial.
+
+    If one throws a die repeatedly until the third time a "1" appears,
+    then the probability distribution of the number of non-"1"s that
+    appear before the third "1" is a negative binomial distribution.
+
+    References
+    ----------
+    .. [1] Weisstein, Eric W. "Negative Binomial Distribution." From
+           MathWorld--A Wolfram Web Resource.
+           http://mathworld.wolfram.com/NegativeBinomialDistribution.html
+    .. [2] Wikipedia, "Negative binomial distribution",
+           http://en.wikipedia.org/wiki/Negative_binomial_distribution
+
+    Examples
+    --------
+    Draw samples from the distribution:
+
+    A real world example. A company drills wild-cat oil
+    exploration wells, each with an estimated probability of
+    success of 0.1.  What is the probability of having one success
+    for each successive well, that is what is the probability of a
+    single success after drilling 5 wells, after 6 wells, etc.?
+
+    >>> import mars.tensor as mt
+
+    >>> s = mt.random.negative_binomial(1, 0.1, 100000)
+    >>> for i in range(1, 11):
+    ...    probability = (mt.sum(s<i) / 100000.).execute()
+    ...    print i, "wells drilled, probability of one success =", probability
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState()
+            .negative_binomial(handle_array(n), handle_array(p), size=(0,))
+            .dtype
+        )
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorNegativeBinomial(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(n, p, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/noncentral_chisquare.py b/python/xorbits/_mars/tensor/random/noncentral_chisquare.py
new file mode 100644
index 000000000..96c26ad2d
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/noncentral_chisquare.py
@@ -0,0 +1,130 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorNoncentralChisquare(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["df", "nonc"]
+    _op_type_ = OperandDef.RAND_NONCENTRAL_CHISQURE
+
+    _fields_ = "df", "nonc", "size"
+    df = AnyField("df")
+    nonc = AnyField("nonc")
+    _func_name = "noncentral_chisquare"
+
+    def __call__(self, df, nonc, chunk_size=None):
+        return self.new_tensor([df, nonc], None, raw_chunk_size=chunk_size)
+
+
+def noncentral_chisquare(
+    random_state, df, nonc, size=None, chunk_size=None, gpu=None, dtype=None
+):
+    r"""
+    Draw samples from a noncentral chi-square distribution.
+
+    The noncentral :math:`\chi^2` distribution is a generalisation of
+    the :math:`\chi^2` distribution.
+
+    Parameters
+    ----------
+    df : float or array_like of floats
+        Degrees of freedom, should be > 0.
+    nonc : float or array_like of floats
+        Non-centrality, should be non-negative.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``df`` and ``nonc`` are both scalars.
+        Otherwise, ``mt.broadcast(df, nonc).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized noncentral chi-square distribution.
+
+    Notes
+    -----
+    The probability density function for the noncentral Chi-square
+    distribution is
+
+    .. math:: P(x;df,nonc) = \sum^{\infty}_{i=0}
+                           \frac{e^{-nonc/2}(nonc/2)^{i}}{i!}
+                           \P_{Y_{df+2i}}(x),
+
+    where :math:`Y_{q}` is the Chi-square with q degrees of freedom.
+
+    In Delhi (2007), it is noted that the noncentral chi-square is
+    useful in bombing and coverage problems, the probability of
+    killing the point target given by the noncentral chi-squared
+    distribution.
+
+    References
+    ----------
+    .. [1] Delhi, M.S. Holla, "On a noncentral chi-square distribution in
+           the analysis of weapon systems effectiveness", Metrika,
+           Volume 15, Number 1 / December, 1970.
+    .. [2] Wikipedia, "Noncentral chi-square distribution"
+           http://en.wikipedia.org/wiki/Noncentral_chi-square_distribution
+
+    Examples
+    --------
+    Draw values from the distribution and plot the histogram
+
+    >>> import matplotlib.pyplot as plt
+    >>> import mars.tensor as mt
+    >>> values = plt.hist(mt.random.noncentral_chisquare(3, 20, 100000).execute(),
+    ...                   bins=200, normed=True)
+    >>> plt.show()
+
+    Draw values from a noncentral chisquare with very small noncentrality,
+    and compare to a chisquare.
+
+    >>> plt.figure()
+    >>> values = plt.hist(mt.random.noncentral_chisquare(3, .0000001, 100000).execute(),
+    ...                   bins=mt.arange(0., 25, .1).execute(), normed=True)
+    >>> values2 = plt.hist(mt.random.chisquare(3, 100000).execute(),
+    ...                    bins=mt.arange(0., 25, .1).execute(), normed=True)
+    >>> plt.plot(values[1][0:-1], values[0]-values2[0], 'ob')
+    >>> plt.show()
+
+    Demonstrate how large values of non-centrality lead to a more symmetric
+    distribution.
+
+    >>> plt.figure()
+    >>> values = plt.hist(mt.random.noncentral_chisquare(3, 20, 100000).execute(),
+    ...                   bins=200, normed=True)
+    >>> plt.show()
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState()
+            .noncentral_chisquare(handle_array(df), handle_array(nonc), size=(0,))
+            .dtype
+        )
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorNoncentralChisquare(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(df, nonc, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/noncentral_f.py b/python/xorbits/_mars/tensor/random/noncentral_f.py
new file mode 100644
index 000000000..f8923c372
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/noncentral_f.py
@@ -0,0 +1,124 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorNoncentralF(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["dfnum", "dfden", "nonc"]
+    _op_type_ = OperandDef.RAND_NONCENTRAL_F
+
+    _fields_ = "dfnum", "dfden", "nonc", "size"
+    dfnum = AnyField("dfnum")
+    dfden = AnyField("dfden")
+    nonc = AnyField("nonc")
+    _func_name = "noncentral_f"
+
+    def __call__(self, dfnum, dfden, nonc, chunk_size=None):
+        return self.new_tensor([dfnum, dfden, nonc], None, raw_chunk_size=chunk_size)
+
+
+def noncentral_f(
+    random_state, dfnum, dfden, nonc, size=None, chunk_size=None, gpu=None, dtype=None
+):
+    """
+    Draw samples from the noncentral F distribution.
+
+    Samples are drawn from an F distribution with specified parameters,
+    `dfnum` (degrees of freedom in numerator) and `dfden` (degrees of
+    freedom in denominator), where both parameters > 1.
+    `nonc` is the non-centrality parameter.
+
+    Parameters
+    ----------
+    dfnum : float or array_like of floats
+        Numerator degrees of freedom, should be > 0.
+    dfden : float or array_like of floats
+        Denominator degrees of freedom, should be > 0.
+    nonc : float or array_like of floats
+        Non-centrality parameter, the sum of the squares of the numerator
+        means, should be >= 0.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``dfnum``, ``dfden``, and ``nonc``
+        are all scalars.  Otherwise, ``np.broadcast(dfnum, dfden, nonc).size``
+        samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized noncentral Fisher distribution.
+
+    Notes
+    -----
+    When calculating the power of an experiment (power = probability of
+    rejecting the null hypothesis when a specific alternative is true) the
+    non-central F statistic becomes important.  When the null hypothesis is
+    true, the F statistic follows a central F distribution. When the null
+    hypothesis is not true, then it follows a non-central F statistic.
+
+    References
+    ----------
+    .. [1] Weisstein, Eric W. "Noncentral F-Distribution."
+           From MathWorld--A Wolfram Web Resource.
+           http://mathworld.wolfram.com/NoncentralF-Distribution.html
+    .. [2] Wikipedia, "Noncentral F-distribution",
+           http://en.wikipedia.org/wiki/Noncentral_F-distribution
+
+    Examples
+    --------
+    In a study, testing for a specific alternative to the null hypothesis
+    requires use of the Noncentral F distribution. We need to calculate the
+    area in the tail of the distribution that exceeds the value of the F
+    distribution for the null hypothesis.  We'll plot the two probability
+    distributions for comparison.
+
+    >>> import mars.tensor as mt
+    >>> import matplotlib.pyplot as plt
+
+    >>> dfnum = 3 # between group deg of freedom
+    >>> dfden = 20 # within groups degrees of freedom
+    >>> nonc = 3.0
+    >>> nc_vals = mt.random.noncentral_f(dfnum, dfden, nonc, 1000000)
+    >>> NF = np.histogram(nc_vals.execute(), bins=50, normed=True)  # TODO(jisheng): implement mt.histogram
+    >>> c_vals = mt.random.f(dfnum, dfden, 1000000)
+    >>> F = np.histogram(c_vals.execute(), bins=50, normed=True)
+    >>> plt.plot(F[1][1:], F[0])
+    >>> plt.plot(NF[1][1:], NF[0])
+    >>> plt.show()
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState()
+            .noncentral_f(
+                handle_array(dfnum), handle_array(dfden), handle_array(nonc), size=(0,)
+            )
+            .dtype
+        )
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorNoncentralF(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(dfnum, dfden, nonc, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/normal.py b/python/xorbits/_mars/tensor/random/normal.py
new file mode 100644
index 000000000..8c3f74485
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/normal.py
@@ -0,0 +1,141 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorNormal(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["loc", "scale"]
+    _op_type_ = OperandDef.RAND_NORMAL
+
+    _fields_ = "loc", "scale", "size"
+    loc = AnyField("loc")
+    scale = AnyField("scale")
+    _func_name = "normal"
+
+    def __call__(self, loc, scale, chunk_size=None):
+        return self.new_tensor([loc, scale], None, raw_chunk_size=chunk_size)
+
+
+def normal(
+    random_state, loc=0.0, scale=1.0, size=None, chunk_size=None, gpu=None, dtype=None
+):
+    r"""
+    Draw random samples from a normal (Gaussian) distribution.
+
+    The probability density function of the normal distribution, first
+    derived by De Moivre and 200 years later by both Gauss and Laplace
+    independently [2]_, is often called the bell curve because of
+    its characteristic shape (see the example below).
+
+    The normal distributions occurs often in nature.  For example, it
+    describes the commonly occurring distribution of samples influenced
+    by a large number of tiny, random disturbances, each with its own
+    unique distribution [2]_.
+
+    Parameters
+    ----------
+    loc : float or array_like of floats
+        Mean ("centre") of the distribution.
+    scale : float or array_like of floats
+        Standard deviation (spread or "width") of the distribution.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``loc`` and ``scale`` are both scalars.
+        Otherwise, ``mt.broadcast(loc, scale).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized normal distribution.
+
+    See Also
+    --------
+    scipy.stats.norm : probability density function, distribution or
+        cumulative density function, etc.
+
+    Notes
+    -----
+    The probability density for the Gaussian distribution is
+
+    .. math:: p(x) = \frac{1}{\sqrt{ 2 \pi \sigma^2 }}
+                     e^{ - \frac{ (x - \mu)^2 } {2 \sigma^2} },
+
+    where :math:`\mu` is the mean and :math:`\sigma` the standard
+    deviation. The square of the standard deviation, :math:`\sigma^2`,
+    is called the variance.
+
+    The function has its peak at the mean, and its "spread" increases with
+    the standard deviation (the function reaches 0.607 times its maximum at
+    :math:`x + \sigma` and :math:`x - \sigma` [2]_).  This implies that
+    `numpy.random.normal` is more likely to return samples lying close to
+    the mean, rather than those far away.
+
+    References
+    ----------
+    .. [1] Wikipedia, "Normal distribution",
+           http://en.wikipedia.org/wiki/Normal_distribution
+    .. [2] P. R. Peebles Jr., "Central Limit Theorem" in "Probability,
+           Random Variables and Random Signal Principles", 4th ed., 2001,
+           pp. 51, 51, 125.
+
+    Examples
+    --------
+    Draw samples from the distribution:
+
+    >>> import mars.tensor as mt
+
+    >>> mu, sigma = 0, 0.1 # mean and standard deviation
+    >>> s = mt.random.normal(mu, sigma, 1000)
+
+    Verify the mean and the variance:
+
+    >>> (abs(mu - mt.mean(s)) < 0.01).execute()
+    True
+
+    >>> (abs(sigma - mt.std(s, ddof=1)) < 0.01).execute()
+    True
+
+    Display the histogram of the samples, along with
+    the probability density function:
+
+    >>> import matplotlib.pyplot as plt
+    >>> count, bins, ignored = plt.hist(s.execute(), 30, normed=True)
+    >>> plt.plot(bins, (1/(sigma * mt.sqrt(2 * mt.pi)) *
+    ...                mt.exp( - (bins - mu)**2 / (2 * sigma**2) )).execute(),
+    ...          linewidth=2, color='r')
+    >>> plt.show()
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState()
+            .normal(handle_array(loc), handle_array(scale), size=(0,))
+            .dtype
+        )
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorNormal(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(loc, scale, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/pareto.py b/python/xorbits/_mars/tensor/random/pareto.py
new file mode 100644
index 000000000..526659f66
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/pareto.py
@@ -0,0 +1,138 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorPareto(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["a"]
+    _op_type_ = OperandDef.RAND_PARETO
+
+    _fields_ = "a", "size"
+    a = AnyField("a")
+    _func_name = "pareto"
+
+    def __call__(self, a, chunk_size=None):
+        return self.new_tensor([a], None, raw_chunk_size=chunk_size)
+
+
+def pareto(random_state, a, size=None, chunk_size=None, gpu=None, dtype=None):
+    r"""
+    Draw samples from a Pareto II or Lomax distribution with
+    specified shape.
+
+    The Lomax or Pareto II distribution is a shifted Pareto
+    distribution. The classical Pareto distribution can be
+    obtained from the Lomax distribution by adding 1 and
+    multiplying by the scale parameter ``m`` (see Notes).  The
+    smallest value of the Lomax distribution is zero while for the
+    classical Pareto distribution it is ``mu``, where the standard
+    Pareto distribution has location ``mu = 1``.  Lomax can also
+    be considered as a simplified version of the Generalized
+    Pareto distribution (available in SciPy), with the scale set
+    to one and the location set to zero.
+
+    The Pareto distribution must be greater than zero, and is
+    unbounded above.  It is also known as the "80-20 rule".  In
+    this distribution, 80 percent of the weights are in the lowest
+    20 percent of the range, while the other 20 percent fill the
+    remaining 80 percent of the range.
+
+    Parameters
+    ----------
+    a : float or array_like of floats
+        Shape of the distribution. Should be greater than zero.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``a`` is a scalar.  Otherwise,
+        ``mt.array(a).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized Pareto distribution.
+
+    See Also
+    --------
+    scipy.stats.lomax : probability density function, distribution or
+        cumulative density function, etc.
+    scipy.stats.genpareto : probability density function, distribution or
+        cumulative density function, etc.
+
+    Notes
+    -----
+    The probability density for the Pareto distribution is
+
+    .. math:: p(x) = \frac{am^a}{x^{a+1}}
+
+    where :math:`a` is the shape and :math:`m` the scale.
+
+    The Pareto distribution, named after the Italian economist
+    Vilfredo Pareto, is a power law probability distribution
+    useful in many real world problems.  Outside the field of
+    economics it is generally referred to as the Bradford
+    distribution. Pareto developed the distribution to describe
+    the distribution of wealth in an economy.  It has also found
+    use in insurance, web page access statistics, oil field sizes,
+    and many other problems, including the download frequency for
+    projects in Sourceforge [1]_.  It is one of the so-called
+    "fat-tailed" distributions.
+
+
+    References
+    ----------
+    .. [1] Francis Hunt and Paul Johnson, On the Pareto Distribution of
+           Sourceforge projects.
+    .. [2] Pareto, V. (1896). Course of Political Economy. Lausanne.
+    .. [3] Reiss, R.D., Thomas, M.(2001), Statistical Analysis of Extreme
+           Values, Birkhauser Verlag, Basel, pp 23-30.
+    .. [4] Wikipedia, "Pareto distribution",
+           http://en.wikipedia.org/wiki/Pareto_distribution
+
+    Examples
+    --------
+    Draw samples from the distribution:
+
+    >>> import mars.tensor as mt
+
+    >>> a, m = 3., 2.  # shape and mode
+    >>> s = (mt.random.pareto(a, 1000) + 1) * m
+
+    Display the histogram of the samples, along with the probability
+    density function:
+
+    >>> import matplotlib.pyplot as plt
+    >>> count, bins, _ = plt.hist(s.execute(), 100, normed=True)
+    >>> fit = a*m**a / bins**(a+1)
+    >>> plt.plot(bins, max(count)*fit/max(fit), linewidth=2, color='r')
+    >>> plt.show()
+    """
+    if dtype is None:
+        dtype = np.random.RandomState().pareto(handle_array(a), size=(0,)).dtype
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorPareto(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(a, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/permutation.py b/python/xorbits/_mars/tensor/random/permutation.py
new file mode 100644
index 000000000..78bc63133
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/permutation.py
@@ -0,0 +1,239 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from numbers import Integral
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core.operand import OperandStage
+from ...serialization.serializables import Int32Field, KeyField
+from ..array_utils import as_same_device, device
+from ..datasource import tensor as astensor
+from ..operands import TensorOperandMixin, TensorShuffleProxy
+from ..utils import gen_random_seeds, validate_axis
+from .core import TensorRandomMapReduceOperand
+
+
+def _permutation_on_axis(ar, axis, rs, xp):
+    try:
+        return rs.permutation(ar, axis=axis)
+    except TypeError:
+        # numpy starts to support axis from 1.18
+        if axis == 0:
+            return rs.permutation(ar)
+        indices = xp.arange(ar.shape[axis])
+        rs.shuffle(indices)
+        slc = (slice(None),) * axis + (indices,)
+        return ar[slc]
+
+
+class TensorPermutation(TensorRandomMapReduceOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.PERMUTATION
+
+    input = KeyField("input")
+    axis = Int32Field("axis")
+
+    reduce_size = Int32Field("reduce_size")
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self.input = self._inputs[0]
+
+    def __call__(self, x):
+        return self.new_tensor([x], x.shape, order=x.order)
+
+    @classmethod
+    def tile(cls, op):
+        in_tensor = op.inputs[0]
+        out_tensor = op.outputs[0]
+
+        state = np.random.RandomState(op.seed)
+        if len(op.input.chunks) == 1:
+            chunk_op = op.copy().reset_key()
+            chunk_op._state = None
+            chunk_op.seed = gen_random_seeds(1, state)[0]
+            c = op.input.chunks[0]
+            chunk = chunk_op.new_chunk([c], shape=c.shape, index=c.index, order=c.order)
+            new_op = op.copy()
+            return new_op.new_tensors(
+                op.inputs,
+                shape=out_tensor.shape,
+                order=out_tensor.order,
+                nsplits=op.input.nsplits,
+                chunks=[chunk],
+            )
+
+        chunk_size = in_tensor.chunk_shape[op.axis]
+        map_seeds = gen_random_seeds(chunk_size, state)
+        reduce_seeds = gen_random_seeds(chunk_size, state)
+        reduce_chunks = []
+        if in_tensor.ndim > 1:
+            cs = in_tensor.chunk_shape
+            left_chunk_shape = cs[: op.axis] + cs[op.axis + 1 :]
+            idx_iter = itertools.product(*[range(s) for s in left_chunk_shape])
+        else:
+            idx_iter = [()]
+        for idx in idx_iter:
+            map_chunks = []
+            for j in range(chunk_size):
+                in_idx = list(idx)
+                in_idx.insert(op.axis, j)
+                c = in_tensor.cix[tuple(in_idx)]
+                chunk_op = TensorPermutation(
+                    stage=OperandStage.map,
+                    seed=map_seeds[c.index[op.axis]],
+                    axis=op.axis,
+                    reduce_size=chunk_size,
+                    dtype=c.dtype,
+                    gpu=c.op.gpu,
+                )
+                map_chunk = chunk_op.new_chunk(
+                    [c], shape=c.shape, index=c.index, order=out_tensor.order
+                )
+                map_chunks.append(map_chunk)
+
+            proxy_chunk = TensorShuffleProxy(
+                dtype=out_tensor.dtype, _tensor_keys=[in_tensor.key]
+            ).new_chunk(map_chunks, shape=())
+
+            for c in map_chunks:
+                chunk_op = TensorPermutation(
+                    stage=OperandStage.reduce,
+                    n_reducers=len(map_chunks),
+                    seed=reduce_seeds[c.index[op.axis]],
+                    axis=op.axis,
+                )
+                chunk_shape = list(c.shape)
+                chunk_shape[op.axis] = np.nan
+                reduce_chunk = chunk_op.new_chunk(
+                    [proxy_chunk],
+                    shape=tuple(chunk_shape),
+                    order=out_tensor.order,
+                    index=c.index,
+                    dtype=out_tensor.dtype,
+                )
+                reduce_chunks.append(reduce_chunk)
+
+        new_op = op.copy()
+        nsplits = list(in_tensor.nsplits)
+        nsplits[op.axis] = [np.nan] * len(nsplits[op.axis])
+        return new_op.new_tensors(
+            op.inputs,
+            out_tensor.shape,
+            order=out_tensor.order,
+            chunks=reduce_chunks,
+            nsplits=nsplits,
+        )
+
+    @classmethod
+    def _execute_map(cls, ctx, op):
+        (x,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        out_chunk = op.outputs[0]
+        reduce_size = op.reduce_size
+        with device(device_id):
+            rs = xp.random.RandomState(op.seed)
+            to_reduce_idxes = rs.randint(reduce_size, size=x.shape[op.axis])
+            for to_reduce_idx in range(reduce_size):
+                reduce_idx = (
+                    out_chunk.index[: op.axis]
+                    + (to_reduce_idx,)
+                    + out_chunk.index[op.axis + 1 :]
+                )
+                slc = (slice(None),) * op.axis + (to_reduce_idxes == to_reduce_idx,)
+                ctx[out_chunk.key, reduce_idx] = x[slc]
+
+    @classmethod
+    def _execute_reduce(cls, ctx, op: "TensorPermutation"):
+        inputs = list(op.iter_mapper_data(ctx))
+        inputs, device_id, xp = as_same_device(inputs, device=op.device, ret_extra=True)
+
+        with device(device_id):
+            rs = xp.random.RandomState(op.seed)
+            data = xp.concatenate(inputs, axis=op.axis)
+            if op.axis == 0:
+                rs.shuffle(data)
+            else:
+                data[...] = _permutation_on_axis(data, op.axis, rs, xp)
+            ctx[op.outputs[0].key] = data
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            cls._execute_map(ctx, op)
+        elif op.stage == OperandStage.reduce:
+            cls._execute_reduce(ctx, op)
+        else:
+            (x,), device_id, xp = as_same_device(
+                [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+            )
+
+            with device(device_id):
+                rs = xp.random.RandomState(op.seed)
+                ctx[op.outputs[0].key] = _permutation_on_axis(x, op.axis, rs, xp)
+
+
+def permutation(random_state, x, axis=0, chunk_size=None):
+    r"""
+    Randomly permute a sequence, or return a permuted range.
+
+    Parameters
+    ----------
+    x : int or array_like
+        If `x` is an integer, randomly permute ``mt.arange(x)``.
+        If `x` is an array, make a copy and shuffle the elements
+        randomly.
+    axis : int, optional
+        The axis which `x` is shuffled along. Default is 0.
+    chunk_size : : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    Returns
+    -------
+    out : Tensor
+        Permuted sequence or tensor range.
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> rng = mt.random.RandomState()
+    >>> rng.permutation(10).execute()
+    array([1, 2, 3, 7, 9, 8, 0, 6, 4, 5]) # random
+    >>> rng.permutation([1, 4, 9, 12, 15]).execute()
+    array([ 9,  4, 12,  1, 15]) # random
+    >>> arr = mt.arange(9).reshape((3, 3))
+    >>> rng.permutation(arr).execute()
+    array([[3, 4, 5], # random
+           [6, 7, 8],
+           [0, 1, 2]])
+    >>> rng.permutation("abc")
+    Traceback (most recent call last):
+        ...
+    numpy.AxisError: x must be an integer or at least 1-dimensional
+    """
+    if isinstance(x, (Integral, np.integer)):
+        from ..datasource import arange
+
+        x = arange(x, chunk_size=chunk_size)
+    else:
+        x = astensor(x, chunk_size=chunk_size)
+        if x.ndim < 1:
+            raise np.AxisError("x must be an integer or at least 1-dimensional")
+
+    axis = validate_axis(x.ndim, axis)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorPermutation(seed=seed, axis=axis, dtype=x.dtype, gpu=x.op.gpu)
+    return op(x)
diff --git a/python/xorbits/_mars/tensor/random/poisson.py b/python/xorbits/_mars/tensor/random/poisson.py
new file mode 100644
index 000000000..b63d62724
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/poisson.py
@@ -0,0 +1,109 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorPoisson(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["lam"]
+    _op_type_ = OperandDef.RAND_POSSION
+
+    _fields_ = "lam", "size"
+    lam = AnyField("lam")
+    _func_name = "poisson"
+
+    def __call__(self, lam, chunk_size=None):
+        return self.new_tensor([lam], None, raw_chunk_size=chunk_size)
+
+
+def poisson(random_state, lam=1.0, size=None, chunk_size=None, gpu=None, dtype=None):
+    r"""
+    Draw samples from a Poisson distribution.
+
+    The Poisson distribution is the limit of the binomial distribution
+    for large N.
+
+    Parameters
+    ----------
+    lam : float or array_like of floats
+        Expectation of interval, should be >= 0. A sequence of expectation
+        intervals must be broadcastable over the requested size.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``lam`` is a scalar. Otherwise,
+        ``mt.array(lam).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized Poisson distribution.
+
+    Notes
+    -----
+    The Poisson distribution
+
+    .. math:: f(k; \lambda)=\frac{\lambda^k e^{-\lambda}}{k!}
+
+    For events with an expected separation :math:`\lambda` the Poisson
+    distribution :math:`f(k; \lambda)` describes the probability of
+    :math:`k` events occurring within the observed
+    interval :math:`\lambda`.
+
+    Because the output is limited to the range of the C long type, a
+    ValueError is raised when `lam` is within 10 sigma of the maximum
+    representable value.
+
+    References
+    ----------
+    .. [1] Weisstein, Eric W. "Poisson Distribution."
+           From MathWorld--A Wolfram Web Resource.
+           http://mathworld.wolfram.com/PoissonDistribution.html
+    .. [2] Wikipedia, "Poisson distribution",
+           http://en.wikipedia.org/wiki/Poisson_distribution
+
+    Examples
+    --------
+    Draw samples from the distribution:
+
+    >>> import mars.tensor as mt
+    >>> s = mt.random.poisson(5, 10000)
+
+    Display histogram of the sample:
+
+    >>> import matplotlib.pyplot as plt
+    >>> count, bins, ignored = plt.hist(s.execute(), 14, normed=True)
+    >>> plt.show()
+
+    Draw each 100 values for lambda 100 and 500:
+
+    >>> s = mt.random.poisson(lam=(100., 500.), size=(100, 2))
+    """
+    if dtype is None:
+        dtype = np.random.RandomState().poisson(handle_array(lam), size=(0,)).dtype
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorPoisson(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(lam, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/power.py b/python/xorbits/_mars/tensor/random/power.py
new file mode 100644
index 000000000..685cd2d71
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/power.py
@@ -0,0 +1,140 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorRandomPower(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["a"]
+    _op_type_ = OperandDef.RAND_POWER
+
+    _fields_ = "a", "size"
+    a = AnyField("a")
+    _func_name = "power"
+
+    def __call__(self, a, chunk_size=None):
+        return self.new_tensor([a], None, raw_chunk_size=chunk_size)
+
+
+def power(random_state, a, size=None, chunk_size=None, gpu=None, dtype=None):
+    r"""
+    Draws samples in [0, 1] from a power distribution with positive
+    exponent a - 1.
+
+    Also known as the power function distribution.
+
+    Parameters
+    ----------
+    a : float or array_like of floats
+        Parameter of the distribution. Should be greater than zero.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``a`` is a scalar.  Otherwise,
+        ``mt.array(a).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized power distribution.
+
+    Raises
+    ------
+    ValueError
+        If a < 1.
+
+    Notes
+    -----
+    The probability density function is
+
+    .. math:: P(x; a) = ax^{a-1}, 0 \le x \le 1, a>0.
+
+    The power function distribution is just the inverse of the Pareto
+    distribution. It may also be seen as a special case of the Beta
+    distribution.
+
+    It is used, for example, in modeling the over-reporting of insurance
+    claims.
+
+    References
+    ----------
+    .. [1] Christian Kleiber, Samuel Kotz, "Statistical size distributions
+           in economics and actuarial sciences", Wiley, 2003.
+    .. [2] Heckert, N. A. and Filliben, James J. "NIST Handbook 148:
+           Dataplot Reference Manual, Volume 2: Let Subcommands and Library
+           Functions", National Institute of Standards and Technology
+           Handbook Series, June 2003.
+           http://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/powpdf.pdf
+
+    Examples
+    --------
+    Draw samples from the distribution:
+
+    >>> import mars.tensor as mt
+
+    >>> a = 5. # shape
+    >>> samples = 1000
+    >>> s = mt.random.power(a, samples)
+
+    Display the histogram of the samples, along with
+    the probability density function:
+
+    >>> import matplotlib.pyplot as plt
+    >>> count, bins, ignored = plt.hist(s.execute(), bins=30)
+    >>> x = mt.linspace(0, 1, 100)
+    >>> y = a*x**(a-1.)
+    >>> normed_y = samples*mt.diff(bins)[0]*y
+    >>> plt.plot(x.execute(), normed_y.execute())
+    >>> plt.show()
+
+    Compare the power function distribution to the inverse of the Pareto.
+
+    >>> from scipy import stats
+    >>> rvs = mt.random.power(5, 1000000)
+    >>> rvsp = mt.random.pareto(5, 1000000)
+    >>> xx = mt.linspace(0,1,100)
+    >>> powpdf = stats.powerlaw.pdf(xx.execute(),5)
+
+    >>> plt.figure()
+    >>> plt.hist(rvs.execute(), bins=50, normed=True)
+    >>> plt.plot(xx.execute(),powpdf,'r-')
+    >>> plt.title('np.random.power(5)')
+
+    >>> plt.figure()
+    >>> plt.hist((1./(1.+rvsp)).execute(), bins=50, normed=True)
+    >>> plt.plot(xx.execute(),powpdf,'r-')
+    >>> plt.title('inverse of 1 + np.random.pareto(5)')
+
+    >>> plt.figure()
+    >>> plt.hist((1./(1.+rvsp)).execute(), bins=50, normed=True)
+    >>> plt.plot(xx.execute(),powpdf,'r-')
+    >>> plt.title('inverse of stats.pareto(5)')
+    """
+    if dtype is None:
+        dtype = np.random.RandomState().power(handle_array(a), size=(0,)).dtype
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorRandomPower(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(a, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/rand.py b/python/xorbits/_mars/tensor/random/rand.py
new file mode 100644
index 000000000..fed83c38a
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/rand.py
@@ -0,0 +1,80 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import gen_random_seeds
+from .core import TensorRandomOperandMixin, TensorSimpleRandomData
+
+
+class TensorRand(TensorSimpleRandomData, TensorRandomOperandMixin):
+    _op_type_ = OperandDef.RAND_RAND
+    _func_name = "rand"
+
+    def __call__(self, chunk_size=None):
+        return self.new_tensor(None, None, raw_chunk_size=chunk_size)
+
+
+def rand(random_state, *dn, **kw):
+    """
+    Random values in a given shape.
+
+    Create a tensor of the given shape and populate it with
+    random samples from a uniform distributionc
+    over ``[0, 1)``.
+
+    Parameters
+    ----------
+    d0, d1, ..., dn : int, optional
+        The dimensions of the returned tensor, should all be positive.
+        If no argument is given a single Python float is returned.
+
+    Returns
+    -------
+    out : Tensor, shape ``(d0, d1, ..., dn)``
+        Random values.
+
+    See Also
+    --------
+    random
+
+    Notes
+    -----
+    This is a convenience function. If you want an interface that
+    takes a shape-tuple as the first argument, refer to
+    mt.random.random_sample .
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.random.rand(3, 2).execute()
+    array([[ 0.14022471,  0.96360618],  #random
+           [ 0.37601032,  0.25528411],  #random
+           [ 0.49313049,  0.94909878]]) #random
+    """
+    if len(dn) == 1 and isinstance(dn[0], (tuple, list)):
+        raise TypeError("'tuple' object cannot be interpreted as an integer")
+    if "dtype" not in kw:
+        kw["dtype"] = np.dtype("f8")
+    chunk_size = kw.pop("chunk_size", None)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorRand(seed=seed, size=dn, **kw)
+
+    for key in op.extra_params:
+        if not key.startswith("_"):
+            raise ValueError(f"rand got unexpected key arguments {key}")
+
+    return op(chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/randint.py b/python/xorbits/_mars/tensor/random/randint.py
new file mode 100644
index 000000000..7dd0def25
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/randint.py
@@ -0,0 +1,173 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import Float64Field, Int64Field
+from ..array_utils import array_module
+from ..utils import gen_random_seeds
+from .core import TensorRandomOperandMixin, TensorSimpleRandomData
+
+
+class TensorRandint(TensorSimpleRandomData, TensorRandomOperandMixin):
+    _op_type_ = OperandDef.RAND_RANDINT
+
+    _fields_ = "low", "high", "density", "size"
+    low = Int64Field("low")
+    high = Int64Field("high")
+    density = Float64Field("density")
+    _func_name = "randint"
+
+    def __call__(self, chunk_size=None):
+        return self.new_tensor(None, None, raw_chunk_size=chunk_size)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.sparse:
+            cls.execute_sparse(ctx, op)
+        else:
+            super().execute(ctx, op)
+
+    @classmethod
+    def execute_sparse(cls, ctx, op):
+        from ...lib.sparse import SparseNDArray
+        from ...lib.sparse.core import cps, sps
+
+        xp = array_module(op.gpu)
+        if op.seed:
+            rs = np.random.RandomState(op.seed)
+        else:
+            rs = None
+
+        chunk = op.outputs[0]
+        if chunk.ndim > 2:
+            raise NotImplementedError
+
+        low = 1 if op.low == 0 else op.low
+
+        rs = rs or xp.random
+        size = int(np.ceil(np.prod(chunk.shape) * op.density))
+        xps = cps if op.gpu else sps
+        ij = xp.empty((2, size))
+        ij[0] = rs.randint(chunk.shape[0], size=size)
+        ij[1] = rs.randint(chunk.shape[1], size=size)
+        data = rs.randint(low, op.high, size=size).astype(op.dtype)
+        m = xps.coo_matrix((data, ij), chunk.shape).tocsr()
+        m.data[m.data >= op.high] = op.high - 1
+
+        # scipy.sparse is too slow, we remove the precise version due to the performance
+        # m = sps.random(*chunk.shape, density=op.density, format='csr')
+        # m.data = (rs or xp.random).randint(low, op.high, size=m.data.size)\
+        #     .astype(op.dtype)
+
+        ctx[chunk.key] = SparseNDArray(m)
+
+    @classmethod
+    def estimate_size(cls, ctx, op):
+        chunk = op.outputs[0]
+        if not op.sparse or not getattr(op, "_density", None):
+            super().estimate_size(ctx, op)
+        else:
+            # use density to estimate real memory usage
+            nbytes = int(chunk.nbytes * getattr(chunk.op, "_density"))
+            ctx[chunk.key] = (nbytes, nbytes)
+
+
+def randint(
+    random_state,
+    low,
+    high=None,
+    size=None,
+    dtype="l",
+    density=None,
+    chunk_size=None,
+    gpu=None,
+):
+    """
+    Return random integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution of
+    the specified dtype in the "half-open" interval [`low`, `high`). If
+    `high` is None (the default), then results are from [0, `low`).
+
+    Parameters
+    ----------
+    low : int
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is one above the
+        *highest* such integer).
+    high : int, optional
+        If provided, one above the largest (signed) integer to be drawn
+        from the distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    dtype : dtype, optional
+        Desired dtype of the result. All dtypes are determined by their
+        name, i.e., 'int64', 'int', etc, so byteorder is not available
+        and a specific precision may have different C types depending
+        on the platform. The default value is 'np.int'.
+    density: float, optional
+        if density specified, a sparse tensor will be created
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : int or Tensor of ints
+        `size`-shaped tensor of random integers from the appropriate
+        distribution, or a single such random int if `size` not provided.
+
+    See Also
+    --------
+    random.random_integers : similar to `randint`, only for the closed
+        interval [`low`, `high`], and 1 is the lowest value if `high` is
+        omitted. In particular, this other one is the one to use to generate
+        uniformly distributed discrete non-integers.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.random.randint(2, size=10).execute()
+    array([1, 0, 0, 0, 1, 1, 0, 0, 1, 0])
+    >>> mt.random.randint(1, size=10).execute()
+    array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+
+    Generate a 2 x 4 tensor of ints between 0 and 4, inclusive:
+
+    >>> mt.random.randint(5, size=(2, 4)).execute()
+    array([[4, 0, 2, 1],
+           [3, 2, 2, 0]])
+    """
+    sparse = bool(density)
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorRandint(
+        seed=seed,
+        low=low,
+        high=high,
+        size=size,
+        dtype=dtype,
+        gpu=gpu,
+        sparse=sparse,
+        density=density,
+    )
+    return op(chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/randn.py b/python/xorbits/_mars/tensor/random/randn.py
new file mode 100644
index 000000000..790992e3b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/randn.py
@@ -0,0 +1,94 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import gen_random_seeds
+from .core import TensorRandomOperandMixin, TensorSimpleRandomData
+
+
+class TensorRandn(TensorSimpleRandomData, TensorRandomOperandMixin):
+    _op_type_ = OperandDef.RAND_RANDN
+    _func_name = "randn"
+
+    def __call__(self, chunk_size=None):
+        return self.new_tensor(None, None, raw_chunk_size=chunk_size)
+
+
+def randn(random_state, *dn, **kw):
+    r"""
+    Return a sample (or samples) from the "standard normal" distribution.
+
+    If positive, int_like or int-convertible arguments are provided,
+    `randn` generates an array of shape ``(d0, d1, ..., dn)``, filled
+    with random floats sampled from a univariate "normal" (Gaussian)
+    distribution of mean 0 and variance 1 (if any of the :math:`d_i` are
+    floats, they are first converted to integers by truncation). A single
+    float randomly sampled from the distribution is returned if no
+    argument is provided.
+
+    This is a convenience function.  If you want an interface that takes a
+    tuple as the first argument, use `numpy.random.standard_normal` instead.
+
+    Parameters
+    ----------
+    d0, d1, ..., dn : int, optional
+        The dimensions of the returned tensor, should be all positive.
+        If no argument is given a single Python float is returned.
+
+    Returns
+    -------
+    Z : Tensor or float
+        A ``(d0, d1, ..., dn)``-shaped array of floating-point samples from
+        the standard normal distribution, or a single such float if
+        no parameters were supplied.
+
+    See Also
+    --------
+    random.standard_normal : Similar, but takes a tuple as its argument.
+
+    Notes
+    -----
+    For random samples from :math:`N(\mu, \sigma^2)`, use:
+
+    ``sigma * mt.random.randn(...) + mu``
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.random.randn().execute()
+    2.1923875335537315 #random
+
+    Two-by-four tensor of samples from N(3, 6.25):
+
+    >>> (2.5 * mt.random.randn(2, 4) + 3).execute()
+    array([[-4.49401501,  4.00950034, -1.81814867,  7.29718677],  #random
+           [ 0.39924804,  4.68456316,  4.99394529,  4.84057254]]) #random
+    """
+    if len(dn) == 1 and isinstance(dn[0], (tuple, list)):
+        raise TypeError("'tuple' object cannot be interpreted as an integer")
+    if "dtype" not in kw:
+        kw["dtype"] = np.dtype("f8")
+    chunk_size = kw.pop("chunk_size", None)
+
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorRandn(seed=seed, size=dn, **kw)
+
+    for key in op.extra_params:
+        if not key.startswith("_"):
+            raise ValueError(f"randn got unexpected key arguments {key}")
+
+    return op(chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/random_integers.py b/python/xorbits/_mars/tensor/random/random_integers.py
new file mode 100644
index 000000000..e986a0f81
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/random_integers.py
@@ -0,0 +1,121 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import Int64Field
+from ..utils import gen_random_seeds
+from .core import TensorRandomOperandMixin, TensorSimpleRandomData
+
+
+class TensorRandomIntegers(TensorSimpleRandomData, TensorRandomOperandMixin):
+    _op_type_ = OperandDef.RAND_RANDOM_INTEGERS
+
+    _fields_ = "low", "high", "size"
+    low = Int64Field("low")
+    high = Int64Field("high")
+    _func_name = "random_integers"
+
+    def __call__(self, chunk_size=None):
+        return self.new_tensor(None, None, raw_chunk_size=chunk_size)
+
+
+def random_integers(random_state, low, high=None, size=None, chunk_size=None, gpu=None):
+    """
+    Random integers of type mt.int between `low` and `high`, inclusive.
+
+    Return random integers of type mt.int from the "discrete uniform"
+    distribution in the closed interval [`low`, `high`].  If `high` is
+    None (the default), then results are from [1, `low`]. The np.int
+    type translates to the C long type used by Python 2 for "short"
+    integers and its precision is platform dependent.
+
+    This function has been deprecated. Use randint instead.
+
+    Parameters
+    ----------
+    low : int
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int, optional
+        If provided, the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+
+    Returns
+    -------
+    out : int or Tensor of ints
+        `size`-shaped array of random integers from the appropriate
+        distribution, or a single such random int if `size` not provided.
+
+    See Also
+    --------
+    random.randint : Similar to `random_integers`, only for the half-open
+        interval [`low`, `high`), and 0 is the lowest value if `high` is
+        omitted.
+
+    Notes
+    -----
+    To sample from N evenly spaced floating-point numbers between a and b,
+    use::
+
+      a + (b - a) * (np.random.random_integers(N) - 1) / (N - 1.)
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.random.random_integers(5).execute()
+    4
+    >>> type(mt.random.random_integers(5).execute())
+    <type 'int'>
+    >>> mt.random.random_integers(5, size=(3,2)).execute()
+    array([[5, 4],
+           [3, 3],
+           [4, 5]])
+
+    Choose five random numbers from the set of five evenly-spaced
+    numbers between 0 and 2.5, inclusive (*i.e.*, from the set
+    :math:`{0, 5/8, 10/8, 15/8, 20/8}`):
+
+    >>> (2.5 * (mt.random.random_integers(5, size=(5,)) - 1) / 4.).execute()
+    array([ 0.625,  1.25 ,  0.625,  0.625,  2.5  ])
+
+    Roll two six sided dice 1000 times and sum the results:
+
+    >>> d1 = mt.random.random_integers(1, 6, 1000)
+    >>> d2 = mt.random.random_integers(1, 6, 1000)
+    >>> dsums = d1 + d2
+
+    Display results as a histogram:
+
+    >>> import matplotlib.pyplot as plt
+    >>> count, bins, ignored = plt.hist(dsums.execute(), 11, normed=True)
+    >>> plt.show()
+    """
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorRandomIntegers(
+        seed=seed, size=size, dtype=np.dtype(int), low=low, high=high, gpu=gpu
+    )
+    return op(chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/random_sample.py b/python/xorbits/_mars/tensor/random/random_sample.py
new file mode 100644
index 000000000..7e13dd802
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/random_sample.py
@@ -0,0 +1,84 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import gen_random_seeds
+from .core import TensorRandomOperandMixin, TensorSimpleRandomData
+
+
+class TensorRandomSample(TensorSimpleRandomData, TensorRandomOperandMixin):
+    _op_type_ = OperandDef.RAND_RANDOM_SAMPLE
+
+    _fields_ = ("size",)
+    _func_name = "random_sample"
+
+    def __call__(self, chunk_size):
+        return self.new_tensor(None, None, raw_chunk_size=chunk_size)
+
+
+def random_sample(random_state, size=None, chunk_size=None, gpu=None, dtype=None):
+    """
+    Return random floats in the half-open interval [0.0, 1.0).
+
+    Results are from the "continuous uniform" distribution over the
+    stated interval.  To sample :math:`Unif[a, b), b > a` multiply
+    the output of `random_sample` by `(b-a)` and add `a`::
+
+      (b - a) * random_sample() + a
+
+    Parameters
+    ----------
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : float or Tensor of floats
+        Array of random floats of shape `size` (unless ``size=None``, in which
+        case a single float is returned).
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.random.random_sample().execute()
+    0.47108547995356098
+    >>> type(mt.random.random_sample().execute())
+    <type 'float'>
+    >>> mt.random.random_sample((5,)).execute()
+    array([ 0.30220482,  0.86820401,  0.1654503 ,  0.11659149,  0.54323428])
+
+    Three-by-two array of random numbers from [-5, 0):
+
+    >>> (5 * mt.random.random_sample((3, 2)) - 5).execute()
+    array([[-3.99149989, -0.52338984],
+           [-2.99091858, -0.79479508],
+           [-1.23204345, -1.75224494]])
+    """
+    if dtype is None:
+        dtype = np.dtype("f8")
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorRandomSample(seed=seed, size=size, gpu=gpu, dtype=dtype)
+    return op(chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/rayleigh.py b/python/xorbits/_mars/tensor/random/rayleigh.py
new file mode 100644
index 000000000..fbfdc5bba
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/rayleigh.py
@@ -0,0 +1,108 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorRayleigh(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["scale"]
+    _op_type_ = OperandDef.RAND_RAYLEIGH
+
+    _fields_ = "scale", "size"
+    scale = AnyField("scale")
+    _func_name = "rayleigh"
+
+    def __call__(self, scale, chunk_size=None):
+        return self.new_tensor([scale], None, raw_chunk_size=chunk_size)
+
+
+def rayleigh(random_state, scale=1.0, size=None, chunk_size=None, gpu=None, dtype=None):
+    r"""
+    Draw samples from a Rayleigh distribution.
+
+    The :math:`\chi` and Weibull distributions are generalizations of the
+    Rayleigh.
+
+    Parameters
+    ----------
+    scale : float or array_like of floats, optional
+        Scale, also equals the mode. Should be >= 0. Default is 1.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``scale`` is a scalar.  Otherwise,
+        ``mt.array(scale).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized Rayleigh distribution.
+
+    Notes
+    -----
+    The probability density function for the Rayleigh distribution is
+
+    .. math:: P(x;scale) = \frac{x}{scale^2}e^{\frac{-x^2}{2 \cdotp scale^2}}
+
+    The Rayleigh distribution would arise, for example, if the East
+    and North components of the wind velocity had identical zero-mean
+    Gaussian distributions.  Then the wind speed would have a Rayleigh
+    distribution.
+
+    References
+    ----------
+    .. [1] Brighton Webs Ltd., "Rayleigh Distribution,"
+           http://www.brighton-webs.co.uk/distributions/rayleigh.asp
+    .. [2] Wikipedia, "Rayleigh distribution"
+           http://en.wikipedia.org/wiki/Rayleigh_distribution
+
+    Examples
+    --------
+    Draw values from the distribution and plot the histogram
+
+    >>> import matplotlib.pyplot as plt
+    >>> import mars.tensor as mt
+
+    >>> values = plt.hist(mt.random.rayleigh(3, 100000).execute(), bins=200, normed=True)
+
+    Wave heights tend to follow a Rayleigh distribution. If the mean wave
+    height is 1 meter, what fraction of waves are likely to be larger than 3
+    meters?
+
+    >>> meanvalue = 1
+    >>> modevalue = mt.sqrt(2 / mt.pi) * meanvalue
+    >>> s = mt.random.rayleigh(modevalue, 1000000)
+
+    The percentage of waves larger than 3 meters is:
+
+    >>> (100.*mt.sum(s>3)/1000000.).execute()
+    0.087300000000000003
+    """
+    if dtype is None:
+        dtype = np.random.RandomState().rayleigh(handle_array(scale), size=(0,)).dtype
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorRayleigh(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(scale, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/shuffle.py b/python/xorbits/_mars/tensor/random/shuffle.py
new file mode 100644
index 000000000..0db013543
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/shuffle.py
@@ -0,0 +1,61 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ..core import TENSOR_TYPE
+from ..datasource import tensor as astensor
+
+
+def shuffle(random_state, x, axis=0):
+    r"""
+    Modify a sequence in-place by shuffling its contents.
+    The order of sub-arrays is changed but their contents remains the same.
+
+    Parameters
+    ----------
+    x : array_like
+        The array or list to be shuffled.
+    axis : int, optional
+        The axis which `x` is shuffled along. Default is 0.
+
+    Returns
+    -------
+    None
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> rng = mt.random.RandomState()
+    >>> arr = mt.arange(10)
+    >>> rng.shuffle(arr)
+    >>> arr.execute()
+    array([0, 1, 4, 2, 8, 6, 5, 9, 3, 7]) # random
+
+    >>> arr = mt.arange(9).reshape((3, 3))
+    >>> rng.shuffle(arr)
+    >>> arr.execute()
+    array([[6, 7, 8], # random
+           [0, 1, 2],
+           [3, 4, 5]])
+    """
+    from .permutation import permutation
+
+    if isinstance(x, (list, np.ndarray, TENSOR_TYPE)):
+        x = astensor(x)
+    else:
+        raise TypeError("x should be list, numpy ndarray or tensor")
+
+    ret = permutation(random_state, x, axis=axis)
+    x.data = ret.data
diff --git a/python/xorbits/_mars/tensor/random/standard_cauchy.py b/python/xorbits/_mars/tensor/random/standard_cauchy.py
new file mode 100644
index 000000000..9a35029c7
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/standard_cauchy.py
@@ -0,0 +1,103 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin
+
+
+class TensorStandardCauchy(TensorDistribution, TensorRandomOperandMixin):
+    _op_type_ = OperandDef.RAND_STANDARD_CAUCHY
+    _func_name = "standard_cauchy"
+    _fields_ = ("size",)
+
+    def __call__(self, chunk_size=None):
+        return self.new_tensor(None, None, raw_chunk_size=chunk_size)
+
+
+def standard_cauchy(random_state, size=None, chunk_size=None, gpu=None, dtype=None):
+    r"""
+    Draw samples from a standard Cauchy distribution with mode = 0.
+
+    Also known as the Lorentz distribution.
+
+    Parameters
+    ----------
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    samples : Tensor or scalar
+        The drawn samples.
+
+    Notes
+    -----
+    The probability density function for the full Cauchy distribution is
+
+    .. math:: P(x; x_0, \gamma) = \frac{1}{\pi \gamma \bigl[ 1+
+              (\frac{x-x_0}{\gamma})^2 \bigr] }
+
+    and the Standard Cauchy distribution just sets :math:`x_0=0` and
+    :math:`\gamma=1`
+
+    The Cauchy distribution arises in the solution to the driven harmonic
+    oscillator problem, and also describes spectral line broadening. It
+    also describes the distribution of values at which a line tilted at
+    a random angle will cut the x axis.
+
+    When studying hypothesis tests that assume normality, seeing how the
+    tests perform on data from a Cauchy distribution is a good indicator of
+    their sensitivity to a heavy-tailed distribution, since the Cauchy looks
+    very much like a Gaussian distribution, but with heavier tails.
+
+    References
+    ----------
+    .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "Cauchy
+          Distribution",
+          http://www.itl.nist.gov/div898/handbook/eda/section3/eda3663.htm
+    .. [2] Weisstein, Eric W. "Cauchy Distribution." From MathWorld--A
+          Wolfram Web Resource.
+          http://mathworld.wolfram.com/CauchyDistribution.html
+    .. [3] Wikipedia, "Cauchy distribution"
+          http://en.wikipedia.org/wiki/Cauchy_distribution
+
+    Examples
+    --------
+    Draw samples and plot the distribution:
+
+    >>> import mars.tensor as mt
+    >>> import matplotlib.pyplot as plt
+
+    >>> s = mt.random.standard_cauchy(1000000)
+    >>> s = s[(s>-25) & (s<25)]  # truncate distribution so it plots well
+    >>> plt.hist(s.execute(), bins=100)
+    >>> plt.show()
+    """
+    if dtype is None:
+        dtype = np.random.RandomState().standard_cauchy(size=(0,)).dtype
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorStandardCauchy(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/standard_exponential.py b/python/xorbits/_mars/tensor/random/standard_exponential.py
new file mode 100644
index 000000000..248d37586
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/standard_exponential.py
@@ -0,0 +1,70 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin
+
+
+class TensorStandardExponential(TensorDistribution, TensorRandomOperandMixin):
+    _op_type_ = OperandDef.RAND_STANDARD_EXPONENTIAL
+    _func_name = "standard_exponential"
+    _fields_ = ("size",)
+
+    def __call__(self, chunk_size=None):
+        return self.new_tensor(None, None, raw_chunk_size=chunk_size)
+
+
+def standard_exponential(
+    random_state, size=None, chunk_size=None, gpu=None, dtype=None
+):
+    """
+    Draw samples from the standard exponential distribution.
+
+    `standard_exponential` is identical to the exponential distribution
+    with a scale parameter of 1.
+
+    Parameters
+    ----------
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : float or Tensor
+        Drawn samples.
+
+    Examples
+    --------
+    Output a 3x8000 tensor:
+
+    >>> import mars.tensor as mt
+    >>> n = mt.random.standard_exponential((3, 8000))
+    """
+    if dtype is None:
+        dtype = np.random.RandomState().standard_exponential(size=(0,)).dtype
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorStandardExponential(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/standard_gamma.py b/python/xorbits/_mars/tensor/random/standard_gamma.py
new file mode 100644
index 000000000..30a0032f5
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/standard_gamma.py
@@ -0,0 +1,118 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorStandardGamma(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["shape"]
+    _op_type_ = OperandDef.RAND_STANDARD_GAMMMA
+
+    _fields_ = "shape", "size"
+    shape = AnyField("shape")
+    _func_name = "standard_gamma"
+
+    def __call__(self, shape, chunk_size=None):
+        return self.new_tensor([shape], None, raw_chunk_size=chunk_size)
+
+
+def standard_gamma(
+    random_state, shape, size=None, chunk_size=None, gpu=None, dtype=None
+):
+    r"""
+    Draw samples from a standard Gamma distribution.
+
+    Samples are drawn from a Gamma distribution with specified parameters,
+    shape (sometimes designated "k") and scale=1.
+
+    Parameters
+    ----------
+    shape : float or array_like of floats
+        Parameter, should be > 0.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``shape`` is a scalar.  Otherwise,
+        ``mt.array(shape).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized standard gamma distribution.
+
+    See Also
+    --------
+    scipy.stats.gamma : probability density function, distribution or
+        cumulative density function, etc.
+
+    Notes
+    -----
+    The probability density for the Gamma distribution is
+
+    .. math:: p(x) = x^{k-1}\frac{e^{-x/\theta}}{\theta^k\Gamma(k)},
+
+    where :math:`k` is the shape and :math:`\theta` the scale,
+    and :math:`\Gamma` is the Gamma function.
+
+    The Gamma distribution is often used to model the times to failure of
+    electronic components, and arises naturally in processes for which the
+    waiting times between Poisson distributed events are relevant.
+
+    References
+    ----------
+    .. [1] Weisstein, Eric W. "Gamma Distribution." From MathWorld--A
+           Wolfram Web Resource.
+           http://mathworld.wolfram.com/GammaDistribution.html
+    .. [2] Wikipedia, "Gamma distribution",
+           http://en.wikipedia.org/wiki/Gamma_distribution
+
+    Examples
+    --------
+    Draw samples from the distribution:
+
+    >>> import mars.tensor as mt
+
+    >>> shape, scale = 2., 1. # mean and width
+    >>> s = mt.random.standard_gamma(shape, 1000000)
+
+    Display the histogram of the samples, along with
+    the probability density function:
+
+    >>> import matplotlib.pyplot as plt
+    >>> import scipy.special as sps
+    >>> count, bins, ignored = plt.hist(s.execute(), 50, normed=True)
+    >>> y = bins**(shape-1) * ((mt.exp(-bins/scale))/ \
+    ...                       (sps.gamma(shape) * scale**shape))
+    >>> plt.plot(bins, y.execute(), linewidth=2, color='r')
+    >>> plt.show()
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState().standard_gamma(handle_array(shape), size=(0,)).dtype
+        )
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorStandardGamma(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(shape, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/standard_normal.py b/python/xorbits/_mars/tensor/random/standard_normal.py
new file mode 100644
index 000000000..f80603363
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/standard_normal.py
@@ -0,0 +1,72 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin
+
+
+class TensorStandardNormal(TensorDistribution, TensorRandomOperandMixin):
+    _op_type_ = OperandDef.RAND_STANDARD_NORMAL
+    _func_name = "standard_normal"
+    _fields_ = ("size",)
+
+    def __call__(self, chunk_size=None):
+        return self.new_tensor(None, None, raw_chunk_size=chunk_size)
+
+
+def standard_normal(random_state, size=None, chunk_size=None, gpu=None, dtype=None):
+    """
+    Draw samples from a standard Normal distribution (mean=0, stdev=1).
+
+    Parameters
+    ----------
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : float or Tensor
+        Drawn samples.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> s = mt.random.standard_normal(8000)
+    >>> s.execute()
+    array([ 0.6888893 ,  0.78096262, -0.89086505, ...,  0.49876311, #random
+           -0.38672696, -0.4685006 ])                               #random
+    >>> s.shape
+    (8000,)
+    >>> s = mt.random.standard_normal(size=(3, 4, 2))
+    >>> s.shape
+    (3, 4, 2)
+    """
+    if dtype is None:
+        dtype = np.random.RandomState().standard_normal(size=(0,)).dtype
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorStandardNormal(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/standard_t.py b/python/xorbits/_mars/tensor/random/standard_t.py
new file mode 100644
index 000000000..6b4006fa7
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/standard_t.py
@@ -0,0 +1,133 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorStandardT(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["df"]
+    _op_type_ = OperandDef.RAND_STANDARD_T
+
+    _fields_ = "df", "size"
+    df = AnyField("df")
+    _func_name = "standard_t"
+
+    def __call__(self, df, chunk_size=None):
+        return self.new_tensor([df], None, raw_chunk_size=chunk_size)
+
+
+def standard_t(random_state, df, size=None, chunk_size=None, gpu=None, dtype=None):
+    r"""
+    Draw samples from a standard Student's t distribution with `df` degrees
+    of freedom.
+
+    A special case of the hyperbolic distribution.  As `df` gets
+    large, the result resembles that of the standard normal
+    distribution (`standard_normal`).
+
+    Parameters
+    ----------
+    df : float or array_like of floats
+        Degrees of freedom, should be > 0.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``df`` is a scalar.  Otherwise,
+        ``mt.array(df).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized standard Student's t distribution.
+
+    Notes
+    -----
+    The probability density function for the t distribution is
+
+    .. math:: P(x, df) = \frac{\Gamma(\frac{df+1}{2})}{\sqrt{\pi df}
+              \Gamma(\frac{df}{2})}\Bigl( 1+\frac{x^2}{df} \Bigr)^{-(df+1)/2}
+
+    The t test is based on an assumption that the data come from a
+    Normal distribution. The t test provides a way to test whether
+    the sample mean (that is the mean calculated from the data) is
+    a good estimate of the true mean.
+
+    The derivation of the t-distribution was first published in
+    1908 by William Gosset while working for the Guinness Brewery
+    in Dublin. Due to proprietary issues, he had to publish under
+    a pseudonym, and so he used the name Student.
+
+    References
+    ----------
+    .. [1] Dalgaard, Peter, "Introductory Statistics With R",
+           Springer, 2002.
+    .. [2] Wikipedia, "Student's t-distribution"
+           http://en.wikipedia.org/wiki/Student's_t-distribution
+
+    Examples
+    --------
+    From Dalgaard page 83 [1]_, suppose the daily energy intake for 11
+    women in Kj is:
+
+    >>> import mars.tensor as mt
+
+    >>> intake = mt.array([5260., 5470, 5640, 6180, 6390, 6515, 6805, 7515, \
+    ...                    7515, 8230, 8770])
+
+    Does their energy intake deviate systematically from the recommended
+    value of 7725 kJ?
+
+    We have 10 degrees of freedom, so is the sample mean within 95% of the
+    recommended value?
+
+    >>> s = mt.random.standard_t(10, size=100000)
+    >>> mt.mean(intake).execute()
+    6753.636363636364
+    >>> intake.std(ddof=1).execute()
+    1142.1232221373727
+
+    Calculate the t statistic, setting the ddof parameter to the unbiased
+    value so the divisor in the standard deviation will be degrees of
+    freedom, N-1.
+
+    >>> t = (mt.mean(intake)-7725)/(intake.std(ddof=1)/mt.sqrt(len(intake)))
+    >>> import matplotlib.pyplot as plt
+    >>> h = plt.hist(s.execute(), bins=100, normed=True)
+
+    For a one-sided t-test, how far out in the distribution does the t
+    statistic appear?
+
+    >>> (mt.sum(s<t) / float(len(s))).execute()
+    0.0090699999999999999  #random
+
+    So the p-value is about 0.009, which says the null hypothesis has a
+    probability of about 99% of being true.
+    """
+    if dtype is None:
+        dtype = np.random.RandomState().standard_t(handle_array(df), size=(0,)).dtype
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorStandardT(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(df, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/tests/__init__.py b/python/xorbits/_mars/tensor/random/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/random/tests/test_random.py b/python/xorbits/_mars/tensor/random/tests/test_random.py
new file mode 100644
index 000000000..f248bcc80
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/tests/test_random.py
@@ -0,0 +1,222 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+from ....core import tile
+from ....serialization import deserialize, serialize
+from ....serialization.serializables import Serializable
+from ...datasource import tensor as from_ndarray
+from .. import (
+    RandomState,
+    TensorPermutation,
+    beta,
+    choice,
+    multivariate_normal,
+    permutation,
+    rand,
+    randint,
+    randn,
+    shuffle,
+)
+from ..core import RandomStateField
+
+
+class ObjWithRandomStateField(Serializable):
+    random_state = RandomStateField("random_state")
+
+
+@pytest.mark.parametrize("rs", [None, np.random.RandomState()])
+def test_serial_random_state_field(rs):
+    res = deserialize(*serialize(ObjWithRandomStateField(rs)))
+    if rs is None:
+        assert res.random_state is None
+    else:
+        original_state = rs.get_state()
+        new_state = res.random_state.get_state()
+        assert original_state[0] == new_state[0]
+        np.testing.assert_array_equal(original_state[1], new_state[1])
+
+
+def test_random():
+    arr = rand(2, 3)
+
+    assert arr.dtype is not None
+
+    arr = tile(beta(1, 2, chunk_size=2))
+
+    assert arr.shape == ()
+    assert len(arr.chunks) == 1
+    assert arr.chunks[0].shape == ()
+    assert arr.chunks[0].op.dtype == np.dtype("f8")
+
+    arr = tile(beta([1, 2], [3, 4], chunk_size=2))
+
+    assert arr.shape == (2,)
+    assert len(arr.chunks) == 1
+    assert arr.chunks[0].shape == (2,)
+    assert arr.chunks[0].op.dtype == np.dtype("f8")
+
+    arr = tile(
+        beta(
+            [[2, 3]],
+            from_ndarray([[4, 6], [5, 2]], chunk_size=2),
+            chunk_size=1,
+            size=(3, 2, 2),
+        )
+    )
+
+    assert arr.shape == (3, 2, 2)
+    assert len(arr.chunks) == 12
+    assert arr.chunks[0].op.dtype == np.dtype("f8")
+
+
+def test_same_key():
+    assert RandomState(0).rand(10).key == RandomState(0).rand(10).key
+
+
+def test_choice():
+    t = choice(5, chunk_size=1)
+    assert t.shape == ()
+    t = tile(t)
+    assert t.nsplits == ()
+    assert len(t.chunks) == 1
+
+    t = choice(5, 3, chunk_size=1)
+    assert t.shape == (3,)
+    t = tile(t)
+    assert t.nsplits == ((1, 1, 1),)
+
+    t = choice(5, 3, replace=False)
+    assert t.shape == (3,)
+
+    with pytest.raises(ValueError):
+        choice(-1)
+
+    # a should be 1-d
+    with pytest.raises(ValueError):
+        choice(np.random.rand(2, 2))
+
+    # p sum != 1
+    with pytest.raises(ValueError):
+        choice(np.random.rand(3), p=[0.2, 0.2, 0.2])
+
+    # p should b 1-d
+    with pytest.raises(ValueError):
+        choice(np.random.rand(3), p=[[0.2, 0.6, 0.2]])
+
+    # replace=False, choice size cannot be greater than a.size
+    with pytest.raises(ValueError):
+        choice(np.random.rand(10), 11, replace=False)
+
+    # replace=False, choice size cannot be greater than a.size
+    with pytest.raises(ValueError):
+        choice(np.random.rand(10), (3, 4), replace=False)
+
+
+def test_multivariate_normal():
+    mean = [0, 0]
+    cov = [[1, 0], [0, 100]]
+
+    t = multivariate_normal(mean, cov, 5000, chunk_size=500)
+    assert t.shape == (5000, 2)
+    assert t.op.size == (5000,)
+
+    t = tile(t)
+    assert t.nsplits == ((500,) * 10, (2,))
+    assert len(t.chunks) == 10
+    c = t.chunks[0]
+    assert c.shape == (500, 2)
+    assert c.op.size == (500,)
+
+
+def test_randint():
+    arr = tile(randint(1, 2, size=(10, 9), dtype="f8", density=0.01, chunk_size=2))
+
+    assert arr.shape == (10, 9)
+    assert len(arr.chunks) == 25
+    assert arr.chunks[0].shape == (2, 2)
+    assert arr.chunks[0].op.dtype == np.float64
+    assert arr.chunks[0].op.low == 1
+    assert arr.chunks[0].op.high == 2
+    assert arr.chunks[0].op.density == 0.01
+
+
+def test_unexpected_key():
+    with pytest.raises(ValueError):
+        rand(10, 10, chunks=5)
+
+    with pytest.raises(ValueError):
+        randn(10, 10, chunks=5)
+
+
+def test_permutation():
+    x = permutation(10)
+
+    assert x.shape == (10,)
+    assert isinstance(x.op, TensorPermutation)
+
+    x = tile(x)
+
+    assert len(x.chunks) == 1
+    assert isinstance(x.chunks[0].op, TensorPermutation)
+
+    arr = from_ndarray([1, 4, 9, 12, 15], chunk_size=2)
+    x = permutation(arr)
+
+    assert x.shape == (5,)
+    assert isinstance(x.op, TensorPermutation)
+
+    x = tile(x)
+    arr = tile(arr)
+
+    assert len(x.chunks) == 3
+    assert np.isnan(x.chunks[0].shape[0])
+    assert x.chunks[0].inputs[0].inputs[0].inputs[0].key == arr.chunks[0].data.key
+
+    arr = rand(3, 3, chunk_size=2)
+    x = permutation(arr)
+
+    assert x.shape == (3, 3)
+    assert isinstance(x.op, TensorPermutation)
+
+    x = tile(x)
+    arr = tile(arr)
+
+    assert len(x.chunks) == 4
+    assert np.isnan(x.chunks[0].shape[0])
+    assert x.chunks[0].shape[1] == 2
+    assert x.cix[0, 0].op.seed == x.cix[0, 1].op.seed
+    assert (
+        x.cix[0, 0].inputs[0].inputs[0].inputs[0].op.seed
+        == x.cix[1, 0].inputs[0].inputs[0].inputs[0].op.seed
+    )
+
+    with pytest.raises(np.AxisError):
+        pytest.raises(permutation("abc"))
+
+
+def test_shuffle():
+    with pytest.raises(TypeError):
+        shuffle("abc")
+
+    x = rand(10, 10, chunk_size=2)
+    shuffle(x)
+    assert isinstance(x.op, TensorPermutation)
+
+    x = rand(10, 10, chunk_size=2)
+    shuffle(x, axis=1)
+    assert isinstance(x.op, TensorPermutation)
+    assert x.op.axis == 1
diff --git a/python/xorbits/_mars/tensor/random/tests/test_random_execution.py b/python/xorbits/_mars/tensor/random/tests/test_random_execution.py
new file mode 100644
index 000000000..ecf2d2b50
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/tests/test_random_execution.py
@@ -0,0 +1,227 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import namedtuple
+
+import numpy as np
+import pytest
+
+from .... import tensor
+from ....core import tile
+from ....lib.sparse.core import issparse
+from ...datasource import tensor as from_ndarray
+
+
+def test_rand_execution(setup):
+    arr = tensor.random.rand(10, 20, chunk_size=8, dtype="f4")
+    res = arr.execute().fetch()
+    assert res.shape == (10, 20)
+    assert res.dtype == np.float32
+    np.testing.assert_array_less(arr, 1)
+    np.testing.assert_array_less(0, arr)
+
+
+def test_randint_execution(setup):
+    arr = tensor.random.randint(0, 2, size=(10, 30), chunk_size=8)
+
+    res = arr.execute().fetch()
+    assert res.shape == (10, 30)
+    np.testing.assert_array_less(arr, 2)
+    np.testing.assert_array_less(-1, res)
+
+
+def test_random_integers_execution(setup):
+    rs = tensor.random.RandomState(0)
+    arr1 = rs.random_integers(0, 10, size=(10, 20), chunk_size=8)
+    rs = tensor.random.RandomState(0)
+    arr2 = rs.random_integers(0, 10, size=(10, 20), chunk_size=8)
+
+    res1 = arr1.execute().fetch()
+    res2 = arr2.execute().fetch()
+
+    np.testing.assert_array_almost_equal(res1, res2)
+
+
+def test_choice_execution(setup):
+    # test 1 chunk, get integer
+    a = tensor.random.RandomState(0).choice(10)
+    res = a.execute().fetch()
+    np.testing.assert_array_less(res, 10)
+    np.testing.assert_array_less(0, res)
+
+    a = tensor.random.RandomState(0).choice(10)
+    seed = tile(a).chunks[0].op.seed
+    expected = np.random.RandomState(seed).choice(10)
+    np.testing.assert_array_equal(res, expected)
+
+    # test 1 chunk, integer
+    a = tensor.random.RandomState(0).choice(10, (4, 3))
+    res = a.execute().fetch()
+    np.testing.assert_array_less(res, 10)
+    np.testing.assert_array_less(-1, res)
+    b = tensor.random.RandomState(0).choice(10, (4, 3))
+    res2 = b.execute().fetch()
+    np.testing.assert_array_equal(res, res2)
+
+    # test 1 chunk, ndarray
+    raw = np.random.RandomState(0).rand(10)
+    a = tensor.random.RandomState(0).choice(raw, (4, 3))
+    res = a.execute().fetch()
+    seed = tile(a).chunks[0].op.seed
+    expected = np.random.RandomState(seed).choice(raw, (4, 3))
+    np.testing.assert_array_equal(res, expected)
+
+    # test with replacement, integer
+    a = tensor.random.RandomState(0).choice(20, (7, 4), chunk_size=4)
+    res = a.execute().fetch()
+    np.testing.assert_array_less(res, 20)
+    np.testing.assert_array_less(-1, res)
+    b = tensor.random.RandomState(0).choice(20, (7, 4), chunk_size=4)
+    res2 = b.execute().fetch()
+    np.testing.assert_array_equal(res, res2)
+
+    # test with replacement, ndarray
+    raw = np.random.RandomState(0).rand(20)
+    t = tensor.array(raw, chunk_size=8)
+    a = tensor.random.RandomState(0).choice(t, (7, 4), chunk_size=4)
+    res = a.execute().fetch()
+    np.testing.assert_array_less(res, 20)
+    np.testing.assert_array_less(-1, res)
+    b = tensor.random.RandomState(0).choice(t, (7, 4), chunk_size=4)
+    res2 = b.execute().fetch()
+    np.testing.assert_array_equal(res, res2)
+
+    # test without replacement, integer
+    a = tensor.random.RandomState(0).choice(100, (7, 2), chunk_size=2, replace=False)
+    res = a.execute().fetch()
+    np.testing.assert_array_less(res, 100)
+    np.testing.assert_array_less(-1, res)
+    assert len(np.unique(res))
+    b = tensor.random.RandomState(0).choice(100, (7, 2), chunk_size=2, replace=False)
+    res2 = b.execute().fetch()
+    np.testing.assert_array_equal(res, res2)
+
+    # test without replacement, ndarray
+    raw = np.random.RandomState(0).rand(100)
+    t = tensor.array(raw, chunk_size=47)
+    a = tensor.random.RandomState(0).choice(t, (7, 2), chunk_size=2, replace=False)
+    res = a.execute().fetch()
+    np.testing.assert_array_less(res, 100)
+    np.testing.assert_array_less(-1, res)
+    assert len(np.unique(res))
+    b = tensor.random.RandomState(0).choice(t, (7, 2), chunk_size=2, replace=False)
+    res2 = b.execute().fetch()
+    np.testing.assert_array_equal(res, res2)
+
+    # test p
+    raw = np.random.RandomState(0).rand(5)
+    p = [0.3, 0.2, 0.1, 0.3, 0.1]
+    a = tensor.random.RandomState(0).choice(raw, 3, p=p)
+    res = a.execute().fetch()
+    expected = np.random.RandomState(tile(a).chunks[0].op.seed).choice(raw, 3, p=p)
+    np.testing.assert_array_equal(res, expected)
+
+
+def test_sparse_randint_execution(setup):
+    # size_executor = ExecutorForTest(sync_provider_type=ExecutorForTest.SyncProviderType.MOCK)
+
+    arr = tensor.random.randint(
+        1, 2, size=(30, 50), density=0.1, chunk_size=20, dtype="f4"
+    )
+    # size_res = size_executor.execute_tensor(arr, mock=True)
+    # assert pytest.approx(arr.nbytes * 0.1) == sum(tp[0] for tp in size_res)
+
+    res = arr.execute().fetch()
+    assert issparse(res) is True
+    assert res.shape == (30, 50)
+    np.testing.assert_array_less(res.data, 2)
+    np.testing.assert_array_less(0, res.data)
+    assert (res >= 1).toarray().sum() == pytest.approx(30 * 50 * 0.1, abs=20)
+
+
+random_test_options = namedtuple("random_test_options", ["func_name", "args", "kwargs"])
+
+random_params = [
+    random_test_options("beta", ([1, 2], [3, 4]), dict(chunk_size=2)),
+    random_test_options("binomial", (10, 0.5, 100), dict(chunk_size=50)),
+    random_test_options("chisquare", (2, 100), dict(chunk_size=50)),
+    random_test_options("dirichlet", ((10, 5, 3), 100), dict(chunk_size=50)),
+    random_test_options("exponential", (1.0, 100), dict(chunk_size=50)),
+    random_test_options("f", (1.0, 2.0, 100), dict(chunk_size=50)),
+    random_test_options("gamma", (1.0, 2.0, 100), dict(chunk_size=50)),
+    random_test_options("geometric", (1.0, 100), dict(chunk_size=50)),
+    random_test_options("gumbel", (0.5, 1.0, 100), dict(chunk_size=50)),
+    random_test_options("hypergeometric", (10, 20, 15, 100), dict(chunk_size=50)),
+    random_test_options("laplace", (0.5, 1.0, 100), dict(chunk_size=50)),
+    random_test_options("logistic", (0.5, 1.0, 100), dict(chunk_size=50)),
+    random_test_options("lognormal", (0.5, 1.0, 100), dict(chunk_size=50)),
+    random_test_options("logseries", (0.5, 100), dict(chunk_size=50)),
+    random_test_options("multinomial", (10, [0.2, 0.5, 0.3], 100), dict(chunk_size=50)),
+    random_test_options(
+        "multivariate_normal", ([1, 2], [[1, 0], [0, 1]], 100), dict(chunk_size=50)
+    ),
+    random_test_options("negative_binomial", (5, 1.0, 100), dict(chunk_size=50)),
+    random_test_options("noncentral_chisquare", (0.5, 1.0, 100), dict(chunk_size=50)),
+    random_test_options("noncentral_f", (1.5, 1.0, 1.1, 100), dict(chunk_size=50)),
+    random_test_options("pareto", (1.0, 100), dict(chunk_size=50)),
+    random_test_options("poisson", (1.0, 100), dict(chunk_size=50)),
+    random_test_options("power", (1.0, 100), dict(chunk_size=50)),
+    random_test_options("rayleigh", (1.0, 100), dict(chunk_size=50)),
+    random_test_options("standard_cauchy", (100,), dict(chunk_size=50)),
+    random_test_options("standard_exponential", (100,), dict(chunk_size=50)),
+    random_test_options("standard_gamma", (1.0, 100), dict(chunk_size=50)),
+    random_test_options("standard_normal", (100,), dict(chunk_size=50)),
+    random_test_options("standard_t", (1.0, 100), dict(chunk_size=50)),
+    random_test_options("triangular", (0.1, 0.2, 0.3, 100), dict(chunk_size=50)),
+    random_test_options("uniform", (0.1, 0.2, 100), dict(chunk_size=50)),
+    random_test_options("vonmises", (0.1, 0.2, 100), dict(chunk_size=50)),
+    random_test_options("wald", (0.1, 0.2, 100), dict(chunk_size=50)),
+    random_test_options("weibull", (0.1, 100), dict(chunk_size=50)),
+    random_test_options("zipf", (1.1, 100), dict(chunk_size=50)),
+]
+
+
+@pytest.mark.parametrize("test_opts", random_params)
+def test_random_execute(setup, test_opts):
+    rs = tensor.random.RandomState(0)
+    arr1 = getattr(rs, test_opts.func_name)(*test_opts.args, **test_opts.kwargs)
+    rs = tensor.random.RandomState(0)
+    arr2 = getattr(rs, test_opts.func_name)(*test_opts.args, **test_opts.kwargs)
+    assert np.array_equal(arr1.execute().fetch(), arr2.execute().fetch())
+
+
+def test_permutation_execute(setup):
+    rs = tensor.random.RandomState(0)
+    x = rs.permutation(10)
+    res = x.execute().fetch()
+    assert not np.all(res[:-1] < res[1:])
+    np.testing.assert_array_equal(np.sort(res), np.arange(10))
+
+    arr = from_ndarray([1, 4, 9, 12, 15], chunk_size=2)
+    x = rs.permutation(arr)
+    res = x.execute().fetch()
+    assert not np.all(res[:-1] < res[1:])
+    np.testing.assert_array_equal(np.sort(res), np.asarray([1, 4, 9, 12, 15]))
+
+    arr = from_ndarray(np.arange(48).reshape(12, 4), chunk_size=2)
+    # axis = 0
+    x = rs.permutation(arr)
+    res = x.execute().fetch()
+    assert not np.all(res[:-1] < res[1:])
+    np.testing.assert_array_equal(np.sort(res, axis=0), np.arange(48).reshape(12, 4))
+    # axis != 0
+    x2 = rs.permutation(arr, axis=1)
+    res = x2.execute().fetch()
+    assert not np.all(res[:, :-1] < res[:, 1:])
+    np.testing.assert_array_equal(np.sort(res, axis=1), np.arange(48).reshape(12, 4))
diff --git a/python/xorbits/_mars/tensor/random/triangular.py b/python/xorbits/_mars/tensor/random/triangular.py
new file mode 100644
index 000000000..a923495e6
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/triangular.py
@@ -0,0 +1,117 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorTriangular(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["left", "mode", "right"]
+    _op_type_ = OperandDef.RAND_TRIANGULAR
+
+    _fields_ = "left", "mode", "right", "size"
+    left = AnyField("left")
+    mode = AnyField("mode")
+    right = AnyField("right")
+    _func_name = "triangular"
+
+    def __call__(self, left, mode, right, chunk_size=None):
+        return self.new_tensor([left, mode, right], None, raw_chunk_size=chunk_size)
+
+
+def triangular(
+    random_state, left, mode, right, size=None, chunk_size=None, gpu=None, dtype=None
+):
+    r"""
+    Draw samples from the triangular distribution over the
+    interval ``[left, right]``.
+
+    The triangular distribution is a continuous probability
+    distribution with lower limit left, peak at mode, and upper
+    limit right. Unlike the other distributions, these parameters
+    directly define the shape of the pdf.
+
+    Parameters
+    ----------
+    left : float or array_like of floats
+        Lower limit.
+    mode : float or array_like of floats
+        The value where the peak of the distribution occurs.
+        The value should fulfill the condition ``left <= mode <= right``.
+    right : float or array_like of floats
+        Upper limit, should be larger than `left`.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``left``, ``mode``, and ``right``
+        are all scalars.  Otherwise, ``mt.broadcast(left, mode, right).size``
+        samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized triangular distribution.
+
+    Notes
+    -----
+    The probability density function for the triangular distribution is
+
+    .. math:: P(x;l, m, r) = \begin{cases}
+              \frac{2(x-l)}{(r-l)(m-l)}& \text{for $l \leq x \leq m$},\\
+              \frac{2(r-x)}{(r-l)(r-m)}& \text{for $m \leq x \leq r$},\\
+              0& \text{otherwise}.
+              \end{cases}
+
+    The triangular distribution is often used in ill-defined
+    problems where the underlying distribution is not known, but
+    some knowledge of the limits and mode exists. Often it is used
+    in simulations.
+
+    References
+    ----------
+    .. [1] Wikipedia, "Triangular distribution"
+           http://en.wikipedia.org/wiki/Triangular_distribution
+
+    Examples
+    --------
+    Draw values from the distribution and plot the histogram:
+
+    >>> import matplotlib.pyplot as plt
+    >>> import mars.tensor as mt
+    >>> h = plt.hist(mt.random.triangular(-3, 0, 8, 100000).execute(), bins=200,
+    ...              normed=True)
+    >>> plt.show()
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState()
+            .triangular(
+                handle_array(left), handle_array(mode), handle_array(right), size=(0,)
+            )
+            .dtype
+        )
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorTriangular(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(left, mode, right, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/uniform.py b/python/xorbits/_mars/tensor/random/uniform.py
new file mode 100644
index 000000000..ac7225bcf
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/uniform.py
@@ -0,0 +1,129 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorUniform(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["low", "high"]
+    _op_type_ = OperandDef.RAND_UNIFORM
+
+    _fields_ = "low", "high", "size"
+    low = AnyField("low")
+    high = AnyField("high")
+    _func_name = "uniform"
+
+    def __call__(self, low, high, chunk_size=None):
+        return self.new_tensor([low, high], None, raw_chunk_size=chunk_size)
+
+
+def uniform(
+    random_state, low=0.0, high=1.0, size=None, chunk_size=None, gpu=None, dtype=None
+):
+    r"""
+    Draw samples from a uniform distribution.
+
+    Samples are uniformly distributed over the half-open interval
+    ``[low, high)`` (includes low, but excludes high).  In other words,
+    any value within the given interval is equally likely to be drawn
+    by `uniform`.
+
+    Parameters
+    ----------
+    low : float or array_like of floats, optional
+        Lower boundary of the output interval.  All values generated will be
+        greater than or equal to low.  The default value is 0.
+    high : float or array_like of floats
+        Upper boundary of the output interval.  All values generated will be
+        less than high.  The default value is 1.0.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``low`` and ``high`` are both scalars.
+        Otherwise, ``mt.broadcast(low, high).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized uniform distribution.
+
+    See Also
+    --------
+    randint : Discrete uniform distribution, yielding integers.
+    random_integers : Discrete uniform distribution over the closed
+                      interval ``[low, high]``.
+    random_sample : Floats uniformly distributed over ``[0, 1)``.
+    random : Alias for `random_sample`.
+    rand : Convenience function that accepts dimensions as input, e.g.,
+           ``rand(2,2)`` would generate a 2-by-2 array of floats,
+           uniformly distributed over ``[0, 1)``.
+
+    Notes
+    -----
+    The probability density function of the uniform distribution is
+
+    .. math:: p(x) = \frac{1}{b - a}
+
+    anywhere within the interval ``[a, b)``, and zero elsewhere.
+
+    When ``high`` == ``low``, values of ``low`` will be returned.
+    If ``high`` < ``low``, the results are officially undefined
+    and may eventually raise an error, i.e. do not rely on this
+    function to behave when passed arguments satisfying that
+    inequality condition.
+
+    Examples
+    --------
+    Draw samples from the distribution:
+
+    >>> import mars.tensor as mt
+
+    >>> s = mt.random.uniform(-1,0,1000)
+
+    All values are within the given interval:
+
+    >>> mt.all(s >= -1).execute()
+    True
+    >>> mt.all(s < 0).execute()
+    True
+
+    Display the histogram of the samples, along with the
+    probability density function:
+
+    >>> import matplotlib.pyplot as plt
+    >>> count, bins, ignored = plt.hist(s.execute(), 15, normed=True)
+    >>> plt.plot(bins, mt.ones_like(bins).execute(), linewidth=2, color='r')
+    >>> plt.show()
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState()
+            .uniform(handle_array(low), handle_array(high), size=(0,))
+            .dtype
+        )
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorUniform(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(low, high, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/vonmises.py b/python/xorbits/_mars/tensor/random/vonmises.py
new file mode 100644
index 000000000..083ac29b1
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/vonmises.py
@@ -0,0 +1,129 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorVonmises(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["mu", "kappa"]
+    _op_type_ = OperandDef.RAND_VONMISES
+
+    _fields_ = "mu", "kappa", "size"
+    mu = AnyField("mu")
+    kappa = AnyField("kappa")
+    _func_name = "vonmises"
+
+    def __call__(self, mu, kappa, chunk_size=None):
+        return self.new_tensor([mu, kappa], None, raw_chunk_size=chunk_size)
+
+
+def vonmises(random_state, mu, kappa, size=None, chunk_size=None, gpu=None, dtype=None):
+    r"""
+    Draw samples from a von Mises distribution.
+
+    Samples are drawn from a von Mises distribution with specified mode
+    (mu) and dispersion (kappa), on the interval [-pi, pi].
+
+    The von Mises distribution (also known as the circular normal
+    distribution) is a continuous probability distribution on the unit
+    circle.  It may be thought of as the circular analogue of the normal
+    distribution.
+
+    Parameters
+    ----------
+    mu : float or array_like of floats
+        Mode ("center") of the distribution.
+    kappa : float or array_like of floats
+        Dispersion of the distribution, has to be >=0.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``mu`` and ``kappa`` are both scalars.
+        Otherwise, ``np.broadcast(mu, kappa).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized von Mises distribution.
+
+    See Also
+    --------
+    scipy.stats.vonmises : probability density function, distribution, or
+        cumulative density function, etc.
+
+    Notes
+    -----
+    The probability density for the von Mises distribution is
+
+    .. math:: p(x) = \frac{e^{\kappa cos(x-\mu)}}{2\pi I_0(\kappa)},
+
+    where :math:`\mu` is the mode and :math:`\kappa` the dispersion,
+    and :math:`I_0(\kappa)` is the modified Bessel function of order 0.
+
+    The von Mises is named for Richard Edler von Mises, who was born in
+    Austria-Hungary, in what is now the Ukraine.  He fled to the United
+    States in 1939 and became a professor at Harvard.  He worked in
+    probability theory, aerodynamics, fluid mechanics, and philosophy of
+    science.
+
+    References
+    ----------
+    .. [1] Abramowitz, M. and Stegun, I. A. (Eds.). "Handbook of
+           Mathematical Functions with Formulas, Graphs, and Mathematical
+           Tables, 9th printing," New York: Dover, 1972.
+    .. [2] von Mises, R., "Mathematical Theory of Probability
+           and Statistics", New York: Academic Press, 1964.
+
+    Examples
+    --------
+    Draw samples from the distribution:
+
+    >>> import mars.tensor as mt
+
+    >>> mu, kappa = 0.0, 4.0 # mean and dispersion
+    >>> s = mt.random.vonmises(mu, kappa, 1000)
+
+    Display the histogram of the samples, along with
+    the probability density function:
+
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.special import i0
+    >>> plt.hist(s.execute(), 50, normed=True)
+    >>> x = mt.linspace(-mt.pi, mt.pi, num=51)
+    >>> y = mt.exp(kappa*mt.cos(x-mu))/(2*mt.pi*i0(kappa))
+    >>> plt.plot(x.execute(), y.execute(), linewidth=2, color='r')
+    >>> plt.show()
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState()
+            .vonmises(handle_array(mu), handle_array(kappa), size=(0,))
+            .dtype
+        )
+
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorVonmises(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(mu, kappa, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/wald.py b/python/xorbits/_mars/tensor/random/wald.py
new file mode 100644
index 000000000..08e195d12
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/wald.py
@@ -0,0 +1,112 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorWald(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["mean", "scale"]
+    _op_type_ = OperandDef.RAND_WALD
+
+    _fields_ = "mean", "scale", "size"
+    mean = AnyField("mean")
+    scale = AnyField("scale")
+    _func_name = "wald"
+
+    def __call__(self, mean, scale, chunk_size=None):
+        return self.new_tensor([mean, scale], None, raw_chunk_size=chunk_size)
+
+
+def wald(random_state, mean, scale, size=None, chunk_size=None, gpu=None, dtype=None):
+    r"""
+    Draw samples from a Wald, or inverse Gaussian, distribution.
+
+    As the scale approaches infinity, the distribution becomes more like a
+    Gaussian. Some references claim that the Wald is an inverse Gaussian
+    with mean equal to 1, but this is by no means universal.
+
+    The inverse Gaussian distribution was first studied in relationship to
+    Brownian motion. In 1956 M.C.K. Tweedie used the name inverse Gaussian
+    because there is an inverse relationship between the time to cover a
+    unit distance and distance covered in unit time.
+
+    Parameters
+    ----------
+    mean : float or array_like of floats
+        Distribution mean, should be > 0.
+    scale : float or array_like of floats
+        Scale parameter, should be >= 0.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``mean`` and ``scale`` are both scalars.
+        Otherwise, ``np.broadcast(mean, scale).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized Wald distribution.
+
+    Notes
+    -----
+    The probability density function for the Wald distribution is
+
+    .. math:: P(x;mean,scale) = \sqrt{\frac{scale}{2\pi x^3}}e^
+                                \frac{-scale(x-mean)^2}{2\cdotp mean^2x}
+
+    As noted above the inverse Gaussian distribution first arise
+    from attempts to model Brownian motion. It is also a
+    competitor to the Weibull for use in reliability modeling and
+    modeling stock returns and interest rate processes.
+
+    References
+    ----------
+    .. [1] Brighton Webs Ltd., Wald Distribution,
+           http://www.brighton-webs.co.uk/distributions/wald.asp
+    .. [2] Chhikara, Raj S., and Folks, J. Leroy, "The Inverse Gaussian
+           Distribution: Theory : Methodology, and Applications", CRC Press,
+           1988.
+    .. [3] Wikipedia, "Wald distribution"
+           http://en.wikipedia.org/wiki/Wald_distribution
+
+    Examples
+    --------
+    Draw values from the distribution and plot the histogram:
+
+    >>> import matplotlib.pyplot as plt
+    >>> import mars.tensor as mt
+    >>> h = plt.hist(mt.random.wald(3, 2, 100000).execute(), bins=200, normed=True)
+    >>> plt.show()
+    """
+    if dtype is None:
+        dtype = (
+            np.random.RandomState()
+            .wald(handle_array(mean), handle_array(scale), size=(0,))
+            .dtype
+        )
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorWald(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(mean, scale, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/weibull.py b/python/xorbits/_mars/tensor/random/weibull.py
new file mode 100644
index 000000000..2dc93c895
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/weibull.py
@@ -0,0 +1,138 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorWeibull(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["a"]
+    _op_type_ = OperandDef.RAND_WEIBULL
+
+    _fields_ = "a", "size"
+    a = AnyField("a")
+    _func_name = "weibull"
+
+    def __call__(self, a, chunk_size=None):
+        return self.new_tensor([a], None, raw_chunk_size=chunk_size)
+
+
+def weibull(random_state, a, size=None, chunk_size=None, gpu=None, dtype=None):
+    r"""
+    Draw samples from a Weibull distribution.
+
+    Draw samples from a 1-parameter Weibull distribution with the given
+    shape parameter `a`.
+
+    .. math:: X = (-ln(U))^{1/a}
+
+    Here, U is drawn from the uniform distribution over (0,1].
+
+    The more common 2-parameter Weibull, including a scale parameter
+    :math:`\lambda` is just :math:`X = \lambda(-ln(U))^{1/a}`.
+
+    Parameters
+    ----------
+    a : float or array_like of floats
+        Shape of the distribution. Should be greater than zero.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``a`` is a scalar.  Otherwise,
+        ``mt.array(a).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized Weibull distribution.
+
+    See Also
+    --------
+    scipy.stats.weibull_max
+    scipy.stats.weibull_min
+    scipy.stats.genextreme
+    gumbel
+
+    Notes
+    -----
+    The Weibull (or Type III asymptotic extreme value distribution
+    for smallest values, SEV Type III, or Rosin-Rammler
+    distribution) is one of a class of Generalized Extreme Value
+    (GEV) distributions used in modeling extreme value problems.
+    This class includes the Gumbel and Frechet distributions.
+
+    The probability density for the Weibull distribution is
+
+    .. math:: p(x) = \frac{a}
+                     {\lambda}(\frac{x}{\lambda})^{a-1}e^{-(x/\lambda)^a},
+
+    where :math:`a` is the shape and :math:`\lambda` the scale.
+
+    The function has its peak (the mode) at
+    :math:`\lambda(\frac{a-1}{a})^{1/a}`.
+
+    When ``a = 1``, the Weibull distribution reduces to the exponential
+    distribution.
+
+    References
+    ----------
+    .. [1] Waloddi Weibull, Royal Technical University, Stockholm,
+           1939 "A Statistical Theory Of The Strength Of Materials",
+           Ingeniorsvetenskapsakademiens Handlingar Nr 151, 1939,
+           Generalstabens Litografiska Anstalts Forlag, Stockholm.
+    .. [2] Waloddi Weibull, "A Statistical Distribution Function of
+           Wide Applicability", Journal Of Applied Mechanics ASME Paper
+           1951.
+    .. [3] Wikipedia, "Weibull distribution",
+           http://en.wikipedia.org/wiki/Weibull_distribution
+
+    Examples
+    --------
+    Draw samples from the distribution:
+
+    >>> import mars.tensor as mt
+
+    >>> a = 5. # shape
+    >>> s = mt.random.weibull(a, 1000)
+
+    Display the histogram of the samples, along with
+    the probability density function:
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = mt.arange(1,100.)/50.
+    >>> def weib(x,n,a):
+    ...     return (a / n) * (x / n)**(a - 1) * mt.exp(-(x / n)**a)
+
+    >>> count, bins, ignored = plt.hist(mt.random.weibull(5.,1000).execute())
+    >>> x = mt.arange(1,100.)/50.
+    >>> scale = count.max()/weib(x, 1., 5.).max()
+    >>> plt.plot(x.execute(), (weib(x, 1., 5.)*scale).execute())
+    >>> plt.show()
+    """
+    if dtype is None:
+        dtype = np.random.RandomState().weibull(handle_array(a), size=(0,)).dtype
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorWeibull(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(a, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/random/zipf.py b/python/xorbits/_mars/tensor/random/zipf.py
new file mode 100644
index 000000000..cef646748
--- /dev/null
+++ b/python/xorbits/_mars/tensor/random/zipf.py
@@ -0,0 +1,120 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ..utils import gen_random_seeds
+from .core import TensorDistribution, TensorRandomOperandMixin, handle_array
+
+
+class TensorZipf(TensorDistribution, TensorRandomOperandMixin):
+    _input_fields_ = ["a"]
+    _op_type_ = OperandDef.RAND_ZIPF
+
+    _fields_ = "a", "size"
+    a = AnyField("a")
+    _func_name = "zipf"
+
+    def __call__(self, a, chunk_size=None):
+        return self.new_tensor([a], None, raw_chunk_size=chunk_size)
+
+
+def zipf(random_state, a, size=None, chunk_size=None, gpu=None, dtype=None):
+    r"""
+    Draw samples from a Zipf distribution.
+
+    Samples are drawn from a Zipf distribution with specified parameter
+    `a` > 1.
+
+    The Zipf distribution (also known as the zeta distribution) is a
+    continuous probability distribution that satisfies Zipf's law: the
+    frequency of an item is inversely proportional to its rank in a
+    frequency table.
+
+    Parameters
+    ----------
+    a : float or array_like of floats
+        Distribution parameter. Should be greater than 1.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a single value is returned if ``a`` is a scalar. Otherwise,
+        ``mt.array(a).size`` samples are drawn.
+    chunk_size : int or tuple of int or tuple of ints, optional
+        Desired chunk size on each dimension
+    gpu : bool, optional
+        Allocate the tensor on GPU if True, False as default
+    dtype : data-type, optional
+      Data-type of the returned tensor.
+
+    Returns
+    -------
+    out : Tensor or scalar
+        Drawn samples from the parameterized Zipf distribution.
+
+    See Also
+    --------
+    scipy.stats.zipf : probability density function, distribution, or
+        cumulative density function, etc.
+
+    Notes
+    -----
+    The probability density for the Zipf distribution is
+
+    .. math:: p(x) = \frac{x^{-a}}{\zeta(a)},
+
+    where :math:`\zeta` is the Riemann Zeta function.
+
+    It is named for the American linguist George Kingsley Zipf, who noted
+    that the frequency of any word in a sample of a language is inversely
+    proportional to its rank in the frequency table.
+
+    References
+    ----------
+    .. [1] Zipf, G. K., "Selected Studies of the Principle of Relative
+           Frequency in Language," Cambridge, MA: Harvard Univ. Press,
+           1932.
+
+    Examples
+    --------
+    Draw samples from the distribution:
+
+    >>> import mars.tensor as mt
+
+    >>> a = 2. # parameter
+    >>> s = mt.random.zipf(a, 1000)
+
+    Display the histogram of the samples, along with
+    the probability density function:
+
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy import special
+
+    Truncate s values at 50 so plot is interesting:
+
+    >>> count, bins, ignored = plt.hist(s[s<50].execute(), 50, normed=True)
+    >>> x = mt.arange(1., 50.)
+    >>> y = x**(-a) / special.zetac(a)
+    >>> plt.plot(x.execute(), (y/mt.max(y)).execute(), linewidth=2, color='r')
+    >>> plt.show()
+    """
+    if dtype is None:
+        dtype = np.random.RandomState().zipf(handle_array(a), size=(0,)).dtype
+
+    size = random_state._handle_size(size)
+    seed = gen_random_seeds(1, random_state.to_numpy())[0]
+    op = TensorZipf(size=size, seed=seed, gpu=gpu, dtype=dtype)
+    return op(a, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/rechunk/__init__.py b/python/xorbits/_mars/tensor/rechunk/__init__.py
new file mode 100644
index 000000000..1f747bf42
--- /dev/null
+++ b/python/xorbits/_mars/tensor/rechunk/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .rechunk import rechunk
+
+
+def _install():
+    from ..core import Tensor, TensorData
+
+    setattr(Tensor, "rechunk", rechunk)
+    setattr(TensorData, "rechunk", rechunk)
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/tensor/rechunk/core.py b/python/xorbits/_mars/tensor/rechunk/core.py
new file mode 100644
index 000000000..cb21458f7
--- /dev/null
+++ b/python/xorbits/_mars/tensor/rechunk/core.py
@@ -0,0 +1,108 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from dataclasses import dataclass
+from typing import List, Tuple, Union
+
+import numpy as np
+
+from ...typing import ChunkType, TileableType
+from ..utils import decide_chunk_sizes
+
+chunk_size_type = Union[int, Tuple[int], Tuple[Tuple[int], ...]]
+
+
+def get_nsplits(
+    tileable: TileableType, new_chunk_size: chunk_size_type, itemsize: int
+) -> Tuple[Tuple[int], ...]:
+    if isinstance(new_chunk_size, dict):
+        chunk_size = list(tileable.nsplits)
+        for idx, c in new_chunk_size.items():
+            chunk_size[idx] = c
+    else:
+        chunk_size = new_chunk_size
+
+    return decide_chunk_sizes(tileable.shape, chunk_size, itemsize)
+
+
+@dataclass
+class RechunkInfo:
+    out_index: Tuple[int]
+    shape: Tuple[int]
+    input_chunks: List[ChunkType]
+    input_slices: List[Tuple[slice]]
+    input_chunk_shape: List[int]
+
+
+def gen_rechunk_infos(
+    inp: TileableType, chunk_size: Tuple[Tuple[int], ...]
+) -> List[RechunkInfo]:
+    cum_in_nsplits = [np.cumsum(ns) for ns in inp.nsplits]
+    cum_out_nsplits = [np.cumsum(ns) for ns in chunk_size]
+    out_starts = [[0] + cum_ns[:-1].tolist() for cum_ns in cum_out_nsplits]
+    out_ends = cum_out_nsplits
+    out_start_indexes = [
+        np.searchsorted(cum_ns, starts)
+        for cum_ns, starts in zip(cum_in_nsplits, out_starts)
+    ]
+    out_end_indexes = [
+        np.searchsorted(cum_ns, ends) for cum_ns, ends in zip(cum_in_nsplits, out_ends)
+    ]
+
+    chunk_index_iter = itertools.product(*(range(len(s)) for s in chunk_size))
+    rechunk_infos = []
+    for chunk_index in chunk_index_iter:
+        shape = tuple(chunk_size[dim][i] for dim, i in enumerate(chunk_index))
+        inp_chunk_slices = [list() for _ in range(len(chunk_index))]
+        inp_chunk_indexes = [list() for _ in range(len(chunk_index))]
+        for dim, i in enumerate(chunk_index):
+            size_start = out_starts[dim][i]
+            size_end = out_ends[dim][i]
+            start_index = out_start_indexes[dim][i]
+            end_index = out_end_indexes[dim][i]
+            for inp_i in range(start_index, end_index + 1):
+                inp_start = cum_in_nsplits[dim][inp_i - 1] if inp_i > 0 else 0
+                inp_end = cum_in_nsplits[dim][inp_i]
+                slice_start = max(inp_start, size_start) - inp_start
+                slice_end = min(inp_end, size_end) - inp_start
+                if slice_start == 0 and slice_end == inp_end - inp_start:
+                    # slice all
+                    slc = slice(None)
+                elif slice_start == slice_end and size_start != size_end:
+                    continue
+                else:
+                    slc = slice(slice_start, slice_end)
+                inp_chunk_slices[dim].append(slc)
+                inp_chunk_indexes[dim].append(inp_i)
+
+        inp_chunks = []
+        inp_slices = []
+        rechunk_info = RechunkInfo(
+            out_index=chunk_index,
+            shape=shape,
+            input_chunks=inp_chunks,
+            input_slices=inp_slices,
+            input_chunk_shape=list(len(s) for s in inp_chunk_indexes),
+        )
+        for inp_chunk_index, inp_chunk_slice in zip(
+            itertools.product(*inp_chunk_indexes),
+            itertools.product(*inp_chunk_slices),
+        ):
+            inp_chunk = inp.cix[tuple(inp_chunk_index)]
+            inp_chunks.append(inp_chunk)
+            inp_slices.append(inp_chunk_slice)
+        rechunk_infos.append(rechunk_info)
+
+    return rechunk_infos
diff --git a/python/xorbits/_mars/tensor/rechunk/rechunk.py b/python/xorbits/_mars/tensor/rechunk/rechunk.py
new file mode 100644
index 000000000..ab25afdd8
--- /dev/null
+++ b/python/xorbits/_mars/tensor/rechunk/rechunk.py
@@ -0,0 +1,115 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField
+from ...utils import has_unknown_shape
+from ..core import Tensor
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+from ..utils import calc_sliced_size
+from .core import chunk_size_type, gen_rechunk_infos, get_nsplits
+
+
+class TensorRechunk(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.RECHUNK
+
+    chunk_size = AnyField("chunk_size")
+
+    def __call__(self, tensor: Tensor):
+        return self.new_tensor([tensor], tensor.shape, order=tensor.order)
+
+    @classmethod
+    def tile(cls, op: "TensorRechunk"):
+        from ..indexing.slice import TensorSlice
+        from ..merge.concatenate import TensorConcatenate
+
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        out = op.outputs[0]
+        tensor = astensor(op.inputs[0])
+        chunk_size = get_nsplits(tensor, op.chunk_size, tensor.dtype.itemsize)
+        if chunk_size == tensor.nsplits:
+            return [tensor]
+
+        rechunk_infos = gen_rechunk_infos(tensor, chunk_size)
+        out_chunks = []
+        for rechunk_info in rechunk_infos:
+            chunk_index = rechunk_info.out_index
+            shape = rechunk_info.shape
+            inp_chunks = rechunk_info.input_chunks
+            inp_chunk_slices = rechunk_info.input_slices
+            inp_slice_chunks = []
+            for inp_chunk, inp_chunk_slice in zip(inp_chunks, inp_chunk_slices):
+                if all(slc == slice(None) for slc in inp_chunk_slice):
+                    inp_slice_chunks.append(inp_chunk)
+                else:
+                    slc_chunk = TensorSlice(slices=list(inp_chunk_slice)).new_chunk(
+                        [inp_chunk],
+                        dtype=inp_chunk.dtype,
+                        shape=tuple(
+                            calc_sliced_size(s, slc)
+                            for s, slc in zip(inp_chunk.shape, inp_chunk_slice)
+                        ),
+                        index=inp_chunk.index,
+                    )
+                    inp_slice_chunks.append(slc_chunk)
+
+            if len(inp_slice_chunks) > 1 or inp_slice_chunks[0].index != chunk_index:
+                chunk_op = TensorConcatenate()
+                out_chunk = chunk_op.new_chunk(
+                    inp_slice_chunks,
+                    shape=shape,
+                    index=chunk_index,
+                    dtype=out.dtype,
+                    order=out.order,
+                )
+                out_chunks.append(out_chunk)
+            else:
+                out_chunks.append(inp_slice_chunks[0])
+
+        new_op = op.copy()
+        params = out.params
+        params["nsplits"] = chunk_size
+        params["chunks"] = out_chunks
+        tensor = new_op.new_tileable(op.inputs, kws=[params])
+
+        if op.reassign_worker:
+            for c in tensor.chunks:
+                c.op.reassign_worker = True
+
+        return [tensor]
+
+
+def rechunk(
+    tensor: Tensor, chunk_size: chunk_size_type, reassign_worker=False
+) -> Tensor:
+    if not any(np.isnan(s) for s in tensor.shape) and not tensor.is_coarse():
+        if not has_unknown_shape(tensor):
+            # do client check only when tensor has no unknown shape,
+            # otherwise, recalculate chunk_size in `tile`
+            chunk_size = get_nsplits(tensor, chunk_size, tensor.dtype.itemsize)
+            if chunk_size == tensor.nsplits:
+                return tensor
+
+    op = TensorRechunk(
+        chunk_size=chunk_size,
+        reassign_worker=reassign_worker,
+        dtype=tensor.dtype,
+        sparse=tensor.issparse(),
+    )
+    return op(tensor)
diff --git a/python/xorbits/_mars/tensor/rechunk/tests/__init__.py b/python/xorbits/_mars/tensor/rechunk/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/rechunk/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/rechunk/tests/test_rechunk.py b/python/xorbits/_mars/tensor/rechunk/tests/test_rechunk.py
new file mode 100644
index 000000000..88c985267
--- /dev/null
+++ b/python/xorbits/_mars/tensor/rechunk/tests/test_rechunk.py
@@ -0,0 +1,42 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import scipy.sparse as sps
+
+from .... import tensor as mt
+
+# dense
+raw = np.random.RandomState(0).rand(12, 9)
+raw2 = raw.copy()
+raw2.ravel()[::2] = 0
+# dense, F-order
+raw3 = np.asfortranarray(raw)
+# sparse
+raw_s = sps.csr_matrix(raw2)
+
+
+@pytest.mark.parametrize("data", [raw, raw3, raw_s])
+@pytest.mark.parametrize("chunk_size", [3, (12, 9), (4, 8)])
+def test_rechunk_execute(setup, data, chunk_size):
+    tensor = mt.tensor(data, chunk_size=4)
+    new_tensor = tensor.rechunk(chunk_size)
+    result = new_tensor.execute().fetch()
+    if hasattr(result, "toarray"):
+        # sparse
+        result = result.toarray()
+        data = data.toarray()
+    assert result.flags["C_CONTIGUOUS"] == data.flags["C_CONTIGUOUS"]
+    np.testing.assert_allclose(result, data)
diff --git a/python/xorbits/_mars/tensor/reduction/__init__.py b/python/xorbits/_mars/tensor/reduction/__init__.py
new file mode 100644
index 000000000..9dd96d7c7
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/__init__.py
@@ -0,0 +1,64 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .all import TensorAll, all
+from .allclose import allclose
+from .any import TensorAny, any
+from .argmax import TensorArgmax, argmax
+from .argmin import TensorArgmin, argmin
+from .array_equal import array_equal
+from .count_nonzero import TensorCountNonzero, count_nonzero
+from .cumprod import TensorCumprod, cumprod
+from .cumsum import TensorCumsum, cumsum
+from .max import TensorMax, max
+from .mean import TensorMean, mean
+from .min import TensorMin, min
+from .nanargmax import TensorNanArgmax, nanargmax
+from .nanargmin import TensorNanArgmin, nanargmin
+from .nancumprod import TensorNanCumprod, nancumprod
+from .nancumsum import TensorNanCumsum, nancumsum
+from .nanmax import TensorNanMax, nanmax
+from .nanmean import TensorNanMean, nanmean
+from .nanmin import TensorNanMin, nanmin
+from .nanprod import TensorNanProd, nanprod
+from .nanstd import nanstd
+from .nansum import TensorNanSum, nansum
+from .nanvar import TensorNanMoment, TensorNanVar, nanvar
+from .prod import TensorProd, prod
+from .std import std
+from .sum import TensorSum, sum
+from .var import TensorMoment, TensorVar, var
+
+
+def _install():
+    from ..core import Tensor, TensorData
+
+    for cls in (Tensor, TensorData):
+        setattr(cls, "sum", sum)
+        setattr(cls, "prod", prod)
+        setattr(cls, "max", max)
+        setattr(cls, "min", min)
+        setattr(cls, "all", all)
+        setattr(cls, "any", any)
+        setattr(cls, "mean", mean)
+        setattr(cls, "argmax", argmax)
+        setattr(cls, "argmin", argmin)
+        setattr(cls, "cumsum", cumsum)
+        setattr(cls, "cumprod", cumprod)
+        setattr(cls, "var", var)
+        setattr(cls, "std", std)
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/tensor/reduction/all.py b/python/xorbits/_mars/tensor/reduction/all.py
new file mode 100644
index 000000000..1005b0c63
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/all.py
@@ -0,0 +1,113 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorReduction, TensorReductionMixin
+
+
+class TensorAll(TensorReduction, TensorReductionMixin):
+    _op_type_ = OperandDef.ALL
+    _func_name = "all"
+
+    def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw):
+        stage = self._rewrite_stage(stage)
+        super().__init__(
+            _axis=axis,
+            _keepdims=keepdims,
+            _combine_size=combine_size,
+            stage=stage,
+            **kw
+        )
+
+
+def all(a, axis=None, out=None, keepdims=None, combine_size=None):
+    """
+    Test whether all array elements along a given axis evaluate to True.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor or object that can be converted to a tensor.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which a logical AND reduction is performed.
+        The default (`axis` = `None`) is to perform a logical AND over all
+        the dimensions of the input array. `axis` may be negative, in
+        which case it counts from the last to the first axis.
+
+        If this is a tuple of ints, a reduction is performed on multiple
+        axes, instead of a single axis or all the axes as before.
+    out : Tensor, optional
+        Alternate output tensor in which to place the result.
+        It must have the same shape as the expected output and its
+        type is preserved (e.g., if ``dtype(out)`` is float, the result
+        will consist of 0.0's and 1.0's).  See `doc.ufuncs` (Section
+        "Output arguments") for more details.
+
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input tensor.
+
+        If the default value is passed, then `keepdims` will not be
+        passed through to the `all` method of sub-classes of
+        `ndarray`, however any non-default value will be.  If the
+        sub-classes `sum` method does not implement `keepdims` any
+        exceptions will be raised.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    all : Tensor, bool
+        A new boolean or tensor is returned unless `out` is specified,
+        in which case a reference to `out` is returned.
+
+    See Also
+    --------
+    Tensor.all : equivalent method
+
+    any : Test whether any element along a given axis evaluates to True.
+
+    Notes
+    -----
+    Not a Number (NaN), positive infinity and negative infinity
+    evaluate to `True` because these are not equal to zero.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.all([[True,False],[True,True]]).execute()
+    False
+
+    >>> mt.all([[True,False],[True,True]], axis=0).execute()
+    array([ True, False])
+
+    >>> mt.all([-1, 4, 5]).execute()
+    True
+
+    >>> mt.all([1.0, mt.nan]).execute()
+    True
+
+    """
+    a = astensor(a)
+    if a.dtype == object:
+        dtype = a.dtype
+    else:
+        dtype = np.dtype(bool)
+    op = TensorAll(axis=axis, dtype=dtype, keepdims=keepdims, combine_size=combine_size)
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/allclose.py b/python/xorbits/_mars/tensor/reduction/allclose.py
new file mode 100644
index 000000000..c8f1db1ff
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/allclose.py
@@ -0,0 +1,86 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def allclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
+    """
+    Returns True if two tensors are element-wise equal within a tolerance.
+
+    The tolerance values are positive, typically very small numbers.  The
+    relative difference (`rtol` * abs(`b`)) and the absolute difference
+    `atol` are added together to compare against the absolute difference
+    between `a` and `b`.
+
+    If either array contains one or more NaNs, False is returned.
+    Infs are treated as equal if they are in the same place and of the same
+    sign in both tensors.
+
+    Parameters
+    ----------
+    a, b : array_like
+        Input tensors to compare.
+    rtol : float
+        The relative tolerance parameter (see Notes).
+    atol : float
+        The absolute tolerance parameter (see Notes).
+    equal_nan : bool
+        Whether to compare NaN's as equal.  If True, NaN's in `a` will be
+        considered equal to NaN's in `b` in the output tensor.
+
+    Returns
+    -------
+    allclose : bool
+        Returns True if the two tensors are equal within the given
+        tolerance; False otherwise.
+
+    See Also
+    --------
+    isclose, all, any, equal
+
+    Notes
+    -----
+    If the following equation is element-wise True, then allclose returns
+    True.
+
+     absolute(`a` - `b`) <= (`atol` + `rtol` * absolute(`b`))
+
+    The above equation is not symmetric in `a` and `b`, so that
+    ``allclose(a, b)`` might be different from ``allclose(b, a)`` in
+    some rare cases.
+
+    The comparison of `a` and `b` uses standard broadcasting, which
+    means that `a` and `b` need not have the same shape in order for
+    ``allclose(a, b)`` to evaluate to True.  The same is true for
+    `equal` but not `array_equal`.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.allclose([1e10,1e-7], [1.00001e10,1e-8]).execute()
+    False
+    >>> mt.allclose([1e10,1e-8], [1.00001e10,1e-9]).execute()
+    True
+    >>> mt.allclose([1e10,1e-8], [1.0001e10,1e-9]).execute()
+    False
+    >>> mt.allclose([1.0, mt.nan], [1.0, mt.nan]).execute()
+    False
+    >>> mt.allclose([1.0, mt.nan], [1.0, mt.nan], equal_nan=True).execute()
+    True
+
+    """
+    from ..arithmetic.isclose import isclose
+    from .all import all
+
+    return all(isclose(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan))
diff --git a/python/xorbits/_mars/tensor/reduction/any.py b/python/xorbits/_mars/tensor/reduction/any.py
new file mode 100644
index 000000000..658db4627
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/any.py
@@ -0,0 +1,115 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorReduction, TensorReductionMixin
+
+
+class TensorAny(TensorReduction, TensorReductionMixin):
+    _op_type_ = OperandDef.ANY
+    _func_name = "any"
+
+    def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw):
+        stage = self._rewrite_stage(stage)
+        super().__init__(
+            _axis=axis,
+            _keepdims=keepdims,
+            _combine_size=combine_size,
+            stage=stage,
+            **kw
+        )
+
+
+def any(a, axis=None, out=None, keepdims=None, combine_size=None):
+    """
+    Test whether any tensor element along a given axis evaluates to True.
+
+    Returns single boolean unless `axis` is not ``None``
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor or object that can be converted to an array.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which a logical OR reduction is performed.
+        The default (`axis` = `None`) is to perform a logical OR over all
+        the dimensions of the input array. `axis` may be negative, in
+        which case it counts from the last to the first axis.
+
+        If this is a tuple of ints, a reduction is performed on multiple
+        axes, instead of a single axis or all the axes as before.
+    out : Tensor, optional
+        Alternate output tensor in which to place the result.  It must have
+        the same shape as the expected output and its type is preserved
+        (e.g., if it is of type float, then it will remain so, returning
+        1.0 for True and 0.0 for False, regardless of the type of `a`).
+        See `doc.ufuncs` (Section "Output arguments") for details.
+
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input tensor.
+
+        If the default value is passed, then `keepdims` will not be
+        passed through to the `any` method of sub-classes of
+        `Tensor`, however any non-default value will be.  If the
+        sub-classes `sum` method does not implement `keepdims` any
+        exceptions will be raised.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    any : bool or Tensor
+        A new boolean or `Tensor` is returned unless `out` is specified,
+        in which case a reference to `out` is returned.
+
+    See Also
+    --------
+    Tensor.any : equivalent method
+
+    all : Test whether all elements along a given axis evaluate to True.
+
+    Notes
+    -----
+    Not a Number (NaN), positive infinity and negative infinity evaluate
+    to `True` because these are not equal to zero.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.any([[True, False], [True, True]]).execute()
+    True
+
+    >>> mt.any([[True, False], [False, False]], axis=0).execute()
+    array([ True, False])
+
+    >>> mt.any([-1, 0, 5]).execute()
+    True
+
+    >>> mt.any(mt.nan).execute()
+    True
+
+    """
+    a = astensor(a)
+    if a.dtype == object:
+        dtype = a.dtype
+    else:
+        dtype = np.dtype(bool)
+    op = TensorAny(axis=axis, dtype=dtype, keepdims=keepdims, combine_size=combine_size)
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/argmax.py b/python/xorbits/_mars/tensor/reduction/argmax.py
new file mode 100644
index 000000000..d01a45cb1
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/argmax.py
@@ -0,0 +1,127 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField, TupleField
+from .core import TensorArgReductionMixin, TensorReduction
+
+
+class TensorArgmax(TensorReduction, TensorArgReductionMixin):
+    _op_type_ = OperandDef.ARGMAX
+    _func_name = "argmax"
+    _agg_func_name = "max"
+
+    _offset = AnyField("offset")
+    _total_shape = TupleField("total_shape")
+
+    def __init__(
+        self,
+        axis=None,
+        dtype=None,
+        combine_size=None,
+        offset=None,
+        total_shape=None,
+        stage=None,
+        **kw
+    ):
+        if dtype is None:
+            dtype = np.dtype(int)
+        stage = self._rewrite_stage(stage)
+        super().__init__(
+            _axis=axis,
+            _combine_size=combine_size,
+            _offset=offset,
+            _total_shape=total_shape,
+            dtype=dtype,
+            stage=stage,
+            **kw
+        )
+
+    @property
+    def offset(self):
+        return getattr(self, "_offset", None)
+
+    @property
+    def total_shape(self):
+        return getattr(self, "_total_shape", None)
+
+
+def argmax(a, axis=None, out=None, combine_size=None):
+    """
+    Returns the indices of the maximum values along an axis.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor.
+    axis : int, optional
+        By default, the index is into the flattened tensor, otherwise
+        along the specified axis.
+    out : Tensor, optional
+        If provided, the result will be inserted into this tensor. It should
+        be of the appropriate shape and dtype.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    index_array : Tensor of ints
+        Tensor of indices into the tensor. It has the same shape as `a.shape`
+        with the dimension along `axis` removed.
+
+    See Also
+    --------
+    Tensor.argmax, argmin
+    amax : The maximum value along a given axis.
+    unravel_index : Convert a flat index into an index tuple.
+
+    Notes
+    -----
+    In case of multiple occurrences of the maximum values, the indices
+    corresponding to the first occurrence are returned.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.arange(6).reshape(2,3)
+    >>> a.execute()
+    array([[0, 1, 2],
+           [3, 4, 5]])
+    >>> mt.argmax(a).execute()
+    5
+    >>> mt.argmax(a, axis=0).execute()
+    array([1, 1, 1])
+    >>> mt.argmax(a, axis=1).execute()
+    array([2, 2])
+
+    Indexes of the maximal elements of a N-dimensional tensor:
+
+    >>> ind = mt.unravel_index(mt.argmax(a, axis=None), a.shape)
+    >>> ind.execute()
+    (1, 2)
+    >>> a[ind].execute()  # TODO(jisheng): accomplish when fancy index on tensor is supported
+
+    >>> b = mt.arange(6)
+    >>> b[1] = 5
+    >>> b.execute()
+    array([0, 5, 2, 3, 4, 5])
+    >>> mt.argmax(b).execute()  # Only the first occurrence is returned.
+    1
+
+    """
+    op = TensorArgmax(axis=axis, dtype=np.dtype(int), combine_size=combine_size)
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/argmin.py b/python/xorbits/_mars/tensor/reduction/argmin.py
new file mode 100644
index 000000000..d5ae03129
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/argmin.py
@@ -0,0 +1,127 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField, TupleField
+from .core import TensorArgReductionMixin, TensorReduction
+
+
+class TensorArgmin(TensorReduction, TensorArgReductionMixin):
+    _op_type_ = OperandDef.ARGMIN
+    _func_name = "argmin"
+    _agg_func_name = "min"
+
+    _offset = AnyField("offset")
+    _total_shape = TupleField("total_shape")
+
+    def __init__(
+        self,
+        axis=None,
+        dtype=None,
+        combine_size=None,
+        offset=None,
+        total_shape=None,
+        stage=None,
+        **kw
+    ):
+        if dtype is None:
+            dtype = np.dtype(int)
+        stage = self._rewrite_stage(stage)
+        super().__init__(
+            _axis=axis,
+            _combine_size=combine_size,
+            _offset=offset,
+            _total_shape=total_shape,
+            dtype=dtype,
+            stage=stage,
+            **kw
+        )
+
+    @property
+    def offset(self):
+        return getattr(self, "_offset", None)
+
+    @property
+    def total_shape(self):
+        return getattr(self, "_total_shape", None)
+
+
+def argmin(a, axis=None, out=None, combine_size=None):
+    """
+    Returns the indices of the minimum values along an axis.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor.
+    axis : int, optional
+        By default, the index is into the flattened tensor, otherwise
+        along the specified axis.
+    out : Tensor, optional
+        If provided, the result will be inserted into this tensor. It should
+        be of the appropriate shape and dtype.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    index_array : Tensor of ints
+        Tensor of indices into the tensor. It has the same shape as `a.shape`
+        with the dimension along `axis` removed.
+
+    See Also
+    --------
+    Tensor.argmin, argmax
+    amin : The minimum value along a given axis.
+    unravel_index : Convert a flat index into an index tuple.
+
+    Notes
+    -----
+    In case of multiple occurrences of the minimum values, the indices
+    corresponding to the first occurrence are returned.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.arange(6).reshape(2,3)
+    >>> a.execute()
+    array([[0, 1, 2],
+           [3, 4, 5]])
+    >>> mt.argmin(a).execute()
+    0
+    >>> mt.argmin(a, axis=0).execute()
+    array([0, 0, 0])
+    >>> mt.argmin(a, axis=1).execute()
+    array([0, 0])
+
+    Indices of the minimum elements of a N-dimensional tensor:
+
+    >>> ind = mt.unravel_index(mt.argmin(a, axis=None), a.shape)
+    >>> ind.execute()
+    (0, 0)
+    >>> a[ind]  # TODO(jisheng): accomplish when fancy index on tensor is supported
+
+    >>> b = mt.arange(6)
+    >>> b[4] = 0
+    >>> b.execute()
+    array([0, 1, 2, 3, 0, 5])
+    >>> mt.argmin(b).execute()  # Only the first occurrence is returned.
+    0
+
+    """
+    op = TensorArgmin(axis=axis, dtype=np.dtype(int), combine_size=combine_size)
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/array_equal.py b/python/xorbits/_mars/tensor/reduction/array_equal.py
new file mode 100644
index 000000000..0d8967619
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/array_equal.py
@@ -0,0 +1,62 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def array_equal(a1, a2):
+    """
+    True if two tensors have the same shape and elements, False otherwise.
+
+    Parameters
+    ----------
+    a1, a2 : array_like
+        Input arrays.
+
+    Returns
+    -------
+    b : bool
+        Returns True if the tensors are equal.
+
+    See Also
+    --------
+    allclose: Returns True if two tensors are element-wise equal within a
+              tolerance.
+    array_equiv: Returns True if input tensors are shape consistent and all
+                 elements equal.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.array_equal([1, 2], [1, 2]).execute()
+    True
+    >>> mt.array_equal(mt.array([1, 2]), mt.array([1, 2])).execute()
+    True
+    >>> mt.array_equal([1, 2], [1, 2, 3]).execute()
+    False
+    >>> mt.array_equal([1, 2], [1, 4]).execute()
+    False
+
+    """
+    from ..datasource import tensor as astensor
+    from ..datasource.scalar import scalar
+    from .all import all
+
+    try:
+        a1, a2 = astensor(a1), astensor(a2)
+    except Exception:
+        return scalar(False)
+
+    if a1.shape != a2.shape:
+        return scalar(False)
+    return all(astensor(a1 == a2))
diff --git a/python/xorbits/_mars/tensor/reduction/core.py b/python/xorbits/_mars/tensor/reduction/core.py
new file mode 100644
index 000000000..54b5b0a9c
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/core.py
@@ -0,0 +1,659 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import builtins
+import copy
+import inspect
+import itertools
+import operator
+from collections.abc import Iterable
+from functools import reduce
+from math import ceil, log
+
+import numpy as np
+
+from ...config import options
+from ...core.operand import OperandStage
+from ...serialization.serializables import AnyField, BoolField, Int32Field, KeyField
+from ..array_utils import as_same_device, cp, device, get_array_module
+from ..core import Tensor, TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorHasInput, TensorOperandMixin
+from ..utils import check_out_param, validate_axis
+
+
+def numel(x, **kwargs):
+    xp = get_array_module(x)
+    return xp.sum(xp.ones_like(x), **kwargs)
+
+
+def nannumel(x, **kwargs):
+    x_size = reduce(operator.mul, x.shape)
+    xp = get_array_module(x)
+    return x_size - xp.sum(xp.isnan(x), **kwargs)
+
+
+class TensorReductionMixin(TensorOperandMixin):
+    __slots__ = ()
+
+    @classmethod
+    def _is_cum(cls):
+        return False
+
+    @classmethod
+    def _calc_order(cls, a, out):
+        return out.order if out is not None else a.order
+
+    @classmethod
+    def _is_sparse(cls, input_sparse, shape):
+        return False
+
+    def _call(self, a, out):
+        a = astensor(a)
+        if out is not None and not isinstance(out, Tensor):
+            raise TypeError(f"out should be Tensor object, got {type(out)} instead")
+
+        axis = getattr(self, "axis", None)
+        keepdims = getattr(self, "keepdims", None)
+        order = self._calc_order(a, out)
+
+        if self._is_cum():
+            if axis is None:
+                a, axis = a.ravel(), 0
+                setattr(self, "_axis", axis)
+            shape = a.shape
+        else:
+            axis = list(range(len(a.shape))) if axis is None else axis
+            if not isinstance(axis, Iterable):
+                axis = (validate_axis(a.ndim, axis),)
+            axis = set(axis)
+
+            shape = tuple(
+                s if i not in axis else 1
+                for i, s in enumerate(a.shape)
+                if keepdims or i not in axis
+            )
+
+        self.sparse = self._is_sparse(a.issparse(), shape)
+        t = self.new_tensor([a], shape, order=order)
+
+        if out is None:
+            return t
+
+        check_out_param(out, t, "same_kind")
+        out_shape, out_dtype = out.shape, out.dtype
+        # if `out` is specified, use out's dtype and shape
+        if out_shape != t.shape:
+            if out.ndim > t.ndim:
+                raise ValueError("output has too many dimensions")
+            raise ValueError(f"output shape should be {t.shape}, got {out_shape}")
+
+        setattr(self, "dtype", out_dtype)
+
+        out.data = t.data
+        return out
+
+    def _new_chunks(self, inputs, kws=None, **kw):
+        chunks = super()._new_chunks(inputs, kws=kws, **kw)
+        setattr(self, "_input", getattr(self, "_inputs")[0])
+        return chunks
+
+    def _new_tileables(self, inputs, kws=None, **kw):
+        tensors = super()._new_tileables(inputs, kws=kws, **kw)
+        setattr(self, "_input", getattr(self, "_inputs")[0])
+        return tensors
+
+    def __call__(self, a, out=None):
+        return self._call(a, out=out)
+
+    @staticmethod
+    def _reduced_shape(shape, axes):
+        return tuple(1 if i in axes else s for i, s in enumerate(shape))
+
+    @staticmethod
+    def _reduced_nsplits(nsplits, axes):
+        return tuple((1,) * len(c) if i in axes else c for i, c in enumerate(nsplits))
+
+    @staticmethod
+    def _concatenate_shape(tensor, combine_block):
+        return tuple(
+            builtins.sum(nsplit[i] for i in cb)
+            for nsplit, cb in zip(tensor.nsplits, combine_block)
+        )
+
+    @staticmethod
+    def _combine_split(ax, combine_size, chunk_shape):
+        if ax not in combine_size:
+            return tuple((i,) for i in range(chunk_shape[ax]))
+        else:
+            size = combine_size[ax]
+            shape = chunk_shape[ax]
+            index = tuple(range(shape))
+            return tuple(index[i : i + size] for i in range(0, shape, size))
+
+    def _get_op_kw(self):
+        return None
+
+    @classmethod
+    def get_axis(cls, axis):
+        return tuple(axis) if axis is not None else axis
+
+    @classmethod
+    def get_arg_axis(cls, axis, ndim):
+        return None if len(axis) == ndim or ndim == 1 else axis[0]
+
+    @classmethod
+    def _tree_reduction(cls, tensor, axis):
+        op = tensor.op
+        kw = getattr(op, "_get_op_kw")() or {}
+        keepdims = op.keepdims
+        combine_size = op.combine_size or options.combine_size
+        if isinstance(combine_size, dict):
+            combine_size = dict((ax, combine_size.get(ax)) for ax in axis)
+        else:
+            assert isinstance(combine_size, int)
+            n = builtins.max(int(combine_size ** (1.0 / (len(axis) or 1))), 2)
+            combine_size = dict((ax, n) for ax in axis)
+
+        times = 1
+        for i, n in enumerate(tensor.chunk_shape):
+            if i in combine_size and combine_size[i] != 1:
+                times = int(builtins.max(times, ceil(log(n, combine_size[i]))))
+
+        for i in range(times - 1):
+            [tensor] = cls._partial_reduction(
+                tensor, axis, op.dtype, True, combine_size, OperandStage.combine
+            )
+
+        return cls._partial_reduction(
+            tensor, axis, op.dtype, keepdims, combine_size, OperandStage.agg, kw
+        )
+
+    @classmethod
+    def _partial_reduction(
+        cls, tensor, axis, dtype, keepdims, combine_size, stage, kw=None
+    ):
+        from ..merge.concatenate import TensorConcatenate
+
+        kw = kw or {}
+        axes = sorted(combine_size.keys())
+        op_type = type(tensor.op)
+
+        combine_blocks = [
+            cls._combine_split(i, combine_size, tensor.chunk_shape)
+            for i in range(tensor.ndim)
+        ]
+        combine_blocks_idxes = [range(len(blocks)) for blocks in combine_blocks]
+
+        chunks = []
+        for combine_block_idx, combine_block in zip(
+            itertools.product(*combine_blocks_idxes), itertools.product(*combine_blocks)
+        ):
+            chks = [tensor.cix[idx] for idx in itertools.product(*combine_block)]
+            if len(chks) > 1:
+                op = TensorConcatenate(axis=axes, dtype=chks[0].dtype)
+                chk = op.new_chunk(
+                    chks,
+                    shape=cls._concatenate_shape(tensor, combine_block),
+                    order=tensor.order,
+                )
+            else:
+                chk = chks[0]
+            shape = tuple(
+                s if i not in combine_size else 1
+                for i, s in enumerate(chk.shape)
+                if keepdims or i not in combine_size
+            )
+            agg_op = op_type(
+                stage=stage, axis=axis, dtype=dtype, keepdims=keepdims, **kw
+            )
+            chunk = agg_op.new_chunk(
+                [chk],
+                shape=shape,
+                index=tuple(
+                    idx
+                    for i, idx in enumerate(combine_block_idx)
+                    if keepdims or i not in combine_size
+                ),
+                order=tensor.order,
+            )
+            chunks.append(chunk)
+
+        nsplits = [
+            tuple(
+                c.shape[i]
+                for c in chunks
+                if builtins.all(idx == 0 for j, idx in enumerate(c.index) if j != i)
+            )
+            for i in range(len(chunks[0].shape))
+        ]
+        shape = tuple(builtins.sum(nsplit) for nsplit in nsplits)
+        agg_op = op_type(
+            stage=stage,
+            axis=axis,
+            dtype=dtype,
+            keepdims=keepdims,
+            combine_size=combine_size,
+            **kw,
+        )
+        return agg_op.new_tensors(
+            [tensor], shape, order=tensor.order, chunks=chunks, nsplits=nsplits
+        )
+
+    @classmethod
+    def tile(cls, op):
+        in_tensor = op.inputs[0]
+        out_tensor = op.outputs[0]
+        axis = tuple(range(in_tensor.ndim)) if op.axis is None else op.axis
+        if isinstance(axis, int):
+            axis = (axis,)
+        axis = tuple(validate_axis(in_tensor.ndim, ax) for ax in axis)
+
+        if len(in_tensor.chunks) == 1:
+            c = in_tensor.chunks[0]
+            new_op = op.copy().reset_key()
+            setattr(new_op, "_axis", axis)
+            shape = list(cls._reduced_shape(c.shape, axis))
+            nsplits = list(cls._reduced_nsplits(in_tensor.nsplits, axis))
+            chunk_index = list(c.index)
+            if not op.keepdims and axis:
+                for ax in axis:
+                    shape[ax] = None
+                    nsplits[ax] = None
+                    chunk_index[ax] = None
+            shape = tuple(s for s in shape if s is not None)
+            nsplits = tuple(ns for ns in nsplits if ns is not None)
+            chunk_index = tuple(i for i in chunk_index if i is not None)
+
+            chunks = new_op.new_chunks(
+                [c], shape=shape, index=chunk_index, order=out_tensor.order
+            )
+            return op.copy().new_tensors(
+                op.inputs,
+                op.outputs[0].shape,
+                order=out_tensor.order,
+                chunks=chunks,
+                nsplits=nsplits,
+            )
+
+        chunks = []
+        kw = getattr(op, "_get_op_kw")() or {}
+        for c in in_tensor.chunks:
+            chunk_op = type(op)(
+                stage=OperandStage.map,
+                axis=axis,
+                dtype=op.dtype,
+                keepdims=True,
+                combine_size=op.combine_size,
+                **kw,
+            )
+            chunks.append(
+                chunk_op.new_chunk(
+                    [c],
+                    shape=cls._reduced_shape(c.shape, axis),
+                    order=out_tensor.order,
+                    index=c.index,
+                )
+            )
+
+        new_op = op.copy()
+        tensor = new_op.new_tensor(
+            op.inputs,
+            cls._reduced_shape(in_tensor.shape, axis),
+            order=out_tensor.order,
+            nsplits=cls._reduced_nsplits(in_tensor.nsplits, axis),
+            chunks=chunks,
+        )
+        return cls._tree_reduction(tensor, axis)
+
+    @classmethod
+    def execute_agg(cls, ctx, op):
+        (input_chunk,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+        axis = cls.get_axis(op.axis)
+        func_name = getattr(cls, "_func_name", None)
+        reduce_func = getattr(xp, func_name)
+        out = op.outputs[0]
+        with device(device_id):
+            if input_chunk.size == 0 and op.keepdims:
+                # input chunk is empty, when keepdims is True, return itself
+                ret = input_chunk
+            elif "dtype" in inspect.getfullargspec(reduce_func).args:
+                ret = reduce_func(
+                    input_chunk, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims)
+                )
+            else:
+                ret = reduce_func(input_chunk, axis=axis, keepdims=bool(op.keepdims))
+
+            if hasattr(ret, "astype"):
+                # for non-object dtype
+                ret = ret.astype(op.dtype, order=out.order.value, copy=False)
+            ctx[out.key] = ret
+
+    @classmethod
+    def execute_one_chunk(cls, ctx, op):
+        cls.execute_agg(ctx, op)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            return cls.execute_map(ctx, op)
+        elif op.stage == OperandStage.combine:
+            return cls.execute_combine(ctx, op)
+        elif op.stage == OperandStage.agg:
+            return cls.execute_agg(ctx, op)
+        else:
+            return cls.execute_one_chunk(ctx, op)
+
+
+class TensorArgReductionMixin(TensorReductionMixin):
+    __slots__ = ()
+
+    @staticmethod
+    def _get_arg_axis(axis, ndim):
+        if axis is None:
+            axis = tuple(range(ndim))
+            ravel = True
+        elif isinstance(axis, int):
+            axis = validate_axis(ndim, axis)
+            axis = (axis,)
+            ravel = ndim == 1
+        else:
+            raise TypeError(f"axis must be either `None` or int, got '{axis}'")
+        return axis, ravel
+
+    @staticmethod
+    def _get_offset(tensor, axis, chunk, ravel):
+        nsplits = tensor.nsplits
+        offset = tuple(
+            builtins.sum(split[:idx]) for split, idx in zip(nsplits, chunk.index)
+        )
+        if not ravel:
+            offset = offset[axis[0]]
+        return offset
+
+    @classmethod
+    def _calc_order(cls, a, out):
+        return out.order if out is not None else TensorOrder.C_ORDER
+
+    @classmethod
+    def tile(cls, op):
+        in_tensor = op.inputs[0]
+        out_tensor = op.outputs[0]
+        axis, ravel = cls._get_arg_axis(op.axis, in_tensor.ndim)
+
+        chunks = []
+        for c in in_tensor.chunks:
+            offset = cls._get_offset(in_tensor, axis, c, ravel)
+            chunk_op = type(op)(
+                stage=OperandStage.map,
+                axis=axis,
+                dtype=op.dtype,
+                offset=offset,
+                total_shape=in_tensor.shape,
+                combine_size=op.combine_size,
+            )
+            chunk = chunk_op.new_chunk(
+                [c],
+                shape=cls._reduced_shape(c.shape, axis),
+                index=c.index,
+                order=out_tensor.order,
+            )
+            chunks.append(chunk)
+        new_op = op.copy()
+        tensor = new_op.new_tensor(
+            op.inputs,
+            cls._reduced_shape(in_tensor.shape, axis),
+            order=out_tensor.order,
+            nsplits=cls._reduced_nsplits(in_tensor.nsplits, axis),
+            chunks=chunks,
+        )
+        return cls._tree_reduction(tensor, axis)
+
+    @classmethod
+    def execute_agg(cls, ctx, op):
+        axis = cls.get_arg_axis(op.axis, op.inputs[0].ndim)
+        (vals, arg), device_id, xp = as_same_device(
+            ctx[op.inputs[0].key], device=op.device, ret_extra=True
+        )
+
+        func_name = getattr(cls, "_func_name")
+        arg_func = getattr(xp, func_name)
+
+        with device(device_id):
+            if xp.any(xp.isnan(vals)) and "nan" in func_name:
+                raise ValueError("All NaN slice encountered")
+            if axis is None:
+                local_args = arg_func(vals, axis=axis)
+                arg = arg.ravel()[local_args]
+            else:
+                local_args = arg_func(vals, axis=axis)
+                inds = np.ogrid[tuple(map(slice, local_args.shape))]
+                if xp != np:
+                    inds = [xp.asarray(it) for it in inds]
+                inds.insert(axis, local_args)
+                arg = arg[tuple(inds)]
+            ctx[op.outputs[0].key] = arg
+
+    @classmethod
+    def execute_map(cls, ctx, op):
+        arg_axis = cls.get_arg_axis(op.axis, op.inputs[0].ndim)
+        (in_chunk,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        func_name = getattr(cls, "_func_name")
+        agg_func_name = getattr(cls, "_agg_func_name")
+        arg_func = getattr(xp, func_name)
+        agg_func_name = getattr(xp, agg_func_name)
+
+        offset = op.offset
+        chunk = op.outputs[0]
+        with device(device_id):
+            vals = agg_func_name(in_chunk, axis=arg_axis)
+            if hasattr(vals, "reshape"):
+                vals = vals.reshape(chunk.shape)
+            try:
+                arg = arg_func(in_chunk, axis=arg_axis)
+                if hasattr(arg, "reshape"):
+                    arg = arg.reshape(chunk.shape)
+            except ValueError:
+                # handle all NaN
+                arg = arg_func(
+                    xp.where(xp.isnan(in_chunk), np.inf, in_chunk), axis=arg_axis
+                ).reshape(chunk.shape)
+
+            if arg_axis is None:
+                if xp == cp:
+                    # we need to copy to do cpu computation, then copy back to gpu
+                    # cuz unravel_index and ravel_multi_index are not implemented in cupy
+                    in_chunk = in_chunk.get()
+
+                total_shape = op.total_shape
+                ind = np.unravel_index(arg.ravel()[0], in_chunk.shape)
+                total_ind = tuple(o + i for (o, i) in zip(offset, ind))
+                res = np.ravel_multi_index(total_ind, total_shape)
+
+                if xp == cp:
+                    # copy back
+                    with xp.cuda.Device(in_chunk.device.id):
+                        arg[:] = xp.asarray(res)
+                else:
+                    arg[:] = res
+            else:
+                arg += offset
+            ctx[op.outputs[0].key] = (vals, arg)
+
+    @classmethod
+    def execute_combine(cls, ctx, op):
+        axis = cls.get_arg_axis(op.axis, op.inputs[0].ndim)
+        (vals, arg), device_id, xp = as_same_device(
+            ctx[op.inputs[0].key], device=op.device, ret_extra=True
+        )
+
+        func_name = getattr(cls, "_func_name")
+        arg_func = getattr(xp, func_name)
+        with device(device_id):
+            if axis is None:
+                local_args = arg_func(vals, axis=axis).reshape(op.outputs[0].shape)
+                vals = vals.ravel()[local_args]
+                arg = arg.ravel()[local_args]
+            else:
+                local_args = arg_func(vals, axis=axis)
+                inds = np.ogrid[tuple(map(slice, local_args.shape))]
+                if xp != np:
+                    inds = [xp.asarray(it) for it in inds]
+                inds.insert(axis, local_args)
+                inds_tuple = tuple(inds)
+                vals = vals[inds_tuple].reshape(op.outputs[0].shape)
+                arg = arg[inds_tuple].reshape(op.outputs[0].shape)
+            ctx[op.outputs[0].key] = (vals, arg)
+
+
+class TensorCumReductionMixin(TensorReductionMixin):
+    __slots__ = ()
+
+    @classmethod
+    def _is_cum(cls):
+        return True
+
+    @staticmethod
+    def _get_op_types():
+        raise NotImplementedError
+
+    @classmethod
+    def tile(cls, op):
+        from ..indexing.slice import TensorSlice
+
+        in_tensor = op.inputs[0]
+        out_tensor = op.outputs[0]
+        axis = op.axis
+        if not isinstance(axis, int):
+            raise ValueError("axis must be a integer")
+        axis = validate_axis(in_tensor.ndim, axis)
+        if axis is None:
+            raise NotImplementedError
+
+        op_type, bin_op_type = getattr(op, "_get_op_types")()
+
+        chunks = []
+        for c in in_tensor.chunks:
+            chunk_op = op_type(axis=op.axis, dtype=op.dtype)
+            chunks.append(
+                chunk_op.new_chunk(
+                    [c], shape=c.shape, index=c.index, order=out_tensor.order
+                )
+            )
+        inter_tensor = copy.copy(in_tensor)
+        inter_tensor._chunks = chunks
+
+        slc = [
+            slice(None) if i != axis else slice(-1, None) for i in range(in_tensor.ndim)
+        ]
+
+        output_chunks = []
+        for chunk in chunks:
+            if chunk.index[axis] == 0:
+                output_chunks.append(chunk)
+                continue
+
+            to_cum_chunks = []
+            for i in range(chunk.index[axis]):
+                to_cum_index = chunk.index[:axis] + (i,) + chunk.index[axis + 1 :]
+                shape = chunk.shape[:axis] + (1,) + chunk.shape[axis + 1 :]
+                to_cum_chunk = inter_tensor.cix[to_cum_index]
+                slice_op = TensorSlice(slices=slc, dtype=chunk.dtype)
+                sliced_chunk = slice_op.new_chunk(
+                    [to_cum_chunk],
+                    shape=shape,
+                    index=to_cum_index,
+                    order=out_tensor.order,
+                )
+                to_cum_chunks.append(sliced_chunk)
+            to_cum_chunks.append(chunk)
+
+            # GH#3132: some chunks of to_cum_chunks may be empty,
+            # so we tell tree_add&tree_multiply to ignore them
+            bin_op = bin_op_type(
+                args=to_cum_chunks, dtype=chunk.dtype, ignore_empty_input=True
+            )
+            output_chunk = bin_op.new_chunk(
+                to_cum_chunks,
+                shape=chunk.shape,
+                index=chunk.index,
+                order=out_tensor.order,
+            )
+            output_chunks.append(output_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            in_tensor.shape,
+            order=out_tensor.order,
+            chunks=output_chunks,
+            nsplits=in_tensor.nsplits,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        (x,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        func_name = getattr(cls, "_func_name")
+        cum_func = getattr(xp, func_name)
+        if xp != np:
+            func = getattr(xp, cum_func.__name__)
+        else:
+            func = cum_func
+
+        with device(device_id):
+            ctx[op.outputs[0].key] = func(x, axis=op.axis, dtype=op.dtype)
+
+
+class TensorReduction(TensorHasInput):
+    _input = KeyField("input")
+    _out = KeyField("out")
+    _axis = AnyField("axis")  # can be None or int or tuple of ints, just infer the data
+    _keepdims = BoolField("keepdims")
+    _combine_size = AnyField("combine_size")
+
+    @property
+    def axis(self):
+        return getattr(self, "_axis", None)
+
+    @property
+    def keepdims(self):
+        return getattr(self, "_keepdims", None)
+
+    @property
+    def combine_size(self):
+        return getattr(self, "_combine_size", None)
+
+    def _rewrite_stage(self, stage):
+        if stage == OperandStage.map and not hasattr(self, "execute_map"):
+            return OperandStage.agg
+        elif stage == OperandStage.combine and not hasattr(self, "execute_combine"):
+            return OperandStage.agg
+        return stage
+
+
+class TensorCumReduction(TensorHasInput):
+    _input = KeyField("input")
+    _axis = Int32Field("axis")
+
+    @property
+    def axis(self):
+        return getattr(self, "_axis", None)
diff --git a/python/xorbits/_mars/tensor/reduction/count_nonzero.py b/python/xorbits/_mars/tensor/reduction/count_nonzero.py
new file mode 100644
index 000000000..de04b084f
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/count_nonzero.py
@@ -0,0 +1,124 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..array_utils import as_same_device, device
+from .core import TensorReduction, TensorReductionMixin
+from .sum import TensorSum
+
+
+class TensorCountNonzero(TensorReduction, TensorReductionMixin):
+    _op_type_ = OperandDef.COUNT_NONZERO
+
+    def __init__(
+        self, axis=None, dtype=None, keepdims=None, combine_size=None, stage=None, **kw
+    ):
+        if dtype is None:
+            dtype = np.dtype(np.intp)
+        stage = self._rewrite_stage(stage)
+        super().__init__(
+            _axis=axis,
+            _keepdims=keepdims,
+            _combine_size=combine_size,
+            dtype=dtype,
+            stage=stage,
+            **kw
+        )
+
+    @classmethod
+    def execute_map(cls, ctx, op):
+        (x,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], op.device, ret_extra=True
+        )
+
+        axis = cls.get_arg_axis(op.axis, op.inputs[0].ndim)
+        keepdims = op.keepdims
+        with device(device_id):
+            nz = xp.count_nonzero(x, axis=axis)
+            if keepdims:
+                slcs = [slice(None)] * op.inputs[0].ndim
+                for ax in op.axis:
+                    slcs[ax] = np.newaxis
+                nz = xp.asarray(nz)[tuple(slcs)]
+
+            ctx[op.outputs[0].key] = nz
+
+    @classmethod
+    def execute_agg(cls, ctx, op):
+        return TensorSum.execute_agg(ctx, op)
+
+    @classmethod
+    def execute_one_chunk(cls, ctx, op):
+        a = ctx[op.inputs[0].key]
+        (inp,), device_id, xp = as_same_device([a], device=op.device, ret_extra=True)
+        with device(device_id):
+            ctx[op.outputs[0].key] = xp.count_nonzero(inp, axis=op.axis)
+
+
+def count_nonzero(a, axis=None, combine_size=None):
+    """
+    Counts the number of non-zero values in the tensor ``a``.
+
+    The word "non-zero" is in reference to the Python 2.x
+    built-in method ``__nonzero__()`` (renamed ``__bool__()``
+    in Python 3.x) of Python objects that tests an object's
+    "truthfulness". For example, any number is considered
+    truthful if it is nonzero, whereas any string is considered
+    truthful if it is not the empty string. Thus, this function
+    (recursively) counts how many elements in ``a`` (and in
+    sub-tensors thereof) have their ``__nonzero__()`` or ``__bool__()``
+    method evaluated to ``True``.
+
+    Parameters
+    ----------
+    a : array_like
+        The tensor for which to count non-zeros.
+    axis : int or tuple, optional
+        Axis or tuple of axes along which to count non-zeros.
+        Default is None, meaning that non-zeros will be counted
+        along a flattened version of ``a``.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    count : int or tensor of int
+        Number of non-zero values in the array along a given axis.
+        Otherwise, the total number of non-zero values in the tensor
+        is returned.
+
+    See Also
+    --------
+    nonzero : Return the coordinates of all the non-zero values.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.count_nonzero(mt.eye(4)).execute()
+    4
+    >>> mt.count_nonzero([[0,1,7,0,0],[3,0,0,2,19]]).execute()
+    5
+    >>> mt.count_nonzero([[0,1,7,0,0],[3,0,0,2,19]], axis=0).execute()
+    array([1, 1, 1, 1, 1])
+    >>> mt.count_nonzero([[0,1,7,0,0],[3,0,0,2,19]], axis=1).execute()
+    array([2, 3])
+
+    """
+    op = TensorCountNonzero(
+        axis=axis, dtype=np.dtype(np.int_), keepdims=None, combine_size=combine_size
+    )
+    return op(a)
diff --git a/python/xorbits/_mars/tensor/reduction/cumprod.py b/python/xorbits/_mars/tensor/reduction/cumprod.py
new file mode 100644
index 000000000..d2a5072bd
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/cumprod.py
@@ -0,0 +1,101 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..arithmetic.multiply import TensorTreeMultiply
+from ..datasource import tensor as astensor
+from .core import TensorCumReduction, TensorCumReductionMixin
+
+
+class TensorCumprod(TensorCumReduction, TensorCumReductionMixin):
+    _op_type_ = OperandDef.CUMPROD
+    _func_name = "cumprod"
+
+    def __init__(self, axis=None, **kw):
+        super().__init__(_axis=axis, **kw)
+
+    @staticmethod
+    def _get_op_types():
+        return TensorCumprod, TensorTreeMultiply
+
+
+def cumprod(a, axis=None, dtype=None, out=None):
+    """
+    Return the cumulative product of elements along a given axis.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor.
+    axis : int, optional
+        Axis along which the cumulative product is computed.  By default
+        the input is flattened.
+    dtype : dtype, optional
+        Type of the returned tensor, as well as of the accumulator in which
+        the elements are multiplied.  If *dtype* is not specified, it
+        defaults to the dtype of `a`, unless `a` has an integer dtype with
+        a precision less than that of the default platform integer.  In
+        that case, the default platform integer is used instead.
+    out : Tensor, optional
+        Alternative output tensor in which to place the result. It must
+        have the same shape and buffer length as the expected output
+        but the type of the resulting values will be cast if necessary.
+
+    Returns
+    -------
+    cumprod : Tensor
+        A new tensor holding the result is returned unless `out` is
+        specified, in which case a reference to out is returned.
+
+    See Also
+    --------
+    numpy.doc.ufuncs : Section "Output arguments"
+
+    Notes
+    -----
+    Arithmetic is modular when using integer types, and no error is
+    raised on overflow.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([1,2,3])
+    >>> mt.cumprod(a).execute() # intermediate results 1, 1*2
+    ...                         # total product 1*2*3 = 6
+    array([1, 2, 6])
+    >>> a = mt.array([[1, 2, 3], [4, 5, 6]])
+    >>> mt.cumprod(a, dtype=float).execute() # specify type of output
+    array([   1.,    2.,    6.,   24.,  120.,  720.])
+
+    The cumulative product for each column (i.e., over the rows) of `a`:
+
+    >>> mt.cumprod(a, axis=0).execute()
+    array([[ 1,  2,  3],
+           [ 4, 10, 18]])
+
+    The cumulative product for each row (i.e. over the columns) of `a`:
+
+    >>> mt.cumprod(a,axis=1).execute()
+    array([[  1,   2,   6],
+           [  4,  20, 120]])
+
+    """
+    a = astensor(a)
+    if dtype is None:
+        dtype = np.empty((1,), dtype=a.dtype).cumprod().dtype
+    op = TensorCumprod(axis=axis, dtype=dtype)
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/cumsum.py b/python/xorbits/_mars/tensor/reduction/cumsum.py
new file mode 100644
index 000000000..f7fb4d0f0
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/cumsum.py
@@ -0,0 +1,105 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..arithmetic.add import TensorTreeAdd
+from ..datasource import tensor as astensor
+from .core import TensorCumReduction, TensorCumReductionMixin
+
+
+class TensorCumsum(TensorCumReduction, TensorCumReductionMixin):
+    _op_type_ = OperandDef.CUMSUM
+    _func_name = "cumsum"
+
+    def __init__(self, axis=None, **kw):
+        super().__init__(_axis=axis, **kw)
+
+    @staticmethod
+    def _get_op_types():
+        return TensorCumsum, TensorTreeAdd
+
+
+def cumsum(a, axis=None, dtype=None, out=None):
+    """
+    Return the cumulative sum of the elements along a given axis.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor.
+    axis : int, optional
+        Axis along which the cumulative sum is computed. The default
+        (None) is to compute the cumsum over the flattened tensor.
+    dtype : dtype, optional
+        Type of the returned tensor and of the accumulator in which the
+        elements are summed.  If `dtype` is not specified, it defaults
+        to the dtype of `a`, unless `a` has an integer dtype with a
+        precision less than that of the default platform integer.  In
+        that case, the default platform integer is used.
+    out : Tensor, optional
+        Alternative output tensor in which to place the result. It must
+        have the same shape and buffer length as the expected output
+        but the type will be cast if necessary. See `doc.ufuncs`
+        (Section "Output arguments") for more details.
+
+    Returns
+    -------
+    cumsum_along_axis : Tensor.
+        A new tensor holding the result is returned unless `out` is
+        specified, in which case a reference to `out` is returned. The
+        result has the same size as `a`, and the same shape as `a` if
+        `axis` is not None or `a` is a 1-d tensor.
+
+
+    See Also
+    --------
+    sum : Sum tensor elements.
+
+    trapz : Integration of tensor values using the composite trapezoidal rule.
+
+    diff :  Calculate the n-th discrete difference along given axis.
+
+    Notes
+    -----
+    Arithmetic is modular when using integer types, and no error is
+    raised on overflow.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([[1,2,3], [4,5,6]])
+    >>> a.execute()
+    array([[1, 2, 3],
+           [4, 5, 6]])
+    >>> mt.cumsum(a).execute()
+    array([ 1,  3,  6, 10, 15, 21])
+    >>> mt.cumsum(a, dtype=float).execute()     # specifies type of output value(s)
+    array([  1.,   3.,   6.,  10.,  15.,  21.])
+
+    >>> mt.cumsum(a,axis=0).execute()      # sum over rows for each of the 3 columns
+    array([[1, 2, 3],
+           [5, 7, 9]])
+    >>> mt.cumsum(a,axis=1).execute()      # sum over columns for each of the 2 rows
+    array([[ 1,  3,  6],
+           [ 4,  9, 15]])
+
+    """
+    a = astensor(a)
+    if dtype is None:
+        dtype = np.empty((1,), dtype=a.dtype).cumsum().dtype
+    op = TensorCumsum(axis=axis, dtype=dtype)
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/max.py b/python/xorbits/_mars/tensor/reduction/max.py
new file mode 100644
index 000000000..b7de28f6b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/max.py
@@ -0,0 +1,132 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorReduction, TensorReductionMixin
+
+
+class TensorMax(TensorReduction, TensorReductionMixin):
+    _op_type_ = OperandDef.MAX
+    _func_name = "max"
+
+    def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw):
+        stage = self._rewrite_stage(stage)
+        super().__init__(
+            _axis=axis,
+            _keepdims=keepdims,
+            _combine_size=combine_size,
+            stage=stage,
+            **kw
+        )
+
+    @classmethod
+    def _is_sparse(cls, input_sparse, shape):
+        if input_sparse and len(shape) > 0:
+            return True
+        return False
+
+
+def max(a, axis=None, out=None, keepdims=None, combine_size=None):
+    """
+    Return the maximum of an array or maximum along an axis.
+
+    Parameters
+    ----------
+    a : array_like
+        Input data.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which to operate.  By default, flattened input is
+        used.
+
+        If this is a tuple of ints, the maximum is selected over multiple axes,
+        instead of a single axis or all the axes as before.
+    out : Tensor, optional
+        Alternative output tensor in which to place the result.  Must
+        be of the same shape and buffer length as the expected output.
+        See `doc.ufuncs` (Section "Output arguments") for more details.
+
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input array.
+
+        If the default value is passed, then `keepdims` will not be
+        passed through to the `amax` method of sub-classes of
+        `ndarray`, however any non-default value will be.  If the
+        sub-classes `sum` method does not implement `keepdims` any
+        exceptions will be raised.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    amax : Tensor or scalar
+        Maximum of `a`. If `axis` is None, the result is a scalar value.
+        If `axis` is given, the result is a tensor of dimension
+        ``a.ndim - 1``.
+
+    See Also
+    --------
+    amin :
+        The minimum value of a tensor along a given axis, propagating any NaNs.
+    nanmax :
+        The maximum value of a tensor along a given axis, ignoring any NaNs.
+    maximum :
+        Element-wise maximum of two tensors, propagating any NaNs.
+    fmax :
+        Element-wise maximum of two tensors, ignoring any NaNs.
+    argmax :
+        Return the indices of the maximum values.
+
+    nanmin, minimum, fmin
+
+    Notes
+    -----
+    NaN values are propagated, that is if at least one item is NaN, the
+    corresponding max value will be NaN as well. To ignore NaN values
+    (MATLAB behavior), please use nanmax.
+
+    Don't use `amax` for element-wise comparison of 2 arrays; when
+    ``a.shape[0]`` is 2, ``maximum(a[0], a[1])`` is faster than
+    ``amax(a, axis=0)``.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.arange(4).reshape((2,2))
+    >>> a.execute()
+    array([[0, 1],
+           [2, 3]])
+    >>> mt.amax(a).execute()           # Maximum of the flattened array
+    3
+    >>> mt.amax(a, axis=0).execute()   # Maxima along the first axis
+    array([2, 3])
+    >>> mt.amax(a, axis=1).execute()   # Maxima along the second axis
+    array([1, 3])
+
+    >>> b = mt.arange(5, dtype=float)
+    >>> b[2] = mt.NaN
+    >>> mt.amax(b).execute()
+    nan
+    >>> mt.nanmax(b).execute()
+    4.0
+
+    """
+    a = astensor(a)
+    op = TensorMax(
+        axis=axis, dtype=a.dtype, keepdims=keepdims, combine_size=combine_size
+    )
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/mean.py b/python/xorbits/_mars/tensor/reduction/mean.py
new file mode 100644
index 000000000..0c4c941c9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/mean.py
@@ -0,0 +1,199 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..array_utils import as_same_device, device
+from ..datasource import tensor as astensor
+from .core import TensorReduction, TensorReductionMixin, numel
+
+
+class TensorMean(TensorReduction, TensorReductionMixin):
+    _op_type_ = OperandDef.MEAN
+
+    def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw):
+        stage = self._rewrite_stage(stage)
+        super().__init__(
+            _axis=axis,
+            _keepdims=keepdims,
+            _combine_size=combine_size,
+            stage=stage,
+            **kw
+        )
+
+    @classmethod
+    def execute_agg(cls, ctx, op):
+        axis = cls.get_axis(op.axis)
+
+        a = ctx[op.inputs[0].key]
+        if not isinstance(a, (list, tuple)):
+            (inp,), device_id, xp = as_same_device(
+                [a], device=op.device, ret_extra=True
+            )
+
+            with device(device_id):
+                ctx[op.outputs[0].key] = xp.mean(
+                    inp, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims)
+                )
+        else:
+            (_data, _count), device_id, xp = as_same_device(
+                a, device=op.device, ret_extra=True
+            )
+
+            with device(device_id):
+                chunk_count = xp.sum(
+                    _count, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims)
+                )
+                chunk_sum = xp.sum(
+                    _data, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims)
+                )
+                ctx[op.outputs[0].key] = xp.true_divide(
+                    chunk_sum, chunk_count, dtype=op.dtype
+                )
+
+    @classmethod
+    def execute_map(cls, ctx, op):
+        (in_chunk,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        axis = cls.get_axis(op.axis)
+
+        with device(device_id):
+            chunk_count = numel(
+                in_chunk, axis=axis, dtype=np.int64, keepdims=bool(op.keepdims)
+            )
+            chunk_sum = xp.sum(
+                in_chunk, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims)
+            )
+            ctx[op.outputs[0].key] = (chunk_sum, chunk_count)
+
+    @classmethod
+    def execute_combine(cls, ctx, op):
+        axis = cls.get_axis(op.axis)
+        (_data, _count), device_id, xp = as_same_device(
+            ctx[op.inputs[0].key], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            chunk_count = xp.sum(
+                _count, axis=axis, dtype=np.int64, keepdims=bool(op.keepdims)
+            )
+            chunk_sum = xp.sum(
+                _data, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims)
+            )
+            ctx[op.outputs[0].key] = (chunk_sum, chunk_count)
+
+
+def mean(a, axis=None, dtype=None, out=None, keepdims=None, combine_size=None):
+    """
+    Compute the arithmetic mean along the specified axis.
+
+    Returns the average of the array elements.  The average is taken over
+    the flattened tensor by default, otherwise over the specified axis.
+    `float64` intermediate and return values are used for integer inputs.
+
+    Parameters
+    ----------
+    a : array_like
+        Tensor containing numbers whose mean is desired. If `a` is not an
+        tensor, a conversion is attempted.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which the means are computed. The default is to
+        compute the mean of the flattened array.
+
+        If this is a tuple of ints, a mean is performed over multiple axes,
+        instead of a single axis or all the axes as before.
+    dtype : data-type, optional
+        Type to use in computing the mean.  For integer inputs, the default
+        is `float64`; for floating point inputs, it is the same as the
+        input dtype.
+    out : Tensor, optional
+        Alternate output tensor in which to place the result.  The default
+        is ``None``; if provided, it must have the same shape as the
+        expected output, but the type will be cast if necessary.
+        See `doc.ufuncs` for details.
+
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input tensor.
+
+        If the default value is passed, then `keepdims` will not be
+        passed through to the `mean` method of sub-classes of
+        `Tensor`, however any non-default value will be.  If the
+        sub-classes `sum` method does not implement `keepdims` any
+        exceptions will be raised.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    m : Tensor, see dtype parameter above
+        If `out=None`, returns a new tensor containing the mean values,
+        otherwise a reference to the output array is returned.
+
+    See Also
+    --------
+    average : Weighted average
+    std, var, nanmean, nanstd, nanvar
+
+    Notes
+    -----
+    The arithmetic mean is the sum of the elements along the axis divided
+    by the number of elements.
+
+    Note that for floating-point input, the mean is computed using the
+    same precision the input has.  Depending on the input data, this can
+    cause the results to be inaccurate, especially for `float32` (see
+    example below).  Specifying a higher-precision accumulator using the
+    `dtype` keyword can alleviate this issue.
+
+    By default, `float16` results are computed using `float32` intermediates
+    for extra precision.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([[1, 2], [3, 4]])
+    >>> mt.mean(a).execute()
+    2.5
+    >>> mt.mean(a, axis=0).execute()
+    array([ 2.,  3.])
+    >>> mt.mean(a, axis=1).execute()
+    array([ 1.5,  3.5])
+
+    In single precision, `mean` can be inaccurate:
+
+    >>> a = mt.zeros((2, 512*512), dtype=mt.float32)
+    >>> a[0, :] = 1.0
+    >>> a[1, :] = 0.1
+    >>> mt.mean(a).execute()
+    0.54999924
+
+    Computing the mean in float64 is more accurate:
+
+    >>> mt.mean(a, dtype=mt.float64).execute()
+    0.55000000074505806
+
+    """
+    a = astensor(a)
+    if dtype is None:
+        dtype = np.mean(np.empty((1,), dtype=a.dtype)).dtype
+    op = TensorMean(
+        axis=axis, dtype=dtype, keepdims=keepdims, combine_size=combine_size
+    )
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/min.py b/python/xorbits/_mars/tensor/reduction/min.py
new file mode 100644
index 000000000..5c4ef9c0a
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/min.py
@@ -0,0 +1,132 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorReduction, TensorReductionMixin
+
+
+class TensorMin(TensorReduction, TensorReductionMixin):
+    _op_type_ = OperandDef.MIN
+    _func_name = "min"
+
+    def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw):
+        stage = self._rewrite_stage(stage)
+        super().__init__(
+            _axis=axis,
+            _keepdims=keepdims,
+            _combine_size=combine_size,
+            stage=stage,
+            **kw
+        )
+
+    @classmethod
+    def _is_sparse(cls, input_sparse, shape):
+        if input_sparse and len(shape) > 0:
+            return True
+        return False
+
+
+def min(a, axis=None, out=None, keepdims=None, combine_size=None):
+    """
+    Return the minimum of a tensor or minimum along an axis.
+
+    Parameters
+    ----------
+    a : array_like
+        Input data.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which to operate.  By default, flattened input is
+        used.
+
+        If this is a tuple of ints, the minimum is selected over multiple axes,
+        instead of a single axis or all the axes as before.
+    out : Tensor, optional
+        Alternative output tensor in which to place the result.  Must
+        be of the same shape and buffer length as the expected output.
+        See `doc.ufuncs` (Section "Output arguments") for more details.
+
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input tensor.
+
+        If the default value is passed, then `keepdims` will not be
+        passed through to the `amin` method of sub-classes of
+        `Tensor`, however any non-default value will be.  If the
+        sub-classes `sum` method does not implement `keepdims` any
+        exceptions will be raised.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    amin : Tensor or scalar
+        Minimum of `a`. If `axis` is None, the result is a scalar value.
+        If `axis` is given, the result is an array of dimension
+        ``a.ndim - 1``.
+
+    See Also
+    --------
+    amax :
+        The maximum value of a tensor along a given axis, propagating any NaNs.
+    nanmin :
+        The minimum value of a tensor along a given axis, ignoring any NaNs.
+    minimum :
+        Element-wise minimum of two tensors, propagating any NaNs.
+    fmin :
+        Element-wise minimum of two tensors, ignoring any NaNs.
+    argmin :
+        Return the indices of the minimum values.
+
+    nanmax, maximum, fmax
+
+    Notes
+    -----
+    NaN values are propagated, that is if at least one item is NaN, the
+    corresponding min value will be NaN as well. To ignore NaN values
+    (MATLAB behavior), please use nanmin.
+
+    Don't use `amin` for element-wise comparison of 2 tensors; when
+    ``a.shape[0]`` is 2, ``minimum(a[0], a[1])`` is faster than
+    ``amin(a, axis=0)``.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.arange(4).reshape((2,2))
+    >>> a.execute()
+    array([[0, 1],
+           [2, 3]])
+    >>> mt.amin(a).execute()           # Minimum of the flattened array
+    0
+    >>> mt.amin(a, axis=0).execute()   # Minima along the first axis
+    array([0, 1])
+    >>> mt.amin(a, axis=1).execute()   # Minima along the second axis
+    array([0, 2])
+
+    >>> b = mt.arange(5, dtype=float)
+    >>> b[2] = mt.NaN
+    >>> mt.amin(b).execute()
+    nan
+    >>> mt.nanmin(b).execute()
+    0.0
+
+    """
+    a = astensor(a)
+    op = TensorMin(
+        axis=axis, dtype=a.dtype, keepdims=keepdims, combine_size=combine_size
+    )
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/nanargmax.py b/python/xorbits/_mars/tensor/reduction/nanargmax.py
new file mode 100644
index 000000000..df8e9a673
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/nanargmax.py
@@ -0,0 +1,108 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField, TupleField
+from .core import TensorArgReductionMixin, TensorReduction
+
+
+class TensorNanArgmax(TensorReduction, TensorArgReductionMixin):
+    _op_type_ = OperandDef.NANARGMAX
+    _func_name = "nanargmax"
+    _agg_func_name = "nanmax"
+
+    _offset = AnyField("offset")
+    _total_shape = TupleField("total_shape")
+
+    def __init__(
+        self,
+        axis=None,
+        dtype=None,
+        combine_size=None,
+        offset=None,
+        total_shape=None,
+        stage=None,
+        **kw
+    ):
+        if dtype is None:
+            dtype = np.dtype(int)
+        stage = self._rewrite_stage(stage)
+        super().__init__(
+            _axis=axis,
+            _combine_size=combine_size,
+            _offset=offset,
+            _total_shape=total_shape,
+            dtype=dtype,
+            stage=stage,
+            **kw
+        )
+
+    @property
+    def offset(self):
+        return getattr(self, "_offset", None)
+
+    @property
+    def total_shape(self):
+        return getattr(self, "_total_shape", None)
+
+
+def nanargmax(a, axis=None, out=None, combine_size=None):
+    """
+    Return the indices of the maximum values in the specified axis ignoring
+    NaNs. For all-NaN slices ``ValueError`` is raised. Warning: the
+    results cannot be trusted if a slice contains only NaNs and -Infs.
+
+
+    Parameters
+    ----------
+    a : array_like
+        Input data.
+    axis : int, optional
+        Axis along which to operate.  By default flattened input is used.
+    out : Tensor, optional
+        Alternate output tensor in which to place the result.  The default
+        is ``None``; if provided, it must have the same shape as the
+        expected output, but the type will be cast if necessary.
+        See `doc.ufuncs` for details.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    index_array : Tensor
+        An tensor of indices or a single index value.
+
+    See Also
+    --------
+    argmax, nanargmin
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([[mt.nan, 4], [2, 3]])
+    >>> mt.argmax(a).execute()
+    0
+    >>> mt.nanargmax(a).execute()
+    1
+    >>> mt.nanargmax(a, axis=0).execute()
+    array([1, 0])
+    >>> mt.nanargmax(a, axis=1).execute()
+    array([1, 1])
+
+    """
+    op = TensorNanArgmax(axis=axis, dtype=np.dtype(int), combine_size=combine_size)
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/nanargmin.py b/python/xorbits/_mars/tensor/reduction/nanargmin.py
new file mode 100644
index 000000000..2189d87d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/nanargmin.py
@@ -0,0 +1,102 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import AnyField, TupleField
+from .core import TensorArgReductionMixin, TensorReduction
+
+
+class TensorNanArgmin(TensorReduction, TensorArgReductionMixin):
+    _op_type_ = OperandDef.NANARGMIN
+    _func_name = "nanargmin"
+    _agg_func_name = "nanmin"
+
+    _offset = AnyField("offset")
+    _total_shape = TupleField("total_shape")
+
+    def __init__(
+        self,
+        axis=None,
+        dtype=None,
+        combine_size=None,
+        offset=None,
+        total_shape=None,
+        stage=None,
+        **kw
+    ):
+        if dtype is None:
+            dtype = np.dtype(int)
+        stage = self._rewrite_stage(stage)
+        super().__init__(
+            _axis=axis,
+            _combine_size=combine_size,
+            _offset=offset,
+            _total_shape=total_shape,
+            dtype=dtype,
+            stage=stage,
+            **kw
+        )
+
+    @property
+    def offset(self):
+        return getattr(self, "_offset", None)
+
+    @property
+    def total_shape(self):
+        return getattr(self, "_total_shape", None)
+
+
+def nanargmin(a, axis=None, out=None, combine_size=None):
+    """
+    Return the indices of the minimum values in the specified axis ignoring
+    NaNs. For all-NaN slices ``ValueError`` is raised. Warning: the results
+    cannot be trusted if a slice contains only NaNs and Infs.
+
+    Parameters
+    ----------
+    a : array_like
+        Input data.
+    axis : int, optional
+        Axis along which to operate.  By default flattened input is used.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    index_array : Tensor
+        A tensor of indices or a single index value.
+
+    See Also
+    --------
+    argmin, nanargmax
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([[mt.nan, 4], [2, 3]])
+    >>> mt.argmin(a).execute()
+    0
+    >>> mt.nanargmin(a).execute()
+    2
+    >>> mt.nanargmin(a, axis=0).execute()
+    array([1, 1])
+    >>> mt.nanargmin(a, axis=1).execute()
+    array([1, 0])
+
+    """
+    op = TensorNanArgmin(axis=axis, dtype=np.dtype(int), combine_size=combine_size)
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/nancumprod.py b/python/xorbits/_mars/tensor/reduction/nancumprod.py
new file mode 100644
index 000000000..3159ee3a5
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/nancumprod.py
@@ -0,0 +1,97 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..arithmetic.multiply import TensorTreeMultiply
+from ..datasource import tensor as astensor
+from .core import TensorCumReduction, TensorCumReductionMixin
+
+
+class TensorNanCumprod(TensorCumReduction, TensorCumReductionMixin):
+    _op_type_ = OperandDef.NANCUMPROD
+    _func_name = "nancumprod"
+
+    def __init__(self, axis=None, **kw):
+        super().__init__(_axis=axis, **kw)
+
+    @staticmethod
+    def _get_op_types():
+        return TensorNanCumprod, TensorTreeMultiply
+
+
+def nancumprod(a, axis=None, dtype=None, out=None):
+    """
+    Return the cumulative product of tensor elements over a given axis treating Not a
+    Numbers (NaNs) as one.  The cumulative product does not change when NaNs are
+    encountered and leading NaNs are replaced by ones.
+
+    Ones are returned for slices that are all-NaN or empty.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor.
+    axis : int, optional
+        Axis along which the cumulative product is computed.  By default
+        the input is flattened.
+    dtype : dtype, optional
+        Type of the returned tensor, as well as of the accumulator in which
+        the elements are multiplied.  If *dtype* is not specified, it
+        defaults to the dtype of `a`, unless `a` has an integer dtype with
+        a precision less than that of the default platform integer.  In
+        that case, the default platform integer is used instead.
+    out : Tensor, optional
+        Alternative output tensor in which to place the result. It must
+        have the same shape and buffer length as the expected output
+        but the type of the resulting values will be cast if necessary.
+
+    Returns
+    -------
+    nancumprod : Tensor
+        A new array holding the result is returned unless `out` is
+        specified, in which case it is returned.
+
+    See Also
+    --------
+    mt.cumprod : Cumulative product across array propagating NaNs.
+    isnan : Show which elements are NaN.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.nancumprod(1).execute()
+    array([1])
+    >>> mt.nancumprod([1]).execute()
+    array([1])
+    >>> mt.nancumprod([1, mt.nan]).execute()
+    array([ 1.,  1.])
+    >>> a = mt.array([[1, 2], [3, mt.nan]])
+    >>> mt.nancumprod(a).execute()
+    array([ 1.,  2.,  6.,  6.])
+    >>> mt.nancumprod(a, axis=0).execute()
+    array([[ 1.,  2.],
+           [ 3.,  2.]])
+    >>> mt.nancumprod(a, axis=1).execute()
+    array([[ 1.,  2.],
+           [ 3.,  3.]])
+
+    """
+    a = astensor(a)
+    if dtype is None:
+        dtype = np.nancumprod(np.empty((1,), dtype=a.dtype)).dtype
+    op = TensorNanCumprod(axis=axis, dtype=dtype)
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/nancumsum.py b/python/xorbits/_mars/tensor/reduction/nancumsum.py
new file mode 100644
index 000000000..521292629
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/nancumsum.py
@@ -0,0 +1,100 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..arithmetic.add import TensorTreeAdd
+from ..datasource import tensor as astensor
+from .core import TensorCumReduction, TensorCumReductionMixin
+
+
+class TensorNanCumsum(TensorCumReduction, TensorCumReductionMixin):
+    _op_type_ = OperandDef.NANCUMSUM
+    _func_name = "nancumsum"
+
+    def __init__(self, axis=None, **kw):
+        super().__init__(_axis=axis, **kw)
+
+    @staticmethod
+    def _get_op_types():
+        return TensorNanCumsum, TensorTreeAdd
+
+
+def nancumsum(a, axis=None, dtype=None, out=None):
+    """
+    Return the cumulative sum of tensor elements over a given axis treating Not a
+    Numbers (NaNs) as zero.  The cumulative sum does not change when NaNs are
+    encountered and leading NaNs are replaced by zeros.
+
+    Zeros are returned for slices that are all-NaN or empty.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor.
+    axis : int, optional
+        Axis along which the cumulative sum is computed. The default
+        (None) is to compute the cumsum over the flattened tensor.
+    dtype : dtype, optional
+        Type of the returned tensor and of the accumulator in which the
+        elements are summed.  If `dtype` is not specified, it defaults
+        to the dtype of `a`, unless `a` has an integer dtype with a
+        precision less than that of the default platform integer.  In
+        that case, the default platform integer is used.
+    out : Tensor, optional
+        Alternative output tensor in which to place the result. It must
+        have the same shape and buffer length as the expected output
+        but the type will be cast if necessary. See `doc.ufuncs`
+        (Section "Output arguments") for more details.
+
+    Returns
+    -------
+    nancumsum : Tensor.
+        A new tensor holding the result is returned unless `out` is
+        specified, in which it is returned. The result has the same
+        size as `a`, and the same shape as `a` if `axis` is not None
+        or `a` is a 1-d tensor.
+
+    See Also
+    --------
+    numpy.cumsum : Cumulative sum across tensor propagating NaNs.
+    isnan : Show which elements are NaN.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.nancumsum(1).execute()
+    array([1])
+    >>> mt.nancumsum([1]).execute()
+    array([1])
+    >>> mt.nancumsum([1, mt.nan]).execute()
+    array([ 1.,  1.])
+    >>> a = mt.array([[1, 2], [3, mt.nan]])
+    >>> mt.nancumsum(a).execute()
+    array([ 1.,  3.,  6.,  6.])
+    >>> mt.nancumsum(a, axis=0).execute()
+    array([[ 1.,  2.],
+           [ 4.,  2.]])
+    >>> mt.nancumsum(a, axis=1).execute()
+    array([[ 1.,  3.],
+           [ 3.,  3.]])
+
+    """
+    a = astensor(a)
+    if dtype is None:
+        dtype = np.nancumsum(np.empty((1,), dtype=a.dtype)).dtype
+    op = TensorNanCumsum(axis=axis, dtype=dtype)
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/nanmax.py b/python/xorbits/_mars/tensor/reduction/nanmax.py
new file mode 100644
index 000000000..4730e1e98
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/nanmax.py
@@ -0,0 +1,123 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorReduction, TensorReductionMixin
+
+
+class TensorNanMax(TensorReduction, TensorReductionMixin):
+    _op_type_ = OperandDef.NANMAX
+    _func_name = "nanmax"
+
+    def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw):
+        stage = self._rewrite_stage(stage)
+        super().__init__(
+            _axis=axis,
+            _keepdims=keepdims,
+            _combine_size=combine_size,
+            stage=stage,
+            **kw
+        )
+
+
+def nanmax(a, axis=None, out=None, keepdims=None, combine_size=None):
+    """
+    Return the maximum of an array or maximum along an axis, ignoring any
+    NaNs.  When all-NaN slices are encountered a ``RuntimeWarning`` is
+    raised and NaN is returned for that slice.
+
+    Parameters
+    ----------
+    a : array_like
+        Tensor containing numbers whose maximum is desired. If `a` is not a
+        tensor, a conversion is attempted.
+    axis : int, optional
+        Axis along which the maximum is computed. The default is to compute
+        the maximum of the flattened tensor.
+    out : ndarray, optional
+        Alternate output array in which to place the result.  The default
+        is ``None``; if provided, it must have the same shape as the
+        expected output, but the type will be cast if necessary.  See
+        `doc.ufuncs` for details.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `a`.
+
+        If the value is anything but the default, then
+        `keepdims` will be passed through to the `max` method
+        of sub-classes of `Tensor`.  If the sub-classes methods
+        does not implement `keepdims` any exceptions will be raised.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    nanmax : Tensor
+        A tensor with the same shape as `a`, with the specified axis removed.
+        If `a` is a 0-d tensor, or if axis is None, a Tensor scalar is
+        returned.  The same dtype as `a` is returned.
+
+    See Also
+    --------
+    nanmin :
+        The minimum value of a tensor along a given axis, ignoring any NaNs.
+    amax :
+        The maximum value of a tensor along a given axis, propagating any NaNs.
+    fmax :
+        Element-wise maximum of two tensors, ignoring any NaNs.
+    maximum :
+        Element-wise maximum of two tensors, propagating any NaNs.
+    isnan :
+        Shows which elements are Not a Number (NaN).
+    isfinite:
+        Shows which elements are neither NaN nor infinity.
+
+    amin, fmin, minimum
+
+    Notes
+    -----
+    Mars uses the IEEE Standard for Binary Floating-Point for Arithmetic
+    (IEEE 754). This means that Not a Number is not equivalent to infinity.
+    Positive infinity is treated as a very large number and negative
+    infinity is treated as a very small (i.e. negative) number.
+
+    If the input has a integer type the function is equivalent to np.max.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([[1, 2], [3, mt.nan]])
+    >>> mt.nanmax(a).execute()
+    3.0
+    >>> mt.nanmax(a, axis=0).execute()
+    array([ 3.,  2.])
+    >>> mt.nanmax(a, axis=1).execute()
+    array([ 2.,  3.])
+
+    When positive infinity and negative infinity are present:
+
+    >>> mt.nanmax([1, 2, mt.nan, mt.NINF]).execute()
+    2.0
+    >>> mt.nanmax([1, 2, mt.nan, mt.inf]).execute()
+    inf
+
+    """
+    a = astensor(a)
+    op = TensorNanMax(
+        axis=axis, dtype=a.dtype, keepdims=keepdims, combine_size=combine_size
+    )
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/nanmean.py b/python/xorbits/_mars/tensor/reduction/nanmean.py
new file mode 100644
index 000000000..f9d5d6314
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/nanmean.py
@@ -0,0 +1,171 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..array_utils import as_same_device, device
+from ..datasource import tensor as astensor
+from .core import TensorReduction, TensorReductionMixin, nannumel
+from .mean import TensorMean
+
+
+class TensorNanMean(TensorReduction, TensorReductionMixin):
+    _op_type_ = OperandDef.NANMEAN
+
+    def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw):
+        stage = self._rewrite_stage(stage)
+        super().__init__(
+            _axis=axis,
+            _keepdims=keepdims,
+            _combine_size=combine_size,
+            stage=stage,
+            **kw
+        )
+
+    @classmethod
+    def execute_map(cls, ctx, op):
+        (in_chunk,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        axis = cls.get_axis(op.axis)
+
+        with device(device_id):
+            chunk_count = nannumel(
+                in_chunk, axis=axis, dtype=np.int64, keepdims=bool(op.keepdims)
+            )
+            chunk_sum = xp.nansum(
+                in_chunk, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims)
+            )
+            ctx[op.outputs[0].key] = (chunk_sum, chunk_count)
+
+    @classmethod
+    def execute_agg(cls, ctx, op):
+        axis = cls.get_axis(op.axis)
+
+        a = ctx[op.inputs[0].key]
+        if not isinstance(a, (list, tuple)):
+            (inp,), device_id, xp = as_same_device(
+                [a], device=op.device, ret_extra=True
+            )
+
+            with device(device_id):
+                ctx[op.outputs[0].key] = xp.nanmean(
+                    inp, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims)
+                )
+        else:
+            (_data, _count), device_id, xp = as_same_device(
+                a, device=op.device, ret_extra=True
+            )
+
+            with device(device_id):
+                chunk_count = xp.sum(
+                    _count, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims)
+                )
+                chunk_sum = xp.sum(
+                    _data, axis=axis, dtype=op.dtype, keepdims=bool(op.keepdims)
+                )
+                ctx[op.outputs[0].key] = xp.true_divide(
+                    chunk_sum, chunk_count, dtype=op.dtype
+                )
+
+    @classmethod
+    def execute_combine(cls, ctx, op):
+        TensorMean.execute_combine(ctx, op)
+
+
+def nanmean(a, axis=None, dtype=None, out=None, keepdims=None, combine_size=None):
+    """
+    Compute the arithmetic mean along the specified axis, ignoring NaNs.
+
+    Returns the average of the tensor elements.  The average is taken over
+    the flattened tensor by default, otherwise over the specified axis.
+    `float64` intermediate and return values are used for integer inputs.
+
+    For all-NaN slices, NaN is returned and a `RuntimeWarning` is raised.
+
+    Parameters
+    ----------
+    a : array_like
+        Tensor containing numbers whose mean is desired. If `a` is not an
+        tensor, a conversion is attempted.
+    axis : int, optional
+        Axis along which the means are computed. The default is to compute
+        the mean of the flattened tensor.
+    dtype : data-type, optional
+        Type to use in computing the mean.  For integer inputs, the default
+        is `float64`; for inexact inputs, it is the same as the input
+        dtype.
+    out : Tensor, optional
+        Alternate output tensor in which to place the result.  The default
+        is ``None``; if provided, it must have the same shape as the
+        expected output, but the type will be cast if necessary.  See
+        `doc.ufuncs` for details.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `a`.
+
+        If the value is anything but the default, then
+        `keepdims` will be passed through to the `mean` or `sum` methods
+        of sub-classes of `Tensor`.  If the sub-classes methods
+        does not implement `keepdims` any exceptions will be raised.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    m : Tensor, see dtype parameter above
+        If `out=None`, returns a new array containing the mean values,
+        otherwise a reference to the output array is returned. Nan is
+        returned for slices that contain only NaNs.
+
+    See Also
+    --------
+    average : Weighted average
+    mean : Arithmetic mean taken while not ignoring NaNs
+    var, nanvar
+
+    Notes
+    -----
+    The arithmetic mean is the sum of the non-NaN elements along the axis
+    divided by the number of non-NaN elements.
+
+    Note that for floating-point input, the mean is computed using the same
+    precision the input has.  Depending on the input data, this can cause
+    the results to be inaccurate, especially for `float32`.  Specifying a
+    higher-precision accumulator using the `dtype` keyword can alleviate
+    this issue.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([[1, mt.nan], [3, 4]])
+    >>> mt.nanmean(a).execute()
+    2.6666666666666665
+    >>> mt.nanmean(a, axis=0).execute()
+    array([ 2.,  4.])
+    >>> mt.nanmean(a, axis=1).execute()
+    array([ 1.,  3.5])
+
+    """
+    a = astensor(a)
+    if dtype is None:
+        dtype = np.nanmean(np.empty((1,), dtype=a.dtype)).dtype
+    op = TensorNanMean(
+        axis=axis, dtype=dtype, keepdims=keepdims, combine_size=combine_size
+    )
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/nanmin.py b/python/xorbits/_mars/tensor/reduction/nanmin.py
new file mode 100644
index 000000000..ff6572ee1
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/nanmin.py
@@ -0,0 +1,123 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorReduction, TensorReductionMixin
+
+
+class TensorNanMin(TensorReduction, TensorReductionMixin):
+    _op_type_ = OperandDef.NANMIN
+    _func_name = "nanmin"
+
+    def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw):
+        stage = self._rewrite_stage(stage)
+        super().__init__(
+            _axis=axis,
+            _keepdims=keepdims,
+            _combine_size=combine_size,
+            stage=stage,
+            **kw
+        )
+
+
+def nanmin(a, axis=None, out=None, keepdims=None, combine_size=None):
+    """
+    Return minimum of a tensor or minimum along an axis, ignoring any NaNs.
+    When all-NaN slices are encountered a ``RuntimeWarning`` is raised and
+    Nan is returned for that slice.
+
+    Parameters
+    ----------
+    a : array_like
+        Tensor containing numbers whose minimum is desired. If `a` is not an
+        tensor, a conversion is attempted.
+    axis : int, optional
+        Axis along which the minimum is computed. The default is to compute
+        the minimum of the flattened tensor.
+    out : Tensor, optional
+        Alternate output tensor in which to place the result.  The default
+        is ``None``; if provided, it must have the same shape as the
+        expected output, but the type will be cast if necessary.  See
+        `doc.ufuncs` for details.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `a`.
+
+        If the value is anything but the default, then
+        `keepdims` will be passed through to the `min` method
+        of sub-classes of `Tensor`.  If the sub-classes methods
+        does not implement `keepdims` any exceptions will be raised.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    nanmin : Tensor
+        An tensor with the same shape as `a`, with the specified axis
+        removed.  If `a` is a 0-d tensor, or if axis is None, a tensor
+        scalar is returned.  The same dtype as `a` is returned.
+
+    See Also
+    --------
+    nanmax :
+        The maximum value of an array along a given axis, ignoring any NaNs.
+    amin :
+        The minimum value of an array along a given axis, propagating any NaNs.
+    fmin :
+        Element-wise minimum of two arrays, ignoring any NaNs.
+    minimum :
+        Element-wise minimum of two arrays, propagating any NaNs.
+    isnan :
+        Shows which elements are Not a Number (NaN).
+    isfinite:
+        Shows which elements are neither NaN nor infinity.
+
+    amax, fmax, maximum
+
+    Notes
+    -----
+    Mars uses the IEEE Standard for Binary Floating-Point for Arithmetic
+    (IEEE 754). This means that Not a Number is not equivalent to infinity.
+    Positive infinity is treated as a very large number and negative
+    infinity is treated as a very small (i.e. negative) number.
+
+    If the input has a integer type the function is equivalent to mt.min.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([[1, 2], [3, mt.nan]])
+    >>> mt.nanmin(a).execute()
+    1.0
+    >>> mt.nanmin(a, axis=0).execute()
+    array([ 1.,  2.])
+    >>> mt.nanmin(a, axis=1).execute()
+    array([ 1.,  3.])
+
+    When positive infinity and negative infinity are present:
+
+    >>> mt.nanmin([1, 2, mt.nan, mt.inf]).execute()
+    1.0
+    >>> mt.nanmin([1, 2, mt.nan, mt.NINF]).execute()
+    -inf
+
+    """
+    a = astensor(a)
+    op = TensorNanMin(
+        axis=axis, dtype=a.dtype, keepdims=keepdims, combine_size=combine_size
+    )
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/nanprod.py b/python/xorbits/_mars/tensor/reduction/nanprod.py
new file mode 100644
index 000000000..237ecfd97
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/nanprod.py
@@ -0,0 +1,106 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorReduction, TensorReductionMixin
+
+
+class TensorNanProd(TensorReduction, TensorReductionMixin):
+    _op_type_ = OperandDef.NANPROD
+    _func_name = "nanprod"
+
+    def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw):
+        stage = self._rewrite_stage(stage)
+        super().__init__(
+            _axis=axis,
+            _keepdims=keepdims,
+            _combine_size=combine_size,
+            stage=stage,
+            **kw
+        )
+
+
+def nanprod(a, axis=None, dtype=None, out=None, keepdims=None, combine_size=None):
+    """
+    Return the product of array elements over a given axis treating Not a
+    Numbers (NaNs) as ones.
+
+    One is returned for slices that are all-NaN or empty.
+
+    Parameters
+    ----------
+    a : array_like
+        Tensor containing numbers whose product is desired. If `a` is not an
+        tensor, a conversion is attempted.
+    axis : int, optional
+        Axis along which the product is computed. The default is to compute
+        the product of the flattened tensor.
+    dtype : data-type, optional
+        The type of the returned tensor and of the accumulator in which the
+        elements are summed.  By default, the dtype of `a` is used.  An
+        exception is when `a` has an integer type with less precision than
+        the platform (u)intp. In that case, the default will be either
+        (u)int32 or (u)int64 depending on whether the platform is 32 or 64
+        bits. For inexact inputs, dtype must be inexact.
+    out : Tensor, optional
+        Alternate output tensor in which to place the result.  The default
+        is ``None``. If provided, it must have the same shape as the
+        expected output, but the type will be cast if necessary.  See
+        `doc.ufuncs` for details. The casting of NaN to integer can yield
+        unexpected results.
+    keepdims : bool, optional
+        If True, the axes which are reduced are left in the result as
+        dimensions with size one. With this option, the result will
+        broadcast correctly against the original `arr`.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    nanprod : Tensor
+        A new tensor holding the result is returned unless `out` is
+        specified, in which case it is returned.
+
+    See Also
+    --------
+    mt.prod : Product across array propagating NaNs.
+    isnan : Show which elements are NaN.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.nanprod(1).execute()
+    1
+    >>> mt.nanprod([1]).execute()
+    1
+    >>> mt.nanprod([1, mt.nan]).execute()
+    1.0
+    >>> a = mt.array([[1, 2], [3, mt.nan]])
+    >>> mt.nanprod(a).execute()
+    6.0
+    >>> mt.nanprod(a, axis=0).execute()
+    array([ 3.,  2.])
+
+    """
+    a = astensor(a)
+    if dtype is None:
+        dtype = np.nanprod(np.empty((1,), dtype=a.dtype)).dtype
+    op = TensorNanProd(
+        axis=axis, dtype=dtype, keepdims=keepdims, combine_size=combine_size
+    )
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/nanstd.py b/python/xorbits/_mars/tensor/reduction/nanstd.py
new file mode 100644
index 000000000..f5828b1d0
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/nanstd.py
@@ -0,0 +1,129 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..arithmetic.sqrt import sqrt
+from .nanvar import nanvar
+
+
+def nanstd(
+    a, axis=None, dtype=None, out=None, ddof=0, keepdims=None, combine_size=None
+):
+    """
+    Compute the standard deviation along the specified axis, while
+    ignoring NaNs.
+
+    Returns the standard deviation, a measure of the spread of a
+    distribution, of the non-NaN tensor elements. The standard deviation is
+    computed for the flattened tensor by default, otherwise over the
+    specified axis.
+
+    For all-NaN slices or slices with zero degrees of freedom, NaN is
+    returned and a `RuntimeWarning` is raised.
+
+    Parameters
+    ----------
+    a : array_like
+        Calculate the standard deviation of the non-NaN values.
+    axis : int, optional
+        Axis along which the standard deviation is computed. The default is
+        to compute the standard deviation of the flattened tensor.
+    dtype : dtype, optional
+        Type to use in computing the standard deviation. For tensors of
+        integer type the default is float64, for tensors of float types it
+        is the same as the tensor type.
+    out : Tensor, optional
+        Alternative output tensor in which to place the result. It must have
+        the same shape as the expected output but the type (of the
+        calculated values) will be cast if necessary.
+    ddof : int, optional
+        Means Delta Degrees of Freedom.  The divisor used in calculations
+        is ``N - ddof``, where ``N`` represents the number of non-NaN
+        elements.  By default `ddof` is zero.
+
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `a`.
+
+        If this value is anything but the default it is passed through
+        as-is to the relevant functions of the sub-classes.  If these
+        functions do not have a `keepdims` kwarg, a RuntimeError will
+        be raised.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    standard_deviation : ndarray, see dtype parameter above.
+        If `out` is None, return a new array containing the standard
+        deviation, otherwise return a reference to the output tensor. If
+        ddof is >= the number of non-NaN elements in a slice or the slice
+        contains only NaNs, then the result for that slice is NaN.
+
+    See Also
+    --------
+    var, mean, std
+    nanvar, nanmean
+
+    Notes
+    -----
+    The standard deviation is the square root of the average of the squared
+    deviations from the mean: ``std = sqrt(mean(abs(x - x.mean())**2))``.
+
+    The average squared deviation is normally calculated as
+    ``x.sum() / N``, where ``N = len(x)``.  If, however, `ddof` is
+    specified, the divisor ``N - ddof`` is used instead. In standard
+    statistical practice, ``ddof=1`` provides an unbiased estimator of the
+    variance of the infinite population. ``ddof=0`` provides a maximum
+    likelihood estimate of the variance for normally distributed variables.
+    The standard deviation computed in this function is the square root of
+    the estimated variance, so even with ``ddof=1``, it will not be an
+    unbiased estimate of the standard deviation per se.
+
+    Note that, for complex numbers, `std` takes the absolute value before
+    squaring, so that the result is always real and nonnegative.
+
+    For floating-point input, the *std* is computed using the same
+    precision the input has. Depending on the input data, this can cause
+    the results to be inaccurate, especially for float32 (see example
+    below).  Specifying a higher-accuracy accumulator using the `dtype`
+    keyword can alleviate this issue.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([[1, mt.nan], [3, 4]])
+    >>> mt.nanstd(a).execute()
+    1.247219128924647
+    >>> mt.nanstd(a, axis=0).execute()
+    array([ 1.,  0.])
+    >>> mt.nanstd(a, axis=1).execute()
+    array([ 0.,  0.5])
+
+    """
+    ret = sqrt(
+        nanvar(
+            a,
+            axis=axis,
+            dtype=dtype,
+            out=out,
+            ddof=ddof,
+            keepdims=keepdims,
+            combine_size=combine_size,
+        )
+    )
+    if dtype is not None and ret.dtype != dtype:
+        ret = ret.astype(dtype)
+    return ret
diff --git a/python/xorbits/_mars/tensor/reduction/nansum.py b/python/xorbits/_mars/tensor/reduction/nansum.py
new file mode 100644
index 000000000..32bf13a3b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/nansum.py
@@ -0,0 +1,127 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorReduction, TensorReductionMixin
+
+
+class TensorNanSum(TensorReduction, TensorReductionMixin):
+    _op_type_ = OperandDef.NANSUM
+    _func_name = "nansum"
+
+    def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw):
+        stage = self._rewrite_stage(stage)
+        super().__init__(
+            _axis=axis,
+            _keepdims=keepdims,
+            _combine_size=combine_size,
+            stage=stage,
+            **kw
+        )
+
+
+def nansum(a, axis=None, dtype=None, out=None, keepdims=None, combine_size=None):
+    """
+    Return the sum of array elements over a given axis treating Not a
+    Numbers (NaNs) as zero.
+
+    Zero is returned for slices that are all-NaN or
+    empty.
+
+    Parameters
+    ----------
+    a : array_like
+        Tensor containing numbers whose sum is desired. If `a` is not an
+        tensor, a conversion is attempted.
+    axis : int, optional
+        Axis along which the sum is computed. The default is to compute the
+        sum of the flattened array.
+    dtype : data-type, optional
+        The type of the returned tensor and of the accumulator in which the
+        elements are summed.  By default, the dtype of `a` is used.  An
+        exception is when `a` has an integer type with less precision than
+        the platform (u)intp. In that case, the default will be either
+        (u)int32 or (u)int64 depending on whether the platform is 32 or 64
+        bits. For inexact inputs, dtype must be inexact.
+    out : Tensor, optional
+        Alternate output tensor in which to place the result.  The default
+        is ``None``. If provided, it must have the same shape as the
+        expected output, but the type will be cast if necessary.  See
+        `doc.ufuncs` for details. The casting of NaN to integer can yield
+        unexpected results.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `a`.
+
+
+        If the value is anything but the default, then
+        `keepdims` will be passed through to the `mean` or `sum` methods
+        of sub-classes of `Tensor`.  If the sub-classes methods
+        does not implement `keepdims` any exceptions will be raised.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    nansum : Tensor.
+        A new tensor holding the result is returned unless `out` is
+        specified, in which it is returned. The result has the same
+        size as `a`, and the same shape as `a` if `axis` is not None
+        or `a` is a 1-d array.
+
+    See Also
+    --------
+    mt.sum : Sum across tensor propagating NaNs.
+    isnan : Show which elements are NaN.
+    isfinite: Show which elements are not NaN or +/-inf.
+
+    Notes
+    -----
+    If both positive and negative infinity are present, the sum will be Not
+    A Number (NaN).
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> mt.nansum(1).execute()
+    1
+    >>> mt.nansum([1]).execute()
+    1
+    >>> mt.nansum([1, mt.nan]).execute()
+    1.0
+    >>> a = mt.array([[1, 1], [1, mt.nan]])
+    >>> mt.nansum(a).execute()
+    3.0
+    >>> mt.nansum(a, axis=0).execute()
+    array([ 2.,  1.])
+    >>> mt.nansum([1, mt.nan, mt.inf]).execute()
+    inf
+    >>> mt.nansum([1, mt.nan, mt.NINF]).execute()
+    -inf
+    >>> mt.nansum([1, mt.nan, mt.inf, -mt.inf]).execute() # both +/- infinity present
+    nan
+
+    """
+    a = astensor(a)
+    if dtype is None:
+        dtype = np.nansum(np.empty((1,), dtype=a.dtype)).dtype
+    op = TensorNanSum(
+        axis=axis, dtype=dtype, keepdims=keepdims, combine_size=combine_size
+    )
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/nanvar.py b/python/xorbits/_mars/tensor/reduction/nanvar.py
new file mode 100644
index 000000000..77e8a879c
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/nanvar.py
@@ -0,0 +1,294 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import Int32Field
+from ..array_utils import as_same_device, device, get_array_module
+from ..datasource import tensor as astensor
+from .core import TensorReduction, TensorReductionMixin, nannumel
+from .var import reduce_var_square
+
+
+class TensorNanMoment(TensorReduction, TensorReductionMixin):
+    _op_type_ = OperandDef.NANMOMENT
+
+    _moment = Int32Field("moment", default=2)
+    _ddof = Int32Field("ddof")
+
+    def __init__(
+        self,
+        axis=None,
+        keepdims=None,
+        moment=None,
+        ddof=None,
+        combine_size=None,
+        stage=None,
+        **kw
+    ):
+        stage = self._rewrite_stage(stage)
+        if moment is not None:
+            kw["_moment"] = moment
+        super().__init__(
+            _axis=axis,
+            _keepdims=keepdims,
+            _ddof=ddof,
+            _combine_size=combine_size,
+            stage=stage,
+            **kw
+        )
+
+    @property
+    def moment(self):
+        return getattr(self, "_moment", 2)
+
+    @property
+    def ddof(self):
+        return self._ddof
+
+    @classmethod
+    def execute_agg(cls, ctx, op):
+        axis = cls.get_axis(op.axis)
+        dtype = op.dtype
+
+        (_data, _count, _var_square), device_id, xp = as_same_device(
+            ctx[op.inputs[0].key], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            chunk_count = xp.nansum(_count, axis=axis, dtype=np.int64, keepdims=True)
+            chunk_sum = xp.nansum(_data, axis=axis, dtype=dtype, keepdims=True)
+            avg = xp.true_divide(chunk_sum, chunk_count, dtype=dtype)
+            avg_diff = xp.true_divide(_data, _count, dtype=dtype) - avg
+            var_square = reduce_var_square(
+                _var_square, avg_diff, _count, op, axis, xp.nansum
+            )
+
+            ctx[op.outputs[0].key] = xp.true_divide(
+                var_square,
+                xp.nansum(
+                    chunk_count, axis=axis, dtype=dtype, keepdims=bool(op.keepdims)
+                )
+                - op.ddof,
+                dtype=dtype,
+            )
+
+    @classmethod
+    def execute_map(cls, ctx, op):
+        (in_chunk,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        axis = cls.get_axis(op.axis)
+        moment = op.moment
+        dtype = op.dtype
+        empty = get_array_module(in_chunk, nosparse=True).empty
+
+        with device(device_id):
+            chunk_count = nannumel(
+                in_chunk, axis=axis, dtype=np.int64, keepdims=bool(op.keepdims)
+            )
+            chunk_sum = xp.nansum(
+                in_chunk, axis=axis, dtype=dtype, keepdims=bool(op.keepdims)
+            )
+            avg = xp.true_divide(chunk_sum, chunk_count)
+            var_square = empty(chunk_count.shape + (moment - 1,), dtype=dtype)
+            for i in range(2, moment + 1):
+                var_square[..., i - 2] = xp.nansum(
+                    (in_chunk - avg) ** i,
+                    axis=axis,
+                    dtype=dtype,
+                    keepdims=bool(op.keepdims),
+                )
+            ctx[op.outputs[0].key] = (chunk_sum, chunk_count, var_square)
+
+    @classmethod
+    def execute_combine(cls, ctx, op):
+        axis = cls.get_axis(op.axis)
+        moment = op.moment
+        dtype = op.dtype
+
+        (_data, _count, _var_square), device_id, xp = as_same_device(
+            ctx[op.inputs[0].key], device=op.device, ret_extra=True
+        )
+        empty = get_array_module(_data, nosparse=True).empty
+
+        with device(device_id):
+            chunk_count = xp.nansum(
+                _count, axis=axis, dtype=np.int64, keepdims=bool(op.keepdims)
+            )
+            chunk_sum = xp.nansum(
+                _data, axis=axis, dtype=dtype, keepdims=bool(op.keepdims)
+            )
+            avg = xp.true_divide(chunk_sum, chunk_count, dtype=dtype)
+            avg_diff = xp.true_divide(_data, _count, dtype=dtype) - avg
+            var_square = empty(chunk_count.shape + (moment - 1,), dtype=dtype)
+
+            for m in range(2, moment + 1):
+                var_square[..., m - 2] = reduce_var_square(
+                    _var_square, avg_diff, _count, op, axis, xp.nansum
+                )
+
+            ctx[op.outputs[0].key] = (chunk_sum, chunk_count, var_square)
+
+
+class TensorNanVar(TensorReduction, TensorReductionMixin):
+    _op_type_ = OperandDef.NANVAR
+
+    _ddof = Int32Field("ddof")
+
+    def __new__(cls, *args, **kwargs):
+        if kwargs.get("stage") is not None:
+            return TensorNanMoment(*args, **kwargs)
+        return super().__new__(cls)
+
+    def __init__(
+        self, axis=None, dtype=None, keepdims=None, ddof=0, combine_size=None, **kw
+    ):
+        super().__init__(
+            _axis=axis,
+            dtype=dtype,
+            _keepdims=keepdims,
+            _ddof=ddof,
+            _combine_size=combine_size,
+            **kw
+        )
+
+    @property
+    def ddof(self):
+        return self._ddof
+
+    def _get_op_kw(self):
+        kw = dict()
+        kw["ddof"] = self.ddof
+        return kw
+
+    @classmethod
+    def execute(cls, ctx, op):
+        axis = cls.get_axis(op.axis)
+        (in_chunk,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            ctx[op.outputs[0].key] = xp.nanvar(
+                in_chunk,
+                axis=axis,
+                dtype=op.dtype,
+                ddof=op.ddof,
+                keepdims=bool(op.keepdims),
+            )
+
+
+def nanvar(
+    a, axis=None, dtype=None, out=None, ddof=0, keepdims=None, combine_size=None
+):
+    """
+    Compute the variance along the specified axis, while ignoring NaNs.
+
+    Returns the variance of the tensor elements, a measure of the spread of
+    a distribution.  The variance is computed for the flattened tensor by
+    default, otherwise over the specified axis.
+
+    For all-NaN slices or slices with zero degrees of freedom, NaN is
+    returned and a `RuntimeWarning` is raised.
+
+    Parameters
+    ----------
+    a : array_like
+        Tensor containing numbers whose variance is desired.  If `a` is not a
+        tensor, a conversion is attempted.
+    axis : int, optional
+        Axis along which the variance is computed.  The default is to compute
+        the variance of the flattened array.
+    dtype : data-type, optional
+        Type to use in computing the variance.  For tensors of integer type
+        the default is `float32`; for tensors of float types it is the same as
+        the tensor type.
+    out : Tensor, optional
+        Alternate output tensor in which to place the result.  It must have
+        the same shape as the expected output, but the type is cast if
+        necessary.
+    ddof : int, optional
+        "Delta Degrees of Freedom": the divisor used in the calculation is
+        ``N - ddof``, where ``N`` represents the number of non-NaN
+        elements. By default `ddof` is zero.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `a`.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+
+    Returns
+    -------
+    variance : Tensor, see dtype parameter above
+        If `out` is None, return a new tensor containing the variance,
+        otherwise return a reference to the output tensor. If ddof is >= the
+        number of non-NaN elements in a slice or the slice contains only
+        NaNs, then the result for that slice is NaN.
+
+    See Also
+    --------
+    std : Standard deviation
+    mean : Average
+    var : Variance while not ignoring NaNs
+    nanstd, nanmean
+
+    Notes
+    -----
+    The variance is the average of the squared deviations from the mean,
+    i.e.,  ``var = mean(abs(x - x.mean())**2)``.
+
+    The mean is normally calculated as ``x.sum() / N``, where ``N = len(x)``.
+    If, however, `ddof` is specified, the divisor ``N - ddof`` is used
+    instead.  In standard statistical practice, ``ddof=1`` provides an
+    unbiased estimator of the variance of a hypothetical infinite
+    population.  ``ddof=0`` provides a maximum likelihood estimate of the
+    variance for normally distributed variables.
+
+    Note that for complex numbers, the absolute value is taken before
+    squaring, so that the result is always real and nonnegative.
+
+    For floating-point input, the variance is computed using the same
+    precision the input has.  Depending on the input data, this can cause
+    the results to be inaccurate, especially for `float32` (see example
+    below).  Specifying a higher-accuracy accumulator using the ``dtype``
+    keyword can alleviate this issue.
+
+    For this function to work on sub-classes of Tensor, they must define
+    `sum` with the kwarg `keepdims`
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([[1, mt.nan], [3, 4]])
+    >>> mt.nanvar(a).execute()
+    1.5555555555555554
+    >>> mt.nanvar(a, axis=0).execute()
+    array([ 1.,  0.])
+    >>> mt.nanvar(a, axis=1).execute()
+    array([ 0.,  0.25])
+
+    """
+    a = astensor(a)
+    if dtype is None:
+        dtype = np.nanvar(np.ones((1,), dtype=a.dtype)).dtype
+    op = TensorNanVar(
+        axis=axis, dtype=dtype, keepdims=keepdims, ddof=ddof, combine_size=combine_size
+    )
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/prod.py b/python/xorbits/_mars/tensor/reduction/prod.py
new file mode 100644
index 000000000..eeb692807
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/prod.py
@@ -0,0 +1,142 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorReduction, TensorReductionMixin
+
+
+class TensorProd(TensorReduction, TensorReductionMixin):
+    _op_type_ = OperandDef.PROD
+    _func_name = "prod"
+
+    def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw):
+        stage = self._rewrite_stage(stage)
+        super().__init__(
+            _axis=axis,
+            _keepdims=keepdims,
+            _combine_size=combine_size,
+            stage=stage,
+            **kw
+        )
+
+
+def prod(a, axis=None, dtype=None, out=None, keepdims=None, combine_size=None):
+    """
+    Return the product of tensor elements over a given axis.
+
+    Parameters
+    ----------
+    a : array_like
+        Input data.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which a product is performed.  The default,
+        axis=None, will calculate the product of all the elements in the
+        input tensor. If axis is negative it counts from the last to the
+        first axis.
+
+        If axis is a tuple of ints, a product is performed on all of the
+        axes specified in the tuple instead of a single axis or all the
+        axes as before.
+    dtype : dtype, optional
+        The type of the returned tensor, as well as of the accumulator in
+        which the elements are multiplied.  The dtype of `a` is used by
+        default unless `a` has an integer dtype of less precision than the
+        default platform integer.  In that case, if `a` is signed then the
+        platform integer is used while if `a` is unsigned then an unsigned
+        integer of the same precision as the platform integer is used.
+    out : Tensor, optional
+        Alternative output tensor in which to place the result. It must have
+        the same shape as the expected output, but the type of the output
+        values will be cast if necessary.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left in the
+        result as dimensions with size one. With this option, the result
+        will broadcast correctly against the input array.
+
+        If the default value is passed, then `keepdims` will not be
+        passed through to the `prod` method of sub-classes of
+        `Tensor`, however any non-default value will be.  If the
+        sub-classes `sum` method does not implement `keepdims` any
+        exceptions will be raised.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    product_along_axis : Tensor, see `dtype` parameter above.
+        An tensor shaped as `a` but with the specified axis removed.
+        Returns a reference to `out` if specified.
+
+    See Also
+    --------
+    Tensor.prod : equivalent method
+
+    Notes
+    -----
+    Arithmetic is modular when using integer types, and no error is
+    raised on overflow.  That means that, on a 32-bit platform:
+
+    >>> import mars.tensor as mt
+
+    >>> x = mt.array([536870910, 536870910, 536870910, 536870910])
+    >>> mt.prod(x).execute()  # random
+    16
+
+    The product of an empty array is the neutral element 1:
+
+    >>> mt.prod([]).execute()
+    1.0
+
+    Examples
+    --------
+    By default, calculate the product of all elements:
+
+    >>> mt.prod([1.,2.]).execute()
+    2.0
+
+    Even when the input array is two-dimensional:
+
+    >>> mt.prod([[1.,2.],[3.,4.]]).execute()
+    24.0
+
+    But we can also specify the axis over which to multiply:
+
+    >>> mt.prod([[1.,2.],[3.,4.]], axis=1).execute()
+    array([  2.,  12.])
+
+    If the type of `x` is unsigned, then the output type is
+    the unsigned platform integer:
+
+    >>> x = mt.array([1, 2, 3], dtype=mt.uint8)
+    >>> mt.prod(x).dtype == mt.uint
+    True
+
+    If `x` is of a signed integer type, then the output type
+    is the default platform integer:
+
+    >>> x = mt.array([1, 2, 3], dtype=mt.int8)
+    >>> mt.prod(x).dtype == int
+    True
+
+    """
+    a = astensor(a)
+    if dtype is None:
+        dtype = np.empty((1,), dtype=a.dtype).prod().dtype
+    op = TensorProd(
+        axis=axis, dtype=dtype, keepdims=keepdims, combine_size=combine_size
+    )
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/std.py b/python/xorbits/_mars/tensor/reduction/std.py
new file mode 100644
index 000000000..10dc803a3
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/std.py
@@ -0,0 +1,135 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..arithmetic.sqrt import sqrt
+from .var import var
+
+
+def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=None, combine_size=None):
+    """
+    Compute the standard deviation along the specified axis.
+
+    Returns the standard deviation, a measure of the spread of a distribution,
+    of the tensor elements. The standard deviation is computed for the
+    flattened tensor by default, otherwise over the specified axis.
+
+    Parameters
+    ----------
+    a : array_like
+        Calculate the standard deviation of these values.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which the standard deviation is computed. The
+        default is to compute the standard deviation of the flattened tensor.
+
+        If this is a tuple of ints, a standard deviation is performed over
+        multiple axes, instead of a single axis or all the axes as before.
+    dtype : dtype, optional
+        Type to use in computing the standard deviation. For tensors of
+        integer type the default is float64, for tensors of float types it is
+        the same as the array type.
+    out : Tensor, optional
+        Alternative output tensor in which to place the result. It must have
+        the same shape as the expected output but the type (of the calculated
+        values) will be cast if necessary.
+    ddof : int, optional
+        Means Delta Degrees of Freedom.  The divisor used in calculations
+        is ``N - ddof``, where ``N`` represents the number of elements.
+        By default `ddof` is zero.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input tensor.
+
+        If the default value is passed, then `keepdims` will not be
+        passed through to the `std` method of sub-classes of
+        `Tensor`, however any non-default value will be.  If the
+        sub-classes `sum` method does not implement `keepdims` any
+        exceptions will be raised.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    standard_deviation : Tensor, see dtype parameter above.
+        If `out` is None, return a new tensor containing the standard deviation,
+        otherwise return a reference to the output array.
+
+    See Also
+    --------
+    var, mean, nanmean, nanstd, nanvar
+
+    Notes
+    -----
+    The standard deviation is the square root of the average of the squared
+    deviations from the mean, i.e., ``std = sqrt(mean(abs(x - x.mean())**2))``.
+
+    The average squared deviation is normally calculated as
+    ``x.sum() / N``, where ``N = len(x)``.  If, however, `ddof` is specified,
+    the divisor ``N - ddof`` is used instead. In standard statistical
+    practice, ``ddof=1`` provides an unbiased estimator of the variance
+    of the infinite population. ``ddof=0`` provides a maximum likelihood
+    estimate of the variance for normally distributed variables. The
+    standard deviation computed in this function is the square root of
+    the estimated variance, so even with ``ddof=1``, it will not be an
+    unbiased estimate of the standard deviation per se.
+
+    Note that, for complex numbers, `std` takes the absolute
+    value before squaring, so that the result is always real and nonnegative.
+
+    For floating-point input, the *std* is computed using the same
+    precision the input has. Depending on the input data, this can cause
+    the results to be inaccurate, especially for float32 (see example below).
+    Specifying a higher-accuracy accumulator using the `dtype` keyword can
+    alleviate this issue.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([[1, 2], [3, 4]])
+    >>> mt.std(a).execute()
+    1.1180339887498949
+    >>> mt.std(a, axis=0).execute()
+    array([ 1.,  1.])
+    >>> mt.std(a, axis=1).execute()
+    array([ 0.5,  0.5])
+
+    In single precision, std() can be inaccurate:
+
+    >>> a = mt.zeros((2, 512*512), dtype=mt.float32)
+    >>> a[0, :] = 1.0
+    >>> a[1, :] = 0.1
+    >>> mt.std(a).execute()
+    0.45000005
+
+    Computing the standard deviation in float64 is more accurate:
+
+    >>> mt.std(a, dtype=mt.float64).execute()
+    0.44999999925494177
+
+    """
+    ret = sqrt(
+        var(
+            a,
+            axis=axis,
+            dtype=dtype,
+            out=out,
+            ddof=ddof,
+            keepdims=keepdims,
+            combine_size=combine_size,
+        )
+    )
+    if dtype is not None and ret.dtype != dtype:
+        ret = ret.astype(dtype)
+    return ret
diff --git a/python/xorbits/_mars/tensor/reduction/sum.py b/python/xorbits/_mars/tensor/reduction/sum.py
new file mode 100644
index 000000000..d50e6b584
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/sum.py
@@ -0,0 +1,135 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ..datasource import tensor as astensor
+from .core import TensorReduction, TensorReductionMixin
+
+
+class TensorSum(TensorReduction, TensorReductionMixin):
+    _op_type_ = OperandDef.SUM
+    _func_name = "sum"
+
+    def __init__(self, axis=None, keepdims=None, combine_size=None, stage=None, **kw):
+        stage = self._rewrite_stage(stage)
+        super().__init__(
+            _axis=axis,
+            _keepdims=keepdims,
+            _combine_size=combine_size,
+            stage=stage,
+            **kw
+        )
+
+
+def sum(a, axis=None, dtype=None, out=None, keepdims=None, combine_size=None):
+    """
+    Sum of tensor elements over a given axis.
+
+    Parameters
+    ----------
+    a : array_like
+        Elements to sum.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which a sum is performed.  The default,
+        axis=None, will sum all of the elements of the input tensor.  If
+        axis is negative it counts from the last to the first axis.
+
+        If axis is a tuple of ints, a sum is performed on all of the axes
+        specified in the tuple instead of a single axis or all the axes as
+        before.
+    dtype : dtype, optional
+        The type of the returned tensor and of the accumulator in which the
+        elements are summed.  The dtype of `a` is used by default unless `a`
+        has an integer dtype of less precision than the default platform
+        integer.  In that case, if `a` is signed then the platform integer
+        is used while if `a` is unsigned then an unsigned integer of the
+        same precision as the platform integer is used.
+    out : Tensor, optional
+        Alternative output tensor in which to place the result. It must have
+        the same shape as the expected output, but the type of the output
+        values will be cast if necessary.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input tensor.
+
+        If the default value is passed, then `keepdims` will not be
+        passed through to the `sum` method of sub-classes of
+        `Tensor`, however any non-default value will be.  If the
+        sub-classes `sum` method does not implement `keepdims` any
+        exceptions will be raised.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    sum_along_axis : Tensor
+        An array with the same shape as `a`, with the specified
+        axis removed.   If `a` is a 0-d tensor, or if `axis` is None, a scalar
+        is returned.  If an output array is specified, a reference to
+        `out` is returned.
+
+    See Also
+    --------
+    Tensor.sum : Equivalent method.
+
+    cumsum : Cumulative sum of tensor elements.
+
+    trapz : Integration of tensor values using the composite trapezoidal rule.
+
+    mean, average
+
+    Notes
+    -----
+    Arithmetic is modular when using integer types, and no error is
+    raised on overflow.
+
+    The sum of an empty array is the neutral element 0:
+
+    >>> import mars.tensor as mt
+
+    >>> mt.sum([]).execute()
+    0.0
+
+    Examples
+    --------
+    >>> mt.sum([0.5, 1.5]).execute()
+    2.0
+    >>> mt.sum([0.5, 0.7, 0.2, 1.5], dtype=mt.int32).execute()
+    1
+    >>> mt.sum([[0, 1], [0, 5]]).execute()
+    6
+    >>> mt.sum([[0, 1], [0, 5]], axis=0).execute()
+    array([0, 6])
+    >>> mt.sum([[0, 1], [0, 5]], axis=1).execute()
+    array([1, 5])
+
+    If the accumulator is too small, overflow occurs:
+
+    >>> mt.ones(128, dtype=mt.int8).sum(dtype=mt.int8).execute()
+    -128
+
+    """
+    a = astensor(a)
+    if dtype is None:
+        if a.dtype == object:
+            dtype = a.dtype
+        else:
+            dtype = np.empty((1,), dtype=a.dtype).sum().dtype
+    else:
+        dtype = np.dtype(dtype)
+    op = TensorSum(axis=axis, dtype=dtype, keepdims=keepdims, combine_size=combine_size)
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reduction/tests/__init__.py b/python/xorbits/_mars/tensor/reduction/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/reduction/tests/test_reduction.py b/python/xorbits/_mars/tensor/reduction/tests/test_reduction.py
new file mode 100644
index 000000000..85133206c
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/tests/test_reduction.py
@@ -0,0 +1,211 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+from ....core import tile
+from ....core.operand import OperandStage
+from ...datasource import ones, tensor
+from ...merge import TensorConcatenate
+from .. import TensorArgmax, TensorArgmin, TensorMean, all
+
+
+def test_base_reduction():
+    sum = lambda x, *args, **kwargs: tile(x.sum(*args, **kwargs))
+    prod = lambda x, *args, **kwargs: tile(x.prod(*args, **kwargs))
+    max = lambda x, *args, **kwargs: tile(x.max(*args, **kwargs))
+    min = lambda x, *args, **kwargs: tile(x.min(*args, **kwargs))
+    all = lambda x, *args, **kwargs: tile(x.all(*args, **kwargs))
+    any = lambda x, *args, **kwargs: tile(x.any(*args, **kwargs))
+
+    for f in [sum, prod, max, min, all, any]:
+        res = f(ones((8, 8), chunk_size=8))
+        assert res.shape == ()
+
+        res = f(ones((10, 8), chunk_size=3))
+        assert res.dtype is not None
+        assert res.shape == ()
+
+        res = f(ones((10, 8), chunk_size=3), axis=0)
+        assert res.shape == (8,)
+
+        res = f(ones((10, 8), chunk_size=3), axis=1)
+        assert res.shape == (10,)
+
+        with pytest.raises(np.AxisError):
+            f(ones((10, 8), chunk_size=3), axis=2)
+
+        res = f(ones((10, 8), chunk_size=3), axis=-1)
+        assert res.shape == (10,)
+
+        with pytest.raises(np.AxisError):
+            f(ones((10, 8), chunk_size=3), axis=-3)
+
+        res = f(ones((10, 8), chunk_size=3), keepdims=True)
+        assert res.shape == (1, 1)
+
+        res = f(ones((10, 8), chunk_size=3), axis=0, keepdims=True)
+        assert res.shape == (1, 8)
+
+        res = f(ones((10, 8), chunk_size=3), axis=1, keepdims=True)
+        assert res.shape == (10, 1)
+
+        res = f(ones((10, 8, 10), chunk_size=3), axis=1)
+        assert res.shape == (10, 10)
+
+        res = f(ones((10, 8, 10), chunk_size=3), axis=1, keepdims=True)
+        assert res.shape == (10, 1, 10)
+
+        res = f(ones((10, 8, 10), chunk_size=3), axis=(0, 2))
+        assert res.shape == (8,)
+
+        res = f(ones((10, 8, 10), chunk_size=3), axis=(0, 2), keepdims=True)
+        assert res.shape == (1, 8, 1)
+
+
+def test_mean_reduction():
+    mean = lambda x, *args, **kwargs: tile(x.mean(*args, **kwargs))
+
+    res = mean(ones((10, 8), chunk_size=3))
+    assert res.shape == ()
+    assert res.dtype is not None
+    assert isinstance(res.chunks[0].op, TensorMean)
+    assert isinstance(res.chunks[0].inputs[0].op, TensorConcatenate)
+    assert isinstance(res.chunks[0].inputs[0].inputs[0].op, TensorMean)
+    assert res.chunks[0].inputs[0].inputs[0].op.stage == OperandStage.combine
+
+    res = mean(ones((8, 8), chunk_size=8))
+    assert res.shape == ()
+
+    res = mean(ones((10, 8), chunk_size=3), axis=0)
+    assert res.shape == (8,)
+
+    res = mean(ones((10, 8), chunk_size=3), axis=1)
+    assert res.shape == (10,)
+
+    with pytest.raises(np.AxisError):
+        mean(ones((10, 8), chunk_size=3), axis=2)
+
+    res = mean(ones((10, 8), chunk_size=3), axis=-1)
+    assert res.shape == (10,)
+
+    with pytest.raises(np.AxisError):
+        mean(ones((10, 8), chunk_size=3), axis=-3)
+
+    res = mean(ones((10, 8), chunk_size=3), keepdims=True)
+    assert res.shape == (1, 1)
+
+    res = mean(ones((10, 8), chunk_size=3), axis=0, keepdims=True)
+    assert res.shape == (1, 8)
+
+    res = mean(ones((10, 8), chunk_size=3), axis=1, keepdims=True)
+    assert res.shape == (10, 1)
+    assert isinstance(res.chunks[0].op, TensorMean)
+    assert res.chunks[0].op.stage == OperandStage.agg
+    assert isinstance(res.chunks[0].inputs[0].op, TensorConcatenate)
+    assert isinstance(res.chunks[0].inputs[0].inputs[0].op, TensorMean)
+    assert res.chunks[0].inputs[0].inputs[0].op.stage == OperandStage.map
+
+
+def test_arg_reduction():
+    argmax = lambda x, *args, **kwargs: tile(x.argmax(*args, **kwargs))
+    argmin = lambda x, *args, **kwargs: tile(x.argmin(*args, **kwargs))
+
+    res1 = argmax(ones((10, 8, 10), chunk_size=3))
+    res2 = argmin(ones((10, 8, 10), chunk_size=3))
+    assert res1.shape == ()
+    assert res1.dtype is not None
+    assert res2.shape == ()
+    assert isinstance(res1.chunks[0].op, TensorArgmax)
+    assert res1.chunks[0].op.stage == OperandStage.agg
+    assert isinstance(res2.chunks[0].op, TensorArgmin)
+    assert res2.chunks[0].op.stage == OperandStage.agg
+    assert isinstance(res1.chunks[0].inputs[0].op, TensorConcatenate)
+    assert isinstance(res2.chunks[0].inputs[0].op, TensorConcatenate)
+    assert isinstance(res1.chunks[0].inputs[0].inputs[0].op, TensorArgmax)
+    assert res1.chunks[0].inputs[0].inputs[0].op.stage == OperandStage.combine
+    assert isinstance(res2.chunks[0].inputs[0].inputs[0].op, TensorArgmin)
+    assert res2.chunks[0].inputs[0].inputs[0].op.stage == OperandStage.combine
+
+    res1 = argmax(ones((10, 8), chunk_size=3), axis=1)
+    res2 = argmin(ones((10, 8), chunk_size=3), axis=1)
+    assert res1.shape == (10,)
+    assert res2.shape == (10,)
+    assert isinstance(res1.chunks[0].op, TensorArgmax)
+    assert res1.chunks[0].op.stage == OperandStage.agg
+    assert isinstance(res2.chunks[0].op, TensorArgmin)
+    assert res2.chunks[0].op.stage == OperandStage.agg
+    assert isinstance(res1.chunks[0].inputs[0].op, TensorConcatenate)
+    assert isinstance(res2.chunks[0].inputs[0].op, TensorConcatenate)
+    assert isinstance(res1.chunks[0].inputs[0].inputs[0].op, TensorArgmax)
+    assert res1.chunks[0].inputs[0].inputs[0].op.stage == OperandStage.map
+    assert isinstance(res2.chunks[0].inputs[0].inputs[0].op, TensorArgmin)
+    assert res2.chunks[0].inputs[0].inputs[0].op.stage == OperandStage.map
+
+    pytest.raises(
+        TypeError, lambda: argmax(ones((10, 8, 10), chunk_size=3), axis=(0, 1))
+    )
+    pytest.raises(
+        TypeError, lambda: argmin(ones((10, 8, 10), chunk_size=3), axis=(0, 1))
+    )
+    pytest.raises(np.AxisError, lambda: argmin(ones((10, 8, 10), chunk_size=3), axis=3))
+    pytest.raises(
+        np.AxisError, lambda: argmin(ones((10, 8, 10), chunk_size=3), axis=-4)
+    )
+
+
+def test_cum_reduction():
+    cumsum = lambda x, *args, **kwargs: tile(x.cumsum(*args, **kwargs))
+    cumprod = lambda x, *args, **kwargs: tile(x.cumprod(*args, **kwargs))
+
+    res1 = cumsum(ones((10, 8), chunk_size=3), axis=0)
+    res2 = cumprod(ones((10, 8), chunk_size=3), axis=0)
+    assert res1.shape == (10, 8)
+    assert res1.dtype is not None
+    assert res2.shape == (10, 8)
+    assert res2.dtype is not None
+
+    res1 = cumsum(ones((10, 8, 8), chunk_size=3), axis=1)
+    res2 = cumprod(ones((10, 8, 8), chunk_size=3), axis=1)
+    assert res1.shape == (10, 8, 8)
+    assert res2.shape == (10, 8, 8)
+
+    res1 = cumsum(ones((10, 8, 8), chunk_size=3), axis=-2)
+    res2 = cumprod(ones((10, 8, 8), chunk_size=3), axis=-2)
+    assert res1.shape == (10, 8, 8)
+    assert res2.shape == (10, 8, 8)
+
+    with pytest.raises(np.AxisError):
+        cumsum(ones((10, 8), chunk_size=3), axis=2)
+    with pytest.raises(np.AxisError):
+        cumsum(ones((10, 8), chunk_size=3), axis=-3)
+
+
+def test_all_reduction():
+    o = tensor([False])
+
+    with pytest.raises(ValueError):
+        all([-1, 4, 5], out=o)
+
+
+def test_var_reduction():
+    var = lambda x, *args, **kwargs: tile(x.var(*args, **kwargs))
+
+    res1 = var(ones((10, 8), chunk_size=3), ddof=2)
+    assert res1.shape == ()
+    assert res1.op.ddof == 2
+
+    res1 = var(ones((10, 8, 8), chunk_size=3), axis=1)
+    assert res1.shape == (10, 8)
diff --git a/python/xorbits/_mars/tensor/reduction/tests/test_reduction_execution.py b/python/xorbits/_mars/tensor/reduction/tests/test_reduction_execution.py
new file mode 100644
index 000000000..4d5924618
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/tests/test_reduction_execution.py
@@ -0,0 +1,662 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import scipy.sparse as sps
+
+from ....utils import ignore_warning
+from ...datasource import ones, tensor
+from .. import (
+    allclose,
+    array_equal,
+    count_nonzero,
+    mean,
+    nanargmax,
+    nanargmin,
+    nancumprod,
+    nancumsum,
+    nanmax,
+    nanmean,
+    nanmin,
+    nanprod,
+    nanstd,
+    nansum,
+    nanvar,
+    std,
+    var,
+)
+
+
+def test_sum_prod_execution(setup):
+    arr = ones((10, 8), chunk_size=6)
+    assert 80 == arr.sum().execute().fetch()
+    np.testing.assert_array_equal(
+        arr.sum(axis=0).execute().fetch(), np.full((8,), fill_value=10)
+    )
+
+    arr = ones((3, 3), chunk_size=2)
+    assert 512 == (arr * 2).prod().execute().fetch()
+    np.testing.assert_array_equal(
+        (arr * 2).prod(axis=0).execute().fetch(), np.full((3,), fill_value=8)
+    )
+
+    raw = sps.random(10, 20, density=0.1)
+    arr = tensor(raw, chunk_size=3)
+    res = arr.sum().execute().fetch()
+
+    assert pytest.approx(res) == raw.sum()
+
+    # test order
+    raw = np.asfortranarray(np.random.rand(10, 20, 30))
+    arr = tensor(raw, chunk_size=13)
+    arr2 = arr.sum(axis=-1)
+
+    res = arr2.execute().fetch()
+    expected = raw.sum(axis=-1)
+    np.testing.assert_allclose(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+    # test string dtype
+    a = tensor(list("abcdefghi"), dtype=object)
+    assert a.sum().execute().fetch() == "abcdefghi"
+    a = tensor(list("abcdefghi"), dtype=object, chunk_size=2)
+    assert a.sum().execute().fetch() == "abcdefghi"
+
+
+def test_max_min_execution(setup):
+    raw = np.random.randint(10000, size=(10, 10, 10))
+
+    arr = tensor(raw, chunk_size=3)
+
+    assert raw.max() == arr.max().execute().fetch()
+    assert raw.min() == arr.min().execute().fetch()
+
+    np.testing.assert_array_equal(raw.max(axis=0), arr.max(axis=0).execute().fetch())
+    assert arr.max(axis=0).issparse() is False
+    np.testing.assert_array_equal(raw.min(axis=0), arr.min(axis=0).execute().fetch())
+    assert arr.min(axis=0).issparse() is False
+
+    np.testing.assert_array_equal(
+        raw.max(axis=(1, 2)), arr.max(axis=(1, 2)).execute().fetch()
+    )
+    np.testing.assert_array_equal(
+        raw.min(axis=(1, 2)), arr.min(axis=(1, 2)).execute().fetch()
+    )
+
+    raw = sps.random(10, 10, density=0.5)
+
+    arr = tensor(raw, chunk_size=3)
+
+    assert raw.max() == arr.max().execute().fetch()
+    assert raw.min() == arr.min().execute().fetch()
+
+    np.testing.assert_almost_equal(
+        raw.max(axis=1).A.ravel(), arr.max(axis=1).execute().fetch().toarray()
+    )
+    assert arr.max(axis=1).issparse() is True
+    np.testing.assert_almost_equal(
+        raw.min(axis=1).A.ravel(), arr.min(axis=1).execute().fetch().toarray()
+    )
+    assert arr.min(axis=1).issparse() is True
+
+    # test string dtype
+    a = tensor(list("abcdefghi"), dtype=object)
+    assert a.max().execute().fetch() == "i"
+    a = tensor(list("abcdefghi"), dtype=object, chunk_size=2)
+    assert a.max().execute().fetch() == "i"
+
+    # test empty chunks
+    raw = np.arange(3, 10)
+    arr = tensor(np.arange(0, 10), chunk_size=3)
+    arr = arr[arr >= 3]
+    assert raw.max() == arr.max().execute().fetch()
+    assert raw.min() == arr.min().execute().fetch()
+
+
+def test_all_any_execution(setup):
+    raw1 = np.zeros((10, 15))
+    raw2 = np.ones((10, 15))
+    raw3 = np.array(
+        [
+            [True, False, True, False],
+            [True, True, True, True],
+            [False, False, False, False],
+            [False, True, False, True],
+        ]
+    )
+
+    arr1 = tensor(raw1, chunk_size=3)
+    arr2 = tensor(raw2, chunk_size=3)
+    arr3 = tensor(raw3, chunk_size=4)
+
+    assert not arr1.all().execute().fetch()
+    assert arr2.all().execute().fetch()
+    assert not arr1.any().execute().fetch()
+    np.testing.assert_array_equal(raw3.all(axis=1), arr3.all(axis=1).execute().fetch())
+    np.testing.assert_array_equal(raw3.any(axis=0), arr3.any(axis=0).execute().fetch())
+
+    raw = sps.random(10, 10, density=0.5) > 0.5
+
+    arr = tensor(raw, chunk_size=3)
+
+    assert raw.A.all() == arr.all().execute().fetch()
+    assert raw.A.any() == arr.any().execute().fetch()
+
+    # test string dtype
+    a = tensor(list("abcdefghi"), dtype=object)
+    assert a.all().execute().fetch() == "i"
+    a = tensor(list("abcdefghi"), dtype=object, chunk_size=2)
+    assert a.any().execute().fetch() == "a"
+
+
+def test_mean_execution(setup):
+    raw1 = np.random.random((20, 25))
+    raw2 = np.random.randint(10, size=(20, 25))
+
+    arr1 = tensor(raw1, chunk_size=6)
+
+    res1 = arr1.mean().execute().fetch()
+    expected1 = raw1.mean()
+    np.testing.assert_allclose(res1, expected1)
+
+    res2 = arr1.mean(axis=0).execute().fetch()
+    expected2 = raw1.mean(axis=0)
+    assert np.allclose(res2, expected2) is True
+
+    res3 = arr1.mean(axis=1, keepdims=True).execute().fetch()
+    expected3 = raw1.mean(axis=1, keepdims=True)
+    np.testing.assert_allclose(res3, expected3)
+
+    arr2 = tensor(raw2, chunk_size=6)
+
+    res1 = arr2.mean().execute().fetch()
+    expected1 = raw2.mean()
+    assert res1 == expected1
+
+    res2 = arr2.mean(axis=0).execute().fetch()
+    expected2 = raw2.mean(axis=0)
+    np.testing.assert_allclose(res2, expected2)
+
+    res3 = arr2.mean(axis=1, keepdims=True).execute().fetch()
+    expected3 = raw2.mean(axis=1, keepdims=True)
+    np.testing.assert_allclose(res3, expected3)
+
+    raw1 = sps.random(20, 25, density=0.1)
+
+    arr1 = tensor(raw1, chunk_size=6)
+
+    res1 = arr1.mean().execute().fetch()
+    expected1 = raw1.mean()
+    np.testing.assert_allclose(res1, expected1)
+
+    arr2 = tensor(raw1, chunk_size=30)
+
+    res1 = arr2.mean().execute().fetch()
+    expected1 = raw1.mean()
+    np.testing.assert_allclose(res1, expected1)
+
+    arr = mean(1)
+    assert arr.execute().fetch() == 1
+
+    with pytest.raises(TypeError):
+        tensor(list("abcdefghi"), dtype=object).mean().execute()
+
+
+def test_var_execution(setup):
+    raw1 = np.random.random((20, 25))
+    raw2 = np.random.randint(10, size=(20, 25))
+
+    arr0 = tensor(raw1, chunk_size=25)
+
+    res1 = arr0.var().execute().fetch()
+    expected1 = raw1.var()
+    np.testing.assert_allclose(res1, expected1)
+
+    arr1 = tensor(raw1, chunk_size=6)
+
+    res1 = arr1.var().execute().fetch()
+    expected1 = raw1.var()
+    np.testing.assert_allclose(res1, expected1)
+
+    res2 = arr1.var(axis=0).execute().fetch()
+    expected2 = raw1.var(axis=0)
+    np.testing.assert_allclose(res2, expected2)
+
+    res3 = arr1.var(axis=1, keepdims=True).execute().fetch()
+    expected3 = raw1.var(axis=1, keepdims=True)
+    np.testing.assert_allclose(res3, expected3)
+
+    arr2 = tensor(raw2, chunk_size=6)
+
+    res1 = arr2.var().execute().fetch()
+    expected1 = raw2.var()
+    assert pytest.approx(res1) == expected1
+
+    res2 = arr2.var(axis=0).execute().fetch()
+    expected2 = raw2.var(axis=0)
+    np.testing.assert_allclose(res2, expected2)
+
+    res3 = arr2.var(axis=1, keepdims=True).execute().fetch()
+    expected3 = raw2.var(axis=1, keepdims=True)
+    np.testing.assert_allclose(res3, expected3)
+
+    res4 = arr2.var(ddof=1).execute().fetch()
+    expected4 = raw2.var(ddof=1)
+    assert pytest.approx(res4) == expected4
+
+    raw1 = sps.random(20, 25, density=0.1)
+
+    arr1 = tensor(raw1, chunk_size=6)
+
+    res1 = arr1.var().execute().fetch()
+    expected1 = raw1.toarray().var()
+    np.testing.assert_allclose(res1, expected1)
+
+    arr2 = tensor(raw1, chunk_size=30)
+
+    res1 = arr2.var().execute().fetch()
+    expected1 = raw1.toarray().var()
+    np.testing.assert_allclose(res1, expected1)
+
+    arr = var(1)
+    assert arr.execute().fetch() == 0
+
+
+def test_std_execution(setup):
+    raw1 = np.random.random((20, 25))
+    raw2 = np.random.randint(10, size=(20, 25))
+
+    arr1 = tensor(raw1, chunk_size=6)
+
+    res1 = arr1.std().execute().fetch()
+    expected1 = raw1.std()
+    np.testing.assert_allclose(res1, expected1)
+
+    res2 = arr1.std(axis=0).execute().fetch()
+    expected2 = raw1.std(axis=0)
+    np.testing.assert_allclose(res2, expected2)
+
+    res3 = arr1.std(axis=1, keepdims=True).execute().fetch()
+    expected3 = raw1.std(axis=1, keepdims=True)
+    np.testing.assert_allclose(res3, expected3)
+
+    arr2 = tensor(raw2, chunk_size=6)
+
+    res1 = arr2.std().execute().fetch()
+    expected1 = raw2.std()
+    assert pytest.approx(res1) == expected1
+
+    res2 = arr2.std(axis=0).execute().fetch()
+    expected2 = raw2.std(axis=0)
+    np.testing.assert_allclose(res2, expected2)
+
+    res3 = arr2.std(axis=1, keepdims=True).execute().fetch()
+    expected3 = raw2.std(axis=1, keepdims=True)
+    np.testing.assert_allclose(res3, expected3)
+
+    res4 = arr2.std(ddof=1).execute().fetch()
+    expected4 = raw2.std(ddof=1)
+    assert pytest.approx(res4) == expected4
+
+    raw1 = sps.random(20, 25, density=0.1)
+
+    arr1 = tensor(raw1, chunk_size=6)
+
+    res1 = arr1.std().execute().fetch()
+    expected1 = raw1.toarray().std()
+    np.testing.assert_allclose(res1, expected1)
+
+    arr2 = tensor(raw1, chunk_size=30)
+
+    res1 = arr2.std().execute().fetch()
+    expected1 = raw1.toarray().std()
+    np.testing.assert_allclose(res1, expected1)
+
+    arr = std(1)
+    assert arr.execute().fetch() == 0
+
+
+def test_arg_reduction(setup):
+    raw = np.random.random((20, 20, 20))
+
+    arr = tensor(raw, chunk_size=6)
+
+    assert raw.argmax() == arr.argmax().execute().fetch()
+    assert raw.argmin() == arr.argmin().execute().fetch()
+
+    np.testing.assert_array_equal(
+        raw.argmax(axis=0), arr.argmax(axis=0).execute().fetch()
+    )
+    np.testing.assert_array_equal(
+        raw.argmin(axis=0), arr.argmin(axis=0).execute().fetch()
+    )
+
+    raw_format = sps.random(20, 20, density=0.1, format="lil")
+
+    random_min = np.random.randint(0, 200)
+    random_max = np.random.randint(200, 400)
+    raw_format[np.unravel_index(random_min, raw_format.shape)] = -1
+    raw_format[np.unravel_index(random_max, raw_format.shape)] = 2
+
+    raw = raw_format.tocoo()
+    arr = tensor(raw, chunk_size=6)
+
+    assert raw.argmax() == arr.argmax().execute().fetch()
+    assert raw.argmin() == arr.argmin().execute().fetch()
+
+    # test order
+    raw = np.asfortranarray(np.random.rand(10, 20, 30))
+    arr = tensor(raw, chunk_size=13)
+    arr2 = arr.argmax(axis=-1)
+
+    res = arr2.execute().fetch()
+    expected = raw.argmax(axis=-1)
+    np.testing.assert_allclose(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+    with pytest.raises(TypeError):
+        tensor(list("abcdefghi"), dtype=object).argmax().execute()
+
+
+@ignore_warning
+def test_nan_reduction(setup):
+    raw = np.random.choice(a=[0, 1, np.nan], size=(10, 10), p=[0.3, 0.4, 0.3])
+
+    arr = tensor(raw, chunk_size=6)
+
+    assert np.nansum(raw) == nansum(arr).execute().fetch()
+    assert np.nanprod(raw) == nanprod(arr).execute().fetch()
+    assert np.nanmax(raw) == nanmax(arr).execute().fetch()
+    assert np.nanmin(raw) == nanmin(arr).execute().fetch()
+    assert np.nanmean(raw) == nanmean(arr).execute().fetch()
+    assert pytest.approx(np.nanvar(raw)) == nanvar(arr).execute().fetch()
+    assert (
+        pytest.approx(np.nanvar(raw, ddof=1)) == nanvar(arr, ddof=1).execute().fetch()
+    )
+    assert pytest.approx(np.nanstd(raw)) == nanstd(arr).execute().fetch()
+    assert (
+        pytest.approx(np.nanstd(raw, ddof=1)) == nanstd(arr, ddof=1).execute().fetch()
+    )
+
+    arr = tensor(raw, chunk_size=10)
+
+    assert np.nansum(raw) == nansum(arr).execute().fetch()
+    assert np.nanprod(raw) == nanprod(arr).execute().fetch()
+    assert np.nanmax(raw) == nanmax(arr).execute().fetch()
+    assert np.nanmin(raw) == nanmin(arr).execute().fetch()
+    assert np.nanmean(raw) == nanmean(arr).execute().fetch()
+    assert pytest.approx(np.nanvar(raw)) == nanvar(arr).execute().fetch()
+    assert (
+        pytest.approx(np.nanvar(raw, ddof=1)) == nanvar(arr, ddof=1).execute().fetch()
+    )
+    assert pytest.approx(np.nanstd(raw)) == nanstd(arr).execute().fetch()
+    assert (
+        pytest.approx(np.nanstd(raw, ddof=1)) == nanstd(arr, ddof=1).execute().fetch()
+    )
+
+    raw = np.random.random((10, 10))
+    raw[:3, :3] = np.nan
+    arr = tensor(raw, chunk_size=6)
+    assert np.nanargmin(raw) == nanargmin(arr).execute().fetch()
+    assert np.nanargmax(raw) == nanargmax(arr).execute().fetch()
+
+    raw = np.full((10, 10), np.nan)
+    arr = tensor(raw, chunk_size=6)
+
+    assert 0 == nansum(arr).execute().fetch()
+    assert 1 == nanprod(arr).execute().fetch()
+    assert np.isnan(nanmax(arr).execute().fetch())
+    assert np.isnan(nanmin(arr).execute().fetch())
+    assert np.isnan(nanmean(arr).execute().fetch())
+    with pytest.raises(ValueError):
+        _ = nanargmin(arr).execute()  # noqa: F841
+    with pytest.raises(ValueError):
+        _ = nanargmax(arr).execute()  # noqa: F841
+
+    raw = sps.random(10, 10, density=0.1, format="csr")
+    raw[:3, :3] = np.nan
+    arr = tensor(raw, chunk_size=6)
+
+    assert pytest.approx(np.nansum(raw.A)) == nansum(arr).execute().fetch()
+    assert pytest.approx(np.nanprod(raw.A)) == nanprod(arr).execute().fetch()
+    assert pytest.approx(np.nanmax(raw.A)) == nanmax(arr).execute().fetch()
+    assert pytest.approx(np.nanmin(raw.A)) == nanmin(arr).execute().fetch()
+    assert pytest.approx(np.nanmean(raw.A)) == nanmean(arr).execute().fetch()
+    assert pytest.approx(np.nanvar(raw.A)) == nanvar(arr).execute().fetch()
+    assert (
+        pytest.approx(np.nanvar(raw.A, ddof=1)) == nanvar(arr, ddof=1).execute().fetch()
+    )
+    assert pytest.approx(np.nanstd(raw.A)) == nanstd(arr).execute().fetch()
+    assert (
+        pytest.approx(np.nanstd(raw.A, ddof=1)) == nanstd(arr, ddof=1).execute().fetch()
+    )
+
+    arr = nansum(1)
+    assert arr.execute().fetch() == 1
+
+
+def test_cum_reduction(setup):
+    raw = np.random.randint(5, size=(8, 8, 8))
+
+    arr = tensor(raw, chunk_size=6)
+
+    res1 = arr.cumsum(axis=1).execute().fetch()
+    res2 = arr.cumprod(axis=1).execute().fetch()
+    expected1 = raw.cumsum(axis=1)
+    expected2 = raw.cumprod(axis=1)
+    np.testing.assert_array_equal(res1, expected1)
+    np.testing.assert_array_equal(res2, expected2)
+
+    raw = sps.random(8, 8, density=0.1)
+
+    arr = tensor(raw, chunk_size=6)
+
+    res1 = arr.cumsum(axis=1).execute().fetch()
+    res2 = arr.cumprod(axis=1).execute().fetch()
+    expected1 = raw.A.cumsum(axis=1)
+    expected2 = raw.A.cumprod(axis=1)
+    assert np.allclose(res1, expected1)
+    assert np.allclose(res2, expected2)
+
+    # test order
+    raw = np.asfortranarray(np.random.rand(10, 20, 30))
+    arr = tensor(raw, chunk_size=13)
+    arr2 = arr.cumsum(axis=-1)
+
+    res = arr2.execute().fetch()
+    expected = raw.cumsum(axis=-1)
+    np.testing.assert_allclose(res, expected)
+    assert res.flags["C_CONTIGUOUS"] == expected.flags["C_CONTIGUOUS"]
+    assert res.flags["F_CONTIGUOUS"] == expected.flags["F_CONTIGUOUS"]
+
+    # test string dtype
+    a = tensor(list("abcdefghi"), dtype=object)
+    np.testing.assert_array_equal(
+        a.cumsum().execute().fetch(),
+        np.cumsum(np.array(list("abcdefghi"), dtype=object)),
+    )
+    a = tensor(list("abcdefghi"), dtype=object, chunk_size=2)
+    np.testing.assert_array_equal(
+        a.cumsum().execute().fetch(),
+        np.cumsum(np.array(list("abcdefghi"), dtype=object)),
+    )
+
+    # test empty chunks
+    raw = np.random.rand(100)
+    arr = tensor(raw, chunk_size=((0, 100),))
+    res = arr.cumsum().execute().fetch()
+    expected = raw.cumsum()
+    np.testing.assert_allclose(res, expected)
+    res = arr.cumprod().execute().fetch()
+    expected = raw.cumprod()
+    np.testing.assert_allclose(res, expected)
+
+
+def test_nan_cum_reduction(setup):
+    raw = np.random.randint(5, size=(8, 8, 8)).astype(float)
+    raw[:2, 2:4, 4:6] = np.nan
+
+    arr = tensor(raw, chunk_size=6)
+
+    res1 = nancumsum(arr, axis=1).execute().fetch()
+    res2 = nancumprod(arr, axis=1).execute().fetch()
+    expected1 = np.nancumsum(raw, axis=1)
+    expected2 = np.nancumprod(raw, axis=1)
+    np.testing.assert_array_equal(res1, expected1)
+    np.testing.assert_array_equal(res2, expected2)
+
+    raw = sps.random(8, 8, density=0.1, format="lil")
+    raw[:2, 2:4] = np.nan
+
+    arr = tensor(raw, chunk_size=6)
+
+    res1 = nancumsum(arr, axis=1).execute().fetch()
+    res2 = nancumprod(arr, axis=1).execute().fetch()
+    expected1 = np.nancumsum(raw.A, axis=1)
+    expected2 = np.nancumprod(raw.A, axis=1)
+    assert np.allclose(res1, expected1) is True
+    assert np.allclose(res2, expected2) is True
+
+
+def test_out_reduction_execution(setup):
+    raw = np.random.randint(5, size=(8, 8, 8))
+
+    arr = tensor(raw, chunk_size=6)
+    arr2 = ones((8, 8), dtype="i8", chunk_size=6)
+    arr.sum(axis=1, out=arr2)
+
+    res = arr2.execute().fetch()
+    expected = raw.sum(axis=1)
+
+    np.testing.assert_array_equal(res, expected)
+
+
+def test_out_cum_reduction_execution(setup):
+    raw = np.random.randint(5, size=(8, 8, 8))
+
+    arr = tensor(raw, chunk_size=6)
+    arr.cumsum(axis=0, out=arr)
+
+    res = arr.execute().fetch()
+    expected = raw.cumsum(axis=0)
+
+    np.testing.assert_array_equal(res, expected)
+
+
+def test_count_nonzero_execution(setup):
+    raw = [[0, 1, 7, 0, 0], [3, 0, 0, 2, 19]]
+
+    arr = tensor(raw, chunk_size=5)
+    t = count_nonzero(arr)
+
+    res = t.execute().fetch()
+    expected = np.count_nonzero(raw)
+    np.testing.assert_equal(res, expected)
+
+    arr = tensor(raw, chunk_size=2)
+    t = count_nonzero(arr)
+
+    res = t.execute().fetch()
+    expected = np.count_nonzero(raw)
+    np.testing.assert_equal(res, expected)
+
+    t = count_nonzero(arr, axis=0)
+
+    res = t.execute().fetch()
+    expected = np.count_nonzero(raw, axis=0)
+    np.testing.assert_equal(res, expected)
+
+    t = count_nonzero(arr, axis=1)
+
+    res = t.execute().fetch()
+    expected = np.count_nonzero(raw, axis=1)
+    np.testing.assert_equal(res, expected)
+
+    raw = sps.csr_matrix(raw)
+
+    arr = tensor(raw, chunk_size=2)
+    t = count_nonzero(arr)
+
+    res = t.execute().fetch()
+    expected = np.count_nonzero(raw.A)
+    np.testing.assert_equal(res, expected)
+
+    t = count_nonzero(arr, axis=0)
+
+    res = t.execute().fetch()
+    expected = np.count_nonzero(raw.A, axis=0)
+    np.testing.assert_equal(res, expected)
+
+    t = count_nonzero(arr, axis=1)
+
+    res = t.execute().fetch()
+    expected = np.count_nonzero(raw.A, axis=1)
+    np.testing.assert_equal(res, expected)
+
+    # test string dtype
+    a = tensor(list("abcdefghi"), dtype=object)
+    assert count_nonzero(a).execute().fetch() == 9
+    a = tensor(list("abcdefghi"), dtype=object, chunk_size=2)
+    assert count_nonzero(a).execute().fetch() == 9
+
+
+def test_allclose_execution(setup):
+    a = tensor([1e10, 1e-7], chunk_size=1)
+    b = tensor([1.00001e10, 1e-8], chunk_size=1)
+
+    t = allclose(a, b)
+
+    res = t.execute().fetch()
+    assert res is False
+
+    a = tensor([1e10, 1e-8], chunk_size=1)
+    b = tensor([1.00001e10, 1e-9], chunk_size=1)
+
+    t = allclose(a, b)
+
+    res = t.execute().fetch()
+    assert res is True
+
+    a = tensor([1.0, np.nan], chunk_size=1)
+    b = tensor([1.0, np.nan], chunk_size=1)
+
+    t = allclose(a, b, equal_nan=True)
+
+    res = t.execute().fetch()
+    assert res is True
+
+    a = tensor(sps.csr_matrix([[1e10, 1e-7], [0, 0]]), chunk_size=1)
+    b = tensor(sps.csr_matrix([[1.00001e10, 1e-8], [0, 0]]), chunk_size=1)
+
+    t = allclose(a, b)
+
+    res = t.execute().fetch()
+    assert res is False
+
+    # test string dtype
+    with pytest.raises(TypeError):
+        a = tensor(list("abcdefghi"), dtype=object)
+        allclose(a, a).execute()
+
+
+def test_array_equal(setup):
+    a = ones((10, 5), chunk_size=4)
+    b = ones((10, 5), chunk_size=5)
+
+    c = array_equal(a, b)
+
+    assert c.execute().fetch()
diff --git a/python/xorbits/_mars/tensor/reduction/var.py b/python/xorbits/_mars/tensor/reduction/var.py
new file mode 100644
index 000000000..2d763bcc0
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reduction/var.py
@@ -0,0 +1,312 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from math import factorial
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...serialization.serializables import Int32Field
+from ..array_utils import as_same_device, device, get_array_module
+from ..datasource import tensor as astensor
+from .core import TensorReduction, TensorReductionMixin, numel
+
+
+def reduce_var_square(var_square, avg_diff, count, op, axis, sum_func):
+    moment = op.moment
+    dtype = op.dtype
+    kw = dict(axis=axis, dtype=dtype, keepdims=bool(op.keepdims))
+
+    reduced_var_square = var_square[..., moment - 2].sum(**kw) + sum_func(
+        count * avg_diff**moment, **kw
+    )
+    for i in range(1, moment - 1):
+        coeff = factorial(moment) / float(factorial(i) * factorial(moment - i))
+        reduced_var_square += coeff * sum_func(
+            var_square[..., moment - i - 2] * avg_diff**moment, **kw
+        )
+    return reduced_var_square
+
+
+class TensorMoment(TensorReduction, TensorReductionMixin):
+    _op_type_ = OperandDef.MOMENT
+
+    _moment = Int32Field("moment", default=2)
+    _ddof = Int32Field("ddof")
+
+    def __init__(
+        self,
+        axis=None,
+        keepdims=None,
+        moment=None,
+        ddof=None,
+        combine_size=None,
+        stage=None,
+        **kw
+    ):
+        stage = self._rewrite_stage(stage)
+        if moment is not None:
+            kw["_moment"] = moment
+        super().__init__(
+            _axis=axis,
+            _keepdims=keepdims,
+            _ddof=ddof,
+            _combine_size=combine_size,
+            stage=stage,
+            **kw
+        )
+
+    @property
+    def moment(self):
+        return getattr(self, "_moment", 2)
+
+    @property
+    def ddof(self):
+        return self._ddof
+
+    @classmethod
+    def execute_agg(cls, ctx, op):
+        axis = cls.get_axis(op.axis)
+        dtype = op.dtype
+
+        (_data, _count, _var_square), device_id, xp = as_same_device(
+            ctx[op.inputs[0].key], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            chunk_count = xp.sum(_count, axis=axis, dtype=np.int64, keepdims=True)
+            chunk_sum = xp.sum(_data, axis=axis, dtype=dtype, keepdims=True)
+            avg = xp.true_divide(chunk_sum, chunk_count, dtype=dtype)
+            avg_diff = xp.true_divide(_data, _count, dtype=dtype) - avg
+            var_square = reduce_var_square(
+                _var_square, avg_diff, _count, op, axis, xp.sum
+            )
+
+            ctx[op.outputs[0].key] = xp.true_divide(
+                var_square,
+                xp.sum(chunk_count, axis=axis, dtype=dtype, keepdims=bool(op.keepdims))
+                - op.ddof,
+                dtype=dtype,
+            )
+
+    @classmethod
+    def execute_map(cls, ctx, op):
+        (in_chunk,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        axis = cls.get_axis(op.axis)
+        moment = op.moment
+        dtype = op.dtype
+        empty = get_array_module(in_chunk, nosparse=True).empty
+
+        with device(device_id):
+            chunk_count = numel(
+                in_chunk, axis=axis, dtype=np.int64, keepdims=bool(op.keepdims)
+            )
+            chunk_sum = xp.sum(
+                in_chunk, axis=axis, dtype=dtype, keepdims=bool(op.keepdims)
+            )
+            avg = xp.true_divide(chunk_sum, chunk_count)
+            var_square = empty(chunk_count.shape + (moment - 1,), dtype=dtype)
+            for i in range(2, moment + 1):
+                var_square[..., i - 2] = xp.sum(
+                    (in_chunk - avg) ** i,
+                    axis=axis,
+                    dtype=dtype,
+                    keepdims=bool(op.keepdims),
+                )
+            ctx[op.outputs[0].key] = (chunk_sum, chunk_count, var_square)
+
+    @classmethod
+    def execute_combine(cls, ctx, op):
+        axis = cls.get_axis(op.axis)
+        moment = op.moment
+        dtype = op.dtype
+
+        (_data, _count, _var_square), device_id, xp = as_same_device(
+            ctx[op.inputs[0].key], device=op.device, ret_extra=True
+        )
+        empty = get_array_module(_data, nosparse=True).empty
+
+        with device(device_id):
+            chunk_count = xp.sum(
+                _count, axis=axis, dtype=np.int64, keepdims=bool(op.keepdims)
+            )
+            chunk_sum = xp.sum(
+                _data, axis=axis, dtype=dtype, keepdims=bool(op.keepdims)
+            )
+            avg = xp.true_divide(chunk_sum, chunk_count, dtype=dtype)
+            avg_diff = xp.true_divide(_data, _count, dtype=dtype) - avg
+            var_square = empty(chunk_count.shape + (moment - 1,), dtype=dtype)
+
+            for m in range(2, moment + 1):
+                var_square[..., m - 2] = reduce_var_square(
+                    _var_square, avg_diff, _count, op, axis, xp.sum
+                )
+
+            ctx[op.outputs[0].key] = (chunk_sum, chunk_count, var_square)
+
+
+class TensorVar(TensorReduction, TensorReductionMixin):
+    _op_type_ = OperandDef.VAR
+
+    _ddof = Int32Field("ddof")
+
+    def __new__(cls, *args, **kwargs):
+        if kwargs.get("stage") is not None:
+            return TensorMoment(*args, **kwargs)
+        return super().__new__(cls)
+
+    def __init__(self, axis=None, keepdims=None, ddof=0, combine_size=None, **kw):
+        super().__init__(
+            _axis=axis, _keepdims=keepdims, _ddof=ddof, _combine_size=combine_size, **kw
+        )
+
+    @property
+    def ddof(self):
+        return self._ddof
+
+    def _get_op_kw(self):
+        kw = dict()
+        kw["ddof"] = self.ddof
+        return kw
+
+    @classmethod
+    def execute(cls, ctx, op):
+        axis = cls.get_axis(op.axis)
+        (in_chunk,), device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            ctx[op.outputs[0].key] = xp.var(
+                in_chunk,
+                axis=axis,
+                dtype=op.dtype,
+                ddof=op.ddof,
+                keepdims=bool(op.keepdims),
+            )
+
+
+def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=None, combine_size=None):
+    """
+    Compute the variance along the specified axis.
+
+    Returns the variance of the tensor elements, a measure of the spread of a
+    distribution.  The variance is computed for the flattened tensor by
+    default, otherwise over the specified axis.
+
+    Parameters
+    ----------
+    a : array_like
+        Tensor containing numbers whose variance is desired.  If `a` is not a
+        tensor, a conversion is attempted.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which the variance is computed.  The default is to
+        compute the variance of the flattened array.
+
+        If this is a tuple of ints, a variance is performed over multiple axes,
+        instead of a single axis or all the axes as before.
+    dtype : data-type, optional
+        Type to use in computing the variance.  For arrays of integer type
+        the default is `float32`; for tensors of float types it is the same as
+        the tensor type.
+    out : Tensor, optional
+        Alternate output array in which to place the result.  It must have
+        the same shape as the expected output, but the type is cast if
+        necessary.
+    ddof : int, optional
+        "Delta Degrees of Freedom": the divisor used in the calculation is
+        ``N - ddof``, where ``N`` represents the number of elements. By
+        default `ddof` is zero.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input tensor.
+
+        If the default value is passed, then `keepdims` will not be
+        passed through to the `var` method of sub-classes of
+        `Tensor`, however any non-default value will be.  If the
+        sub-classes `sum` method does not implement `keepdims` any
+        exceptions will be raised.
+    combine_size: int, optional
+        The number of chunks to combine.
+
+    Returns
+    -------
+    variance : Tensor, see dtype parameter above
+        If ``out=None``, returns a new tensor containing the variance;
+        otherwise, a reference to the output tensor is returned.
+
+    See Also
+    --------
+    std , mean, nanmean, nanstd, nanvar
+
+    Notes
+    -----
+    The variance is the average of the squared deviations from the mean,
+    i.e.,  ``var = mean(abs(x - x.mean())**2)``.
+
+    The mean is normally calculated as ``x.sum() / N``, where ``N = len(x)``.
+    If, however, `ddof` is specified, the divisor ``N - ddof`` is used
+    instead.  In standard statistical practice, ``ddof=1`` provides an
+    unbiased estimator of the variance of a hypothetical infinite population.
+    ``ddof=0`` provides a maximum likelihood estimate of the variance for
+    normally distributed variables.
+
+    Note that for complex numbers, the absolute value is taken before
+    squaring, so that the result is always real and nonnegative.
+
+    For floating-point input, the variance is computed using the same
+    precision the input has.  Depending on the input data, this can cause
+    the results to be inaccurate, especially for `float32` (see example
+    below).  Specifying a higher-accuracy accumulator using the ``dtype``
+    keyword can alleviate this issue.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> a = mt.array([[1, 2], [3, 4]])
+    >>> mt.var(a).execute()
+    1.25
+    >>> mt.var(a, axis=0).execute()
+    array([ 1.,  1.])
+    >>> mt.var(a, axis=1).execute()
+    array([ 0.25,  0.25])
+
+    In single precision, var() can be inaccurate:
+
+    >>> a = mt.zeros((2, 512*512), dtype=mt.float32)
+    >>> a[0, :] = 1.0
+    >>> a[1, :] = 0.1
+    >>> mt.var(a).execute()
+    0.20250003
+
+    Computing the variance in float64 is more accurate:
+
+    >>> mt.var(a, dtype=mt.float64).execute()
+    0.20249999932944759
+    >>> ((1-0.55)**2 + (0.1-0.55)**2)/2
+    0.2025
+
+    """
+    a = astensor(a)
+    if dtype is None:
+        dtype = np.var(np.ones((1,), dtype=a.dtype)).dtype
+    op = TensorVar(
+        axis=axis, dtype=dtype, keepdims=keepdims, ddof=ddof, combine_size=combine_size
+    )
+    return op(a, out=out)
diff --git a/python/xorbits/_mars/tensor/reshape/__init__.py b/python/xorbits/_mars/tensor/reshape/__init__.py
new file mode 100644
index 000000000..e5a45443f
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reshape/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .reshape import reshape
diff --git a/python/xorbits/_mars/tensor/reshape/reshape.py b/python/xorbits/_mars/tensor/reshape/reshape.py
new file mode 100644
index 000000000..c38d5fa34
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reshape/reshape.py
@@ -0,0 +1,634 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import logging
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import recursive_tile
+from ...core.operand import OperandStage
+from ...serialization.serializables import FieldTypes, KeyField, StringField, TupleField
+from ...utils import has_unknown_shape
+from ..array_utils import as_same_device, device
+from ..datasource import tensor as astensor
+from ..operands import TensorMapReduceOperand, TensorOperandMixin, TensorShuffleProxy
+from ..utils import decide_chunk_sizes, get_order
+
+logger = logging.getLogger(__name__)
+
+
+class TensorReshape(TensorMapReduceOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.RESHAPE
+
+    _input = KeyField("input")
+    _newshape = TupleField("newshape", FieldTypes.int64)
+    _order = StringField("order")
+
+    _axis_offsets = TupleField("axis_offsets", FieldTypes.uint64)
+    _oldshape = TupleField("oldshape", FieldTypes.uint64)
+    _new_chunk_size = TupleField("new_chunk_size", FieldTypes.uint64)
+
+    def __init__(
+        self,
+        newshape=None,
+        order=None,
+        axis_offsets=None,
+        oldshape=None,
+        new_chunk_size=None,
+        **kw,
+    ):
+        super().__init__(
+            _newshape=newshape,
+            _order=order,
+            _axis_offsets=axis_offsets,
+            _oldshape=oldshape,
+            _new_chunk_size=new_chunk_size,
+            **kw,
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def newshape(self):
+        return self._newshape
+
+    @property
+    def axis_offsets(self):
+        return self._axis_offsets
+
+    @property
+    def oldshape(self):
+        return self._oldshape
+
+    @property
+    def new_chunk_size(self):
+        return self._new_chunk_size
+
+    @property
+    def order(self):
+        return self._order
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+
+    def on_output_modify(self, new_output):
+        return reshape(new_output, self._input.shape)
+
+    def on_input_modify(self, new_input):
+        op = self.copy().reset_key()
+        return op(new_input)
+
+    def __call__(self, a, order, out_shape):
+        return self.new_tensor([a], out_shape, order=order)
+
+    @staticmethod
+    def _gen_reshape_rechunk_nsplits(old_shape, new_shape, nsplits):
+        old_idx = len(old_shape) - 1
+        new_idx = len(new_shape) - 1
+        rechunk_nsplists = [None for _ in old_shape]
+        reshape_nsplists = [None for _ in new_shape]
+
+        while old_idx >= 0 or new_idx >= 0:
+            old_dim_size = old_shape[old_idx]
+            new_dim_size = new_shape[new_idx]
+
+            if old_dim_size == new_dim_size:
+                # nothing need to do
+                rechunk_nsplists[old_idx] = nsplits[old_idx]
+                reshape_nsplists[new_idx] = nsplits[old_idx]
+                old_idx -= 1
+                new_idx -= 1
+                continue
+
+            if old_dim_size == 1:
+                rechunk_nsplists[old_idx] = (1,)
+                old_idx -= 1
+            elif new_dim_size == 1:
+                reshape_nsplists[new_idx] = (1,)
+                new_idx -= 1
+            elif old_dim_size < new_dim_size:
+                left_old_idx = old_idx - 1
+                while (
+                    left_old_idx >= 0
+                    and np.prod(old_shape[left_old_idx : old_idx + 1]) < new_dim_size
+                ):
+                    left_old_idx -= 1
+                if np.prod(old_shape[left_old_idx : old_idx + 1]) != new_dim_size:
+                    raise ValueError("shapes not compatible")
+
+                for i in range(left_old_idx + 1, old_idx + 1):
+                    # rechunk the higher dimension into 1 chunk
+                    # e.g. ((2, 2, 2), [(3, 3), (4, 4))] -> [6, 8]
+                    rechunk_nsplists[i] = (old_shape[i],)
+
+                chunk_reduce = np.prod(
+                    [len(c) for c in nsplits[left_old_idx + 1 : old_idx + 1]]
+                ).item()
+                # cause the higher dimension has been concatenated,
+                # the lowest dimension should be expanded to reduce size
+                rechunk_nsplists[left_old_idx] = TensorReshape._expand_nsplit_by_reduce(
+                    nsplits[left_old_idx], chunk_reduce
+                )
+
+                size_reduce = np.prod(old_shape[left_old_idx + 1 : old_idx + 1]).item()
+                reshape_nsplists[new_idx] = tuple(
+                    size_reduce * c for c in rechunk_nsplists[left_old_idx]
+                )
+
+                old_idx = left_old_idx - 1
+                new_idx -= 1
+            else:
+                assert old_dim_size > new_dim_size
+                lef_new_idx = new_idx - 1
+                while (
+                    lef_new_idx >= 0
+                    and np.prod(new_shape[lef_new_idx : new_idx + 1]) < old_dim_size
+                ):
+                    lef_new_idx -= 1
+                if np.prod(new_shape[lef_new_idx : new_idx + 1]) != old_dim_size:
+                    raise ValueError("shapes not compatible")
+
+                chunk_expand = np.prod(new_shape[lef_new_idx + 1 : new_idx + 1]).item()
+                rechunk_nsplists[old_idx] = TensorReshape._reduce_nsplit_by_expand(
+                    nsplits[old_idx], chunk_expand
+                )
+
+                for i in range(lef_new_idx + 1, new_idx + 1):
+                    reshape_nsplists[i] = (new_shape[i],)
+                reshape_nsplists[lef_new_idx] = tuple(
+                    c // chunk_expand for c in rechunk_nsplists[old_idx]
+                )
+
+                old_idx -= 1
+                new_idx = lef_new_idx - 1
+
+        assert np.prod([len(s) for s in rechunk_nsplists]) == np.prod(
+            [len(s) for s in reshape_nsplists]
+        )
+        return rechunk_nsplists, reshape_nsplists
+
+    @staticmethod
+    def _expand_nsplit_by_reduce(splits, reduced):
+        if reduced == 1:
+            return splits
+
+        out = []
+        for s in splits:
+            x = s
+            part = max(x / reduced, 1)
+            while x >= 2 * part:
+                out.append(int(part))
+                x -= int(part)
+            if x:
+                out.append(x)
+        assert sum(splits) == sum(out)
+        return tuple(out)
+
+    @staticmethod
+    def _reduce_nsplit_by_expand(splits, expand):
+        assert sum(splits) % expand == 0
+
+        out = []
+        residual = 0
+        for chunk in splits:
+            chunk += residual
+            div = chunk // expand
+            residual = chunk % expand
+            good = expand * div
+            if good:
+                out.append(good)
+        return tuple(out)
+
+    @staticmethod
+    def _tile_as_shuffle(op):
+        in_tensor = op.input
+        tensor = op.outputs[0]
+        new_shape = op.newshape
+        shuffle_inputs, shuffle_outputs = [], []
+        axis_offsets = [[0] + np.cumsum(ns)[:-1].tolist() for ns in in_tensor.nsplits]
+
+        max_chunk_size = max(max(tp) for tp in in_tensor.nsplits)
+        out_nsplits = decide_chunk_sizes(
+            new_shape, max_chunk_size, tensor.dtype.itemsize
+        )
+        chunk_size_idxes = (range(len(size)) for size in out_nsplits)
+
+        for inp in in_tensor.chunks:
+            offset = tuple(
+                axis_offsets[axis][idx] for axis, idx in enumerate(inp.index)
+            )
+            chunk_op = TensorReshape(
+                stage=OperandStage.map,
+                axis_offsets=offset,
+                oldshape=in_tensor.shape,
+                newshape=new_shape,
+                new_chunk_size=(max_chunk_size,) * len(new_shape),
+                dtype=inp.dtype,
+            )
+            shuffle_inputs.append(
+                chunk_op.new_chunk([inp], shape=(np.nan,), index=inp.index)
+            )
+
+        proxy_chunk = TensorShuffleProxy(
+            dtype=in_tensor.dtype, _tensor_keys=[in_tensor.op.key]
+        ).new_chunk(shuffle_inputs, shape=())
+
+        out_indices = list(
+            zip(itertools.product(*out_nsplits), itertools.product(*chunk_size_idxes))
+        )
+        for chunk_shape, chunk_idx in out_indices:
+            chunk_op = TensorReshape(
+                stage=OperandStage.reduce,
+                dtype=tensor.dtype,
+                n_reducers=len(out_indices),
+            )
+            shuffle_outputs.append(
+                chunk_op.new_chunk(
+                    [proxy_chunk],
+                    shape=chunk_shape,
+                    order=tensor.order,
+                    index=chunk_idx,
+                )
+            )
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            new_shape,
+            order=tensor.order,
+            chunks=shuffle_outputs,
+            nsplits=out_nsplits,
+        )
+
+    @classmethod
+    def tile(cls, op):
+        in_tensor = op.input
+        tensor = op.outputs[0]
+
+        # check unknown shape
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        if any(np.isnan(s) for s in tensor.shape):
+            # -1 exists in newshape and input tensor has unknown shape
+            # recalculate new shape
+            shape = tuple(-1 if np.isnan(s) else s for s in tensor.shape)
+            op._newshape = newshape = calc_shape(in_tensor.size, shape)
+            tensor._shape = newshape
+
+        if op.order == "F":
+            # do transpose first, then do regular reshape, then transpose back
+            result = in_tensor.transpose().reshape(op.newshape[::-1])
+            if getattr(op, "_reshape_with_shuffle", True):
+                result.op.extra_params["_reshape_with_shuffle"] = True
+            result = result.transpose()
+            return [(yield from recursive_tile(result))]
+
+        if len(in_tensor.chunks) == 1:
+            # 1 chunk
+            chunk_op = op.copy().reset_key()
+            chunk = chunk_op.new_chunk(
+                in_tensor.chunks,
+                shape=tensor.shape,
+                order=tensor.order,
+                index=(0,) * tensor.ndim,
+            )
+            new_op = op.copy()
+            return new_op.new_tensors(
+                op.inputs,
+                shape=tensor.shape,
+                order=tensor.order,
+                chunks=[chunk],
+                nsplits=tuple((s,) for s in tensor.shape),
+            )
+        try:
+            rechunk_nsplits, reshape_nsplits = cls._gen_reshape_rechunk_nsplits(
+                in_tensor.shape, tensor.shape, in_tensor.nsplits
+            )
+            rechunked_tensor = yield from recursive_tile(
+                in_tensor.rechunk(rechunk_nsplits)
+            )
+            in_idxes = itertools.product(*[range(len(s)) for s in rechunk_nsplits])
+            out_idxes = itertools.product(*[range(len(s)) for s in reshape_nsplits])
+            out_shape = itertools.product(*[s for s in reshape_nsplits])
+            out_chunks = []
+            for input_idx, out_idx, out_shape in zip(in_idxes, out_idxes, out_shape):
+                in_chunk = rechunked_tensor.cix[input_idx]
+                chunk_op = op.copy().reset_key()
+                chunk_op._newshape = out_shape
+                out_chunk = chunk_op.new_chunk(
+                    [in_chunk], shape=out_shape, order=tensor.order, index=out_idx
+                )
+                out_chunks.append(out_chunk)
+
+            new_op = op.copy()
+            return new_op.new_tensors(
+                op.inputs,
+                tensor.shape,
+                order=tensor.order,
+                chunks=out_chunks,
+                nsplits=reshape_nsplits,
+            )
+        except ValueError:
+            # TODO: make this as default when shuffle is mature
+            if getattr(op.extra_params, "_reshape_with_shuffle", False):
+                return cls._tile_as_shuffle(op)
+
+            # shape incompatible, we will first do flatten, then reshape to the new shape
+            return [
+                (
+                    yield from recursive_tile(
+                        in_tensor.reshape(-1, order=tensor.op.order).reshape(
+                            tensor.shape, order=tensor.op.order
+                        )
+                    )
+                )
+            ]
+
+    @classmethod
+    def estimate_size(cls, ctx, op):
+        chunk = op.outputs[0]
+        if op.stage == OperandStage.map:
+            inp_chunk = chunk.inputs[0]
+            inp_size, inp_calc = ctx[inp_chunk.key]
+            store_overhead = np.int64().itemsize * inp_chunk.ndim
+            calc_overhead = np.int64().itemsize * (inp_chunk.ndim + 2)
+            ctx[chunk.key] = (store_overhead + inp_size, calc_overhead + inp_calc)
+        elif op.stage == OperandStage.reduce:
+            sum_size = 0
+            for shuffle_input in chunk.inputs[0].inputs or ():
+                key = (shuffle_input.key, chunk.index)
+                if ctx.get(key) is not None:
+                    sum_size += ctx[key][0]
+                else:
+                    ctx[key] = None
+            ctx[chunk.key] = (chunk.nbytes, max(sum_size, chunk.nbytes))
+        else:
+            super().estimate_size(ctx, op)
+
+    @classmethod
+    def _execute_map(cls, ctx, op):
+        chunk = op.outputs[0]
+        # todo this function is an experimental one making shuffle runnable.
+        # try elevate performance when needed.
+        old_shape = op.oldshape
+        new_shape = op.newshape
+        new_chunk_size = op.new_chunk_size
+        axis_offset = op.axis_offsets
+
+        logger.debug("Reshape mapper: Start mapping step for %s", chunk.key)
+
+        data = ctx[op.inputs[0].key]
+        indices = list(np.nonzero(data))
+        nz_data = data[tuple(indices)]
+
+        for idx in range(len(old_shape)):
+            indices[idx] = np.add(indices[idx], axis_offset[idx], out=indices[idx])
+        rest_indices = indices[0]
+        indices[0] = None
+        for idx in range(1, len(old_shape)):
+            rest_indices = np.multiply(rest_indices, old_shape[idx], out=rest_indices)
+            rest_indices = np.add(rest_indices, indices[idx], out=rest_indices)
+            indices[idx] = None
+        del indices
+
+        new_indices = []
+        for dim_size in reversed(new_shape[1:]):
+            new_index = rest_indices % dim_size
+            new_indices.append(new_index)
+            rest_indices = np.floor_divide(rest_indices, dim_size, out=rest_indices)
+        new_indices.append(rest_indices)
+        new_indices.reverse()
+        del rest_indices
+
+        logger.debug("Reshape mapper: remapping to new locations for %s", chunk.key)
+
+        dim_chunk_counts = [
+            int(np.ceil(dim_size * 1.0 / chunk_size))
+            for dim_size, chunk_size in zip(new_shape, new_chunk_size)
+        ]
+        target = new_indices[0] // new_chunk_size[0]
+        for new_index, chunk_size, dim_chunk_count in zip(
+            new_indices[1:], new_chunk_size[1:], dim_chunk_counts[1:]
+        ):
+            target = np.multiply(target, dim_chunk_count, out=target)
+            target = np.add(target, new_index // chunk_size, out=target)
+
+        for idx, chunk_size in enumerate(new_chunk_size):
+            new_indices[idx] = np.mod(
+                new_indices[idx], chunk_size, out=new_indices[idx]
+            )
+
+        logger.debug("Reshape mapper: sorting for %s", chunk.key)
+
+        sort_idx = np.argsort(target)
+        target = target[sort_idx]
+        nz_data = nz_data[sort_idx]
+        for idx in range(len(new_indices)):
+            new_indices[idx] = new_indices[idx][sort_idx]
+        del sort_idx
+
+        logger.debug("Reshape mapper: splitting for %s", chunk.key)
+
+        mapper_outputs = {}
+        for t in np.unique(target):
+            data_slice = slice(
+                np.searchsorted(target, t), np.searchsorted(target, t, "right")
+            )
+            group_indices = tuple(
+                new_indices[idx][data_slice] for idx in range(len(new_shape))
+            )
+            group_data = nz_data[data_slice]
+
+            target_chunk_idx = [None] * len(dim_chunk_counts)
+            for idx, dim_chunk_count in enumerate(reversed(dim_chunk_counts)):
+                t, target_chunk_idx[idx] = divmod(t, dim_chunk_count)
+            target_chunk_idx.reverse()
+
+            mapper_outputs[chunk.key, tuple(target_chunk_idx)] = group_indices + (
+                group_data,
+            )
+
+        # ensure all mapper data are inserted context and fill missing partition with None
+        for target_chunk_idx in itertools.product(
+            *(range(dim_chunk_cnt) for dim_chunk_cnt in dim_chunk_counts)
+        ):
+            data_key = chunk.key, tuple(target_chunk_idx)
+            ctx[data_key] = mapper_outputs.get(data_key)
+
+    @classmethod
+    def _execute_reduce(cls, ctx, op: "TensorReshape"):
+        chunk = op.outputs[0]
+        try:
+            result_array = ctx[chunk.key]
+        except KeyError:
+            result_array = np.zeros(
+                chunk.shape, dtype=chunk.dtype, order=chunk.order.value
+            )
+        for data_tuple in op.iter_mapper_data(ctx, skip_none=True):
+            if data_tuple is None:
+                # skip missing partition data
+                continue
+            result_array[data_tuple[:-1]] = data_tuple[-1]
+        ctx[chunk.key] = result_array
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            cls._execute_map(ctx, op)
+        elif op.stage == OperandStage.reduce:
+            cls._execute_reduce(ctx, op)
+        else:
+            (x,), device_id, xp = as_same_device(
+                [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+            )
+
+            with device(device_id):
+                ctx[op.outputs[0].key] = x.reshape(op.newshape, order=op.order)
+
+
+def calc_shape(size, newshape):
+    if isinstance(newshape, int):
+        newshape = (newshape,)
+    else:
+        newshape = tuple(int(s) for s in newshape)
+
+    known_shape = [s for s in newshape if s >= 0]
+    missing_dim = len(newshape) - len(known_shape)
+    if missing_dim > 1:
+        raise ValueError("can only specify one unknown dimension")
+    if missing_dim == 1:
+        known_size = np.prod(known_shape)
+        newshape = tuple(
+            int(size / known_size) if s < 0 and known_size > 0 else s for s in newshape
+        )
+
+    return newshape
+
+
+def reshape(a, newshape, order="C"):
+    """
+    Gives a new shape to a tensor without changing its data.
+
+    Parameters
+    ----------
+    a : array_like
+        Tensor to be reshaped.
+    newshape : int or tuple of ints
+        The new shape should be compatible with the original shape. If
+        an integer, then the result will be a 1-D tensor of that length.
+        One shape dimension can be -1. In this case, the value is
+        inferred from the length of the tensor and remaining dimensions.
+    order : {'C', 'F', 'A'}, optional
+        Read the elements of `a` using this index order, and place the
+        elements into the reshaped array using this index order.  'C'
+        means to read / write the elements using C-like index order,
+        with the last axis index changing fastest, back to the first
+        axis index changing slowest. 'F' means to read / write the
+        elements using Fortran-like index order, with the first index
+        changing fastest, and the last index changing slowest. Note that
+        the 'C' and 'F' options take no account of the memory layout of
+        the underlying array, and only refer to the order of indexing.
+        'A' means to read / write the elements in Fortran-like index
+        order if `a` is Fortran *contiguous* in memory, C-like order
+        otherwise.
+
+    Returns
+    -------
+    reshaped_array : Tensor
+        This will be a new view object if possible; otherwise, it will
+        be a copy.
+
+    See Also
+    --------
+    Tensor.reshape : Equivalent method.
+
+    Notes
+    -----
+    It is not always possible to change the shape of a tensor without
+    copying the data. If you want an error to be raised when the data is copied,
+    you should assign the new shape to the shape attribute of the array::
+
+    >>> import mars.tensor as mt
+
+    >>> a = mt.arange(6).reshape((3, 2))
+    >>> a.execute()
+    array([[0, 1],
+           [2, 3],
+           [4, 5]])
+
+    You can think of reshaping as first raveling the tensor (using the given
+    index order), then inserting the elements from the raveled tensor into the
+    new tensor using the same kind of index ordering as was used for the
+    raveling.
+
+    >>> mt.reshape(a, (2, 3)).execute()
+    array([[0, 1, 2],
+           [3, 4, 5]])
+    >>> mt.reshape(mt.ravel(a), (2, 3)).execute()
+    array([[0, 1, 2],
+           [3, 4, 5]])
+
+    Examples
+    --------
+    >>> a = mt.array([[1,2,3], [4,5,6]])
+    >>> mt.reshape(a, 6).execute()
+    array([1, 2, 3, 4, 5, 6])
+
+    >>> mt.reshape(a, (3,-1)).execute()       # the unspecified value is inferred to be 2
+    array([[1, 2],
+           [3, 4],
+           [5, 6]])
+    """
+    a = astensor(a)
+
+    if np.isnan(sum(a.shape)):
+        # some shape is nan
+        new_shape = [newshape] if isinstance(newshape, int) else list(newshape)
+        # if -1 exists in newshape, just treat it as unknown shape
+        new_shape = [s if s != -1 else np.nan for s in new_shape]
+        out_shape = tuple(new_shape)
+    else:
+        out_shape = newshape = calc_shape(a.size, newshape)
+        if a.size != np.prod(newshape):
+            raise ValueError(
+                f"cannot reshape array of size {a.size} into shape {newshape}"
+            )
+
+    tensor_order = get_order(order, a.order, available_options="CFA")
+
+    if a.shape == newshape and (
+        a.ndim <= 1 or (a.ndim > 1 and tensor_order == a.order)
+    ):
+        # does not need to reshape
+        return a
+    return _reshape(
+        a, newshape, order=order, tensor_order=tensor_order, out_shape=out_shape
+    )
+
+
+def _reshape(a, newshape, order="C", tensor_order=None, out_shape=None):
+    if tensor_order is None:
+        tensor_order = get_order(order, a.order, available_options="CFA")
+    op = TensorReshape(
+        newshape, order, dtype=a.dtype, create_view=tensor_order == a.order
+    )
+    if out_shape is None:
+        out_shape = newshape
+    return op(a, tensor_order, out_shape)
diff --git a/python/xorbits/_mars/tensor/reshape/tests/__init__.py b/python/xorbits/_mars/tensor/reshape/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reshape/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/reshape/tests/test_reshape.py b/python/xorbits/_mars/tensor/reshape/tests/test_reshape.py
new file mode 100644
index 000000000..94447fc47
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reshape/tests/test_reshape.py
@@ -0,0 +1,72 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+from ....core import tile
+from ....core.operand import OperandStage
+from ...datasource import ones
+from ..reshape import TensorReshape
+
+
+def test_reshape():
+    a = ones((10, 20, 30), chunk_size=5)
+    b = a.reshape(10, 600)
+
+    b = tile(b)
+
+    assert tuple(sum(s) for s in b.nsplits) == (10, 600)
+
+    a = ones((10, 600), chunk_size=5)
+    b = a.reshape(10, 30, 20)
+
+    b = tile(b)
+
+    assert tuple(sum(s) for s in b.nsplits) == (10, 30, 20)
+
+    a = ones((10, 600), chunk_size=5)
+    a.shape = [10, 30, 20]
+
+    a = tile(a)
+
+    assert tuple(sum(s) for s in a.nsplits) == (10, 30, 20)
+
+    # test reshape unknown shape
+    c = a[a > 0]
+    d = c.reshape(10, 600)
+    assert d.shape == (10, 600)
+    d = c.reshape(-1, 10)
+    assert len(d.shape) == 2
+    assert np.isnan(d.shape[0])
+    assert d.shape[1]
+
+    with pytest.raises(TypeError):
+        a.reshape((10, 30, 20), other_argument=True)
+
+
+def test_shuffle_reshape():
+    a = ones((31, 27), chunk_size=10)
+    b = a.reshape(27, 31)
+    b.op.extra_params["_reshape_with_shuffle"] = True
+
+    b = tile(b)
+
+    assert tuple(sum(s) for s in b.nsplits) == (27, 31)
+    assert isinstance(b.chunks[0].op, TensorReshape)
+    assert b.chunks[0].op.stage == OperandStage.reduce
+
+    shuffle_map_sample = b.chunks[0].inputs[0].inputs[0]
+    assert isinstance(shuffle_map_sample.op, TensorReshape)
+    assert shuffle_map_sample.op.stage == OperandStage.map
diff --git a/python/xorbits/_mars/tensor/reshape/tests/test_reshape_execution.py b/python/xorbits/_mars/tensor/reshape/tests/test_reshape_execution.py
new file mode 100644
index 000000000..34b9360dc
--- /dev/null
+++ b/python/xorbits/_mars/tensor/reshape/tests/test_reshape_execution.py
@@ -0,0 +1,98 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ...datasource import ones, tensor
+
+
+def test_reshape_execution(setup):
+    x = ones((1, 2, 3), chunk_size=[4, 3, 5])
+    y = x.reshape(3, 2)
+    res = y.execute().fetch()
+    assert y.shape == (3, 2)
+    np.testing.assert_equal(res, np.ones((3, 2)))
+
+    data = np.random.rand(6, 4)
+    x2 = tensor(data, chunk_size=2)
+    y2 = x2.reshape(3, 8, order="F")
+    res = y2.execute().fetch()
+    expected = data.reshape((3, 8), order="F")
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["F_CONTIGUOUS"] is True
+    assert res.flags["C_CONTIGUOUS"] is False
+
+    data2 = np.asfortranarray(np.random.rand(6, 4))
+    x3 = tensor(data2, chunk_size=2)
+    y3 = x3.reshape(3, 8)
+    res = y3.execute().fetch()
+    expected = data2.reshape((3, 8))
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] is True
+    assert res.flags["F_CONTIGUOUS"] is False
+
+    data2 = np.asfortranarray(np.random.rand(6, 4))
+    x3 = tensor(data2, chunk_size=2)
+    y3 = x3.reshape(3, 8, order="F")
+    res = y3.execute().fetch()
+    expected = data2.reshape((3, 8), order="F")
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["F_CONTIGUOUS"] is True
+    assert res.flags["C_CONTIGUOUS"] is False
+
+    for chunk_size in [None, 3]:
+        rs = np.random.RandomState(0)
+        data = rs.rand(3, 4, 5)
+        x = tensor(data, chunk_size=chunk_size)
+        x = x[x[:, 0, 0] < 0.7]
+        y = x.reshape(-1, 20)
+        assert np.isnan(y.shape[0])
+        res = y.execute().fetch()
+        expected = data[data[:, 0, 0] < 0.7].reshape(-1, 20)
+        np.testing.assert_array_equal(res, expected)
+
+
+def test_shuffle_reshape_execution(setup):
+    a = ones((31, 27), chunk_size=10)
+    b = a.reshape(27, 31)
+    b.op.extra_params["_reshape_with_shuffle"] = True
+
+    res = b.execute().fetch()
+    np.testing.assert_array_equal(res, np.ones((27, 31)))
+
+    b2 = a.reshape(27, 31, order="F")
+    b.op.extra_params["_reshape_with_shuffle"] = True
+    res = b2.execute().fetch()
+    assert res.flags["F_CONTIGUOUS"] is True
+    assert res.flags["C_CONTIGUOUS"] is False
+
+    data = np.random.rand(6, 4)
+    x2 = tensor(data, chunk_size=2)
+    y2 = x2.reshape(4, 6, order="F")
+    y2.op.extra_params["_reshape_with_shuffle"] = True
+    res = y2.execute().fetch()
+    expected = data.reshape((4, 6), order="F")
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["F_CONTIGUOUS"] is True
+    assert res.flags["C_CONTIGUOUS"] is False
+
+    data2 = np.asfortranarray(np.random.rand(6, 4))
+    x3 = tensor(data2, chunk_size=2)
+    y3 = x3.reshape(4, 6)
+    y3.op.extra_params["_reshape_with_shuffle"] = True
+    res = y3.execute().fetch()
+    expected = data2.reshape((4, 6))
+    np.testing.assert_array_equal(res, expected)
+    assert res.flags["C_CONTIGUOUS"] is True
+    assert res.flags["F_CONTIGUOUS"] is False
diff --git a/python/xorbits/_mars/tensor/spatial/__init__.py b/python/xorbits/_mars/tensor/spatial/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/spatial/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/spatial/distance/__init__.py b/python/xorbits/_mars/tensor/spatial/distance/__init__.py
new file mode 100644
index 000000000..0e31d9a3f
--- /dev/null
+++ b/python/xorbits/_mars/tensor/spatial/distance/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .cdist import cdist
+from .pdist import pdist
+from .squareform import squareform
diff --git a/python/xorbits/_mars/tensor/spatial/distance/cdist.py b/python/xorbits/_mars/tensor/spatial/distance/cdist.py
new file mode 100644
index 000000000..5b975b441
--- /dev/null
+++ b/python/xorbits/_mars/tensor/spatial/distance/cdist.py
@@ -0,0 +1,564 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from typing import Tuple
+
+import numpy as np
+
+from .... import opcodes as OperandDef
+from ....core import recursive_tile
+from ....serialization.serializables import AnyField, Float16Field, KeyField
+from ....utils import ensure_own_data, has_unknown_shape, require_module
+from ...array_utils import as_same_device, cp, device
+from ...core import TensorOrder
+from ...datasource import tensor as astensor
+from ...operands import TensorOperand, TensorOperandMixin
+
+
+class TensorCdist(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.CDIST
+
+    _xa = KeyField("XA")
+    _xb = KeyField("XB")
+    _metric = AnyField("metric")
+    _p = Float16Field("p", on_serialize=lambda x: float(x) if x is not None else x)
+    _w = KeyField("w")
+    _v = KeyField("V")
+    _vi = KeyField("VI")
+
+    def __init__(self, metric=None, p=None, w=None, v=None, vi=None, **kw):
+        super().__init__(_metric=metric, _p=p, _w=w, _v=v, _vi=vi, **kw)
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        self._xa = next(inputs_iter)
+        self._xb = next(inputs_iter)
+        if self._w is not None:
+            self._w = next(inputs_iter)
+        if self._v is not None:
+            self._v = next(inputs_iter)
+        if self._vi is not None:
+            self._vi = next(inputs_iter)
+
+    @property
+    def xa(self):
+        return self._xa
+
+    @property
+    def xb(self):
+        return self._xb
+
+    @property
+    def metric(self):
+        return self._metric
+
+    @property
+    def p(self):
+        return self._p
+
+    @property
+    def w(self):
+        return self._w
+
+    @property
+    def v(self):
+        return self._v
+
+    @property
+    def vi(self):
+        return self._vi
+
+    def __call__(self, xa, xb, shape: Tuple):
+        inputs = [xa, xb]
+        for val in [self._w, self._v, self._vi]:
+            if val is not None:
+                inputs.append(val)
+        return self.new_tensor(inputs, shape=shape, order=TensorOrder.C_ORDER)
+
+    @classmethod
+    def _tile_one_chunk(cls, op, xa, xb, w, v, vi):
+        out_tensor = op.outputs[0]
+        chunk_op = op.copy().reset_key()
+        chunk_inputs = [xa.chunks[0], xb.chunks[0]]
+        for val in [w, v, vi]:
+            if val is not None:
+                chunk_inputs.append(val.chunks[0])
+        chunk = chunk_op.new_chunk(
+            chunk_inputs,
+            shape=out_tensor.shape,
+            order=out_tensor.order,
+            index=(0,) * out_tensor.ndim,
+        )
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            shape=out_tensor.shape,
+            order=out_tensor.order,
+            nsplits=tuple((s,) for s in out_tensor.shape),
+            chunks=[chunk],
+        )
+
+    @classmethod
+    def _tile_chunks(cls, op, xa, xb, w, v, vi):
+        out_tensor = op.outputs[0]
+        acs, bcs = xa.chunk_shape[0], xb.chunk_shape[0]
+
+        out_chunks = []
+        for idx in itertools.product(range(acs), range(bcs)):
+            ixa, ixb = idx
+            chunk_op = op.copy().reset_key()
+
+            chunk_inputs = []
+            xa_chunk = xa.cix[ixa, 0]
+            xb_chunk = xb.cix[ixb, 0]
+            chunk_inputs.extend([xa_chunk, xb_chunk])
+            if w is not None:
+                w_chunk = chunk_op._w = w.chunks[0]
+                chunk_inputs.append(w_chunk)
+            if v is not None:
+                v_chunk = chunk_op._v = v.chunks[0]
+                chunk_inputs.append(v_chunk)
+            if vi is not None:
+                vi_chunk = chunk_op._vi = vi.chunks[0]
+                chunk_inputs.append(vi_chunk)
+            chunk = chunk_op.new_chunk(
+                chunk_inputs,
+                shape=(xa_chunk.shape[0], xb_chunk.shape[0]),
+                order=out_tensor.order,
+                index=idx,
+            )
+            out_chunks.append(chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            shape=out_tensor.shape,
+            order=out_tensor.order,
+            chunks=out_chunks,
+            nsplits=(xa.nsplits[0], xb.nsplits[0]),
+        )
+
+    @classmethod
+    def tile(cls, op):
+        # make sure every inputs have known shape
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        xa = op.xa.rechunk({1: op.xa.shape[1]})
+        xb = op.xb.rechunk({1: op.xb.shape[1]})
+        xa, xb = yield from recursive_tile(xa, xb)
+
+        # rechunk w, v, vi into one chunk if any of them has value
+        extra_inputs = [None] * 3
+        for i, ei in enumerate([op.w, op.v, op.vi]):
+            if ei is None:
+                continue
+            new_ei = yield from recursive_tile(ei.rechunk(ei.shape))
+            extra_inputs[i] = new_ei
+        w, v, vi = extra_inputs
+
+        if len(xa.chunks) == 1 and len(xb.chunks) == 1:
+            # only 1 chunk
+            return cls._tile_one_chunk(op, xa, xb, w, v, vi)
+        else:
+            return cls._tile_chunks(op, xa, xb, w, v, vi)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        from scipy.spatial.distance import cdist
+
+        inputs, device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+
+        if xp is cp:  # pragma: no cover
+            raise NotImplementedError("`cdist` does not support running on GPU yet")
+
+        with device(device_id):
+            inputs_iter = iter(inputs)
+            xa = next(inputs_iter)
+            xb = next(inputs_iter)
+            kw = dict()
+            if op.p is not None:
+                kw["p"] = op.p
+            if op.w is not None:
+                kw["w"] = next(inputs_iter)
+            if op.v is not None:
+                kw["V"] = next(inputs_iter)
+            if op.vi is not None:
+                kw["VI"] = next(inputs_iter)
+
+        ctx[op.outputs[0].key] = cdist(
+            ensure_own_data(xa), ensure_own_data(xb), metric=op.metric, **kw
+        )
+
+
+@require_module("scipy.spatial.distance")
+def cdist(XA, XB, metric="euclidean", **kwargs):
+    """
+    Compute distance between each pair of the two collections of inputs.
+
+    See Notes for common calling conventions.
+
+    Parameters
+    ----------
+    XA : Tensor
+        An :math:`m_A` by :math:`n` tensor of :math:`m_A`
+        original observations in an :math:`n`-dimensional space.
+        Inputs are converted to float type.
+    XB : Tensor
+        An :math:`m_B` by :math:`n` tensor of :math:`m_B`
+        original observations in an :math:`n`-dimensional space.
+        Inputs are converted to float type.
+    metric : str or callable, optional
+        The distance metric to use.  If a string, the distance function can be
+        'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation',
+        'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'jensenshannon',
+        'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
+        'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
+        'wminkowski', 'yule'.
+    **kwargs : dict, optional
+        Extra arguments to `metric`: refer to each metric documentation for a
+        list of all possible arguments.
+
+        Some possible arguments:
+
+        p : scalar
+        The p-norm to apply for Minkowski, weighted and unweighted.
+        Default: 2.
+
+        w : Tensor
+        The weight vector for metrics that support weights (e.g., Minkowski).
+
+        V : Tensor
+        The variance vector for standardized Euclidean.
+        Default: var(vstack([XA, XB]), axis=0, ddof=1)
+
+        VI : Tensor
+        The inverse of the covariance matrix for Mahalanobis.
+        Default: inv(cov(vstack([XA, XB].T))).T
+
+        out : Tensor
+        The output tensor
+        If not None, the distance matrix Y is stored in this tensor.
+        Note: metric independent, it will become a regular keyword arg in a
+        future scipy version
+
+    Returns
+    -------
+    Y : Tensor
+        A :math:`m_A` by :math:`m_B` distance matrix is returned.
+        For each :math:`i` and :math:`j`, the metric
+        ``dist(u=XA[i], v=XB[j])`` is computed and stored in the
+        :math:`ij` th entry.
+
+    Raises
+    ------
+    ValueError
+        An exception is thrown if `XA` and `XB` do not have
+        the same number of columns.
+
+    Notes
+    -----
+    The following are common calling conventions:
+
+    1. ``Y = cdist(XA, XB, 'euclidean')``
+
+       Computes the distance between :math:`m` points using
+       Euclidean distance (2-norm) as the distance metric between the
+       points. The points are arranged as :math:`m`
+       :math:`n`-dimensional row vectors in the matrix X.
+
+    2. ``Y = cdist(XA, XB, 'minkowski', p=2.)``
+
+       Computes the distances using the Minkowski distance
+       :math:`||u-v||_p` (:math:`p`-norm) where :math:`p \\geq 1`.
+
+    3. ``Y = cdist(XA, XB, 'cityblock')``
+
+       Computes the city block or Manhattan distance between the
+       points.
+
+    4. ``Y = cdist(XA, XB, 'seuclidean', V=None)``
+
+       Computes the standardized Euclidean distance. The standardized
+       Euclidean distance between two n-vectors ``u`` and ``v`` is
+
+       .. math::
+
+          \\sqrt{\\sum {(u_i-v_i)^2 / V[x_i]}}.
+
+       V is the variance vector; V[i] is the variance computed over all
+       the i'th components of the points. If not passed, it is
+       automatically computed.
+
+    5. ``Y = cdist(XA, XB, 'sqeuclidean')``
+
+       Computes the squared Euclidean distance :math:`||u-v||_2^2` between
+       the vectors.
+
+    6. ``Y = cdist(XA, XB, 'cosine')``
+
+       Computes the cosine distance between vectors u and v,
+
+       .. math::
+
+          1 - \\frac{u \\cdot v}
+                   {{||u||}_2 {||v||}_2}
+
+       where :math:`||*||_2` is the 2-norm of its argument ``*``, and
+       :math:`u \\cdot v` is the dot product of :math:`u` and :math:`v`.
+
+    7. ``Y = cdist(XA, XB, 'correlation')``
+
+       Computes the correlation distance between vectors u and v. This is
+
+       .. math::
+
+          1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})}
+                   {{||(u - \\bar{u})||}_2 {||(v - \\bar{v})||}_2}
+
+       where :math:`\\bar{v}` is the mean of the elements of vector v,
+       and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`.
+
+
+    8. ``Y = cdist(XA, XB, 'hamming')``
+
+       Computes the normalized Hamming distance, or the proportion of
+       those vector elements between two n-vectors ``u`` and ``v``
+       which disagree. To save memory, the matrix ``X`` can be of type
+       boolean.
+
+    9. ``Y = cdist(XA, XB, 'jaccard')``
+
+       Computes the Jaccard distance between the points. Given two
+       vectors, ``u`` and ``v``, the Jaccard distance is the
+       proportion of those elements ``u[i]`` and ``v[i]`` that
+       disagree where at least one of them is non-zero.
+
+    10. ``Y = cdist(XA, XB, 'chebyshev')``
+
+       Computes the Chebyshev distance between the points. The
+       Chebyshev distance between two n-vectors ``u`` and ``v`` is the
+       maximum norm-1 distance between their respective elements. More
+       precisely, the distance is given by
+
+       .. math::
+
+          d(u,v) = \\max_i {|u_i-v_i|}.
+
+    11. ``Y = cdist(XA, XB, 'canberra')``
+
+       Computes the Canberra distance between the points. The
+       Canberra distance between two points ``u`` and ``v`` is
+
+       .. math::
+
+         d(u,v) = \\sum_i \\frac{|u_i-v_i|}
+                              {|u_i|+|v_i|}.
+
+    12. ``Y = cdist(XA, XB, 'braycurtis')``
+
+       Computes the Bray-Curtis distance between the points. The
+       Bray-Curtis distance between two points ``u`` and ``v`` is
+
+
+       .. math::
+
+            d(u,v) = \\frac{\\sum_i (|u_i-v_i|)}
+                          {\\sum_i (|u_i+v_i|)}
+
+    13. ``Y = cdist(XA, XB, 'mahalanobis', VI=None)``
+
+       Computes the Mahalanobis distance between the points. The
+       Mahalanobis distance between two points ``u`` and ``v`` is
+       :math:`\\sqrt{(u-v)(1/V)(u-v)^T}` where :math:`(1/V)` (the ``VI``
+       variable) is the inverse covariance. If ``VI`` is not None,
+       ``VI`` will be used as the inverse covariance matrix.
+
+    14. ``Y = cdist(XA, XB, 'yule')``
+
+       Computes the Yule distance between the boolean
+       vectors. (see `yule` function documentation)
+
+    15. ``Y = cdist(XA, XB, 'matching')``
+
+       Synonym for 'hamming'.
+
+    16. ``Y = cdist(XA, XB, 'dice')``
+
+       Computes the Dice distance between the boolean vectors. (see
+       `dice` function documentation)
+
+    17. ``Y = cdist(XA, XB, 'kulsinski')``
+
+       Computes the Kulsinski distance between the boolean
+       vectors. (see `kulsinski` function documentation)
+
+    18. ``Y = cdist(XA, XB, 'rogerstanimoto')``
+
+       Computes the Rogers-Tanimoto distance between the boolean
+       vectors. (see `rogerstanimoto` function documentation)
+
+    19. ``Y = cdist(XA, XB, 'russellrao')``
+
+       Computes the Russell-Rao distance between the boolean
+       vectors. (see `russellrao` function documentation)
+
+    20. ``Y = cdist(XA, XB, 'sokalmichener')``
+
+       Computes the Sokal-Michener distance between the boolean
+       vectors. (see `sokalmichener` function documentation)
+
+    21. ``Y = cdist(XA, XB, 'sokalsneath')``
+
+       Computes the Sokal-Sneath distance between the vectors. (see
+       `sokalsneath` function documentation)
+
+
+    22. ``Y = cdist(XA, XB, 'wminkowski', p=2., w=w)``
+
+       Computes the weighted Minkowski distance between the
+       vectors. (see `wminkowski` function documentation)
+
+    23. ``Y = cdist(XA, XB, f)``
+
+       Computes the distance between all pairs of vectors in X
+       using the user supplied 2-arity function f. For example,
+       Euclidean distance between the vectors could be computed
+       as follows::
+
+         dm = cdist(XA, XB, lambda u, v: np.sqrt(((u-v)**2).sum()))
+
+       Note that you should avoid passing a reference to one of
+       the distance functions defined in this library. For example,::
+
+         dm = cdist(XA, XB, sokalsneath)
+
+       would calculate the pair-wise distances between the vectors in
+       X using the Python function `sokalsneath`. This would result in
+       sokalsneath being called :math:`{n \\choose 2}` times, which
+       is inefficient. Instead, the optimized C version is more
+       efficient, and we call it using the following syntax::
+
+         dm = cdist(XA, XB, 'sokalsneath')
+
+    Examples
+    --------
+    Find the Euclidean distances between four 2-D coordinates:
+
+    >>> from mars.tensor.spatial import distance
+    >>> coords = [(35.0456, -85.2672),
+    ...           (35.1174, -89.9711),
+    ...           (35.9728, -83.9422),
+    ...           (36.1667, -86.7833)]
+    >>> distance.cdist(coords, coords, 'euclidean').execute()
+    array([[ 0.    ,  4.7044,  1.6172,  1.8856],
+           [ 4.7044,  0.    ,  6.0893,  3.3561],
+           [ 1.6172,  6.0893,  0.    ,  2.8477],
+           [ 1.8856,  3.3561,  2.8477,  0.    ]])
+
+
+    Find the Manhattan distance from a 3-D point to the corners of the unit
+    cube:
+
+    >>> import mars.tensor as mt
+    >>> a = mt.array([[0, 0, 0],
+    ...               [0, 0, 1],
+    ...               [0, 1, 0],
+    ...               [0, 1, 1],
+    ...               [1, 0, 0],
+    ...               [1, 0, 1],
+    ...               [1, 1, 0],
+    ...               [1, 1, 1]])
+    >>> b = mt.array([[ 0.1,  0.2,  0.4]])
+    >>> distance.cdist(a, b, 'cityblock').execute()
+    array([[ 0.7],
+           [ 0.9],
+           [ 1.3],
+           [ 1.5],
+           [ 1.5],
+           [ 1.7],
+           [ 2.1],
+           [ 2.3]])
+
+    """
+    XA = astensor(XA, order="C")
+    XB = astensor(XB, order="C")
+
+    if XA.issparse() or XB.issparse():
+        raise ValueError("Sparse tensors are not supported by this function.")
+
+    s = XA.shape
+    sB = XB.shape
+
+    if len(s) != 2:
+        raise ValueError("XA must be a 2-dimensional array.")
+    if len(sB) != 2:
+        raise ValueError("XB must be a 2-dimensional array.")
+    if s[1] != sB[1]:
+        raise ValueError(
+            "XA and XB must have the same number of columns "
+            "(i.e. feature dimension.)"
+        )
+
+    mA = s[0]
+    mB = sB[0]
+    out = kwargs.pop("out", None)
+    if out is not None:
+        if not hasattr(out, "shape"):
+            raise TypeError("return arrays must be a tensor")
+        if out.shape != (mA, mB):
+            raise ValueError("Output tensor has incorrect shape.")
+        if out.dtype != np.double:
+            raise ValueError("Output tensor must be double type.")
+
+    if not isinstance(metric, str) and not callable(metric):
+        raise TypeError(
+            "3rd argument metric must be a string identifier or a function."
+        )
+
+    # scipy remove "wminkowski" since v1.8.0, use "minkowski" with `w=`
+    # keyword-argument for the given weight.
+    if metric == "wminkowski":
+        metric = "minkowski"
+
+    p = kwargs.pop("p", None)
+    w = kwargs.pop("w", None)
+    if w is not None:
+        w = astensor(w)
+    v = kwargs.pop("V", None)
+    if v is not None:
+        v = astensor(v)
+    vi = kwargs.pop("VI", None)
+    if vi is not None:
+        vi = astensor(vi)
+
+    if len(kwargs) > 0:
+        raise TypeError(
+            f"`cdist` got an unexpected keyword argument '{next(iter(kwargs))}'"
+        )
+
+    op = TensorCdist(metric=metric, p=p, w=w, v=v, vi=vi, dtype=np.dtype(float))
+    shape = (XA.shape[0], XB.shape[0])
+    ret = op(XA, XB, shape)
+
+    if out is None:
+        return ret
+    else:
+        out.data = ret.data
+        return out
diff --git a/python/xorbits/_mars/tensor/spatial/distance/pdist.py b/python/xorbits/_mars/tensor/spatial/distance/pdist.py
new file mode 100644
index 000000000..ea630b907
--- /dev/null
+++ b/python/xorbits/_mars/tensor/spatial/distance/pdist.py
@@ -0,0 +1,740 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Tuple
+
+import numpy as np
+
+from .... import opcodes as OperandDef
+from ....config import options
+from ....core import recursive_tile
+from ....core.operand import OperandStage
+from ....serialization.serializables import (
+    AnyField,
+    FieldTypes,
+    Float16Field,
+    Int32Field,
+    KeyField,
+    TupleField,
+)
+from ....utils import ensure_own_data, has_unknown_shape, require_module
+from ...array_utils import as_same_device, cp, device
+from ...core import TensorOrder
+from ...datasource.array import tensor as astensor
+from ...operands import TensorMapReduceOperand, TensorOperandMixin, TensorShuffleProxy
+
+
+class TensorPdist(TensorMapReduceOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.PDIST
+
+    _input = KeyField("input")
+    _metric = AnyField("metric")
+    _p = Float16Field("p", on_serialize=lambda x: float(x) if x is not None else x)
+    _w = KeyField("w")
+    _v = KeyField("V")
+    _vi = KeyField("VI")
+    _aggregate_size = Int32Field("aggregate_size")
+
+    _a = KeyField("a")
+    _a_offset = Int32Field("a_offset")
+    _b = KeyField("b")
+    _b_offset = Int32Field("b_offset")
+    _out_sizes = TupleField("out_sizes", FieldTypes.int32)
+    _n = Int32Field("n")
+
+    def __init__(
+        self,
+        metric=None,
+        p=None,
+        w=None,
+        v=None,
+        vi=None,
+        a=None,
+        a_offset=None,
+        b=None,
+        b_offset=None,
+        out_sizes=None,
+        n=None,
+        aggregate_size=None,
+        **kw,
+    ):
+        super().__init__(
+            _metric=metric,
+            _p=p,
+            _w=w,
+            _v=v,
+            _vi=vi,
+            _a=a,
+            _a_offset=a_offset,
+            _b=b,
+            _b_offset=b_offset,
+            _out_sizes=out_sizes,
+            _n=n,
+            _aggregate_size=aggregate_size,
+            **kw,
+        )
+
+    def _set_inputs(self, inputs: List) -> None:
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+
+        if self.stage == OperandStage.map:
+            self._a = next(inputs_iter)
+            if self._b is not None:
+                self._b = next(inputs_iter)
+        else:
+            self._input = next(inputs_iter)
+
+        if self._w is not None:
+            self._w = next(inputs_iter)
+        if self._v is not None:
+            self._v = next(inputs_iter)
+        if self._vi is not None:
+            self._vi = next(inputs_iter)
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def metric(self):
+        return self._metric
+
+    @property
+    def p(self):
+        return self._p
+
+    @property
+    def w(self):
+        return self._w
+
+    @property
+    def v(self):
+        return self._v
+
+    @property
+    def vi(self):
+        return self._vi
+
+    @property
+    def aggregate_size(self):
+        return self._aggregate_size
+
+    @property
+    def a(self):
+        return self._a
+
+    @property
+    def a_offset(self):
+        return self._a_offset
+
+    @property
+    def b(self):
+        return self._b
+
+    @property
+    def b_offset(self):
+        return self._b_offset
+
+    @property
+    def out_sizes(self):
+        return self._out_sizes
+
+    @property
+    def n(self):
+        return self._n
+
+    def __call__(self, x, shape: Tuple):
+        inputs = [x]
+        for val in [self._w, self._v, self._vi]:
+            if val is not None:
+                inputs.append(val)
+        return self.new_tensor(inputs, shape=shape, order=TensorOrder.C_ORDER)
+
+    @classmethod
+    def _tile_one_chunk(cls, op, in_tensor, w, v, vi):
+        out_tensor = op.outputs[0]
+        chunk_op = op.copy().reset_key()
+        chunk_inputs = [in_tensor.chunks[0]]
+        for val in [w, v, vi]:
+            if val is not None:
+                chunk_inputs.append(val.chunks[0])
+        chunk = chunk_op.new_chunk(
+            chunk_inputs,
+            shape=out_tensor.shape,
+            order=out_tensor.order,
+            index=(0,) * out_tensor.ndim,
+        )
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            shape=out_tensor.shape,
+            order=out_tensor.order,
+            nsplits=tuple((s,) for s in out_tensor.shape),
+            chunks=[chunk],
+        )
+
+    @classmethod
+    def _tile_chunks(cls, op, in_tensor, w, v, vi):
+        out_tensor = op.outputs[0]
+        extra_inputs = []
+        for val in [w, v, vi]:
+            if val is not None:
+                extra_inputs.append(val.chunks[0])
+
+        n = in_tensor.shape[0]
+        aggregate_size = op.aggregate_size
+        if aggregate_size is None:
+            aggregate_size = (
+                np.ceil(
+                    out_tensor.size
+                    * out_tensor.dtype.itemsize
+                    / options.chunk_store_limit
+                )
+                .astype(int)
+                .item()
+            )
+        out_sizes = [out_tensor.size // aggregate_size for _ in range(aggregate_size)]
+        for i in range(out_tensor.size % aggregate_size):
+            out_sizes[i] += 1
+
+        chunk_size = in_tensor.chunk_shape[0]
+        map_chunks = []
+        axis_0_cum_size = np.cumsum(in_tensor.nsplits[0]).tolist()
+        for i in range(chunk_size):
+            for j in range(i, chunk_size):
+                kw = {
+                    "stage": OperandStage.map,
+                    "a": in_tensor.cix[i, 0],
+                    "a_offset": axis_0_cum_size[i - 1] if i > 0 else 0,
+                    "out_sizes": tuple(out_sizes),
+                    "n": n,
+                    "metric": op.metric,
+                    "p": op.p,
+                    "w": w.chunks[0] if w is not None else None,
+                    "v": v.chunks[0] if v is not None else None,
+                    "vi": vi.chunks[0] if vi is not None else None,
+                    "dtype": out_tensor.dtype,
+                }
+                if i != j:
+                    kw["b"] = in_tensor.cix[j, 0]
+                    kw["b_offset"] = axis_0_cum_size[j - 1] if j > 0 else 0
+                map_op = TensorPdist(**kw)
+                map_chunk_inputs = [kw["a"]]
+                if "b" in kw:
+                    map_chunk_inputs.append(kw["b"])
+                if kw["w"] is not None:
+                    map_chunk_inputs.append(kw["w"])
+                if kw["v"] is not None:
+                    map_chunk_inputs.append(kw["v"])
+                if kw["vi"] is not None:
+                    map_chunk_inputs.append(kw["vi"])
+                # calc chunk shape
+                if i == j:
+                    a_axis_0_size = kw["a"].shape[0]
+                    chunk_shape = (a_axis_0_size * (a_axis_0_size - 1) // 2,)
+                else:
+                    chunk_shape = (kw["a"].shape[0] * kw["b"].shape[0],)
+                map_chunk = map_op.new_chunk(
+                    map_chunk_inputs,
+                    shape=chunk_shape,
+                    order=out_tensor.order,
+                    index=(i * chunk_size + j,),
+                )
+                map_chunks.append(map_chunk)
+
+        proxy_chunk = TensorShuffleProxy(dtype=out_tensor.dtype).new_chunk(
+            map_chunks, shape=()
+        )
+
+        reduce_chunks = []
+        for p in range(aggregate_size):
+            reduce_chunk_op = TensorPdist(
+                stage=OperandStage.reduce,
+                dtype=out_tensor.dtype,
+                n_reducers=aggregate_size,
+            )
+            reduce_chunk = reduce_chunk_op.new_chunk(
+                [proxy_chunk], shape=(out_sizes[p],), order=out_tensor.order, index=(p,)
+            )
+            reduce_chunks.append(reduce_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            shape=out_tensor.shape,
+            order=out_tensor.order,
+            nsplits=(tuple(out_sizes),),
+            chunks=reduce_chunks,
+        )
+
+    @classmethod
+    def tile(cls, op):
+        # make sure every inputs have known shape
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        in_tensor = yield from recursive_tile(op.input.rechunk({1: op.input.shape[1]}))
+        # rechunk w, v, vi into one chunk if any of them has value
+        extra_inputs = [None] * 3
+        for i, ei in enumerate([op.w, op.v, op.vi]):
+            if ei is None:
+                continue
+            new_ei = yield from recursive_tile(ei.rechunk(ei.shape))
+            extra_inputs[i] = new_ei
+        w, v, vi = extra_inputs
+
+        if len(in_tensor.chunks) == 1:
+            # only 1 chunk
+            return cls._tile_one_chunk(op, in_tensor, w, v, vi)
+        else:
+            return cls._tile_chunks(op, in_tensor, w, v, vi)
+
+    @classmethod
+    def _execute_map(cls, ctx, op):
+        from scipy.spatial.distance import cdist, pdist
+
+        inputs, device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+
+        if xp is cp:  # pragma: no cover
+            raise NotImplementedError("`pdist` does not support running on GPU yet")
+
+        with device(device_id):
+            inputs_iter = iter(inputs)
+            a = next(inputs_iter)
+            if op.b is not None:
+                b = next(inputs_iter)
+            else:
+                b = None
+            kw = dict()
+            if op.p is not None:
+                kw["p"] = op.p
+            if op.w is not None:
+                kw["w"] = next(inputs_iter)
+            if op.v is not None:
+                kw["V"] = next(inputs_iter)
+            if op.vi is not None:
+                kw["VI"] = next(inputs_iter)
+            metric = op.metric
+
+            if b is None:
+                # one input, pdist on same chunk
+                dists = pdist(ensure_own_data(a), metric=metric, **kw)
+                i_indices, j_indices = xp.triu_indices(a.shape[0], k=1)
+                i_indices += op.a_offset
+                j_indices += op.a_offset
+            else:
+                # two inputs, pdist on different chunks
+                dists = cdist(
+                    ensure_own_data(a), ensure_own_data(b), metric=metric, **kw
+                ).ravel()
+                mgrid = xp.mgrid[
+                    op.a_offset : op.a_offset + a.shape[0],
+                    op.b_offset : op.b_offset + b.shape[0],
+                ]
+                i_indices, j_indices = mgrid[0].ravel(), mgrid[1].ravel()
+
+            out_row_sizes = xp.arange(op.n - 1, -1, -1)
+            out_row_cum_sizes = xp.empty((op.n + 1,), dtype=int)
+            out_row_cum_sizes[0] = 0
+            xp.cumsum(out_row_sizes, out=out_row_cum_sizes[1:])
+            indices = (
+                out_row_cum_sizes[i_indices]
+                + j_indices
+                - (op.n - out_row_sizes[i_indices])
+            )
+
+            # save as much memory as possible
+            del i_indices, j_indices, out_row_sizes, out_row_cum_sizes
+
+            out_cum_size = xp.cumsum(op.out_sizes)
+            out = op.outputs[0]
+            for i in range(len(op.out_sizes)):
+                start_index = out_cum_size[i - 1] if i > 0 else 0
+                end_index = out_cum_size[i]
+                to_filter = (indices >= start_index) & (indices < end_index)
+                downside_indices = indices[to_filter] - start_index
+                downside_dists = dists[to_filter]
+                ctx[out.key, (i,)] = (downside_indices, downside_dists)
+
+    @classmethod
+    def _execute_single(cls, ctx, op):
+        from scipy.spatial.distance import pdist
+
+        inputs, device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+
+        if xp is cp:  # pragma: no cover
+            raise NotImplementedError("`pdist` does not support running on GPU yet")
+
+        with device(device_id):
+            inputs_iter = iter(inputs)
+            x = next(inputs_iter)
+            kw = dict()
+            if op.p is not None:
+                kw["p"] = op.p
+            if op.w is not None:
+                kw["w"] = next(inputs_iter)
+            if op.v is not None:
+                kw["V"] = next(inputs_iter)
+            if op.vi is not None:
+                kw["VI"] = next(inputs_iter)
+
+        ctx[op.outputs[0].key] = pdist(ensure_own_data(x), metric=op.metric, **kw)
+
+    @classmethod
+    def _execute_reduce(cls, ctx, op: "TensorPdist"):
+        raw_inputs = list(op.iter_mapper_data(ctx))
+        raw_indices = [inp[0] for inp in raw_inputs]
+        raw_dists = [inp[1] for inp in raw_inputs]
+        inputs, device_id, xp = as_same_device(
+            raw_indices + raw_dists, op.device, ret_extra=True
+        )
+        raw_indices = inputs[: len(raw_indices)]
+        raw_dists = inputs[len(raw_indices) :]
+        output = op.outputs[0]
+
+        with device(device_id):
+            indices = xp.concatenate(raw_indices)
+            dists = xp.concatenate(raw_dists)
+            out_dists = xp.empty(output.shape, dtype=float)
+            out_dists[indices] = dists
+            ctx[output.key] = out_dists
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            cls._execute_map(ctx, op)
+        elif op.stage == OperandStage.reduce:
+            cls._execute_reduce(ctx, op)
+        else:
+            cls._execute_single(ctx, op)
+
+
+@require_module("scipy.spatial.distance")
+def pdist(X, metric="euclidean", **kwargs):
+    """
+    Pairwise distances between observations in n-dimensional space.
+
+    See Notes for common calling conventions.
+
+    Parameters
+    ----------
+    X : Tensor
+        An m by n tensor of m original observations in an
+        n-dimensional space.
+    metric : str or function, optional
+        The distance metric to use. The distance function can
+        be 'braycurtis', 'canberra', 'chebyshev', 'cityblock',
+        'correlation', 'cosine', 'dice', 'euclidean', 'hamming',
+        'jaccard', 'jensenshannon', 'kulsinski', 'mahalanobis', 'matching',
+        'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
+        'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'.
+    **kwargs : dict, optional
+        Extra arguments to `metric`: refer to each metric documentation for a
+        list of all possible arguments.
+
+        Some possible arguments:
+
+        p : scalar
+        The p-norm to apply for Minkowski, weighted and unweighted.
+        Default: 2.
+
+        w : Tensor
+        The weight vector for metrics that support weights (e.g., Minkowski).
+
+        V : Tensor
+        The variance vector for standardized Euclidean.
+        Default: var(X, axis=0, ddof=1)
+
+        VI : Tensor
+        The inverse of the covariance matrix for Mahalanobis.
+        Default: inv(cov(X.T)).T
+
+        out : Tensor.
+        The output tensor
+        If not None, condensed distance matrix Y is stored in this tensor.
+        Note: metric independent, it will become a regular keyword arg in a
+        future scipy version
+
+    Returns
+    -------
+    Y : Tensor
+        Returns a condensed distance matrix Y.  For
+        each :math:`i` and :math:`j` (where :math:`i<j<m`),where m is the number
+        of original observations. The metric ``dist(u=X[i], v=X[j])``
+        is computed and stored in entry ``ij``.
+
+    See Also
+    --------
+    squareform : converts between condensed distance matrices and
+                 square distance matrices.
+
+    Notes
+    -----
+    See ``squareform`` for information on how to calculate the index of
+    this entry or to convert the condensed distance matrix to a
+    redundant square matrix.
+
+    The following are common calling conventions.
+
+    1. ``Y = pdist(X, 'euclidean')``
+
+       Computes the distance between m points using Euclidean distance
+       (2-norm) as the distance metric between the points. The points
+       are arranged as m n-dimensional row vectors in the matrix X.
+
+    2. ``Y = pdist(X, 'minkowski', p=2.)``
+
+       Computes the distances using the Minkowski distance
+       :math:`||u-v||_p` (p-norm) where :math:`p \\geq 1`.
+
+    3. ``Y = pdist(X, 'cityblock')``
+
+       Computes the city block or Manhattan distance between the
+       points.
+
+    4. ``Y = pdist(X, 'seuclidean', V=None)``
+
+       Computes the standardized Euclidean distance. The standardized
+       Euclidean distance between two n-vectors ``u`` and ``v`` is
+
+       .. math::
+
+          \\sqrt{\\sum {(u_i-v_i)^2 / V[x_i]}}
+
+
+       V is the variance vector; V[i] is the variance computed over all
+       the i'th components of the points.  If not passed, it is
+       automatically computed.
+
+    5. ``Y = pdist(X, 'sqeuclidean')``
+
+       Computes the squared Euclidean distance :math:`||u-v||_2^2` between
+       the vectors.
+
+    6. ``Y = pdist(X, 'cosine')``
+
+       Computes the cosine distance between vectors u and v,
+
+       .. math::
+
+          1 - \\frac{u \\cdot v}
+                   {{||u||}_2 {||v||}_2}
+
+       where :math:`||*||_2` is the 2-norm of its argument ``*``, and
+       :math:`u \\cdot v` is the dot product of ``u`` and ``v``.
+
+    7. ``Y = pdist(X, 'correlation')``
+
+       Computes the correlation distance between vectors u and v. This is
+
+       .. math::
+
+          1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})}
+                   {{||(u - \\bar{u})||}_2 {||(v - \\bar{v})||}_2}
+
+       where :math:`\\bar{v}` is the mean of the elements of vector v,
+       and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`.
+
+    8. ``Y = pdist(X, 'hamming')``
+
+       Computes the normalized Hamming distance, or the proportion of
+       those vector elements between two n-vectors ``u`` and ``v``
+       which disagree. To save memory, the matrix ``X`` can be of type
+       boolean.
+
+    9. ``Y = pdist(X, 'jaccard')``
+
+       Computes the Jaccard distance between the points. Given two
+       vectors, ``u`` and ``v``, the Jaccard distance is the
+       proportion of those elements ``u[i]`` and ``v[i]`` that
+       disagree.
+
+    10. ``Y = pdist(X, 'chebyshev')``
+
+       Computes the Chebyshev distance between the points. The
+       Chebyshev distance between two n-vectors ``u`` and ``v`` is the
+       maximum norm-1 distance between their respective elements. More
+       precisely, the distance is given by
+
+       .. math::
+
+          d(u,v) = \\max_i {|u_i-v_i|}
+
+    11. ``Y = pdist(X, 'canberra')``
+
+       Computes the Canberra distance between the points. The
+       Canberra distance between two points ``u`` and ``v`` is
+
+       .. math::
+
+         d(u,v) = \\sum_i \\frac{|u_i-v_i|}
+                              {|u_i|+|v_i|}
+
+
+    12. ``Y = pdist(X, 'braycurtis')``
+
+       Computes the Bray-Curtis distance between the points. The
+       Bray-Curtis distance between two points ``u`` and ``v`` is
+
+
+       .. math::
+
+            d(u,v) = \\frac{\\sum_i {|u_i-v_i|}}
+                           {\\sum_i {|u_i+v_i|}}
+
+    13. ``Y = pdist(X, 'mahalanobis', VI=None)``
+
+       Computes the Mahalanobis distance between the points. The
+       Mahalanobis distance between two points ``u`` and ``v`` is
+       :math:`\\sqrt{(u-v)(1/V)(u-v)^T}` where :math:`(1/V)` (the ``VI``
+       variable) is the inverse covariance. If ``VI`` is not None,
+       ``VI`` will be used as the inverse covariance matrix.
+
+    14. ``Y = pdist(X, 'yule')``
+
+       Computes the Yule distance between each pair of boolean
+       vectors. (see yule function documentation)
+
+    15. ``Y = pdist(X, 'matching')``
+
+       Synonym for 'hamming'.
+
+    16. ``Y = pdist(X, 'dice')``
+
+       Computes the Dice distance between each pair of boolean
+       vectors. (see dice function documentation)
+
+    17. ``Y = pdist(X, 'kulsinski')``
+
+       Computes the Kulsinski distance between each pair of
+       boolean vectors. (see kulsinski function documentation)
+
+    18. ``Y = pdist(X, 'rogerstanimoto')``
+
+       Computes the Rogers-Tanimoto distance between each pair of
+       boolean vectors. (see rogerstanimoto function documentation)
+
+    19. ``Y = pdist(X, 'russellrao')``
+
+       Computes the Russell-Rao distance between each pair of
+       boolean vectors. (see russellrao function documentation)
+
+    20. ``Y = pdist(X, 'sokalmichener')``
+
+       Computes the Sokal-Michener distance between each pair of
+       boolean vectors. (see sokalmichener function documentation)
+
+    21. ``Y = pdist(X, 'sokalsneath')``
+
+       Computes the Sokal-Sneath distance between each pair of
+       boolean vectors. (see sokalsneath function documentation)
+
+    22. ``Y = pdist(X, 'wminkowski', p=2, w=w)``
+
+       Computes the weighted Minkowski distance between each pair of
+       vectors. (see wminkowski function documentation)
+
+    23. ``Y = pdist(X, f)``
+
+       Computes the distance between all pairs of vectors in X
+       using the user supplied 2-arity function f. For example,
+       Euclidean distance between the vectors could be computed
+       as follows::
+
+         dm = pdist(X, lambda u, v: np.sqrt(((u-v)**2).sum()))
+
+       Note that you should avoid passing a reference to one of
+       the distance functions defined in this library. For example,::
+
+         dm = pdist(X, sokalsneath)
+
+       would calculate the pair-wise distances between the vectors in
+       X using the Python function sokalsneath. This would result in
+       sokalsneath being called :math:`{n \\choose 2}` times, which
+       is inefficient. Instead, the optimized C version is more
+       efficient, and we call it using the following syntax.::
+
+         dm = pdist(X, 'sokalsneath')
+
+    """
+
+    X = astensor(X, order="C")
+
+    if X.issparse():
+        raise ValueError("Sparse tensors are not supported by this function.")
+
+    s = X.shape
+    if len(s) != 2:
+        raise ValueError("A 2-dimensional tensor must be passed.")
+
+    m = s[0]
+    out = kwargs.pop("out", None)
+    if out is not None:
+        if not hasattr(out, "shape"):
+            raise TypeError("return arrays must be a tensor")
+        if out.shape != (m * (m - 1) // 2,):
+            raise ValueError("output tensor has incorrect shape.")
+        if out.dtype != np.double:
+            raise ValueError("Output tensor must be double type.")
+
+    if not callable(metric) and not isinstance(metric, str):
+        raise TypeError(
+            "2nd argument metric must be a string identifier or a function."
+        )
+
+    # scipy remove "wminkowski" since v1.8.0, use "minkowski" with `w=`
+    # keyword-argument for the given weight.
+    if metric == "wminkowski":
+        metric = "minkowski"
+
+    p = kwargs.pop("p", None)
+    w = kwargs.pop("w", None)
+    if w is not None:
+        w = astensor(w)
+    v = kwargs.pop("V", None)
+    if v is not None:
+        v = astensor(v)
+    vi = kwargs.pop("VI", None)
+    if vi is not None:
+        vi = astensor(vi)
+    aggregate_size = kwargs.pop("aggregate_size", None)
+
+    if len(kwargs) > 0:
+        raise TypeError(
+            f"`pdist` got an unexpected keyword argument '{next(iter(kwargs))}'"
+        )
+
+    op = TensorPdist(
+        metric=metric,
+        p=p,
+        w=w,
+        v=v,
+        vi=vi,
+        aggregate_size=aggregate_size,
+        dtype=np.dtype(float),
+    )
+    shape = (m * (m - 1) // 2,)
+    ret = op(X, shape)
+
+    if out is None:
+        return ret
+    else:
+        out.data = ret.data
+        return out
diff --git a/python/xorbits/_mars/tensor/spatial/distance/squareform.py b/python/xorbits/_mars/tensor/spatial/distance/squareform.py
new file mode 100644
index 000000000..b55473fa7
--- /dev/null
+++ b/python/xorbits/_mars/tensor/spatial/distance/squareform.py
@@ -0,0 +1,452 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+
+from .... import opcodes as OperandDef
+from ....config import options
+from ....core import recursive_tile
+from ....core.operand import OperandStage
+from ....serialization.serializables import BoolField, FieldTypes, KeyField, TupleField
+from ....utils import has_unknown_shape, require_module
+from ...arithmetic import equal
+from ...array_utils import as_same_device, cp, device
+from ...core import TensorOrder
+from ...datasource import array, ascontiguousarray, zeros
+from ...operands import TensorMapReduceOperand, TensorOperandMixin, TensorShuffleProxy
+from ...utils import decide_chunk_sizes
+
+
+class TensorSquareform(TensorMapReduceOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.SQUAREFORM
+
+    _input = KeyField("input")
+    _checks = BoolField("checks")
+
+    _checks_input = KeyField("checks_input")
+    _x_shape = TupleField("x_shape", FieldTypes.int32)
+    _reduce_sizes = TupleField("reduce_sizes", FieldTypes.tuple)
+    _start_positions = TupleField("start_positions", FieldTypes.int32)
+
+    def __init__(
+        self,
+        checks=None,
+        checks_input=None,
+        x_shape=None,
+        reduce_sizes=None,
+        start_positions=None,
+        **kw
+    ):
+        super().__init__(
+            _checks=checks,
+            _checks_input=checks_input,
+            _x_shape=x_shape,
+            _reduce_sizes=reduce_sizes,
+            _start_positions=start_positions,
+            **kw
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def checks(self):
+        return self._checks
+
+    @property
+    def checks_input(self):
+        return self._checks_input
+
+    @property
+    def x_shape(self):
+        return self._x_shape
+
+    @property
+    def reduce_sizes(self):
+        return self._reduce_sizes
+
+    @property
+    def start_positions(self):
+        return self._start_positions
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+        if self._checks_input is not None:
+            self._checks_input = self._inputs[-1]
+
+    def __call__(self, X, force="no", chunk_size=None):
+        s = X.shape
+
+        if force.lower() == "tomatrix":
+            if len(s) != 1:
+                raise ValueError(
+                    "Forcing 'tomatrix' but input X is not a distance vector."
+                )
+        elif force.lower() == "tovector":
+            if len(s) != 2:
+                raise ValueError(
+                    "Forcing 'tovector' but input X is not a distance matrix."
+                )
+
+        # X = squareform(v)
+        if len(s) == 1:
+            if s[0] == 0:
+                return zeros((1, 1), dtype=X.dtype)
+
+            # Grab the closest value to the square root of the number
+            # of elements times 2 to see if the number of elements
+            # is indeed a binomial coefficient.
+            d = int(np.ceil(np.sqrt(s[0] * 2)))
+
+            # Check that v is of valid dimensions.
+            if d * (d - 1) != s[0] * 2:
+                raise ValueError(
+                    "Incompatible vector size. It must be a binomial "
+                    "coefficient n choose 2 for some integer n >= 2."
+                )
+
+            shape = (d, d)
+        elif len(s) == 2:
+            if s[0] != s[1]:
+                raise ValueError("The matrix argument must be square.")
+
+            # One-side of the dimensions is set here.
+            d = s[0]
+
+            if d <= 1:
+                return array([], dtype=X.dtype)
+
+            shape = ((d * (d - 1)) // 2,)
+        else:
+            raise ValueError(
+                (
+                    "The first argument must be one or two dimensional "
+                    "tensor. A %d-dimensional tensor is not "
+                    "permitted"
+                )
+                % len(s)
+            )
+
+        return self.new_tensor(
+            [X], shape=shape, order=TensorOrder.C_ORDER, raw_chunk_size=chunk_size
+        )
+
+    @classmethod
+    def tile(cls, op):
+        tensor = op.outputs[0]
+        chunk_size = tensor.extra_params.raw_chunk_size or options.chunk_size
+        chunk_size = decide_chunk_sizes(tensor.shape, chunk_size, tensor.dtype.itemsize)
+        n_chunk = np.product([len(cs) for cs in chunk_size])
+
+        if len(op.input.chunks) == 1 and n_chunk == 1:
+            return cls._tile_one_chunk(op)
+        else:
+            return (yield from cls._tile_chunks(op, chunk_size))
+
+    @classmethod
+    def _tile_one_chunk(cls, op):
+        out = op.outputs[0]
+        chunk_op = op.copy().reset_key()
+        chunk = chunk_op.new_chunk(
+            op.input.chunks, shape=out.shape, order=out.order, index=(0,) * out.ndim
+        )
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            shape=out.shape,
+            order=out.order,
+            chunks=[chunk],
+            nsplits=tuple((s,) for s in out.shape),
+        )
+
+    @classmethod
+    def _gen_checks_input(cls, op):
+        if op.input.ndim != 2 or not op.checks:
+            return
+
+        x = op.input
+        ret = yield from recursive_tile(equal(x, x.T).all())
+        return ret.chunks[0]
+
+    @classmethod
+    def _tile_chunks(cls, op, chunk_size):
+        if has_unknown_shape(*op.inputs):
+            yield
+        out = op.outputs[0]
+
+        checks_input = yield from cls._gen_checks_input(op)
+
+        map_chunks = []
+        cum_sizes = [[0] + np.cumsum(ns).tolist() for ns in op.input.nsplits]
+        to_vec = op.input.ndim == 2
+        for in_chunk in op.input.chunks:
+            if to_vec and in_chunk.index[0] > in_chunk.index[1]:
+                # if apply squareform to 2-d tensor which is symmetric,
+                # we don't need to calculate for lower triangle chunks
+                continue
+            map_chunk_op = TensorSquareform(
+                stage=OperandStage.map,
+                checks_input=checks_input,
+                reduce_sizes=chunk_size,
+                x_shape=op.input.shape,
+                start_positions=tuple(
+                    cum_sizes[ax][j] for ax, j in enumerate(in_chunk.index)
+                ),
+                dtype=out.dtype,
+                gpu=out.op.gpu,
+            )
+            chunk_inputs = [in_chunk]
+            if checks_input is not None:
+                chunk_inputs.append(checks_input)
+            map_chunk = map_chunk_op.new_chunk(
+                chunk_inputs, shape=(2, np.nan), index=in_chunk.index, order=out.order
+            )
+            map_chunks.append(map_chunk)
+
+        proxy_chunk = TensorShuffleProxy(dtype=out.dtype).new_chunk(
+            map_chunks, shape=()
+        )
+
+        reduce_chunks = []
+        out_shape_iter = itertools.product(*chunk_size)
+        out_indices = list(itertools.product(*(range(len(cs)) for cs in chunk_size)))
+        for out_idx, out_shape in zip(out_indices, out_shape_iter):
+            reduce_chunk_op = TensorSquareform(
+                stage=OperandStage.reduce,
+                dtype=out.dtype,
+                n_reducers=len(out_indices),
+            )
+            reduce_chunk = reduce_chunk_op.new_chunk(
+                [proxy_chunk], shape=out_shape, index=out_idx, order=out.order
+            )
+            reduce_chunks.append(reduce_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            shape=out.shape,
+            order=out.order,
+            nsplits=chunk_size,
+            chunks=reduce_chunks,
+        )
+
+    @classmethod
+    def _to_matrix(cls, ctx, xp, x, op):
+        assert x.ndim == 1
+        out_chunk_size = op.reduce_sizes
+        out_shape = tuple(sum(ns) for ns in out_chunk_size)
+        d = out_shape[0]
+
+        # calculate the index for the 1-d chunk
+        index = xp.arange(x.shape[0])
+        index = xp.add(index, op.start_positions[0], out=index)
+
+        # input length for each row
+        row_sizes = xp.arange(d, -1, -1)
+        row_sizes[0] = 0
+        xp.cumsum(row_sizes[1:], out=row_sizes[1:])
+        # calculate row for each element
+        rows = xp.searchsorted(row_sizes, index, side="right")
+        xp.subtract(rows, 1, out=rows)
+        # calculate col for each element
+        # offsets
+        cols_offsets = xp.arange(1, d + 1)
+        cols = xp.empty(x.shape, dtype=np.int32)
+        xp.add(
+            xp.subtract(index, row_sizes[rows], out=cols), cols_offsets[rows], out=cols
+        )
+
+        cum_sizes = [[0] + np.cumsum(cs).tolist() for cs in out_chunk_size]
+        for idx in itertools.product(*(range(len(ns)) for ns in out_chunk_size)):
+            i, j = idx
+            row_range = cum_sizes[0][i], cum_sizes[0][i + 1]
+            col_range = cum_sizes[1][j], cum_sizes[1][j + 1]
+            # for upper
+            filtered = (
+                (rows >= row_range[0])
+                & (rows < row_range[1])
+                & (cols >= col_range[0])
+                & (cols < col_range[1])
+            )
+            inds_tup = rows[filtered] - row_range[0], cols[filtered] - col_range[0]
+            upper_inds = xp.ravel_multi_index(
+                inds_tup, (out_chunk_size[0][i], out_chunk_size[1][j])
+            )
+            upper_values = x[filtered]
+            # for lower
+            filtered = (
+                (rows >= col_range[0])
+                & (rows < col_range[1])
+                & (cols >= row_range[0])
+                & (cols < row_range[1])
+            )
+            inds_tup = cols[filtered] - row_range[0], rows[filtered] - col_range[0]
+            lower_inds = xp.ravel_multi_index(
+                inds_tup, (out_chunk_size[0][i], out_chunk_size[1][j])
+            )
+            lower_values = x[filtered]
+
+            inds = xp.concatenate([upper_inds, lower_inds])
+            values = xp.concatenate([upper_values, lower_values])
+
+            ctx[op.outputs[0].key, idx] = inds, values
+
+    @classmethod
+    def _to_vector(cls, ctx, xp, x, op):
+        out_chunk_size = op.reduce_sizes
+        start_poses = op.start_positions
+
+        i_indices, j_indices = xp.mgrid[
+            start_poses[0] : start_poses[0] + x.shape[0],
+            start_poses[1] : start_poses[1] + x.shape[1],
+        ]
+        filtered = i_indices < j_indices
+        i_indices, j_indices, x = i_indices[filtered], j_indices[filtered], x[filtered]
+
+        d = op.x_shape[0]
+        row_sizes = xp.arange(d - 1, -1, -1)
+        row_cum_sizes = xp.empty((d + 1,), dtype=int)
+        row_cum_sizes[0] = 0
+        xp.cumsum(row_sizes, out=row_cum_sizes[1:])
+        to_indices = row_cum_sizes[i_indices] + j_indices - (d - row_sizes[i_indices])
+
+        cum_chunk_size = [0] + np.cumsum(out_chunk_size).tolist()
+        for i in range(len(out_chunk_size[0])):
+            index_range = cum_chunk_size[i], cum_chunk_size[i + 1]
+            filtered = (to_indices >= index_range[0]) & (to_indices < index_range[1])
+            out_indices = to_indices[filtered] - cum_chunk_size[i]
+            ctx[op.outputs[0].key, (i,)] = out_indices, x[filtered]
+
+    @classmethod
+    def _execute_map(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+
+        if len(inputs) == 2 and not inputs[1]:
+            # check fail
+            raise ValueError("Distance matrix X must be symmetric.")
+
+        if xp is cp:  # pragma: no cover
+            raise NotImplementedError(
+                "`squareform` does not support running on GPU yet"
+            )
+
+        with device(device_id):
+            x = inputs[0]
+            if x.ndim == 1:
+                cls._to_matrix(ctx, xp, x, op)
+            else:
+                cls._to_vector(ctx, xp, x, op)
+
+    @classmethod
+    def _execute_reduce(cls, ctx, op: "TensorSquareform"):
+        raw_inputs = list(op.iter_mapper_data(ctx))
+        raw_indices = [inp[0] for inp in raw_inputs]
+        raw_dists = [inp[1] for inp in raw_inputs]
+        inputs, device_id, xp = as_same_device(
+            raw_indices + raw_dists, op.device, ret_extra=True
+        )
+        raw_indices = inputs[: len(raw_indices)]
+        raw_dists = inputs[len(raw_indices) :]
+        output = op.outputs[0]
+
+        with device(device_id):
+            out_dists = xp.zeros(output.shape, dtype=output.dtype)
+            indices = xp.concatenate(raw_indices)
+            dists = xp.concatenate(raw_dists)
+            out_dists.flat[indices] = dists
+            ctx[output.key] = out_dists
+
+    @classmethod
+    def execute(cls, ctx, op):
+        if op.stage == OperandStage.map:
+            cls._execute_map(ctx, op)
+        elif op.stage == OperandStage.reduce:
+            cls._execute_reduce(ctx, op)
+        else:
+            from scipy.spatial.distance import squareform
+
+            (x,), device_id, xp = as_same_device(
+                [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+            )
+
+            if xp is cp:  # pragma: no cover
+                raise NotImplementedError(
+                    "`squareform` does not support running on GPU yet"
+                )
+
+            with device(device_id):
+                ctx[op.outputs[0].key] = squareform(x, checks=op.checks)
+
+
+@require_module("scipy.spatial.distance")
+def squareform(X, force="no", checks=True, chunk_size=None):
+    """
+    Convert a vector-form distance vector to a square-form distance
+    matrix, and vice-versa.
+
+    Parameters
+    ----------
+    X : Tensor
+        Either a condensed or redundant distance matrix.
+    force : str, optional
+        As with MATLAB(TM), if force is equal to ``'tovector'`` or
+        ``'tomatrix'``, the input will be treated as a distance matrix or
+        distance vector respectively.
+    checks : bool, optional
+        If set to False, no checks will be made for matrix
+        symmetry nor zero diagonals. This is useful if it is known that
+        ``X - X.T1`` is small and ``diag(X)`` is close to zero.
+        These values are ignored any way so they do not disrupt the
+        squareform transformation.
+
+    Returns
+    -------
+    Y : Tensor
+        If a condensed distance matrix is passed, a redundant one is
+        returned, or if a redundant one is passed, a condensed distance
+        matrix is returned.
+
+    Notes
+    -----
+    1. v = squareform(X)
+
+       Given a square d-by-d symmetric distance matrix X,
+       ``v = squareform(X)`` returns a ``d * (d-1) / 2`` (or
+       :math:`{n \\choose 2}`) sized vector v.
+
+      :math:`v[{n \\choose 2}-{n-i \\choose 2} + (j-i-1)]` is the distance
+      between points i and j. If X is non-square or asymmetric, an error
+      is returned.
+
+    2. X = squareform(v)
+
+      Given a ``d*(d-1)/2`` sized v for some integer ``d >= 2`` encoding
+      distances as described, ``X = squareform(v)`` returns a d by d distance
+      matrix X.  The ``X[i, j]`` and ``X[j, i]`` values are set to
+      :math:`v[{n \\choose 2}-{n-i \\choose 2} + (j-i-1)]` and all
+      diagonal elements are zero.
+
+    """
+
+    X = ascontiguousarray(X)
+
+    op = TensorSquareform(checks=checks, dtype=X.dtype, gpu=X.op.gpu)
+    return op(X, force=force, chunk_size=chunk_size)
diff --git a/python/xorbits/_mars/tensor/spatial/distance/tests/__init__.py b/python/xorbits/_mars/tensor/spatial/distance/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/spatial/distance/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/spatial/distance/tests/test_distance.py b/python/xorbits/_mars/tensor/spatial/distance/tests/test_distance.py
new file mode 100644
index 000000000..a71e130a1
--- /dev/null
+++ b/python/xorbits/_mars/tensor/spatial/distance/tests/test_distance.py
@@ -0,0 +1,160 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import scipy.sparse as sps
+
+from .....core import tile
+from ....datasource import tensor
+from ... import distance
+
+
+def test_pdist():
+    raw = np.random.rand(100, 10)
+
+    # test 1 chunk
+    a = tensor(raw, chunk_size=100)
+    dist = distance.pdist(a)
+    assert dist.shape == (100 * 99 // 2,)
+
+    dist = tile(dist)
+    assert len(dist.chunks) == 1
+    for c in dist.chunks:
+        assert c.shape == (dist.shape[0],)
+
+    # test multiple chunks
+    a = tensor(raw, chunk_size=15)
+    dist = distance.pdist(a, aggregate_size=2)
+    assert dist.shape == (100 * 99 // 2,)
+
+    dist = tile(dist)
+    assert len(dist.chunks) == 2
+    for c in dist.chunks:
+        assert c.shape == (dist.shape[0] // 2,)
+
+    # X cannot be sparse
+    raw = sps.csr_matrix(np.zeros((4, 3)))
+    a = tensor(raw)
+    with pytest.raises(ValueError):
+        distance.pdist(a)
+
+    # X can only be 2-d
+    with pytest.raises(ValueError):
+        distance.pdist(np.random.rand(3, 3, 3))
+
+    # out type wrong
+    with pytest.raises(TypeError):
+        distance.pdist(np.random.rand(3, 3), out=2)
+
+    # out shape wrong
+    with pytest.raises(ValueError):
+        distance.pdist(np.random.rand(3, 3), out=tensor(np.random.rand(2)))
+
+    # out dtype wrong
+    with pytest.raises(ValueError):
+        distance.pdist(
+            np.random.rand(3, 3), out=tensor(np.random.randint(2, size=(3,)))
+        )
+
+    # test extra param
+    with pytest.raises(TypeError):
+        distance.pdist(np.random.rand(3, 3), unknown_kw="unknown_kw")
+
+
+def test_cdist():
+    raw_a = np.random.rand(100, 10)
+    raw_b = np.random.rand(90, 10)
+
+    # test 1 chunk
+    a = tensor(raw_a, chunk_size=100)
+    b = tensor(raw_b, chunk_size=100)
+    dist = distance.cdist(a, b)
+    assert dist.shape == (100, 90)
+
+    dist = tile(dist)
+    assert len(dist.chunks) == 1
+    for c in dist.chunks:
+        assert c.shape == dist.shape
+
+    # test multiple chunks
+    a = tensor(raw_a, chunk_size=15)
+    b = tensor(raw_b, chunk_size=16)
+    dist = distance.cdist(a, b)
+    assert dist.shape == (100, 90)
+
+    ta, tb, dist = tile(a, b, dist)
+    assert len(dist.chunks) == (100 // 15 + 1) * (90 // 16 + 1)
+    assert dist.nsplits == (ta.nsplits[0], tb.nsplits[0])
+    for c in dist.chunks:
+        assert c.shape == (
+            ta.cix[c.index[0], 0].shape[0],
+            tb.cix[c.index[1], 0].shape[0],
+        )
+
+    # XA can only be 2-d
+    with pytest.raises(ValueError):
+        distance.cdist(np.random.rand(3, 3, 3), np.random.rand(3, 3))
+
+    # XB can only be 2-d
+    with pytest.raises(ValueError):
+        distance.cdist(np.random.rand(3, 3), np.random.rand(3, 3, 3))
+
+    # XA cannot be sparse
+    raw = sps.csr_matrix(np.zeros((4, 3)))
+    a = tensor(raw)
+    with pytest.raises(ValueError):
+        distance.cdist(a, np.random.rand(10, 3))
+
+    # XB cannot be sparse
+    raw = sps.csr_matrix(np.zeros((4, 3)))
+    b = tensor(raw)
+    with pytest.raises(ValueError):
+        distance.cdist(np.random.rand(10, 3), b)
+
+    # out type wrong
+    with pytest.raises(TypeError):
+        distance.cdist(raw_a, raw_b, out=2)
+
+    # out shape wrong
+    with pytest.raises(ValueError):
+        distance.cdist(raw_a, raw_b, out=tensor(np.random.rand(100, 91)))
+
+    # out dtype wrong
+    with pytest.raises(ValueError):
+        distance.cdist(raw_a, raw_b, out=tensor(np.random.randint(2, size=(100, 90))))
+
+    # test extra param
+    with pytest.raises(TypeError):
+        distance.cdist(raw_a, raw_b, unknown_kw="unknown_kw")
+
+
+def test_squareform():
+    assert distance.squareform(np.array([], dtype=float)).shape == (1, 1)
+    assert distance.squareform(np.atleast_2d(np.random.rand())).shape == (0,)
+
+    with pytest.raises(ValueError):
+        distance.squareform(np.random.rand(3, 3), force="tomatrix")
+
+    with pytest.raises(ValueError):
+        distance.squareform(np.random.rand(3), force="tovector")
+
+    with pytest.raises(ValueError):
+        distance.squareform(np.random.rand(3, 3, 3))
+
+    with pytest.raises(ValueError):
+        distance.squareform(np.random.rand(2, 4))
+
+    with pytest.raises(ValueError):
+        distance.squareform(np.random.rand(7))
diff --git a/python/xorbits/_mars/tensor/spatial/distance/tests/test_distance_execution.py b/python/xorbits/_mars/tensor/spatial/distance/tests/test_distance_execution.py
new file mode 100644
index 000000000..51ccfd0e1
--- /dev/null
+++ b/python/xorbits/_mars/tensor/spatial/distance/tests/test_distance_execution.py
@@ -0,0 +1,238 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+from .....core import tile
+from ....datasource import tensor
+from ... import distance
+
+
+@pytest.mark.skipif(distance.pdist is None, reason="scipy not installed")
+def test_pdist_execution(setup):
+    from scipy.spatial.distance import pdist as sp_pdist
+
+    raw = np.random.rand(100, 10)
+
+    # test 1 chunk
+    x = tensor(raw, chunk_size=100)
+
+    dist = distance.pdist(x)
+    result = dist.execute().fetch()
+    expected = sp_pdist(raw)
+    np.testing.assert_array_equal(result, expected)
+
+    dist = distance.pdist(x, metric="hamming")
+    result = dist.execute().fetch()
+    expected = sp_pdist(raw, metric="hamming")
+    np.testing.assert_array_equal(result, expected)
+
+    f = lambda u, v: np.sqrt(((u - v) ** 2).sum())
+    dist = distance.pdist(x, metric=f)
+    result = dist.execute().fetch()
+    expected = sp_pdist(raw, metric=f)
+    np.testing.assert_array_equal(result, expected)
+
+    # test more than 1 chunk
+    x = tensor(raw, chunk_size=12)
+
+    dist = distance.pdist(x)
+    tdist = tile(dist)
+    assert len(tdist.chunks) == 1
+    result = dist.execute().fetch()
+    expected = sp_pdist(raw)
+    np.testing.assert_array_equal(result, expected)
+
+    dist = distance.pdist(x, aggregate_size=3)
+    tdist = tile(dist)
+    assert len(tdist.chunks) == 3
+    result = dist.execute().fetch()
+    expected = sp_pdist(raw)
+    np.testing.assert_array_equal(result, expected)
+
+    dist = distance.pdist(x, metric="hamming", aggregate_size=2)
+    tdist = tile(dist)
+    assert len(tdist.chunks) == 2
+    result = dist.execute().fetch()
+    expected = sp_pdist(raw, metric="hamming")
+    np.testing.assert_array_equal(result, expected)
+
+    f = lambda u, v: np.sqrt(((u - v) ** 2).sum())
+    dist = distance.pdist(x, metric=f, aggregate_size=2)
+    result = dist.execute().fetch()
+    expected = sp_pdist(raw, metric=f)
+    np.testing.assert_array_equal(result, expected)
+
+    for x in [tensor(raw), tensor(raw, chunk_size=12)]:
+        # test w
+        weight = np.random.rand(10)
+        w = tensor(weight, chunk_size=7)
+        dist = distance.pdist(x, metric="wminkowski", p=3, w=w)
+        result = dist.execute().fetch()
+        expected = sp_pdist(raw, metric="minkowski", p=3, w=weight)
+        np.testing.assert_array_equal(result, expected)
+
+        # test V
+        v = np.random.rand(10)
+        V = tensor(v, chunk_size=7)
+        dist = distance.pdist(x, metric="seuclidean", V=V)
+        result = dist.execute().fetch()
+        expected = sp_pdist(raw, metric="seuclidean", V=v)
+        np.testing.assert_array_equal(result, expected)
+
+        # test VI
+        vi = np.random.rand(10, 10)
+        VI = tensor(vi, chunk_size=8)
+        dist = distance.pdist(x, metric="mahalanobis", VI=VI)
+        result = dist.execute().fetch()
+        expected = sp_pdist(raw, metric="mahalanobis", VI=vi)
+        np.testing.assert_array_equal(result, expected)
+
+
+@pytest.mark.skipif(distance.cdist is None, reason="scipy not installed")
+def test_cdist_execution(setup):
+    from scipy.spatial.distance import cdist as sp_cdist
+
+    raw_a = np.random.rand(100, 10)
+    raw_b = np.random.rand(89, 10)
+
+    # test 1 chunk
+    xa = tensor(raw_a, chunk_size=100)
+    xb = tensor(raw_b, chunk_size=100)
+
+    dist = distance.cdist(xa, xb)
+    result = dist.execute().fetch()
+    expected = sp_cdist(raw_a, raw_b)
+    np.testing.assert_array_equal(result, expected)
+
+    dist = distance.cdist(xa, xb, metric="hamming")
+    result = dist.execute().fetch()
+    expected = sp_cdist(raw_a, raw_b, metric="hamming")
+    np.testing.assert_array_equal(result, expected)
+
+    f = lambda u, v: np.sqrt(((u - v) ** 2).sum())
+    dist = distance.cdist(xa, xb, metric=f)
+    result = dist.execute().fetch()
+    expected = sp_cdist(raw_a, raw_b, metric=f)
+    np.testing.assert_array_equal(result, expected)
+
+    # test more than 1 chunk
+    xa = tensor(raw_a, chunk_size=12)
+    xb = tensor(raw_b, chunk_size=13)
+
+    dist = distance.cdist(xa, xb)
+    result = dist.execute().fetch()
+    expected = sp_cdist(raw_a, raw_b)
+    np.testing.assert_array_equal(result, expected)
+
+    dist = distance.cdist(xa, xb, metric="hamming")
+    result = dist.execute().fetch()
+    expected = sp_cdist(raw_a, raw_b, metric="hamming")
+    np.testing.assert_array_equal(result, expected)
+
+    f = lambda u, v: np.sqrt(((u - v) ** 2).sum())
+    dist = distance.cdist(xa, xb, metric=f)
+    result = dist.execute().fetch()
+    expected = sp_cdist(raw_a, raw_b, metric=f)
+    np.testing.assert_array_equal(result, expected)
+
+    for xa, xb in [
+        (tensor(raw_a), tensor(raw_b)),
+        (tensor(raw_a, chunk_size=12), tensor(raw_b, chunk_size=13)),
+    ]:
+        # test w
+        weight = np.random.rand(10)
+        w = tensor(weight, chunk_size=7)
+        dist = distance.cdist(xa, xb, metric="wminkowski", p=3, w=w)
+        result = dist.execute().fetch()
+        expected = sp_cdist(raw_a, raw_b, metric="minkowski", p=3, w=weight)
+        np.testing.assert_array_equal(result, expected)
+
+        # test V
+        v = np.random.rand(10)
+        V = tensor(v, chunk_size=7)
+        dist = distance.cdist(xa, xb, metric="seuclidean", V=V)
+        result = dist.execute().fetch()
+        expected = sp_cdist(raw_a, raw_b, metric="seuclidean", V=v)
+        np.testing.assert_array_equal(result, expected)
+
+        # test VI
+        vi = np.random.rand(10, 10)
+        VI = tensor(vi, chunk_size=8)
+        dist = distance.cdist(xa, xb, metric="mahalanobis", VI=VI)
+        result = dist.execute().fetch()
+        expected = sp_cdist(raw_a, raw_b, metric="mahalanobis", VI=vi)
+        np.testing.assert_array_equal(result, expected)
+
+
+@pytest.mark.skipif(distance.cdist is None, reason="scipy not installed")
+def test_squareform_execution(setup):
+    from scipy.spatial.distance import pdist as sp_pdist
+    from scipy.spatial.distance import squareform as sp_squareform
+
+    raw_a = np.random.rand(80, 10)
+    raw_pdsit = sp_pdist(raw_a)
+    raw_square = sp_squareform(raw_pdsit)
+
+    # tomatrix, test 1 chunk
+    vec = tensor(raw_pdsit, chunk_size=raw_pdsit.shape[0])
+    mat = distance.squareform(vec, chunk_size=100)
+    result = mat.execute().fetch()
+    np.testing.assert_array_equal(result, raw_square)
+
+    # tomatrix, test more than 1 chunk
+    vec = tensor(raw_pdsit, chunk_size=33)
+    assert len(tile(vec).chunks) > 1
+    mat = distance.squareform(vec, chunk_size=34)
+    result = mat.execute().fetch()
+    np.testing.assert_array_equal(result, raw_square)
+
+    # tovec, test 1 chunk
+    mat = tensor(raw_square)
+    vec = distance.squareform(mat, chunk_size=raw_pdsit.shape[0])
+    assert len(tile(mat).chunks) == 1
+    assert len(tile(vec).chunks) == 1
+    result = vec.execute().fetch()
+    np.testing.assert_array_equal(result, raw_pdsit)
+
+    # tovec, test more than 1 chunk
+    mat = tensor(raw_square, chunk_size=31)
+    vec = distance.squareform(mat, chunk_size=40)
+    assert len(tile(vec).chunks) > 1
+    result = vec.execute().fetch()
+    np.testing.assert_array_equal(result, raw_pdsit)
+
+    # test checks
+    # generate non-symmetric matrix
+    non_sym_arr = np.random.RandomState(0).rand(10, 10)
+
+    # 1 chunk
+    mat = tensor(non_sym_arr)
+    vec = distance.squareform(mat, checks=True, chunk_size=100)
+    with pytest.raises(ValueError):
+        _ = vec.execute().fetch()
+    # force checks=False
+    vec = distance.squareform(mat, checks=False, chunk_size=100)
+    _ = vec.execute().fetch()
+
+    # more than 1 chunk
+    mat = tensor(non_sym_arr, chunk_size=6)
+    vec = distance.squareform(mat, checks=True, chunk_size=8)
+    assert len(tile(vec).chunks) > 1
+    with pytest.raises(ValueError):
+        _ = vec.execute().fetch()
+    # force checks=False
+    vec = distance.squareform(mat, checks=False, chunk_size=100)
+    _ = vec.execute().fetch()
diff --git a/python/xorbits/_mars/tensor/special/__init__.py b/python/xorbits/_mars/tensor/special/__init__.py
new file mode 100644
index 000000000..e1644e19b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/special/__init__.py
@@ -0,0 +1,167 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try:
+    import scipy
+
+    from .airy import TensorAiry, TensorAirye, TensorItairy, airy, airye, itairy
+    from .bessel import (
+        TensorHankel1,
+        TensorHankel1e,
+        TensorHankel2,
+        TensorHankel2e,
+        TensorIV,
+        TensorIVE,
+        TensorJV,
+        TensorJVE,
+        TensorKN,
+        TensorKV,
+        TensorKVE,
+        TensorYN,
+        TensorYV,
+        TensorYVE,
+        hankel1,
+        hankel1e,
+        hankel2,
+        hankel2e,
+        iv,
+        ive,
+        jv,
+        jve,
+        kn,
+        kv,
+        kve,
+        yn,
+        yv,
+        yve,
+    )
+    from .convenience import TensorXLogY, xlogy
+    from .ellip_func_integrals import (
+        TensorEllipe,
+        TensorEllipeinc,
+        TensorEllipk,
+        TensorEllipkinc,
+        TensorEllipkm1,
+        TensorElliprc,
+        TensorElliprd,
+        TensorElliprf,
+        TensorElliprg,
+        TensorElliprj,
+        ellipe,
+        ellipeinc,
+        ellipk,
+        ellipkinc,
+        ellipkm1,
+        elliprc,
+        elliprd,
+        elliprf,
+        elliprg,
+        elliprj,
+    )
+    from .ellip_harm import (
+        TensorEllipHarm,
+        TensorEllipHarm2,
+        TensorEllipNormal,
+        ellip_harm,
+        ellip_harm_2,
+        ellip_normal,
+    )
+    from .err_fresnel import (
+        TensorDawsn,
+        TensorErf,
+        TensorErfc,
+        TensorErfcinv,
+        TensorErfcx,
+        TensorErfi,
+        TensorErfinv,
+        TensorFresnel,
+        TensorModFresnelM,
+        TensorModFresnelP,
+        TensorVoigtProfile,
+        TensorWofz,
+        dawsn,
+        erf,
+        erfc,
+        erfcinv,
+        erfcx,
+        erfi,
+        erfinv,
+        fresnel,
+        modfresnelm,
+        modfresnelp,
+        voigt_profile,
+        wofz,
+    )
+    from .gamma_funcs import (
+        TensorBeta,
+        TensorBetaInc,
+        TensorBetaIncInv,
+        TensorBetaLn,
+        TensorDiGamma,
+        TensorGamma,
+        TensorGammaInc,
+        TensorGammaIncc,
+        TensorGammaInccInv,
+        TensorGammaIncInv,
+        TensorGammaln,
+        TensorGammaSgn,
+        TensorLogGamma,
+        TensorMultiGammaLn,
+        TensorPoch,
+        TensorPolyGamma,
+        TensorPsi,
+        TensorRGamma,
+        beta,
+        betainc,
+        betaincinv,
+        betaln,
+        digamma,
+        gamma,
+        gammainc,
+        gammaincc,
+        gammainccinv,
+        gammaincinv,
+        gammaln,
+        gammasgn,
+        loggamma,
+        multigammaln,
+        poch,
+        polygamma,
+        psi,
+        rgamma,
+    )
+    from .hypergeometric_funcs import (
+        TensorHYP0F1,
+        TensorHYP1F1,
+        TensorHYP2F1,
+        TensorHYPERU,
+        hyp0f1,
+        hyp1f1,
+        hyp2f1,
+        hyperu,
+    )
+    from .info_theory import (
+        TensorEntr,
+        TensorKlDiv,
+        TensorRelEntr,
+        entr,
+        kl_div,
+        rel_entr,
+    )
+except ImportError:  # pragma: no cover
+    pass
+
+_names_to_del = [_name for _name, _val in globals().items() if _val is None]
+[globals().pop(_name) for _name in _names_to_del]
+del _names_to_del
diff --git a/python/xorbits/_mars/tensor/special/airy.py b/python/xorbits/_mars/tensor/special/airy.py
new file mode 100644
index 000000000..6b9d916b7
--- /dev/null
+++ b/python/xorbits/_mars/tensor/special/airy.py
@@ -0,0 +1,57 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import scipy.special as spspecial
+
+from ..utils import implement_scipy, infer_dtype
+from .core import TensorTupleOp, _register_special_op
+
+
+@_register_special_op
+class TensorAiry(TensorTupleOp):
+    _func_name = "airy"
+    _n_outputs = 4
+
+
+@implement_scipy(spspecial.airy)
+@infer_dtype(spspecial.airy, multi_outputs=True)
+def airy(z, out=None, **kwargs):
+    op = TensorAiry(**kwargs)
+    return op(z, out=out)
+
+
+@_register_special_op
+class TensorAirye(TensorTupleOp):
+    _func_name = "airye"
+    _n_outputs = 4
+
+
+@implement_scipy(spspecial.airye)
+@infer_dtype(spspecial.airye, multi_outputs=True)
+def airye(z, out=None, **kwargs):
+    op = TensorAirye(**kwargs)
+    return op(z, out=out)
+
+
+@_register_special_op
+class TensorItairy(TensorTupleOp):
+    _func_name = "itairy"
+    _n_outputs = 4
+
+
+@implement_scipy(spspecial.itairy)
+@infer_dtype(spspecial.itairy, multi_outputs=True)
+def itairy(x, out=None, **kwargs):
+    op = TensorItairy(**kwargs)
+    return op(x, out=out)
diff --git a/python/xorbits/_mars/tensor/special/bessel.py b/python/xorbits/_mars/tensor/special/bessel.py
new file mode 100644
index 000000000..edf64890e
--- /dev/null
+++ b/python/xorbits/_mars/tensor/special/bessel.py
@@ -0,0 +1,201 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import scipy.special as spspecial
+
+from ..arithmetic.utils import arithmetic_operand
+from ..utils import implement_scipy, infer_dtype
+from .core import TensorSpecialBinOp, _register_special_op
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorJV(TensorSpecialBinOp):
+    _func_name = "jv"
+
+
+@implement_scipy(spspecial.jv)
+@infer_dtype(spspecial.jv)
+def jv(v, z, **kwargs):
+    op = TensorJV(**kwargs)
+    return op(v, z)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorJVE(TensorSpecialBinOp):
+    _func_name = "jve"
+
+
+@implement_scipy(spspecial.jve)
+@infer_dtype(spspecial.jve)
+def jve(v, z, **kwargs):
+    op = TensorJVE(**kwargs)
+    return op(v, z)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorYN(TensorSpecialBinOp):
+    _func_name = "yn"
+
+
+@implement_scipy(spspecial.yn)
+@infer_dtype(spspecial.yn)
+def yn(n, x, **kwargs):
+    op = TensorYN(**kwargs)
+    return op(n, x)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorYV(TensorSpecialBinOp):
+    _func_name = "yv"
+
+
+@implement_scipy(spspecial.yv)
+@infer_dtype(spspecial.yv)
+def yv(v, z, **kwargs):
+    op = TensorYV(**kwargs)
+    return op(v, z)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorYVE(TensorSpecialBinOp):
+    _func_name = "yve"
+
+
+@implement_scipy(spspecial.yve)
+@infer_dtype(spspecial.yve)
+def yve(v, z, **kwargs):
+    op = TensorYVE(**kwargs)
+    return op(v, z)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorKN(TensorSpecialBinOp):
+    _func_name = "kn"
+
+
+@implement_scipy(spspecial.kn)
+@infer_dtype(spspecial.kn)
+def kn(n, x, **kwargs):
+    op = TensorKN(**kwargs)
+    return op(n, x)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorKV(TensorSpecialBinOp):
+    _func_name = "kv"
+
+
+@implement_scipy(spspecial.kv)
+@infer_dtype(spspecial.kv)
+def kv(v, z, **kwargs):
+    op = TensorKV(**kwargs)
+    return op(v, z)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorKVE(TensorSpecialBinOp):
+    _func_name = "kve"
+
+
+@implement_scipy(spspecial.kve)
+@infer_dtype(spspecial.kve)
+def kve(v, z, **kwargs):
+    op = TensorKVE(**kwargs)
+    return op(v, z)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorIV(TensorSpecialBinOp):
+    _func_name = "iv"
+
+
+@implement_scipy(spspecial.iv)
+@infer_dtype(spspecial.iv)
+def iv(v, z, **kwargs):
+    op = TensorIV(**kwargs)
+    return op(v, z)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorIVE(TensorSpecialBinOp):
+    _func_name = "ive"
+
+
+@implement_scipy(spspecial.ive)
+@infer_dtype(spspecial.ive)
+def ive(v, z, **kwargs):
+    op = TensorIVE(**kwargs)
+    return op(v, z)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorHankel1(TensorSpecialBinOp):
+    _func_name = "hankel1"
+
+
+@implement_scipy(spspecial.hankel1)
+@infer_dtype(spspecial.hankel1)
+def hankel1(v, z, **kwargs):
+    op = TensorHankel1(**kwargs)
+    return op(v, z)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorHankel1e(TensorSpecialBinOp):
+    _func_name = "hankel1e"
+
+
+@implement_scipy(spspecial.hankel1e)
+@infer_dtype(spspecial.hankel1e)
+def hankel1e(v, z, **kwargs):
+    op = TensorHankel1e(**kwargs)
+    return op(v, z)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorHankel2(TensorSpecialBinOp):
+    _func_name = "hankel2"
+
+
+@implement_scipy(spspecial.hankel2)
+@infer_dtype(spspecial.hankel2)
+def hankel2(v, z, **kwargs):
+    op = TensorHankel2(**kwargs)
+    return op(v, z)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorHankel2e(TensorSpecialBinOp):
+    _func_name = "hankel2e"
+
+
+@implement_scipy(spspecial.hankel2e)
+@infer_dtype(spspecial.hankel2e)
+def hankel2e(v, z, **kwargs):
+    op = TensorHankel2e(**kwargs)
+    return op(v, z)
diff --git a/python/xorbits/_mars/tensor/special/convenience.py b/python/xorbits/_mars/tensor/special/convenience.py
new file mode 100644
index 000000000..a5ad51d01
--- /dev/null
+++ b/python/xorbits/_mars/tensor/special/convenience.py
@@ -0,0 +1,36 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import scipy.special as spspecial
+
+from ..utils import implement_scipy, infer_dtype
+from .core import TensorSpecialBinOp, _register_special_op
+
+
+@_register_special_op
+class TensorXLogY(TensorSpecialBinOp):
+    _func_name = "xlogy"
+
+    @classmethod
+    def _is_sparse(cls, x1, x2):
+        if hasattr(x1, "issparse") and x1.issparse():
+            return True
+        return False
+
+
+@implement_scipy(spspecial.xlogy)
+@infer_dtype(spspecial.xlogy)
+def xlogy(x1, x2, out=None, where=None, **kwargs):
+    op = TensorXLogY(**kwargs)
+    return op(x1, x2, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/special/core.py b/python/xorbits/_mars/tensor/special/core.py
new file mode 100644
index 000000000..3f19a0f52
--- /dev/null
+++ b/python/xorbits/_mars/tensor/special/core.py
@@ -0,0 +1,172 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import scipy.special as spspecial
+
+from ... import opcodes
+from ...core import ExecutableTuple
+from ..arithmetic.core import TensorBinOp, TensorMultiOp, TensorUnaryOp
+from ..array_utils import (
+    as_same_device,
+    convert_order,
+    cp,
+    device,
+    issparse,
+    np,
+    sparse,
+)
+from ..datasource import tensor as astensor
+
+_func_name_to_special_cls = {}
+
+
+def _register_special_op(cls):
+    if cls._func_name is not None:
+        _func_name_to_special_cls[cls._func_name] = cls
+    return cls
+
+
+class TensorSpecialOperandMixin:
+    _op_code_ = opcodes.SPECIAL
+    _func_name = None
+
+    def __new__(cls, *args, **kwargs):
+        if cls._func_name is not None:
+            return object.__new__(_func_name_to_special_cls[cls._func_name])
+        return super().__new__(cls, *args, **kwargs)
+
+    @classmethod
+    def _get_func(cls, xp):
+        if xp is np:
+            from scipy import special
+
+            return getattr(special, cls._func_name)
+        elif cp is not None and xp is cp:
+            from cupyx.scipy import special
+
+            return getattr(special, cls._func_name)
+        else:
+            assert xp is sparse
+            return getattr(sparse, cls._func_name)
+
+
+class TensorSpecialUnaryOp(TensorSpecialOperandMixin, TensorUnaryOp):
+    pass
+
+
+class TensorSpecialBinOp(TensorSpecialOperandMixin, TensorBinOp):
+    pass
+
+
+class TensorSpecialMultiOp(TensorSpecialOperandMixin, TensorMultiOp):
+    @classmethod
+    def _execute_gpu(cls, op, xp, *args, **kw):
+        if kw.get("out") is not None:
+            kw["out"] = xp.asarray(kw["out"])
+        r = cls._get_func(xp)(*args, **kw)
+        return convert_order(r, op.outputs[0].order.value)
+
+    @classmethod
+    def _execute_cpu(cls, op, xp, *args, **kw):
+        kw["order"] = op.order
+        if kw.get("out") is not None:
+            kw["out"] = np.asarray(kw["out"])
+        try:
+            return cls._get_func(xp)(*args, **kw)
+        except TypeError:
+            kw.pop("order")
+            r = cls._get_func(xp)(*args, **kw)
+            if issparse(r):
+                return r
+            return convert_order(r, op.outputs[0].order.value)
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            kw = {"casting": op.casting} if op.out is not None else {}
+
+            inputs_iter = iter(inputs)
+            args = [a if np.isscalar(a) else next(inputs_iter) for a in op.args]
+            if op.out is not None:
+                kw["out"] = next(inputs_iter).copy()
+
+            with np.errstate(**op.err):
+                if op.is_gpu():
+                    ret = cls._execute_gpu(op, xp, *args, **kw)
+                else:
+                    ret = cls._execute_cpu(op, xp, *args, **kw)
+
+                if ret.dtype != op.dtype:
+                    ret = ret.astype(op.dtype)
+                ctx[op.outputs[0].key] = ret
+
+
+class TensorTupleOp(TensorSpecialUnaryOp):
+    @property
+    def output_limit(self):
+        return self._n_outputs
+
+    def __call__(self, x, out=None):
+        x = astensor(x)
+
+        if out is not None:
+            if not isinstance(out, ExecutableTuple):
+                raise TypeError(
+                    f"out should be ExecutableTuple object, got {type(out)} instead"
+                )
+            if len(out) != self._n_outputs:
+                raise TypeError(
+                    f"out should be an ExecutableTuple object with {self._n_outputs} elements, got {len(out)} instead"
+                )
+
+        func = getattr(spspecial, self._func_name)
+        res = func(np.ones(x.shape, dtype=x.dtype))
+        res_tensors = self.new_tensors(
+            [x],
+            kws=[
+                {
+                    "side": f"{self._func_name}[{i}]",
+                    "dtype": output.dtype,
+                    "shape": output.shape,
+                }
+                for i, output in enumerate(res)
+            ],
+        )
+
+        if out is None:
+            return ExecutableTuple(res_tensors)
+
+        for res_tensor, out_tensor in zip(res_tensors, out):
+            out_tensor.data = res_tensor.data
+        return out
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        with device(device_id):
+            with np.errstate(**op.err):
+                if op.is_gpu():
+                    ret = cls._execute_gpu(op, xp, inputs[0])
+                else:
+                    ret = cls._execute_cpu(op, xp, inputs[0])
+
+                for output, ret_element in zip(op.outputs, ret):
+                    ctx[output.key] = ret_element
diff --git a/python/xorbits/_mars/tensor/special/ellip_func_integrals.py b/python/xorbits/_mars/tensor/special/ellip_func_integrals.py
new file mode 100644
index 000000000..761a20cf8
--- /dev/null
+++ b/python/xorbits/_mars/tensor/special/ellip_func_integrals.py
@@ -0,0 +1,157 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import scipy.special as spspecial
+
+from ..arithmetic.utils import arithmetic_operand
+from ..utils import implement_scipy, infer_dtype
+from .core import (
+    TensorSpecialBinOp,
+    TensorSpecialMultiOp,
+    TensorSpecialUnaryOp,
+    _register_special_op,
+)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorEllipk(TensorSpecialUnaryOp):
+    _func_name = "ellipk"
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorEllipkm1(TensorSpecialUnaryOp):
+    _func_name = "ellipkm1"
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorEllipkinc(TensorSpecialBinOp):
+    _func_name = "ellipkinc"
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorEllipe(TensorSpecialUnaryOp):
+    _func_name = "ellipe"
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorEllipeinc(TensorSpecialBinOp):
+    _func_name = "ellipeinc"
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorElliprc(TensorSpecialBinOp):
+    _func_name = "elliprc"
+
+
+@_register_special_op
+class TensorElliprd(TensorSpecialMultiOp):
+    _ARG_COUNT = 3
+    _func_name = "elliprd"
+
+
+@_register_special_op
+class TensorElliprf(TensorSpecialMultiOp):
+    _ARG_COUNT = 3
+    _func_name = "elliprf"
+
+
+@_register_special_op
+class TensorElliprg(TensorSpecialMultiOp):
+    _ARG_COUNT = 3
+    _func_name = "elliprg"
+
+
+@_register_special_op
+class TensorElliprj(TensorSpecialMultiOp):
+    _ARG_COUNT = 4
+    _func_name = "elliprj"
+
+
+@implement_scipy(spspecial.ellipk)
+@infer_dtype(spspecial.ellipk)
+def ellipk(x, **kwargs):
+    op = TensorEllipk(**kwargs)
+    return op(x)
+
+
+@implement_scipy(spspecial.ellipkm1)
+@infer_dtype(spspecial.ellipkm1)
+def ellipkm1(x, **kwargs):
+    op = TensorEllipkm1(**kwargs)
+    return op(x)
+
+
+@implement_scipy(spspecial.ellipkinc)
+@infer_dtype(spspecial.ellipkinc)
+def ellipkinc(phi, m, **kwargs):
+    op = TensorEllipkinc(**kwargs)
+    return op(phi, m)
+
+
+@implement_scipy(spspecial.ellipe)
+@infer_dtype(spspecial.ellipe)
+def ellipe(x, **kwargs):
+    op = TensorEllipe(**kwargs)
+    return op(x)
+
+
+@implement_scipy(spspecial.ellipeinc)
+@infer_dtype(spspecial.ellipeinc)
+def ellipeinc(phi, m, **kwargs):
+    op = TensorEllipeinc(**kwargs)
+    return op(phi, m)
+
+
+try:
+
+    @implement_scipy(spspecial.elliprc)
+    @infer_dtype(spspecial.elliprc)
+    def elliprc(x, y, **kwargs):
+        op = TensorElliprc(**kwargs)
+        return op(x, y)
+
+    @implement_scipy(spspecial.elliprd)
+    @infer_dtype(spspecial.elliprd)
+    def elliprd(x, y, z, **kwargs):
+        op = TensorElliprd(**kwargs)
+        return op(x, y, z)
+
+    @implement_scipy(spspecial.elliprf)
+    @infer_dtype(spspecial.elliprf)
+    def elliprf(x, y, z, **kwargs):
+        op = TensorElliprf(**kwargs)
+        return op(x, y, z)
+
+    @implement_scipy(spspecial.elliprg)
+    @infer_dtype(spspecial.elliprg)
+    def elliprg(x, y, z, **kwargs):
+        op = TensorElliprg(**kwargs)
+        return op(x, y, z)
+
+    @implement_scipy(spspecial.elliprj)
+    @infer_dtype(spspecial.elliprj)
+    def elliprj(x, y, z, p, **kwargs):
+        op = TensorElliprj(**kwargs)
+        return op(x, y, z, p)
+
+except AttributeError:
+    # These functions are not implemented before scipy v1.8 so
+    # spsecial.func may cause AttributeError
+    elliprc = elliprd = elliprf = elliprg = elliprj = None
diff --git a/python/xorbits/_mars/tensor/special/ellip_harm.py b/python/xorbits/_mars/tensor/special/ellip_harm.py
new file mode 100644
index 000000000..e53e2bbf6
--- /dev/null
+++ b/python/xorbits/_mars/tensor/special/ellip_harm.py
@@ -0,0 +1,57 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import scipy.special as spspecial
+
+from ..utils import implement_scipy, infer_dtype
+from .core import TensorSpecialMultiOp, _register_special_op
+
+
+@_register_special_op
+class TensorEllipHarm(TensorSpecialMultiOp):
+    _ARG_COUNT = 5
+    _func_name = "ellip_harm"
+
+
+@implement_scipy(spspecial.ellip_harm)
+@infer_dtype(spspecial.ellip_harm)
+def ellip_harm(h2, k2, n, p, s, signm=1, signn=1, **kwargs):
+    op = TensorEllipHarm(**kwargs)
+    return op(h2, k2, n, p, s, signm, signn)
+
+
+@_register_special_op
+class TensorEllipHarm2(TensorSpecialMultiOp):
+    _ARG_COUNT = 5
+    _func_name = "ellip_harm_2"
+
+
+@implement_scipy(spspecial.ellip_harm_2)
+@infer_dtype(spspecial.ellip_harm_2)
+def ellip_harm_2(h2, k2, n, p, s, **kwargs):
+    op = TensorEllipHarm2(**kwargs)
+    return op(h2, k2, n, p, s)
+
+
+@_register_special_op
+class TensorEllipNormal(TensorSpecialMultiOp):
+    _ARG_COUNT = 4
+    _func_name = "ellip_normal"
+
+
+@implement_scipy(spspecial.ellip_normal)
+@infer_dtype(spspecial.ellip_normal)
+def ellip_normal(h2, k2, n, p, **kwargs):
+    op = TensorEllipNormal(**kwargs)
+    return op(h2, k2, n, p)
diff --git a/python/xorbits/_mars/tensor/special/err_fresnel.py b/python/xorbits/_mars/tensor/special/err_fresnel.py
new file mode 100644
index 000000000..53ca312eb
--- /dev/null
+++ b/python/xorbits/_mars/tensor/special/err_fresnel.py
@@ -0,0 +1,225 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import scipy.special as spspecial
+
+from ..arithmetic.utils import arithmetic_operand
+from ..utils import implement_scipy, infer_dtype
+from .core import (
+    TensorSpecialMultiOp,
+    TensorSpecialUnaryOp,
+    TensorTupleOp,
+    _register_special_op,
+)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorErf(TensorSpecialUnaryOp):
+    _func_name = "erf"
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorErfc(TensorSpecialUnaryOp):
+    _func_name = "erfc"
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorErfcx(TensorSpecialUnaryOp):
+    _func_name = "erfcx"
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorErfi(TensorSpecialUnaryOp):
+    _func_name = "erfi"
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorErfinv(TensorSpecialUnaryOp):
+    _func_name = "erfinv"
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorErfcinv(TensorSpecialUnaryOp):
+    _func_name = "erfcinv"
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorWofz(TensorSpecialUnaryOp):
+    _func_name = "wofz"
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorDawsn(TensorSpecialUnaryOp):
+    _func_name = "dawsn"
+
+
+@_register_special_op
+class TensorFresnel(TensorTupleOp):
+    _func_name = "fresnel"
+    _n_outputs = 2
+
+
+@_register_special_op
+class TensorModFresnelP(TensorTupleOp):
+    _func_name = "modfresnelp"
+    _n_outputs = 2
+
+
+@_register_special_op
+class TensorModFresnelM(TensorTupleOp):
+    _func_name = "modfresnelm"
+    _n_outputs = 2
+
+
+@_register_special_op
+class TensorVoigtProfile(TensorSpecialMultiOp):
+    _ARG_COUNT = 3
+    _func_name = "voigt_profile"
+
+
+@implement_scipy(spspecial.erf)
+@infer_dtype(spspecial.erf)
+def erf(x, out=None, where=None, **kwargs):
+    """
+    Returns the error function of complex argument.
+
+    It is defined as ``2/sqrt(pi)*integral(exp(-t**2), t=0..z)``.
+
+    Parameters
+    ----------
+    x : Tensor
+        Input tensor.
+
+    Returns
+    -------
+    res : Tensor
+        The values of the error function at the given points `x`.
+
+    See Also
+    --------
+    erfc, erfinv, erfcinv, wofz, erfcx, erfi
+
+    Notes
+    -----
+    The cumulative of the unit normal distribution is given by
+    ``Phi(z) = 1/2[1 + erf(z/sqrt(2))]``.
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/Error_function
+    .. [2] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover,
+        1972. http://www.math.sfu.ca/~cbm/aands/page_297.htm
+    .. [3] Steven G. Johnson, Faddeeva W function implementation.
+       http://ab-initio.mit.edu/Faddeeva
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> from mars.tensor import special
+    >>> import matplotlib.pyplot as plt
+    >>> x = mt.linspace(-3, 3)
+    >>> plt.plot(x, special.erf(x))
+    >>> plt.xlabel('$x$')
+    >>> plt.ylabel('$erf(x)$')
+    >>> plt.show()
+    """
+    op = TensorErf(**kwargs)
+    return op(x, out=out, where=where)
+
+
+@implement_scipy(spspecial.erfc)
+@infer_dtype(spspecial.erfc)
+def erfc(x, out=None, where=None, **kwargs):
+    op = TensorErfc(**kwargs)
+    return op(x, out=out, where=where)
+
+
+@implement_scipy(spspecial.erfcx)
+@infer_dtype(spspecial.erfcx)
+def erfcx(x, out=None, where=None, **kwargs):
+    op = TensorErfcx(**kwargs)
+    return op(x, out=out, where=where)
+
+
+@implement_scipy(spspecial.erfi)
+@infer_dtype(spspecial.erfi)
+def erfi(x, out=None, where=None, **kwargs):
+    op = TensorErfi(**kwargs)
+    return op(x, out=out, where=where)
+
+
+@implement_scipy(spspecial.erfinv)
+@infer_dtype(spspecial.erfinv)
+def erfinv(x, out=None, where=None, **kwargs):
+    op = TensorErfinv(**kwargs)
+    return op(x, out=out, where=where)
+
+
+@implement_scipy(spspecial.erfcinv)
+@infer_dtype(spspecial.erfcinv)
+def erfcinv(x, out=None, where=None, **kwargs):
+    op = TensorErfcinv(**kwargs)
+    return op(x, out=out, where=where)
+
+
+@implement_scipy(spspecial.wofz)
+@infer_dtype(spspecial.wofz)
+def wofz(x, out=None, where=None, **kwargs):
+    op = TensorWofz(**kwargs)
+    return op(x, out=out, where=where)
+
+
+@implement_scipy(spspecial.dawsn)
+@infer_dtype(spspecial.dawsn)
+def dawsn(x, out=None, where=None, **kwargs):
+    op = TensorDawsn(**kwargs)
+    return op(x, out=out, where=where)
+
+
+@implement_scipy(spspecial.fresnel)
+@infer_dtype(spspecial.fresnel, multi_outputs=True)
+def fresnel(x, out=None, **kwargs):
+    op = TensorFresnel(**kwargs)
+    return op(x, out=out)
+
+
+@implement_scipy(spspecial.modfresnelp)
+@infer_dtype(spspecial.modfresnelp, multi_outputs=True)
+def modfresnelp(x, out=None, **kwargs):
+    op = TensorModFresnelP(**kwargs)
+    return op(x, out=out)
+
+
+@implement_scipy(spspecial.modfresnelm)
+@infer_dtype(spspecial.modfresnelm, multi_outputs=True)
+def modfresnelm(x, out=None, **kwargs):
+    op = TensorModFresnelM(**kwargs)
+    return op(x, out=out)
+
+
+@implement_scipy(spspecial.voigt_profile)
+@infer_dtype(spspecial.voigt_profile)
+def voigt_profile(x, sigma, gamma, **kwargs):
+    op = TensorVoigtProfile(**kwargs)
+    return op(x, sigma, gamma)
diff --git a/python/xorbits/_mars/tensor/special/gamma_funcs.py b/python/xorbits/_mars/tensor/special/gamma_funcs.py
new file mode 100644
index 000000000..2ebe19efa
--- /dev/null
+++ b/python/xorbits/_mars/tensor/special/gamma_funcs.py
@@ -0,0 +1,305 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import scipy.special as spspecial
+
+from ..arithmetic.utils import arithmetic_operand
+from ..utils import implement_scipy, infer_dtype
+from .core import (
+    TensorSpecialBinOp,
+    TensorSpecialMultiOp,
+    TensorSpecialUnaryOp,
+    _register_special_op,
+)
+
+
+class NoOrderSpecialMixin:
+    @classmethod
+    def _get_func(cls, xp):
+        func = super()._get_func(xp)
+
+        def _wrapped(*args, **kw):
+            kw.pop("order", None)
+            return func(*args, **kw)
+
+        return _wrapped
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorGamma(TensorSpecialUnaryOp):
+    _func_name = "gamma"
+
+
+@implement_scipy(spspecial.gamma)
+@infer_dtype(spspecial.gamma)
+def gamma(x, **kwargs):
+    op = TensorGamma(**kwargs)
+    return op(x)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorGammaln(TensorSpecialUnaryOp):
+    _func_name = "gammaln"
+
+
+@implement_scipy(spspecial.gammaln)
+@infer_dtype(spspecial.gammaln)
+def gammaln(x, out=None, where=None, **kwargs):
+    """
+    Logarithm of the absolute value of the Gamma function.
+
+    Parameters
+    ----------
+    x : array-like
+        Values on the real line at which to compute ``gammaln``
+    out : Tensor, None, or tuple of Tensor and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated tensor is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    where : array_like, optional
+        Values of True indicate to calculate the ufunc at that position, values
+        of False indicate to leave the value in the output alone.
+    **kwargs
+
+    Returns
+    -------
+    gammaln : Tensor
+        Values of ``gammaln`` at x.
+
+    See Also
+    --------
+    gammasgn : sign of the gamma function
+    loggamma : principal branch of the logarithm of the gamma function
+
+    Notes
+    -----
+    When used in conjunction with `gammasgn`, this function is useful
+    for working in logspace on the real axis without having to deal with
+    complex numbers, via the relation ``exp(gammaln(x)) = gammasgn(x)*gamma(x)``.
+
+    For complex-valued log-gamma, use `loggamma` instead of `gammaln`.
+    """
+    op = TensorGammaln(**kwargs)
+    return op(x, out=out, where=where)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorLogGamma(TensorSpecialUnaryOp):
+    _func_name = "loggamma"
+
+
+@implement_scipy(spspecial.loggamma)
+@infer_dtype(spspecial.loggamma)
+def loggamma(x, **kwargs):
+    op = TensorLogGamma(**kwargs)
+    return op(x)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorGammaSgn(TensorSpecialUnaryOp):
+    _func_name = "gammasgn"
+
+
+@implement_scipy(spspecial.gammasgn)
+@infer_dtype(spspecial.gammasgn)
+def gammasgn(x, **kwargs):
+    op = TensorGammaSgn(**kwargs)
+    return op(x)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorGammaInc(TensorSpecialBinOp):
+    _func_name = "gammainc"
+
+
+@implement_scipy(spspecial.gammainc)
+@infer_dtype(spspecial.gammainc)
+def gammainc(a, b, **kwargs):
+    op = TensorGammaInc(**kwargs)
+    return op(a, b)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorGammaIncInv(TensorSpecialBinOp):
+    _func_name = "gammaincinv"
+
+
+@implement_scipy(spspecial.gammaincinv)
+@infer_dtype(spspecial.gammaincinv)
+def gammaincinv(a, b, **kwargs):
+    op = TensorGammaIncInv(**kwargs)
+    return op(a, b)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorGammaIncc(TensorSpecialBinOp):
+    _func_name = "gammaincc"
+
+
+@implement_scipy(spspecial.gammainc)
+@infer_dtype(spspecial.gammainc)
+def gammaincc(a, b, **kwargs):
+    op = TensorGammaIncc(**kwargs)
+    return op(a, b)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorGammaInccInv(TensorSpecialBinOp):
+    _func_name = "gammainccinv"
+
+
+@implement_scipy(spspecial.gammainccinv)
+@infer_dtype(spspecial.gammainccinv)
+def gammainccinv(a, b, **kwargs):
+    op = TensorGammaInccInv(**kwargs)
+    return op(a, b)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorBeta(TensorSpecialBinOp):
+    _func_name = "beta"
+
+
+@implement_scipy(spspecial.beta)
+@infer_dtype(spspecial.beta)
+def beta(a, b, out=None, **kwargs):
+    op = TensorBeta(**kwargs)
+    return op(a, b, out=out)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorBetaLn(TensorSpecialBinOp):
+    _func_name = "betaln"
+
+
+@implement_scipy(spspecial.betaln)
+@infer_dtype(spspecial.betaln)
+def betaln(a, b, out=None, **kwargs):
+    op = TensorBetaLn(**kwargs)
+    return op(a, b, out=out)
+
+
+@_register_special_op
+class TensorBetaInc(TensorSpecialMultiOp):
+    _ARG_COUNT = 3
+    _func_name = "betainc"
+
+
+@implement_scipy(spspecial.betainc)
+@infer_dtype(spspecial.betainc)
+def betainc(a, b, x, out=None, **kwargs):
+    op = TensorBetaInc(**kwargs)
+    return op(a, b, x, out=out)
+
+
+@_register_special_op
+class TensorBetaIncInv(TensorSpecialMultiOp):
+    _ARG_COUNT = 3
+    _func_name = "betaincinv"
+
+
+@implement_scipy(spspecial.betaincinv)
+@infer_dtype(spspecial.betaincinv)
+def betaincinv(a, b, y, out=None, **kwargs):
+    op = TensorBetaIncInv(**kwargs)
+    return op(a, b, y, out=out)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorPsi(TensorSpecialUnaryOp):
+    _func_name = "psi"
+
+
+@implement_scipy(spspecial.psi)
+@infer_dtype(spspecial.psi)
+def psi(x, out=None, **kwargs):
+    op = TensorPsi(**kwargs)
+    return op(x, out=out)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorRGamma(TensorSpecialUnaryOp):
+    _func_name = "rgamma"
+
+
+@implement_scipy(spspecial.rgamma)
+@infer_dtype(spspecial.rgamma)
+def rgamma(x, out=None, **kwargs):
+    op = TensorRGamma(**kwargs)
+    return op(x, out=out)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorPolyGamma(NoOrderSpecialMixin, TensorSpecialBinOp):
+    _func_name = "polygamma"
+
+
+@implement_scipy(spspecial.polygamma)
+@infer_dtype(spspecial.polygamma)
+def polygamma(a, b, **kwargs):
+    op = TensorPolyGamma(**kwargs)
+    return op(a, b)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorMultiGammaLn(NoOrderSpecialMixin, TensorSpecialBinOp):
+    _func_name = "multigammaln"
+
+
+@implement_scipy(spspecial.multigammaln)
+@infer_dtype(spspecial.multigammaln)
+def multigammaln(a, b, **kwargs):
+    op = TensorMultiGammaLn(**kwargs)
+    return op(a, b)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorDiGamma(TensorSpecialUnaryOp):
+    _func_name = "digamma"
+
+
+@implement_scipy(spspecial.digamma)
+@infer_dtype(spspecial.digamma)
+def digamma(x, out=None, **kwargs):
+    op = TensorDiGamma(**kwargs)
+    return op(x, out=out)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorPoch(TensorSpecialBinOp):
+    _func_name = "poch"
+
+
+@implement_scipy(spspecial.poch)
+@infer_dtype(spspecial.poch)
+def poch(a, b, **kwargs):
+    op = TensorPoch(**kwargs)
+    return op(a, b)
diff --git a/python/xorbits/_mars/tensor/special/hypergeometric_funcs.py b/python/xorbits/_mars/tensor/special/hypergeometric_funcs.py
new file mode 100644
index 000000000..af9774b94
--- /dev/null
+++ b/python/xorbits/_mars/tensor/special/hypergeometric_funcs.py
@@ -0,0 +1,71 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import scipy.special as spspecial
+
+from ..arithmetic.utils import arithmetic_operand
+from ..utils import implement_scipy, infer_dtype
+from .core import TensorSpecialBinOp, TensorSpecialMultiOp, _register_special_op
+
+
+@_register_special_op
+class TensorHYP2F1(TensorSpecialMultiOp):
+    _ARG_COUNT = 4
+    _func_name = "hyp2f1"
+
+
+@implement_scipy(spspecial.hyp2f1)
+@infer_dtype(spspecial.hyp2f1)
+def hyp2f1(a, b, c, z, **kwargs):
+    op = TensorHYP2F1(**kwargs)
+    return op(a, b, c, z)
+
+
+@_register_special_op
+class TensorHYP1F1(TensorSpecialMultiOp):
+    _ARG_COUNT = 3
+    _func_name = "hyp1f1"
+
+
+@implement_scipy(spspecial.hyp1f1)
+@infer_dtype(spspecial.hyp1f1)
+def hyp1f1(a, b, x, out=None, **kwargs):
+    op = TensorHYP1F1(**kwargs)
+    return op(a, b, x, out=out)
+
+
+@_register_special_op
+class TensorHYPERU(TensorSpecialMultiOp):
+    _ARG_COUNT = 3
+    _func_name = "hyperu"
+
+
+@implement_scipy(spspecial.hyperu)
+@infer_dtype(spspecial.hyperu)
+def hyperu(a, b, x, out=None, **kwargs):
+    op = TensorHYPERU(**kwargs)
+    return op(a, b, x, out=out)
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="binary_and")
+class TensorHYP0F1(TensorSpecialBinOp):
+    _func_name = "hyp0f1"
+
+
+@implement_scipy(spspecial.hyp0f1)
+@infer_dtype(spspecial.hyp0f1)
+def hyp0f1(v, z, out=None, **kwargs):
+    op = TensorHYP0F1(**kwargs)
+    return op(v, z, out=out)
diff --git a/python/xorbits/_mars/tensor/special/info_theory.py b/python/xorbits/_mars/tensor/special/info_theory.py
new file mode 100644
index 000000000..5cc67b1a1
--- /dev/null
+++ b/python/xorbits/_mars/tensor/special/info_theory.py
@@ -0,0 +1,191 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import scipy.special as spspecial
+
+from ..arithmetic.utils import arithmetic_operand
+from ..utils import implement_scipy, infer_dtype
+from .core import TensorSpecialBinOp, TensorSpecialUnaryOp, _register_special_op
+
+
+@_register_special_op
+@arithmetic_operand(sparse_mode="unary")
+class TensorEntr(TensorSpecialUnaryOp):
+    _func_name = "entr"
+
+
+@implement_scipy(spspecial.entr)
+@infer_dtype(spspecial.entr)
+def entr(x, out=None, where=None, **kwargs):
+    r"""
+    Elementwise function for computing entropy.
+
+    .. math:: \text{entr}(x) = \begin{cases} - x \log(x) & x > 0  \\ 0 & x = 0 \\ -\infty & \text{otherwise} \end{cases}
+
+    Parameters
+    ----------
+    x : Tensor
+        Input tensor.
+
+    Returns
+    -------
+    res : Tensor
+        The value of the elementwise entropy function at the given points `x`.
+
+    See Also
+    --------
+    kl_div, rel_entr
+
+    Notes
+    -----
+    This function is concave.
+    """
+    op = TensorEntr(**kwargs)
+    return op(x, out=out, where=where)
+
+
+@_register_special_op
+class TensorRelEntr(TensorSpecialBinOp):
+    _func_name = "rel_entr"
+
+    @classmethod
+    def _is_sparse(cls, x1, x2):
+        if hasattr(x1, "issparse") and x1.issparse():
+            return True
+        return False
+
+
+@implement_scipy(spspecial.rel_entr)
+@infer_dtype(spspecial.rel_entr)
+def rel_entr(x, y, out=None, where=None, **kwargs):
+    r"""
+    Elementwise function for computing relative entropy.
+
+    .. math::
+
+        \mathrm{rel\_entr}(x, y) =
+            \begin{cases}
+                x \log(x / y) & x > 0, y > 0 \\
+                0 & x = 0, y \ge 0 \\
+                \infty & \text{otherwise}
+            \end{cases}
+
+    Parameters
+    ----------
+    x, y : array_like
+        Input arrays
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        Relative entropy of the inputs
+
+    See Also
+    --------
+    entr, kl_div
+
+    Notes
+    -----
+    This function is jointly convex in x and y.
+
+    The origin of this function is in convex programming; see
+    [1]_. Given two discrete probability distributions :math:`p_1,
+    \ldots, p_n` and :math:`q_1, \ldots, q_n`, to get the relative
+    entropy of statistics compute the sum
+
+    .. math::
+
+        \sum_{i = 1}^n \mathrm{rel\_entr}(p_i, q_i).
+
+    See [2]_ for details.
+
+    References
+    ----------
+    .. [1] Grant, Boyd, and Ye, "CVX: Matlab Software for Disciplined Convex
+        Programming", http://cvxr.com/cvx/
+    .. [2] Kullback-Leibler divergence,
+        https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
+    """
+    op = TensorRelEntr(**kwargs)
+    return op(x, y, out=out, where=where)
+
+
+@_register_special_op
+class TensorKlDiv(TensorSpecialBinOp):
+    _func_name = "kl_div"
+
+    @classmethod
+    def _is_sparse(cls, x1, x2):
+        if hasattr(x1, "issparse") and x1.issparse():
+            return True
+        return False
+
+
+@implement_scipy(spspecial.kl_div)
+@infer_dtype(spspecial.kl_div)
+def kl_div(x, y, out=None, where=None, **kwargs):
+    r"""
+    Elementwise function for computing relative entropy.
+
+    .. math::
+
+        \mathrm{rel\_entr}(x, y) =
+            \begin{cases}
+                x \log(x / y) & x > 0, y > 0 \\
+                0 & x = 0, y \ge 0 \\
+                \infty & \text{otherwise}
+            \end{cases}
+
+    Parameters
+    ----------
+    x, y : array_like
+        Input arrays
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        Relative entropy of the inputs
+
+    See Also
+    --------
+    entr, kl_div
+
+    Notes
+    -----
+    This function is jointly convex in x and y.
+
+    The origin of this function is in convex programming; see
+    [1]_. Given two discrete probability distributions :math:`p_1,
+    \ldots, p_n` and :math:`q_1, \ldots, q_n`, to get the relative
+    entropy of statistics compute the sum
+
+    .. math::
+
+        \sum_{i = 1}^n \mathrm{rel\_entr}(p_i, q_i).
+
+    See [2]_ for details.
+
+    References
+    ----------
+    .. [1] Grant, Boyd, and Ye, "CVX: Matlab Software for Disciplined Convex
+        Programming", http://cvxr.com/cvx/
+    .. [2] Kullback-Leibler divergence,
+        https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
+    """
+    op = TensorKlDiv(**kwargs)
+    return op(x, y, out=out, where=where)
diff --git a/python/xorbits/_mars/tensor/special/tests/__init__.py b/python/xorbits/_mars/tensor/special/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/special/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/special/tests/test_special.py b/python/xorbits/_mars/tensor/special/tests/test_special.py
new file mode 100644
index 000000000..a75f7bd44
--- /dev/null
+++ b/python/xorbits/_mars/tensor/special/tests/test_special.py
@@ -0,0 +1,321 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import scipy
+import scipy.special as spsecial
+
+from ....core import ExecutableTuple, tile
+from ....lib.version import parse as parse_version
+from ... import special as mt_special
+from ... import tensor
+from ..airy import TensorAiry, TensorAirye, TensorItairy
+from ..ellip_func_integrals import (
+    TensorEllipe,
+    TensorEllipeinc,
+    TensorEllipk,
+    TensorEllipkinc,
+    TensorEllipkm1,
+    TensorElliprc,
+    TensorElliprd,
+    TensorElliprf,
+    TensorElliprg,
+    TensorElliprj,
+)
+from ..err_fresnel import (
+    TensorDawsn,
+    TensorErf,
+    TensorErfc,
+    TensorErfcinv,
+    TensorErfcx,
+    TensorErfi,
+    TensorErfinv,
+    TensorFresnel,
+    TensorModFresnelM,
+    TensorModFresnelP,
+    TensorVoigtProfile,
+    TensorWofz,
+)
+from ..gamma_funcs import TensorBetaInc, TensorGammaln
+
+
+@pytest.mark.parametrize(
+    "func,tensor_cls",
+    [
+        ("gammaln", TensorGammaln),
+        ("erf", TensorErf),
+        ("erfinv", TensorErfinv),
+        ("erfcinv", TensorErfcinv),
+        ("wofz", TensorWofz),
+        ("dawsn", TensorDawsn),
+        ("ellipk", TensorEllipk),
+        ("ellipkm1", TensorEllipkm1),
+        ("ellipe", TensorEllipe),
+        ("erfc", TensorErfc),
+        ("erfcx", TensorErfcx),
+        ("erfi", TensorErfi),
+    ],
+)
+def test_unary_operand_no_out(func, tensor_cls):
+    sp_func = getattr(spsecial, func)
+    mt_func = getattr(mt_special, func)
+
+    raw = np.random.rand(10, 8, 5)
+    t = tensor(raw, chunk_size=3)
+
+    r = mt_func(t)
+    expect = sp_func(raw)
+
+    assert r.shape == raw.shape
+    assert r.dtype == expect.dtype
+
+    t, r = tile(t, r)
+
+    assert r.nsplits == t.nsplits
+    for c in r.chunks:
+        assert isinstance(c.op, tensor_cls)
+        assert c.index == c.inputs[0].index
+        assert c.shape == c.inputs[0].shape
+
+
+@pytest.mark.parametrize(
+    "func,tensor_cls",
+    [
+        ("erfc", TensorErfc),
+        ("erfcx", TensorErfcx),
+        ("erfi", TensorErfi),
+    ],
+)
+def test_unary_operand_out(func, tensor_cls):
+    sp_func = getattr(spsecial, func)
+    mt_func = getattr(mt_special, func)
+
+    raw = np.random.rand(10, 8, 5)
+    t = tensor(raw, chunk_size=3)
+
+    out = tensor(raw, chunk_size=3)
+    r_with_optional = mt_func(t, out)
+    expect = sp_func(raw)
+
+    assert out.shape == raw.shape
+    assert out.dtype == expect.dtype
+
+    assert r_with_optional.shape == raw.shape
+    assert r_with_optional.dtype == expect.dtype
+
+    t_optional_out, out = tile(t, out)
+
+    assert out.nsplits == t_optional_out.nsplits
+    for c in out.chunks:
+        assert isinstance(c.op, tensor_cls)
+        assert c.index == c.inputs[0].index
+        assert c.shape == c.inputs[0].shape
+
+    t_optional_r, r_with_optional = tile(t, r_with_optional)
+
+    assert r_with_optional.nsplits == t_optional_r.nsplits
+    for c in r_with_optional.chunks:
+        assert isinstance(c.op, tensor_cls)
+        assert c.index == c.inputs[0].index
+        assert c.shape == c.inputs[0].shape
+
+
+@pytest.mark.parametrize(
+    "func,tensor_cls,n_outputs",
+    [
+        ("fresnel", TensorFresnel, 2),
+        ("modfresnelp", TensorModFresnelP, 2),
+        ("modfresnelm", TensorModFresnelM, 2),
+        ("airy", TensorAiry, 4),
+        ("airye", TensorAirye, 4),
+        ("itairy", TensorItairy, 4),
+    ],
+)
+def test_unary_tuple_operand(func, tensor_cls, n_outputs):
+    sp_func = getattr(spsecial, func)
+    mt_func = getattr(mt_special, func)
+
+    raw = np.random.rand(10, 8, 5)
+    t = tensor(raw, chunk_size=3)
+
+    r = mt_func(t)
+    expect = sp_func(raw)
+
+    assert isinstance(r, ExecutableTuple)
+
+    for r_i, expect_i in zip(r, expect):
+        assert r_i.shape == expect_i.shape
+        assert r_i.dtype == expect_i.dtype
+        assert isinstance(r_i.op, tensor_cls)
+
+    non_tuple_out = tensor(raw, chunk_size=3)
+    with pytest.raises(TypeError):
+        r = mt_func(t, non_tuple_out)
+
+    mismatch_size_tuple = ExecutableTuple([t])
+    with pytest.raises(TypeError):
+        r = mt_func(t, mismatch_size_tuple)
+
+    out = ExecutableTuple([t] * n_outputs)
+    r_out = mt_func(t, out=out)
+
+    assert isinstance(out, ExecutableTuple)
+    assert isinstance(r_out, ExecutableTuple)
+
+    for r_output, expected_output, out_output in zip(r, expect, out):
+        assert r_output.shape == expected_output.shape
+        assert r_output.dtype == expected_output.dtype
+        assert isinstance(r_output.op, tensor_cls)
+
+        assert out_output.shape == expected_output.shape
+        assert out_output.dtype == expected_output.dtype
+        assert isinstance(out_output.op, tensor_cls)
+
+
+@pytest.mark.parametrize(
+    "func,tensor_cls",
+    [
+        ("betainc", TensorBetaInc),
+        ("voigt_profile", TensorVoigtProfile),
+        pytest.param(
+            "elliprd",
+            TensorElliprd,
+            marks=pytest.mark.skipif(
+                parse_version(scipy.__version__) < parse_version("1.8.0"),
+                reason="function not implemented in scipy.",
+            ),
+        ),
+        pytest.param(
+            "elliprf",
+            TensorElliprf,
+            marks=pytest.mark.skipif(
+                parse_version(scipy.__version__) < parse_version("1.8.0"),
+                reason="function not implemented in scipy.",
+            ),
+        ),
+        pytest.param(
+            "elliprg",
+            TensorElliprg,
+            marks=pytest.mark.skipif(
+                parse_version(scipy.__version__) < parse_version("1.8.0"),
+                reason="function not implemented in scipy.",
+            ),
+        ),
+    ],
+)
+def test_triple_operand(func, tensor_cls):
+    sp_func = getattr(spsecial, func)
+    mt_func = getattr(mt_special, func)
+
+    raw1 = np.random.rand(4, 3, 2)
+    raw2 = np.random.rand(4, 3, 2)
+    raw3 = np.random.rand(4, 3, 2)
+    a = tensor(raw1, chunk_size=3)
+    b = tensor(raw2, chunk_size=3)
+    c = tensor(raw3, chunk_size=3)
+
+    r = mt_func(a, b, c)
+    expect = sp_func(raw1, raw2, raw3)
+
+    assert r.shape == raw1.shape
+    assert r.dtype == expect.dtype
+
+    tiled_a, r = tile(a, r)
+
+    assert r.nsplits == tiled_a.nsplits
+    for chunk in r.chunks:
+        assert isinstance(chunk.op, tensor_cls)
+        assert chunk.index == chunk.inputs[0].index
+        assert chunk.shape == chunk.inputs[0].shape
+
+
+@pytest.mark.parametrize(
+    "func,tensor_cls",
+    [
+        ("ellipkinc", TensorEllipkinc),
+        ("ellipeinc", TensorEllipeinc),
+        pytest.param(
+            "elliprc",
+            TensorElliprc,
+            marks=pytest.mark.skipif(
+                parse_version(scipy.__version__) < parse_version("1.8.0"),
+                reason="function not implemented in scipy.",
+            ),
+        ),
+    ],
+)
+def test_binary_operand(func, tensor_cls):
+    sp_func = getattr(spsecial, func)
+    mt_func = getattr(mt_special, func)
+
+    raw1 = np.random.rand(4, 3, 2)
+    raw2 = np.random.rand(4, 3, 2)
+    a = tensor(raw1, chunk_size=3)
+    b = tensor(raw2, chunk_size=3)
+
+    r = mt_func(a, b)
+    expect = sp_func(raw1, raw2)
+
+    assert r.shape == raw1.shape
+    assert r.dtype == expect.dtype
+
+    tiled_a, r = tile(a, r)
+
+    assert r.nsplits == tiled_a.nsplits
+    for chunk in r.chunks:
+        assert isinstance(chunk.op, tensor_cls)
+        assert chunk.index == chunk.inputs[0].index
+        assert chunk.shape == chunk.inputs[0].shape
+
+
+@pytest.mark.parametrize(
+    "func,tensor_cls",
+    [
+        pytest.param(
+            "elliprj",
+            TensorElliprj,
+            marks=pytest.mark.skipif(
+                parse_version(scipy.__version__) < parse_version("1.8.0"),
+                reason="function not implemented in scipy.",
+            ),
+        ),
+    ],
+)
+def test_quadruple_operand(func, tensor_cls):
+    sp_func = getattr(spsecial, func)
+    mt_func = getattr(mt_special, func)
+
+    raw1 = np.random.rand(4, 3, 2)
+    raw2 = np.random.rand(4, 3, 2)
+    raw3 = np.random.rand(4, 3, 2)
+    raw4 = np.random.rand(4, 3, 2)
+    a = tensor(raw1, chunk_size=3)
+    b = tensor(raw2, chunk_size=3)
+    c = tensor(raw3, chunk_size=3)
+    d = tensor(raw4, chunk_size=3)
+
+    r = mt_func(a, b, c, d)
+    expect = sp_func(raw1, raw2, raw3, raw4)
+
+    assert r.shape == raw1.shape
+    assert r.dtype == expect.dtype
+
+    tiled_a, r = tile(a, r)
+
+    assert r.nsplits == tiled_a.nsplits
+    for chunk in r.chunks:
+        assert isinstance(chunk.op, tensor_cls)
+        assert chunk.index == chunk.inputs[0].index
+        assert chunk.shape == chunk.inputs[0].shape
diff --git a/python/xorbits/_mars/tensor/special/tests/test_special_execution.py b/python/xorbits/_mars/tensor/special/tests/test_special_execution.py
new file mode 100644
index 000000000..afb1fb274
--- /dev/null
+++ b/python/xorbits/_mars/tensor/special/tests/test_special_execution.py
@@ -0,0 +1,325 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import scipy
+import scipy.sparse as sps
+import scipy.special as spspecial
+
+from ....lib.version import parse as parse_version
+from ... import special as mt_special
+from ... import tensor
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        "gamma",
+        "gammaln",
+        "loggamma",
+        "gammasgn",
+        "psi",
+        "rgamma",
+        "digamma",
+        "erf",
+        "erfc",
+        "erfcx",
+        "erfi",
+        "erfinv",
+        "erfcinv",
+        "wofz",
+        "dawsn",
+        "entr",
+        "ellipk",
+        "ellipkm1",
+        "ellipe",
+    ],
+)
+def test_unary_execution(setup, func):
+    sp_func = getattr(spspecial, func)
+    mt_func = getattr(mt_special, func)
+
+    raw = np.random.rand(10, 8, 6)
+    a = tensor(raw, chunk_size=3)
+
+    r = mt_func(a)
+
+    result = r.execute().fetch()
+    expected = sp_func(raw)
+
+    np.testing.assert_array_equal(result, expected)
+
+    # test sparse
+    raw = sps.csr_matrix(np.array([0, 1.0, 1.01, np.nan]))
+    a = tensor(raw, chunk_size=3)
+
+    r = mt_func(a)
+
+    result = r.execute().fetch()
+
+    data = sp_func(raw.data)
+    expected = sps.csr_matrix((data, raw.indices, raw.indptr), raw.shape)
+
+    np.testing.assert_array_equal(result.toarray(), expected.toarray())
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        "gammainc",
+        "gammaincinv",
+        "gammaincc",
+        "gammainccinv",
+        "beta",
+        "betaln",
+        "polygamma",
+        "poch",
+        "rel_entr",
+        "kl_div",
+        "xlogy",
+        "jv",
+        "jve",
+        "yn",
+        "yv",
+        "yve",
+        "kn",
+        "kv",
+        "kve",
+        "iv",
+        "ive",
+        "hankel1",
+        "hankel1e",
+        "hankel2",
+        "hankel2e",
+        "hyp0f1",
+        "ellipkinc",
+        "ellipeinc",
+        pytest.param(
+            "elliprc",
+            marks=pytest.mark.skipif(
+                parse_version(scipy.__version__) < parse_version("1.8.0"),
+                reason="function not implemented in scipy.",
+            ),
+        ),
+    ],
+)
+def test_binary_execution(setup, func):
+    sp_func = getattr(spspecial, func)
+    mt_func = getattr(mt_special, func)
+
+    raw1 = np.random.rand(4, 3, 2)
+    raw2 = np.random.rand(4, 3, 2)
+    a = tensor(raw1, chunk_size=3)
+    b = tensor(raw2, chunk_size=3)
+
+    r = mt_func(a, b)
+
+    result = r.execute().fetch()
+    expected = sp_func(raw1, raw2)
+
+    np.testing.assert_array_equal(result, expected)
+
+    # test sparse
+    raw1 = sps.csr_matrix(np.array([0, 1.0, 1.01, np.nan] * 3).reshape(4, 3))
+    a = tensor(raw1, chunk_size=3)
+    raw2 = np.random.rand(4, 3)
+    b = tensor(raw2, chunk_size=3)
+
+    r = mt_func(a, b)
+
+    result = r.execute().fetch()
+
+    expected = sp_func(raw1.toarray(), raw2)
+    np.testing.assert_array_equal(result.toarray(), expected)
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        "betainc",
+        "betaincinv",
+        "hyp1f1",
+        "hyperu",
+        "voigt_profile",
+        pytest.param(
+            "elliprd",
+            marks=pytest.mark.skipif(
+                parse_version(scipy.__version__) < parse_version("1.8.0"),
+                reason="function not implemented in scipy.",
+            ),
+        ),
+        pytest.param(
+            "elliprf",
+            marks=pytest.mark.skipif(
+                parse_version(scipy.__version__) < parse_version("1.8.0"),
+                reason="function not implemented in scipy.",
+            ),
+        ),
+        pytest.param(
+            "elliprg",
+            marks=pytest.mark.skipif(
+                parse_version(scipy.__version__) < parse_version("1.8.0"),
+                reason="function not implemented in scipy.",
+            ),
+        ),
+    ],
+)
+def test_triple_execution(setup, func):
+    sp_func = getattr(spspecial, func)
+    mt_func = getattr(mt_special, func)
+
+    raw1 = np.random.rand(4, 3, 2)
+    raw2 = np.random.rand(4, 3, 2)
+    raw3 = np.random.rand(4, 3, 2)
+    a = tensor(raw1, chunk_size=3)
+    b = tensor(raw2, chunk_size=3)
+    c = tensor(raw3, chunk_size=3)
+
+    r = mt_func(a, b, c)
+
+    result = r.execute().fetch()
+    expected = sp_func(raw1, raw2, raw3)
+
+    np.testing.assert_array_equal(result, expected)
+
+    # test sparse
+    raw1 = sps.csr_matrix(np.array([0, 1.0, 1.01, np.nan] * 3).reshape(4, 3))
+    a = tensor(raw1, chunk_size=3)
+    raw2 = np.random.rand(4, 3)
+    b = tensor(raw2, chunk_size=3)
+    raw3 = np.random.rand(4, 3)
+    c = tensor(raw3, chunk_size=3)
+
+    r = mt_func(a, b, c)
+
+    result = r.execute().fetch()
+
+    expected = sp_func(raw1.toarray(), raw2, raw3)
+    np.testing.assert_array_equal(result.toarray(), expected)
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        "hyp2f1",
+        "ellip_normal",
+        pytest.param(
+            "elliprj",
+            marks=pytest.mark.skipif(
+                parse_version(scipy.__version__) < parse_version("1.8.0"),
+                reason="function not implemented in scipy.",
+            ),
+        ),
+    ],
+)
+def test_quadruple_execution(setup, func):
+    sp_func = getattr(spspecial, func)
+    mt_func = getattr(mt_special, func)
+
+    raw1 = np.random.rand(4, 3, 2)
+    raw2 = np.random.rand(4, 3, 2)
+    raw3 = np.random.rand(4, 3, 2)
+    raw4 = np.random.rand(4, 3, 2)
+    a = tensor(raw1, chunk_size=3)
+    b = tensor(raw2, chunk_size=3)
+    c = tensor(raw3, chunk_size=3)
+    d = tensor(raw4, chunk_size=3)
+
+    r = mt_func(a, b, c, d)
+
+    result = r.execute().fetch()
+    expected = sp_func(raw1, raw2, raw3, raw4)
+
+    np.testing.assert_array_equal(result, expected)
+
+    # test sparse
+    raw1 = sps.csr_matrix(np.array([0, 1.0, 1.01, np.nan] * 3).reshape(4, 3))
+    a = tensor(raw1, chunk_size=3)
+    raw2 = np.random.rand(4, 3)
+    b = tensor(raw2, chunk_size=3)
+    raw3 = np.random.rand(4, 3)
+    c = tensor(raw3, chunk_size=3)
+    raw4 = np.random.rand(4, 3)
+    d = tensor(raw4, chunk_size=3)
+
+    r = mt_func(a, b, c, d)
+
+    result = r.execute().fetch()
+
+    expected = sp_func(raw1.toarray(), raw2, raw3, raw4)
+    np.testing.assert_array_equal(result.toarray(), expected)
+
+
+@pytest.mark.parametrize("func", ["ellip_harm", "ellip_harm_2"])
+def test_quintuple_execution(setup, func):
+    sp_func = getattr(spspecial, func)
+    mt_func = getattr(mt_special, func)
+
+    raw1 = np.random.rand(4, 3, 2)
+    raw2 = np.random.rand(4, 3, 2)
+    raw3 = np.random.rand(4, 3, 2)
+    raw4 = np.random.rand(4, 3, 2)
+    raw5 = np.random.rand(4, 3, 2)
+    a = tensor(raw1, chunk_size=3)
+    b = tensor(raw2, chunk_size=3)
+    c = tensor(raw3, chunk_size=3)
+    d = tensor(raw4, chunk_size=3)
+    e = tensor(raw5, chunk_size=3)
+
+    r = mt_func(a, b, c, d, e)
+
+    result = r.execute().fetch()
+    expected = sp_func(raw1, raw2, raw3, raw4, raw5)
+
+    np.testing.assert_array_equal(result, expected)
+
+    # test sparse
+    raw1 = sps.csr_matrix(np.array([0, 1.0, 1.01, np.nan] * 3).reshape(4, 3))
+    a = tensor(raw1, chunk_size=3)
+    raw2 = np.random.rand(4, 3)
+    b = tensor(raw2, chunk_size=3)
+    raw3 = np.random.rand(4, 3)
+    c = tensor(raw3, chunk_size=3)
+    raw4 = np.random.rand(4, 3)
+    d = tensor(raw4, chunk_size=3)
+    raw5 = np.random.rand(4, 3)
+    e = tensor(raw5, chunk_size=3)
+
+    r = mt_func(a, b, c, d, e)
+
+    result = r.execute().fetch()
+
+    expected = sp_func(raw1.toarray(), raw2, raw3, raw4, raw5)
+    np.testing.assert_array_equal(result.toarray(), expected)
+
+
+@pytest.mark.parametrize(
+    "func",
+    ["fresnel", "modfresnelp", "modfresnelm", "airy", "airye", "itairy"],
+)
+def test_unary_tuple_execution(setup, func):
+    sp_func = getattr(spspecial, func)
+    mt_func = getattr(mt_special, func)
+
+    raw = np.random.rand(10, 8, 6)
+    a = tensor(raw, chunk_size=3)
+
+    r = mt_func(a)
+
+    result = r.execute().fetch()
+    expected = sp_func(raw)
+
+    for actual_output, expected_output in zip(result, expected):
+        np.testing.assert_array_equal(actual_output, expected_output)
diff --git a/python/xorbits/_mars/tensor/statistics/__init__.py b/python/xorbits/_mars/tensor/statistics/__init__.py
new file mode 100644
index 000000000..950de82c9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/statistics/__init__.py
@@ -0,0 +1,40 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .average import average
+from .bincount import bincount
+from .corrcoef import corrcoef
+from .cov import cov
+from .digitize import TensorDigitize, digitize
+from .histogram import (
+    TensorHistogram,
+    TensorHistogramBinEdges,
+    histogram,
+    histogram_bin_edges,
+)
+from .median import median
+from .percentile import percentile
+from .ptp import ptp
+from .quantile import quantile
+
+
+def _install():
+    from ..core import Tensor, TensorData
+
+    for cls in (Tensor, TensorData):
+        setattr(cls, "ptp", ptp)
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/tensor/statistics/average.py b/python/xorbits/_mars/tensor/statistics/average.py
new file mode 100644
index 000000000..b4700d8cb
--- /dev/null
+++ b/python/xorbits/_mars/tensor/statistics/average.py
@@ -0,0 +1,143 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ..base.broadcast_to import broadcast_to
+from ..base.swapaxes import swapaxes
+from ..datasource import tensor as astensor
+
+
+def average(a, axis=None, weights=None, returned=False):
+    """
+    Compute the weighted average along the specified axis.
+
+    Parameters
+    ----------
+    a : array_like
+        Tensor containing data to be averaged. If `a` is not a tensor, a
+        conversion is attempted.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which to average `a`.  The default,
+        axis=None, will average over all of the elements of the input tensor.
+        If axis is negative it counts from the last to the first axis.
+
+        If axis is a tuple of ints, averaging is performed on all of the axes
+        specified in the tuple instead of a single axis or all the axes as
+        before.
+    weights : array_like, optional
+        A tensor of weights associated with the values in `a`. Each value in
+        `a` contributes to the average according to its associated weight.
+        The weights tensor can either be 1-D (in which case its length must be
+        the size of `a` along the given axis) or of the same shape as `a`.
+        If `weights=None`, then all data in `a` are assumed to have a
+        weight equal to one.
+    returned : bool, optional
+        Default is `False`. If `True`, the tuple (`average`, `sum_of_weights`)
+        is returned, otherwise only the average is returned.
+        If `weights=None`, `sum_of_weights` is equivalent to the number of
+        elements over which the average is taken.
+
+
+    Returns
+    -------
+    average, [sum_of_weights] : tensor_type or double
+        Return the average along the specified axis. When returned is `True`,
+        return a tuple with the average as the first element and the sum
+        of the weights as the second element. The return type is `Float`
+        if `a` is of integer type, otherwise it is of the same type as `a`.
+        `sum_of_weights` is of the same type as `average`.
+
+    Raises
+    ------
+    ZeroDivisionError
+        When all weights along axis are zero. See `numpy.ma.average` for a
+        version robust to this type of error.
+    TypeError
+        When the length of 1D `weights` is not the same as the shape of `a`
+        along axis.
+
+    See Also
+    --------
+    mean
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> data = list(range(1,5))
+    >>> data
+    [1, 2, 3, 4]
+    >>> mt.average(data).execute()
+    2.5
+    >>> mt.average(range(1,11), weights=range(10,0,-1)).execute()
+    4.0
+
+    >>> data = mt.arange(6).reshape((3,2))
+    >>> data.execute()
+    array([[0, 1],
+           [2, 3],
+           [4, 5]])
+    >>> mt.average(data, axis=1, weights=[1./4, 3./4]).execute()
+    array([ 0.75,  2.75,  4.75])
+    >>> mt.average(data, weights=[1./4, 3./4]).execute()
+    Traceback (most recent call last):
+    ...
+    TypeError: Axis must be specified when shapes of a and weights differ.
+
+    """
+    from ..arithmetic import multiply, truediv
+
+    a = astensor(a)
+
+    if weights is None:
+        avg = a.mean(axis)
+        scl = avg.dtype.type(a.size / avg.size)
+    else:
+        wgt = astensor(weights)
+
+        if issubclass(a.dtype.type, (np.integer, np.bool_)):
+            result_dtype = np.result_type(a.dtype, wgt.dtype, "f8")
+        else:
+            result_dtype = np.result_type(a.dtype, wgt.dtype)
+
+        # sanity checks
+        if a.shape != wgt.shape:
+            if axis is None:
+                raise TypeError(
+                    "Axis must be specified when shapes of a and weights differ."
+                )
+            if wgt.ndim != 1:
+                raise TypeError(
+                    "1D weights expected when shapes of a and weights differ."
+                )
+            if wgt.shape[0] != a.shape[axis]:
+                raise ValueError(
+                    "Length of weights not compatible with specified axis."
+                )
+
+            # setup wgt to broadcast along axis
+            wgt = broadcast_to(wgt, (a.ndim - 1) * (1,) + wgt.shape)
+            wgt = swapaxes(wgt, -1, axis)
+
+        scl = wgt.sum(axis=axis, dtype=result_dtype)
+        with np.errstate(divide="raise"):
+            avg = truediv(multiply(a, wgt, dtype=result_dtype).sum(axis), scl)
+
+    if returned:
+        if scl.shape != avg.shape:
+            scl = broadcast_to(scl, avg.shape)
+        return avg, scl
+    else:
+        return avg
diff --git a/python/xorbits/_mars/tensor/statistics/bincount.py b/python/xorbits/_mars/tensor/statistics/bincount.py
new file mode 100644
index 000000000..257fb24de
--- /dev/null
+++ b/python/xorbits/_mars/tensor/statistics/bincount.py
@@ -0,0 +1,301 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import itertools
+from typing import Optional
+
+import numpy as np
+import pandas as pd
+
+from ... import get_context, opcodes, options
+from ...core import OutputType, recursive_tile
+from ...core.operand import OperandStage
+from ...serialization.serializables import Int64Field, ReferenceField
+from ...utils import ceildiv, has_unknown_shape
+from ..datasource import tensor as astensor
+from ..operands import TensorMapReduceOperand, TensorOperandMixin
+
+
+class TensorBinCount(TensorMapReduceOperand, TensorOperandMixin):
+    _op_type_ = opcodes.BINCOUNT
+
+    weights = ReferenceField("weights", default=None)
+    minlength: Optional[int] = Int64Field("minlength", default=0)
+    chunk_size_limit: int = Int64Field("chunk_size_limit")
+
+    chunk_count: Optional[int] = Int64Field("chunk_count")
+    tileable_right_bound: Optional[int] = Int64Field("tileable_right_bound")
+
+    def __call__(self, x, weights=None):
+        inputs = [x]
+        self.weights = weights
+        dtype = np.dtype(np.int_)
+        if weights is not None:
+            inputs.append(weights)
+            dtype = weights.dtype
+        return self.new_tensor(inputs, dtype=dtype, shape=(np.nan,))
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        if len(inputs) > 1:
+            self.weights = inputs[1]
+
+    @classmethod
+    def _tile_single(cls, op: "TensorBinCount"):
+        out = op.outputs[0]
+        new_chunk_op = op.copy().reset_key()
+        chunk_inputs = [op.inputs[0].chunks[0]]
+        if op.weights is not None:
+            chunk_inputs.append(op.weights.chunks[0])
+        new_chunk = new_chunk_op.new_chunk(chunk_inputs, index=(0,), **out.params)
+
+        new_op = op.copy().reset_key()
+        return new_op.new_tileables(
+            op.inputs, chunks=[new_chunk], nsplits=((np.nan,),), **out.params
+        )
+
+    @classmethod
+    def tile(cls, op: "TensorBinCount"):
+        from ...dataframe.operands import DataFrameShuffleProxy
+        from ...dataframe.utils import parse_index
+
+        if has_unknown_shape(*op.inputs):
+            yield
+
+        ctx = get_context()
+        a = op.inputs[0]
+        out = op.outputs[0]
+
+        if op.weights is not None and a.shape != op.weights.shape:
+            raise ValueError("The weights and list don't have the same length.")
+
+        input_max = yield from recursive_tile(a.max())
+        yield input_max.chunks + [c for inp in op.inputs for c in inp.chunks]
+        [max_val] = ctx.get_chunks_result([input_max.chunks[0].key])
+        tileable_right_bound = max(op.minlength, int(max_val) + 1)
+
+        chunk_count = max(1, ceildiv(tileable_right_bound, op.chunk_size_limit))
+
+        if (
+            len(op.inputs[0].chunks) == 1
+            and (op.weights is None or len(op.weights.chunks) == 1)
+            and chunk_count == 1
+        ):
+            return cls._tile_single(op)
+
+        if op.weights is not None:
+            weights = yield from recursive_tile(op.weights.rechunk(a.nsplits))
+            weights_chunks = weights.chunks
+        else:
+            weights_chunks = itertools.repeat(None)
+
+        map_chunks = []
+        for a_chunk, weights_chunk in zip(a.chunks, weights_chunks):
+            new_op = op.copy().reset_key()
+            new_op.chunk_count = chunk_count
+            new_op.tileable_right_bound = tileable_right_bound
+            new_op.stage = OperandStage.map
+            new_op._output_types = [OutputType.series]
+
+            inputs = [a_chunk]
+            if weights_chunk is not None:
+                inputs.append(weights_chunk)
+            map_chunks.append(
+                new_op.new_chunk(
+                    inputs,
+                    dtype=out.dtype,
+                    shape=(np.nan,),
+                    index=a_chunk.index,
+                    index_value=parse_index(pd.Index([0], dtype=np.int64), a_chunk.key),
+                )
+            )
+
+        shuffle_op = DataFrameShuffleProxy(output_types=[OutputType.tensor]).new_chunk(
+            map_chunks, dtype=out.dtype, shape=()
+        )
+
+        reduce_chunks = []
+        reduce_nsplits = []
+        left_offset = 0
+        for chunk_idx in range(chunk_count):
+            right_offset = min(tileable_right_bound, left_offset + op.chunk_size_limit)
+
+            new_op = op.copy().reset_key()
+            new_op.stage = OperandStage.reduce
+            new_op.reducer_ordinal = chunk_idx
+            new_op.n_reducers = chunk_count
+            new_op.chunk_count = chunk_count
+            new_op.tileable_right_bound = tileable_right_bound
+
+            reduce_chunks.append(
+                new_op.new_chunk(
+                    [shuffle_op],
+                    dtype=out.dtype,
+                    shape=(right_offset - left_offset,),
+                    index=(chunk_idx,),
+                )
+            )
+            reduce_nsplits.append(right_offset - left_offset)
+            left_offset = right_offset
+
+        new_op = op.copy().reset_key()
+        params = out.params.copy()
+        params["shape"] = (tileable_right_bound,)
+        return new_op.new_tileables(
+            op.inputs,
+            chunks=reduce_chunks,
+            nsplits=(tuple(reduce_nsplits),),
+            **params,
+        )
+
+    @classmethod
+    def _execute_map(cls, ctx, op: "TensorBinCount"):
+        input_val = ctx[op.inputs[0].key]
+        if op.weights is not None:
+            weights_val = ctx[op.weights.key]
+            df = pd.DataFrame({"data": input_val, "weights": weights_val})
+            res = df.groupby("data")["weights"].sum()
+        else:
+            res = pd.Series(input_val).groupby(input_val).count()
+
+        if res.index.min() < 0:
+            raise ValueError("'list' argument must have no negative elements")
+
+        left_bound = 0
+        for target_idx in range(op.chunk_count):
+            right_bound = res.index.searchsorted(
+                (1 + target_idx) * op.chunk_size_limit, "left"
+            )
+            sliced = res.iloc[left_bound:right_bound]
+            if len(sliced) > 0:
+                ctx[op.outputs[0].key, (target_idx,)] = sliced
+            else:
+                # ensure all mapper data are inserted context
+                ctx[op.outputs[0].key, (target_idx,)] = None
+            left_bound = right_bound
+
+    @classmethod
+    def _execute_reduce(cls, ctx, op: "TensorBinCount"):
+        out = op.outputs[0]
+        input_list = list(
+            d for d in op.iter_mapper_data(ctx, skip_none=True) if d is not None
+        )
+        left_bound = op.chunk_size_limit * out.index[0]
+        right_bound = min(left_bound + op.chunk_size_limit, op.tileable_right_bound)
+        if not input_list:
+            ctx[op.outputs[0].key] = np.zeros(right_bound - left_bound)
+        else:
+            res = functools.reduce(
+                lambda a, b: a.add(b, fill_value=0), input_list
+            ).astype(out.dtype)
+            res = res.reindex(pd.RangeIndex(left_bound, right_bound), fill_value=0)
+            ctx[op.outputs[0].key] = res.values
+
+    @classmethod
+    def execute(cls, ctx, op: "TensorBinCount"):
+        if op.stage == OperandStage.map:
+            op._execute_map(ctx, op)
+        elif op.stage == OperandStage.reduce:
+            op._execute_reduce(ctx, op)
+        else:
+            input_val = ctx[op.inputs[0].key]
+            weights_val = ctx[op.weights.key] if op.weights is not None else None
+            ctx[op.outputs[0].key] = np.bincount(
+                input_val, weights=weights_val, minlength=op.minlength
+            )
+
+
+def bincount(x, weights=None, minlength=0, chunk_size_limit=None):
+    """
+    Count number of occurrences of each value in array of non-negative ints.
+
+    The number of bins (of size 1) is one larger than the largest value in
+    `x`. If `minlength` is specified, there will be at least this number
+    of bins in the output array (though it will be longer if necessary,
+    depending on the contents of `x`).
+    Each bin gives the number of occurrences of its index value in `x`.
+    If `weights` is specified the input array is weighted by it, i.e. if a
+    value ``n`` is found at position ``i``, ``out[n] += weight[i]`` instead
+    of ``out[n] += 1``.
+
+    Parameters
+    ----------
+    x : tensor or array_like, 1 dimension, nonnegative ints
+        Input array.
+    weights : tensor or array_like, optional
+        Weights, array of the same shape as `x`.
+    minlength : int, optional
+        A minimum number of bins for the output array.
+
+    Returns
+    -------
+    out : tensor of ints
+        The result of binning the input array.
+        The length of `out` is equal to ``np.amax(x)+1``.
+
+    Raises
+    ------
+    ValueError
+        If the input is not 1-dimensional, or contains elements with negative
+        values, or if `minlength` is negative.
+    TypeError
+        If the type of the input is float or complex.
+
+    See Also
+    --------
+    histogram, digitize, unique
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> mt.bincount(mt.arange(5)).execute()
+    array([1, 1, 1, 1, 1])
+    >>> mt.bincount(mt.tensor([0, 1, 1, 3, 2, 1, 7])).execute()
+    array([1, 3, 1, 1, 0, 0, 0, 1])
+
+    The input array needs to be of integer dtype, otherwise a
+    TypeError is raised:
+
+    >>> mt.bincount(mt.arange(5, dtype=float)).execute()
+    Traceback (most recent call last):
+      ....execute()
+    TypeError: Cannot cast array data from dtype('float64') to dtype('int64')
+    according to the rule 'safe'
+
+    A possible use of ``bincount`` is to perform sums over
+    variable-size chunks of an array, using the ``weights`` keyword.
+
+    >>> w = mt.array([0.3, 0.5, 0.2, 0.7, 1., -0.6]) # weights
+    >>> x = mt.array([0, 1, 1, 2, 2, 2])
+    >>> mt.bincount(x,  weights=w).execute()
+    array([ 0.3,  0.7,  1.1])
+    """
+    x = astensor(x)
+    weights = astensor(weights) if weights is not None else None
+
+    if not np.issubdtype(x.dtype, np.int_):
+        raise TypeError(f"Cannot cast array data from {x.dtype} to {np.dtype(np.int_)}")
+    if x.ndim != 1:
+        raise ValueError("'x' must be 1 dimension")
+    if minlength < 0:
+        raise ValueError("'minlength' must not be negative")
+
+    chunk_size_limit = (
+        chunk_size_limit
+        if chunk_size_limit is not None
+        else options.bincount.chunk_size_limit
+    )
+    op = TensorBinCount(minlength=minlength, chunk_size_limit=chunk_size_limit)
+    return op(x, weights=weights)
diff --git a/python/xorbits/_mars/tensor/statistics/core.py b/python/xorbits/_mars/tensor/statistics/core.py
new file mode 100644
index 000000000..d512ede9f
--- /dev/null
+++ b/python/xorbits/_mars/tensor/statistics/core.py
@@ -0,0 +1,69 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import normalize_axis_tuple
+
+
+def _ureduce(a, func, **kwargs):
+    """
+    Internal Function.
+    Call `func` with `a` as first argument swapping the axes to use extended
+    axis on functions that don't support it natively.
+
+    Returns result and a.shape with axis dims set to 1.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor or object that can be converted to a tensor.
+    func : callable
+        Reduction function capable of receiving a single axis argument.
+        It is called with `a` as first argument followed by `kwargs`.
+    kwargs : keyword arguments
+        additional keyword arguments to pass to `func`.
+
+    Returns
+    -------
+    result : tuple
+        Result of func(a, **kwargs) and a.shape with axis dims set to 1
+        which can be used to reshape the result to the same shape a ufunc with
+        keepdims=True would produce.
+
+    """
+    axis = kwargs.get("axis", None)
+    if axis is not None:
+        keepdim = list(a.shape)
+        nd = a.ndim
+        axis = normalize_axis_tuple(axis, nd)
+
+        for ax in axis:
+            keepdim[ax] = 1
+
+        if len(axis) == 1:
+            kwargs["axis"] = axis[0]
+        else:
+            keep = set(range(nd)) - set(axis)
+            nkeep = len(keep)
+            # swap axis that should not be reduced to front
+            for i, s in enumerate(sorted(keep)):
+                a = a.swapaxes(i, s)
+            # merge reduced axis
+            a = a.reshape(a.shape[:nkeep] + (-1,))
+            kwargs["axis"] = -1
+        keepdim = tuple(keepdim)
+    else:
+        keepdim = (1,) * a.ndim
+
+    r = func(a, **kwargs)
+    return r, keepdim
diff --git a/python/xorbits/_mars/tensor/statistics/corrcoef.py b/python/xorbits/_mars/tensor/statistics/corrcoef.py
new file mode 100644
index 000000000..461fd204e
--- /dev/null
+++ b/python/xorbits/_mars/tensor/statistics/corrcoef.py
@@ -0,0 +1,77 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .cov import cov
+
+
+def corrcoef(x, y=None, rowvar=True):
+    r"""
+    Return Pearson product-moment correlation coefficients.
+
+    Please refer to the documentation for `cov` for more detail.  The
+    relationship between the correlation coefficient matrix, `R`, and the
+    covariance matrix, `C`, is
+
+    .. math:: R_{ij} = \frac{ C_{ij} } { \sqrt{ C_{ii} * C_{jj} } }
+
+    The values of `R` are between -1 and 1, inclusive.
+
+    Parameters
+    ----------
+    x : array_like
+        A 1-D or 2-D array containing multiple variables and observations.
+        Each row of `x` represents a variable, and each column a single
+        observation of all those variables. Also see `rowvar` below.
+    y : array_like, optional
+        An additional set of variables and observations. `y` has the same
+        shape as `x`.
+    rowvar : bool, optional
+        If `rowvar` is True (default), then each row represents a
+        variable, with observations in the columns. Otherwise, the relationship
+        is transposed: each column represents a variable, while the rows
+        contain observations.
+
+    Returns
+    -------
+    R : Tensor
+        The correlation coefficient matrix of the variables.
+
+    See Also
+    --------
+    cov : Covariance matrix
+
+    Notes
+    -----
+    Due to floating point rounding the resulting tensor may not be Hermitian,
+    the diagonal elements may not be 1, and the elements may not satisfy the
+    inequality abs(a) <= 1. The real and imaginary parts are clipped to the
+    interval [-1,  1] in an attempt to improve on that situation but is not
+    much help in the complex case.
+
+    This function accepts but discards arguments `bias` and `ddof`.  This is
+    for backwards compatibility with previous versions of this function.  These
+    arguments had no effect on the return values of the function and can be
+    safely ignored in this and previous versions of numpy.
+
+    """
+    from ..arithmetic import sqrt
+    from ..datasource import diag
+
+    c = cov(x, y, rowvar)
+    if c.ndim == 0:
+        return c / c
+    d = diag(c)
+    d = d.reshape(d.shape[0], 1)
+    sqrt_d = sqrt(d)
+    return (c / sqrt_d) / sqrt_d.T
diff --git a/python/xorbits/_mars/tensor/statistics/cov.py b/python/xorbits/_mars/tensor/statistics/cov.py
new file mode 100644
index 000000000..7a838c095
--- /dev/null
+++ b/python/xorbits/_mars/tensor/statistics/cov.py
@@ -0,0 +1,222 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import numpy as np
+
+from ..base.squeeze import squeeze
+from ..base.where import where
+from ..core import Tensor
+from ..datasource import array
+from ..datasource import tensor as astensor
+from .average import average
+
+
+def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None, aweights=None):
+    """
+    Estimate a covariance matrix, given data and weights.
+
+    Covariance indicates the level to which two variables vary together.
+    If we examine N-dimensional samples, :math:`X = [x_1, x_2, ... x_N]^T`,
+    then the covariance matrix element :math:`C_{ij}` is the covariance of
+    :math:`x_i` and :math:`x_j`. The element :math:`C_{ii}` is the variance
+    of :math:`x_i`.
+
+    See the notes for an outline of the algorithm.
+
+    Parameters
+    ----------
+    m : array_like
+        A 1-D or 2-D array containing multiple variables and observations.
+        Each row of `m` represents a variable, and each column a single
+        observation of all those variables. Also see `rowvar` below.
+    y : array_like, optional
+        An additional set of variables and observations. `y` has the same form
+        as that of `m`.
+    rowvar : bool, optional
+        If `rowvar` is True (default), then each row represents a
+        variable, with observations in the columns. Otherwise, the relationship
+        is transposed: each column represents a variable, while the rows
+        contain observations.
+    bias : bool, optional
+        Default normalization (False) is by ``(N - 1)``, where ``N`` is the
+        number of observations given (unbiased estimate). If `bias` is True,
+        then normalization is by ``N``. These values can be overridden by using
+        the keyword ``ddof`` in numpy versions >= 1.5.
+    ddof : int, optional
+        If not ``None`` the default value implied by `bias` is overridden.
+        Note that ``ddof=1`` will return the unbiased estimate, even if both
+        `fweights` and `aweights` are specified, and ``ddof=0`` will return
+        the simple average. See the notes for the details. The default value
+        is ``None``.
+    fweights : array_like, int, optional
+        1-D tensor of integer freguency weights; the number of times each
+        observation vector should be repeated.
+    aweights : array_like, optional
+        1-D tensor of observation vector weights. These relative weights are
+        typically large for observations considered "important" and smaller for
+        observations considered less "important". If ``ddof=0`` the array of
+        weights can be used to assign probabilities to observation vectors.
+
+    Returns
+    -------
+    out : Tensor
+        The covariance matrix of the variables.
+
+    See Also
+    --------
+    corrcoef : Normalized covariance matrix
+
+    Notes
+    -----
+    Assume that the observations are in the columns of the observation
+    array `m` and let ``f = fweights`` and ``a = aweights`` for brevity. The
+    steps to compute the weighted covariance are as follows::
+
+        >>> w = f * a
+        >>> v1 = mt.sum(w)
+        >>> v2 = mt.sum(w * a)
+        >>> m -= mt.sum(m * w, axis=1, keepdims=True) / v1
+        >>> cov = mt.dot(m * w, m.T) * v1 / (v1**2 - ddof * v2)
+
+    Note that when ``a == 1``, the normalization factor
+    ``v1 / (v1**2 - ddof * v2)`` goes over to ``1 / (np.sum(f) - ddof)``
+    as it should.
+
+    Examples
+    --------
+    Consider two variables, :math:`x_0` and :math:`x_1`, which
+    correlate perfectly, but in opposite directions:
+
+    >>> import mars.tensor as mt
+
+    >>> x = mt.array([[0, 2], [1, 1], [2, 0]]).T
+    >>> x.execute()
+    array([[0, 1, 2],
+           [2, 1, 0]])
+
+    Note how :math:`x_0` increases while :math:`x_1` decreases. The covariance
+    matrix shows this clearly:
+
+    >>> mt.cov(x).execute()
+    array([[ 1., -1.],
+           [-1.,  1.]])
+
+    Note that element :math:`C_{0,1}`, which shows the correlation between
+    :math:`x_0` and :math:`x_1`, is negative.
+
+    Further, note how `x` and `y` are combined:
+
+    >>> x = [-2.1, -1,  4.3]
+    >>> y = [3,  1.1,  0.12]
+    >>> X = mt.stack((x, y), axis=0)
+    >>> print(mt.cov(X).execute())
+    [[ 11.71        -4.286     ]
+     [ -4.286        2.14413333]]
+    >>> print(mt.cov(x, y).execute())
+    [[ 11.71        -4.286     ]
+     [ -4.286        2.14413333]]
+    >>> print(mt.cov(x).execute())
+    11.71
+
+    """
+    from ..linalg import dot
+    from ..merge import vstack
+
+    if ddof is not None and ddof != int(ddof):
+        raise ValueError("ddof must be integer")
+
+    m = astensor(m)
+    if m.ndim > 2:
+        raise ValueError("m has more than 2 dimensions")
+
+    if y is None:
+        dtype = np.result_type(m.dtype, np.float64)
+    else:
+        y = astensor(y)
+        if y.ndim > 2:
+            raise ValueError("y has more than 2 dimensions")
+        dtype = np.result_type(m.dtype, y.dtype, np.float64)
+
+    X = array(m, ndmin=2, dtype=dtype)
+    if not rowvar and X.shape[0] != 1:
+        X = X.T
+    if y is not None:
+        y = array(y, copy=False, ndmin=2, dtype=dtype)
+        if not rowvar and y.shape[0] != 1:
+            y = y.T
+        X = vstack((X, y))
+
+    if ddof is None:
+        if bias == 0:
+            ddof = 1
+        else:
+            ddof = 0
+
+    # Get the product of frequencies and weights
+    w = None
+    if fweights is not None:
+        fweights = astensor(fweights, dtype=float)
+        if fweights.ndim > 1:
+            raise RuntimeError("cannot handle multidimensional fweights")
+        if fweights.shape[0] != X.shape[1]:
+            raise RuntimeError("incompatible numbers of samples and fweights")
+        if any(fweights < 0):
+            raise ValueError("fweights cannot be negative")
+        w = fweights
+    if aweights is not None:
+        aweights = astensor(aweights, dtype=float)
+        if aweights.ndim > 1:
+            raise RuntimeError("cannot handle multidimensional aweights")
+        if aweights.shape[0] != X.shape[1]:
+            raise RuntimeError("incompatible numbers of samples and aweights")
+        if any(aweights < 0):
+            raise ValueError("aweights cannot be negative")
+        if w is None:
+            w = aweights
+        else:
+            w *= aweights
+
+    avg, w_sum = average(X, axis=1, weights=w, returned=True)
+    w_sum = w_sum[0]
+
+    # Determine the normalization
+    if w is None:
+        fact = X.shape[1] - ddof
+    elif ddof == 0:
+        fact = w_sum
+    elif aweights is None:
+        fact = w_sum - ddof
+    else:
+        fact = w_sum - ddof * sum(w * aweights) / w_sum
+
+    X -= avg[:, None]
+    if w is None:
+        X_T = X.T
+    else:
+        X_T = (X * w).T
+    c = dot(X, X_T.conj())
+    if isinstance(fact, Tensor):
+        fact = where(fact <= 0, 0.0, fact)
+        fact = fact.astype(float)
+    else:
+        if fact <= 0:
+            warnings.warn(
+                "Degrees of freedom <= 0 for slice", RuntimeWarning, stacklevel=2
+            )
+            fact = 0.0
+        fact = np.float64(fact)
+    c = c * (1.0 / fact)
+    return squeeze(c)
diff --git a/python/xorbits/_mars/tensor/statistics/digitize.py b/python/xorbits/_mars/tensor/statistics/digitize.py
new file mode 100644
index 000000000..39407a977
--- /dev/null
+++ b/python/xorbits/_mars/tensor/statistics/digitize.py
@@ -0,0 +1,186 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import recursive_tile
+from ...lib.sparse.core import get_array_module
+from ...serialization.serializables import AnyField, BoolField, KeyField
+from ...utils import has_unknown_shape
+from ..array_utils import as_same_device, device
+from ..core import Tensor, TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorHasInput, TensorOperandMixin
+
+
+class TensorDigitize(TensorHasInput, TensorOperandMixin):
+    _op_type_ = OperandDef.DIGITIZE
+
+    _input = KeyField("input")
+    _bins = AnyField("bins")
+    _right = BoolField("right")
+
+    def __init__(self, right=False, **kw):
+        super().__init__(_right=right, **kw)
+
+    @property
+    def bins(self):
+        return self._bins
+
+    @property
+    def right(self):
+        return self._right
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._input = self._inputs[0]
+        if len(inputs) > 1:
+            self._bins = self._inputs[1]
+
+    def __call__(self, x, bins):
+        x = astensor(x)
+        inputs = [x]
+        if not isinstance(bins, Tensor):
+            bins = get_array_module(bins).asarray(bins)
+            self._bins = bins
+        else:
+            inputs.append(bins)
+        self.dtype = np.digitize(
+            [0], np.empty(1, dtype=bins.dtype), right=self._right
+        ).dtype
+
+        return self.new_tensor(inputs, x.shape, order=TensorOrder.C_ORDER)
+
+    @classmethod
+    def tile(cls, op):
+        tensor = op.outputs[0]
+        in_tensor = op.input
+        bins = op.bins
+        if len(op.inputs) == 2:
+            # bins is TensorData
+            if has_unknown_shape(bins):
+                yield
+            bins = (yield from recursive_tile(bins.rechunk(tensor.shape))).chunks[0]
+
+        out_chunks = []
+        for c in in_tensor.chunks:
+            input_chunks = [c]
+            if len(op.inputs) == 2:
+                input_chunks.append(bins)
+            out_chunk = (
+                op.copy()
+                .reset_key()
+                .new_chunk(
+                    input_chunks, shape=c.shape, index=c.index, order=tensor.order
+                )
+            )
+            out_chunks.append(out_chunk)
+
+        new_op = op.copy()
+        return new_op.new_tensors(
+            op.inputs,
+            tensor.shape,
+            order=tensor.order,
+            chunks=out_chunks,
+            nsplits=in_tensor.nsplits,
+        )
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True
+        )
+
+        x = inputs[0]
+        if len(inputs) > 1:
+            bins = inputs[1]
+        else:
+            bins = op.bins
+
+        with device(device_id):
+            ctx[op.outputs[0].key] = xp.digitize(x, bins=bins, right=op.right)
+
+
+def digitize(x, bins, right=False):
+    """
+    Return the indices of the bins to which each value in input tensor belongs.
+
+    Each index ``i`` returned is such that ``bins[i-1] <= x < bins[i]`` if
+    `bins` is monotonically increasing, or ``bins[i-1] > x >= bins[i]`` if
+    `bins` is monotonically decreasing. If values in `x` are beyond the
+    bounds of `bins`, 0 or ``len(bins)`` is returned as appropriate. If right
+    is True, then the right bin is closed so that the index ``i`` is such
+    that ``bins[i-1] < x <= bins[i]`` or ``bins[i-1] >= x > bins[i]`` if `bins`
+    is monotonically increasing or decreasing, respectively.
+
+    Parameters
+    ----------
+    x : array_like
+        Input tensor to be binned.
+    bins : array_like
+        Array of bins. It has to be 1-dimensional and monotonic.
+    right : bool, optional
+        Indicating whether the intervals include the right or the left bin
+        edge. Default behavior is (right==False) indicating that the interval
+        does not include the right edge. The left bin end is open in this
+        case, i.e., bins[i-1] <= x < bins[i] is the default behavior for
+        monotonically increasing bins.
+
+    Returns
+    -------
+    out : Tensor of ints
+        Output tensor of indices, of same shape as `x`.
+
+    Raises
+    ------
+    ValueError
+        If `bins` is not monotonic.
+    TypeError
+        If the type of the input is complex.
+
+    See Also
+    --------
+    bincount, histogram, unique, searchsorted
+
+    Notes
+    -----
+    If values in `x` are such that they fall outside the bin range,
+    attempting to index `bins` with the indices that `digitize` returns
+    will result in an IndexError.
+
+    `mt.digitize` is  implemented in terms of `mt.searchsorted`. This means
+    that a binary search is used to bin the values, which scales much better
+    for larger number of bins than the previous linear search. It also removes
+    the requirement for the input array to be 1-dimensional.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.array([0.2, 6.4, 3.0, 1.6])
+    >>> bins = mt.array([0.0, 1.0, 2.5, 4.0, 10.0])
+    >>> inds = mt.digitize(x, bins)
+    >>> inds.execute()
+    array([1, 4, 3, 2])
+
+    >>> x = mt.array([1.2, 10.0, 12.4, 15.5, 20.])
+    >>> bins = mt.array([0, 5, 10, 15, 20])
+    >>> mt.digitize(x,bins,right=True).execute()
+    array([1, 2, 3, 4, 4])
+    >>> mt.digitize(x,bins,right=False).execute()
+    array([1, 3, 3, 4, 5])
+    """
+    op = TensorDigitize(right=right)
+    return op(x, bins)
diff --git a/python/xorbits/_mars/tensor/statistics/histogram.py b/python/xorbits/_mars/tensor/statistics/histogram.py
new file mode 100644
index 000000000..b990c46b1
--- /dev/null
+++ b/python/xorbits/_mars/tensor/statistics/histogram.py
@@ -0,0 +1,1006 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+import warnings
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ... import tensor as mt
+from ...core import recursive_tile
+from ...core.context import get_context
+from ...serialization.serializables import AnyField, BoolField, KeyField, TupleField
+from ...utils import has_unknown_shape
+from ..arithmetic.utils import chunk_tree_add
+from ..array_utils import as_same_device, device
+from ..core import TENSOR_CHUNK_TYPE, TENSOR_TYPE, TensorOrder
+from ..datasource import tensor as astensor
+from ..operands import TensorOperand, TensorOperandMixin
+from ..utils import is_asc_sorted
+
+# note: some logic of this file were adopted from `numpy/lib/histograms`
+
+
+def _ptp(range_):
+    """Peak-to-peak value of x.
+
+    This implementation avoids the problem of signed integer arrays having a
+    peak-to-peak value that cannot be represented with the array's data type.
+    This function returns an unsigned value for signed integer arrays.
+    """
+    return _unsigned_subtract(*range_[::-1])
+
+
+class HistBinSelector:
+    def __init__(self, histogram_bin_edges_op, x, range, raw_range):
+        self._op = histogram_bin_edges_op
+        self._x = x
+        self._range = range
+        self._raw_range = raw_range
+        self._width = None
+
+    def check(self):
+        # not checked before
+        width = self()
+        if width is None:
+            return
+        self._width = width = yield from recursive_tile(width)
+        yield [c.data for c in width.chunks]
+
+    def __call__(self):
+        return
+
+    def get_result(self):
+        ctx = get_context()
+        width = ctx.get_chunks_result([self._width.chunks[0].key])[0]
+        return width
+
+
+class HistBinSqrtSelector(HistBinSelector):
+    """
+    Square root histogram bin estimator.
+
+    Bin width is inversely proportional to the data size. Used by many
+    programs for its simplicity.
+    """
+
+    def get_result(self):
+        return _ptp(self._raw_range) / np.sqrt(self._x.size)
+
+
+class HistBinSturgesSelector(HistBinSelector):
+    """
+    Sturges histogram bin estimator.
+
+    A very simplistic estimator based on the assumption of normality of
+    the data. This estimator has poor performance for non-normal data,
+    which becomes especially obvious for large data sets. The estimate
+    depends only on size of the data.
+    """
+
+    def get_result(self):
+        return _ptp(self._raw_range) / (np.log2(self._x.size) + 1.0)
+
+
+class HistBinRiceSelector(HistBinSelector):
+    """
+    Rice histogram bin estimator.
+
+    Another simple estimator with no normality assumption. It has better
+    performance for large data than Sturges, but tends to overestimate
+    the number of bins. The number of bins is proportional to the cube
+    root of data size (asymptotically optimal). The estimate depends
+    only on size of the data.
+    """
+
+    def get_result(self):
+        return _ptp(self._raw_range) / (2.0 * self._x.size ** (1.0 / 3))
+
+
+class HistBinScottSelector(HistBinSelector):
+    """
+    Scott histogram bin estimator.
+
+    The binwidth is proportional to the standard deviation of the data
+    and inversely proportional to the cube root of data size
+    (asymptotically optimal).
+    """
+
+    def __call__(self):
+        return (24.0 * np.pi**0.5 / self._x.size) ** (1.0 / 3.0) * mt.std(self._x)
+
+
+class HistBinStoneSelector(HistBinSelector):
+    """
+    Histogram bin estimator based on minimizing the estimated integrated squared error (ISE).
+
+    The number of bins is chosen by minimizing the estimated ISE against the unknown true distribution.
+    The ISE is estimated using cross-validation and can be regarded as a generalization of Scott's rule.
+    https://en.wikipedia.org/wiki/Histogram#Scott.27s_normal_reference_rule
+
+    This paper by Stone appears to be the origination of this rule.
+    http://digitalassets.lib.berkeley.edu/sdtr/ucb/text/34.pdf
+    """
+
+    def __call__(self):
+        n = self._x.size
+        ptp_x = _ptp(self._raw_range)
+
+        if n <= 1 or ptp_x == 0:
+            return
+
+        nbins_upper_bound = max(100, int(np.sqrt(n)))
+        candidates = []
+        for nbins in range(1, nbins_upper_bound + 1):
+            hh = ptp_x / nbins
+            p_k = histogram(self._x, bins=nbins, range=self._range)[0] / n
+            candidate = (2 - (n + 1) * p_k.dot(p_k)) / hh
+            candidates.append(candidate)
+        nbins = mt.stack(candidates).argmin() + 1
+        return ptp_x / nbins
+
+    def get_result(self):
+        ptp_x = _ptp(self._raw_range)
+        if self._x.size <= 1 or ptp_x == 0:
+            return 0.0
+        else:
+            return super().get_result()
+
+
+class HistBinDoaneSelector(HistBinSelector):
+    """
+    Doane's histogram bin estimator.
+
+    Improved version of Sturges' formula which works better for
+    non-normal data. See
+    stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning
+    """
+
+    def __call__(self):
+        x = self._x
+        if x.size <= 2:
+            return
+
+        sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3)))
+        sigma = mt.std(x)
+        g1 = mt.mean(((x - mt.mean(x)) / sigma) ** 3)
+        ret = _ptp(self._raw_range) / (
+            1.0 + np.log2(x.size) + mt.log2(1.0 + mt.absolute(g1) / sg1)
+        )
+        return mt.where(sigma > 0.0, ret, 0.0)
+
+    def get_result(self):
+        if self._x.size <= 2:
+            return 0.0
+        else:
+            return super().get_result()
+
+
+class HistBinFdSelector(HistBinSelector):
+    """
+    The Freedman-Diaconis histogram bin estimator.
+
+    The Freedman-Diaconis rule uses interquartile range (IQR) to
+    estimate binwidth. It is considered a variation of the Scott rule
+    with more robustness as the IQR is less affected by outliers than
+    the standard deviation. However, the IQR depends on fewer points
+    than the standard deviation, so it is less accurate, especially for
+    long tailed distributions.
+
+    If the IQR is 0, this function returns 1 for the number of bins.
+    Binwidth is inversely proportional to the cube root of data size
+    (asymptotically optimal).
+    """
+
+    def __call__(self):
+        iqr = mt.subtract(*mt.percentile(self._x, [75, 25]))
+        return 2.0 * iqr * self._x.size ** (-1.0 / 3.0)
+
+
+class HistBinAutoSelector(HistBinSelector):
+    """
+    Histogram bin estimator that uses the minimum width of the
+    Freedman-Diaconis and Sturges estimators if the FD bandwidth is non zero
+    and the Sturges estimator if the FD bandwidth is 0.
+
+    The FD estimator is usually the most robust method, but its width
+    estimate tends to be too large for small `x` and bad for data with limited
+    variance. The Sturges estimator is quite good for small (<1000) datasets
+    and is the default in the R language. This method gives good off the shelf
+    behaviour.
+
+    If there is limited variance the IQR can be 0, which results in the
+    FD bin width being 0 too. This is not a valid bin width, so
+    ``np.histogram_bin_edges`` chooses 1 bin instead, which may not be optimal.
+    If the IQR is 0, it's unlikely any variance based estimators will be of
+    use, so we revert to the sturges estimator, which only uses the size of the
+    dataset in its calculation.
+    """
+
+    def __init__(self, histogram_bin_edges_op, x, range, raw_range):
+        super().__init__(histogram_bin_edges_op, x, range, raw_range)
+        self._bin_fd = HistBinFdSelector(histogram_bin_edges_op, x, range, raw_range)
+        self._bin_sturges = HistBinSturgesSelector(
+            histogram_bin_edges_op, x, range, raw_range
+        )
+
+    def __call__(self):
+        return self._bin_fd()
+
+    def get_result(self):
+        fd_bw = super().get_result()
+        sturges_bw = self._bin_sturges.get_result()
+        if fd_bw:
+            return min(fd_bw, sturges_bw)
+        else:
+            # limited variance, so we return a len dependent bw estimator
+            return sturges_bw
+
+
+# Private dict initialized at module load time
+_hist_bin_selectors = {
+    "stone": HistBinStoneSelector,
+    "auto": HistBinAutoSelector,
+    "doane": HistBinDoaneSelector,
+    "fd": HistBinFdSelector,
+    "rice": HistBinRiceSelector,
+    "scott": HistBinScottSelector,
+    "sqrt": HistBinSqrtSelector,
+    "sturges": HistBinSturgesSelector,
+}
+
+
+def _ravel_and_check_weights(a, weights):
+    """Check a and weights have matching shapes, and ravel both"""
+    a = astensor(a)
+
+    # Ensure that the array is a "subtractable" dtype
+    if a.dtype == np.bool_:
+        warnings.warn(
+            f"Converting input from {a.dtype} to {np.uint8} for compatibility.",
+            RuntimeWarning,
+            stacklevel=3,
+        )
+        a = a.astype(np.uint8)
+
+    if weights is not None:
+        weights = astensor(weights)
+        if weights.shape != a.shape:
+            raise ValueError("weights should have the same shape as a.")
+        weights = weights.ravel()
+    a = a.ravel()
+    return a, weights
+
+
+def _check_range(range):
+    first_edge, last_edge = range
+    if first_edge > last_edge:
+        raise ValueError("max must be larger than min in range parameter.")
+    if not (np.isfinite(first_edge) and np.isfinite(last_edge)):
+        raise ValueError(f"supplied range of [{first_edge}, {last_edge}] is not finite")
+    return first_edge, last_edge
+
+
+def _get_outer_edges(a, range):
+    """
+    Determine the outer bin edges to use, from either the data or the range
+    argument
+    """
+    if range is not None:
+        first_edge, last_edge = _check_range(range)
+    else:
+        assert a.size == 0
+        # handle empty arrays. Can't determine range, so use 0-1.
+        first_edge, last_edge = 0, 1
+
+    # expand empty range to avoid divide by zero
+    if first_edge == last_edge:
+        first_edge = first_edge - 0.5
+        last_edge = last_edge + 0.5
+
+    return first_edge, last_edge
+
+
+def _unsigned_subtract(a, b):
+    """
+    Subtract two values where a >= b, and produce an unsigned result
+
+    This is needed when finding the difference between the upper and lower
+    bound of an int16 histogram
+    """
+    # coerce to a single type
+    signed_to_unsigned = {
+        np.byte: np.ubyte,
+        np.short: np.ushort,
+        np.intc: np.uintc,
+        np.int_: np.uint,
+        np.longlong: np.ulonglong,
+    }
+    dt = np.result_type(a, b)
+    try:
+        dt = signed_to_unsigned[dt.type]
+    except KeyError:  # pragma: no cover
+        return np.subtract(a, b, dtype=dt)
+    else:
+        # we know the inputs are integers, and we are deliberately casting
+        # signed to unsigned
+        return np.subtract(a, b, casting="unsafe", dtype=dt)
+
+
+def _get_bin_edges(op, a, bins, range, weights):
+    # parse the overloaded bins argument
+    n_equal_bins = None
+    bin_edges = None
+    first_edge = None
+    last_edge = None
+
+    if isinstance(bins, str):
+        # when `bins` is str, x.min() and x.max()
+        # will be calculated in advance
+        bin_name = bins
+        if a.size > 0:
+            assert range is not None
+
+        raw_range = range
+        first_edge, last_edge = _get_outer_edges(a, range)
+
+        if a.size == 0:
+            n_equal_bins = 1
+        else:
+            # Do not call selectors on empty arrays
+            selector = _hist_bin_selectors[bin_name](
+                op, a, (first_edge, last_edge), raw_range
+            )
+            yield from selector.check()
+            width = selector.get_result()
+            if width:
+                n_equal_bins = int(
+                    np.ceil(_unsigned_subtract(last_edge, first_edge) / width)
+                )
+            else:
+                # Width can be zero for some estimators, e.g. FD when
+                # the IQR of the data is zero.
+                n_equal_bins = 1
+
+    elif mt.ndim(bins) == 0:
+        first_edge, last_edge = _get_outer_edges(a, range)
+        n_equal_bins = bins
+
+    else:
+        # cannot be Tensor, must be calculated first
+        assert mt.ndim(bins) == 1 and not isinstance(bins, TENSOR_TYPE)
+        bin_edges = np.asarray(bins)
+        if not is_asc_sorted(bin_edges):
+            raise ValueError("`bins` must increase monotonically, when an array")
+
+    if n_equal_bins is not None:
+        # numpy gh-10322 means that type resolution rules are dependent on array
+        # shapes. To avoid this causing problems, we pick a type now and stick
+        # with it throughout.
+        bin_type = np.result_type(first_edge, last_edge, a)
+        if np.issubdtype(bin_type, np.integer):
+            bin_type = np.result_type(bin_type, float)
+
+        # bin edges must be computed
+        bin_edges = mt.linspace(
+            first_edge,
+            last_edge,
+            n_equal_bins + 1,
+            endpoint=True,
+            dtype=bin_type,
+            gpu=op.gpu,
+        )
+        return bin_edges, (first_edge, last_edge, n_equal_bins)
+    else:
+        return mt.tensor(bin_edges), None
+
+
+class TensorHistogramBinEdges(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.HISTOGRAM_BIN_EDGES
+
+    _input = KeyField("input")
+    _bins = AnyField("bins")
+    _range = TupleField("range")
+    _weights = KeyField("weights")
+    _uniform_bins = TupleField("uniform_bins")
+
+    def __init__(
+        self,
+        input=None,
+        bins=None,
+        range=None,
+        weights=None,
+        **kw,
+    ):
+        super().__init__(_input=input, _bins=bins, _range=range, _weights=weights, **kw)
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def bins(self):
+        return self._bins
+
+    @property
+    def range(self):
+        return self._range
+
+    @property
+    def weights(self):
+        return self._weights
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        self._input = next(inputs_iter)
+        if isinstance(self._bins, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)):
+            self._bins = next(inputs_iter)
+        if self._weights is not None:
+            self._weights = next(inputs_iter)
+
+    def __call__(self, a, bins, range, weights):
+        if range is not None:
+            _check_range(range)
+        if isinstance(bins, str):
+            # string, 'auto', 'stone', ...
+            # shape is unknown
+            bin_name = bins
+            # if `bins` is a string for an automatic method,
+            # this will replace it with the number of bins calculated
+            if bin_name not in _hist_bin_selectors:
+                raise ValueError(f"{bin_name!r} is not a valid estimator for `bins`")
+            if weights is not None:
+                raise TypeError(
+                    "Automated estimation of the number of "
+                    "bins is not supported for weighted data"
+                )
+            if isinstance(range, tuple) and len(range) == 2:
+                # if `bins` is a string, e.g. 'auto', 'stone'...,
+                # and `range` provided as well,
+                # `a` should be trimmed first
+                first_edge, last_edge = _get_outer_edges(a, range)
+                a = a[(a >= first_edge) & (a <= last_edge)]
+            shape = (np.nan,)
+        elif mt.ndim(bins) == 0:
+            try:
+                n_equal_bins = operator.index(bins)
+            except TypeError:  # pragma: no cover
+                raise TypeError("`bins` must be an integer, a string, or an array")
+            if n_equal_bins < 1:
+                raise ValueError("`bins` must be positive, when an integer")
+            shape = (bins + 1,)
+        elif mt.ndim(bins) == 1:
+            if not isinstance(bins, TENSOR_TYPE):
+                bins = np.asarray(bins)
+                if not is_asc_sorted(bins):
+                    raise ValueError(
+                        "`bins` must increase monotonically, when an array"
+                    )
+            shape = astensor(bins).shape
+        else:
+            raise ValueError("`bins` must be 1d, when an array")
+
+        inputs = [a]
+        if isinstance(bins, TENSOR_TYPE):
+            inputs.append(bins)
+        if weights is not None:
+            inputs.append(weights)
+
+        return self.new_tensor(inputs, shape=shape, order=TensorOrder.C_ORDER)
+
+    @classmethod
+    def tile(cls, op):
+        ctx = get_context()
+        a = op.input
+        range_ = op.range
+        bins = op.bins
+
+        if isinstance(bins, str):
+            if has_unknown_shape(a):
+                yield
+        if (
+            (a.size > 0 or np.isnan(a.size))
+            and (isinstance(bins, str) or mt.ndim(bins) == 0)
+            and not range_
+        ):
+            input_min = a.min(keepdims=True)
+            input_max = a.max(keepdims=True)
+            input_min, input_max = yield from recursive_tile(input_min, input_max)
+            chunks = [input_min.chunks[0], input_max.chunks[0]]
+            yield chunks + a.chunks
+            range_results = ctx.get_chunks_result([c.key for c in chunks])
+            # make sure returned bounds are valid
+            if all(x.size > 0 for x in range_results):
+                range_ = tuple(x[0] for x in range_results)
+        if isinstance(bins, TENSOR_TYPE):
+            # `bins` is a Tensor, needs to be calculated first
+            yield
+            bin_datas = ctx.get_chunks_result([c.key for c in bins.chunks])
+            bins = np.concatenate(bin_datas)
+        else:
+            bins = op.bins
+
+        bin_edges, _ = yield from _get_bin_edges(op, op.input, bins, range_, op.weights)
+        bin_edges = yield from recursive_tile(bin_edges)
+        return [bin_edges]
+
+
+def histogram_bin_edges(a, bins=10, range=None, weights=None):
+    r"""
+    Function to calculate only the edges of the bins used by the `histogram`
+    function.
+
+    Parameters
+    ----------
+    a : array_like
+        Input data. The histogram is computed over the flattened tensor.
+    bins : int or sequence of scalars or str, optional
+        If `bins` is an int, it defines the number of equal-width
+        bins in the given range (10, by default). If `bins` is a
+        sequence, it defines the bin edges, including the rightmost
+        edge, allowing for non-uniform bin widths.
+
+        If `bins` is a string from the list below, `histogram_bin_edges` will use
+        the method chosen to calculate the optimal bin width and
+        consequently the number of bins (see `Notes` for more detail on
+        the estimators) from the data that falls within the requested
+        range. While the bin width will be optimal for the actual data
+        in the range, the number of bins will be computed to fill the
+        entire range, including the empty portions. For visualisation,
+        using the 'auto' option is suggested. Weighted data is not
+        supported for automated bin size selection.
+
+        'auto'
+            Maximum of the 'sturges' and 'fd' estimators. Provides good
+            all around performance.
+
+        'fd' (Freedman Diaconis Estimator)
+            Robust (resilient to outliers) estimator that takes into
+            account data variability and data size.
+
+        'doane'
+            An improved version of Sturges' estimator that works better
+            with non-normal datasets.
+
+        'scott'
+            Less robust estimator that that takes into account data
+            variability and data size.
+
+        'stone'
+            Estimator based on leave-one-out cross-validation estimate of
+            the integrated squared error. Can be regarded as a generalization
+            of Scott's rule.
+
+        'rice'
+            Estimator does not take variability into account, only data
+            size. Commonly overestimates number of bins required.
+
+        'sturges'
+            R's default method, only accounts for data size. Only
+            optimal for gaussian data and underestimates number of bins
+            for large non-gaussian datasets.
+
+        'sqrt'
+            Square root (of data size) estimator, used by Excel and
+            other programs for its speed and simplicity.
+
+    range : (float, float), optional
+        The lower and upper range of the bins.  If not provided, range
+        is simply ``(a.min(), a.max())``.  Values outside the range are
+        ignored. The first element of the range must be less than or
+        equal to the second. `range` affects the automatic bin
+        computation as well. While bin width is computed to be optimal
+        based on the actual data within `range`, the bin count will fill
+        the entire range including portions containing no data.
+
+    weights : array_like, optional
+        A tensor of weights, of the same shape as `a`.  Each value in
+        `a` only contributes its associated weight towards the bin count
+        (instead of 1). This is currently not used by any of the bin estimators,
+        but may be in the future.
+
+    Returns
+    -------
+    bin_edges : tensor of dtype float
+        The edges to pass into `histogram`
+
+    See Also
+    --------
+    histogram
+
+    Notes
+    -----
+    The methods to estimate the optimal number of bins are well founded
+    in literature, and are inspired by the choices R provides for
+    histogram visualisation. Note that having the number of bins
+    proportional to :math:`n^{1/3}` is asymptotically optimal, which is
+    why it appears in most estimators. These are simply plug-in methods
+    that give good starting points for number of bins. In the equations
+    below, :math:`h` is the binwidth and :math:`n_h` is the number of
+    bins. All estimators that compute bin counts are recast to bin width
+    using the `ptp` of the data. The final bin count is obtained from
+    ``np.round(np.ceil(range / h))``.
+
+    'auto' (maximum of the 'sturges' and 'fd' estimators)
+        A compromise to get a good value. For small datasets the Sturges
+        value will usually be chosen, while larger datasets will usually
+        default to FD.  Avoids the overly conservative behaviour of FD
+        and Sturges for small and large datasets respectively.
+        Switchover point is usually :math:`a.size \approx 1000`.
+
+    'fd' (Freedman Diaconis Estimator)
+        .. math:: h = 2 \frac{IQR}{n^{1/3}}
+
+        The binwidth is proportional to the interquartile range (IQR)
+        and inversely proportional to cube root of a.size. Can be too
+        conservative for small datasets, but is quite good for large
+        datasets. The IQR is very robust to outliers.
+
+    'scott'
+        .. math:: h = \sigma \sqrt[3]{\frac{24 * \sqrt{\pi}}{n}}
+
+        The binwidth is proportional to the standard deviation of the
+        data and inversely proportional to cube root of ``x.size``. Can
+        be too conservative for small datasets, but is quite good for
+        large datasets. The standard deviation is not very robust to
+        outliers. Values are very similar to the Freedman-Diaconis
+        estimator in the absence of outliers.
+
+    'rice'
+        .. math:: n_h = 2n^{1/3}
+
+        The number of bins is only proportional to cube root of
+        ``a.size``. It tends to overestimate the number of bins and it
+        does not take into account data variability.
+
+    'sturges'
+        .. math:: n_h = \log _{2}n+1
+
+        The number of bins is the base 2 log of ``a.size``.  This
+        estimator assumes normality of data and is too conservative for
+        larger, non-normal datasets. This is the default method in R's
+        ``hist`` method.
+
+    'doane'
+        .. math:: n_h = 1 + \log_{2}(n) +
+                        \log_{2}(1 + \frac{|g_1|}{\sigma_{g_1}})
+
+            g_1 = mean[(\frac{x - \mu}{\sigma})^3]
+
+            \sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}}
+
+        An improved version of Sturges' formula that produces better
+        estimates for non-normal datasets. This estimator attempts to
+        account for the skew of the data.
+
+    'sqrt'
+        .. math:: n_h = \sqrt n
+
+        The simplest and fastest estimator. Only takes into account the
+        data size.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> arr = mt.array([0, 0, 0, 1, 2, 3, 3, 4, 5])
+    >>> mt.histogram_bin_edges(arr, bins='auto', range=(0, 1)).execute()
+    array([0.  , 0.25, 0.5 , 0.75, 1.  ])
+    >>> mt.histogram_bin_edges(arr, bins=2).execute()
+    array([0. , 2.5, 5. ])
+
+    For consistency with histogram, a tensor of pre-computed bins is
+    passed through unmodified:
+
+    >>> mt.histogram_bin_edges(arr, [1, 2]).execute()
+    array([1, 2])
+
+    This function allows one set of bins to be computed, and reused across
+    multiple histograms:
+
+    >>> shared_bins = mt.histogram_bin_edges(arr, bins='auto')
+    >>> shared_bins.execute()
+    array([0., 1., 2., 3., 4., 5.])
+
+    >>> group_id = mt.array([0, 1, 1, 0, 1, 1, 0, 1, 1])
+    >>> a = arr[group_id == 0]
+    >>> a.execute()
+    array([0, 1, 3])
+    >>> hist_0, _ = mt.histogram(a, bins=shared_bins).execute()
+    >>> b = arr[group_id == 1]
+    >>> b.execute()
+    array([0, 0, 2, 3, 4, 5])
+    >>> hist_1, _ = mt.histogram(b, bins=shared_bins).execute()
+
+    >>> hist_0; hist_1
+    array([1, 1, 0, 1, 0])
+    array([2, 0, 1, 1, 2])
+
+    Which gives more easily comparable results than using separate bins for
+    each histogram:
+
+    >>> hist_0, bins_0 = mt.histogram(a, bins='auto').execute()
+    >>> hist_1, bins_1 = mt.histogram(b, bins='auto').execute()
+    >>> hist_0; hist_1
+    array([1, 1, 1])
+    array([2, 1, 1, 2])
+    >>> bins_0; bins_1
+    array([0., 1., 2., 3.])
+    array([0.  , 1.25, 2.5 , 3.75, 5.  ])
+
+    """
+    a, weights = _ravel_and_check_weights(a, weights)
+    op = TensorHistogramBinEdges(
+        input=a, bins=bins, range=range, weights=weights, dtype=a.dtype
+    )
+    return op(a, bins, range, weights)
+
+
+class TensorHistogram(TensorOperand, TensorOperandMixin):
+    _op_type_ = OperandDef.HISTOGRAM
+
+    _input = KeyField("input")
+    _bins = AnyField("bins")
+    _range = TupleField("range")
+    _weights = KeyField("weights")
+    _density = BoolField("density")
+    _ret_bins = BoolField("ret_bins")
+
+    def __init__(
+        self,
+        input=None,
+        bins=None,
+        range=None,
+        weights=None,
+        density=None,
+        ret_bins=None,
+        **kw,
+    ):
+        super().__init__(
+            _input=input,
+            _bins=bins,
+            _range=range,
+            _weights=weights,
+            _density=density,
+            _ret_bins=ret_bins,
+            **kw,
+        )
+
+    @property
+    def input(self):
+        return self._input
+
+    @property
+    def bins(self):
+        return self._bins
+
+    @property
+    def range(self):
+        return self._range
+
+    @property
+    def weights(self):
+        return self._weights
+
+    @property
+    def density(self):
+        return self._density
+
+    @property
+    def ret_bins(self):
+        return self._ret_bins
+
+    @property
+    def output_limit(self):
+        return 1 if not self._ret_bins else 2
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        inputs_iter = iter(self._inputs)
+        self._input = next(inputs_iter)
+        if isinstance(self._bins, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)):
+            self._bins = next(inputs_iter)
+        if self._weights is not None:
+            self._weights = next(inputs_iter)
+
+    def __call__(self, a, bins, range, weights):
+        a, weights = _ravel_and_check_weights(a, weights)
+        histogram_bin_edges_op = TensorHistogramBinEdges(
+            input=a, bins=bins, range=range, weights=weights, dtype=np.dtype(np.float64)
+        )
+        bins = self._bins = histogram_bin_edges_op(a, bins, range, weights)
+
+        inputs = [histogram_bin_edges_op.input]
+        if isinstance(bins, TENSOR_TYPE):
+            inputs.append(bins)
+        # Histogram is an integer or a float array depending on the weights.
+        if weights is None:
+            dtype = np.dtype(np.intp)
+        else:
+            inputs.append(weights)
+            dtype = weights.dtype
+        self.dtype = dtype
+
+        hist = self.new_tensor(
+            inputs, shape=(bins.size - 1,), order=TensorOrder.C_ORDER
+        )
+        return mt.ExecutableTuple([hist, bins])
+
+    @classmethod
+    def tile(cls, op):
+        bins = op.bins.rechunk(op.bins.shape)
+        shape = (bins.size - 1,)
+        out = op.outputs[0]
+        weights = None
+        if op.weights is not None:
+            # make input and weights have the same nsplits
+            weights = yield from recursive_tile(op.weights.rechunk(op.input.nsplits))
+
+        out_chunks = []
+        for chunk in op.input.chunks:
+            chunk_op = op.copy().reset_key()
+            chunk_op._range = None
+            chunk_op._ret_bins = False
+            chunk_op._density = False
+            chunk_inputs = [chunk, bins.chunks[0]]
+            if weights is not None:
+                weights_chunk = weights.cix[chunk.index]
+                chunk_inputs.append(weights_chunk)
+            out_chunk = chunk_op.new_chunk(
+                chunk_inputs, shape=shape, index=chunk.index, order=out.order
+            )
+            out_chunks.append(out_chunk)
+
+        # merge chunks together
+        chunk = chunk_tree_add(out.dtype, out_chunks, (0,), shape)
+        new_op = op.copy()
+        n = new_op.new_tensor(
+            op.inputs,
+            shape=shape,
+            order=out.order,
+            chunks=[chunk],
+            nsplits=((shape[0],),),
+        )
+        if op.density:
+            db = mt.array(mt.diff(bins), float)
+            hist = n / db / n.sum()
+            hist = yield from recursive_tile(hist)
+            return [hist]
+        else:
+            return [n]
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+        a = inputs[0]
+        bins = inputs[1] if isinstance(op.bins, TENSOR_CHUNK_TYPE) else op.bins
+        weights = None
+        if op.weights is not None:
+            weights = inputs[-1]
+        with device(device_id):
+            hist, bin_edges = xp.histogram(
+                a, bins=bins, range=op.range, weights=weights, density=op.density
+            )
+            ctx[op.outputs[0].key] = hist
+            if op.ret_bins:
+                ctx[op.outputs[1].key] = bin_edges
+
+
+def histogram(a, bins=10, range=None, weights=None, density=None):
+    r"""
+    Compute the histogram of a set of data.
+
+    Parameters
+    ----------
+    a : array_like
+        Input data. The histogram is computed over the flattened tensor.
+    bins : int or sequence of scalars or str, optional
+        If `bins` is an int, it defines the number of equal-width
+        bins in the given range (10, by default). If `bins` is a
+        sequence, it defines a monotonically increasing tensor of bin edges,
+        including the rightmost edge, allowing for non-uniform bin widths.
+
+        If `bins` is a string, it defines the method used to calculate the
+        optimal bin width, as defined by `histogram_bin_edges`.
+
+    range : (float, float), optional
+        The lower and upper range of the bins.  If not provided, range
+        is simply ``(a.min(), a.max())``.  Values outside the range are
+        ignored. The first element of the range must be less than or
+        equal to the second. `range` affects the automatic bin
+        computation as well. While bin width is computed to be optimal
+        based on the actual data within `range`, the bin count will fill
+        the entire range including portions containing no data.
+
+    weights : array_like, optional
+        A tensor of weights, of the same shape as `a`.  Each value in
+        `a` only contributes its associated weight towards the bin count
+        (instead of 1). If `density` is True, the weights are
+        normalized, so that the integral of the density over the range
+        remains 1.
+    density : bool, optional
+        If ``False``, the result will contain the number of samples in
+        each bin. If ``True``, the result is the value of the
+        probability *density* function at the bin, normalized such that
+        the *integral* over the range is 1. Note that the sum of the
+        histogram values will not be equal to 1 unless bins of unity
+        width are chosen; it is not a probability *mass* function.
+
+        Overrides the ``normed`` keyword if given.
+
+    Returns
+    -------
+    hist : tensor
+        The values of the histogram. See `density` and `weights` for a
+        description of the possible semantics.
+    bin_edges : tensor of dtype float
+        Return the bin edges ``(length(hist)+1)``.
+
+
+    See Also
+    --------
+    histogramdd, bincount, searchsorted, digitize, histogram_bin_edges
+
+    Notes
+    -----
+    All but the last (righthand-most) bin is half-open.  In other words,
+    if `bins` is::
+
+      [1, 2, 3, 4]
+
+    then the first bin is ``[1, 2)`` (including 1, but excluding 2) and
+    the second ``[2, 3)``.  The last bin, however, is ``[3, 4]``, which
+    *includes* 4.
+
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> mt.histogram([1, 2, 1], bins=[0, 1, 2, 3]).execute()
+    (array([0, 2, 1]), array([0, 1, 2, 3]))
+    >>> mt.histogram(mt.arange(4), bins=mt.arange(5), density=True).execute()
+    (array([0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4]))
+    >>> mt.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3]).execute()
+    (array([1, 4, 1]), array([0, 1, 2, 3]))
+
+    >>> a = mt.arange(5)
+    >>> hist, bin_edges = mt.histogram(a, density=True)
+    >>> hist.execute()
+    array([0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5])
+    >>> hist.sum().execute()
+    2.4999999999999996
+    >>> mt.sum(hist * mt.diff(bin_edges)).execute()
+    1.0
+
+    Automated Bin Selection Methods example, using 2 peak random data
+    with 2000 points:
+
+    >>> import matplotlib.pyplot as plt
+    >>> rng = mt.random.RandomState(10)  # deterministic random data
+    >>> a = mt.hstack((rng.normal(size=1000),
+    ...                rng.normal(loc=5, scale=2, size=1000)))
+    >>> _ = plt.hist(np.asarray(a), bins='auto')  # arguments are passed to np.histogram
+    >>> plt.title("Histogram with 'auto' bins")
+    Text(0.5, 1.0, "Histogram with 'auto' bins")
+    >>> plt.show()
+
+    """
+    a, weights = _ravel_and_check_weights(a, weights)
+    op = TensorHistogram(
+        input=a, bins=bins, range=range, weights=weights, density=density
+    )
+    return op(a, bins, range, weights)
diff --git a/python/xorbits/_mars/tensor/statistics/median.py b/python/xorbits/_mars/tensor/statistics/median.py
new file mode 100644
index 000000000..a2d6789c6
--- /dev/null
+++ b/python/xorbits/_mars/tensor/statistics/median.py
@@ -0,0 +1,85 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .quantile import quantile
+
+
+def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):
+    """
+    Compute the median along the specified axis.
+
+    Returns the median of the tensor elements.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor or object that can be converted to a tensor.
+    axis : {int, sequence of int, None}, optional
+        Axis or axes along which the medians are computed. The default
+        is to compute the median along a flattened version of the tensor.
+        A sequence of axes is supported since version 1.9.0.
+    out : Tensor, optional
+        Alternative output tensor in which to place the result. It must
+        have the same shape and buffer length as the expected output,
+        but the type (of the output) will be cast if necessary.
+    overwrite_input : bool, optional
+        Just for compatibility with Numpy, would not take effect.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
+
+    Returns
+    -------
+    median : Tensor
+        A new tensor holding the result. If the input contains integers
+        or floats smaller than ``float64``, then the output data-type is
+        ``np.float64``.  Otherwise, the data-type of the output is the
+        same as that of the input. If `out` is specified, that tensor is
+        returned instead.
+
+    See Also
+    --------
+    mean, percentile
+
+    Notes
+    -----
+    Given a vector ``V`` of length ``N``, the median of ``V`` is the
+    middle value of a sorted copy of ``V``, ``V_sorted`` - i
+    e., ``V_sorted[(N-1)/2]``, when ``N`` is odd, and the average of the
+    two middle values of ``V_sorted`` when ``N`` is even.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> a = mt.array([[10, 7, 4], [3, 2, 1]])
+    >>> a.execute()
+    array([[10,  7,  4],
+           [ 3,  2,  1]])
+    >>> mt.median(a).execute()
+    3.5
+    >>> mt.median(a, axis=0).execute()
+    array([6.5, 4.5, 2.5])
+    >>> mt.median(a, axis=1).execute()
+    array([7.,  2.])
+    >>> m = mt.median(a, axis=0)
+    >>> out = mt.zeros_like(m)
+    >>> mt.median(a, axis=0, out=m).execute()
+    array([6.5,  4.5,  2.5])
+    >>> m.execute()
+    array([6.5,  4.5,  2.5])
+    """
+    return quantile(
+        a, 0.5, axis=axis, out=out, overwrite_input=overwrite_input, keepdims=keepdims
+    )
diff --git a/python/xorbits/_mars/tensor/statistics/percentile.py b/python/xorbits/_mars/tensor/statistics/percentile.py
new file mode 100644
index 000000000..aefcdf348
--- /dev/null
+++ b/python/xorbits/_mars/tensor/statistics/percentile.py
@@ -0,0 +1,175 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ...core import ENTITY_TYPE
+from ..arithmetic import truediv
+from .quantile import _quantile_is_valid, _quantile_unchecked
+
+q_error_msg = "Percentiles must be in the range [0, 100]"
+
+
+def percentile(
+    a,
+    q,
+    axis=None,
+    out=None,
+    overwrite_input=False,
+    interpolation="linear",
+    keepdims=False,
+):
+    """
+    Compute the q-th percentile of the data along the specified axis.
+
+    Returns the q-th percentile(s) of the array elements.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor or object that can be converted to a tensor.
+    q : array_like of float
+        Percentile or sequence of percentiles to compute, which must be between
+        0 and 100 inclusive.
+    axis : {int, tuple of int, None}, optional
+        Axis or axes along which the percentiles are computed. The
+        default is to compute the percentile(s) along a flattened
+        version of the tensor.
+    out : ndarray, optional
+        Alternative output array in which to place the result. It must
+        have the same shape and buffer length as the expected output,
+        but the type (of the output) will be cast if necessary.
+    overwrite_input : bool, optional
+        Just for compatibility with Numpy, would not take effect.
+    interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
+        This optional parameter specifies the interpolation method to
+        use when the desired percentile lies between two data points
+        ``i < j``:
+
+        * 'linear': ``i + (j - i) * fraction``, where ``fraction``
+          is the fractional part of the index surrounded by ``i``
+          and ``j``.
+        * 'lower': ``i``.
+        * 'higher': ``j``.
+        * 'nearest': ``i`` or ``j``, whichever is nearest.
+        * 'midpoint': ``(i + j) / 2``.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left in
+        the result as dimensions with size one. With this option, the
+        result will broadcast correctly against the original array `a`.
+
+    Returns
+    -------
+    percentile : scalar or ndarray
+        If `q` is a single percentile and `axis=None`, then the result
+        is a scalar. If multiple percentiles are given, first axis of
+        the result corresponds to the percentiles. The other axes are
+        the axes that remain after the reduction of `a`. If the input
+        contains integers or floats smaller than ``float64``, the output
+        data-type is ``float64``. Otherwise, the output data-type is the
+        same as that of the input. If `out` is specified, that array is
+        returned instead.
+
+    See Also
+    --------
+    mean
+    median : equivalent to ``percentile(..., 50)``
+    nanpercentile
+    quantile : equivalent to percentile, except with q in the range [0, 1].
+
+    Notes
+    -----
+    Given a vector ``V`` of length ``N``, the q-th percentile of
+    ``V`` is the value ``q/100`` of the way from the minimum to the
+    maximum in a sorted copy of ``V``. The values and distances of
+    the two nearest neighbors as well as the `interpolation` parameter
+    will determine the percentile if the normalized ranking does not
+    match the location of ``q`` exactly. This function is the same as
+    the median if ``q=50``, the same as the minimum if ``q=0`` and the
+    same as the maximum if ``q=100``.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> a = mt.array([[10, 7, 4], [3, 2, 1]])
+    >>> a.execute()
+    array([[10,  7,  4],
+           [ 3,  2,  1]])
+    >>> mt.percentile(a, 50).execute()
+    3.5
+    >>> mt.percentile(a, 50, axis=0).execute()
+    array([6.5, 4.5, 2.5])
+    >>> mt.percentile(a, 50, axis=1).execute()
+    array([7.,  2.])
+    >>> mt.percentile(a, 50, axis=1, keepdims=True).execute()
+    array([[7.],
+           [2.]])
+
+    >>> m = mt.percentile(a, 50, axis=0)
+    >>> out = mt.zeros_like(m)
+    >>> mt.percentile(a, 50, axis=0, out=out).execute()
+    array([6.5, 4.5, 2.5])
+    >>> m.execute()
+    array([6.5, 4.5, 2.5])
+
+    The different types of interpolation can be visualized graphically:
+
+    .. plot::
+
+        import matplotlib.pyplot as plt
+        import mars.tensor as mt
+        import numpy as np
+
+        a = mt.arange(4)
+        p = mt.linspace(0, 100, 6001)
+        ax = plt.gca()
+        lines = [
+            ('linear', None),
+            ('higher', '--'),
+            ('lower', '--'),
+            ('nearest', '-.'),
+            ('midpoint', '-.'),
+        ]
+        for interpolation, style in lines:
+            ax.plot(
+                np.asarray(p), np.asarray(mt.percentile(a, p, interpolation=interpolation)),
+                label=interpolation, linestyle=style)
+        ax.set(
+            title='Interpolation methods for list: ' + str(a),
+            xlabel='Percentile',
+            ylabel='List item returned',
+            yticks=np.asarray(a))
+        ax.legend()
+        plt.show()
+
+    """
+    if not isinstance(q, ENTITY_TYPE):
+        q = np.asanyarray(q)
+        q = np.true_divide(q, 100)
+        # do check instantly if q is not a tensor
+        if not _quantile_is_valid(q):
+            raise ValueError(q_error_msg)
+    else:
+        q = truediv(q, 100)
+
+    return _quantile_unchecked(
+        a,
+        q,
+        axis=axis,
+        out=out,
+        overwrite_input=overwrite_input,
+        interpolation=interpolation,
+        keepdims=keepdims,
+        q_error_msg=q_error_msg,
+    )
diff --git a/python/xorbits/_mars/tensor/statistics/ptp.py b/python/xorbits/_mars/tensor/statistics/ptp.py
new file mode 100644
index 000000000..08484397b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/statistics/ptp.py
@@ -0,0 +1,88 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..base.ravel import ravel
+from ..core import Tensor
+from ..datasource import tensor as astensor
+from ..utils import check_out_param, validate_axis
+
+
+def ptp(a, axis=None, out=None, keepdims=None):
+    """
+    Range of values (maximum - minimum) along an axis.
+
+    The name of the function comes from the acronym for 'peak to peak'.
+
+    Parameters
+    ----------
+    a : array_like
+        Input values.
+    axis : int, optional
+        Axis along which to find the peaks.  By default, flatten the
+        array.
+    out : array_like
+        Alternative output tensor in which to place the result. It must
+        have the same shape and buffer length as the expected output,
+        but the type of the output values will be cast if necessary.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input array.
+
+        If the default value is passed, then `keepdims` will not be
+        passed through to the `ptp` method of sub-classes of
+        `Tensor`, however any non-default value will be.  If the
+        sub-class' method does not implement `keepdims` any
+        exceptions will be raised.
+
+    Returns
+    -------
+    ptp : Tensor
+        A new tensor holding the result, unless `out` was
+        specified, in which case a reference to `out` is returned.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+
+    >>> x = mt.arange(4).reshape((2,2))
+    >>> x.execute()
+    array([[0, 1],
+           [2, 3]])
+
+    >>> mt.ptp(x, axis=0).execute()
+    array([2, 2])
+
+    >>> mt.ptp(x, axis=1).execute()
+    array([1, 1])
+
+    """
+    a = astensor(a)
+
+    if axis is None:
+        a = ravel(a)
+    else:
+        validate_axis(a.ndim, axis)
+
+    t = a.max(axis=axis, keepdims=keepdims) - a.min(axis=axis, keepdims=keepdims)
+
+    if out is not None:
+        if not isinstance(out, Tensor):
+            raise TypeError(f"out should be Tensor object, got {type(out)} instead")
+
+        check_out_param(out, t, "same_kind")
+        out.data = t.data
+        return out
+
+    return t
diff --git a/python/xorbits/_mars/tensor/statistics/quantile.py b/python/xorbits/_mars/tensor/statistics/quantile.py
new file mode 100644
index 000000000..fa9f18f0b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/statistics/quantile.py
@@ -0,0 +1,566 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Iterable
+
+import numpy as np
+
+from ... import opcodes as OperandDef
+from ...core import ENTITY_TYPE, recursive_tile
+from ...core.context import get_context
+from ...serialization.serializables import AnyField, BoolField, KeyField, StringField
+from ...utils import has_unknown_shape
+from ..arithmetic import add, isnan
+from ..array_utils import as_same_device, device
+from ..base import moveaxis, where
+from ..core import TENSOR_CHUNK_TYPE, TENSOR_TYPE, TensorOrder
+from ..datasource import tensor as astensor
+from ..indexing import take
+from ..operands import TensorOperand, TensorOperandMixin
+from ..reduction import any as tensor_any
+from ..utils import check_out_param
+from .core import _ureduce
+
+
+def _quantile_is_valid(q):
+    # avoid expensive reductions, relevant for arrays with < O(1000) elements
+    if q.ndim == 1 and q.size < 10:
+        for i in range(q.size):
+            if q[i] < 0.0 or q[i] > 1.0:
+                return False
+    else:
+        # faster than any()
+        if np.count_nonzero(q < 0.0) or np.count_nonzero(q > 1.0):
+            return False
+    return True
+
+
+def _quantile_ureduce_func(
+    a,
+    q,
+    axis=None,
+    out=None,
+    overwrite_input=False,
+    interpolation="linear",
+    keepdims=False,
+):
+    a = astensor(a)
+    out = astensor(out) if out is not None else None
+
+    if q.ndim == 0:
+        # Do not allow 0-d arrays because following code fails for scalar
+        zerod = True
+        q = q[None]
+    else:
+        zerod = False
+
+    # prepare a for partitioning
+    if overwrite_input:
+        if axis is None:
+            ap = a.ravel()
+        else:
+            ap = a
+    else:
+        if axis is None:
+            ap = a.flatten()
+        else:
+            ap = a.copy()
+
+    if axis is None:
+        axis = 0
+
+    Nx = ap.shape[axis]
+    indices = q * (Nx - 1)
+
+    # round fractional indices according to interpolation method
+    if interpolation == "lower":
+        indices = np.floor(indices).astype(np.intp)
+    elif interpolation == "higher":
+        indices = np.ceil(indices).astype(np.intp)
+    elif interpolation == "midpoint":
+        indices = 0.5 * (np.floor(indices) + np.ceil(indices))
+    elif interpolation == "nearest":
+        indices = np.around(indices).astype(np.intp)
+    else:
+        assert interpolation == "linear"
+        # keep index as fraction and interpolate
+
+    n = np.array(False, dtype=bool)  # check for nan's flag
+    if indices.dtype == np.intp:  # take the points along axis
+        # Check if the array contains any nan's
+        if np.issubdtype(a.dtype, np.inexact):
+            indices = np.concatenate((indices, [-1]))
+
+        ap.partition(indices, axis=axis, need_align=True)
+        # ensure axis with q-th is first
+        ap = moveaxis(ap, axis, 0)
+        axis = 0
+
+        # Check if the array contains any nan's
+        if np.issubdtype(a.dtype, np.inexact):
+            indices = indices[:-1]
+            n = isnan(ap[-1:, ...])
+
+        if zerod:
+            indices = indices[0]
+        r = take(ap, indices, axis=axis, out=out)
+
+    else:  # weight the points above and below the indices
+        indices_below = np.floor(indices).astype(np.intp)
+        indices_above = indices_below + 1
+        indices_above[indices_above > Nx - 1] = Nx - 1
+
+        # Check if the array contains any nan's
+        if np.issubdtype(a.dtype, np.inexact):
+            indices_above = np.concatenate((indices_above, [-1]))
+
+        weights_above = indices - indices_below
+        weights_below = 1 - weights_above
+
+        weights_shape = [1] * ap.ndim
+        weights_shape[axis] = len(indices)
+        weights_below.shape = weights_shape
+        weights_above.shape = weights_shape
+
+        ap.partition(
+            np.concatenate((indices_below, indices_above)), axis=axis, need_align=True
+        )
+
+        # ensure axis with q-th is first
+        ap = moveaxis(ap, axis, 0)
+        weights_below = np.moveaxis(weights_below, axis, 0)
+        weights_above = np.moveaxis(weights_above, axis, 0)
+        axis = 0
+
+        # Check if the array contains any nan's
+        if np.issubdtype(a.dtype, np.inexact):
+            indices_above = indices_above[:-1]
+            n = isnan(ap[-1:, ...])
+
+        x1 = take(ap, indices_below, axis=axis) * weights_below
+        x2 = take(ap, indices_above, axis=axis) * weights_above
+
+        # ensure axis with q-th is first
+        x1 = moveaxis(x1, axis, 0)
+        x2 = moveaxis(x2, axis, 0)
+
+        if zerod:
+            x1 = x1.squeeze(0)
+            x2 = x2.squeeze(0)
+
+        if out is not None:
+            r = add(x1, x2, out=out)
+        else:
+            r = add(x1, x2)
+
+    if isinstance(n, TENSOR_TYPE):
+        if zerod:
+            if ap.ndim == 1:
+                r.data = where(tensor_any(n), a.dtype.type(np.nan), r).data
+                if out is not None:
+                    out.data = r.data
+            else:
+                r[:] = where(
+                    tensor_any(n), where(n.squeeze(0), a.dtype.type(np.nan), r), r
+                )
+        else:
+            if r.ndim == 1:
+                r[:] = where(tensor_any(n), np.full(r.shape, a.dtype.type(np.nan)), r)
+            else:
+                r[:] = where(
+                    tensor_any(n),
+                    where(n.repeat(q.size, 0), a.dtype.type(np.nan), r),
+                    r,
+                )
+
+    return r
+
+
+q_error_msg = "Quantiles must be in the range [0, 1]"
+
+
+class TensorQuantile(TensorOperand, TensorOperandMixin):
+    __slots__ = ("q_error_msg",)
+    _op_type_ = OperandDef.QUANTILE
+
+    _a = KeyField("a")
+    _q = AnyField("q")
+    _axis = AnyField("axis")
+    _out = KeyField("out")
+    _overwrite_input = BoolField("overwrite_input")
+    _interpolation = StringField("interpolation")
+    _keepdims = BoolField("keepdims")
+
+    def __init__(
+        self,
+        q=None,
+        axis=None,
+        out=None,
+        overwrite_input=None,
+        interpolation=None,
+        keepdims=None,
+        **kw,
+    ):
+        self.q_error_msg = kw.pop("q_error_msg", q_error_msg)
+        super().__init__(
+            _q=q,
+            _axis=axis,
+            _interpolation=interpolation,
+            _out=out,
+            _overwrite_input=overwrite_input,
+            _keepdims=keepdims,
+            **kw,
+        )
+
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        self._a = self._inputs[0]
+        if isinstance(self._q, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)):
+            self._q = self._inputs[1]
+        if isinstance(self._out, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)):
+            self._out = self._inputs[-1]
+
+    @property
+    def a(self):
+        return self._a
+
+    @property
+    def q(self):
+        return self._q
+
+    @property
+    def axis(self):
+        return self._axis
+
+    @property
+    def out(self):
+        return self._out
+
+    @property
+    def overwrite_input(self):
+        return self._overwrite_input
+
+    @property
+    def interpolation(self):
+        return self._interpolation
+
+    @property
+    def keepdims(self):
+        return self._keepdims
+
+    def __call__(self, a, q=None, out=None):
+        shape = [self._q.size] if self._q.ndim > 0 else []
+        if self._axis is None:
+            exclude_axes = set(range(a.ndim))
+        elif isinstance(self._axis, tuple):
+            exclude_axes = set(self._axis)
+        else:
+            exclude_axes = {self._axis}
+        for ax, s in enumerate(a.shape):
+            if ax not in exclude_axes:
+                shape.append(s)
+            elif self._keepdims:
+                shape.append(1)
+        inputs = [a] if q is None else [a, q]
+        order = TensorOrder.C_ORDER
+        if out is not None:
+            inputs.append(out)
+            order = out.order
+            shape = out.shape
+        t = self.new_tensor(inputs, shape=tuple(shape), order=order)
+        if out is not None:
+            check_out_param(out, t, "same_kind")
+            out.data = t.data
+            return out
+        else:
+            return t
+
+    @classmethod
+    def _tile(cls, op, q):
+        r, k = _ureduce(
+            op.a,
+            func=_quantile_ureduce_func,
+            q=q,
+            axis=op.axis,
+            out=op.out,
+            overwrite_input=op.overwrite_input,
+            interpolation=op.interpolation,
+        )
+        if op.keepdims:
+            return r.reshape(q.shape + k)
+        else:
+            return r
+
+    @classmethod
+    def _tile_one_chunk(cls, op, q):
+        in_tensor = op.inputs[0]
+        out_tensor = op.outputs[0]
+        chunk_op = op.copy().reset_key()
+        chunk_op._q = q
+        chunk_inputs = [in_tensor.chunks[0]]
+        if op.out is not None:
+            chunk_inputs.append(op.out.chunks[0])
+        chunk = chunk_op.new_chunk(
+            chunk_inputs,
+            shape=out_tensor.shape,
+            index=(0,) * out_tensor.ndim,
+            order=out_tensor.order,
+        )
+        op = op.copy()
+        return op.new_tensors(
+            op.inputs,
+            shape=out_tensor.shape,
+            order=out_tensor.order,
+            nsplits=tuple((s,) for s in out_tensor.shape),
+            chunks=[chunk],
+        )
+
+    @classmethod
+    def tile(cls, op):
+        if isinstance(op.q, TENSOR_TYPE):
+            ctx = get_context()
+            # get q's data
+            q_chunk_keys = [c.key for c in op.q.chunks]
+            try:
+                q_data = ctx.get_chunks_result(q_chunk_keys)
+            except KeyError:
+                # trigger execution of `q`
+                yield op.q.chunks
+                q_data = ctx.get_chunks_result(q_chunk_keys)
+            op._q = q = np.concatenate(q_data)
+            if not _quantile_is_valid(q):
+                raise ValueError(op.q_error_msg)
+        else:
+            if has_unknown_shape(*op.inputs):
+                yield
+            q = np.asarray(op.q)
+
+        if len(op.a.chunks) == 1 and (op.out is None or len(op.out.chunks) == 1):
+            return cls._tile_one_chunk(op, q)
+        else:
+            tiled = yield from recursive_tile(cls._tile(op, q))
+            return [tiled]
+
+    @classmethod
+    def execute(cls, ctx, op):
+        inputs, device_id, xp = as_same_device(
+            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True
+        )
+        a = inputs[0]
+        out = inputs[-1].copy() if op.out is not None else None
+
+        with device(device_id):
+            ctx[op.outputs[0].key] = xp.quantile(
+                a,
+                q=op.q,
+                axis=op.axis,
+                out=out,
+                interpolation=op.interpolation,
+                keepdims=op.keepdims,
+            )
+
+
+INTERPOLATION_TYPES = {"linear", "lower", "higher", "midpoint", "nearest"}
+
+
+def _quantile_unchecked(
+    a,
+    q,
+    axis=None,
+    out=None,
+    overwrite_input=False,
+    interpolation="linear",
+    keepdims=False,
+    q_error_msg=None,
+    handle_non_numeric=None,
+):
+    a = astensor(a)
+    raw_dtype = a.dtype
+    need_view_back = False
+    if handle_non_numeric and not np.issubdtype(a.dtype, np.number):
+        # enable handle_non_numeric is often used
+        # to handle the datetime-like dtype
+        a = a.astype("i8")
+        need_view_back = True
+    if isinstance(q, ENTITY_TYPE):
+        q = astensor(q)
+        # do check in tile
+        q_input = q
+    else:
+        q_input = None
+
+    if isinstance(axis, Iterable):
+        axis = tuple(axis)
+
+    if q.ndim > 1:
+        raise ValueError("`q` should be a scalar or array of float")
+
+    if out is not None and not isinstance(out, TENSOR_TYPE):
+        raise TypeError(f"`out` should be a tensor, got {type(out)}")
+
+    if interpolation not in INTERPOLATION_TYPES:
+        raise ValueError(
+            "interpolation can only be 'linear', 'lower' "
+            "'higher', 'midpoint', or 'nearest'"
+        )
+
+    # infer dtype
+    q_tiny = np.random.rand(2 if q.size % 2 == 0 else 1).astype(q.dtype)
+    if handle_non_numeric and not np.issubdtype(a.dtype, np.number):
+        dtype = a.dtype
+    else:
+        dtype = np.quantile(
+            np.empty(1, dtype=a.dtype), q_tiny, interpolation=interpolation
+        ).dtype
+    op = TensorQuantile(
+        q=q,
+        axis=axis,
+        out=out,
+        overwrite_input=overwrite_input,
+        interpolation=interpolation,
+        keepdims=keepdims,
+        handle_non_numeric=handle_non_numeric,
+        q_error_msg=q_error_msg,
+        dtype=dtype,
+        gpu=a.op.gpu,
+    )
+    ret = op(a, q=q_input, out=out)
+    if need_view_back:
+        ret = ret.astype(raw_dtype)
+    return ret
+
+
+def quantile(
+    a,
+    q,
+    axis=None,
+    out=None,
+    overwrite_input=False,
+    interpolation="linear",
+    keepdims=False,
+    **kw,
+):
+    """
+    Compute the q-th quantile of the data along the specified axis.
+
+    Parameters
+    ----------
+    a : array_like
+        Input tensor or object that can be converted to a tensor.
+    q : array_like of float
+        Quantile or sequence of quantiles to compute, which must be between
+        0 and 1 inclusive.
+    axis : {int, tuple of int, None}, optional
+        Axis or axes along which the quantiles are computed. The
+        default is to compute the quantile(s) along a flattened
+        version of the tensor.
+    out : Tensor, optional
+        Alternative output tensor in which to place the result. It must
+        have the same shape and buffer length as the expected output,
+        but the type (of the output) will be cast if necessary.
+    overwrite_input : bool, optional
+        Just for compatibility with Numpy, would not take effect.
+    interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
+        This optional parameter specifies the interpolation method to
+        use when the desired quantile lies between two data points
+        ``i < j``:
+
+            * linear: ``i + (j - i) * fraction``, where ``fraction``
+              is the fractional part of the index surrounded by ``i``
+              and ``j``.
+            * lower: ``i``.
+            * higher: ``j``.
+            * nearest: ``i`` or ``j``, whichever is nearest.
+            * midpoint: ``(i + j) / 2``.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left in
+        the result as dimensions with size one. With this option, the
+        result will broadcast correctly against the original tensor `a`.
+
+    Returns
+    -------
+    quantile : scalar or Tensor
+        If `q` is a single quantile and `axis=None`, then the result
+        is a scalar. If multiple quantiles are given, first axis of
+        the result corresponds to the quantiles. The other axes are
+        the axes that remain after the reduction of `a`. If the input
+        contains integers or floats smaller than ``float64``, the output
+        data-type is ``float64``. Otherwise, the output data-type is the
+        same as that of the input. If `out` is specified, that tensor is
+        returned instead.
+
+    See Also
+    --------
+    mean
+    percentile : equivalent to quantile, but with q in the range [0, 100].
+    median : equivalent to ``quantile(..., 0.5)``
+    nanquantile
+
+    Notes
+    -----
+    Given a vector ``V`` of length ``N``, the q-th quantile of
+    ``V`` is the value ``q`` of the way from the minimum to the
+    maximum in a sorted copy of ``V``. The values and distances of
+    the two nearest neighbors as well as the `interpolation` parameter
+    will determine the quantile if the normalized ranking does not
+    match the location of ``q`` exactly. This function is the same as
+    the median if ``q=0.5``, the same as the minimum if ``q=0.0`` and the
+    same as the maximum if ``q=1.0``.
+
+    Examples
+    --------
+    >>> import mars.tensor as mt
+    >>> a = mt.array([[10, 7, 4], [3, 2, 1]])
+    >>> a.execute()
+    array([[10,  7,  4],
+           [ 3,  2,  1]])
+    >>> mt.quantile(a, 0.5).execute()
+    3.5
+    >>> mt.quantile(a, 0.5, axis=0).execute()
+    array([6.5, 4.5, 2.5])
+    >>> mt.quantile(a, 0.5, axis=1).execute()
+    array([7.,  2.])
+    >>> mt.quantile(a, 0.5, axis=1, keepdims=True).execute()
+    array([[7.],
+           [2.]])
+    >>> m = mt.quantile(a, 0.5, axis=0)
+    >>> out = mt.zeros_like(m)
+    >>> mt.quantile(a, 0.5, axis=0, out=out).execute()
+    array([6.5, 4.5, 2.5])
+    >>> m.execute()
+    array([6.5, 4.5, 2.5])
+    """
+
+    handle_non_numeric = kw.pop("handle_non_numeric", None)
+    if len(kw) > 0:  # pragma: no cover
+        raise TypeError(
+            f"quantile() got an unexpected keyword argument '{next(iter(kw))}'"
+        )
+
+    if not isinstance(q, ENTITY_TYPE):
+        q = np.asanyarray(q)
+        # do check instantly if q is not a tensor
+        if not _quantile_is_valid(q):
+            raise ValueError(q_error_msg)
+
+    return _quantile_unchecked(
+        a,
+        q,
+        axis=axis,
+        out=out,
+        overwrite_input=overwrite_input,
+        interpolation=interpolation,
+        keepdims=keepdims,
+        handle_non_numeric=handle_non_numeric,
+    )
diff --git a/python/xorbits/_mars/tensor/statistics/tests/__init__.py b/python/xorbits/_mars/tensor/statistics/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/statistics/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/statistics/tests/test_statistics.py b/python/xorbits/_mars/tensor/statistics/tests/test_statistics.py
new file mode 100644
index 000000000..aaf08cf4a
--- /dev/null
+++ b/python/xorbits/_mars/tensor/statistics/tests/test_statistics.py
@@ -0,0 +1,159 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+from ....core import tile
+from ...datasource import array, tensor
+from .. import digitize, histogram_bin_edges, percentile, quantile
+from ..quantile import INTERPOLATION_TYPES
+
+
+def test_digitize():
+    x = tensor(np.array([0.2, 6.4, 3.0, 1.6]), chunk_size=2)
+    bins = np.array([0.0, 1.0, 2.5, 4.0, 10.0])
+    inds = digitize(x, bins)
+
+    assert inds.shape == (4,)
+    assert inds.dtype is not None
+
+    inds = tile(inds)
+
+    assert len(inds.chunks) == 2
+
+
+def test_histogram_bin_edges():
+    a = array([0, 0, 0, 1, 2, 3, 3, 4, 5], chunk_size=3)
+
+    with pytest.raises(ValueError):
+        histogram_bin_edges(a, bins="unknown")
+
+    with pytest.raises(TypeError):
+        # bins is str, weights cannot be provided
+        histogram_bin_edges(a, bins="scott", weights=a)
+
+    with pytest.raises(ValueError):
+        histogram_bin_edges(a, bins=-1)
+
+    with pytest.raises(ValueError):
+        # not asc
+        histogram_bin_edges(a, bins=[3, 2, 1])
+
+    with pytest.raises(ValueError):
+        # bins cannot be 2d
+        histogram_bin_edges(a, bins=np.random.rand(2, 3))
+
+    with pytest.raises(ValueError):
+        histogram_bin_edges(a, range=(5, 0))
+
+    with pytest.raises(ValueError):
+        histogram_bin_edges(a, range=(np.nan, np.nan))
+
+    bins = histogram_bin_edges(a, bins=3, range=(0, 5))
+    # if range specified, no error will occur
+    tile(bins)
+
+
+def test_quantile():
+    raw = np.random.rand(100)
+    q = np.random.rand(10)
+
+    for dtype in [np.float32, np.int64, np.complex128]:
+        raw2 = raw.astype(dtype)
+        a = tensor(raw2, chunk_size=100)
+
+        b = quantile(a, q, overwrite_input=True)
+        assert b.shape == (10,)
+        assert b.dtype == np.quantile(raw2, q).dtype
+
+        b = tile(b)
+        assert len(b.chunks) == 1
+
+    raw = np.random.rand(20, 10)
+    q = np.random.rand(10)
+
+    for dtype in [np.float32, np.int64, np.complex128]:
+        for axis in (None, 0, 1, [0, 1]):
+            for interpolation in INTERPOLATION_TYPES:
+                for keepdims in [True, False]:
+                    raw2 = raw.astype(dtype)
+                    a = tensor(raw2, chunk_size=(8, 6))
+
+                    b = quantile(
+                        a, q, axis=axis, interpolation=interpolation, keepdims=keepdims
+                    )
+                    expected = np.quantile(
+                        raw2,
+                        q,
+                        axis=axis,
+                        interpolation=interpolation,
+                        keepdims=keepdims,
+                    )
+                    assert b.shape == expected.shape
+                    assert b.dtype == expected.dtype
+
+    a = tensor(raw, chunk_size=10)
+    b = quantile(a, q)
+
+    b = tile(b)
+    assert b.shape == (10,)
+
+    b = quantile(a, 0.3)
+    assert b.ndim == 0
+
+    raw2 = np.random.rand(3, 4, 5)
+    a2 = tensor(raw2, chunk_size=3)
+    b2 = quantile(a2, q, axis=(0, 2))
+    expected = np.quantile(raw2, q, axis=(0, 2))
+    assert b2.shape == expected.shape
+
+    b2 = tile(b2)
+    assert b2.shape == expected.shape
+
+    # q has to be 1-d
+    with pytest.raises(ValueError):
+        quantile(a, q.reshape(5, 2))
+
+    # wrong out type
+    with pytest.raises(TypeError):
+        quantile(a, q, out=2)
+
+    # wrong q
+    with pytest.raises(ValueError):
+        q2 = q.copy()
+        q2[0] = 1.1
+        quantile(a, q2)
+
+    # wrong q, with size < 10
+    with pytest.raises(ValueError):
+        q2 = np.random.rand(5)
+        q2[0] = 1.1
+        quantile(a, q2)
+
+    # wrong interpolation
+    with pytest.raises(ValueError):
+        quantile(a, q, interpolation="unknown")
+
+
+def test_percentile():
+    raw = np.random.rand(100)
+    q = [101]
+
+    a = tensor(raw, chunk_size=100)
+
+    with pytest.raises(ValueError) as cm:
+        percentile(a, q)
+    the_exception = cm.value.args[0]
+    assert "Percentiles" in the_exception
diff --git a/python/xorbits/_mars/tensor/statistics/tests/test_statistics_execution.py b/python/xorbits/_mars/tensor/statistics/tests/test_statistics_execution.py
new file mode 100644
index 000000000..1290fbe7c
--- /dev/null
+++ b/python/xorbits/_mars/tensor/statistics/tests/test_statistics_execution.py
@@ -0,0 +1,563 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import scipy.sparse as sps
+
+from ....utils import ignore_warning
+from ...base import sort
+from ...datasource import arange, empty, tensor
+from ...merge import stack
+from ...reduction import all as tall
+from .. import (
+    average,
+    bincount,
+    corrcoef,
+    cov,
+    digitize,
+    histogram,
+    histogram_bin_edges,
+    median,
+    percentile,
+    ptp,
+    quantile,
+)
+from ..quantile import INTERPOLATION_TYPES
+
+
+def test_average_execution(setup):
+    data = arange(1, 5, chunk_size=1)
+    t = average(data)
+
+    res = t.execute().fetch()
+    expected = np.average(np.arange(1, 5))
+    assert res == expected
+
+    t = average(arange(1, 11, chunk_size=2), weights=arange(10, 0, -1, chunk_size=2))
+
+    res = t.execute().fetch()
+    expected = np.average(range(1, 11), weights=range(10, 0, -1))
+    assert res == expected
+
+    data = arange(6, chunk_size=2).reshape((3, 2))
+    t = average(data, axis=1, weights=tensor([1.0 / 4, 3.0 / 4], chunk_size=2))
+
+    res = t.execute().fetch()
+    expected = np.average(
+        np.arange(6).reshape(3, 2), axis=1, weights=(1.0 / 4, 3.0 / 4)
+    )
+    np.testing.assert_equal(res, expected)
+
+    with pytest.raises(TypeError):
+        average(data, weights=tensor([1.0 / 4, 3.0 / 4], chunk_size=2))
+
+
+def test_cov_execution(setup):
+    data = np.array([[0, 2], [1, 1], [2, 0]]).T
+    x = tensor(data, chunk_size=1)
+
+    t = cov(x)
+
+    res = t.execute().fetch()
+    expected = np.cov(data)
+    np.testing.assert_equal(res, expected)
+
+    data_x = [-2.1, -1, 4.3]
+    data_y = [3, 1.1, 0.12]
+    x = tensor(data_x, chunk_size=1)
+    y = tensor(data_y, chunk_size=1)
+
+    X = stack((x, y), axis=0)
+    t = cov(x, y)
+    r = tall(t == cov(X))
+    assert r.execute().fetch()
+
+
+def test_corrcoef_execution(setup):
+    data_x = [-2.1, -1, 4.3]
+    data_y = [3, 1.1, 0.12]
+    x = tensor(data_x, chunk_size=1)
+    y = tensor(data_y, chunk_size=1)
+
+    t = corrcoef(x, y)
+
+    res = t.execute().fetch()
+    expected = np.corrcoef(data_x, data_y)
+    np.testing.assert_equal(res, expected)
+
+
+def test_ptp_execution(setup):
+    x = arange(4, chunk_size=1).reshape(2, 2)
+
+    t = ptp(x, axis=0)
+
+    res = t.execute().fetch()
+    expected = np.ptp(np.arange(4).reshape(2, 2), axis=0)
+    np.testing.assert_equal(res, expected)
+
+    t = ptp(x, axis=1)
+
+    res = t.execute().fetch()
+    expected = np.ptp(np.arange(4).reshape(2, 2), axis=1)
+    np.testing.assert_equal(res, expected)
+
+    t = ptp(x)
+
+    res = t.execute().fetch()
+    expected = np.ptp(np.arange(4).reshape(2, 2))
+    np.testing.assert_equal(res, expected)
+
+
+def test_digitize_execution(setup):
+    data = np.array([0.2, 6.4, 3.0, 1.6])
+    x = tensor(data, chunk_size=2)
+    bins = np.array([0.0, 1.0, 2.5, 4.0, 10.0])
+    inds = digitize(x, bins)
+
+    res = inds.execute().fetch()
+    expected = np.digitize(data, bins)
+    np.testing.assert_equal(res, expected)
+
+    b = tensor(bins, chunk_size=2)
+    inds = digitize(x, b)
+
+    res = inds.execute().fetch()
+    expected = np.digitize(data, bins)
+    np.testing.assert_equal(res, expected)
+
+    data = np.array([1.2, 10.0, 12.4, 15.5, 20.0])
+    x = tensor(data, chunk_size=2)
+    bins = np.array([0, 5, 10, 15, 20])
+    inds = digitize(x, bins, right=True)
+
+    res = inds.execute().fetch()
+    expected = np.digitize(data, bins, right=True)
+    np.testing.assert_equal(res, expected)
+
+    inds = digitize(x, bins, right=False)
+
+    res = inds.execute().fetch()
+    expected = np.digitize(data, bins, right=False)
+    np.testing.assert_equal(res, expected)
+
+    data = sps.random(10, 1, density=0.1) * 12
+    x = tensor(data, chunk_size=2)
+    bins = np.array([1.0, 2.0, 2.5, 4.0, 10.0])
+    inds = digitize(x, bins)
+
+    res = inds.execute().fetch()
+    expected = np.digitize(data.toarray(), bins, right=False)
+    np.testing.assert_equal(res.toarray(), expected)
+
+
+@ignore_warning
+def test_histogram_bin_edges_execution(setup):
+    rs = np.random.RandomState(0)
+
+    raw = rs.randint(10, size=(20,))
+    a = tensor(raw, chunk_size=6)
+
+    # range provided
+    for range_ in [(0, 10), (3, 11), (3, 7)]:
+        bin_edges = histogram_bin_edges(a, range=range_)
+        result = bin_edges.execute().fetch()
+        expected = np.histogram_bin_edges(raw, range=range_)
+        np.testing.assert_array_equal(result, expected)
+
+    raw2 = rs.randint(10, size=(1,))
+    b = tensor(raw2)
+    raw3 = rs.randint(10, size=(0,))
+    c = tensor(raw3)
+    for t, r in [(a, raw), (b, raw2), (c, raw3), (sort(a), raw)]:
+        test_bins = [
+            10,
+            "stone",
+            "auto",
+            "doane",
+            "fd",
+            "rice",
+            "scott",
+            "sqrt",
+            "sturges",
+        ]
+        for bins in test_bins:
+            bin_edges = histogram_bin_edges(t, bins=bins)
+            result = bin_edges.execute().fetch()
+            expected = np.histogram_bin_edges(r, bins=bins)
+            np.testing.assert_array_equal(result, expected)
+
+        test_bins = [[0, 4, 8], tensor([0, 4, 8], chunk_size=2)]
+        for bins in test_bins:
+            bin_edges = histogram_bin_edges(t, bins=bins)
+            result = bin_edges.execute().fetch()
+            expected = np.histogram_bin_edges(r, bins=[0, 4, 8])
+            np.testing.assert_array_equal(result, expected)
+
+        raw = np.arange(5)
+        a = tensor(raw, chunk_size=3)
+        bin_edges = histogram_bin_edges(a)
+        result = bin_edges.execute().fetch()
+        expected = np.histogram_bin_edges(raw)
+        assert bin_edges.shape == expected.shape
+        np.testing.assert_array_equal(result, expected)
+
+
+@ignore_warning
+def test_histogram_execution(setup):
+    rs = np.random.RandomState(0)
+
+    raw = rs.randint(10, size=(20,))
+    a = tensor(raw, chunk_size=6)
+    raw_weights = rs.random(20)
+    weights = tensor(raw_weights, chunk_size=8)
+
+    # range provided
+    for range_ in [(0, 10), (3, 11), (3, 7)]:
+        bin_edges = histogram(a, range=range_)[0]
+        result = bin_edges.execute().fetch()
+        expected = np.histogram(raw, range=range_)[0]
+        np.testing.assert_array_equal(result, expected)
+
+    for wt in (raw_weights, weights):
+        for density in (True, False):
+            bins = [1, 4, 6, 9]
+            bin_edges = histogram(a, bins=bins, weights=wt, density=density)[0]
+            result = bin_edges.execute().fetch()
+            expected = np.histogram(
+                raw, bins=bins, weights=raw_weights, density=density
+            )[0]
+            np.testing.assert_almost_equal(result, expected)
+
+    raw2 = rs.randint(10, size=(1,))
+    b = tensor(raw2)
+    raw3 = rs.randint(10, size=(0,))
+    c = tensor(raw3)
+    for t, r in [(a, raw), (b, raw2), (c, raw3), (sort(a), raw)]:
+        for density in (True, False):
+            test_bins = [
+                10,
+                "stone",
+                "auto",
+                "doane",
+                "fd",
+                "rice",
+                "scott",
+                "sqrt",
+                "sturges",
+            ]
+            for bins in test_bins:
+                hist = histogram(t, bins=bins, density=density)[0]
+                result = hist.execute().fetch()
+                expected = np.histogram(r, bins=bins, density=density)[0]
+                np.testing.assert_array_equal(result, expected)
+
+            test_bins = [[0, 4, 8], tensor([0, 4, 8], chunk_size=2)]
+            for bins in test_bins:
+                hist = histogram(t, bins=bins, density=density)[0]
+                result = hist.execute().fetch()
+                expected = np.histogram(r, bins=[0, 4, 8], density=density)[0]
+                np.testing.assert_array_equal(result, expected)
+
+        # test unknown shape
+        raw4 = rs.rand(10)
+        d = tensor(raw4, chunk_size=6)
+        d = d[d < 0.9]
+        hist = histogram(d)
+        result = hist.execute().fetch()[0]
+        expected = np.histogram(raw4[raw4 < 0.9])[0]
+        np.testing.assert_array_equal(result, expected)
+
+        raw5 = np.arange(3, 10)
+        e = arange(10, chunk_size=6)
+        e = e[e >= 3]
+        hist = histogram(e)
+        result = hist.execute().fetch()[0]
+        expected = np.histogram(raw5)[0]
+        np.testing.assert_array_equal(result, expected)
+
+
+def test_quantile_execution(setup):
+    # test 1 chunk, 1-d
+    raw = np.random.rand(20)
+    a = tensor(raw, chunk_size=20)
+
+    raw2 = raw.copy()
+    raw2[np.random.RandomState(0).randint(raw.size, size=3)] = np.nan
+    a2 = tensor(raw2, chunk_size=20)
+
+    for q in [np.random.RandomState(0).rand(), np.random.RandomState(0).rand(5)]:
+        for interpolation in INTERPOLATION_TYPES:
+            for keepdims in [True, False]:
+                r = quantile(a, q, interpolation=interpolation, keepdims=keepdims)
+
+                result = r.execute().fetch()
+                expected = np.quantile(
+                    raw, q, interpolation=interpolation, keepdims=keepdims
+                )
+
+                np.testing.assert_array_equal(result, expected)
+
+                r2 = quantile(a2, q, interpolation=interpolation, keepdims=keepdims)
+
+                result = r2.execute().fetch()
+                expected = np.quantile(
+                    raw2, q, interpolation=interpolation, keepdims=keepdims
+                )
+
+                np.testing.assert_array_equal(result, expected)
+
+    # test 1 chunk, 2-d
+    raw = np.random.rand(20, 10)
+    a = tensor(raw, chunk_size=20)
+
+    raw2 = raw.copy()
+    raw2.flat[np.random.RandomState(0).randint(raw.size, size=3)] = np.nan
+    a2 = tensor(raw2, chunk_size=20)
+
+    for q in [np.random.RandomState(0).rand(), np.random.RandomState(0).rand(5)]:
+        for interpolation in INTERPOLATION_TYPES:
+            for keepdims in [True, False]:
+                for axis in [None, 0, 1]:
+                    r = quantile(
+                        a, q, axis=axis, interpolation=interpolation, keepdims=keepdims
+                    )
+
+                    result = r.execute().fetch()
+                    expected = np.quantile(
+                        raw,
+                        q,
+                        axis=axis,
+                        interpolation=interpolation,
+                        keepdims=keepdims,
+                    )
+
+                    np.testing.assert_array_equal(result, expected)
+
+                    r2 = quantile(
+                        a2, q, axis=axis, interpolation=interpolation, keepdims=keepdims
+                    )
+
+                    result = r2.execute().fetch()
+                    expected = np.quantile(
+                        raw2,
+                        q,
+                        axis=axis,
+                        interpolation=interpolation,
+                        keepdims=keepdims,
+                    )
+
+                    np.testing.assert_array_equal(result, expected)
+
+    # test multi chunks, 1-d
+    raw = np.random.rand(20)
+    a = tensor(raw, chunk_size=6)
+
+    raw2 = raw.copy()
+    raw2[np.random.RandomState(0).randint(raw.size, size=3)] = np.nan
+    a2 = tensor(raw2, chunk_size=20)
+
+    for q in [np.random.RandomState(0).rand(), np.random.RandomState(0).rand(5)]:
+        for interpolation in INTERPOLATION_TYPES:
+            for keepdims in [True, False]:
+                r = quantile(a, q, interpolation=interpolation, keepdims=keepdims)
+
+                result = r.execute().fetch()
+                expected = np.quantile(
+                    raw, q, interpolation=interpolation, keepdims=keepdims
+                )
+
+                np.testing.assert_almost_equal(result, expected)
+
+                r2 = quantile(a2, q, interpolation=interpolation, keepdims=keepdims)
+
+                result = r2.execute().fetch()
+                expected = np.quantile(
+                    raw2, q, interpolation=interpolation, keepdims=keepdims
+                )
+
+                np.testing.assert_almost_equal(result, expected)
+
+    # test multi chunk, 2-d
+    raw = np.random.rand(20, 10)
+    a = tensor(raw, chunk_size=(12, 6))
+
+    raw2 = raw.copy()
+    raw2.flat[np.random.RandomState(0).randint(raw.size, size=3)] = np.nan
+    a2 = tensor(raw2, chunk_size=(12, 6))
+
+    for q in [np.random.RandomState(0).rand(), np.random.RandomState(0).rand(5)]:
+        for interpolation in INTERPOLATION_TYPES:
+            for keepdims in [True, False]:
+                for axis in [None, 0, 1]:
+                    r = quantile(
+                        a, q, axis=axis, interpolation=interpolation, keepdims=keepdims
+                    )
+
+                    result = r.execute().fetch()
+                    expected = np.quantile(
+                        raw,
+                        q,
+                        axis=axis,
+                        interpolation=interpolation,
+                        keepdims=keepdims,
+                    )
+
+                    np.testing.assert_almost_equal(result, expected)
+
+                    r2 = quantile(
+                        a2, q, axis=axis, interpolation=interpolation, keepdims=keepdims
+                    )
+
+                    result = r2.execute().fetch()
+                    expected = np.quantile(
+                        raw2,
+                        q,
+                        axis=axis,
+                        interpolation=interpolation,
+                        keepdims=keepdims,
+                    )
+
+                    np.testing.assert_almost_equal(result, expected)
+
+    # test out, 1 chunk
+    raw = np.random.rand(20)
+    q = np.random.rand(11)
+    a = tensor(raw, chunk_size=20)
+    out = empty((5, 11))
+    quantile(a, q, out=out)
+
+    result = out.execute().fetch()
+    expected = np.quantile(raw, q, out=np.empty((5, 11)))
+    np.testing.assert_array_equal(result, expected)
+
+    # test out, multi chunks
+    raw = np.random.rand(20)
+    q = np.random.rand(11)
+    a = tensor(raw, chunk_size=6)
+    out = empty((5, 11))
+    quantile(a, q, out=out)
+
+    result = out.execute().fetch()
+    expected = np.quantile(raw, q, out=np.empty((5, 11)))
+    np.testing.assert_almost_equal(result, expected)
+
+    # test q which is a tensor
+    q_raw = np.random.RandomState(0).rand(5)
+    q = tensor(q_raw, chunk_size=6)
+
+    r = quantile(a, q, axis=None)
+
+    result = r.execute().fetch()
+    expected = np.quantile(raw, q_raw, axis=None)
+
+    np.testing.assert_almost_equal(result, expected)
+
+    with pytest.raises(ValueError):
+        q[0] = 1.1
+        r = quantile(a, q, axis=None)
+        _ = r.execute()
+
+
+def test_percentile_execution(setup):
+    raw = np.random.rand(20, 10)
+    q = np.random.RandomState(0).randint(100, size=11)
+    a = tensor(raw, chunk_size=7)
+    r = percentile(a, q)
+
+    result = r.execute().fetch()
+    expected = np.percentile(raw, q)
+    np.testing.assert_almost_equal(result, expected)
+
+    mq = tensor(q)
+
+    r = percentile(a, mq)
+    result = r.execute().fetch()
+
+    np.testing.assert_almost_equal(result, expected)
+
+
+def test_median_execution(setup):
+    raw = np.random.rand(20, 10)
+    a = tensor(raw, chunk_size=7)
+    r = median(a)
+
+    result = r.execute().fetch()
+    expected = np.median(raw)
+
+    np.testing.assert_array_equal(result, expected)
+
+    r = median(a, axis=1)
+
+    result = r.execute().fetch()
+    expected = np.median(raw, axis=1)
+
+    np.testing.assert_array_equal(result, expected)
+
+
+def test_bincount_execution(setup):
+    rs = np.random.RandomState(0)
+    raw = rs.randint(0, 9, (100,))
+    raw[raw == 3] = 0
+    raw_weights = rs.rand(100)
+
+    # test non-chunked
+    a = tensor(raw)
+    result = bincount(a).execute().fetch()
+    expected = np.bincount(raw)
+    np.testing.assert_array_equal(result, expected)
+
+    weights = tensor(raw_weights)
+    result = bincount(a, weights=weights).execute().fetch()
+    expected = np.bincount(raw, weights=raw_weights)
+    np.testing.assert_array_equal(result, expected)
+
+    # test chunked
+    a = tensor(raw, chunk_size=13)
+    result = bincount(a, chunk_size_limit=5).execute().fetch()
+    expected = np.bincount(raw)
+    np.testing.assert_array_equal(result, expected)
+
+    # test minlength
+    a = tensor(raw, chunk_size=13)
+    result = bincount(a, chunk_size_limit=5, minlength=15).execute().fetch()
+    expected = np.bincount(raw, minlength=15)
+    np.testing.assert_array_equal(result, expected)
+
+    # test with gap
+    raw1 = np.concatenate([raw, [20]])
+    a = tensor(raw1, chunk_size=13)
+    result = bincount(a, chunk_size_limit=5).execute().fetch()
+    expected = np.bincount(raw1)
+    np.testing.assert_array_equal(result, expected)
+
+    # test with weights
+    a = tensor(raw, chunk_size=13)
+    weights = tensor(raw_weights, chunk_size=15)
+    result = bincount(a, chunk_size_limit=5, weights=weights).execute().fetch()
+    expected = np.bincount(raw, weights=raw_weights)
+    np.testing.assert_array_almost_equal(result, expected)
+
+    # test errors
+    a = tensor(raw, chunk_size=13)
+    with pytest.raises(TypeError, match="cast array data"):
+        bincount(a.astype(float)).execute()
+    with pytest.raises(ValueError, match="1 dimension"):
+        bincount(np.array([[1, 2], [3, 4]])).execute()
+    with pytest.raises(ValueError, match="be negative"):
+        bincount(a, minlength=-1).execute()
+    with pytest.raises(ValueError, match="the same length"):
+        bincount([-1, 1, 2, 3], weights=[3, 4]).execute()
+    with pytest.raises(ValueError, match="negative elements"):
+        bincount(tensor([-1, 1, 2, 3], chunk_size=2)).execute()
diff --git a/python/xorbits/_mars/tensor/stats/__init__.py b/python/xorbits/_mars/tensor/stats/__init__.py
new file mode 100644
index 000000000..5906e2601
--- /dev/null
+++ b/python/xorbits/_mars/tensor/stats/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .chisquare import chisquare
+from .entropy import entropy
+from .ks import ks_1samp, ks_2samp
+from .power_divergence import power_divergence
+from .rankdata import rankdata
+from .ttest import ttest_1samp, ttest_ind, ttest_ind_from_stats, ttest_rel
diff --git a/python/xorbits/_mars/tensor/stats/chisquare.py b/python/xorbits/_mars/tensor/stats/chisquare.py
new file mode 100644
index 000000000..98d0c0bb5
--- /dev/null
+++ b/python/xorbits/_mars/tensor/stats/chisquare.py
@@ -0,0 +1,124 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .power_divergence import power_divergence
+
+
+def chisquare(f_obs, f_exp=None, ddof=0, axis=0):
+    """
+    Calculate a one-way chi-square test.
+
+    The chi-square test tests the null hypothesis that the categorical data
+    has the given frequencies.
+
+    Parameters
+    ----------
+    f_obs : array_like
+        Observed frequencies in each category.
+    f_exp : array_like, optional
+        Expected frequencies in each category.  By default the categories are
+        assumed to be equally likely.
+    ddof : int, optional
+        "Delta degrees of freedom": adjustment to the degrees of freedom
+        for the p-value.  The p-value is computed using a chi-squared
+        distribution with ``k - 1 - ddof`` degrees of freedom, where `k`
+        is the number of observed frequencies.  The default value of `ddof`
+        is 0.
+    axis : int or None, optional
+        The axis of the broadcast result of `f_obs` and `f_exp` along which to
+        apply the test.  If axis is None, all values in `f_obs` are treated
+        as a single data set.  Default is 0.
+
+    Returns
+    -------
+    chisq : float or ndarray
+        The chi-squared test statistic.  The value is a float if `axis` is
+        None or `f_obs` and `f_exp` are 1-D.
+    p : float or ndarray
+        The p-value of the test.  The value is a float if `ddof` and the
+        return value `chisq` are scalars.
+
+    See Also
+    --------
+    scipy.stats.power_divergence
+
+    Notes
+    -----
+    This test is invalid when the observed or expected frequencies in each
+    category are too small.  A typical rule is that all of the observed
+    and expected frequencies should be at least 5.
+
+    The default degrees of freedom, k-1, are for the case when no parameters
+    of the distribution are estimated. If p parameters are estimated by
+    efficient maximum likelihood then the correct degrees of freedom are
+    k-1-p. If the parameters are estimated in a different way, then the
+    dof can be between k-1-p and k-1. However, it is also possible that
+    the asymptotic distribution is not chi-square, in which case this test
+    is not appropriate.
+
+    References
+    ----------
+    .. [1] Lowry, Richard.  "Concepts and Applications of Inferential
+           Statistics". Chapter 8.
+           https://web.archive.org/web/20171022032306/http://vassarstats.net:80/textbook/ch8pt1.html
+    .. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test
+
+    Examples
+    --------
+    When just `f_obs` is given, it is assumed that the expected frequencies
+    are uniform and given by the mean of the observed frequencies.
+
+    >>> import mars.tensor as mt
+    >>> from mars.tensor.stats import chisquare
+    >>> chisquare([16, 18, 16, 14, 12, 12])
+    (2.0, 0.84914503608460956)
+
+    With `f_exp` the expected frequencies can be given.
+
+    >>> chisquare([16, 18, 16, 14, 12, 12], f_exp=[16, 16, 16, 16, 16, 8]).execute()
+    (3.5, 0.62338762774958223)
+
+    When `f_obs` is 2-D, by default the test is applied to each column.
+
+    >>> obs = mt.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T
+    >>> obs.shape
+    (6, 2)
+    >>> chisquare(obs).execute()
+    (array([ 2.        ,  6.66666667]), array([ 0.84914504,  0.24663415]))
+
+    By setting ``axis=None``, the test is applied to all data in the array,
+    which is equivalent to applying the test to the flattened array.
+
+    >>> chisquare(obs, axis=None).execute()
+    (23.31034482758621, 0.015975692534127565)
+    >>> chisquare(obs.ravel()).execute()
+    (23.31034482758621, 0.015975692534127565)
+
+    `ddof` is the change to make to the default degrees of freedom.
+
+    >>> chisquare([16, 18, 16, 14, 12, 12], ddof=1).execute()
+    (2.0, 0.73575888234288467)
+
+    `f_obs` and `f_exp` are also broadcast.  In the following, `f_obs` has
+    shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting
+    `f_obs` and `f_exp` has shape (2, 6).  To compute the desired chi-squared
+    statistics, we use ``axis=1``:
+
+    >>> chisquare([16, 18, 16, 14, 12, 12],
+    ...           f_exp=[[16, 16, 16, 16, 16, 8], [8, 20, 20, 16, 12, 12]],
+    ...           axis=1).execute()
+    (array([ 3.5 ,  9.25]), array([ 0.62338763,  0.09949846]))
+
+    """
+    return power_divergence(f_obs, f_exp=f_exp, ddof=ddof, axis=axis, lambda_="pearson")
diff --git a/python/xorbits/_mars/tensor/stats/entropy.py b/python/xorbits/_mars/tensor/stats/entropy.py
new file mode 100644
index 000000000..fea0d2f27
--- /dev/null
+++ b/python/xorbits/_mars/tensor/stats/entropy.py
@@ -0,0 +1,47 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from math import log
+
+try:
+    from scipy.stats import entropy as sp_entropy
+except ImportError:
+    sp_entropy = None
+
+from ... import tensor as mt
+from ...tensor import special as mt_special
+from ..core import TENSOR_TYPE
+from ..datasource import tensor as astensor
+from ..utils import implement_scipy
+
+
+@implement_scipy(sp_entropy)
+def entropy(pk, qk=None, base=None):
+    pk = astensor(pk)
+    pk = 1.0 * pk / mt.sum(pk, axis=0)
+    if qk is None:
+        vec = mt_special.entr(pk)
+    else:
+        qk = astensor(qk)
+        if len(qk) != len(pk):
+            raise ValueError("qk and pk must have same length.")
+        qk = 1.0 * qk / mt.sum(qk, axis=0)
+        vec = mt_special.rel_entr(pk, qk)
+    S = mt.sum(vec, axis=0)
+    if base is not None:
+        if isinstance(base, TENSOR_TYPE):
+            S /= mt.log(base)
+        else:
+            S /= log(base)
+    return S
diff --git a/python/xorbits/_mars/tensor/stats/ks.py b/python/xorbits/_mars/tensor/stats/ks.py
new file mode 100644
index 000000000..3ae58959b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/stats/ks.py
@@ -0,0 +1,689 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+from collections import namedtuple
+from math import gcd
+from typing import Callable, Tuple, Union
+
+import numpy as np
+from scipy import special
+from scipy.stats import distributions
+
+from ... import tensor as mt
+from ...core import ExecutableTuple
+from ...typing import TileableType
+
+KstestResult = namedtuple("KstestResult", ("statistic", "pvalue"))
+Ks_2sampResult = KstestResult
+
+
+def _compute_prob_inside_method(m, n, g, h):  # pragma: no cover
+    """
+    Count the proportion of paths that stay strictly inside two diagonal lines.
+
+    Parameters
+    ----------
+    m : integer
+        m > 0
+    n : integer
+        n > 0
+    g : integer
+        g is greatest common divisor of m and n
+    h : integer
+        0 <= h <= lcm(m,n)
+
+    Returns
+    -------
+    p : float
+        The proportion of paths that stay inside the two lines.
+
+
+    Count the integer lattice paths from (0, 0) to (m, n) which satisfy
+    |x/m - y/n| < h / lcm(m, n).
+    The paths make steps of size +1 in either positive x or positive y directions.
+
+    We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk.
+    Hodges, J.L. Jr.,
+    "The Significance Probability of the Smirnov Two-Sample Test,"
+    Arkiv fiur Matematik, 3, No. 43 (1958), 469-86.
+
+    """
+    # Probability is symmetrical in m, n.  Computation below uses m >= n.
+    if m < n:
+        m, n = n, m
+    mg = m // g
+    ng = n // g
+
+    # Count the integer lattice paths from (0, 0) to (m, n) which satisfy
+    # |nx/g - my/g| < h.
+    # Compute matrix A such that:
+    #  A(x, 0) = A(0, y) = 1
+    #  A(x, y) = A(x, y-1) + A(x-1, y), for x,y>=1, except that
+    #  A(x, y) = 0 if |x/m - y/n|>= h
+    # Probability is A(m, n)/binom(m+n, n)
+    # Optimizations exist for m==n, m==n*p.
+    # Only need to preserve a single column of A, and only a sliding window of it.
+    # minj keeps track of the slide.
+    minj, maxj = 0, min(int(np.ceil(h / mg)), n + 1)
+    curlen = maxj - minj
+    # Make a vector long enough to hold maximum window needed.
+    lenA = min(2 * maxj + 2, n + 1)
+    # This is an integer calculation, but the entries are essentially
+    # binomial coefficients, hence grow quickly.
+    # Scaling after each column is computed avoids dividing by a
+    # large binomial coefficient at the end, but is not sufficient to avoid
+    # the large dynamic range which appears during the calculation.
+    # Instead we rescale based on the magnitude of the right most term in
+    # the column and keep track of an exponent separately and apply
+    # it at the end of the calculation.  Similarly when multiplying by
+    # the binomial coefficient
+    dtype = np.float64
+    A = np.zeros(lenA, dtype=dtype)
+    # Initialize the first column
+    A[minj:maxj] = 1
+    expnt = 0
+    for i in range(1, m + 1):
+        # Generate the next column.
+        # First calculate the sliding window
+        lastminj, lastlen = minj, curlen
+        minj = max(int(np.floor((ng * i - h) / mg)) + 1, 0)
+        minj = min(minj, n)
+        maxj = min(int(np.ceil((ng * i + h) / mg)), n + 1)
+        if maxj <= minj:
+            return 0
+        # Now fill in the values
+        A[0 : maxj - minj] = np.cumsum(A[minj - lastminj : maxj - lastminj])
+        curlen = maxj - minj
+        if lastlen > curlen:
+            # Set some carried-over elements to 0
+            A[maxj - minj : maxj - minj + (lastlen - curlen)] = 0
+        # Rescale if the right most value is over 2**900
+        val = A[maxj - minj - 1]
+        _, valexpt = math.frexp(val)
+        if valexpt > 900:
+            # Scaling to bring down to about 2**800 appears
+            # sufficient for sizes under 10000.
+            valexpt -= 800
+            A = np.ldexp(A, -valexpt)
+            expnt += valexpt
+
+    val = A[maxj - minj - 1]
+    # Now divide by the binomial (m+n)!/m!/n!
+    for i in range(1, n + 1):
+        val = (val * i) / (m + i)
+        _, valexpt = math.frexp(val)
+        if valexpt < -128:
+            val = np.ldexp(val, -valexpt)
+            expnt += valexpt
+    # Finally scale if needed.
+    return np.ldexp(val, expnt)
+
+
+def _compute_prob_outside_square(n, h):  # pragma: no cover
+    """
+    Compute the proportion of paths that pass outside the two diagonal lines.
+
+    Parameters
+    ----------
+    n : integer
+        n > 0
+    h : integer
+        0 <= h <= n
+
+    Returns
+    -------
+    p : float
+        The proportion of paths that pass outside the lines x-y = +/-h.
+
+    """
+    # Compute Pr(D_{n,n} >= h/n)
+    # Prob = 2 * ( binom(2n, n-h) - binom(2n, n-2a) + binom(2n, n-3a) - ... )  / binom(2n, n)
+    # This formulation exhibits subtractive cancellation.
+    # Instead divide each term by binom(2n, n), then factor common terms
+    # and use a Horner-like algorithm
+    # P = 2 * A0 * (1 - A1*(1 - A2*(1 - A3*(1 - A4*(...)))))
+
+    P = 0.0
+    k = int(np.floor(n / h))
+    while k >= 0:
+        p1 = 1.0
+        # Each of the Ai terms has numerator and denominator with h simple terms.
+        for j in range(h):
+            p1 = (n - k * h - j) * p1 / (n + k * h + j + 1)
+        P = p1 * (1.0 - P)
+        k -= 1
+    return 2 * P
+
+
+def _count_paths_outside_method(m, n, g, h):  # pragma: no cover
+    """
+    Count the number of paths that pass outside the specified diagonal.
+
+    Parameters
+    ----------
+    m : integer
+        m > 0
+    n : integer
+        n > 0
+    g : integer
+        g is greatest common divisor of m and n
+    h : integer
+        0 <= h <= lcm(m,n)
+
+    Returns
+    -------
+    p : float
+        The number of paths that go low.
+        The calculation may overflow - check for a finite answer.
+
+    Raises
+    ------
+    FloatingPointError: Raised if the intermediate computation goes outside
+    the range of a float.
+
+    Notes
+    -----
+    Count the integer lattice paths from (0, 0) to (m, n), which at some
+    point (x, y) along the path, satisfy:
+      m*y <= n*x - h*g
+    The paths make steps of size +1 in either positive x or positive y directions.
+
+    We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk.
+    Hodges, J.L. Jr.,
+    "The Significance Probability of the Smirnov Two-Sample Test,"
+    Arkiv fiur Matematik, 3, No. 43 (1958), 469-86.
+
+    """
+    # Compute #paths which stay lower than x/m-y/n = h/lcm(m,n)
+    # B(x, y) = #{paths from (0,0) to (x,y) without previously crossing the boundary}
+    #         = binom(x, y) - #{paths which already reached the boundary}
+    # Multiply by the number of path extensions going from (x, y) to (m, n)
+    # Sum.
+
+    # Probability is symmetrical in m, n.  Computation below assumes m >= n.
+    if m < n:
+        m, n = n, m
+    mg = m // g
+    ng = n // g
+
+    # Not every x needs to be considered.
+    # xj holds the list of x values to be checked.
+    # Wherever n*x/m + ng*h crosses an integer
+    lxj = n + (mg - h) // mg
+    xj = [(h + mg * j + ng - 1) // ng for j in range(lxj)]
+    # B is an array just holding a few values of B(x,y), the ones needed.
+    # B[j] == B(x_j, j)
+    if lxj == 0:
+        return np.round(special.binom(m + n, n))
+    B = np.zeros(lxj)
+    B[0] = 1
+    # Compute the B(x, y) terms
+    # The binomial coefficient is an integer, but special.binom() may return a float.
+    # Round it to the nearest integer.
+    for j in range(1, lxj):
+        Bj = np.round(special.binom(xj[j] + j, j))
+        if not np.isfinite(Bj):
+            raise FloatingPointError()
+        for i in range(j):
+            bin = np.round(
+                special.binom(xj[j] - xj[i] + j - i, j - i)
+            )  # pylint: disable=redefined-builtin
+            Bj -= bin * B[i]
+        B[j] = Bj
+        if not np.isfinite(Bj):
+            raise FloatingPointError()
+    # Compute the number of path extensions...
+    num_paths = 0
+    for j in range(lxj):
+        bin = np.round(special.binom((m - xj[j]) + (n - j), n - j))
+        term = B[j] * bin
+        if not np.isfinite(term):
+            raise FloatingPointError()
+        num_paths += term
+    return np.round(num_paths)
+
+
+def _attempt_exact_2kssamp(n1, n2, g, d, alternative):  # pragma: no cover
+    """Attempts to compute the exact 2sample probability.
+
+    n1, n2 are the sample sizes
+    g is the gcd(n1, n2)
+    d is the computed max difference in ECDFs
+
+    Returns (success, d, probability)
+    """
+    lcm = (n1 // g) * n2
+    h = int(np.round(d * lcm))
+    d = h * 1.0 / lcm
+    if h == 0:
+        return True, d, 1.0
+    saw_fp_error, prob = False, np.nan
+    try:
+        if alternative == "two-sided":
+            if n1 == n2:
+                prob = _compute_prob_outside_square(n1, h)
+            else:
+                prob = 1 - _compute_prob_inside_method(n1, n2, g, h)
+        else:
+            if n1 == n2:
+                # prob = binom(2n, n-h) / binom(2n, n)
+                # Evaluating in that form incurs roundoff errors
+                # from special.binom. Instead calculate directly
+                jrange = np.arange(h)
+                prob = np.prod((n1 - jrange) / (n1 + jrange + 1.0))
+            else:
+                num_paths = _count_paths_outside_method(n1, n2, g, h)
+                bin = special.binom(n1 + n2, n1)  # pylint: disable=redefined-builtin
+                if (
+                    not np.isfinite(bin)
+                    or not np.isfinite(num_paths)
+                    or num_paths > bin
+                ):
+                    saw_fp_error = True
+                else:
+                    prob = num_paths / bin
+
+    except FloatingPointError:
+        saw_fp_error = True
+
+    if saw_fp_error:
+        return False, d, np.nan
+    if not (0 <= prob <= 1):
+        return False, d, prob
+    return True, d, prob
+
+
+def _calc_prob_2samp(d, n1, n2, alternative, mode):  # pragma: no cover
+    MAX_AUTO_N = 10000  # 'auto' will attempt to be exact if n1,n2 <= MAX_AUTO_N
+
+    g = gcd(n1, n2)
+    n1g = n1 // g
+    n2g = n2 // g
+    prob = -mt.inf
+    original_mode = mode
+    if mode == "auto":
+        mode = "exact" if max(n1, n2) <= MAX_AUTO_N else "asymp"
+    elif mode == "exact":
+        # If lcm(n1, n2) is too big, switch from exact to asymp
+        if n1g >= np.iinfo(np.int_).max / n2g:
+            mode = "asymp"
+            warnings.warn(
+                f"Exact ks_2samp calculation not possible with samples sizes "
+                f"{n1} and {n2}. Switching to 'asymp'.",
+                RuntimeWarning,
+            )
+
+    if mode == "exact":
+        success, d, prob = _attempt_exact_2kssamp(n1, n2, g, d, alternative)
+        if not success:
+            mode = "asymp"
+            if original_mode == "exact":
+                warnings.warn(
+                    f"ks_2samp: Exact calculation unsuccessful. "
+                    f"Switching to mode={mode}.",
+                    RuntimeWarning,
+                )
+
+    if mode == "asymp":
+        # The product n1*n2 is large.  Use Smirnov's asymptotic formula.
+        # Ensure float to avoid overflow in multiplication
+        # sorted because the one-sided formula is not symmetric in n1, n2
+        m, n = sorted([float(n1), float(n2)], reverse=True)
+        en = m * n / (m + n)
+        if alternative == "two-sided":
+            prob = distributions.kstwo.sf(d, np.round(en))
+        else:
+            z = np.sqrt(en) * d
+            # Use Hodges' suggested approximation Eqn 5.3
+            # Requires m to be the larger of (n1, n2)
+            expt = -2 * z**2 - 2 * z * (m + 2 * n) / np.sqrt(m * n * (m + n)) / 3.0
+            prob = np.exp(expt)
+
+    return np.clip(prob, 0, 1)
+
+
+def _compute_dplus(cdfvals, n):
+    """Computes D+ as used in the Kolmogorov-Smirnov test.
+
+    Parameters
+    ----------
+    cdfvals: array_like
+      Sorted array of CDF values between 0 and 1
+
+    Returns
+    -------
+      Maximum distance of the CDF values below Uniform(0, 1)
+    """
+    return (mt.arange(1.0, n + 1) / n - cdfvals).max()
+
+
+def _compute_dminus(cdfvals, n):
+    """Computes D- as used in the Kolmogorov-Smirnov test.
+
+    Parameters
+    ----------
+    cdfvals: array_like
+      Sorted array of CDF values between 0 and 1
+
+    Returns
+    -------
+      Maximum distance of the CDF values above Uniform(0, 1)
+    """
+    return (cdfvals - mt.arange(0.0, n) / n).max()
+
+
+def ks_1samp(
+    x: Union[np.ndarray, list, TileableType],
+    cdf: Callable,
+    args: Tuple = (),
+    alternative: str = "two-sided",
+    mode: str = "auto",
+):
+    """
+    Performs the one-sample Kolmogorov-Smirnov test for goodness of fit.
+
+    This test compares the underlying distribution F(x) of a sample
+    against a given continuous distribution G(x). See Notes for a description
+    of the available null and alternative hypotheses.
+
+    Parameters
+    ----------
+    x : array_like
+        a 1-D array of observations of iid random variables.
+    cdf : callable
+        callable used to calculate the cdf.
+    args : tuple, sequence, optional
+        Distribution parameters, used with `cdf`.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the null and alternative hypotheses. Default is 'two-sided'.
+        Please see explanations in the Notes below.
+    mode : {'auto', 'exact', 'approx', 'asymp'}, optional
+        Defines the distribution used for calculating the p-value.
+        The following options are available (default is 'auto'):
+
+          * 'auto' : selects one of the other options.
+          * 'exact' : uses the exact distribution of test statistic.
+          * 'approx' : approximates the two-sided probability with twice
+            the one-sided probability
+          * 'asymp': uses asymptotic distribution of test statistic
+
+    Returns
+    -------
+    statistic : float
+        KS test statistic, either D, D+ or D- (depending on the value
+        of 'alternative')
+    pvalue :  float
+        One-tailed or two-tailed p-value.
+
+    See Also
+    --------
+    ks_2samp, kstest
+
+    Notes
+    -----
+    There are three options for the null and corresponding alternative
+    hypothesis that can be selected using the `alternative` parameter.
+
+    - `two-sided`: The null hypothesis is that the two distributions are
+      identical, F(x)=G(x) for all x; the alternative is that they are not
+      identical.
+
+    - `less`: The null hypothesis is that F(x) >= G(x) for all x; the
+      alternative is that F(x) < G(x) for at least one x.
+
+    - `greater`: The null hypothesis is that F(x) <= G(x) for all x; the
+      alternative is that F(x) > G(x) for at least one x.
+
+    Note that the alternative hypotheses describe the *CDFs* of the
+    underlying distributions, not the observed values. For example,
+    suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in
+    x1 tend to be less than those in x2.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> import mars.tensor as mt
+    >>> from mars.tensor.stats import ks_1samp
+
+    >>> np.random.seed(12345678)  #fix random seed to get the same result
+    >>> x = mt.linspace(-15, 15, 9, chunk_size=5)
+    >>> ks_1samp(x, stats.norm.cdf).execute()
+    (0.44435602715924361, 0.038850142705171065)
+
+    >>> ks_1samp(stats.norm.rvs(size=100), stats.norm.cdf).execute()
+    KstestResult(statistic=0.165471391799..., pvalue=0.007331283245...)
+
+    *Test against one-sided alternative hypothesis*
+
+    Shift distribution to larger values, so that `` CDF(x) < norm.cdf(x)``:
+
+    >>> x = stats.norm.rvs(loc=0.2, size=100)
+    >>> ks_1samp(x, stats.norm.cdf, alternative='less').execute()
+    KstestResult(statistic=0.235488541678..., pvalue=1.158315030683...)
+
+    Reject null hypothesis in favor of alternative hypothesis: less
+
+    >>> ks_1samp(x, stats.norm.cdf, alternative='greater').execute()
+    KstestResult(statistic=0.010167165616..., pvalue=0.972494973653...)
+
+    Reject null hypothesis in favor of alternative hypothesis: greater
+
+    >>> ks_1samp(x, stats.norm.cdf).execute()
+    KstestResult(statistic=0.235488541678..., pvalue=2.316630061366...)
+
+    Don't reject null hypothesis in favor of alternative hypothesis: two-sided
+
+    *Testing t distributed random variables against normal distribution*
+
+    With 100 degrees of freedom the t distribution looks close to the normal
+    distribution, and the K-S test does not reject the hypothesis that the
+    sample came from the normal distribution:
+
+    >>> ks_1samp(stats.t.rvs(100, size=100), stats.norm.cdf).execute()
+    KstestResult(statistic=0.077844250253..., pvalue=0.553155412513...)
+
+    With 3 degrees of freedom the t distribution looks sufficiently different
+    from the normal distribution, that we can reject the hypothesis that the
+    sample came from the normal distribution at the 10% level:
+
+    >>> ks_1samp(stats.t.rvs(3, size=100), stats.norm.cdf).execute()
+    KstestResult(statistic=0.118967105356..., pvalue=0.108627114578...)
+    """
+    alternative = {"t": "two-sided", "g": "greater", "l": "less"}.get(
+        alternative.lower()[0], alternative
+    )
+    if alternative not in ["two-sided", "greater", "less"]:
+        raise ValueError("Unexpected alternative %s" % alternative)
+
+    x = mt.asarray(x)
+    N = x.shape[0]
+    x = mt.sort(x)
+    cdfvals = x.map_chunk(cdf, args=args, elementwise=True)
+
+    if alternative == "greater":
+        Dplus = _compute_dplus(cdfvals, N)
+        return ExecutableTuple(
+            KstestResult(Dplus, Dplus.map_chunk(distributions.ksone.sf, args=(N,)))
+        )
+
+    if alternative == "less":
+        Dminus = _compute_dminus(cdfvals, N)
+        return ExecutableTuple(
+            KstestResult(Dminus, Dminus.map_chunk(distributions.ksone.sf, args=(N,)))
+        )
+
+    # alternative == 'two-sided':
+    Dplus = _compute_dplus(cdfvals, N)
+    Dminus = _compute_dminus(cdfvals, N)
+    D = mt.stack([Dplus, Dminus]).max()
+    if mode == "auto":  # Always select exact
+        mode = "exact"
+    if mode == "exact":
+        prob = D.map_chunk(distributions.kstwo.sf, args=(N,), elementwise=True)
+    elif mode == "asymp":
+        prob = (D * np.sqrt(N)).map_chunk(distributions.kstwobign.sf, elementwise=True)
+    else:
+        # mode == 'approx'
+        prob = 2 * D.map_chunk(distributions.ksone.sf, args=(N,), elementwise=True)
+    prob = mt.clip(prob, 0, 1)
+    return ExecutableTuple(KstestResult(D, prob))
+
+
+def ks_2samp(
+    data1: Union[np.ndarray, list, TileableType],
+    data2: Union[np.ndarray, list, TileableType],
+    alternative: str = "two-sided",
+    mode: str = "auto",
+):
+    """
+    Compute the Kolmogorov-Smirnov statistic on 2 samples.
+
+    This is a two-sided test for the null hypothesis that 2 independent samples
+    are drawn from the same continuous distribution.  The alternative hypothesis
+    can be either 'two-sided' (default), 'less' or 'greater'.
+
+    Parameters
+    ----------
+    data1, data2 : array_like, 1-Dimensional
+        Two arrays of sample observations assumed to be drawn from a continuous
+        distribution, sample sizes can be different.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis.
+        The following options are available (default is 'two-sided'):
+
+          * 'two-sided'
+          * 'less': one-sided, see explanation in Notes
+          * 'greater': one-sided, see explanation in Notes
+    mode : {'auto', 'exact', 'asymp'}, optional
+        Defines the method used for calculating the p-value.
+        The following options are available (default is 'auto'):
+
+          * 'auto' : use 'exact' for small size arrays, 'asymp' for large
+          * 'exact' : use exact distribution of test statistic
+          * 'asymp' : use asymptotic distribution of test statistic
+
+    Returns
+    -------
+    statistic : float
+        KS statistic.
+    pvalue : float
+        Two-tailed p-value.
+
+    See Also
+    --------
+    kstest, ks_1samp, epps_singleton_2samp, anderson_ksamp
+
+    Notes
+    -----
+    This tests whether 2 samples are drawn from the same distribution. Note
+    that, like in the case of the one-sample KS test, the distribution is
+    assumed to be continuous.
+
+    In the one-sided test, the alternative is that the empirical
+    cumulative distribution function F(x) of the data1 variable is "less"
+    or "greater" than the empirical cumulative distribution function G(x)
+    of the data2 variable, ``F(x)<=G(x)``, resp. ``F(x)>=G(x)``.
+
+    If the KS statistic is small or the p-value is high, then we cannot
+    reject the hypothesis that the distributions of the two samples
+    are the same.
+
+    If the mode is 'auto', the computation is exact if the sample sizes are
+    less than 10000.  For larger sizes, the computation uses the
+    Kolmogorov-Smirnov distributions to compute an approximate value.
+
+    The 'two-sided' 'exact' computation computes the complementary probability
+    and then subtracts from 1.  As such, the minimum probability it can return
+    is about 1e-16.  While the algorithm itself is exact, numerical
+    errors may accumulate for large sample sizes.   It is most suited to
+    situations in which one of the sample sizes is only a few thousand.
+
+    We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk [1]_.
+
+    References
+    ----------
+    .. [1] Hodges, J.L. Jr.,  "The Significance Probability of the Smirnov
+           Two-Sample Test," Arkiv fiur Matematik, 3, No. 43 (1958), 469-86.
+
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> import mars.tensor as mt
+    >>> from mars.tensor.stats import ks_2samp
+    >>> np.random.seed(12345678)  #fix random seed to get the same result
+    >>> n1 = 200  # size of first sample
+    >>> n2 = 300  # size of second sample
+
+    For a different distribution, we can reject the null hypothesis since the
+    pvalue is below 1%:
+
+    >>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1)
+    >>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5)
+    >>> ks_2samp(rvs1, rvs2).execute()
+    KstestResult(statistic=0.20833333333333337, pvalue=5.1292795978041816e-05)
+
+    For a slightly different distribution, we cannot reject the null hypothesis
+    at a 10% or lower alpha since the p-value at 0.144 is higher than 10%
+
+    >>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0)
+    >>> ks_2samp(rvs1, rvs3).execute()
+    KstestResult(statistic=0.10333333333333333, pvalue=0.14691437867433788)
+
+    For an identical distribution, we cannot reject the null hypothesis since
+    the p-value is high, 41%:
+
+    >>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0)
+    >>> ks_2samp(rvs1, rvs4).execute()
+    KstestResult(statistic=0.07999999999999996, pvalue=0.4115432028915931)
+
+    """
+
+    if mode not in ["auto", "exact", "asymp"]:
+        raise ValueError(f"Invalid value for mode: {mode}")
+    alternative = {"t": "two-sided", "g": "greater", "l": "less"}.get(
+        alternative.lower()[0], alternative
+    )
+    if alternative not in ["two-sided", "less", "greater"]:
+        raise ValueError(f"Invalid value for alternative: {alternative}")
+    data1 = mt.asarray(data1)
+    data2 = mt.asarray(data2)
+    data1 = mt.sort(data1)
+    data2 = mt.sort(data2)
+    n1 = data1.shape[0]
+    n2 = data2.shape[0]
+    if min(n1, n2) == 0:
+        raise ValueError("Data passed to ks_2samp must not be empty")
+
+    data_all = mt.concatenate([data1, data2])
+    # using searchsorted solves equal data problem
+    cdf1 = mt.searchsorted(data1, data_all, side="right") / n1
+    cdf2 = mt.searchsorted(data2, data_all, side="right") / n2
+    cddiffs = cdf1 - cdf2
+    minS = mt.clip(-mt.min(cddiffs), 0, 1)  # Ensure sign of minS is not negative.
+    maxS = mt.max(cddiffs)
+    alt2Dvalue = {"less": minS, "greater": maxS, "two-sided": mt.maximum(minS, maxS)}
+    d = alt2Dvalue[alternative]
+    prob = d.map_chunk(
+        _calc_prob_2samp,
+        args=(n1, n2, alternative, mode),
+        elementwise=True,
+        dtype=d.dtype,
+    )
+
+    return ExecutableTuple(Ks_2sampResult(d, prob))
diff --git a/python/xorbits/_mars/tensor/stats/power_divergence.py b/python/xorbits/_mars/tensor/stats/power_divergence.py
new file mode 100644
index 000000000..7b50c4b0a
--- /dev/null
+++ b/python/xorbits/_mars/tensor/stats/power_divergence.py
@@ -0,0 +1,243 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import namedtuple
+
+import numpy as np
+
+try:
+    from scipy.stats import distributions as sp_distributions
+except ImportError:
+    sp_distributions = None
+
+from ...core import ExecutableTuple
+from ...utils import require_not_none
+from .. import special
+from ..datasource import asarray
+
+# Map from names to lambda_ values used in power_divergence().
+_power_div_lambda_names = {
+    "pearson": 1,
+    "log-likelihood": 0,
+    "freeman-tukey": -0.5,
+    "mod-log-likelihood": -1,
+    "neyman": -2,
+    "cressie-read": 2 / 3,
+}
+
+
+def _count(a, axis=None):
+    if axis is None:
+        return a.size
+    else:
+        return a.shape[axis]
+
+
+Power_divergenceResult = namedtuple("Power_divergenceResult", ("statistic", "pvalue"))
+
+
+@require_not_none(sp_distributions)
+def power_divergence(f_obs, f_exp=None, ddof=0, axis=0, lambda_=None):
+    """
+    Cressie-Read power divergence statistic and goodness of fit test.
+
+    This function tests the null hypothesis that the categorical data
+    has the given frequencies, using the Cressie-Read power divergence
+    statistic.
+
+    Parameters
+    ----------
+    f_obs : array_like
+        Observed frequencies in each category.
+    f_exp : array_like, optional
+        Expected frequencies in each category.  By default the categories are
+        assumed to be equally likely.
+    ddof : int, optional
+        "Delta degrees of freedom": adjustment to the degrees of freedom
+        for the p-value.  The p-value is computed using a chi-squared
+        distribution with ``k - 1 - ddof`` degrees of freedom, where `k`
+        is the number of observed frequencies.  The default value of `ddof`
+        is 0.
+    axis : int or None, optional
+        The axis of the broadcast result of `f_obs` and `f_exp` along which to
+        apply the test.  If axis is None, all values in `f_obs` are treated
+        as a single data set.  Default is 0.
+    lambda_ : float or str, optional
+        The power in the Cressie-Read power divergence statistic.  The default
+        is 1.  For convenience, `lambda_` may be assigned one of the following
+        strings, in which case the corresponding numerical value is used::
+
+            String              Value   Description
+            "pearson"             1     Pearson's chi-squared statistic.
+                                        In this case, the function is
+                                        equivalent to `stats.chisquare`.
+            "log-likelihood"      0     Log-likelihood ratio. Also known as
+                                        the G-test [3]_.
+            "freeman-tukey"      -1/2   Freeman-Tukey statistic.
+            "mod-log-likelihood" -1     Modified log-likelihood ratio.
+            "neyman"             -2     Neyman's statistic.
+            "cressie-read"        2/3   The power recommended in [5]_.
+
+    Returns
+    -------
+    statistic : float or ndarray
+        The Cressie-Read power divergence test statistic.  The value is
+        a float if `axis` is None or if` `f_obs` and `f_exp` are 1-D.
+    pvalue : float or ndarray
+        The p-value of the test.  The value is a float if `ddof` and the
+        return value `stat` are scalars.
+
+    See Also
+    --------
+    chisquare
+
+    Notes
+    -----
+    This test is invalid when the observed or expected frequencies in each
+    category are too small.  A typical rule is that all of the observed
+    and expected frequencies should be at least 5.
+
+    When `lambda_` is less than zero, the formula for the statistic involves
+    dividing by `f_obs`, so a warning or error may be generated if any value
+    in `f_obs` is 0.
+
+    Similarly, a warning or error may be generated if any value in `f_exp` is
+    zero when `lambda_` >= 0.
+
+    The default degrees of freedom, k-1, are for the case when no parameters
+    of the distribution are estimated. If p parameters are estimated by
+    efficient maximum likelihood then the correct degrees of freedom are
+    k-1-p. If the parameters are estimated in a different way, then the
+    dof can be between k-1-p and k-1. However, it is also possible that
+    the asymptotic distribution is not a chisquare, in which case this
+    test is not appropriate.
+
+    This function handles masked arrays.  If an element of `f_obs` or `f_exp`
+    is masked, then data at that position is ignored, and does not count
+    towards the size of the data set.
+
+    .. versionadded:: 0.13.0
+
+    References
+    ----------
+    .. [1] Lowry, Richard.  "Concepts and Applications of Inferential
+           Statistics". Chapter 8.
+           https://web.archive.org/web/20171015035606/http://faculty.vassar.edu/lowry/ch8pt1.html
+    .. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test
+    .. [3] "G-test", https://en.wikipedia.org/wiki/G-test
+    .. [4] Sokal, R. R. and Rohlf, F. J. "Biometry: the principles and
+           practice of statistics in biological research", New York: Freeman
+           (1981)
+    .. [5] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
+           Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
+           pp. 440-464.
+
+    Examples
+    --------
+    (See `chisquare` for more examples.)
+
+    When just `f_obs` is given, it is assumed that the expected frequencies
+    are uniform and given by the mean of the observed frequencies.  Here we
+    perform a G-test (i.e. use the log-likelihood ratio statistic):
+
+    >>> import mars.tensor as mt
+    >>> from mars.tensor.stats import power_divergence
+    >>> power_divergence([16, 18, 16, 14, 12, 12], lambda_='log-likelihood').execute()
+    (2.006573162632538, 0.84823476779463769)
+
+    The expected frequencies can be given with the `f_exp` argument:
+
+    >>> power_divergence([16, 18, 16, 14, 12, 12],
+    ...                  f_exp=[16, 16, 16, 16, 16, 8],
+    ...                  lambda_='log-likelihood').execute()
+    (3.3281031458963746, 0.6495419288047497)
+
+    When `f_obs` is 2-D, by default the test is applied to each column.
+
+    >>> obs = mt.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T
+    >>> obs.shape
+    (6, 2)
+    >>> power_divergence(obs, lambda_="log-likelihood").execute()
+    (array([ 2.00657316,  6.77634498]), array([ 0.84823477,  0.23781225]))
+
+    By setting ``axis=None``, the test is applied to all data in the array,
+    which is equivalent to applying the test to the flattened array.
+
+    >>> power_divergence(obs, axis=None).execute()
+    (23.31034482758621, 0.015975692534127565)
+    >>> power_divergence(obs.ravel()).execute()
+    (23.31034482758621, 0.015975692534127565)
+
+    `ddof` is the change to make to the default degrees of freedom.
+
+    >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=1).execute()
+    (2.0, 0.73575888234288467)
+
+    `f_obs` and `f_exp` are also broadcast.  In the following, `f_obs` has
+    shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting
+    `f_obs` and `f_exp` has shape (2, 6).  To compute the desired chi-squared
+    statistics, we must use ``axis=1``:
+
+    >>> power_divergence([16, 18, 16, 14, 12, 12],
+    ...                  f_exp=[[16, 16, 16, 16, 16, 8],
+    ...                         [8, 20, 20, 16, 12, 12]],
+    ...                  axis=1)
+    (array([ 3.5 ,  9.25]), array([ 0.62338763,  0.09949846]))
+    """
+    # Convert the input argument `lambda_` to a numerical value.
+    if isinstance(lambda_, str):
+        if lambda_ not in _power_div_lambda_names:
+            names = repr(list(_power_div_lambda_names.keys()))[1:-1]
+            raise ValueError(
+                "invalid string for lambda_: {0!r}.  Valid strings "
+                "are {1}".format(lambda_, names)
+            )
+        lambda_ = _power_div_lambda_names[lambda_]
+    elif lambda_ is None:
+        lambda_ = 1
+
+    f_obs = asarray(f_obs)
+
+    if f_exp is not None:
+        f_exp = asarray(f_exp)
+    else:
+        f_exp = f_obs.mean(axis=axis, keepdims=True)
+
+    # `terms` is the array of terms that are summed along `axis` to create
+    # the test statistic.  We use some specialized code for a few special
+    # cases of lambda_.
+    if lambda_ == 1:
+        # Pearson's chi-squared statistic
+        terms = (f_obs.astype(np.float64) - f_exp) ** 2 / f_exp
+    elif lambda_ == 0:
+        # Log-likelihood ratio (i.e. G-test)
+        terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp)
+    elif lambda_ == -1:
+        # Modified log-likelihood ratio
+        terms = 2.0 * special.xlogy(f_exp, f_exp / f_obs)
+    else:
+        # General Cressie-Read power divergence.
+        terms = f_obs * ((f_obs / f_exp) ** lambda_ - 1)
+        terms /= 0.5 * lambda_ * (lambda_ + 1)
+
+    stat = terms.sum(axis=axis)
+
+    num_obs = _count(terms, axis=axis)
+    # we decide not to support ddof for multiple dimensions
+    # ddof = asarray(ddof)
+    p = stat.map_chunk(
+        sp_distributions.chi2.sf, (num_obs - 1 - ddof,), elementwise=True
+    )
+
+    return ExecutableTuple(Power_divergenceResult(stat, p))
diff --git a/python/xorbits/_mars/tensor/stats/rankdata.py b/python/xorbits/_mars/tensor/stats/rankdata.py
new file mode 100644
index 000000000..799b98c77
--- /dev/null
+++ b/python/xorbits/_mars/tensor/stats/rankdata.py
@@ -0,0 +1,113 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import tensor as mt
+
+
+def rankdata(a, method="average", *, axis=None):
+    """Assign ranks to data, dealing with ties appropriately.
+    By default (``axis=None``), the data array is first flattened, and a flat
+    array of ranks is returned. Separately reshape the rank array to the
+    shape of the data array if desired (see Examples).
+    Ranks begin at 1.  The `method` argument controls how ranks are assigned
+    to equal values.  See [1]_ for further discussion of ranking methods.
+    Parameters
+    ----------
+    a : array_like
+        The array of values to be ranked.
+    method : {'average', 'min', 'max', 'dense', 'ordinal'}, optional
+        The method used to assign ranks to tied elements.
+        The following methods are available (default is 'average'):
+          * 'average': The average of the ranks that would have been assigned to
+            all the tied values is assigned to each value.
+          * 'min': The minimum of the ranks that would have been assigned to all
+            the tied values is assigned to each value.  (This is also
+            referred to as "competition" ranking.)
+          * 'max': The maximum of the ranks that would have been assigned to all
+            the tied values is assigned to each value.
+          * 'dense': Like 'min', but the rank of the next highest element is
+            assigned the rank immediately after those assigned to the tied
+            elements.
+          * 'ordinal': All values are given a distinct rank, corresponding to
+            the order that the values occur in `a`.
+    axis : {None, int}, optional
+        Axis along which to perform the ranking. If ``None``, the data array
+        is first flattened.
+    Returns
+    -------
+    ranks : ndarray
+         An array of size equal to the size of `a`, containing rank
+         scores.
+    References
+    ----------
+    .. [1] "Ranking", https://en.wikipedia.org/wiki/Ranking
+    Examples
+    --------
+    >>> from mars.tensor.stats import rankdata
+    >>> rankdata([0, 2, 3, 2]).execute()
+    array([ 1. ,  2.5,  4. ,  2.5])
+    >>> rankdata([0, 2, 3, 2], method='min').execute()
+    array([ 1,  2,  4,  2])
+    >>> rankdata([0, 2, 3, 2], method='max').execute()
+    array([ 1,  3,  4,  3])
+    >>> rankdata([0, 2, 3, 2], method='dense').execute()
+    array([ 1,  2,  3,  2])
+    >>> rankdata([0, 2, 3, 2], method='ordinal').execute()
+    array([ 1,  2,  4,  3])
+    >>> rankdata([[0, 2], [3, 2]]).reshape(2,2).execute()
+    array([[1. , 2.5],
+          [4. , 2.5]])
+    >>> rankdata([[0, 2, 2], [3, 2, 5]], axis=1).execute()
+    array([[1. , 2.5, 2.5],
+           [2. , 1. , 3. ]])
+    """
+    if method not in ("average", "min", "max", "dense", "ordinal"):
+        raise ValueError('unknown method "{0}"'.format(method))
+
+    if axis is not None:
+        a = np.asarray(a)
+        if a.size == 0:
+            np.core.multiarray.normalize_axis_index(axis, a.ndim)
+            dt = np.float64 if method == "average" else np.int_
+            return mt.empty(a.shape, dtype=dt)
+        return mt.tensor(np.apply_along_axis(rankdata, axis, a, method))
+
+    arr = mt.ravel(mt.asarray(a))
+    algo = "mergesort" if method == "ordinal" else "quicksort"
+    sorter = mt.argsort(arr, kind=algo)
+
+    inv = mt.empty(sorter.size, dtype=np.intp)
+    inv[sorter] = mt.arange(sorter.size, dtype=np.intp)
+
+    if method == "ordinal":
+        return inv + 1
+
+    arr = arr[sorter]
+    obs = mt.r_[True, arr[1:] != arr[:-1]]
+    dense = obs.cumsum()[inv]
+
+    if method == "dense":
+        return dense
+
+    count = mt.r_[mt.nonzero(obs)[0], len(obs)]
+
+    if method == "max":
+        return count[dense]
+
+    if method == "min":
+        return count[dense - 1] + 1
+
+    return 0.5 * (count[dense] + count[dense - 1] + 1)
diff --git a/python/xorbits/_mars/tensor/stats/tests/__init__.py b/python/xorbits/_mars/tensor/stats/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/stats/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/stats/tests/test_stats_execution.py b/python/xorbits/_mars/tensor/stats/tests/test_stats_execution.py
new file mode 100644
index 000000000..93bb66bc7
--- /dev/null
+++ b/python/xorbits/_mars/tensor/stats/tests/test_stats_execution.py
@@ -0,0 +1,343 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+
+import numpy as np
+import pytest
+import scipy
+from scipy.stats import chisquare as sp_chisquare
+from scipy.stats import entropy as sp_entropy
+from scipy.stats import ks_1samp as sp_ks_1samp
+from scipy.stats import ks_2samp as sp_ks_2samp
+from scipy.stats import norm as sp_norm
+from scipy.stats import power_divergence as sp_power_divergence
+from scipy.stats import rankdata as sp_rankdata
+from scipy.stats import ttest_1samp as sp_ttest_1samp
+from scipy.stats import ttest_ind as sp_ttest_ind
+from scipy.stats import ttest_ind_from_stats as sp_ttest_ind_from_stats
+from scipy.stats import ttest_rel as sp_ttest_rel
+
+from ....lib.version import parse as parse_version
+from ... import tensor
+from .. import (
+    chisquare,
+    entropy,
+    ks_1samp,
+    ks_2samp,
+    power_divergence,
+    rankdata,
+    ttest_1samp,
+    ttest_ind,
+    ttest_ind_from_stats,
+    ttest_rel,
+)
+
+
+def test_entropy_execution(setup):
+    rs = np.random.RandomState(0)
+    a = rs.rand(10)
+
+    t1 = tensor(a, chunk_size=4)
+    r = entropy(t1)
+
+    result = r.execute().fetch()
+    expected = sp_entropy(a)
+    np.testing.assert_array_almost_equal(result, expected)
+
+    b = rs.rand(10)
+    base = 3.1
+
+    t2 = tensor(b, chunk_size=4)
+    r = entropy(t1, t2, base)
+
+    result = r.execute().fetch()
+    expected = sp_entropy(a, b, base)
+    np.testing.assert_array_almost_equal(result, expected)
+
+    b = rs.rand(10)
+    base = 3.1
+
+    t2 = tensor(b, chunk_size=4)
+    r = entropy(t1, t2, base)
+
+    result = r.execute().fetch()
+    expected = sp_entropy(a, b, base)
+    np.testing.assert_array_almost_equal(result, expected)
+
+    r = entropy(t1, t2, t1.sum())
+
+    result = r.execute().fetch()
+    expected = sp_entropy(a, b, a.sum())
+    np.testing.assert_array_almost_equal(result, expected)
+
+    with pytest.raises(ValueError):
+        entropy(t1, t2[:7])
+
+
+def test_power_divergence_execution(setup):
+    f_obs_raw = np.array([16, 18, 16, 14, 12, 12])
+    f_exp_raw = np.array([16, 16, 16, 16, 16, 8])
+
+    f_obs = tensor(f_obs_raw, chunk_size=4)
+    f_exp = tensor(f_exp_raw, chunk_size=4)
+
+    with pytest.raises(ValueError):
+        power_divergence(f_obs, f_exp, lambda_="non-exist-lambda")
+
+    r = power_divergence(f_obs, lambda_="pearson")
+    result = r.execute().fetch()
+
+    expected = sp_power_divergence(f_obs_raw, lambda_="pearson")
+    np.testing.assert_almost_equal(expected[0], result[0])
+    np.testing.assert_almost_equal(expected[1], result[1])
+
+    modes = [
+        None,
+        "pearson",
+        "log-likelihood",
+        "mod-log-likelihood",
+        "neyman",
+    ]
+
+    for mode in modes:
+        r = power_divergence(f_obs, f_exp, lambda_=mode)
+        result = r.execute().fetch()
+
+        expected = sp_power_divergence(f_obs_raw, f_exp_raw, lambda_=mode)
+        np.testing.assert_almost_equal(expected[0], result[0])
+        np.testing.assert_almost_equal(expected[1], result[1])
+
+
+def test_chisquare_execution(setup):
+    f_obs_raw = np.array([16, 18, 16, 14, 12, 12])
+    f_exp_raw = np.array([16, 16, 16, 16, 16, 8])
+
+    f_obs = tensor(f_obs_raw, chunk_size=4)
+    f_exp = tensor(f_exp_raw, chunk_size=4)
+
+    r = chisquare(f_obs, f_exp)
+    result = r.execute().fetch()
+
+    expected = sp_chisquare(f_obs_raw, f_exp_raw)
+    np.testing.assert_almost_equal(expected[0], result[0])
+    np.testing.assert_almost_equal(expected[1], result[1])
+
+
+def test_t_test_execution(setup):
+    if parse_version(scipy.__version__) >= parse_version("1.6.0"):
+        alternatives = ["less", "greater", "two-sided"]
+
+        mt_from_stats = (
+            lambda a, b, alternative=None, equal_var=True: ttest_ind_from_stats(
+                a.mean(),
+                a.std(),
+                a.shape[0],
+                b.mean(),
+                b.std(),
+                b.shape[0],
+                alternative=alternative,
+                equal_var=equal_var,
+            )
+        )
+        sp_from_stats = (
+            lambda a, b, alternative=None, equal_var=True: sp_ttest_ind_from_stats(
+                a.mean(),
+                a.std(),
+                a.shape[0],
+                b.mean(),
+                b.std(),
+                b.shape[0],
+                alternative=alternative,
+                equal_var=equal_var,
+            )
+        )
+    else:
+        alternatives = ["two-sided"]
+
+        mt_from_stats = lambda a, b, equal_var=True: ttest_ind_from_stats(
+            a.mean(),
+            a.std(),
+            a.shape[0],
+            b.mean(),
+            b.std(),
+            b.shape[0],
+            equal_var=equal_var,
+        )
+        sp_from_stats = lambda a, b, equal_var=True: sp_ttest_ind_from_stats(
+            a.mean(),
+            a.std(),
+            a.shape[0],
+            b.mean(),
+            b.std(),
+            b.shape[0],
+            equal_var=equal_var,
+        )
+
+    funcs = [
+        (ttest_rel, sp_ttest_rel),
+        (
+            functools.partial(ttest_ind, equal_var=True),
+            functools.partial(sp_ttest_ind, equal_var=True),
+        ),
+        (
+            functools.partial(ttest_ind, equal_var=False),
+            functools.partial(sp_ttest_ind, equal_var=False),
+        ),
+        (
+            functools.partial(mt_from_stats, equal_var=True),
+            functools.partial(sp_from_stats, equal_var=True),
+        ),
+        (
+            functools.partial(mt_from_stats, equal_var=False),
+            functools.partial(sp_from_stats, equal_var=False),
+        ),
+    ]
+
+    fa_raw = np.array([16, 18, 16, 14, 12, 12])
+    fb_raw = np.array([16, 16, 16, 16, 16, 8])
+
+    fa = tensor(fa_raw, chunk_size=4)
+    fb = tensor(fb_raw, chunk_size=4)
+
+    for mt_func, sp_func in funcs:
+        if parse_version(scipy.__version__) >= parse_version("1.6.0"):
+            with pytest.raises(ValueError):
+                mt_func(fa, fb, alternative="illegal-alternative")
+
+        for alt in alternatives:
+            if parse_version(scipy.__version__) >= parse_version("1.6.0"):
+                r = mt_func(fa, fb, alternative=alt)
+            else:
+                r = mt_func(fa, fb)
+            result = r.execute().fetch()
+
+            if parse_version(scipy.__version__) >= parse_version("1.6.0"):
+                expected = sp_func(fa_raw, fb_raw, alternative=alt)
+            else:
+                expected = sp_func(fa_raw, fb_raw)
+            np.testing.assert_almost_equal(expected[0], result[0])
+            np.testing.assert_almost_equal(expected[1], result[1])
+
+    # second param size must be 1 for ttest_1samp
+    fb_raw = np.array([16])
+    fb = tensor(fb_raw)
+    for alt in alternatives:
+        if parse_version(scipy.__version__) >= parse_version("1.6.0"):
+            r = ttest_1samp(fa, fb, alternative=alt)
+        else:
+            r = ttest_1samp(fa, fb)
+        result = r.execute().fetch()
+
+        if parse_version(scipy.__version__) >= parse_version("1.6.0"):
+            expected = sp_ttest_1samp(fa_raw, fb_raw, alternative=alt)
+        else:
+            expected = sp_ttest_1samp(fa_raw, fb_raw)
+        np.testing.assert_almost_equal(expected[0], result[0])
+        np.testing.assert_almost_equal(expected[1], result[1])
+
+
+@pytest.mark.parametrize("chunk_size", [5, 15])
+@pytest.mark.parametrize(
+    "mode, alternative",
+    [
+        ("auto", "greater"),
+        ("auto", "less"),
+        ("auto", "two-sided"),
+        ("asymp", "two-sided"),
+        ("approx", "two-sided"),
+    ],
+)
+def test_ks_1samp(setup, chunk_size, mode, alternative):
+    x = tensor(np.linspace(-15, 15, 9), chunk_size=5)
+
+    result = ks_1samp(x, sp_norm.cdf, mode=mode).execute().fetch()
+    expected = sp_ks_1samp(x, sp_norm.cdf, mode=mode)
+    assert result == expected
+
+    with pytest.raises(ValueError):
+        ks_1samp(x, sp_norm.cdf, alternative="unknown")
+
+
+@pytest.mark.parametrize("chunk_size", [5, 15])
+def test_ks_2samp(setup, chunk_size):
+    n1 = 10
+    n2 = 15
+    rs = np.random.RandomState(0)
+    rvs1 = sp_norm.rvs(size=n1, loc=0.0, scale=1, random_state=rs)
+    rvs2 = sp_norm.rvs(size=n2, loc=0.5, scale=1.5, random_state=rs)
+
+    d1 = tensor(rvs1, chunk_size=chunk_size)
+    d2 = tensor(rvs2, chunk_size=chunk_size)
+
+    result = ks_2samp(d1, d2).execute().fetch()
+    expected = sp_ks_2samp(rvs1, rvs2)
+    assert result == expected
+
+    with pytest.raises(ValueError):
+        ks_2samp(d1, d2, alternative="unknown")
+
+    with pytest.raises(ValueError):
+        ks_2samp(d1, d2, mode="unknown")
+
+    with pytest.raises(ValueError):
+        ks_2samp(d1, [])
+
+
+def test_rankdata_execution(setup):
+    rs = np.random.RandomState(0)
+    a = rs.rand(4)
+
+    t1 = tensor(a, chunk_size=5)
+    r = rankdata(t1)
+
+    result = r.execute().fetch()
+    expected = sp_rankdata(a)
+    np.testing.assert_array_almost_equal(result, expected)
+
+    b = rs.rand(4, 4)
+
+    t2 = tensor(b, chunk_size=5)
+    r2 = rankdata(t2, axis=1)
+
+    result = r2.execute().fetch()
+    expected = sp_rankdata(b, axis=1)
+    np.testing.assert_array_almost_equal(result, expected)
+
+    c = rs.rand(0, 4)
+
+    t3 = tensor(c, chunk_size=5)
+    r3 = rankdata(t3, axis=1)
+
+    result = r3.execute().fetch()
+    expected = sp_rankdata(c, axis=1)
+    np.testing.assert_array_almost_equal(result, expected)
+
+    methods = [
+        "average",
+        "min",
+        "max",
+        "dense",
+        "ordinal",
+    ]
+
+    for method in methods:
+        r = rankdata(t1, method=method)
+        result = r.execute().fetch()
+
+        expected = sp_rankdata(a, method=method)
+        np.testing.assert_almost_equal(result, expected)
+
+    with pytest.raises(ValueError):
+        r = rankdata(t1, method="unknown")
diff --git a/python/xorbits/_mars/tensor/stats/ttest.py b/python/xorbits/_mars/tensor/stats/ttest.py
new file mode 100644
index 000000000..f593cc505
--- /dev/null
+++ b/python/xorbits/_mars/tensor/stats/ttest.py
@@ -0,0 +1,165 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import namedtuple
+
+import numpy as np
+from scipy import __version__ as sp_version
+from scipy.stats import distributions as sp_distributions
+from scipy.stats import ttest_1samp as sp_ttest_1samp
+from scipy.stats import ttest_ind as sp_ttest_ind
+from scipy.stats import ttest_ind_from_stats as sp_ttest_ind_from_stats
+from scipy.stats import ttest_rel as sp_ttest_rel
+
+from ...core import ExecutableTuple
+from ...lib.version import parse as parse_version
+from ..arithmetic import absolute as mt_abs
+from ..arithmetic import divide as mt_divide
+from ..arithmetic import isnan as mt_isnan
+from ..arithmetic import sqrt as mt_sqrt
+from ..base import where as mt_where
+from ..reduction import mean as mt_mean
+from ..reduction import var as mt_var
+from ..utils import implement_scipy
+
+
+def _equal_var_ttest_denom(v1, n1, v2, n2):
+    df = n1 + n2 - 2.0
+    svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / df
+    denom = mt_sqrt(svar * (1.0 / n1 + 1.0 / n2))  # XXX: np -> da
+    return df, denom
+
+
+def _unequal_var_ttest_denom(v1, n1, v2, n2):
+    vn1 = v1 / n1
+    vn2 = v2 / n2
+    with np.errstate(divide="ignore", invalid="ignore"):
+        df = (vn1 + vn2) ** 2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1))
+
+    # If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0).
+    # Hence it doesn't matter what df is as long as it's not NaN.
+    df = mt_where(mt_isnan(df), 1, df)
+    denom = mt_sqrt(vn1 + vn2)
+    return df, denom
+
+
+def _ttest_ind_from_stats(mean1, mean2, denom, df, alternative):
+    d = mean1 - mean2
+    with np.errstate(divide="ignore", invalid="ignore"):
+        t = mt_divide(d, denom)
+    t, prob = _ttest_finish(df, t, alternative)
+
+    return t, prob
+
+
+def _ttest_finish(df, t, alternative):
+    """Common code between all 3 t-test functions."""
+    if alternative != "two-sided" and parse_version(sp_version) < parse_version(
+        "1.6.0"
+    ):  # pragma: no cover
+        raise ValueError("alternative must be 'two-sided' with scipy prior to 1.6.0")
+
+    if alternative == "less":
+        prob = t.map_chunk(sp_distributions.t.cdf, args=(df,))
+    elif alternative == "greater":
+        prob = t.map_chunk(sp_distributions.t.sf, args=(df,))
+    elif alternative == "two-sided":
+        prob = mt_abs(t).map_chunk(sp_distributions.t.sf, args=(df,)) * 2
+    else:
+        raise ValueError("alternative must be 'less', 'greater' or 'two-sided'")
+    if t.ndim == 0:
+        t = t[()]
+    return t, prob
+
+
+Ttest_1sampResult = namedtuple("Ttest_1sampResult", ("statistic", "pvalue"))
+
+
+@implement_scipy(sp_ttest_1samp)
+def ttest_1samp(a, popmean, axis=0, nan_policy="propagate", alternative="two-sided"):
+    if nan_policy != "propagate":
+        raise NotImplementedError(
+            "`nan_policy` other than 'propagate' have not been implemented."
+        )
+    n = a.shape[axis]
+    df = n - 1
+
+    d = a.mean(axis=axis) - popmean
+    v = a.var(axis=axis, ddof=1)
+    denom = mt_sqrt(v / float(n))
+
+    with np.errstate(divide="ignore", invalid="ignore"):
+        t = mt_divide(d, denom)
+    t, prob = _ttest_finish(df, t, alternative)
+    return ExecutableTuple(Ttest_1sampResult(t, prob))
+
+
+Ttest_indResult = namedtuple("Ttest_indResult", ("statistic", "pvalue"))
+
+
+@implement_scipy(sp_ttest_ind)
+def ttest_ind(a, b, axis=0, equal_var=True, alternative="two-sided"):
+    v1 = mt_var(a, axis, ddof=1)
+    v2 = mt_var(b, axis, ddof=1)
+    n1 = a.shape[axis]
+    n2 = b.shape[axis]
+
+    if equal_var:
+        df, denom = _equal_var_ttest_denom(v1, n1, v2, n2)
+    else:
+        df, denom = _unequal_var_ttest_denom(v1, n1, v2, n2)
+
+    res = _ttest_ind_from_stats(
+        mt_mean(a, axis), mt_mean(b, axis), denom, df, alternative
+    )
+
+    return ExecutableTuple(Ttest_indResult(*res))
+
+
+@implement_scipy(sp_ttest_ind_from_stats)
+def ttest_ind_from_stats(
+    mean1, std1, nobs1, mean2, std2, nobs2, equal_var=True, alternative="two-sided"
+):
+    if equal_var:
+        df, denom = _equal_var_ttest_denom(std1**2, nobs1, std2**2, nobs2)
+    else:
+        df, denom = _unequal_var_ttest_denom(std1**2, nobs1, std2**2, nobs2)
+
+    res = _ttest_ind_from_stats(mean1, mean2, denom, df, alternative)
+    return ExecutableTuple(Ttest_indResult(*res))
+
+
+Ttest_relResult = namedtuple("Ttest_relResult", ("statistic", "pvalue"))
+
+
+@implement_scipy(sp_ttest_rel)
+def ttest_rel(a, b, axis=0, nan_policy="propagate", alternative="two-sided"):
+    if nan_policy != "propagate":
+        raise NotImplementedError(
+            "`nan_policy` other than 'propagate' have not been implemented."
+        )
+
+    n = a.shape[axis]
+    df = float(n - 1)
+
+    d = (a - b).astype(np.float64)
+    v = mt_var(d, axis, ddof=1)
+    dm = mt_mean(d, axis)
+    denom = mt_sqrt(v / float(n))
+
+    with np.errstate(divide="ignore", invalid="ignore"):
+        t = mt_divide(dm, denom)
+    t, prob = _ttest_finish(df, t, alternative)
+
+    return ExecutableTuple(Ttest_relResult(t, prob))
diff --git a/python/xorbits/_mars/tensor/tests/__init__.py b/python/xorbits/_mars/tensor/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/tests/test_core.py b/python/xorbits/_mars/tensor/tests/test_core.py
new file mode 100644
index 000000000..f0d867272
--- /dev/null
+++ b/python/xorbits/_mars/tensor/tests/test_core.py
@@ -0,0 +1,36 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ... import tensor as mt
+from ...core import tile
+
+
+def test_params():
+    raw = np.random.rand(10, 10)
+    a = mt.tensor(raw)
+    a = a[a[0] < 0.5]
+    a = tile(a)
+    c = a.chunks[0]
+
+    assert any(np.isnan(s) for s in c.params["shape"])
+    c.params = c.get_params_from_data(raw[raw[0] < 0.5])
+    assert not any(np.isnan(s) for s in c.params["shape"])
+
+    params = c.params.copy()
+    params.pop("index", None)
+    a.params = params
+    assert np.prod(a.shape) > 0
+    a.refresh_params()
diff --git a/python/xorbits/_mars/tensor/tests/test_core_execution.py b/python/xorbits/_mars/tensor/tests/test_core_execution.py
new file mode 100644
index 000000000..f53ed763e
--- /dev/null
+++ b/python/xorbits/_mars/tensor/tests/test_core_execution.py
@@ -0,0 +1,282 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from .. import (
+    add,
+    atleast_1d,
+    atleast_2d,
+    atleast_3d,
+    moveaxis,
+    ones,
+    squeeze,
+    swapaxes,
+    tensor,
+)
+
+
+def test_array_function(setup):
+    a = ones((10, 20), chunk_size=8)
+
+    # test sum
+    np.testing.assert_equal(np.sum(a).execute().fetch(), 200)
+
+    # test qr
+    q, r = np.linalg.qr(a)
+    np.testing.assert_array_almost_equal(np.dot(q, r).execute().fetch(), a)
+
+
+def test_view_data_on_slice(setup):
+    data = np.random.rand(10, 20)
+    a = tensor(data, chunk_size=8)
+    b = a[:5, 5:10]
+    b[:3, :3] = 3
+
+    npa = data.copy()
+    npb = npa[:5, 5:10]
+    npb[:3, :3] = 3
+
+    np.testing.assert_array_equal(b.execute(), npb)
+    np.testing.assert_array_equal(a.execute(), npa)
+
+    data = np.random.rand(10, 20)
+    a = tensor(data, chunk_size=8)
+    b = a[:7]
+    b += 1
+
+    npa = data.copy()
+    npb = npa[:7]
+    npb += 1
+
+    np.testing.assert_array_equal(a.execute(), npa)
+    np.testing.assert_array_equal(b.execute(), npb)
+
+
+def test_set_item_on_view(setup):
+    a = ones((5, 8), dtype=int)
+    b = a[:3]
+    b[0, 0] = 2
+    c = b.ravel()  # create view
+    c[1] = 4
+
+    npa = np.ones((5, 8), dtype=int)
+    npb = npa[:3]
+    npb[0, 0] = 2
+    npc = npb.ravel()  # create view
+    npc[1] = 4
+
+    np.testing.assert_array_equal(a.execute(), npa)
+    np.testing.assert_array_equal(b.execute(), npb)
+    np.testing.assert_array_equal(c.execute(), npc)
+
+
+def test_view_data_on_transpose(setup):
+    data = np.random.rand(10, 20)
+    a = tensor(data, chunk_size=6)
+    b = a.T
+    add(b, 1, out=b)
+
+    np.testing.assert_array_equal(b.execute(), data.T + 1)
+    np.testing.assert_array_equal(a.execute(), data + 1)
+
+
+def test_view_data_on_swapaxes(setup):
+    data = np.random.rand(10, 20)
+    a = tensor(data, chunk_size=6)
+    b = swapaxes(a, 1, 0)
+    a[1] = 10
+
+    npa = data.copy()
+    npb = np.swapaxes(npa, 1, 0)
+    npa[1] = 10
+
+    np.testing.assert_array_equal(b.execute(), npb)
+    np.testing.assert_array_equal(a.execute(), npa)
+
+
+def test_view_data_on_moveaxis(setup):
+    data = np.random.rand(10, 20)
+    a = tensor(data, chunk_size=6)
+    b = moveaxis(a, 1, 0)
+    a[0][1] = 10
+
+    npa = data.copy()
+    npb = np.moveaxis(npa, 1, 0)
+    npa[0][1] = 10
+
+    np.testing.assert_array_equal(b.execute(), npb)
+    np.testing.assert_array_equal(a.execute(), npa)
+
+
+def test_view_data_on_atleast1d(setup):
+    a = atleast_1d(1)
+    b = a[:]
+    b[0] = 10
+
+    np.testing.assert_array_equal(b.execute(), np.array([10]))
+    np.testing.assert_array_equal(a.execute(), np.array([10]))
+
+
+def test_view_data_on_atleast2d(setup):
+    data = np.random.rand(10)
+    a = atleast_2d(tensor(data, chunk_size=5))
+    b = add(a[:, :5], 1, out=a[:, 5:])
+
+    npa = np.atleast_2d(data.copy())
+    npb = np.add(npa[:, :5], 1, out=npa[:, 5:])
+
+    np.testing.assert_array_equal(a.execute(), npa)
+    np.testing.assert_array_equal(b.execute(), npb)
+
+
+def test_view_data_on_atleast3d(setup):
+    data = np.random.rand(10, 20)
+    a = atleast_3d(tensor(data, chunk_size=5))
+    b = a[:, :5, :10][0]
+    c = add(b[:4], b[1:], out=a[0, 16:])
+
+    npa = np.atleast_3d(data.copy())
+    npb = npa[:, :5, :10][0]
+    npc = np.add(npb[:4], npb[1:], out=npa[0, 16:])
+
+    np.testing.assert_array_equal(a.execute(), npa)
+    np.testing.assert_array_equal(b.execute(), npb)
+    np.testing.assert_array_equal(c.execute(), npc)
+
+
+def test_view_data_on_squeeze(setup):
+    data = np.random.rand(1, 4, 1)
+    a = tensor(data, chunk_size=2)
+    b = squeeze(a, axis=0)
+    b[:3] = 10
+
+    npa = data.copy()
+    npb = np.squeeze(npa, axis=0)
+    npb[:3] = 10
+
+    np.testing.assert_array_equal(b.execute(), npb)
+    np.testing.assert_array_equal(a.execute(), npa)
+
+
+def test_view_data_on_reshape(setup):
+    data = np.random.RandomState(0).random((3, 4, 5))
+    a = tensor(data.copy(), chunk_size=2)
+    b = a.reshape((5, 4, 3))
+    b[:3] = 10
+
+    npa = data.copy()
+    npb = npa.reshape((5, 4, 3))
+    npb[:3] = 10
+
+    np.testing.assert_array_equal(b.execute(), npb)
+    np.testing.assert_array_equal(a.execute(), npa)
+
+    data = np.random.RandomState(0).random((4, 5))
+    a2 = tensor(data.copy(), chunk_size=2)
+    b2 = a2.reshape((5, 4), order="F")
+    b2[:3] = 10
+
+    npa = data.copy()
+    npb = npa.reshape((5, 4), order="F")
+    npb[:3] = 10
+
+    b2_result = b2.execute()
+    np.testing.assert_array_equal(a2.execute(), npa)
+    np.testing.assert_array_equal(b2_result, npb)
+
+
+def test_view_data_on_ravel(setup):
+    # ravel creates a view
+    data = np.random.rand(3, 4, 5)
+    a = tensor(data, chunk_size=2)
+    b = a.ravel()
+    b[:10] = 10
+
+    npa = data.copy()
+    npb = npa.ravel()
+    npb[:10] = 10
+
+    np.testing.assert_array_equal(b.execute(), npb)
+    np.testing.assert_array_equal(a.execute(), npa)
+
+    # flatten creates a copy
+    data = np.random.rand(3, 4, 5)
+    a = tensor(data, chunk_size=2)
+    b = a.flatten()
+    b[:10] = 10
+
+    npa = data.copy()
+    npb = npa.flatten()
+    npb[:10] = 10
+
+    np.testing.assert_array_equal(b.execute(), npb)
+    np.testing.assert_array_equal(a.execute(), npa)
+
+
+def test_copy_and_view(setup):
+    data = np.random.rand(10, 20)
+    a = tensor(data, chunk_size=6)
+    b = a.view()
+    b[:5] = 10
+
+    npa = data.copy()
+    npb = npa.view()
+    npb[:5] = 10
+
+    np.testing.assert_array_equal(b.execute(), npb)
+    np.testing.assert_array_equal(a.execute(), npa)
+
+    data = np.random.rand(10, 20)
+    a = tensor(data.copy(), chunk_size=6)
+    b = a.copy()
+    b[:5] = 10
+
+    npa = data.copy()
+    npb = npa.copy()
+    npb[:5] = 10
+
+    np.testing.assert_array_equal(b.execute(), npb)
+    np.testing.assert_array_equal(a.execute(), npa)
+
+    a = tensor(data.copy(), chunk_size=6)
+    b = a[:5, :4]
+    c = b.copy()
+    c[0, 0] = 10
+
+    npa = data.copy()
+    npb = npa[:5, :4]
+    npc = npb.copy()
+    npc[0, 0] = 10
+
+    np.testing.assert_array_equal(c.execute(), npc)
+    np.testing.assert_array_equal(a.execute(), npa)
+
+
+def test_flat(setup):
+    data = np.random.rand(10, 20)
+    a = tensor(data, chunk_size=4)
+    fl = a.flat
+    fl[1:10] = 10
+    b = fl[10:20]
+    b[0:4] = 20
+
+    npa = data.copy()
+    npfl = npa.flat
+    npfl[1:10] = 10
+    npb = npfl[10:20]
+    npb[0:4] = 20
+
+    np.testing.assert_array_equal(b.execute(), npb)
+    np.testing.assert_array_equal(a.execute(), npa)
diff --git a/python/xorbits/_mars/tensor/tests/test_utils.py b/python/xorbits/_mars/tensor/tests/test_utils.py
new file mode 100644
index 000000000..890806ca0
--- /dev/null
+++ b/python/xorbits/_mars/tensor/tests/test_utils.py
@@ -0,0 +1,92 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+from ... import tensor as mt
+from ...lib.mmh3 import hash_from_buffer as mmh3_hash_from_buffer
+from ..utils import fetch_corner_data, hash_on_axis, normalize_axis_tuple
+
+
+def test_hash_on_axis():
+    hash_from_buffer = lambda x: mmh3_hash_from_buffer(memoryview(x))
+
+    a = np.random.rand(10)
+
+    result = hash_on_axis(a, 0, 3)
+    expected = np.array([mmh3_hash_from_buffer(element) % 3 for element in a])
+
+    np.testing.assert_array_equal(result, expected)
+
+    result = hash_on_axis(a, 0, 1)
+    expected = np.array([0 for _ in a])
+
+    np.testing.assert_array_equal(result, expected)
+
+    a = np.random.rand(10, 5)
+
+    result = hash_on_axis(a, 0, 3)
+    expected = np.array([hash_from_buffer(a[i, :]) % 3 for i in range(a.shape[0])])
+
+    np.testing.assert_array_equal(result, expected)
+
+    result = hash_on_axis(a, 1, 3)
+    expected = np.array([hash_from_buffer(a[:, i]) % 3 for i in range(a.shape[1])])
+
+    np.testing.assert_array_equal(result, expected)
+
+    a = np.random.rand(10, 5, 4)
+
+    result = hash_on_axis(a, 2, 3)
+    expected = np.array([hash_from_buffer(a[:, :, i]) % 3 for i in range(a.shape[2])])
+
+    np.testing.assert_array_equal(result, expected)
+
+
+def test_normalize_axis_tuple():
+    assert normalize_axis_tuple(-1, 3) == (2,)
+    assert normalize_axis_tuple([0, -2], 3) == (0, 1)
+    assert sorted(normalize_axis_tuple({0, -2}, 3)) == [0, 1]
+
+    with pytest.raises(ValueError) as cm:
+        normalize_axis_tuple((1, -2), 3, argname="axes")
+    assert "axes" in str(cm.value)
+
+    with pytest.raises(ValueError):
+        normalize_axis_tuple((1, -2), 3)
+
+
+def test_fetch_tensor_corner_data(setup):
+    print_options = np.get_printoptions()
+
+    # make sure numpy default option
+    assert print_options["edgeitems"] == 3
+    assert print_options["threshold"] == 1000
+
+    size = 12
+    for i in (2, 4, size - 3, size, size + 3):
+        arr = np.random.rand(i, i, i)
+        t = mt.tensor(arr, chunk_size=size // 2)
+        t.execute()
+
+        corner_data = fetch_corner_data(t)
+        corner_threshold = 1000 if t.size < 1000 else corner_data.size - 1
+        with np.printoptions(threshold=corner_threshold, suppress=True):
+            # when we repr corner data, we need to limit threshold that
+            # it's exactly less than the size
+            repr_corner_data = repr(corner_data)
+        with np.printoptions(suppress=True):
+            repr_result = repr(arr)
+        assert repr_corner_data == repr_result
diff --git a/python/xorbits/_mars/tensor/ufunc/__init__.py b/python/xorbits/_mars/tensor/ufunc/__init__.py
new file mode 100644
index 000000000..a8027eea4
--- /dev/null
+++ b/python/xorbits/_mars/tensor/ufunc/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def _install():
+    from ..core import Tensor
+    from .ufunc import _array_ufunc
+
+    Tensor.__array_ufunc__ = _array_ufunc
+
+
+_install()
+del _install
diff --git a/python/xorbits/_mars/tensor/ufunc/tests/__init__.py b/python/xorbits/_mars/tensor/ufunc/tests/__init__.py
new file mode 100644
index 000000000..37f6558d9
--- /dev/null
+++ b/python/xorbits/_mars/tensor/ufunc/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/xorbits/_mars/tensor/ufunc/tests/test_ufunc_execution.py b/python/xorbits/_mars/tensor/ufunc/tests/test_ufunc_execution.py
new file mode 100644
index 000000000..3ef36850b
--- /dev/null
+++ b/python/xorbits/_mars/tensor/ufunc/tests/test_ufunc_execution.py
@@ -0,0 +1,80 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+from .... import tensor as mt
+from ...core import Tensor
+
+
+@pytest.mark.parametrize("ufunc_name", ["negative"])
+def test_unary_ufunc(setup, ufunc_name):
+    raw_data = np.random.rand(100, 100)
+    t = mt.tensor(raw_data.copy(), chunk_size=20)
+
+    ufunc_obj = getattr(np, ufunc_name)
+
+    res = ufunc_obj(t)
+    expected = ufunc_obj(raw_data)
+    assert isinstance(res, Tensor)
+    np.testing.assert_array_equal(res.execute().fetch(), expected)
+
+    ufunc_obj.at(t, 3)
+    ufunc_obj.at(raw_data, 3)
+    np.testing.assert_array_equal(t.execute().fetch(), raw_data)
+
+
+@pytest.mark.parametrize("ufunc_name", ["add", "multiply", "logaddexp", "logaddexp2"])
+def test_binary_ufunc(setup, ufunc_name):
+    raw_data1 = np.random.rand(100, 100)
+    t1 = mt.tensor(raw_data1.copy(), chunk_size=50)
+    raw_data2 = np.random.rand(100, 100)
+    t2 = mt.tensor(raw_data2.copy(), chunk_size=50)
+
+    ufunc_obj = getattr(np, ufunc_name)
+
+    res = ufunc_obj(t1, t2)
+    expected = ufunc_obj(raw_data1, raw_data2)
+    assert isinstance(res, Tensor)
+    np.testing.assert_array_equal(res.execute().fetch(), expected)
+
+    ufunc_obj.at(t1, (3, 4), 2)
+    ufunc_obj.at(raw_data1, (3, 4), 2)
+    np.testing.assert_array_equal(t1.execute().fetch(), raw_data1)
+
+    res = ufunc_obj.reduce(t1, axis=1)
+    expected = ufunc_obj.reduce(raw_data1, axis=1)
+    assert isinstance(res, Tensor)
+    np.testing.assert_almost_equal(res.execute().fetch(), expected)
+
+    res = t1.copy()
+    ufunc_obj.reduce(t1, axis=1, out=res)
+    expected = ufunc_obj.reduce(raw_data1, axis=1)
+    assert isinstance(res, Tensor)
+    np.testing.assert_almost_equal(res.execute().fetch(), expected)
+
+    res = ufunc_obj.accumulate(t1, axis=1)
+    expected = ufunc_obj.accumulate(raw_data1, axis=1)
+    assert isinstance(res, Tensor)
+    np.testing.assert_almost_equal(res.execute().fetch(), expected)
+
+    res = t1.copy()
+    ufunc_obj.accumulate(t1, axis=1, out=res)
+    expected = ufunc_obj.accumulate(raw_data1, axis=1)
+    assert isinstance(res, Tensor)
+    np.testing.assert_almost_equal(res.execute().fetch(), expected)
+
+    with pytest.raises(TypeError):
+        ufunc_obj.reduceat(t1, [(3, 4)])
diff --git a/python/xorbits/_mars/tensor/ufunc/ufunc.py b/python/xorbits/_mars/tensor/ufunc/ufunc.py
new file mode 100644
index 000000000..00a079c04
--- /dev/null
+++ b/python/xorbits/_mars/tensor/ufunc/ufunc.py
@@ -0,0 +1,198 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from numbers import Number
+
+import numpy as np
+
+from .. import arithmetic as arith
+from .. import reduction
+from ..datasource import tensor as astensor
+
+
+class TensorUfuncDef:
+    def __init__(
+        self, method, aggregator=None, accumulator=None, pre_agg=None, post_agg=None
+    ):
+        self._method = method
+        self._aggregator = aggregator
+        self._accumulator = accumulator
+        self._pre_agg = pre_agg
+        self._post_agg = post_agg
+
+    def __call__(self, *args, **kwargs):
+        return self._method(*args, **kwargs)
+
+    def at(self, a, indices, b=None):
+        # todo handle setting duplicated keys, a separate operand may be needed
+        if b is None:
+            a[indices] = self(a[indices])
+        else:
+            a[indices] = self(a[indices], b)
+
+    def accumulate(self, array, axis=0, dtype=None, out=None):
+        if self._accumulator is None:
+            raise NotImplementedError
+        data = array if self._pre_agg is None else self._pre_agg(array)
+        result = self._accumulator(data, axis=axis, dtype=dtype)
+        result = result if self._post_agg is None else self._post_agg(result)
+        if out is not None:
+            out[0]._data = result._data
+        else:
+            return result
+
+    def reduce(self, array, axis=0, dtype=None, out=None, keepdims=False):
+        if self._aggregator is None:
+            raise NotImplementedError
+        data = array if self._pre_agg is None else self._pre_agg(array)
+        result = self._aggregator(data, axis=axis, dtype=dtype, keepdims=keepdims)
+        result = result if self._post_agg is None else self._post_agg(result)
+        if out is not None:
+            out[0]._data = result._data
+        else:
+            return result
+
+
+UFUNC_TO_TENSOR_FUNCS = {
+    np.add: TensorUfuncDef(
+        arith.add,
+        accumulator=reduction.cumsum,
+        aggregator=reduction.sum,
+    ),
+    np.subtract: TensorUfuncDef(arith.subtract),
+    np.multiply: TensorUfuncDef(
+        arith.multiply,
+        accumulator=reduction.cumprod,
+        aggregator=reduction.prod,
+    ),
+    np.divide: TensorUfuncDef(arith.divide),
+    np.logaddexp: TensorUfuncDef(
+        arith.logaddexp,
+        accumulator=reduction.cumsum,
+        aggregator=reduction.sum,
+        pre_agg=arith.exp,
+        post_agg=arith.log,
+    ),
+    np.logaddexp2: TensorUfuncDef(
+        arith.logaddexp2,
+        accumulator=reduction.cumsum,
+        aggregator=reduction.sum,
+        pre_agg=lambda x: arith.power(2, x),
+        post_agg=arith.log2,
+    ),
+    np.true_divide: TensorUfuncDef(arith.truediv),
+    np.floor_divide: TensorUfuncDef(arith.floordiv),
+    # unary
+    np.negative: TensorUfuncDef(arith.negative),
+    np.power: TensorUfuncDef(arith.power),
+    np.float_power: TensorUfuncDef(arith.float_power),
+    np.remainder: TensorUfuncDef(arith.remainder),
+    np.mod: TensorUfuncDef(arith.mod),
+    np.fmod: TensorUfuncDef(arith.fmod),
+    np.conj: TensorUfuncDef(arith.conj),
+    np.conjugate: TensorUfuncDef(arith.conjugate),
+    np.exp: TensorUfuncDef(arith.exp),
+    np.exp2: TensorUfuncDef(arith.exp2),
+    np.log: TensorUfuncDef(arith.log),
+    np.log2: TensorUfuncDef(arith.log2),
+    np.log10: TensorUfuncDef(arith.log10),
+    np.log1p: TensorUfuncDef(arith.log1p),
+    np.expm1: TensorUfuncDef(arith.expm1),
+    np.sqrt: TensorUfuncDef(arith.sqrt),
+    np.square: TensorUfuncDef(arith.square),
+    np.cbrt: TensorUfuncDef(arith.cbrt),
+    np.reciprocal: TensorUfuncDef(arith.reciprocal),
+    # trigonometric functions
+    np.sin: TensorUfuncDef(arith.sin),
+    np.cos: TensorUfuncDef(arith.cos),
+    np.tan: TensorUfuncDef(arith.tan),
+    np.arcsin: TensorUfuncDef(arith.arcsin),
+    np.arccos: TensorUfuncDef(arith.arccos),
+    np.arctan: TensorUfuncDef(arith.arctan),
+    np.arctan2: TensorUfuncDef(arith.arctan2),
+    np.hypot: TensorUfuncDef(arith.hypot),
+    np.sinh: TensorUfuncDef(arith.sinh),
+    np.cosh: TensorUfuncDef(arith.cosh),
+    np.tanh: TensorUfuncDef(arith.tanh),
+    np.arcsinh: TensorUfuncDef(arith.arcsinh),
+    np.arccosh: TensorUfuncDef(arith.arccosh),
+    np.arctanh: TensorUfuncDef(arith.arctanh),
+    np.deg2rad: TensorUfuncDef(arith.deg2rad),
+    np.rad2deg: TensorUfuncDef(arith.rad2deg),
+    # comparison functions
+    np.greater: TensorUfuncDef(arith.greater),
+    np.greater_equal: TensorUfuncDef(arith.greater_equal),
+    np.less: TensorUfuncDef(arith.less),
+    np.less_equal: TensorUfuncDef(arith.less_equal),
+    np.not_equal: TensorUfuncDef(arith.not_equal),
+    np.equal: TensorUfuncDef(arith.equal),
+    np.logical_and: TensorUfuncDef(arith.logical_and),
+    np.logical_or: TensorUfuncDef(arith.logical_or),
+    np.logical_xor: TensorUfuncDef(arith.logical_xor),
+    np.logical_not: TensorUfuncDef(arith.logical_not),
+    np.maximum: TensorUfuncDef(arith.maximum),
+    np.minimum: TensorUfuncDef(arith.minimum),
+    np.fmax: TensorUfuncDef(arith.fmax),
+    np.fmin: TensorUfuncDef(arith.fmin),
+    # floating functions
+    np.isfinite: TensorUfuncDef(arith.isfinite),
+    np.isinf: TensorUfuncDef(arith.isinf),
+    np.isnan: TensorUfuncDef(arith.isnan),
+    np.signbit: TensorUfuncDef(arith.signbit),
+    np.copysign: TensorUfuncDef(arith.copysign),
+    np.nextafter: TensorUfuncDef(arith.nextafter),
+    np.spacing: TensorUfuncDef(arith.spacing),
+    np.modf: TensorUfuncDef(arith.modf),
+    np.ldexp: TensorUfuncDef(arith.ldexp),
+    np.frexp: TensorUfuncDef(arith.frexp),
+    np.floor: TensorUfuncDef(arith.floor),
+    np.ceil: TensorUfuncDef(arith.ceil),
+    np.trunc: TensorUfuncDef(arith.trunc),
+    # more math functions
+    np.degrees: TensorUfuncDef(arith.degrees),
+    np.radians: TensorUfuncDef(arith.radians),
+    np.rint: TensorUfuncDef(arith.rint),
+    np.fabs: TensorUfuncDef(arith.fabs),
+    np.sign: TensorUfuncDef(arith.sign),
+    np.absolute: TensorUfuncDef(arith.absolute),
+}
+
+
+def _check_arg(arg):
+    if isinstance(arg, Number):
+        return True
+
+    try:
+        astensor(arg)
+        return True
+    except ValueError:
+        return False
+
+
+def _array_ufunc(_, ufunc, method, *inputs, **kwargs):
+    out = kwargs.get("out", tuple())
+    for x in inputs + out:
+        if not _check_arg(x):
+            return NotImplemented
+
+    if ufunc.signature is not None:
+        return NotImplemented
+    if ufunc not in UFUNC_TO_TENSOR_FUNCS:
+        return NotImplemented
+
+    try:
+        tensor_func = getattr(UFUNC_TO_TENSOR_FUNCS[ufunc], method)
+        return tensor_func(*inputs, **kwargs)
+    except (AttributeError, NotImplementedError):
+        return NotImplemented
diff --git a/python/xorbits/_mars/tensor/utils.py b/python/xorbits/_mars/tensor/utils.py
new file mode 100644
index 000000000..111edf3c3
--- /dev/null
+++ b/python/xorbits/_mars/tensor/utils.py
@@ -0,0 +1,835 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import itertools
+import operator
+from collections import OrderedDict
+from collections.abc import Iterable
+from functools import lru_cache, reduce, wraps
+from math import ceil
+from numbers import Integral
+from typing import Dict, List, Union
+
+import numpy as np
+
+try:
+    import tiledb
+except (ImportError, OSError):  # pragma: no cover
+    tildb = None
+
+from ..core import ExecutableTuple, recursive_tile
+from ..lib.mmh3 import hash_from_buffer
+from ..utils import lazy_import
+
+cp = lazy_import("cupy", rename="cp")
+
+
+def normalize_shape(shape):
+    if isinstance(shape, Iterable):
+        return tuple(shape)
+    else:
+        return (shape,)
+
+
+def normalize_chunk_sizes(shape, chunk_size):
+    shape = normalize_shape(shape)
+    if not isinstance(chunk_size, tuple):
+        if isinstance(chunk_size, Iterable):
+            chunk_size = tuple(chunk_size)
+        elif isinstance(chunk_size, int):
+            chunk_size = (chunk_size,) * len(shape)
+
+    if len(shape) != len(chunk_size):
+        raise ValueError(
+            "Chunks must have the same dimemsion, "
+            f"got shape: {shape}, chunks: {chunk_size}"
+        )
+
+    chunk_sizes = []
+    for size, chunk in zip(shape, chunk_size):
+        if isinstance(chunk, Iterable):
+            if not isinstance(chunk, tuple):
+                chunk = tuple(chunk)
+
+            # if chunk is (np.nan,), it means we need to concat
+            # all chunks together.
+            if chunk == (np.nan,):
+                chunk = (size,)
+
+            if sum(chunk) != size:
+                raise ValueError(
+                    "chunks shape should be of the same length, "
+                    f"got shape: {size}, chunks: {chunk}"
+                )
+            chunk_sizes.append(chunk)
+        else:
+            assert isinstance(chunk, int)
+
+            if size == 0:
+                sizes = (0,)
+            else:
+                sizes = tuple(chunk for _ in range(int(size / chunk))) + (
+                    tuple() if size % chunk == 0 else (size % chunk,)
+                )
+            chunk_sizes.append(sizes)
+
+    return tuple(chunk_sizes)
+
+
+def broadcast_shape(*shapes):
+    if len(shapes) == 1:
+        return shapes[0]
+
+    out_shapes = []
+    for ss in itertools.zip_longest(*[reversed(s) for s in shapes], fillvalue=-1):
+        shape = max(s for s in ss if s != -1)
+        if any(i != -1 and i != 1 and i != shape and not np.isnan(i) for i in ss):
+            raise ValueError(
+                "Operands could not be broadcast together "
+                "with shape {0}".format(" ".join(map(str, shapes)))
+            )
+        out_shapes.append(shape)
+    return tuple(reversed(out_shapes))
+
+
+def get_chunk_slices(nsplits, idx):
+    return tuple(
+        slice(sum(nsplit[:idx]), sum(nsplit[: idx + 1]))
+        for idx, nsplit in zip(idx, nsplits)
+    )
+
+
+def gen_random_seeds(n, random_state):
+    assert isinstance(random_state, np.random.RandomState)
+    return tuple(np.frombuffer(random_state.bytes(n * 4), dtype=np.uint32).tolist())
+
+
+def validate_axis(ndim, axis, argname=None):
+    if axis >= ndim or axis < -ndim:
+        raise np.AxisError(axis, ndim=ndim, msg_prefix=argname)
+
+    return axis if axis >= 0 else ndim + axis
+
+
+def normalize_axis_tuple(axis, ndim, argname=None, allow_duplicate=False):
+    """
+    Normalizes an axis argument into a tuple of non-negative integer axes.
+
+    This handles shorthands such as ``1`` and converts them to ``(1,)``,
+    as well as performing the handling of negative indices covered by
+    `normalize_axis_index`.
+
+    By default, this forbids axes from being specified multiple times.
+
+    Used internally by multi-axis-checking logic.
+
+    Parameters
+    ----------
+    axis : int, iterable of int
+        The un-normalized index or indices of the axis.
+    ndim : int
+        The number of dimensions of the array that `axis` should be normalized
+        against.
+    argname : str, optional
+        A prefix to put before the error message, typically the name of the
+        argument.
+    allow_duplicate : bool, optional
+        If False, the default, disallow an axis from being specified twice.
+
+    Returns
+    -------
+    normalized_axes : tuple of int
+        The normalized axis index, such that `0 <= normalized_axis < ndim`
+
+    Raises
+    ------
+    AxisError
+        If any axis provided is out of range
+    ValueError
+        If an axis is repeated
+
+    See also
+    --------
+    normalize_axis_index : normalizing a single scalar axis
+    """
+    # Optimization to speed-up the most common cases.
+    if type(axis) not in (tuple, list):
+        try:
+            axis = [operator.index(axis)]
+        except TypeError:
+            pass
+    # Going via an iterator directly is slower than via list comprehension.
+    axis = tuple([validate_axis(ndim, ax, argname) for ax in axis])
+    if not allow_duplicate and len(set(axis)) != len(axis):
+        if argname:
+            raise ValueError(f"repeated axis in `{argname}` argument")
+        else:
+            raise ValueError("repeated axis")
+    return axis
+
+
+def validate_order(dtype, order):
+    if getattr(dtype, "fields", None) is None:
+        if order is not None:
+            raise ValueError("Cannot specify order when the array has no fields")
+        else:
+            return
+
+    need_check = True
+    if order is None:
+        order = list(dtype.names)
+        need_check = False
+    elif isinstance(order, (list, tuple)):
+        order = list(order)
+    else:
+        order = [order]
+    if need_check:
+        for o in order:
+            if o not in dtype.fields:
+                raise ValueError(f"unknown field name: {o}")
+    return order
+
+
+def inject_dtype(dtype):
+    def inner(func):
+        @wraps(func)
+        def call(*tensors, **kw):
+            kw["dtype"] = np.dtype(dtype)
+            ret = func(*tensors, **kw)
+            if ret is NotImplemented:
+                reverse_func = getattr(
+                    inspect.getmodule(func), f"r{func.__name__}", None
+                )
+                if reverse_func is not None:
+                    ret = reverse_func(*tensors[::-1], **kw)
+                if ret is NotImplemented:
+                    raise TypeError(
+                        "unsupported operand type(s) for {0}: '{1}' and '{2}".format(
+                            func.__name__, *[type(t) for t in tensors]
+                        )
+                    )
+            return ret
+
+        return call
+
+    return inner
+
+
+def infer_dtype(np_func, multi_outputs=False, empty=True, reverse=False, check=True):
+    def make_arg(arg):
+        if empty:
+            return np.empty((1,) * max(1, arg.ndim), dtype=arg.dtype)
+        else:
+            if hasattr(arg, "op") and hasattr(arg.op, "data"):
+                arg = arg.op.data
+            return arg[(0,) * max(1, arg.ndim)]
+
+    tensor_ufunc = "__tensor_ufunc__"
+
+    def is_arg(arg):
+        if hasattr(arg, tensor_ufunc):
+            return False
+        return hasattr(arg, "ndim") and hasattr(arg, "dtype")
+
+    def inner(func):
+        @wraps(func)
+        def h(*tensors, **kw):
+            usr_dtype = np.dtype(kw.pop("dtype")) if "dtype" in kw else None
+            args = [make_arg(t) if is_arg(t) else t for t in tensors]
+            if reverse:
+                args = args[::-1]
+            np_kw = dict(
+                (k, make_arg(v) if hasattr(v, "op") else v)
+                for k, v in kw.items()
+                if is_arg(v) and k != "out"
+            )
+
+            dtype = None
+            if not any(
+                hasattr(arg, tensor_ufunc)
+                for arg in itertools.chain(args, np_kw.values())
+            ):
+                # skip infer if encounter mars DataFrame etc
+                # that implements __tensor_ufunc__
+                try:
+                    with np.errstate(all="ignore"):
+                        if multi_outputs:
+                            dtype = np_func(*args, **np_kw)[0].dtype
+                        else:
+                            dtype = np_func(*args, **np_kw).dtype
+                except:  # noqa: E722
+                    dtype = None
+
+            if usr_dtype and dtype:
+                can_cast_kwargs = {}
+                if kw.get("casting") is not None:
+                    can_cast_kwargs["casting"] = kw.get("casting")
+                if check and not np.can_cast(dtype, usr_dtype, **can_cast_kwargs):
+                    raise TypeError(
+                        "No loop matching the specified signature "
+                        f"and casting was found for ufunc {np_func}"
+                    )
+                kw["dtype"] = usr_dtype
+            else:
+                kw["dtype"] = dtype
+
+            ret = func(*tensors, **kw)
+            if ret is NotImplemented:
+                reverse_func = (
+                    getattr(inspect.getmodule(func), f"r{func.__name__}", None)
+                    if not reverse
+                    else None
+                )
+                if reverse_func is not None:
+                    ret = reverse_func(*tensors[::-1], **kw)
+                if ret is NotImplemented:
+                    raise TypeError(
+                        "unsupported operand type(s) for {0}: '{1}' and '{2}".format(
+                            func.__name__, *[type(t) for t in tensors]
+                        )
+                    )
+            return ret
+
+        return h
+
+    return inner
+
+
+def index_ndim(index):
+    from .core import Tensor
+
+    if isinstance(index, Tensor) and index.dtype == np.bool_:
+        # boolean indexing will occupy the ndim
+        return index.ndim
+
+    return 1 if index is not None else 0
+
+
+def replace_ellipsis(index, ndim):
+    all_illipsis = list(i for i, idx in enumerate(index) if idx is Ellipsis)
+    if len(all_illipsis) > 1:
+        raise IndexError("an index can only have a single ellipsis ('...')")
+    if not all_illipsis:
+        return index
+
+    illipsis_index = all_illipsis[0]
+    n_extra = ndim - sum([index_ndim(i) for i in index]) + 1
+    return (
+        index[:illipsis_index] + (slice(None),) * n_extra + index[illipsis_index + 1 :]
+    )
+
+
+def calc_sliced_size(size: int, sliceobj: slice) -> int:
+    if np.isnan(size):
+        return np.nan
+
+    start, stop, step = sliceobj.indices(size)
+    return int(ceil(abs((stop - start) / float(step))))
+
+
+def calc_object_length(obj, size=None):
+    if np.isscalar(obj):
+        return 1
+    elif isinstance(obj, slice):
+        return calc_sliced_size(size, obj)
+    else:
+        return len(obj)
+
+
+def slice_split(
+    index: Union[int, slice], sizes: List[int]
+) -> Dict[int, Union[int, slice]]:
+    size = sum(sizes)
+
+    if isinstance(index, Integral):
+        index = index if index >= 0 else size + index
+        i = 0
+        ind = index
+        lens = list(sizes)
+        while ind >= lens[0]:
+            i += 1
+            ind -= lens.pop(0)
+        return {i: ind}
+
+    assert isinstance(index, slice)
+    start, stop, step = index.indices(size)
+
+    slice_all = slice(None)
+
+    if index == slice_all:
+        return dict((k, slice_all) for k in range(len(sizes)))
+
+    d = dict()
+    if step > 0:
+        for i, length in enumerate(sizes):
+            if start < length and stop > 0:
+                d[i] = slice(start, min(stop, length), step)
+                start = (start - length) % step
+            else:
+                start = start - length
+            stop -= length
+    else:
+        rstart = start  # running start
+        chunk_boundaries = np.cumsum(sizes)
+        for i, chunk_stop in reversed(list(enumerate(chunk_boundaries))):
+            # create a chunk start and stop
+            if i == 0:
+                chunk_start = 0
+            else:
+                chunk_start = chunk_boundaries[i - 1]
+
+            # if our slice is in this chunk
+            if (chunk_start <= rstart < chunk_stop) and (rstart > stop):
+                d[i] = slice(
+                    rstart - chunk_stop,
+                    max(chunk_start - chunk_stop - 1, stop - chunk_stop),
+                    step,
+                )
+
+                # compute the next running start point,
+                offset = (rstart - (chunk_start - 1)) % step
+                rstart = chunk_start + offset - 1
+
+    # replace 0:20:1 with : if appropriate
+    for k, v in d.items():
+        if v == slice(0, sizes[k], 1):
+            d[k] = slice(None, None, None)
+
+    if not d:  # special case x[:0]
+        d[0] = slice(0, 0, 1)
+
+    return d
+
+
+def is_asc_sorted(arr):
+    arr = np.asarray(arr)
+    if len(arr) == 0:
+        return True
+    return np.all(arr[:-1] <= arr[1:])
+
+
+def split_indexes_into_chunks(nsplits, indexes, ret_is_asc=True):
+    indexes = np.asarray(indexes)
+    chunk_idxes = np.empty_like(indexes)
+    cum_nsplits = [np.cumsum(nsplit) for nsplit in nsplits]
+    for i, cum_nsplit, index in zip(itertools.count(0), cum_nsplits, indexes):
+        # handle negative value in index
+        if hasattr(index, "flags") and not index.flags.writeable:
+            index = index.copy()
+        index = np.add(index, cum_nsplit[-1], out=index, where=index < 0)
+        sorted_idx = np.argsort(index)
+
+        if np.any(index >= cum_nsplit[-1]):
+            idx = index[index >= cum_nsplit[-1]][0]
+            raise IndexError(f"index {idx} is out of bounds with size {cum_nsplit[-1]}")
+
+        chunk_idx = np.searchsorted(cum_nsplit, index[sorted_idx], side="right")
+        chunk_idxes[i, sorted_idx] = chunk_idx
+
+    chunk_idxes_asc = False
+    if ret_is_asc:
+        chunk_idxes_asc = is_asc_sorted(np.lexsort(chunk_idxes[::-1]))
+
+    chunk_index_to_indexes = OrderedDict()
+    chunk_index_to_poses = OrderedDict()
+    poses = np.arange(len(indexes[0]))
+    for idx in itertools.product(*(range(len(nsplit)) for nsplit in nsplits)):
+        cond = (chunk_idxes == np.array(idx).reshape((len(idx), 1))).all(axis=0)
+        filtered = indexes[:, cond]
+        for i in range(len(indexes)):
+            filtered[i] = filtered[i] - (
+                cum_nsplits[i][idx[i] - 1] if idx[i] > 0 else 0
+            )
+        chunk_index_to_indexes[idx] = filtered
+        chunk_index_to_poses[idx] = poses[cond]
+
+    if ret_is_asc:
+        return chunk_index_to_indexes, chunk_index_to_poses, chunk_idxes_asc
+    return chunk_index_to_indexes, chunk_index_to_poses
+
+
+def calc_pos(fancy_index_shape, pos, xp=np):
+    if isinstance(pos, dict):
+        pos = xp.concatenate(list(pos.values()))
+    select_pos = xp.empty(fancy_index_shape, dtype=int)
+    select_pos.flat[pos] = xp.arange(select_pos.size)
+    return select_pos
+
+
+def decide_unify_split(*splits):
+    # TODO (jisheng): In the future, we need more sophisticated way to decide the rechunk split
+    # right now, for (2, 2) and (3, 1), we get the rechunk split as (2, 1, 1)
+    if not splits:
+        return ()
+    raw_splits = splits
+    # support broadcasting rules
+    # decide_unify_splits((1,), (5,))  --> (5,)
+    splits = set(s for s in splits if ((len(s) > 1) or (len(s) == 1 and s[0] != 1)))
+    if len(splits) == 1:
+        return splits.pop()
+    if len(splits) == 0:
+        return raw_splits[0]
+
+    if any(np.isnan(sum(s)) for s in splits):
+        raise ValueError(f"Tensor chunk sizes are unknown: {splits}")
+    if len(set(sum(s) for s in splits)) > 1:
+        raise ValueError(f"Splits not of same size: {splits}")
+
+    q = [list(s) for s in splits]
+    size = sum(q[0])
+    cum = 0
+
+    res = []
+    while cum < size:
+        m = min(s[0] for s in q)
+        res.append(m)
+        for s in q:
+            s[0] -= m
+            if s[0] == 0:
+                s.pop(0)
+
+        cum += m
+
+    return tuple(res)
+
+
+def unify_nsplits(*tensor_axes):
+    tensor_splits = [
+        dict((a, split) for a, split in zip(axes, t.nsplits) if split != (1,))
+        for t, axes in tensor_axes
+        if t.nsplits
+    ]
+    common_axes = (
+        reduce(operator.and_, [set(ts.keys()) for ts in tensor_splits])
+        if tensor_splits
+        else set()
+    )
+    axes_unified_splits = dict(
+        (ax, decide_unify_split(*(t[ax] for t in tensor_splits))) for ax in common_axes
+    )
+
+    if len(common_axes) == 0:
+        return tuple(t[0] for t in tensor_axes)
+
+    res = []
+    for t, axes in tensor_axes:
+        new_chunk = dict(
+            (i, axes_unified_splits[ax])
+            for ax, i in zip(axes, range(t.ndim))
+            if ax in axes_unified_splits
+        )
+        t = yield from recursive_tile(t.rechunk(new_chunk))
+        res.append(t)
+
+    return tuple(res)
+
+
+def unify_chunks(*tensors):
+    tensor_axes = [
+        (t, range(t.ndim)) if not isinstance(t, tuple) else t for t in tensors
+    ]
+
+    if len(tensor_axes) < 2:
+        return tuple(t[0] if isinstance(t, tuple) else t for t in tensors)
+
+    return (yield from unify_nsplits(*tensor_axes))
+
+
+def check_out_param(out, t, casting):
+    from .base import broadcast_to
+
+    if not hasattr(out, "shape"):
+        raise TypeError("return arrays must be a tensor")
+
+    try:
+        broadcast_to(t, out.shape)
+    except ValueError:
+        raise ValueError(
+            "operands could not be broadcast together "
+            "with shapes ({0}) ({1})".format(
+                ",".join(str(s) for s in t.shape), ",".join(str(s) for s in out.shape)
+            )
+        )
+
+    if not np.can_cast(t.dtype, out.dtype, casting):
+        raise TypeError(
+            f"output (typecode '{t.dtype.char}') could not be coerced "
+            f"to provided output parameter (typecode '{out.dtype.char}') "
+            f"according to the casting rule ''{casting}''"
+        )
+
+
+def dictify_chunk_size(shape, chunk_size):
+    """
+    Given chunk_size which may be a tuple or dict, return a dict type all the same.
+
+    :param shape: tensor's shape
+    :param chunk_size: if dict provided, it's dimension id to chunk size;
+                       if provided, it's the chunk size for each dimension.
+    :return: dict form of chunk_size
+    """
+    if chunk_size is not None:
+        if isinstance(chunk_size, Iterable):
+            if not isinstance(chunk_size, dict):
+                chunk_size = {i: c for i, c in enumerate(chunk_size)}
+        elif isinstance(chunk_size, int):
+            chunk_size = {i: chunk_size for i in range(len(shape))}
+        else:
+            raise TypeError(f"chunks must be iterable, got {type(chunk_size)}")
+
+    if chunk_size is None:
+        chunk_size = dict()
+
+    return chunk_size
+
+
+def decide_chunk_sizes(shape, chunk_size, itemsize):
+    """
+    Decide how a given tensor can be split into chunk.
+
+    :param shape: tensor's shape
+    :param chunk_size: if dict provided, it's dimension id to chunk size;
+                       if provided, it's the chunk size for each dimension.
+    :param itemsize: element size
+    :return: the calculated chunk size for each dimension
+    :rtype: tuple
+    """
+
+    from ..config import options
+
+    chunk_size = dictify_chunk_size(shape, chunk_size)
+    nleft = len(shape) - len(chunk_size)
+    if nleft < 0:
+        raise ValueError("chunks have more dimensions than input tensor")
+    if nleft == 0:
+        return normalize_chunk_sizes(
+            shape, tuple(chunk_size[j] for j in range(len(shape)))
+        )
+
+    max_chunk_size = options.chunk_store_limit
+
+    # normalize the dimension which specified first
+    dim_to_normalized = {
+        i: normalize_chunk_sizes((shape[i],), (c,))[0] for i, c in chunk_size.items()
+    }
+
+    left = {j: [] for j in range(len(shape)) if j not in dim_to_normalized}
+    left_unsplit = {j: shape[j] for j in left}
+    while True:
+        nbytes_occupied = (
+            np.prod([max(c) for c in dim_to_normalized.values()]) * itemsize
+        )
+        dim_size = np.maximum(
+            int(np.power(max_chunk_size / nbytes_occupied, 1 / float(len(left)))), 1
+        )
+        for j, ns in left.copy().items():
+            unsplit = left_unsplit[j]
+            ns.append(int(np.minimum(unsplit, dim_size)))
+            left_unsplit[j] -= ns[-1]
+            if left_unsplit[j] <= 0:
+                dim_to_normalized[j] = tuple(ns)
+                del left[j]
+
+        if len(left) == 0:
+            break
+
+    return tuple(dim_to_normalized[i] for i in range(len(dim_to_normalized)))
+
+
+def check_random_state(seed):
+    """
+    Turn seed into a mt.random.RandomState instance
+
+    :param seed:
+        If seed is None, return the RandomState singleton used by mt.random.
+        If seed is an int, return a new RandomState instance seeded with seed.
+        If seed is already a RandomState instance, return it.
+        Otherwise raise ValueError.
+    :return:
+    """
+    from numpy import random as np_mtrand
+
+    from . import random as mtrand
+
+    if seed is None or seed is mtrand or seed is np_mtrand:
+        return mtrand._random_state
+    if isinstance(seed, (Integral, np.integer)):
+        return mtrand.RandomState(seed)
+    if isinstance(seed, np.random.RandomState):
+        return mtrand.RandomState.from_numpy(seed)
+    if isinstance(seed, mtrand.RandomState):
+        return seed
+    raise ValueError(f"{seed} cannot be used to seed a mt.random.RandomState instance")
+
+
+def filter_inputs(inputs):
+    from ..core import ENTITY_TYPE
+
+    return [inp for inp in inputs if isinstance(inp, ENTITY_TYPE)]
+
+
+# As TileDB Ctx's creation is a bit time-consuming,
+# we just cache the Ctx
+# also remember the arguments should be hashable
+@lru_cache(10)
+def _create_tiledb_ctx(conf_tuple):
+    if conf_tuple is not None:
+        return tiledb.Ctx(dict(conf_tuple))
+    return tiledb.Ctx()
+
+
+def get_tiledb_ctx(conf):
+    key = tuple(conf.items()) if conf is not None else None
+    return _create_tiledb_ctx(key)
+
+
+# this function is only used for pandas' compatibility
+def to_numpy(pdf):
+    try:
+        return pdf.to_numpy()
+    except AttributeError:  # pragma: no cover
+        return pdf.values
+
+
+def check_order(order_str, available_options="KACF", err_msg="order not understood"):
+    order_str = order_str.upper()
+    if order_str not in available_options:
+        raise TypeError(err_msg)
+
+
+def get_order(
+    order_str, to_keep_order, available_options="KACF", err_msg="order not understood"
+):
+    from .core import TensorOrder
+
+    check_order(order_str, available_options=available_options, err_msg=err_msg)
+
+    if order_str in "KA":
+        return to_keep_order
+    elif order_str == "C":
+        return TensorOrder.C_ORDER
+    else:
+        return TensorOrder.F_ORDER
+
+
+def reverse_order(old_order):
+    from .core import TensorOrder
+
+    assert isinstance(old_order, TensorOrder)
+    return (
+        TensorOrder.C_ORDER if old_order == TensorOrder.F_ORDER else TensorOrder.F_ORDER
+    )
+
+
+def hash_on_axis(ar, axis, n_dest):
+    ar = np.asarray(ar)
+    # cannot be scalar
+    assert ar.ndim > 0
+    axis = validate_axis(ar.ndim, axis)
+
+    if n_dest == 1:
+        return np.zeros(ar.shape[axis], dtype=np.uint32)
+
+    if ar.ndim > 2:
+        ret = np.empty(ar.shape[axis], dtype=np.uint32)
+
+        def _hash_to_dest(data):
+            i = data[0]
+            idx = (slice(None),) * axis + (i,)
+            ret[i] = hash_from_buffer(memoryview(ar[idx])) % n_dest
+
+        np.apply_along_axis(_hash_to_dest, 0, np.arange(ar.shape[axis])[np.newaxis, :])
+        return ret
+    else:
+
+        def _hash_to_dest(data):
+            return hash_from_buffer(memoryview(data)) % n_dest
+
+        if ar.ndim == 1:
+            ar = ar.reshape(ar.size, 1)
+        return np.apply_along_axis(_hash_to_dest, 1 - axis, ar)
+
+
+def fetch_corner_data(tensor, session=None):
+    print_option = np.get_printoptions()
+    # only fetch corner data when data > threshold
+    threshold = print_option["threshold"]
+    # number of edge items to print
+    edgeitems = print_option["edgeitems"]
+
+    # we fetch corner data based on the fact that
+    # the tensor must have been executed,
+    # thus the size could not be NaN
+    if tensor.size > threshold:
+        # two edges for each exis
+        indices_iter = list(itertools.product(*(range(2) for _ in range(tensor.ndim))))
+        corners = np.empty(shape=(2,) * tensor.ndim, dtype=object)
+        shape = [0 for _ in range(tensor.ndim)]
+        for indices in indices_iter:
+            slc = []
+            for ax, i in enumerate(indices):
+                size = tensor.shape[ax]
+                if size > edgeitems * 2 + 2:
+                    # fetch two more elements
+                    if i == 0:
+                        slc.append(slice(edgeitems + 1))
+                    else:
+                        slc.append(slice(-edgeitems - 1, None))
+                    shape[ax] += edgeitems + 1
+                else:
+                    i_sep = size // 2
+                    if i == 0:
+                        slc.append(slice(i_sep))
+                        shape[ax] += i_sep
+                    else:
+                        slc.append(slice(i_sep, None))
+                        shape[ax] += size - i_sep
+            corners[indices] = tensor[tuple(slc)]
+        # fetch together
+        fetched = ExecutableTuple(corners.flat).fetch(session=session)
+        for indices, f in zip(indices_iter, fetched):
+            corners[indices] = f
+        return np.block(corners.tolist())
+    else:
+        return tensor.fetch(session=session)
+
+
+def implement_scipy(scipy_fun):
+    import re
+    import textwrap
+
+    def wrapper(fun):
+        if scipy_fun is None:
+            return None
+        if not fun.__doc__:
+            doc_str = textwrap.dedent(scipy_fun.__doc__)
+            lines = []
+            for line in doc_str.splitlines(keepends=False):
+                # skip function headers
+                if line.startswith(scipy_fun.__name__ + "("):
+                    continue
+                # skip version marks
+                if line.strip().startswith(".. versionadded::"):
+                    continue
+                # skip examples
+                if line.strip() == "Examples":
+                    break
+                lines.append(line)
+            doc_str = "\n".join(lines).strip()
+            # remove trailing empty sections
+            fun.__doc__ = re.sub(r"[A-Za-z]+\n-+$", "", doc_str).strip()
+        return fun
+
+    return wrapper
diff --git a/python/xorbits/_mars/tests/__init__.py b/python/xorbits/_mars/tests/__init__.py
new file mode 100644
index 000000000..ce3fa5c45
--- /dev/null
+++ b/python/xorbits/_mars/tests/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core import flaky
diff --git a/python/xorbits/_mars/tests/core.py b/python/xorbits/_mars/tests/core.py
new file mode 100644
index 000000000..a76952088
--- /dev/null
+++ b/python/xorbits/_mars/tests/core.py
@@ -0,0 +1,585 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fnmatch
+import functools
+import inspect
+import itertools
+import logging
+import os
+import sys
+import time
+import types
+from typing import Dict
+
+import numpy as np
+import pandas as pd
+import pytest
+
+try:
+    from flaky import flaky as _raw_flaky
+except ImportError:
+    _raw_flaky = None
+try:
+    import mock
+except ImportError:
+    from unittest import mock
+_mock = mock
+
+from ..core.operand import OperandStage
+from ..utils import lazy_import
+
+cupy = lazy_import("cupy")
+cudf = lazy_import("cudf")
+ray = lazy_import("ray")
+ucx = lazy_import("ucp")
+
+logger = logging.getLogger(__name__)
+
+
+def flaky(o=None, *args, **kwargs):
+    platform = kwargs.pop("platform", "")
+    if _raw_flaky is None or not sys.platform.startswith(platform):
+        if o is not None:
+            return o
+
+        def ident(x):
+            return x
+
+        return ident
+    elif o is not None:
+        return _raw_flaky(o, *args, **kwargs)
+    else:
+        return _raw_flaky(*args, **kwargs)
+
+
+def patch_method(method, *args, **kwargs):
+    if hasattr(method, "__qualname__"):
+        return mock.patch(
+            method.__module__ + "." + method.__qualname__, *args, **kwargs
+        )
+    elif hasattr(method, "im_class"):
+        return mock.patch(
+            ".".join(
+                [method.im_class.__module__, method.im_class.__name__, method.__name__]
+            ),
+            *args,
+            **kwargs,
+        )
+    else:
+        return mock.patch(method.__module__ + "." + method.__name__, *args, **kwargs)
+
+
+def patch_cls(target_cls):
+    def _wrapper(cls):
+        class Super(cls.__bases__[0]):
+            pass
+
+        cls.__patch_super__ = Super
+
+        target = target_cls.__module__ + "." + target_cls.__qualname__
+        for name, obj in cls.__dict__.items():
+            if name.startswith("__") and name != "__init__":
+                continue
+            p = mock.patch(target + "." + name, obj, create=True)
+            original, local = p.get_original()
+            setattr(cls.__patch_super__, name, original)
+            p.start()
+
+        return cls
+
+    return _wrapper
+
+
+def patch_super():
+    back = inspect.currentframe().f_back
+    if not back or "__class__" not in back.f_locals:
+        raise RuntimeError("Calling super() in the incorrect context.")
+
+    patch_super_cls = back.f_locals["__class__"].__patch_super__
+    patch_self = back.f_locals.get("self")
+
+    class _SuperAccessor:
+        def __getattribute__(self, item):
+            func = getattr(patch_super_cls, item)
+            if func == mock.DEFAULT:
+                raise AttributeError(f"super object has no attribute '{item}'")
+            if patch_self:
+                return types.MethodType(func, patch_self)
+            return func
+
+    return _SuperAccessor()
+
+
+def print_entrance(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            print(
+                f"Start to execute function {func} with args {args} and kwargs {kwargs}"
+            )
+            result = func(*args, **kwargs)
+            print(
+                f"Finished executing function {func} with args {args} and kwargs {kwargs}"
+            )
+            return result
+        except NotImplementedError:
+            return NotImplemented
+
+    return wrapper
+
+
+def print_async_entrance(func):
+    @functools.wraps(func)
+    async def wrapper(*args, **kwargs):
+        try:
+            print(
+                f"Start to execute function {func} with args {args} and kwargs {kwargs}"
+            )
+            result = await func(*args, **kwargs)
+            print(
+                f"Finished executing function {func} with args {args} and kwargs {kwargs}"
+            )
+            return result
+        except NotImplementedError:
+            return NotImplemented
+
+    return wrapper
+
+
+def require_cupy(func):
+    if pytest:
+        func = pytest.mark.cuda(func)
+    func = pytest.mark.skipif(cupy is None, reason="cupy not installed")(func)
+    return func
+
+
+def require_cudf(func):
+    if pytest:
+        func = pytest.mark.cuda(func)
+    func = pytest.mark.skipif(cudf is None, reason="cudf not installed")(func)
+    return func
+
+
+def require_ray(func):
+    if pytest:
+        func = pytest.mark.ray(func)
+    func = pytest.mark.skipif(ray is None, reason="ray not installed")(func)
+    return func
+
+
+def require_ucx(func):
+    if pytest:
+        func = pytest.mark.ucx(func)
+    func = pytest.mark.skipif(ucx is None, reason="ucx not installed")(func)
+    return func
+
+
+def require_hadoop(func):
+    if pytest:
+        func = pytest.mark.hadoop(func)
+    func = pytest.mark.skipif(
+        not os.environ.get("WITH_HADOOP"), reason="Only run when hadoop is installed"
+    )(func)
+    return func
+
+
+def assert_groupby_equal(
+    left, right, sort_keys=False, sort_index=True, with_selection=False
+):
+    if hasattr(left, "groupby_obj"):
+        left = left.groupby_obj
+    if hasattr(right, "groupby_obj"):
+        right = right.groupby_obj
+
+    if type(left) is not type(right):
+        raise AssertionError(
+            f"Type of groupby not consistent: {type(left)} != {type(right)}"
+        )
+
+    left_selection = getattr(left, "_selection", None)
+    right_selection = getattr(right, "_selection", None)
+    if sort_keys:
+        left = sorted(left, key=lambda p: p[0])
+        right = sorted(right, key=lambda p: p[0])
+    else:
+        left, right = list(left), list(right)
+    if sort_index:
+        left = [(k, v.sort_index()) for k, v in left]
+        right = [(k, v.sort_index()) for k, v in right]
+
+    if len(left) != len(right):
+        raise AssertionError(
+            f"Count of groupby keys not consistent: {len(left)} != {len(right)}"
+        )
+
+    left_keys = [p[0] for p in left]
+    right_keys = [p[0] for p in right]
+    if left_keys != right_keys:
+        raise AssertionError(
+            f"Group keys not consistent: {left_keys!r} != {right_keys!r}"
+        )
+    for (left_key, left_frame), (right_key, right_frame) in zip(left, right):
+        if with_selection:
+            if left_selection and isinstance(left_frame, pd.DataFrame):
+                left_frame = left_frame[left_selection]
+            if right_selection and isinstance(right_frame, pd.DataFrame):
+                right_frame = right_frame[right_selection]
+
+        if isinstance(left_frame, pd.DataFrame):
+            pd.testing.assert_frame_equal(left_frame, right_frame)
+        else:
+            pd.testing.assert_series_equal(left_frame, right_frame)
+
+
+_check_options = dict()
+_check_args = [
+    "check_all",
+    "check_series_name",
+    "check_index_name",
+    "check_dtypes",
+    "check_dtype",
+    "check_shape",
+    "check_nsplits",
+    "check_index_value",
+    "check_columns_value",
+]
+
+
+class ObjectCheckMixin:
+    _check_options: Dict
+
+    @staticmethod
+    def adapt_index_value(value):
+        if hasattr(value, "to_pandas"):
+            return value.to_pandas()
+        return value
+
+    def assert_shape_consistent(self, expected_shape, real_shape):
+        if not self._check_options["check_shape"] or not expected_shape:
+            return
+
+        if len(expected_shape) != len(real_shape):
+            raise AssertionError(
+                f"ndim in metadata {len(expected_shape)} is not consistent "
+                f"with real ndim {len(real_shape)}"
+            )
+        for e, r in zip(expected_shape, real_shape):
+            if not np.isnan(e) and e != r:
+                raise AssertionError(
+                    f"shape in metadata {expected_shape!r} is not consistent "
+                    f"with real shape {real_shape!r}"
+                )
+
+    @staticmethod
+    def assert_dtype_consistent(expected_dtype, real_dtype):
+        cate_dtypes = [pd.CategoricalDtype]
+        if cudf:
+            cate_dtypes.append(cudf.CategoricalDtype)
+        cate_dtypes = tuple(cate_dtypes)
+
+        if isinstance(real_dtype, pd.DatetimeTZDtype):
+            real_dtype = real_dtype.base
+        if expected_dtype != real_dtype:
+            if expected_dtype == np.dtype("O") and real_dtype.type is np.str_:
+                # real dtype is string, this matches expectation
+                return
+            if expected_dtype is None:
+                raise AssertionError("Expected dtype cannot be None")
+            if isinstance(real_dtype, cate_dtypes) and isinstance(
+                expected_dtype, cate_dtypes
+            ):
+                return
+            if not np.can_cast(real_dtype, expected_dtype) and not np.can_cast(
+                expected_dtype, real_dtype
+            ):
+                raise AssertionError(
+                    f"cannot cast between dtype of real dtype {real_dtype} "
+                    f"and dtype {expected_dtype} defined in metadata"
+                )
+
+    def assert_tensor_consistent(self, expected, real):
+        from ..lib.sparse import SparseNDArray
+
+        np_types = (np.generic, np.ndarray, pd.Timestamp, SparseNDArray)
+        if cupy is not None:
+            np_types += (cupy.ndarray,)
+
+        if isinstance(real, tuple):
+            # allow returning a batch of chunks for some operands
+            real = real[0]
+        if isinstance(real, (str, int, bool, float, complex)):
+            real = np.array([real])[0]
+        if not isinstance(real, np_types):
+            raise AssertionError(
+                f"Type of real value ({type(real)}) not one of {np_types!r}"
+            )
+        if not hasattr(expected, "dtype"):
+            return
+        if self._check_options["check_dtypes"]:
+            try:
+                self.assert_dtype_consistent(expected.dtype, real.dtype)
+            except AssertionError as ex:
+                if hasattr(expected, "op"):
+                    raise AssertionError(
+                        f"dtype assertion error: {ex}, source operand {expected.op}"
+                    )
+                else:
+                    raise
+        if self._check_options["check_shape"]:
+            self.assert_shape_consistent(expected.shape, real.shape)
+
+    @classmethod
+    def assert_index_value_consistent(cls, expected_index_value, real_index):
+        if expected_index_value is not None and expected_index_value.has_value():
+            expected_index = expected_index_value.to_pandas()
+            try:
+                pd.testing.assert_index_equal(
+                    expected_index, cls.adapt_index_value(real_index)
+                )
+            except AssertionError as e:
+                raise AssertionError(
+                    f"Index of real value ({real_index}) not equal to ({expected_index})"
+                ) from e
+
+    def assert_dataframe_consistent(self, expected, real):
+        dataframe_types = (pd.DataFrame,)
+        if cudf is not None:
+            dataframe_types += (cudf.DataFrame,)
+
+        if isinstance(real, tuple):
+            # allow returning a batch of chunks for some operands
+            real = real[0]
+        if not isinstance(real, dataframe_types):
+            raise AssertionError(f"Type of real value ({type(real)}) not DataFrame")
+        if expected.shape is None:
+            return
+        self.assert_shape_consistent(expected.shape, real.shape)
+        if not np.isnan(expected.shape[1]) and expected.dtypes is not None:
+            if self._check_options["check_dtypes"]:
+                # ignore check when columns length is nan or dtypes undefined
+                pd.testing.assert_index_equal(
+                    expected.dtypes.index, self.adapt_index_value(real.dtypes.index)
+                )
+
+                try:
+                    for expected_dtype, real_dtype in zip(expected.dtypes, real.dtypes):
+                        self.assert_dtype_consistent(expected_dtype, real_dtype)
+                except AssertionError:
+                    raise AssertionError(
+                        f"dtypes in metadata {expected.dtype} cannot cast "
+                        f"to real dtype {real.dtype}"
+                    )
+
+        if self._check_options["check_columns_value"] and not np.isnan(
+            expected.shape[1]
+        ):
+            self.assert_index_value_consistent(expected.columns_value, real.columns)
+        if self._check_options["check_index_value"] and not np.isnan(expected.shape[0]):
+            self.assert_index_value_consistent(expected.index_value, real.index)
+
+    def assert_series_consistent(self, expected, real):
+        series_types = (pd.Series,)
+        if cudf is not None:
+            series_types += (cudf.Series,)
+
+        if not isinstance(real, series_types):
+            raise AssertionError(f"Type of real value ({type(real)}) not Series")
+        self.assert_shape_consistent(expected.shape, real.shape)
+
+        if self._check_options["check_series_name"]:
+            if expected.name is not None and expected.name != real.name:
+                raise AssertionError(
+                    f"series name in metadata {expected.name} "
+                    f"is not equal to real name {real.name}"
+                )
+
+        self.assert_dtype_consistent(expected.dtype, real.dtype)
+        if self._check_options["check_index_value"]:
+            self.assert_index_value_consistent(expected.index_value, real.index)
+
+    def assert_groupby_consistent(self, expected, real):
+        from pandas.core.groupby import DataFrameGroupBy, SeriesGroupBy
+
+        from ..dataframe.core import (
+            DATAFRAME_GROUPBY_CHUNK_TYPE,
+            DATAFRAME_GROUPBY_TYPE,
+            SERIES_GROUPBY_CHUNK_TYPE,
+            SERIES_GROUPBY_TYPE,
+        )
+        from ..lib.groupby_wrapper import GroupByWrapper
+
+        df_groupby_types = (DataFrameGroupBy,)
+        series_groupby_types = (SeriesGroupBy,)
+
+        try:
+            from cudf.core.groupby.groupby import DataFrameGroupBy as CUDataFrameGroupBy
+            from cudf.core.groupby.groupby import SeriesGroupBy as CUSeriesGroupBy
+
+            df_groupby_types += (CUDataFrameGroupBy,)
+            series_groupby_types += (CUSeriesGroupBy,)
+        except ImportError:
+            pass
+
+        if isinstance(real, GroupByWrapper):
+            real = real.groupby_obj
+
+        if isinstance(
+            expected, (DATAFRAME_GROUPBY_TYPE, DATAFRAME_GROUPBY_CHUNK_TYPE)
+        ) and isinstance(real, df_groupby_types):
+            selection = getattr(real, "_selection", None)
+            if not selection:
+                self.assert_dataframe_consistent(expected, real.obj)
+            else:
+                self.assert_dataframe_consistent(expected, real.obj[selection])
+        elif isinstance(
+            expected, (SERIES_GROUPBY_TYPE, SERIES_GROUPBY_CHUNK_TYPE)
+        ) and isinstance(real, series_groupby_types):
+            self.assert_series_consistent(expected, real.obj)
+        else:
+            raise AssertionError(
+                "GroupBy type not consistent. Expecting %r but receive %r"
+                % (type(expected), type(real))
+            )
+
+    def assert_index_consistent(self, expected, real):
+        index_types = (pd.Index,)
+        if cudf is not None:
+            index_types += (cudf.Index,)
+
+        if not isinstance(real, index_types):
+            raise AssertionError(f"Type of real value ({type(real)}) not Index")
+        self.assert_shape_consistent(expected.shape, real.shape)
+
+        if self._check_options["check_series_name"] and expected.name != real.name:
+            raise AssertionError(
+                f"series name in metadata {expected.name} is not equal to "
+                f"real name {real.name}"
+            )
+
+        self.assert_dtype_consistent(expected.dtype, real.dtype)
+        self.assert_index_value_consistent(expected.index_value, real)
+
+    def assert_categorical_consistent(self, expected, real):
+        if not isinstance(real, pd.Categorical):
+            raise AssertionError(f"Type of real value ({type(real)}) not Categorical")
+        self.assert_dtype_consistent(expected.dtype, real.dtype)
+        self.assert_shape_consistent(expected.shape, real.shape)
+        self.assert_index_value_consistent(expected.categories_value, real.categories)
+
+    def assert_object_consistent(self, expected, real):
+        from ..dataframe.core import (
+            CATEGORICAL_CHUNK_TYPE,
+            CATEGORICAL_TYPE,
+            DATAFRAME_CHUNK_TYPE,
+            DATAFRAME_TYPE,
+            GROUPBY_CHUNK_TYPE,
+            GROUPBY_TYPE,
+            INDEX_CHUNK_TYPE,
+            INDEX_TYPE,
+            SERIES_CHUNK_TYPE,
+            SERIES_TYPE,
+        )
+        from ..tensor.core import TENSOR_CHUNK_TYPE, TENSOR_TYPE
+
+        op = getattr(expected, "op", None)
+        if op and getattr(op, "stage", None) == OperandStage.map:
+            return
+
+        if isinstance(expected, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)):
+            self.assert_tensor_consistent(expected, real)
+        elif isinstance(expected, (DATAFRAME_TYPE, DATAFRAME_CHUNK_TYPE)):
+            self.assert_dataframe_consistent(expected, real)
+        elif isinstance(expected, (SERIES_TYPE, SERIES_CHUNK_TYPE)):
+            self.assert_series_consistent(expected, real)
+        elif isinstance(expected, (GROUPBY_TYPE, GROUPBY_CHUNK_TYPE)):
+            self.assert_groupby_consistent(expected, real)
+        elif isinstance(expected, (INDEX_TYPE, INDEX_CHUNK_TYPE)):
+            self.assert_index_consistent(expected, real)
+        elif isinstance(expected, (CATEGORICAL_TYPE, CATEGORICAL_CHUNK_TYPE)):
+            self.assert_categorical_consistent(expected, real)
+
+
+DICT_NOT_EMPTY = type("DICT_NOT_EMPTY", (object,), {})  # is check works for deepcopy
+
+
+def check_dict_structure_same(a, b, prefix=None):
+    def _p(k):
+        if prefix is None:
+            return k
+        return ".".join(str(i) for i in prefix + [k])
+
+    for ai, bi in itertools.zip_longest(
+        a.items(), b.items(), fillvalue=("_KEY_NOT_EXISTS_", None)
+    ):
+        if ai[0] != bi[0]:
+            if "*" in ai[0]:
+                pattern, target = ai[0], bi[0]
+            elif "*" in bi[0]:
+                pattern, target = bi[0], ai[0]
+            else:
+                raise KeyError(f"Key {_p(ai[0])} != {_p(bi[0])}")
+            if not fnmatch.fnmatch(target, pattern):
+                raise KeyError(f"Key {_p(target)} not match {_p(pattern)}")
+
+        if ai[1] is DICT_NOT_EMPTY:
+            target = bi[1]
+        elif bi[1] is DICT_NOT_EMPTY:
+            target = ai[1]
+        else:
+            target = None
+        if target is not None:
+            if not isinstance(target, dict):
+                raise TypeError(f"Value type of {_p(ai[0])} is not a dict.")
+            if not target:
+                raise TypeError(f"Value of {_p(ai[0])} empty.")
+            continue
+
+        if type(ai[1]) is not type(bi[1]):
+            raise TypeError(f"Value type of {_p(ai[0])} mismatch {ai[1]} != {bi[1]}")
+        if isinstance(ai[1], dict):
+            check_dict_structure_same(
+                ai[1], bi[1], [ai[0]] if prefix is None else prefix + [ai[0]]
+            )
+
+
+async def wait_for_condition(
+    condition_predictor, timeout=10, retry_interval_ms=100, **kwargs
+):  # pragma: no cover
+    """Wait until a condition is met or time out with an exception.
+
+    Args:
+        condition_predictor: A function that predicts the condition.
+        timeout: Maximum timeout in seconds.
+        retry_interval_ms: Retry interval in milliseconds.
+
+    Raises:
+        RuntimeError: If the condition is not met before the timeout expires.
+    """
+    start = time.time()
+    last_ex = None
+    while time.time() - start <= timeout:
+        try:
+            pred = condition_predictor(**kwargs)
+            if inspect.isawaitable(pred):
+                pred = await pred
+            if pred:
+                return
+        except Exception as ex:
+            last_ex = ex
+        time.sleep(retry_interval_ms / 1000.0)
+    message = "The condition wasn't met before the timeout expired."
+    if last_ex is not None:
+        message += f" Last exception: {last_ex}"
+    raise RuntimeError(message)
diff --git a/python/xorbits/_mars/tests/test_cluster.py b/python/xorbits/_mars/tests/test_cluster.py
new file mode 100644
index 000000000..71cada2cd
--- /dev/null
+++ b/python/xorbits/_mars/tests/test_cluster.py
@@ -0,0 +1,120 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+import subprocess
+import sys
+import tempfile
+
+import psutil
+import pytest
+
+from .. import new_session
+from .. import tensor as mt
+from ..services.cluster import NodeRole, WebClusterAPI
+from ..utils import get_next_port
+
+CONFIG_CONTENT = """\
+"@inherits": "@mars/config.yml"
+scheduling:
+  mem_hard_limit: null"""
+
+
+def _terminate(pid: int):
+    proc = psutil.Process(pid)
+    sub_pids = [p.pid for p in proc.children(recursive=True)]
+    proc.terminate()
+    proc.wait(5)
+    for p in sub_pids:
+        try:
+            proc = psutil.Process(p)
+            proc.kill()
+        except psutil.NoSuchProcess:
+            continue
+
+
+@pytest.mark.asyncio
+async def test_cluster():
+    port = get_next_port()
+    web_port = get_next_port()
+    supervisor_addr = f"127.0.0.1:{port}"
+    web_addr = f"http://127.0.0.1:{web_port}"
+
+    # gen config file
+    fd, path = tempfile.mkstemp()
+    with os.fdopen(fd, mode="w") as f:
+        f.write(CONFIG_CONTENT)
+
+    w = subprocess.Popen(
+        [sys.executable, "-m", "mars.worker", "-s", supervisor_addr, "-f", path]
+    )
+    r = subprocess.Popen(
+        [
+            sys.executable,
+            "-m",
+            "mars.supervisor",
+            "-H",
+            "127.0.0.1",
+            "-p",
+            str(port),
+            "-w",
+            str(web_port),
+            "-f",
+            path,
+        ],
+        stderr=subprocess.PIPE,
+    )
+
+    for p in [r, w]:
+        try:
+            retcode = p.wait(1)
+        except subprocess.TimeoutExpired:
+            # supervisor & worker will run forever,
+            # timeout means everything goes well, at least looks well,
+            continue
+        else:
+            if retcode:
+                std_err = p.communicate()[1].decode()
+                _terminate(r.pid)
+                _terminate(w.pid)
+                raise RuntimeError("Start cluster failed, stderr: \n" + std_err)
+
+    try:
+        cluster_api = WebClusterAPI(web_addr)
+        while True:
+            try:
+                jsn = await cluster_api.get_nodes_info(role=NodeRole.WORKER)
+            except ConnectionError:
+                await asyncio.sleep(0.5)
+                continue
+            if not jsn:
+                await asyncio.sleep(0.5)
+                continue
+            if len(jsn) > 0:
+                break
+
+        sess = new_session(web_addr, default=True)
+        a = mt.arange(10)
+        assert a.sum().to_numpy(show_progress=False) == 45
+
+        sess2 = new_session(web_addr, session_id=sess.session_id)
+        sess2.close()
+    finally:
+        _terminate(w.pid)
+        _terminate(r.pid)
+
+    # test stderr
+    out = r.communicate()[1].decode()
+    assert f"Supervisor started at {supervisor_addr}, web address: {web_addr}" in out
diff --git a/python/xorbits/_mars/tests/test_config.py b/python/xorbits/_mars/tests/test_config.py
new file mode 100644
index 000000000..28c67d2cb
--- /dev/null
+++ b/python/xorbits/_mars/tests/test_config.py
@@ -0,0 +1,113 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle
+import threading
+
+import pytest
+
+from ..config import Config, is_integer, is_string, option_context, options
+
+
+def test_config_context():
+    with pytest.raises(AttributeError):
+        _ = options.a.b.c
+
+    options.register_option("c.d.e", "a", is_string)
+    assert "c" in dir(options)
+    assert "d" in dir(options.c)
+
+    try:
+        with option_context() as ctx:
+            ctx.register_option("a.b.c", 1, validator=is_integer)
+            assert ctx.a.b.c == 1
+
+            ctx.a.b.c = 2
+            assert ctx.a.b.c == 2
+
+            with pytest.raises(ValueError):
+                ctx.a.b.c = "a"
+
+            assert ctx.c.d.e == "a"
+
+            ctx.c.d.e = "b"
+
+        assert options.c.d.e == "a"
+
+        options.c.d.e = "c"
+
+        assert options.c.d.e == "c"
+
+        with pytest.raises(AttributeError):
+            _ = options.a.b.c  # noqa: F841
+    finally:
+        options.unregister_option("c.d.e")
+
+
+def test_multi_thread_config():
+    options.register_option("a.b.c", 1)
+
+    class T(threading.Thread):
+        def __init__(self, is_first, condition):
+            super().__init__()
+            self.is_first = is_first
+            self.condition = condition
+
+        def run(self):
+            self.condition.acquire()
+            if self.is_first:
+                options.a.b.c = 2
+                self.condition.notify()
+            else:
+                self.condition.wait()
+                assert options.a.b.c == 1
+            self.condition.release()
+
+    try:
+        cond = threading.Condition()
+        a = T(True, cond)
+        b = T(False, cond)
+        b.start()
+        a.start()
+        a.join()
+        b.join()
+    finally:
+        options.unregister_option("a.b.c")
+
+
+def test_config_copy():
+    cfg = Config()
+    cfg.register_option("a.b.c", 1)
+    cfg.redirect_option("a.c", "a.b.c")
+
+    target_cfg = Config()
+    target_cfg.register_option("a.b.c", -1)
+    target_cfg.redirect_option("a.c", "a.b.c")
+
+    src_cfg_dict = cfg.to_dict()
+    assert src_cfg_dict == {"a.b.c": 1}
+
+    target_cfg.update(src_cfg_dict)
+    assert target_cfg.a.b.c == 1
+
+
+def test_pickle_config():
+    cfg = Config()
+    cfg.register_option("a.b.c", 1)
+    cfg.redirect_option("a.c", "a.b.c")
+
+    s = pickle.dumps(cfg)
+    new_cfg = pickle.loads(s)
+    assert new_cfg.a.b.c == 1
+    assert new_cfg.a.c == 1
diff --git a/python/xorbits/_mars/tests/test_eager_mode.py b/python/xorbits/_mars/tests/test_eager_mode.py
new file mode 100644
index 000000000..5d1513c4e
--- /dev/null
+++ b/python/xorbits/_mars/tests/test_eager_mode.py
@@ -0,0 +1,177 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .. import dataframe as md
+from .. import tensor as mt
+from ..config import option_context
+from ..dataframe.datasource.dataframe import from_pandas
+
+
+def test_base_execute(setup):
+    with option_context({"eager_mode": True}):
+        a_data = np.random.rand(10, 10)
+        a = mt.tensor(a_data, chunk_size=6)
+        np.testing.assert_array_equal(a.fetch(), a_data)
+
+        r1 = a + 1
+        np.testing.assert_array_equal(r1.fetch(), a_data + 1)
+
+        r2 = 2 * r1
+        np.testing.assert_array_equal(r2.fetch(), (a_data + 1) * 2)
+
+        # test add with out
+        b = mt.ones((10, 10), chunk_size=6)
+        np.testing.assert_array_equal(b.fetch(), np.ones((10, 10)))
+
+        mt.add(a, b, out=b)
+        np.testing.assert_array_equal(b.fetch(), a_data + 1)
+
+        # test tensor dot
+        c_data1 = np.random.rand(10, 10)
+        c_data2 = np.random.rand(10, 10)
+        c1 = mt.tensor(c_data1, chunk_size=6)
+        c2 = mt.tensor(c_data2, chunk_size=6)
+        r3 = c1.dot(c2)
+        np.testing.assert_array_almost_equal(r3.fetch(), c_data1.dot(c_data2))
+
+
+def test_multiple_output_execute(setup):
+    with option_context({"eager_mode": True}):
+        data = np.random.random((5, 9))
+
+        arr1 = mt.tensor(data.copy(), chunk_size=3)
+        result = mt.modf(arr1)
+        expected = np.modf(data)
+
+        np.testing.assert_array_equal(result[0].fetch(), expected[0])
+        np.testing.assert_array_equal(result[1].fetch(), expected[1])
+
+        arr3 = mt.tensor(data.copy(), chunk_size=3)
+        result1, result2, result3 = mt.split(arr3, 3, axis=1)
+        expected = np.split(data, 3, axis=1)
+
+        np.testing.assert_array_equal(result1.fetch(), expected[0])
+        np.testing.assert_array_equal(result2.fetch(), expected[1])
+        np.testing.assert_array_equal(result3.fetch(), expected[2])
+
+
+def test_mixed_config(setup):
+    a = mt.ones((10, 10), chunk_size=6)
+    with pytest.raises(ValueError):
+        a.fetch()
+
+    with option_context({"eager_mode": True}):
+        b = mt.ones((10, 10), chunk_size=(6, 8))
+        np.testing.assert_array_equal(b.fetch(), np.ones((10, 10)))
+
+        r = b + 1
+        np.testing.assert_array_equal(r.fetch(), np.ones((10, 10)) * 2)
+
+        r2 = b.dot(b)
+        np.testing.assert_array_equal(r2.fetch(), np.ones((10, 10)) * 10)
+
+    c = mt.ones((10, 10), chunk_size=6)
+    with pytest.raises(ValueError):
+        c.fetch()
+    np.testing.assert_array_equal(c.execute(), np.ones((10, 10)))
+
+    r = c.dot(c)
+    with pytest.raises(ValueError):
+        r.fetch()
+    np.testing.assert_array_equal(r.execute(), np.ones((10, 10)) * 10)
+
+
+@pytest.mark.ray_dag
+def test_index(setup):
+    with option_context({"eager_mode": True}):
+        a = mt.random.rand(10, 5, chunk_size=5)
+        idx = slice(0, 5), slice(0, 5)
+        a[idx] = 1
+        np.testing.assert_array_equal(a.fetch()[idx], np.ones((5, 5)))
+
+        split1, split2 = mt.split(a, 2)
+        np.testing.assert_array_equal(split1.fetch(), np.ones((5, 5)))
+
+        # test bool indexing
+        a = mt.random.rand(8, 8, chunk_size=4)
+        set_value = mt.ones((2, 2)) * 2
+        a[4:6, 4:6] = set_value
+        b = a[a > 1]
+        assert b.shape == (4,)
+        np.testing.assert_array_equal(b.fetch(), np.ones((4,)) * 2)
+
+        c = b.reshape((2, 2))
+        assert c.shape == (2, 2)
+        np.testing.assert_array_equal(c.fetch(), np.ones((2, 2)) * 2)
+
+
+def test_repr_tensor(setup):
+    a = mt.ones((10, 10), chunk_size=3)
+    assert a.key in repr(a)
+
+    assert repr(np.ones((10, 10))) not in repr(a)
+    assert str(np.ones((10, 10))) not in str(a)
+
+    with option_context({"eager_mode": True}):
+        a = mt.ones((10, 10))
+        assert repr(np.ones((10, 10))) == repr(a)
+        assert str(np.ones((10, 10))) == str(a)
+
+
+def test_repr_dataframe(setup):
+    x = pd.DataFrame(np.ones((10, 10)))
+
+    with option_context({"eager_mode": True}):
+        a = md.DataFrame(np.ones((10, 10)), chunk_size=3)
+        assert repr(x) in repr(a)
+        assert str(x) in str(a)
+
+    a = md.DataFrame(np.ones((10, 10)), chunk_size=3)
+    assert repr(x) not in repr(a)
+    assert str(x) not in str(a)
+
+
+def test_view(setup):
+    with option_context({"eager_mode": True}):
+        data = np.random.rand(10, 20)
+        a = mt.tensor(data, chunk_size=5)
+        b = a[0][1:4]
+        b[1] = 10
+
+        npa = data.copy()
+        npb = npa[0][1:4]
+        npb[1] = 10
+
+        np.testing.assert_array_equal(a.fetch(), npa)
+        np.testing.assert_array_equal(b.fetch(), npb)
+
+
+def test_dataframe(setup):
+    with option_context({"eager_mode": True}):
+        from ..dataframe.arithmetic import add
+
+        data1 = pd.DataFrame(np.random.rand(10, 10))
+        df1 = from_pandas(data1, chunk_size=5)
+        pd.testing.assert_frame_equal(df1.fetch(), data1)
+
+        data2 = pd.DataFrame(np.random.rand(10, 10))
+        df2 = from_pandas(data2, chunk_size=6)
+        pd.testing.assert_frame_equal(df2.fetch(), data2)
+
+        df3 = add(df1, df2)
+        pd.testing.assert_frame_equal(df3.fetch(), data1 + data2)
diff --git a/python/xorbits/_mars/tests/test_resource.py b/python/xorbits/_mars/tests/test_resource.py
new file mode 100644
index 000000000..a0f35aeb9
--- /dev/null
+++ b/python/xorbits/_mars/tests/test_resource.py
@@ -0,0 +1,225 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import os
+import tempfile
+import time
+
+import pytest
+
+from ..resource import Resource, ZeroResource
+
+_v1_cpu_stat_first = "8678870951786"
+_v1_cpu_stat_last = "8679429771672"
+
+# just a fragment of real cpu.stat
+_v2_cpu_stat_first = """
+usage_usec 8678870951
+"""
+_v2_cpu_stat_last = """
+usage_usec 8679429771
+"""
+
+# just a fragment of real memory.stat
+_v1_memory_stat_content = """
+cache 489275392
+rss 218181632
+mapped_file 486768640
+swap 0
+inactive_anon 486744064
+active_anon 218103808
+inactive_file 2457600
+active_file 73728
+hierarchical_memory_limit 1073741824
+"""
+
+_v2_memory_current_content = "218181632\n"
+_v2_memory_max_content = "1073741824\n"
+
+
+def test_stats():
+    from mars import resource
+
+    resource = importlib.reload(resource)
+    resource.cpu_percent()
+
+    mem_stats = resource.virtual_memory()
+    assert mem_stats.available >= 0
+    assert mem_stats.total >= 0
+    assert mem_stats.percent >= 0
+    assert mem_stats.used >= 0
+    assert mem_stats.free >= 0
+
+    cpu_usage = resource.cpu_percent()
+    time.sleep(0.1)
+    assert cpu_usage >= 0
+
+    resource.disk_io_usage()
+    time.sleep(0.1)
+    recv_speed, send_speed = resource.disk_io_usage()
+    assert recv_speed >= 0
+    assert send_speed >= 0
+
+    curdir = os.path.dirname(os.path.abspath(__file__))
+    resource.disk_io_usage(curdir)
+    time.sleep(0.1)
+    usage = resource.disk_io_usage(curdir)
+    if usage is not None:
+        assert usage.reads >= 0
+        assert usage.writes >= 0
+
+    resource.net_io_usage()
+    time.sleep(0.1)
+    recv_speed, send_speed = resource.net_io_usage()
+    assert recv_speed >= 0
+    assert send_speed >= 0
+
+
+def test_use_process_stats():
+    from mars import resource
+
+    cpu_total = resource.cpu_count()
+    mem_total = resource.virtual_memory().total
+    try:
+        os.environ["MARS_USE_PROCESS_STAT"] = "1"
+        os.environ["MARS_CPU_TOTAL"] = str(cpu_total)
+        os.environ["MARS_MEMORY_TOTAL"] = str(mem_total)
+
+        resource = importlib.reload(resource)
+        resource.cpu_percent()
+        time.sleep(0.5)
+
+        mem_stats = resource.virtual_memory()
+        assert mem_stats.available >= 0
+        assert mem_stats.total >= 0
+        assert mem_stats.percent >= 0
+        assert mem_stats.used >= 0
+        assert mem_stats.free >= 0
+
+        cpu_usage = resource.cpu_percent()
+        assert cpu_usage >= 0
+        cpu_usage = resource.cpu_percent()
+        assert cpu_usage >= 0
+    finally:
+        del os.environ["MARS_USE_PROCESS_STAT"]
+        del os.environ["MARS_CPU_TOTAL"]
+        del os.environ["MARS_MEMORY_TOTAL"]
+        importlib.reload(resource)
+
+
+@pytest.mark.parametrize("cgroup_ver", ["v1", "v2"])
+def test_use_c_group_stats(cgroup_ver):
+    from mars import resource
+
+    def write_tmp_text_file(prefix, content):
+        fd, file_name = tempfile.mkstemp(prefix)
+        with os.fdopen(fd, "w") as f:
+            f.write(content)
+        return file_name
+
+    v1_cpu_acct_path = write_tmp_text_file(
+        "test-mars-res-cgroup-v1-cpu-", _v1_cpu_stat_first
+    )
+    v1_mem_stat_path = write_tmp_text_file(
+        "test-mars-res-cgroup-v1-mem-", _v1_memory_stat_content
+    )
+    v2_cpu_stat_path = write_tmp_text_file(
+        "test-mars-res-cgroup-v2-cpu-", _v2_cpu_stat_first
+    )
+    v2_mem_cur_path = write_tmp_text_file(
+        "test-mars-res-cgroup-v2-cpu-", _v2_memory_current_content
+    )
+    v2_mem_max_path = write_tmp_text_file(
+        "test-mars-res-cgroup-v2-cpu-", _v2_memory_max_content
+    )
+
+    old_is_cgroup_v2 = resource._is_cgroup_v2
+    old_v1_cpu_acct_file = resource.CGROUP_V1_CPU_ACCT_FILE
+    old_v1_mem_stat_file = resource.CGROUP_V1_MEM_STAT_FILE
+    old_v2_cpu_stat_file = resource.CGROUP_V2_CPU_STAT_FILE
+    old_v2_mem_current_file = resource.CGROUP_V2_MEM_CURRENT_FILE
+    old_v2_mem_max_file = resource.CGROUP_V2_MEM_MAX_FILE
+    old_shm_path = resource._shm_path
+    try:
+        os.environ["MARS_USE_CGROUP_STAT"] = "1"
+
+        resource = importlib.reload(resource)
+        if cgroup_ver == "v1":
+            resource.CGROUP_V1_CPU_ACCT_FILE = v1_cpu_acct_path
+            resource.CGROUP_V1_MEM_STAT_FILE = v1_mem_stat_path
+            resource._is_cgroup_v2 = False
+        else:
+            resource.CGROUP_V2_CPU_STAT_FILE = v2_cpu_stat_path
+            resource.CGROUP_V2_MEM_CURRENT_FILE = v2_mem_cur_path
+            resource.CGROUP_V2_MEM_MAX_FILE = v2_mem_max_path
+            resource._is_cgroup_v2 = True
+        resource._shm_path = None
+
+        assert resource.cpu_percent() is None
+        time.sleep(0.5)
+        with open(v1_cpu_acct_path, "w") as f:
+            f.write(_v1_cpu_stat_last)
+        with open(v2_cpu_stat_path, "w") as f:
+            f.write(_v2_cpu_stat_last)
+        assert resource.cpu_percent() > 50
+        assert resource.cpu_percent() < 150
+
+        mem_stats = resource.virtual_memory()
+        assert mem_stats.total == 1073741824
+        assert mem_stats.used == 218181632
+    finally:
+        resource._is_cgroup_v2 = old_is_cgroup_v2
+        resource._shm_path = old_shm_path
+        resource.CGROUP_V1_CPU_ACCT_FILE = old_v1_cpu_acct_file
+        resource.CGROUP_V1_MEM_STAT_FILE = old_v1_mem_stat_file
+        resource.CGROUP_V2_CPU_STAT_FILE = old_v2_cpu_stat_file
+        resource.CGROUP_V2_MEM_CURRENT_FILE = old_v2_mem_current_file
+        resource.CGROUP_V2_MEM_MAX_FILE = old_v2_mem_max_file
+
+        del os.environ["MARS_USE_CGROUP_STAT"]
+
+        os.unlink(v1_cpu_acct_path)
+        os.unlink(v1_mem_stat_path)
+        os.unlink(v2_cpu_stat_path)
+        os.unlink(v2_mem_cur_path)
+        os.unlink(v2_mem_max_path)
+
+        importlib.reload(resource)
+
+
+def test_resource():
+    assert Resource(num_cpus=1) + Resource(num_cpus=1) == Resource(num_cpus=2)
+    assert Resource(num_cpus=1) + Resource(num_gpus=1) + Resource(
+        mem_bytes=1024**3
+    ) == Resource(num_cpus=1, num_gpus=1, mem_bytes=1024**3)
+    assert -Resource(num_cpus=1, num_gpus=1, mem_bytes=1024**3) == Resource(
+        num_cpus=-1, num_gpus=-1, mem_bytes=-(1024**3)
+    )
+    assert Resource(num_cpus=-1) < ZeroResource
+    assert Resource(num_gpus=-1) < ZeroResource
+    assert Resource(mem_bytes=-1) < ZeroResource
+    assert Resource(num_cpus=1, num_gpus=1, mem_bytes=-(1024**3)) < ZeroResource
+    assert Resource(num_cpus=1, num_gpus=1, mem_bytes=1024**3) > Resource(
+        num_cpus=10, num_gpus=1, mem_bytes=1024
+    )
+    assert Resource(num_cpus=1, num_gpus=10, mem_bytes=1024**3) > Resource(
+        num_cpus=10, num_gpus=1, mem_bytes=1024**3
+    )
+    assert Resource(num_cpus=100, num_gpus=10, mem_bytes=1024**3) > Resource(
+        num_cpus=10, num_gpus=10, mem_bytes=1024**3
+    )
+    assert Resource(num_cpus=100, num_gpus=10, mem_bytes=1024) - Resource(
+        num_cpus=10, num_gpus=20, mem_bytes=512
+    ) == Resource(num_cpus=90, num_gpus=-10, mem_bytes=512)
diff --git a/python/xorbits/_mars/tests/test_session.py b/python/xorbits/_mars/tests/test_session.py
new file mode 100644
index 000000000..b0de521b1
--- /dev/null
+++ b/python/xorbits/_mars/tests/test_session.py
@@ -0,0 +1,529 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import os
+import re
+import sys
+import tempfile
+from collections import namedtuple
+
+import numpy as np
+import pandas as pd
+import pytest
+
+try:
+    import pyarrow as pa
+except ImportError:  # pragma: no cover
+    pa = None
+
+from .. import dataframe as md
+from .. import remote as mr
+from .. import tensor as mt
+from ..config import option_context
+from ..deploy.utils import load_service_config_file
+from ..session import execute, fetch, fetch_log
+
+test_namedtuple_type = namedtuple("TestNamedTuple", "a b")
+
+
+@pytest.fixture
+def setup():
+    from ..deploy.oscar.tests.session import new_test_session
+
+    sess = new_test_session(address="127.0.0.1", init_local=True, default=True)
+    with option_context({"show_progress": False}):
+        try:
+            from .. import __version__ as mars_version
+
+            assert sess.get_cluster_versions() == [mars_version]
+            yield sess
+        finally:
+            sess.stop_server()
+
+
+def test_session_async_execute(setup):
+    raw_a = np.random.RandomState(0).rand(10, 20)
+    a = mt.tensor(raw_a)
+
+    expected = raw_a.sum()
+    res = a.sum().to_numpy(wait=False).result()
+    assert expected == res
+    res = a.sum().execute(wait=False)
+    res = res.result().fetch()
+    assert expected == res
+
+    raw_df = pd.DataFrame(raw_a)
+
+    expected = raw_df.skew()
+    df = md.DataFrame(a)
+    res = df.skew().to_pandas(wait=False).result()
+    pd.testing.assert_series_equal(expected, res)
+    res = df.skew().execute(wait=False)
+    res = res.result().fetch()
+    pd.testing.assert_series_equal(expected, res)
+
+    t = [df.sum(), a.sum()]
+    res = mt.ExecutableTuple(t).to_object(wait=False).result()
+    pd.testing.assert_series_equal(raw_df.sum(), res[0])
+    assert raw_a.sum() == res[1]
+    res = mt.ExecutableTuple(t).execute(wait=False)
+    res = fetch(*res.result())
+    pd.testing.assert_series_equal(raw_df.sum(), res[0])
+    assert raw_a.sum() == res[1]
+
+
+def test_executable_tuple_execute(setup):
+    raw_a = np.random.RandomState(0).rand(10, 20)
+    a = mt.tensor(raw_a)
+
+    raw_df = pd.DataFrame(raw_a)
+    df = md.DataFrame(raw_df)
+
+    tp = test_namedtuple_type(a, df)
+    executable_tp = mt.ExecutableTuple(tp)
+
+    assert "a" in dir(executable_tp)
+    assert executable_tp.a is a
+    assert test_namedtuple_type.__name__ in repr(executable_tp)
+    with pytest.raises(AttributeError):
+        getattr(executable_tp, "c")
+
+    res = mt.ExecutableTuple(tp).execute().fetch()
+    assert test_namedtuple_type is type(res)
+
+    np.testing.assert_array_equal(raw_a, res.a)
+    pd.testing.assert_frame_equal(raw_df, res.b)
+
+
+def test_multiple_output_execute(setup):
+    data = np.random.random((5, 9))
+
+    # test multiple outputs
+    arr1 = mt.tensor(data.copy(), chunk_size=3)
+    result = mt.modf(arr1).execute().fetch()
+    expected = np.modf(data)
+
+    np.testing.assert_array_equal(result[0], expected[0])
+    np.testing.assert_array_equal(result[1], expected[1])
+
+    # test 1 output
+    arr2 = mt.tensor(data.copy(), chunk_size=3)
+    result = ((arr2 + 1) * 2).to_numpy()
+    expected = (data + 1) * 2
+
+    np.testing.assert_array_equal(result, expected)
+
+    # test multiple outputs, but only execute 1
+    arr3 = mt.tensor(data.copy(), chunk_size=3)
+    arrs = mt.split(arr3, 3, axis=1)
+    result = arrs[0].to_numpy()
+    expected = np.split(data, 3, axis=1)[0]
+
+    np.testing.assert_array_equal(result, expected)
+
+    # test multiple outputs, but only execute 1
+    data = np.random.randint(0, 10, (5, 5))
+    arr3 = (mt.tensor(data) + 1) * 2
+    arrs = mt.linalg.qr(arr3)
+    result = (arrs[0] + 1).to_numpy()
+    expected = np.linalg.qr((data + 1) * 2)[0] + 1
+
+    np.testing.assert_array_almost_equal(result, expected)
+
+    result = (arrs[0] + 2).to_numpy()
+    expected = np.linalg.qr((data + 1) * 2)[0] + 2
+
+    np.testing.assert_array_almost_equal(result, expected)
+
+    s = mt.shape(0)
+
+    result = s.execute().fetch()
+    expected = np.shape(0)
+    assert result == expected
+
+
+def test_closed_session():
+    from ..deploy.oscar.tests.session import new_test_session
+
+    session = new_test_session(default=True)
+    with option_context({"show_progress": False}):
+        arr = mt.ones((10, 10))
+        try:
+            result = session.execute(arr)
+
+            np.testing.assert_array_equal(result, np.ones((10, 10)))
+
+            # close session
+            session.close()
+
+            with pytest.raises(RuntimeError):
+                session.execute(arr)
+
+            with pytest.raises(RuntimeError):
+                session.execute(arr + 1)
+        finally:
+            session.stop_server()
+
+
+def test_array_protocol(setup):
+    arr = mt.ones((10, 20))
+
+    result = np.asarray(arr)
+    np.testing.assert_array_equal(result, np.ones((10, 20)))
+
+    arr2 = mt.ones((10, 20))
+
+    result = np.asarray(arr2, mt.bool)
+    np.testing.assert_array_equal(result, np.ones((10, 20), dtype=np.bool_))
+
+    arr3 = mt.ones((10, 20)).sum()
+
+    result = np.asarray(arr3)
+    np.testing.assert_array_equal(result, np.asarray(200))
+
+    arr4 = mt.ones((10, 20)).sum()
+
+    result = np.asarray(arr4, dtype=np.float_)
+    np.testing.assert_array_equal(result, np.asarray(200, dtype=np.float_))
+
+
+def test_without_fuse(setup):
+    arr1 = (mt.ones((10, 10), chunk_size=6) + 1) * 2
+    r1 = arr1.execute(fuse_enabled=False).fetch()
+    arr2 = (mt.ones((10, 10), chunk_size=5) + 1) * 2
+    r2 = arr2.execute(fuse_enabled=False).fetch()
+    np.testing.assert_array_equal(r1, r2)
+
+
+@pytest.mark.ray_dag
+def test_fetch_slices(setup):
+    arr1 = mt.random.rand(10, 8, chunk_size=3)
+    r1 = arr1.execute().fetch()
+
+    r2 = arr1[:2, 3:9].fetch()
+    np.testing.assert_array_equal(r2, r1[:2, 3:9])
+
+    r3 = arr1[0].fetch()
+    np.testing.assert_array_equal(r3, r1[0])
+
+
+def test_fetch_dataframe_slices(setup):
+    arr1 = mt.random.rand(10, 8, chunk_size=3)
+    df1 = md.DataFrame(arr1)
+    r1 = df1.execute().fetch()
+
+    r2 = df1.iloc[:, :].fetch()
+    pd.testing.assert_frame_equal(r2, r1.iloc[:, :])
+
+    r3 = df1.iloc[1].fetch(extra_config={"check_series_name": False})
+    pd.testing.assert_series_equal(r3, r1.iloc[1])
+
+    r4 = df1.iloc[0, 2].fetch()
+    assert r4 == r1.iloc[0, 2]
+
+    arr2 = mt.random.rand(10, 3, chunk_size=3)
+    df2 = md.DataFrame(arr2)
+    r5 = df2.execute().fetch()
+
+    r6 = df2.iloc[:4].fetch(batch_size=3)
+    pd.testing.assert_frame_equal(r5.iloc[:4], r6)
+
+
+def test_repr(setup):
+    # test tensor repr
+    with np.printoptions(threshold=100):
+        arr = np.random.randint(1000, size=(11, 4, 13))
+
+        t = mt.tensor(arr, chunk_size=3)
+
+        result = repr(t.execute())
+        expected = repr(arr)
+        assert result == expected
+
+    for size in (5, 58, 60, 62, 64):
+        pdf = pd.DataFrame(np.random.randint(1000, size=(size, 10)))
+
+        # test DataFrame repr
+        df = md.DataFrame(pdf, chunk_size=size // 2)
+
+        result = repr(df.execute())
+        expected = repr(pdf)
+        assert result == expected
+
+        # test DataFrame _repr_html_
+        result = df.execute()._repr_html_()
+        expected = pdf._repr_html_()
+        assert result == expected
+
+        # test Series repr
+        ps = pdf[0]
+        s = md.Series(ps, chunk_size=size // 2)
+
+        result = repr(s.execute())
+        expected = repr(ps)
+        assert result == expected
+
+    # test Index repr
+    pind = pd.date_range("2020-1-1", periods=10)
+    ind = md.Index(pind, chunk_size=5)
+
+    assert "DatetimeIndex" in repr(ind.execute())
+
+    # test groupby repr
+    df = md.DataFrame(pd.DataFrame(np.random.rand(100, 3), columns=list("abc")))
+    grouped = df.groupby(["a", "b"]).execute()
+
+    assert "DataFrameGroupBy" in repr(grouped)
+
+    # test Categorical repr
+    c = md.qcut(range(5), 3)
+    assert "Categorical" in repr(c)
+    assert "Categorical" in str(c)
+    assert repr(c.execute()) == repr(pd.qcut(range(5), 3))
+
+
+def test_iter(setup):
+    raw_data = pd.DataFrame(np.random.randint(1000, size=(20, 10)))
+    df = md.DataFrame(raw_data, chunk_size=5)
+
+    for col, series in df.iteritems():
+        pd.testing.assert_series_equal(series.execute().fetch(), raw_data[col])
+
+    for i, batch in enumerate(df.iterbatch(batch_size=15)):
+        pd.testing.assert_frame_equal(batch, raw_data.iloc[i * 15 : (i + 1) * 15])
+
+    i = 0
+    for result_row, expect_row in zip(df.iterrows(batch_size=15), raw_data.iterrows()):
+        assert result_row[0] == expect_row[0]
+        pd.testing.assert_series_equal(result_row[1], expect_row[1])
+        i += 1
+
+    assert i == len(raw_data)
+
+    i = 0
+    for result_tup, expect_tup in zip(
+        df.itertuples(batch_size=10), raw_data.itertuples()
+    ):
+        assert result_tup == expect_tup
+        i += 1
+
+    assert i == len(raw_data)
+
+    raw_data = pd.Series(np.random.randint(1000, size=(20,)))
+    s = md.Series(raw_data, chunk_size=5)
+
+    for i, batch in enumerate(s.iterbatch(batch_size=15)):
+        pd.testing.assert_series_equal(batch, raw_data.iloc[i * 15 : (i + 1) * 15])
+
+    i = 0
+    for result_item, expect_item in zip(
+        s.iteritems(batch_size=15), raw_data.iteritems()
+    ):
+        assert result_item[0] == expect_item[0]
+        assert result_item[1] == expect_item[1]
+        i += 1
+
+    assert i == len(raw_data)
+
+    # test to_dict
+    assert s.to_dict() == raw_data.to_dict()
+
+
+CONFIG = """
+"@inherits": '@default'
+session:
+  custom_log_dir: '{custom_log_dir}'
+"""
+
+
+@pytest.fixture
+def fetch_log_setup():
+    from ..deploy.oscar.tests.session import new_test_session
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        config = io.StringIO(CONFIG.format(custom_log_dir=temp_dir))
+        sess = new_test_session(
+            default=True, config=load_service_config_file(config), n_cpu=8
+        )
+        with option_context({"show_progress": False}):
+            try:
+                yield sess
+            finally:
+                sess.stop_server()
+
+
+def test_fetch_log(fetch_log_setup):
+    def f():
+        print("test")
+
+    r = mr.spawn(f)
+    r.execute()
+
+    log = r.fetch_log()
+    assert str(log).strip() == "test"
+
+    # test multiple functions
+    def f1(size):
+        print("f1" * size)
+        sys.stdout.flush()
+
+    fs = mr.ExecutableTuple([mr.spawn(f1, 30), mr.spawn(f1, 40)])
+    execute(*fs)
+    log = fetch_log(*fs, offsets=20, sizes=10)
+    assert str(log[0]).strip() == ("f1" * 30)[20:30]
+    assert str(log[1]).strip() == ("f1" * 40)[20:30]
+    assert len(log[0].offsets) > 0
+    assert all(s > 0 for s in log[0].offsets)
+    assert len(log[1].offsets) > 0
+    assert all(s > 0 for s in log[1].offsets)
+    assert len(log[0].chunk_op_keys) > 0
+
+    # test negative offsets
+    log = fs.fetch_log(offsets=-20, sizes=10)
+    assert str(log[0]).strip() == ("f1" * 30 + os.linesep)[-20:-10]
+    assert str(log[1]).strip() == ("f1" * 40 + os.linesep)[-20:-10]
+    assert all(s > 0 for s in log[0].offsets) is True
+    assert len(log[1].offsets) > 0
+    assert all(s > 0 for s in log[1].offsets) is True
+    assert len(log[0].chunk_op_keys) > 0
+
+    # test negative offsets which represented in string
+    log = fetch_log(*fs, offsets="-0.02K", sizes="0.01K")
+    assert str(log[0]).strip() == ("f1" * 30 + os.linesep)[-20:-10]
+    assert str(log[1]).strip() == ("f1" * 40 + os.linesep)[-20:-10]
+    assert all(s > 0 for s in log[0].offsets) is True
+    assert len(log[1].offsets) > 0
+    assert all(s > 0 for s in log[1].offsets) is True
+    assert len(log[0].chunk_op_keys) > 0
+
+    def test_nested():
+        print("level0")
+        fr = mr.spawn(f1, 1)
+        fr.execute()
+        print(fr.fetch_log())
+
+    r = mr.spawn(test_nested)
+    r.execute()
+    log = str(r.fetch_log())
+    assert "level0" in log
+    assert "f1" in log
+
+    df = md.DataFrame(mt.random.rand(10, 3), chunk_size=5)
+
+    def df_func(c):
+        print("df func")
+        return c
+
+    df2 = df.map_chunk(df_func)
+    df2.execute()
+    log = df2.fetch_log()
+    assert "Chunk op key:" in str(log)
+    assert "df func" in repr(log)
+    assert len(str(df.fetch_log())) == 0
+
+    def test_host(rndf):
+        rm = mr.spawn(nested, rndf)
+        rm.execute()
+        print(rm.fetch_log())
+
+    def nested(_rndf):
+        print("log_content")
+
+    ds = [mr.spawn(test_host, n, retry_when_fail=False) for n in np.random.rand(4)]
+    xtp = execute(ds)
+    for log in fetch_log(xtp):
+        assert str(log).strip() == "log_content"
+
+    def test_threaded():
+        import threading
+
+        exc_info = None
+
+        def print_fun():
+            nonlocal exc_info
+            try:
+                print("inner")
+            except:  # noqa: E722  # nosec  # pylint: disable=bare-except
+                exc_info = sys.exc_info()
+
+        print_thread = threading.Thread(target=print_fun)
+        print_thread.start()
+        print_thread.join()
+
+        if exc_info is not None:
+            raise exc_info[1].with_traceback(exc_info[-1])
+
+        print("after")
+
+    rm = mr.spawn(test_threaded)
+    rm.execute()
+    logs = str(rm.fetch_log()).strip()
+    assert logs == "inner\nafter"
+
+
+def test_align_series(setup):
+    t = np.random.rand(10, 3)
+    pdf = pd.DataFrame(t)
+    df = md.DataFrame(pdf, chunk_size=(5, 3))
+    r = df[0] != df.sort_index()[0].shift(-1)
+    expected = pdf[0] != pdf.sort_index()[0].shift(-1)
+    pd.testing.assert_series_equal(r.execute().fetch(), expected)
+
+
+def test_cache_tileable(setup):
+    raw = np.random.rand(10, 3)
+    t = mt.tensor(raw)
+    t.cache = True
+    t2 = t + 1
+    result = t2.execute().fetch()
+    np.testing.assert_array_equal(result, raw + 1)
+    np.testing.assert_array_equal(t.fetch(), raw)
+
+    with option_context({"warn_duplicated_execution": True}):
+        t = mt.tensor(raw)
+        with pytest.warns(
+            RuntimeWarning,
+            match=re.escape(f"Tileable {repr(t)} has been submitted before"),
+        ):
+            (t + 1).execute()
+            (t + 2).execute()
+
+        # should have no warning
+        t = mt.tensor(raw)
+        with pytest.raises(BaseException, match="DID NOT WARN"):
+            with pytest.warns(
+                RuntimeWarning,
+                match=re.escape(f"Tileable {repr(t)} has been submitted before"),
+            ):
+                (t + 1).execute()
+
+
+@pytest.mark.parametrize("method", ["shuffle", "broadcast", None])
+@pytest.mark.parametrize("auto_merge", ["after", "before"])
+def test_merge_groupby(setup, method, auto_merge):
+    rs = np.random.RandomState(0)
+    raw1 = pd.DataFrame({"a": rs.randint(3, size=100), "b": rs.rand(100)})
+    raw2 = pd.DataFrame({"a": rs.randint(3, size=10), "c": rs.rand(10)})
+    df1 = md.DataFrame(raw1, chunk_size=10).execute()
+    df2 = md.DataFrame(raw2, chunk_size=10).execute()
+    # do not trigger auto merge
+    df3 = df1.merge(
+        df2, on="a", auto_merge_threshold=8, method=method, auto_merge=auto_merge
+    )
+    df4 = df3.groupby("a").sum()
+
+    result = df4.execute().fetch()
+    expected = raw1.merge(raw2, on="a").groupby("a").sum()
+    pd.testing.assert_frame_equal(result, expected)
diff --git a/python/xorbits/_mars/tests/test_utils.py b/python/xorbits/_mars/tests/test_utils.py
new file mode 100644
index 000000000..4fb9d9912
--- /dev/null
+++ b/python/xorbits/_mars/tests/test_utils.py
@@ -0,0 +1,669 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import copy
+import logging
+import multiprocessing
+import os
+import shutil
+import sys
+import tempfile
+import textwrap
+import time
+from concurrent.futures import ThreadPoolExecutor
+from enum import Enum
+from functools import partial
+from io import BytesIO
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from .. import dataframe as md
+from .. import tensor as mt
+from .. import utils
+from ..core import TileableGraph, tile
+from .core import require_cudf, require_ray
+
+
+def test_string_conversion():
+    s = None
+    assert utils.to_binary(s) is None
+    assert utils.to_str(s) is None
+    assert utils.to_text(s) is None
+
+    s = "abcdefg"
+    assert isinstance(utils.to_binary(s), bytes)
+    assert utils.to_binary(s) == b"abcdefg"
+    assert isinstance(utils.to_str(s), str)
+    assert utils.to_str(s) == "abcdefg"
+    assert isinstance(utils.to_text(s), str)
+    assert utils.to_text(s) == "abcdefg"
+
+    ustr = type("ustr", (str,), {})
+    assert isinstance(utils.to_str(ustr(s)), str)
+    assert utils.to_str(ustr(s)) == "abcdefg"
+
+    s = b"abcdefg"
+    assert isinstance(utils.to_binary(s), bytes)
+    assert utils.to_binary(s) == b"abcdefg"
+    assert isinstance(utils.to_str(s), str)
+    assert utils.to_str(s) == "abcdefg"
+    assert isinstance(utils.to_text(s), str)
+    assert utils.to_text(s) == "abcdefg"
+
+    ubytes = type("ubytes", (bytes,), {})
+    assert isinstance(utils.to_binary(ubytes(s)), bytes)
+    assert utils.to_binary(ubytes(s)) == b"abcdefg"
+
+    s = "abcdefg"
+    assert isinstance(utils.to_binary(s), bytes)
+    assert utils.to_binary(s) == b"abcdefg"
+    assert isinstance(utils.to_str(s), str)
+    assert utils.to_str(s) == "abcdefg"
+    assert isinstance(utils.to_text(s), str)
+    assert utils.to_text(s) == "abcdefg"
+
+    uunicode = type("uunicode", (str,), {})
+    assert isinstance(utils.to_text(uunicode(s)), str)
+    assert utils.to_text(uunicode(s)) == "abcdefg"
+
+    with pytest.raises(TypeError):
+        utils.to_binary(utils)
+    with pytest.raises(TypeError):
+        utils.to_str(utils)
+    with pytest.raises(TypeError):
+        utils.to_text(utils)
+
+
+def test_tokenize():
+    import shutil
+    import tempfile
+
+    class TestEnum(Enum):
+        VAL1 = "val1"
+
+    tempdir = tempfile.mkdtemp("mars_test_utils_")
+    try:
+        filename = os.path.join(tempdir, "test_npa.dat")
+        mmp_array = np.memmap(filename, dtype=float, mode="w+", shape=(3, 4))
+        mmp_array[:] = np.random.random((3, 4)).astype(float)
+        mmp_array.flush()
+        del mmp_array
+
+        mmp_array1 = np.memmap(filename, dtype=float, shape=(3, 4))
+        mmp_array2 = np.memmap(filename, dtype=float, shape=(3, 4))
+
+        try:
+            v = [
+                1,
+                2.3,
+                "456",
+                "789",
+                b"101112",
+                2147483649,
+                None,
+                np.ndarray,
+                [912, "uvw"],
+                np.arange(0, 10),
+                np.array(10),
+                np.array([b"\x01\x32\xff"]),
+                np.int64,
+                TestEnum.VAL1,
+            ]
+            copy_v = copy.deepcopy(v)
+            assert utils.tokenize(v + [mmp_array1], ext_data=1234) == utils.tokenize(
+                copy_v + [mmp_array2], ext_data=1234
+            )
+        finally:
+            del mmp_array1, mmp_array2
+    finally:
+        shutil.rmtree(tempdir)
+
+    v = {"a", "xyz", "uvw"}
+    assert utils.tokenize(v) == utils.tokenize(copy.deepcopy(v))
+
+    v = dict(x="abcd", y=98765)
+    assert utils.tokenize(v) == utils.tokenize(copy.deepcopy(v))
+
+    v = dict(x=dict(a=1, b=[1, 2, 3]), y=12345)
+    assert utils.tokenize(v) == utils.tokenize(copy.deepcopy(v))
+
+    # pandas relative
+    if pd is not None:
+        df = pd.DataFrame(
+            [[utils.to_binary("测试"), utils.to_text("数据")]],
+            index=["a"],
+            columns=["中文", "data"],
+        )
+        v = [df, df.index, df.columns, df["data"], pd.Categorical(list("ABCD"))]
+        assert utils.tokenize(v) == utils.tokenize(copy.deepcopy(v))
+
+    class NonTokenizableCls:
+        def __getstate__(self):
+            raise SystemError
+
+    with pytest.raises(TypeError):
+        utils.tokenize(NonTokenizableCls())
+
+    class CustomizedTokenize(object):
+        def __mars_tokenize__(self):
+            return id(type(self)), id(NonTokenizableCls)
+
+    assert utils.tokenize(CustomizedTokenize()) == utils.tokenize(CustomizedTokenize())
+
+    v = lambda x: x + 1
+    assert utils.tokenize(v) == utils.tokenize(copy.deepcopy(v))
+
+    def f(a, b):
+        return np.add(a, b)
+
+    assert utils.tokenize(f) == utils.tokenize(copy.deepcopy(f))
+
+    partial_f = partial(f, 1, k=0)
+    partial_f2 = partial(f, 1, k=1)
+    assert utils.tokenize(partial_f) == utils.tokenize(copy.deepcopy(partial_f))
+    assert utils.tokenize(partial_f) != utils.tokenize(partial_f2)
+
+
+def test_lazy_import():
+    old_sys_path = sys.path
+    mock_mod = textwrap.dedent(
+        """
+        __version__ = '0.1.0b1'
+        """.strip()
+    )
+    mock_mod2 = textwrap.dedent(
+        """
+        from mars.utils import lazy_import
+        mock_mod = lazy_import("mock_mod")
+
+        def get_version():
+            return mock_mod.__version__
+        """
+    )
+
+    temp_dir = tempfile.mkdtemp(prefix="mars-utils-test-")
+    sys.path += [temp_dir]
+    try:
+        with open(os.path.join(temp_dir, "mock_mod.py"), "w") as outf:
+            outf.write(mock_mod)
+        with open(os.path.join(temp_dir, "mock_mod2.py"), "w") as outf:
+            outf.write(mock_mod2)
+
+        non_exist_mod = utils.lazy_import("non_exist_mod", locals=locals())
+        assert non_exist_mod is None
+
+        non_exist_mod1 = utils.lazy_import("non_exist_mod1", placeholder=True)
+        with pytest.raises(AttributeError) as ex_data:
+            non_exist_mod1.meth()
+        assert "required" in str(ex_data.value)
+
+        mod = utils.lazy_import(
+            "mock_mod", globals=globals(), locals=locals(), rename="mod"
+        )
+        assert mod is not None
+        assert mod.__version__ == "0.1.0b1"
+
+        glob = globals().copy()
+        mod = utils.lazy_import("mock_mod", globals=glob, locals=locals(), rename="mod")
+        glob["mod"] = mod
+        assert mod is not None
+        assert mod.__version__ == "0.1.0b1"
+        assert type(glob["mod"]).__name__ == "module"
+
+        import mock_mod2 as mod2
+
+        assert type(mod2.mock_mod).__name__ != "module"
+        assert mod2.get_version() == "0.1.0b1"
+        assert type(mod2.mock_mod).__name__ == "module"
+    finally:
+        shutil.rmtree(temp_dir)
+        sys.path = old_sys_path
+        sys.modules.pop("mock_mod", None)
+        sys.modules.pop("mock_mod2", None)
+
+
+def test_chunks_indexer():
+    a = mt.ones((3, 4, 5), chunk_size=2)
+    a = tile(a)
+
+    assert a.chunk_shape == (2, 2, 3)
+
+    with pytest.raises(ValueError):
+        _ = a.cix[1]
+    with pytest.raises(ValueError):
+        _ = a.cix[1, :]
+
+    chunk_key = a.cix[0, 0, 0].key
+    expected = a.chunks[0].key
+    assert chunk_key == expected
+
+    # as chunks[9] and chunks[10] shares the same shape,
+    #  their keys should be equal.
+    chunk_key = a.cix[1, 1, 1].key
+    expected = a.chunks[9].key
+    assert chunk_key == expected
+
+    chunk_key = a.cix[1, 1, 2].key
+    expected = a.chunks[11].key
+    assert chunk_key == expected
+
+    chunk_key = a.cix[0, -1, -1].key
+    expected = a.chunks[5].key
+    assert chunk_key == expected
+
+    chunk_key = a.cix[0, -1, -1].key
+    expected = a.chunks[5].key
+    assert chunk_key == expected
+
+    chunk_keys = [c.key for c in a.cix[0, 0, :]]
+    expected = [c.key for c in [a.cix[0, 0, 0], a.cix[0, 0, 1], a.cix[0, 0, 2]]]
+    assert chunk_keys == expected
+
+    chunk_keys = [c.key for c in a.cix[:, 0, :]]
+    expected = [
+        c.key
+        for c in [
+            a.cix[0, 0, 0],
+            a.cix[0, 0, 1],
+            a.cix[0, 0, 2],
+            a.cix[1, 0, 0],
+            a.cix[1, 0, 1],
+            a.cix[1, 0, 2],
+        ]
+    ]
+    assert chunk_keys == expected
+
+    chunk_keys = [c.key for c in a.cix[:, :, :]]
+    expected = [c.key for c in a.chunks]
+    assert chunk_keys == expected
+
+
+def test_require_not_none():
+    @utils.require_not_none(1)
+    def should_exist():
+        pass
+
+    assert should_exist is not None
+
+    @utils.require_not_none(None)
+    def should_not_exist():
+        pass
+
+    assert should_not_exist is None
+
+    @utils.require_module("numpy.fft")
+    def should_exist_np():
+        pass
+
+    assert should_exist_np is not None
+
+    @utils.require_module("numpy.fft_error")
+    def should_not_exist_np():
+        pass
+
+    assert should_not_exist_np is None
+
+
+def test_type_dispatcher():
+    dispatcher = utils.TypeDispatcher()
+
+    type1 = type("Type1", (), {})
+    type2 = type("Type2", (type1,), {})
+    type3 = type("Type3", (), {})
+    type4 = type("Type4", (type2,), {})
+    type5 = type("Type5", (type4,), {})
+
+    dispatcher.register(object, lambda x: "Object")
+    dispatcher.register(type1, lambda x: "Type1")
+    dispatcher.register(type4, lambda x: "Type4")
+    dispatcher.register("pandas.DataFrame", lambda x: "DataFrame")
+    dispatcher.register(utils.NamedType("ray", type1), lambda x: "RayType1")
+
+    assert "Type1" == dispatcher(type2())
+    assert "DataFrame" == dispatcher(pd.DataFrame())
+    assert "Object" == dispatcher(type3())
+
+    tp = utils.NamedType("ray", type1)
+    assert dispatcher.get_handler(tp)(tp) == "RayType1"
+    tp = utils.NamedType("ray", type2)
+    assert dispatcher.get_handler(tp)(tp) == "RayType1"
+    tp = utils.NamedType("xxx", type2)
+    assert dispatcher.get_handler(tp)(tp) == "Type1"
+    assert "Type1" == dispatcher(type2())
+    tp = utils.NamedType("ray", type5)
+    assert dispatcher.get_handler(tp)(tp) == "Type4"
+
+    dispatcher.unregister(object)
+    with pytest.raises(KeyError):
+        dispatcher(type3())
+
+
+def test_fixed_size_file_object():
+    arr = [str(i).encode() * 20 for i in range(10)]
+    bts = os.linesep.encode().join(arr)
+    bio = BytesIO(bts)
+
+    ref_bio = BytesIO(bio.read(100))
+    bio.seek(0)
+    ref_bio.seek(0)
+    fix_bio = utils.FixedSizeFileObject(bio, 100)
+
+    assert ref_bio.readline() == fix_bio.readline()
+    assert ref_bio.tell() == fix_bio.tell()
+    pos = ref_bio.tell() + 10
+    assert ref_bio.seek(pos) == fix_bio.seek(pos)
+    assert ref_bio.read(5) == fix_bio.read(5)
+    assert ref_bio.readlines(25) == fix_bio.readlines(25)
+    assert list(ref_bio) == list(fix_bio)
+
+
+def test_timer():
+    with utils.Timer() as timer:
+        time.sleep(0.1)
+
+    assert timer.duration >= 0.1
+
+
+def test_quiet_stdio():
+    old_stdout, old_stderr = sys.stdout, sys.stderr
+
+    class _IOWrapper:
+        def __init__(self, name=None):
+            self.name = name
+            self.content = ""
+
+        @staticmethod
+        def writable():
+            return True
+
+        def write(self, d):
+            self.content += d
+            return len(d)
+
+    stdout_w = _IOWrapper("stdout")
+    stderr_w = _IOWrapper("stderr")
+    executor = ThreadPoolExecutor(1)
+    try:
+        sys.stdout = stdout_w
+        sys.stderr = stderr_w
+
+        with utils.quiet_stdio():
+            with utils.quiet_stdio():
+                assert sys.stdout.writable()
+                assert sys.stderr.writable()
+
+                print("LINE 1", end="\n")
+                print("LINE 2", file=sys.stderr, end="\n")
+                executor.submit(print, "LINE T").result()
+            print("LINE 3", end="\n")
+
+        print("LINE 1", end="\n")
+        print("LINE 2", file=sys.stderr, end="\n")
+    finally:
+        sys.stdout, sys.stderr = old_stdout, old_stderr
+        executor.shutdown(False)
+
+    assert stdout_w.content == "LINE T\nLINE 1\n"
+    assert stderr_w.content == "LINE 2\n"
+
+
+@pytest.mark.asyncio
+@pytest.mark.skipif(
+    sys.version_info[:2] < (3, 7),
+    reason="asyncio task timeout detector is not supported on python versions below 3.7",
+)
+async def test_asyncio_task_timeout_detector():
+    log_file_name = "test_asyncio_task_timeout_detector.log"
+    try:
+        os.environ["MARS_DEBUG_ASYNCIO_TASK_TIMEOUT_CHECK_INTERVAL"] = "1"
+        p = multiprocessing.Process(
+            target=_run_task_timeout_detector, args=(log_file_name,)
+        )
+        p.start()
+        while p.is_alive():
+            await asyncio.sleep(0.1)
+        with open(log_file_name, "r") as f:
+            detector_log = f.read()
+            assert "timeout_func" in detector_log
+    finally:
+        os.environ.pop("MARS_DEBUG_ASYNCIO_TASK_TIMEOUT_CHECK_INTERVAL")
+        if os.path.exists(log_file_name):
+            os.remove(log_file_name)
+
+
+def _run_task_timeout_detector(log_file_name):
+    from ..utils import logger, register_asyncio_task_timeout_detector
+
+    fh = logging.FileHandler(log_file_name)
+    fh.setLevel(logging.INFO)
+    logger.addHandler(fh)
+
+    async def timeout_func():
+        await asyncio.sleep(2)
+
+    async def main():
+        task = register_asyncio_task_timeout_detector()
+        await asyncio.create_task(timeout_func())
+        task.cancel()
+
+    asyncio.run(main())
+
+
+def test_module_placeholder():
+    required_module = utils.ModulePlaceholder("required_module")
+
+    with pytest.raises(AttributeError):
+        required_module()
+    with pytest.raises(AttributeError) as e:
+        required_module.method()
+    msg = e.value.args[0]
+    assert msg == "required_module is required but not installed."
+
+
+def test_merge_dict():
+    from ..utils import merge_dict
+
+    assert merge_dict({}, {1: 2}) == {1: 2}
+    assert merge_dict({1: 2}, {}) == {1: 2}
+    assert merge_dict(
+        {"a": {1: 2}, "b": {2: 3}, "c": {1: {2: 3}}},
+        {"a": {1: 3}, "b": {2: 3}, "c": {1: {2: 4}}},
+    ) == {"a": {1: 3}, "b": {2: 3}, "c": {1: {2: 4}}}
+    with pytest.raises(ValueError):
+        merge_dict({"a": {1: 2}, "b": {2: 3}}, {"a": {1: 3}}, overwrite=False)
+
+
+def test_flatten_dict_to_nested_dict():
+    from ..utils import flatten_dict_to_nested_dict
+
+    assert flatten_dict_to_nested_dict({}) == {}
+    with pytest.raises(ValueError):
+        flatten_dict_to_nested_dict({"a.b.c": 1, "a.b": 2})
+    assert flatten_dict_to_nested_dict({"a.b.c": 1, "a.b.d": 2}) == {
+        "a": {"b": {"c": 1, "d": 2}}
+    }
+
+
+def test_readable_size():
+    assert utils.readable_size(32) == "32.00"
+    assert utils.readable_size(14354) == "14.02K"
+    assert utils.readable_size(14354000) == "13.69M"
+    assert utils.readable_size(14354000000) == "13.37G"
+    assert utils.readable_size(14354000000000) == "13.05T"
+
+
+def test_estimate_pandas_size():
+    df1 = pd.DataFrame(np.random.rand(50, 10))
+    assert utils.estimate_pandas_size(df1) == sys.getsizeof(df1)
+
+    df2 = pd.DataFrame(np.random.rand(1000, 10))
+    assert utils.estimate_pandas_size(df2) == sys.getsizeof(df2)
+
+    df3 = pd.DataFrame(
+        {
+            "A": np.random.choice(["abcd", "def", "gh"], size=(1000,)),
+            "B": np.random.rand(1000),
+            "C": np.random.rand(1000),
+        }
+    )
+    assert utils.estimate_pandas_size(df3) != sys.getsizeof(df3)
+
+    s1 = pd.Series(np.random.rand(1000))
+    assert utils.estimate_pandas_size(s1) == sys.getsizeof(s1)
+
+    from ..dataframe.arrays import ArrowStringArray
+
+    array = ArrowStringArray(np.random.choice(["abcd", "def", "gh"], size=(1000,)))
+    s2 = pd.Series(array)
+    assert utils.estimate_pandas_size(s2) == sys.getsizeof(s2)
+
+    s3 = pd.Series(np.random.choice(["abcd", "def", "gh"], size=(1000,)))
+    assert utils.estimate_pandas_size(s3) != sys.getsizeof(s3)
+    assert (
+        pytest.approx(utils.estimate_pandas_size(s3) / sys.getsizeof(s3), abs=0.5) == 1
+    )
+
+    idx1 = pd.MultiIndex.from_arrays(
+        [np.arange(0, 1000), np.random.choice(["abcd", "def", "gh"], size=(1000,))]
+    )
+    assert utils.estimate_pandas_size(idx1) == sys.getsizeof(idx1)
+
+    string_idx = pd.Index(np.random.choice(["a", "bb", "cc"], size=(1000,)))
+    assert utils.estimate_pandas_size(string_idx) != sys.getsizeof(string_idx)
+    assert (
+        pytest.approx(
+            utils.estimate_pandas_size(string_idx) / sys.getsizeof(string_idx), abs=0.5
+        )
+        == 1
+    )
+
+    # dataframe with multi index
+    idx2 = pd.MultiIndex.from_arrays(
+        [np.arange(0, 1000), np.random.choice(["abcd", "def", "gh"], size=(1000,))]
+    )
+    df4 = pd.DataFrame(
+        {
+            "A": np.random.choice(["abcd", "def", "gh"], size=(1000,)),
+            "B": np.random.rand(1000),
+            "C": np.random.rand(1000),
+        },
+        index=idx2,
+    )
+    assert utils.estimate_pandas_size(df4) != sys.getsizeof(df4)
+    assert (
+        pytest.approx(utils.estimate_pandas_size(df4) / sys.getsizeof(df4), abs=0.5)
+        == 1
+    )
+
+    # series with multi index
+    idx3 = pd.MultiIndex.from_arrays(
+        [
+            np.random.choice(["a1", "a2", "a3"], size=(1000,)),
+            np.random.choice(["abcd", "def", "gh"], size=(1000,)),
+        ]
+    )
+    s4 = pd.Series(np.arange(1000), index=idx3)
+
+    assert utils.estimate_pandas_size(s4) == sys.getsizeof(s4)
+
+
+@require_ray
+def test_web_serialize_lambda():
+    df = md.DataFrame(
+        mt.random.rand(10_0000, 4, chunk_size=1_0000), columns=list("abcd")
+    )
+    r = df.apply(lambda x: x)
+    graph = TileableGraph([r])
+    s = utils.serialize_serializable(graph)
+    f = utils.deserialize_serializable(s)
+    assert isinstance(f, TileableGraph)
+
+
+def test_get_func_token_values():
+    from ..utils import _get_func_token_values
+
+    assert _get_func_token_values(test_get_func_token_values) == [
+        test_get_func_token_values.__code__.co_code
+    ]
+    captured_vars = [1, 2, 3]
+
+    def closure_func(a, b):
+        return captured_vars
+
+    assert _get_func_token_values(closure_func)[1][0] == captured_vars
+    assert _get_func_token_values(partial(closure_func, 1))[0][0] == 1
+    assert _get_func_token_values(partial(closure_func, 1))[-1][0] == captured_vars
+
+    from .._utils import ceildiv
+
+    assert _get_func_token_values(ceildiv) == [ceildiv.__module__, ceildiv.__name__]
+
+    class Func:
+        def __call__(self, *args, **kwargs):
+            pass
+
+    func = Func()
+    assert _get_func_token_values(func) == [func]
+
+
+@pytest.mark.parametrize("id_length", [0, 5, 32, 63])
+def test_gen_random_id(id_length):
+    rnd_id = utils.new_random_id(id_length)
+    assert len(rnd_id) == id_length
+
+
+@pytest.mark.asyncio
+async def test_retry_callable():
+    assert utils.retry_callable(lambda x: x)(1) == 1
+    assert utils.retry_callable(lambda x: 0)(1) == 0
+
+    class CustomException(BaseException):
+        pass
+
+    def f1(x):
+        nonlocal num_retried
+        num_retried += 1
+        if num_retried == 3:
+            return x
+        raise CustomException
+
+    num_retried = 0
+    with pytest.raises(CustomException):
+        utils.retry_callable(f1)(1)
+    assert utils.retry_callable(f1, ex_type=CustomException)(1) == 1
+    num_retried = 0
+    with pytest.raises(CustomException):
+        utils.retry_callable(f1, max_retries=2, ex_type=CustomException)(1)
+    num_retried = 0
+    assert utils.retry_callable(f1, max_retries=3, ex_type=CustomException)(1) == 1
+
+    async def f2(x):
+        return f1(x)
+
+    num_retried = 0
+    with pytest.raises(CustomException):
+        await utils.retry_callable(f2)(1)
+    assert await utils.retry_callable(f2, ex_type=CustomException)(1) == 1
+
+
+@require_cudf
+def test_calc_data_size_gpu():
+    import cudf
+
+    df = pd.DataFrame({"a": ["a", "b", "a"]}, dtype="category")
+    df = cudf.from_pandas(df)
+    assert utils.calc_data_size(df) > 0
diff --git a/python/xorbits/_mars/typing.py b/python/xorbits/_mars/typing.py
new file mode 100644
index 000000000..9c987f655
--- /dev/null
+++ b/python/xorbits/_mars/typing.py
@@ -0,0 +1,43 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try:
+    from typing import Tuple, TypeVar
+except ImportError:  # pragma: no cover
+    # in some scenario (for instance, pycharm debug), `mars.typing`
+    # could be mistakenly imported as builtin typing. Code below
+    # resolves this issue.
+    import os
+    import sys
+
+    _orig_sys_path = list(sys.path)
+    _mars_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    try:
+        sys.path = [p for p in sys.path if not p.startswith(_mars_path)]
+        sys.modules.pop("typing", None)
+        from typing import Tuple, TypeVar
+    finally:
+        sys.path = _orig_sys_path
+        del _orig_sys_path, _mars_path
+
+OperandType = TypeVar("OperandType")
+TileableType = TypeVar("TileableType")
+ChunkType = TypeVar("ChunkType")
+EntityType = TypeVar("EntityType")
+SessionType = TypeVar("SessionType")
+
+ClusterType = TypeVar("ClusterType")
+ClientType = TypeVar("ClientType")
+
+BandType = Tuple[str, str]  # (band address, resource_type)
diff --git a/python/xorbits/_mars/utils.py b/python/xorbits/_mars/utils.py
new file mode 100644
index 000000000..dc7f6ee8b
--- /dev/null
+++ b/python/xorbits/_mars/utils.py
@@ -0,0 +1,1891 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import dataclasses
+import datetime
+import enum
+import functools
+import importlib
+import inspect
+import io
+import itertools
+import logging
+import numbers
+import operator
+import os
+import pkgutil
+import random
+import shutil
+import socket
+import struct
+import sys
+import threading
+import time
+import types
+import uuid
+import warnings
+import weakref
+import zlib
+from abc import ABC
+from contextlib import contextmanager
+from types import TracebackType
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Mapping,
+    NamedTuple,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
+from urllib.parse import urlparse
+
+import cloudpickle as pickle
+import numpy as np
+import pandas as pd
+
+from ._utils import (  # noqa: F401 # pylint: disable=unused-import
+    NamedType,
+    Timer,
+    TypeDispatcher,
+    ceildiv,
+    new_random_id,
+    register_tokenizer,
+    reset_id_random_seed,
+    to_binary,
+    to_str,
+    to_text,
+    tokenize,
+    tokenize_int,
+)
+from .constants import MARS_LOG_PATH_KEY
+from .lib.version import parse as parse_version
+from .typing import ChunkType, EntityType, OperandType, TileableType
+
+logger = logging.getLogger(__name__)
+random.seed(int(time.time()) * os.getpid())
+pd_release_version: Tuple[int] = parse_version(pd.__version__).release
+
+OBJECT_FIELD_OVERHEAD = 50
+
+# make flake8 happy by referencing these imports
+NamedType = NamedType
+TypeDispatcher = TypeDispatcher
+tokenize = tokenize
+register_tokenizer = register_tokenizer
+ceildiv = ceildiv
+reset_id_random_seed = reset_id_random_seed
+new_random_id = new_random_id
+_create_task = asyncio.create_task
+_is_ci = (os.environ.get("CI") or "0").lower() in ("1", "true")
+
+
+# fix encoding conversion problem under windows
+if sys.platform.startswith("win"):
+
+    def _replace_default_encoding(func):
+        def _fun(s, encoding=None):
+            encoding = encoding or getattr(sys.stdout, "encoding", None) or "mbcs"
+            return func(s, encoding=encoding)
+
+        _fun.__name__ = func.__name__
+        _fun.__doc__ = func.__doc__
+        return _fun
+
+    to_binary = _replace_default_encoding(to_binary)
+    to_text = _replace_default_encoding(to_text)
+    to_str = _replace_default_encoding(to_str)
+
+
+try:
+    from pandas._libs import lib as _pd__libs_lib
+    from pandas._libs.lib import NoDefault, no_default
+
+    _raw__reduce__ = type(NoDefault).__reduce__
+
+    def _no_default__reduce__(self):
+        if self is not NoDefault:
+            return _raw__reduce__(self)
+        else:  # pragma: no cover
+            return getattr, (_pd__libs_lib, "NoDefault")
+
+    if hasattr(_pd__libs_lib, "_NoDefault"):  # pragma: no cover
+        # need to patch __reduce__ to make sure it can be properly unpickled
+        type(NoDefault).__reduce__ = _no_default__reduce__
+    else:
+        # introduced in pandas 1.5.0 : register for pickle compatibility
+        _pd__libs_lib._NoDefault = NoDefault
+except ImportError:  # pragma: no cover
+
+    class NoDefault(enum.Enum):
+        no_default = "NO_DEFAULT"
+
+        def __repr__(self) -> str:
+            return "<no_default>"
+
+    no_default = NoDefault.no_default
+
+    try:
+        # register for pickle compatibility
+        from pandas._libs import lib as _pd__libs_lib
+
+        _pd__libs_lib.NoDefault = NoDefault
+    except (ImportError, AttributeError):
+        pass
+
+
+class AttributeDict(dict):
+    def __getattr__(self, item):
+        try:
+            return self[item]
+        except KeyError:
+            raise AttributeError(f"'AttributeDict' object has no attribute {item}")
+
+
+def get_bool_environ(var_name: str) -> Optional[bool]:
+    var_value = os.environ.get(var_name)
+    if not var_value:
+        return None
+    return bool(int(var_value))
+
+
+def on_serialize_shape(shape: Tuple[int]):
+    if shape:
+        return tuple(s if not np.isnan(s) else -1 for s in shape)
+    return shape
+
+
+def on_deserialize_shape(shape: Tuple[int]):
+    if shape:
+        return tuple(s if s != -1 else np.nan for s in shape)
+    return shape
+
+
+def on_serialize_numpy_type(value: np.dtype):
+    if value is pd.NaT:
+        value = None
+    return value.item() if isinstance(value, np.generic) else value
+
+
+def on_serialize_nsplits(value: Tuple[Tuple[int]]):
+    if value is None:
+        return None
+    new_nsplits = []
+    for dim_splits in value:
+        new_nsplits.append(tuple(None if pd.isna(v) else v for v in dim_splits))
+    return tuple(new_nsplits)
+
+
+_memory_size_indices = {"": 0, "k": 1, "m": 2, "g": 3, "t": 4}
+
+
+def calc_size_by_str(
+    value: Union[str, int, None], total: Union[int, None]
+) -> Optional[int]:
+    if value is None:
+        return None
+    if isinstance(value, int):
+        return value
+    mem_limit, is_percent = parse_readable_size(value)
+    if is_percent:
+        return int(total * mem_limit) if total is not None else None
+    else:
+        return int(mem_limit)
+
+
+def parse_readable_size(value: Union[str, int, float]) -> Tuple[float, bool]:
+    if isinstance(value, numbers.Number):
+        return float(value), False
+
+    value = value.strip().lower()
+    num_pos = 0
+    while num_pos < len(value) and value[num_pos] in "0123456789.-":
+        num_pos += 1
+
+    value, suffix = value[:num_pos], value[num_pos:]
+    suffix = suffix.strip()
+    if suffix.endswith("%"):
+        return float(value) / 100, True
+
+    try:
+        return float(value) * (1024 ** _memory_size_indices[suffix[:1]]), False
+    except (ValueError, KeyError):
+        raise ValueError(f"Unknown limitation value: {value}")
+
+
+def readable_size(size: int, trunc: bool = False) -> str:
+    if size < 1024:
+        ret_size = size
+        size_unit = ""
+    elif 1024 <= size < 1024**2:
+        ret_size = size * 1.0 / 1024
+        size_unit = "K"
+    elif 1024**2 <= size < 1024**3:
+        ret_size = size * 1.0 / (1024**2)
+        size_unit = "M"
+    elif 1024**3 <= size < 1024**4:
+        ret_size = size * 1.0 / (1024**3)
+        size_unit = "G"
+    else:
+        ret_size = size * 1.0 / (1024**4)
+        size_unit = "T"
+
+    if not trunc:
+        return "{0:.2f}{1}".format(ret_size, size_unit)
+    else:
+        return f"{int(ret_size)}{size_unit}"
+
+
+_git_info = None
+
+
+class GitInfo(NamedTuple):
+    commit_hash: str
+    commit_ref: str
+
+
+def git_info():
+    from ._version import get_versions
+
+    global _git_info
+
+    if _git_info is not None:
+        return _git_info
+
+    versions = get_versions()
+    _git_info = GitInfo(versions["full-revisionid"], versions["version"])
+    return _git_info
+
+
+LOW_PORT_BOUND = 10000
+HIGH_PORT_BOUND = 65535
+_local_occupied_ports = set()
+
+
+def _get_ports_from_netstat() -> Set[int]:
+    import subprocess
+
+    while True:
+        p = subprocess.Popen("netstat -a -n -p tcp".split(), stdout=subprocess.PIPE)
+        try:
+            outs, _ = p.communicate(timeout=5)
+            outs = outs.split(to_binary(os.linesep))
+            occupied = set()
+            for line in outs:
+                if b"." not in line:
+                    continue
+                line = to_str(line)
+                for part in line.split():
+                    # in windows, netstat uses ':' to separate host and port
+                    part = part.replace(":", ".")
+                    if "." in part:
+                        _, port_str = part.rsplit(".", 1)
+                        if port_str == "*":
+                            continue
+                        port = int(port_str)
+                        if LOW_PORT_BOUND <= port <= HIGH_PORT_BOUND:
+                            occupied.add(int(port_str))
+                        break
+            return occupied
+        except subprocess.TimeoutExpired:
+            p.kill()
+            continue
+
+
+def get_next_port(typ: int = None, occupy: bool = True) -> int:
+    import psutil
+
+    if sys.platform.lower().startswith("win"):
+        occupied = _get_ports_from_netstat()
+    else:
+        try:
+            conns = psutil.net_connections()
+            typ = typ or socket.SOCK_STREAM
+            occupied = set(
+                sc.laddr.port
+                for sc in conns
+                if sc.type == typ and LOW_PORT_BOUND <= sc.laddr.port <= HIGH_PORT_BOUND
+            )
+        except psutil.AccessDenied:
+            occupied = _get_ports_from_netstat()
+
+    occupied.update(_local_occupied_ports)
+    random.seed(uuid.uuid1().bytes)
+    randn = random.randint(0, 100000000)
+
+    idx = int(randn % (1 + HIGH_PORT_BOUND - LOW_PORT_BOUND - len(occupied)))
+    for i in range(LOW_PORT_BOUND, HIGH_PORT_BOUND + 1):
+        if i in occupied:
+            continue
+        if idx == 0:
+            if occupy:
+                _local_occupied_ports.add(i)
+            return i
+        idx -= 1
+    raise SystemError("No ports available.")
+
+
+@functools.lru_cache(200)
+def mod_hash(val: Any, modulus: int):
+    return tokenize_int(val) % modulus
+
+
+class classproperty:
+    def __init__(self, f):
+        self.f = f
+
+    def __get__(self, obj, owner):
+        return self.f(owner)
+
+
+def lazy_import(
+    name: str,
+    package: str = None,
+    globals: Dict = None,  # pylint: disable=redefined-builtin
+    locals: Dict = None,  # pylint: disable=redefined-builtin
+    rename: str = None,
+    placeholder: bool = False,
+):
+    rename = rename or name
+    prefix_name = name.split(".", 1)[0]
+    globals = globals or inspect.currentframe().f_back.f_globals
+
+    class LazyModule:
+        def __init__(self):
+            self._on_loads = []
+
+        def __getattr__(self, item):
+            if item.startswith("_pytest") or item in ("__bases__", "__test__"):
+                raise AttributeError(item)
+
+            real_mod = importlib.import_module(name, package=package)
+            if rename in globals:
+                globals[rename] = real_mod
+            elif locals is not None:
+                locals[rename] = real_mod
+            ret = getattr(real_mod, item)
+            for on_load_func in self._on_loads:
+                on_load_func()
+            # make sure on_load hooks only executed once
+            self._on_loads = []
+            return ret
+
+        def add_load_handler(self, func: Callable):
+            self._on_loads.append(func)
+            return func
+
+    if pkgutil.find_loader(prefix_name) is not None:
+        return LazyModule()
+    elif placeholder:
+        return ModulePlaceholder(prefix_name)
+    else:
+        return None
+
+
+def lazy_import_on_load(lazy_mod):
+    def wrapper(fun):
+        if lazy_mod is not None and hasattr(lazy_mod, "add_load_handler"):
+            lazy_mod.add_load_handler(fun)
+        return fun
+
+    return wrapper
+
+
+class ModulePlaceholder:
+    def __init__(self, mod_name: str):
+        self._mod_name = mod_name
+
+    def _raises(self):
+        raise AttributeError(f"{self._mod_name} is required but not installed.")
+
+    def __getattr__(self, key):
+        self._raises()
+
+    def __call__(self, *_args, **_kwargs):
+        self._raises()
+
+
+def serialize_serializable(serializable, compress: bool = False):
+    from .serialization import serialize
+
+    bio = io.BytesIO()
+    header, buffers = serialize(serializable)
+    buf_sizes = [getattr(buf, "nbytes", len(buf)) for buf in buffers]
+    header[0]["buf_sizes"] = buf_sizes
+    s_header = pickle.dumps(header)
+    bio.write(struct.pack("<Q", len(s_header)))
+    bio.write(s_header)
+    for buf in buffers:
+        bio.write(buf)
+    ser_graph = bio.getvalue()
+
+    if compress:
+        ser_graph = zlib.compress(ser_graph)
+    return ser_graph
+
+
+def deserialize_serializable(ser_serializable: bytes):
+    from .serialization import deserialize
+
+    bio = io.BytesIO(ser_serializable)
+    s_header_length = struct.unpack("Q", bio.read(8))[0]
+    header2 = pickle.loads(bio.read(s_header_length))
+    buffers2 = [bio.read(s) for s in header2[0]["buf_sizes"]]
+    return deserialize(header2, buffers2)
+
+
+def register_ray_serializer(obj_type, serializer=None, deserializer=None):
+    try:
+        import ray
+
+        try:
+            ray.register_custom_serializer(
+                obj_type, serializer=serializer, deserializer=deserializer
+            )
+        except AttributeError:  # ray >= 1.0
+            try:
+                from ray.worker import global_worker
+
+                global_worker.check_connected()
+                context = global_worker.get_serialization_context()
+                context.register_custom_serializer(
+                    obj_type, serializer=serializer, deserializer=deserializer
+                )
+            except AttributeError:  # ray >= 1.2.0
+                ray.util.register_serializer(
+                    obj_type, serializer=serializer, deserializer=deserializer
+                )
+    except ImportError:
+        pass
+
+
+cudf = lazy_import("cudf")
+
+
+def _get_dtype_itemsize(dt: Union[np.dtype, pd.api.extensions.ExtensionDtype]) -> int:
+    try:
+        return dt.itemsize
+    except AttributeError:
+        if cudf and isinstance(dt, cudf.CategoricalDtype):
+            return dt.to_pandas().itemsize
+        raise
+
+
+def calc_data_size(dt: Any, shape: Tuple[int] = None) -> int:
+    from .dataframe.core import IndexValue
+
+    if dt is None:
+        return 0
+
+    if isinstance(dt, tuple):
+        return sum(calc_data_size(c) for c in dt)
+
+    shape = getattr(dt, "shape", None) or shape
+    if isinstance(dt, (pd.DataFrame, pd.Series)):
+        return estimate_pandas_size(dt)
+    if hasattr(dt, "estimate_size"):
+        return dt.estimate_size()
+    if hasattr(dt, "nbytes"):
+        return max(sys.getsizeof(dt), dt.nbytes)
+    if hasattr(dt, "shape") and len(dt.shape) == 0:
+        return 0
+    if hasattr(dt, "dtypes") and shape is not None:
+        size = shape[0] * sum(_get_dtype_itemsize(dtype) for dtype in dt.dtypes)
+        try:
+            index_value_value = dt.index_value.value
+            if hasattr(index_value_value, "dtype") and not isinstance(
+                index_value_value, IndexValue.RangeIndex
+            ):
+                size += calc_data_size(index_value_value, shape=shape)
+        except AttributeError:
+            pass
+        return size
+    if hasattr(dt, "dtype") and shape is not None:
+        return shape[0] * dt.dtype.itemsize
+
+    # object chunk
+    return sys.getsizeof(dt)
+
+
+def estimate_pandas_size(
+    pd_obj, max_samples: int = 10, min_sample_rows: int = 100
+) -> int:
+    if len(pd_obj) <= min_sample_rows or isinstance(pd_obj, pd.RangeIndex):
+        return sys.getsizeof(pd_obj)
+    if isinstance(pd_obj, pd.MultiIndex):
+        # MultiIndex's sample size can't be used to estimate
+        return sys.getsizeof(pd_obj)
+
+    from .dataframe.arrays import ArrowDtype
+
+    def _is_fast_dtype(dtype):
+        if isinstance(dtype, np.dtype):
+            return np.issubdtype(dtype, np.number)
+        else:
+            return isinstance(dtype, ArrowDtype)
+
+    dtypes = []
+    is_series = False
+    if isinstance(pd_obj, pd.DataFrame):
+        dtypes.extend(pd_obj.dtypes)
+        index_obj = pd_obj.index
+    elif isinstance(pd_obj, pd.Series):
+        dtypes.append(pd_obj.dtype)
+        index_obj = pd_obj.index
+        is_series = True
+    else:
+        index_obj = pd_obj
+
+    # handling possible MultiIndex
+    if hasattr(index_obj, "dtypes"):
+        dtypes.extend(index_obj.dtypes)
+    else:
+        dtypes.append(index_obj.dtype)
+
+    if all(_is_fast_dtype(dtype) for dtype in dtypes):
+        return sys.getsizeof(pd_obj)
+
+    indices = np.sort(np.random.choice(len(pd_obj), size=max_samples, replace=False))
+    iloc = pd_obj if isinstance(pd_obj, pd.Index) else pd_obj.iloc
+    if isinstance(index_obj, pd.MultiIndex):
+        # MultiIndex's sample size is much greater than expected, thus we calculate
+        # the size separately.
+        index_size = sys.getsizeof(pd_obj.index)
+        if is_series:
+            sample_frame_size = iloc[indices].memory_usage(deep=True, index=False)
+        else:
+            sample_frame_size = iloc[indices].memory_usage(deep=True, index=False).sum()
+        return index_size + sample_frame_size * len(pd_obj) // max_samples
+    else:
+        sample_size = sys.getsizeof(iloc[indices])
+        return sample_size * len(pd_obj) // max_samples
+
+
+def build_fetch_shuffle(
+    chunk: ChunkType, n_reducers=None, shuffle_fetch_type=None
+) -> ChunkType:
+    from .core.operand import ShuffleFetchType, ShuffleProxy
+
+    chunk_op = chunk.op
+    assert isinstance(chunk_op, ShuffleProxy)
+    params = chunk.params.copy()
+    n_mappers = len(chunk.inputs)
+    assert n_reducers > 0, n_reducers
+    # for shuffle nodes, we build FetchShuffle chunks
+    # to replace ShuffleProxy
+    if shuffle_fetch_type is ShuffleFetchType.FETCH_BY_INDEX:
+        # skip data keys info for `FETCH_BY_INDEX`
+        source_keys = None
+    else:
+        source_keys = [pinp.key for pinp in chunk.inputs]
+    op = chunk_op.get_fetch_op_cls(chunk)(
+        source_keys=source_keys,
+        n_mappers=n_mappers,
+        n_reducers=n_reducers,
+        shuffle_fetch_type=shuffle_fetch_type,
+        gpu=chunk.op.gpu,
+    )
+    return op.new_chunk(
+        None,
+        is_broadcaster=chunk.is_broadcaster,
+        kws=[params],
+        _key=chunk.key,
+        _id=chunk.id,
+    )
+
+
+def build_fetch_chunk(chunk: ChunkType, **kwargs) -> ChunkType:
+    from .core.operand import ShuffleProxy
+
+    chunk_op = chunk.op
+    params = chunk.params.copy()
+    assert not isinstance(chunk_op, ShuffleProxy)
+    # for non-shuffle nodes, we build Fetch chunks
+    # to replace original chunk
+    op = chunk_op.get_fetch_op_cls(chunk)(sparse=chunk.op.sparse, gpu=chunk.op.gpu)
+    return op.new_chunk(
+        None,
+        is_broadcaster=chunk.is_broadcaster,
+        kws=[params],
+        _key=chunk.key,
+        **kwargs,
+    )
+
+
+def build_fetch_tileable(tileable: TileableType) -> TileableType:
+    if tileable.is_coarse():
+        chunks = None
+    else:
+        chunks = []
+        for c in tileable.chunks:
+            fetch_chunk = build_fetch_chunk(c, index=c.index)
+            chunks.append(fetch_chunk)
+
+    tileable_op = tileable.op
+    params = tileable.params.copy()
+
+    new_op = tileable_op.get_fetch_op_cls(tileable)(_id=tileable_op.id)
+    return new_op.new_tileables(
+        None,
+        chunks=chunks,
+        nsplits=tileable.nsplits,
+        _key=tileable.key,
+        _id=tileable.id,
+        **params,
+    )[0]
+
+
+def build_fetch(entity: EntityType) -> EntityType:
+    from .core import CHUNK_TYPE, ENTITY_TYPE
+
+    if isinstance(entity, CHUNK_TYPE):
+        return build_fetch_chunk(entity)
+    elif isinstance(entity, ENTITY_TYPE):
+        return build_fetch_tileable(entity)
+    else:
+        raise TypeError(f"Type {type(entity)} not supported")
+
+
+def get_chunk_reducer_index(chunk: ChunkType) -> Tuple[int]:
+    op = chunk.op
+    try:
+        return op.reducer_index
+    except AttributeError:
+        from .core.operand import Fuse
+
+        if isinstance(op, Fuse):
+            return chunk.composed[0].op.reducer_index
+        else:  # pragma: no cover
+            raise
+
+
+def merge_chunks(chunk_results: List[Tuple[Tuple[int], Any]]) -> Any:
+    """
+    Concatenate chunk results according to index.
+
+    Parameters
+    ----------
+    chunk_results : list of tuple, {(chunk_idx, chunk_result), ...,}
+
+    Returns
+    -------
+    Data
+    """
+    from sklearn.base import BaseEstimator
+
+    from .dataframe.utils import (
+        concat_on_columns,
+        get_xdf,
+        is_dataframe,
+        is_index,
+        is_series,
+    )
+    from .lib.groupby_wrapper import GroupByWrapper
+    from .tensor.array_utils import get_array_module, is_array
+
+    chunk_results = sorted(chunk_results, key=operator.itemgetter(0))
+    v = chunk_results[0][1]
+    if len(chunk_results) == 1 and not (chunk_results[0][0]):
+        return v
+    if is_array(v):
+        xp = get_array_module(v)
+        ndim = v.ndim
+        for i in range(ndim - 1):
+            new_chunks = []
+            for idx, cs in itertools.groupby(chunk_results, key=lambda t: t[0][:-1]):
+                new_chunks.append(
+                    (idx, xp.concatenate([c[1] for c in cs], axis=ndim - i - 1))
+                )
+            chunk_results = new_chunks
+        to_concat = [c[1] for c in chunk_results]
+        if len(to_concat) == 1:
+            return to_concat[0]
+        concat_result = xp.concatenate(to_concat)
+        return concat_result
+    elif is_dataframe(v):
+        xdf = get_xdf(v)
+        concats = []
+        for _, cs in itertools.groupby(chunk_results, key=lambda t: t[0][0]):
+            concats.append(concat_on_columns([c[1] for c in cs]))
+        return xdf.concat(concats, axis=0)
+    elif is_series(v):
+        xdf = get_xdf(v)
+        return xdf.concat([c[1] for c in chunk_results])
+    elif is_index(v):
+        xdf = get_xdf(v)
+        df = xdf.concat([xdf.DataFrame(index=r[1]) for r in chunk_results])
+        return df.index
+    elif isinstance(v, pd.Categorical):
+        categories = [r[1] for r in chunk_results]
+        arrays = [np.asarray(r) for r in categories]
+        array = np.concatenate(arrays)
+        return pd.Categorical(
+            array, categories=categories[0].categories, ordered=categories[0].ordered
+        )
+    elif isinstance(v, GroupByWrapper):
+        df = pd.concat([r[1].obj for r in chunk_results], axis=0)
+        if not isinstance(v.keys, list):
+            keys = v.keys
+        else:
+            keys = []
+            for idx, k in enumerate(v.keys):
+                if isinstance(k, pd.Series):
+                    keys.append(pd.concat([r[1].keys[idx] for r in chunk_results]))
+                else:
+                    keys.append(k)
+        grouped = GroupByWrapper(
+            df,
+            None,
+            keys=keys,
+            axis=v.axis,
+            level=v.level,
+            exclusions=v.exclusions,
+            selection=v.selection,
+            as_index=v.as_index,
+            sort=v.sort,
+            group_keys=v.group_keys,
+            squeeze=v.squeeze,
+            observed=v.observed,
+            mutated=v.mutated,
+        )
+        return grouped.groupby_obj
+    elif isinstance(v, (str, bytes, memoryview, BaseEstimator)):
+        result = [r[1] for r in chunk_results]
+        if len(result) == 1:
+            return result[0]
+        return result
+    else:
+        result = None
+        for cr in chunk_results:
+            if cr[1] is None:
+                continue
+            if isinstance(cr[1], dict) and not cr[1]:
+                continue
+            if result is None:
+                result = cr[1]
+                result = result.item() if hasattr(result, "item") else result
+            else:
+                raise TypeError(f"unsupported type {type(v)}")
+        return result
+
+
+def merged_chunk_as_tileable_type(merged, tileable: TileableType):
+    from .tensor.array_utils import get_array_module
+    from .tensor.core import TensorOrder
+
+    if hasattr(tileable, "order") and tileable.ndim > 0:
+        module = get_array_module(merged)
+        if tileable.order == TensorOrder.F_ORDER and hasattr(module, "asfortranarray"):
+            merged = module.asfortranarray(merged)
+        elif tileable.order == TensorOrder.C_ORDER and hasattr(
+            module, "ascontiguousarray"
+        ):
+            merged = module.ascontiguousarray(merged)
+    if (
+        hasattr(tileable, "isscalar")
+        and tileable.isscalar()
+        and getattr(merged, "size", None) == 1
+    ):
+        merged = merged.item()
+    return merged
+
+
+def calc_nsplits(chunk_idx_to_shape: Dict[Tuple[int], Tuple[int]]) -> Tuple[Tuple[int]]:
+    """
+    Calculate a tiled entity's nsplits.
+
+    Parameters
+    ----------
+    chunk_idx_to_shape : Dict type, {chunk_idx: chunk_shape}
+
+    Returns
+    -------
+    nsplits
+    """
+    ndim = len(next(iter(chunk_idx_to_shape)))
+    tileable_nsplits = []
+    # for each dimension, record chunk shape whose index is zero on other dimensions
+    for i in range(ndim):
+        splits = []
+        for index, shape in chunk_idx_to_shape.items():
+            if all(idx == 0 for j, idx in enumerate(index) if j != i):
+                splits.append(shape[i])
+        tileable_nsplits.append(tuple(splits))
+    return tuple(tileable_nsplits)
+
+
+def has_unknown_shape(*tiled_tileables: TileableType) -> bool:
+    for tileable in tiled_tileables:
+        if getattr(tileable, "shape", None) is None:
+            continue
+        if any(pd.isnull(s) for s in tileable.shape):
+            return True
+        if any(pd.isnull(s) for s in itertools.chain(*tileable.nsplits)):
+            return True
+    return False
+
+
+def sbytes(x: Any) -> bytes:
+    # NB: bytes() in Python 3 has different semantic with Python 2, see: help(bytes)
+    from numbers import Number
+
+    if x is None or isinstance(x, Number):
+        return bytes(str(x), encoding="ascii")
+    elif isinstance(x, list):
+        return bytes("[" + ", ".join([str(k) for k in x]) + "]", encoding="utf-8")
+    elif isinstance(x, tuple):
+        return bytes("(" + ", ".join([str(k) for k in x]) + ")", encoding="utf-8")
+    elif isinstance(x, str):
+        return bytes(x, encoding="utf-8")
+    else:
+        return bytes(x)
+
+
+def copy_tileables(tileables: List[TileableType], **kwargs):
+    inputs = kwargs.pop("inputs", None)
+    copy_key = kwargs.pop("copy_key", True)
+    copy_id = kwargs.pop("copy_id", True)
+    if kwargs:
+        raise TypeError(f"got un unexpected keyword argument '{next(iter(kwargs))}'")
+    if len(tileables) > 1:
+        # cannot handle tileables with different operands here
+        # try to copy separately if so
+        if len({t.op for t in tileables}) != 1:
+            raise TypeError("All tileables' operands should be same.")
+
+    op = tileables[0].op.copy().reset_key()
+    if copy_key:
+        op._key = tileables[0].op.key
+    kws = []
+    for t in tileables:
+        params = t.params.copy()
+        if copy_key:
+            params["_key"] = t.key
+        if copy_id:
+            params["_id"] = t.id
+        params.update(t.extra_params)
+        kws.append(params)
+    inputs = inputs or op.inputs
+    return op.new_tileables(inputs, kws=kws, output_limit=len(kws))
+
+
+def require_not_none(obj: Any):
+    def wrap(func):
+        if obj is not None:
+            return func
+        else:
+            return
+
+    return wrap
+
+
+def require_module(module: str):
+    def wrap(func):
+        try:
+            importlib.import_module(module)
+
+            @functools.wraps(func)
+            def inner(*args, **kwargs):
+                return func(*args, **kwargs)
+
+            return inner
+        except ImportError:
+            return
+
+    return wrap
+
+
+def ignore_warning(func: Callable):
+    @functools.wraps(func)
+    def inner(*args, **kwargs):
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            return func(*args, **kwargs)
+
+    return inner
+
+
+def flatten(nested_iterable: Union[List, Tuple]) -> List:
+    """
+    Flatten a nested iterable into a list.
+
+    Parameters
+    ----------
+    nested_iterable : list or tuple
+        an iterable which can contain other iterables
+
+    Returns
+    -------
+    flattened : list
+
+    Examples
+    --------
+    >>> flatten([[0, 1], [2, 3]])
+    [0, 1, 2, 3]
+    >>> flatten([[0, 1], [[3], [4, 5]]])
+    [0, 1, 3, 4, 5]
+    """
+
+    flattened = []
+    stack = list(nested_iterable)[::-1]
+    while len(stack) > 0:
+        inp = stack.pop()
+        if isinstance(inp, (tuple, list)):
+            stack.extend(inp[::-1])
+        else:
+            flattened.append(inp)
+    return flattened
+
+
+def stack_back(flattened: List, raw: Union[List, Tuple]) -> Union[List, Tuple]:
+    """
+    Organize a new iterable from a flattened list according to raw iterable.
+
+    Parameters
+    ----------
+    flattened : list
+        flattened list
+    raw: list
+        raw iterable
+
+    Returns
+    -------
+    ret : list
+
+    Examples
+    --------
+    >>> raw = [[0, 1], [2, [3, 4]]]
+    >>> flattened = flatten(raw)
+    >>> flattened
+    [0, 1, 2, 3, 4]
+    >>> a = [f + 1 for f in flattened]
+    >>> a
+    [1, 2, 3, 4, 5]
+    >>> stack_back(a, raw)
+    [[1, 2], [3, [4, 5]]]
+    """
+    flattened_iter = iter(flattened)
+    result = list()
+
+    def _stack(container, items):
+        for item in items:
+            if not isinstance(item, (list, tuple)):
+                container.append(next(flattened_iter))
+            else:
+                new_container = list()
+                container.append(new_container)
+                _stack(new_container, item)
+
+        return container
+
+    return _stack(result, raw)
+
+
+def build_fuse_chunk(
+    fused_chunks: List[ChunkType],
+    fuse_op_cls: Type[OperandType],
+    op_kw: Dict = None,
+    chunk_kw: Dict = None,
+) -> ChunkType:
+    from .core.graph import ChunkGraph
+
+    fuse_graph = ChunkGraph(fused_chunks)
+    for i, fuse_chunk in enumerate(fused_chunks):
+        fuse_graph.add_node(fuse_chunk)
+        if i > 0:
+            fuse_graph.add_edge(fused_chunks[i - 1], fuse_chunk)
+
+    head_chunk = fused_chunks[0]
+    tail_chunk = fused_chunks[-1]
+    tail_chunk_op = tail_chunk.op
+    fuse_op = fuse_op_cls(
+        sparse=tail_chunk_op.sparse,
+        gpu=tail_chunk_op.gpu,
+        _key=tail_chunk_op.key,
+        fuse_graph=fuse_graph,
+        **(op_kw or dict()),
+    )
+    return fuse_op.new_chunk(
+        head_chunk.inputs,
+        kws=[tail_chunk.params],
+        _key=tail_chunk.key,
+        _chunk=tail_chunk,
+        **(chunk_kw or dict()),
+    )
+
+
+def adapt_mars_docstring(doc: str) -> str:
+    """
+    Adapt numpy-style docstrings to Mars docstring.
+
+    This util function will add Mars imports, replace object references
+    and add execute calls. Note that check is needed after replacement.
+    """
+    if doc is None:
+        return None
+
+    lines = []
+    first_prompt = True
+    prev_prompt = False
+    has_numpy = "np." in doc
+    has_pandas = "pd." in doc
+
+    for line in doc.splitlines():
+        sp = line.strip()
+        if sp.startswith(">>>") or sp.startswith("..."):
+            prev_prompt = True
+            if first_prompt:
+                first_prompt = False
+                indent = "".join(itertools.takewhile(lambda x: x in (" ", "\t"), line))
+                if has_numpy:
+                    lines.extend([indent + ">>> import mars.tensor as mt"])
+                if has_pandas:
+                    lines.extend([indent + ">>> import mars.dataframe as md"])
+            line = line.replace("np.", "mt.").replace("pd.", "md.")
+        elif prev_prompt:
+            prev_prompt = False
+            if sp:
+                lines[-1] += ".execute()"
+        lines.append(line)
+    return "\n".join(lines)
+
+
+class FixedSizeFileObject:
+    def __init__(self, file_obj, fixed_size):
+        self._file_obj = file_obj
+        self._cur = self._file_obj.tell()
+        self._size = fixed_size
+        self._end = self._cur + self._size
+
+    def _get_size(self, size):
+        max_size = self._end - self._cur
+        if size is None:
+            return max_size
+        else:
+            return min(max_size, size)
+
+    def read(self, size=None):
+        result = self._file_obj.read(self._get_size(size))
+        self._cur = self._file_obj.tell()
+        return result
+
+    def read1(self, size=None):
+        return self.read(size)
+
+    def readline(self, size=None):
+        result = self._file_obj.readline(self._get_size(size))
+        self._cur = self._file_obj.tell()
+        return result
+
+    def readlines(self, size=None):
+        result = self._file_obj.readlines(self._get_size(size))
+        self._cur = self._file_obj.tell()
+        return result
+
+    def seek(self, offset):
+        self._cur = offset
+        return self._file_obj.seek(offset)
+
+    def tell(self):
+        return self._file_obj.tell()
+
+    def __next__(self):
+        while True:
+            result = self.readline()
+            if len(result) == 0:
+                raise StopIteration
+            else:
+                return result
+
+    def __iter__(self):
+        while True:
+            try:
+                yield next(self)
+            except StopIteration:
+                return
+
+    def __getattr__(self, item):  # pragma: no cover
+        return getattr(self._file_obj, item)
+
+
+def is_object_dtype(dtype: np.dtype) -> bool:
+    try:
+        return (
+            np.issubdtype(dtype, np.object_)
+            or np.issubdtype(dtype, np.unicode_)
+            or np.issubdtype(dtype, np.bytes_)
+        )
+    except TypeError:  # pragma: no cover
+        return False
+
+
+def get_dtype(dtype: Union[np.dtype, pd.api.extensions.ExtensionDtype]):
+    if pd.api.types.is_extension_array_dtype(dtype):
+        return dtype
+    elif dtype is pd.Timestamp or dtype is datetime.datetime:
+        return np.dtype("datetime64[ns]")
+    elif dtype is pd.Timedelta or dtype is datetime.timedelta:
+        return np.dtype("timedelta64[ns]")
+    else:
+        return np.dtype(dtype)
+
+
+def calc_object_overhead(chunk: ChunkType, shape: Tuple[int]) -> int:
+    from .dataframe.core import (
+        DATAFRAME_CHUNK_TYPE,
+        INDEX_CHUNK_TYPE,
+        SERIES_CHUNK_TYPE,
+    )
+
+    if not shape or np.isnan(shape[0]) or getattr(chunk, "dtypes", None) is None:
+        return 0
+
+    if isinstance(chunk, DATAFRAME_CHUNK_TYPE) and chunk.dtypes is not None:
+        n_strings = len([dt for dt in chunk.dtypes if is_object_dtype(dt)])
+        if chunk.index_value and is_object_dtype(
+            getattr(chunk.index_value.value, "dtype", None)
+        ):
+            n_strings += 1
+    elif isinstance(chunk, SERIES_CHUNK_TYPE) and chunk.dtype is not None:
+        n_strings = 1 if is_object_dtype(chunk.dtype) else 0
+        if chunk.index_value and is_object_dtype(
+            getattr(chunk.index_value.value, "dtype", None)
+        ):
+            n_strings += 1
+    elif isinstance(chunk, INDEX_CHUNK_TYPE) and chunk.dtype is not None:
+        n_strings = 1 if is_object_dtype(chunk.dtype) else 0
+    else:
+        n_strings = 0
+    return n_strings * shape[0] * OBJECT_FIELD_OVERHEAD
+
+
+def arrow_array_to_objects(
+    obj: Union[pd.DataFrame, pd.Series]
+) -> Union[pd.DataFrame, pd.Series]:
+    from .dataframe.arrays import ArrowDtype
+
+    if isinstance(obj, pd.DataFrame):
+        if any(isinstance(dt, ArrowDtype) for dt in obj.dtypes):
+            # ArrowDtype exists
+            result = pd.DataFrame(columns=obj.columns)
+            for i, dtype in enumerate(obj.dtypes):
+                if isinstance(dtype, ArrowDtype):
+                    result.iloc[:, i] = pd.Series(
+                        obj.iloc[:, i].to_numpy(), index=obj.index
+                    )
+                else:
+                    result.iloc[:, i] = obj.iloc[:, i]
+            obj = result
+    elif isinstance(obj, pd.Series):
+        if isinstance(obj.dtype, ArrowDtype):
+            obj = pd.Series(obj.to_numpy(), index=obj.index, name=obj.name)
+    return obj
+
+
+_enter_counter = 0
+_initial_session = None
+
+
+def enter_current_session(func: Callable):
+    @functools.wraps(func)
+    def wrapped(cls, ctx, op):
+        from .deploy.oscar.session import AbstractSession, get_default_session
+
+        global _enter_counter, _initial_session
+        # skip in some test cases
+        if not hasattr(ctx, "get_current_session"):
+            return func(cls, ctx, op)
+
+        with AbstractSession._lock:
+            if _enter_counter == 0:
+                # to handle nested call, only set initial session
+                # in first call
+                session = ctx.get_current_session()
+                _initial_session = get_default_session()
+                session.as_default()
+            _enter_counter += 1
+
+        try:
+            result = func(cls, ctx, op)
+        finally:
+            with AbstractSession._lock:
+                _enter_counter -= 1
+                if _enter_counter == 0:
+                    # set previous session when counter is 0
+                    if _initial_session:
+                        _initial_session.as_default()
+                    else:
+                        AbstractSession.reset_default()
+        return result
+
+    return wrapped
+
+
+_io_quiet_local = threading.local()
+_io_quiet_lock = threading.Lock()
+
+
+class _QuietIOWrapper:
+    def __init__(self, wrapped):
+        self.wrapped = wrapped
+
+    def __getattr__(self, item):
+        return getattr(self.wrapped, item)
+
+    def write(self, d):
+        if getattr(_io_quiet_local, "is_wrapped", False):
+            return 0
+        return self.wrapped.write(d)
+
+
+@contextmanager
+def quiet_stdio():
+    """Quiets standard outputs when inferring types of functions"""
+    with _io_quiet_lock:
+        _io_quiet_local.is_wrapped = True
+        sys.stdout = _QuietIOWrapper(sys.stdout)
+        sys.stderr = _QuietIOWrapper(sys.stderr)
+
+    try:
+        yield
+    finally:
+        with _io_quiet_lock:
+            sys.stdout = sys.stdout.wrapped
+            sys.stderr = sys.stderr.wrapped
+            if not isinstance(sys.stdout, _QuietIOWrapper):
+                _io_quiet_local.is_wrapped = False
+
+
+def implements(f: Callable):
+    def decorator(g):
+        g.__doc__ = f.__doc__
+        return g
+
+    return decorator
+
+
+def stringify_path(path: Union[str, os.PathLike]) -> str:
+    """
+    Convert *path* to a string or unicode path if possible.
+    """
+    if isinstance(path, str):
+        return path
+
+    # checking whether path implements the filesystem protocol
+    try:
+        return path.__fspath__()
+    except AttributeError:
+        raise TypeError("not a path-like object")
+
+
+def find_objects(nested: Union[List, Dict], types: Union[Type, Tuple[Type]]) -> List:
+    found = []
+    stack = [nested]
+
+    while len(stack) > 0:
+        it = stack.pop()
+        if isinstance(it, types):
+            found.append(it)
+            continue
+
+        if isinstance(it, (list, tuple, set)):
+            stack.extend(list(it)[::-1])
+        elif isinstance(it, dict):
+            stack.extend(list(it.values())[::-1])
+
+    return found
+
+
+def replace_objects(nested: Union[List, Dict], mapping: Mapping) -> Union[List, Dict]:
+    if not mapping:
+        return nested
+
+    if isinstance(nested, dict):
+        vals = list(nested.values())
+    else:
+        vals = list(nested)
+
+    new_vals = []
+    for val in vals:
+        if isinstance(val, (dict, list, tuple, set)):
+            new_val = replace_objects(val, mapping)
+        else:
+            try:
+                new_val = mapping.get(val, val)
+            except TypeError:
+                new_val = val
+        new_vals.append(new_val)
+
+    if isinstance(nested, dict):
+        return type(nested)((k, v) for k, v in zip(nested.keys(), new_vals))
+    else:
+        return type(nested)(new_vals)
+
+
+# from https://github.com/ericvsmith/dataclasses/blob/master/dataclass_tools.py
+# released under Apache License 2.0
+def dataslots(cls):
+    # Need to create a new class, since we can't set __slots__
+    #  after a class has been created.
+
+    # Make sure __slots__ isn't already set.
+    if "__slots__" in cls.__dict__:  # pragma: no cover
+        raise TypeError(f"{cls.__name__} already specifies __slots__")
+
+    # Create a new dict for our new class.
+    cls_dict = dict(cls.__dict__)
+    field_names = tuple(f.name for f in dataclasses.fields(cls))
+    cls_dict["__slots__"] = field_names
+    for field_name in field_names:
+        # Remove our attributes, if present. They'll still be
+        #  available in _MARKER.
+        cls_dict.pop(field_name, None)
+    # Remove __dict__ itself.
+    cls_dict.pop("__dict__", None)
+    # And finally create the class.
+    qualname = getattr(cls, "__qualname__", None)
+    cls = type(cls)(cls.__name__, cls.__bases__, cls_dict)
+    if qualname is not None:
+        cls.__qualname__ = qualname
+    return cls
+
+
+def get_chunk_params(chunk):
+    from .dataframe.core import (
+        DATAFRAME_CHUNK_TYPE,
+        DATAFRAME_GROUPBY_CHUNK_TYPE,
+        SERIES_GROUPBY_CHUNK_TYPE,
+    )
+
+    params = chunk.params.copy()
+    if isinstance(
+        chunk,
+        (
+            DATAFRAME_CHUNK_TYPE,
+            DATAFRAME_GROUPBY_CHUNK_TYPE,
+            SERIES_GROUPBY_CHUNK_TYPE,
+        ),
+    ):
+        # dataframe chunk needs some special process for now
+        params.pop("columns_value", None)
+        params.pop("dtypes", None)
+        params.pop("key_dtypes", None)
+    return params
+
+
+# Please refer to https://bugs.python.org/issue41451
+try:
+
+    class _Dummy(ABC):
+        __slots__ = ("__weakref__",)
+
+    abc_type_require_weakref_slot = True
+except TypeError:
+    abc_type_require_weakref_slot = False
+
+
+def patch_asyncio_task_create_time():  # pragma: no cover
+    new_loop = False
+    try:
+        loop = asyncio.get_running_loop()
+    except RuntimeError:
+        loop = asyncio.new_event_loop()
+        new_loop = True
+    loop_class = loop.__class__
+    # Save raw loop_class.create_task and make multiple apply idempotent
+    loop_create_task = getattr(
+        patch_asyncio_task_create_time, "loop_create_task", loop_class.create_task
+    )
+    patch_asyncio_task_create_time.loop_create_task = loop_create_task
+
+    def new_loop_create_task(*args, **kwargs):
+        task = loop_create_task(*args, **kwargs)
+        task.__mars_asyncio_task_create_time__ = time.time()
+        return task
+
+    if loop_create_task is not new_loop_create_task:
+        loop_class.create_task = new_loop_create_task
+    if not new_loop and loop.create_task is not new_loop_create_task:
+        loop.create_task = functools.partial(new_loop_create_task, loop)
+
+
+async def asyncio_task_timeout_detector(
+    check_interval: int, task_timeout_seconds: int, task_exclude_filters: List[str]
+):
+    task_exclude_filters.append("asyncio_task_timeout_detector")
+    while True:  # pragma: no cover
+        await asyncio.sleep(check_interval)
+        loop = asyncio.get_running_loop()
+        current_time = (
+            time.time()
+        )  # avoid invoke `time.time()` frequently if we have plenty of unfinished tasks.
+        for task in asyncio.all_tasks(loop=loop):
+            # Some task may be create before `patch_asyncio_task_create_time` applied, take them as never timeout.
+            create_time = getattr(
+                task, "__mars_asyncio_task_create_time__", current_time
+            )
+            if current_time - create_time >= task_timeout_seconds:
+                stack = io.StringIO()
+                task.print_stack(file=stack)
+                task_str = str(task)
+                if any(
+                    excluded_task in task_str for excluded_task in task_exclude_filters
+                ):
+                    continue
+                logger.warning(
+                    """Task %s in event loop %s doesn't finish in %s seconds. %s""",
+                    task,
+                    loop,
+                    time.time() - create_time,
+                    stack.getvalue(),
+                )
+
+
+def register_asyncio_task_timeout_detector(
+    check_interval: int = None,
+    task_timeout_seconds: int = None,
+    task_exclude_filters: List[str] = None,
+) -> Optional[asyncio.Task]:  # pragma: no cover
+    """Register a asyncio task which print timeout task periodically."""
+    check_interval = check_interval or int(
+        os.environ.get("MARS_DEBUG_ASYNCIO_TASK_TIMEOUT_CHECK_INTERVAL", -1)
+    )
+    if check_interval > 0:
+        patch_asyncio_task_create_time()
+        task_timeout_seconds = task_timeout_seconds or int(
+            os.environ.get("MARS_DEBUG_ASYNCIO_TASK_TIMEOUT_SECONDS", check_interval)
+        )
+        if not task_exclude_filters:
+            # Ignore mars/oscar by default since it has some long-running coroutines.
+            task_exclude_filters = os.environ.get(
+                "MARS_DEBUG_ASYNCIO_TASK_EXCLUDE_FILTERS", "mars/oscar"
+            )
+            task_exclude_filters = task_exclude_filters.split(";")
+        if sys.version_info[:2] < (3, 7):
+            logger.warning(
+                "asyncio tasks timeout detector is not supported under python %s",
+                sys.version,
+            )
+        else:
+            loop = asyncio.get_running_loop()
+            logger.info(
+                "Create asyncio tasks timeout detector with check_interval %s task_timeout_seconds %s "
+                "task_exclude_filters %s",
+                check_interval,
+                task_timeout_seconds,
+                task_exclude_filters,
+            )
+            return loop.create_task(
+                asyncio_task_timeout_detector(
+                    check_interval, task_timeout_seconds, task_exclude_filters
+                )
+            )
+    else:
+        return None
+
+
+def ensure_own_data(data: np.ndarray) -> np.ndarray:
+    if not isinstance(data, np.ndarray):
+        return data
+    if not data.flags["OWNDATA"]:
+        return data.copy()
+    else:
+        return data
+
+
+def get_chunk_key_to_data_keys(chunk_graph):
+    from .core.operand import FetchShuffle, MapReduceOperand, OperandStage
+
+    chunk_key_to_data_keys = dict()
+    for chunk in chunk_graph:
+        if chunk.key in chunk_key_to_data_keys:
+            continue
+        if not isinstance(chunk.op, FetchShuffle):
+            chunk_key_to_data_keys[chunk.key] = [chunk.key]
+        else:
+            keys = []
+            for succ in chunk_graph.iter_successors(chunk):
+                if (
+                    isinstance(succ.op, MapReduceOperand)
+                    and succ.op.stage == OperandStage.reduce
+                ):
+                    for key in succ.op.get_dependent_data_keys():
+                        if key not in keys:
+                            keys.append(key)
+            chunk_key_to_data_keys[chunk.key] = keys
+    return chunk_key_to_data_keys
+
+
+def merge_dict(dest: Dict, src: Dict, path=None, overwrite=True):
+    """
+    Merges src dict into dest dict.
+
+    Parameters
+    ----------
+    dest: Dict
+        dest dict
+    src: Dict
+        source dict
+    path: List
+        merge path
+    overwrite: bool
+        Whether overwrite dest dict when where is a conflict
+    Returns
+    -------
+    Dict
+        Updated dest dict
+    """
+    if path is None:
+        path = []
+    for key in src:
+        if key in dest:
+            if isinstance(dest[key], Dict) and isinstance(src[key], Dict):
+                merge_dict(dest[key], src[key], path + [str(key)], overwrite=overwrite)
+            elif dest[key] == src[key]:
+                pass  # same leaf value
+            elif overwrite:
+                dest[key] = src[key]
+            else:
+                raise ValueError("Conflict at %s" % ".".join(path + [str(key)]))
+        else:
+            dest[key] = src[key]
+    return dest
+
+
+def flatten_dict_to_nested_dict(flatten_dict: Dict, sep=".") -> Dict:
+    """
+    Return nested dict from flatten dict.
+
+    Parameters
+    ----------
+    flatten_dict: Dict
+    sep: str
+        flatten key separator
+
+    Returns
+    -------
+    Dict
+        Nested dict
+    """
+    assert all(isinstance(k, str) for k in flatten_dict.keys())
+    nested_dict = dict()
+    # longest path first to avoid shorter path has a leaf key with value dict
+    # as sub dict by mistake.
+    keys = sorted(flatten_dict.keys(), key=lambda k: -len(k.split(sep)))
+    for k in keys:
+        sub_keys = k.split(sep)
+        sub_nested_dict = nested_dict
+        for i, sub_key in enumerate(sub_keys):
+            if i == len(sub_keys) - 1:
+                if sub_key in sub_nested_dict:
+                    raise ValueError(f"Key {k} conflict in sub key {sub_key}.")
+                sub_nested_dict[sub_key] = flatten_dict[k]
+            else:
+                if sub_key not in sub_nested_dict:
+                    new_sub_nested_dict = dict()
+                    sub_nested_dict[sub_key] = new_sub_nested_dict
+                    sub_nested_dict = new_sub_nested_dict
+                else:
+                    sub_nested_dict = sub_nested_dict[sub_key]
+    return nested_dict
+
+
+def is_full_slice(slc: Any) -> bool:
+    """Check if the input is a full slice ((:) or (0:))"""
+    return (
+        isinstance(slc, slice)
+        and (slc.start == 0 or slc.start is None)
+        and slc.stop is None
+        and slc.step is None
+    )
+
+
+def wrap_exception(
+    exc: Exception,
+    bases: Tuple[Type] = None,
+    wrap_name: str = None,
+    message: str = None,
+    traceback: Optional[TracebackType] = None,
+    attr_dict: dict = None,
+):
+    """Generate an exception wraps the cause exception."""
+
+    def __init__(self):
+        pass
+
+    def __getattr__(self, item):
+        return getattr(exc, item)
+
+    def __str__(self):
+        return message or super(type(self), self).__str__()
+
+    traceback = traceback or exc.__traceback__
+    bases = bases or ()
+    attr_dict = attr_dict or {}
+    attr_dict.update(
+        {
+            "__init__": __init__,
+            "__getattr__": __getattr__,
+            "__str__": __str__,
+            "__wrapname__": wrap_name,
+            "__wrapped__": exc,
+            "__module__": type(exc).__module__,
+            "__cause__": exc.__cause__,
+            "__context__": exc.__context__,
+            "__suppress_context__": exc.__suppress_context__,
+            "args": exc.args,
+        }
+    )
+    new_exc_type = type(type(exc).__name__, bases + (type(exc),), attr_dict)
+    return new_exc_type().with_traceback(traceback)
+
+
+_func_token_cache = weakref.WeakKeyDictionary()
+
+
+def get_func_token(func):
+    try:
+        token = _func_token_cache.get(func)
+        if token is None:
+            fields = _get_func_token_values(func)
+            token = tokenize(*fields)
+            _func_token_cache[func] = token
+        return token
+    except TypeError:  # cannot create weak reference to func like 'numpy.ufunc'
+        return tokenize(*_get_func_token_values(func))
+
+
+def _get_func_token_values(func):
+    if hasattr(func, "__code__"):
+        tokens = [func.__code__.co_code]
+        if func.__closure__ is not None:
+            cvars = tuple([x.cell_contents for x in func.__closure__])
+            tokens.append(cvars)
+        return tokens
+    else:
+        tokens = []
+        while isinstance(func, functools.partial):
+            tokens.extend([func.args, func.keywords])
+            func = func.func
+        if hasattr(func, "__code__"):
+            tokens.extend(_get_func_token_values(func))
+        elif isinstance(func, types.BuiltinFunctionType):
+            tokens.extend([func.__module__, func.__name__])
+        else:
+            tokens.append(func)
+        return tokens
+
+
+async def _run_task_with_error_log(
+    coro, call_site=None, exit_if_exception=False
+):  # pragma: no cover
+    try:
+        return await coro
+    except asyncio.CancelledError:
+        raise
+    except Exception as e:
+        logger.exception(
+            "Coroutine %r at call_site %s execution got exception %s.",
+            coro,
+            call_site,
+            e,
+        )
+        if exit_if_exception:
+            logger.error("Exit because exit_if_exception=%s.", exit_if_exception)
+            os._exit(-1)  # Use os._exit to ensure exit in non-main thread.
+        raise
+
+
+def create_task_with_error_log(coro, *args, **kwargs):  # pragma: no cover
+    frame = inspect.currentframe()
+    if frame and frame.f_back:
+        call_site = frame.f_back.f_code
+    else:
+        call_site = None
+    return _create_task(_run_task_with_error_log(coro, call_site), *args, **kwargs)
+
+
+def aiotask_wrapper(_f=None, exit_if_exception=False):
+    def _wrapper(func):
+        @functools.wraps(func)
+        def _aiotask_wrapper(*args, **kwargs):
+            frame = inspect.currentframe()
+            if frame and frame.f_back:
+                call_site = frame.f_back.f_code
+            else:
+                call_site = None
+            return _run_task_with_error_log(
+                func(*args, **kwargs),
+                call_site=call_site,
+                exit_if_exception=exit_if_exception,
+            )
+
+        return _aiotask_wrapper
+
+    if inspect.iscoroutinefunction(_f):
+        return _wrapper(_f)
+    else:
+        assert _f is None
+        return _wrapper
+
+
+def is_ray_address(address: str) -> bool:
+    from .oscar.backends.ray.communication import RayServer
+
+    if urlparse(address).scheme == RayServer.scheme:
+        return True
+    else:
+        return False
+
+
+# TODO: clean_up_func, is_on_ray and restore_func functions may be
+# removed or refactored in the future to calculate func size
+# with more accuracy as well as address some serialization issues.
+def is_on_ray(ctx):
+    from .services.task.execution.ray.context import (
+        RayExecutionContext,
+        RayExecutionWorkerContext,
+    )
+
+    # There are three conditions
+    #   a. mars backend
+    #   b. ray backend(oscar), c. ray backend(dag)
+    # When a. or b. is selected, ctx is an instance of ThreadedServiceContext.
+    #   The main difference between them is whether worker address matches ray scheme.
+    #   To avoid duplicated checks, here we choose the first worker address.
+    # When c. is selected, ctx is an instance of RayExecutionContext or RayExecutionWorkerContext,
+    #   while get_worker_addresses method isn't currently implemented in RayExecutionWorkerContext.
+    try:
+        worker_addresses = ctx.get_worker_addresses()
+    except AttributeError:  # pragma: no cover
+        assert isinstance(ctx, RayExecutionWorkerContext)
+        return True
+    return isinstance(ctx, RayExecutionContext) or is_ray_address(worker_addresses[0])
+
+
+def cache_tileables(*tileables):
+    from .core import ENTITY_TYPE
+
+    if len(tileables) == 1 and isinstance(tileables[0], (tuple, list)):
+        tileables = tileables[0]
+    for t in tileables:
+        if isinstance(t, ENTITY_TYPE):
+            t.cache = True
+
+
+class TreeReductionBuilder:
+    def __init__(self, combine_size=None):
+        from .config import options
+
+        self._combine_size = combine_size or options.combine_size
+
+    def _build_reduction(self, inputs, final=False):
+        raise NotImplementedError
+
+    def build(self, inputs):
+        combine_size = self._combine_size
+        while len(inputs) > self._combine_size:
+            new_inputs = []
+            for i in range(0, len(inputs), combine_size):
+                objs = inputs[i : i + combine_size]
+                if len(objs) == 1:
+                    obj = objs[0]
+                else:
+                    obj = self._build_reduction(objs, final=False)
+                new_inputs.append(obj)
+            inputs = new_inputs
+
+        if len(inputs) == 1:
+            return inputs[0]
+        return self._build_reduction(inputs, final=True)
+
+
+def ensure_coverage():
+    # make sure coverage is handled when starting with subprocess.Popen
+    if (
+        not sys.platform.startswith("win") and "COV_CORE_SOURCE" in os.environ
+    ):  # pragma: no cover
+        try:
+            from pytest_cov.embed import cleanup_on_sigterm
+        except ImportError:
+            pass
+        else:
+            cleanup_on_sigterm()
+
+
+@functools.lru_cache(100)
+def sync_to_async(func):
+    if inspect.iscoroutinefunction(func):
+        return func
+    else:
+        # Wrap the sync call to thread to avoid blocking the
+        # asyncio event loop. e.g. acquiring a threading.Lock()
+        # in the sync call.
+        return functools.partial(asyncio.to_thread, func)
+
+
+def retry_callable(
+    callable_,
+    ex_type: type = Exception,
+    wait_interval=1,
+    max_retries=-1,
+    sync: bool = None,
+):
+    if inspect.iscoroutinefunction(callable_) or sync is False:
+
+        @functools.wraps(callable)
+        async def retry_call(*args, **kwargs):
+            num_retried = 0
+            while max_retries < 0 or num_retried < max_retries:
+                num_retried += 1
+                try:
+                    return await callable_(*args, **kwargs)
+                except ex_type:
+                    await asyncio.sleep(wait_interval)
+
+    else:
+
+        @functools.wraps(callable)
+        def retry_call(*args, **kwargs):
+            num_retried = 0
+            ex = None
+            while max_retries < 0 or num_retried < max_retries:
+                num_retried += 1
+                try:
+                    return callable_(*args, **kwargs)
+                except ex_type as e:
+                    ex = e
+                    time.sleep(wait_interval)
+            assert ex is not None
+            raise ex  # pylint: disable-msg=E0702
+
+    return retry_call
+
+
+def clean_mars_tmp_dir():
+    # clean Mars log file and Mars tmp dir
+    filename = os.environ.get(MARS_LOG_PATH_KEY)
+    if filename is not None:
+        os.environ.pop(MARS_LOG_PATH_KEY)
+        if os.path.exists(filename):
+            mars_tmp_dir = os.path.dirname(filename)
+            if os.path.exists(mars_tmp_dir):
+                # on windows platform, raise Permission Error
+                _windows: bool = sys.platform.startswith("win")
+                shutil.rmtree(mars_tmp_dir, ignore_errors=_windows)
diff --git a/python/xorbits/_mars/worker.py b/python/xorbits/_mars/worker.py
new file mode 100644
index 000000000..3bb850485
--- /dev/null
+++ b/python/xorbits/_mars/worker.py
@@ -0,0 +1,23 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# shortcut to support
+# python -m mars.worker
+
+from .deploy.oscar.worker import main
+from .utils import ensure_coverage
+
+if __name__ == "__main__":
+    ensure_coverage()
+    main()
diff --git a/python/xorbits/core/adapter.py b/python/xorbits/core/adapter.py
index 2089035fd..a192226fe 100644
--- a/python/xorbits/core/adapter.py
+++ b/python/xorbits/core/adapter.py
@@ -243,7 +243,7 @@ def to_mars(inp: Union[DataRef, Tuple, List, Dict]):
     if isinstance(inp, DataRef):
         mars_entity = getattr(inp.data, "_mars_entity", None)
         if mars_entity is None:  # pragma: no cover
-            raise TypeError(f"Can't covert {inp} to mars entity")
+            raise TypeError(f"Can't convert {inp} to mars entity")
         conditions = _TO_MARS_EXECUTION_CONDITION[type(mars_entity).__name__]
         for cond in conditions:
             if cond(mars_entity):
diff --git a/python/xorbits/core/data.py b/python/xorbits/core/data.py
index 47a8ed6ec..877b15547 100644
--- a/python/xorbits/core/data.py
+++ b/python/xorbits/core/data.py
@@ -185,7 +185,7 @@ def _own_data(self):
     def __iter__(self):
         # Mars entity hasn't implemented __iter__, however `iter(mars_entity)`
         # still works, it's because iteration is supported by `__getitem__` that
-        # accepts intergers 0,1,.., it can be seen as a "legacy feature" that not
+        # accepts integers 0,1,.., it can be seen as a "legacy feature" that not
         # recommended. Here we implement __iter__ for some data types, others keep
         # behaviors with Mars.
         if self._own_data():
diff --git a/python/xorbits/deploy/__init__.py b/python/xorbits/deploy/__init__.py
index b43f33025..7f3e5d391 100644
--- a/python/xorbits/deploy/__init__.py
+++ b/python/xorbits/deploy/__init__.py
@@ -51,7 +51,7 @@ def init(
     session_id: str, optional
         Session ID, if not specified, a new ID will be auto generated.
     timeout: float
-        Timeout about creating a new runtime or connecting to an exising cluster.
+        Timeout about creating a new runtime or connecting to an existing cluster.
     n_worker: int, optional
         How many workers to start when creating a local runtime.
 
diff --git a/third_party/_mars b/third_party/_mars
deleted file mode 160000
index a99b5a1d2..000000000
--- a/third_party/_mars
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit a99b5a1d2e1183a58d771d56a3aa57196417cb5c